diff options
Diffstat (limited to 'fs/f2fs')
-rw-r--r-- | fs/f2fs/Kconfig | 152 | ||||
-rw-r--r-- | fs/f2fs/Makefile | 12 | ||||
-rw-r--r-- | fs/f2fs/acl.c | 441 | ||||
-rw-r--r-- | fs/f2fs/acl.h | 51 | ||||
-rw-r--r-- | fs/f2fs/checkpoint.c | 1954 | ||||
-rw-r--r-- | fs/f2fs/compress.c | 2054 | ||||
-rw-r--r-- | fs/f2fs/data.c | 4235 | ||||
-rw-r--r-- | fs/f2fs/debug.c | 679 | ||||
-rw-r--r-- | fs/f2fs/dir.c | 1182 | ||||
-rw-r--r-- | fs/f2fs/extent_cache.c | 1061 | ||||
-rw-r--r-- | fs/f2fs/f2fs.h | 4555 | ||||
-rw-r--r-- | fs/f2fs/file.c | 4935 | ||||
-rw-r--r-- | fs/f2fs/gc.c | 2250 | ||||
-rw-r--r-- | fs/f2fs/gc.h | 168 | ||||
-rw-r--r-- | fs/f2fs/hash.c | 137 | ||||
-rw-r--r-- | fs/f2fs/inline.c | 815 | ||||
-rw-r--r-- | fs/f2fs/inode.c | 957 | ||||
-rw-r--r-- | fs/f2fs/iostat.c | 324 | ||||
-rw-r--r-- | fs/f2fs/iostat.h | 84 | ||||
-rw-r--r-- | fs/f2fs/namei.c | 1400 | ||||
-rw-r--r-- | fs/f2fs/node.c | 3438 | ||||
-rw-r--r-- | fs/f2fs/node.h | 429 | ||||
-rw-r--r-- | fs/f2fs/recovery.c | 934 | ||||
-rw-r--r-- | fs/f2fs/segment.c | 5251 | ||||
-rw-r--r-- | fs/f2fs/segment.h | 959 | ||||
-rw-r--r-- | fs/f2fs/shrinker.c | 141 | ||||
-rw-r--r-- | fs/f2fs/super.c | 4807 | ||||
-rw-r--r-- | fs/f2fs/sysfs.c | 1352 | ||||
-rw-r--r-- | fs/f2fs/verity.c | 292 | ||||
-rw-r--r-- | fs/f2fs/xattr.c | 844 | ||||
-rw-r--r-- | fs/f2fs/xattr.h | 166 |
31 files changed, 46059 insertions, 0 deletions
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig new file mode 100644 index 000000000..03ef08753 --- /dev/null +++ b/fs/f2fs/Kconfig @@ -0,0 +1,152 @@ +# SPDX-License-Identifier: GPL-2.0-only +config F2FS_FS + tristate "F2FS filesystem support" + depends on BLOCK + select NLS + select CRYPTO + select CRYPTO_CRC32 + select F2FS_FS_XATTR if FS_ENCRYPTION + select FS_ENCRYPTION_ALGS if FS_ENCRYPTION + select FS_IOMAP + select LZ4_COMPRESS if F2FS_FS_LZ4 + select LZ4_DECOMPRESS if F2FS_FS_LZ4 + select LZ4HC_COMPRESS if F2FS_FS_LZ4HC + select LZO_COMPRESS if F2FS_FS_LZO + select LZO_DECOMPRESS if F2FS_FS_LZO + select ZSTD_COMPRESS if F2FS_FS_ZSTD + select ZSTD_DECOMPRESS if F2FS_FS_ZSTD + help + F2FS is based on Log-structured File System (LFS), which supports + versatile "flash-friendly" features. The design has been focused on + addressing the fundamental issues in LFS, which are snowball effect + of wandering tree and high cleaning overhead. + + Since flash-based storages show different characteristics according to + the internal geometry or flash memory management schemes aka FTL, F2FS + and tools support various parameters not only for configuring on-disk + layout, but also for selecting allocation and cleaning algorithms. + + If unsure, say N. + +config F2FS_STAT_FS + bool "F2FS Status Information" + depends on F2FS_FS + default y + help + /sys/kernel/debug/f2fs/ contains information about all the partitions + mounted as f2fs. Each file shows the whole f2fs information. + + /sys/kernel/debug/f2fs/status includes: + - major filesystem information managed by f2fs currently + - average SIT information about whole segments + - current memory footprint consumed by f2fs. + +config F2FS_FS_XATTR + bool "F2FS extended attributes" + depends on F2FS_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page for details). + + If unsure, say N. + +config F2FS_FS_POSIX_ACL + bool "F2FS Access Control Lists" + depends on F2FS_FS_XATTR + select FS_POSIX_ACL + default y + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + If you don't know what Access Control Lists are, say N + +config F2FS_FS_SECURITY + bool "F2FS Security Labels" + depends on F2FS_FS_XATTR + help + Security labels provide an access control facility to support Linux + Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO + Linux. This option enables an extended attribute handler for file + security labels in the f2fs filesystem, so that it requires enabling + the extended attribute support in advance. In particular you need this + option if you use the setcap command to assign initial process capabi- + lities to executables (the security.* extended attributes). + + If you are not using a security module, say N. + +config F2FS_CHECK_FS + bool "F2FS consistency checking feature" + depends on F2FS_FS + help + Enables BUG_ONs which check the filesystem consistency in runtime. + + If you want to improve the performance, say N. + +config F2FS_FAULT_INJECTION + bool "F2FS fault injection facility" + depends on F2FS_FS + help + Test F2FS to inject faults such as ENOMEM, ENOSPC, and so on. + + If unsure, say N. + +config F2FS_FS_COMPRESSION + bool "F2FS compression feature" + depends on F2FS_FS + help + Enable filesystem-level compression on f2fs regular files, + multiple back-end compression algorithms are supported. + +config F2FS_FS_LZO + bool "LZO compression support" + depends on F2FS_FS_COMPRESSION + default y + help + Support LZO compress algorithm, if unsure, say Y. + +config F2FS_FS_LZORLE + bool "LZO-RLE compression support" + depends on F2FS_FS_LZO + default y + help + Support LZO-RLE compress algorithm, if unsure, say Y. + +config F2FS_FS_LZ4 + bool "LZ4 compression support" + depends on F2FS_FS_COMPRESSION + default y + help + Support LZ4 compress algorithm, if unsure, say Y. + +config F2FS_FS_LZ4HC + bool "LZ4HC compression support" + depends on F2FS_FS_LZ4 + default y + help + Support LZ4HC compress algorithm, LZ4HC has compatible on-disk + layout with LZ4, if unsure, say Y. + +config F2FS_FS_ZSTD + bool "ZSTD compression support" + depends on F2FS_FS_COMPRESSION + default y + help + Support ZSTD compress algorithm, if unsure, say Y. + +config F2FS_IOSTAT + bool "F2FS IO statistics information" + depends on F2FS_FS + default y + help + Support getting IO statistics through sysfs and printing out periodic + IO statistics tracepoint events. You have to turn on "iostat_enable" + sysfs node to enable this feature. + +config F2FS_UNFAIR_RWSEM + bool "F2FS unfair rw_semaphore" + depends on F2FS_FS && BLK_CGROUP + help + Use unfair rw_semaphore, if system configured IO priority by block + cgroup. diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile new file mode 100644 index 000000000..8a7322d22 --- /dev/null +++ b/fs/f2fs/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_F2FS_FS) += f2fs.o + +f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o +f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o +f2fs-y += shrinker.o extent_cache.o sysfs.o +f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o +f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o +f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o +f2fs-$(CONFIG_FS_VERITY) += verity.o +f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o +f2fs-$(CONFIG_F2FS_IOSTAT) += iostat.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c new file mode 100644 index 000000000..5bbc44a52 --- /dev/null +++ b/fs/f2fs/acl.c @@ -0,0 +1,441 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/f2fs/acl.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext2/acl.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> + */ +#include <linux/f2fs_fs.h> +#include "f2fs.h" +#include "xattr.h" +#include "acl.h" + +static inline size_t f2fs_acl_size(int count) +{ + if (count <= 4) { + return sizeof(struct f2fs_acl_header) + + count * sizeof(struct f2fs_acl_entry_short); + } else { + return sizeof(struct f2fs_acl_header) + + 4 * sizeof(struct f2fs_acl_entry_short) + + (count - 4) * sizeof(struct f2fs_acl_entry); + } +} + +static inline int f2fs_acl_count(size_t size) +{ + ssize_t s; + + size -= sizeof(struct f2fs_acl_header); + s = size - 4 * sizeof(struct f2fs_acl_entry_short); + if (s < 0) { + if (size % sizeof(struct f2fs_acl_entry_short)) + return -1; + return size / sizeof(struct f2fs_acl_entry_short); + } else { + if (s % sizeof(struct f2fs_acl_entry)) + return -1; + return s / sizeof(struct f2fs_acl_entry) + 4; + } +} + +static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size) +{ + int i, count; + struct posix_acl *acl; + struct f2fs_acl_header *hdr = (struct f2fs_acl_header *)value; + struct f2fs_acl_entry *entry = (struct f2fs_acl_entry *)(hdr + 1); + const char *end = value + size; + + if (size < sizeof(struct f2fs_acl_header)) + return ERR_PTR(-EINVAL); + + if (hdr->a_version != cpu_to_le32(F2FS_ACL_VERSION)) + return ERR_PTR(-EINVAL); + + count = f2fs_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < count; i++) { + + if ((char *)entry > end) + goto fail; + + acl->a_entries[i].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[i].e_perm = le16_to_cpu(entry->e_perm); + + switch (acl->a_entries[i].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry_short)); + break; + + case ACL_USER: + acl->a_entries[i].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry)); + break; + case ACL_GROUP: + acl->a_entries[i].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry)); + break; + default: + goto fail; + } + } + if ((char *)entry != end) + goto fail; + return acl; +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi, + const struct posix_acl *acl, size_t *size) +{ + struct f2fs_acl_header *f2fs_acl; + struct f2fs_acl_entry *entry; + int i; + + f2fs_acl = f2fs_kmalloc(sbi, sizeof(struct f2fs_acl_header) + + acl->a_count * sizeof(struct f2fs_acl_entry), + GFP_NOFS); + if (!f2fs_acl) + return ERR_PTR(-ENOMEM); + + f2fs_acl->a_version = cpu_to_le32(F2FS_ACL_VERSION); + entry = (struct f2fs_acl_entry *)(f2fs_acl + 1); + + for (i = 0; i < acl->a_count; i++) { + + entry->e_tag = cpu_to_le16(acl->a_entries[i].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[i].e_perm); + + switch (acl->a_entries[i].e_tag) { + case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, + acl->a_entries[i].e_uid)); + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry)); + break; + case ACL_GROUP: + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, + acl->a_entries[i].e_gid)); + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry)); + break; + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry_short)); + break; + default: + goto fail; + } + } + *size = f2fs_acl_size(acl->a_count); + return (void *)f2fs_acl; + +fail: + kfree(f2fs_acl); + return ERR_PTR(-EINVAL); +} + +static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, + struct page *dpage) +{ + int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT; + void *value = NULL; + struct posix_acl *acl; + int retval; + + if (type == ACL_TYPE_ACCESS) + name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; + + retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage); + if (retval > 0) { + value = f2fs_kmalloc(F2FS_I_SB(inode), retval, GFP_F2FS_ZERO); + if (!value) + return ERR_PTR(-ENOMEM); + retval = f2fs_getxattr(inode, name_index, "", value, + retval, dpage); + } + + if (retval > 0) + acl = f2fs_acl_from_disk(value, retval); + else if (retval == -ENODATA) + acl = NULL; + else + acl = ERR_PTR(retval); + kfree(value); + + return acl; +} + +struct posix_acl *f2fs_get_acl(struct inode *inode, int type, bool rcu) +{ + if (rcu) + return ERR_PTR(-ECHILD); + + return __f2fs_get_acl(inode, type, NULL); +} + +static int f2fs_acl_update_mode(struct user_namespace *mnt_userns, + struct inode *inode, umode_t *mode_p, + struct posix_acl **acl) +{ + umode_t mode = inode->i_mode; + int error; + + if (is_inode_flag_set(inode, FI_ACL_MODE)) + mode = F2FS_I(inode)->i_acl_mode; + + error = posix_acl_equiv_mode(*acl, &mode); + if (error < 0) + return error; + if (error == 0) + *acl = NULL; + if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + mode &= ~S_ISGID; + *mode_p = mode; + return 0; +} + +static int __f2fs_set_acl(struct user_namespace *mnt_userns, + struct inode *inode, int type, + struct posix_acl *acl, struct page *ipage) +{ + int name_index; + void *value = NULL; + size_t size = 0; + int error; + umode_t mode = inode->i_mode; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl && !ipage) { + error = f2fs_acl_update_mode(mnt_userns, inode, + &mode, &acl); + if (error) + return error; + set_acl_inode(inode, mode); + } + break; + + case ACL_TYPE_DEFAULT: + name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + + if (acl) { + value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size); + if (IS_ERR(value)) { + clear_inode_flag(inode, FI_ACL_MODE); + return PTR_ERR(value); + } + } + + error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0); + + kfree(value); + if (!error) + set_cached_acl(inode, type, acl); + + clear_inode_flag(inode, FI_ACL_MODE); + return error; +} + +int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) +{ + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + + return __f2fs_set_acl(mnt_userns, inode, type, acl, NULL); +} + +/* + * Most part of f2fs_acl_clone, f2fs_acl_create_masq, f2fs_acl_create + * are copied from posix_acl.c + */ +static struct posix_acl *f2fs_acl_clone(const struct posix_acl *acl, + gfp_t flags) +{ + struct posix_acl *clone = NULL; + + if (acl) { + int size = sizeof(struct posix_acl) + acl->a_count * + sizeof(struct posix_acl_entry); + clone = kmemdup(acl, size, flags); + if (clone) + refcount_set(&clone->a_refcount, 1); + } + return clone; +} + +static int f2fs_acl_create_masq(struct posix_acl *acl, umode_t *mode_p) +{ + struct posix_acl_entry *pa, *pe; + struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; + umode_t mode = *mode_p; + int not_equiv = 0; + + /* assert(atomic_read(acl->a_refcount) == 1); */ + + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch (pa->e_tag) { + case ACL_USER_OBJ: + pa->e_perm &= (mode >> 6) | ~S_IRWXO; + mode &= (pa->e_perm << 6) | ~S_IRWXU; + break; + + case ACL_USER: + case ACL_GROUP: + not_equiv = 1; + break; + + case ACL_GROUP_OBJ: + group_obj = pa; + break; + + case ACL_OTHER: + pa->e_perm &= mode | ~S_IRWXO; + mode &= pa->e_perm | ~S_IRWXO; + break; + + case ACL_MASK: + mask_obj = pa; + not_equiv = 1; + break; + + default: + return -EIO; + } + } + + if (mask_obj) { + mask_obj->e_perm &= (mode >> 3) | ~S_IRWXO; + mode &= (mask_obj->e_perm << 3) | ~S_IRWXG; + } else { + if (!group_obj) + return -EIO; + group_obj->e_perm &= (mode >> 3) | ~S_IRWXO; + mode &= (group_obj->e_perm << 3) | ~S_IRWXG; + } + + *mode_p = (*mode_p & ~S_IRWXUGO) | mode; + return not_equiv; +} + +static int f2fs_acl_create(struct inode *dir, umode_t *mode, + struct posix_acl **default_acl, struct posix_acl **acl, + struct page *dpage) +{ + struct posix_acl *p; + struct posix_acl *clone; + int ret; + + *acl = NULL; + *default_acl = NULL; + + if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) + return 0; + + p = __f2fs_get_acl(dir, ACL_TYPE_DEFAULT, dpage); + if (!p || p == ERR_PTR(-EOPNOTSUPP)) { + *mode &= ~current_umask(); + return 0; + } + if (IS_ERR(p)) + return PTR_ERR(p); + + clone = f2fs_acl_clone(p, GFP_NOFS); + if (!clone) { + ret = -ENOMEM; + goto release_acl; + } + + ret = f2fs_acl_create_masq(clone, mode); + if (ret < 0) + goto release_clone; + + if (ret == 0) + posix_acl_release(clone); + else + *acl = clone; + + if (!S_ISDIR(*mode)) + posix_acl_release(p); + else + *default_acl = p; + + return 0; + +release_clone: + posix_acl_release(clone); +release_acl: + posix_acl_release(p); + return ret; +} + +int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, + struct page *dpage) +{ + struct posix_acl *default_acl = NULL, *acl = NULL; + int error; + + error = f2fs_acl_create(dir, &inode->i_mode, &default_acl, &acl, dpage); + if (error) + return error; + + f2fs_mark_inode_dirty_sync(inode, true); + + if (default_acl) { + error = __f2fs_set_acl(NULL, inode, ACL_TYPE_DEFAULT, default_acl, + ipage); + posix_acl_release(default_acl); + } else { + inode->i_default_acl = NULL; + } + if (acl) { + if (!error) + error = __f2fs_set_acl(NULL, inode, ACL_TYPE_ACCESS, acl, + ipage); + posix_acl_release(acl); + } else { + inode->i_acl = NULL; + } + + return error; +} diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h new file mode 100644 index 000000000..a26e33cab --- /dev/null +++ b/fs/f2fs/acl.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * fs/f2fs/acl.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext2/acl.h + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> + */ +#ifndef __F2FS_ACL_H__ +#define __F2FS_ACL_H__ + +#include <linux/posix_acl_xattr.h> + +#define F2FS_ACL_VERSION 0x0001 + +struct f2fs_acl_entry { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +}; + +struct f2fs_acl_entry_short { + __le16 e_tag; + __le16 e_perm; +}; + +struct f2fs_acl_header { + __le32 a_version; +}; + +#ifdef CONFIG_F2FS_FS_POSIX_ACL + +extern struct posix_acl *f2fs_get_acl(struct inode *, int, bool); +extern int f2fs_set_acl(struct user_namespace *, struct inode *, + struct posix_acl *, int); +extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, + struct page *); +#else +#define f2fs_get_acl NULL +#define f2fs_set_acl NULL + +static inline int f2fs_init_acl(struct inode *inode, struct inode *dir, + struct page *ipage, struct page *dpage) +{ + return 0; +} +#endif +#endif /* __F2FS_ACL_H__ */ diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c new file mode 100644 index 000000000..eb4d69f53 --- /dev/null +++ b/fs/f2fs/checkpoint.c @@ -0,0 +1,1954 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/f2fs/checkpoint.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + */ +#include <linux/fs.h> +#include <linux/bio.h> +#include <linux/mpage.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/f2fs_fs.h> +#include <linux/pagevec.h> +#include <linux/swap.h> +#include <linux/kthread.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "iostat.h" +#include <trace/events/f2fs.h> + +#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) + +static struct kmem_cache *ino_entry_slab; +struct kmem_cache *f2fs_inode_entry_slab; + +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, + unsigned char reason) +{ + f2fs_build_fault_attr(sbi, 0, 0); + set_ckpt_flags(sbi, CP_ERROR_FLAG); + if (!end_io) { + f2fs_flush_merged_writes(sbi); + + f2fs_handle_stop(sbi, reason); + } +} + +/* + * We guarantee no failure on the returned page. + */ +struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +{ + struct address_space *mapping = META_MAPPING(sbi); + struct page *page; +repeat: + page = f2fs_grab_cache_page(mapping, index, false); + if (!page) { + cond_resched(); + goto repeat; + } + f2fs_wait_on_page_writeback(page, META, true, true); + if (!PageUptodate(page)) + SetPageUptodate(page); + return page; +} + +static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, + bool is_meta) +{ + struct address_space *mapping = META_MAPPING(sbi); + struct page *page; + struct f2fs_io_info fio = { + .sbi = sbi, + .type = META, + .op = REQ_OP_READ, + .op_flags = REQ_META | REQ_PRIO, + .old_blkaddr = index, + .new_blkaddr = index, + .encrypted_page = NULL, + .is_por = !is_meta, + }; + int err; + + if (unlikely(!is_meta)) + fio.op_flags &= ~REQ_META; +repeat: + page = f2fs_grab_cache_page(mapping, index, false); + if (!page) { + cond_resched(); + goto repeat; + } + if (PageUptodate(page)) + goto out; + + fio.page = page; + + err = f2fs_submit_page_bio(&fio); + if (err) { + f2fs_put_page(page, 1); + return ERR_PTR(err); + } + + f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE); + + lock_page(page); + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; + } + + if (unlikely(!PageUptodate(page))) { + f2fs_handle_page_eio(sbi, page->index, META); + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } +out: + return page; +} + +struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +{ + return __get_meta_page(sbi, index, true); +} + +struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index) +{ + struct page *page; + int count = 0; + +retry: + page = __get_meta_page(sbi, index, true); + if (IS_ERR(page)) { + if (PTR_ERR(page) == -EIO && + ++count <= DEFAULT_RETRY_IO_COUNT) + goto retry; + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_META_PAGE); + } + return page; +} + +/* for POR only */ +struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) +{ + return __get_meta_page(sbi, index, false); +} + +static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, + int type) +{ + struct seg_entry *se; + unsigned int segno, offset; + bool exist; + + if (type == DATA_GENERIC) + return true; + + segno = GET_SEGNO(sbi, blkaddr); + offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + se = get_seg_entry(sbi, segno); + + exist = f2fs_test_bit(offset, se->cur_valid_map); + if (exist && type == DATA_GENERIC_ENHANCE_UPDATE) { + f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", + blkaddr, exist); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return exist; + } + + if (!exist && type == DATA_GENERIC_ENHANCE) { + f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", + blkaddr, exist); + set_sbi_flag(sbi, SBI_NEED_FSCK); + dump_stack(); + } + return exist; +} + +bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) +{ + switch (type) { + case META_NAT: + break; + case META_SIT: + if (unlikely(blkaddr >= SIT_BLK_CNT(sbi))) + return false; + break; + case META_SSA: + if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) || + blkaddr < SM_I(sbi)->ssa_blkaddr)) + return false; + break; + case META_CP: + if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr || + blkaddr < __start_cp_addr(sbi))) + return false; + break; + case META_POR: + if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || + blkaddr < MAIN_BLKADDR(sbi))) + return false; + break; + case DATA_GENERIC: + case DATA_GENERIC_ENHANCE: + case DATA_GENERIC_ENHANCE_READ: + case DATA_GENERIC_ENHANCE_UPDATE: + if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || + blkaddr < MAIN_BLKADDR(sbi))) { + f2fs_warn(sbi, "access invalid blkaddr:%u", + blkaddr); + set_sbi_flag(sbi, SBI_NEED_FSCK); + dump_stack(); + return false; + } else { + return __is_bitmap_valid(sbi, blkaddr, type); + } + break; + case META_GENERIC: + if (unlikely(blkaddr < SEG0_BLKADDR(sbi) || + blkaddr >= MAIN_BLKADDR(sbi))) + return false; + break; + default: + BUG(); + } + + return true; +} + +/* + * Readahead CP/NAT/SIT/SSA/POR pages + */ +int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, + int type, bool sync) +{ + struct page *page; + block_t blkno = start; + struct f2fs_io_info fio = { + .sbi = sbi, + .type = META, + .op = REQ_OP_READ, + .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD, + .encrypted_page = NULL, + .in_list = false, + .is_por = (type == META_POR), + }; + struct blk_plug plug; + int err; + + if (unlikely(type == META_POR)) + fio.op_flags &= ~REQ_META; + + blk_start_plug(&plug); + for (; nrpages-- > 0; blkno++) { + + if (!f2fs_is_valid_blkaddr(sbi, blkno, type)) + goto out; + + switch (type) { + case META_NAT: + if (unlikely(blkno >= + NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid))) + blkno = 0; + /* get nat block addr */ + fio.new_blkaddr = current_nat_addr(sbi, + blkno * NAT_ENTRY_PER_BLOCK); + break; + case META_SIT: + if (unlikely(blkno >= TOTAL_SEGS(sbi))) + goto out; + /* get sit block addr */ + fio.new_blkaddr = current_sit_addr(sbi, + blkno * SIT_ENTRY_PER_BLOCK); + break; + case META_SSA: + case META_CP: + case META_POR: + fio.new_blkaddr = blkno; + break; + default: + BUG(); + } + + page = f2fs_grab_cache_page(META_MAPPING(sbi), + fio.new_blkaddr, false); + if (!page) + continue; + if (PageUptodate(page)) { + f2fs_put_page(page, 1); + continue; + } + + fio.page = page; + err = f2fs_submit_page_bio(&fio); + f2fs_put_page(page, err ? 1 : 0); + + if (!err) + f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, + F2FS_BLKSIZE); + } +out: + blk_finish_plug(&plug); + return blkno - start; +} + +void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, + unsigned int ra_blocks) +{ + struct page *page; + bool readahead = false; + + if (ra_blocks == RECOVERY_MIN_RA_BLOCKS) + return; + + page = find_get_page(META_MAPPING(sbi), index); + if (!page || !PageUptodate(page)) + readahead = true; + f2fs_put_page(page, 0); + + if (readahead) + f2fs_ra_meta_pages(sbi, index, ra_blocks, META_POR, true); +} + +static int __f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + + trace_f2fs_writepage(page, META); + + if (unlikely(f2fs_cp_error(sbi))) { + if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) { + ClearPageUptodate(page); + dec_page_count(sbi, F2FS_DIRTY_META); + unlock_page(page); + return 0; + } + goto redirty_out; + } + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0)) + goto redirty_out; + + f2fs_do_write_meta_page(sbi, page, io_type); + dec_page_count(sbi, F2FS_DIRTY_META); + + if (wbc->for_reclaim) + f2fs_submit_merged_write_cond(sbi, NULL, page, 0, META); + + unlock_page(page); + + if (unlikely(f2fs_cp_error(sbi))) + f2fs_submit_merged_write(sbi, META); + + return 0; + +redirty_out: + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; +} + +static int f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc) +{ + return __f2fs_write_meta_page(page, wbc, FS_META_IO); +} + +static int f2fs_write_meta_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + long diff, written; + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + + /* collect a number of dirty meta pages and write together */ + if (wbc->sync_mode != WB_SYNC_ALL && + get_pages(sbi, F2FS_DIRTY_META) < + nr_pages_to_skip(sbi, META)) + goto skip_write; + + /* if locked failed, cp will flush dirty pages instead */ + if (!f2fs_down_write_trylock(&sbi->cp_global_sem)) + goto skip_write; + + trace_f2fs_writepages(mapping->host, wbc, META); + diff = nr_pages_to_write(sbi, META, wbc); + written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); + f2fs_up_write(&sbi->cp_global_sem); + wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); + return 0; + +skip_write: + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META); + trace_f2fs_writepages(mapping->host, wbc, META); + return 0; +} + +long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, + long nr_to_write, enum iostat_type io_type) +{ + struct address_space *mapping = META_MAPPING(sbi); + pgoff_t index = 0, prev = ULONG_MAX; + struct pagevec pvec; + long nwritten = 0; + int nr_pages; + struct writeback_control wbc = { + .for_reclaim = 0, + }; + struct blk_plug plug; + + pagevec_init(&pvec); + + blk_start_plug(&plug); + + while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY))) { + int i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (prev == ULONG_MAX) + prev = page->index - 1; + if (nr_to_write != LONG_MAX && page->index != prev + 1) { + pagevec_release(&pvec); + goto stop; + } + + lock_page(page); + + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + f2fs_wait_on_page_writeback(page, META, true, true); + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + if (__f2fs_write_meta_page(page, &wbc, io_type)) { + unlock_page(page); + break; + } + nwritten++; + prev = page->index; + if (unlikely(nwritten >= nr_to_write)) + break; + } + pagevec_release(&pvec); + cond_resched(); + } +stop: + if (nwritten) + f2fs_submit_merged_write(sbi, type); + + blk_finish_plug(&plug); + + return nwritten; +} + +static bool f2fs_dirty_meta_folio(struct address_space *mapping, + struct folio *folio) +{ + trace_f2fs_set_page_dirty(&folio->page, META); + + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + if (filemap_dirty_folio(mapping, folio)) { + inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META); + set_page_private_reference(&folio->page); + return true; + } + return false; +} + +const struct address_space_operations f2fs_meta_aops = { + .writepage = f2fs_write_meta_page, + .writepages = f2fs_write_meta_pages, + .dirty_folio = f2fs_dirty_meta_folio, + .invalidate_folio = f2fs_invalidate_folio, + .release_folio = f2fs_release_folio, + .migrate_folio = filemap_migrate_folio, +}; + +static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) +{ + struct inode_management *im = &sbi->im[type]; + struct ino_entry *e = NULL, *new = NULL; + + if (type == FLUSH_INO) { + rcu_read_lock(); + e = radix_tree_lookup(&im->ino_root, ino); + rcu_read_unlock(); + } + +retry: + if (!e) + new = f2fs_kmem_cache_alloc(ino_entry_slab, + GFP_NOFS, true, NULL); + + radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); + + spin_lock(&im->ino_lock); + e = radix_tree_lookup(&im->ino_root, ino); + if (!e) { + if (!new) { + spin_unlock(&im->ino_lock); + goto retry; + } + e = new; + if (unlikely(radix_tree_insert(&im->ino_root, ino, e))) + f2fs_bug_on(sbi, 1); + + memset(e, 0, sizeof(struct ino_entry)); + e->ino = ino; + + list_add_tail(&e->list, &im->ino_list); + if (type != ORPHAN_INO) + im->ino_num++; + } + + if (type == FLUSH_INO) + f2fs_set_bit(devidx, (char *)&e->dirty_device); + + spin_unlock(&im->ino_lock); + radix_tree_preload_end(); + + if (new && e != new) + kmem_cache_free(ino_entry_slab, new); +} + +static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + struct inode_management *im = &sbi->im[type]; + struct ino_entry *e; + + spin_lock(&im->ino_lock); + e = radix_tree_lookup(&im->ino_root, ino); + if (e) { + list_del(&e->list); + radix_tree_delete(&im->ino_root, ino); + im->ino_num--; + spin_unlock(&im->ino_lock); + kmem_cache_free(ino_entry_slab, e); + return; + } + spin_unlock(&im->ino_lock); +} + +void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + /* add new dirty ino entry into list */ + __add_ino_entry(sbi, ino, 0, type); +} + +void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + /* remove dirty ino entry from list */ + __remove_ino_entry(sbi, ino, type); +} + +/* mode should be APPEND_INO, UPDATE_INO or TRANS_DIR_INO */ +bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) +{ + struct inode_management *im = &sbi->im[mode]; + struct ino_entry *e; + + spin_lock(&im->ino_lock); + e = radix_tree_lookup(&im->ino_root, ino); + spin_unlock(&im->ino_lock); + return e ? true : false; +} + +void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all) +{ + struct ino_entry *e, *tmp; + int i; + + for (i = all ? ORPHAN_INO : APPEND_INO; i < MAX_INO_ENTRY; i++) { + struct inode_management *im = &sbi->im[i]; + + spin_lock(&im->ino_lock); + list_for_each_entry_safe(e, tmp, &im->ino_list, list) { + list_del(&e->list); + radix_tree_delete(&im->ino_root, e->ino); + kmem_cache_free(ino_entry_slab, e); + im->ino_num--; + } + spin_unlock(&im->ino_lock); + } +} + +void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) +{ + __add_ino_entry(sbi, ino, devidx, type); +} + +bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) +{ + struct inode_management *im = &sbi->im[type]; + struct ino_entry *e; + bool is_dirty = false; + + spin_lock(&im->ino_lock); + e = radix_tree_lookup(&im->ino_root, ino); + if (e && f2fs_test_bit(devidx, (char *)&e->dirty_device)) + is_dirty = true; + spin_unlock(&im->ino_lock); + return is_dirty; +} + +int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi) +{ + struct inode_management *im = &sbi->im[ORPHAN_INO]; + int err = 0; + + spin_lock(&im->ino_lock); + + if (time_to_inject(sbi, FAULT_ORPHAN)) { + spin_unlock(&im->ino_lock); + f2fs_show_injection_info(sbi, FAULT_ORPHAN); + return -ENOSPC; + } + + if (unlikely(im->ino_num >= sbi->max_orphans)) + err = -ENOSPC; + else + im->ino_num++; + spin_unlock(&im->ino_lock); + + return err; +} + +void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi) +{ + struct inode_management *im = &sbi->im[ORPHAN_INO]; + + spin_lock(&im->ino_lock); + f2fs_bug_on(sbi, im->ino_num == 0); + im->ino_num--; + spin_unlock(&im->ino_lock); +} + +void f2fs_add_orphan_inode(struct inode *inode) +{ + /* add new orphan ino entry into list */ + __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, 0, ORPHAN_INO); + f2fs_update_inode_page(inode); +} + +void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +{ + /* remove orphan entry from orphan list */ + __remove_ino_entry(sbi, ino, ORPHAN_INO); +} + +static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct inode *inode; + struct node_info ni; + int err; + + inode = f2fs_iget_retry(sbi->sb, ino); + if (IS_ERR(inode)) { + /* + * there should be a bug that we can't find the entry + * to orphan inode. + */ + f2fs_bug_on(sbi, PTR_ERR(inode) == -ENOENT); + return PTR_ERR(inode); + } + + err = f2fs_dquot_initialize(inode); + if (err) { + iput(inode); + goto err_out; + } + + clear_nlink(inode); + + /* truncate all the data during iput */ + iput(inode); + + err = f2fs_get_node_info(sbi, ino, &ni, false); + if (err) + goto err_out; + + /* ENOMEM was fully retried in f2fs_evict_inode. */ + if (ni.blk_addr != NULL_ADDR) { + err = -EIO; + goto err_out; + } + return 0; + +err_out: + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: orphan failed (ino=%x), run fsck to fix.", + __func__, ino); + return err; +} + +int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) +{ + block_t start_blk, orphan_blocks, i, j; + unsigned int s_flags = sbi->sb->s_flags; + int err = 0; +#ifdef CONFIG_QUOTA + int quota_enabled; +#endif + + if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) + return 0; + + if (bdev_read_only(sbi->sb->s_bdev)) { + f2fs_info(sbi, "write access unavailable, skipping orphan cleanup"); + return 0; + } + + if (s_flags & SB_RDONLY) { + f2fs_info(sbi, "orphan cleanup on readonly fs"); + sbi->sb->s_flags &= ~SB_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ + quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); +#endif + + start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); + orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); + + f2fs_ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true); + + for (i = 0; i < orphan_blocks; i++) { + struct page *page; + struct f2fs_orphan_block *orphan_blk; + + page = f2fs_get_meta_page(sbi, start_blk + i); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } + + orphan_blk = (struct f2fs_orphan_block *)page_address(page); + for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) { + nid_t ino = le32_to_cpu(orphan_blk->ino[j]); + + err = recover_orphan_inode(sbi, ino); + if (err) { + f2fs_put_page(page, 1); + goto out; + } + } + f2fs_put_page(page, 1); + } + /* clear Orphan Flag */ + clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); +out: + set_sbi_flag(sbi, SBI_IS_RECOVERED); + +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + if (quota_enabled) + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ + + return err; +} + +static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) +{ + struct list_head *head; + struct f2fs_orphan_block *orphan_blk = NULL; + unsigned int nentries = 0; + unsigned short index = 1; + unsigned short orphan_blocks; + struct page *page = NULL; + struct ino_entry *orphan = NULL; + struct inode_management *im = &sbi->im[ORPHAN_INO]; + + orphan_blocks = GET_ORPHAN_BLOCKS(im->ino_num); + + /* + * we don't need to do spin_lock(&im->ino_lock) here, since all the + * orphan inode operations are covered under f2fs_lock_op(). + * And, spin_lock should be avoided due to page operations below. + */ + head = &im->ino_list; + + /* loop for each orphan inode entry and write them in Jornal block */ + list_for_each_entry(orphan, head, list) { + if (!page) { + page = f2fs_grab_meta_page(sbi, start_blk++); + orphan_blk = + (struct f2fs_orphan_block *)page_address(page); + memset(orphan_blk, 0, sizeof(*orphan_blk)); + } + + orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); + + if (nentries == F2FS_ORPHANS_PER_BLOCK) { + /* + * an orphan block is full of 1020 entries, + * then we need to flush current orphan blocks + * and bring another one in memory + */ + orphan_blk->blk_addr = cpu_to_le16(index); + orphan_blk->blk_count = cpu_to_le16(orphan_blocks); + orphan_blk->entry_count = cpu_to_le32(nentries); + set_page_dirty(page); + f2fs_put_page(page, 1); + index++; + nentries = 0; + page = NULL; + } + } + + if (page) { + orphan_blk->blk_addr = cpu_to_le16(index); + orphan_blk->blk_count = cpu_to_le16(orphan_blocks); + orphan_blk->entry_count = cpu_to_le32(nentries); + set_page_dirty(page); + f2fs_put_page(page, 1); + } +} + +static __u32 f2fs_checkpoint_chksum(struct f2fs_sb_info *sbi, + struct f2fs_checkpoint *ckpt) +{ + unsigned int chksum_ofs = le32_to_cpu(ckpt->checksum_offset); + __u32 chksum; + + chksum = f2fs_crc32(sbi, ckpt, chksum_ofs); + if (chksum_ofs < CP_CHKSUM_OFFSET) { + chksum_ofs += sizeof(chksum); + chksum = f2fs_chksum(sbi, chksum, (__u8 *)ckpt + chksum_ofs, + F2FS_BLKSIZE - chksum_ofs); + } + return chksum; +} + +static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, + struct f2fs_checkpoint **cp_block, struct page **cp_page, + unsigned long long *version) +{ + size_t crc_offset = 0; + __u32 crc; + + *cp_page = f2fs_get_meta_page(sbi, cp_addr); + if (IS_ERR(*cp_page)) + return PTR_ERR(*cp_page); + + *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); + + crc_offset = le32_to_cpu((*cp_block)->checksum_offset); + if (crc_offset < CP_MIN_CHKSUM_OFFSET || + crc_offset > CP_CHKSUM_OFFSET) { + f2fs_put_page(*cp_page, 1); + f2fs_warn(sbi, "invalid crc_offset: %zu", crc_offset); + return -EINVAL; + } + + crc = f2fs_checkpoint_chksum(sbi, *cp_block); + if (crc != cur_cp_crc(*cp_block)) { + f2fs_put_page(*cp_page, 1); + f2fs_warn(sbi, "invalid crc value"); + return -EINVAL; + } + + *version = cur_cp_version(*cp_block); + return 0; +} + +static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, + block_t cp_addr, unsigned long long *version) +{ + struct page *cp_page_1 = NULL, *cp_page_2 = NULL; + struct f2fs_checkpoint *cp_block = NULL; + unsigned long long cur_version = 0, pre_version = 0; + unsigned int cp_blocks; + int err; + + err = get_checkpoint_version(sbi, cp_addr, &cp_block, + &cp_page_1, version); + if (err) + return NULL; + + cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count); + + if (cp_blocks > sbi->blocks_per_seg || cp_blocks <= F2FS_CP_PACKS) { + f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u", + le32_to_cpu(cp_block->cp_pack_total_block_count)); + goto invalid_cp; + } + pre_version = *version; + + cp_addr += cp_blocks - 1; + err = get_checkpoint_version(sbi, cp_addr, &cp_block, + &cp_page_2, version); + if (err) + goto invalid_cp; + cur_version = *version; + + if (cur_version == pre_version) { + *version = cur_version; + f2fs_put_page(cp_page_2, 1); + return cp_page_1; + } + f2fs_put_page(cp_page_2, 1); +invalid_cp: + f2fs_put_page(cp_page_1, 1); + return NULL; +} + +int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *cp_block; + struct f2fs_super_block *fsb = sbi->raw_super; + struct page *cp1, *cp2, *cur_page; + unsigned long blk_size = sbi->blocksize; + unsigned long long cp1_version = 0, cp2_version = 0; + unsigned long long cp_start_blk_no; + unsigned int cp_blks = 1 + __cp_payload(sbi); + block_t cp_blk_no; + int i; + int err; + + sbi->ckpt = f2fs_kvzalloc(sbi, array_size(blk_size, cp_blks), + GFP_KERNEL); + if (!sbi->ckpt) + return -ENOMEM; + /* + * Finding out valid cp block involves read both + * sets( cp pack 1 and cp pack 2) + */ + cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr); + cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version); + + /* The second checkpoint pack should start at the next segment */ + cp_start_blk_no += ((unsigned long long)1) << + le32_to_cpu(fsb->log_blocks_per_seg); + cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version); + + if (cp1 && cp2) { + if (ver_after(cp2_version, cp1_version)) + cur_page = cp2; + else + cur_page = cp1; + } else if (cp1) { + cur_page = cp1; + } else if (cp2) { + cur_page = cp2; + } else { + err = -EFSCORRUPTED; + goto fail_no_cp; + } + + cp_block = (struct f2fs_checkpoint *)page_address(cur_page); + memcpy(sbi->ckpt, cp_block, blk_size); + + if (cur_page == cp1) + sbi->cur_cp_pack = 1; + else + sbi->cur_cp_pack = 2; + + /* Sanity checking of checkpoint */ + if (f2fs_sanity_check_ckpt(sbi)) { + err = -EFSCORRUPTED; + goto free_fail_no_cp; + } + + if (cp_blks <= 1) + goto done; + + cp_blk_no = le32_to_cpu(fsb->cp_blkaddr); + if (cur_page == cp2) + cp_blk_no += BIT(le32_to_cpu(fsb->log_blocks_per_seg)); + + for (i = 1; i < cp_blks; i++) { + void *sit_bitmap_ptr; + unsigned char *ckpt = (unsigned char *)sbi->ckpt; + + cur_page = f2fs_get_meta_page(sbi, cp_blk_no + i); + if (IS_ERR(cur_page)) { + err = PTR_ERR(cur_page); + goto free_fail_no_cp; + } + sit_bitmap_ptr = page_address(cur_page); + memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size); + f2fs_put_page(cur_page, 1); + } +done: + f2fs_put_page(cp1, 1); + f2fs_put_page(cp2, 1); + return 0; + +free_fail_no_cp: + f2fs_put_page(cp1, 1); + f2fs_put_page(cp2, 1); +fail_no_cp: + kvfree(sbi->ckpt); + return err; +} + +static void __add_dirty_inode(struct inode *inode, enum inode_type type) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; + + if (is_inode_flag_set(inode, flag)) + return; + + set_inode_flag(inode, flag); + list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); + stat_inc_dirty_inode(sbi, type); +} + +static void __remove_dirty_inode(struct inode *inode, enum inode_type type) +{ + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; + + if (get_dirty_pages(inode) || !is_inode_flag_set(inode, flag)) + return; + + list_del_init(&F2FS_I(inode)->dirty_list); + clear_inode_flag(inode, flag); + stat_dec_dirty_inode(F2FS_I_SB(inode), type); +} + +void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; + + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) + return; + + spin_lock(&sbi->inode_lock[type]); + if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH)) + __add_dirty_inode(inode, type); + inode_inc_dirty_pages(inode); + spin_unlock(&sbi->inode_lock[type]); + + set_page_private_reference(&folio->page); +} + +void f2fs_remove_dirty_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; + + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) + return; + + if (type == FILE_INODE && !test_opt(sbi, DATA_FLUSH)) + return; + + spin_lock(&sbi->inode_lock[type]); + __remove_dirty_inode(inode, type); + spin_unlock(&sbi->inode_lock[type]); +} + +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type, + bool from_cp) +{ + struct list_head *head; + struct inode *inode; + struct f2fs_inode_info *fi; + bool is_dir = (type == DIR_INODE); + unsigned long ino = 0; + + trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); +retry: + if (unlikely(f2fs_cp_error(sbi))) { + trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); + return -EIO; + } + + spin_lock(&sbi->inode_lock[type]); + + head = &sbi->inode_list[type]; + if (list_empty(head)) { + spin_unlock(&sbi->inode_lock[type]); + trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); + return 0; + } + fi = list_first_entry(head, struct f2fs_inode_info, dirty_list); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[type]); + if (inode) { + unsigned long cur_ino = inode->i_ino; + + if (from_cp) + F2FS_I(inode)->cp_task = current; + F2FS_I(inode)->wb_task = current; + + filemap_fdatawrite(inode->i_mapping); + + F2FS_I(inode)->wb_task = NULL; + if (from_cp) + F2FS_I(inode)->cp_task = NULL; + + iput(inode); + /* We need to give cpu to another writers. */ + if (ino == cur_ino) + cond_resched(); + else + ino = cur_ino; + } else { + /* + * We should submit bio, since it exists several + * wribacking dentry pages in the freeing inode. + */ + f2fs_submit_merged_write(sbi, DATA); + cond_resched(); + } + goto retry; +} + +int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &sbi->inode_list[DIRTY_META]; + struct inode *inode; + struct f2fs_inode_info *fi; + s64 total = get_pages(sbi, F2FS_DIRTY_IMETA); + + while (total--) { + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (list_empty(head)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return 0; + } + fi = list_first_entry(head, struct f2fs_inode_info, + gdirty_list); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + if (inode) { + sync_inode_metadata(inode, 0); + + /* it's on eviction */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) + f2fs_update_inode_page(inode); + iput(inode); + } + } + return 0; +} + +static void __prepare_cp_block(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + nid_t last_nid = nm_i->next_scan_nid; + + next_free_nid(sbi, &last_nid); + ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); + ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); + ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); + ckpt->next_free_nid = cpu_to_le32(last_nid); +} + +static bool __need_flush_quota(struct f2fs_sb_info *sbi) +{ + bool ret = false; + + if (!is_journalled_quota(sbi)) + return false; + + if (!f2fs_down_write_trylock(&sbi->quota_sem)) + return true; + if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) { + ret = false; + } else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) { + ret = false; + } else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_FLUSH)) { + clear_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH); + ret = true; + } else if (get_pages(sbi, F2FS_DIRTY_QDATA)) { + ret = true; + } + f2fs_up_write(&sbi->quota_sem); + return ret; +} + +/* + * Freeze all the FS-operations for checkpoint. + */ +static int block_operations(struct f2fs_sb_info *sbi) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .for_reclaim = 0, + }; + int err = 0, cnt = 0; + + /* + * Let's flush inline_data in dirty node pages. + */ + f2fs_flush_inline_data(sbi); + +retry_flush_quotas: + f2fs_lock_all(sbi); + if (__need_flush_quota(sbi)) { + int locked; + + if (++cnt > DEFAULT_RETRY_QUOTA_FLUSH_COUNT) { + set_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH); + set_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH); + goto retry_flush_dents; + } + f2fs_unlock_all(sbi); + + /* only failed during mount/umount/freeze/quotactl */ + locked = down_read_trylock(&sbi->sb->s_umount); + f2fs_quota_sync(sbi->sb, -1); + if (locked) + up_read(&sbi->sb->s_umount); + cond_resched(); + goto retry_flush_quotas; + } + +retry_flush_dents: + /* write all the dirty dentry pages */ + if (get_pages(sbi, F2FS_DIRTY_DENTS)) { + f2fs_unlock_all(sbi); + err = f2fs_sync_dirty_inodes(sbi, DIR_INODE, true); + if (err) + return err; + cond_resched(); + goto retry_flush_quotas; + } + + /* + * POR: we should ensure that there are no dirty node pages + * until finishing nat/sit flush. inode->i_blocks can be updated. + */ + f2fs_down_write(&sbi->node_change); + + if (get_pages(sbi, F2FS_DIRTY_IMETA)) { + f2fs_up_write(&sbi->node_change); + f2fs_unlock_all(sbi); + err = f2fs_sync_inode_meta(sbi); + if (err) + return err; + cond_resched(); + goto retry_flush_quotas; + } + +retry_flush_nodes: + f2fs_down_write(&sbi->node_write); + + if (get_pages(sbi, F2FS_DIRTY_NODES)) { + f2fs_up_write(&sbi->node_write); + atomic_inc(&sbi->wb_sync_req[NODE]); + err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); + atomic_dec(&sbi->wb_sync_req[NODE]); + if (err) { + f2fs_up_write(&sbi->node_change); + f2fs_unlock_all(sbi); + return err; + } + cond_resched(); + goto retry_flush_nodes; + } + + /* + * sbi->node_change is used only for AIO write_begin path which produces + * dirty node blocks and some checkpoint values by block allocation. + */ + __prepare_cp_block(sbi); + f2fs_up_write(&sbi->node_change); + return err; +} + +static void unblock_operations(struct f2fs_sb_info *sbi) +{ + f2fs_up_write(&sbi->node_write); + f2fs_unlock_all(sbi); +} + +void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type) +{ + DEFINE_WAIT(wait); + + for (;;) { + if (!get_pages(sbi, type)) + break; + + if (unlikely(f2fs_cp_error(sbi) && + !is_sbi_flag_set(sbi, SBI_IS_CLOSE))) + break; + + if (type == F2FS_DIRTY_META) + f2fs_sync_meta_pages(sbi, META, LONG_MAX, + FS_CP_META_IO); + else if (type == F2FS_WB_CP_DATA) + f2fs_submit_merged_write(sbi, DATA); + + prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); + io_schedule_timeout(DEFAULT_IO_TIMEOUT); + } + finish_wait(&sbi->cp_wait, &wait); +} + +static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned long flags; + + if (cpc->reason & CP_UMOUNT) { + if (le32_to_cpu(ckpt->cp_pack_total_block_count) + + NM_I(sbi)->nat_bits_blocks > sbi->blocks_per_seg) { + clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + f2fs_notice(sbi, "Disable nat_bits due to no space"); + } else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) && + f2fs_nat_bitmap_enabled(sbi)) { + f2fs_enable_nat_bits(sbi); + set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + f2fs_notice(sbi, "Rebuild and enable nat_bits"); + } + } + + spin_lock_irqsave(&sbi->cp_lock, flags); + + if (cpc->reason & CP_TRIMMED) + __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); + else + __clear_ckpt_flags(ckpt, CP_TRIMMED_FLAG); + + if (cpc->reason & CP_UMOUNT) + __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + + if (cpc->reason & CP_FASTBOOT) + __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + + if (orphan_num) + __set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + __set_ckpt_flags(ckpt, CP_FSCK_FLAG); + + if (is_sbi_flag_set(sbi, SBI_IS_RESIZEFS)) + __set_ckpt_flags(ckpt, CP_RESIZEFS_FLAG); + else + __clear_ckpt_flags(ckpt, CP_RESIZEFS_FLAG); + + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED)) + __set_ckpt_flags(ckpt, CP_DISABLED_FLAG); + else + __clear_ckpt_flags(ckpt, CP_DISABLED_FLAG); + + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK)) + __set_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG); + else + __clear_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG); + + if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) + __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); + else + __clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); + + if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) + __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); + + /* set this flag to activate crc|cp_ver for recovery */ + __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); + __clear_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG); + + spin_unlock_irqrestore(&sbi->cp_lock, flags); +} + +static void commit_checkpoint(struct f2fs_sb_info *sbi, + void *src, block_t blk_addr) +{ + struct writeback_control wbc = { + .for_reclaim = 0, + }; + + /* + * pagevec_lookup_tag and lock_page again will take + * some extra time. Therefore, f2fs_update_meta_pages and + * f2fs_sync_meta_pages are combined in this function. + */ + struct page *page = f2fs_grab_meta_page(sbi, blk_addr); + int err; + + f2fs_wait_on_page_writeback(page, META, true, true); + + memcpy(page_address(page), src, PAGE_SIZE); + + set_page_dirty(page); + if (unlikely(!clear_page_dirty_for_io(page))) + f2fs_bug_on(sbi, 1); + + /* writeout cp pack 2 page */ + err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO); + if (unlikely(err && f2fs_cp_error(sbi))) { + f2fs_put_page(page, 1); + return; + } + + f2fs_bug_on(sbi, err); + f2fs_put_page(page, 0); + + /* submit checkpoint (with barrier if NOBARRIER is not set) */ + f2fs_submit_merged_write(sbi, META_FLUSH); +} + +static inline u64 get_sectors_written(struct block_device *bdev) +{ + return (u64)part_stat_read(bdev, sectors[STAT_WRITE]); +} + +u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi) +{ + if (f2fs_is_multi_device(sbi)) { + u64 sectors = 0; + int i; + + for (i = 0; i < sbi->s_ndevs; i++) + sectors += get_sectors_written(FDEV(i).bdev); + + return sectors; + } + + return get_sectors_written(sbi->sb->s_bdev); +} + +static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num, flags; + block_t start_blk; + unsigned int data_sum_blocks, orphan_blocks; + __u32 crc32 = 0; + int i; + int cp_payload_blks = __cp_payload(sbi); + struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); + u64 kbytes_written; + int err; + + /* Flush all the NAT/SIT pages */ + f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + + /* start to update checkpoint, cp ver is already updated previously */ + ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true)); + ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + ckpt->cur_node_segno[i] = + cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE)); + ckpt->cur_node_blkoff[i] = + cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_NODE)); + ckpt->alloc_type[i + CURSEG_HOT_NODE] = + curseg_alloc_type(sbi, i + CURSEG_HOT_NODE); + } + for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { + ckpt->cur_data_segno[i] = + cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA)); + ckpt->cur_data_blkoff[i] = + cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_DATA)); + ckpt->alloc_type[i + CURSEG_HOT_DATA] = + curseg_alloc_type(sbi, i + CURSEG_HOT_DATA); + } + + /* 2 cp + n data seg summary + orphan inode blocks */ + data_sum_blocks = f2fs_npages_for_summary_flush(sbi, false); + spin_lock_irqsave(&sbi->cp_lock, flags); + if (data_sum_blocks < NR_CURSEG_DATA_TYPE) + __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + else + __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + spin_unlock_irqrestore(&sbi->cp_lock, flags); + + orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num); + ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + + orphan_blocks); + + if (__remain_node_summaries(cpc->reason)) + ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS + + cp_payload_blks + data_sum_blocks + + orphan_blocks + NR_CURSEG_NODE_TYPE); + else + ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS + + cp_payload_blks + data_sum_blocks + + orphan_blocks); + + /* update ckpt flag for checkpoint */ + update_ckpt_flags(sbi, cpc); + + /* update SIT/NAT bitmap */ + get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); + get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); + + crc32 = f2fs_checkpoint_chksum(sbi, ckpt); + *((__le32 *)((unsigned char *)ckpt + + le32_to_cpu(ckpt->checksum_offset))) + = cpu_to_le32(crc32); + + start_blk = __start_cp_next_addr(sbi); + + /* write nat bits */ + if ((cpc->reason & CP_UMOUNT) && + is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) { + __u64 cp_ver = cur_cp_version(ckpt); + block_t blk; + + cp_ver |= ((__u64)crc32 << 32); + *(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver); + + blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks; + for (i = 0; i < nm_i->nat_bits_blocks; i++) + f2fs_update_meta_page(sbi, nm_i->nat_bits + + (i << F2FS_BLKSIZE_BITS), blk + i); + } + + /* write out checkpoint buffer at block 0 */ + f2fs_update_meta_page(sbi, ckpt, start_blk++); + + for (i = 1; i < 1 + cp_payload_blks; i++) + f2fs_update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE, + start_blk++); + + if (orphan_num) { + write_orphan_inodes(sbi, start_blk); + start_blk += orphan_blocks; + } + + f2fs_write_data_summaries(sbi, start_blk); + start_blk += data_sum_blocks; + + /* Record write statistics in the hot node summary */ + kbytes_written = sbi->kbytes_written; + kbytes_written += (f2fs_get_sectors_written(sbi) - + sbi->sectors_written_start) >> 1; + seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written); + + if (__remain_node_summaries(cpc->reason)) { + f2fs_write_node_summaries(sbi, start_blk); + start_blk += NR_CURSEG_NODE_TYPE; + } + + /* update user_block_counts */ + sbi->last_valid_block_count = sbi->total_valid_block_count; + percpu_counter_set(&sbi->alloc_valid_block_count, 0); + percpu_counter_set(&sbi->rf_node_block_count, 0); + + /* Here, we have one bio having CP pack except cp pack 2 page */ + f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + /* Wait for all dirty meta pages to be submitted for IO */ + f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META); + + /* wait for previous submitted meta pages writeback */ + f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); + + /* flush all device cache */ + err = f2fs_flush_device_cache(sbi); + if (err) + return err; + + /* barrier and flush checkpoint cp pack 2 page if it can */ + commit_checkpoint(sbi, ckpt, start_blk); + f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); + + /* + * invalidate intermediate page cache borrowed from meta inode which are + * used for migration of encrypted, verity or compressed inode's blocks. + */ + if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi) || + f2fs_sb_has_compression(sbi)) + invalidate_mapping_pages(META_MAPPING(sbi), + MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1); + + f2fs_release_ino_entry(sbi, false); + + f2fs_reset_fsync_node_info(sbi); + + clear_sbi_flag(sbi, SBI_IS_DIRTY); + clear_sbi_flag(sbi, SBI_NEED_CP); + clear_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH); + + spin_lock(&sbi->stat_lock); + sbi->unusable_block_count = 0; + spin_unlock(&sbi->stat_lock); + + __set_cp_next_pack(sbi); + + /* + * redirty superblock if metadata like node page or inode cache is + * updated during writing checkpoint. + */ + if (get_pages(sbi, F2FS_DIRTY_NODES) || + get_pages(sbi, F2FS_DIRTY_IMETA)) + set_sbi_flag(sbi, SBI_IS_DIRTY); + + f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS)); + + return unlikely(f2fs_cp_error(sbi)) ? -EIO : 0; +} + +int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned long long ckpt_ver; + int err = 0; + + if (f2fs_readonly(sbi->sb) || f2fs_hw_is_readonly(sbi)) + return -EROFS; + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (cpc->reason != CP_PAUSE) + return 0; + f2fs_warn(sbi, "Start checkpoint disabled!"); + } + if (cpc->reason != CP_RESIZE) + f2fs_down_write(&sbi->cp_global_sem); + + if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && + ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || + ((cpc->reason & CP_DISCARD) && !sbi->discard_blks))) + goto out; + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto out; + } + + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); + + err = block_operations(sbi); + if (err) + goto out; + + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); + + f2fs_flush_merged_writes(sbi); + + /* this is the case of multiple fstrims without any changes */ + if (cpc->reason & CP_DISCARD) { + if (!f2fs_exist_trim_candidates(sbi, cpc)) { + unblock_operations(sbi); + goto out; + } + + if (NM_I(sbi)->nat_cnt[DIRTY_NAT] == 0 && + SIT_I(sbi)->dirty_sentries == 0 && + prefree_segments(sbi) == 0) { + f2fs_flush_sit_entries(sbi, cpc); + f2fs_clear_prefree_segments(sbi, cpc); + unblock_operations(sbi); + goto out; + } + } + + /* + * update checkpoint pack index + * Increase the version number so that + * SIT entries and seg summaries are written at correct place + */ + ckpt_ver = cur_cp_version(ckpt); + ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); + + /* write cached NAT/SIT entries to NAT/SIT area */ + err = f2fs_flush_nat_entries(sbi, cpc); + if (err) { + f2fs_err(sbi, "f2fs_flush_nat_entries failed err:%d, stop checkpoint", err); + f2fs_bug_on(sbi, !f2fs_cp_error(sbi)); + goto stop; + } + + f2fs_flush_sit_entries(sbi, cpc); + + /* save inmem log status */ + f2fs_save_inmem_curseg(sbi); + + err = do_checkpoint(sbi, cpc); + if (err) { + f2fs_err(sbi, "do_checkpoint failed err:%d, stop checkpoint", err); + f2fs_bug_on(sbi, !f2fs_cp_error(sbi)); + f2fs_release_discard_addrs(sbi); + } else { + f2fs_clear_prefree_segments(sbi, cpc); + } + + f2fs_restore_inmem_curseg(sbi); +stop: + unblock_operations(sbi); + stat_inc_cp_count(sbi->stat_info); + + if (cpc->reason & CP_RECOVERY) + f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver); + + /* update CP_TIME to trigger checkpoint periodically */ + f2fs_update_time(sbi, CP_TIME); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); +out: + if (cpc->reason != CP_RESIZE) + f2fs_up_write(&sbi->cp_global_sem); + return err; +} + +void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < MAX_INO_ENTRY; i++) { + struct inode_management *im = &sbi->im[i]; + + INIT_RADIX_TREE(&im->ino_root, GFP_ATOMIC); + spin_lock_init(&im->ino_lock); + INIT_LIST_HEAD(&im->ino_list); + im->ino_num = 0; + } + + sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS - + NR_CURSEG_PERSIST_TYPE - __cp_payload(sbi)) * + F2FS_ORPHANS_PER_BLOCK; +} + +int __init f2fs_create_checkpoint_caches(void) +{ + ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry", + sizeof(struct ino_entry)); + if (!ino_entry_slab) + return -ENOMEM; + f2fs_inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry", + sizeof(struct inode_entry)); + if (!f2fs_inode_entry_slab) { + kmem_cache_destroy(ino_entry_slab); + return -ENOMEM; + } + return 0; +} + +void f2fs_destroy_checkpoint_caches(void) +{ + kmem_cache_destroy(ino_entry_slab); + kmem_cache_destroy(f2fs_inode_entry_slab); +} + +static int __write_checkpoint_sync(struct f2fs_sb_info *sbi) +{ + struct cp_control cpc = { .reason = CP_SYNC, }; + int err; + + f2fs_down_write(&sbi->gc_lock); + err = f2fs_write_checkpoint(sbi, &cpc); + f2fs_up_write(&sbi->gc_lock); + + return err; +} + +static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct ckpt_req *req, *next; + struct llist_node *dispatch_list; + u64 sum_diff = 0, diff, count = 0; + int ret; + + dispatch_list = llist_del_all(&cprc->issue_list); + if (!dispatch_list) + return; + dispatch_list = llist_reverse_order(dispatch_list); + + ret = __write_checkpoint_sync(sbi); + atomic_inc(&cprc->issued_ckpt); + + llist_for_each_entry_safe(req, next, dispatch_list, llnode) { + diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time); + req->ret = ret; + complete(&req->wait); + + sum_diff += diff; + count++; + } + atomic_sub(count, &cprc->queued_ckpt); + atomic_add(count, &cprc->total_ckpt); + + spin_lock(&cprc->stat_lock); + cprc->cur_time = (unsigned int)div64_u64(sum_diff, count); + if (cprc->peak_time < cprc->cur_time) + cprc->peak_time = cprc->cur_time; + spin_unlock(&cprc->stat_lock); +} + +static int issue_checkpoint_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct ckpt_req_control *cprc = &sbi->cprc_info; + wait_queue_head_t *q = &cprc->ckpt_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; + + if (!llist_empty(&cprc->issue_list)) + __checkpoint_and_complete_reqs(sbi); + + wait_event_interruptible(*q, + kthread_should_stop() || !llist_empty(&cprc->issue_list)); + goto repeat; +} + +static void flush_remained_ckpt_reqs(struct f2fs_sb_info *sbi, + struct ckpt_req *wait_req) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (!llist_empty(&cprc->issue_list)) { + __checkpoint_and_complete_reqs(sbi); + } else { + /* already dispatched by issue_checkpoint_thread */ + if (wait_req) + wait_for_completion(&wait_req->wait); + } +} + +static void init_ckpt_req(struct ckpt_req *req) +{ + memset(req, 0, sizeof(struct ckpt_req)); + + init_completion(&req->wait); + req->queue_time = ktime_get(); +} + +int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct ckpt_req req; + struct cp_control cpc; + + cpc.reason = __get_cp_reason(sbi); + if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) { + int ret; + + f2fs_down_write(&sbi->gc_lock); + ret = f2fs_write_checkpoint(sbi, &cpc); + f2fs_up_write(&sbi->gc_lock); + + return ret; + } + + if (!cprc->f2fs_issue_ckpt) + return __write_checkpoint_sync(sbi); + + init_ckpt_req(&req); + + llist_add(&req.llnode, &cprc->issue_list); + atomic_inc(&cprc->queued_ckpt); + + /* + * update issue_list before we wake up issue_checkpoint thread, + * this smp_mb() pairs with another barrier in ___wait_event(), + * see more details in comments of waitqueue_active(). + */ + smp_mb(); + + if (waitqueue_active(&cprc->ckpt_wait_queue)) + wake_up(&cprc->ckpt_wait_queue); + + if (cprc->f2fs_issue_ckpt) + wait_for_completion(&req.wait); + else + flush_remained_ckpt_reqs(sbi, &req); + + return req.ret; +} + +int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (cprc->f2fs_issue_ckpt) + return 0; + + cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi, + "f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(cprc->f2fs_issue_ckpt)) { + cprc->f2fs_issue_ckpt = NULL; + return -ENOMEM; + } + + set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio); + + return 0; +} + +void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct task_struct *ckpt_task; + + if (!cprc->f2fs_issue_ckpt) + return; + + ckpt_task = cprc->f2fs_issue_ckpt; + cprc->f2fs_issue_ckpt = NULL; + kthread_stop(ckpt_task); + + f2fs_flush_ckpt_thread(sbi); +} + +void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + flush_remained_ckpt_reqs(sbi, NULL); + + /* Let's wait for the previous dispatched checkpoint. */ + while (atomic_read(&cprc->queued_ckpt)) + io_schedule_timeout(DEFAULT_IO_TIMEOUT); +} + +void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + atomic_set(&cprc->issued_ckpt, 0); + atomic_set(&cprc->total_ckpt, 0); + atomic_set(&cprc->queued_ckpt, 0); + cprc->ckpt_thread_ioprio = DEFAULT_CHECKPOINT_IOPRIO; + init_waitqueue_head(&cprc->ckpt_wait_queue); + init_llist_head(&cprc->issue_list); + spin_lock_init(&cprc->stat_lock); +} diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c new file mode 100644 index 000000000..4cb58e8d6 --- /dev/null +++ b/fs/f2fs/compress.c @@ -0,0 +1,2054 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * f2fs compress support + * + * Copyright (c) 2019 Chao Yu <chao@kernel.org> + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/moduleparam.h> +#include <linux/writeback.h> +#include <linux/backing-dev.h> +#include <linux/lzo.h> +#include <linux/lz4.h> +#include <linux/zstd.h> +#include <linux/pagevec.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include <trace/events/f2fs.h> + +static struct kmem_cache *cic_entry_slab; +static struct kmem_cache *dic_entry_slab; + +static void *page_array_alloc(struct inode *inode, int nr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int size = sizeof(struct page *) * nr; + + if (likely(size <= sbi->page_array_slab_size)) + return f2fs_kmem_cache_alloc(sbi->page_array_slab, + GFP_F2FS_ZERO, false, F2FS_I_SB(inode)); + return f2fs_kzalloc(sbi, size, GFP_NOFS); +} + +static void page_array_free(struct inode *inode, void *pages, int nr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int size = sizeof(struct page *) * nr; + + if (!pages) + return; + + if (likely(size <= sbi->page_array_slab_size)) + kmem_cache_free(sbi->page_array_slab, pages); + else + kfree(pages); +} + +struct f2fs_compress_ops { + int (*init_compress_ctx)(struct compress_ctx *cc); + void (*destroy_compress_ctx)(struct compress_ctx *cc); + int (*compress_pages)(struct compress_ctx *cc); + int (*init_decompress_ctx)(struct decompress_io_ctx *dic); + void (*destroy_decompress_ctx)(struct decompress_io_ctx *dic); + int (*decompress_pages)(struct decompress_io_ctx *dic); +}; + +static unsigned int offset_in_cluster(struct compress_ctx *cc, pgoff_t index) +{ + return index & (cc->cluster_size - 1); +} + +static pgoff_t cluster_idx(struct compress_ctx *cc, pgoff_t index) +{ + return index >> cc->log_cluster_size; +} + +static pgoff_t start_idx_of_cluster(struct compress_ctx *cc) +{ + return cc->cluster_idx << cc->log_cluster_size; +} + +bool f2fs_is_compressed_page(struct page *page) +{ + if (!PagePrivate(page)) + return false; + if (!page_private(page)) + return false; + if (page_private_nonpointer(page)) + return false; + + f2fs_bug_on(F2FS_M_SB(page->mapping), + *((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC); + return true; +} + +static void f2fs_set_compressed_page(struct page *page, + struct inode *inode, pgoff_t index, void *data) +{ + attach_page_private(page, (void *)data); + + /* i_crypto_info and iv index */ + page->index = index; + page->mapping = inode->i_mapping; +} + +static void f2fs_drop_rpages(struct compress_ctx *cc, int len, bool unlock) +{ + int i; + + for (i = 0; i < len; i++) { + if (!cc->rpages[i]) + continue; + if (unlock) + unlock_page(cc->rpages[i]); + else + put_page(cc->rpages[i]); + } +} + +static void f2fs_put_rpages(struct compress_ctx *cc) +{ + f2fs_drop_rpages(cc, cc->cluster_size, false); +} + +static void f2fs_unlock_rpages(struct compress_ctx *cc, int len) +{ + f2fs_drop_rpages(cc, len, true); +} + +static void f2fs_put_rpages_wbc(struct compress_ctx *cc, + struct writeback_control *wbc, bool redirty, int unlock) +{ + unsigned int i; + + for (i = 0; i < cc->cluster_size; i++) { + if (!cc->rpages[i]) + continue; + if (redirty) + redirty_page_for_writepage(wbc, cc->rpages[i]); + f2fs_put_page(cc->rpages[i], unlock); + } +} + +struct page *f2fs_compress_control_page(struct page *page) +{ + return ((struct compress_io_ctx *)page_private(page))->rpages[0]; +} + +int f2fs_init_compress_ctx(struct compress_ctx *cc) +{ + if (cc->rpages) + return 0; + + cc->rpages = page_array_alloc(cc->inode, cc->cluster_size); + return cc->rpages ? 0 : -ENOMEM; +} + +void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse) +{ + page_array_free(cc->inode, cc->rpages, cc->cluster_size); + cc->rpages = NULL; + cc->nr_rpages = 0; + cc->nr_cpages = 0; + cc->valid_nr_cpages = 0; + if (!reuse) + cc->cluster_idx = NULL_CLUSTER; +} + +void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page) +{ + unsigned int cluster_ofs; + + if (!f2fs_cluster_can_merge_page(cc, page->index)) + f2fs_bug_on(F2FS_I_SB(cc->inode), 1); + + cluster_ofs = offset_in_cluster(cc, page->index); + cc->rpages[cluster_ofs] = page; + cc->nr_rpages++; + cc->cluster_idx = cluster_idx(cc, page->index); +} + +#ifdef CONFIG_F2FS_FS_LZO +static int lzo_init_compress_ctx(struct compress_ctx *cc) +{ + cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), + LZO1X_MEM_COMPRESS, GFP_NOFS); + if (!cc->private) + return -ENOMEM; + + cc->clen = lzo1x_worst_compress(PAGE_SIZE << cc->log_cluster_size); + return 0; +} + +static void lzo_destroy_compress_ctx(struct compress_ctx *cc) +{ + kvfree(cc->private); + cc->private = NULL; +} + +static int lzo_compress_pages(struct compress_ctx *cc) +{ + int ret; + + ret = lzo1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata, + &cc->clen, cc->private); + if (ret != LZO_E_OK) { + printk_ratelimited("%sF2FS-fs (%s): lzo compress failed, ret:%d\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, ret); + return -EIO; + } + return 0; +} + +static int lzo_decompress_pages(struct decompress_io_ctx *dic) +{ + int ret; + + ret = lzo1x_decompress_safe(dic->cbuf->cdata, dic->clen, + dic->rbuf, &dic->rlen); + if (ret != LZO_E_OK) { + printk_ratelimited("%sF2FS-fs (%s): lzo decompress failed, ret:%d\n", + KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, ret); + return -EIO; + } + + if (dic->rlen != PAGE_SIZE << dic->log_cluster_size) { + printk_ratelimited("%sF2FS-fs (%s): lzo invalid rlen:%zu, " + "expected:%lu\n", KERN_ERR, + F2FS_I_SB(dic->inode)->sb->s_id, + dic->rlen, + PAGE_SIZE << dic->log_cluster_size); + return -EIO; + } + return 0; +} + +static const struct f2fs_compress_ops f2fs_lzo_ops = { + .init_compress_ctx = lzo_init_compress_ctx, + .destroy_compress_ctx = lzo_destroy_compress_ctx, + .compress_pages = lzo_compress_pages, + .decompress_pages = lzo_decompress_pages, +}; +#endif + +#ifdef CONFIG_F2FS_FS_LZ4 +static int lz4_init_compress_ctx(struct compress_ctx *cc) +{ + unsigned int size = LZ4_MEM_COMPRESS; + +#ifdef CONFIG_F2FS_FS_LZ4HC + if (F2FS_I(cc->inode)->i_compress_level) + size = LZ4HC_MEM_COMPRESS; +#endif + + cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), size, GFP_NOFS); + if (!cc->private) + return -ENOMEM; + + /* + * we do not change cc->clen to LZ4_compressBound(inputsize) to + * adapt worst compress case, because lz4 compressor can handle + * output budget properly. + */ + cc->clen = cc->rlen - PAGE_SIZE - COMPRESS_HEADER_SIZE; + return 0; +} + +static void lz4_destroy_compress_ctx(struct compress_ctx *cc) +{ + kvfree(cc->private); + cc->private = NULL; +} + +#ifdef CONFIG_F2FS_FS_LZ4HC +static int lz4hc_compress_pages(struct compress_ctx *cc) +{ + unsigned char level = F2FS_I(cc->inode)->i_compress_level; + int len; + + if (level) + len = LZ4_compress_HC(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, level, cc->private); + else + len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, cc->private); + if (!len) + return -EAGAIN; + + cc->clen = len; + return 0; +} +#endif + +static int lz4_compress_pages(struct compress_ctx *cc) +{ + int len; + +#ifdef CONFIG_F2FS_FS_LZ4HC + return lz4hc_compress_pages(cc); +#endif + len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, cc->private); + if (!len) + return -EAGAIN; + + cc->clen = len; + return 0; +} + +static int lz4_decompress_pages(struct decompress_io_ctx *dic) +{ + int ret; + + ret = LZ4_decompress_safe(dic->cbuf->cdata, dic->rbuf, + dic->clen, dic->rlen); + if (ret < 0) { + printk_ratelimited("%sF2FS-fs (%s): lz4 decompress failed, ret:%d\n", + KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, ret); + return -EIO; + } + + if (ret != PAGE_SIZE << dic->log_cluster_size) { + printk_ratelimited("%sF2FS-fs (%s): lz4 invalid ret:%d, " + "expected:%lu\n", KERN_ERR, + F2FS_I_SB(dic->inode)->sb->s_id, ret, + PAGE_SIZE << dic->log_cluster_size); + return -EIO; + } + return 0; +} + +static const struct f2fs_compress_ops f2fs_lz4_ops = { + .init_compress_ctx = lz4_init_compress_ctx, + .destroy_compress_ctx = lz4_destroy_compress_ctx, + .compress_pages = lz4_compress_pages, + .decompress_pages = lz4_decompress_pages, +}; +#endif + +#ifdef CONFIG_F2FS_FS_ZSTD +static int zstd_init_compress_ctx(struct compress_ctx *cc) +{ + zstd_parameters params; + zstd_cstream *stream; + void *workspace; + unsigned int workspace_size; + unsigned char level = F2FS_I(cc->inode)->i_compress_level; + + /* Need to remain this for backward compatibility */ + if (!level) + level = F2FS_ZSTD_DEFAULT_CLEVEL; + + params = zstd_get_params(level, cc->rlen); + workspace_size = zstd_cstream_workspace_bound(¶ms.cParams); + + workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode), + workspace_size, GFP_NOFS); + if (!workspace) + return -ENOMEM; + + stream = zstd_init_cstream(¶ms, 0, workspace, workspace_size); + if (!stream) { + printk_ratelimited("%sF2FS-fs (%s): %s zstd_init_cstream failed\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, + __func__); + kvfree(workspace); + return -EIO; + } + + cc->private = workspace; + cc->private2 = stream; + + cc->clen = cc->rlen - PAGE_SIZE - COMPRESS_HEADER_SIZE; + return 0; +} + +static void zstd_destroy_compress_ctx(struct compress_ctx *cc) +{ + kvfree(cc->private); + cc->private = NULL; + cc->private2 = NULL; +} + +static int zstd_compress_pages(struct compress_ctx *cc) +{ + zstd_cstream *stream = cc->private2; + zstd_in_buffer inbuf; + zstd_out_buffer outbuf; + int src_size = cc->rlen; + int dst_size = src_size - PAGE_SIZE - COMPRESS_HEADER_SIZE; + int ret; + + inbuf.pos = 0; + inbuf.src = cc->rbuf; + inbuf.size = src_size; + + outbuf.pos = 0; + outbuf.dst = cc->cbuf->cdata; + outbuf.size = dst_size; + + ret = zstd_compress_stream(stream, &outbuf, &inbuf); + if (zstd_is_error(ret)) { + printk_ratelimited("%sF2FS-fs (%s): %s zstd_compress_stream failed, ret: %d\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, + __func__, zstd_get_error_code(ret)); + return -EIO; + } + + ret = zstd_end_stream(stream, &outbuf); + if (zstd_is_error(ret)) { + printk_ratelimited("%sF2FS-fs (%s): %s zstd_end_stream returned %d\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, + __func__, zstd_get_error_code(ret)); + return -EIO; + } + + /* + * there is compressed data remained in intermediate buffer due to + * no more space in cbuf.cdata + */ + if (ret) + return -EAGAIN; + + cc->clen = outbuf.pos; + return 0; +} + +static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic) +{ + zstd_dstream *stream; + void *workspace; + unsigned int workspace_size; + unsigned int max_window_size = + MAX_COMPRESS_WINDOW_SIZE(dic->log_cluster_size); + + workspace_size = zstd_dstream_workspace_bound(max_window_size); + + workspace = f2fs_kvmalloc(F2FS_I_SB(dic->inode), + workspace_size, GFP_NOFS); + if (!workspace) + return -ENOMEM; + + stream = zstd_init_dstream(max_window_size, workspace, workspace_size); + if (!stream) { + printk_ratelimited("%sF2FS-fs (%s): %s zstd_init_dstream failed\n", + KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, + __func__); + kvfree(workspace); + return -EIO; + } + + dic->private = workspace; + dic->private2 = stream; + + return 0; +} + +static void zstd_destroy_decompress_ctx(struct decompress_io_ctx *dic) +{ + kvfree(dic->private); + dic->private = NULL; + dic->private2 = NULL; +} + +static int zstd_decompress_pages(struct decompress_io_ctx *dic) +{ + zstd_dstream *stream = dic->private2; + zstd_in_buffer inbuf; + zstd_out_buffer outbuf; + int ret; + + inbuf.pos = 0; + inbuf.src = dic->cbuf->cdata; + inbuf.size = dic->clen; + + outbuf.pos = 0; + outbuf.dst = dic->rbuf; + outbuf.size = dic->rlen; + + ret = zstd_decompress_stream(stream, &outbuf, &inbuf); + if (zstd_is_error(ret)) { + printk_ratelimited("%sF2FS-fs (%s): %s zstd_decompress_stream failed, ret: %d\n", + KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, + __func__, zstd_get_error_code(ret)); + return -EIO; + } + + if (dic->rlen != outbuf.pos) { + printk_ratelimited("%sF2FS-fs (%s): %s ZSTD invalid rlen:%zu, " + "expected:%lu\n", KERN_ERR, + F2FS_I_SB(dic->inode)->sb->s_id, + __func__, dic->rlen, + PAGE_SIZE << dic->log_cluster_size); + return -EIO; + } + + return 0; +} + +static const struct f2fs_compress_ops f2fs_zstd_ops = { + .init_compress_ctx = zstd_init_compress_ctx, + .destroy_compress_ctx = zstd_destroy_compress_ctx, + .compress_pages = zstd_compress_pages, + .init_decompress_ctx = zstd_init_decompress_ctx, + .destroy_decompress_ctx = zstd_destroy_decompress_ctx, + .decompress_pages = zstd_decompress_pages, +}; +#endif + +#ifdef CONFIG_F2FS_FS_LZO +#ifdef CONFIG_F2FS_FS_LZORLE +static int lzorle_compress_pages(struct compress_ctx *cc) +{ + int ret; + + ret = lzorle1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata, + &cc->clen, cc->private); + if (ret != LZO_E_OK) { + printk_ratelimited("%sF2FS-fs (%s): lzo-rle compress failed, ret:%d\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, ret); + return -EIO; + } + return 0; +} + +static const struct f2fs_compress_ops f2fs_lzorle_ops = { + .init_compress_ctx = lzo_init_compress_ctx, + .destroy_compress_ctx = lzo_destroy_compress_ctx, + .compress_pages = lzorle_compress_pages, + .decompress_pages = lzo_decompress_pages, +}; +#endif +#endif + +static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = { +#ifdef CONFIG_F2FS_FS_LZO + &f2fs_lzo_ops, +#else + NULL, +#endif +#ifdef CONFIG_F2FS_FS_LZ4 + &f2fs_lz4_ops, +#else + NULL, +#endif +#ifdef CONFIG_F2FS_FS_ZSTD + &f2fs_zstd_ops, +#else + NULL, +#endif +#if defined(CONFIG_F2FS_FS_LZO) && defined(CONFIG_F2FS_FS_LZORLE) + &f2fs_lzorle_ops, +#else + NULL, +#endif +}; + +bool f2fs_is_compress_backend_ready(struct inode *inode) +{ + if (!f2fs_compressed_file(inode)) + return true; + return f2fs_cops[F2FS_I(inode)->i_compress_algorithm]; +} + +static mempool_t *compress_page_pool; +static int num_compress_pages = 512; +module_param(num_compress_pages, uint, 0444); +MODULE_PARM_DESC(num_compress_pages, + "Number of intermediate compress pages to preallocate"); + +int f2fs_init_compress_mempool(void) +{ + compress_page_pool = mempool_create_page_pool(num_compress_pages, 0); + if (!compress_page_pool) + return -ENOMEM; + + return 0; +} + +void f2fs_destroy_compress_mempool(void) +{ + mempool_destroy(compress_page_pool); +} + +static struct page *f2fs_compress_alloc_page(void) +{ + struct page *page; + + page = mempool_alloc(compress_page_pool, GFP_NOFS); + lock_page(page); + + return page; +} + +static void f2fs_compress_free_page(struct page *page) +{ + if (!page) + return; + detach_page_private(page); + page->mapping = NULL; + unlock_page(page); + mempool_free(page, compress_page_pool); +} + +#define MAX_VMAP_RETRIES 3 + +static void *f2fs_vmap(struct page **pages, unsigned int count) +{ + int i; + void *buf = NULL; + + for (i = 0; i < MAX_VMAP_RETRIES; i++) { + buf = vm_map_ram(pages, count, -1); + if (buf) + break; + vm_unmap_aliases(); + } + return buf; +} + +static int f2fs_compress_pages(struct compress_ctx *cc) +{ + struct f2fs_inode_info *fi = F2FS_I(cc->inode); + const struct f2fs_compress_ops *cops = + f2fs_cops[fi->i_compress_algorithm]; + unsigned int max_len, new_nr_cpages; + u32 chksum = 0; + int i, ret; + + trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx, + cc->cluster_size, fi->i_compress_algorithm); + + if (cops->init_compress_ctx) { + ret = cops->init_compress_ctx(cc); + if (ret) + goto out; + } + + max_len = COMPRESS_HEADER_SIZE + cc->clen; + cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); + cc->valid_nr_cpages = cc->nr_cpages; + + cc->cpages = page_array_alloc(cc->inode, cc->nr_cpages); + if (!cc->cpages) { + ret = -ENOMEM; + goto destroy_compress_ctx; + } + + for (i = 0; i < cc->nr_cpages; i++) { + cc->cpages[i] = f2fs_compress_alloc_page(); + if (!cc->cpages[i]) { + ret = -ENOMEM; + goto out_free_cpages; + } + } + + cc->rbuf = f2fs_vmap(cc->rpages, cc->cluster_size); + if (!cc->rbuf) { + ret = -ENOMEM; + goto out_free_cpages; + } + + cc->cbuf = f2fs_vmap(cc->cpages, cc->nr_cpages); + if (!cc->cbuf) { + ret = -ENOMEM; + goto out_vunmap_rbuf; + } + + ret = cops->compress_pages(cc); + if (ret) + goto out_vunmap_cbuf; + + max_len = PAGE_SIZE * (cc->cluster_size - 1) - COMPRESS_HEADER_SIZE; + + if (cc->clen > max_len) { + ret = -EAGAIN; + goto out_vunmap_cbuf; + } + + cc->cbuf->clen = cpu_to_le32(cc->clen); + + if (fi->i_compress_flag & BIT(COMPRESS_CHKSUM)) + chksum = f2fs_crc32(F2FS_I_SB(cc->inode), + cc->cbuf->cdata, cc->clen); + cc->cbuf->chksum = cpu_to_le32(chksum); + + for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++) + cc->cbuf->reserved[i] = cpu_to_le32(0); + + new_nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); + + /* zero out any unused part of the last page */ + memset(&cc->cbuf->cdata[cc->clen], 0, + (new_nr_cpages * PAGE_SIZE) - + (cc->clen + COMPRESS_HEADER_SIZE)); + + vm_unmap_ram(cc->cbuf, cc->nr_cpages); + vm_unmap_ram(cc->rbuf, cc->cluster_size); + + for (i = 0; i < cc->nr_cpages; i++) { + if (i < new_nr_cpages) + continue; + f2fs_compress_free_page(cc->cpages[i]); + cc->cpages[i] = NULL; + } + + if (cops->destroy_compress_ctx) + cops->destroy_compress_ctx(cc); + + cc->valid_nr_cpages = new_nr_cpages; + + trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, + cc->clen, ret); + return 0; + +out_vunmap_cbuf: + vm_unmap_ram(cc->cbuf, cc->nr_cpages); +out_vunmap_rbuf: + vm_unmap_ram(cc->rbuf, cc->cluster_size); +out_free_cpages: + for (i = 0; i < cc->nr_cpages; i++) { + if (cc->cpages[i]) + f2fs_compress_free_page(cc->cpages[i]); + } + page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + cc->cpages = NULL; +destroy_compress_ctx: + if (cops->destroy_compress_ctx) + cops->destroy_compress_ctx(cc); +out: + trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, + cc->clen, ret); + return ret; +} + +static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, + bool pre_alloc); +static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, + bool bypass_destroy_callback, bool pre_alloc); + +void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); + struct f2fs_inode_info *fi = F2FS_I(dic->inode); + const struct f2fs_compress_ops *cops = + f2fs_cops[fi->i_compress_algorithm]; + bool bypass_callback = false; + int ret; + + trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, + dic->cluster_size, fi->i_compress_algorithm); + + if (dic->failed) { + ret = -EIO; + goto out_end_io; + } + + ret = f2fs_prepare_decomp_mem(dic, false); + if (ret) { + bypass_callback = true; + goto out_release; + } + + dic->clen = le32_to_cpu(dic->cbuf->clen); + dic->rlen = PAGE_SIZE << dic->log_cluster_size; + + if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) { + ret = -EFSCORRUPTED; + + /* Avoid f2fs_commit_super in irq context */ + if (!in_task) + f2fs_save_errors(sbi, ERROR_FAIL_DECOMPRESSION); + else + f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION); + goto out_release; + } + + ret = cops->decompress_pages(dic); + + if (!ret && (fi->i_compress_flag & BIT(COMPRESS_CHKSUM))) { + u32 provided = le32_to_cpu(dic->cbuf->chksum); + u32 calculated = f2fs_crc32(sbi, dic->cbuf->cdata, dic->clen); + + if (provided != calculated) { + if (!is_inode_flag_set(dic->inode, FI_COMPRESS_CORRUPT)) { + set_inode_flag(dic->inode, FI_COMPRESS_CORRUPT); + printk_ratelimited( + "%sF2FS-fs (%s): checksum invalid, nid = %lu, %x vs %x", + KERN_INFO, sbi->sb->s_id, dic->inode->i_ino, + provided, calculated); + } + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + } + +out_release: + f2fs_release_decomp_mem(dic, bypass_callback, false); + +out_end_io: + trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx, + dic->clen, ret); + f2fs_decompress_end_io(dic, ret, in_task); +} + +/* + * This is called when a page of a compressed cluster has been read from disk + * (or failed to be read from disk). It checks whether this page was the last + * page being waited on in the cluster, and if so, it decompresses the cluster + * (or in the case of a failure, cleans up without actually decompressing). + */ +void f2fs_end_read_compressed_page(struct page *page, bool failed, + block_t blkaddr, bool in_task) +{ + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); + + dec_page_count(sbi, F2FS_RD_DATA); + + if (failed) + WRITE_ONCE(dic->failed, true); + else if (blkaddr && in_task) + f2fs_cache_compressed_page(sbi, page, + dic->inode->i_ino, blkaddr); + + if (atomic_dec_and_test(&dic->remaining_pages)) + f2fs_decompress_cluster(dic, in_task); +} + +static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index) +{ + if (cc->cluster_idx == NULL_CLUSTER) + return true; + return cc->cluster_idx == cluster_idx(cc, index); +} + +bool f2fs_cluster_is_empty(struct compress_ctx *cc) +{ + return cc->nr_rpages == 0; +} + +static bool f2fs_cluster_is_full(struct compress_ctx *cc) +{ + return cc->cluster_size == cc->nr_rpages; +} + +bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) +{ + if (f2fs_cluster_is_empty(cc)) + return true; + return is_page_in_cluster(cc, index); +} + +bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages, + int index, int nr_pages, bool uptodate) +{ + unsigned long pgidx = pages[index]->index; + int i = uptodate ? 0 : 1; + + /* + * when uptodate set to true, try to check all pages in cluster is + * uptodate or not. + */ + if (uptodate && (pgidx % cc->cluster_size)) + return false; + + if (nr_pages - index < cc->cluster_size) + return false; + + for (; i < cc->cluster_size; i++) { + if (pages[index + i]->index != pgidx + i) + return false; + if (uptodate && !PageUptodate(pages[index + i])) + return false; + } + + return true; +} + +static bool cluster_has_invalid_data(struct compress_ctx *cc) +{ + loff_t i_size = i_size_read(cc->inode); + unsigned nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE); + int i; + + for (i = 0; i < cc->cluster_size; i++) { + struct page *page = cc->rpages[i]; + + f2fs_bug_on(F2FS_I_SB(cc->inode), !page); + + /* beyond EOF */ + if (page->index >= nr_pages) + return true; + } + return false; +} + +bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + unsigned int cluster_size = F2FS_I(dn->inode)->i_cluster_size; + bool compressed = dn->data_blkaddr == COMPRESS_ADDR; + int cluster_end = 0; + int i; + char *reason = ""; + + if (!compressed) + return false; + + /* [..., COMPR_ADDR, ...] */ + if (dn->ofs_in_node % cluster_size) { + reason = "[*|C|*|*]"; + goto out; + } + + for (i = 1; i < cluster_size; i++) { + block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + /* [COMPR_ADDR, ..., COMPR_ADDR] */ + if (blkaddr == COMPRESS_ADDR) { + reason = "[C|*|C|*]"; + goto out; + } + if (!__is_valid_data_blkaddr(blkaddr)) { + if (!cluster_end) + cluster_end = i; + continue; + } + /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */ + if (cluster_end) { + reason = "[C|N|N|V]"; + goto out; + } + } + return false; +out: + f2fs_warn(sbi, "access invalid cluster, ino:%lu, nid:%u, ofs_in_node:%u, reason:%s", + dn->inode->i_ino, dn->nid, dn->ofs_in_node, reason); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return true; +} + +static int __f2fs_cluster_blocks(struct inode *inode, + unsigned int cluster_idx, bool compr) +{ + struct dnode_of_data dn; + unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; + unsigned int start_idx = cluster_idx << + F2FS_I(inode)->i_log_cluster_size; + int ret; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); + if (ret) { + if (ret == -ENOENT) + ret = 0; + goto fail; + } + + if (f2fs_sanity_check_cluster(&dn)) { + ret = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_CLUSTER); + goto fail; + } + + if (dn.data_blkaddr == COMPRESS_ADDR) { + int i; + + ret = 1; + for (i = 1; i < cluster_size; i++) { + block_t blkaddr; + + blkaddr = data_blkaddr(dn.inode, + dn.node_page, dn.ofs_in_node + i); + if (compr) { + if (__is_valid_data_blkaddr(blkaddr)) + ret++; + } else { + if (blkaddr != NULL_ADDR) + ret++; + } + } + + f2fs_bug_on(F2FS_I_SB(inode), + !compr && ret != cluster_size && + !is_inode_flag_set(inode, FI_COMPRESS_RELEASED)); + } +fail: + f2fs_put_dnode(&dn); + return ret; +} + +/* return # of compressed blocks in compressed cluster */ +static int f2fs_compressed_blocks(struct compress_ctx *cc) +{ + return __f2fs_cluster_blocks(cc->inode, cc->cluster_idx, true); +} + +/* return # of valid blocks in compressed cluster */ +int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) +{ + return __f2fs_cluster_blocks(inode, + index >> F2FS_I(inode)->i_log_cluster_size, + false); +} + +static bool cluster_may_compress(struct compress_ctx *cc) +{ + if (!f2fs_need_compress_data(cc->inode)) + return false; + if (f2fs_is_atomic_file(cc->inode)) + return false; + if (!f2fs_cluster_is_full(cc)) + return false; + if (unlikely(f2fs_cp_error(F2FS_I_SB(cc->inode)))) + return false; + return !cluster_has_invalid_data(cc); +} + +static void set_cluster_writeback(struct compress_ctx *cc) +{ + int i; + + for (i = 0; i < cc->cluster_size; i++) { + if (cc->rpages[i]) + set_page_writeback(cc->rpages[i]); + } +} + +static void set_cluster_dirty(struct compress_ctx *cc) +{ + int i; + + for (i = 0; i < cc->cluster_size; i++) + if (cc->rpages[i]) + set_page_dirty(cc->rpages[i]); +} + +static int prepare_compress_overwrite(struct compress_ctx *cc, + struct page **pagep, pgoff_t index, void **fsdata) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + struct address_space *mapping = cc->inode->i_mapping; + struct page *page; + sector_t last_block_in_bio; + unsigned fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT; + pgoff_t start_idx = start_idx_of_cluster(cc); + int i, ret; + +retry: + ret = f2fs_is_compressed_cluster(cc->inode, start_idx); + if (ret <= 0) + return ret; + + ret = f2fs_init_compress_ctx(cc); + if (ret) + return ret; + + /* keep page reference to avoid page reclaim */ + for (i = 0; i < cc->cluster_size; i++) { + page = f2fs_pagecache_get_page(mapping, start_idx + i, + fgp_flag, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + goto unlock_pages; + } + + if (PageUptodate(page)) + f2fs_put_page(page, 1); + else + f2fs_compress_ctx_add_page(cc, page); + } + + if (!f2fs_cluster_is_empty(cc)) { + struct bio *bio = NULL; + + ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size, + &last_block_in_bio, false, true); + f2fs_put_rpages(cc); + f2fs_destroy_compress_ctx(cc, true); + if (ret) + goto out; + if (bio) + f2fs_submit_bio(sbi, bio, DATA); + + ret = f2fs_init_compress_ctx(cc); + if (ret) + goto out; + } + + for (i = 0; i < cc->cluster_size; i++) { + f2fs_bug_on(sbi, cc->rpages[i]); + + page = find_lock_page(mapping, start_idx + i); + if (!page) { + /* page can be truncated */ + goto release_and_retry; + } + + f2fs_wait_on_page_writeback(page, DATA, true, true); + f2fs_compress_ctx_add_page(cc, page); + + if (!PageUptodate(page)) { +release_and_retry: + f2fs_put_rpages(cc); + f2fs_unlock_rpages(cc, i + 1); + f2fs_destroy_compress_ctx(cc, true); + goto retry; + } + } + + if (likely(!ret)) { + *fsdata = cc->rpages; + *pagep = cc->rpages[offset_in_cluster(cc, index)]; + return cc->cluster_size; + } + +unlock_pages: + f2fs_put_rpages(cc); + f2fs_unlock_rpages(cc, i); + f2fs_destroy_compress_ctx(cc, true); +out: + return ret; +} + +int f2fs_prepare_compress_overwrite(struct inode *inode, + struct page **pagep, pgoff_t index, void **fsdata) +{ + struct compress_ctx cc = { + .inode = inode, + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size, + .rpages = NULL, + .nr_rpages = 0, + }; + + return prepare_compress_overwrite(&cc, pagep, index, fsdata); +} + +bool f2fs_compress_write_end(struct inode *inode, void *fsdata, + pgoff_t index, unsigned copied) + +{ + struct compress_ctx cc = { + .inode = inode, + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .rpages = fsdata, + }; + bool first_index = (index == cc.rpages[0]->index); + + if (copied) + set_cluster_dirty(&cc); + + f2fs_put_rpages_wbc(&cc, NULL, false, 1); + f2fs_destroy_compress_ctx(&cc, false); + + return first_index; +} + +int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock) +{ + void *fsdata = NULL; + struct page *pagep; + int log_cluster_size = F2FS_I(inode)->i_log_cluster_size; + pgoff_t start_idx = from >> (PAGE_SHIFT + log_cluster_size) << + log_cluster_size; + int err; + + err = f2fs_is_compressed_cluster(inode, start_idx); + if (err < 0) + return err; + + /* truncate normal cluster */ + if (!err) + return f2fs_do_truncate_blocks(inode, from, lock); + + /* truncate compressed cluster */ + err = f2fs_prepare_compress_overwrite(inode, &pagep, + start_idx, &fsdata); + + /* should not be a normal cluster */ + f2fs_bug_on(F2FS_I_SB(inode), err == 0); + + if (err <= 0) + return err; + + if (err > 0) { + struct page **rpages = fsdata; + int cluster_size = F2FS_I(inode)->i_cluster_size; + int i; + + for (i = cluster_size - 1; i >= 0; i--) { + loff_t start = rpages[i]->index << PAGE_SHIFT; + + if (from <= start) { + zero_user_segment(rpages[i], 0, PAGE_SIZE); + } else { + zero_user_segment(rpages[i], from - start, + PAGE_SIZE); + break; + } + } + + f2fs_compress_write_end(inode, fsdata, start_idx, true); + } + return 0; +} + +static int f2fs_write_compressed_pages(struct compress_ctx *cc, + int *submitted, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + struct inode *inode = cc->inode; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_io_info fio = { + .sbi = sbi, + .ino = cc->inode->i_ino, + .type = DATA, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), + .old_blkaddr = NEW_ADDR, + .page = NULL, + .encrypted_page = NULL, + .compressed_page = NULL, + .submitted = false, + .io_type = io_type, + .io_wbc = wbc, + .encrypted = fscrypt_inode_uses_fs_layer_crypto(cc->inode), + }; + struct dnode_of_data dn; + struct node_info ni; + struct compress_io_ctx *cic; + pgoff_t start_idx = start_idx_of_cluster(cc); + unsigned int last_index = cc->cluster_size - 1; + loff_t psize; + int i, err; + bool quota_inode = IS_NOQUOTA(inode); + + /* we should bypass data pages to proceed the kworkder jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + mapping_set_error(cc->rpages[0]->mapping, -EIO); + goto out_free; + } + + if (quota_inode) { + /* + * We need to wait for node_write to avoid block allocation during + * checkpoint. This can only happen to quota writes which can cause + * the below discard race condition. + */ + f2fs_down_read(&sbi->node_write); + } else if (!f2fs_trylock_op(sbi)) { + goto out_free; + } + + set_new_dnode(&dn, cc->inode, NULL, NULL, 0); + + err = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); + if (err) + goto out_unlock_op; + + for (i = 0; i < cc->cluster_size; i++) { + if (data_blkaddr(dn.inode, dn.node_page, + dn.ofs_in_node + i) == NULL_ADDR) + goto out_put_dnode; + } + + psize = (loff_t)(cc->rpages[last_index]->index + 1) << PAGE_SHIFT; + + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); + if (err) + goto out_put_dnode; + + fio.version = ni.version; + + cic = f2fs_kmem_cache_alloc(cic_entry_slab, GFP_F2FS_ZERO, false, sbi); + if (!cic) + goto out_put_dnode; + + cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; + cic->inode = inode; + atomic_set(&cic->pending_pages, cc->valid_nr_cpages); + cic->rpages = page_array_alloc(cc->inode, cc->cluster_size); + if (!cic->rpages) + goto out_put_cic; + + cic->nr_rpages = cc->cluster_size; + + for (i = 0; i < cc->valid_nr_cpages; i++) { + f2fs_set_compressed_page(cc->cpages[i], inode, + cc->rpages[i + 1]->index, cic); + fio.compressed_page = cc->cpages[i]; + + fio.old_blkaddr = data_blkaddr(dn.inode, dn.node_page, + dn.ofs_in_node + i + 1); + + /* wait for GCed page writeback via META_MAPPING */ + f2fs_wait_on_block_writeback(inode, fio.old_blkaddr); + + if (fio.encrypted) { + fio.page = cc->rpages[i + 1]; + err = f2fs_encrypt_one_page(&fio); + if (err) + goto out_destroy_crypt; + cc->cpages[i] = fio.encrypted_page; + } + } + + set_cluster_writeback(cc); + + for (i = 0; i < cc->cluster_size; i++) + cic->rpages[i] = cc->rpages[i]; + + for (i = 0; i < cc->cluster_size; i++, dn.ofs_in_node++) { + block_t blkaddr; + + blkaddr = f2fs_data_blkaddr(&dn); + fio.page = cc->rpages[i]; + fio.old_blkaddr = blkaddr; + + /* cluster header */ + if (i == 0) { + if (blkaddr == COMPRESS_ADDR) + fio.compr_blocks++; + if (__is_valid_data_blkaddr(blkaddr)) + f2fs_invalidate_blocks(sbi, blkaddr); + f2fs_update_data_blkaddr(&dn, COMPRESS_ADDR); + goto unlock_continue; + } + + if (fio.compr_blocks && __is_valid_data_blkaddr(blkaddr)) + fio.compr_blocks++; + + if (i > cc->valid_nr_cpages) { + if (__is_valid_data_blkaddr(blkaddr)) { + f2fs_invalidate_blocks(sbi, blkaddr); + f2fs_update_data_blkaddr(&dn, NEW_ADDR); + } + goto unlock_continue; + } + + f2fs_bug_on(fio.sbi, blkaddr == NULL_ADDR); + + if (fio.encrypted) + fio.encrypted_page = cc->cpages[i - 1]; + else + fio.compressed_page = cc->cpages[i - 1]; + + cc->cpages[i - 1] = NULL; + f2fs_outplace_write_data(&dn, &fio); + (*submitted)++; +unlock_continue: + inode_dec_dirty_pages(cc->inode); + unlock_page(fio.page); + } + + if (fio.compr_blocks) + f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false); + f2fs_i_compr_blocks_update(inode, cc->valid_nr_cpages, true); + add_compr_block_stat(inode, cc->valid_nr_cpages); + + set_inode_flag(cc->inode, FI_APPEND_WRITE); + if (cc->cluster_idx == 0) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + + f2fs_put_dnode(&dn); + if (quota_inode) + f2fs_up_read(&sbi->node_write); + else + f2fs_unlock_op(sbi); + + spin_lock(&fi->i_size_lock); + if (fi->last_disk_size < psize) + fi->last_disk_size = psize; + spin_unlock(&fi->i_size_lock); + + f2fs_put_rpages(cc); + page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + cc->cpages = NULL; + f2fs_destroy_compress_ctx(cc, false); + return 0; + +out_destroy_crypt: + page_array_free(cc->inode, cic->rpages, cc->cluster_size); + + for (--i; i >= 0; i--) + fscrypt_finalize_bounce_page(&cc->cpages[i]); +out_put_cic: + kmem_cache_free(cic_entry_slab, cic); +out_put_dnode: + f2fs_put_dnode(&dn); +out_unlock_op: + if (quota_inode) + f2fs_up_read(&sbi->node_write); + else + f2fs_unlock_op(sbi); +out_free: + for (i = 0; i < cc->valid_nr_cpages; i++) { + f2fs_compress_free_page(cc->cpages[i]); + cc->cpages[i] = NULL; + } + page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + cc->cpages = NULL; + return -EAGAIN; +} + +void f2fs_compress_write_end_io(struct bio *bio, struct page *page) +{ + struct f2fs_sb_info *sbi = bio->bi_private; + struct compress_io_ctx *cic = + (struct compress_io_ctx *)page_private(page); + int i; + + if (unlikely(bio->bi_status)) + mapping_set_error(cic->inode->i_mapping, -EIO); + + f2fs_compress_free_page(page); + + dec_page_count(sbi, F2FS_WB_DATA); + + if (atomic_dec_return(&cic->pending_pages)) + return; + + for (i = 0; i < cic->nr_rpages; i++) { + WARN_ON(!cic->rpages[i]); + clear_page_private_gcing(cic->rpages[i]); + end_page_writeback(cic->rpages[i]); + } + + page_array_free(cic->inode, cic->rpages, cic->nr_rpages); + kmem_cache_free(cic_entry_slab, cic); +} + +static int f2fs_write_raw_pages(struct compress_ctx *cc, + int *submitted, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + struct address_space *mapping = cc->inode->i_mapping; + int _submitted, compr_blocks, ret, i; + + compr_blocks = f2fs_compressed_blocks(cc); + + for (i = 0; i < cc->cluster_size; i++) { + if (!cc->rpages[i]) + continue; + + redirty_page_for_writepage(wbc, cc->rpages[i]); + unlock_page(cc->rpages[i]); + } + + if (compr_blocks < 0) + return compr_blocks; + + for (i = 0; i < cc->cluster_size; i++) { + if (!cc->rpages[i]) + continue; +retry_write: + lock_page(cc->rpages[i]); + + if (cc->rpages[i]->mapping != mapping) { +continue_unlock: + unlock_page(cc->rpages[i]); + continue; + } + + if (!PageDirty(cc->rpages[i])) + goto continue_unlock; + + if (PageWriteback(cc->rpages[i])) { + if (wbc->sync_mode == WB_SYNC_NONE) + goto continue_unlock; + f2fs_wait_on_page_writeback(cc->rpages[i], DATA, true, true); + } + + if (!clear_page_dirty_for_io(cc->rpages[i])) + goto continue_unlock; + + ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted, + NULL, NULL, wbc, io_type, + compr_blocks, false); + if (ret) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(cc->rpages[i]); + ret = 0; + } else if (ret == -EAGAIN) { + /* + * for quota file, just redirty left pages to + * avoid deadlock caused by cluster update race + * from foreground operation. + */ + if (IS_NOQUOTA(cc->inode)) + return 0; + ret = 0; + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + goto retry_write; + } + return ret; + } + + *submitted += _submitted; + } + + f2fs_balance_fs(F2FS_M_SB(mapping), true); + + return 0; +} + +int f2fs_write_multi_pages(struct compress_ctx *cc, + int *submitted, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + int err; + + *submitted = 0; + if (cluster_may_compress(cc)) { + err = f2fs_compress_pages(cc); + if (err == -EAGAIN) { + add_compr_block_stat(cc->inode, cc->cluster_size); + goto write; + } else if (err) { + f2fs_put_rpages_wbc(cc, wbc, true, 1); + goto destroy_out; + } + + err = f2fs_write_compressed_pages(cc, submitted, + wbc, io_type); + if (!err) + return 0; + f2fs_bug_on(F2FS_I_SB(cc->inode), err != -EAGAIN); + } +write: + f2fs_bug_on(F2FS_I_SB(cc->inode), *submitted); + + err = f2fs_write_raw_pages(cc, submitted, wbc, io_type); + f2fs_put_rpages_wbc(cc, wbc, false, 0); +destroy_out: + f2fs_destroy_compress_ctx(cc, false); + return err; +} + +static inline bool allow_memalloc_for_decomp(struct f2fs_sb_info *sbi, + bool pre_alloc) +{ + return pre_alloc ^ f2fs_low_mem_mode(sbi); +} + +static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, + bool pre_alloc) +{ + const struct f2fs_compress_ops *cops = + f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; + int i; + + if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc)) + return 0; + + dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); + if (!dic->tpages) + return -ENOMEM; + + for (i = 0; i < dic->cluster_size; i++) { + if (dic->rpages[i]) { + dic->tpages[i] = dic->rpages[i]; + continue; + } + + dic->tpages[i] = f2fs_compress_alloc_page(); + if (!dic->tpages[i]) + return -ENOMEM; + } + + dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); + if (!dic->rbuf) + return -ENOMEM; + + dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); + if (!dic->cbuf) + return -ENOMEM; + + if (cops->init_decompress_ctx) + return cops->init_decompress_ctx(dic); + + return 0; +} + +static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, + bool bypass_destroy_callback, bool pre_alloc) +{ + const struct f2fs_compress_ops *cops = + f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; + + if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc)) + return; + + if (!bypass_destroy_callback && cops->destroy_decompress_ctx) + cops->destroy_decompress_ctx(dic); + + if (dic->cbuf) + vm_unmap_ram(dic->cbuf, dic->nr_cpages); + + if (dic->rbuf) + vm_unmap_ram(dic->rbuf, dic->cluster_size); +} + +static void f2fs_free_dic(struct decompress_io_ctx *dic, + bool bypass_destroy_callback); + +struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) +{ + struct decompress_io_ctx *dic; + pgoff_t start_idx = start_idx_of_cluster(cc); + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + int i, ret; + + dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, false, sbi); + if (!dic) + return ERR_PTR(-ENOMEM); + + dic->rpages = page_array_alloc(cc->inode, cc->cluster_size); + if (!dic->rpages) { + kmem_cache_free(dic_entry_slab, dic); + return ERR_PTR(-ENOMEM); + } + + dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; + dic->inode = cc->inode; + atomic_set(&dic->remaining_pages, cc->nr_cpages); + dic->cluster_idx = cc->cluster_idx; + dic->cluster_size = cc->cluster_size; + dic->log_cluster_size = cc->log_cluster_size; + dic->nr_cpages = cc->nr_cpages; + refcount_set(&dic->refcnt, 1); + dic->failed = false; + dic->need_verity = f2fs_need_verity(cc->inode, start_idx); + + for (i = 0; i < dic->cluster_size; i++) + dic->rpages[i] = cc->rpages[i]; + dic->nr_rpages = cc->cluster_size; + + dic->cpages = page_array_alloc(dic->inode, dic->nr_cpages); + if (!dic->cpages) { + ret = -ENOMEM; + goto out_free; + } + + for (i = 0; i < dic->nr_cpages; i++) { + struct page *page; + + page = f2fs_compress_alloc_page(); + if (!page) { + ret = -ENOMEM; + goto out_free; + } + + f2fs_set_compressed_page(page, cc->inode, + start_idx + i + 1, dic); + dic->cpages[i] = page; + } + + ret = f2fs_prepare_decomp_mem(dic, true); + if (ret) + goto out_free; + + return dic; + +out_free: + f2fs_free_dic(dic, true); + return ERR_PTR(ret); +} + +static void f2fs_free_dic(struct decompress_io_ctx *dic, + bool bypass_destroy_callback) +{ + int i; + + f2fs_release_decomp_mem(dic, bypass_destroy_callback, true); + + if (dic->tpages) { + for (i = 0; i < dic->cluster_size; i++) { + if (dic->rpages[i]) + continue; + if (!dic->tpages[i]) + continue; + f2fs_compress_free_page(dic->tpages[i]); + } + page_array_free(dic->inode, dic->tpages, dic->cluster_size); + } + + if (dic->cpages) { + for (i = 0; i < dic->nr_cpages; i++) { + if (!dic->cpages[i]) + continue; + f2fs_compress_free_page(dic->cpages[i]); + } + page_array_free(dic->inode, dic->cpages, dic->nr_cpages); + } + + page_array_free(dic->inode, dic->rpages, dic->nr_rpages); + kmem_cache_free(dic_entry_slab, dic); +} + +static void f2fs_late_free_dic(struct work_struct *work) +{ + struct decompress_io_ctx *dic = + container_of(work, struct decompress_io_ctx, free_work); + + f2fs_free_dic(dic, false); +} + +static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task) +{ + if (refcount_dec_and_test(&dic->refcnt)) { + if (in_task) { + f2fs_free_dic(dic, false); + } else { + INIT_WORK(&dic->free_work, f2fs_late_free_dic); + queue_work(F2FS_I_SB(dic->inode)->post_read_wq, + &dic->free_work); + } + } +} + +/* + * Update and unlock the cluster's pagecache pages, and release the reference to + * the decompress_io_ctx that was being held for I/O completion. + */ +static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, + bool in_task) +{ + int i; + + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; + + if (!rpage) + continue; + + /* PG_error was set if verity failed. */ + if (failed || PageError(rpage)) { + ClearPageUptodate(rpage); + /* will re-read again later */ + ClearPageError(rpage); + } else { + SetPageUptodate(rpage); + } + unlock_page(rpage); + } + + f2fs_put_dic(dic, in_task); +} + +static void f2fs_verify_cluster(struct work_struct *work) +{ + struct decompress_io_ctx *dic = + container_of(work, struct decompress_io_ctx, verity_work); + int i; + + /* Verify the cluster's decompressed pages with fs-verity. */ + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; + + if (rpage && !fsverity_verify_page(rpage)) + SetPageError(rpage); + } + + __f2fs_decompress_end_io(dic, false, true); +} + +/* + * This is called when a compressed cluster has been decompressed + * (or failed to be read and/or decompressed). + */ +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, + bool in_task) +{ + if (!failed && dic->need_verity) { + /* + * Note that to avoid deadlocks, the verity work can't be done + * on the decompression workqueue. This is because verifying + * the data pages can involve reading metadata pages from the + * file, and these metadata pages may be compressed. + */ + INIT_WORK(&dic->verity_work, f2fs_verify_cluster); + fsverity_enqueue_verify_work(&dic->verity_work); + } else { + __f2fs_decompress_end_io(dic, failed, in_task); + } +} + +/* + * Put a reference to a compressed page's decompress_io_ctx. + * + * This is called when the page is no longer needed and can be freed. + */ +void f2fs_put_page_dic(struct page *page, bool in_task) +{ + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + + f2fs_put_dic(dic, in_task); +} + +/* + * check whether cluster blocks are contiguous, and add extent cache entry + * only if cluster blocks are logically and physically contiguous. + */ +unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) +{ + bool compressed = f2fs_data_blkaddr(dn) == COMPRESS_ADDR; + int i = compressed ? 1 : 0; + block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + for (i += 1; i < F2FS_I(dn->inode)->i_cluster_size; i++) { + block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + if (!__is_valid_data_blkaddr(blkaddr)) + break; + if (first_blkaddr + i - (compressed ? 1 : 0) != blkaddr) + return 0; + } + + return compressed ? i - 1 : i; +} + +const struct address_space_operations f2fs_compress_aops = { + .release_folio = f2fs_release_folio, + .invalidate_folio = f2fs_invalidate_folio, +}; + +struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->compress_inode->i_mapping; +} + +void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + if (!sbi->compress_inode) + return; + invalidate_mapping_pages(COMPRESS_MAPPING(sbi), blkaddr, blkaddr); +} + +void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, + nid_t ino, block_t blkaddr) +{ + struct page *cpage; + int ret; + + if (!test_opt(sbi, COMPRESS_CACHE)) + return; + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) + return; + + if (!f2fs_available_free_memory(sbi, COMPRESS_PAGE)) + return; + + cpage = find_get_page(COMPRESS_MAPPING(sbi), blkaddr); + if (cpage) { + f2fs_put_page(cpage, 0); + return; + } + + cpage = alloc_page(__GFP_NOWARN | __GFP_IO); + if (!cpage) + return; + + ret = add_to_page_cache_lru(cpage, COMPRESS_MAPPING(sbi), + blkaddr, GFP_NOFS); + if (ret) { + f2fs_put_page(cpage, 0); + return; + } + + set_page_private_data(cpage, ino); + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) + goto out; + + memcpy(page_address(cpage), page_address(page), PAGE_SIZE); + SetPageUptodate(cpage); +out: + f2fs_put_page(cpage, 1); +} + +bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, + block_t blkaddr) +{ + struct page *cpage; + bool hitted = false; + + if (!test_opt(sbi, COMPRESS_CACHE)) + return false; + + cpage = f2fs_pagecache_get_page(COMPRESS_MAPPING(sbi), + blkaddr, FGP_LOCK | FGP_NOWAIT, GFP_NOFS); + if (cpage) { + if (PageUptodate(cpage)) { + atomic_inc(&sbi->compress_page_hit); + memcpy(page_address(page), + page_address(cpage), PAGE_SIZE); + hitted = true; + } + f2fs_put_page(cpage, 1); + } + + return hitted; +} + +void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct address_space *mapping = COMPRESS_MAPPING(sbi); + struct folio_batch fbatch; + pgoff_t index = 0; + pgoff_t end = MAX_BLKADDR(sbi); + + if (!mapping->nrpages) + return; + + folio_batch_init(&fbatch); + + do { + unsigned int nr, i; + + nr = filemap_get_folios(mapping, &index, end - 1, &fbatch); + if (!nr) + break; + + for (i = 0; i < nr; i++) { + struct folio *folio = fbatch.folios[i]; + + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); + continue; + } + + if (ino != get_page_private_data(&folio->page)) { + folio_unlock(folio); + continue; + } + + generic_error_remove_page(mapping, &folio->page); + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + } while (index < end); +} + +int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) +{ + struct inode *inode; + + if (!test_opt(sbi, COMPRESS_CACHE)) + return 0; + + inode = f2fs_iget(sbi->sb, F2FS_COMPRESS_INO(sbi)); + if (IS_ERR(inode)) + return PTR_ERR(inode); + sbi->compress_inode = inode; + + sbi->compress_percent = COMPRESS_PERCENT; + sbi->compress_watermark = COMPRESS_WATERMARK; + + atomic_set(&sbi->compress_page_hit, 0); + + return 0; +} + +void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) +{ + if (!sbi->compress_inode) + return; + iput(sbi->compress_inode); + sbi->compress_inode = NULL; +} + +int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + char slab_name[35]; + + if (!f2fs_sb_has_compression(sbi)) + return 0; + + sprintf(slab_name, "f2fs_page_array_entry-%u:%u", MAJOR(dev), MINOR(dev)); + + sbi->page_array_slab_size = sizeof(struct page *) << + F2FS_OPTION(sbi).compress_log_size; + + sbi->page_array_slab = f2fs_kmem_cache_create(slab_name, + sbi->page_array_slab_size); + if (!sbi->page_array_slab) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) +{ + kmem_cache_destroy(sbi->page_array_slab); +} + +static int __init f2fs_init_cic_cache(void) +{ + cic_entry_slab = f2fs_kmem_cache_create("f2fs_cic_entry", + sizeof(struct compress_io_ctx)); + if (!cic_entry_slab) + return -ENOMEM; + return 0; +} + +static void f2fs_destroy_cic_cache(void) +{ + kmem_cache_destroy(cic_entry_slab); +} + +static int __init f2fs_init_dic_cache(void) +{ + dic_entry_slab = f2fs_kmem_cache_create("f2fs_dic_entry", + sizeof(struct decompress_io_ctx)); + if (!dic_entry_slab) + return -ENOMEM; + return 0; +} + +static void f2fs_destroy_dic_cache(void) +{ + kmem_cache_destroy(dic_entry_slab); +} + +int __init f2fs_init_compress_cache(void) +{ + int err; + + err = f2fs_init_cic_cache(); + if (err) + goto out; + err = f2fs_init_dic_cache(); + if (err) + goto free_cic; + return 0; +free_cic: + f2fs_destroy_cic_cache(); +out: + return -ENOMEM; +} + +void f2fs_destroy_compress_cache(void) +{ + f2fs_destroy_dic_cache(); + f2fs_destroy_cic_cache(); +} diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c new file mode 100644 index 000000000..8b561af37 --- /dev/null +++ b/fs/f2fs/data.c @@ -0,0 +1,4235 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/f2fs/data.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/buffer_head.h> +#include <linux/sched/mm.h> +#include <linux/mpage.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include <linux/blkdev.h> +#include <linux/bio.h> +#include <linux/blk-crypto.h> +#include <linux/swap.h> +#include <linux/prefetch.h> +#include <linux/uio.h> +#include <linux/sched/signal.h> +#include <linux/fiemap.h> +#include <linux/iomap.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "iostat.h" +#include <trace/events/f2fs.h> + +#define NUM_PREALLOC_POST_READ_CTXS 128 + +static struct kmem_cache *bio_post_read_ctx_cache; +static struct kmem_cache *bio_entry_slab; +static mempool_t *bio_post_read_ctx_pool; +static struct bio_set f2fs_bioset; + +#define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE + +int __init f2fs_init_bioset(void) +{ + if (bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE, + 0, BIOSET_NEED_BVECS)) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_bioset(void) +{ + bioset_exit(&f2fs_bioset); +} + +static bool __is_cp_guaranteed(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode; + struct f2fs_sb_info *sbi; + + if (!mapping) + return false; + + inode = mapping->host; + sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_META_INO(sbi) || + inode->i_ino == F2FS_NODE_INO(sbi) || + S_ISDIR(inode->i_mode)) + return true; + + if (f2fs_is_compressed_page(page)) + return false; + if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) || + page_private_gcing(page)) + return true; + return false; +} + +static enum count_type __read_io_type(struct page *page) +{ + struct address_space *mapping = page_file_mapping(page); + + if (mapping) { + struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_META_INO(sbi)) + return F2FS_RD_META; + + if (inode->i_ino == F2FS_NODE_INO(sbi)) + return F2FS_RD_NODE; + } + return F2FS_RD_DATA; +} + +/* postprocessing steps for read bios */ +enum bio_post_read_step { +#ifdef CONFIG_FS_ENCRYPTION + STEP_DECRYPT = BIT(0), +#else + STEP_DECRYPT = 0, /* compile out the decryption-related code */ +#endif +#ifdef CONFIG_F2FS_FS_COMPRESSION + STEP_DECOMPRESS = BIT(1), +#else + STEP_DECOMPRESS = 0, /* compile out the decompression-related code */ +#endif +#ifdef CONFIG_FS_VERITY + STEP_VERITY = BIT(2), +#else + STEP_VERITY = 0, /* compile out the verity-related code */ +#endif +}; + +struct bio_post_read_ctx { + struct bio *bio; + struct f2fs_sb_info *sbi; + struct work_struct work; + unsigned int enabled_steps; + block_t fs_blkaddr; +}; + +static void f2fs_finish_read_bio(struct bio *bio, bool in_task) +{ + struct bio_vec *bv; + struct bvec_iter_all iter_all; + + /* + * Update and unlock the bio's pagecache pages, and put the + * decompression context for any compressed pages. + */ + bio_for_each_segment_all(bv, bio, iter_all) { + struct page *page = bv->bv_page; + + if (f2fs_is_compressed_page(page)) { + if (bio->bi_status) + f2fs_end_read_compressed_page(page, true, 0, + in_task); + f2fs_put_page_dic(page, in_task); + continue; + } + + /* PG_error was set if verity failed. */ + if (bio->bi_status || PageError(page)) { + ClearPageUptodate(page); + /* will re-read again later */ + ClearPageError(page); + } else { + SetPageUptodate(page); + } + dec_page_count(F2FS_P_SB(page), __read_io_type(page)); + unlock_page(page); + } + + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); + bio_put(bio); +} + +static void f2fs_verify_bio(struct work_struct *work) +{ + struct bio_post_read_ctx *ctx = + container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; + bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS); + + /* + * fsverity_verify_bio() may call readahead() again, and while verity + * will be disabled for this, decryption and/or decompression may still + * be needed, resulting in another bio_post_read_ctx being allocated. + * So to prevent deadlocks we need to release the current ctx to the + * mempool first. This assumes that verity is the last post-read step. + */ + mempool_free(ctx, bio_post_read_ctx_pool); + bio->bi_private = NULL; + + /* + * Verify the bio's pages with fs-verity. Exclude compressed pages, + * as those were handled separately by f2fs_end_read_compressed_page(). + */ + if (may_have_compressed_pages) { + struct bio_vec *bv; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bv, bio, iter_all) { + struct page *page = bv->bv_page; + + if (!f2fs_is_compressed_page(page) && + !fsverity_verify_page(page)) + SetPageError(page); + } + } else { + fsverity_verify_bio(bio); + } + + f2fs_finish_read_bio(bio, true); +} + +/* + * If the bio's data needs to be verified with fs-verity, then enqueue the + * verity work for the bio. Otherwise finish the bio now. + * + * Note that to avoid deadlocks, the verity work can't be done on the + * decryption/decompression workqueue. This is because verifying the data pages + * can involve reading verity metadata pages from the file, and these verity + * metadata pages may be encrypted and/or compressed. + */ +static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task) +{ + struct bio_post_read_ctx *ctx = bio->bi_private; + + if (ctx && (ctx->enabled_steps & STEP_VERITY)) { + INIT_WORK(&ctx->work, f2fs_verify_bio); + fsverity_enqueue_verify_work(&ctx->work); + } else { + f2fs_finish_read_bio(bio, in_task); + } +} + +/* + * Handle STEP_DECOMPRESS by decompressing any compressed clusters whose last + * remaining page was read by @ctx->bio. + * + * Note that a bio may span clusters (even a mix of compressed and uncompressed + * clusters) or be for just part of a cluster. STEP_DECOMPRESS just indicates + * that the bio includes at least one compressed page. The actual decompression + * is done on a per-cluster basis, not a per-bio basis. + */ +static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx, + bool in_task) +{ + struct bio_vec *bv; + struct bvec_iter_all iter_all; + bool all_compressed = true; + block_t blkaddr = ctx->fs_blkaddr; + + bio_for_each_segment_all(bv, ctx->bio, iter_all) { + struct page *page = bv->bv_page; + + if (f2fs_is_compressed_page(page)) + f2fs_end_read_compressed_page(page, false, blkaddr, + in_task); + else + all_compressed = false; + + blkaddr++; + } + + /* + * Optimization: if all the bio's pages are compressed, then scheduling + * the per-bio verity work is unnecessary, as verity will be fully + * handled at the compression cluster level. + */ + if (all_compressed) + ctx->enabled_steps &= ~STEP_VERITY; +} + +static void f2fs_post_read_work(struct work_struct *work) +{ + struct bio_post_read_ctx *ctx = + container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; + + if ((ctx->enabled_steps & STEP_DECRYPT) && !fscrypt_decrypt_bio(bio)) { + f2fs_finish_read_bio(bio, true); + return; + } + + if (ctx->enabled_steps & STEP_DECOMPRESS) + f2fs_handle_step_decompress(ctx, true); + + f2fs_verify_and_finish_bio(bio, true); +} + +static void f2fs_read_end_io(struct bio *bio) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); + struct bio_post_read_ctx *ctx; + bool intask = in_task(); + + iostat_update_and_unbind_ctx(bio, 0); + ctx = bio->bi_private; + + if (time_to_inject(sbi, FAULT_READ_IO)) { + f2fs_show_injection_info(sbi, FAULT_READ_IO); + bio->bi_status = BLK_STS_IOERR; + } + + if (bio->bi_status) { + f2fs_finish_read_bio(bio, intask); + return; + } + + if (ctx) { + unsigned int enabled_steps = ctx->enabled_steps & + (STEP_DECRYPT | STEP_DECOMPRESS); + + /* + * If we have only decompression step between decompression and + * decrypt, we don't need post processing for this. + */ + if (enabled_steps == STEP_DECOMPRESS && + !f2fs_low_mem_mode(sbi)) { + f2fs_handle_step_decompress(ctx, intask); + } else if (enabled_steps) { + INIT_WORK(&ctx->work, f2fs_post_read_work); + queue_work(ctx->sbi->post_read_wq, &ctx->work); + return; + } + } + + f2fs_verify_and_finish_bio(bio, intask); +} + +static void f2fs_write_end_io(struct bio *bio) +{ + struct f2fs_sb_info *sbi; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + iostat_update_and_unbind_ctx(bio, 1); + sbi = bio->bi_private; + + if (time_to_inject(sbi, FAULT_WRITE_IO)) { + f2fs_show_injection_info(sbi, FAULT_WRITE_IO); + bio->bi_status = BLK_STS_IOERR; + } + + bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *page = bvec->bv_page; + enum count_type type = WB_DATA_TYPE(page); + + if (page_private_dummy(page)) { + clear_page_private_dummy(page); + unlock_page(page); + mempool_free(page, sbi->write_io_dummy); + + if (unlikely(bio->bi_status)) + f2fs_stop_checkpoint(sbi, true, + STOP_CP_REASON_WRITE_FAIL); + continue; + } + + fscrypt_finalize_bounce_page(&page); + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_is_compressed_page(page)) { + f2fs_compress_write_end_io(bio, page); + continue; + } +#endif + + if (unlikely(bio->bi_status)) { + mapping_set_error(page->mapping, -EIO); + if (type == F2FS_WB_CP_DATA) + f2fs_stop_checkpoint(sbi, true, + STOP_CP_REASON_WRITE_FAIL); + } + + f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) && + page->index != nid_of_node(page)); + + dec_page_count(sbi, type); + if (f2fs_in_warm_node_list(sbi, page)) + f2fs_del_fsync_node_entry(sbi, page); + clear_page_private_gcing(page); + end_page_writeback(page); + } + if (!get_pages(sbi, F2FS_WB_CP_DATA) && + wq_has_sleeper(&sbi->cp_wait)) + wake_up(&sbi->cp_wait); + + bio_put(bio); +} + +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, sector_t *sector) +{ + struct block_device *bdev = sbi->sb->s_bdev; + int i; + + if (f2fs_is_multi_device(sbi)) { + for (i = 0; i < sbi->s_ndevs; i++) { + if (FDEV(i).start_blk <= blk_addr && + FDEV(i).end_blk >= blk_addr) { + blk_addr -= FDEV(i).start_blk; + bdev = FDEV(i).bdev; + break; + } + } + } + + if (sector) + *sector = SECTOR_FROM_BLOCK(blk_addr); + return bdev; +} + +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + int i; + + if (!f2fs_is_multi_device(sbi)) + return 0; + + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr) + return i; + return 0; +} + +static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) +{ + unsigned int temp_mask = GENMASK(NR_TEMP_TYPE - 1, 0); + unsigned int fua_flag, meta_flag, io_flag; + blk_opf_t op_flags = 0; + + if (fio->op != REQ_OP_WRITE) + return 0; + if (fio->type == DATA) + io_flag = fio->sbi->data_io_flag; + else if (fio->type == NODE) + io_flag = fio->sbi->node_io_flag; + else + return 0; + + fua_flag = io_flag & temp_mask; + meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; + + /* + * data/node io flag bits per temp: + * REQ_META | REQ_FUA | + * 5 | 4 | 3 | 2 | 1 | 0 | + * Cold | Warm | Hot | Cold | Warm | Hot | + */ + if (BIT(fio->temp) & meta_flag) + op_flags |= REQ_META; + if (BIT(fio->temp) & fua_flag) + op_flags |= REQ_FUA; + return op_flags; +} + +static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) +{ + struct f2fs_sb_info *sbi = fio->sbi; + struct block_device *bdev; + sector_t sector; + struct bio *bio; + + bdev = f2fs_target_device(sbi, fio->new_blkaddr, §or); + bio = bio_alloc_bioset(bdev, npages, + fio->op | fio->op_flags | f2fs_io_flags(fio), + GFP_NOIO, &f2fs_bioset); + bio->bi_iter.bi_sector = sector; + if (is_read_io(fio->op)) { + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = NULL; + } else { + bio->bi_end_io = f2fs_write_end_io; + bio->bi_private = sbi; + } + iostat_alloc_and_bind_ctx(sbi, bio, NULL); + + if (fio->io_wbc) + wbc_init_bio(fio->io_wbc, bio); + + return bio; +} + +static void f2fs_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, + pgoff_t first_idx, + const struct f2fs_io_info *fio, + gfp_t gfp_mask) +{ + /* + * The f2fs garbage collector sets ->encrypted_page when it wants to + * read/write raw data without encryption. + */ + if (!fio || !fio->encrypted_page) + fscrypt_set_bio_crypt_ctx(bio, inode, first_idx, gfp_mask); +} + +static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode, + pgoff_t next_idx, + const struct f2fs_io_info *fio) +{ + /* + * The f2fs garbage collector sets ->encrypted_page when it wants to + * read/write raw data without encryption. + */ + if (fio && fio->encrypted_page) + return !bio_has_crypt_ctx(bio); + + return fscrypt_mergeable_bio(bio, inode, next_idx); +} + +static inline void __submit_bio(struct f2fs_sb_info *sbi, + struct bio *bio, enum page_type type) +{ + if (!is_read_io(bio_op(bio))) { + unsigned int start; + + if (type != DATA && type != NODE) + goto submit_io; + + if (f2fs_lfs_mode(sbi) && current->plug) + blk_finish_plug(current->plug); + + if (!F2FS_IO_ALIGNED(sbi)) + goto submit_io; + + start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; + start %= F2FS_IO_SIZE(sbi); + + if (start == 0) + goto submit_io; + + /* fill dummy pages */ + for (; start < F2FS_IO_SIZE(sbi); start++) { + struct page *page = + mempool_alloc(sbi->write_io_dummy, + GFP_NOIO | __GFP_NOFAIL); + f2fs_bug_on(sbi, !page); + + lock_page(page); + + zero_user_segment(page, 0, PAGE_SIZE); + set_page_private_dummy(page); + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) + f2fs_bug_on(sbi, 1); + } + /* + * In the NODE case, we lose next block address chain. So, we + * need to do checkpoint in f2fs_sync_file. + */ + if (type == NODE) + set_sbi_flag(sbi, SBI_NEED_CP); + } +submit_io: + if (is_read_io(bio_op(bio))) + trace_f2fs_submit_read_bio(sbi->sb, type, bio); + else + trace_f2fs_submit_write_bio(sbi->sb, type, bio); + + iostat_update_submit_ctx(bio, type); + submit_bio(bio); +} + +void f2fs_submit_bio(struct f2fs_sb_info *sbi, + struct bio *bio, enum page_type type) +{ + __submit_bio(sbi, bio, type); +} + +static void __submit_merged_bio(struct f2fs_bio_info *io) +{ + struct f2fs_io_info *fio = &io->fio; + + if (!io->bio) + return; + + if (is_read_io(fio->op)) + trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio); + else + trace_f2fs_prepare_write_bio(io->sbi->sb, fio->type, io->bio); + + __submit_bio(io->sbi, io->bio, fio->type); + io->bio = NULL; +} + +static bool __has_merged_page(struct bio *bio, struct inode *inode, + struct page *page, nid_t ino) +{ + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + if (!bio) + return false; + + if (!inode && !page && !ino) + return true; + + bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *target = bvec->bv_page; + + if (fscrypt_is_bounce_page(target)) { + target = fscrypt_pagecache_page(target); + if (IS_ERR(target)) + continue; + } + if (f2fs_is_compressed_page(target)) { + target = f2fs_compress_control_page(target); + if (IS_ERR(target)) + continue; + } + + if (inode && inode == target->mapping->host) + return true; + if (page && page == target) + return true; + if (ino && ino == ino_of_node(target)) + return true; + } + + return false; +} + +int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < NR_PAGE_TYPE; i++) { + int n = (i == META) ? 1 : NR_TEMP_TYPE; + int j; + + sbi->write_io[i] = f2fs_kmalloc(sbi, + array_size(n, sizeof(struct f2fs_bio_info)), + GFP_KERNEL); + if (!sbi->write_io[i]) + return -ENOMEM; + + for (j = HOT; j < n; j++) { + init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem); + sbi->write_io[i][j].sbi = sbi; + sbi->write_io[i][j].bio = NULL; + spin_lock_init(&sbi->write_io[i][j].io_lock); + INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); + INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); + init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); + } + } + + return 0; +} + +static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io = sbi->write_io[btype] + temp; + + f2fs_down_write(&io->io_rwsem); + + if (!io->bio) + goto unlock_out; + + /* change META to META_FLUSH in the checkpoint procedure */ + if (type >= META_FLUSH) { + io->fio.type = META_FLUSH; + io->bio->bi_opf |= REQ_META | REQ_PRIO | REQ_SYNC; + if (!test_opt(sbi, NOBARRIER)) + io->bio->bi_opf |= REQ_PREFLUSH | REQ_FUA; + } + __submit_merged_bio(io); +unlock_out: + f2fs_up_write(&io->io_rwsem); +} + +static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, struct page *page, + nid_t ino, enum page_type type, bool force) +{ + enum temp_type temp; + bool ret = true; + + for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { + if (!force) { + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io = sbi->write_io[btype] + temp; + + f2fs_down_read(&io->io_rwsem); + ret = __has_merged_page(io->bio, inode, page, ino); + f2fs_up_read(&io->io_rwsem); + } + if (ret) + __f2fs_submit_merged_write(sbi, type, temp); + + /* TODO: use HOT temp only for meta pages now. */ + if (type >= META) + break; + } +} + +void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) +{ + __submit_merged_write_cond(sbi, NULL, NULL, 0, type, true); +} + +void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, struct page *page, + nid_t ino, enum page_type type) +{ + __submit_merged_write_cond(sbi, inode, page, ino, type, false); +} + +void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) +{ + f2fs_submit_merged_write(sbi, DATA); + f2fs_submit_merged_write(sbi, NODE); + f2fs_submit_merged_write(sbi, META); +} + +/* + * Fill the locked page with data located in the block address. + * A caller needs to unlock the page on failure. + */ +int f2fs_submit_page_bio(struct f2fs_io_info *fio) +{ + struct bio *bio; + struct page *page = fio->encrypted_page ? + fio->encrypted_page : fio->page; + + if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, + fio->is_por ? META_POR : (__is_meta_io(fio) ? + META_GENERIC : DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); + return -EFSCORRUPTED; + } + + trace_f2fs_submit_page_bio(page, fio); + + /* Allocate a new bio */ + bio = __bio_alloc(fio, 1); + + f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host, + fio->page->index, fio, GFP_NOIO); + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + bio_put(bio); + return -EFAULT; + } + + if (fio->io_wbc && !is_read_io(fio->op)) + wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + + inc_page_count(fio->sbi, is_read_io(fio->op) ? + __read_io_type(page) : WB_DATA_TYPE(fio->page)); + + __submit_bio(fio->sbi, bio, fio->type); + return 0; +} + +static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, + block_t last_blkaddr, block_t cur_blkaddr) +{ + if (unlikely(sbi->max_io_bytes && + bio->bi_iter.bi_size >= sbi->max_io_bytes)) + return false; + if (last_blkaddr + 1 != cur_blkaddr) + return false; + return bio->bi_bdev == f2fs_target_device(sbi, cur_blkaddr, NULL); +} + +static bool io_type_is_mergeable(struct f2fs_bio_info *io, + struct f2fs_io_info *fio) +{ + if (io->fio.op != fio->op) + return false; + return io->fio.op_flags == fio->op_flags; +} + +static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, + struct f2fs_bio_info *io, + struct f2fs_io_info *fio, + block_t last_blkaddr, + block_t cur_blkaddr) +{ + if (F2FS_IO_ALIGNED(sbi) && (fio->type == DATA || fio->type == NODE)) { + unsigned int filled_blocks = + F2FS_BYTES_TO_BLK(bio->bi_iter.bi_size); + unsigned int io_size = F2FS_IO_SIZE(sbi); + unsigned int left_vecs = bio->bi_max_vecs - bio->bi_vcnt; + + /* IOs in bio is aligned and left space of vectors is not enough */ + if (!(filled_blocks % io_size) && left_vecs < io_size) + return false; + } + if (!page_is_mergeable(sbi, bio, last_blkaddr, cur_blkaddr)) + return false; + return io_type_is_mergeable(io, fio); +} + +static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio, + struct page *page, enum temp_type temp) +{ + struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; + struct bio_entry *be; + + be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS, true, NULL); + be->bio = bio; + bio_get(bio); + + if (bio_add_page(bio, page, PAGE_SIZE, 0) != PAGE_SIZE) + f2fs_bug_on(sbi, 1); + + f2fs_down_write(&io->bio_list_lock); + list_add_tail(&be->list, &io->bio_list); + f2fs_up_write(&io->bio_list_lock); +} + +static void del_bio_entry(struct bio_entry *be) +{ + list_del(&be->list); + kmem_cache_free(bio_entry_slab, be); +} + +static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, + struct page *page) +{ + struct f2fs_sb_info *sbi = fio->sbi; + enum temp_type temp; + bool found = false; + int ret = -EAGAIN; + + for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) { + struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; + struct list_head *head = &io->bio_list; + struct bio_entry *be; + + f2fs_down_write(&io->bio_list_lock); + list_for_each_entry(be, head, list) { + if (be->bio != *bio) + continue; + + found = true; + + f2fs_bug_on(sbi, !page_is_mergeable(sbi, *bio, + *fio->last_block, + fio->new_blkaddr)); + if (f2fs_crypt_mergeable_bio(*bio, + fio->page->mapping->host, + fio->page->index, fio) && + bio_add_page(*bio, page, PAGE_SIZE, 0) == + PAGE_SIZE) { + ret = 0; + break; + } + + /* page can't be merged into bio; submit the bio */ + del_bio_entry(be); + __submit_bio(sbi, *bio, DATA); + break; + } + f2fs_up_write(&io->bio_list_lock); + } + + if (ret) { + bio_put(*bio); + *bio = NULL; + } + + return ret; +} + +void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, + struct bio **bio, struct page *page) +{ + enum temp_type temp; + bool found = false; + struct bio *target = bio ? *bio : NULL; + + f2fs_bug_on(sbi, !target && !page); + + for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) { + struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; + struct list_head *head = &io->bio_list; + struct bio_entry *be; + + if (list_empty(head)) + continue; + + f2fs_down_read(&io->bio_list_lock); + list_for_each_entry(be, head, list) { + if (target) + found = (target == be->bio); + else + found = __has_merged_page(be->bio, NULL, + page, 0); + if (found) + break; + } + f2fs_up_read(&io->bio_list_lock); + + if (!found) + continue; + + found = false; + + f2fs_down_write(&io->bio_list_lock); + list_for_each_entry(be, head, list) { + if (target) + found = (target == be->bio); + else + found = __has_merged_page(be->bio, NULL, + page, 0); + if (found) { + target = be->bio; + del_bio_entry(be); + break; + } + } + f2fs_up_write(&io->bio_list_lock); + } + + if (found) + __submit_bio(sbi, target, DATA); + if (bio && *bio) { + bio_put(*bio); + *bio = NULL; + } +} + +int f2fs_merge_page_bio(struct f2fs_io_info *fio) +{ + struct bio *bio = *fio->bio; + struct page *page = fio->encrypted_page ? + fio->encrypted_page : fio->page; + + if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, + __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) { + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); + return -EFSCORRUPTED; + } + + trace_f2fs_submit_page_bio(page, fio); + + if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block, + fio->new_blkaddr)) + f2fs_submit_merged_ipu_write(fio->sbi, &bio, NULL); +alloc_new: + if (!bio) { + bio = __bio_alloc(fio, BIO_MAX_VECS); + f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host, + fio->page->index, fio, GFP_NOIO); + + add_bio_entry(fio->sbi, bio, page, fio->temp); + } else { + if (add_ipu_page(fio, &bio, page)) + goto alloc_new; + } + + if (fio->io_wbc) + wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + + inc_page_count(fio->sbi, WB_DATA_TYPE(page)); + + *fio->last_block = fio->new_blkaddr; + *fio->bio = bio; + + return 0; +} + +void f2fs_submit_page_write(struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = fio->sbi; + enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); + struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; + struct page *bio_page; + + f2fs_bug_on(sbi, is_read_io(fio->op)); + + f2fs_down_write(&io->io_rwsem); +next: + if (fio->in_list) { + spin_lock(&io->io_lock); + if (list_empty(&io->io_list)) { + spin_unlock(&io->io_lock); + goto out; + } + fio = list_first_entry(&io->io_list, + struct f2fs_io_info, list); + list_del(&fio->list); + spin_unlock(&io->io_lock); + } + + verify_fio_blkaddr(fio); + + if (fio->encrypted_page) + bio_page = fio->encrypted_page; + else if (fio->compressed_page) + bio_page = fio->compressed_page; + else + bio_page = fio->page; + + /* set submitted = true as a return value */ + fio->submitted = true; + + inc_page_count(sbi, WB_DATA_TYPE(bio_page)); + + if (io->bio && + (!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio, + fio->new_blkaddr) || + !f2fs_crypt_mergeable_bio(io->bio, fio->page->mapping->host, + bio_page->index, fio))) + __submit_merged_bio(io); +alloc_new: + if (io->bio == NULL) { + if (F2FS_IO_ALIGNED(sbi) && + (fio->type == DATA || fio->type == NODE) && + fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { + dec_page_count(sbi, WB_DATA_TYPE(bio_page)); + fio->retry = true; + goto skip; + } + io->bio = __bio_alloc(fio, BIO_MAX_VECS); + f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host, + bio_page->index, fio, GFP_NOIO); + io->fio = *fio; + } + + if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { + __submit_merged_bio(io); + goto alloc_new; + } + + if (fio->io_wbc) + wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + + io->last_block_in_bio = fio->new_blkaddr; + + trace_f2fs_submit_page_write(fio->page, fio); +skip: + if (fio->in_list) + goto next; +out: + if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || + !f2fs_is_checkpoint_ready(sbi)) + __submit_merged_bio(io); + f2fs_up_write(&io->io_rwsem); +} + +static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages, blk_opf_t op_flag, + pgoff_t first_idx, bool for_write) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct bio *bio; + struct bio_post_read_ctx *ctx = NULL; + unsigned int post_read_steps = 0; + sector_t sector; + struct block_device *bdev = f2fs_target_device(sbi, blkaddr, §or); + + bio = bio_alloc_bioset(bdev, bio_max_segs(nr_pages), + REQ_OP_READ | op_flag, + for_write ? GFP_NOIO : GFP_KERNEL, &f2fs_bioset); + if (!bio) + return ERR_PTR(-ENOMEM); + bio->bi_iter.bi_sector = sector; + f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS); + bio->bi_end_io = f2fs_read_end_io; + + if (fscrypt_inode_uses_fs_layer_crypto(inode)) + post_read_steps |= STEP_DECRYPT; + + if (f2fs_need_verity(inode, first_idx)) + post_read_steps |= STEP_VERITY; + + /* + * STEP_DECOMPRESS is handled specially, since a compressed file might + * contain both compressed and uncompressed clusters. We'll allocate a + * bio_post_read_ctx if the file is compressed, but the caller is + * responsible for enabling STEP_DECOMPRESS if it's actually needed. + */ + + if (post_read_steps || f2fs_compressed_file(inode)) { + /* Due to the mempool, this never fails. */ + ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); + ctx->bio = bio; + ctx->sbi = sbi; + ctx->enabled_steps = post_read_steps; + ctx->fs_blkaddr = blkaddr; + bio->bi_private = ctx; + } + iostat_alloc_and_bind_ctx(sbi, bio, ctx); + + return bio; +} + +/* This can handle encryption stuffs */ +static int f2fs_submit_page_read(struct inode *inode, struct page *page, + block_t blkaddr, blk_opf_t op_flags, + bool for_write) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct bio *bio; + + bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, + page->index, for_write); + if (IS_ERR(bio)) + return PTR_ERR(bio); + + /* wait for GCed page writeback via META_MAPPING */ + f2fs_wait_on_block_writeback(inode, blkaddr); + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + bio_put(bio); + return -EFAULT; + } + ClearPageError(page); + inc_page_count(sbi, F2FS_RD_DATA); + f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); + __submit_bio(sbi, bio, DATA); + return 0; +} + +static void __set_data_blkaddr(struct dnode_of_data *dn) +{ + struct f2fs_node *rn = F2FS_NODE(dn->node_page); + __le32 *addr_array; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); + + /* Get physical address of data block */ + addr_array = blkaddr_in_node(rn); + addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); +} + +/* + * Lock ordering for the change of data block address: + * ->data_page + * ->node_page + * update block addresses in the node page + */ +void f2fs_set_data_blkaddr(struct dnode_of_data *dn) +{ + f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true); + __set_data_blkaddr(dn); + if (set_page_dirty(dn->node_page)) + dn->node_changed = true; +} + +void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) +{ + dn->data_blkaddr = blkaddr; + f2fs_set_data_blkaddr(dn); + f2fs_update_read_extent_cache(dn); +} + +/* dn->ofs_in_node will be returned with up-to-date last block pointer */ +int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + int err; + + if (!count) + return 0; + + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) + return -EPERM; + if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) + return err; + + trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, + dn->ofs_in_node, count); + + f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true); + + for (; count > 0; dn->ofs_in_node++) { + block_t blkaddr = f2fs_data_blkaddr(dn); + + if (blkaddr == NULL_ADDR) { + dn->data_blkaddr = NEW_ADDR; + __set_data_blkaddr(dn); + count--; + } + } + + if (set_page_dirty(dn->node_page)) + dn->node_changed = true; + return 0; +} + +/* Should keep dn->ofs_in_node unchanged */ +int f2fs_reserve_new_block(struct dnode_of_data *dn) +{ + unsigned int ofs_in_node = dn->ofs_in_node; + int ret; + + ret = f2fs_reserve_new_blocks(dn, 1); + dn->ofs_in_node = ofs_in_node; + return ret; +} + +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) +{ + bool need_put = dn->inode_page ? false : true; + int err; + + err = f2fs_get_dnode_of_data(dn, index, ALLOC_NODE); + if (err) + return err; + + if (dn->data_blkaddr == NULL_ADDR) + err = f2fs_reserve_new_block(dn); + if (err || need_put) + f2fs_put_dnode(dn); + return err; +} + +int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) +{ + struct extent_info ei = {0, }; + struct inode *inode = dn->inode; + + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { + dn->data_blkaddr = ei.blk + index - ei.fofs; + return 0; + } + + return f2fs_reserve_block(dn, index); +} + +struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, + blk_opf_t op_flags, bool for_write, + pgoff_t *next_pgofs) +{ + struct address_space *mapping = inode->i_mapping; + struct dnode_of_data dn; + struct page *page; + struct extent_info ei = {0, }; + int err; + + page = f2fs_grab_cache_page(mapping, index, for_write); + if (!page) + return ERR_PTR(-ENOMEM); + + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, + DATA_GENERIC_ENHANCE_READ)) { + err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); + goto put_err; + } + goto got_it; + } + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) { + if (err == -ENOENT && next_pgofs) + *next_pgofs = f2fs_get_next_page_offset(&dn, index); + goto put_err; + } + f2fs_put_dnode(&dn); + + if (unlikely(dn.data_blkaddr == NULL_ADDR)) { + err = -ENOENT; + if (next_pgofs) + *next_pgofs = index + 1; + goto put_err; + } + if (dn.data_blkaddr != NEW_ADDR && + !f2fs_is_valid_blkaddr(F2FS_I_SB(inode), + dn.data_blkaddr, + DATA_GENERIC_ENHANCE)) { + err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); + goto put_err; + } +got_it: + if (PageUptodate(page)) { + unlock_page(page); + return page; + } + + /* + * A new dentry page is allocated but not able to be written, since its + * new inode page couldn't be allocated due to -ENOSPC. + * In such the case, its blkaddr can be remained as NEW_ADDR. + * see, f2fs_add_link -> f2fs_get_new_data_page -> + * f2fs_init_inode_metadata. + */ + if (dn.data_blkaddr == NEW_ADDR) { + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); + unlock_page(page); + return page; + } + + err = f2fs_submit_page_read(inode, page, dn.data_blkaddr, + op_flags, for_write); + if (err) + goto put_err; + return page; + +put_err: + f2fs_put_page(page, 1); + return ERR_PTR(err); +} + +struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, + pgoff_t *next_pgofs) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + + page = find_get_page(mapping, index); + if (page && PageUptodate(page)) + return page; + f2fs_put_page(page, 0); + + page = f2fs_get_read_data_page(inode, index, 0, false, next_pgofs); + if (IS_ERR(page)) + return page; + + if (PageUptodate(page)) + return page; + + wait_on_page_locked(page); + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 0); + return ERR_PTR(-EIO); + } + return page; +} + +/* + * If it tries to access a hole, return an error. + * Because, the callers, functions in dir.c and GC, should be able to know + * whether this page exists or not. + */ +struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, + bool for_write) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + + page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL); + if (IS_ERR(page)) + return page; + + /* wait for read completion */ + lock_page(page); + if (unlikely(page->mapping != mapping || !PageUptodate(page))) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } + return page; +} + +/* + * Caller ensures that this data page is never allocated. + * A new zero-filled data page is allocated in the page cache. + * + * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + * Note that, ipage is set only by make_empty_dir, and if any error occur, + * ipage should be released by this function. + */ +struct page *f2fs_get_new_data_page(struct inode *inode, + struct page *ipage, pgoff_t index, bool new_i_size) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + struct dnode_of_data dn; + int err; + + page = f2fs_grab_cache_page(mapping, index, true); + if (!page) { + /* + * before exiting, we should make sure ipage will be released + * if any error occur. + */ + f2fs_put_page(ipage, 1); + return ERR_PTR(-ENOMEM); + } + + set_new_dnode(&dn, inode, ipage, NULL, 0); + err = f2fs_reserve_block(&dn, index); + if (err) { + f2fs_put_page(page, 1); + return ERR_PTR(err); + } + if (!ipage) + f2fs_put_dnode(&dn); + + if (PageUptodate(page)) + goto got_it; + + if (dn.data_blkaddr == NEW_ADDR) { + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); + } else { + f2fs_put_page(page, 1); + + /* if ipage exists, blkaddr should be NEW_ADDR */ + f2fs_bug_on(F2FS_I_SB(inode), ipage); + page = f2fs_get_lock_data_page(inode, index, true); + if (IS_ERR(page)) + return page; + } +got_it: + if (new_i_size && i_size_read(inode) < + ((loff_t)(index + 1) << PAGE_SHIFT)) + f2fs_i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT)); + return page; +} + +static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct f2fs_summary sum; + struct node_info ni; + block_t old_blkaddr; + blkcnt_t count = 1; + int err; + + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) + return -EPERM; + + err = f2fs_get_node_info(sbi, dn->nid, &ni, false); + if (err) + return err; + + dn->data_blkaddr = f2fs_data_blkaddr(dn); + if (dn->data_blkaddr != NULL_ADDR) + goto alloc; + + if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) + return err; + +alloc: + set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); + old_blkaddr = dn->data_blkaddr; + f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, + &sum, seg_type, NULL); + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { + invalidate_mapping_pages(META_MAPPING(sbi), + old_blkaddr, old_blkaddr); + f2fs_invalidate_compress_page(sbi, old_blkaddr); + } + f2fs_update_data_blkaddr(dn, dn->data_blkaddr); + return 0; +} + +void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) +{ + if (flag == F2FS_GET_BLOCK_PRE_AIO) { + if (lock) + f2fs_down_read(&sbi->node_change); + else + f2fs_up_read(&sbi->node_change); + } else { + if (lock) + f2fs_lock_op(sbi); + else + f2fs_unlock_op(sbi); + } +} + +/* + * f2fs_map_blocks() tries to find or build mapping relationship which + * maps continuous logical blocks to physical blocks, and return such + * info via f2fs_map_blocks structure. + */ +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, + int create, int flag) +{ + unsigned int maxblocks = map->m_len; + struct dnode_of_data dn; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int mode = map->m_may_create ? ALLOC_NODE : LOOKUP_NODE; + pgoff_t pgofs, end_offset, end; + int err = 0, ofs = 1; + unsigned int ofs_in_node, last_ofs_in_node; + blkcnt_t prealloc; + struct extent_info ei = {0, }; + block_t blkaddr; + unsigned int start_pgofs; + int bidx = 0; + + if (!maxblocks) + return 0; + + map->m_bdev = inode->i_sb->s_bdev; + map->m_multidev_dio = + f2fs_allow_multi_device_dio(F2FS_I_SB(inode), flag); + + map->m_len = 0; + map->m_flags = 0; + + /* it only supports block size == page size */ + pgofs = (pgoff_t)map->m_lblk; + end = pgofs + maxblocks; + + if (!create && f2fs_lookup_read_extent_cache(inode, pgofs, &ei)) { + if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO && + map->m_may_create) + goto next_dnode; + + map->m_pblk = ei.blk + pgofs - ei.fofs; + map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs); + map->m_flags = F2FS_MAP_MAPPED; + if (map->m_next_extent) + *map->m_next_extent = pgofs + map->m_len; + + /* for hardware encryption, but to avoid potential issue in future */ + if (flag == F2FS_GET_BLOCK_DIO) + f2fs_wait_on_block_writeback_range(inode, + map->m_pblk, map->m_len); + + if (map->m_multidev_dio) { + block_t blk_addr = map->m_pblk; + + bidx = f2fs_target_device_index(sbi, map->m_pblk); + + map->m_bdev = FDEV(bidx).bdev; + map->m_pblk -= FDEV(bidx).start_blk; + map->m_len = min(map->m_len, + FDEV(bidx).end_blk + 1 - map->m_pblk); + + if (map->m_may_create) + f2fs_update_device_state(sbi, inode->i_ino, + blk_addr, map->m_len); + } + goto out; + } + +next_dnode: + if (map->m_may_create) + f2fs_do_map_lock(sbi, flag, true); + + /* When reading holes, we need its node page */ + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, pgofs, mode); + if (err) { + if (flag == F2FS_GET_BLOCK_BMAP) + map->m_pblk = 0; + + if (err == -ENOENT) { + /* + * There is one exceptional case that read_node_page() + * may return -ENOENT due to filesystem has been + * shutdown or cp_error, so force to convert error + * number to EIO for such case. + */ + if (map->m_may_create && + (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || + f2fs_cp_error(sbi))) { + err = -EIO; + goto unlock_out; + } + + err = 0; + if (map->m_next_pgofs) + *map->m_next_pgofs = + f2fs_get_next_page_offset(&dn, pgofs); + if (map->m_next_extent) + *map->m_next_extent = + f2fs_get_next_page_offset(&dn, pgofs); + } + goto unlock_out; + } + + start_pgofs = pgofs; + prealloc = 0; + last_ofs_in_node = ofs_in_node = dn.ofs_in_node; + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + +next_block: + blkaddr = f2fs_data_blkaddr(&dn); + + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + goto sync_out; + } + + if (__is_valid_data_blkaddr(blkaddr)) { + /* use out-place-update for driect IO under LFS mode */ + if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO && + map->m_may_create) { + err = __allocate_data_block(&dn, map->m_seg_type); + if (err) + goto sync_out; + blkaddr = dn.data_blkaddr; + set_inode_flag(inode, FI_APPEND_WRITE); + } + } else { + if (create) { + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto sync_out; + } + if (flag == F2FS_GET_BLOCK_PRE_AIO) { + if (blkaddr == NULL_ADDR) { + prealloc++; + last_ofs_in_node = dn.ofs_in_node; + } + } else { + WARN_ON(flag != F2FS_GET_BLOCK_PRE_DIO && + flag != F2FS_GET_BLOCK_DIO); + err = __allocate_data_block(&dn, + map->m_seg_type); + if (!err) { + if (flag == F2FS_GET_BLOCK_PRE_DIO) + file_need_truncate(inode); + set_inode_flag(inode, FI_APPEND_WRITE); + } + } + if (err) + goto sync_out; + map->m_flags |= F2FS_MAP_NEW; + blkaddr = dn.data_blkaddr; + } else { + if (f2fs_compressed_file(inode) && + f2fs_sanity_check_cluster(&dn) && + (flag != F2FS_GET_BLOCK_FIEMAP || + IS_ENABLED(CONFIG_F2FS_CHECK_FS))) { + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_CORRUPTED_CLUSTER); + goto sync_out; + } + if (flag == F2FS_GET_BLOCK_BMAP) { + map->m_pblk = 0; + goto sync_out; + } + if (flag == F2FS_GET_BLOCK_PRECACHE) + goto sync_out; + if (flag == F2FS_GET_BLOCK_FIEMAP && + blkaddr == NULL_ADDR) { + if (map->m_next_pgofs) + *map->m_next_pgofs = pgofs + 1; + goto sync_out; + } + if (flag != F2FS_GET_BLOCK_FIEMAP) { + /* for defragment case */ + if (map->m_next_pgofs) + *map->m_next_pgofs = pgofs + 1; + goto sync_out; + } + } + } + + if (flag == F2FS_GET_BLOCK_PRE_AIO) + goto skip; + + if (map->m_multidev_dio) + bidx = f2fs_target_device_index(sbi, blkaddr); + + if (map->m_len == 0) { + /* preallocated unwritten block should be mapped for fiemap. */ + if (blkaddr == NEW_ADDR) + map->m_flags |= F2FS_MAP_UNWRITTEN; + map->m_flags |= F2FS_MAP_MAPPED; + + map->m_pblk = blkaddr; + map->m_len = 1; + + if (map->m_multidev_dio) + map->m_bdev = FDEV(bidx).bdev; + } else if ((map->m_pblk != NEW_ADDR && + blkaddr == (map->m_pblk + ofs)) || + (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) || + flag == F2FS_GET_BLOCK_PRE_DIO) { + if (map->m_multidev_dio && map->m_bdev != FDEV(bidx).bdev) + goto sync_out; + ofs++; + map->m_len++; + } else { + goto sync_out; + } + +skip: + dn.ofs_in_node++; + pgofs++; + + /* preallocate blocks in batch for one dnode page */ + if (flag == F2FS_GET_BLOCK_PRE_AIO && + (pgofs == end || dn.ofs_in_node == end_offset)) { + + dn.ofs_in_node = ofs_in_node; + err = f2fs_reserve_new_blocks(&dn, prealloc); + if (err) + goto sync_out; + + map->m_len += dn.ofs_in_node - ofs_in_node; + if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { + err = -ENOSPC; + goto sync_out; + } + dn.ofs_in_node = end_offset; + } + + if (pgofs >= end) + goto sync_out; + else if (dn.ofs_in_node < end_offset) + goto next_block; + + if (flag == F2FS_GET_BLOCK_PRECACHE) { + if (map->m_flags & F2FS_MAP_MAPPED) { + unsigned int ofs = start_pgofs - map->m_lblk; + + f2fs_update_read_extent_cache_range(&dn, + start_pgofs, map->m_pblk + ofs, + map->m_len - ofs); + } + } + + f2fs_put_dnode(&dn); + + if (map->m_may_create) { + f2fs_do_map_lock(sbi, flag, false); + f2fs_balance_fs(sbi, dn.node_changed); + } + goto next_dnode; + +sync_out: + + if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) { + /* + * for hardware encryption, but to avoid potential issue + * in future + */ + f2fs_wait_on_block_writeback_range(inode, + map->m_pblk, map->m_len); + + if (map->m_multidev_dio) { + block_t blk_addr = map->m_pblk; + + bidx = f2fs_target_device_index(sbi, map->m_pblk); + + map->m_bdev = FDEV(bidx).bdev; + map->m_pblk -= FDEV(bidx).start_blk; + + if (map->m_may_create) + f2fs_update_device_state(sbi, inode->i_ino, + blk_addr, map->m_len); + + f2fs_bug_on(sbi, blk_addr + map->m_len > + FDEV(bidx).end_blk + 1); + } + } + + if (flag == F2FS_GET_BLOCK_PRECACHE) { + if (map->m_flags & F2FS_MAP_MAPPED) { + unsigned int ofs = start_pgofs - map->m_lblk; + + f2fs_update_read_extent_cache_range(&dn, + start_pgofs, map->m_pblk + ofs, + map->m_len - ofs); + } + if (map->m_next_extent) + *map->m_next_extent = pgofs + 1; + } + f2fs_put_dnode(&dn); +unlock_out: + if (map->m_may_create) { + f2fs_do_map_lock(sbi, flag, false); + f2fs_balance_fs(sbi, dn.node_changed); + } +out: + trace_f2fs_map_blocks(inode, map, create, flag, err); + return err; +} + +bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) +{ + struct f2fs_map_blocks map; + block_t last_lblk; + int err; + + if (pos + len > i_size_read(inode)) + return false; + + map.m_lblk = F2FS_BYTES_TO_BLK(pos); + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; + map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; + last_lblk = F2FS_BLK_ALIGN(pos + len); + + while (map.m_lblk < last_lblk) { + map.m_len = last_lblk - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); + if (err || map.m_len == 0) + return false; + map.m_lblk += map.m_len; + } + return true; +} + +static inline u64 bytes_to_blks(struct inode *inode, u64 bytes) +{ + return (bytes >> inode->i_blkbits); +} + +static inline u64 blks_to_bytes(struct inode *inode, u64 blks) +{ + return (blks << inode->i_blkbits); +} + +static int f2fs_xattr_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct page *page; + struct node_info ni; + __u64 phys = 0, len; + __u32 flags; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + int err = 0; + + if (f2fs_has_inline_xattr(inode)) { + int offset; + + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), + inode->i_ino, false); + if (!page) + return -ENOMEM; + + err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); + if (err) { + f2fs_put_page(page, 1); + return err; + } + + phys = blks_to_bytes(inode, ni.blk_addr); + offset = offsetof(struct f2fs_inode, i_addr) + + sizeof(__le32) * (DEF_ADDRS_PER_INODE - + get_inline_xattr_addrs(inode)); + + phys += offset; + len = inline_xattr_size(inode); + + f2fs_put_page(page, 1); + + flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED; + + if (!xnid) + flags |= FIEMAP_EXTENT_LAST; + + err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); + trace_f2fs_fiemap(inode, 0, phys, len, flags, err); + if (err) + return err; + } + + if (xnid) { + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), xnid, false); + if (!page) + return -ENOMEM; + + err = f2fs_get_node_info(sbi, xnid, &ni, false); + if (err) { + f2fs_put_page(page, 1); + return err; + } + + phys = blks_to_bytes(inode, ni.blk_addr); + len = inode->i_sb->s_blocksize; + + f2fs_put_page(page, 1); + + flags = FIEMAP_EXTENT_LAST; + } + + if (phys) { + err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); + trace_f2fs_fiemap(inode, 0, phys, len, flags, err); + } + + return (err < 0 ? err : 0); +} + +static loff_t max_inode_blocks(struct inode *inode) +{ + loff_t result = ADDRS_PER_INODE(inode); + loff_t leaf_count = ADDRS_PER_BLOCK(inode); + + /* two direct node blocks */ + result += (leaf_count * 2); + + /* two indirect node blocks */ + leaf_count *= NIDS_PER_BLOCK; + result += (leaf_count * 2); + + /* one double indirect node block */ + leaf_count *= NIDS_PER_BLOCK; + result += leaf_count; + + return result; +} + +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + struct f2fs_map_blocks map; + sector_t start_blk, last_blk; + pgoff_t next_pgofs; + u64 logical = 0, phys = 0, size = 0; + u32 flags = 0; + int ret = 0; + bool compr_cluster = false, compr_appended; + unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; + unsigned int count_in_cluster = 0; + loff_t maxbytes; + + if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + ret = f2fs_precache_extents(inode); + if (ret) + return ret; + } + + ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_XATTR); + if (ret) + return ret; + + inode_lock(inode); + + maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS; + if (start > maxbytes) { + ret = -EFBIG; + goto out; + } + + if (len > maxbytes || (maxbytes - len) < start) + len = maxbytes - start; + + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { + ret = f2fs_xattr_fiemap(inode, fieinfo); + goto out; + } + + if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) { + ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len); + if (ret != -EAGAIN) + goto out; + } + + if (bytes_to_blks(inode, len) == 0) + len = blks_to_bytes(inode, 1); + + start_blk = bytes_to_blks(inode, start); + last_blk = bytes_to_blks(inode, start + len - 1); + +next: + memset(&map, 0, sizeof(map)); + map.m_lblk = start_blk; + map.m_len = bytes_to_blks(inode, len); + map.m_next_pgofs = &next_pgofs; + map.m_seg_type = NO_CHECK_TYPE; + + if (compr_cluster) { + map.m_lblk += 1; + map.m_len = cluster_size - count_in_cluster; + } + + ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); + if (ret) + goto out; + + /* HOLE */ + if (!compr_cluster && !(map.m_flags & F2FS_MAP_FLAGS)) { + start_blk = next_pgofs; + + if (blks_to_bytes(inode, start_blk) < blks_to_bytes(inode, + max_inode_blocks(inode))) + goto prep_next; + + flags |= FIEMAP_EXTENT_LAST; + } + + compr_appended = false; + /* In a case of compressed cluster, append this to the last extent */ + if (compr_cluster && ((map.m_flags & F2FS_MAP_UNWRITTEN) || + !(map.m_flags & F2FS_MAP_FLAGS))) { + compr_appended = true; + goto skip_fill; + } + + if (size) { + flags |= FIEMAP_EXTENT_MERGED; + if (IS_ENCRYPTED(inode)) + flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; + + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + trace_f2fs_fiemap(inode, logical, phys, size, flags, ret); + if (ret) + goto out; + size = 0; + } + + if (start_blk > last_blk) + goto out; + +skip_fill: + if (map.m_pblk == COMPRESS_ADDR) { + compr_cluster = true; + count_in_cluster = 1; + } else if (compr_appended) { + unsigned int appended_blks = cluster_size - + count_in_cluster + 1; + size += blks_to_bytes(inode, appended_blks); + start_blk += appended_blks; + compr_cluster = false; + } else { + logical = blks_to_bytes(inode, start_blk); + phys = __is_valid_data_blkaddr(map.m_pblk) ? + blks_to_bytes(inode, map.m_pblk) : 0; + size = blks_to_bytes(inode, map.m_len); + flags = 0; + + if (compr_cluster) { + flags = FIEMAP_EXTENT_ENCODED; + count_in_cluster += map.m_len; + if (count_in_cluster == cluster_size) { + compr_cluster = false; + size += blks_to_bytes(inode, 1); + } + } else if (map.m_flags & F2FS_MAP_UNWRITTEN) { + flags = FIEMAP_EXTENT_UNWRITTEN; + } + + start_blk += bytes_to_blks(inode, size); + } + +prep_next: + cond_resched(); + if (fatal_signal_pending(current)) + ret = -EINTR; + else + goto next; +out: + if (ret == 1) + ret = 0; + + inode_unlock(inode); + return ret; +} + +static inline loff_t f2fs_readpage_limit(struct inode *inode) +{ + if (IS_ENABLED(CONFIG_FS_VERITY) && + (IS_VERITY(inode) || f2fs_verity_in_progress(inode))) + return inode->i_sb->s_maxbytes; + + return i_size_read(inode); +} + +static int f2fs_read_single_page(struct inode *inode, struct page *page, + unsigned nr_pages, + struct f2fs_map_blocks *map, + struct bio **bio_ret, + sector_t *last_block_in_bio, + bool is_readahead) +{ + struct bio *bio = *bio_ret; + const unsigned blocksize = blks_to_bytes(inode, 1); + sector_t block_in_file; + sector_t last_block; + sector_t last_block_in_file; + sector_t block_nr; + int ret = 0; + + block_in_file = (sector_t)page_index(page); + last_block = block_in_file + nr_pages; + last_block_in_file = bytes_to_blks(inode, + f2fs_readpage_limit(inode) + blocksize - 1); + if (last_block > last_block_in_file) + last_block = last_block_in_file; + + /* just zeroing out page which is beyond EOF */ + if (block_in_file >= last_block) + goto zero_out; + /* + * Map blocks using the previous result first. + */ + if ((map->m_flags & F2FS_MAP_MAPPED) && + block_in_file > map->m_lblk && + block_in_file < (map->m_lblk + map->m_len)) + goto got_it; + + /* + * Then do more f2fs_map_blocks() calls until we are + * done with this page. + */ + map->m_lblk = block_in_file; + map->m_len = last_block - block_in_file; + + ret = f2fs_map_blocks(inode, map, 0, F2FS_GET_BLOCK_DEFAULT); + if (ret) + goto out; +got_it: + if ((map->m_flags & F2FS_MAP_MAPPED)) { + block_nr = map->m_pblk + block_in_file - map->m_lblk; + SetPageMappedToDisk(page); + + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, + DATA_GENERIC_ENHANCE_READ)) { + ret = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); + goto out; + } + } else { +zero_out: + zero_user_segment(page, 0, PAGE_SIZE); + if (f2fs_need_verity(inode, page->index) && + !fsverity_verify_page(page)) { + ret = -EIO; + goto out; + } + if (!PageUptodate(page)) + SetPageUptodate(page); + unlock_page(page); + goto out; + } + + /* + * This page will go to BIO. Do we need to send this + * BIO off first? + */ + if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio, + *last_block_in_bio, block_nr) || + !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) { +submit_and_realloc: + __submit_bio(F2FS_I_SB(inode), bio, DATA); + bio = NULL; + } + if (bio == NULL) { + bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, + is_readahead ? REQ_RAHEAD : 0, page->index, + false); + if (IS_ERR(bio)) { + ret = PTR_ERR(bio); + bio = NULL; + goto out; + } + } + + /* + * If the page is under writeback, we need to wait for + * its completion to see the correct decrypted data. + */ + f2fs_wait_on_block_writeback(inode, block_nr); + + if (bio_add_page(bio, page, blocksize, 0) < blocksize) + goto submit_and_realloc; + + inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); + f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO, + F2FS_BLKSIZE); + ClearPageError(page); + *last_block_in_bio = block_nr; + goto out; +out: + *bio_ret = bio; + return ret; +} + +#ifdef CONFIG_F2FS_FS_COMPRESSION +int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, + unsigned nr_pages, sector_t *last_block_in_bio, + bool is_readahead, bool for_write) +{ + struct dnode_of_data dn; + struct inode *inode = cc->inode; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct bio *bio = *bio_ret; + unsigned int start_idx = cc->cluster_idx << cc->log_cluster_size; + sector_t last_block_in_file; + const unsigned blocksize = blks_to_bytes(inode, 1); + struct decompress_io_ctx *dic = NULL; + struct extent_info ei = {0, }; + bool from_dnode = true; + int i; + int ret = 0; + + f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc)); + + last_block_in_file = bytes_to_blks(inode, + f2fs_readpage_limit(inode) + blocksize - 1); + + /* get rid of pages beyond EOF */ + for (i = 0; i < cc->cluster_size; i++) { + struct page *page = cc->rpages[i]; + + if (!page) + continue; + if ((sector_t)page->index >= last_block_in_file) { + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); + } else if (!PageUptodate(page)) { + continue; + } + unlock_page(page); + if (for_write) + put_page(page); + cc->rpages[i] = NULL; + cc->nr_rpages--; + } + + /* we are done since all pages are beyond EOF */ + if (f2fs_cluster_is_empty(cc)) + goto out; + + if (f2fs_lookup_read_extent_cache(inode, start_idx, &ei)) + from_dnode = false; + + if (!from_dnode) + goto skip_reading_dnode; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); + if (ret) + goto out; + + f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR); + +skip_reading_dnode: + for (i = 1; i < cc->cluster_size; i++) { + block_t blkaddr; + + blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page, + dn.ofs_in_node + i) : + ei.blk + i - 1; + + if (!__is_valid_data_blkaddr(blkaddr)) + break; + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) { + ret = -EFAULT; + goto out_put_dnode; + } + cc->nr_cpages++; + + if (!from_dnode && i >= ei.c_len) + break; + } + + /* nothing to decompress */ + if (cc->nr_cpages == 0) { + ret = 0; + goto out_put_dnode; + } + + dic = f2fs_alloc_dic(cc); + if (IS_ERR(dic)) { + ret = PTR_ERR(dic); + goto out_put_dnode; + } + + for (i = 0; i < cc->nr_cpages; i++) { + struct page *page = dic->cpages[i]; + block_t blkaddr; + struct bio_post_read_ctx *ctx; + + blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page, + dn.ofs_in_node + i + 1) : + ei.blk + i; + + f2fs_wait_on_block_writeback(inode, blkaddr); + + if (f2fs_load_compressed_page(sbi, page, blkaddr)) { + if (atomic_dec_and_test(&dic->remaining_pages)) { + f2fs_decompress_cluster(dic, true); + break; + } + continue; + } + + if (bio && (!page_is_mergeable(sbi, bio, + *last_block_in_bio, blkaddr) || + !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) { +submit_and_realloc: + __submit_bio(sbi, bio, DATA); + bio = NULL; + } + + if (!bio) { + bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, + is_readahead ? REQ_RAHEAD : 0, + page->index, for_write); + if (IS_ERR(bio)) { + ret = PTR_ERR(bio); + f2fs_decompress_end_io(dic, ret, true); + f2fs_put_dnode(&dn); + *bio_ret = NULL; + return ret; + } + } + + if (bio_add_page(bio, page, blocksize, 0) < blocksize) + goto submit_and_realloc; + + ctx = get_post_read_ctx(bio); + ctx->enabled_steps |= STEP_DECOMPRESS; + refcount_inc(&dic->refcnt); + + inc_page_count(sbi, F2FS_RD_DATA); + f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); + ClearPageError(page); + *last_block_in_bio = blkaddr; + } + + if (from_dnode) + f2fs_put_dnode(&dn); + + *bio_ret = bio; + return 0; + +out_put_dnode: + if (from_dnode) + f2fs_put_dnode(&dn); +out: + for (i = 0; i < cc->cluster_size; i++) { + if (cc->rpages[i]) { + ClearPageUptodate(cc->rpages[i]); + ClearPageError(cc->rpages[i]); + unlock_page(cc->rpages[i]); + } + } + *bio_ret = bio; + return ret; +} +#endif + +/* + * This function was originally taken from fs/mpage.c, and customized for f2fs. + * Major change was from block_size == page_size in f2fs by default. + */ +static int f2fs_mpage_readpages(struct inode *inode, + struct readahead_control *rac, struct page *page) +{ + struct bio *bio = NULL; + sector_t last_block_in_bio = 0; + struct f2fs_map_blocks map; +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct compress_ctx cc = { + .inode = inode, + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .cluster_idx = NULL_CLUSTER, + .rpages = NULL, + .cpages = NULL, + .nr_rpages = 0, + .nr_cpages = 0, + }; + pgoff_t nc_cluster_idx = NULL_CLUSTER; +#endif + unsigned nr_pages = rac ? readahead_count(rac) : 1; + unsigned max_nr_pages = nr_pages; + int ret = 0; + + map.m_pblk = 0; + map.m_lblk = 0; + map.m_len = 0; + map.m_flags = 0; + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; + map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; + + for (; nr_pages; nr_pages--) { + if (rac) { + page = readahead_page(rac); + prefetchw(&page->flags); + } + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + /* there are remained comressed pages, submit them */ + if (!f2fs_cluster_can_merge_page(&cc, page->index)) { + ret = f2fs_read_multi_pages(&cc, &bio, + max_nr_pages, + &last_block_in_bio, + rac != NULL, false); + f2fs_destroy_compress_ctx(&cc, false); + if (ret) + goto set_error_page; + } + if (cc.cluster_idx == NULL_CLUSTER) { + if (nc_cluster_idx == + page->index >> cc.log_cluster_size) { + goto read_single_page; + } + + ret = f2fs_is_compressed_cluster(inode, page->index); + if (ret < 0) + goto set_error_page; + else if (!ret) { + nc_cluster_idx = + page->index >> cc.log_cluster_size; + goto read_single_page; + } + + nc_cluster_idx = NULL_CLUSTER; + } + ret = f2fs_init_compress_ctx(&cc); + if (ret) + goto set_error_page; + + f2fs_compress_ctx_add_page(&cc, page); + + goto next_page; + } +read_single_page: +#endif + + ret = f2fs_read_single_page(inode, page, max_nr_pages, &map, + &bio, &last_block_in_bio, rac); + if (ret) { +#ifdef CONFIG_F2FS_FS_COMPRESSION +set_error_page: +#endif + SetPageError(page); + zero_user_segment(page, 0, PAGE_SIZE); + unlock_page(page); + } +#ifdef CONFIG_F2FS_FS_COMPRESSION +next_page: +#endif + if (rac) + put_page(page); + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + /* last page */ + if (nr_pages == 1 && !f2fs_cluster_is_empty(&cc)) { + ret = f2fs_read_multi_pages(&cc, &bio, + max_nr_pages, + &last_block_in_bio, + rac != NULL, false); + f2fs_destroy_compress_ctx(&cc, false); + } + } +#endif + } + if (bio) + __submit_bio(F2FS_I_SB(inode), bio, DATA); + return ret; +} + +static int f2fs_read_data_folio(struct file *file, struct folio *folio) +{ + struct page *page = &folio->page; + struct inode *inode = page_file_mapping(page)->host; + int ret = -EAGAIN; + + trace_f2fs_readpage(page, DATA); + + if (!f2fs_is_compress_backend_ready(inode)) { + unlock_page(page); + return -EOPNOTSUPP; + } + + /* If the file has inline data, try to read it directly */ + if (f2fs_has_inline_data(inode)) + ret = f2fs_read_inline_data(inode, page); + if (ret == -EAGAIN) + ret = f2fs_mpage_readpages(inode, NULL, page); + return ret; +} + +static void f2fs_readahead(struct readahead_control *rac) +{ + struct inode *inode = rac->mapping->host; + + trace_f2fs_readpages(inode, readahead_index(rac), readahead_count(rac)); + + if (!f2fs_is_compress_backend_ready(inode)) + return; + + /* If the file has inline data, skip readahead */ + if (f2fs_has_inline_data(inode)) + return; + + f2fs_mpage_readpages(inode, rac, NULL); +} + +int f2fs_encrypt_one_page(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + struct page *mpage, *page; + gfp_t gfp_flags = GFP_NOFS; + + if (!f2fs_encrypted_file(inode)) + return 0; + + page = fio->compressed_page ? fio->compressed_page : fio->page; + + if (fscrypt_inode_uses_inline_crypto(inode)) + return 0; + +retry_encrypt: + fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page, + PAGE_SIZE, 0, gfp_flags); + if (IS_ERR(fio->encrypted_page)) { + /* flush pending IOs and wait for a while in the ENOMEM case */ + if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { + f2fs_flush_merged_writes(fio->sbi); + memalloc_retry_wait(GFP_NOFS); + gfp_flags |= __GFP_NOFAIL; + goto retry_encrypt; + } + return PTR_ERR(fio->encrypted_page); + } + + mpage = find_lock_page(META_MAPPING(fio->sbi), fio->old_blkaddr); + if (mpage) { + if (PageUptodate(mpage)) + memcpy(page_address(mpage), + page_address(fio->encrypted_page), PAGE_SIZE); + f2fs_put_page(mpage, 1); + } + return 0; +} + +static inline bool check_inplace_update_policy(struct inode *inode, + struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int policy = SM_I(sbi)->ipu_policy; + + if (policy & (0x1 << F2FS_IPU_HONOR_OPU_WRITE) && + is_inode_flag_set(inode, FI_OPU_WRITE)) + return false; + if (policy & (0x1 << F2FS_IPU_FORCE)) + return true; + if (policy & (0x1 << F2FS_IPU_SSR) && f2fs_need_SSR(sbi)) + return true; + if (policy & (0x1 << F2FS_IPU_UTIL) && + utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && f2fs_need_SSR(sbi) && + utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + + /* + * IPU for rewrite async pages + */ + if (policy & (0x1 << F2FS_IPU_ASYNC) && + fio && fio->op == REQ_OP_WRITE && + !(fio->op_flags & REQ_SYNC) && + !IS_ENCRYPTED(inode)) + return true; + + /* this is only set during fdatasync */ + if (policy & (0x1 << F2FS_IPU_FSYNC) && + is_inode_flag_set(inode, FI_NEED_IPU)) + return true; + + if (unlikely(fio && is_sbi_flag_set(sbi, SBI_CP_DISABLED) && + !f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) + return true; + + return false; +} + +bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) +{ + /* swap file is migrating in aligned write mode */ + if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) + return false; + + if (f2fs_is_pinned_file(inode)) + return true; + + /* if this is cold file, we should overwrite to avoid fragmentation */ + if (file_is_cold(inode) && !is_inode_flag_set(inode, FI_OPU_WRITE)) + return true; + + return check_inplace_update_policy(inode, fio); +} + +bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + /* The below cases were checked when setting it. */ + if (f2fs_is_pinned_file(inode)) + return false; + if (fio && is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + return true; + if (f2fs_lfs_mode(sbi)) + return true; + if (S_ISDIR(inode->i_mode)) + return true; + if (IS_NOQUOTA(inode)) + return true; + if (f2fs_is_atomic_file(inode)) + return true; + + /* swap file is migrating in aligned write mode */ + if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) + return true; + + if (is_inode_flag_set(inode, FI_OPU_WRITE)) + return true; + + if (fio) { + if (page_private_gcing(fio->page)) + return true; + if (page_private_dummy(fio->page)) + return true; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && + f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) + return true; + } + return false; +} + +static inline bool need_inplace_update(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + + if (f2fs_should_update_outplace(inode, fio)) + return false; + + return f2fs_should_update_inplace(inode, fio); +} + +int f2fs_do_write_data_page(struct f2fs_io_info *fio) +{ + struct page *page = fio->page; + struct inode *inode = page->mapping->host; + struct dnode_of_data dn; + struct extent_info ei = {0, }; + struct node_info ni; + bool ipu_force = false; + int err = 0; + + /* Use COW inode to make dnode_of_data for atomic write */ + if (f2fs_is_atomic_file(inode)) + set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0); + else + set_new_dnode(&dn, inode, NULL, NULL, 0); + + if (need_inplace_update(fio) && + f2fs_lookup_read_extent_cache(inode, page->index, &ei)) { + fio->old_blkaddr = ei.blk + page->index - ei.fofs; + + if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, + DATA_GENERIC_ENHANCE)) { + f2fs_handle_error(fio->sbi, + ERROR_INVALID_BLKADDR); + return -EFSCORRUPTED; + } + + ipu_force = true; + fio->need_lock = LOCK_DONE; + goto got_it; + } + + /* Deadlock due to between page->lock and f2fs_lock_op */ + if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi)) + return -EAGAIN; + + err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + if (err) + goto out; + + fio->old_blkaddr = dn.data_blkaddr; + + /* This page is already truncated */ + if (fio->old_blkaddr == NULL_ADDR) { + ClearPageUptodate(page); + clear_page_private_gcing(page); + goto out_writepage; + } +got_it: + if (__is_valid_data_blkaddr(fio->old_blkaddr) && + !f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, + DATA_GENERIC_ENHANCE)) { + err = -EFSCORRUPTED; + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); + goto out_writepage; + } + + /* wait for GCed page writeback via META_MAPPING */ + if (fio->post_read) + f2fs_wait_on_block_writeback(inode, fio->old_blkaddr); + + /* + * If current allocation needs SSR, + * it had better in-place writes for updated data. + */ + if (ipu_force || + (__is_valid_data_blkaddr(fio->old_blkaddr) && + need_inplace_update(fio))) { + err = f2fs_encrypt_one_page(fio); + if (err) + goto out_writepage; + + set_page_writeback(page); + ClearPageError(page); + f2fs_put_dnode(&dn); + if (fio->need_lock == LOCK_REQ) + f2fs_unlock_op(fio->sbi); + err = f2fs_inplace_write_data(fio); + if (err) { + if (fscrypt_inode_uses_fs_layer_crypto(inode)) + fscrypt_finalize_bounce_page(&fio->encrypted_page); + if (PageWriteback(page)) + end_page_writeback(page); + } else { + set_inode_flag(inode, FI_UPDATE_WRITE); + } + trace_f2fs_do_write_data_page(fio->page, IPU); + return err; + } + + if (fio->need_lock == LOCK_RETRY) { + if (!f2fs_trylock_op(fio->sbi)) { + err = -EAGAIN; + goto out_writepage; + } + fio->need_lock = LOCK_REQ; + } + + err = f2fs_get_node_info(fio->sbi, dn.nid, &ni, false); + if (err) + goto out_writepage; + + fio->version = ni.version; + + err = f2fs_encrypt_one_page(fio); + if (err) + goto out_writepage; + + set_page_writeback(page); + ClearPageError(page); + + if (fio->compr_blocks && fio->old_blkaddr == COMPRESS_ADDR) + f2fs_i_compr_blocks_update(inode, fio->compr_blocks - 1, false); + + /* LFS mode write path */ + f2fs_outplace_write_data(&dn, fio); + trace_f2fs_do_write_data_page(page, OPU); + set_inode_flag(inode, FI_APPEND_WRITE); + if (page->index == 0) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); +out_writepage: + f2fs_put_dnode(&dn); +out: + if (fio->need_lock == LOCK_REQ) + f2fs_unlock_op(fio->sbi); + return err; +} + +int f2fs_write_single_data_page(struct page *page, int *submitted, + struct bio **bio, + sector_t *last_block, + struct writeback_control *wbc, + enum iostat_type io_type, + int compr_blocks, + bool allow_balance) +{ + struct inode *inode = page->mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + loff_t i_size = i_size_read(inode); + const pgoff_t end_index = ((unsigned long long)i_size) + >> PAGE_SHIFT; + loff_t psize = (loff_t)(page->index + 1) << PAGE_SHIFT; + unsigned offset = 0; + bool need_balance_fs = false; + bool quota_inode = IS_NOQUOTA(inode); + int err = 0; + struct f2fs_io_info fio = { + .sbi = sbi, + .ino = inode->i_ino, + .type = DATA, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), + .old_blkaddr = NULL_ADDR, + .page = page, + .encrypted_page = NULL, + .submitted = false, + .compr_blocks = compr_blocks, + .need_lock = LOCK_RETRY, + .post_read = f2fs_post_read_required(inode), + .io_type = io_type, + .io_wbc = wbc, + .bio = bio, + .last_block = last_block, + }; + + trace_f2fs_writepage(page, DATA); + + /* we should bypass data pages to proceed the kworkder jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + mapping_set_error(page->mapping, -EIO); + /* + * don't drop any dirty dentry pages for keeping lastest + * directory structure. + */ + if (S_ISDIR(inode->i_mode) && + !is_sbi_flag_set(sbi, SBI_IS_CLOSE)) + goto redirty_out; + goto out; + } + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + + if (page->index < end_index || + f2fs_verity_in_progress(inode) || + compr_blocks) + goto write; + + /* + * If the offset is out-of-range of file size, + * this page does not have to be written to disk. + */ + offset = i_size & (PAGE_SIZE - 1); + if ((page->index >= end_index + 1) || !offset) + goto out; + + zero_user_segment(page, offset, PAGE_SIZE); +write: + if (f2fs_is_drop_cache(inode)) + goto out; + + /* Dentry/quota blocks are controlled by checkpoint */ + if (S_ISDIR(inode->i_mode) || quota_inode) { + /* + * We need to wait for node_write to avoid block allocation during + * checkpoint. This can only happen to quota writes which can cause + * the below discard race condition. + */ + if (quota_inode) + f2fs_down_read(&sbi->node_write); + + fio.need_lock = LOCK_DONE; + err = f2fs_do_write_data_page(&fio); + + if (quota_inode) + f2fs_up_read(&sbi->node_write); + + goto done; + } + + if (!wbc->for_reclaim) + need_balance_fs = true; + else if (has_not_enough_free_secs(sbi, 0, 0)) + goto redirty_out; + else + set_inode_flag(inode, FI_HOT_DATA); + + err = -EAGAIN; + if (f2fs_has_inline_data(inode)) { + err = f2fs_write_inline_data(inode, page); + if (!err) + goto out; + } + + if (err == -EAGAIN) { + err = f2fs_do_write_data_page(&fio); + if (err == -EAGAIN) { + fio.need_lock = LOCK_REQ; + err = f2fs_do_write_data_page(&fio); + } + } + + if (err) { + file_set_keep_isize(inode); + } else { + spin_lock(&F2FS_I(inode)->i_size_lock); + if (F2FS_I(inode)->last_disk_size < psize) + F2FS_I(inode)->last_disk_size = psize; + spin_unlock(&F2FS_I(inode)->i_size_lock); + } + +done: + if (err && err != -ENOENT) + goto redirty_out; + +out: + inode_dec_dirty_pages(inode); + if (err) { + ClearPageUptodate(page); + clear_page_private_gcing(page); + } + + if (wbc->for_reclaim) { + f2fs_submit_merged_write_cond(sbi, NULL, page, 0, DATA); + clear_inode_flag(inode, FI_HOT_DATA); + f2fs_remove_dirty_inode(inode); + submitted = NULL; + } + unlock_page(page); + if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && + !F2FS_I(inode)->wb_task && allow_balance) + f2fs_balance_fs(sbi, need_balance_fs); + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_submit_merged_write(sbi, DATA); + if (bio && *bio) + f2fs_submit_merged_ipu_write(sbi, bio, NULL); + submitted = NULL; + } + + if (submitted) + *submitted = fio.submitted ? 1 : 0; + + return 0; + +redirty_out: + redirty_page_for_writepage(wbc, page); + /* + * pageout() in MM traslates EAGAIN, so calls handle_write_error() + * -> mapping_set_error() -> set_bit(AS_EIO, ...). + * file_write_and_wait_range() will see EIO error, which is critical + * to return value of fsync() followed by atomic_write failure to user. + */ + if (!err || wbc->for_reclaim) + return AOP_WRITEPAGE_ACTIVATE; + unlock_page(page); + return err; +} + +static int f2fs_write_data_page(struct page *page, + struct writeback_control *wbc) +{ +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct inode *inode = page->mapping->host; + + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + goto out; + + if (f2fs_compressed_file(inode)) { + if (f2fs_is_compressed_cluster(inode, page->index)) { + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; + } + } +out: +#endif + + return f2fs_write_single_data_page(page, NULL, NULL, NULL, + wbc, FS_DATA_IO, 0, true); +} + +/* + * This function was copied from write_cche_pages from mm/page-writeback.c. + * The major change is making write step of cold data page separately from + * warm/hot data page. + */ +static int f2fs_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + int ret = 0; + int done = 0, retry = 0; + struct page *pages_local[F2FS_ONSTACK_PAGES]; + struct page **pages = pages_local; + struct folio_batch fbatch; + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + struct bio *bio = NULL; + sector_t last_block; +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct inode *inode = mapping->host; + struct compress_ctx cc = { + .inode = inode, + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .cluster_idx = NULL_CLUSTER, + .rpages = NULL, + .nr_rpages = 0, + .cpages = NULL, + .valid_nr_cpages = 0, + .rbuf = NULL, + .cbuf = NULL, + .rlen = PAGE_SIZE * F2FS_I(inode)->i_cluster_size, + .private = NULL, + }; +#endif + int nr_folios, p, idx; + int nr_pages; + unsigned int max_pages = F2FS_ONSTACK_PAGES; + pgoff_t index; + pgoff_t end; /* Inclusive */ + pgoff_t done_index; + int range_whole = 0; + xa_mark_t tag; + int nwritten = 0; + int submitted = 0; + int i; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode) && + 1 << cc.log_cluster_size > F2FS_ONSTACK_PAGES) { + pages = f2fs_kzalloc(sbi, sizeof(struct page *) << + cc.log_cluster_size, GFP_NOFS | __GFP_NOFAIL); + max_pages = 1 << cc.log_cluster_size; + } +#endif + + folio_batch_init(&fbatch); + + if (get_dirty_pages(mapping->host) <= + SM_I(F2FS_M_SB(mapping))->min_hot_blocks) + set_inode_flag(mapping->host, FI_HOT_DATA); + else + clear_inode_flag(mapping->host, FI_HOT_DATA); + + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + } + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; +retry: + retry = 0; + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, index, end); + done_index = index; + while (!done && !retry && (index <= end)) { + nr_pages = 0; +again: + nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch); + if (nr_folios == 0) { + if (nr_pages) + goto write; + break; + } + + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; + + idx = 0; + p = folio_nr_pages(folio); +add_more: + pages[nr_pages] = folio_page(folio, idx); + folio_get(folio); + if (++nr_pages == max_pages) { + index = folio->index + idx + 1; + folio_batch_release(&fbatch); + goto write; + } + if (++idx < p) + goto add_more; + } + folio_batch_release(&fbatch); + goto again; +write: + for (i = 0; i < nr_pages; i++) { + struct page *page = pages[i]; + struct folio *folio = page_folio(page); + bool need_readd; +readd: + need_readd = false; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + void *fsdata = NULL; + struct page *pagep; + int ret2; + + ret = f2fs_init_compress_ctx(&cc); + if (ret) { + done = 1; + break; + } + + if (!f2fs_cluster_can_merge_page(&cc, + folio->index)) { + ret = f2fs_write_multi_pages(&cc, + &submitted, wbc, io_type); + if (!ret) + need_readd = true; + goto result; + } + + if (unlikely(f2fs_cp_error(sbi))) + goto lock_folio; + + if (!f2fs_cluster_is_empty(&cc)) + goto lock_folio; + + if (f2fs_all_cluster_page_ready(&cc, + pages, i, nr_pages, true)) + goto lock_folio; + + ret2 = f2fs_prepare_compress_overwrite( + inode, &pagep, + folio->index, &fsdata); + if (ret2 < 0) { + ret = ret2; + done = 1; + break; + } else if (ret2 && + (!f2fs_compress_write_end(inode, + fsdata, folio->index, 1) || + !f2fs_all_cluster_page_ready(&cc, + pages, i, nr_pages, + false))) { + retry = 1; + break; + } + } +#endif + /* give a priority to WB_SYNC threads */ + if (atomic_read(&sbi->wb_sync_req[DATA]) && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } +#ifdef CONFIG_F2FS_FS_COMPRESSION +lock_folio: +#endif + done_index = folio->index; +retry_write: + folio_lock(folio); + + if (unlikely(folio->mapping != mapping)) { +continue_unlock: + folio_unlock(folio); + continue; + } + + if (!folio_test_dirty(folio)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (folio_test_writeback(folio)) { + if (wbc->sync_mode != WB_SYNC_NONE) + f2fs_wait_on_page_writeback( + &folio->page, + DATA, true, true); + else + goto continue_unlock; + } + + if (!folio_clear_dirty_for_io(folio)) + goto continue_unlock; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + folio_get(folio); + f2fs_compress_ctx_add_page(&cc, &folio->page); + continue; + } +#endif + ret = f2fs_write_single_data_page(&folio->page, + &submitted, &bio, &last_block, + wbc, io_type, 0, true); + if (ret == AOP_WRITEPAGE_ACTIVATE) + folio_unlock(folio); +#ifdef CONFIG_F2FS_FS_COMPRESSION +result: +#endif + nwritten += submitted; + wbc->nr_to_write -= submitted; + + if (unlikely(ret)) { + /* + * keep nr_to_write, since vfs uses this to + * get # of written pages. + */ + if (ret == AOP_WRITEPAGE_ACTIVATE) { + ret = 0; + goto next; + } else if (ret == -EAGAIN) { + ret = 0; + if (wbc->sync_mode == WB_SYNC_ALL) { + f2fs_io_schedule_timeout( + DEFAULT_IO_TIMEOUT); + goto retry_write; + } + goto next; + } + done_index = folio->index + + folio_nr_pages(folio); + done = 1; + break; + } + + if (wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } +next: + if (need_readd) + goto readd; + } + release_pages(pages, nr_pages); + cond_resched(); + } +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* flush remained pages in compress cluster */ + if (f2fs_compressed_file(inode) && !f2fs_cluster_is_empty(&cc)) { + ret = f2fs_write_multi_pages(&cc, &submitted, wbc, io_type); + nwritten += submitted; + wbc->nr_to_write -= submitted; + if (ret) { + done = 1; + retry = 0; + } + } + if (f2fs_compressed_file(inode)) + f2fs_destroy_compress_ctx(&cc, false); +#endif + if (retry) { + index = 0; + end = -1; + goto retry; + } + if (wbc->range_cyclic && !done) + done_index = 0; + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = done_index; + + if (nwritten) + f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host, + NULL, 0, DATA); + /* submit cached bio of IPU write */ + if (bio) + f2fs_submit_merged_ipu_write(sbi, &bio, NULL); + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (pages != pages_local) + kfree(pages); +#endif + + return ret; +} + +static inline bool __should_serialize_io(struct inode *inode, + struct writeback_control *wbc) +{ + /* to avoid deadlock in path of data flush */ + if (F2FS_I(inode)->wb_task) + return false; + + if (!S_ISREG(inode->i_mode)) + return false; + if (IS_NOQUOTA(inode)) + return false; + + if (f2fs_need_compress_data(inode)) + return true; + if (wbc->sync_mode != WB_SYNC_ALL) + return true; + if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks) + return true; + return false; +} + +static int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct blk_plug plug; + int ret; + bool locked = false; + + /* deal with chardevs and other special file */ + if (!mapping->a_ops->writepage) + return 0; + + /* skip writing if there is no dirty page in this inode */ + if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) + return 0; + + /* during POR, we don't need to trigger writepage at all. */ + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + + if ((S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) && + wbc->sync_mode == WB_SYNC_NONE && + get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && + f2fs_available_free_memory(sbi, DIRTY_DENTS)) + goto skip_write; + + /* skip writing in file defragment preparing stage */ + if (is_inode_flag_set(inode, FI_SKIP_WRITES)) + goto skip_write; + + trace_f2fs_writepages(mapping->host, wbc, DATA); + + /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_inc(&sbi->wb_sync_req[DATA]); + else if (atomic_read(&sbi->wb_sync_req[DATA])) { + /* to avoid potential deadlock */ + if (current->plug) + blk_finish_plug(current->plug); + goto skip_write; + } + + if (__should_serialize_io(inode, wbc)) { + mutex_lock(&sbi->writepages); + locked = true; + } + + blk_start_plug(&plug); + ret = f2fs_write_cache_pages(mapping, wbc, io_type); + blk_finish_plug(&plug); + + if (locked) + mutex_unlock(&sbi->writepages); + + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_dec(&sbi->wb_sync_req[DATA]); + /* + * if some pages were truncated, we cannot guarantee its mapping->host + * to detect pending bios. + */ + + f2fs_remove_dirty_inode(inode); + return ret; + +skip_write: + wbc->pages_skipped += get_dirty_pages(inode); + trace_f2fs_writepages(mapping->host, wbc, DATA); + return 0; +} + +static int f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + + return __f2fs_write_data_pages(mapping, wbc, + F2FS_I(inode)->cp_task == current ? + FS_CP_DATA_IO : FS_DATA_IO); +} + +void f2fs_write_failed(struct inode *inode, loff_t to) +{ + loff_t i_size = i_size_read(inode); + + if (IS_NOQUOTA(inode)) + return; + + /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */ + if (to > i_size && !f2fs_verity_in_progress(inode)) { + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + + truncate_pagecache(inode, i_size); + f2fs_truncate_blocks(inode, i_size, true); + + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + } +} + +static int prepare_write_begin(struct f2fs_sb_info *sbi, + struct page *page, loff_t pos, unsigned len, + block_t *blk_addr, bool *node_changed) +{ + struct inode *inode = page->mapping->host; + pgoff_t index = page->index; + struct dnode_of_data dn; + struct page *ipage; + bool locked = false; + struct extent_info ei = {0, }; + int err = 0; + int flag; + + /* + * If a whole page is being written and we already preallocated all the + * blocks, then there is no need to get a block address now. + */ + if (len == PAGE_SIZE && is_inode_flag_set(inode, FI_PREALLOCATED_ALL)) + return 0; + + /* f2fs_lock_op avoids race between write CP and convert_inline_page */ + if (f2fs_has_inline_data(inode) && pos + len > MAX_INLINE_DATA(inode)) + flag = F2FS_GET_BLOCK_DEFAULT; + else + flag = F2FS_GET_BLOCK_PRE_AIO; + + if (f2fs_has_inline_data(inode) || + (pos & PAGE_MASK) >= i_size_read(inode)) { + f2fs_do_map_lock(sbi, flag, true); + locked = true; + } + +restart: + /* check inline_data */ + ipage = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_out; + } + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_has_inline_data(inode)) { + if (pos + len <= MAX_INLINE_DATA(inode)) { + f2fs_do_read_inline_data(page, ipage); + set_inode_flag(inode, FI_DATA_EXIST); + if (inode->i_nlink) + set_page_private_inline(ipage); + } else { + err = f2fs_convert_inline_page(&dn, page); + if (err) + goto out; + if (dn.data_blkaddr == NULL_ADDR) + err = f2fs_get_block(&dn, index); + } + } else if (locked) { + err = f2fs_get_block(&dn, index); + } else { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + } else { + /* hole case */ + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err || dn.data_blkaddr == NULL_ADDR) { + f2fs_put_dnode(&dn); + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, + true); + WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO); + locked = true; + goto restart; + } + } + } + + /* convert_inline_page can make node_changed */ + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; +out: + f2fs_put_dnode(&dn); +unlock_out: + if (locked) + f2fs_do_map_lock(sbi, flag, false); + return err; +} + +static int __find_data_block(struct inode *inode, pgoff_t index, + block_t *blk_addr) +{ + struct dnode_of_data dn; + struct page *ipage; + struct extent_info ei = {0, }; + int err = 0; + + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + } else { + /* hole case */ + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) { + dn.data_blkaddr = NULL_ADDR; + err = 0; + } + } + *blk_addr = dn.data_blkaddr; + f2fs_put_dnode(&dn); + return err; +} + +static int __reserve_data_block(struct inode *inode, pgoff_t index, + block_t *blk_addr, bool *node_changed) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct page *ipage; + int err = 0; + + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + + ipage = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_out; + } + set_new_dnode(&dn, inode, ipage, ipage, 0); + + err = f2fs_get_block(&dn, index); + + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; + f2fs_put_dnode(&dn); + +unlock_out: + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + return err; +} + +static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, + struct page *page, loff_t pos, unsigned int len, + block_t *blk_addr, bool *node_changed) +{ + struct inode *inode = page->mapping->host; + struct inode *cow_inode = F2FS_I(inode)->cow_inode; + pgoff_t index = page->index; + int err = 0; + block_t ori_blk_addr = NULL_ADDR; + + /* If pos is beyond the end of file, reserve a new block in COW inode */ + if ((pos & PAGE_MASK) >= i_size_read(inode)) + goto reserve_block; + + /* Look for the block in COW inode first */ + err = __find_data_block(cow_inode, index, blk_addr); + if (err) + return err; + else if (*blk_addr != NULL_ADDR) + return 0; + + /* Look for the block in the original inode */ + err = __find_data_block(inode, index, &ori_blk_addr); + if (err) + return err; + +reserve_block: + /* Finally, we should reserve a new block in COW inode for the update */ + err = __reserve_data_block(cow_inode, index, blk_addr, node_changed); + if (err) + return err; + inc_atomic_write_cnt(inode); + + if (ori_blk_addr != NULL_ADDR) + *blk_addr = ori_blk_addr; + return 0; +} + +static int f2fs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct page *page = NULL; + pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; + bool need_balance = false; + block_t blkaddr = NULL_ADDR; + int err = 0; + + trace_f2fs_write_begin(inode, pos, len); + + if (!f2fs_is_checkpoint_ready(sbi)) { + err = -ENOSPC; + goto fail; + } + + /* + * We should check this at this moment to avoid deadlock on inode page + * and #0 page. The locking rule for inline_data conversion should be: + * lock_page(page #0) -> lock_page(inode_page) + */ + if (index != 0) { + err = f2fs_convert_inline_inode(inode); + if (err) + goto fail; + } + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + int ret; + + *fsdata = NULL; + + if (len == PAGE_SIZE && !(f2fs_is_atomic_file(inode))) + goto repeat; + + ret = f2fs_prepare_compress_overwrite(inode, pagep, + index, fsdata); + if (ret < 0) { + err = ret; + goto fail; + } else if (ret) { + return 0; + } + } +#endif + +repeat: + /* + * Do not use grab_cache_page_write_begin() to avoid deadlock due to + * wait_for_stable_page. Will wait that below with our IO control. + */ + page = f2fs_pagecache_get_page(mapping, index, + FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS); + if (!page) { + err = -ENOMEM; + goto fail; + } + + /* TODO: cluster can be compressed due to race with .writepage */ + + *pagep = page; + + if (f2fs_is_atomic_file(inode)) + err = prepare_atomic_write_begin(sbi, page, pos, len, + &blkaddr, &need_balance); + else + err = prepare_write_begin(sbi, page, pos, len, + &blkaddr, &need_balance); + if (err) + goto fail; + + if (need_balance && !IS_NOQUOTA(inode) && + has_not_enough_free_secs(sbi, 0, 0)) { + unlock_page(page); + f2fs_balance_fs(sbi, true); + lock_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + f2fs_put_page(page, 1); + goto repeat; + } + } + + f2fs_wait_on_page_writeback(page, DATA, false, true); + + if (len == PAGE_SIZE || PageUptodate(page)) + return 0; + + if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode) && + !f2fs_verity_in_progress(inode)) { + zero_user_segment(page, len, PAGE_SIZE); + return 0; + } + + if (blkaddr == NEW_ADDR) { + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); + } else { + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE_READ)) { + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + goto fail; + } + err = f2fs_submit_page_read(inode, page, blkaddr, 0, true); + if (err) + goto fail; + + lock_page(page); + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; + } + if (unlikely(!PageUptodate(page))) { + err = -EIO; + goto fail; + } + } + return 0; + +fail: + f2fs_put_page(page, 1); + f2fs_write_failed(inode, pos + len); + return err; +} + +static int f2fs_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + + trace_f2fs_write_end(inode, pos, len, copied); + + /* + * This should be come from len == PAGE_SIZE, and we expect copied + * should be PAGE_SIZE. Otherwise, we treat it with zero copied and + * let generic_perform_write() try to copy data again through copied=0. + */ + if (!PageUptodate(page)) { + if (unlikely(copied != len)) + copied = 0; + else + SetPageUptodate(page); + } + +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* overwrite compressed file */ + if (f2fs_compressed_file(inode) && fsdata) { + f2fs_compress_write_end(inode, fsdata, page->index, copied); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + + if (pos + copied > i_size_read(inode) && + !f2fs_verity_in_progress(inode)) + f2fs_i_size_write(inode, pos + copied); + return copied; + } +#endif + + if (!copied) + goto unlock_out; + + set_page_dirty(page); + + if (pos + copied > i_size_read(inode) && + !f2fs_verity_in_progress(inode)) { + f2fs_i_size_write(inode, pos + copied); + if (f2fs_is_atomic_file(inode)) + f2fs_i_size_write(F2FS_I(inode)->cow_inode, + pos + copied); + } +unlock_out: + f2fs_put_page(page, 1); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + return copied; +} + +void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length) +{ + struct inode *inode = folio->mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino >= F2FS_ROOT_INO(sbi) && + (offset || length != folio_size(folio))) + return; + + if (folio_test_dirty(folio)) { + if (inode->i_ino == F2FS_META_INO(sbi)) { + dec_page_count(sbi, F2FS_DIRTY_META); + } else if (inode->i_ino == F2FS_NODE_INO(sbi)) { + dec_page_count(sbi, F2FS_DIRTY_NODES); + } else { + inode_dec_dirty_pages(inode); + f2fs_remove_dirty_inode(inode); + } + } + + clear_page_private_gcing(&folio->page); + + if (test_opt(sbi, COMPRESS_CACHE) && + inode->i_ino == F2FS_COMPRESS_INO(sbi)) + clear_page_private_data(&folio->page); + + folio_detach_private(folio); +} + +bool f2fs_release_folio(struct folio *folio, gfp_t wait) +{ + struct f2fs_sb_info *sbi; + + /* If this is dirty folio, keep private data */ + if (folio_test_dirty(folio)) + return false; + + sbi = F2FS_M_SB(folio->mapping); + if (test_opt(sbi, COMPRESS_CACHE)) { + struct inode *inode = folio->mapping->host; + + if (inode->i_ino == F2FS_COMPRESS_INO(sbi)) + clear_page_private_data(&folio->page); + } + + clear_page_private_gcing(&folio->page); + + folio_detach_private(folio); + return true; +} + +static bool f2fs_dirty_data_folio(struct address_space *mapping, + struct folio *folio) +{ + struct inode *inode = mapping->host; + + trace_f2fs_set_page_dirty(&folio->page, DATA); + + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + BUG_ON(folio_test_swapcache(folio)); + + if (filemap_dirty_folio(mapping, folio)) { + f2fs_update_dirty_folio(inode, folio); + return true; + } + return false; +} + + +static sector_t f2fs_bmap_compress(struct inode *inode, sector_t block) +{ +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct dnode_of_data dn; + sector_t start_idx, blknr = 0; + int ret; + + start_idx = round_down(block, F2FS_I(inode)->i_cluster_size); + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); + if (ret) + return 0; + + if (dn.data_blkaddr != COMPRESS_ADDR) { + dn.ofs_in_node += block - start_idx; + blknr = f2fs_data_blkaddr(&dn); + if (!__is_valid_data_blkaddr(blknr)) + blknr = 0; + } + + f2fs_put_dnode(&dn); + return blknr; +#else + return 0; +#endif +} + + +static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) +{ + struct inode *inode = mapping->host; + sector_t blknr = 0; + + if (f2fs_has_inline_data(inode)) + goto out; + + /* make sure allocating whole blocks */ + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + filemap_write_and_wait(mapping); + + /* Block number less than F2FS MAX BLOCKS */ + if (unlikely(block >= max_file_blocks(inode))) + goto out; + + if (f2fs_compressed_file(inode)) { + blknr = f2fs_bmap_compress(inode, block); + } else { + struct f2fs_map_blocks map; + + memset(&map, 0, sizeof(map)); + map.m_lblk = block; + map.m_len = 1; + map.m_next_pgofs = NULL; + map.m_seg_type = NO_CHECK_TYPE; + + if (!f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_BMAP)) + blknr = map.m_pblk; + } +out: + trace_f2fs_bmap(inode, block, blknr); + return blknr; +} + +#ifdef CONFIG_SWAP +static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, + unsigned int blkcnt) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int blkofs; + unsigned int blk_per_sec = BLKS_PER_SEC(sbi); + unsigned int secidx = start_blk / blk_per_sec; + unsigned int end_sec = secidx + blkcnt / blk_per_sec; + int ret = 0; + + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + + set_inode_flag(inode, FI_ALIGNED_WRITE); + set_inode_flag(inode, FI_OPU_WRITE); + + for (; secidx < end_sec; secidx++) { + f2fs_down_write(&sbi->pin_sem); + + f2fs_lock_op(sbi); + f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); + f2fs_unlock_op(sbi); + + set_inode_flag(inode, FI_SKIP_WRITES); + + for (blkofs = 0; blkofs < blk_per_sec; blkofs++) { + struct page *page; + unsigned int blkidx = secidx * blk_per_sec + blkofs; + + page = f2fs_get_lock_data_page(inode, blkidx, true); + if (IS_ERR(page)) { + f2fs_up_write(&sbi->pin_sem); + ret = PTR_ERR(page); + goto done; + } + + set_page_dirty(page); + f2fs_put_page(page, 1); + } + + clear_inode_flag(inode, FI_SKIP_WRITES); + + ret = filemap_fdatawrite(inode->i_mapping); + + f2fs_up_write(&sbi->pin_sem); + + if (ret) + break; + } + +done: + clear_inode_flag(inode, FI_SKIP_WRITES); + clear_inode_flag(inode, FI_OPU_WRITE); + clear_inode_flag(inode, FI_ALIGNED_WRITE); + + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + + return ret; +} + +static int check_swap_activate(struct swap_info_struct *sis, + struct file *swap_file, sector_t *span) +{ + struct address_space *mapping = swap_file->f_mapping; + struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + sector_t cur_lblock; + sector_t last_lblock; + sector_t pblock; + sector_t lowest_pblock = -1; + sector_t highest_pblock = 0; + int nr_extents = 0; + unsigned long nr_pblocks; + unsigned int blks_per_sec = BLKS_PER_SEC(sbi); + unsigned int sec_blks_mask = BLKS_PER_SEC(sbi) - 1; + unsigned int not_aligned = 0; + int ret = 0; + + /* + * Map all the blocks into the extent list. This code doesn't try + * to be very smart. + */ + cur_lblock = 0; + last_lblock = bytes_to_blks(inode, i_size_read(inode)); + + while (cur_lblock < last_lblock && cur_lblock < sis->max) { + struct f2fs_map_blocks map; +retry: + cond_resched(); + + memset(&map, 0, sizeof(map)); + map.m_lblk = cur_lblock; + map.m_len = last_lblock - cur_lblock; + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; + map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; + + ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); + if (ret) + goto out; + + /* hole */ + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + f2fs_err(sbi, "Swapfile has holes"); + ret = -EINVAL; + goto out; + } + + pblock = map.m_pblk; + nr_pblocks = map.m_len; + + if ((pblock - SM_I(sbi)->main_blkaddr) & sec_blks_mask || + nr_pblocks & sec_blks_mask) { + not_aligned++; + + nr_pblocks = roundup(nr_pblocks, blks_per_sec); + if (cur_lblock + nr_pblocks > sis->max) + nr_pblocks -= blks_per_sec; + + if (!nr_pblocks) { + /* this extent is last one */ + nr_pblocks = map.m_len; + f2fs_warn(sbi, "Swapfile: last extent is not aligned to section"); + goto next; + } + + ret = f2fs_migrate_blocks(inode, cur_lblock, + nr_pblocks); + if (ret) + goto out; + goto retry; + } +next: + if (cur_lblock + nr_pblocks >= sis->max) + nr_pblocks = sis->max - cur_lblock; + + if (cur_lblock) { /* exclude the header page */ + if (pblock < lowest_pblock) + lowest_pblock = pblock; + if (pblock + nr_pblocks - 1 > highest_pblock) + highest_pblock = pblock + nr_pblocks - 1; + } + + /* + * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks + */ + ret = add_swap_extent(sis, cur_lblock, nr_pblocks, pblock); + if (ret < 0) + goto out; + nr_extents += ret; + cur_lblock += nr_pblocks; + } + ret = nr_extents; + *span = 1 + highest_pblock - lowest_pblock; + if (cur_lblock == 0) + cur_lblock = 1; /* force Empty message */ + sis->max = cur_lblock; + sis->pages = cur_lblock - 1; + sis->highest_bit = cur_lblock - 1; +out: + if (not_aligned) + f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%u * N)", + not_aligned, blks_per_sec * F2FS_BLKSIZE); + return ret; +} + +static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) +{ + struct inode *inode = file_inode(file); + int ret; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (f2fs_readonly(F2FS_I_SB(inode)->sb)) + return -EROFS; + + if (f2fs_lfs_mode(F2FS_I_SB(inode))) { + f2fs_err(F2FS_I_SB(inode), + "Swapfile not supported in LFS mode"); + return -EINVAL; + } + + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + if (!f2fs_disable_compressed_file(inode)) + return -EINVAL; + + f2fs_precache_extents(inode); + + ret = check_swap_activate(sis, file, span); + if (ret < 0) + return ret; + + stat_inc_swapfile_inode(inode); + set_inode_flag(inode, FI_PIN_FILE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + return ret; +} + +static void f2fs_swap_deactivate(struct file *file) +{ + struct inode *inode = file_inode(file); + + stat_dec_swapfile_inode(inode); + clear_inode_flag(inode, FI_PIN_FILE); +} +#else +static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) +{ + return -EOPNOTSUPP; +} + +static void f2fs_swap_deactivate(struct file *file) +{ +} +#endif + +const struct address_space_operations f2fs_dblock_aops = { + .read_folio = f2fs_read_data_folio, + .readahead = f2fs_readahead, + .writepage = f2fs_write_data_page, + .writepages = f2fs_write_data_pages, + .write_begin = f2fs_write_begin, + .write_end = f2fs_write_end, + .dirty_folio = f2fs_dirty_data_folio, + .migrate_folio = filemap_migrate_folio, + .invalidate_folio = f2fs_invalidate_folio, + .release_folio = f2fs_release_folio, + .direct_IO = noop_direct_IO, + .bmap = f2fs_bmap, + .swap_activate = f2fs_swap_activate, + .swap_deactivate = f2fs_swap_deactivate, +}; + +void f2fs_clear_page_cache_dirty_tag(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + unsigned long flags; + + xa_lock_irqsave(&mapping->i_pages, flags); + __xa_clear_mark(&mapping->i_pages, page_index(page), + PAGECACHE_TAG_DIRTY); + xa_unlock_irqrestore(&mapping->i_pages, flags); +} + +int __init f2fs_init_post_read_processing(void) +{ + bio_post_read_ctx_cache = + kmem_cache_create("f2fs_bio_post_read_ctx", + sizeof(struct bio_post_read_ctx), 0, 0, NULL); + if (!bio_post_read_ctx_cache) + goto fail; + bio_post_read_ctx_pool = + mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, + bio_post_read_ctx_cache); + if (!bio_post_read_ctx_pool) + goto fail_free_cache; + return 0; + +fail_free_cache: + kmem_cache_destroy(bio_post_read_ctx_cache); +fail: + return -ENOMEM; +} + +void f2fs_destroy_post_read_processing(void) +{ + mempool_destroy(bio_post_read_ctx_pool); + kmem_cache_destroy(bio_post_read_ctx_cache); +} + +int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi) +{ + if (!f2fs_sb_has_encrypt(sbi) && + !f2fs_sb_has_verity(sbi) && + !f2fs_sb_has_compression(sbi)) + return 0; + + sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq", + WQ_UNBOUND | WQ_HIGHPRI, + num_online_cpus()); + if (!sbi->post_read_wq) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi) +{ + if (sbi->post_read_wq) + destroy_workqueue(sbi->post_read_wq); +} + +int __init f2fs_init_bio_entry_cache(void) +{ + bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab", + sizeof(struct bio_entry)); + if (!bio_entry_slab) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_bio_entry_cache(void) +{ + kmem_cache_destroy(bio_entry_slab); +} + +static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + struct f2fs_map_blocks map = {}; + pgoff_t next_pgofs = 0; + int err; + + map.m_lblk = bytes_to_blks(inode, offset); + map.m_len = bytes_to_blks(inode, offset + length - 1) - map.m_lblk + 1; + map.m_next_pgofs = &next_pgofs; + map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint); + if (flags & IOMAP_WRITE) + map.m_may_create = true; + + err = f2fs_map_blocks(inode, &map, flags & IOMAP_WRITE, + F2FS_GET_BLOCK_DIO); + if (err) + return err; + + iomap->offset = blks_to_bytes(inode, map.m_lblk); + + /* + * When inline encryption is enabled, sometimes I/O to an encrypted file + * has to be broken up to guarantee DUN contiguity. Handle this by + * limiting the length of the mapping returned. + */ + map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); + + /* + * We should never see delalloc or compressed extents here based on + * prior flushing and checks. + */ + if (WARN_ON_ONCE(map.m_pblk == NEW_ADDR)) + return -EINVAL; + if (WARN_ON_ONCE(map.m_pblk == COMPRESS_ADDR)) + return -EINVAL; + + if (map.m_pblk != NULL_ADDR) { + iomap->length = blks_to_bytes(inode, map.m_len); + iomap->type = IOMAP_MAPPED; + iomap->flags |= IOMAP_F_MERGED; + iomap->bdev = map.m_bdev; + iomap->addr = blks_to_bytes(inode, map.m_pblk); + } else { + if (flags & IOMAP_WRITE) + return -ENOTBLK; + iomap->length = blks_to_bytes(inode, next_pgofs) - + iomap->offset; + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + } + + if (map.m_flags & F2FS_MAP_NEW) + iomap->flags |= IOMAP_F_NEW; + if ((inode->i_state & I_DIRTY_DATASYNC) || + offset + length > i_size_read(inode)) + iomap->flags |= IOMAP_F_DIRTY; + + return 0; +} + +const struct iomap_ops f2fs_iomap_ops = { + .iomap_begin = f2fs_iomap_begin, +}; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c new file mode 100644 index 000000000..a9baa121d --- /dev/null +++ b/fs/f2fs/debug.c @@ -0,0 +1,679 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * f2fs debugging statistics + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * Copyright (c) 2012 Linux Foundation + * Copyright (c) 2012 Greg Kroah-Hartman <gregkh@linuxfoundation.org> + */ + +#include <linux/fs.h> +#include <linux/backing-dev.h> +#include <linux/f2fs_fs.h> +#include <linux/blkdev.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "gc.h" + +static LIST_HEAD(f2fs_stat_list); +static DEFINE_RAW_SPINLOCK(f2fs_stat_lock); +#ifdef CONFIG_DEBUG_FS +static struct dentry *f2fs_debugfs_root; +#endif + +/* + * This function calculates BDF of every segments + */ +void f2fs_update_sit_info(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + unsigned long long blks_per_sec, hblks_per_sec, total_vblocks; + unsigned long long bimodal, dist; + unsigned int segno, vblocks; + int ndirty = 0; + + bimodal = 0; + total_vblocks = 0; + blks_per_sec = CAP_BLKS_PER_SEC(sbi); + hblks_per_sec = blks_per_sec / 2; + for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { + vblocks = get_valid_blocks(sbi, segno, true); + dist = abs(vblocks - hblks_per_sec); + bimodal += dist * dist; + + if (vblocks > 0 && vblocks < blks_per_sec) { + total_vblocks += vblocks; + ndirty++; + } + } + dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100); + si->bimodal = div64_u64(bimodal, dist); + if (si->dirty_count) + si->avg_vblocks = div_u64(total_vblocks, ndirty); + else + si->avg_vblocks = 0; +} + +#ifdef CONFIG_DEBUG_FS +static void update_general_status(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + int i; + + /* these will be changed if online resize is done */ + si->main_area_segs = le32_to_cpu(raw_super->segment_count_main); + si->main_area_sections = le32_to_cpu(raw_super->section_count); + si->main_area_zones = si->main_area_sections / + le32_to_cpu(raw_super->secs_per_zone); + + /* general extent cache stats */ + for (i = 0; i < NR_EXTENT_CACHES; i++) { + struct extent_tree_info *eti = &sbi->extent_tree[i]; + + si->hit_cached[i] = atomic64_read(&sbi->read_hit_cached[i]); + si->hit_rbtree[i] = atomic64_read(&sbi->read_hit_rbtree[i]); + si->total_ext[i] = atomic64_read(&sbi->total_hit_ext[i]); + si->hit_total[i] = si->hit_cached[i] + si->hit_rbtree[i]; + si->ext_tree[i] = atomic_read(&eti->total_ext_tree); + si->zombie_tree[i] = atomic_read(&eti->total_zombie_tree); + si->ext_node[i] = atomic_read(&eti->total_ext_node); + } + /* read extent_cache only */ + si->hit_largest = atomic64_read(&sbi->read_hit_largest); + si->hit_total[EX_READ] += si->hit_largest; + + /* validation check of the segment numbers */ + si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); + si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); + si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); + si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA); + si->ndirty_qdata = get_pages(sbi, F2FS_DIRTY_QDATA); + si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA); + si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; + si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; + si->nquota_files = sbi->nquota_files; + si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; + si->aw_cnt = atomic_read(&sbi->atomic_files); + si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); + si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ); + si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE); + si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); + si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); + si->nr_rd_data = get_pages(sbi, F2FS_RD_DATA); + si->nr_rd_node = get_pages(sbi, F2FS_RD_NODE); + si->nr_rd_meta = get_pages(sbi, F2FS_RD_META); + if (SM_I(sbi)->fcc_info) { + si->nr_flushed = + atomic_read(&SM_I(sbi)->fcc_info->issued_flush); + si->nr_flushing = + atomic_read(&SM_I(sbi)->fcc_info->queued_flush); + si->flush_list_empty = + llist_empty(&SM_I(sbi)->fcc_info->issue_list); + } + if (SM_I(sbi)->dcc_info) { + si->nr_discarded = + atomic_read(&SM_I(sbi)->dcc_info->issued_discard); + si->nr_discarding = + atomic_read(&SM_I(sbi)->dcc_info->queued_discard); + si->nr_discard_cmd = + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; + } + si->nr_issued_ckpt = atomic_read(&sbi->cprc_info.issued_ckpt); + si->nr_total_ckpt = atomic_read(&sbi->cprc_info.total_ckpt); + si->nr_queued_ckpt = atomic_read(&sbi->cprc_info.queued_ckpt); + spin_lock(&sbi->cprc_info.stat_lock); + si->cur_ckpt_time = sbi->cprc_info.cur_time; + si->peak_ckpt_time = sbi->cprc_info.peak_time; + spin_unlock(&sbi->cprc_info.stat_lock); + si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; + si->rsvd_segs = reserved_segments(sbi); + si->overp_segs = overprovision_segments(sbi); + si->valid_count = valid_user_blocks(sbi); + si->discard_blks = discard_blocks(sbi); + si->valid_node_count = valid_node_count(sbi); + si->valid_inode_count = valid_inode_count(sbi); + si->inline_xattr = atomic_read(&sbi->inline_xattr); + si->inline_inode = atomic_read(&sbi->inline_inode); + si->inline_dir = atomic_read(&sbi->inline_dir); + si->compr_inode = atomic_read(&sbi->compr_inode); + si->swapfile_inode = atomic_read(&sbi->swapfile_inode); + si->compr_blocks = atomic64_read(&sbi->compr_blocks); + si->append = sbi->im[APPEND_INO].ino_num; + si->update = sbi->im[UPDATE_INO].ino_num; + si->orphans = sbi->im[ORPHAN_INO].ino_num; + si->utilization = utilization(sbi); + + si->free_segs = free_segments(sbi); + si->free_secs = free_sections(sbi); + si->prefree_count = prefree_segments(sbi); + si->dirty_count = dirty_segments(sbi); + if (sbi->node_inode) + si->node_pages = NODE_MAPPING(sbi)->nrpages; + if (sbi->meta_inode) + si->meta_pages = META_MAPPING(sbi)->nrpages; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (sbi->compress_inode) { + si->compress_pages = COMPRESS_MAPPING(sbi)->nrpages; + si->compress_page_hit = atomic_read(&sbi->compress_page_hit); + } +#endif + si->nats = NM_I(sbi)->nat_cnt[TOTAL_NAT]; + si->dirty_nats = NM_I(sbi)->nat_cnt[DIRTY_NAT]; + si->sits = MAIN_SEGS(sbi); + si->dirty_sits = SIT_I(sbi)->dirty_sentries; + si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID]; + si->avail_nids = NM_I(sbi)->available_nids; + si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; + si->io_skip_bggc = sbi->io_skip_bggc; + si->other_skip_bggc = sbi->other_skip_bggc; + si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) + * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) + / 2; + si->util_valid = (int)(written_block_count(sbi) >> + sbi->log_blocks_per_seg) + * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) + / 2; + si->util_invalid = 50 - si->util_free - si->util_valid; + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { + struct curseg_info *curseg = CURSEG_I(sbi, i); + + si->curseg[i] = curseg->segno; + si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); + si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]); + } + + for (i = META_CP; i < META_MAX; i++) + si->meta_count[i] = atomic_read(&sbi->meta_count[i]); + + for (i = 0; i < NO_CHECK_TYPE; i++) { + si->dirty_seg[i] = 0; + si->full_seg[i] = 0; + si->valid_blks[i] = 0; + } + + for (i = 0; i < MAIN_SEGS(sbi); i++) { + int blks = get_seg_entry(sbi, i)->valid_blocks; + int type = get_seg_entry(sbi, i)->type; + + if (!blks) + continue; + + if (blks == sbi->blocks_per_seg) + si->full_seg[type]++; + else + si->dirty_seg[type]++; + si->valid_blks[type] += blks; + } + + for (i = 0; i < 2; i++) { + si->segment_count[i] = sbi->segment_count[i]; + si->block_count[i] = sbi->block_count[i]; + } + + si->inplace_count = atomic_read(&sbi->inplace_count); +} + +/* + * This function calculates memory footprint. + */ +static void update_mem_info(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + int i; + + if (si->base_mem) + goto get_cache; + + /* build stat */ + si->base_mem = sizeof(struct f2fs_stat_info); + + /* build superblock */ + si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; + si->base_mem += 2 * sizeof(struct f2fs_inode_info); + si->base_mem += sizeof(*sbi->ckpt); + + /* build sm */ + si->base_mem += sizeof(struct f2fs_sm_info); + + /* build sit */ + si->base_mem += sizeof(struct sit_info); + si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); + si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); + si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + si->base_mem += SIT_VBLOCK_MAP_SIZE; + if (__is_large_section(sbi)) + si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); + si->base_mem += __bitmap_size(sbi, SIT_BITMAP); + + /* build free segmap */ + si->base_mem += sizeof(struct free_segmap_info); + si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); + si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi)); + + /* build curseg */ + si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE; + si->base_mem += PAGE_SIZE * NR_CURSEG_TYPE; + + /* build dirty segmap */ + si->base_mem += sizeof(struct dirty_seglist_info); + si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(MAIN_SEGS(sbi)); + si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi)); + + /* build nm */ + si->base_mem += sizeof(struct f2fs_nm_info); + si->base_mem += __bitmap_size(sbi, NAT_BITMAP); + si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS); + si->base_mem += NM_I(sbi)->nat_blocks * + f2fs_bitmap_size(NAT_ENTRY_PER_BLOCK); + si->base_mem += NM_I(sbi)->nat_blocks / 8; + si->base_mem += NM_I(sbi)->nat_blocks * sizeof(unsigned short); + +get_cache: + si->cache_mem = 0; + + /* build gc */ + if (sbi->gc_thread) + si->cache_mem += sizeof(struct f2fs_gc_kthread); + + /* build merge flush thread */ + if (SM_I(sbi)->fcc_info) + si->cache_mem += sizeof(struct flush_cmd_control); + if (SM_I(sbi)->dcc_info) { + si->cache_mem += sizeof(struct discard_cmd_control); + si->cache_mem += sizeof(struct discard_cmd) * + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + } + + /* free nids */ + si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] + + NM_I(sbi)->nid_cnt[PREALLOC_NID]) * + sizeof(struct free_nid); + si->cache_mem += NM_I(sbi)->nat_cnt[TOTAL_NAT] * + sizeof(struct nat_entry); + si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] * + sizeof(struct nat_entry_set); + for (i = 0; i < MAX_INO_ENTRY; i++) + si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); + + for (i = 0; i < NR_EXTENT_CACHES; i++) { + struct extent_tree_info *eti = &sbi->extent_tree[i]; + + si->ext_mem[i] = atomic_read(&eti->total_ext_tree) * + sizeof(struct extent_tree); + si->ext_mem[i] += atomic_read(&eti->total_ext_node) * + sizeof(struct extent_node); + si->cache_mem += si->ext_mem[i]; + } + + si->page_mem = 0; + if (sbi->node_inode) { + unsigned npages = NODE_MAPPING(sbi)->nrpages; + + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } + if (sbi->meta_inode) { + unsigned npages = META_MAPPING(sbi)->nrpages; + + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (sbi->compress_inode) { + unsigned npages = COMPRESS_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } +#endif +} + +static char *s_flag[] = { + [SBI_IS_DIRTY] = " fs_dirty", + [SBI_IS_CLOSE] = " closing", + [SBI_NEED_FSCK] = " need_fsck", + [SBI_POR_DOING] = " recovering", + [SBI_NEED_SB_WRITE] = " sb_dirty", + [SBI_NEED_CP] = " need_cp", + [SBI_IS_SHUTDOWN] = " shutdown", + [SBI_IS_RECOVERED] = " recovered", + [SBI_CP_DISABLED] = " cp_disabled", + [SBI_CP_DISABLED_QUICK] = " cp_disabled_quick", + [SBI_QUOTA_NEED_FLUSH] = " quota_need_flush", + [SBI_QUOTA_SKIP_FLUSH] = " quota_skip_flush", + [SBI_QUOTA_NEED_REPAIR] = " quota_need_repair", + [SBI_IS_RESIZEFS] = " resizefs", + [SBI_IS_FREEZING] = " freezefs", +}; + +static int stat_show(struct seq_file *s, void *v) +{ + struct f2fs_stat_info *si; + int i = 0, j = 0; + unsigned long flags; + + raw_spin_lock_irqsave(&f2fs_stat_lock, flags); + list_for_each_entry(si, &f2fs_stat_list, stat_list) { + update_general_status(si->sbi); + + seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", + si->sbi->sb->s_bdev, i++, + f2fs_readonly(si->sbi->sb) ? "RO" : "RW", + is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ? + "Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good")); + if (si->sbi->s_flag) { + seq_puts(s, "[SBI:"); + for_each_set_bit(j, &si->sbi->s_flag, 32) + seq_puts(s, s_flag[j]); + seq_puts(s, "]\n"); + } + seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", + si->sit_area_segs, si->nat_area_segs); + seq_printf(s, "[SSA: %d] [MAIN: %d", + si->ssa_area_segs, si->main_area_segs); + seq_printf(s, "(OverProv:%d Resv:%d)]\n\n", + si->overp_segs, si->rsvd_segs); + seq_printf(s, "Current Time Sec: %llu / Mounted Time Sec: %llu\n\n", + ktime_get_boottime_seconds(), + SIT_I(si->sbi)->mounted_time); + if (test_opt(si->sbi, DISCARD)) + seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n", + si->utilization, si->valid_count, si->discard_blks); + else + seq_printf(s, "Utilization: %u%% (%u valid blocks)\n", + si->utilization, si->valid_count); + + seq_printf(s, " - Node: %u (Inode: %u, ", + si->valid_node_count, si->valid_inode_count); + seq_printf(s, "Other: %u)\n - Data: %u\n", + si->valid_node_count - si->valid_inode_count, + si->valid_count - si->valid_node_count); + seq_printf(s, " - Inline_xattr Inode: %u\n", + si->inline_xattr); + seq_printf(s, " - Inline_data Inode: %u\n", + si->inline_inode); + seq_printf(s, " - Inline_dentry Inode: %u\n", + si->inline_dir); + seq_printf(s, " - Compressed Inode: %u, Blocks: %llu\n", + si->compr_inode, si->compr_blocks); + seq_printf(s, " - Swapfile Inode: %u\n", + si->swapfile_inode); + seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", + si->orphans, si->append, si->update); + seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", + si->main_area_segs, si->main_area_sections, + si->main_area_zones); + seq_printf(s, " TYPE %8s %8s %8s %10s %10s %10s\n", + "segno", "secno", "zoneno", "dirty_seg", "full_seg", "valid_blk"); + seq_printf(s, " - COLD data: %8d %8d %8d %10u %10u %10u\n", + si->curseg[CURSEG_COLD_DATA], + si->cursec[CURSEG_COLD_DATA], + si->curzone[CURSEG_COLD_DATA], + si->dirty_seg[CURSEG_COLD_DATA], + si->full_seg[CURSEG_COLD_DATA], + si->valid_blks[CURSEG_COLD_DATA]); + seq_printf(s, " - WARM data: %8d %8d %8d %10u %10u %10u\n", + si->curseg[CURSEG_WARM_DATA], + si->cursec[CURSEG_WARM_DATA], + si->curzone[CURSEG_WARM_DATA], + si->dirty_seg[CURSEG_WARM_DATA], + si->full_seg[CURSEG_WARM_DATA], + si->valid_blks[CURSEG_WARM_DATA]); + seq_printf(s, " - HOT data: %8d %8d %8d %10u %10u %10u\n", + si->curseg[CURSEG_HOT_DATA], + si->cursec[CURSEG_HOT_DATA], + si->curzone[CURSEG_HOT_DATA], + si->dirty_seg[CURSEG_HOT_DATA], + si->full_seg[CURSEG_HOT_DATA], + si->valid_blks[CURSEG_HOT_DATA]); + seq_printf(s, " - Dir dnode: %8d %8d %8d %10u %10u %10u\n", + si->curseg[CURSEG_HOT_NODE], + si->cursec[CURSEG_HOT_NODE], + si->curzone[CURSEG_HOT_NODE], + si->dirty_seg[CURSEG_HOT_NODE], + si->full_seg[CURSEG_HOT_NODE], + si->valid_blks[CURSEG_HOT_NODE]); + seq_printf(s, " - File dnode: %8d %8d %8d %10u %10u %10u\n", + si->curseg[CURSEG_WARM_NODE], + si->cursec[CURSEG_WARM_NODE], + si->curzone[CURSEG_WARM_NODE], + si->dirty_seg[CURSEG_WARM_NODE], + si->full_seg[CURSEG_WARM_NODE], + si->valid_blks[CURSEG_WARM_NODE]); + seq_printf(s, " - Indir nodes: %8d %8d %8d %10u %10u %10u\n", + si->curseg[CURSEG_COLD_NODE], + si->cursec[CURSEG_COLD_NODE], + si->curzone[CURSEG_COLD_NODE], + si->dirty_seg[CURSEG_COLD_NODE], + si->full_seg[CURSEG_COLD_NODE], + si->valid_blks[CURSEG_COLD_NODE]); + seq_printf(s, " - Pinned file: %8d %8d %8d\n", + si->curseg[CURSEG_COLD_DATA_PINNED], + si->cursec[CURSEG_COLD_DATA_PINNED], + si->curzone[CURSEG_COLD_DATA_PINNED]); + seq_printf(s, " - ATGC data: %8d %8d %8d\n", + si->curseg[CURSEG_ALL_DATA_ATGC], + si->cursec[CURSEG_ALL_DATA_ATGC], + si->curzone[CURSEG_ALL_DATA_ATGC]); + seq_printf(s, "\n - Valid: %d\n - Dirty: %d\n", + si->main_area_segs - si->dirty_count - + si->prefree_count - si->free_segs, + si->dirty_count); + seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", + si->prefree_count, si->free_segs, si->free_secs); + seq_printf(s, "CP calls: %d (BG: %d)\n", + si->cp_count, si->bg_cp_count); + seq_printf(s, " - cp blocks : %u\n", si->meta_count[META_CP]); + seq_printf(s, " - sit blocks : %u\n", + si->meta_count[META_SIT]); + seq_printf(s, " - nat blocks : %u\n", + si->meta_count[META_NAT]); + seq_printf(s, " - ssa blocks : %u\n", + si->meta_count[META_SSA]); + seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, " + "Cur time: %4d(ms), Peak time: %4d(ms))\n", + si->nr_queued_ckpt, si->nr_issued_ckpt, + si->nr_total_ckpt, si->cur_ckpt_time, + si->peak_ckpt_time); + seq_printf(s, "GC calls: %d (BG: %d)\n", + si->call_count, si->bg_gc); + seq_printf(s, " - data segments : %d (%d)\n", + si->data_segs, si->bg_data_segs); + seq_printf(s, " - node segments : %d (%d)\n", + si->node_segs, si->bg_node_segs); + seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), " + "Idle Greedy (%d), Idle AT (%d), " + "Urgent High (%d), Urgent Mid (%d), " + "Urgent Low (%d)\n", + si->sbi->gc_reclaimed_segs[GC_NORMAL], + si->sbi->gc_reclaimed_segs[GC_IDLE_CB], + si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY], + si->sbi->gc_reclaimed_segs[GC_IDLE_AT], + si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH], + si->sbi->gc_reclaimed_segs[GC_URGENT_MID], + si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]); + seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks, + si->bg_data_blks + si->bg_node_blks); + seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks, + si->bg_data_blks); + seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, + si->bg_node_blks); + seq_printf(s, "BG skip : IO: %u, Other: %u\n", + si->io_skip_bggc, si->other_skip_bggc); + seq_puts(s, "\nExtent Cache (Read):\n"); + seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n", + si->hit_largest, si->hit_cached[EX_READ], + si->hit_rbtree[EX_READ]); + seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n", + !si->total_ext[EX_READ] ? 0 : + div64_u64(si->hit_total[EX_READ] * 100, + si->total_ext[EX_READ]), + si->hit_total[EX_READ], si->total_ext[EX_READ]); + seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", + si->ext_tree[EX_READ], si->zombie_tree[EX_READ], + si->ext_node[EX_READ]); + seq_puts(s, "\nBalancing F2FS Async:\n"); + seq_printf(s, " - DIO (R: %4d, W: %4d)\n", + si->nr_dio_read, si->nr_dio_write); + seq_printf(s, " - IO_R (Data: %4d, Node: %4d, Meta: %4d\n", + si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta); + seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), " + "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", + si->nr_wb_cp_data, si->nr_wb_data, + si->nr_flushing, si->nr_flushed, + si->flush_list_empty, + si->nr_discarding, si->nr_discarded, + si->nr_discard_cmd, si->undiscard_blks); + seq_printf(s, " - atomic IO: %4d (Max. %4d)\n", + si->aw_cnt, si->max_aw_cnt); + seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit); + seq_printf(s, " - nodes: %4d in %4d\n", + si->ndirty_node, si->node_pages); + seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", + si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); + seq_printf(s, " - datas: %4d in files:%4d\n", + si->ndirty_data, si->ndirty_files); + seq_printf(s, " - quota datas: %4d in quota files:%4d\n", + si->ndirty_qdata, si->nquota_files); + seq_printf(s, " - meta: %4d in %4d\n", + si->ndirty_meta, si->meta_pages); + seq_printf(s, " - imeta: %4d\n", + si->ndirty_imeta); + seq_printf(s, " - fsync mark: %4lld\n", + percpu_counter_sum_positive( + &si->sbi->rf_node_block_count)); + seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", + si->dirty_nats, si->nats, si->dirty_sits, si->sits); + seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n", + si->free_nids, si->avail_nids, si->alloc_nids); + seq_puts(s, "\nDistribution of User Blocks:"); + seq_puts(s, " [ valid | invalid | free ]\n"); + seq_puts(s, " ["); + + for (j = 0; j < si->util_valid; j++) + seq_putc(s, '-'); + seq_putc(s, '|'); + + for (j = 0; j < si->util_invalid; j++) + seq_putc(s, '-'); + seq_putc(s, '|'); + + for (j = 0; j < si->util_free; j++) + seq_putc(s, '-'); + seq_puts(s, "]\n\n"); + seq_printf(s, "IPU: %u blocks\n", si->inplace_count); + seq_printf(s, "SSR: %u blocks in %u segments\n", + si->block_count[SSR], si->segment_count[SSR]); + seq_printf(s, "LFS: %u blocks in %u segments\n", + si->block_count[LFS], si->segment_count[LFS]); + + /* segment usage info */ + f2fs_update_sit_info(si->sbi); + seq_printf(s, "\nBDF: %u, avg. vblocks: %u\n", + si->bimodal, si->avg_vblocks); + + /* memory footprint */ + update_mem_info(si->sbi); + seq_printf(s, "\nMemory: %llu KB\n", + (si->base_mem + si->cache_mem + si->page_mem) >> 10); + seq_printf(s, " - static: %llu KB\n", + si->base_mem >> 10); + seq_printf(s, " - cached all: %llu KB\n", + si->cache_mem >> 10); + seq_printf(s, " - read extent cache: %llu KB\n", + si->ext_mem[EX_READ] >> 10); + seq_printf(s, " - paged : %llu KB\n", + si->page_mem >> 10); + } + raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(stat); +#endif + +int f2fs_build_stats(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + struct f2fs_stat_info *si; + unsigned long flags; + int i; + + si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL); + if (!si) + return -ENOMEM; + + si->all_area_segs = le32_to_cpu(raw_super->segment_count); + si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit); + si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat); + si->ssa_area_segs = le32_to_cpu(raw_super->segment_count_ssa); + si->main_area_segs = le32_to_cpu(raw_super->segment_count_main); + si->main_area_sections = le32_to_cpu(raw_super->section_count); + si->main_area_zones = si->main_area_sections / + le32_to_cpu(raw_super->secs_per_zone); + si->sbi = sbi; + sbi->stat_info = si; + + /* general extent cache stats */ + for (i = 0; i < NR_EXTENT_CACHES; i++) { + atomic64_set(&sbi->total_hit_ext[i], 0); + atomic64_set(&sbi->read_hit_rbtree[i], 0); + atomic64_set(&sbi->read_hit_cached[i], 0); + } + + /* read extent_cache only */ + atomic64_set(&sbi->read_hit_largest, 0); + + atomic_set(&sbi->inline_xattr, 0); + atomic_set(&sbi->inline_inode, 0); + atomic_set(&sbi->inline_dir, 0); + atomic_set(&sbi->compr_inode, 0); + atomic64_set(&sbi->compr_blocks, 0); + atomic_set(&sbi->swapfile_inode, 0); + atomic_set(&sbi->atomic_files, 0); + atomic_set(&sbi->inplace_count, 0); + for (i = META_CP; i < META_MAX; i++) + atomic_set(&sbi->meta_count[i], 0); + + atomic_set(&sbi->max_aw_cnt, 0); + + raw_spin_lock_irqsave(&f2fs_stat_lock, flags); + list_add_tail(&si->stat_list, &f2fs_stat_list); + raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); + + return 0; +} + +void f2fs_destroy_stats(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + unsigned long flags; + + raw_spin_lock_irqsave(&f2fs_stat_lock, flags); + list_del(&si->stat_list); + raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); + + kfree(si); +} + +void __init f2fs_create_root_stats(void) +{ +#ifdef CONFIG_DEBUG_FS + f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); + + debugfs_create_file("status", 0444, f2fs_debugfs_root, NULL, + &stat_fops); +#endif +} + +void f2fs_destroy_root_stats(void) +{ +#ifdef CONFIG_DEBUG_FS + debugfs_remove_recursive(f2fs_debugfs_root); + f2fs_debugfs_root = NULL; +#endif +} diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c new file mode 100644 index 000000000..510736d2a --- /dev/null +++ b/fs/f2fs/dir.c @@ -0,0 +1,1182 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/f2fs/dir.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + */ +#include <asm/unaligned.h> +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/sched/signal.h> +#include <linux/unicode.h> +#include "f2fs.h" +#include "node.h" +#include "acl.h" +#include "xattr.h" +#include <trace/events/f2fs.h> + +#if IS_ENABLED(CONFIG_UNICODE) +extern struct kmem_cache *f2fs_cf_name_slab; +#endif + +static unsigned long dir_blocks(struct inode *inode) +{ + return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1)) + >> PAGE_SHIFT; +} + +static unsigned int dir_buckets(unsigned int level, int dir_level) +{ + if (level + dir_level < MAX_DIR_HASH_DEPTH / 2) + return BIT(level + dir_level); + else + return MAX_DIR_BUCKETS; +} + +static unsigned int bucket_blocks(unsigned int level) +{ + if (level < MAX_DIR_HASH_DEPTH / 2) + return 2; + else + return 4; +} + +static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { + [F2FS_FT_UNKNOWN] = DT_UNKNOWN, + [F2FS_FT_REG_FILE] = DT_REG, + [F2FS_FT_DIR] = DT_DIR, + [F2FS_FT_CHRDEV] = DT_CHR, + [F2FS_FT_BLKDEV] = DT_BLK, + [F2FS_FT_FIFO] = DT_FIFO, + [F2FS_FT_SOCK] = DT_SOCK, + [F2FS_FT_SYMLINK] = DT_LNK, +}; + +static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = F2FS_FT_DIR, + [S_IFCHR >> S_SHIFT] = F2FS_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = F2FS_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = F2FS_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = F2FS_FT_SOCK, + [S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK, +}; + +static void set_de_type(struct f2fs_dir_entry *de, umode_t mode) +{ + de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; +} + +unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de) +{ + if (de->file_type < F2FS_FT_MAX) + return f2fs_filetype_table[de->file_type]; + return DT_UNKNOWN; +} + +/* If @dir is casefolded, initialize @fname->cf_name from @fname->usr_fname. */ +int f2fs_init_casefolded_name(const struct inode *dir, + struct f2fs_filename *fname) +{ +#if IS_ENABLED(CONFIG_UNICODE) + struct super_block *sb = dir->i_sb; + + if (IS_CASEFOLDED(dir) && + !is_dot_dotdot(fname->usr_fname->name, fname->usr_fname->len)) { + fname->cf_name.name = f2fs_kmem_cache_alloc(f2fs_cf_name_slab, + GFP_NOFS, false, F2FS_SB(sb)); + if (!fname->cf_name.name) + return -ENOMEM; + fname->cf_name.len = utf8_casefold(sb->s_encoding, + fname->usr_fname, + fname->cf_name.name, + F2FS_NAME_LEN); + if ((int)fname->cf_name.len <= 0) { + kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); + fname->cf_name.name = NULL; + if (sb_has_strict_encoding(sb)) + return -EINVAL; + /* fall back to treating name as opaque byte sequence */ + } + } +#endif + return 0; +} + +static int __f2fs_setup_filename(const struct inode *dir, + const struct fscrypt_name *crypt_name, + struct f2fs_filename *fname) +{ + int err; + + memset(fname, 0, sizeof(*fname)); + + fname->usr_fname = crypt_name->usr_fname; + fname->disk_name = crypt_name->disk_name; +#ifdef CONFIG_FS_ENCRYPTION + fname->crypto_buf = crypt_name->crypto_buf; +#endif + if (crypt_name->is_nokey_name) { + /* hash was decoded from the no-key name */ + fname->hash = cpu_to_le32(crypt_name->hash); + } else { + err = f2fs_init_casefolded_name(dir, fname); + if (err) { + f2fs_free_filename(fname); + return err; + } + f2fs_hash_filename(dir, fname); + } + return 0; +} + +/* + * Prepare to search for @iname in @dir. This is similar to + * fscrypt_setup_filename(), but this also handles computing the casefolded name + * and the f2fs dirhash if needed, then packing all the information about this + * filename up into a 'struct f2fs_filename'. + */ +int f2fs_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct f2fs_filename *fname) +{ + struct fscrypt_name crypt_name; + int err; + + err = fscrypt_setup_filename(dir, iname, lookup, &crypt_name); + if (err) + return err; + + return __f2fs_setup_filename(dir, &crypt_name, fname); +} + +/* + * Prepare to look up @dentry in @dir. This is similar to + * fscrypt_prepare_lookup(), but this also handles computing the casefolded name + * and the f2fs dirhash if needed, then packing all the information about this + * filename up into a 'struct f2fs_filename'. + */ +int f2fs_prepare_lookup(struct inode *dir, struct dentry *dentry, + struct f2fs_filename *fname) +{ + struct fscrypt_name crypt_name; + int err; + + err = fscrypt_prepare_lookup(dir, dentry, &crypt_name); + if (err) + return err; + + return __f2fs_setup_filename(dir, &crypt_name, fname); +} + +void f2fs_free_filename(struct f2fs_filename *fname) +{ +#ifdef CONFIG_FS_ENCRYPTION + kfree(fname->crypto_buf.name); + fname->crypto_buf.name = NULL; +#endif +#if IS_ENABLED(CONFIG_UNICODE) + if (fname->cf_name.name) { + kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); + fname->cf_name.name = NULL; + } +#endif +} + +static unsigned long dir_block_index(unsigned int level, + int dir_level, unsigned int idx) +{ + unsigned long i; + unsigned long bidx = 0; + + for (i = 0; i < level; i++) + bidx += dir_buckets(i, dir_level) * bucket_blocks(i); + bidx += idx * bucket_blocks(level); + return bidx; +} + +static struct f2fs_dir_entry *find_in_block(struct inode *dir, + struct page *dentry_page, + const struct f2fs_filename *fname, + int *max_slots) +{ + struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_ptr d; + + dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page); + + make_dentry_ptr_block(dir, &d, dentry_blk); + return f2fs_find_target_dentry(&d, fname, max_slots); +} + +#if IS_ENABLED(CONFIG_UNICODE) +/* + * Test whether a case-insensitive directory entry matches the filename + * being searched for. + * + * Returns 1 for a match, 0 for no match, and -errno on an error. + */ +static int f2fs_match_ci_name(const struct inode *dir, const struct qstr *name, + const u8 *de_name, u32 de_name_len) +{ + const struct super_block *sb = dir->i_sb; + const struct unicode_map *um = sb->s_encoding; + struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); + struct qstr entry = QSTR_INIT(de_name, de_name_len); + int res; + + if (IS_ENCRYPTED(dir)) { + const struct fscrypt_str encrypted_name = + FSTR_INIT((u8 *)de_name, de_name_len); + + if (WARN_ON_ONCE(!fscrypt_has_encryption_key(dir))) + return -EINVAL; + + decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); + if (!decrypted_name.name) + return -ENOMEM; + res = fscrypt_fname_disk_to_usr(dir, 0, 0, &encrypted_name, + &decrypted_name); + if (res < 0) + goto out; + entry.name = decrypted_name.name; + entry.len = decrypted_name.len; + } + + res = utf8_strncasecmp_folded(um, name, &entry); + /* + * In strict mode, ignore invalid names. In non-strict mode, + * fall back to treating them as opaque byte sequences. + */ + if (res < 0 && !sb_has_strict_encoding(sb)) { + res = name->len == entry.len && + memcmp(name->name, entry.name, name->len) == 0; + } else { + /* utf8_strncasecmp_folded returns 0 on match */ + res = (res == 0); + } +out: + kfree(decrypted_name.name); + return res; +} +#endif /* CONFIG_UNICODE */ + +static inline int f2fs_match_name(const struct inode *dir, + const struct f2fs_filename *fname, + const u8 *de_name, u32 de_name_len) +{ + struct fscrypt_name f; + +#if IS_ENABLED(CONFIG_UNICODE) + if (fname->cf_name.name) { + struct qstr cf = FSTR_TO_QSTR(&fname->cf_name); + + return f2fs_match_ci_name(dir, &cf, de_name, de_name_len); + } +#endif + f.usr_fname = fname->usr_fname; + f.disk_name = fname->disk_name; +#ifdef CONFIG_FS_ENCRYPTION + f.crypto_buf = fname->crypto_buf; +#endif + return fscrypt_match_name(&f, de_name, de_name_len); +} + +struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d, + const struct f2fs_filename *fname, int *max_slots) +{ + struct f2fs_dir_entry *de; + unsigned long bit_pos = 0; + int max_len = 0; + int res = 0; + + if (max_slots) + *max_slots = 0; + while (bit_pos < d->max) { + if (!test_bit_le(bit_pos, d->bitmap)) { + bit_pos++; + max_len++; + continue; + } + + de = &d->dentry[bit_pos]; + + if (unlikely(!de->name_len)) { + bit_pos++; + continue; + } + + if (de->hash_code == fname->hash) { + res = f2fs_match_name(d->inode, fname, + d->filename[bit_pos], + le16_to_cpu(de->name_len)); + if (res < 0) + return ERR_PTR(res); + if (res) + goto found; + } + + if (max_slots && max_len > *max_slots) + *max_slots = max_len; + max_len = 0; + + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + } + + de = NULL; +found: + if (max_slots && max_len > *max_slots) + *max_slots = max_len; + return de; +} + +static struct f2fs_dir_entry *find_in_level(struct inode *dir, + unsigned int level, + const struct f2fs_filename *fname, + struct page **res_page) +{ + int s = GET_DENTRY_SLOTS(fname->disk_name.len); + unsigned int nbucket, nblock; + unsigned int bidx, end_block; + struct page *dentry_page; + struct f2fs_dir_entry *de = NULL; + pgoff_t next_pgofs; + bool room = false; + int max_slots; + + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); + nblock = bucket_blocks(level); + + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, + le32_to_cpu(fname->hash) % nbucket); + end_block = bidx + nblock; + + while (bidx < end_block) { + /* no need to allocate new dentry pages to all the indices */ + dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs); + if (IS_ERR(dentry_page)) { + if (PTR_ERR(dentry_page) == -ENOENT) { + room = true; + bidx = next_pgofs; + continue; + } else { + *res_page = dentry_page; + break; + } + } + + de = find_in_block(dir, dentry_page, fname, &max_slots); + if (IS_ERR(de)) { + *res_page = ERR_CAST(de); + de = NULL; + break; + } else if (de) { + *res_page = dentry_page; + break; + } + + if (max_slots >= s) + room = true; + f2fs_put_page(dentry_page, 0); + + bidx++; + } + + if (!de && room && F2FS_I(dir)->chash != fname->hash) { + F2FS_I(dir)->chash = fname->hash; + F2FS_I(dir)->clevel = level; + } + + return de; +} + +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, + const struct f2fs_filename *fname, + struct page **res_page) +{ + unsigned long npages = dir_blocks(dir); + struct f2fs_dir_entry *de = NULL; + unsigned int max_depth; + unsigned int level; + + *res_page = NULL; + + if (f2fs_has_inline_dentry(dir)) { + de = f2fs_find_in_inline_dir(dir, fname, res_page); + goto out; + } + + if (npages == 0) + goto out; + + max_depth = F2FS_I(dir)->i_current_depth; + if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { + f2fs_warn(F2FS_I_SB(dir), "Corrupted max_depth of %lu: %u", + dir->i_ino, max_depth); + max_depth = MAX_DIR_HASH_DEPTH; + f2fs_i_depth_write(dir, max_depth); + } + + for (level = 0; level < max_depth; level++) { + de = find_in_level(dir, level, fname, res_page); + if (de || IS_ERR(*res_page)) + break; + } +out: + /* This is to increase the speed of f2fs_create */ + if (!de) + F2FS_I(dir)->task = current; + return de; +} + +/* + * Find an entry in the specified directory with the wanted name. + * It returns the page where the entry was found (as a parameter - res_page), + * and the entry itself. Page is returned mapped and unlocked. + * Entry is guaranteed to be valid. + */ +struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, + const struct qstr *child, struct page **res_page) +{ + struct f2fs_dir_entry *de = NULL; + struct f2fs_filename fname; + int err; + + err = f2fs_setup_filename(dir, child, 1, &fname); + if (err) { + if (err == -ENOENT) + *res_page = NULL; + else + *res_page = ERR_PTR(err); + return NULL; + } + + de = __f2fs_find_entry(dir, &fname, res_page); + + f2fs_free_filename(&fname); + return de; +} + +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) +{ + return f2fs_find_entry(dir, &dotdot_name, p); +} + +ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, + struct page **page) +{ + ino_t res = 0; + struct f2fs_dir_entry *de; + + de = f2fs_find_entry(dir, qstr, page); + if (de) { + res = le32_to_cpu(de->ino); + f2fs_put_page(*page, 0); + } + + return res; +} + +void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, + struct page *page, struct inode *inode) +{ + enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; + + lock_page(page); + f2fs_wait_on_page_writeback(page, type, true, true); + de->ino = cpu_to_le32(inode->i_ino); + set_de_type(de, inode->i_mode); + set_page_dirty(page); + + dir->i_mtime = dir->i_ctime = current_time(dir); + f2fs_mark_inode_dirty_sync(dir, false); + f2fs_put_page(page, 1); +} + +static void init_dent_inode(struct inode *dir, struct inode *inode, + const struct f2fs_filename *fname, + struct page *ipage) +{ + struct f2fs_inode *ri; + + if (!fname) /* tmpfile case? */ + return; + + f2fs_wait_on_page_writeback(ipage, NODE, true, true); + + /* copy name info. to this inode page */ + ri = F2FS_INODE(ipage); + ri->i_namelen = cpu_to_le32(fname->disk_name.len); + memcpy(ri->i_name, fname->disk_name.name, fname->disk_name.len); + if (IS_ENCRYPTED(dir)) { + file_set_enc_name(inode); + /* + * Roll-forward recovery doesn't have encryption keys available, + * so it can't compute the dirhash for encrypted+casefolded + * filenames. Append it to i_name if possible. Else, disable + * roll-forward recovery of the dentry (i.e., make fsync'ing the + * file force a checkpoint) by setting LOST_PINO. + */ + if (IS_CASEFOLDED(dir)) { + if (fname->disk_name.len + sizeof(f2fs_hash_t) <= + F2FS_NAME_LEN) + put_unaligned(fname->hash, (f2fs_hash_t *) + &ri->i_name[fname->disk_name.len]); + else + file_lost_pino(inode); + } + } + set_page_dirty(ipage); +} + +void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, + struct f2fs_dentry_ptr *d) +{ + struct fscrypt_str dot = FSTR_INIT(".", 1); + struct fscrypt_str dotdot = FSTR_INIT("..", 2); + + /* update dirent of "." */ + f2fs_update_dentry(inode->i_ino, inode->i_mode, d, &dot, 0, 0); + + /* update dirent of ".." */ + f2fs_update_dentry(parent->i_ino, parent->i_mode, d, &dotdot, 0, 1); +} + +static int make_empty_dir(struct inode *inode, + struct inode *parent, struct page *page) +{ + struct page *dentry_page; + struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_ptr d; + + if (f2fs_has_inline_dentry(inode)) + return f2fs_make_empty_inline_dir(inode, parent, page); + + dentry_page = f2fs_get_new_data_page(inode, page, 0, true); + if (IS_ERR(dentry_page)) + return PTR_ERR(dentry_page); + + dentry_blk = page_address(dentry_page); + + make_dentry_ptr_block(NULL, &d, dentry_blk); + f2fs_do_make_empty_dir(inode, parent, &d); + + set_page_dirty(dentry_page); + f2fs_put_page(dentry_page, 1); + return 0; +} + +struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, + const struct f2fs_filename *fname, struct page *dpage) +{ + struct page *page; + int err; + + if (is_inode_flag_set(inode, FI_NEW_INODE)) { + page = f2fs_new_inode_page(inode); + if (IS_ERR(page)) + return page; + + if (S_ISDIR(inode->i_mode)) { + /* in order to handle error case */ + get_page(page); + err = make_empty_dir(inode, dir, page); + if (err) { + lock_page(page); + goto put_error; + } + put_page(page); + } + + err = f2fs_init_acl(inode, dir, page, dpage); + if (err) + goto put_error; + + err = f2fs_init_security(inode, dir, + fname ? fname->usr_fname : NULL, page); + if (err) + goto put_error; + + if (IS_ENCRYPTED(inode)) { + err = fscrypt_set_context(inode, page); + if (err) + goto put_error; + } + } else { + page = f2fs_get_node_page(F2FS_I_SB(dir), inode->i_ino); + if (IS_ERR(page)) + return page; + } + + init_dent_inode(dir, inode, fname, page); + + /* + * This file should be checkpointed during fsync. + * We lost i_pino from now on. + */ + if (is_inode_flag_set(inode, FI_INC_LINK)) { + if (!S_ISDIR(inode->i_mode)) + file_lost_pino(inode); + /* + * If link the tmpfile to alias through linkat path, + * we should remove this inode from orphan list. + */ + if (inode->i_nlink == 0) + f2fs_remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); + f2fs_i_links_write(inode, true); + } + return page; + +put_error: + clear_nlink(inode); + f2fs_update_inode(inode, page); + f2fs_put_page(page, 1); + return ERR_PTR(err); +} + +void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, + unsigned int current_depth) +{ + if (inode && is_inode_flag_set(inode, FI_NEW_INODE)) { + if (S_ISDIR(inode->i_mode)) + f2fs_i_links_write(dir, true); + clear_inode_flag(inode, FI_NEW_INODE); + } + dir->i_mtime = dir->i_ctime = current_time(dir); + f2fs_mark_inode_dirty_sync(dir, false); + + if (F2FS_I(dir)->i_current_depth != current_depth) + f2fs_i_depth_write(dir, current_depth); + + if (inode && is_inode_flag_set(inode, FI_INC_LINK)) + clear_inode_flag(inode, FI_INC_LINK); +} + +int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots) +{ + int bit_start = 0; + int zero_start, zero_end; +next: + zero_start = find_next_zero_bit_le(bitmap, max_slots, bit_start); + if (zero_start >= max_slots) + return max_slots; + + zero_end = find_next_bit_le(bitmap, max_slots, zero_start); + if (zero_end - zero_start >= slots) + return zero_start; + + bit_start = zero_end + 1; + + if (zero_end + 1 >= max_slots) + return max_slots; + goto next; +} + +bool f2fs_has_enough_room(struct inode *dir, struct page *ipage, + const struct f2fs_filename *fname) +{ + struct f2fs_dentry_ptr d; + unsigned int bit_pos; + int slots = GET_DENTRY_SLOTS(fname->disk_name.len); + + make_dentry_ptr_inline(dir, &d, inline_data_addr(dir, ipage)); + + bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max); + + return bit_pos < d.max; +} + +void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, + const struct fscrypt_str *name, f2fs_hash_t name_hash, + unsigned int bit_pos) +{ + struct f2fs_dir_entry *de; + int slots = GET_DENTRY_SLOTS(name->len); + int i; + + de = &d->dentry[bit_pos]; + de->hash_code = name_hash; + de->name_len = cpu_to_le16(name->len); + memcpy(d->filename[bit_pos], name->name, name->len); + de->ino = cpu_to_le32(ino); + set_de_type(de, mode); + for (i = 0; i < slots; i++) { + __set_bit_le(bit_pos + i, (void *)d->bitmap); + /* avoid wrong garbage data for readdir */ + if (i) + (de + i)->name_len = 0; + } +} + +int f2fs_add_regular_entry(struct inode *dir, const struct f2fs_filename *fname, + struct inode *inode, nid_t ino, umode_t mode) +{ + unsigned int bit_pos; + unsigned int level; + unsigned int current_depth; + unsigned long bidx, block; + unsigned int nbucket, nblock; + struct page *dentry_page = NULL; + struct f2fs_dentry_block *dentry_blk = NULL; + struct f2fs_dentry_ptr d; + struct page *page = NULL; + int slots, err = 0; + + level = 0; + slots = GET_DENTRY_SLOTS(fname->disk_name.len); + + current_depth = F2FS_I(dir)->i_current_depth; + if (F2FS_I(dir)->chash == fname->hash) { + level = F2FS_I(dir)->clevel; + F2FS_I(dir)->chash = 0; + } + +start: + if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) { + f2fs_show_injection_info(F2FS_I_SB(dir), FAULT_DIR_DEPTH); + return -ENOSPC; + } + + if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) + return -ENOSPC; + + /* Increase the depth, if required */ + if (level == current_depth) + ++current_depth; + + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); + nblock = bucket_blocks(level); + + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, + (le32_to_cpu(fname->hash) % nbucket)); + + for (block = bidx; block <= (bidx + nblock - 1); block++) { + dentry_page = f2fs_get_new_data_page(dir, NULL, block, true); + if (IS_ERR(dentry_page)) + return PTR_ERR(dentry_page); + + dentry_blk = page_address(dentry_page); + bit_pos = f2fs_room_for_filename(&dentry_blk->dentry_bitmap, + slots, NR_DENTRY_IN_BLOCK); + if (bit_pos < NR_DENTRY_IN_BLOCK) + goto add_dentry; + + f2fs_put_page(dentry_page, 1); + } + + /* Move to next level to find the empty slot for new dentry */ + ++level; + goto start; +add_dentry: + f2fs_wait_on_page_writeback(dentry_page, DATA, true, true); + + if (inode) { + f2fs_down_write(&F2FS_I(inode)->i_sem); + page = f2fs_init_inode_metadata(inode, dir, fname, NULL); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } + } + + make_dentry_ptr_block(NULL, &d, dentry_blk); + f2fs_update_dentry(ino, mode, &d, &fname->disk_name, fname->hash, + bit_pos); + + set_page_dirty(dentry_page); + + if (inode) { + f2fs_i_pino_write(inode, dir->i_ino); + + /* synchronize inode page's data from inode cache */ + if (is_inode_flag_set(inode, FI_NEW_INODE)) + f2fs_update_inode(inode, page); + + f2fs_put_page(page, 1); + } + + f2fs_update_parent_metadata(dir, inode, current_depth); +fail: + if (inode) + f2fs_up_write(&F2FS_I(inode)->i_sem); + + f2fs_put_page(dentry_page, 1); + + return err; +} + +int f2fs_add_dentry(struct inode *dir, const struct f2fs_filename *fname, + struct inode *inode, nid_t ino, umode_t mode) +{ + int err = -EAGAIN; + + if (f2fs_has_inline_dentry(dir)) { + /* + * Should get i_xattr_sem to keep the lock order: + * i_xattr_sem -> inode_page lock used by f2fs_setxattr. + */ + f2fs_down_read(&F2FS_I(dir)->i_xattr_sem); + err = f2fs_add_inline_entry(dir, fname, inode, ino, mode); + f2fs_up_read(&F2FS_I(dir)->i_xattr_sem); + } + if (err == -EAGAIN) + err = f2fs_add_regular_entry(dir, fname, inode, ino, mode); + + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + return err; +} + +/* + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + */ +int f2fs_do_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct f2fs_filename fname; + struct page *page = NULL; + struct f2fs_dir_entry *de = NULL; + int err; + + err = f2fs_setup_filename(dir, name, 0, &fname); + if (err) + return err; + + /* + * An immature stackable filesystem shows a race condition between lookup + * and create. If we have same task when doing lookup and create, it's + * definitely fine as expected by VFS normally. Otherwise, let's just + * verify on-disk dentry one more time, which guarantees filesystem + * consistency more. + */ + if (current != F2FS_I(dir)->task) { + de = __f2fs_find_entry(dir, &fname, &page); + F2FS_I(dir)->task = NULL; + } + if (de) { + f2fs_put_page(page, 0); + err = -EEXIST; + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + } else { + err = f2fs_add_dentry(dir, &fname, inode, ino, mode); + } + f2fs_free_filename(&fname); + return err; +} + +int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) +{ + struct page *page; + int err = 0; + + f2fs_down_write(&F2FS_I(inode)->i_sem); + page = f2fs_init_inode_metadata(inode, dir, NULL, NULL); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } + f2fs_put_page(page, 1); + + clear_inode_flag(inode, FI_NEW_INODE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); +fail: + f2fs_up_write(&F2FS_I(inode)->i_sem); + return err; +} + +void f2fs_drop_nlink(struct inode *dir, struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + + f2fs_down_write(&F2FS_I(inode)->i_sem); + + if (S_ISDIR(inode->i_mode)) + f2fs_i_links_write(dir, false); + inode->i_ctime = current_time(inode); + + f2fs_i_links_write(inode, false); + if (S_ISDIR(inode->i_mode)) { + f2fs_i_links_write(inode, false); + f2fs_i_size_write(inode, 0); + } + f2fs_up_write(&F2FS_I(inode)->i_sem); + + if (inode->i_nlink == 0) + f2fs_add_orphan_inode(inode); + else + f2fs_release_orphan_inode(sbi); +} + +/* + * It only removes the dentry from the dentry page, corresponding name + * entry in name page does not need to be touched during deletion. + */ +void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode) +{ + struct f2fs_dentry_block *dentry_blk; + unsigned int bit_pos; + int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); + int i; + + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + + if (F2FS_OPTION(F2FS_I_SB(dir)).fsync_mode == FSYNC_MODE_STRICT) + f2fs_add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); + + if (f2fs_has_inline_dentry(dir)) + return f2fs_delete_inline_entry(dentry, page, dir, inode); + + lock_page(page); + f2fs_wait_on_page_writeback(page, DATA, true, true); + + dentry_blk = page_address(page); + bit_pos = dentry - dentry_blk->dentry; + for (i = 0; i < slots; i++) + __clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); + + /* Let's check and deallocate this dentry page */ + bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, + NR_DENTRY_IN_BLOCK, + 0); + set_page_dirty(page); + + if (bit_pos == NR_DENTRY_IN_BLOCK && + !f2fs_truncate_hole(dir, page->index, page->index + 1)) { + f2fs_clear_page_cache_dirty_tag(page); + clear_page_dirty_for_io(page); + ClearPageUptodate(page); + + clear_page_private_gcing(page); + + inode_dec_dirty_pages(dir); + f2fs_remove_dirty_inode(dir); + + detach_page_private(page); + set_page_private(page, 0); + } + f2fs_put_page(page, 1); + + dir->i_ctime = dir->i_mtime = current_time(dir); + f2fs_mark_inode_dirty_sync(dir, false); + + if (inode) + f2fs_drop_nlink(dir, inode); +} + +bool f2fs_empty_dir(struct inode *dir) +{ + unsigned long bidx = 0; + struct page *dentry_page; + unsigned int bit_pos; + struct f2fs_dentry_block *dentry_blk; + unsigned long nblock = dir_blocks(dir); + + if (f2fs_has_inline_dentry(dir)) + return f2fs_empty_inline_dir(dir); + + while (bidx < nblock) { + pgoff_t next_pgofs; + + dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs); + if (IS_ERR(dentry_page)) { + if (PTR_ERR(dentry_page) == -ENOENT) { + bidx = next_pgofs; + continue; + } else { + return false; + } + } + + dentry_blk = page_address(dentry_page); + if (bidx == 0) + bit_pos = 2; + else + bit_pos = 0; + bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, + NR_DENTRY_IN_BLOCK, + bit_pos); + + f2fs_put_page(dentry_page, 0); + + if (bit_pos < NR_DENTRY_IN_BLOCK) + return false; + + bidx++; + } + return true; +} + +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, + unsigned int start_pos, struct fscrypt_str *fstr) +{ + unsigned char d_type = DT_UNKNOWN; + unsigned int bit_pos; + struct f2fs_dir_entry *de = NULL; + struct fscrypt_str de_name = FSTR_INIT(NULL, 0); + struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); + struct blk_plug plug; + bool readdir_ra = sbi->readdir_ra == 1; + bool found_valid_dirent = false; + int err = 0; + + bit_pos = ((unsigned long)ctx->pos % d->max); + + if (readdir_ra) + blk_start_plug(&plug); + + while (bit_pos < d->max) { + bit_pos = find_next_bit_le(d->bitmap, d->max, bit_pos); + if (bit_pos >= d->max) + break; + + de = &d->dentry[bit_pos]; + if (de->name_len == 0) { + if (found_valid_dirent || !bit_pos) { + printk_ratelimited( + "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.", + KERN_WARNING, sbi->sb->s_id, + le32_to_cpu(de->ino)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + bit_pos++; + ctx->pos = start_pos + bit_pos; + continue; + } + + d_type = f2fs_get_de_type(de); + + de_name.name = d->filename[bit_pos]; + de_name.len = le16_to_cpu(de->name_len); + + /* check memory boundary before moving forward */ + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + if (unlikely(bit_pos > d->max || + le16_to_cpu(de->name_len) > F2FS_NAME_LEN)) { + f2fs_warn(sbi, "%s: corrupted namelen=%d, run fsck to fix.", + __func__, le16_to_cpu(de->name_len)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_CORRUPTED_DIRENT); + goto out; + } + + if (IS_ENCRYPTED(d->inode)) { + int save_len = fstr->len; + + err = fscrypt_fname_disk_to_usr(d->inode, + (u32)le32_to_cpu(de->hash_code), + 0, &de_name, fstr); + if (err) + goto out; + + de_name = *fstr; + fstr->len = save_len; + } + + if (!dir_emit(ctx, de_name.name, de_name.len, + le32_to_cpu(de->ino), d_type)) { + err = 1; + goto out; + } + + if (readdir_ra) + f2fs_ra_node_page(sbi, le32_to_cpu(de->ino)); + + ctx->pos = start_pos + bit_pos; + found_valid_dirent = true; + } +out: + if (readdir_ra) + blk_finish_plug(&plug); + return err; +} + +static int f2fs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + unsigned long npages = dir_blocks(inode); + struct f2fs_dentry_block *dentry_blk = NULL; + struct page *dentry_page = NULL; + struct file_ra_state *ra = &file->f_ra; + loff_t start_pos = ctx->pos; + unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); + struct f2fs_dentry_ptr d; + struct fscrypt_str fstr = FSTR_INIT(NULL, 0); + int err = 0; + + if (IS_ENCRYPTED(inode)) { + err = fscrypt_prepare_readdir(inode); + if (err) + goto out; + + err = fscrypt_fname_alloc_buffer(F2FS_NAME_LEN, &fstr); + if (err < 0) + goto out; + } + + if (f2fs_has_inline_dentry(inode)) { + err = f2fs_read_inline_dir(file, ctx, &fstr); + goto out_free; + } + + for (; n < npages; ctx->pos = n * NR_DENTRY_IN_BLOCK) { + pgoff_t next_pgofs; + + /* allow readdir() to be interrupted */ + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto out_free; + } + cond_resched(); + + /* readahead for multi pages of dir */ + if (npages - n > 1 && !ra_has_index(ra, n)) + page_cache_sync_readahead(inode->i_mapping, ra, file, n, + min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); + + dentry_page = f2fs_find_data_page(inode, n, &next_pgofs); + if (IS_ERR(dentry_page)) { + err = PTR_ERR(dentry_page); + if (err == -ENOENT) { + err = 0; + n = next_pgofs; + continue; + } else { + goto out_free; + } + } + + dentry_blk = page_address(dentry_page); + + make_dentry_ptr_block(inode, &d, dentry_blk); + + err = f2fs_fill_dentries(ctx, &d, + n * NR_DENTRY_IN_BLOCK, &fstr); + if (err) { + f2fs_put_page(dentry_page, 0); + break; + } + + f2fs_put_page(dentry_page, 0); + + n++; + } +out_free: + fscrypt_fname_free_buffer(&fstr); +out: + trace_f2fs_readdir(inode, start_pos, ctx->pos, err); + return err < 0 ? err : 0; +} + +const struct file_operations f2fs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate_shared = f2fs_readdir, + .fsync = f2fs_sync_file, + .unlocked_ioctl = f2fs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = f2fs_compat_ioctl, +#endif +}; diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c new file mode 100644 index 000000000..16692c96e --- /dev/null +++ b/fs/f2fs/extent_cache.c @@ -0,0 +1,1061 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * f2fs extent cache support + * + * Copyright (c) 2015 Motorola Mobility + * Copyright (c) 2015 Samsung Electronics + * Authors: Jaegeuk Kim <jaegeuk@kernel.org> + * Chao Yu <chao2.yu@samsung.com> + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" +#include "node.h" +#include <trace/events/f2fs.h> + +bool sanity_check_extent_cache(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct extent_info *ei; + + if (!fi->extent_tree[EX_READ]) + return true; + + ei = &fi->extent_tree[EX_READ]->largest; + + if (ei->len && + (!f2fs_is_valid_blkaddr(sbi, ei->blk, + DATA_GENERIC_ENHANCE) || + !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1, + DATA_GENERIC_ENHANCE))) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) extent info [%u, %u, %u] is incorrect, run fsck to fix", + __func__, inode->i_ino, + ei->blk, ei->fofs, ei->len); + return false; + } + return true; +} + +static void __set_extent_info(struct extent_info *ei, + unsigned int fofs, unsigned int len, + block_t blk, bool keep_clen, + enum extent_type type) +{ + ei->fofs = fofs; + ei->len = len; + + if (type == EX_READ) { + ei->blk = blk; + if (keep_clen) + return; +#ifdef CONFIG_F2FS_FS_COMPRESSION + ei->c_len = 0; +#endif + } +} + +static bool __may_read_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, READ_EXTENT_CACHE)) + return false; + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return false; + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && + !f2fs_sb_has_readonly(sbi)) + return false; + return S_ISREG(inode->i_mode); +} + +static bool __init_may_extent_tree(struct inode *inode, enum extent_type type) +{ + if (type == EX_READ) + return __may_read_extent_tree(inode); + return false; +} + +static bool __may_extent_tree(struct inode *inode, enum extent_type type) +{ + /* + * for recovered files during mount do not create extents + * if shrinker is not registered. + */ + if (list_empty(&F2FS_I_SB(inode)->s_list)) + return false; + + return __init_may_extent_tree(inode, type); +} + +static void __try_update_largest_extent(struct extent_tree *et, + struct extent_node *en) +{ + if (et->type != EX_READ) + return; + if (en->ei.len <= et->largest.len) + return; + + et->largest = en->ei; + et->largest_updated = true; +} + +static bool __is_extent_mergeable(struct extent_info *back, + struct extent_info *front, enum extent_type type) +{ + if (type == EX_READ) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (back->c_len && back->len != back->c_len) + return false; + if (front->c_len && front->len != front->c_len) + return false; +#endif + return (back->fofs + back->len == front->fofs && + back->blk + back->len == front->blk); + } + return false; +} + +static bool __is_back_mergeable(struct extent_info *cur, + struct extent_info *back, enum extent_type type) +{ + return __is_extent_mergeable(back, cur, type); +} + +static bool __is_front_mergeable(struct extent_info *cur, + struct extent_info *front, enum extent_type type) +{ + return __is_extent_mergeable(cur, front, type); +} + +static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, + unsigned int ofs) +{ + if (cached_re) { + if (cached_re->ofs <= ofs && + cached_re->ofs + cached_re->len > ofs) { + return cached_re; + } + } + return NULL; +} + +static struct rb_entry *__lookup_rb_tree_slow(struct rb_root_cached *root, + unsigned int ofs) +{ + struct rb_node *node = root->rb_root.rb_node; + struct rb_entry *re; + + while (node) { + re = rb_entry(node, struct rb_entry, rb_node); + + if (ofs < re->ofs) + node = node->rb_left; + else if (ofs >= re->ofs + re->len) + node = node->rb_right; + else + return re; + } + return NULL; +} + +struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, + struct rb_entry *cached_re, unsigned int ofs) +{ + struct rb_entry *re; + + re = __lookup_rb_tree_fast(cached_re, ofs); + if (!re) + return __lookup_rb_tree_slow(root, ofs); + + return re; +} + +struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root_cached *root, + struct rb_node **parent, + unsigned int ofs, bool *leftmost) +{ + struct rb_node **p = &root->rb_root.rb_node; + struct rb_entry *re; + + while (*p) { + *parent = *p; + re = rb_entry(*parent, struct rb_entry, rb_node); + + if (ofs < re->ofs) { + p = &(*p)->rb_left; + } else if (ofs >= re->ofs + re->len) { + p = &(*p)->rb_right; + *leftmost = false; + } else { + f2fs_bug_on(sbi, 1); + } + } + + return p; +} + +/* + * lookup rb entry in position of @ofs in rb-tree, + * if hit, return the entry, otherwise, return NULL + * @prev_ex: extent before ofs + * @next_ex: extent after ofs + * @insert_p: insert point for new extent at ofs + * in order to simpfy the insertion after. + * tree must stay unchanged between lookup and insertion. + */ +struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, + struct rb_entry *cached_re, + unsigned int ofs, + struct rb_entry **prev_entry, + struct rb_entry **next_entry, + struct rb_node ***insert_p, + struct rb_node **insert_parent, + bool force, bool *leftmost) +{ + struct rb_node **pnode = &root->rb_root.rb_node; + struct rb_node *parent = NULL, *tmp_node; + struct rb_entry *re = cached_re; + + *insert_p = NULL; + *insert_parent = NULL; + *prev_entry = NULL; + *next_entry = NULL; + + if (RB_EMPTY_ROOT(&root->rb_root)) + return NULL; + + if (re) { + if (re->ofs <= ofs && re->ofs + re->len > ofs) + goto lookup_neighbors; + } + + if (leftmost) + *leftmost = true; + + while (*pnode) { + parent = *pnode; + re = rb_entry(*pnode, struct rb_entry, rb_node); + + if (ofs < re->ofs) { + pnode = &(*pnode)->rb_left; + } else if (ofs >= re->ofs + re->len) { + pnode = &(*pnode)->rb_right; + if (leftmost) + *leftmost = false; + } else { + goto lookup_neighbors; + } + } + + *insert_p = pnode; + *insert_parent = parent; + + re = rb_entry(parent, struct rb_entry, rb_node); + tmp_node = parent; + if (parent && ofs > re->ofs) + tmp_node = rb_next(parent); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + + tmp_node = parent; + if (parent && ofs < re->ofs) + tmp_node = rb_prev(parent); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + return NULL; + +lookup_neighbors: + if (ofs == re->ofs || force) { + /* lookup prev node for merging backward later */ + tmp_node = rb_prev(&re->rb_node); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + if (ofs == re->ofs + re->len - 1 || force) { + /* lookup next node for merging frontward later */ + tmp_node = rb_next(&re->rb_node); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + return re; +} + +bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root_cached *root) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct rb_node *cur = rb_first_cached(root), *next; + struct rb_entry *cur_re, *next_re; + + if (!cur) + return true; + + while (cur) { + next = rb_next(cur); + if (!next) + return true; + + cur_re = rb_entry(cur, struct rb_entry, rb_node); + next_re = rb_entry(next, struct rb_entry, rb_node); + + if (cur_re->ofs + cur_re->len > next_re->ofs) { + f2fs_info(sbi, "inconsistent rbtree, cur(%u, %u) next(%u, %u)", + cur_re->ofs, cur_re->len, + next_re->ofs, next_re->len); + return false; + } + cur = next; + } +#endif + return true; +} + +static struct kmem_cache *extent_tree_slab; +static struct kmem_cache *extent_node_slab; + +static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct rb_node *parent, struct rb_node **p, + bool leftmost) +{ + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + struct extent_node *en; + + en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi); + if (!en) + return NULL; + + en->ei = *ei; + INIT_LIST_HEAD(&en->list); + en->et = et; + + rb_link_node(&en->rb_node, parent, p); + rb_insert_color_cached(&en->rb_node, &et->root, leftmost); + atomic_inc(&et->node_cnt); + atomic_inc(&eti->total_ext_node); + return en; +} + +static void __detach_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + + rb_erase_cached(&en->rb_node, &et->root); + atomic_dec(&et->node_cnt); + atomic_dec(&eti->total_ext_node); + + if (et->cached_en == en) + et->cached_en = NULL; + kmem_cache_free(extent_node_slab, en); +} + +/* + * Flow to release an extent_node: + * 1. list_del_init + * 2. __detach_extent_node + * 3. kmem_cache_free. + */ +static void __release_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + + spin_lock(&eti->extent_lock); + f2fs_bug_on(sbi, list_empty(&en->list)); + list_del_init(&en->list); + spin_unlock(&eti->extent_lock); + + __detach_extent_node(sbi, et, en); +} + +static struct extent_tree *__grab_extent_tree(struct inode *inode, + enum extent_type type) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree_info *eti = &sbi->extent_tree[type]; + struct extent_tree *et; + nid_t ino = inode->i_ino; + + mutex_lock(&eti->extent_tree_lock); + et = radix_tree_lookup(&eti->extent_tree_root, ino); + if (!et) { + et = f2fs_kmem_cache_alloc(extent_tree_slab, + GFP_NOFS, true, NULL); + f2fs_radix_tree_insert(&eti->extent_tree_root, ino, et); + memset(et, 0, sizeof(struct extent_tree)); + et->ino = ino; + et->type = type; + et->root = RB_ROOT_CACHED; + et->cached_en = NULL; + rwlock_init(&et->lock); + INIT_LIST_HEAD(&et->list); + atomic_set(&et->node_cnt, 0); + atomic_inc(&eti->total_ext_tree); + } else { + atomic_dec(&eti->total_zombie_tree); + list_del_init(&et->list); + } + mutex_unlock(&eti->extent_tree_lock); + + /* never died until evict_inode */ + F2FS_I(inode)->extent_tree[type] = et; + + return et; +} + +static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et) +{ + struct rb_node *node, *next; + struct extent_node *en; + unsigned int count = atomic_read(&et->node_cnt); + + node = rb_first_cached(&et->root); + while (node) { + next = rb_next(node); + en = rb_entry(node, struct extent_node, rb_node); + __release_extent_node(sbi, et, en); + node = next; + } + + return count - atomic_read(&et->node_cnt); +} + +static void __drop_largest_extent(struct extent_tree *et, + pgoff_t fofs, unsigned int len) +{ + if (fofs < et->largest.fofs + et->largest.len && + fofs + len > et->largest.fofs) { + et->largest.len = 0; + et->largest_updated = true; + } +} + +void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree_info *eti = &sbi->extent_tree[EX_READ]; + struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext; + struct extent_tree *et; + struct extent_node *en; + struct extent_info ei; + + if (!__may_extent_tree(inode, EX_READ)) { + /* drop largest read extent */ + if (i_ext && i_ext->len) { + f2fs_wait_on_page_writeback(ipage, NODE, true, true); + i_ext->len = 0; + set_page_dirty(ipage); + } + goto out; + } + + et = __grab_extent_tree(inode, EX_READ); + + if (!i_ext || !i_ext->len) + goto out; + + get_read_extent_info(&ei, i_ext); + + write_lock(&et->lock); + if (atomic_read(&et->node_cnt)) + goto unlock_out; + + en = __attach_extent_node(sbi, et, &ei, NULL, + &et->root.rb_root.rb_node, true); + if (en) { + et->largest = en->ei; + et->cached_en = en; + + spin_lock(&eti->extent_lock); + list_add_tail(&en->list, &eti->extent_list); + spin_unlock(&eti->extent_lock); + } +unlock_out: + write_unlock(&et->lock); +out: + if (!F2FS_I(inode)->extent_tree[EX_READ]) + set_inode_flag(inode, FI_NO_EXTENT); +} + +void f2fs_init_extent_tree(struct inode *inode) +{ + /* initialize read cache */ + if (__init_may_extent_tree(inode, EX_READ)) + __grab_extent_tree(inode, EX_READ); +} + +static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei, enum extent_type type) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree_info *eti = &sbi->extent_tree[type]; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; + struct extent_node *en; + bool ret = false; + + if (!et) + return false; + + trace_f2fs_lookup_extent_tree_start(inode, pgofs, type); + + read_lock(&et->lock); + + if (type == EX_READ && + et->largest.fofs <= pgofs && + et->largest.fofs + et->largest.len > pgofs) { + *ei = et->largest; + ret = true; + stat_inc_largest_node_hit(sbi); + goto out; + } + + en = (struct extent_node *)f2fs_lookup_rb_tree(&et->root, + (struct rb_entry *)et->cached_en, pgofs); + if (!en) + goto out; + + if (en == et->cached_en) + stat_inc_cached_node_hit(sbi, type); + else + stat_inc_rbtree_node_hit(sbi, type); + + *ei = en->ei; + spin_lock(&eti->extent_lock); + if (!list_empty(&en->list)) { + list_move_tail(&en->list, &eti->extent_list); + et->cached_en = en; + } + spin_unlock(&eti->extent_lock); + ret = true; +out: + stat_inc_total_hit(sbi, type); + read_unlock(&et->lock); + + if (type == EX_READ) + trace_f2fs_lookup_read_extent_tree_end(inode, pgofs, ei); + return ret; +} + +static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct extent_node *prev_ex, + struct extent_node *next_ex) +{ + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + struct extent_node *en = NULL; + + if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei, et->type)) { + prev_ex->ei.len += ei->len; + ei = &prev_ex->ei; + en = prev_ex; + } + + if (next_ex && __is_front_mergeable(ei, &next_ex->ei, et->type)) { + next_ex->ei.fofs = ei->fofs; + next_ex->ei.len += ei->len; + if (et->type == EX_READ) + next_ex->ei.blk = ei->blk; + if (en) + __release_extent_node(sbi, et, prev_ex); + + en = next_ex; + } + + if (!en) + return NULL; + + __try_update_largest_extent(et, en); + + spin_lock(&eti->extent_lock); + if (!list_empty(&en->list)) { + list_move_tail(&en->list, &eti->extent_list); + et->cached_en = en; + } + spin_unlock(&eti->extent_lock); + return en; +} + +static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct rb_node **insert_p, + struct rb_node *insert_parent, + bool leftmost) +{ + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + struct rb_node **p; + struct rb_node *parent = NULL; + struct extent_node *en = NULL; + + if (insert_p && insert_parent) { + parent = insert_parent; + p = insert_p; + goto do_insert; + } + + leftmost = true; + + p = f2fs_lookup_rb_tree_for_insert(sbi, &et->root, &parent, + ei->fofs, &leftmost); +do_insert: + en = __attach_extent_node(sbi, et, ei, parent, p, leftmost); + if (!en) + return NULL; + + __try_update_largest_extent(et, en); + + /* update in global extent list */ + spin_lock(&eti->extent_lock); + list_add_tail(&en->list, &eti->extent_list); + et->cached_en = en; + spin_unlock(&eti->extent_lock); + return en; +} + +static void __update_extent_tree_range(struct inode *inode, + struct extent_info *tei, enum extent_type type) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; + struct extent_node *en = NULL, *en1 = NULL; + struct extent_node *prev_en = NULL, *next_en = NULL; + struct extent_info ei, dei, prev; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + unsigned int fofs = tei->fofs, len = tei->len; + unsigned int end = fofs + len; + bool updated = false; + bool leftmost = false; + + if (!et) + return; + + if (type == EX_READ) + trace_f2fs_update_read_extent_tree_range(inode, fofs, len, + tei->blk, 0); + write_lock(&et->lock); + + if (type == EX_READ) { + if (is_inode_flag_set(inode, FI_NO_EXTENT)) { + write_unlock(&et->lock); + return; + } + + prev = et->largest; + dei.len = 0; + + /* + * drop largest extent before lookup, in case it's already + * been shrunk from extent tree + */ + __drop_largest_extent(et, fofs, len); + } + + /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ + en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, + (struct rb_entry *)et->cached_en, fofs, + (struct rb_entry **)&prev_en, + (struct rb_entry **)&next_en, + &insert_p, &insert_parent, false, + &leftmost); + if (!en) + en = next_en; + + /* 2. invlidate all extent nodes in range [fofs, fofs + len - 1] */ + while (en && en->ei.fofs < end) { + unsigned int org_end; + int parts = 0; /* # of parts current extent split into */ + + next_en = en1 = NULL; + + dei = en->ei; + org_end = dei.fofs + dei.len; + f2fs_bug_on(sbi, fofs >= org_end); + + if (fofs > dei.fofs && (type != EX_READ || + fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN)) { + en->ei.len = fofs - en->ei.fofs; + prev_en = en; + parts = 1; + } + + if (end < org_end && (type != EX_READ || + org_end - end >= F2FS_MIN_EXTENT_LEN)) { + if (parts) { + __set_extent_info(&ei, + end, org_end - end, + end - dei.fofs + dei.blk, false, + type); + en1 = __insert_extent_tree(sbi, et, &ei, + NULL, NULL, true); + next_en = en1; + } else { + __set_extent_info(&en->ei, + end, en->ei.len - (end - dei.fofs), + en->ei.blk + (end - dei.fofs), true, + type); + next_en = en; + } + parts++; + } + + if (!next_en) { + struct rb_node *node = rb_next(&en->rb_node); + + next_en = rb_entry_safe(node, struct extent_node, + rb_node); + } + + if (parts) + __try_update_largest_extent(et, en); + else + __release_extent_node(sbi, et, en); + + /* + * if original extent is split into zero or two parts, extent + * tree has been altered by deletion or insertion, therefore + * invalidate pointers regard to tree. + */ + if (parts != 1) { + insert_p = NULL; + insert_parent = NULL; + } + en = next_en; + } + + /* 3. update extent in read extent cache */ + BUG_ON(type != EX_READ); + + if (tei->blk) { + __set_extent_info(&ei, fofs, len, tei->blk, false, EX_READ); + if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) + __insert_extent_tree(sbi, et, &ei, + insert_p, insert_parent, leftmost); + + /* give up extent_cache, if split and small updates happen */ + if (dei.len >= 1 && + prev.len < F2FS_MIN_EXTENT_LEN && + et->largest.len < F2FS_MIN_EXTENT_LEN) { + et->largest.len = 0; + et->largest_updated = true; + set_inode_flag(inode, FI_NO_EXTENT); + } + } + + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + __free_extent_tree(sbi, et); + + if (et->largest_updated) { + et->largest_updated = false; + updated = true; + } + + write_unlock(&et->lock); + + if (updated) + f2fs_mark_inode_dirty_sync(inode, true); +} + +#ifdef CONFIG_F2FS_FS_COMPRESSION +void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, unsigned int llen, + unsigned int c_len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ]; + struct extent_node *en = NULL; + struct extent_node *prev_en = NULL, *next_en = NULL; + struct extent_info ei; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + bool leftmost = false; + + trace_f2fs_update_read_extent_tree_range(inode, fofs, llen, + blkaddr, c_len); + + /* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */ + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return; + + write_lock(&et->lock); + + en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, + (struct rb_entry *)et->cached_en, fofs, + (struct rb_entry **)&prev_en, + (struct rb_entry **)&next_en, + &insert_p, &insert_parent, false, + &leftmost); + if (en) + goto unlock_out; + + __set_extent_info(&ei, fofs, llen, blkaddr, true, EX_READ); + ei.c_len = c_len; + + if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) + __insert_extent_tree(sbi, et, &ei, + insert_p, insert_parent, leftmost); +unlock_out: + write_unlock(&et->lock); +} +#endif + +static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type) +{ + struct extent_info ei; + + if (!__may_extent_tree(dn->inode, type)) + return; + + ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + + dn->ofs_in_node; + ei.len = 1; + + if (type == EX_READ) { + if (dn->data_blkaddr == NEW_ADDR) + ei.blk = NULL_ADDR; + else + ei.blk = dn->data_blkaddr; + } + __update_extent_tree_range(dn->inode, &ei, type); +} + +static unsigned int __shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink, + enum extent_type type) +{ + struct extent_tree_info *eti = &sbi->extent_tree[type]; + struct extent_tree *et, *next; + struct extent_node *en; + unsigned int node_cnt = 0, tree_cnt = 0; + int remained; + + if (!atomic_read(&eti->total_zombie_tree)) + goto free_node; + + if (!mutex_trylock(&eti->extent_tree_lock)) + goto out; + + /* 1. remove unreferenced extent tree */ + list_for_each_entry_safe(et, next, &eti->zombie_list, list) { + if (atomic_read(&et->node_cnt)) { + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et); + write_unlock(&et->lock); + } + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); + list_del_init(&et->list); + radix_tree_delete(&eti->extent_tree_root, et->ino); + kmem_cache_free(extent_tree_slab, et); + atomic_dec(&eti->total_ext_tree); + atomic_dec(&eti->total_zombie_tree); + tree_cnt++; + + if (node_cnt + tree_cnt >= nr_shrink) + goto unlock_out; + cond_resched(); + } + mutex_unlock(&eti->extent_tree_lock); + +free_node: + /* 2. remove LRU extent entries */ + if (!mutex_trylock(&eti->extent_tree_lock)) + goto out; + + remained = nr_shrink - (node_cnt + tree_cnt); + + spin_lock(&eti->extent_lock); + for (; remained > 0; remained--) { + if (list_empty(&eti->extent_list)) + break; + en = list_first_entry(&eti->extent_list, + struct extent_node, list); + et = en->et; + if (!write_trylock(&et->lock)) { + /* refresh this extent node's position in extent list */ + list_move_tail(&en->list, &eti->extent_list); + continue; + } + + list_del_init(&en->list); + spin_unlock(&eti->extent_lock); + + __detach_extent_node(sbi, et, en); + + write_unlock(&et->lock); + node_cnt++; + spin_lock(&eti->extent_lock); + } + spin_unlock(&eti->extent_lock); + +unlock_out: + mutex_unlock(&eti->extent_tree_lock); +out: + trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt, type); + + return node_cnt + tree_cnt; +} + +/* read extent cache operations */ +bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (!__may_extent_tree(inode, EX_READ)) + return false; + + return __lookup_extent_tree(inode, pgofs, ei, EX_READ); +} + +void f2fs_update_read_extent_cache(struct dnode_of_data *dn) +{ + return __update_extent_cache(dn, EX_READ); +} + +void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len) +{ + struct extent_info ei = { + .fofs = fofs, + .len = len, + .blk = blkaddr, + }; + + if (!__may_extent_tree(dn->inode, EX_READ)) + return; + + __update_extent_tree_range(dn->inode, &ei, EX_READ); +} + +unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + if (!test_opt(sbi, READ_EXTENT_CACHE)) + return 0; + + return __shrink_extent_tree(sbi, nr_shrink, EX_READ); +} + +static unsigned int __destroy_extent_node(struct inode *inode, + enum extent_type type) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; + unsigned int node_cnt = 0; + + if (!et || !atomic_read(&et->node_cnt)) + return 0; + + write_lock(&et->lock); + node_cnt = __free_extent_tree(sbi, et); + write_unlock(&et->lock); + + return node_cnt; +} + +void f2fs_destroy_extent_node(struct inode *inode) +{ + __destroy_extent_node(inode, EX_READ); +} + +static void __drop_extent_tree(struct inode *inode, enum extent_type type) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; + bool updated = false; + + if (!__may_extent_tree(inode, type)) + return; + + write_lock(&et->lock); + __free_extent_tree(sbi, et); + if (type == EX_READ) { + set_inode_flag(inode, FI_NO_EXTENT); + if (et->largest.len) { + et->largest.len = 0; + updated = true; + } + } + write_unlock(&et->lock); + if (updated) + f2fs_mark_inode_dirty_sync(inode, true); +} + +void f2fs_drop_extent_tree(struct inode *inode) +{ + __drop_extent_tree(inode, EX_READ); +} + +static void __destroy_extent_tree(struct inode *inode, enum extent_type type) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree_info *eti = &sbi->extent_tree[type]; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; + unsigned int node_cnt = 0; + + if (!et) + return; + + if (inode->i_nlink && !is_bad_inode(inode) && + atomic_read(&et->node_cnt)) { + mutex_lock(&eti->extent_tree_lock); + list_add_tail(&et->list, &eti->zombie_list); + atomic_inc(&eti->total_zombie_tree); + mutex_unlock(&eti->extent_tree_lock); + return; + } + + /* free all extent info belong to this extent tree */ + node_cnt = __destroy_extent_node(inode, type); + + /* delete extent tree entry in radix tree */ + mutex_lock(&eti->extent_tree_lock); + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); + radix_tree_delete(&eti->extent_tree_root, inode->i_ino); + kmem_cache_free(extent_tree_slab, et); + atomic_dec(&eti->total_ext_tree); + mutex_unlock(&eti->extent_tree_lock); + + F2FS_I(inode)->extent_tree[type] = NULL; + + trace_f2fs_destroy_extent_tree(inode, node_cnt, type); +} + +void f2fs_destroy_extent_tree(struct inode *inode) +{ + __destroy_extent_tree(inode, EX_READ); +} + +static void __init_extent_tree_info(struct extent_tree_info *eti) +{ + INIT_RADIX_TREE(&eti->extent_tree_root, GFP_NOIO); + mutex_init(&eti->extent_tree_lock); + INIT_LIST_HEAD(&eti->extent_list); + spin_lock_init(&eti->extent_lock); + atomic_set(&eti->total_ext_tree, 0); + INIT_LIST_HEAD(&eti->zombie_list); + atomic_set(&eti->total_zombie_tree, 0); + atomic_set(&eti->total_ext_node, 0); +} + +void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) +{ + __init_extent_tree_info(&sbi->extent_tree[EX_READ]); +} + +int __init f2fs_create_extent_cache(void) +{ + extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree", + sizeof(struct extent_tree)); + if (!extent_tree_slab) + return -ENOMEM; + extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node", + sizeof(struct extent_node)); + if (!extent_node_slab) { + kmem_cache_destroy(extent_tree_slab); + return -ENOMEM; + } + return 0; +} + +void f2fs_destroy_extent_cache(void) +{ + kmem_cache_destroy(extent_node_slab); + kmem_cache_destroy(extent_tree_slab); +} diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h new file mode 100644 index 000000000..5c76ba764 --- /dev/null +++ b/fs/f2fs/f2fs.h @@ -0,0 +1,4555 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * fs/f2fs/f2fs.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + */ +#ifndef _LINUX_F2FS_H +#define _LINUX_F2FS_H + +#include <linux/uio.h> +#include <linux/types.h> +#include <linux/page-flags.h> +#include <linux/buffer_head.h> +#include <linux/slab.h> +#include <linux/crc32.h> +#include <linux/magic.h> +#include <linux/kobject.h> +#include <linux/sched.h> +#include <linux/cred.h> +#include <linux/sched/mm.h> +#include <linux/vmalloc.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/quotaops.h> +#include <linux/part_stat.h> +#include <crypto/hash.h> + +#include <linux/fscrypt.h> +#include <linux/fsverity.h> + +struct pagevec; + +#ifdef CONFIG_F2FS_CHECK_FS +#define f2fs_bug_on(sbi, condition) BUG_ON(condition) +#else +#define f2fs_bug_on(sbi, condition) \ + do { \ + if (WARN_ON(condition)) \ + set_sbi_flag(sbi, SBI_NEED_FSCK); \ + } while (0) +#endif + +enum { + FAULT_KMALLOC, + FAULT_KVMALLOC, + FAULT_PAGE_ALLOC, + FAULT_PAGE_GET, + FAULT_ALLOC_BIO, /* it's obsolete due to bio_alloc() will never fail */ + FAULT_ALLOC_NID, + FAULT_ORPHAN, + FAULT_BLOCK, + FAULT_DIR_DEPTH, + FAULT_EVICT_INODE, + FAULT_TRUNCATE, + FAULT_READ_IO, + FAULT_CHECKPOINT, + FAULT_DISCARD, + FAULT_WRITE_IO, + FAULT_SLAB_ALLOC, + FAULT_DQUOT_INIT, + FAULT_LOCK_OP, + FAULT_MAX, +}; + +#ifdef CONFIG_F2FS_FAULT_INJECTION +#define F2FS_ALL_FAULT_TYPE (GENMASK(FAULT_MAX - 1, 0)) + +struct f2fs_fault_info { + atomic_t inject_ops; + unsigned int inject_rate; + unsigned int inject_type; +}; + +extern const char *f2fs_fault_name[FAULT_MAX]; +#define IS_FAULT_SET(fi, type) ((fi)->inject_type & BIT(type)) +#endif + +/* + * For mount options + */ +#define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000002 +#define F2FS_MOUNT_DISCARD 0x00000004 +#define F2FS_MOUNT_NOHEAP 0x00000008 +#define F2FS_MOUNT_XATTR_USER 0x00000010 +#define F2FS_MOUNT_POSIX_ACL 0x00000020 +#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 +#define F2FS_MOUNT_INLINE_XATTR 0x00000080 +#define F2FS_MOUNT_INLINE_DATA 0x00000100 +#define F2FS_MOUNT_INLINE_DENTRY 0x00000200 +#define F2FS_MOUNT_FLUSH_MERGE 0x00000400 +#define F2FS_MOUNT_NOBARRIER 0x00000800 +#define F2FS_MOUNT_FASTBOOT 0x00001000 +#define F2FS_MOUNT_READ_EXTENT_CACHE 0x00002000 +#define F2FS_MOUNT_DATA_FLUSH 0x00008000 +#define F2FS_MOUNT_FAULT_INJECTION 0x00010000 +#define F2FS_MOUNT_USRQUOTA 0x00080000 +#define F2FS_MOUNT_GRPQUOTA 0x00100000 +#define F2FS_MOUNT_PRJQUOTA 0x00200000 +#define F2FS_MOUNT_QUOTA 0x00400000 +#define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 +#define F2FS_MOUNT_RESERVE_ROOT 0x01000000 +#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000 +#define F2FS_MOUNT_NORECOVERY 0x04000000 +#define F2FS_MOUNT_ATGC 0x08000000 +#define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 +#define F2FS_MOUNT_GC_MERGE 0x20000000 +#define F2FS_MOUNT_COMPRESS_CACHE 0x40000000 + +#define F2FS_OPTION(sbi) ((sbi)->mount_opt) +#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) +#define set_opt(sbi, option) (F2FS_OPTION(sbi).opt |= F2FS_MOUNT_##option) +#define test_opt(sbi, option) (F2FS_OPTION(sbi).opt & F2FS_MOUNT_##option) + +#define ver_after(a, b) (typecheck(unsigned long long, a) && \ + typecheck(unsigned long long, b) && \ + ((long long)((a) - (b)) > 0)) + +typedef u32 block_t; /* + * should not change u32, since it is the on-disk block + * address format, __le32. + */ +typedef u32 nid_t; + +#define COMPRESS_EXT_NUM 16 + +/* + * An implementation of an rwsem that is explicitly unfair to readers. This + * prevents priority inversion when a low-priority reader acquires the read lock + * while sleeping on the write lock but the write lock is needed by + * higher-priority clients. + */ + +struct f2fs_rwsem { + struct rw_semaphore internal_rwsem; +#ifdef CONFIG_F2FS_UNFAIR_RWSEM + wait_queue_head_t read_waiters; +#endif +}; + +struct f2fs_mount_info { + unsigned int opt; + int write_io_size_bits; /* Write IO size bits */ + block_t root_reserved_blocks; /* root reserved blocks */ + kuid_t s_resuid; /* reserved blocks for uid */ + kgid_t s_resgid; /* reserved blocks for gid */ + int active_logs; /* # of active logs */ + int inline_xattr_size; /* inline xattr size */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct f2fs_fault_info fault_info; /* For fault injection */ +#endif +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char *s_qf_names[MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif + /* For which write hints are passed down to block layer */ + int alloc_mode; /* segment allocation policy */ + int fsync_mode; /* fsync policy */ + int fs_mode; /* fs mode: LFS or ADAPTIVE */ + int bggc_mode; /* bggc mode: off, on or sync */ + int memory_mode; /* memory mode */ + int discard_unit; /* + * discard command's offset/size should + * be aligned to this unit: block, + * segment or section + */ + struct fscrypt_dummy_policy dummy_enc_policy; /* test dummy encryption */ + block_t unusable_cap_perc; /* percentage for cap */ + block_t unusable_cap; /* Amount of space allowed to be + * unusable when disabling checkpoint + */ + + /* For compression */ + unsigned char compress_algorithm; /* algorithm type */ + unsigned char compress_log_size; /* cluster log size */ + unsigned char compress_level; /* compress level */ + bool compress_chksum; /* compressed data chksum */ + unsigned char compress_ext_cnt; /* extension count */ + unsigned char nocompress_ext_cnt; /* nocompress extension count */ + int compress_mode; /* compression mode */ + unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ + unsigned char noextensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ +}; + +#define F2FS_FEATURE_ENCRYPT 0x0001 +#define F2FS_FEATURE_BLKZONED 0x0002 +#define F2FS_FEATURE_ATOMIC_WRITE 0x0004 +#define F2FS_FEATURE_EXTRA_ATTR 0x0008 +#define F2FS_FEATURE_PRJQUOTA 0x0010 +#define F2FS_FEATURE_INODE_CHKSUM 0x0020 +#define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x0040 +#define F2FS_FEATURE_QUOTA_INO 0x0080 +#define F2FS_FEATURE_INODE_CRTIME 0x0100 +#define F2FS_FEATURE_LOST_FOUND 0x0200 +#define F2FS_FEATURE_VERITY 0x0400 +#define F2FS_FEATURE_SB_CHKSUM 0x0800 +#define F2FS_FEATURE_CASEFOLD 0x1000 +#define F2FS_FEATURE_COMPRESSION 0x2000 +#define F2FS_FEATURE_RO 0x4000 + +#define __F2FS_HAS_FEATURE(raw_super, mask) \ + ((raw_super->feature & cpu_to_le32(mask)) != 0) +#define F2FS_HAS_FEATURE(sbi, mask) __F2FS_HAS_FEATURE(sbi->raw_super, mask) +#define F2FS_SET_FEATURE(sbi, mask) \ + (sbi->raw_super->feature |= cpu_to_le32(mask)) +#define F2FS_CLEAR_FEATURE(sbi, mask) \ + (sbi->raw_super->feature &= ~cpu_to_le32(mask)) + +/* + * Default values for user and/or group using reserved blocks + */ +#define F2FS_DEF_RESUID 0 +#define F2FS_DEF_RESGID 0 + +/* + * For checkpoint manager + */ +enum { + NAT_BITMAP, + SIT_BITMAP +}; + +#define CP_UMOUNT 0x00000001 +#define CP_FASTBOOT 0x00000002 +#define CP_SYNC 0x00000004 +#define CP_RECOVERY 0x00000008 +#define CP_DISCARD 0x00000010 +#define CP_TRIMMED 0x00000020 +#define CP_PAUSE 0x00000040 +#define CP_RESIZE 0x00000080 + +#define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ +#define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ +#define DEF_MID_DISCARD_ISSUE_TIME 500 /* 500 ms, if device busy */ +#define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ +#define DEF_DISCARD_URGENT_UTIL 80 /* do more discard over 80% */ +#define DEF_CP_INTERVAL 60 /* 60 secs */ +#define DEF_IDLE_INTERVAL 5 /* 5 secs */ +#define DEF_DISABLE_INTERVAL 5 /* 5 secs */ +#define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */ +#define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ + +struct cp_control { + int reason; + __u64 trim_start; + __u64 trim_end; + __u64 trim_minlen; +}; + +/* + * indicate meta/data type + */ +enum { + META_CP, + META_NAT, + META_SIT, + META_SSA, + META_MAX, + META_POR, + DATA_GENERIC, /* check range only */ + DATA_GENERIC_ENHANCE, /* strong check on range and segment bitmap */ + DATA_GENERIC_ENHANCE_READ, /* + * strong check on range and segment + * bitmap but no warning due to race + * condition of read on truncated area + * by extent_cache + */ + DATA_GENERIC_ENHANCE_UPDATE, /* + * strong check on range and segment + * bitmap for update case + */ + META_GENERIC, +}; + +/* for the list of ino */ +enum { + ORPHAN_INO, /* for orphan ino list */ + APPEND_INO, /* for append ino list */ + UPDATE_INO, /* for update ino list */ + TRANS_DIR_INO, /* for transactions dir ino list */ + FLUSH_INO, /* for multiple device flushing */ + MAX_INO_ENTRY, /* max. list */ +}; + +struct ino_entry { + struct list_head list; /* list head */ + nid_t ino; /* inode number */ + unsigned int dirty_device; /* dirty device bitmap */ +}; + +/* for the list of inodes to be GCed */ +struct inode_entry { + struct list_head list; /* list head */ + struct inode *inode; /* vfs inode pointer */ +}; + +struct fsync_node_entry { + struct list_head list; /* list head */ + struct page *page; /* warm node page pointer */ + unsigned int seq_id; /* sequence id */ +}; + +struct ckpt_req { + struct completion wait; /* completion for checkpoint done */ + struct llist_node llnode; /* llist_node to be linked in wait queue */ + int ret; /* return code of checkpoint */ + ktime_t queue_time; /* request queued time */ +}; + +struct ckpt_req_control { + struct task_struct *f2fs_issue_ckpt; /* checkpoint task */ + int ckpt_thread_ioprio; /* checkpoint merge thread ioprio */ + wait_queue_head_t ckpt_wait_queue; /* waiting queue for wake-up */ + atomic_t issued_ckpt; /* # of actually issued ckpts */ + atomic_t total_ckpt; /* # of total ckpts */ + atomic_t queued_ckpt; /* # of queued ckpts */ + struct llist_head issue_list; /* list for command issue */ + spinlock_t stat_lock; /* lock for below checkpoint time stats */ + unsigned int cur_time; /* cur wait time in msec for currently issued checkpoint */ + unsigned int peak_time; /* peak wait time in msec until now */ +}; + +/* for the bitmap indicate blocks to be discarded */ +struct discard_entry { + struct list_head list; /* list head */ + block_t start_blkaddr; /* start blockaddr of current segment */ + unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ +}; + +/* default discard granularity of inner discard thread, unit: block count */ +#define DEFAULT_DISCARD_GRANULARITY 16 + +/* max discard pend list number */ +#define MAX_PLIST_NUM 512 +#define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ + (MAX_PLIST_NUM - 1) : ((blk_num) - 1)) + +enum { + D_PREP, /* initial */ + D_PARTIAL, /* partially submitted */ + D_SUBMIT, /* all submitted */ + D_DONE, /* finished */ +}; + +struct discard_info { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ +}; + +struct discard_cmd { + struct rb_node rb_node; /* rb node located in rb-tree */ + union { + struct { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ + }; + struct discard_info di; /* discard info */ + + }; + struct list_head list; /* command list */ + struct completion wait; /* compleation */ + struct block_device *bdev; /* bdev */ + unsigned short ref; /* reference count */ + unsigned char state; /* state */ + unsigned char queued; /* queued discard */ + int error; /* bio error */ + spinlock_t lock; /* for state/bio_ref updating */ + unsigned short bio_ref; /* bio reference count */ +}; + +enum { + DPOLICY_BG, + DPOLICY_FORCE, + DPOLICY_FSTRIM, + DPOLICY_UMOUNT, + MAX_DPOLICY, +}; + +struct discard_policy { + int type; /* type of discard */ + unsigned int min_interval; /* used for candidates exist */ + unsigned int mid_interval; /* used for device busy */ + unsigned int max_interval; /* used for candidates not exist */ + unsigned int max_requests; /* # of discards issued per round */ + unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */ + bool io_aware; /* issue discard in idle time */ + bool sync; /* submit discard with REQ_SYNC flag */ + bool ordered; /* issue discard by lba order */ + bool timeout; /* discard timeout for put_super */ + unsigned int granularity; /* discard granularity */ +}; + +struct discard_cmd_control { + struct task_struct *f2fs_issue_discard; /* discard thread */ + struct list_head entry_list; /* 4KB discard entry list */ + struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ + struct list_head wait_list; /* store on-flushing entries */ + struct list_head fstrim_list; /* in-flight discard from fstrim */ + wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ + unsigned int discard_wake; /* to wake up discard thread */ + struct mutex cmd_lock; + unsigned int nr_discards; /* # of discards in the list */ + unsigned int max_discards; /* max. discards to be issued */ + unsigned int max_discard_request; /* max. discard request per round */ + unsigned int min_discard_issue_time; /* min. interval between discard issue */ + unsigned int mid_discard_issue_time; /* mid. interval between discard issue */ + unsigned int max_discard_issue_time; /* max. interval between discard issue */ + unsigned int discard_granularity; /* discard granularity */ + unsigned int undiscard_blks; /* # of undiscard blocks */ + unsigned int next_pos; /* next discard position */ + atomic_t issued_discard; /* # of issued discard */ + atomic_t queued_discard; /* # of queued discard */ + atomic_t discard_cmd_cnt; /* # of cached cmd count */ + struct rb_root_cached root; /* root of discard rb-tree */ + bool rbtree_check; /* config for consistence check */ +}; + +/* for the list of fsync inodes, used only during recovery */ +struct fsync_inode_entry { + struct list_head list; /* list head */ + struct inode *inode; /* vfs inode pointer */ + block_t blkaddr; /* block address locating the last fsync */ + block_t last_dentry; /* block address locating the last dentry */ +}; + +#define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats)) +#define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits)) + +#define nat_in_journal(jnl, i) ((jnl)->nat_j.entries[i].ne) +#define nid_in_journal(jnl, i) ((jnl)->nat_j.entries[i].nid) +#define sit_in_journal(jnl, i) ((jnl)->sit_j.entries[i].se) +#define segno_in_journal(jnl, i) ((jnl)->sit_j.entries[i].segno) + +#define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl)) +#define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl)) + +static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) +{ + int before = nats_in_cursum(journal); + + journal->n_nats = cpu_to_le16(before + i); + return before; +} + +static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i) +{ + int before = sits_in_cursum(journal); + + journal->n_sits = cpu_to_le16(before + i); + return before; +} + +static inline bool __has_cursum_space(struct f2fs_journal *journal, + int size, int type) +{ + if (type == NAT_JOURNAL) + return size <= MAX_NAT_JENTRIES(journal); + return size <= MAX_SIT_JENTRIES(journal); +} + +/* for inline stuff */ +#define DEF_INLINE_RESERVED_SIZE 1 +static inline int get_extra_isize(struct inode *inode); +static inline int get_inline_xattr_addrs(struct inode *inode); +#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ + (CUR_ADDRS_PER_INODE(inode) - \ + get_inline_xattr_addrs(inode) - \ + DEF_INLINE_RESERVED_SIZE)) + +/* for inline dir */ +#define NR_INLINE_DENTRY(inode) (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + BITS_PER_BYTE + 1)) +#define INLINE_DENTRY_BITMAP_SIZE(inode) \ + DIV_ROUND_UP(NR_INLINE_DENTRY(inode), BITS_PER_BYTE) +#define INLINE_RESERVED_SIZE(inode) (MAX_INLINE_DATA(inode) - \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + NR_INLINE_DENTRY(inode) + \ + INLINE_DENTRY_BITMAP_SIZE(inode))) + +/* + * For INODE and NODE manager + */ +/* for directory operations */ + +struct f2fs_filename { + /* + * The filename the user specified. This is NULL for some + * filesystem-internal operations, e.g. converting an inline directory + * to a non-inline one, or roll-forward recovering an encrypted dentry. + */ + const struct qstr *usr_fname; + + /* + * The on-disk filename. For encrypted directories, this is encrypted. + * This may be NULL for lookups in an encrypted dir without the key. + */ + struct fscrypt_str disk_name; + + /* The dirhash of this filename */ + f2fs_hash_t hash; + +#ifdef CONFIG_FS_ENCRYPTION + /* + * For lookups in encrypted directories: either the buffer backing + * disk_name, or a buffer that holds the decoded no-key name. + */ + struct fscrypt_str crypto_buf; +#endif +#if IS_ENABLED(CONFIG_UNICODE) + /* + * For casefolded directories: the casefolded name, but it's left NULL + * if the original name is not valid Unicode, if the original name is + * "." or "..", if the directory is both casefolded and encrypted and + * its encryption key is unavailable, or if the filesystem is doing an + * internal operation where usr_fname is also NULL. In all these cases + * we fall back to treating the name as an opaque byte sequence. + */ + struct fscrypt_str cf_name; +#endif +}; + +struct f2fs_dentry_ptr { + struct inode *inode; + void *bitmap; + struct f2fs_dir_entry *dentry; + __u8 (*filename)[F2FS_SLOT_LEN]; + int max; + int nr_bitmap; +}; + +static inline void make_dentry_ptr_block(struct inode *inode, + struct f2fs_dentry_ptr *d, struct f2fs_dentry_block *t) +{ + d->inode = inode; + d->max = NR_DENTRY_IN_BLOCK; + d->nr_bitmap = SIZE_OF_DENTRY_BITMAP; + d->bitmap = t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; +} + +static inline void make_dentry_ptr_inline(struct inode *inode, + struct f2fs_dentry_ptr *d, void *t) +{ + int entry_cnt = NR_INLINE_DENTRY(inode); + int bitmap_size = INLINE_DENTRY_BITMAP_SIZE(inode); + int reserved_size = INLINE_RESERVED_SIZE(inode); + + d->inode = inode; + d->max = entry_cnt; + d->nr_bitmap = bitmap_size; + d->bitmap = t; + d->dentry = t + bitmap_size + reserved_size; + d->filename = t + bitmap_size + reserved_size + + SIZE_OF_DIR_ENTRY * entry_cnt; +} + +/* + * XATTR_NODE_OFFSET stores xattrs to one node block per file keeping -1 + * as its node offset to distinguish from index node blocks. + * But some bits are used to mark the node block. + */ +#define XATTR_NODE_OFFSET ((((unsigned int)-1) << OFFSET_BIT_SHIFT) \ + >> OFFSET_BIT_SHIFT) +enum { + ALLOC_NODE, /* allocate a new node page if needed */ + LOOKUP_NODE, /* look up a node without readahead */ + LOOKUP_NODE_RA, /* + * look up a node with readahead called + * by get_data_block. + */ +}; + +#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO or flush count */ + +/* congestion wait timeout value, default: 20ms */ +#define DEFAULT_IO_TIMEOUT (msecs_to_jiffies(20)) + +/* maximum retry quota flush count */ +#define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8 + +/* maximum retry of EIO'ed page */ +#define MAX_RETRY_PAGE_EIO 100 + +#define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ + +#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ + +/* dirty segments threshold for triggering CP */ +#define DEFAULT_DIRTY_THRESHOLD 4 + +#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS +#define RECOVERY_MIN_RA_BLOCKS 1 + +#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */ + +/* for in-memory extent cache entry */ +#define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ + +/* number of extent info in extent cache we try to shrink */ +#define READ_EXTENT_CACHE_SHRINK_NUMBER 128 + +/* extent cache type */ +enum extent_type { + EX_READ, + NR_EXTENT_CACHES, +}; + +struct rb_entry { + struct rb_node rb_node; /* rb node located in rb-tree */ + unsigned int ofs; /* start offset of the entry */ + unsigned int len; /* length of the entry */ +}; + +struct extent_info { + unsigned int fofs; /* start offset in a file */ + unsigned int len; /* length of the extent */ + union { + /* read extent_cache */ + struct { + /* start block address of the extent */ + block_t blk; +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* physical extent length of compressed blocks */ + unsigned int c_len; +#endif + }; + }; +}; + +struct extent_node { + struct rb_node rb_node; /* rb node located in rb-tree */ + struct extent_info ei; /* extent info */ + struct list_head list; /* node in global extent list of sbi */ + struct extent_tree *et; /* extent tree pointer */ +}; + +struct extent_tree { + nid_t ino; /* inode number */ + enum extent_type type; /* keep the extent tree type */ + struct rb_root_cached root; /* root of extent info rb-tree */ + struct extent_node *cached_en; /* recently accessed extent node */ + struct list_head list; /* to be used by sbi->zombie_list */ + rwlock_t lock; /* protect extent info rb-tree */ + atomic_t node_cnt; /* # of extent node in rb-tree*/ + bool largest_updated; /* largest extent updated */ + struct extent_info largest; /* largest cached extent for EX_READ */ +}; + +struct extent_tree_info { + struct radix_tree_root extent_tree_root;/* cache extent cache entries */ + struct mutex extent_tree_lock; /* locking extent radix tree */ + struct list_head extent_list; /* lru list for shrinker */ + spinlock_t extent_lock; /* locking extent lru list */ + atomic_t total_ext_tree; /* extent tree count */ + struct list_head zombie_list; /* extent zombie tree list */ + atomic_t total_zombie_tree; /* extent zombie tree count */ + atomic_t total_ext_node; /* extent info count */ +}; + +/* + * This structure is taken from ext4_map_blocks. + * + * Note that, however, f2fs uses NEW and MAPPED flags for f2fs_map_blocks(). + */ +#define F2FS_MAP_NEW (1 << BH_New) +#define F2FS_MAP_MAPPED (1 << BH_Mapped) +#define F2FS_MAP_UNWRITTEN (1 << BH_Unwritten) +#define F2FS_MAP_FLAGS (F2FS_MAP_NEW | F2FS_MAP_MAPPED |\ + F2FS_MAP_UNWRITTEN) + +struct f2fs_map_blocks { + struct block_device *m_bdev; /* for multi-device dio */ + block_t m_pblk; + block_t m_lblk; + unsigned int m_len; + unsigned int m_flags; + pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */ + pgoff_t *m_next_extent; /* point to next possible extent */ + int m_seg_type; + bool m_may_create; /* indicate it is from write path */ + bool m_multidev_dio; /* indicate it allows multi-device dio */ +}; + +/* for flag in get_data_block */ +enum { + F2FS_GET_BLOCK_DEFAULT, + F2FS_GET_BLOCK_FIEMAP, + F2FS_GET_BLOCK_BMAP, + F2FS_GET_BLOCK_DIO, + F2FS_GET_BLOCK_PRE_DIO, + F2FS_GET_BLOCK_PRE_AIO, + F2FS_GET_BLOCK_PRECACHE, +}; + +/* + * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. + */ +#define FADVISE_COLD_BIT 0x01 +#define FADVISE_LOST_PINO_BIT 0x02 +#define FADVISE_ENCRYPT_BIT 0x04 +#define FADVISE_ENC_NAME_BIT 0x08 +#define FADVISE_KEEP_SIZE_BIT 0x10 +#define FADVISE_HOT_BIT 0x20 +#define FADVISE_VERITY_BIT 0x40 +#define FADVISE_TRUNC_BIT 0x80 + +#define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT) + +#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) +#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) +#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT) + +#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) +#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) +#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT) + +#define file_is_encrypt(inode) is_file(inode, FADVISE_ENCRYPT_BIT) +#define file_set_encrypt(inode) set_file(inode, FADVISE_ENCRYPT_BIT) + +#define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) +#define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) + +#define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) +#define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) + +#define file_is_hot(inode) is_file(inode, FADVISE_HOT_BIT) +#define file_set_hot(inode) set_file(inode, FADVISE_HOT_BIT) +#define file_clear_hot(inode) clear_file(inode, FADVISE_HOT_BIT) + +#define file_is_verity(inode) is_file(inode, FADVISE_VERITY_BIT) +#define file_set_verity(inode) set_file(inode, FADVISE_VERITY_BIT) + +#define file_should_truncate(inode) is_file(inode, FADVISE_TRUNC_BIT) +#define file_need_truncate(inode) set_file(inode, FADVISE_TRUNC_BIT) +#define file_dont_truncate(inode) clear_file(inode, FADVISE_TRUNC_BIT) + +#define DEF_DIR_LEVEL 0 + +enum { + GC_FAILURE_PIN, + MAX_GC_FAILURE +}; + +/* used for f2fs_inode_info->flags */ +enum { + FI_NEW_INODE, /* indicate newly allocated inode */ + FI_DIRTY_INODE, /* indicate inode is dirty or not */ + FI_AUTO_RECOVER, /* indicate inode is recoverable */ + FI_DIRTY_DIR, /* indicate directory has dirty pages */ + FI_INC_LINK, /* need to increment i_nlink */ + FI_ACL_MODE, /* indicate acl mode */ + FI_NO_ALLOC, /* should not allocate any blocks */ + FI_FREE_NID, /* free allocated nide */ + FI_NO_EXTENT, /* not to use the extent cache */ + FI_INLINE_XATTR, /* used for inline xattr */ + FI_INLINE_DATA, /* used for inline data*/ + FI_INLINE_DENTRY, /* used for inline dentry */ + FI_APPEND_WRITE, /* inode has appended data */ + FI_UPDATE_WRITE, /* inode has in-place-update data */ + FI_NEED_IPU, /* used for ipu per file */ + FI_ATOMIC_FILE, /* indicate atomic file */ + FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ + FI_DROP_CACHE, /* drop dirty page cache */ + FI_DATA_EXIST, /* indicate data exists */ + FI_INLINE_DOTS, /* indicate inline dot dentries */ + FI_SKIP_WRITES, /* should skip data page writeback */ + FI_OPU_WRITE, /* used for opu per file */ + FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ + FI_PREALLOCATED_ALL, /* all blocks for write were preallocated */ + FI_HOT_DATA, /* indicate file is hot */ + FI_EXTRA_ATTR, /* indicate file has extra attribute */ + FI_PROJ_INHERIT, /* indicate file inherits projectid */ + FI_PIN_FILE, /* indicate file should not be gced */ + FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ + FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ + FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */ + FI_MMAP_FILE, /* indicate file was mmapped */ + FI_ENABLE_COMPRESS, /* enable compression in "user" compression mode */ + FI_COMPRESS_RELEASED, /* compressed blocks were released */ + FI_ALIGNED_WRITE, /* enable aligned write */ + FI_COW_FILE, /* indicate COW file */ + FI_ATOMIC_COMMITTED, /* indicate atomic commit completed except disk sync */ + FI_MAX, /* max flag, never be used */ +}; + +struct f2fs_inode_info { + struct inode vfs_inode; /* serve a vfs inode */ + unsigned long i_flags; /* keep an inode flags for ioctl */ + unsigned char i_advise; /* use to give file attribute hints */ + unsigned char i_dir_level; /* use for dentry level for large dir */ + unsigned int i_current_depth; /* only for directory depth */ + /* for gc failure statistic */ + unsigned int i_gc_failures[MAX_GC_FAILURE]; + unsigned int i_pino; /* parent inode number */ + umode_t i_acl_mode; /* keep file acl mode temporarily */ + + /* Use below internally in f2fs*/ + unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */ + struct f2fs_rwsem i_sem; /* protect fi info */ + atomic_t dirty_pages; /* # of dirty pages */ + f2fs_hash_t chash; /* hash value of given file name */ + unsigned int clevel; /* maximum level of given file name */ + struct task_struct *task; /* lookup and create consistency */ + struct task_struct *cp_task; /* separate cp/wb IO stats*/ + struct task_struct *wb_task; /* indicate inode is in context of writeback */ + nid_t i_xattr_nid; /* node id that contains xattrs */ + loff_t last_disk_size; /* lastly written file size */ + spinlock_t i_size_lock; /* protect last_disk_size */ + +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; + + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif + struct list_head dirty_list; /* dirty list for dirs and files */ + struct list_head gdirty_list; /* linked in global dirty list */ + struct task_struct *atomic_write_task; /* store atomic write task */ + struct extent_tree *extent_tree[NR_EXTENT_CACHES]; + /* cached extent_tree entry */ + struct inode *cow_inode; /* copy-on-write inode for atomic write */ + + /* avoid racing between foreground op and gc */ + struct f2fs_rwsem i_gc_rwsem[2]; + struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */ + + int i_extra_isize; /* size of extra space located in i_addr */ + kprojid_t i_projid; /* id for project quota */ + int i_inline_xattr_size; /* inline xattr size */ + struct timespec64 i_crtime; /* inode creation time */ + struct timespec64 i_disk_time[4];/* inode disk times */ + + /* for file compress */ + atomic_t i_compr_blocks; /* # of compressed blocks */ + unsigned char i_compress_algorithm; /* algorithm type */ + unsigned char i_log_cluster_size; /* log of cluster size */ + unsigned char i_compress_level; /* compress level (lz4hc,zstd) */ + unsigned char i_compress_flag; /* compress flag */ + unsigned int i_cluster_size; /* cluster size */ + + unsigned int atomic_write_cnt; + loff_t original_i_size; /* original i_size before atomic write */ +}; + +static inline void get_read_extent_info(struct extent_info *ext, + struct f2fs_extent *i_ext) +{ + ext->fofs = le32_to_cpu(i_ext->fofs); + ext->blk = le32_to_cpu(i_ext->blk); + ext->len = le32_to_cpu(i_ext->len); +} + +static inline void set_raw_read_extent(struct extent_info *ext, + struct f2fs_extent *i_ext) +{ + i_ext->fofs = cpu_to_le32(ext->fofs); + i_ext->blk = cpu_to_le32(ext->blk); + i_ext->len = cpu_to_le32(ext->len); +} + +static inline bool __is_discard_mergeable(struct discard_info *back, + struct discard_info *front, unsigned int max_len) +{ + return (back->lstart + back->len == front->lstart) && + (back->len + front->len <= max_len); +} + +static inline bool __is_discard_back_mergeable(struct discard_info *cur, + struct discard_info *back, unsigned int max_len) +{ + return __is_discard_mergeable(back, cur, max_len); +} + +static inline bool __is_discard_front_mergeable(struct discard_info *cur, + struct discard_info *front, unsigned int max_len) +{ + return __is_discard_mergeable(cur, front, max_len); +} + +/* + * For free nid management + */ +enum nid_state { + FREE_NID, /* newly added to free nid list */ + PREALLOC_NID, /* it is preallocated */ + MAX_NID_STATE, +}; + +enum nat_state { + TOTAL_NAT, + DIRTY_NAT, + RECLAIMABLE_NAT, + MAX_NAT_STATE, +}; + +struct f2fs_nm_info { + block_t nat_blkaddr; /* base disk address of NAT */ + nid_t max_nid; /* maximum possible node ids */ + nid_t available_nids; /* # of available node ids */ + nid_t next_scan_nid; /* the next nid to be scanned */ + nid_t max_rf_node_blocks; /* max # of nodes for recovery */ + unsigned int ram_thresh; /* control the memory footprint */ + unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ + unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */ + + /* NAT cache management */ + struct radix_tree_root nat_root;/* root of the nat entry cache */ + struct radix_tree_root nat_set_root;/* root of the nat set cache */ + struct f2fs_rwsem nat_tree_lock; /* protect nat entry tree */ + struct list_head nat_entries; /* cached nat entry list (clean) */ + spinlock_t nat_list_lock; /* protect clean nat entry list */ + unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */ + unsigned int nat_blocks; /* # of nat blocks */ + + /* free node ids management */ + struct radix_tree_root free_nid_root;/* root of the free_nid cache */ + struct list_head free_nid_list; /* list for free nids excluding preallocated nids */ + unsigned int nid_cnt[MAX_NID_STATE]; /* the number of free node id */ + spinlock_t nid_list_lock; /* protect nid lists ops */ + struct mutex build_lock; /* lock for build free nids */ + unsigned char **free_nid_bitmap; + unsigned char *nat_block_bitmap; + unsigned short *free_nid_count; /* free nid count of NAT block */ + + /* for checkpoint */ + char *nat_bitmap; /* NAT bitmap pointer */ + + unsigned int nat_bits_blocks; /* # of nat bits blocks */ + unsigned char *nat_bits; /* NAT bits blocks */ + unsigned char *full_nat_bits; /* full NAT pages */ + unsigned char *empty_nat_bits; /* empty NAT pages */ +#ifdef CONFIG_F2FS_CHECK_FS + char *nat_bitmap_mir; /* NAT bitmap mirror */ +#endif + int bitmap_size; /* bitmap size */ +}; + +/* + * this structure is used as one of function parameters. + * all the information are dedicated to a given direct node block determined + * by the data offset in a file. + */ +struct dnode_of_data { + struct inode *inode; /* vfs inode pointer */ + struct page *inode_page; /* its inode page, NULL is possible */ + struct page *node_page; /* cached direct node page */ + nid_t nid; /* node id of the direct node block */ + unsigned int ofs_in_node; /* data offset in the node page */ + bool inode_page_locked; /* inode page is locked or not */ + bool node_changed; /* is node block changed */ + char cur_level; /* level of hole node page */ + char max_level; /* level of current page located */ + block_t data_blkaddr; /* block address of the node block */ +}; + +static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, + struct page *ipage, struct page *npage, nid_t nid) +{ + memset(dn, 0, sizeof(*dn)); + dn->inode = inode; + dn->inode_page = ipage; + dn->node_page = npage; + dn->nid = nid; +} + +/* + * For SIT manager + * + * By default, there are 6 active log areas across the whole main area. + * When considering hot and cold data separation to reduce cleaning overhead, + * we split 3 for data logs and 3 for node logs as hot, warm, and cold types, + * respectively. + * In the current design, you should not change the numbers intentionally. + * Instead, as a mount option such as active_logs=x, you can use 2, 4, and 6 + * logs individually according to the underlying devices. (default: 6) + * Just in case, on-disk layout covers maximum 16 logs that consist of 8 for + * data and 8 for node logs. + */ +#define NR_CURSEG_DATA_TYPE (3) +#define NR_CURSEG_NODE_TYPE (3) +#define NR_CURSEG_INMEM_TYPE (2) +#define NR_CURSEG_RO_TYPE (2) +#define NR_CURSEG_PERSIST_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) +#define NR_CURSEG_TYPE (NR_CURSEG_INMEM_TYPE + NR_CURSEG_PERSIST_TYPE) + +enum { + CURSEG_HOT_DATA = 0, /* directory entry blocks */ + CURSEG_WARM_DATA, /* data blocks */ + CURSEG_COLD_DATA, /* multimedia or GCed data blocks */ + CURSEG_HOT_NODE, /* direct node blocks of directory files */ + CURSEG_WARM_NODE, /* direct node blocks of normal files */ + CURSEG_COLD_NODE, /* indirect node blocks */ + NR_PERSISTENT_LOG, /* number of persistent log */ + CURSEG_COLD_DATA_PINNED = NR_PERSISTENT_LOG, + /* pinned file that needs consecutive block address */ + CURSEG_ALL_DATA_ATGC, /* SSR alloctor in hot/warm/cold data area */ + NO_CHECK_TYPE, /* number of persistent & inmem log */ +}; + +struct flush_cmd { + struct completion wait; + struct llist_node llnode; + nid_t ino; + int ret; +}; + +struct flush_cmd_control { + struct task_struct *f2fs_issue_flush; /* flush thread */ + wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ + atomic_t issued_flush; /* # of issued flushes */ + atomic_t queued_flush; /* # of queued flushes */ + struct llist_head issue_list; /* list for command issue */ + struct llist_node *dispatch_list; /* list for command dispatch */ +}; + +struct f2fs_sm_info { + struct sit_info *sit_info; /* whole segment information */ + struct free_segmap_info *free_info; /* free segment information */ + struct dirty_seglist_info *dirty_info; /* dirty segment information */ + struct curseg_info *curseg_array; /* active segment information */ + + struct f2fs_rwsem curseg_lock; /* for preventing curseg change */ + + block_t seg0_blkaddr; /* block address of 0'th segment */ + block_t main_blkaddr; /* start block address of main area */ + block_t ssa_blkaddr; /* start block address of SSA area */ + + unsigned int segment_count; /* total # of segments */ + unsigned int main_segments; /* # of segments in main area */ + unsigned int reserved_segments; /* # of reserved segments */ + unsigned int additional_reserved_segments;/* reserved segs for IO align feature */ + unsigned int ovp_segments; /* # of overprovision segments */ + + /* a threshold to reclaim prefree segments */ + unsigned int rec_prefree_segments; + + /* for batched trimming */ + unsigned int trim_sections; /* # of sections to trim */ + + struct list_head sit_entry_set; /* sit entry set list */ + + unsigned int ipu_policy; /* in-place-update policy */ + unsigned int min_ipu_util; /* in-place-update threshold */ + unsigned int min_fsync_blocks; /* threshold for fsync */ + unsigned int min_seq_blocks; /* threshold for sequential blocks */ + unsigned int min_hot_blocks; /* threshold for hot block allocation */ + unsigned int min_ssr_sections; /* threshold to trigger SSR allocation */ + + /* for flush command control */ + struct flush_cmd_control *fcc_info; + + /* for discard command control */ + struct discard_cmd_control *dcc_info; +}; + +/* + * For superblock + */ +/* + * COUNT_TYPE for monitoring + * + * f2fs monitors the number of several block types such as on-writeback, + * dirty dentry blocks, dirty node blocks, and dirty meta blocks. + */ +#define WB_DATA_TYPE(p) (__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) +enum count_type { + F2FS_DIRTY_DENTS, + F2FS_DIRTY_DATA, + F2FS_DIRTY_QDATA, + F2FS_DIRTY_NODES, + F2FS_DIRTY_META, + F2FS_DIRTY_IMETA, + F2FS_WB_CP_DATA, + F2FS_WB_DATA, + F2FS_RD_DATA, + F2FS_RD_NODE, + F2FS_RD_META, + F2FS_DIO_WRITE, + F2FS_DIO_READ, + NR_COUNT_TYPE, +}; + +/* + * The below are the page types of bios used in submit_bio(). + * The available types are: + * DATA User data pages. It operates as async mode. + * NODE Node pages. It operates as async mode. + * META FS metadata pages such as SIT, NAT, CP. + * NR_PAGE_TYPE The number of page types. + * META_FLUSH Make sure the previous pages are written + * with waiting the bio's completion + * ... Only can be used with META. + */ +#define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type)) +enum page_type { + DATA = 0, + NODE = 1, /* should not change this */ + META, + NR_PAGE_TYPE, + META_FLUSH, + IPU, /* the below types are used by tracepoints only. */ + OPU, +}; + +enum temp_type { + HOT = 0, /* must be zero for meta bio */ + WARM, + COLD, + NR_TEMP_TYPE, +}; + +enum need_lock_type { + LOCK_REQ = 0, + LOCK_DONE, + LOCK_RETRY, +}; + +enum cp_reason_type { + CP_NO_NEEDED, + CP_NON_REGULAR, + CP_COMPRESSED, + CP_HARDLINK, + CP_SB_NEED_CP, + CP_WRONG_PINO, + CP_NO_SPC_ROLL, + CP_NODE_NEED_CP, + CP_FASTBOOT_MODE, + CP_SPEC_LOG_NUM, + CP_RECOVER_DIR, +}; + +enum iostat_type { + /* WRITE IO */ + APP_DIRECT_IO, /* app direct write IOs */ + APP_BUFFERED_IO, /* app buffered write IOs */ + APP_WRITE_IO, /* app write IOs */ + APP_MAPPED_IO, /* app mapped IOs */ + APP_BUFFERED_CDATA_IO, /* app buffered write IOs on compressed file */ + APP_MAPPED_CDATA_IO, /* app mapped write IOs on compressed file */ + FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */ + FS_CDATA_IO, /* data IOs from kworker/fsync/reclaimer on compressed file */ + FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */ + FS_META_IO, /* meta IOs from kworker/reclaimer */ + FS_GC_DATA_IO, /* data IOs from forground gc */ + FS_GC_NODE_IO, /* node IOs from forground gc */ + FS_CP_DATA_IO, /* data IOs from checkpoint */ + FS_CP_NODE_IO, /* node IOs from checkpoint */ + FS_CP_META_IO, /* meta IOs from checkpoint */ + + /* READ IO */ + APP_DIRECT_READ_IO, /* app direct read IOs */ + APP_BUFFERED_READ_IO, /* app buffered read IOs */ + APP_READ_IO, /* app read IOs */ + APP_MAPPED_READ_IO, /* app mapped read IOs */ + APP_BUFFERED_CDATA_READ_IO, /* app buffered read IOs on compressed file */ + APP_MAPPED_CDATA_READ_IO, /* app mapped read IOs on compressed file */ + FS_DATA_READ_IO, /* data read IOs */ + FS_GDATA_READ_IO, /* data read IOs from background gc */ + FS_CDATA_READ_IO, /* compressed data read IOs */ + FS_NODE_READ_IO, /* node read IOs */ + FS_META_READ_IO, /* meta read IOs */ + + /* other */ + FS_DISCARD, /* discard */ + NR_IO_TYPE, +}; + +struct f2fs_io_info { + struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ + nid_t ino; /* inode number */ + enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ + enum temp_type temp; /* contains HOT/WARM/COLD */ + enum req_op op; /* contains REQ_OP_ */ + blk_opf_t op_flags; /* req_flag_bits */ + block_t new_blkaddr; /* new block address to be written */ + block_t old_blkaddr; /* old block address before Cow */ + struct page *page; /* page to be written */ + struct page *encrypted_page; /* encrypted page */ + struct page *compressed_page; /* compressed page */ + struct list_head list; /* serialize IOs */ + bool submitted; /* indicate IO submission */ + int need_lock; /* indicate we need to lock cp_rwsem */ + bool in_list; /* indicate fio is in io_list */ + bool is_por; /* indicate IO is from recovery or not */ + bool retry; /* need to reallocate block address */ + int compr_blocks; /* # of compressed block addresses */ + bool encrypted; /* indicate file is encrypted */ + bool post_read; /* require post read */ + enum iostat_type io_type; /* io type */ + struct writeback_control *io_wbc; /* writeback control */ + struct bio **bio; /* bio for ipu */ + sector_t *last_block; /* last block number in bio */ + unsigned char version; /* version of the node */ +}; + +struct bio_entry { + struct bio *bio; + struct list_head list; +}; + +#define is_read_io(rw) ((rw) == READ) +struct f2fs_bio_info { + struct f2fs_sb_info *sbi; /* f2fs superblock */ + struct bio *bio; /* bios to merge */ + sector_t last_block_in_bio; /* last block number */ + struct f2fs_io_info fio; /* store buffered io info. */ + struct f2fs_rwsem io_rwsem; /* blocking op for bio */ + spinlock_t io_lock; /* serialize DATA/NODE IOs */ + struct list_head io_list; /* track fios */ + struct list_head bio_list; /* bio entry list head */ + struct f2fs_rwsem bio_list_lock; /* lock to protect bio entry list */ +}; + +#define FDEV(i) (sbi->devs[i]) +#define RDEV(i) (raw_super->devs[i]) +struct f2fs_dev_info { + struct block_device *bdev; + char path[MAX_PATH_LEN]; + unsigned int total_segments; + block_t start_blk; + block_t end_blk; +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int nr_blkz; /* Total number of zones */ + unsigned long *blkz_seq; /* Bitmap indicating sequential zones */ +#endif +}; + +enum inode_type { + DIR_INODE, /* for dirty dir inode */ + FILE_INODE, /* for dirty regular/symlink inode */ + DIRTY_META, /* for all dirtied inode metadata */ + NR_INODE_TYPE, +}; + +/* for inner inode cache management */ +struct inode_management { + struct radix_tree_root ino_root; /* ino entry array */ + spinlock_t ino_lock; /* for ino entry lock */ + struct list_head ino_list; /* inode list head */ + unsigned long ino_num; /* number of entries */ +}; + +/* for GC_AT */ +struct atgc_management { + bool atgc_enabled; /* ATGC is enabled or not */ + struct rb_root_cached root; /* root of victim rb-tree */ + struct list_head victim_list; /* linked with all victim entries */ + unsigned int victim_count; /* victim count in rb-tree */ + unsigned int candidate_ratio; /* candidate ratio */ + unsigned int max_candidate_count; /* max candidate count */ + unsigned int age_weight; /* age weight, vblock_weight = 100 - age_weight */ + unsigned long long age_threshold; /* age threshold */ +}; + +struct f2fs_gc_control { + unsigned int victim_segno; /* target victim segment number */ + int init_gc_type; /* FG_GC or BG_GC */ + bool no_bg_gc; /* check the space and stop bg_gc */ + bool should_migrate_blocks; /* should migrate blocks */ + bool err_gc_skipped; /* return EAGAIN if GC skipped */ + unsigned int nr_free_secs; /* # of free sections to do GC */ +}; + +/* For s_flag in struct f2fs_sb_info */ +enum { + SBI_IS_DIRTY, /* dirty flag for checkpoint */ + SBI_IS_CLOSE, /* specify unmounting */ + SBI_NEED_FSCK, /* need fsck.f2fs to fix */ + SBI_POR_DOING, /* recovery is doing or not */ + SBI_NEED_SB_WRITE, /* need to recover superblock */ + SBI_NEED_CP, /* need to checkpoint */ + SBI_IS_SHUTDOWN, /* shutdown by ioctl */ + SBI_IS_RECOVERED, /* recovered orphan/data */ + SBI_CP_DISABLED, /* CP was disabled last mount */ + SBI_CP_DISABLED_QUICK, /* CP was disabled quickly */ + SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */ + SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */ + SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ + SBI_IS_RESIZEFS, /* resizefs is in process */ + SBI_IS_FREEZING, /* freezefs is in process */ +}; + +enum { + CP_TIME, + REQ_TIME, + DISCARD_TIME, + GC_TIME, + DISABLE_TIME, + UMOUNT_DISCARD_TIMEOUT, + MAX_TIME, +}; + +enum { + GC_NORMAL, + GC_IDLE_CB, + GC_IDLE_GREEDY, + GC_IDLE_AT, + GC_URGENT_HIGH, + GC_URGENT_LOW, + GC_URGENT_MID, + MAX_GC_MODE, +}; + +enum { + BGGC_MODE_ON, /* background gc is on */ + BGGC_MODE_OFF, /* background gc is off */ + BGGC_MODE_SYNC, /* + * background gc is on, migrating blocks + * like foreground gc + */ +}; + +enum { + FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */ + FS_MODE_LFS, /* use lfs allocation only */ + FS_MODE_FRAGMENT_SEG, /* segment fragmentation mode */ + FS_MODE_FRAGMENT_BLK, /* block fragmentation mode */ +}; + +enum { + ALLOC_MODE_DEFAULT, /* stay default */ + ALLOC_MODE_REUSE, /* reuse segments as much as possible */ +}; + +enum fsync_mode { + FSYNC_MODE_POSIX, /* fsync follows posix semantics */ + FSYNC_MODE_STRICT, /* fsync behaves in line with ext4 */ + FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */ +}; + +enum { + COMPR_MODE_FS, /* + * automatically compress compression + * enabled files + */ + COMPR_MODE_USER, /* + * automatical compression is disabled. + * user can control the file compression + * using ioctls + */ +}; + +enum { + DISCARD_UNIT_BLOCK, /* basic discard unit is block */ + DISCARD_UNIT_SEGMENT, /* basic discard unit is segment */ + DISCARD_UNIT_SECTION, /* basic discard unit is section */ +}; + +enum { + MEMORY_MODE_NORMAL, /* memory mode for normal devices */ + MEMORY_MODE_LOW, /* memory mode for low memry devices */ +}; + + + +static inline int f2fs_test_bit(unsigned int nr, char *addr); +static inline void f2fs_set_bit(unsigned int nr, char *addr); +static inline void f2fs_clear_bit(unsigned int nr, char *addr); + +/* + * Layout of f2fs page.private: + * + * Layout A: lowest bit should be 1 + * | bit0 = 1 | bit1 | bit2 | ... | bit MAX | private data .... | + * bit 0 PAGE_PRIVATE_NOT_POINTER + * bit 1 PAGE_PRIVATE_ATOMIC_WRITE + * bit 2 PAGE_PRIVATE_DUMMY_WRITE + * bit 3 PAGE_PRIVATE_ONGOING_MIGRATION + * bit 4 PAGE_PRIVATE_INLINE_INODE + * bit 5 PAGE_PRIVATE_REF_RESOURCE + * bit 6- f2fs private data + * + * Layout B: lowest bit should be 0 + * page.private is a wrapped pointer. + */ +enum { + PAGE_PRIVATE_NOT_POINTER, /* private contains non-pointer data */ + PAGE_PRIVATE_ATOMIC_WRITE, /* data page from atomic write path */ + PAGE_PRIVATE_DUMMY_WRITE, /* data page for padding aligned IO */ + PAGE_PRIVATE_ONGOING_MIGRATION, /* data page which is on-going migrating */ + PAGE_PRIVATE_INLINE_INODE, /* inode page contains inline data */ + PAGE_PRIVATE_REF_RESOURCE, /* dirty page has referenced resources */ + PAGE_PRIVATE_MAX +}; + +#define PAGE_PRIVATE_GET_FUNC(name, flagname) \ +static inline bool page_private_##name(struct page *page) \ +{ \ + return PagePrivate(page) && \ + test_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)) && \ + test_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ +} + +#define PAGE_PRIVATE_SET_FUNC(name, flagname) \ +static inline void set_page_private_##name(struct page *page) \ +{ \ + if (!PagePrivate(page)) { \ + get_page(page); \ + SetPagePrivate(page); \ + set_page_private(page, 0); \ + } \ + set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); \ + set_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ +} + +#define PAGE_PRIVATE_CLEAR_FUNC(name, flagname) \ +static inline void clear_page_private_##name(struct page *page) \ +{ \ + clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ + if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) { \ + set_page_private(page, 0); \ + if (PagePrivate(page)) { \ + ClearPagePrivate(page); \ + put_page(page); \ + }\ + } \ +} + +PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER); +PAGE_PRIVATE_GET_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_GET_FUNC(atomic, ATOMIC_WRITE); +PAGE_PRIVATE_GET_FUNC(dummy, DUMMY_WRITE); + +PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_SET_FUNC(atomic, ATOMIC_WRITE); +PAGE_PRIVATE_SET_FUNC(dummy, DUMMY_WRITE); + +PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_CLEAR_FUNC(atomic, ATOMIC_WRITE); +PAGE_PRIVATE_CLEAR_FUNC(dummy, DUMMY_WRITE); + +static inline unsigned long get_page_private_data(struct page *page) +{ + unsigned long data = page_private(page); + + if (!test_bit(PAGE_PRIVATE_NOT_POINTER, &data)) + return 0; + return data >> PAGE_PRIVATE_MAX; +} + +static inline void set_page_private_data(struct page *page, unsigned long data) +{ + if (!PagePrivate(page)) { + get_page(page); + SetPagePrivate(page); + set_page_private(page, 0); + } + set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); + page_private(page) |= data << PAGE_PRIVATE_MAX; +} + +static inline void clear_page_private_data(struct page *page) +{ + page_private(page) &= GENMASK(PAGE_PRIVATE_MAX - 1, 0); + if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) { + set_page_private(page, 0); + if (PagePrivate(page)) { + ClearPagePrivate(page); + put_page(page); + } + } +} + +/* For compression */ +enum compress_algorithm_type { + COMPRESS_LZO, + COMPRESS_LZ4, + COMPRESS_ZSTD, + COMPRESS_LZORLE, + COMPRESS_MAX, +}; + +enum compress_flag { + COMPRESS_CHKSUM, + COMPRESS_MAX_FLAG, +}; + +#define COMPRESS_WATERMARK 20 +#define COMPRESS_PERCENT 20 + +#define COMPRESS_DATA_RESERVED_SIZE 4 +struct compress_data { + __le32 clen; /* compressed data size */ + __le32 chksum; /* compressed data chksum */ + __le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */ + u8 cdata[]; /* compressed data */ +}; + +#define COMPRESS_HEADER_SIZE (sizeof(struct compress_data)) + +#define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000 + +#define F2FS_ZSTD_DEFAULT_CLEVEL 1 + +#define COMPRESS_LEVEL_OFFSET 8 + +/* compress context */ +struct compress_ctx { + struct inode *inode; /* inode the context belong to */ + pgoff_t cluster_idx; /* cluster index number */ + unsigned int cluster_size; /* page count in cluster */ + unsigned int log_cluster_size; /* log of cluster size */ + struct page **rpages; /* pages store raw data in cluster */ + unsigned int nr_rpages; /* total page number in rpages */ + struct page **cpages; /* pages store compressed data in cluster */ + unsigned int nr_cpages; /* total page number in cpages */ + unsigned int valid_nr_cpages; /* valid page number in cpages */ + void *rbuf; /* virtual mapped address on rpages */ + struct compress_data *cbuf; /* virtual mapped address on cpages */ + size_t rlen; /* valid data length in rbuf */ + size_t clen; /* valid data length in cbuf */ + void *private; /* payload buffer for specified compression algorithm */ + void *private2; /* extra payload buffer */ +}; + +/* compress context for write IO path */ +struct compress_io_ctx { + u32 magic; /* magic number to indicate page is compressed */ + struct inode *inode; /* inode the context belong to */ + struct page **rpages; /* pages store raw data in cluster */ + unsigned int nr_rpages; /* total page number in rpages */ + atomic_t pending_pages; /* in-flight compressed page count */ +}; + +/* Context for decompressing one cluster on the read IO path */ +struct decompress_io_ctx { + u32 magic; /* magic number to indicate page is compressed */ + struct inode *inode; /* inode the context belong to */ + pgoff_t cluster_idx; /* cluster index number */ + unsigned int cluster_size; /* page count in cluster */ + unsigned int log_cluster_size; /* log of cluster size */ + struct page **rpages; /* pages store raw data in cluster */ + unsigned int nr_rpages; /* total page number in rpages */ + struct page **cpages; /* pages store compressed data in cluster */ + unsigned int nr_cpages; /* total page number in cpages */ + struct page **tpages; /* temp pages to pad holes in cluster */ + void *rbuf; /* virtual mapped address on rpages */ + struct compress_data *cbuf; /* virtual mapped address on cpages */ + size_t rlen; /* valid data length in rbuf */ + size_t clen; /* valid data length in cbuf */ + + /* + * The number of compressed pages remaining to be read in this cluster. + * This is initially nr_cpages. It is decremented by 1 each time a page + * has been read (or failed to be read). When it reaches 0, the cluster + * is decompressed (or an error is reported). + * + * If an error occurs before all the pages have been submitted for I/O, + * then this will never reach 0. In this case the I/O submitter is + * responsible for calling f2fs_decompress_end_io() instead. + */ + atomic_t remaining_pages; + + /* + * Number of references to this decompress_io_ctx. + * + * One reference is held for I/O completion. This reference is dropped + * after the pagecache pages are updated and unlocked -- either after + * decompression (and verity if enabled), or after an error. + * + * In addition, each compressed page holds a reference while it is in a + * bio. These references are necessary prevent compressed pages from + * being freed while they are still in a bio. + */ + refcount_t refcnt; + + bool failed; /* IO error occurred before decompression? */ + bool need_verity; /* need fs-verity verification after decompression? */ + void *private; /* payload buffer for specified decompression algorithm */ + void *private2; /* extra payload buffer */ + struct work_struct verity_work; /* work to verify the decompressed pages */ + struct work_struct free_work; /* work for late free this structure itself */ +}; + +#define NULL_CLUSTER ((unsigned int)(~0)) +#define MIN_COMPRESS_LOG_SIZE 2 +#define MAX_COMPRESS_LOG_SIZE 8 +#define MAX_COMPRESS_WINDOW_SIZE(log_size) ((PAGE_SIZE) << (log_size)) + +struct f2fs_sb_info { + struct super_block *sb; /* pointer to VFS super block */ + struct proc_dir_entry *s_proc; /* proc entry */ + struct f2fs_super_block *raw_super; /* raw super block pointer */ + struct f2fs_rwsem sb_lock; /* lock for raw super block */ + int valid_super_block; /* valid super block no */ + unsigned long s_flag; /* flags for sbi */ + struct mutex writepages; /* mutex for writepages() */ + +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int blocks_per_blkz; /* F2FS blocks per zone */ + unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ +#endif + + /* for node-related operations */ + struct f2fs_nm_info *nm_info; /* node manager */ + struct inode *node_inode; /* cache node blocks */ + + /* for segment-related operations */ + struct f2fs_sm_info *sm_info; /* segment manager */ + + /* for bio operations */ + struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ + /* keep migration IO order for LFS mode */ + struct f2fs_rwsem io_order_lock; + mempool_t *write_io_dummy; /* Dummy pages */ + pgoff_t page_eio_ofs[NR_PAGE_TYPE]; /* EIO page offset */ + int page_eio_cnt[NR_PAGE_TYPE]; /* EIO count */ + + /* for checkpoint */ + struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ + int cur_cp_pack; /* remain current cp pack */ + spinlock_t cp_lock; /* for flag in ckpt */ + struct inode *meta_inode; /* cache meta blocks */ + struct f2fs_rwsem cp_global_sem; /* checkpoint procedure lock */ + struct f2fs_rwsem cp_rwsem; /* blocking FS operations */ + struct f2fs_rwsem node_write; /* locking node writes */ + struct f2fs_rwsem node_change; /* locking node change */ + wait_queue_head_t cp_wait; + unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ + long interval_time[MAX_TIME]; /* to store thresholds */ + struct ckpt_req_control cprc_info; /* for checkpoint request control */ + + struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ + + spinlock_t fsync_node_lock; /* for node entry lock */ + struct list_head fsync_node_list; /* node list head */ + unsigned int fsync_seg_id; /* sequence id */ + unsigned int fsync_node_num; /* number of node entries */ + + /* for orphan inode, use 0'th array */ + unsigned int max_orphans; /* max orphan inodes */ + + /* for inode management */ + struct list_head inode_list[NR_INODE_TYPE]; /* dirty inode list */ + spinlock_t inode_lock[NR_INODE_TYPE]; /* for dirty inode list lock */ + struct mutex flush_lock; /* for flush exclusion */ + + /* for extent tree cache */ + struct extent_tree_info extent_tree[NR_EXTENT_CACHES]; + + /* basic filesystem units */ + unsigned int log_sectors_per_block; /* log2 sectors per block */ + unsigned int log_blocksize; /* log2 block size */ + unsigned int blocksize; /* block size */ + unsigned int root_ino_num; /* root inode number*/ + unsigned int node_ino_num; /* node inode number*/ + unsigned int meta_ino_num; /* meta inode number*/ + unsigned int log_blocks_per_seg; /* log2 blocks per segment */ + unsigned int blocks_per_seg; /* blocks per segment */ + unsigned int unusable_blocks_per_sec; /* unusable blocks per section */ + unsigned int segs_per_sec; /* segments per section */ + unsigned int secs_per_zone; /* sections per zone */ + unsigned int total_sections; /* total section count */ + unsigned int total_node_count; /* total node block count */ + unsigned int total_valid_node_count; /* valid node block count */ + int dir_level; /* directory level */ + int readdir_ra; /* readahead inode in readdir */ + u64 max_io_bytes; /* max io bytes to merge IOs */ + + block_t user_block_count; /* # of user blocks */ + block_t total_valid_block_count; /* # of valid blocks */ + block_t discard_blks; /* discard command candidats */ + block_t last_valid_block_count; /* for recovery */ + block_t reserved_blocks; /* configurable reserved blocks */ + block_t current_reserved_blocks; /* current reserved blocks */ + + /* Additional tracking for no checkpoint mode */ + block_t unusable_block_count; /* # of blocks saved by last cp */ + + unsigned int nquota_files; /* # of quota sysfile */ + struct f2fs_rwsem quota_sem; /* blocking cp for flags */ + + /* # of pages, see count_type */ + atomic_t nr_pages[NR_COUNT_TYPE]; + /* # of allocated blocks */ + struct percpu_counter alloc_valid_block_count; + /* # of node block writes as roll forward recovery */ + struct percpu_counter rf_node_block_count; + + /* writeback control */ + atomic_t wb_sync_req[META]; /* count # of WB_SYNC threads */ + + /* valid inode count */ + struct percpu_counter total_valid_inode_count; + + struct f2fs_mount_info mount_opt; /* mount options */ + + /* for cleaning operations */ + struct f2fs_rwsem gc_lock; /* + * semaphore for GC, avoid + * race between GC and GC or CP + */ + struct f2fs_gc_kthread *gc_thread; /* GC thread */ + struct atgc_management am; /* atgc management */ + unsigned int cur_victim_sec; /* current victim section num */ + unsigned int gc_mode; /* current GC state */ + unsigned int next_victim_seg[2]; /* next segment in victim section */ + spinlock_t gc_urgent_high_lock; + unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */ + + /* for skip statistic */ + unsigned long long skipped_gc_rwsem; /* FG_GC only */ + + /* threshold for gc trials on pinned files */ + u64 gc_pin_file_threshold; + struct f2fs_rwsem pin_sem; + + /* maximum # of trials to find a victim segment for SSR and GC */ + unsigned int max_victim_search; + /* migration granularity of garbage collection, unit: segment */ + unsigned int migration_granularity; + + /* + * for stat information. + * one is for the LFS mode, and the other is for the SSR mode. + */ +#ifdef CONFIG_F2FS_STAT_FS + struct f2fs_stat_info *stat_info; /* FS status information */ + atomic_t meta_count[META_MAX]; /* # of meta blocks */ + unsigned int segment_count[2]; /* # of allocated segments */ + unsigned int block_count[2]; /* # of allocated blocks */ + atomic_t inplace_count; /* # of inplace update */ + /* # of lookup extent cache */ + atomic64_t total_hit_ext[NR_EXTENT_CACHES]; + /* # of hit rbtree extent node */ + atomic64_t read_hit_rbtree[NR_EXTENT_CACHES]; + /* # of hit cached extent node */ + atomic64_t read_hit_cached[NR_EXTENT_CACHES]; + /* # of hit largest extent node in read extent cache */ + atomic64_t read_hit_largest; + atomic_t inline_xattr; /* # of inline_xattr inodes */ + atomic_t inline_inode; /* # of inline_data inodes */ + atomic_t inline_dir; /* # of inline_dentry inodes */ + atomic_t compr_inode; /* # of compressed inodes */ + atomic64_t compr_blocks; /* # of compressed blocks */ + atomic_t swapfile_inode; /* # of swapfile inodes */ + atomic_t atomic_files; /* # of opened atomic file */ + atomic_t max_aw_cnt; /* max # of atomic writes */ + unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ + unsigned int other_skip_bggc; /* skip background gc for other reasons */ + unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ +#endif + spinlock_t stat_lock; /* lock for stat operations */ + + /* to attach REQ_META|REQ_FUA flags */ + unsigned int data_io_flag; + unsigned int node_io_flag; + + /* For sysfs support */ + struct kobject s_kobj; /* /sys/fs/f2fs/<devname> */ + struct completion s_kobj_unregister; + + struct kobject s_stat_kobj; /* /sys/fs/f2fs/<devname>/stat */ + struct completion s_stat_kobj_unregister; + + struct kobject s_feature_list_kobj; /* /sys/fs/f2fs/<devname>/feature_list */ + struct completion s_feature_list_kobj_unregister; + + /* For shrinker support */ + struct list_head s_list; + struct mutex umount_mutex; + unsigned int shrinker_run_no; + + /* For multi devices */ + int s_ndevs; /* number of devices */ + struct f2fs_dev_info *devs; /* for device list */ + unsigned int dirty_device; /* for checkpoint data flush */ + spinlock_t dev_lock; /* protect dirty_device */ + bool aligned_blksize; /* all devices has the same logical blksize */ + + /* For write statistics */ + u64 sectors_written_start; + u64 kbytes_written; + + /* Reference to checksum algorithm driver via cryptoapi */ + struct crypto_shash *s_chksum_driver; + + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_chksum_seed; + + struct workqueue_struct *post_read_wq; /* post read workqueue */ + + unsigned char errors[MAX_F2FS_ERRORS]; /* error flags */ + spinlock_t error_lock; /* protect errors array */ + bool error_dirty; /* errors of sb is dirty */ + + struct kmem_cache *inline_xattr_slab; /* inline xattr entry */ + unsigned int inline_xattr_slab_size; /* default inline xattr slab size */ + + /* For reclaimed segs statistics per each GC mode */ + unsigned int gc_segment_mode; /* GC state for reclaimed segments */ + unsigned int gc_reclaimed_segs[MAX_GC_MODE]; /* Reclaimed segs for each mode */ + + unsigned long seq_file_ra_mul; /* multiplier for ra_pages of seq. files in fadvise */ + + int max_fragment_chunk; /* max chunk size for block fragmentation mode */ + int max_fragment_hole; /* max hole size for block fragmentation mode */ + + /* For atomic write statistics */ + atomic64_t current_atomic_write; + s64 peak_atomic_write; + u64 committed_atomic_block; + u64 revoked_atomic_block; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct kmem_cache *page_array_slab; /* page array entry */ + unsigned int page_array_slab_size; /* default page array slab size */ + + /* For runtime compression statistics */ + u64 compr_written_block; + u64 compr_saved_block; + u32 compr_new_inode; + + /* For compressed block cache */ + struct inode *compress_inode; /* cache compressed blocks */ + unsigned int compress_percent; /* cache page percentage */ + unsigned int compress_watermark; /* cache page watermark */ + atomic_t compress_page_hit; /* cache hit count */ +#endif + +#ifdef CONFIG_F2FS_IOSTAT + /* For app/fs IO statistics */ + spinlock_t iostat_lock; + unsigned long long rw_iostat[NR_IO_TYPE]; + unsigned long long prev_rw_iostat[NR_IO_TYPE]; + bool iostat_enable; + unsigned long iostat_next_period; + unsigned int iostat_period_ms; + + /* For io latency related statistics info in one iostat period */ + spinlock_t iostat_lat_lock; + struct iostat_lat_info *iostat_io_lat; +#endif +}; + +#ifdef CONFIG_F2FS_FAULT_INJECTION +#define f2fs_show_injection_info(sbi, type) \ + printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n", \ + KERN_INFO, sbi->sb->s_id, \ + f2fs_fault_name[type], \ + __func__, __builtin_return_address(0)) +static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) +{ + struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; + + if (!ffi->inject_rate) + return false; + + if (!IS_FAULT_SET(ffi, type)) + return false; + + atomic_inc(&ffi->inject_ops); + if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { + atomic_set(&ffi->inject_ops, 0); + return true; + } + return false; +} +#else +#define f2fs_show_injection_info(sbi, type) do { } while (0) +static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) +{ + return false; +} +#endif + +/* + * Test if the mounted volume is a multi-device volume. + * - For a single regular disk volume, sbi->s_ndevs is 0. + * - For a single zoned disk volume, sbi->s_ndevs is 1. + * - For a multi-device volume, sbi->s_ndevs is always 2 or more. + */ +static inline bool f2fs_is_multi_device(struct f2fs_sb_info *sbi) +{ + return sbi->s_ndevs > 1; +} + +static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) +{ + unsigned long now = jiffies; + + sbi->last_time[type] = now; + + /* DISCARD_TIME and GC_TIME are based on REQ_TIME */ + if (type == REQ_TIME) { + sbi->last_time[DISCARD_TIME] = now; + sbi->last_time[GC_TIME] = now; + } +} + +static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) +{ + unsigned long interval = sbi->interval_time[type] * HZ; + + return time_after(jiffies, sbi->last_time[type] + interval); +} + +static inline unsigned int f2fs_time_to_wait(struct f2fs_sb_info *sbi, + int type) +{ + unsigned long interval = sbi->interval_time[type] * HZ; + unsigned int wait_ms = 0; + long delta; + + delta = (sbi->last_time[type] + interval) - jiffies; + if (delta > 0) + wait_ms = jiffies_to_msecs(delta); + + return wait_ms; +} + +/* + * Inline functions + */ +static inline u32 __f2fs_crc32(struct f2fs_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + int err; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} + +static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, + unsigned int length) +{ + return __f2fs_crc32(sbi, F2FS_SUPER_MAGIC, address, length); +} + +static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, + void *buf, size_t buf_size) +{ + return f2fs_crc32(sbi, buf, buf_size) == blk_crc; +} + +static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + return __f2fs_crc32(sbi, crc, address, length); +} + +static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) +{ + return container_of(inode, struct f2fs_inode_info, vfs_inode); +} + +static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct f2fs_sb_info *F2FS_I_SB(struct inode *inode) +{ + return F2FS_SB(inode->i_sb); +} + +static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping) +{ + return F2FS_I_SB(mapping->host); +} + +static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page) +{ + return F2FS_M_SB(page_file_mapping(page)); +} + +static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_super_block *)(sbi->raw_super); +} + +static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_checkpoint *)(sbi->ckpt); +} + +static inline struct f2fs_node *F2FS_NODE(struct page *page) +{ + return (struct f2fs_node *)page_address(page); +} + +static inline struct f2fs_inode *F2FS_INODE(struct page *page) +{ + return &((struct f2fs_node *)page_address(page))->i; +} + +static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_nm_info *)(sbi->nm_info); +} + +static inline struct f2fs_sm_info *SM_I(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_sm_info *)(sbi->sm_info); +} + +static inline struct sit_info *SIT_I(struct f2fs_sb_info *sbi) +{ + return (struct sit_info *)(SM_I(sbi)->sit_info); +} + +static inline struct free_segmap_info *FREE_I(struct f2fs_sb_info *sbi) +{ + return (struct free_segmap_info *)(SM_I(sbi)->free_info); +} + +static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi) +{ + return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info); +} + +static inline struct address_space *META_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->meta_inode->i_mapping; +} + +static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->node_inode->i_mapping; +} + +static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type) +{ + return test_bit(type, &sbi->s_flag); +} + +static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) +{ + set_bit(type, &sbi->s_flag); +} + +static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) +{ + clear_bit(type, &sbi->s_flag); +} + +static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) +{ + return le64_to_cpu(cp->checkpoint_ver); +} + +static inline unsigned long f2fs_qf_ino(struct super_block *sb, int type) +{ + if (type < F2FS_MAX_QUOTAS) + return le32_to_cpu(F2FS_SB(sb)->raw_super->qf_ino[type]); + return 0; +} + +static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp) +{ + size_t crc_offset = le32_to_cpu(cp->checksum_offset); + return le32_to_cpu(*((__le32 *)((unsigned char *)cp + crc_offset))); +} + +static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + + return ckpt_flags & f; +} + +static inline bool is_set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) +{ + return __is_set_ckpt_flags(F2FS_CKPT(sbi), f); +} + +static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags; + + ckpt_flags = le32_to_cpu(cp->ckpt_flags); + ckpt_flags |= f; + cp->ckpt_flags = cpu_to_le32(ckpt_flags); +} + +static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) +{ + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); + __set_ckpt_flags(F2FS_CKPT(sbi), f); + spin_unlock_irqrestore(&sbi->cp_lock, flags); +} + +static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags; + + ckpt_flags = le32_to_cpu(cp->ckpt_flags); + ckpt_flags &= (~f); + cp->ckpt_flags = cpu_to_le32(ckpt_flags); +} + +static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) +{ + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); + __clear_ckpt_flags(F2FS_CKPT(sbi), f); + spin_unlock_irqrestore(&sbi->cp_lock, flags); +} + +#define init_f2fs_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_f2fs_rwsem((sem), #sem, &__key); \ +} while (0) + +static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem, + const char *sem_name, struct lock_class_key *key) +{ + __init_rwsem(&sem->internal_rwsem, sem_name, key); +#ifdef CONFIG_F2FS_UNFAIR_RWSEM + init_waitqueue_head(&sem->read_waiters); +#endif +} + +static inline int f2fs_rwsem_is_locked(struct f2fs_rwsem *sem) +{ + return rwsem_is_locked(&sem->internal_rwsem); +} + +static inline int f2fs_rwsem_is_contended(struct f2fs_rwsem *sem) +{ + return rwsem_is_contended(&sem->internal_rwsem); +} + +static inline void f2fs_down_read(struct f2fs_rwsem *sem) +{ +#ifdef CONFIG_F2FS_UNFAIR_RWSEM + wait_event(sem->read_waiters, down_read_trylock(&sem->internal_rwsem)); +#else + down_read(&sem->internal_rwsem); +#endif +} + +static inline int f2fs_down_read_trylock(struct f2fs_rwsem *sem) +{ + return down_read_trylock(&sem->internal_rwsem); +} + +static inline void f2fs_up_read(struct f2fs_rwsem *sem) +{ + up_read(&sem->internal_rwsem); +} + +static inline void f2fs_down_write(struct f2fs_rwsem *sem) +{ + down_write(&sem->internal_rwsem); +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static inline void f2fs_down_read_nested(struct f2fs_rwsem *sem, int subclass) +{ + down_read_nested(&sem->internal_rwsem, subclass); +} + +static inline void f2fs_down_write_nested(struct f2fs_rwsem *sem, int subclass) +{ + down_write_nested(&sem->internal_rwsem, subclass); +} +#else +#define f2fs_down_read_nested(sem, subclass) f2fs_down_read(sem) +#define f2fs_down_write_nested(sem, subclass) f2fs_down_write(sem) +#endif + +static inline int f2fs_down_write_trylock(struct f2fs_rwsem *sem) +{ + return down_write_trylock(&sem->internal_rwsem); +} + +static inline void f2fs_up_write(struct f2fs_rwsem *sem) +{ + up_write(&sem->internal_rwsem); +#ifdef CONFIG_F2FS_UNFAIR_RWSEM + wake_up_all(&sem->read_waiters); +#endif +} + +static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) +{ + f2fs_down_read(&sbi->cp_rwsem); +} + +static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) +{ + if (time_to_inject(sbi, FAULT_LOCK_OP)) { + f2fs_show_injection_info(sbi, FAULT_LOCK_OP); + return 0; + } + return f2fs_down_read_trylock(&sbi->cp_rwsem); +} + +static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) +{ + f2fs_up_read(&sbi->cp_rwsem); +} + +static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) +{ + f2fs_down_write(&sbi->cp_rwsem); +} + +static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) +{ + f2fs_up_write(&sbi->cp_rwsem); +} + +static inline int __get_cp_reason(struct f2fs_sb_info *sbi) +{ + int reason = CP_SYNC; + + if (test_opt(sbi, FASTBOOT)) + reason = CP_FASTBOOT; + if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) + reason = CP_UMOUNT; + return reason; +} + +static inline bool __remain_node_summaries(int reason) +{ + return (reason & (CP_UMOUNT | CP_FASTBOOT)); +} + +static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) +{ + return (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG) || + is_set_ckpt_flags(sbi, CP_FASTBOOT_FLAG)); +} + +/* + * Check whether the inode has blocks or not + */ +static inline int F2FS_HAS_BLOCKS(struct inode *inode) +{ + block_t xattr_block = F2FS_I(inode)->i_xattr_nid ? 1 : 0; + + return (inode->i_blocks >> F2FS_LOG_SECTORS_PER_BLOCK) > xattr_block; +} + +static inline bool f2fs_has_xattr_block(unsigned int ofs) +{ + return ofs == XATTR_NODE_OFFSET; +} + +static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, + struct inode *inode, bool cap) +{ + if (!inode) + return true; + if (!test_opt(sbi, RESERVE_ROOT)) + return false; + if (IS_NOQUOTA(inode)) + return true; + if (uid_eq(F2FS_OPTION(sbi).s_resuid, current_fsuid())) + return true; + if (!gid_eq(F2FS_OPTION(sbi).s_resgid, GLOBAL_ROOT_GID) && + in_group_p(F2FS_OPTION(sbi).s_resgid)) + return true; + if (cap && capable(CAP_SYS_RESOURCE)) + return true; + return false; +} + +static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool); +static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, + struct inode *inode, blkcnt_t *count) +{ + blkcnt_t diff = 0, release = 0; + block_t avail_user_block_count; + int ret; + + ret = dquot_reserve_block(inode, *count); + if (ret) + return ret; + + if (time_to_inject(sbi, FAULT_BLOCK)) { + f2fs_show_injection_info(sbi, FAULT_BLOCK); + release = *count; + goto release_quota; + } + + /* + * let's increase this in prior to actual block count change in order + * for f2fs_sync_file to avoid data races when deciding checkpoint. + */ + percpu_counter_add(&sbi->alloc_valid_block_count, (*count)); + + spin_lock(&sbi->stat_lock); + sbi->total_valid_block_count += (block_t)(*count); + avail_user_block_count = sbi->user_block_count - + sbi->current_reserved_blocks; + + if (!__allow_reserved_blocks(sbi, inode, true)) + avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; + + if (F2FS_IO_ALIGNED(sbi)) + avail_user_block_count -= sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments; + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (avail_user_block_count > sbi->unusable_block_count) + avail_user_block_count -= sbi->unusable_block_count; + else + avail_user_block_count = 0; + } + if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { + diff = sbi->total_valid_block_count - avail_user_block_count; + if (diff > *count) + diff = *count; + *count -= diff; + release = diff; + sbi->total_valid_block_count -= diff; + if (!*count) { + spin_unlock(&sbi->stat_lock); + goto enospc; + } + } + spin_unlock(&sbi->stat_lock); + + if (unlikely(release)) { + percpu_counter_sub(&sbi->alloc_valid_block_count, release); + dquot_release_reservation_block(inode, release); + } + f2fs_i_blocks_write(inode, *count, true, true); + return 0; + +enospc: + percpu_counter_sub(&sbi->alloc_valid_block_count, release); +release_quota: + dquot_release_reservation_block(inode, release); + return -ENOSPC; +} + +__printf(2, 3) +void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...); + +#define f2fs_err(sbi, fmt, ...) \ + f2fs_printk(sbi, KERN_ERR fmt, ##__VA_ARGS__) +#define f2fs_warn(sbi, fmt, ...) \ + f2fs_printk(sbi, KERN_WARNING fmt, ##__VA_ARGS__) +#define f2fs_notice(sbi, fmt, ...) \ + f2fs_printk(sbi, KERN_NOTICE fmt, ##__VA_ARGS__) +#define f2fs_info(sbi, fmt, ...) \ + f2fs_printk(sbi, KERN_INFO fmt, ##__VA_ARGS__) +#define f2fs_debug(sbi, fmt, ...) \ + f2fs_printk(sbi, KERN_DEBUG fmt, ##__VA_ARGS__) + +static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, + struct inode *inode, + block_t count) +{ + blkcnt_t sectors = count << F2FS_LOG_SECTORS_PER_BLOCK; + + spin_lock(&sbi->stat_lock); + f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); + sbi->total_valid_block_count -= (block_t)count; + if (sbi->reserved_blocks && + sbi->current_reserved_blocks < sbi->reserved_blocks) + sbi->current_reserved_blocks = min(sbi->reserved_blocks, + sbi->current_reserved_blocks + count); + spin_unlock(&sbi->stat_lock); + if (unlikely(inode->i_blocks < sectors)) { + f2fs_warn(sbi, "Inconsistent i_blocks, ino:%lu, iblocks:%llu, sectors:%llu", + inode->i_ino, + (unsigned long long)inode->i_blocks, + (unsigned long long)sectors); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return; + } + f2fs_i_blocks_write(inode, count, false, true); +} + +static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) +{ + atomic_inc(&sbi->nr_pages[count_type]); + + if (count_type == F2FS_DIRTY_DENTS || + count_type == F2FS_DIRTY_NODES || + count_type == F2FS_DIRTY_META || + count_type == F2FS_DIRTY_QDATA || + count_type == F2FS_DIRTY_IMETA) + set_sbi_flag(sbi, SBI_IS_DIRTY); +} + +static inline void inode_inc_dirty_pages(struct inode *inode) +{ + atomic_inc(&F2FS_I(inode)->dirty_pages); + inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); + if (IS_NOQUOTA(inode)) + inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); +} + +static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) +{ + atomic_dec(&sbi->nr_pages[count_type]); +} + +static inline void inode_dec_dirty_pages(struct inode *inode) +{ + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) + return; + + atomic_dec(&F2FS_I(inode)->dirty_pages); + dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); + if (IS_NOQUOTA(inode)) + dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); +} + +static inline void inc_atomic_write_cnt(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + u64 current_write; + + fi->atomic_write_cnt++; + atomic64_inc(&sbi->current_atomic_write); + current_write = atomic64_read(&sbi->current_atomic_write); + if (current_write > sbi->peak_atomic_write) + sbi->peak_atomic_write = current_write; +} + +static inline void release_atomic_write_cnt(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + + atomic64_sub(fi->atomic_write_cnt, &sbi->current_atomic_write); + fi->atomic_write_cnt = 0; +} + +static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) +{ + return atomic_read(&sbi->nr_pages[count_type]); +} + +static inline int get_dirty_pages(struct inode *inode) +{ + return atomic_read(&F2FS_I(inode)->dirty_pages); +} + +static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) +{ + unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; + unsigned int segs = (get_pages(sbi, block_type) + pages_per_sec - 1) >> + sbi->log_blocks_per_seg; + + return segs / sbi->segs_per_sec; +} + +static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) +{ + return sbi->total_valid_block_count; +} + +static inline block_t discard_blocks(struct f2fs_sb_info *sbi) +{ + return sbi->discard_blks; +} + +static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + + /* return NAT or SIT bitmap */ + if (flag == NAT_BITMAP) + return le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); + else if (flag == SIT_BITMAP) + return le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); + + return 0; +} + +static inline block_t __cp_payload(struct f2fs_sb_info *sbi) +{ + return le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); +} + +static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + void *tmp_ptr = &ckpt->sit_nat_version_bitmap; + int offset; + + if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) { + offset = (flag == SIT_BITMAP) ? + le32_to_cpu(ckpt->nat_ver_bitmap_bytesize) : 0; + /* + * if large_nat_bitmap feature is enabled, leave checksum + * protection for all nat/sit bitmaps. + */ + return tmp_ptr + offset + sizeof(__le32); + } + + if (__cp_payload(sbi) > 0) { + if (flag == NAT_BITMAP) + return tmp_ptr; + else + return (unsigned char *)ckpt + F2FS_BLKSIZE; + } else { + offset = (flag == NAT_BITMAP) ? + le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; + return tmp_ptr + offset; + } +} + +static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) +{ + block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); + + if (sbi->cur_cp_pack == 2) + start_addr += sbi->blocks_per_seg; + return start_addr; +} + +static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi) +{ + block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); + + if (sbi->cur_cp_pack == 1) + start_addr += sbi->blocks_per_seg; + return start_addr; +} + +static inline void __set_cp_next_pack(struct f2fs_sb_info *sbi) +{ + sbi->cur_cp_pack = (sbi->cur_cp_pack == 1) ? 2 : 1; +} + +static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) +{ + return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); +} + +extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); +static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, + struct inode *inode, bool is_inode) +{ + block_t valid_block_count; + unsigned int valid_node_count, user_block_count; + int err; + + if (is_inode) { + if (inode) { + err = dquot_alloc_inode(inode); + if (err) + return err; + } + } else { + err = dquot_reserve_block(inode, 1); + if (err) + return err; + } + + if (time_to_inject(sbi, FAULT_BLOCK)) { + f2fs_show_injection_info(sbi, FAULT_BLOCK); + goto enospc; + } + + spin_lock(&sbi->stat_lock); + + valid_block_count = sbi->total_valid_block_count + + sbi->current_reserved_blocks + 1; + + if (!__allow_reserved_blocks(sbi, inode, false)) + valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks; + + if (F2FS_IO_ALIGNED(sbi)) + valid_block_count += sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments; + + user_block_count = sbi->user_block_count; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + user_block_count -= sbi->unusable_block_count; + + if (unlikely(valid_block_count > user_block_count)) { + spin_unlock(&sbi->stat_lock); + goto enospc; + } + + valid_node_count = sbi->total_valid_node_count + 1; + if (unlikely(valid_node_count > sbi->total_node_count)) { + spin_unlock(&sbi->stat_lock); + goto enospc; + } + + sbi->total_valid_node_count++; + sbi->total_valid_block_count++; + spin_unlock(&sbi->stat_lock); + + if (inode) { + if (is_inode) + f2fs_mark_inode_dirty_sync(inode, true); + else + f2fs_i_blocks_write(inode, 1, true, true); + } + + percpu_counter_inc(&sbi->alloc_valid_block_count); + return 0; + +enospc: + if (is_inode) { + if (inode) + dquot_free_inode(inode); + } else { + dquot_release_reservation_block(inode, 1); + } + return -ENOSPC; +} + +static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, + struct inode *inode, bool is_inode) +{ + spin_lock(&sbi->stat_lock); + + if (unlikely(!sbi->total_valid_block_count || + !sbi->total_valid_node_count)) { + f2fs_warn(sbi, "dec_valid_node_count: inconsistent block counts, total_valid_block:%u, total_valid_node:%u", + sbi->total_valid_block_count, + sbi->total_valid_node_count); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } else { + sbi->total_valid_block_count--; + sbi->total_valid_node_count--; + } + + if (sbi->reserved_blocks && + sbi->current_reserved_blocks < sbi->reserved_blocks) + sbi->current_reserved_blocks++; + + spin_unlock(&sbi->stat_lock); + + if (is_inode) { + dquot_free_inode(inode); + } else { + if (unlikely(inode->i_blocks == 0)) { + f2fs_warn(sbi, "dec_valid_node_count: inconsistent i_blocks, ino:%lu, iblocks:%llu", + inode->i_ino, + (unsigned long long)inode->i_blocks); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return; + } + f2fs_i_blocks_write(inode, 1, false, true); + } +} + +static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) +{ + return sbi->total_valid_node_count; +} + +static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) +{ + percpu_counter_inc(&sbi->total_valid_inode_count); +} + +static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) +{ + percpu_counter_dec(&sbi->total_valid_inode_count); +} + +static inline s64 valid_inode_count(struct f2fs_sb_info *sbi) +{ + return percpu_counter_sum_positive(&sbi->total_valid_inode_count); +} + +static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, + pgoff_t index, bool for_write) +{ + struct page *page; + unsigned int flags; + + if (IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) { + if (!for_write) + page = find_get_page_flags(mapping, index, + FGP_LOCK | FGP_ACCESSED); + else + page = find_lock_page(mapping, index); + if (page) + return page; + + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) { + f2fs_show_injection_info(F2FS_M_SB(mapping), + FAULT_PAGE_ALLOC); + return NULL; + } + } + + if (!for_write) + return grab_cache_page(mapping, index); + + flags = memalloc_nofs_save(); + page = grab_cache_page_write_begin(mapping, index); + memalloc_nofs_restore(flags); + + return page; +} + +static inline struct page *f2fs_pagecache_get_page( + struct address_space *mapping, pgoff_t index, + int fgp_flags, gfp_t gfp_mask) +{ + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) { + f2fs_show_injection_info(F2FS_M_SB(mapping), FAULT_PAGE_GET); + return NULL; + } + + return pagecache_get_page(mapping, index, fgp_flags, gfp_mask); +} + +static inline void f2fs_put_page(struct page *page, int unlock) +{ + if (!page) + return; + + if (unlock) { + f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page)); + unlock_page(page); + } + put_page(page); +} + +static inline void f2fs_put_dnode(struct dnode_of_data *dn) +{ + if (dn->node_page) + f2fs_put_page(dn->node_page, 1); + if (dn->inode_page && dn->node_page != dn->inode_page) + f2fs_put_page(dn->inode_page, 0); + dn->node_page = NULL; + dn->inode_page = NULL; +} + +static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, + size_t size) +{ + return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL); +} + +static inline void *f2fs_kmem_cache_alloc_nofail(struct kmem_cache *cachep, + gfp_t flags) +{ + void *entry; + + entry = kmem_cache_alloc(cachep, flags); + if (!entry) + entry = kmem_cache_alloc(cachep, flags | __GFP_NOFAIL); + return entry; +} + +static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, + gfp_t flags, bool nofail, struct f2fs_sb_info *sbi) +{ + if (nofail) + return f2fs_kmem_cache_alloc_nofail(cachep, flags); + + if (time_to_inject(sbi, FAULT_SLAB_ALLOC)) { + f2fs_show_injection_info(sbi, FAULT_SLAB_ALLOC); + return NULL; + } + + return kmem_cache_alloc(cachep, flags); +} + +static inline bool is_inflight_io(struct f2fs_sb_info *sbi, int type) +{ + if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) || + get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) || + get_pages(sbi, F2FS_WB_CP_DATA) || + get_pages(sbi, F2FS_DIO_READ) || + get_pages(sbi, F2FS_DIO_WRITE)) + return true; + + if (type != DISCARD_TIME && SM_I(sbi) && SM_I(sbi)->dcc_info && + atomic_read(&SM_I(sbi)->dcc_info->queued_discard)) + return true; + + if (SM_I(sbi) && SM_I(sbi)->fcc_info && + atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) + return true; + return false; +} + +static inline bool is_idle(struct f2fs_sb_info *sbi, int type) +{ + if (sbi->gc_mode == GC_URGENT_HIGH) + return true; + + if (is_inflight_io(sbi, type)) + return false; + + if (sbi->gc_mode == GC_URGENT_MID) + return true; + + if (sbi->gc_mode == GC_URGENT_LOW && + (type == DISCARD_TIME || type == GC_TIME)) + return true; + + return f2fs_time_over(sbi, type); +} + +static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, + unsigned long index, void *item) +{ + while (radix_tree_insert(root, index, item)) + cond_resched(); +} + +#define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino) + +static inline bool IS_INODE(struct page *page) +{ + struct f2fs_node *p = F2FS_NODE(page); + + return RAW_IS_INODE(p); +} + +static inline int offset_in_addr(struct f2fs_inode *i) +{ + return (i->i_inline & F2FS_EXTRA_ATTR) ? + (le16_to_cpu(i->i_extra_isize) / sizeof(__le32)) : 0; +} + +static inline __le32 *blkaddr_in_node(struct f2fs_node *node) +{ + return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr; +} + +static inline int f2fs_has_extra_attr(struct inode *inode); +static inline block_t data_blkaddr(struct inode *inode, + struct page *node_page, unsigned int offset) +{ + struct f2fs_node *raw_node; + __le32 *addr_array; + int base = 0; + bool is_inode = IS_INODE(node_page); + + raw_node = F2FS_NODE(node_page); + + if (is_inode) { + if (!inode) + /* from GC path only */ + base = offset_in_addr(&raw_node->i); + else if (f2fs_has_extra_attr(inode)) + base = get_extra_isize(inode); + } + + addr_array = blkaddr_in_node(raw_node); + return le32_to_cpu(addr_array[base + offset]); +} + +static inline block_t f2fs_data_blkaddr(struct dnode_of_data *dn) +{ + return data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node); +} + +static inline int f2fs_test_bit(unsigned int nr, char *addr) +{ + int mask; + + addr += (nr >> 3); + mask = BIT(7 - (nr & 0x07)); + return mask & *addr; +} + +static inline void f2fs_set_bit(unsigned int nr, char *addr) +{ + int mask; + + addr += (nr >> 3); + mask = BIT(7 - (nr & 0x07)); + *addr |= mask; +} + +static inline void f2fs_clear_bit(unsigned int nr, char *addr) +{ + int mask; + + addr += (nr >> 3); + mask = BIT(7 - (nr & 0x07)); + *addr &= ~mask; +} + +static inline int f2fs_test_and_set_bit(unsigned int nr, char *addr) +{ + int mask; + int ret; + + addr += (nr >> 3); + mask = BIT(7 - (nr & 0x07)); + ret = mask & *addr; + *addr |= mask; + return ret; +} + +static inline int f2fs_test_and_clear_bit(unsigned int nr, char *addr) +{ + int mask; + int ret; + + addr += (nr >> 3); + mask = BIT(7 - (nr & 0x07)); + ret = mask & *addr; + *addr &= ~mask; + return ret; +} + +static inline void f2fs_change_bit(unsigned int nr, char *addr) +{ + int mask; + + addr += (nr >> 3); + mask = BIT(7 - (nr & 0x07)); + *addr ^= mask; +} + +/* + * On-disk inode flags (f2fs_inode::i_flags) + */ +#define F2FS_COMPR_FL 0x00000004 /* Compress file */ +#define F2FS_SYNC_FL 0x00000008 /* Synchronous updates */ +#define F2FS_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define F2FS_APPEND_FL 0x00000020 /* writes to file may only append */ +#define F2FS_NODUMP_FL 0x00000040 /* do not dump file */ +#define F2FS_NOATIME_FL 0x00000080 /* do not update atime */ +#define F2FS_NOCOMP_FL 0x00000400 /* Don't compress */ +#define F2FS_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define F2FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ +#define F2FS_CASEFOLD_FL 0x40000000 /* Casefolded file */ + +/* Flags that should be inherited by new inodes from their parent. */ +#define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \ + F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ + F2FS_CASEFOLD_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ + F2FS_CASEFOLD_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define F2FS_OTHER_FLMASK (F2FS_NODUMP_FL | F2FS_NOATIME_FL) + +static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & F2FS_REG_FLMASK; + else + return flags & F2FS_OTHER_FLMASK; +} + +static inline void __mark_inode_dirty_flag(struct inode *inode, + int flag, bool set) +{ + switch (flag) { + case FI_INLINE_XATTR: + case FI_INLINE_DATA: + case FI_INLINE_DENTRY: + case FI_NEW_INODE: + if (set) + return; + fallthrough; + case FI_DATA_EXIST: + case FI_INLINE_DOTS: + case FI_PIN_FILE: + case FI_COMPRESS_RELEASED: + f2fs_mark_inode_dirty_sync(inode, true); + } +} + +static inline void set_inode_flag(struct inode *inode, int flag) +{ + set_bit(flag, F2FS_I(inode)->flags); + __mark_inode_dirty_flag(inode, flag, true); +} + +static inline int is_inode_flag_set(struct inode *inode, int flag) +{ + return test_bit(flag, F2FS_I(inode)->flags); +} + +static inline void clear_inode_flag(struct inode *inode, int flag) +{ + clear_bit(flag, F2FS_I(inode)->flags); + __mark_inode_dirty_flag(inode, flag, false); +} + +static inline bool f2fs_verity_in_progress(struct inode *inode) +{ + return IS_ENABLED(CONFIG_FS_VERITY) && + is_inode_flag_set(inode, FI_VERITY_IN_PROGRESS); +} + +static inline void set_acl_inode(struct inode *inode, umode_t mode) +{ + F2FS_I(inode)->i_acl_mode = mode; + set_inode_flag(inode, FI_ACL_MODE); + f2fs_mark_inode_dirty_sync(inode, false); +} + +static inline void f2fs_i_links_write(struct inode *inode, bool inc) +{ + if (inc) + inc_nlink(inode); + else + drop_nlink(inode); + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline void f2fs_i_blocks_write(struct inode *inode, + block_t diff, bool add, bool claim) +{ + bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); + bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); + + /* add = 1, claim = 1 should be dquot_reserve_block in pair */ + if (add) { + if (claim) + dquot_claim_block(inode, diff); + else + dquot_alloc_block_nofail(inode, diff); + } else { + dquot_free_block(inode, diff); + } + + f2fs_mark_inode_dirty_sync(inode, true); + if (clean || recover) + set_inode_flag(inode, FI_AUTO_RECOVER); +} + +static inline bool f2fs_is_atomic_file(struct inode *inode); + +static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) +{ + bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); + bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); + + if (i_size_read(inode) == i_size) + return; + + i_size_write(inode, i_size); + + if (f2fs_is_atomic_file(inode)) + return; + + f2fs_mark_inode_dirty_sync(inode, true); + if (clean || recover) + set_inode_flag(inode, FI_AUTO_RECOVER); +} + +static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) +{ + F2FS_I(inode)->i_current_depth = depth; + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline void f2fs_i_gc_failures_write(struct inode *inode, + unsigned int count) +{ + F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = count; + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) +{ + F2FS_I(inode)->i_xattr_nid = xnid; + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino) +{ + F2FS_I(inode)->i_pino = pino; + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + if (ri->i_inline & F2FS_INLINE_XATTR) + set_bit(FI_INLINE_XATTR, fi->flags); + if (ri->i_inline & F2FS_INLINE_DATA) + set_bit(FI_INLINE_DATA, fi->flags); + if (ri->i_inline & F2FS_INLINE_DENTRY) + set_bit(FI_INLINE_DENTRY, fi->flags); + if (ri->i_inline & F2FS_DATA_EXIST) + set_bit(FI_DATA_EXIST, fi->flags); + if (ri->i_inline & F2FS_INLINE_DOTS) + set_bit(FI_INLINE_DOTS, fi->flags); + if (ri->i_inline & F2FS_EXTRA_ATTR) + set_bit(FI_EXTRA_ATTR, fi->flags); + if (ri->i_inline & F2FS_PIN_FILE) + set_bit(FI_PIN_FILE, fi->flags); + if (ri->i_inline & F2FS_COMPRESS_RELEASED) + set_bit(FI_COMPRESS_RELEASED, fi->flags); +} + +static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) +{ + ri->i_inline = 0; + + if (is_inode_flag_set(inode, FI_INLINE_XATTR)) + ri->i_inline |= F2FS_INLINE_XATTR; + if (is_inode_flag_set(inode, FI_INLINE_DATA)) + ri->i_inline |= F2FS_INLINE_DATA; + if (is_inode_flag_set(inode, FI_INLINE_DENTRY)) + ri->i_inline |= F2FS_INLINE_DENTRY; + if (is_inode_flag_set(inode, FI_DATA_EXIST)) + ri->i_inline |= F2FS_DATA_EXIST; + if (is_inode_flag_set(inode, FI_INLINE_DOTS)) + ri->i_inline |= F2FS_INLINE_DOTS; + if (is_inode_flag_set(inode, FI_EXTRA_ATTR)) + ri->i_inline |= F2FS_EXTRA_ATTR; + if (is_inode_flag_set(inode, FI_PIN_FILE)) + ri->i_inline |= F2FS_PIN_FILE; + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + ri->i_inline |= F2FS_COMPRESS_RELEASED; +} + +static inline int f2fs_has_extra_attr(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_EXTRA_ATTR); +} + +static inline int f2fs_has_inline_xattr(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_INLINE_XATTR); +} + +static inline int f2fs_compressed_file(struct inode *inode) +{ + return S_ISREG(inode->i_mode) && + is_inode_flag_set(inode, FI_COMPRESSED_FILE); +} + +static inline bool f2fs_need_compress_data(struct inode *inode) +{ + int compress_mode = F2FS_OPTION(F2FS_I_SB(inode)).compress_mode; + + if (!f2fs_compressed_file(inode)) + return false; + + if (compress_mode == COMPR_MODE_FS) + return true; + else if (compress_mode == COMPR_MODE_USER && + is_inode_flag_set(inode, FI_ENABLE_COMPRESS)) + return true; + + return false; +} + +static inline unsigned int addrs_per_inode(struct inode *inode) +{ + unsigned int addrs = CUR_ADDRS_PER_INODE(inode) - + get_inline_xattr_addrs(inode); + + if (!f2fs_compressed_file(inode)) + return addrs; + return ALIGN_DOWN(addrs, F2FS_I(inode)->i_cluster_size); +} + +static inline unsigned int addrs_per_block(struct inode *inode) +{ + if (!f2fs_compressed_file(inode)) + return DEF_ADDRS_PER_BLOCK; + return ALIGN_DOWN(DEF_ADDRS_PER_BLOCK, F2FS_I(inode)->i_cluster_size); +} + +static inline void *inline_xattr_addr(struct inode *inode, struct page *page) +{ + struct f2fs_inode *ri = F2FS_INODE(page); + + return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - + get_inline_xattr_addrs(inode)]); +} + +static inline int inline_xattr_size(struct inode *inode) +{ + if (f2fs_has_inline_xattr(inode)) + return get_inline_xattr_addrs(inode) * sizeof(__le32); + return 0; +} + +/* + * Notice: check inline_data flag without inode page lock is unsafe. + * It could change at any time by f2fs_convert_inline_page(). + */ +static inline int f2fs_has_inline_data(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_INLINE_DATA); +} + +static inline int f2fs_exist_data(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_DATA_EXIST); +} + +static inline int f2fs_has_inline_dots(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_INLINE_DOTS); +} + +static inline int f2fs_is_mmap_file(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_MMAP_FILE); +} + +static inline bool f2fs_is_pinned_file(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_PIN_FILE); +} + +static inline bool f2fs_is_atomic_file(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_ATOMIC_FILE); +} + +static inline bool f2fs_is_cow_file(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_COW_FILE); +} + +static inline bool f2fs_is_first_block_written(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN); +} + +static inline bool f2fs_is_drop_cache(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_DROP_CACHE); +} + +static inline void *inline_data_addr(struct inode *inode, struct page *page) +{ + struct f2fs_inode *ri = F2FS_INODE(page); + int extra_size = get_extra_isize(inode); + + return (void *)&(ri->i_addr[extra_size + DEF_INLINE_RESERVED_SIZE]); +} + +static inline int f2fs_has_inline_dentry(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_INLINE_DENTRY); +} + +static inline int is_file(struct inode *inode, int type) +{ + return F2FS_I(inode)->i_advise & type; +} + +static inline void set_file(struct inode *inode, int type) +{ + if (is_file(inode, type)) + return; + F2FS_I(inode)->i_advise |= type; + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline void clear_file(struct inode *inode, int type) +{ + if (!is_file(inode, type)) + return; + F2FS_I(inode)->i_advise &= ~type; + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline bool f2fs_is_time_consistent(struct inode *inode) +{ + if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) + return false; + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) + return false; + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) + return false; + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 3, + &F2FS_I(inode)->i_crtime)) + return false; + return true; +} + +static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) +{ + bool ret; + + if (dsync) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + spin_lock(&sbi->inode_lock[DIRTY_META]); + ret = list_empty(&F2FS_I(inode)->gdirty_list); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return ret; + } + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || + file_keep_isize(inode) || + i_size_read(inode) & ~PAGE_MASK) + return false; + + if (!f2fs_is_time_consistent(inode)) + return false; + + spin_lock(&F2FS_I(inode)->i_size_lock); + ret = F2FS_I(inode)->last_disk_size == i_size_read(inode); + spin_unlock(&F2FS_I(inode)->i_size_lock); + + return ret; +} + +static inline bool f2fs_readonly(struct super_block *sb) +{ + return sb_rdonly(sb); +} + +static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) +{ + return is_set_ckpt_flags(sbi, CP_ERROR_FLAG); +} + +static inline bool is_dot_dotdot(const u8 *name, size_t len) +{ + if (len == 1 && name[0] == '.') + return true; + + if (len == 2 && name[0] == '.' && name[1] == '.') + return true; + + return false; +} + +static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ + if (time_to_inject(sbi, FAULT_KMALLOC)) { + f2fs_show_injection_info(sbi, FAULT_KMALLOC); + return NULL; + } + + return kmalloc(size, flags); +} + +static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ + return f2fs_kmalloc(sbi, size, flags | __GFP_ZERO); +} + +static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ + if (time_to_inject(sbi, FAULT_KVMALLOC)) { + f2fs_show_injection_info(sbi, FAULT_KVMALLOC); + return NULL; + } + + return kvmalloc(size, flags); +} + +static inline void *f2fs_kvzalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ + return f2fs_kvmalloc(sbi, size, flags | __GFP_ZERO); +} + +static inline int get_extra_isize(struct inode *inode) +{ + return F2FS_I(inode)->i_extra_isize / sizeof(__le32); +} + +static inline int get_inline_xattr_addrs(struct inode *inode) +{ + return F2FS_I(inode)->i_inline_xattr_size; +} + +#define f2fs_get_inode_mode(i) \ + ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ + (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) + +#define F2FS_TOTAL_EXTRA_ATTR_SIZE \ + (offsetof(struct f2fs_inode, i_extra_end) - \ + offsetof(struct f2fs_inode, i_extra_isize)) \ + +#define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr)) +#define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \ + ((offsetof(typeof(*(f2fs_inode)), field) + \ + sizeof((f2fs_inode)->field)) \ + <= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \ + +#define __is_large_section(sbi) ((sbi)->segs_per_sec > 1) + +#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META) + +bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type); +static inline void verify_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) +{ + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type)) { + f2fs_err(sbi, "invalid blkaddr: %u, type: %d, run fsck to fix.", + blkaddr, type); + f2fs_bug_on(sbi, 1); + } +} + +static inline bool __is_valid_data_blkaddr(block_t blkaddr) +{ + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR || + blkaddr == COMPRESS_ADDR) + return false; + return true; +} + +/* + * file.c + */ +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); +void f2fs_truncate_data_blocks(struct dnode_of_data *dn); +int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock); +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); +int f2fs_truncate(struct inode *inode); +int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); +int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); +int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); +void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count); +int f2fs_precache_extents(struct inode *inode); +int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int f2fs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid); +int f2fs_pin_file_control(struct inode *inode, bool inc); + +/* + * inode.c + */ +void f2fs_set_inode_flags(struct inode *inode); +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); +struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); +struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); +int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); +void f2fs_update_inode(struct inode *inode, struct page *node_page); +void f2fs_update_inode_page(struct inode *inode); +int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); +void f2fs_evict_inode(struct inode *inode); +void f2fs_handle_failed_inode(struct inode *inode); + +/* + * namei.c + */ +int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, + bool hot, bool set); +struct dentry *f2fs_get_parent(struct dentry *child); +int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct inode **new_inode); + +/* + * dir.c + */ +unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de); +int f2fs_init_casefolded_name(const struct inode *dir, + struct f2fs_filename *fname); +int f2fs_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct f2fs_filename *fname); +int f2fs_prepare_lookup(struct inode *dir, struct dentry *dentry, + struct f2fs_filename *fname); +void f2fs_free_filename(struct f2fs_filename *fname); +struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d, + const struct f2fs_filename *fname, int *max_slots); +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, + unsigned int start_pos, struct fscrypt_str *fstr); +void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, + struct f2fs_dentry_ptr *d); +struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, + const struct f2fs_filename *fname, struct page *dpage); +void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, + unsigned int current_depth); +int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots); +void f2fs_drop_nlink(struct inode *dir, struct inode *inode); +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, + const struct f2fs_filename *fname, + struct page **res_page); +struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, + const struct qstr *child, struct page **res_page); +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p); +ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, + struct page **page); +void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, + struct page *page, struct inode *inode); +bool f2fs_has_enough_room(struct inode *dir, struct page *ipage, + const struct f2fs_filename *fname); +void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, + const struct fscrypt_str *name, f2fs_hash_t name_hash, + unsigned int bit_pos); +int f2fs_add_regular_entry(struct inode *dir, const struct f2fs_filename *fname, + struct inode *inode, nid_t ino, umode_t mode); +int f2fs_add_dentry(struct inode *dir, const struct f2fs_filename *fname, + struct inode *inode, nid_t ino, umode_t mode); +int f2fs_do_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode); +void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode); +int f2fs_do_tmpfile(struct inode *inode, struct inode *dir); +bool f2fs_empty_dir(struct inode *dir); + +static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) +{ + if (fscrypt_is_nokey_name(dentry)) + return -ENOKEY; + return f2fs_do_add_link(d_inode(dentry->d_parent), &dentry->d_name, + inode, inode->i_ino, inode->i_mode); +} + +/* + * super.c + */ +int f2fs_inode_dirtied(struct inode *inode, bool sync); +void f2fs_inode_synced(struct inode *inode); +int f2fs_dquot_initialize(struct inode *inode); +int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); +int f2fs_quota_sync(struct super_block *sb, int type); +loff_t max_file_blocks(struct inode *inode); +void f2fs_quota_off_umount(struct super_block *sb); +void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason); +void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag); +void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error); +int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); +int f2fs_sync_fs(struct super_block *sb, int sync); +int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); + +/* + * hash.c + */ +void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname); + +/* + * node.c + */ +struct node_info; + +int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); +bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi); +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi); +int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); +bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); +bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); +int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, + struct node_info *ni, bool checkpoint_context); +pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); +int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); +int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); +int f2fs_truncate_xattr_node(struct inode *inode); +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, + unsigned int seq_id); +bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi); +int f2fs_remove_inode_page(struct inode *inode); +struct page *f2fs_new_inode_page(struct inode *inode); +struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); +void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); +struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); +struct page *f2fs_get_node_page_ra(struct page *parent, int start); +int f2fs_move_node_page(struct page *node_page, int gc_type); +void f2fs_flush_inline_data(struct f2fs_sb_info *sbi); +int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, + struct writeback_control *wbc, bool atomic, + unsigned int *seq_id); +int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, + struct writeback_control *wbc, + bool do_balance, enum iostat_type io_type); +int f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); +bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); +void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); +void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); +int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); +int f2fs_recover_inline_xattr(struct inode *inode, struct page *page); +int f2fs_recover_xattr_data(struct inode *inode, struct page *page); +int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); +int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, + unsigned int segno, struct f2fs_summary_block *sum); +void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi); +int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int f2fs_build_node_manager(struct f2fs_sb_info *sbi); +void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi); +int __init f2fs_create_node_manager_caches(void); +void f2fs_destroy_node_manager_caches(void); + +/* + * segment.c + */ +bool f2fs_need_SSR(struct f2fs_sb_info *sbi); +int f2fs_commit_atomic_write(struct inode *inode); +void f2fs_abort_atomic_write(struct inode *inode, bool clean); +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg); +int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); +int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi); +int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); +void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); +void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); +bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); +int f2fs_start_discard_thread(struct f2fs_sb_info *sbi); +void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi); +void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi); +bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi); +void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, + struct cp_control *cpc); +void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi); +block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi); +int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable); +void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); +int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); +bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno); +void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi); +void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi); +void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi); +void f2fs_get_new_segment(struct f2fs_sb_info *sbi, + unsigned int *newseg, bool new_sec, int dir); +void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, + unsigned int start, unsigned int end); +void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force); +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); +int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); +bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, + struct cp_control *cpc); +struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); +void f2fs_update_meta_page(struct f2fs_sb_info *sbi, void *src, + block_t blk_addr); +void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type); +void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio); +void f2fs_outplace_write_data(struct dnode_of_data *dn, + struct f2fs_io_info *fio); +int f2fs_inplace_write_data(struct f2fs_io_info *fio); +void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + block_t old_blkaddr, block_t new_blkaddr, + bool recover_curseg, bool recover_newaddr, + bool from_gc); +void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, + block_t old_addr, block_t new_addr, + unsigned char version, bool recover_curseg, + bool recover_newaddr); +void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, int type, + struct f2fs_io_info *fio); +void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, + block_t blkaddr, unsigned int blkcnt); +void f2fs_wait_on_page_writeback(struct page *page, + enum page_type type, bool ordered, bool locked); +void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); +void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, + block_t len); +void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, + unsigned int val, int alloc); +void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi); +int f2fs_check_write_pointer(struct f2fs_sb_info *sbi); +int f2fs_build_segment_manager(struct f2fs_sb_info *sbi); +void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); +int __init f2fs_create_segment_manager_caches(void); +void f2fs_destroy_segment_manager_caches(void); +int f2fs_rw_hint_to_seg_type(enum rw_hint hint); +unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, + unsigned int segno); +unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, + unsigned int segno); + +#define DEF_FRAGMENT_SIZE 4 +#define MIN_FRAGMENT_SIZE 1 +#define MAX_FRAGMENT_SIZE 512 + +static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi) +{ + return F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG || + F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK; +} + +/* + * checkpoint.c + */ +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, + unsigned char reason); +void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi); +struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); +bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type); +int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, + int type, bool sync); +void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, + unsigned int ra_blocks); +long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, + long nr_to_write, enum iostat_type io_type); +void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all); +bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); +void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type); +bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type); +int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi); +int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi); +void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi); +void f2fs_add_orphan_inode(struct inode *inode); +void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino); +int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi); +int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi); +void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio); +void f2fs_remove_dirty_inode(struct inode *inode); +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type, + bool from_cp); +void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type); +u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi); +int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); +int __init f2fs_create_checkpoint_caches(void); +void f2fs_destroy_checkpoint_caches(void); +int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi); +int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi); +void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi); +void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); + +/* + * data.c + */ +int __init f2fs_init_bioset(void); +void f2fs_destroy_bioset(void); +int f2fs_init_bio_entry_cache(void); +void f2fs_destroy_bio_entry_cache(void); +void f2fs_submit_bio(struct f2fs_sb_info *sbi, + struct bio *bio, enum page_type type); +int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi); +void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); +void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, struct page *page, + nid_t ino, enum page_type type); +void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, + struct bio **bio, struct page *page); +void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); +int f2fs_submit_page_bio(struct f2fs_io_info *fio); +int f2fs_merge_page_bio(struct f2fs_io_info *fio); +void f2fs_submit_page_write(struct f2fs_io_info *fio); +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, sector_t *sector); +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); +void f2fs_set_data_blkaddr(struct dnode_of_data *dn); +void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); +int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); +int f2fs_reserve_new_block(struct dnode_of_data *dn); +int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); +struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, + blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs); +struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, + pgoff_t *next_pgofs); +struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, + bool for_write); +struct page *f2fs_get_new_data_page(struct inode *inode, + struct page *ipage, pgoff_t index, bool new_i_size); +int f2fs_do_write_data_page(struct f2fs_io_info *fio); +void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock); +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, + int create, int flag); +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); +int f2fs_encrypt_one_page(struct f2fs_io_info *fio); +bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); +bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); +int f2fs_write_single_data_page(struct page *page, int *submitted, + struct bio **bio, sector_t *last_block, + struct writeback_control *wbc, + enum iostat_type io_type, + int compr_blocks, bool allow_balance); +void f2fs_write_failed(struct inode *inode, loff_t to); +void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length); +bool f2fs_release_folio(struct folio *folio, gfp_t wait); +bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); +void f2fs_clear_page_cache_dirty_tag(struct page *page); +int f2fs_init_post_read_processing(void); +void f2fs_destroy_post_read_processing(void); +int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi); +void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); +extern const struct iomap_ops f2fs_iomap_ops; + +/* + * gc.c + */ +int f2fs_start_gc_thread(struct f2fs_sb_info *sbi); +void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); +block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); +int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control); +void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); +int f2fs_resize_fs(struct file *filp, __u64 block_count); +int __init f2fs_create_garbage_collection_cache(void); +void f2fs_destroy_garbage_collection_cache(void); + +/* + * recovery.c + */ +int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); +bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi); +int __init f2fs_create_recovery_cache(void); +void f2fs_destroy_recovery_cache(void); + +/* + * debug.c + */ +#ifdef CONFIG_F2FS_STAT_FS +struct f2fs_stat_info { + struct list_head stat_list; + struct f2fs_sb_info *sbi; + int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; + int main_area_segs, main_area_sections, main_area_zones; + unsigned long long hit_cached[NR_EXTENT_CACHES]; + unsigned long long hit_rbtree[NR_EXTENT_CACHES]; + unsigned long long total_ext[NR_EXTENT_CACHES]; + unsigned long long hit_total[NR_EXTENT_CACHES]; + int ext_tree[NR_EXTENT_CACHES]; + int zombie_tree[NR_EXTENT_CACHES]; + int ext_node[NR_EXTENT_CACHES]; + /* to count memory footprint */ + unsigned long long ext_mem[NR_EXTENT_CACHES]; + /* for read extent cache */ + unsigned long long hit_largest; + int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; + int ndirty_data, ndirty_qdata; + unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; + int nats, dirty_nats, sits, dirty_sits; + int free_nids, avail_nids, alloc_nids; + int total_count, utilization; + int bg_gc, nr_wb_cp_data, nr_wb_data; + int nr_rd_data, nr_rd_node, nr_rd_meta; + int nr_dio_read, nr_dio_write; + unsigned int io_skip_bggc, other_skip_bggc; + int nr_flushing, nr_flushed, flush_list_empty; + int nr_discarding, nr_discarded; + int nr_discard_cmd; + unsigned int undiscard_blks; + int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt; + unsigned int cur_ckpt_time, peak_ckpt_time; + int inline_xattr, inline_inode, inline_dir, append, update, orphans; + int compr_inode, swapfile_inode; + unsigned long long compr_blocks; + int aw_cnt, max_aw_cnt; + unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; + unsigned int bimodal, avg_vblocks; + int util_free, util_valid, util_invalid; + int rsvd_segs, overp_segs; + int dirty_count, node_pages, meta_pages, compress_pages; + int compress_page_hit; + int prefree_count, call_count, cp_count, bg_cp_count; + int tot_segs, node_segs, data_segs, free_segs, free_secs; + int bg_node_segs, bg_data_segs; + int tot_blks, data_blks, node_blks; + int bg_data_blks, bg_node_blks; + int curseg[NR_CURSEG_TYPE]; + int cursec[NR_CURSEG_TYPE]; + int curzone[NR_CURSEG_TYPE]; + unsigned int dirty_seg[NR_CURSEG_TYPE]; + unsigned int full_seg[NR_CURSEG_TYPE]; + unsigned int valid_blks[NR_CURSEG_TYPE]; + + unsigned int meta_count[META_MAX]; + unsigned int segment_count[2]; + unsigned int block_count[2]; + unsigned int inplace_count; + unsigned long long base_mem, cache_mem, page_mem; +}; + +static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_stat_info *)sbi->stat_info; +} + +#define stat_inc_cp_count(si) ((si)->cp_count++) +#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++) +#define stat_inc_call_count(si) ((si)->call_count++) +#define stat_inc_bggc_count(si) ((si)->bg_gc++) +#define stat_io_skip_bggc_count(sbi) ((sbi)->io_skip_bggc++) +#define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++) +#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) +#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) +#define stat_inc_total_hit(sbi, type) (atomic64_inc(&(sbi)->total_hit_ext[type])) +#define stat_inc_rbtree_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_rbtree[type])) +#define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) +#define stat_inc_cached_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_cached[type])) +#define stat_inc_inline_xattr(inode) \ + do { \ + if (f2fs_has_inline_xattr(inode)) \ + (atomic_inc(&F2FS_I_SB(inode)->inline_xattr)); \ + } while (0) +#define stat_dec_inline_xattr(inode) \ + do { \ + if (f2fs_has_inline_xattr(inode)) \ + (atomic_dec(&F2FS_I_SB(inode)->inline_xattr)); \ + } while (0) +#define stat_inc_inline_inode(inode) \ + do { \ + if (f2fs_has_inline_data(inode)) \ + (atomic_inc(&F2FS_I_SB(inode)->inline_inode)); \ + } while (0) +#define stat_dec_inline_inode(inode) \ + do { \ + if (f2fs_has_inline_data(inode)) \ + (atomic_dec(&F2FS_I_SB(inode)->inline_inode)); \ + } while (0) +#define stat_inc_inline_dir(inode) \ + do { \ + if (f2fs_has_inline_dentry(inode)) \ + (atomic_inc(&F2FS_I_SB(inode)->inline_dir)); \ + } while (0) +#define stat_dec_inline_dir(inode) \ + do { \ + if (f2fs_has_inline_dentry(inode)) \ + (atomic_dec(&F2FS_I_SB(inode)->inline_dir)); \ + } while (0) +#define stat_inc_compr_inode(inode) \ + do { \ + if (f2fs_compressed_file(inode)) \ + (atomic_inc(&F2FS_I_SB(inode)->compr_inode)); \ + } while (0) +#define stat_dec_compr_inode(inode) \ + do { \ + if (f2fs_compressed_file(inode)) \ + (atomic_dec(&F2FS_I_SB(inode)->compr_inode)); \ + } while (0) +#define stat_add_compr_blocks(inode, blocks) \ + (atomic64_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) +#define stat_sub_compr_blocks(inode, blocks) \ + (atomic64_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) +#define stat_inc_swapfile_inode(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->swapfile_inode)) +#define stat_dec_swapfile_inode(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->swapfile_inode)) +#define stat_inc_atomic_inode(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->atomic_files)) +#define stat_dec_atomic_inode(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->atomic_files)) +#define stat_inc_meta_count(sbi, blkaddr) \ + do { \ + if (blkaddr < SIT_I(sbi)->sit_base_addr) \ + atomic_inc(&(sbi)->meta_count[META_CP]); \ + else if (blkaddr < NM_I(sbi)->nat_blkaddr) \ + atomic_inc(&(sbi)->meta_count[META_SIT]); \ + else if (blkaddr < SM_I(sbi)->ssa_blkaddr) \ + atomic_inc(&(sbi)->meta_count[META_NAT]); \ + else if (blkaddr < SM_I(sbi)->main_blkaddr) \ + atomic_inc(&(sbi)->meta_count[META_SSA]); \ + } while (0) +#define stat_inc_seg_type(sbi, curseg) \ + ((sbi)->segment_count[(curseg)->alloc_type]++) +#define stat_inc_block_count(sbi, curseg) \ + ((sbi)->block_count[(curseg)->alloc_type]++) +#define stat_inc_inplace_blocks(sbi) \ + (atomic_inc(&(sbi)->inplace_count)) +#define stat_update_max_atomic_write(inode) \ + do { \ + int cur = atomic_read(&F2FS_I_SB(inode)->atomic_files); \ + int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ + if (cur > max) \ + atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ + } while (0) +#define stat_inc_seg_count(sbi, type, gc_type) \ + do { \ + struct f2fs_stat_info *si = F2FS_STAT(sbi); \ + si->tot_segs++; \ + if ((type) == SUM_TYPE_DATA) { \ + si->data_segs++; \ + si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \ + } else { \ + si->node_segs++; \ + si->bg_node_segs += (gc_type == BG_GC) ? 1 : 0; \ + } \ + } while (0) + +#define stat_inc_tot_blk_count(si, blks) \ + ((si)->tot_blks += (blks)) + +#define stat_inc_data_blk_count(sbi, blks, gc_type) \ + do { \ + struct f2fs_stat_info *si = F2FS_STAT(sbi); \ + stat_inc_tot_blk_count(si, blks); \ + si->data_blks += (blks); \ + si->bg_data_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ + } while (0) + +#define stat_inc_node_blk_count(sbi, blks, gc_type) \ + do { \ + struct f2fs_stat_info *si = F2FS_STAT(sbi); \ + stat_inc_tot_blk_count(si, blks); \ + si->node_blks += (blks); \ + si->bg_node_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ + } while (0) + +int f2fs_build_stats(struct f2fs_sb_info *sbi); +void f2fs_destroy_stats(struct f2fs_sb_info *sbi); +void __init f2fs_create_root_stats(void); +void f2fs_destroy_root_stats(void); +void f2fs_update_sit_info(struct f2fs_sb_info *sbi); +#else +#define stat_inc_cp_count(si) do { } while (0) +#define stat_inc_bg_cp_count(si) do { } while (0) +#define stat_inc_call_count(si) do { } while (0) +#define stat_inc_bggc_count(si) do { } while (0) +#define stat_io_skip_bggc_count(sbi) do { } while (0) +#define stat_other_skip_bggc_count(sbi) do { } while (0) +#define stat_inc_dirty_inode(sbi, type) do { } while (0) +#define stat_dec_dirty_inode(sbi, type) do { } while (0) +#define stat_inc_total_hit(sbi, type) do { } while (0) +#define stat_inc_rbtree_node_hit(sbi, type) do { } while (0) +#define stat_inc_largest_node_hit(sbi) do { } while (0) +#define stat_inc_cached_node_hit(sbi, type) do { } while (0) +#define stat_inc_inline_xattr(inode) do { } while (0) +#define stat_dec_inline_xattr(inode) do { } while (0) +#define stat_inc_inline_inode(inode) do { } while (0) +#define stat_dec_inline_inode(inode) do { } while (0) +#define stat_inc_inline_dir(inode) do { } while (0) +#define stat_dec_inline_dir(inode) do { } while (0) +#define stat_inc_compr_inode(inode) do { } while (0) +#define stat_dec_compr_inode(inode) do { } while (0) +#define stat_add_compr_blocks(inode, blocks) do { } while (0) +#define stat_sub_compr_blocks(inode, blocks) do { } while (0) +#define stat_inc_swapfile_inode(inode) do { } while (0) +#define stat_dec_swapfile_inode(inode) do { } while (0) +#define stat_inc_atomic_inode(inode) do { } while (0) +#define stat_dec_atomic_inode(inode) do { } while (0) +#define stat_update_max_atomic_write(inode) do { } while (0) +#define stat_inc_meta_count(sbi, blkaddr) do { } while (0) +#define stat_inc_seg_type(sbi, curseg) do { } while (0) +#define stat_inc_block_count(sbi, curseg) do { } while (0) +#define stat_inc_inplace_blocks(sbi) do { } while (0) +#define stat_inc_seg_count(sbi, type, gc_type) do { } while (0) +#define stat_inc_tot_blk_count(si, blks) do { } while (0) +#define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0) +#define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0) + +static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } +static inline void __init f2fs_create_root_stats(void) { } +static inline void f2fs_destroy_root_stats(void) { } +static inline void f2fs_update_sit_info(struct f2fs_sb_info *sbi) {} +#endif + +extern const struct file_operations f2fs_dir_operations; +extern const struct file_operations f2fs_file_operations; +extern const struct inode_operations f2fs_file_inode_operations; +extern const struct address_space_operations f2fs_dblock_aops; +extern const struct address_space_operations f2fs_node_aops; +extern const struct address_space_operations f2fs_meta_aops; +extern const struct inode_operations f2fs_dir_inode_operations; +extern const struct inode_operations f2fs_symlink_inode_operations; +extern const struct inode_operations f2fs_encrypted_symlink_inode_operations; +extern const struct inode_operations f2fs_special_inode_operations; +extern struct kmem_cache *f2fs_inode_entry_slab; + +/* + * inline.c + */ +bool f2fs_may_inline_data(struct inode *inode); +bool f2fs_sanity_check_inline_data(struct inode *inode); +bool f2fs_may_inline_dentry(struct inode *inode); +void f2fs_do_read_inline_data(struct page *page, struct page *ipage); +void f2fs_truncate_inline_inode(struct inode *inode, + struct page *ipage, u64 from); +int f2fs_read_inline_data(struct inode *inode, struct page *page); +int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); +int f2fs_convert_inline_inode(struct inode *inode); +int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry); +int f2fs_write_inline_data(struct inode *inode, struct page *page); +int f2fs_recover_inline_data(struct inode *inode, struct page *npage); +struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, + const struct f2fs_filename *fname, + struct page **res_page); +int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent, + struct page *ipage); +int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, + struct inode *inode, nid_t ino, umode_t mode); +void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, + struct page *page, struct inode *dir, + struct inode *inode); +bool f2fs_empty_inline_dir(struct inode *dir); +int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, + struct fscrypt_str *fstr); +int f2fs_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); + +/* + * shrinker.c + */ +unsigned long f2fs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc); +unsigned long f2fs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc); +void f2fs_join_shrinker(struct f2fs_sb_info *sbi); +void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); + +/* + * extent_cache.c + */ +bool sanity_check_extent_cache(struct inode *inode); +struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, + struct rb_entry *cached_re, unsigned int ofs); +struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root_cached *root, + struct rb_node **parent, + unsigned int ofs, bool *leftmost); +struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, + struct rb_entry *cached_re, unsigned int ofs, + struct rb_entry **prev_entry, struct rb_entry **next_entry, + struct rb_node ***insert_p, struct rb_node **insert_parent, + bool force, bool *leftmost); +bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root_cached *root); +void f2fs_init_extent_tree(struct inode *inode); +void f2fs_drop_extent_tree(struct inode *inode); +void f2fs_destroy_extent_node(struct inode *inode); +void f2fs_destroy_extent_tree(struct inode *inode); +void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi); +int __init f2fs_create_extent_cache(void); +void f2fs_destroy_extent_cache(void); + +/* read extent cache ops */ +void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage); +bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_read_extent_cache(struct dnode_of_data *dn); +void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len); +unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, + int nr_shrink); + +/* + * sysfs.c + */ +#define MIN_RA_MUL 2 +#define MAX_RA_MUL 256 + +int __init f2fs_init_sysfs(void); +void f2fs_exit_sysfs(void); +int f2fs_register_sysfs(struct f2fs_sb_info *sbi); +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi); + +/* verity.c */ +extern const struct fsverity_operations f2fs_verityops; + +/* + * crypto support + */ +static inline bool f2fs_encrypted_file(struct inode *inode) +{ + return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode); +} + +static inline void f2fs_set_encrypted_inode(struct inode *inode) +{ +#ifdef CONFIG_FS_ENCRYPTION + file_set_encrypt(inode); + f2fs_set_inode_flags(inode); +#endif +} + +/* + * Returns true if the reads of the inode's data need to undergo some + * postprocessing step, like decryption or authenticity verification. + */ +static inline bool f2fs_post_read_required(struct inode *inode) +{ + return f2fs_encrypted_file(inode) || fsverity_active(inode) || + f2fs_compressed_file(inode); +} + +/* + * compress.c + */ +#ifdef CONFIG_F2FS_FS_COMPRESSION +bool f2fs_is_compressed_page(struct page *page); +struct page *f2fs_compress_control_page(struct page *page); +int f2fs_prepare_compress_overwrite(struct inode *inode, + struct page **pagep, pgoff_t index, void **fsdata); +bool f2fs_compress_write_end(struct inode *inode, void *fsdata, + pgoff_t index, unsigned copied); +int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock); +void f2fs_compress_write_end_io(struct bio *bio, struct page *page); +bool f2fs_is_compress_backend_ready(struct inode *inode); +int f2fs_init_compress_mempool(void); +void f2fs_destroy_compress_mempool(void); +void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task); +void f2fs_end_read_compressed_page(struct page *page, bool failed, + block_t blkaddr, bool in_task); +bool f2fs_cluster_is_empty(struct compress_ctx *cc); +bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); +bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages, + int index, int nr_pages, bool uptodate); +bool f2fs_sanity_check_cluster(struct dnode_of_data *dn); +void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); +int f2fs_write_multi_pages(struct compress_ctx *cc, + int *submitted, + struct writeback_control *wbc, + enum iostat_type io_type); +int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index); +void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, + unsigned int llen, unsigned int c_len); +int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, + unsigned nr_pages, sector_t *last_block_in_bio, + bool is_readahead, bool for_write); +struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, + bool in_task); +void f2fs_put_page_dic(struct page *page, bool in_task); +unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn); +int f2fs_init_compress_ctx(struct compress_ctx *cc); +void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); +void f2fs_init_compress_info(struct f2fs_sb_info *sbi); +int f2fs_init_compress_inode(struct f2fs_sb_info *sbi); +void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi); +int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi); +void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi); +int __init f2fs_init_compress_cache(void); +void f2fs_destroy_compress_cache(void); +struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi); +void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, block_t blkaddr); +void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, + nid_t ino, block_t blkaddr); +bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, + block_t blkaddr); +void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino); +#define inc_compr_inode_stat(inode) \ + do { \ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ + sbi->compr_new_inode++; \ + } while (0) +#define add_compr_block_stat(inode, blocks) \ + do { \ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ + int diff = F2FS_I(inode)->i_cluster_size - blocks; \ + sbi->compr_written_block += blocks; \ + sbi->compr_saved_block += diff; \ + } while (0) +#else +static inline bool f2fs_is_compressed_page(struct page *page) { return false; } +static inline bool f2fs_is_compress_backend_ready(struct inode *inode) +{ + if (!f2fs_compressed_file(inode)) + return true; + /* not support compression */ + return false; +} +static inline struct page *f2fs_compress_control_page(struct page *page) +{ + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); +} +static inline int f2fs_init_compress_mempool(void) { return 0; } +static inline void f2fs_destroy_compress_mempool(void) { } +static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic, + bool in_task) { } +static inline void f2fs_end_read_compressed_page(struct page *page, + bool failed, block_t blkaddr, bool in_task) +{ + WARN_ON_ONCE(1); +} +static inline void f2fs_put_page_dic(struct page *page, bool in_task) +{ + WARN_ON_ONCE(1); +} +static inline unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) { return 0; } +static inline bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) { return false; } +static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { } +static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } +static inline int __init f2fs_init_compress_cache(void) { return 0; } +static inline void f2fs_destroy_compress_cache(void) { } +static inline void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, + block_t blkaddr) { } +static inline void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, + struct page *page, nid_t ino, block_t blkaddr) { } +static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, + struct page *page, block_t blkaddr) { return false; } +static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, + nid_t ino) { } +#define inc_compr_inode_stat(inode) do { } while (0) +static inline void f2fs_update_read_extent_tree_range_compressed( + struct inode *inode, + pgoff_t fofs, block_t blkaddr, + unsigned int llen, unsigned int c_len) { } +#endif + +static inline int set_compress_context(struct inode *inode) +{ +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + F2FS_I(inode)->i_compress_algorithm = + F2FS_OPTION(sbi).compress_algorithm; + F2FS_I(inode)->i_log_cluster_size = + F2FS_OPTION(sbi).compress_log_size; + F2FS_I(inode)->i_compress_flag = + F2FS_OPTION(sbi).compress_chksum ? + BIT(COMPRESS_CHKSUM) : 0; + F2FS_I(inode)->i_cluster_size = + BIT(F2FS_I(inode)->i_log_cluster_size); + if ((F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 || + F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) && + F2FS_OPTION(sbi).compress_level) + F2FS_I(inode)->i_compress_level = + F2FS_OPTION(sbi).compress_level; + F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; + set_inode_flag(inode, FI_COMPRESSED_FILE); + stat_inc_compr_inode(inode); + inc_compr_inode_stat(inode); + f2fs_mark_inode_dirty_sync(inode, true); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static inline bool f2fs_disable_compressed_file(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + if (!f2fs_compressed_file(inode)) + return true; + if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode)) + return false; + + fi->i_flags &= ~F2FS_COMPR_FL; + stat_dec_compr_inode(inode); + clear_inode_flag(inode, FI_COMPRESSED_FILE); + f2fs_mark_inode_dirty_sync(inode, true); + return true; +} + +#define F2FS_FEATURE_FUNCS(name, flagname) \ +static inline int f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \ +{ \ + return F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_##flagname); \ +} + +F2FS_FEATURE_FUNCS(encrypt, ENCRYPT); +F2FS_FEATURE_FUNCS(blkzoned, BLKZONED); +F2FS_FEATURE_FUNCS(extra_attr, EXTRA_ATTR); +F2FS_FEATURE_FUNCS(project_quota, PRJQUOTA); +F2FS_FEATURE_FUNCS(inode_chksum, INODE_CHKSUM); +F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); +F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO); +F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME); +F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND); +F2FS_FEATURE_FUNCS(verity, VERITY); +F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM); +F2FS_FEATURE_FUNCS(casefold, CASEFOLD); +F2FS_FEATURE_FUNCS(compression, COMPRESSION); +F2FS_FEATURE_FUNCS(readonly, RO); + +#ifdef CONFIG_BLK_DEV_ZONED +static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, + block_t blkaddr) +{ + unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz; + + return test_bit(zno, FDEV(devi).blkz_seq); +} +#endif + +static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi) +{ + return f2fs_sb_has_blkzoned(sbi); +} + +static inline bool f2fs_bdev_support_discard(struct block_device *bdev) +{ + return bdev_max_discard_sectors(bdev) || bdev_is_zoned(bdev); +} + +static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi) +{ + int i; + + if (!f2fs_is_multi_device(sbi)) + return f2fs_bdev_support_discard(sbi->sb->s_bdev); + + for (i = 0; i < sbi->s_ndevs; i++) + if (f2fs_bdev_support_discard(FDEV(i).bdev)) + return true; + return false; +} + +static inline bool f2fs_realtime_discard_enable(struct f2fs_sb_info *sbi) +{ + return (test_opt(sbi, DISCARD) && f2fs_hw_support_discard(sbi)) || + f2fs_hw_should_discard(sbi); +} + +static inline bool f2fs_hw_is_readonly(struct f2fs_sb_info *sbi) +{ + int i; + + if (!f2fs_is_multi_device(sbi)) + return bdev_read_only(sbi->sb->s_bdev); + + for (i = 0; i < sbi->s_ndevs; i++) + if (bdev_read_only(FDEV(i).bdev)) + return true; + return false; +} + +static inline bool f2fs_dev_is_readonly(struct f2fs_sb_info *sbi) +{ + return f2fs_sb_has_readonly(sbi) || f2fs_hw_is_readonly(sbi); +} + +static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) +{ + return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS; +} + +static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi) +{ + return F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW; +} + +static inline bool f2fs_may_compress(struct inode *inode) +{ + if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || + f2fs_is_atomic_file(inode) || f2fs_has_inline_data(inode) || + f2fs_is_mmap_file(inode)) + return false; + return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); +} + +static inline void f2fs_i_compr_blocks_update(struct inode *inode, + u64 blocks, bool add) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + int diff = fi->i_cluster_size - blocks; + + /* don't update i_compr_blocks if saved blocks were released */ + if (!add && !atomic_read(&fi->i_compr_blocks)) + return; + + if (add) { + atomic_add(diff, &fi->i_compr_blocks); + stat_add_compr_blocks(inode, diff); + } else { + atomic_sub(diff, &fi->i_compr_blocks); + stat_sub_compr_blocks(inode, diff); + } + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi, + int flag) +{ + if (!f2fs_is_multi_device(sbi)) + return false; + if (flag != F2FS_GET_BLOCK_DIO) + return false; + return sbi->aligned_blksize; +} + +static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) +{ + return fsverity_active(inode) && + idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); +} + +#ifdef CONFIG_F2FS_FAULT_INJECTION +extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, + unsigned int type); +#else +#define f2fs_build_fault_attr(sbi, rate, type) do { } while (0) +#endif + +static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) +{ +#ifdef CONFIG_QUOTA + if (f2fs_sb_has_quota_ino(sbi)) + return true; + if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) + return true; +#endif + return false; +} + +static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi) +{ + return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK; +} + +static inline void f2fs_io_schedule_timeout(long timeout) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(timeout); +} + +static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs, + enum page_type type) +{ + if (unlikely(f2fs_cp_error(sbi))) + return; + + if (ofs == sbi->page_eio_ofs[type]) { + if (sbi->page_eio_cnt[type]++ == MAX_RETRY_PAGE_EIO) + set_ckpt_flags(sbi, CP_ERROR_FLAG); + } else { + sbi->page_eio_ofs[type] = ofs; + sbi->page_eio_cnt[type] = 0; + } +} + +#define EFSBADCRC EBADMSG /* Bad CRC detected */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ + +#endif /* _LINUX_F2FS_H */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c new file mode 100644 index 000000000..fd22854db --- /dev/null +++ b/fs/f2fs/file.c @@ -0,0 +1,4935 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/f2fs/file.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/stat.h> +#include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/falloc.h> +#include <linux/types.h> +#include <linux/compat.h> +#include <linux/uaccess.h> +#include <linux/mount.h> +#include <linux/pagevec.h> +#include <linux/uio.h> +#include <linux/uuid.h> +#include <linux/file.h> +#include <linux/nls.h> +#include <linux/sched/signal.h> +#include <linux/fileattr.h> +#include <linux/fadvise.h> +#include <linux/iomap.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "xattr.h" +#include "acl.h" +#include "gc.h" +#include "iostat.h" +#include <trace/events/f2fs.h> +#include <uapi/linux/f2fs.h> + +static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + vm_fault_t ret; + + ret = filemap_fault(vmf); + if (ret & VM_FAULT_LOCKED) + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_MAPPED_READ_IO, F2FS_BLKSIZE); + + trace_f2fs_filemap_fault(inode, vmf->pgoff, (unsigned long)ret); + + return ret; +} + +static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vmf->vma->vm_file); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + bool need_alloc = true; + int err = 0; + + if (unlikely(IS_IMMUTABLE(inode))) + return VM_FAULT_SIGBUS; + + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + return VM_FAULT_SIGBUS; + + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto err; + } + + if (!f2fs_is_checkpoint_ready(sbi)) { + err = -ENOSPC; + goto err; + } + + err = f2fs_convert_inline_inode(inode); + if (err) + goto err; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + int ret = f2fs_is_compressed_cluster(inode, page->index); + + if (ret < 0) { + err = ret; + goto err; + } else if (ret) { + need_alloc = false; + } + } +#endif + /* should do out of any locked page */ + if (need_alloc) + f2fs_balance_fs(sbi, true); + + sb_start_pagefault(inode->i_sb); + + f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); + + file_update_time(vmf->vma->vm_file); + filemap_invalidate_lock_shared(inode->i_mapping); + lock_page(page); + if (unlikely(page->mapping != inode->i_mapping || + page_offset(page) > i_size_read(inode) || + !PageUptodate(page))) { + unlock_page(page); + err = -EFAULT; + goto out_sem; + } + + if (need_alloc) { + /* block allocation */ + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_block(&dn, page->index); + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + } + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (!need_alloc) { + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + f2fs_put_dnode(&dn); + } +#endif + if (err) { + unlock_page(page); + goto out_sem; + } + + f2fs_wait_on_page_writeback(page, DATA, false, true); + + /* wait for GCed page writeback via META_MAPPING */ + f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); + + /* + * check to see if the page is mapped already (no holes) + */ + if (PageMappedToDisk(page)) + goto out_sem; + + /* page is wholly or partially inside EOF */ + if (((loff_t)(page->index + 1) << PAGE_SHIFT) > + i_size_read(inode)) { + loff_t offset; + + offset = i_size_read(inode) & ~PAGE_MASK; + zero_user_segment(page, offset, PAGE_SIZE); + } + set_page_dirty(page); + if (!PageUptodate(page)) + SetPageUptodate(page); + + f2fs_update_iostat(sbi, inode, APP_MAPPED_IO, F2FS_BLKSIZE); + f2fs_update_time(sbi, REQ_TIME); + + trace_f2fs_vm_page_mkwrite(page, DATA); +out_sem: + filemap_invalidate_unlock_shared(inode->i_mapping); + + sb_end_pagefault(inode->i_sb); +err: + return block_page_mkwrite_return(err); +} + +static const struct vm_operations_struct f2fs_file_vm_ops = { + .fault = f2fs_filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = f2fs_vm_page_mkwrite, +}; + +static int get_parent_ino(struct inode *inode, nid_t *pino) +{ + struct dentry *dentry; + + /* + * Make sure to get the non-deleted alias. The alias associated with + * the open file descriptor being fsync()'ed may be deleted already. + */ + dentry = d_find_alias(inode); + if (!dentry) + return 0; + + *pino = parent_ino(dentry); + dput(dentry); + return 1; +} + +static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + enum cp_reason_type cp_reason = CP_NO_NEEDED; + + if (!S_ISREG(inode->i_mode)) + cp_reason = CP_NON_REGULAR; + else if (f2fs_compressed_file(inode)) + cp_reason = CP_COMPRESSED; + else if (inode->i_nlink != 1) + cp_reason = CP_HARDLINK; + else if (is_sbi_flag_set(sbi, SBI_NEED_CP)) + cp_reason = CP_SB_NEED_CP; + else if (file_wrong_pino(inode)) + cp_reason = CP_WRONG_PINO; + else if (!f2fs_space_for_roll_forward(sbi)) + cp_reason = CP_NO_SPC_ROLL; + else if (!f2fs_is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) + cp_reason = CP_NODE_NEED_CP; + else if (test_opt(sbi, FASTBOOT)) + cp_reason = CP_FASTBOOT_MODE; + else if (F2FS_OPTION(sbi).active_logs == 2) + cp_reason = CP_SPEC_LOG_NUM; + else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT && + f2fs_need_dentry_mark(sbi, inode->i_ino) && + f2fs_exist_written_data(sbi, F2FS_I(inode)->i_pino, + TRANS_DIR_INO)) + cp_reason = CP_RECOVER_DIR; + + return cp_reason; +} + +static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct page *i = find_get_page(NODE_MAPPING(sbi), ino); + bool ret = false; + /* But we need to avoid that there are some inode updates */ + if ((i && PageDirty(i)) || f2fs_need_inode_block_update(sbi, ino)) + ret = true; + f2fs_put_page(i, 0); + return ret; +} + +static void try_to_fix_pino(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + nid_t pino; + + f2fs_down_write(&fi->i_sem); + if (file_wrong_pino(inode) && inode->i_nlink == 1 && + get_parent_ino(inode, &pino)) { + f2fs_i_pino_write(inode, pino); + file_got_pino(inode); + } + f2fs_up_write(&fi->i_sem); +} + +static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, + int datasync, bool atomic) +{ + struct inode *inode = file->f_mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t ino = inode->i_ino; + int ret = 0; + enum cp_reason_type cp_reason = 0; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .for_reclaim = 0, + }; + unsigned int seq_id = 0; + + if (unlikely(f2fs_readonly(inode->i_sb))) + return 0; + + trace_f2fs_sync_file_enter(inode); + + if (S_ISDIR(inode->i_mode)) + goto go_write; + + /* if fdatasync is triggered, let's do in-place-update */ + if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) + set_inode_flag(inode, FI_NEED_IPU); + ret = file_write_and_wait_range(file, start, end); + clear_inode_flag(inode, FI_NEED_IPU); + + if (ret || is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { + trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); + return ret; + } + + /* if the inode is dirty, let's recover all the time */ + if (!f2fs_skip_inode_update(inode, datasync)) { + f2fs_write_inode(inode, NULL); + goto go_write; + } + + /* + * if there is no written data, don't waste time to write recovery info. + */ + if (!is_inode_flag_set(inode, FI_APPEND_WRITE) && + !f2fs_exist_written_data(sbi, ino, APPEND_INO)) { + + /* it may call write_inode just prior to fsync */ + if (need_inode_page_update(sbi, ino)) + goto go_write; + + if (is_inode_flag_set(inode, FI_UPDATE_WRITE) || + f2fs_exist_written_data(sbi, ino, UPDATE_INO)) + goto flush_out; + goto out; + } else { + /* + * for OPU case, during fsync(), node can be persisted before + * data when lower device doesn't support write barrier, result + * in data corruption after SPO. + * So for strict fsync mode, force to use atomic write sematics + * to keep write order in between data/node and last node to + * avoid potential data corruption. + */ + if (F2FS_OPTION(sbi).fsync_mode == + FSYNC_MODE_STRICT && !atomic) + atomic = true; + } +go_write: + /* + * Both of fdatasync() and fsync() are able to be recovered from + * sudden-power-off. + */ + f2fs_down_read(&F2FS_I(inode)->i_sem); + cp_reason = need_do_checkpoint(inode); + f2fs_up_read(&F2FS_I(inode)->i_sem); + + if (cp_reason) { + /* all the dirty node pages should be flushed for POR */ + ret = f2fs_sync_fs(inode->i_sb, 1); + + /* + * We've secured consistency through sync_fs. Following pino + * will be used only for fsynced inodes after checkpoint. + */ + try_to_fix_pino(inode); + clear_inode_flag(inode, FI_APPEND_WRITE); + clear_inode_flag(inode, FI_UPDATE_WRITE); + goto out; + } +sync_nodes: + atomic_inc(&sbi->wb_sync_req[NODE]); + ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic, &seq_id); + atomic_dec(&sbi->wb_sync_req[NODE]); + if (ret) + goto out; + + /* if cp_error was enabled, we should avoid infinite loop */ + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; + goto out; + } + + if (f2fs_need_inode_block_update(sbi, ino)) { + f2fs_mark_inode_dirty_sync(inode, true); + f2fs_write_inode(inode, NULL); + goto sync_nodes; + } + + /* + * If it's atomic_write, it's just fine to keep write ordering. So + * here we don't need to wait for node write completion, since we use + * node chain which serializes node blocks. If one of node writes are + * reordered, we can see simply broken chain, resulting in stopping + * roll-forward recovery. It means we'll recover all or none node blocks + * given fsync mark. + */ + if (!atomic) { + ret = f2fs_wait_on_node_pages_writeback(sbi, seq_id); + if (ret) + goto out; + } + + /* once recovery info is written, don't need to tack this */ + f2fs_remove_ino_entry(sbi, ino, APPEND_INO); + clear_inode_flag(inode, FI_APPEND_WRITE); +flush_out: + if ((!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) || + (atomic && !test_opt(sbi, NOBARRIER) && f2fs_sb_has_blkzoned(sbi))) + ret = f2fs_issue_flush(sbi, inode->i_ino); + if (!ret) { + f2fs_remove_ino_entry(sbi, ino, UPDATE_INO); + clear_inode_flag(inode, FI_UPDATE_WRITE); + f2fs_remove_ino_entry(sbi, ino, FLUSH_INO); + } + f2fs_update_time(sbi, REQ_TIME); +out: + trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); + return ret; +} + +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +{ + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(file))))) + return -EIO; + return f2fs_do_sync_file(file, start, end, datasync, false); +} + +static bool __found_offset(struct address_space *mapping, block_t blkaddr, + pgoff_t index, int whence) +{ + switch (whence) { + case SEEK_DATA: + if (__is_valid_data_blkaddr(blkaddr)) + return true; + if (blkaddr == NEW_ADDR && + xa_get_mark(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY)) + return true; + break; + case SEEK_HOLE: + if (blkaddr == NULL_ADDR) + return true; + break; + } + return false; +} + +static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes = inode->i_sb->s_maxbytes; + struct dnode_of_data dn; + pgoff_t pgofs, end_offset; + loff_t data_ofs = offset; + loff_t isize; + int err = 0; + + inode_lock(inode); + + isize = i_size_read(inode); + if (offset >= isize) + goto fail; + + /* handle inline data case */ + if (f2fs_has_inline_data(inode)) { + if (whence == SEEK_HOLE) { + data_ofs = isize; + goto found; + } else if (whence == SEEK_DATA) { + data_ofs = offset; + goto found; + } + } + + pgofs = (pgoff_t)(offset >> PAGE_SHIFT); + + for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) { + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, pgofs, LOOKUP_NODE); + if (err && err != -ENOENT) { + goto fail; + } else if (err == -ENOENT) { + /* direct node does not exists */ + if (whence == SEEK_DATA) { + pgofs = f2fs_get_next_page_offset(&dn, pgofs); + continue; + } else { + goto found; + } + } + + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + + /* find data/hole in dnode block */ + for (; dn.ofs_in_node < end_offset; + dn.ofs_in_node++, pgofs++, + data_ofs = (loff_t)pgofs << PAGE_SHIFT) { + block_t blkaddr; + + blkaddr = f2fs_data_blkaddr(&dn); + + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(F2FS_I_SB(inode), + blkaddr, DATA_GENERIC_ENHANCE)) { + f2fs_put_dnode(&dn); + goto fail; + } + + if (__found_offset(file->f_mapping, blkaddr, + pgofs, whence)) { + f2fs_put_dnode(&dn); + goto found; + } + } + f2fs_put_dnode(&dn); + } + + if (whence == SEEK_DATA) + goto fail; +found: + if (whence == SEEK_HOLE && data_ofs > isize) + data_ofs = isize; + inode_unlock(inode); + return vfs_setpos(file, data_ofs, maxbytes); +fail: + inode_unlock(inode); + return -ENXIO; +} + +static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes = inode->i_sb->s_maxbytes; + + if (f2fs_compressed_file(inode)) + maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS; + + switch (whence) { + case SEEK_SET: + case SEEK_CUR: + case SEEK_END: + return generic_file_llseek_size(file, offset, whence, + maxbytes, i_size_read(inode)); + case SEEK_DATA: + case SEEK_HOLE: + if (offset < 0) + return -ENXIO; + return f2fs_seek_block(file, offset, whence); + } + + return -EINVAL; +} + +static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(file); + + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + + file_accessed(file); + vma->vm_ops = &f2fs_file_vm_ops; + + f2fs_down_read(&F2FS_I(inode)->i_sem); + set_inode_flag(inode, FI_MMAP_FILE); + f2fs_up_read(&F2FS_I(inode)->i_sem); + + return 0; +} + +static int f2fs_file_open(struct inode *inode, struct file *filp) +{ + int err = fscrypt_file_open(inode, filp); + + if (err) + return err; + + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + + err = fsverity_file_open(inode, filp); + if (err) + return err; + + filp->f_mode |= FMODE_NOWAIT; + + return dquot_file_open(inode, filp); +} + +void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct f2fs_node *raw_node; + int nr_free = 0, ofs = dn->ofs_in_node, len = count; + __le32 *addr; + int base = 0; + bool compressed_cluster = false; + int cluster_index = 0, valid_blocks = 0; + int cluster_size = F2FS_I(dn->inode)->i_cluster_size; + bool released = !atomic_read(&F2FS_I(dn->inode)->i_compr_blocks); + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); + + raw_node = F2FS_NODE(dn->node_page); + addr = blkaddr_in_node(raw_node) + base + ofs; + + /* Assumption: truncateion starts with cluster */ + for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) { + block_t blkaddr = le32_to_cpu(*addr); + + if (f2fs_compressed_file(dn->inode) && + !(cluster_index & (cluster_size - 1))) { + if (compressed_cluster) + f2fs_i_compr_blocks_update(dn->inode, + valid_blocks, false); + compressed_cluster = (blkaddr == COMPRESS_ADDR); + valid_blocks = 0; + } + + if (blkaddr == NULL_ADDR) + continue; + + dn->data_blkaddr = NULL_ADDR; + f2fs_set_data_blkaddr(dn); + + if (__is_valid_data_blkaddr(blkaddr)) { + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE)) + continue; + if (compressed_cluster) + valid_blocks++; + } + + if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) + clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); + + f2fs_invalidate_blocks(sbi, blkaddr); + + if (!released || blkaddr != COMPRESS_ADDR) + nr_free++; + } + + if (compressed_cluster) + f2fs_i_compr_blocks_update(dn->inode, valid_blocks, false); + + if (nr_free) { + pgoff_t fofs; + /* + * once we invalidate valid blkaddr in range [ofs, ofs + count], + * we will invalidate all blkaddr in the whole range. + */ + fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), + dn->inode) + ofs; + f2fs_update_read_extent_cache_range(dn, fofs, 0, len); + dec_valid_block_count(sbi, dn->inode, nr_free); + } + dn->ofs_in_node = ofs; + + f2fs_update_time(sbi, REQ_TIME); + trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid, + dn->ofs_in_node, nr_free); +} + +void f2fs_truncate_data_blocks(struct dnode_of_data *dn) +{ + f2fs_truncate_data_blocks_range(dn, ADDRS_PER_BLOCK(dn->inode)); +} + +static int truncate_partial_data_page(struct inode *inode, u64 from, + bool cache_only) +{ + loff_t offset = from & (PAGE_SIZE - 1); + pgoff_t index = from >> PAGE_SHIFT; + struct address_space *mapping = inode->i_mapping; + struct page *page; + + if (!offset && !cache_only) + return 0; + + if (cache_only) { + page = find_lock_page(mapping, index); + if (page && PageUptodate(page)) + goto truncate_out; + f2fs_put_page(page, 1); + return 0; + } + + page = f2fs_get_lock_data_page(inode, index, true); + if (IS_ERR(page)) + return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); +truncate_out: + f2fs_wait_on_page_writeback(page, DATA, true, true); + zero_user(page, offset, PAGE_SIZE - offset); + + /* An encrypted inode should have a key and truncate the last page. */ + f2fs_bug_on(F2FS_I_SB(inode), cache_only && IS_ENCRYPTED(inode)); + if (!cache_only) + set_page_dirty(page); + f2fs_put_page(page, 1); + return 0; +} + +int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + pgoff_t free_from; + int count = 0, err = 0; + struct page *ipage; + bool truncate_page = false; + + trace_f2fs_truncate_blocks_enter(inode, from); + + free_from = (pgoff_t)F2FS_BLK_ALIGN(from); + + if (free_from >= max_file_blocks(inode)) + goto free_partial; + + if (lock) + f2fs_lock_op(sbi); + + ipage = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto out; + } + + if (f2fs_has_inline_data(inode)) { + f2fs_truncate_inline_inode(inode, ipage, from); + f2fs_put_page(ipage, 1); + truncate_page = true; + goto out; + } + + set_new_dnode(&dn, inode, ipage, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA); + if (err) { + if (err == -ENOENT) + goto free_next; + goto out; + } + + count = ADDRS_PER_PAGE(dn.node_page, inode); + + count -= dn.ofs_in_node; + f2fs_bug_on(sbi, count < 0); + + if (dn.ofs_in_node || IS_INODE(dn.node_page)) { + f2fs_truncate_data_blocks_range(&dn, count); + free_from += count; + } + + f2fs_put_dnode(&dn); +free_next: + err = f2fs_truncate_inode_blocks(inode, free_from); +out: + if (lock) + f2fs_unlock_op(sbi); +free_partial: + /* lastly zero out the first data page */ + if (!err) + err = truncate_partial_data_page(inode, from, truncate_page); + + trace_f2fs_truncate_blocks_exit(inode, err); + return err; +} + +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) +{ + u64 free_from = from; + int err; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* + * for compressed file, only support cluster size + * aligned truncation. + */ + if (f2fs_compressed_file(inode)) + free_from = round_up(from, + F2FS_I(inode)->i_cluster_size << PAGE_SHIFT); +#endif + + err = f2fs_do_truncate_blocks(inode, free_from, lock); + if (err) + return err; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* + * For compressed file, after release compress blocks, don't allow write + * direct, but we should allow write direct after truncate to zero. + */ + if (f2fs_compressed_file(inode) && !free_from + && is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + clear_inode_flag(inode, FI_COMPRESS_RELEASED); + + if (from != free_from) { + err = f2fs_truncate_partial_cluster(inode, from, lock); + if (err) + return err; + } +#endif + + return 0; +} + +int f2fs_truncate(struct inode *inode) +{ + int err; + + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return 0; + + trace_f2fs_truncate(inode); + + if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) { + f2fs_show_injection_info(F2FS_I_SB(inode), FAULT_TRUNCATE); + return -EIO; + } + + err = f2fs_dquot_initialize(inode); + if (err) + return err; + + /* we should check inline_data size */ + if (!f2fs_may_inline_data(inode)) { + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } + + err = f2fs_truncate_blocks(inode, i_size_read(inode), true); + if (err) + return err; + + inode->i_mtime = inode->i_ctime = current_time(inode); + f2fs_mark_inode_dirty_sync(inode, false); + return 0; +} + +static bool f2fs_force_buffered_io(struct inode *inode, int rw) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!fscrypt_dio_supported(inode)) + return true; + if (fsverity_active(inode)) + return true; + if (f2fs_compressed_file(inode)) + return true; + + /* disallow direct IO if any of devices has unaligned blksize */ + if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize) + return true; + /* + * for blkzoned device, fallback direct IO to buffered IO, so + * all IOs can be serialized by log-structured write. + */ + if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE)) + return true; + if (f2fs_lfs_mode(sbi) && rw == WRITE && F2FS_IO_ALIGNED(sbi)) + return true; + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED)) + return true; + + return false; +} + +int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_inode *ri = NULL; + unsigned int flags; + + if (f2fs_has_extra_attr(inode) && + f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) { + stat->result_mask |= STATX_BTIME; + stat->btime.tv_sec = fi->i_crtime.tv_sec; + stat->btime.tv_nsec = fi->i_crtime.tv_nsec; + } + + /* + * Return the DIO alignment restrictions if requested. We only return + * this information when requested, since on encrypted files it might + * take a fair bit of work to get if the file wasn't opened recently. + * + * f2fs sometimes supports DIO reads but not DIO writes. STATX_DIOALIGN + * cannot represent that, so in that case we report no DIO support. + */ + if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { + unsigned int bsize = i_blocksize(inode); + + stat->result_mask |= STATX_DIOALIGN; + if (!f2fs_force_buffered_io(inode, WRITE)) { + stat->dio_mem_align = bsize; + stat->dio_offset_align = bsize; + } + } + + flags = fi->i_flags; + if (flags & F2FS_COMPR_FL) + stat->attributes |= STATX_ATTR_COMPRESSED; + if (flags & F2FS_APPEND_FL) + stat->attributes |= STATX_ATTR_APPEND; + if (IS_ENCRYPTED(inode)) + stat->attributes |= STATX_ATTR_ENCRYPTED; + if (flags & F2FS_IMMUTABLE_FL) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (flags & F2FS_NODUMP_FL) + stat->attributes |= STATX_ATTR_NODUMP; + if (IS_VERITY(inode)) + stat->attributes |= STATX_ATTR_VERITY; + + stat->attributes_mask |= (STATX_ATTR_COMPRESSED | + STATX_ATTR_APPEND | + STATX_ATTR_ENCRYPTED | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP | + STATX_ATTR_VERITY); + + generic_fillattr(mnt_userns, inode, stat); + + /* we need to show initial sectors used for inline_data/dentries */ + if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) || + f2fs_has_inline_dentry(inode)) + stat->blocks += (stat->size + 511) >> 9; + + return 0; +} + +#ifdef CONFIG_F2FS_FS_POSIX_ACL +static void __setattr_copy(struct user_namespace *mnt_userns, + struct inode *inode, const struct iattr *attr) +{ + unsigned int ia_valid = attr->ia_valid; + + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); + if (ia_valid & ATTR_ATIME) + inode->i_atime = attr->ia_atime; + if (ia_valid & ATTR_MTIME) + inode->i_mtime = attr->ia_mtime; + if (ia_valid & ATTR_CTIME) + inode->i_ctime = attr->ia_ctime; + if (ia_valid & ATTR_MODE) { + umode_t mode = attr->ia_mode; + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + + if (!vfsgid_in_group_p(vfsgid) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + mode &= ~S_ISGID; + set_acl_inode(inode, mode); + } +} +#else +#define __setattr_copy setattr_copy +#endif + +int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + int err; + + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + if (unlikely(IS_APPEND(inode) && + (attr->ia_valid & (ATTR_MODE | ATTR_UID | + ATTR_GID | ATTR_TIMES_SET)))) + return -EPERM; + + if ((attr->ia_valid & ATTR_SIZE) && + !f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + + err = setattr_prepare(mnt_userns, dentry, attr); + if (err) + return err; + + err = fscrypt_prepare_setattr(dentry, attr); + if (err) + return err; + + err = fsverity_prepare_setattr(dentry, attr); + if (err) + return err; + + if (is_quota_modification(mnt_userns, inode, attr)) { + err = f2fs_dquot_initialize(inode); + if (err) + return err; + } + if (i_uid_needs_update(mnt_userns, attr, inode) || + i_gid_needs_update(mnt_userns, attr, inode)) { + f2fs_lock_op(F2FS_I_SB(inode)); + err = dquot_transfer(mnt_userns, inode, attr); + if (err) { + set_sbi_flag(F2FS_I_SB(inode), + SBI_QUOTA_NEED_REPAIR); + f2fs_unlock_op(F2FS_I_SB(inode)); + return err; + } + /* + * update uid/gid under lock_op(), so that dquot and inode can + * be updated atomically. + */ + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); + f2fs_mark_inode_dirty_sync(inode, true); + f2fs_unlock_op(F2FS_I_SB(inode)); + } + + if (attr->ia_valid & ATTR_SIZE) { + loff_t old_size = i_size_read(inode); + + if (attr->ia_size > MAX_INLINE_DATA(inode)) { + /* + * should convert inline inode before i_size_write to + * keep smaller than inline_data size with inline flag. + */ + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } + + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + + truncate_setsize(inode, attr->ia_size); + + if (attr->ia_size <= old_size) + err = f2fs_truncate(inode); + /* + * do not trim all blocks after i_size if target size is + * larger than i_size. + */ + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + if (err) + return err; + + spin_lock(&F2FS_I(inode)->i_size_lock); + inode->i_mtime = inode->i_ctime = current_time(inode); + F2FS_I(inode)->last_disk_size = i_size_read(inode); + spin_unlock(&F2FS_I(inode)->i_size_lock); + } + + __setattr_copy(mnt_userns, inode, attr); + + if (attr->ia_valid & ATTR_MODE) { + err = posix_acl_chmod(mnt_userns, inode, f2fs_get_inode_mode(inode)); + + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + if (!err) + inode->i_mode = F2FS_I(inode)->i_acl_mode; + clear_inode_flag(inode, FI_ACL_MODE); + } + } + + /* file size may changed here */ + f2fs_mark_inode_dirty_sync(inode, true); + + /* inode change will produce dirty node pages flushed by checkpoint */ + f2fs_balance_fs(F2FS_I_SB(inode), true); + + return err; +} + +const struct inode_operations f2fs_file_inode_operations = { + .getattr = f2fs_getattr, + .setattr = f2fs_setattr, + .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, + .listxattr = f2fs_listxattr, + .fiemap = f2fs_fiemap, + .fileattr_get = f2fs_fileattr_get, + .fileattr_set = f2fs_fileattr_set, +}; + +static int fill_zero(struct inode *inode, pgoff_t index, + loff_t start, loff_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct page *page; + + if (!len) + return 0; + + f2fs_balance_fs(sbi, true); + + f2fs_lock_op(sbi); + page = f2fs_get_new_data_page(inode, NULL, index, false); + f2fs_unlock_op(sbi); + + if (IS_ERR(page)) + return PTR_ERR(page); + + f2fs_wait_on_page_writeback(page, DATA, true, true); + zero_user(page, start, len); + set_page_dirty(page); + f2fs_put_page(page, 1); + return 0; +} + +int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) +{ + int err; + + while (pg_start < pg_end) { + struct dnode_of_data dn; + pgoff_t end_offset, count; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, pg_start, LOOKUP_NODE); + if (err) { + if (err == -ENOENT) { + pg_start = f2fs_get_next_page_offset(&dn, + pg_start); + continue; + } + return err; + } + + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + count = min(end_offset - dn.ofs_in_node, pg_end - pg_start); + + f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset); + + f2fs_truncate_data_blocks_range(&dn, count); + f2fs_put_dnode(&dn); + + pg_start += count; + } + return 0; +} + +static int punch_hole(struct inode *inode, loff_t offset, loff_t len) +{ + pgoff_t pg_start, pg_end; + loff_t off_start, off_end; + int ret; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; + + off_start = offset & (PAGE_SIZE - 1); + off_end = (offset + len) & (PAGE_SIZE - 1); + + if (pg_start == pg_end) { + ret = fill_zero(inode, pg_start, off_start, + off_end - off_start); + if (ret) + return ret; + } else { + if (off_start) { + ret = fill_zero(inode, pg_start++, off_start, + PAGE_SIZE - off_start); + if (ret) + return ret; + } + if (off_end) { + ret = fill_zero(inode, pg_end, 0, off_end); + if (ret) + return ret; + } + + if (pg_start < pg_end) { + loff_t blk_start, blk_end; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + f2fs_balance_fs(sbi, true); + + blk_start = (loff_t)pg_start << PAGE_SHIFT; + blk_end = (loff_t)pg_end << PAGE_SHIFT; + + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + + truncate_pagecache_range(inode, blk_start, blk_end - 1); + + f2fs_lock_op(sbi); + ret = f2fs_truncate_hole(inode, pg_start, pg_end); + f2fs_unlock_op(sbi); + + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + } + } + + return ret; +} + +static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr, + int *do_replace, pgoff_t off, pgoff_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + int ret, done, i; + +next_dnode: + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + return ret; + } else if (ret == -ENOENT) { + if (dn.max_level == 0) + return -ENOENT; + done = min((pgoff_t)ADDRS_PER_BLOCK(inode) - + dn.ofs_in_node, len); + blkaddr += done; + do_replace += done; + goto next; + } + + done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - + dn.ofs_in_node, len); + for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { + *blkaddr = f2fs_data_blkaddr(&dn); + + if (__is_valid_data_blkaddr(*blkaddr) && + !f2fs_is_valid_blkaddr(sbi, *blkaddr, + DATA_GENERIC_ENHANCE)) { + f2fs_put_dnode(&dn); + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + return -EFSCORRUPTED; + } + + if (!f2fs_is_checkpointed_data(sbi, *blkaddr)) { + + if (f2fs_lfs_mode(sbi)) { + f2fs_put_dnode(&dn); + return -EOPNOTSUPP; + } + + /* do not invalidate this block address */ + f2fs_update_data_blkaddr(&dn, NULL_ADDR); + *do_replace = 1; + } + } + f2fs_put_dnode(&dn); +next: + len -= done; + off += done; + if (len) + goto next_dnode; + return 0; +} + +static int __roll_back_blkaddrs(struct inode *inode, block_t *blkaddr, + int *do_replace, pgoff_t off, int len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + int ret, i; + + for (i = 0; i < len; i++, do_replace++, blkaddr++) { + if (*do_replace == 0) + continue; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA); + if (ret) { + dec_valid_block_count(sbi, inode, 1); + f2fs_invalidate_blocks(sbi, *blkaddr); + } else { + f2fs_update_data_blkaddr(&dn, *blkaddr); + } + f2fs_put_dnode(&dn); + } + return 0; +} + +static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, + block_t *blkaddr, int *do_replace, + pgoff_t src, pgoff_t dst, pgoff_t len, bool full) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(src_inode); + pgoff_t i = 0; + int ret; + + while (i < len) { + if (blkaddr[i] == NULL_ADDR && !full) { + i++; + continue; + } + + if (do_replace[i] || blkaddr[i] == NULL_ADDR) { + struct dnode_of_data dn; + struct node_info ni; + size_t new_size; + pgoff_t ilen; + + set_new_dnode(&dn, dst_inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, dst + i, ALLOC_NODE); + if (ret) + return ret; + + ret = f2fs_get_node_info(sbi, dn.nid, &ni, false); + if (ret) { + f2fs_put_dnode(&dn); + return ret; + } + + ilen = min((pgoff_t) + ADDRS_PER_PAGE(dn.node_page, dst_inode) - + dn.ofs_in_node, len - i); + do { + dn.data_blkaddr = f2fs_data_blkaddr(&dn); + f2fs_truncate_data_blocks_range(&dn, 1); + + if (do_replace[i]) { + f2fs_i_blocks_write(src_inode, + 1, false, false); + f2fs_i_blocks_write(dst_inode, + 1, true, false); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + blkaddr[i], ni.version, true, false); + + do_replace[i] = 0; + } + dn.ofs_in_node++; + i++; + new_size = (loff_t)(dst + i) << PAGE_SHIFT; + if (dst_inode->i_size < new_size) + f2fs_i_size_write(dst_inode, new_size); + } while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR)); + + f2fs_put_dnode(&dn); + } else { + struct page *psrc, *pdst; + + psrc = f2fs_get_lock_data_page(src_inode, + src + i, true); + if (IS_ERR(psrc)) + return PTR_ERR(psrc); + pdst = f2fs_get_new_data_page(dst_inode, NULL, dst + i, + true); + if (IS_ERR(pdst)) { + f2fs_put_page(psrc, 1); + return PTR_ERR(pdst); + } + memcpy_page(pdst, 0, psrc, 0, PAGE_SIZE); + set_page_dirty(pdst); + f2fs_put_page(pdst, 1); + f2fs_put_page(psrc, 1); + + ret = f2fs_truncate_hole(src_inode, + src + i, src + i + 1); + if (ret) + return ret; + i++; + } + } + return 0; +} + +static int __exchange_data_block(struct inode *src_inode, + struct inode *dst_inode, pgoff_t src, pgoff_t dst, + pgoff_t len, bool full) +{ + block_t *src_blkaddr; + int *do_replace; + pgoff_t olen; + int ret; + + while (len) { + olen = min((pgoff_t)4 * ADDRS_PER_BLOCK(src_inode), len); + + src_blkaddr = f2fs_kvzalloc(F2FS_I_SB(src_inode), + array_size(olen, sizeof(block_t)), + GFP_NOFS); + if (!src_blkaddr) + return -ENOMEM; + + do_replace = f2fs_kvzalloc(F2FS_I_SB(src_inode), + array_size(olen, sizeof(int)), + GFP_NOFS); + if (!do_replace) { + kvfree(src_blkaddr); + return -ENOMEM; + } + + ret = __read_out_blkaddrs(src_inode, src_blkaddr, + do_replace, src, olen); + if (ret) + goto roll_back; + + ret = __clone_blkaddrs(src_inode, dst_inode, src_blkaddr, + do_replace, src, dst, olen, full); + if (ret) + goto roll_back; + + src += olen; + dst += olen; + len -= olen; + + kvfree(src_blkaddr); + kvfree(do_replace); + } + return 0; + +roll_back: + __roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, olen); + kvfree(src_blkaddr); + kvfree(do_replace); + return ret; +} + +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t nrpages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + pgoff_t start = offset >> PAGE_SHIFT; + pgoff_t end = (offset + len) >> PAGE_SHIFT; + int ret; + + f2fs_balance_fs(sbi, true); + + /* avoid gc operation during block exchange */ + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + + f2fs_lock_op(sbi); + f2fs_drop_extent_tree(inode); + truncate_pagecache(inode, offset); + ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); + f2fs_unlock_op(sbi); + + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + return ret; +} + +static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) +{ + loff_t new_size; + int ret; + + if (offset + len >= i_size_read(inode)) + return -EINVAL; + + /* collapse range should be aligned to block size of f2fs. */ + if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + /* write out all dirty pages from offset */ + ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + if (ret) + return ret; + + ret = f2fs_do_collapse(inode, offset, len); + if (ret) + return ret; + + /* write out all moved pages, if possible */ + filemap_invalidate_lock(inode->i_mapping); + filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + truncate_pagecache(inode, offset); + + new_size = i_size_read(inode) - len; + ret = f2fs_truncate_blocks(inode, new_size, true); + filemap_invalidate_unlock(inode->i_mapping); + if (!ret) + f2fs_i_size_write(inode, new_size); + return ret; +} + +static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, + pgoff_t end) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + pgoff_t index = start; + unsigned int ofs_in_node = dn->ofs_in_node; + blkcnt_t count = 0; + int ret; + + for (; index < end; index++, dn->ofs_in_node++) { + if (f2fs_data_blkaddr(dn) == NULL_ADDR) + count++; + } + + dn->ofs_in_node = ofs_in_node; + ret = f2fs_reserve_new_blocks(dn, count); + if (ret) + return ret; + + dn->ofs_in_node = ofs_in_node; + for (index = start; index < end; index++, dn->ofs_in_node++) { + dn->data_blkaddr = f2fs_data_blkaddr(dn); + /* + * f2fs_reserve_new_blocks will not guarantee entire block + * allocation. + */ + if (dn->data_blkaddr == NULL_ADDR) { + ret = -ENOSPC; + break; + } + + if (dn->data_blkaddr == NEW_ADDR) + continue; + + if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr, + DATA_GENERIC_ENHANCE)) { + ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + break; + } + + f2fs_invalidate_blocks(sbi, dn->data_blkaddr); + dn->data_blkaddr = NEW_ADDR; + f2fs_set_data_blkaddr(dn); + } + + f2fs_update_read_extent_cache_range(dn, start, 0, index - start); + + return ret; +} + +static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, + int mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; + pgoff_t index, pg_start, pg_end; + loff_t new_size = i_size_read(inode); + loff_t off_start, off_end; + int ret = 0; + + ret = inode_newsize_ok(inode, (len + offset)); + if (ret) + return ret; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); + if (ret) + return ret; + + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; + + off_start = offset & (PAGE_SIZE - 1); + off_end = (offset + len) & (PAGE_SIZE - 1); + + if (pg_start == pg_end) { + ret = fill_zero(inode, pg_start, off_start, + off_end - off_start); + if (ret) + return ret; + + new_size = max_t(loff_t, new_size, offset + len); + } else { + if (off_start) { + ret = fill_zero(inode, pg_start++, off_start, + PAGE_SIZE - off_start); + if (ret) + return ret; + + new_size = max_t(loff_t, new_size, + (loff_t)pg_start << PAGE_SHIFT); + } + + for (index = pg_start; index < pg_end;) { + struct dnode_of_data dn; + unsigned int end_offset; + pgoff_t end; + + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(mapping); + + truncate_pagecache_range(inode, + (loff_t)index << PAGE_SHIFT, + ((loff_t)pg_end << PAGE_SHIFT) - 1); + + f2fs_lock_op(sbi); + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); + if (ret) { + f2fs_unlock_op(sbi); + filemap_invalidate_unlock(mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + goto out; + } + + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + end = min(pg_end, end_offset - dn.ofs_in_node + index); + + ret = f2fs_do_zero_range(&dn, index, end); + f2fs_put_dnode(&dn); + + f2fs_unlock_op(sbi); + filemap_invalidate_unlock(mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + + f2fs_balance_fs(sbi, dn.node_changed); + + if (ret) + goto out; + + index = end; + new_size = max_t(loff_t, new_size, + (loff_t)index << PAGE_SHIFT); + } + + if (off_end) { + ret = fill_zero(inode, pg_end, 0, off_end); + if (ret) + goto out; + + new_size = max_t(loff_t, new_size, offset + len); + } + } + +out: + if (new_size > i_size_read(inode)) { + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); + else + f2fs_i_size_write(inode, new_size); + } + return ret; +} + +static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; + pgoff_t nr, pg_start, pg_end, delta, idx; + loff_t new_size; + int ret = 0; + + new_size = i_size_read(inode) + len; + ret = inode_newsize_ok(inode, new_size); + if (ret) + return ret; + + if (offset >= i_size_read(inode)) + return -EINVAL; + + /* insert range should be aligned to block size of f2fs. */ + if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + f2fs_balance_fs(sbi, true); + + filemap_invalidate_lock(mapping); + ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); + filemap_invalidate_unlock(mapping); + if (ret) + return ret; + + /* write out all dirty pages from offset */ + ret = filemap_write_and_wait_range(mapping, offset, LLONG_MAX); + if (ret) + return ret; + + pg_start = offset >> PAGE_SHIFT; + pg_end = (offset + len) >> PAGE_SHIFT; + delta = pg_end - pg_start; + idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + + /* avoid gc operation during block exchange */ + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(mapping); + truncate_pagecache(inode, offset); + + while (!ret && idx > pg_start) { + nr = idx - pg_start; + if (nr > delta) + nr = delta; + idx -= nr; + + f2fs_lock_op(sbi); + f2fs_drop_extent_tree(inode); + + ret = __exchange_data_block(inode, inode, idx, + idx + delta, nr, false); + f2fs_unlock_op(sbi); + } + filemap_invalidate_unlock(mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + + /* write out all moved pages, if possible */ + filemap_invalidate_lock(mapping); + filemap_write_and_wait_range(mapping, offset, LLONG_MAX); + truncate_pagecache(inode, offset); + filemap_invalidate_unlock(mapping); + + if (!ret) + f2fs_i_size_write(inode, new_size); + return ret; +} + +static int expand_inode_data(struct inode *inode, loff_t offset, + loff_t len, int mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_map_blocks map = { .m_next_pgofs = NULL, + .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, + .m_may_create = true }; + struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .init_gc_type = FG_GC, + .should_migrate_blocks = false, + .err_gc_skipped = true, + .nr_free_secs = 0 }; + pgoff_t pg_start, pg_end; + loff_t new_size = i_size_read(inode); + loff_t off_end; + block_t expanded = 0; + int err; + + err = inode_newsize_ok(inode, (len + offset)); + if (err) + return err; + + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + + f2fs_balance_fs(sbi, true); + + pg_start = ((unsigned long long)offset) >> PAGE_SHIFT; + pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT; + off_end = (offset + len) & (PAGE_SIZE - 1); + + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; + if (off_end) + map.m_len++; + + if (!map.m_len) + return 0; + + if (f2fs_is_pinned_file(inode)) { + block_t sec_blks = CAP_BLKS_PER_SEC(sbi); + block_t sec_len = roundup(map.m_len, sec_blks); + + map.m_len = sec_blks; +next_alloc: + if (has_not_enough_free_secs(sbi, 0, + GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { + f2fs_down_write(&sbi->gc_lock); + err = f2fs_gc(sbi, &gc_control); + if (err && err != -ENODATA) + goto out_err; + } + + f2fs_down_write(&sbi->pin_sem); + + f2fs_lock_op(sbi); + f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); + f2fs_unlock_op(sbi); + + map.m_seg_type = CURSEG_COLD_DATA_PINNED; + err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + file_dont_truncate(inode); + + f2fs_up_write(&sbi->pin_sem); + + expanded += map.m_len; + sec_len -= map.m_len; + map.m_lblk += map.m_len; + if (!err && sec_len) + goto next_alloc; + + map.m_len = expanded; + } else { + err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + expanded = map.m_len; + } +out_err: + if (err) { + pgoff_t last_off; + + if (!expanded) + return err; + + last_off = pg_start + expanded - 1; + + /* update new size to the failed position */ + new_size = (last_off == pg_end) ? offset + len : + (loff_t)(last_off + 1) << PAGE_SHIFT; + } else { + new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end; + } + + if (new_size > i_size_read(inode)) { + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); + else + f2fs_i_size_write(inode, new_size); + } + + return err; +} + +static long f2fs_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + long ret = 0; + + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + if (!f2fs_is_checkpoint_ready(F2FS_I_SB(inode))) + return -ENOSPC; + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + + /* f2fs only support ->fallocate for regular file */ + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (IS_ENCRYPTED(inode) && + (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) + return -EOPNOTSUPP; + + /* + * Pinned file should not support partial trucation since the block + * can be used by applications. + */ + if ((f2fs_compressed_file(inode) || f2fs_is_pinned_file(inode)) && + (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | + FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE))) + return -EOPNOTSUPP; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | + FALLOC_FL_INSERT_RANGE)) + return -EOPNOTSUPP; + + inode_lock(inode); + + ret = file_modified(file); + if (ret) + goto out; + + if (mode & FALLOC_FL_PUNCH_HOLE) { + if (offset >= inode->i_size) + goto out; + + ret = punch_hole(inode, offset, len); + } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { + ret = f2fs_collapse_range(inode, offset, len); + } else if (mode & FALLOC_FL_ZERO_RANGE) { + ret = f2fs_zero_range(inode, offset, len, mode); + } else if (mode & FALLOC_FL_INSERT_RANGE) { + ret = f2fs_insert_range(inode, offset, len); + } else { + ret = expand_inode_data(inode, offset, len, mode); + } + + if (!ret) { + inode->i_mtime = inode->i_ctime = current_time(inode); + f2fs_mark_inode_dirty_sync(inode, false); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + } + +out: + inode_unlock(inode); + + trace_f2fs_fallocate(inode, mode, offset, len, ret); + return ret; +} + +static int f2fs_release_file(struct inode *inode, struct file *filp) +{ + /* + * f2fs_relase_file is called at every close calls. So we should + * not drop any inmemory pages by close called by other process. + */ + if (!(filp->f_mode & FMODE_WRITE) || + atomic_read(&inode->i_writecount) != 1) + return 0; + + inode_lock(inode); + f2fs_abort_atomic_write(inode, true); + inode_unlock(inode); + + return 0; +} + +static int f2fs_file_flush(struct file *file, fl_owner_t id) +{ + struct inode *inode = file_inode(file); + + /* + * If the process doing a transaction is crashed, we should do + * roll-back. Otherwise, other reader/write can see corrupted database + * until all the writers close its file. Since this should be done + * before dropping file lock, it needs to do in ->flush. + */ + if (F2FS_I(inode)->atomic_write_task == current && + (current->flags & PF_EXITING)) { + inode_lock(inode); + f2fs_abort_atomic_write(inode, true); + inode_unlock(inode); + } + + return 0; +} + +static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + u32 masked_flags = fi->i_flags & mask; + + /* mask can be shrunk by flags_valid selector */ + iflags &= mask; + + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + return -EPERM; + + if ((iflags ^ masked_flags) & F2FS_CASEFOLD_FL) { + if (!f2fs_sb_has_casefold(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + if (!f2fs_empty_dir(inode)) + return -ENOTEMPTY; + } + + if (iflags & (F2FS_COMPR_FL | F2FS_NOCOMP_FL)) { + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + if ((iflags & F2FS_COMPR_FL) && (iflags & F2FS_NOCOMP_FL)) + return -EINVAL; + } + + if ((iflags ^ masked_flags) & F2FS_COMPR_FL) { + if (masked_flags & F2FS_COMPR_FL) { + if (!f2fs_disable_compressed_file(inode)) + return -EINVAL; + } else { + /* try to convert inline_data to support compression */ + int err = f2fs_convert_inline_inode(inode); + if (err) + return err; + + f2fs_down_write(&F2FS_I(inode)->i_sem); + if (!f2fs_may_compress(inode) || + (S_ISREG(inode->i_mode) && + F2FS_HAS_BLOCKS(inode))) { + f2fs_up_write(&F2FS_I(inode)->i_sem); + return -EINVAL; + } + err = set_compress_context(inode); + f2fs_up_write(&F2FS_I(inode)->i_sem); + + if (err) + return err; + } + } + + fi->i_flags = iflags | (fi->i_flags & ~mask); + f2fs_bug_on(F2FS_I_SB(inode), (fi->i_flags & F2FS_COMPR_FL) && + (fi->i_flags & F2FS_NOCOMP_FL)); + + if (fi->i_flags & F2FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + else + clear_inode_flag(inode, FI_PROJ_INHERIT); + + inode->i_ctime = current_time(inode); + f2fs_set_inode_flags(inode); + f2fs_mark_inode_dirty_sync(inode, true); + return 0; +} + +/* FS_IOC_[GS]ETFLAGS and FS_IOC_FS[GS]ETXATTR support */ + +/* + * To make a new on-disk f2fs i_flag gettable via FS_IOC_GETFLAGS, add an entry + * for it to f2fs_fsflags_map[], and add its FS_*_FL equivalent to + * F2FS_GETTABLE_FS_FL. To also make it settable via FS_IOC_SETFLAGS, also add + * its FS_*_FL equivalent to F2FS_SETTABLE_FS_FL. + * + * Translating flags to fsx_flags value used by FS_IOC_FSGETXATTR and + * FS_IOC_FSSETXATTR is done by the VFS. + */ + +static const struct { + u32 iflag; + u32 fsflag; +} f2fs_fsflags_map[] = { + { F2FS_COMPR_FL, FS_COMPR_FL }, + { F2FS_SYNC_FL, FS_SYNC_FL }, + { F2FS_IMMUTABLE_FL, FS_IMMUTABLE_FL }, + { F2FS_APPEND_FL, FS_APPEND_FL }, + { F2FS_NODUMP_FL, FS_NODUMP_FL }, + { F2FS_NOATIME_FL, FS_NOATIME_FL }, + { F2FS_NOCOMP_FL, FS_NOCOMP_FL }, + { F2FS_INDEX_FL, FS_INDEX_FL }, + { F2FS_DIRSYNC_FL, FS_DIRSYNC_FL }, + { F2FS_PROJINHERIT_FL, FS_PROJINHERIT_FL }, + { F2FS_CASEFOLD_FL, FS_CASEFOLD_FL }, +}; + +#define F2FS_GETTABLE_FS_FL ( \ + FS_COMPR_FL | \ + FS_SYNC_FL | \ + FS_IMMUTABLE_FL | \ + FS_APPEND_FL | \ + FS_NODUMP_FL | \ + FS_NOATIME_FL | \ + FS_NOCOMP_FL | \ + FS_INDEX_FL | \ + FS_DIRSYNC_FL | \ + FS_PROJINHERIT_FL | \ + FS_ENCRYPT_FL | \ + FS_INLINE_DATA_FL | \ + FS_NOCOW_FL | \ + FS_VERITY_FL | \ + FS_CASEFOLD_FL) + +#define F2FS_SETTABLE_FS_FL ( \ + FS_COMPR_FL | \ + FS_SYNC_FL | \ + FS_IMMUTABLE_FL | \ + FS_APPEND_FL | \ + FS_NODUMP_FL | \ + FS_NOATIME_FL | \ + FS_NOCOMP_FL | \ + FS_DIRSYNC_FL | \ + FS_PROJINHERIT_FL | \ + FS_CASEFOLD_FL) + +/* Convert f2fs on-disk i_flags to FS_IOC_{GET,SET}FLAGS flags */ +static inline u32 f2fs_iflags_to_fsflags(u32 iflags) +{ + u32 fsflags = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(f2fs_fsflags_map); i++) + if (iflags & f2fs_fsflags_map[i].iflag) + fsflags |= f2fs_fsflags_map[i].fsflag; + + return fsflags; +} + +/* Convert FS_IOC_{GET,SET}FLAGS flags to f2fs on-disk i_flags */ +static inline u32 f2fs_fsflags_to_iflags(u32 fsflags) +{ + u32 iflags = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(f2fs_fsflags_map); i++) + if (fsflags & f2fs_fsflags_map[i].fsflag) + iflags |= f2fs_fsflags_map[i].iflag; + + return iflags; +} + +static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + + return put_user(inode->i_generation, (int __user *)arg); +} + +static int f2fs_ioc_start_atomic_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + struct user_namespace *mnt_userns = file_mnt_user_ns(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct inode *pinode; + loff_t isize; + int ret; + + if (!inode_owner_or_capable(mnt_userns, inode)) + return -EACCES; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (filp->f_flags & O_DIRECT) + return -EINVAL; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + + if (!f2fs_disable_compressed_file(inode)) { + ret = -EINVAL; + goto out; + } + + if (f2fs_is_atomic_file(inode)) + goto out; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); + + /* + * Should wait end_io to count F2FS_WB_CP_DATA correctly by + * f2fs_is_atomic_file. + */ + if (get_dirty_pages(inode)) + f2fs_warn(sbi, "Unexpected flush for atomic writes: ino=%lu, npages=%u", + inode->i_ino, get_dirty_pages(inode)); + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + goto out; + } + + /* Check if the inode already has a COW inode */ + if (fi->cow_inode == NULL) { + /* Create a COW inode for atomic write */ + pinode = f2fs_iget(inode->i_sb, fi->i_pino); + if (IS_ERR(pinode)) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + ret = PTR_ERR(pinode); + goto out; + } + + ret = f2fs_get_tmpfile(mnt_userns, pinode, &fi->cow_inode); + iput(pinode); + if (ret) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + goto out; + } + + set_inode_flag(fi->cow_inode, FI_COW_FILE); + clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); + } else { + /* Reuse the already created COW inode */ + ret = f2fs_do_truncate_blocks(fi->cow_inode, 0, true); + if (ret) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + goto out; + } + } + + f2fs_write_inode(inode, NULL); + + isize = i_size_read(inode); + fi->original_i_size = isize; + f2fs_i_size_write(fi->cow_inode, isize); + + stat_inc_atomic_inode(inode); + + set_inode_flag(inode, FI_ATOMIC_FILE); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + + f2fs_update_time(sbi, REQ_TIME); + fi->atomic_write_task = current; + stat_update_max_atomic_write(inode); + fi->atomic_write_cnt = 0; +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_commit_atomic_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + struct user_namespace *mnt_userns = file_mnt_user_ns(filp); + int ret; + + if (!inode_owner_or_capable(mnt_userns, inode)) + return -EACCES; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + f2fs_balance_fs(F2FS_I_SB(inode), true); + + inode_lock(inode); + + if (f2fs_is_atomic_file(inode)) { + ret = f2fs_commit_atomic_write(inode); + if (!ret) + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + + f2fs_abort_atomic_write(inode, ret); + } else { + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); + } + + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_abort_atomic_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + struct user_namespace *mnt_userns = file_mnt_user_ns(filp); + int ret; + + if (!inode_owner_or_capable(mnt_userns, inode)) + return -EACCES; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + + f2fs_abort_atomic_write(inode, true); + + inode_unlock(inode); + + mnt_drop_write_file(filp); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + return ret; +} + +static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct super_block *sb = sbi->sb; + __u32 in; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(in, (__u32 __user *)arg)) + return -EFAULT; + + if (in != F2FS_GOING_DOWN_FULLSYNC) { + ret = mnt_want_write_file(filp); + if (ret) { + if (ret == -EROFS) { + ret = 0; + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_SHUTDOWN); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + trace_f2fs_shutdown(sbi, in, ret); + } + return ret; + } + } + + switch (in) { + case F2FS_GOING_DOWN_FULLSYNC: + ret = freeze_bdev(sb->s_bdev); + if (ret) + goto out; + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + thaw_bdev(sb->s_bdev); + break; + case F2FS_GOING_DOWN_METASYNC: + /* do checkpoint only */ + ret = f2fs_sync_fs(sb, 1); + if (ret) + goto out; + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + break; + case F2FS_GOING_DOWN_NOSYNC: + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + break; + case F2FS_GOING_DOWN_METAFLUSH: + f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + break; + case F2FS_GOING_DOWN_NEED_FSCK: + set_sbi_flag(sbi, SBI_NEED_FSCK); + set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + set_sbi_flag(sbi, SBI_IS_DIRTY); + /* do checkpoint only */ + ret = f2fs_sync_fs(sb, 1); + goto out; + default: + ret = -EINVAL; + goto out; + } + + f2fs_stop_gc_thread(sbi); + f2fs_stop_discard_thread(sbi); + + f2fs_drop_discard_cmd(sbi); + clear_opt(sbi, DISCARD); + + f2fs_update_time(sbi, REQ_TIME); +out: + if (in != F2FS_GOING_DOWN_FULLSYNC) + mnt_drop_write_file(filp); + + trace_f2fs_shutdown(sbi, in, ret); + + return ret; +} + +static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct super_block *sb = inode->i_sb; + struct fstrim_range range; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!f2fs_hw_support_discard(F2FS_SB(sb))) + return -EOPNOTSUPP; + + if (copy_from_user(&range, (struct fstrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + range.minlen = max((unsigned int)range.minlen, + bdev_discard_granularity(sb->s_bdev)); + ret = f2fs_trim_fs(F2FS_SB(sb), &range); + mnt_drop_write_file(filp); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range __user *)arg, &range, + sizeof(range))) + return -EFAULT; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + return 0; +} + +static bool uuid_is_nonzero(__u8 u[16]) +{ + int i; + + for (i = 0; i < 16; i++) + if (u[i]) + return true; + return false; +} + +static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + + if (!f2fs_sb_has_encrypt(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + + return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); +} + +static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); +} + +static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + u8 encrypt_pw_salt[16]; + int err; + + if (!f2fs_sb_has_encrypt(sbi)) + return -EOPNOTSUPP; + + err = mnt_want_write_file(filp); + if (err) + return err; + + f2fs_down_write(&sbi->sb_lock); + + if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) + goto got_it; + + /* update superblock with uuid */ + generate_random_uuid(sbi->raw_super->encrypt_pw_salt); + + err = f2fs_commit_super(sbi, false); + if (err) { + /* undo new data */ + memset(sbi->raw_super->encrypt_pw_salt, 0, 16); + goto out_err; + } +got_it: + memcpy(encrypt_pw_salt, sbi->raw_super->encrypt_pw_salt, 16); +out_err: + f2fs_up_write(&sbi->sb_lock); + mnt_drop_write_file(filp); + + if (!err && copy_to_user((__u8 __user *)arg, encrypt_pw_salt, 16)) + err = -EFAULT; + + return err; +} + +static int f2fs_ioc_get_encryption_policy_ex(struct file *filp, + unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg); +} + +static int f2fs_ioc_add_encryption_key(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_add_key(filp, (void __user *)arg); +} + +static int f2fs_ioc_remove_encryption_key(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_remove_key(filp, (void __user *)arg); +} + +static int f2fs_ioc_remove_encryption_key_all_users(struct file *filp, + unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_remove_key_all_users(filp, (void __user *)arg); +} + +static int f2fs_ioc_get_encryption_key_status(struct file *filp, + unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_get_key_status(filp, (void __user *)arg); +} + +static int f2fs_ioc_get_encryption_nonce(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_get_nonce(filp, (void __user *)arg); +} + +static int f2fs_ioc_gc(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .no_bg_gc = false, + .should_migrate_blocks = false, + .nr_free_secs = 0 }; + __u32 sync; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(sync, (__u32 __user *)arg)) + return -EFAULT; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (!sync) { + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + ret = -EBUSY; + goto out; + } + } else { + f2fs_down_write(&sbi->gc_lock); + } + + gc_control.init_gc_type = sync ? FG_GC : BG_GC; + gc_control.err_gc_skipped = sync; + ret = f2fs_gc(sbi, &gc_control); +out: + mnt_drop_write_file(filp); + return ret; +} + +static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); + struct f2fs_gc_control gc_control = { + .init_gc_type = range->sync ? FG_GC : BG_GC, + .no_bg_gc = false, + .should_migrate_blocks = false, + .err_gc_skipped = range->sync, + .nr_free_secs = 0 }; + u64 end; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + end = range->start + range->len; + if (end < range->start || range->start < MAIN_BLKADDR(sbi) || + end >= MAX_BLKADDR(sbi)) + return -EINVAL; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + +do_more: + if (!range->sync) { + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + ret = -EBUSY; + goto out; + } + } else { + f2fs_down_write(&sbi->gc_lock); + } + + gc_control.victim_segno = GET_SEGNO(sbi, range->start); + ret = f2fs_gc(sbi, &gc_control); + if (ret) { + if (ret == -EBUSY) + ret = -EAGAIN; + goto out; + } + range->start += CAP_BLKS_PER_SEC(sbi); + if (range->start <= end) + goto do_more; +out: + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) +{ + struct f2fs_gc_range range; + + if (copy_from_user(&range, (struct f2fs_gc_range __user *)arg, + sizeof(range))) + return -EFAULT; + return __f2fs_ioc_gc_range(filp, &range); +} + +static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + f2fs_info(sbi, "Skipping Checkpoint. Checkpoints currently disabled."); + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = f2fs_sync_fs(sbi->sb, 1); + + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_defragment_range(struct f2fs_sb_info *sbi, + struct file *filp, + struct f2fs_defragment *range) +{ + struct inode *inode = file_inode(filp); + struct f2fs_map_blocks map = { .m_next_extent = NULL, + .m_seg_type = NO_CHECK_TYPE, + .m_may_create = false }; + struct extent_info ei = {0, }; + pgoff_t pg_start, pg_end, next_pgofs; + unsigned int blk_per_seg = sbi->blocks_per_seg; + unsigned int total = 0, sec_num; + block_t blk_end = 0; + bool fragmented = false; + int err; + + pg_start = range->start >> PAGE_SHIFT; + pg_end = (range->start + range->len) >> PAGE_SHIFT; + + f2fs_balance_fs(sbi, true); + + inode_lock(inode); + + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + err = -EINVAL; + goto unlock_out; + } + + /* if in-place-update policy is enabled, don't waste time here */ + set_inode_flag(inode, FI_OPU_WRITE); + if (f2fs_should_update_inplace(inode, NULL)) { + err = -EINVAL; + goto out; + } + + /* writeback all dirty pages in the range */ + err = filemap_write_and_wait_range(inode->i_mapping, range->start, + range->start + range->len - 1); + if (err) + goto out; + + /* + * lookup mapping info in extent cache, skip defragmenting if physical + * block addresses are continuous. + */ + if (f2fs_lookup_read_extent_cache(inode, pg_start, &ei)) { + if (ei.fofs + ei.len >= pg_end) + goto out; + } + + map.m_lblk = pg_start; + map.m_next_pgofs = &next_pgofs; + + /* + * lookup mapping info in dnode page cache, skip defragmenting if all + * physical block addresses are continuous even if there are hole(s) + * in logical blocks. + */ + while (map.m_lblk < pg_end) { + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); + if (err) + goto out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk = next_pgofs; + continue; + } + + if (blk_end && blk_end != map.m_pblk) + fragmented = true; + + /* record total count of block that we're going to move */ + total += map.m_len; + + blk_end = map.m_pblk + map.m_len; + + map.m_lblk += map.m_len; + } + + if (!fragmented) { + total = 0; + goto out; + } + + sec_num = DIV_ROUND_UP(total, CAP_BLKS_PER_SEC(sbi)); + + /* + * make sure there are enough free section for LFS allocation, this can + * avoid defragment running in SSR mode when free section are allocated + * intensively + */ + if (has_not_enough_free_secs(sbi, 0, sec_num)) { + err = -EAGAIN; + goto out; + } + + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; + total = 0; + + while (map.m_lblk < pg_end) { + pgoff_t idx; + int cnt = 0; + +do_map: + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); + if (err) + goto clear_out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk = next_pgofs; + goto check; + } + + set_inode_flag(inode, FI_SKIP_WRITES); + + idx = map.m_lblk; + while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) { + struct page *page; + + page = f2fs_get_lock_data_page(inode, idx, true); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto clear_out; + } + + set_page_dirty(page); + set_page_private_gcing(page); + f2fs_put_page(page, 1); + + idx++; + cnt++; + total++; + } + + map.m_lblk = idx; +check: + if (map.m_lblk < pg_end && cnt < blk_per_seg) + goto do_map; + + clear_inode_flag(inode, FI_SKIP_WRITES); + + err = filemap_fdatawrite(inode->i_mapping); + if (err) + goto out; + } +clear_out: + clear_inode_flag(inode, FI_SKIP_WRITES); +out: + clear_inode_flag(inode, FI_OPU_WRITE); +unlock_out: + inode_unlock(inode); + if (!err) + range->len = (u64)total << PAGE_SHIFT; + return err; +} + +static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_defragment range; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode)) + return -EINVAL; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, + sizeof(range))) + return -EFAULT; + + /* verify alignment of offset & size */ + if (range.start & (F2FS_BLKSIZE - 1) || range.len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + if (unlikely((range.start + range.len) >> PAGE_SHIFT > + max_file_blocks(inode))) + return -EINVAL; + + err = mnt_want_write_file(filp); + if (err) + return err; + + err = f2fs_defragment_range(sbi, filp, &range); + mnt_drop_write_file(filp); + + f2fs_update_time(sbi, REQ_TIME); + if (err < 0) + return err; + + if (copy_to_user((struct f2fs_defragment __user *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; +} + +static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, size_t len) +{ + struct inode *src = file_inode(file_in); + struct inode *dst = file_inode(file_out); + struct f2fs_sb_info *sbi = F2FS_I_SB(src); + size_t olen = len, dst_max_i_size = 0; + size_t dst_osize; + int ret; + + if (file_in->f_path.mnt != file_out->f_path.mnt || + src->i_sb != dst->i_sb) + return -EXDEV; + + if (unlikely(f2fs_readonly(src->i_sb))) + return -EROFS; + + if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode)) + return -EINVAL; + + if (IS_ENCRYPTED(src) || IS_ENCRYPTED(dst)) + return -EOPNOTSUPP; + + if (pos_out < 0 || pos_in < 0) + return -EINVAL; + + if (src == dst) { + if (pos_in == pos_out) + return 0; + if (pos_out > pos_in && pos_out < pos_in + len) + return -EINVAL; + } + + inode_lock(src); + if (src != dst) { + ret = -EBUSY; + if (!inode_trylock(dst)) + goto out; + } + + if (f2fs_compressed_file(src) || f2fs_compressed_file(dst)) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + + ret = -EINVAL; + if (pos_in + len > src->i_size || pos_in + len < pos_in) + goto out_unlock; + if (len == 0) + olen = len = src->i_size - pos_in; + if (pos_in + len == src->i_size) + len = ALIGN(src->i_size, F2FS_BLKSIZE) - pos_in; + if (len == 0) { + ret = 0; + goto out_unlock; + } + + dst_osize = dst->i_size; + if (pos_out + olen > dst->i_size) + dst_max_i_size = pos_out + olen; + + /* verify the end result is block aligned */ + if (!IS_ALIGNED(pos_in, F2FS_BLKSIZE) || + !IS_ALIGNED(pos_in + len, F2FS_BLKSIZE) || + !IS_ALIGNED(pos_out, F2FS_BLKSIZE)) + goto out_unlock; + + ret = f2fs_convert_inline_inode(src); + if (ret) + goto out_unlock; + + ret = f2fs_convert_inline_inode(dst); + if (ret) + goto out_unlock; + + /* write out all dirty pages from offset */ + ret = filemap_write_and_wait_range(src->i_mapping, + pos_in, pos_in + len); + if (ret) + goto out_unlock; + + ret = filemap_write_and_wait_range(dst->i_mapping, + pos_out, pos_out + len); + if (ret) + goto out_unlock; + + f2fs_balance_fs(sbi, true); + + f2fs_down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + if (src != dst) { + ret = -EBUSY; + if (!f2fs_down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) + goto out_src; + } + + f2fs_lock_op(sbi); + ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, + pos_out >> F2FS_BLKSIZE_BITS, + len >> F2FS_BLKSIZE_BITS, false); + + if (!ret) { + if (dst_max_i_size) + f2fs_i_size_write(dst, dst_max_i_size); + else if (dst_osize != dst->i_size) + f2fs_i_size_write(dst, dst_osize); + } + f2fs_unlock_op(sbi); + + if (src != dst) + f2fs_up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); +out_src: + f2fs_up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); +out_unlock: + if (src != dst) + inode_unlock(dst); +out: + inode_unlock(src); + return ret; +} + +static int __f2fs_ioc_move_range(struct file *filp, + struct f2fs_move_range *range) +{ + struct fd dst; + int err; + + if (!(filp->f_mode & FMODE_READ) || + !(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + dst = fdget(range->dst_fd); + if (!dst.file) + return -EBADF; + + if (!(dst.file->f_mode & FMODE_WRITE)) { + err = -EBADF; + goto err_out; + } + + err = mnt_want_write_file(filp); + if (err) + goto err_out; + + err = f2fs_move_file_range(filp, range->pos_in, dst.file, + range->pos_out, range->len); + + mnt_drop_write_file(filp); +err_out: + fdput(dst); + return err; +} + +static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) +{ + struct f2fs_move_range range; + + if (copy_from_user(&range, (struct f2fs_move_range __user *)arg, + sizeof(range))) + return -EFAULT; + return __f2fs_ioc_move_range(filp, &range); +} + +static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct sit_info *sm = SIT_I(sbi); + unsigned int start_segno = 0, end_segno = 0; + unsigned int dev_start_segno = 0, dev_end_segno = 0; + struct f2fs_flush_device range; + struct f2fs_gc_control gc_control = { + .init_gc_type = FG_GC, + .should_migrate_blocks = true, + .err_gc_skipped = true, + .nr_free_secs = 0 }; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return -EINVAL; + + if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg, + sizeof(range))) + return -EFAULT; + + if (!f2fs_is_multi_device(sbi) || sbi->s_ndevs - 1 <= range.dev_num || + __is_large_section(sbi)) { + f2fs_warn(sbi, "Can't flush %u in %d for segs_per_sec %u != 1", + range.dev_num, sbi->s_ndevs, sbi->segs_per_sec); + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (range.dev_num != 0) + dev_start_segno = GET_SEGNO(sbi, FDEV(range.dev_num).start_blk); + dev_end_segno = GET_SEGNO(sbi, FDEV(range.dev_num).end_blk); + + start_segno = sm->last_victim[FLUSH_DEVICE]; + if (start_segno < dev_start_segno || start_segno >= dev_end_segno) + start_segno = dev_start_segno; + end_segno = min(start_segno + range.segments, dev_end_segno); + + while (start_segno < end_segno) { + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + ret = -EBUSY; + goto out; + } + sm->last_victim[GC_CB] = end_segno + 1; + sm->last_victim[GC_GREEDY] = end_segno + 1; + sm->last_victim[ALLOC_NEXT] = end_segno + 1; + + gc_control.victim_segno = start_segno; + ret = f2fs_gc(sbi, &gc_control); + if (ret == -EAGAIN) + ret = 0; + else if (ret < 0) + break; + start_segno++; + } +out: + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_get_features(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + u32 sb_feature = le32_to_cpu(F2FS_I_SB(inode)->raw_super->feature); + + /* Must validate to set it with SQLite behavior in Android. */ + sb_feature |= F2FS_FEATURE_ATOMIC_WRITE; + + return put_user(sb_feature, (u32 __user *)arg); +} + +#ifdef CONFIG_QUOTA +int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid) +{ + struct dquot *transfer_to[MAXQUOTAS] = {}; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct super_block *sb = sbi->sb; + int err; + + transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); + if (IS_ERR(transfer_to[PRJQUOTA])) + return PTR_ERR(transfer_to[PRJQUOTA]); + + err = __dquot_transfer(inode, transfer_to); + if (err) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + dqput(transfer_to[PRJQUOTA]); + return err; +} + +static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode *ri = NULL; + kprojid_t kprojid; + int err; + + if (!f2fs_sb_has_project_quota(sbi)) { + if (projid != F2FS_DEF_PROJID) + return -EOPNOTSUPP; + else + return 0; + } + + if (!f2fs_has_extra_attr(inode)) + return -EOPNOTSUPP; + + kprojid = make_kprojid(&init_user_ns, (projid_t)projid); + + if (projid_eq(kprojid, fi->i_projid)) + return 0; + + err = -EPERM; + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + return err; + + if (!F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) + return -EOVERFLOW; + + err = f2fs_dquot_initialize(inode); + if (err) + return err; + + f2fs_lock_op(sbi); + err = f2fs_transfer_project_quota(inode, kprojid); + if (err) + goto out_unlock; + + fi->i_projid = kprojid; + inode->i_ctime = current_time(inode); + f2fs_mark_inode_dirty_sync(inode, true); +out_unlock: + f2fs_unlock_op(sbi); + return err; +} +#else +int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid) +{ + return 0; +} + +static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) +{ + if (projid != F2FS_DEF_PROJID) + return -EOPNOTSUPP; + return 0; +} +#endif + +int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct f2fs_inode_info *fi = F2FS_I(inode); + u32 fsflags = f2fs_iflags_to_fsflags(fi->i_flags); + + if (IS_ENCRYPTED(inode)) + fsflags |= FS_ENCRYPT_FL; + if (IS_VERITY(inode)) + fsflags |= FS_VERITY_FL; + if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) + fsflags |= FS_INLINE_DATA_FL; + if (is_inode_flag_set(inode, FI_PIN_FILE)) + fsflags |= FS_NOCOW_FL; + + fileattr_fill_flags(fa, fsflags & F2FS_GETTABLE_FS_FL); + + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode))) + fa->fsx_projid = from_kprojid(&init_user_ns, fi->i_projid); + + return 0; +} + +int f2fs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + u32 fsflags = fa->flags, mask = F2FS_SETTABLE_FS_FL; + u32 iflags; + int err; + + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + if (!f2fs_is_checkpoint_ready(F2FS_I_SB(inode))) + return -ENOSPC; + if (fsflags & ~F2FS_GETTABLE_FS_FL) + return -EOPNOTSUPP; + fsflags &= F2FS_SETTABLE_FS_FL; + if (!fa->flags_valid) + mask &= FS_COMMON_FL; + + iflags = f2fs_fsflags_to_iflags(fsflags); + if (f2fs_mask_flags(inode->i_mode, iflags) != iflags) + return -EOPNOTSUPP; + + err = f2fs_setflags_common(inode, iflags, f2fs_fsflags_to_iflags(mask)); + if (!err) + err = f2fs_ioc_setproject(inode, fa->fsx_projid); + + return err; +} + +int f2fs_pin_file_control(struct inode *inode, bool inc) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + /* Use i_gc_failures for normal file as a risk signal. */ + if (inc) + f2fs_i_gc_failures_write(inode, + fi->i_gc_failures[GC_FAILURE_PIN] + 1); + + if (fi->i_gc_failures[GC_FAILURE_PIN] > sbi->gc_pin_file_threshold) { + f2fs_warn(sbi, "%s: Enable GC = ino %lx after %x GC trials", + __func__, inode->i_ino, + fi->i_gc_failures[GC_FAILURE_PIN]); + clear_inode_flag(inode, FI_PIN_FILE); + return -EAGAIN; + } + return 0; +} + +static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u32 pin; + int ret = 0; + + if (get_user(pin, (__u32 __user *)arg)) + return -EFAULT; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (f2fs_readonly(F2FS_I_SB(inode)->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + + if (!pin) { + clear_inode_flag(inode, FI_PIN_FILE); + f2fs_i_gc_failures_write(inode, 0); + goto done; + } + + if (f2fs_should_update_outplace(inode, NULL)) { + ret = -EINVAL; + goto out; + } + + if (f2fs_pin_file_control(inode, false)) { + ret = -EAGAIN; + goto out; + } + + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + if (!f2fs_disable_compressed_file(inode)) { + ret = -EOPNOTSUPP; + goto out; + } + + set_inode_flag(inode, FI_PIN_FILE); + ret = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]; +done: + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u32 pin = 0; + + if (is_inode_flag_set(inode, FI_PIN_FILE)) + pin = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]; + return put_user(pin, (u32 __user *)arg); +} + +int f2fs_precache_extents(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_map_blocks map; + pgoff_t m_next_extent; + loff_t end; + int err; + + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return -EOPNOTSUPP; + + map.m_lblk = 0; + map.m_pblk = 0; + map.m_next_pgofs = NULL; + map.m_next_extent = &m_next_extent; + map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; + end = max_file_blocks(inode); + + while (map.m_lblk < end) { + map.m_len = end - map.m_lblk; + + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + if (err) + return err; + + map.m_lblk = m_next_extent; + } + + return 0; +} + +static int f2fs_ioc_precache_extents(struct file *filp, unsigned long arg) +{ + return f2fs_precache_extents(file_inode(filp)); +} + +static int f2fs_ioc_resize_fs(struct file *filp, unsigned long arg) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); + __u64 block_count; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (copy_from_user(&block_count, (void __user *)arg, + sizeof(block_count))) + return -EFAULT; + + return f2fs_resize_fs(filp, block_count); +} + +static int f2fs_ioc_enable_verity(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + + if (!f2fs_sb_has_verity(F2FS_I_SB(inode))) { + f2fs_warn(F2FS_I_SB(inode), + "Can't enable fs-verity on inode %lu: the verity feature is not enabled on this filesystem", + inode->i_ino); + return -EOPNOTSUPP; + } + + return fsverity_ioctl_enable(filp, (const void __user *)arg); +} + +static int f2fs_ioc_measure_verity(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_verity(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fsverity_ioctl_measure(filp, (void __user *)arg); +} + +static int f2fs_ioc_read_verity_metadata(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_verity(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fsverity_ioctl_read_metadata(filp, (const void __user *)arg); +} + +static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + char *vbuf; + int count; + int err = 0; + + vbuf = f2fs_kzalloc(sbi, MAX_VOLUME_NAME, GFP_KERNEL); + if (!vbuf) + return -ENOMEM; + + f2fs_down_read(&sbi->sb_lock); + count = utf16s_to_utf8s(sbi->raw_super->volume_name, + ARRAY_SIZE(sbi->raw_super->volume_name), + UTF16_LITTLE_ENDIAN, vbuf, MAX_VOLUME_NAME); + f2fs_up_read(&sbi->sb_lock); + + if (copy_to_user((char __user *)arg, vbuf, + min(FSLABEL_MAX, count))) + err = -EFAULT; + + kfree(vbuf); + return err; +} + +static int f2fs_ioc_setfslabel(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + char *vbuf; + int err = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vbuf = strndup_user((const char __user *)arg, FSLABEL_MAX); + if (IS_ERR(vbuf)) + return PTR_ERR(vbuf); + + err = mnt_want_write_file(filp); + if (err) + goto out; + + f2fs_down_write(&sbi->sb_lock); + + memset(sbi->raw_super->volume_name, 0, + sizeof(sbi->raw_super->volume_name)); + utf8s_to_utf16s(vbuf, strlen(vbuf), UTF16_LITTLE_ENDIAN, + sbi->raw_super->volume_name, + ARRAY_SIZE(sbi->raw_super->volume_name)); + + err = f2fs_commit_super(sbi, false); + + f2fs_up_write(&sbi->sb_lock); + + mnt_drop_write_file(filp); +out: + kfree(vbuf); + return err; +} + +static int f2fs_get_compress_blocks(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u64 blocks; + + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + + if (!f2fs_compressed_file(inode)) + return -EINVAL; + + blocks = atomic_read(&F2FS_I(inode)->i_compr_blocks); + return put_user(blocks, (u64 __user *)arg); +} + +static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + unsigned int released_blocks = 0; + int cluster_size = F2FS_I(dn->inode)->i_cluster_size; + block_t blkaddr; + int i; + + for (i = 0; i < count; i++) { + blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + if (!__is_valid_data_blkaddr(blkaddr)) + continue; + if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + return -EFSCORRUPTED; + } + } + + while (count) { + int compr_blocks = 0; + + for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) { + blkaddr = f2fs_data_blkaddr(dn); + + if (i == 0) { + if (blkaddr == COMPRESS_ADDR) + continue; + dn->ofs_in_node += cluster_size; + goto next; + } + + if (__is_valid_data_blkaddr(blkaddr)) + compr_blocks++; + + if (blkaddr != NEW_ADDR) + continue; + + dn->data_blkaddr = NULL_ADDR; + f2fs_set_data_blkaddr(dn); + } + + f2fs_i_compr_blocks_update(dn->inode, compr_blocks, false); + dec_valid_block_count(sbi, dn->inode, + cluster_size - compr_blocks); + + released_blocks += cluster_size - compr_blocks; +next: + count -= cluster_size; + } + + return released_blocks; +} + +static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t page_idx = 0, last_idx; + unsigned int released_blocks = 0; + int ret; + int writecount; + + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + + if (!f2fs_compressed_file(inode)) + return -EINVAL; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + f2fs_balance_fs(F2FS_I_SB(inode), true); + + inode_lock(inode); + + writecount = atomic_read(&inode->i_writecount); + if ((filp->f_mode & FMODE_WRITE && writecount != 1) || + (!(filp->f_mode & FMODE_WRITE) && writecount)) { + ret = -EBUSY; + goto out; + } + + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + ret = -EINVAL; + goto out; + } + + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + goto out; + + set_inode_flag(inode, FI_COMPRESS_RELEASED); + inode->i_ctime = current_time(inode); + f2fs_mark_inode_dirty_sync(inode, true); + + if (!atomic_read(&F2FS_I(inode)->i_compr_blocks)) + goto out; + + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + + last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + + while (page_idx < last_idx) { + struct dnode_of_data dn; + pgoff_t end_offset, count; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE); + if (ret) { + if (ret == -ENOENT) { + page_idx = f2fs_get_next_page_offset(&dn, + page_idx); + ret = 0; + continue; + } + break; + } + + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); + count = round_up(count, F2FS_I(inode)->i_cluster_size); + + ret = release_compress_blocks(&dn, count); + + f2fs_put_dnode(&dn); + + if (ret < 0) + break; + + page_idx += count; + released_blocks += ret; + } + + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); +out: + inode_unlock(inode); + + mnt_drop_write_file(filp); + + if (ret >= 0) { + ret = put_user(released_blocks, (u64 __user *)arg); + } else if (released_blocks && + atomic_read(&F2FS_I(inode)->i_compr_blocks)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx " + "iblocks=%llu, released=%u, compr_blocks=%u, " + "run fsck to fix.", + __func__, inode->i_ino, inode->i_blocks, + released_blocks, + atomic_read(&F2FS_I(inode)->i_compr_blocks)); + } + + return ret; +} + +static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + unsigned int reserved_blocks = 0; + int cluster_size = F2FS_I(dn->inode)->i_cluster_size; + block_t blkaddr; + int i; + + for (i = 0; i < count; i++) { + blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + if (!__is_valid_data_blkaddr(blkaddr)) + continue; + if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + return -EFSCORRUPTED; + } + } + + while (count) { + int compr_blocks = 0; + blkcnt_t reserved; + int ret; + + for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) { + blkaddr = f2fs_data_blkaddr(dn); + + if (i == 0) { + if (blkaddr == COMPRESS_ADDR) + continue; + dn->ofs_in_node += cluster_size; + goto next; + } + + if (__is_valid_data_blkaddr(blkaddr)) { + compr_blocks++; + continue; + } + + dn->data_blkaddr = NEW_ADDR; + f2fs_set_data_blkaddr(dn); + } + + reserved = cluster_size - compr_blocks; + ret = inc_valid_block_count(sbi, dn->inode, &reserved); + if (ret) + return ret; + + if (reserved != cluster_size - compr_blocks) + return -ENOSPC; + + f2fs_i_compr_blocks_update(dn->inode, compr_blocks, true); + + reserved_blocks += reserved; +next: + count -= cluster_size; + } + + return reserved_blocks; +} + +static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t page_idx = 0, last_idx; + unsigned int reserved_blocks = 0; + int ret; + + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + + if (!f2fs_compressed_file(inode)) + return -EINVAL; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (atomic_read(&F2FS_I(inode)->i_compr_blocks)) + goto out; + + f2fs_balance_fs(F2FS_I_SB(inode), true); + + inode_lock(inode); + + if (!is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + ret = -EINVAL; + goto unlock_inode; + } + + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + + last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + + while (page_idx < last_idx) { + struct dnode_of_data dn; + pgoff_t end_offset, count; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE); + if (ret) { + if (ret == -ENOENT) { + page_idx = f2fs_get_next_page_offset(&dn, + page_idx); + ret = 0; + continue; + } + break; + } + + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); + count = round_up(count, F2FS_I(inode)->i_cluster_size); + + ret = reserve_compress_blocks(&dn, count); + + f2fs_put_dnode(&dn); + + if (ret < 0) + break; + + page_idx += count; + reserved_blocks += ret; + } + + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + + if (ret >= 0) { + clear_inode_flag(inode, FI_COMPRESS_RELEASED); + inode->i_ctime = current_time(inode); + f2fs_mark_inode_dirty_sync(inode, true); + } +unlock_inode: + inode_unlock(inode); +out: + mnt_drop_write_file(filp); + + if (ret >= 0) { + ret = put_user(reserved_blocks, (u64 __user *)arg); + } else if (reserved_blocks && + atomic_read(&F2FS_I(inode)->i_compr_blocks)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx " + "iblocks=%llu, reserved=%u, compr_blocks=%u, " + "run fsck to fix.", + __func__, inode->i_ino, inode->i_blocks, + reserved_blocks, + atomic_read(&F2FS_I(inode)->i_compr_blocks)); + } + + return ret; +} + +static int f2fs_secure_erase(struct block_device *bdev, struct inode *inode, + pgoff_t off, block_t block, block_t len, u32 flags) +{ + sector_t sector = SECTOR_FROM_BLOCK(block); + sector_t nr_sects = SECTOR_FROM_BLOCK(len); + int ret = 0; + + if (flags & F2FS_TRIM_FILE_DISCARD) { + if (bdev_max_secure_erase_sectors(bdev)) + ret = blkdev_issue_secure_erase(bdev, sector, nr_sects, + GFP_NOFS); + else + ret = blkdev_issue_discard(bdev, sector, nr_sects, + GFP_NOFS); + } + + if (!ret && (flags & F2FS_TRIM_FILE_ZEROOUT)) { + if (IS_ENCRYPTED(inode)) + ret = fscrypt_zeroout_range(inode, off, block, len); + else + ret = blkdev_issue_zeroout(bdev, sector, nr_sects, + GFP_NOFS, 0); + } + + return ret; +} + +static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; + struct block_device *prev_bdev = NULL; + struct f2fs_sectrim_range range; + pgoff_t index, pg_end, prev_index = 0; + block_t prev_block = 0, len = 0; + loff_t end_addr; + bool to_end = false; + int ret = 0; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&range, (struct f2fs_sectrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + if (range.flags == 0 || (range.flags & ~F2FS_TRIM_FILE_MASK) || + !S_ISREG(inode->i_mode)) + return -EINVAL; + + if (((range.flags & F2FS_TRIM_FILE_DISCARD) && + !f2fs_hw_support_discard(sbi)) || + ((range.flags & F2FS_TRIM_FILE_ZEROOUT) && + IS_ENCRYPTED(inode) && f2fs_is_multi_device(sbi))) + return -EOPNOTSUPP; + + file_start_write(filp); + inode_lock(inode); + + if (f2fs_is_atomic_file(inode) || f2fs_compressed_file(inode) || + range.start >= inode->i_size) { + ret = -EINVAL; + goto err; + } + + if (range.len == 0) + goto err; + + if (inode->i_size - range.start > range.len) { + end_addr = range.start + range.len; + } else { + end_addr = range.len == (u64)-1 ? + sbi->sb->s_maxbytes : inode->i_size; + to_end = true; + } + + if (!IS_ALIGNED(range.start, F2FS_BLKSIZE) || + (!to_end && !IS_ALIGNED(end_addr, F2FS_BLKSIZE))) { + ret = -EINVAL; + goto err; + } + + index = F2FS_BYTES_TO_BLK(range.start); + pg_end = DIV_ROUND_UP(end_addr, F2FS_BLKSIZE); + + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto err; + + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(mapping); + + ret = filemap_write_and_wait_range(mapping, range.start, + to_end ? LLONG_MAX : end_addr - 1); + if (ret) + goto out; + + truncate_inode_pages_range(mapping, range.start, + to_end ? -1 : end_addr - 1); + + while (index < pg_end) { + struct dnode_of_data dn; + pgoff_t end_offset, count; + int i; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (ret) { + if (ret == -ENOENT) { + index = f2fs_get_next_page_offset(&dn, index); + continue; + } + goto out; + } + + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + count = min(end_offset - dn.ofs_in_node, pg_end - index); + for (i = 0; i < count; i++, index++, dn.ofs_in_node++) { + struct block_device *cur_bdev; + block_t blkaddr = f2fs_data_blkaddr(&dn); + + if (!__is_valid_data_blkaddr(blkaddr)) + continue; + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE)) { + ret = -EFSCORRUPTED; + f2fs_put_dnode(&dn); + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); + goto out; + } + + cur_bdev = f2fs_target_device(sbi, blkaddr, NULL); + if (f2fs_is_multi_device(sbi)) { + int di = f2fs_target_device_index(sbi, blkaddr); + + blkaddr -= FDEV(di).start_blk; + } + + if (len) { + if (prev_bdev == cur_bdev && + index == prev_index + len && + blkaddr == prev_block + len) { + len++; + } else { + ret = f2fs_secure_erase(prev_bdev, + inode, prev_index, prev_block, + len, range.flags); + if (ret) { + f2fs_put_dnode(&dn); + goto out; + } + + len = 0; + } + } + + if (!len) { + prev_bdev = cur_bdev; + prev_index = index; + prev_block = blkaddr; + len = 1; + } + } + + f2fs_put_dnode(&dn); + + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + cond_resched(); + } + + if (len) + ret = f2fs_secure_erase(prev_bdev, inode, prev_index, + prev_block, len, range.flags); +out: + filemap_invalidate_unlock(mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); +err: + inode_unlock(inode); + file_end_write(filp); + + return ret; +} + +static int f2fs_ioc_get_compress_option(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_comp_option option; + + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + + inode_lock_shared(inode); + + if (!f2fs_compressed_file(inode)) { + inode_unlock_shared(inode); + return -ENODATA; + } + + option.algorithm = F2FS_I(inode)->i_compress_algorithm; + option.log_cluster_size = F2FS_I(inode)->i_log_cluster_size; + + inode_unlock_shared(inode); + + if (copy_to_user((struct f2fs_comp_option __user *)arg, &option, + sizeof(option))) + return -EFAULT; + + return 0; +} + +static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_comp_option option; + int ret = 0; + + if (!f2fs_sb_has_compression(sbi)) + return -EOPNOTSUPP; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&option, (struct f2fs_comp_option __user *)arg, + sizeof(option))) + return -EFAULT; + + if (!f2fs_compressed_file(inode) || + option.log_cluster_size < MIN_COMPRESS_LOG_SIZE || + option.log_cluster_size > MAX_COMPRESS_LOG_SIZE || + option.algorithm >= COMPRESS_MAX) + return -EINVAL; + + file_start_write(filp); + inode_lock(inode); + + f2fs_down_write(&F2FS_I(inode)->i_sem); + if (f2fs_is_mmap_file(inode) || get_dirty_pages(inode)) { + ret = -EBUSY; + goto out; + } + + if (F2FS_HAS_BLOCKS(inode)) { + ret = -EFBIG; + goto out; + } + + F2FS_I(inode)->i_compress_algorithm = option.algorithm; + F2FS_I(inode)->i_log_cluster_size = option.log_cluster_size; + F2FS_I(inode)->i_cluster_size = BIT(option.log_cluster_size); + /* Set default level */ + if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) + F2FS_I(inode)->i_compress_level = F2FS_ZSTD_DEFAULT_CLEVEL; + else + F2FS_I(inode)->i_compress_level = 0; + /* Adjust mount option level */ + if (option.algorithm == F2FS_OPTION(sbi).compress_algorithm && + F2FS_OPTION(sbi).compress_level) + F2FS_I(inode)->i_compress_level = F2FS_OPTION(sbi).compress_level; + f2fs_mark_inode_dirty_sync(inode, true); + + if (!f2fs_is_compress_backend_ready(inode)) + f2fs_warn(sbi, "compression algorithm is successfully set, " + "but current kernel doesn't support this algorithm."); +out: + f2fs_up_write(&F2FS_I(inode)->i_sem); + inode_unlock(inode); + file_end_write(filp); + + return ret; +} + +static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) +{ + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, page_idx); + struct address_space *mapping = inode->i_mapping; + struct page *page; + pgoff_t redirty_idx = page_idx; + int i, page_len = 0, ret = 0; + + page_cache_ra_unbounded(&ractl, len, 0); + + for (i = 0; i < len; i++, page_idx++) { + page = read_cache_page(mapping, page_idx, NULL, NULL); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + break; + } + page_len++; + } + + for (i = 0; i < page_len; i++, redirty_idx++) { + page = find_lock_page(mapping, redirty_idx); + + /* It will never fail, when page has pinned above */ + f2fs_bug_on(F2FS_I_SB(inode), !page); + + set_page_dirty(page); + f2fs_put_page(page, 1); + f2fs_put_page(page, 0); + } + + return ret; +} + +static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + pgoff_t page_idx = 0, last_idx; + unsigned int blk_per_seg = sbi->blocks_per_seg; + int cluster_size = fi->i_cluster_size; + int count, ret; + + if (!f2fs_sb_has_compression(sbi) || + F2FS_OPTION(sbi).compress_mode != COMPR_MODE_USER) + return -EOPNOTSUPP; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (!f2fs_compressed_file(inode)) + return -EINVAL; + + f2fs_balance_fs(F2FS_I_SB(inode), true); + + file_start_write(filp); + inode_lock(inode); + + if (!f2fs_is_compress_backend_ready(inode)) { + ret = -EOPNOTSUPP; + goto out; + } + + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + ret = -EINVAL; + goto out; + } + + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + goto out; + + if (!atomic_read(&fi->i_compr_blocks)) + goto out; + + last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + + count = last_idx - page_idx; + while (count) { + int len = min(cluster_size, count); + + ret = redirty_blocks(inode, page_idx, len); + if (ret < 0) + break; + + if (get_dirty_pages(inode) >= blk_per_seg) + filemap_fdatawrite(inode->i_mapping); + + count -= len; + page_idx += len; + } + + if (!ret) + ret = filemap_write_and_wait_range(inode->i_mapping, 0, + LLONG_MAX); + + if (ret) + f2fs_warn(sbi, "%s: The file might be partially decompressed (errno=%d). Please delete the file.", + __func__, ret); +out: + inode_unlock(inode); + file_end_write(filp); + + return ret; +} + +static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t page_idx = 0, last_idx; + unsigned int blk_per_seg = sbi->blocks_per_seg; + int cluster_size = F2FS_I(inode)->i_cluster_size; + int count, ret; + + if (!f2fs_sb_has_compression(sbi) || + F2FS_OPTION(sbi).compress_mode != COMPR_MODE_USER) + return -EOPNOTSUPP; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (!f2fs_compressed_file(inode)) + return -EINVAL; + + f2fs_balance_fs(F2FS_I_SB(inode), true); + + file_start_write(filp); + inode_lock(inode); + + if (!f2fs_is_compress_backend_ready(inode)) { + ret = -EOPNOTSUPP; + goto out; + } + + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + ret = -EINVAL; + goto out; + } + + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + goto out; + + set_inode_flag(inode, FI_ENABLE_COMPRESS); + + last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + + count = last_idx - page_idx; + while (count) { + int len = min(cluster_size, count); + + ret = redirty_blocks(inode, page_idx, len); + if (ret < 0) + break; + + if (get_dirty_pages(inode) >= blk_per_seg) + filemap_fdatawrite(inode->i_mapping); + + count -= len; + page_idx += len; + } + + if (!ret) + ret = filemap_write_and_wait_range(inode->i_mapping, 0, + LLONG_MAX); + + clear_inode_flag(inode, FI_ENABLE_COMPRESS); + + if (ret) + f2fs_warn(sbi, "%s: The file might be partially compressed (errno=%d). Please delete the file.", + __func__, ret); +out: + inode_unlock(inode); + file_end_write(filp); + + return ret; +} + +static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case FS_IOC_GETVERSION: + return f2fs_ioc_getversion(filp, arg); + case F2FS_IOC_START_ATOMIC_WRITE: + return f2fs_ioc_start_atomic_write(filp); + case F2FS_IOC_COMMIT_ATOMIC_WRITE: + return f2fs_ioc_commit_atomic_write(filp); + case F2FS_IOC_ABORT_ATOMIC_WRITE: + return f2fs_ioc_abort_atomic_write(filp); + case F2FS_IOC_START_VOLATILE_WRITE: + case F2FS_IOC_RELEASE_VOLATILE_WRITE: + return -EOPNOTSUPP; + case F2FS_IOC_SHUTDOWN: + return f2fs_ioc_shutdown(filp, arg); + case FITRIM: + return f2fs_ioc_fitrim(filp, arg); + case FS_IOC_SET_ENCRYPTION_POLICY: + return f2fs_ioc_set_encryption_policy(filp, arg); + case FS_IOC_GET_ENCRYPTION_POLICY: + return f2fs_ioc_get_encryption_policy(filp, arg); + case FS_IOC_GET_ENCRYPTION_PWSALT: + return f2fs_ioc_get_encryption_pwsalt(filp, arg); + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + return f2fs_ioc_get_encryption_policy_ex(filp, arg); + case FS_IOC_ADD_ENCRYPTION_KEY: + return f2fs_ioc_add_encryption_key(filp, arg); + case FS_IOC_REMOVE_ENCRYPTION_KEY: + return f2fs_ioc_remove_encryption_key(filp, arg); + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + return f2fs_ioc_remove_encryption_key_all_users(filp, arg); + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + return f2fs_ioc_get_encryption_key_status(filp, arg); + case FS_IOC_GET_ENCRYPTION_NONCE: + return f2fs_ioc_get_encryption_nonce(filp, arg); + case F2FS_IOC_GARBAGE_COLLECT: + return f2fs_ioc_gc(filp, arg); + case F2FS_IOC_GARBAGE_COLLECT_RANGE: + return f2fs_ioc_gc_range(filp, arg); + case F2FS_IOC_WRITE_CHECKPOINT: + return f2fs_ioc_write_checkpoint(filp, arg); + case F2FS_IOC_DEFRAGMENT: + return f2fs_ioc_defragment(filp, arg); + case F2FS_IOC_MOVE_RANGE: + return f2fs_ioc_move_range(filp, arg); + case F2FS_IOC_FLUSH_DEVICE: + return f2fs_ioc_flush_device(filp, arg); + case F2FS_IOC_GET_FEATURES: + return f2fs_ioc_get_features(filp, arg); + case F2FS_IOC_GET_PIN_FILE: + return f2fs_ioc_get_pin_file(filp, arg); + case F2FS_IOC_SET_PIN_FILE: + return f2fs_ioc_set_pin_file(filp, arg); + case F2FS_IOC_PRECACHE_EXTENTS: + return f2fs_ioc_precache_extents(filp, arg); + case F2FS_IOC_RESIZE_FS: + return f2fs_ioc_resize_fs(filp, arg); + case FS_IOC_ENABLE_VERITY: + return f2fs_ioc_enable_verity(filp, arg); + case FS_IOC_MEASURE_VERITY: + return f2fs_ioc_measure_verity(filp, arg); + case FS_IOC_READ_VERITY_METADATA: + return f2fs_ioc_read_verity_metadata(filp, arg); + case FS_IOC_GETFSLABEL: + return f2fs_ioc_getfslabel(filp, arg); + case FS_IOC_SETFSLABEL: + return f2fs_ioc_setfslabel(filp, arg); + case F2FS_IOC_GET_COMPRESS_BLOCKS: + return f2fs_get_compress_blocks(filp, arg); + case F2FS_IOC_RELEASE_COMPRESS_BLOCKS: + return f2fs_release_compress_blocks(filp, arg); + case F2FS_IOC_RESERVE_COMPRESS_BLOCKS: + return f2fs_reserve_compress_blocks(filp, arg); + case F2FS_IOC_SEC_TRIM_FILE: + return f2fs_sec_trim_file(filp, arg); + case F2FS_IOC_GET_COMPRESS_OPTION: + return f2fs_ioc_get_compress_option(filp, arg); + case F2FS_IOC_SET_COMPRESS_OPTION: + return f2fs_ioc_set_compress_option(filp, arg); + case F2FS_IOC_DECOMPRESS_FILE: + return f2fs_ioc_decompress_file(filp, arg); + case F2FS_IOC_COMPRESS_FILE: + return f2fs_ioc_compress_file(filp, arg); + default: + return -ENOTTY; + } +} + +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) + return -EIO; + if (!f2fs_is_checkpoint_ready(F2FS_I_SB(file_inode(filp)))) + return -ENOSPC; + + return __f2fs_ioctl(filp, cmd, arg); +} + +/* + * Return %true if the given read or write request should use direct I/O, or + * %false if it should use buffered I/O. + */ +static bool f2fs_should_use_dio(struct inode *inode, struct kiocb *iocb, + struct iov_iter *iter) +{ + unsigned int align; + + if (!(iocb->ki_flags & IOCB_DIRECT)) + return false; + + if (f2fs_force_buffered_io(inode, iov_iter_rw(iter))) + return false; + + /* + * Direct I/O not aligned to the disk's logical_block_size will be + * attempted, but will fail with -EINVAL. + * + * f2fs additionally requires that direct I/O be aligned to the + * filesystem block size, which is often a stricter requirement. + * However, f2fs traditionally falls back to buffered I/O on requests + * that are logical_block_size-aligned but not fs-block aligned. + * + * The below logic implements this behavior. + */ + align = iocb->ki_pos | iov_iter_alignment(iter); + if (!IS_ALIGNED(align, i_blocksize(inode)) && + IS_ALIGNED(align, bdev_logical_block_size(inode->i_sb->s_bdev))) + return false; + + return true; +} + +static int f2fs_dio_read_end_io(struct kiocb *iocb, ssize_t size, int error, + unsigned int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(iocb->ki_filp)); + + dec_page_count(sbi, F2FS_DIO_READ); + if (error) + return error; + f2fs_update_iostat(sbi, NULL, APP_DIRECT_READ_IO, size); + return 0; +} + +static const struct iomap_dio_ops f2fs_iomap_dio_read_ops = { + .end_io = f2fs_dio_read_end_io, +}; + +static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + const loff_t pos = iocb->ki_pos; + const size_t count = iov_iter_count(to); + struct iomap_dio *dio; + ssize_t ret; + + if (count == 0) + return 0; /* skip atime update */ + + trace_f2fs_direct_IO_enter(inode, iocb, count, READ); + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[READ])) { + ret = -EAGAIN; + goto out; + } + } else { + f2fs_down_read(&fi->i_gc_rwsem[READ]); + } + + /* + * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of + * the higher-level function iomap_dio_rw() in order to ensure that the + * F2FS_DIO_READ counter will be decremented correctly in all cases. + */ + inc_page_count(sbi, F2FS_DIO_READ); + dio = __iomap_dio_rw(iocb, to, &f2fs_iomap_ops, + &f2fs_iomap_dio_read_ops, 0, NULL, 0); + if (IS_ERR_OR_NULL(dio)) { + ret = PTR_ERR_OR_ZERO(dio); + if (ret != -EIOCBQUEUED) + dec_page_count(sbi, F2FS_DIO_READ); + } else { + ret = iomap_dio_complete(dio); + } + + f2fs_up_read(&fi->i_gc_rwsem[READ]); + + file_accessed(file); +out: + trace_f2fs_direct_IO_exit(inode, pos, count, READ, ret); + return ret; +} + +static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + const loff_t pos = iocb->ki_pos; + ssize_t ret; + + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + + if (trace_f2fs_dataread_start_enabled()) { + char *p = f2fs_kmalloc(F2FS_I_SB(inode), PATH_MAX, GFP_KERNEL); + char *path; + + if (!p) + goto skip_read_trace; + + path = dentry_path_raw(file_dentry(iocb->ki_filp), p, PATH_MAX); + if (IS_ERR(path)) { + kfree(p); + goto skip_read_trace; + } + + trace_f2fs_dataread_start(inode, pos, iov_iter_count(to), + current->pid, path, current->comm); + kfree(p); + } +skip_read_trace: + if (f2fs_should_use_dio(inode, iocb, to)) { + ret = f2fs_dio_read_iter(iocb, to); + } else { + ret = filemap_read(iocb, to, 0); + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_BUFFERED_READ_IO, ret); + } + if (trace_f2fs_dataread_end_enabled()) + trace_f2fs_dataread_end(inode, pos, ret); + return ret; +} + +static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + ssize_t count; + int err; + + if (IS_IMMUTABLE(inode)) + return -EPERM; + + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + return -EPERM; + + count = generic_write_checks(iocb, from); + if (count <= 0) + return count; + + err = file_modified(file); + if (err) + return err; + return count; +} + +/* + * Preallocate blocks for a write request, if it is possible and helpful to do + * so. Returns a positive number if blocks may have been preallocated, 0 if no + * blocks were preallocated, or a negative errno value if something went + * seriously wrong. Also sets FI_PREALLOCATED_ALL on the inode if *all* the + * requested blocks (not just some of them) have been allocated. + */ +static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter, + bool dio) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + const loff_t pos = iocb->ki_pos; + const size_t count = iov_iter_count(iter); + struct f2fs_map_blocks map = {}; + int flag; + int ret; + + /* If it will be an out-of-place direct write, don't bother. */ + if (dio && f2fs_lfs_mode(sbi)) + return 0; + /* + * Don't preallocate holes aligned to DIO_SKIP_HOLES which turns into + * buffered IO, if DIO meets any holes. + */ + if (dio && i_size_read(inode) && + (F2FS_BYTES_TO_BLK(pos) < F2FS_BLK_ALIGN(i_size_read(inode)))) + return 0; + + /* No-wait I/O can't allocate blocks. */ + if (iocb->ki_flags & IOCB_NOWAIT) + return 0; + + /* If it will be a short write, don't bother. */ + if (fault_in_iov_iter_readable(iter, count)) + return 0; + + if (f2fs_has_inline_data(inode)) { + /* If the data will fit inline, don't bother. */ + if (pos + count <= MAX_INLINE_DATA(inode)) + return 0; + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + + /* Do not preallocate blocks that will be written partially in 4KB. */ + map.m_lblk = F2FS_BLK_ALIGN(pos); + map.m_len = F2FS_BYTES_TO_BLK(pos + count); + if (map.m_len > map.m_lblk) + map.m_len -= map.m_lblk; + else + map.m_len = 0; + map.m_may_create = true; + if (dio) { + map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint); + flag = F2FS_GET_BLOCK_PRE_DIO; + } else { + map.m_seg_type = NO_CHECK_TYPE; + flag = F2FS_GET_BLOCK_PRE_AIO; + } + + ret = f2fs_map_blocks(inode, &map, 1, flag); + /* -ENOSPC|-EDQUOT are fine to report the number of allocated blocks. */ + if (ret < 0 && !((ret == -ENOSPC || ret == -EDQUOT) && map.m_len > 0)) + return ret; + if (ret == 0) + set_inode_flag(inode, FI_PREALLOCATED_ALL); + return map.m_len; +} + +static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + + current->backing_dev_info = inode_to_bdi(inode); + ret = generic_perform_write(iocb, from); + current->backing_dev_info = NULL; + + if (ret > 0) { + iocb->ki_pos += ret; + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_BUFFERED_IO, ret); + } + return ret; +} + +static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, + unsigned int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(iocb->ki_filp)); + + dec_page_count(sbi, F2FS_DIO_WRITE); + if (error) + return error; + f2fs_update_iostat(sbi, NULL, APP_DIRECT_IO, size); + return 0; +} + +static const struct iomap_dio_ops f2fs_iomap_dio_write_ops = { + .end_io = f2fs_dio_write_end_io, +}; + +static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, + bool *may_need_sync) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + const bool do_opu = f2fs_lfs_mode(sbi); + const loff_t pos = iocb->ki_pos; + const ssize_t count = iov_iter_count(from); + unsigned int dio_flags; + struct iomap_dio *dio; + ssize_t ret; + + trace_f2fs_direct_IO_enter(inode, iocb, count, WRITE); + + if (iocb->ki_flags & IOCB_NOWAIT) { + /* f2fs_convert_inline_inode() and block allocation can block */ + if (f2fs_has_inline_data(inode) || + !f2fs_overwrite_io(inode, pos, count)) { + ret = -EAGAIN; + goto out; + } + + if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[WRITE])) { + ret = -EAGAIN; + goto out; + } + if (do_opu && !f2fs_down_read_trylock(&fi->i_gc_rwsem[READ])) { + f2fs_up_read(&fi->i_gc_rwsem[WRITE]); + ret = -EAGAIN; + goto out; + } + } else { + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + f2fs_down_read(&fi->i_gc_rwsem[WRITE]); + if (do_opu) + f2fs_down_read(&fi->i_gc_rwsem[READ]); + } + + /* + * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of + * the higher-level function iomap_dio_rw() in order to ensure that the + * F2FS_DIO_WRITE counter will be decremented correctly in all cases. + */ + inc_page_count(sbi, F2FS_DIO_WRITE); + dio_flags = 0; + if (pos + count > inode->i_size) + dio_flags |= IOMAP_DIO_FORCE_WAIT; + dio = __iomap_dio_rw(iocb, from, &f2fs_iomap_ops, + &f2fs_iomap_dio_write_ops, dio_flags, NULL, 0); + if (IS_ERR_OR_NULL(dio)) { + ret = PTR_ERR_OR_ZERO(dio); + if (ret == -ENOTBLK) + ret = 0; + if (ret != -EIOCBQUEUED) + dec_page_count(sbi, F2FS_DIO_WRITE); + } else { + ret = iomap_dio_complete(dio); + } + + if (do_opu) + f2fs_up_read(&fi->i_gc_rwsem[READ]); + f2fs_up_read(&fi->i_gc_rwsem[WRITE]); + + if (ret < 0) + goto out; + if (pos + ret > inode->i_size) + f2fs_i_size_write(inode, pos + ret); + if (!do_opu) + set_inode_flag(inode, FI_UPDATE_WRITE); + + if (iov_iter_count(from)) { + ssize_t ret2; + loff_t bufio_start_pos = iocb->ki_pos; + + /* + * The direct write was partial, so we need to fall back to a + * buffered write for the remainder. + */ + + ret2 = f2fs_buffered_write_iter(iocb, from); + if (iov_iter_count(from)) + f2fs_write_failed(inode, iocb->ki_pos); + if (ret2 < 0) + goto out; + + /* + * Ensure that the pagecache pages are written to disk and + * invalidated to preserve the expected O_DIRECT semantics. + */ + if (ret2 > 0) { + loff_t bufio_end_pos = bufio_start_pos + ret2 - 1; + + ret += ret2; + + ret2 = filemap_write_and_wait_range(file->f_mapping, + bufio_start_pos, + bufio_end_pos); + if (ret2 < 0) + goto out; + invalidate_mapping_pages(file->f_mapping, + bufio_start_pos >> PAGE_SHIFT, + bufio_end_pos >> PAGE_SHIFT); + } + } else { + /* iomap_dio_rw() already handled the generic_write_sync(). */ + *may_need_sync = false; + } +out: + trace_f2fs_direct_IO_exit(inode, pos, count, WRITE, ret); + return ret; +} + +static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + const loff_t orig_pos = iocb->ki_pos; + const size_t orig_count = iov_iter_count(from); + loff_t target_size; + bool dio; + bool may_need_sync = true; + int preallocated; + ssize_t ret; + + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { + ret = -EIO; + goto out; + } + + if (!f2fs_is_compress_backend_ready(inode)) { + ret = -EOPNOTSUPP; + goto out; + } + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) { + ret = -EAGAIN; + goto out; + } + } else { + inode_lock(inode); + } + + ret = f2fs_write_checks(iocb, from); + if (ret <= 0) + goto out_unlock; + + /* Determine whether we will do a direct write or a buffered write. */ + dio = f2fs_should_use_dio(inode, iocb, from); + + /* Possibly preallocate the blocks for the write. */ + target_size = iocb->ki_pos + iov_iter_count(from); + preallocated = f2fs_preallocate_blocks(iocb, from, dio); + if (preallocated < 0) { + ret = preallocated; + } else { + if (trace_f2fs_datawrite_start_enabled()) { + char *p = f2fs_kmalloc(F2FS_I_SB(inode), + PATH_MAX, GFP_KERNEL); + char *path; + + if (!p) + goto skip_write_trace; + path = dentry_path_raw(file_dentry(iocb->ki_filp), + p, PATH_MAX); + if (IS_ERR(path)) { + kfree(p); + goto skip_write_trace; + } + trace_f2fs_datawrite_start(inode, orig_pos, orig_count, + current->pid, path, current->comm); + kfree(p); + } +skip_write_trace: + /* Do the actual write. */ + ret = dio ? + f2fs_dio_write_iter(iocb, from, &may_need_sync) : + f2fs_buffered_write_iter(iocb, from); + + if (trace_f2fs_datawrite_end_enabled()) + trace_f2fs_datawrite_end(inode, orig_pos, ret); + } + + /* Don't leave any preallocated blocks around past i_size. */ + if (preallocated && i_size_read(inode) < target_size) { + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + if (!f2fs_truncate(inode)) + file_dont_truncate(inode); + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + } else { + file_dont_truncate(inode); + } + + clear_inode_flag(inode, FI_PREALLOCATED_ALL); +out_unlock: + inode_unlock(inode); +out: + trace_f2fs_file_write_iter(inode, orig_pos, orig_count, ret); + if (ret > 0 && may_need_sync) + ret = generic_write_sync(iocb, ret); + return ret; +} + +static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, + int advice) +{ + struct address_space *mapping; + struct backing_dev_info *bdi; + struct inode *inode = file_inode(filp); + int err; + + if (advice == POSIX_FADV_SEQUENTIAL) { + if (S_ISFIFO(inode->i_mode)) + return -ESPIPE; + + mapping = filp->f_mapping; + if (!mapping || len < 0) + return -EINVAL; + + bdi = inode_to_bdi(mapping->host); + filp->f_ra.ra_pages = bdi->ra_pages * + F2FS_I_SB(inode)->seq_file_ra_mul; + spin_lock(&filp->f_lock); + filp->f_mode &= ~FMODE_RANDOM; + spin_unlock(&filp->f_lock); + return 0; + } + + err = generic_fadvise(filp, offset, len, advice); + if (!err && advice == POSIX_FADV_DONTNEED && + test_opt(F2FS_I_SB(inode), COMPRESS_CACHE) && + f2fs_compressed_file(inode)) + f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino); + + return err; +} + +#ifdef CONFIG_COMPAT +struct compat_f2fs_gc_range { + u32 sync; + compat_u64 start; + compat_u64 len; +}; +#define F2FS_IOC32_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11,\ + struct compat_f2fs_gc_range) + +static int f2fs_compat_ioc_gc_range(struct file *file, unsigned long arg) +{ + struct compat_f2fs_gc_range __user *urange; + struct f2fs_gc_range range; + int err; + + urange = compat_ptr(arg); + err = get_user(range.sync, &urange->sync); + err |= get_user(range.start, &urange->start); + err |= get_user(range.len, &urange->len); + if (err) + return -EFAULT; + + return __f2fs_ioc_gc_range(file, &range); +} + +struct compat_f2fs_move_range { + u32 dst_fd; + compat_u64 pos_in; + compat_u64 pos_out; + compat_u64 len; +}; +#define F2FS_IOC32_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ + struct compat_f2fs_move_range) + +static int f2fs_compat_ioc_move_range(struct file *file, unsigned long arg) +{ + struct compat_f2fs_move_range __user *urange; + struct f2fs_move_range range; + int err; + + urange = compat_ptr(arg); + err = get_user(range.dst_fd, &urange->dst_fd); + err |= get_user(range.pos_in, &urange->pos_in); + err |= get_user(range.pos_out, &urange->pos_out); + err |= get_user(range.len, &urange->len); + if (err) + return -EFAULT; + + return __f2fs_ioc_move_range(file, &range); +} + +long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(file))))) + return -EIO; + if (!f2fs_is_checkpoint_ready(F2FS_I_SB(file_inode(file)))) + return -ENOSPC; + + switch (cmd) { + case FS_IOC32_GETVERSION: + cmd = FS_IOC_GETVERSION; + break; + case F2FS_IOC32_GARBAGE_COLLECT_RANGE: + return f2fs_compat_ioc_gc_range(file, arg); + case F2FS_IOC32_MOVE_RANGE: + return f2fs_compat_ioc_move_range(file, arg); + case F2FS_IOC_START_ATOMIC_WRITE: + case F2FS_IOC_COMMIT_ATOMIC_WRITE: + case F2FS_IOC_START_VOLATILE_WRITE: + case F2FS_IOC_RELEASE_VOLATILE_WRITE: + case F2FS_IOC_ABORT_ATOMIC_WRITE: + case F2FS_IOC_SHUTDOWN: + case FITRIM: + case FS_IOC_SET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_PWSALT: + case FS_IOC_GET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + case FS_IOC_ADD_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + case FS_IOC_GET_ENCRYPTION_NONCE: + case F2FS_IOC_GARBAGE_COLLECT: + case F2FS_IOC_WRITE_CHECKPOINT: + case F2FS_IOC_DEFRAGMENT: + case F2FS_IOC_FLUSH_DEVICE: + case F2FS_IOC_GET_FEATURES: + case F2FS_IOC_GET_PIN_FILE: + case F2FS_IOC_SET_PIN_FILE: + case F2FS_IOC_PRECACHE_EXTENTS: + case F2FS_IOC_RESIZE_FS: + case FS_IOC_ENABLE_VERITY: + case FS_IOC_MEASURE_VERITY: + case FS_IOC_READ_VERITY_METADATA: + case FS_IOC_GETFSLABEL: + case FS_IOC_SETFSLABEL: + case F2FS_IOC_GET_COMPRESS_BLOCKS: + case F2FS_IOC_RELEASE_COMPRESS_BLOCKS: + case F2FS_IOC_RESERVE_COMPRESS_BLOCKS: + case F2FS_IOC_SEC_TRIM_FILE: + case F2FS_IOC_GET_COMPRESS_OPTION: + case F2FS_IOC_SET_COMPRESS_OPTION: + case F2FS_IOC_DECOMPRESS_FILE: + case F2FS_IOC_COMPRESS_FILE: + break; + default: + return -ENOIOCTLCMD; + } + return __f2fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif + +const struct file_operations f2fs_file_operations = { + .llseek = f2fs_llseek, + .read_iter = f2fs_file_read_iter, + .write_iter = f2fs_file_write_iter, + .open = f2fs_file_open, + .release = f2fs_release_file, + .mmap = f2fs_file_mmap, + .flush = f2fs_file_flush, + .fsync = f2fs_sync_file, + .fallocate = f2fs_fallocate, + .unlocked_ioctl = f2fs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = f2fs_compat_ioctl, +#endif + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .fadvise = f2fs_file_fadvise, +}; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c new file mode 100644 index 000000000..ec7212f7a --- /dev/null +++ b/fs/f2fs/gc.c @@ -0,0 +1,2250 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/f2fs/gc.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + */ +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/f2fs_fs.h> +#include <linux/kthread.h> +#include <linux/delay.h> +#include <linux/freezer.h> +#include <linux/sched/signal.h> +#include <linux/random.h> +#include <linux/sched/mm.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "gc.h" +#include "iostat.h" +#include <trace/events/f2fs.h> + +static struct kmem_cache *victim_entry_slab; + +static unsigned int count_bits(const unsigned long *addr, + unsigned int offset, unsigned int len); + +static int gc_thread_func(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct f2fs_gc_kthread *gc_th = sbi->gc_thread; + wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; + wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq; + unsigned int wait_ms; + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .should_migrate_blocks = false, + .err_gc_skipped = false }; + + wait_ms = gc_th->min_sleep_time; + + set_freezable(); + do { + bool sync_mode, foreground = false; + + wait_event_interruptible_timeout(*wq, + kthread_should_stop() || freezing(current) || + waitqueue_active(fggc_wq) || + gc_th->gc_wake, + msecs_to_jiffies(wait_ms)); + + if (test_opt(sbi, GC_MERGE) && waitqueue_active(fggc_wq)) + foreground = true; + + /* give it a try one time */ + if (gc_th->gc_wake) + gc_th->gc_wake = 0; + + if (try_to_freeze()) { + stat_other_skip_bggc_count(sbi); + continue; + } + if (kthread_should_stop()) + break; + + if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { + increase_sleep_time(gc_th, &wait_ms); + stat_other_skip_bggc_count(sbi); + continue; + } + + if (time_to_inject(sbi, FAULT_CHECKPOINT)) { + f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_FAULT_INJECT); + } + + if (!sb_start_write_trylock(sbi->sb)) { + stat_other_skip_bggc_count(sbi); + continue; + } + + /* + * [GC triggering condition] + * 0. GC is not conducted currently. + * 1. There are enough dirty segments. + * 2. IO subsystem is idle by checking the # of writeback pages. + * 3. IO subsystem is idle by checking the # of requests in + * bdev's request list. + * + * Note) We have to avoid triggering GCs frequently. + * Because it is possible that some segments can be + * invalidated soon after by user update or deletion. + * So, I'd like to wait some time to collect dirty segments. + */ + if (sbi->gc_mode == GC_URGENT_HIGH || + sbi->gc_mode == GC_URGENT_MID) { + wait_ms = gc_th->urgent_sleep_time; + f2fs_down_write(&sbi->gc_lock); + goto do_gc; + } + + if (foreground) { + f2fs_down_write(&sbi->gc_lock); + goto do_gc; + } else if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + stat_other_skip_bggc_count(sbi); + goto next; + } + + if (!is_idle(sbi, GC_TIME)) { + increase_sleep_time(gc_th, &wait_ms); + f2fs_up_write(&sbi->gc_lock); + stat_io_skip_bggc_count(sbi); + goto next; + } + + if (has_enough_invalid_blocks(sbi)) + decrease_sleep_time(gc_th, &wait_ms); + else + increase_sleep_time(gc_th, &wait_ms); +do_gc: + if (!foreground) + stat_inc_bggc_count(sbi->stat_info); + + sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC; + + /* foreground GC was been triggered via f2fs_balance_fs() */ + if (foreground) + sync_mode = false; + + gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; + gc_control.no_bg_gc = foreground; + gc_control.nr_free_secs = foreground ? 1 : 0; + + /* if return value is not zero, no victim was selected */ + if (f2fs_gc(sbi, &gc_control)) { + /* don't bother wait_ms by foreground gc */ + if (!foreground) + wait_ms = gc_th->no_gc_sleep_time; + } + + if (foreground) + wake_up_all(&gc_th->fggc_wq); + + trace_f2fs_background_gc(sbi->sb, wait_ms, + prefree_segments(sbi), free_segments(sbi)); + + /* balancing f2fs's metadata periodically */ + f2fs_balance_fs_bg(sbi, true); +next: + if (sbi->gc_mode == GC_URGENT_HIGH) { + spin_lock(&sbi->gc_urgent_high_lock); + if (sbi->gc_urgent_high_remaining) { + sbi->gc_urgent_high_remaining--; + if (!sbi->gc_urgent_high_remaining) + sbi->gc_mode = GC_NORMAL; + } + spin_unlock(&sbi->gc_urgent_high_lock); + } + sb_end_write(sbi->sb); + + } while (!kthread_should_stop()); + return 0; +} + +int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) +{ + struct f2fs_gc_kthread *gc_th; + dev_t dev = sbi->sb->s_bdev->bd_dev; + int err = 0; + + gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL); + if (!gc_th) { + err = -ENOMEM; + goto out; + } + + gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; + gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; + gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; + gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; + + gc_th->gc_wake = 0; + + sbi->gc_thread = gc_th; + init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); + init_waitqueue_head(&sbi->gc_thread->fggc_wq); + sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, + "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(gc_th->f2fs_gc_task)) { + err = PTR_ERR(gc_th->f2fs_gc_task); + kfree(gc_th); + sbi->gc_thread = NULL; + } +out: + return err; +} + +void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) +{ + struct f2fs_gc_kthread *gc_th = sbi->gc_thread; + + if (!gc_th) + return; + kthread_stop(gc_th->f2fs_gc_task); + wake_up_all(&gc_th->fggc_wq); + kfree(gc_th); + sbi->gc_thread = NULL; +} + +static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) +{ + int gc_mode; + + if (gc_type == BG_GC) { + if (sbi->am.atgc_enabled) + gc_mode = GC_AT; + else + gc_mode = GC_CB; + } else { + gc_mode = GC_GREEDY; + } + + switch (sbi->gc_mode) { + case GC_IDLE_CB: + gc_mode = GC_CB; + break; + case GC_IDLE_GREEDY: + case GC_URGENT_HIGH: + gc_mode = GC_GREEDY; + break; + case GC_IDLE_AT: + gc_mode = GC_AT; + break; + } + + return gc_mode; +} + +static void select_policy(struct f2fs_sb_info *sbi, int gc_type, + int type, struct victim_sel_policy *p) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + + if (p->alloc_mode == SSR) { + p->gc_mode = GC_GREEDY; + p->dirty_bitmap = dirty_i->dirty_segmap[type]; + p->max_search = dirty_i->nr_dirty[type]; + p->ofs_unit = 1; + } else if (p->alloc_mode == AT_SSR) { + p->gc_mode = GC_GREEDY; + p->dirty_bitmap = dirty_i->dirty_segmap[type]; + p->max_search = dirty_i->nr_dirty[type]; + p->ofs_unit = 1; + } else { + p->gc_mode = select_gc_type(sbi, gc_type); + p->ofs_unit = sbi->segs_per_sec; + if (__is_large_section(sbi)) { + p->dirty_bitmap = dirty_i->dirty_secmap; + p->max_search = count_bits(p->dirty_bitmap, + 0, MAIN_SECS(sbi)); + } else { + p->dirty_bitmap = dirty_i->dirty_segmap[DIRTY]; + p->max_search = dirty_i->nr_dirty[DIRTY]; + } + } + + /* + * adjust candidates range, should select all dirty segments for + * foreground GC and urgent GC cases. + */ + if (gc_type != FG_GC && + (sbi->gc_mode != GC_URGENT_HIGH) && + (p->gc_mode != GC_AT && p->alloc_mode != AT_SSR) && + p->max_search > sbi->max_victim_search) + p->max_search = sbi->max_victim_search; + + /* let's select beginning hot/small space first in no_heap mode*/ + if (f2fs_need_rand_seg(sbi)) + p->offset = prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec); + else if (test_opt(sbi, NOHEAP) && + (type == CURSEG_HOT_DATA || IS_NODESEG(type))) + p->offset = 0; + else + p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; +} + +static unsigned int get_max_cost(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + /* SSR allocates in a segment unit */ + if (p->alloc_mode == SSR) + return sbi->blocks_per_seg; + else if (p->alloc_mode == AT_SSR) + return UINT_MAX; + + /* LFS */ + if (p->gc_mode == GC_GREEDY) + return 2 * sbi->blocks_per_seg * p->ofs_unit; + else if (p->gc_mode == GC_CB) + return UINT_MAX; + else if (p->gc_mode == GC_AT) + return UINT_MAX; + else /* No other gc_mode */ + return 0; +} + +static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int secno; + + /* + * If the gc_type is FG_GC, we can select victim segments + * selected by background GC before. + * Those segments guarantee they have small valid blocks. + */ + for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { + if (sec_usage_check(sbi, secno)) + continue; + clear_bit(secno, dirty_i->victim_secmap); + return GET_SEG_FROM_SEC(sbi, secno); + } + return NULL_SEGNO; +} + +static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start = GET_SEG_FROM_SEC(sbi, secno); + unsigned long long mtime = 0; + unsigned int vblocks; + unsigned char age = 0; + unsigned char u; + unsigned int i; + unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi, segno); + + for (i = 0; i < usable_segs_per_sec; i++) + mtime += get_seg_entry(sbi, start + i)->mtime; + vblocks = get_valid_blocks(sbi, segno, true); + + mtime = div_u64(mtime, usable_segs_per_sec); + vblocks = div_u64(vblocks, usable_segs_per_sec); + + u = (vblocks * 100) >> sbi->log_blocks_per_seg; + + /* Handle if the system time has changed by the user */ + if (mtime < sit_i->min_mtime) + sit_i->min_mtime = mtime; + if (mtime > sit_i->max_mtime) + sit_i->max_mtime = mtime; + if (sit_i->max_mtime != sit_i->min_mtime) + age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime), + sit_i->max_mtime - sit_i->min_mtime); + + return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); +} + +static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, + unsigned int segno, struct victim_sel_policy *p) +{ + if (p->alloc_mode == SSR) + return get_seg_entry(sbi, segno)->ckpt_valid_blocks; + + /* alloc_mode == LFS */ + if (p->gc_mode == GC_GREEDY) + return get_valid_blocks(sbi, segno, true); + else if (p->gc_mode == GC_CB) + return get_cb_cost(sbi, segno); + + f2fs_bug_on(sbi, 1); + return 0; +} + +static unsigned int count_bits(const unsigned long *addr, + unsigned int offset, unsigned int len) +{ + unsigned int end = offset + len, sum = 0; + + while (offset < end) { + if (test_bit(offset++, addr)) + ++sum; + } + return sum; +} + +static bool f2fs_check_victim_tree(struct f2fs_sb_info *sbi, + struct rb_root_cached *root) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct rb_node *cur = rb_first_cached(root), *next; + struct victim_entry *cur_ve, *next_ve; + + while (cur) { + next = rb_next(cur); + if (!next) + return true; + + cur_ve = rb_entry(cur, struct victim_entry, rb_node); + next_ve = rb_entry(next, struct victim_entry, rb_node); + + if (cur_ve->mtime > next_ve->mtime) { + f2fs_info(sbi, "broken victim_rbtree, " + "cur_mtime(%llu) next_mtime(%llu)", + cur_ve->mtime, next_ve->mtime); + return false; + } + cur = next; + } +#endif + return true; +} + +static struct victim_entry *__lookup_victim_entry(struct f2fs_sb_info *sbi, + unsigned long long mtime) +{ + struct atgc_management *am = &sbi->am; + struct rb_node *node = am->root.rb_root.rb_node; + struct victim_entry *ve = NULL; + + while (node) { + ve = rb_entry(node, struct victim_entry, rb_node); + + if (mtime < ve->mtime) + node = node->rb_left; + else + node = node->rb_right; + } + return ve; +} + +static struct victim_entry *__create_victim_entry(struct f2fs_sb_info *sbi, + unsigned long long mtime, unsigned int segno) +{ + struct atgc_management *am = &sbi->am; + struct victim_entry *ve; + + ve = f2fs_kmem_cache_alloc(victim_entry_slab, GFP_NOFS, true, NULL); + + ve->mtime = mtime; + ve->segno = segno; + + list_add_tail(&ve->list, &am->victim_list); + am->victim_count++; + + return ve; +} + +static void __insert_victim_entry(struct f2fs_sb_info *sbi, + unsigned long long mtime, unsigned int segno) +{ + struct atgc_management *am = &sbi->am; + struct rb_root_cached *root = &am->root; + struct rb_node **p = &root->rb_root.rb_node; + struct rb_node *parent = NULL; + struct victim_entry *ve; + bool left_most = true; + + /* look up rb tree to find parent node */ + while (*p) { + parent = *p; + ve = rb_entry(parent, struct victim_entry, rb_node); + + if (mtime < ve->mtime) { + p = &(*p)->rb_left; + } else { + p = &(*p)->rb_right; + left_most = false; + } + } + + ve = __create_victim_entry(sbi, mtime, segno); + + rb_link_node(&ve->rb_node, parent, p); + rb_insert_color_cached(&ve->rb_node, root, left_most); +} + +static void add_victim_entry(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p, unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start = GET_SEG_FROM_SEC(sbi, secno); + unsigned long long mtime = 0; + unsigned int i; + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (p->gc_mode == GC_AT && + get_valid_blocks(sbi, segno, true) == 0) + return; + } + + for (i = 0; i < sbi->segs_per_sec; i++) + mtime += get_seg_entry(sbi, start + i)->mtime; + mtime = div_u64(mtime, sbi->segs_per_sec); + + /* Handle if the system time has changed by the user */ + if (mtime < sit_i->min_mtime) + sit_i->min_mtime = mtime; + if (mtime > sit_i->max_mtime) + sit_i->max_mtime = mtime; + if (mtime < sit_i->dirty_min_mtime) + sit_i->dirty_min_mtime = mtime; + if (mtime > sit_i->dirty_max_mtime) + sit_i->dirty_max_mtime = mtime; + + /* don't choose young section as candidate */ + if (sit_i->dirty_max_mtime - mtime < p->age_threshold) + return; + + __insert_victim_entry(sbi, mtime, segno); +} + +static void atgc_lookup_victim(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct atgc_management *am = &sbi->am; + struct rb_root_cached *root = &am->root; + struct rb_node *node; + struct victim_entry *ve; + unsigned long long total_time; + unsigned long long age, u, accu; + unsigned long long max_mtime = sit_i->dirty_max_mtime; + unsigned long long min_mtime = sit_i->dirty_min_mtime; + unsigned int sec_blocks = CAP_BLKS_PER_SEC(sbi); + unsigned int vblocks; + unsigned int dirty_threshold = max(am->max_candidate_count, + am->candidate_ratio * + am->victim_count / 100); + unsigned int age_weight = am->age_weight; + unsigned int cost; + unsigned int iter = 0; + + if (max_mtime < min_mtime) + return; + + max_mtime += 1; + total_time = max_mtime - min_mtime; + + accu = div64_u64(ULLONG_MAX, total_time); + accu = min_t(unsigned long long, div_u64(accu, 100), + DEFAULT_ACCURACY_CLASS); + + node = rb_first_cached(root); +next: + ve = rb_entry_safe(node, struct victim_entry, rb_node); + if (!ve) + return; + + if (ve->mtime >= max_mtime || ve->mtime < min_mtime) + goto skip; + + /* age = 10000 * x% * 60 */ + age = div64_u64(accu * (max_mtime - ve->mtime), total_time) * + age_weight; + + vblocks = get_valid_blocks(sbi, ve->segno, true); + f2fs_bug_on(sbi, !vblocks || vblocks == sec_blocks); + + /* u = 10000 * x% * 40 */ + u = div64_u64(accu * (sec_blocks - vblocks), sec_blocks) * + (100 - age_weight); + + f2fs_bug_on(sbi, age + u >= UINT_MAX); + + cost = UINT_MAX - (age + u); + iter++; + + if (cost < p->min_cost || + (cost == p->min_cost && age > p->oldest_age)) { + p->min_cost = cost; + p->oldest_age = age; + p->min_segno = ve->segno; + } +skip: + if (iter < dirty_threshold) { + node = rb_next(node); + goto next; + } +} + +/* + * select candidates around source section in range of + * [target - dirty_threshold, target + dirty_threshold] + */ +static void atssr_lookup_victim(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct atgc_management *am = &sbi->am; + struct victim_entry *ve; + unsigned long long age; + unsigned long long max_mtime = sit_i->dirty_max_mtime; + unsigned long long min_mtime = sit_i->dirty_min_mtime; + unsigned int seg_blocks = sbi->blocks_per_seg; + unsigned int vblocks; + unsigned int dirty_threshold = max(am->max_candidate_count, + am->candidate_ratio * + am->victim_count / 100); + unsigned int cost, iter; + int stage = 0; + + if (max_mtime < min_mtime) + return; + max_mtime += 1; +next_stage: + iter = 0; + ve = __lookup_victim_entry(sbi, p->age); +next_node: + if (!ve) { + if (stage++ == 0) + goto next_stage; + return; + } + + if (ve->mtime >= max_mtime || ve->mtime < min_mtime) + goto skip_node; + + age = max_mtime - ve->mtime; + + vblocks = get_seg_entry(sbi, ve->segno)->ckpt_valid_blocks; + f2fs_bug_on(sbi, !vblocks); + + /* rare case */ + if (vblocks == seg_blocks) + goto skip_node; + + iter++; + + age = max_mtime - abs(p->age - age); + cost = UINT_MAX - vblocks; + + if (cost < p->min_cost || + (cost == p->min_cost && age > p->oldest_age)) { + p->min_cost = cost; + p->oldest_age = age; + p->min_segno = ve->segno; + } +skip_node: + if (iter < dirty_threshold) { + ve = rb_entry(stage == 0 ? rb_prev(&ve->rb_node) : + rb_next(&ve->rb_node), + struct victim_entry, rb_node); + goto next_node; + } + + if (stage++ == 0) + goto next_stage; +} + +static void lookup_victim_by_age(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + f2fs_bug_on(sbi, !f2fs_check_victim_tree(sbi, &sbi->am.root)); + + if (p->gc_mode == GC_AT) + atgc_lookup_victim(sbi, p); + else if (p->alloc_mode == AT_SSR) + atssr_lookup_victim(sbi, p); + else + f2fs_bug_on(sbi, 1); +} + +static void release_victim_entry(struct f2fs_sb_info *sbi) +{ + struct atgc_management *am = &sbi->am; + struct victim_entry *ve, *tmp; + + list_for_each_entry_safe(ve, tmp, &am->victim_list, list) { + list_del(&ve->list); + kmem_cache_free(victim_entry_slab, ve); + am->victim_count--; + } + + am->root = RB_ROOT_CACHED; + + f2fs_bug_on(sbi, am->victim_count); + f2fs_bug_on(sbi, !list_empty(&am->victim_list)); +} + +static bool f2fs_pin_section(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + + if (!dirty_i->enable_pin_section) + return false; + if (!test_and_set_bit(secno, dirty_i->pinned_secmap)) + dirty_i->pinned_secmap_cnt++; + return true; +} + +static bool f2fs_pinned_section_exists(struct dirty_seglist_info *dirty_i) +{ + return dirty_i->pinned_secmap_cnt; +} + +static bool f2fs_section_is_pinned(struct dirty_seglist_info *dirty_i, + unsigned int secno) +{ + return dirty_i->enable_pin_section && + f2fs_pinned_section_exists(dirty_i) && + test_bit(secno, dirty_i->pinned_secmap); +} + +static void f2fs_unpin_all_sections(struct f2fs_sb_info *sbi, bool enable) +{ + unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + + if (f2fs_pinned_section_exists(DIRTY_I(sbi))) { + memset(DIRTY_I(sbi)->pinned_secmap, 0, bitmap_size); + DIRTY_I(sbi)->pinned_secmap_cnt = 0; + } + DIRTY_I(sbi)->enable_pin_section = enable; +} + +static int f2fs_gc_pinned_control(struct inode *inode, int gc_type, + unsigned int segno) +{ + if (!f2fs_is_pinned_file(inode)) + return 0; + if (gc_type != FG_GC) + return -EBUSY; + if (!f2fs_pin_section(F2FS_I_SB(inode), segno)) + f2fs_pin_file_control(inode, true); + return -EAGAIN; +} + +/* + * This function is called from two paths. + * One is garbage collection and the other is SSR segment selection. + * When it is called during GC, it just gets a victim segment + * and it does not remove it from dirty seglist. + * When it is called from SSR segment selection, it finds a segment + * which has minimum valid blocks and removes it from dirty seglist. + */ +static int get_victim_by_default(struct f2fs_sb_info *sbi, + unsigned int *result, int gc_type, int type, + char alloc_mode, unsigned long long age) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct sit_info *sm = SIT_I(sbi); + struct victim_sel_policy p; + unsigned int secno, last_victim; + unsigned int last_segment; + unsigned int nsearched; + bool is_atgc; + int ret = 0; + + mutex_lock(&dirty_i->seglist_lock); + last_segment = MAIN_SECS(sbi) * sbi->segs_per_sec; + + p.alloc_mode = alloc_mode; + p.age = age; + p.age_threshold = sbi->am.age_threshold; + +retry: + select_policy(sbi, gc_type, type, &p); + p.min_segno = NULL_SEGNO; + p.oldest_age = 0; + p.min_cost = get_max_cost(sbi, &p); + + is_atgc = (p.gc_mode == GC_AT || p.alloc_mode == AT_SSR); + nsearched = 0; + + if (is_atgc) + SIT_I(sbi)->dirty_min_mtime = ULLONG_MAX; + + if (*result != NULL_SEGNO) { + if (!get_valid_blocks(sbi, *result, false)) { + ret = -ENODATA; + goto out; + } + + if (sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) + ret = -EBUSY; + else + p.min_segno = *result; + goto out; + } + + ret = -ENODATA; + if (p.max_search == 0) + goto out; + + if (__is_large_section(sbi) && p.alloc_mode == LFS) { + if (sbi->next_victim_seg[BG_GC] != NULL_SEGNO) { + p.min_segno = sbi->next_victim_seg[BG_GC]; + *result = p.min_segno; + sbi->next_victim_seg[BG_GC] = NULL_SEGNO; + goto got_result; + } + if (gc_type == FG_GC && + sbi->next_victim_seg[FG_GC] != NULL_SEGNO) { + p.min_segno = sbi->next_victim_seg[FG_GC]; + *result = p.min_segno; + sbi->next_victim_seg[FG_GC] = NULL_SEGNO; + goto got_result; + } + } + + last_victim = sm->last_victim[p.gc_mode]; + if (p.alloc_mode == LFS && gc_type == FG_GC) { + p.min_segno = check_bg_victims(sbi); + if (p.min_segno != NULL_SEGNO) + goto got_it; + } + + while (1) { + unsigned long cost, *dirty_bitmap; + unsigned int unit_no, segno; + + dirty_bitmap = p.dirty_bitmap; + unit_no = find_next_bit(dirty_bitmap, + last_segment / p.ofs_unit, + p.offset / p.ofs_unit); + segno = unit_no * p.ofs_unit; + if (segno >= last_segment) { + if (sm->last_victim[p.gc_mode]) { + last_segment = + sm->last_victim[p.gc_mode]; + sm->last_victim[p.gc_mode] = 0; + p.offset = 0; + continue; + } + break; + } + + p.offset = segno + p.ofs_unit; + nsearched++; + +#ifdef CONFIG_F2FS_CHECK_FS + /* + * skip selecting the invalid segno (that is failed due to block + * validity check failure during GC) to avoid endless GC loop in + * such cases. + */ + if (test_bit(segno, sm->invalid_segmap)) + goto next; +#endif + + secno = GET_SEC_FROM_SEG(sbi, segno); + + if (sec_usage_check(sbi, secno)) + goto next; + + /* Don't touch checkpointed data */ + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (p.alloc_mode == LFS) { + /* + * LFS is set to find source section during GC. + * The victim should have no checkpointed data. + */ + if (get_ckpt_valid_blocks(sbi, segno, true)) + goto next; + } else { + /* + * SSR | AT_SSR are set to find target segment + * for writes which can be full by checkpointed + * and newly written blocks. + */ + if (!f2fs_segment_has_free_slot(sbi, segno)) + goto next; + } + } + + if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) + goto next; + + if (gc_type == FG_GC && f2fs_section_is_pinned(dirty_i, secno)) + goto next; + + if (is_atgc) { + add_victim_entry(sbi, &p, segno); + goto next; + } + + cost = get_gc_cost(sbi, segno, &p); + + if (p.min_cost > cost) { + p.min_segno = segno; + p.min_cost = cost; + } +next: + if (nsearched >= p.max_search) { + if (!sm->last_victim[p.gc_mode] && segno <= last_victim) + sm->last_victim[p.gc_mode] = + last_victim + p.ofs_unit; + else + sm->last_victim[p.gc_mode] = segno + p.ofs_unit; + sm->last_victim[p.gc_mode] %= + (MAIN_SECS(sbi) * sbi->segs_per_sec); + break; + } + } + + /* get victim for GC_AT/AT_SSR */ + if (is_atgc) { + lookup_victim_by_age(sbi, &p); + release_victim_entry(sbi); + } + + if (is_atgc && p.min_segno == NULL_SEGNO && + sm->elapsed_time < p.age_threshold) { + p.age_threshold = 0; + goto retry; + } + + if (p.min_segno != NULL_SEGNO) { +got_it: + *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; +got_result: + if (p.alloc_mode == LFS) { + secno = GET_SEC_FROM_SEG(sbi, p.min_segno); + if (gc_type == FG_GC) + sbi->cur_victim_sec = secno; + else + set_bit(secno, dirty_i->victim_secmap); + } + ret = 0; + + } +out: + if (p.min_segno != NULL_SEGNO) + trace_f2fs_get_victim(sbi->sb, type, gc_type, &p, + sbi->cur_victim_sec, + prefree_segments(sbi), free_segments(sbi)); + mutex_unlock(&dirty_i->seglist_lock); + + return ret; +} + +static const struct victim_selection default_v_ops = { + .get_victim = get_victim_by_default, +}; + +static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino) +{ + struct inode_entry *ie; + + ie = radix_tree_lookup(&gc_list->iroot, ino); + if (ie) + return ie->inode; + return NULL; +} + +static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) +{ + struct inode_entry *new_ie; + + if (inode == find_gc_inode(gc_list, inode->i_ino)) { + iput(inode); + return; + } + new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, + GFP_NOFS, true, NULL); + new_ie->inode = inode; + + f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie); + list_add_tail(&new_ie->list, &gc_list->ilist); +} + +static void put_gc_inode(struct gc_inode_list *gc_list) +{ + struct inode_entry *ie, *next_ie; + + list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) { + radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); + iput(ie->inode); + list_del(&ie->list); + kmem_cache_free(f2fs_inode_entry_slab, ie); + } +} + +static int check_valid_map(struct f2fs_sb_info *sbi, + unsigned int segno, int offset) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct seg_entry *sentry; + int ret; + + down_read(&sit_i->sentry_lock); + sentry = get_seg_entry(sbi, segno); + ret = f2fs_test_bit(offset, sentry->cur_valid_map); + up_read(&sit_i->sentry_lock); + return ret; +} + +/* + * This function compares node address got in summary with that in NAT. + * On validity, copy that node with cold status, otherwise (invalid node) + * ignore that. + */ +static int gc_node_segment(struct f2fs_sb_info *sbi, + struct f2fs_summary *sum, unsigned int segno, int gc_type) +{ + struct f2fs_summary *entry; + block_t start_addr; + int off; + int phase = 0; + bool fggc = (gc_type == FG_GC); + int submitted = 0; + unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); + + start_addr = START_BLOCK(sbi, segno); + +next_step: + entry = sum; + + if (fggc && phase == 2) + atomic_inc(&sbi->wb_sync_req[NODE]); + + for (off = 0; off < usable_blks_in_seg; off++, entry++) { + nid_t nid = le32_to_cpu(entry->nid); + struct page *node_page; + struct node_info ni; + int err; + + /* stop BG_GC if there is not enough free sections. */ + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) + return submitted; + + if (check_valid_map(sbi, segno, off) == 0) + continue; + + if (phase == 0) { + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + META_NAT, true); + continue; + } + + if (phase == 1) { + f2fs_ra_node_page(sbi, nid); + continue; + } + + /* phase == 2 */ + node_page = f2fs_get_node_page(sbi, nid); + if (IS_ERR(node_page)) + continue; + + /* block may become invalid during f2fs_get_node_page */ + if (check_valid_map(sbi, segno, off) == 0) { + f2fs_put_page(node_page, 1); + continue; + } + + if (f2fs_get_node_info(sbi, nid, &ni, false)) { + f2fs_put_page(node_page, 1); + continue; + } + + if (ni.blk_addr != start_addr + off) { + f2fs_put_page(node_page, 1); + continue; + } + + err = f2fs_move_node_page(node_page, gc_type); + if (!err && gc_type == FG_GC) + submitted++; + stat_inc_node_blk_count(sbi, 1, gc_type); + } + + if (++phase < 3) + goto next_step; + + if (fggc) + atomic_dec(&sbi->wb_sync_req[NODE]); + return submitted; +} + +/* + * Calculate start block index indicating the given node offset. + * Be careful, caller should give this node offset only indicating direct node + * blocks. If any node offsets, which point the other types of node blocks such + * as indirect or double indirect node blocks, are given, it must be a caller's + * bug. + */ +block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode) +{ + unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; + unsigned int bidx; + + if (node_ofs == 0) + return 0; + + if (node_ofs <= 2) { + bidx = node_ofs - 1; + } else if (node_ofs <= indirect_blks) { + int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); + + bidx = node_ofs - 2 - dec; + } else { + int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); + + bidx = node_ofs - 5 - dec; + } + return bidx * ADDRS_PER_BLOCK(inode) + ADDRS_PER_INODE(inode); +} + +static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + struct node_info *dni, block_t blkaddr, unsigned int *nofs) +{ + struct page *node_page; + nid_t nid; + unsigned int ofs_in_node, max_addrs, base; + block_t source_blkaddr; + + nid = le32_to_cpu(sum->nid); + ofs_in_node = le16_to_cpu(sum->ofs_in_node); + + node_page = f2fs_get_node_page(sbi, nid); + if (IS_ERR(node_page)) + return false; + + if (f2fs_get_node_info(sbi, nid, dni, false)) { + f2fs_put_page(node_page, 1); + return false; + } + + if (sum->version != dni->version) { + f2fs_warn(sbi, "%s: valid data with mismatched node version.", + __func__); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + + if (f2fs_check_nid_range(sbi, dni->ino)) { + f2fs_put_page(node_page, 1); + return false; + } + + if (IS_INODE(node_page)) { + base = offset_in_addr(F2FS_INODE(node_page)); + max_addrs = DEF_ADDRS_PER_INODE; + } else { + base = 0; + max_addrs = DEF_ADDRS_PER_BLOCK; + } + + if (base + ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent blkaddr offset: base:%u, ofs_in_node:%u, max:%u, ino:%u, nid:%u", + base, ofs_in_node, max_addrs, dni->ino, dni->nid); + f2fs_put_page(node_page, 1); + return false; + } + + *nofs = ofs_of_node(node_page); + source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node); + f2fs_put_page(node_page, 1); + + if (source_blkaddr != blkaddr) { +#ifdef CONFIG_F2FS_CHECK_FS + unsigned int segno = GET_SEGNO(sbi, blkaddr); + unsigned long offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + + if (unlikely(check_valid_map(sbi, segno, offset))) { + if (!test_and_set_bit(segno, SIT_I(sbi)->invalid_segmap)) { + f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u", + blkaddr, source_blkaddr, segno); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + } +#endif + return false; + } + return true; +} + +static int ra_data_block(struct inode *inode, pgoff_t index) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; + struct dnode_of_data dn; + struct page *page; + struct extent_info ei = {0, }; + struct f2fs_io_info fio = { + .sbi = sbi, + .ino = inode->i_ino, + .type = DATA, + .temp = COLD, + .op = REQ_OP_READ, + .op_flags = 0, + .encrypted_page = NULL, + .in_list = false, + .retry = false, + }; + int err; + + page = f2fs_grab_cache_page(mapping, index, true); + if (!page) + return -ENOMEM; + + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, + DATA_GENERIC_ENHANCE_READ))) { + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + goto put_page; + } + goto got_it; + } + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) + goto put_page; + f2fs_put_dnode(&dn); + + if (!__is_valid_data_blkaddr(dn.data_blkaddr)) { + err = -ENOENT; + goto put_page; + } + if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, + DATA_GENERIC_ENHANCE))) { + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + goto put_page; + } +got_it: + /* read page */ + fio.page = page; + fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; + + /* + * don't cache encrypted data into meta inode until previous dirty + * data were writebacked to avoid racing between GC and flush. + */ + f2fs_wait_on_page_writeback(page, DATA, true, true); + + f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); + + fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi), + dn.data_blkaddr, + FGP_LOCK | FGP_CREAT, GFP_NOFS); + if (!fio.encrypted_page) { + err = -ENOMEM; + goto put_page; + } + + err = f2fs_submit_page_bio(&fio); + if (err) + goto put_encrypted_page; + f2fs_put_page(fio.encrypted_page, 0); + f2fs_put_page(page, 1); + + f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_GDATA_READ_IO, F2FS_BLKSIZE); + + return 0; +put_encrypted_page: + f2fs_put_page(fio.encrypted_page, 1); +put_page: + f2fs_put_page(page, 1); + return err; +} + +/* + * Move data block via META_MAPPING while keeping locked data page. + * This can be used to move blocks, aka LBAs, directly on disk. + */ +static int move_data_block(struct inode *inode, block_t bidx, + int gc_type, unsigned int segno, int off) +{ + struct f2fs_io_info fio = { + .sbi = F2FS_I_SB(inode), + .ino = inode->i_ino, + .type = DATA, + .temp = COLD, + .op = REQ_OP_READ, + .op_flags = 0, + .encrypted_page = NULL, + .in_list = false, + .retry = false, + }; + struct dnode_of_data dn; + struct f2fs_summary sum; + struct node_info ni; + struct page *page, *mpage; + block_t newaddr; + int err = 0; + bool lfs_mode = f2fs_lfs_mode(fio.sbi); + int type = fio.sbi->am.atgc_enabled && (gc_type == BG_GC) && + (fio.sbi->gc_mode != GC_URGENT_HIGH) ? + CURSEG_ALL_DATA_ATGC : CURSEG_COLD_DATA; + + /* do not read out */ + page = f2fs_grab_cache_page(inode->i_mapping, bidx, false); + if (!page) + return -ENOMEM; + + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) { + err = -ENOENT; + goto out; + } + + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err) + goto out; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE); + if (err) + goto out; + + if (unlikely(dn.data_blkaddr == NULL_ADDR)) { + ClearPageUptodate(page); + err = -ENOENT; + goto put_out; + } + + /* + * don't cache encrypted data into meta inode until previous dirty + * data were writebacked to avoid racing between GC and flush. + */ + f2fs_wait_on_page_writeback(page, DATA, true, true); + + f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); + + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); + if (err) + goto put_out; + + /* read page */ + fio.page = page; + fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; + + if (lfs_mode) + f2fs_down_write(&fio.sbi->io_order_lock); + + mpage = f2fs_grab_cache_page(META_MAPPING(fio.sbi), + fio.old_blkaddr, false); + if (!mpage) { + err = -ENOMEM; + goto up_out; + } + + fio.encrypted_page = mpage; + + /* read source block in mpage */ + if (!PageUptodate(mpage)) { + err = f2fs_submit_page_bio(&fio); + if (err) { + f2fs_put_page(mpage, 1); + goto up_out; + } + + f2fs_update_iostat(fio.sbi, inode, FS_DATA_READ_IO, + F2FS_BLKSIZE); + f2fs_update_iostat(fio.sbi, NULL, FS_GDATA_READ_IO, + F2FS_BLKSIZE); + + lock_page(mpage); + if (unlikely(mpage->mapping != META_MAPPING(fio.sbi) || + !PageUptodate(mpage))) { + err = -EIO; + f2fs_put_page(mpage, 1); + goto up_out; + } + } + + set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); + + /* allocate block address */ + f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, + &sum, type, NULL); + + fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), + newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); + if (!fio.encrypted_page) { + err = -ENOMEM; + f2fs_put_page(mpage, 1); + goto recover_block; + } + + /* write target block */ + f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true, true); + memcpy(page_address(fio.encrypted_page), + page_address(mpage), PAGE_SIZE); + f2fs_put_page(mpage, 1); + invalidate_mapping_pages(META_MAPPING(fio.sbi), + fio.old_blkaddr, fio.old_blkaddr); + f2fs_invalidate_compress_page(fio.sbi, fio.old_blkaddr); + + set_page_dirty(fio.encrypted_page); + if (clear_page_dirty_for_io(fio.encrypted_page)) + dec_page_count(fio.sbi, F2FS_DIRTY_META); + + set_page_writeback(fio.encrypted_page); + ClearPageError(page); + + fio.op = REQ_OP_WRITE; + fio.op_flags = REQ_SYNC; + fio.new_blkaddr = newaddr; + f2fs_submit_page_write(&fio); + if (fio.retry) { + err = -EAGAIN; + if (PageWriteback(fio.encrypted_page)) + end_page_writeback(fio.encrypted_page); + goto put_page_out; + } + + f2fs_update_iostat(fio.sbi, NULL, FS_GC_DATA_IO, F2FS_BLKSIZE); + + f2fs_update_data_blkaddr(&dn, newaddr); + set_inode_flag(inode, FI_APPEND_WRITE); + if (page->index == 0) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); +put_page_out: + f2fs_put_page(fio.encrypted_page, 1); +recover_block: + if (err) + f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, + true, true, true); +up_out: + if (lfs_mode) + f2fs_up_write(&fio.sbi->io_order_lock); +put_out: + f2fs_put_dnode(&dn); +out: + f2fs_put_page(page, 1); + return err; +} + +static int move_data_page(struct inode *inode, block_t bidx, int gc_type, + unsigned int segno, int off) +{ + struct page *page; + int err = 0; + + page = f2fs_get_lock_data_page(inode, bidx, true); + if (IS_ERR(page)) + return PTR_ERR(page); + + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) { + err = -ENOENT; + goto out; + } + + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err) + goto out; + + if (gc_type == BG_GC) { + if (PageWriteback(page)) { + err = -EAGAIN; + goto out; + } + set_page_dirty(page); + set_page_private_gcing(page); + } else { + struct f2fs_io_info fio = { + .sbi = F2FS_I_SB(inode), + .ino = inode->i_ino, + .type = DATA, + .temp = COLD, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC, + .old_blkaddr = NULL_ADDR, + .page = page, + .encrypted_page = NULL, + .need_lock = LOCK_REQ, + .io_type = FS_GC_DATA_IO, + }; + bool is_dirty = PageDirty(page); + +retry: + f2fs_wait_on_page_writeback(page, DATA, true, true); + + set_page_dirty(page); + if (clear_page_dirty_for_io(page)) { + inode_dec_dirty_pages(inode); + f2fs_remove_dirty_inode(inode); + } + + set_page_private_gcing(page); + + err = f2fs_do_write_data_page(&fio); + if (err) { + clear_page_private_gcing(page); + if (err == -ENOMEM) { + memalloc_retry_wait(GFP_NOFS); + goto retry; + } + if (is_dirty) + set_page_dirty(page); + } + } +out: + f2fs_put_page(page, 1); + return err; +} + +/* + * This function tries to get parent node of victim data block, and identifies + * data block validity. If the block is valid, copy that with cold status and + * modify parent node. + * If the parent node is not valid or the data block address is different, + * the victim data block is ignored. + */ +static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + struct gc_inode_list *gc_list, unsigned int segno, int gc_type, + bool force_migrate) +{ + struct super_block *sb = sbi->sb; + struct f2fs_summary *entry; + block_t start_addr; + int off; + int phase = 0; + int submitted = 0; + unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); + + start_addr = START_BLOCK(sbi, segno); + +next_step: + entry = sum; + + for (off = 0; off < usable_blks_in_seg; off++, entry++) { + struct page *data_page; + struct inode *inode; + struct node_info dni; /* dnode info for the data */ + unsigned int ofs_in_node, nofs; + block_t start_bidx; + nid_t nid = le32_to_cpu(entry->nid); + + /* + * stop BG_GC if there is not enough free sections. + * Or, stop GC if the segment becomes fully valid caused by + * race condition along with SSR block allocation. + */ + if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) || + (!force_migrate && get_valid_blocks(sbi, segno, true) == + CAP_BLKS_PER_SEC(sbi))) + return submitted; + + if (check_valid_map(sbi, segno, off) == 0) + continue; + + if (phase == 0) { + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + META_NAT, true); + continue; + } + + if (phase == 1) { + f2fs_ra_node_page(sbi, nid); + continue; + } + + /* Get an inode by ino with checking validity */ + if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs)) + continue; + + if (phase == 2) { + f2fs_ra_node_page(sbi, dni.ino); + continue; + } + + ofs_in_node = le16_to_cpu(entry->ofs_in_node); + + if (phase == 3) { + int err; + + inode = f2fs_iget(sb, dni.ino); + if (IS_ERR(inode) || is_bad_inode(inode) || + special_file(inode->i_mode)) + continue; + + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err == -EAGAIN) { + iput(inode); + return submitted; + } + + if (!f2fs_down_write_trylock( + &F2FS_I(inode)->i_gc_rwsem[WRITE])) { + iput(inode); + sbi->skipped_gc_rwsem++; + continue; + } + + start_bidx = f2fs_start_bidx_of_node(nofs, inode) + + ofs_in_node; + + if (f2fs_post_read_required(inode)) { + int err = ra_data_block(inode, start_bidx); + + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + if (err) { + iput(inode); + continue; + } + add_gc_inode(gc_list, inode); + continue; + } + + data_page = f2fs_get_read_data_page(inode, start_bidx, + REQ_RAHEAD, true, NULL); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + if (IS_ERR(data_page)) { + iput(inode); + continue; + } + + f2fs_put_page(data_page, 0); + add_gc_inode(gc_list, inode); + continue; + } + + /* phase 4 */ + inode = find_gc_inode(gc_list, dni.ino); + if (inode) { + struct f2fs_inode_info *fi = F2FS_I(inode); + bool locked = false; + int err; + + if (S_ISREG(inode->i_mode)) { + if (!f2fs_down_write_trylock(&fi->i_gc_rwsem[READ])) { + sbi->skipped_gc_rwsem++; + continue; + } + if (!f2fs_down_write_trylock( + &fi->i_gc_rwsem[WRITE])) { + sbi->skipped_gc_rwsem++; + f2fs_up_write(&fi->i_gc_rwsem[READ]); + continue; + } + locked = true; + + /* wait for all inflight aio data */ + inode_dio_wait(inode); + } + + start_bidx = f2fs_start_bidx_of_node(nofs, inode) + + ofs_in_node; + if (f2fs_post_read_required(inode)) + err = move_data_block(inode, start_bidx, + gc_type, segno, off); + else + err = move_data_page(inode, start_bidx, gc_type, + segno, off); + + if (!err && (gc_type == FG_GC || + f2fs_post_read_required(inode))) + submitted++; + + if (locked) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[READ]); + } + + stat_inc_data_blk_count(sbi, 1, gc_type); + } + } + + if (++phase < 5) + goto next_step; + + return submitted; +} + +static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, + int gc_type) +{ + struct sit_info *sit_i = SIT_I(sbi); + int ret; + + down_write(&sit_i->sentry_lock); + ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, + NO_CHECK_TYPE, LFS, 0); + up_write(&sit_i->sentry_lock); + return ret; +} + +static int do_garbage_collect(struct f2fs_sb_info *sbi, + unsigned int start_segno, + struct gc_inode_list *gc_list, int gc_type, + bool force_migrate) +{ + struct page *sum_page; + struct f2fs_summary_block *sum; + struct blk_plug plug; + unsigned int segno = start_segno; + unsigned int end_segno = start_segno + sbi->segs_per_sec; + int seg_freed = 0, migrated = 0; + unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? + SUM_TYPE_DATA : SUM_TYPE_NODE; + int submitted = 0; + + if (__is_large_section(sbi)) + end_segno = rounddown(end_segno, sbi->segs_per_sec); + + /* + * zone-capacity can be less than zone-size in zoned devices, + * resulting in less than expected usable segments in the zone, + * calculate the end segno in the zone which can be garbage collected + */ + if (f2fs_sb_has_blkzoned(sbi)) + end_segno -= sbi->segs_per_sec - + f2fs_usable_segs_in_sec(sbi, segno); + + sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type); + + /* readahead multi ssa blocks those have contiguous address */ + if (__is_large_section(sbi)) + f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), + end_segno - segno, META_SSA, true); + + /* reference all summary page */ + while (segno < end_segno) { + sum_page = f2fs_get_sum_page(sbi, segno++); + if (IS_ERR(sum_page)) { + int err = PTR_ERR(sum_page); + + end_segno = segno - 1; + for (segno = start_segno; segno < end_segno; segno++) { + sum_page = find_get_page(META_MAPPING(sbi), + GET_SUM_BLOCK(sbi, segno)); + f2fs_put_page(sum_page, 0); + f2fs_put_page(sum_page, 0); + } + return err; + } + unlock_page(sum_page); + } + + blk_start_plug(&plug); + + for (segno = start_segno; segno < end_segno; segno++) { + + /* find segment summary of victim */ + sum_page = find_get_page(META_MAPPING(sbi), + GET_SUM_BLOCK(sbi, segno)); + f2fs_put_page(sum_page, 0); + + if (get_valid_blocks(sbi, segno, false) == 0) + goto freed; + if (gc_type == BG_GC && __is_large_section(sbi) && + migrated >= sbi->migration_granularity) + goto skip; + if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi))) + goto skip; + + sum = page_address(sum_page); + if (type != GET_SUM_TYPE((&sum->footer))) { + f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT", + segno, type, GET_SUM_TYPE((&sum->footer))); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_CORRUPTED_SUMMARY); + goto skip; + } + + /* + * this is to avoid deadlock: + * - lock_page(sum_page) - f2fs_replace_block + * - check_valid_map() - down_write(sentry_lock) + * - down_read(sentry_lock) - change_curseg() + * - lock_page(sum_page) + */ + if (type == SUM_TYPE_NODE) + submitted += gc_node_segment(sbi, sum->entries, segno, + gc_type); + else + submitted += gc_data_segment(sbi, sum->entries, gc_list, + segno, gc_type, + force_migrate); + + stat_inc_seg_count(sbi, type, gc_type); + sbi->gc_reclaimed_segs[sbi->gc_mode]++; + migrated++; + +freed: + if (gc_type == FG_GC && + get_valid_blocks(sbi, segno, false) == 0) + seg_freed++; + + if (__is_large_section(sbi)) + sbi->next_victim_seg[gc_type] = + (segno + 1 < end_segno) ? segno + 1 : NULL_SEGNO; +skip: + f2fs_put_page(sum_page, 0); + } + + if (submitted) + f2fs_submit_merged_write(sbi, + (type == SUM_TYPE_NODE) ? NODE : DATA); + + blk_finish_plug(&plug); + + stat_inc_call_count(sbi->stat_info); + + return seg_freed; +} + +int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) +{ + int gc_type = gc_control->init_gc_type; + unsigned int segno = gc_control->victim_segno; + int sec_freed = 0, seg_freed = 0, total_freed = 0; + int ret = 0; + struct cp_control cpc; + struct gc_inode_list gc_list = { + .ilist = LIST_HEAD_INIT(gc_list.ilist), + .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), + }; + unsigned int skipped_round = 0, round = 0; + unsigned int upper_secs; + + trace_f2fs_gc_begin(sbi->sb, gc_type, gc_control->no_bg_gc, + gc_control->nr_free_secs, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + + cpc.reason = __get_cp_reason(sbi); +gc_more: + sbi->skipped_gc_rwsem = 0; + if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { + ret = -EINVAL; + goto stop; + } + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; + goto stop; + } + + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) { + /* + * For example, if there are many prefree_segments below given + * threshold, we can make them free by checkpoint. Then, we + * secure free segments which doesn't need fggc any more. + */ + if (prefree_segments(sbi)) { + ret = f2fs_write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + } + if (has_not_enough_free_secs(sbi, 0, 0)) + gc_type = FG_GC; + } + + /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ + if (gc_type == BG_GC && gc_control->no_bg_gc) { + ret = -EINVAL; + goto stop; + } +retry: + ret = __get_victim(sbi, &segno, gc_type); + if (ret) { + /* allow to search victim from sections has pinned data */ + if (ret == -ENODATA && gc_type == FG_GC && + f2fs_pinned_section_exists(DIRTY_I(sbi))) { + f2fs_unpin_all_sections(sbi, false); + goto retry; + } + goto stop; + } + + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, + gc_control->should_migrate_blocks); + total_freed += seg_freed; + + if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) + sec_freed++; + + if (gc_type == FG_GC) + sbi->cur_victim_sec = NULL_SEGNO; + + if (gc_control->init_gc_type == FG_GC || + !has_not_enough_free_secs(sbi, + (gc_type == FG_GC) ? sec_freed : 0, 0)) { + if (gc_type == FG_GC && sec_freed < gc_control->nr_free_secs) + goto go_gc_more; + goto stop; + } + + /* FG_GC stops GC by skip_count */ + if (gc_type == FG_GC) { + if (sbi->skipped_gc_rwsem) + skipped_round++; + round++; + if (skipped_round > MAX_SKIP_GC_COUNT && + skipped_round * 2 >= round) { + ret = f2fs_write_checkpoint(sbi, &cpc); + goto stop; + } + } + + __get_secs_required(sbi, NULL, &upper_secs, NULL); + + /* + * Write checkpoint to reclaim prefree segments. + * We need more three extra sections for writer's data/node/dentry. + */ + if (free_sections(sbi) <= upper_secs + NR_GC_CHECKPOINT_SECS && + prefree_segments(sbi)) { + ret = f2fs_write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + } +go_gc_more: + segno = NULL_SEGNO; + goto gc_more; + +stop: + SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; + SIT_I(sbi)->last_victim[FLUSH_DEVICE] = gc_control->victim_segno; + + if (gc_type == FG_GC) + f2fs_unpin_all_sections(sbi, true); + + trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + + f2fs_up_write(&sbi->gc_lock); + + put_gc_inode(&gc_list); + + if (gc_control->err_gc_skipped && !ret) + ret = sec_freed ? 0 : -EAGAIN; + return ret; +} + +int __init f2fs_create_garbage_collection_cache(void) +{ + victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry", + sizeof(struct victim_entry)); + if (!victim_entry_slab) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_garbage_collection_cache(void) +{ + kmem_cache_destroy(victim_entry_slab); +} + +static void init_atgc_management(struct f2fs_sb_info *sbi) +{ + struct atgc_management *am = &sbi->am; + + if (test_opt(sbi, ATGC) && + SIT_I(sbi)->elapsed_time >= DEF_GC_THREAD_AGE_THRESHOLD) + am->atgc_enabled = true; + + am->root = RB_ROOT_CACHED; + INIT_LIST_HEAD(&am->victim_list); + am->victim_count = 0; + + am->candidate_ratio = DEF_GC_THREAD_CANDIDATE_RATIO; + am->max_candidate_count = DEF_GC_THREAD_MAX_CANDIDATE_COUNT; + am->age_weight = DEF_GC_THREAD_AGE_WEIGHT; + am->age_threshold = DEF_GC_THREAD_AGE_THRESHOLD; +} + +void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) +{ + DIRTY_I(sbi)->v_ops = &default_v_ops; + + sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES; + + /* give warm/cold data area from slower device */ + if (f2fs_is_multi_device(sbi) && !__is_large_section(sbi)) + SIT_I(sbi)->last_victim[ALLOC_NEXT] = + GET_SEGNO(sbi, FDEV(0).end_blk) + 1; + + init_atgc_management(sbi); +} + +static int free_segment_range(struct f2fs_sb_info *sbi, + unsigned int secs, bool gc_only) +{ + unsigned int segno, next_inuse, start, end; + struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; + int gc_mode, gc_type; + int err = 0; + int type; + + /* Force block allocation for GC */ + MAIN_SECS(sbi) -= secs; + start = MAIN_SECS(sbi) * sbi->segs_per_sec; + end = MAIN_SEGS(sbi) - 1; + + mutex_lock(&DIRTY_I(sbi)->seglist_lock); + for (gc_mode = 0; gc_mode < MAX_GC_POLICY; gc_mode++) + if (SIT_I(sbi)->last_victim[gc_mode] >= start) + SIT_I(sbi)->last_victim[gc_mode] = 0; + + for (gc_type = BG_GC; gc_type <= FG_GC; gc_type++) + if (sbi->next_victim_seg[gc_type] >= start) + sbi->next_victim_seg[gc_type] = NULL_SEGNO; + mutex_unlock(&DIRTY_I(sbi)->seglist_lock); + + /* Move out cursegs from the target range */ + for (type = CURSEG_HOT_DATA; type < NR_CURSEG_PERSIST_TYPE; type++) + f2fs_allocate_segment_for_resize(sbi, type, start, end); + + /* do GC to move out valid blocks in the range */ + for (segno = start; segno <= end; segno += sbi->segs_per_sec) { + struct gc_inode_list gc_list = { + .ilist = LIST_HEAD_INIT(gc_list.ilist), + .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), + }; + + do_garbage_collect(sbi, segno, &gc_list, FG_GC, true); + put_gc_inode(&gc_list); + + if (!gc_only && get_valid_blocks(sbi, segno, true)) { + err = -EAGAIN; + goto out; + } + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto out; + } + } + if (gc_only) + goto out; + + err = f2fs_write_checkpoint(sbi, &cpc); + if (err) + goto out; + + next_inuse = find_next_inuse(FREE_I(sbi), end + 1, start); + if (next_inuse <= end) { + f2fs_err(sbi, "segno %u should be free but still inuse!", + next_inuse); + f2fs_bug_on(sbi, 1); + } +out: + MAIN_SECS(sbi) += secs; + return err; +} + +static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) +{ + struct f2fs_super_block *raw_sb = F2FS_RAW_SUPER(sbi); + int section_count; + int segment_count; + int segment_count_main; + long long block_count; + int segs = secs * sbi->segs_per_sec; + + f2fs_down_write(&sbi->sb_lock); + + section_count = le32_to_cpu(raw_sb->section_count); + segment_count = le32_to_cpu(raw_sb->segment_count); + segment_count_main = le32_to_cpu(raw_sb->segment_count_main); + block_count = le64_to_cpu(raw_sb->block_count); + + raw_sb->section_count = cpu_to_le32(section_count + secs); + raw_sb->segment_count = cpu_to_le32(segment_count + segs); + raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs); + raw_sb->block_count = cpu_to_le64(block_count + + (long long)segs * sbi->blocks_per_seg); + if (f2fs_is_multi_device(sbi)) { + int last_dev = sbi->s_ndevs - 1; + int dev_segs = + le32_to_cpu(raw_sb->devs[last_dev].total_segments); + + raw_sb->devs[last_dev].total_segments = + cpu_to_le32(dev_segs + segs); + } + + f2fs_up_write(&sbi->sb_lock); +} + +static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) +{ + int segs = secs * sbi->segs_per_sec; + long long blks = (long long)segs * sbi->blocks_per_seg; + long long user_block_count = + le64_to_cpu(F2FS_CKPT(sbi)->user_block_count); + + SM_I(sbi)->segment_count = (int)SM_I(sbi)->segment_count + segs; + MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs; + MAIN_SECS(sbi) += secs; + FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs; + FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs; + F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks); + + if (f2fs_is_multi_device(sbi)) { + int last_dev = sbi->s_ndevs - 1; + + FDEV(last_dev).total_segments = + (int)FDEV(last_dev).total_segments + segs; + FDEV(last_dev).end_blk = + (long long)FDEV(last_dev).end_blk + blks; +#ifdef CONFIG_BLK_DEV_ZONED + FDEV(last_dev).nr_blkz = (int)FDEV(last_dev).nr_blkz + + (int)(blks >> sbi->log_blocks_per_blkz); +#endif + } +} + +int f2fs_resize_fs(struct file *filp, __u64 block_count) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); + __u64 old_block_count, shrunk_blocks; + struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; + unsigned int secs; + int err = 0; + __u32 rem; + + old_block_count = le64_to_cpu(F2FS_RAW_SUPER(sbi)->block_count); + if (block_count > old_block_count) + return -EINVAL; + + if (f2fs_is_multi_device(sbi)) { + int last_dev = sbi->s_ndevs - 1; + __u64 last_segs = FDEV(last_dev).total_segments; + + if (block_count + last_segs * sbi->blocks_per_seg <= + old_block_count) + return -EINVAL; + } + + /* new fs size should align to section size */ + div_u64_rem(block_count, BLKS_PER_SEC(sbi), &rem); + if (rem) + return -EINVAL; + + if (block_count == old_block_count) + return 0; + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + f2fs_err(sbi, "Should run fsck to repair first."); + return -EFSCORRUPTED; + } + + if (test_opt(sbi, DISABLE_CHECKPOINT)) { + f2fs_err(sbi, "Checkpoint should be enabled."); + return -EINVAL; + } + + err = mnt_want_write_file(filp); + if (err) + return err; + + shrunk_blocks = old_block_count - block_count; + secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi)); + + /* stop other GC */ + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + err = -EAGAIN; + goto out_drop_write; + } + + /* stop CP to protect MAIN_SEC in free_segment_range */ + f2fs_lock_op(sbi); + + spin_lock(&sbi->stat_lock); + if (shrunk_blocks + valid_user_blocks(sbi) + + sbi->current_reserved_blocks + sbi->unusable_block_count + + F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count) + err = -ENOSPC; + spin_unlock(&sbi->stat_lock); + + if (err) + goto out_unlock; + + err = free_segment_range(sbi, secs, true); + +out_unlock: + f2fs_unlock_op(sbi); + f2fs_up_write(&sbi->gc_lock); +out_drop_write: + mnt_drop_write_file(filp); + if (err) + return err; + + err = freeze_super(sbi->sb); + if (err) + return err; + + if (f2fs_readonly(sbi->sb)) { + thaw_super(sbi->sb); + return -EROFS; + } + + f2fs_down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->cp_global_sem); + + spin_lock(&sbi->stat_lock); + if (shrunk_blocks + valid_user_blocks(sbi) + + sbi->current_reserved_blocks + sbi->unusable_block_count + + F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count) + err = -ENOSPC; + else + sbi->user_block_count -= shrunk_blocks; + spin_unlock(&sbi->stat_lock); + if (err) + goto out_err; + + set_sbi_flag(sbi, SBI_IS_RESIZEFS); + err = free_segment_range(sbi, secs, false); + if (err) + goto recover_out; + + update_sb_metadata(sbi, -secs); + + err = f2fs_commit_super(sbi, false); + if (err) { + update_sb_metadata(sbi, secs); + goto recover_out; + } + + update_fs_metadata(sbi, -secs); + clear_sbi_flag(sbi, SBI_IS_RESIZEFS); + set_sbi_flag(sbi, SBI_IS_DIRTY); + + err = f2fs_write_checkpoint(sbi, &cpc); + if (err) { + update_fs_metadata(sbi, secs); + update_sb_metadata(sbi, secs); + f2fs_commit_super(sbi, false); + } +recover_out: + clear_sbi_flag(sbi, SBI_IS_RESIZEFS); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_err(sbi, "resize_fs failed, should run fsck to repair!"); + + spin_lock(&sbi->stat_lock); + sbi->user_block_count += shrunk_blocks; + spin_unlock(&sbi->stat_lock); + } +out_err: + f2fs_up_write(&sbi->cp_global_sem); + f2fs_up_write(&sbi->gc_lock); + thaw_super(sbi->sb); + return err; +} diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h new file mode 100644 index 000000000..47357101b --- /dev/null +++ b/fs/f2fs/gc.h @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * fs/f2fs/gc.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + */ +#define GC_THREAD_MIN_WB_PAGES 1 /* + * a threshold to determine + * whether IO subsystem is idle + * or not + */ +#define DEF_GC_THREAD_URGENT_SLEEP_TIME 500 /* 500 ms */ +#define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ +#define DEF_GC_THREAD_MAX_SLEEP_TIME 60000 +#define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ + +/* choose candidates from sections which has age of more than 7 days */ +#define DEF_GC_THREAD_AGE_THRESHOLD (60 * 60 * 24 * 7) +#define DEF_GC_THREAD_CANDIDATE_RATIO 20 /* select 20% oldest sections as candidates */ +#define DEF_GC_THREAD_MAX_CANDIDATE_COUNT 10 /* select at most 10 sections as candidates */ +#define DEF_GC_THREAD_AGE_WEIGHT 60 /* age weight */ +#define DEFAULT_ACCURACY_CLASS 10000 /* accuracy class */ + +#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ +#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ + +#define DEF_GC_FAILED_PINNED_FILES 2048 + +/* Search max. number of dirty segments to select a victim segment */ +#define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ + +#define NR_GC_CHECKPOINT_SECS (3) /* data/node/dentry sections */ + +struct f2fs_gc_kthread { + struct task_struct *f2fs_gc_task; + wait_queue_head_t gc_wait_queue_head; + + /* for gc sleep time */ + unsigned int urgent_sleep_time; + unsigned int min_sleep_time; + unsigned int max_sleep_time; + unsigned int no_gc_sleep_time; + + /* for changing gc mode */ + unsigned int gc_wake; + + /* for GC_MERGE mount option */ + wait_queue_head_t fggc_wq; /* + * caller of f2fs_balance_fs() + * will wait on this wait queue. + */ +}; + +struct gc_inode_list { + struct list_head ilist; + struct radix_tree_root iroot; +}; + +struct victim_entry { + struct rb_node rb_node; /* rb node located in rb-tree */ + unsigned long long mtime; /* mtime of section */ + unsigned int segno; /* segment No. */ + struct list_head list; +}; + +/* + * inline functions + */ + +/* + * On a Zoned device zone-capacity can be less than zone-size and if + * zone-capacity is not aligned to f2fs segment size(2MB), then the segment + * starting just before zone-capacity has some blocks spanning across the + * zone-capacity, these blocks are not usable. + * Such spanning segments can be in free list so calculate the sum of usable + * blocks in currently free segments including normal and spanning segments. + */ +static inline block_t free_segs_blk_count_zoned(struct f2fs_sb_info *sbi) +{ + block_t free_seg_blks = 0; + struct free_segmap_info *free_i = FREE_I(sbi); + int j; + + spin_lock(&free_i->segmap_lock); + for (j = 0; j < MAIN_SEGS(sbi); j++) + if (!test_bit(j, free_i->free_segmap)) + free_seg_blks += f2fs_usable_blks_in_seg(sbi, j); + spin_unlock(&free_i->segmap_lock); + + return free_seg_blks; +} + +static inline block_t free_segs_blk_count(struct f2fs_sb_info *sbi) +{ + if (f2fs_sb_has_blkzoned(sbi)) + return free_segs_blk_count_zoned(sbi); + + return free_segments(sbi) << sbi->log_blocks_per_seg; +} + +static inline block_t free_user_blocks(struct f2fs_sb_info *sbi) +{ + block_t free_blks, ovp_blks; + + free_blks = free_segs_blk_count(sbi); + ovp_blks = overprovision_segments(sbi) << sbi->log_blocks_per_seg; + + if (free_blks < ovp_blks) + return 0; + + return free_blks - ovp_blks; +} + +static inline block_t limit_invalid_user_blocks(block_t user_block_count) +{ + return (long)(user_block_count * LIMIT_INVALID_BLOCK) / 100; +} + +static inline block_t limit_free_user_blocks(block_t reclaimable_user_blocks) +{ + return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100; +} + +static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th, + unsigned int *wait) +{ + unsigned int min_time = gc_th->min_sleep_time; + unsigned int max_time = gc_th->max_sleep_time; + + if (*wait == gc_th->no_gc_sleep_time) + return; + + if ((long long)*wait + (long long)min_time > (long long)max_time) + *wait = max_time; + else + *wait += min_time; +} + +static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th, + unsigned int *wait) +{ + unsigned int min_time = gc_th->min_sleep_time; + + if (*wait == gc_th->no_gc_sleep_time) + *wait = gc_th->max_sleep_time; + + if ((long long)*wait - (long long)min_time < (long long)min_time) + *wait = min_time; + else + *wait -= min_time; +} + +static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) +{ + block_t user_block_count = sbi->user_block_count; + block_t invalid_user_blocks = user_block_count - + written_block_count(sbi); + /* + * Background GC is triggered with the following conditions. + * 1. There are a number of invalid blocks. + * 2. There is not enough free space. + */ + return (invalid_user_blocks > + limit_invalid_user_blocks(user_block_count) && + free_user_blocks(sbi) < + limit_free_user_blocks(invalid_user_blocks)); +} diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c new file mode 100644 index 000000000..049ce50ce --- /dev/null +++ b/fs/f2fs/hash.c @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/f2fs/hash.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext3/hash.c + * + * Copyright (C) 2002 by Theodore Ts'o + */ +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/pagemap.h> +#include <linux/unicode.h> + +#include "f2fs.h" + +/* + * Hashing code copied from ext3 + */ +#define DELTA 0x9E3779B9 + +static void TEA_transform(unsigned int buf[4], unsigned int const in[]) +{ + __u32 sum = 0; + __u32 b0 = buf[0], b1 = buf[1]; + __u32 a = in[0], b = in[1], c = in[2], d = in[3]; + int n = 16; + + do { + sum += DELTA; + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); + } while (--n); + + buf[0] += b0; + buf[1] += b1; +} + +static void str2hashbuf(const unsigned char *msg, size_t len, + unsigned int *buf, int num) +{ + unsigned pad, val; + int i; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num * 4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = msg[i] + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +static u32 TEA_hash_name(const u8 *p, size_t len) +{ + __u32 in[8], buf[4]; + + /* Initialize the default seed for the hash checksum functions */ + buf[0] = 0x67452301; + buf[1] = 0xefcdab89; + buf[2] = 0x98badcfe; + buf[3] = 0x10325476; + + while (1) { + str2hashbuf(p, len, in, 4); + TEA_transform(buf, in); + p += 16; + if (len <= 16) + break; + len -= 16; + } + return buf[0] & ~F2FS_HASH_COL_BIT; +} + +/* + * Compute @fname->hash. For all directories, @fname->disk_name must be set. + * For casefolded directories, @fname->usr_fname must be set, and also + * @fname->cf_name if the filename is valid Unicode and is not "." or "..". + */ +void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) +{ + const u8 *name = fname->disk_name.name; + size_t len = fname->disk_name.len; + + WARN_ON_ONCE(!name); + + if (is_dot_dotdot(name, len)) { + fname->hash = 0; + return; + } + +#if IS_ENABLED(CONFIG_UNICODE) + if (IS_CASEFOLDED(dir)) { + /* + * If the casefolded name is provided, hash it instead of the + * on-disk name. If the casefolded name is *not* provided, that + * should only be because the name wasn't valid Unicode or was + * "." or "..", so fall back to treating the name as an opaque + * byte sequence. Note that to handle encrypted directories, + * the fallback must use usr_fname (plaintext) rather than + * disk_name (ciphertext). + */ + WARN_ON_ONCE(!fname->usr_fname->name); + if (fname->cf_name.name) { + name = fname->cf_name.name; + len = fname->cf_name.len; + } else { + name = fname->usr_fname->name; + len = fname->usr_fname->len; + } + if (IS_ENCRYPTED(dir)) { + struct qstr tmp = QSTR_INIT(name, len); + + fname->hash = + cpu_to_le32(fscrypt_fname_siphash(dir, &tmp)); + return; + } + } +#endif + fname->hash = cpu_to_le32(TEA_hash_name(name, len)); +} diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c new file mode 100644 index 000000000..8747eec3d --- /dev/null +++ b/fs/f2fs/inline.c @@ -0,0 +1,815 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/f2fs/inline.c + * Copyright (c) 2013, Intel Corporation + * Authors: Huajun Li <huajun.li@intel.com> + * Haicheng Li <haicheng.li@intel.com> + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/fiemap.h> + +#include "f2fs.h" +#include "node.h" +#include <trace/events/f2fs.h> + +static bool support_inline_data(struct inode *inode) +{ + if (f2fs_is_atomic_file(inode)) + return false; + if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) + return false; + if (i_size_read(inode) > MAX_INLINE_DATA(inode)) + return false; + return true; +} + +bool f2fs_may_inline_data(struct inode *inode) +{ + if (!support_inline_data(inode)) + return false; + + return !f2fs_post_read_required(inode); +} + +bool f2fs_sanity_check_inline_data(struct inode *inode) +{ + if (!f2fs_has_inline_data(inode)) + return false; + + if (!support_inline_data(inode)) + return true; + + /* + * used by sanity_check_inode(), when disk layout fields has not + * been synchronized to inmem fields. + */ + return (S_ISREG(inode->i_mode) && + (file_is_encrypt(inode) || file_is_verity(inode) || + (F2FS_I(inode)->i_flags & F2FS_COMPR_FL))); +} + +bool f2fs_may_inline_dentry(struct inode *inode) +{ + if (!test_opt(F2FS_I_SB(inode), INLINE_DENTRY)) + return false; + + if (!S_ISDIR(inode->i_mode)) + return false; + + return true; +} + +void f2fs_do_read_inline_data(struct page *page, struct page *ipage) +{ + struct inode *inode = page->mapping->host; + + if (PageUptodate(page)) + return; + + f2fs_bug_on(F2FS_P_SB(page), page->index); + + zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE); + + /* Copy the whole inline data block */ + memcpy_to_page(page, 0, inline_data_addr(inode, ipage), + MAX_INLINE_DATA(inode)); + if (!PageUptodate(page)) + SetPageUptodate(page); +} + +void f2fs_truncate_inline_inode(struct inode *inode, + struct page *ipage, u64 from) +{ + void *addr; + + if (from >= MAX_INLINE_DATA(inode)) + return; + + addr = inline_data_addr(inode, ipage); + + f2fs_wait_on_page_writeback(ipage, NODE, true, true); + memset(addr + from, 0, MAX_INLINE_DATA(inode) - from); + set_page_dirty(ipage); + + if (from == 0) + clear_inode_flag(inode, FI_DATA_EXIST); +} + +int f2fs_read_inline_data(struct inode *inode, struct page *page) +{ + struct page *ipage; + + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) { + unlock_page(page); + return PTR_ERR(ipage); + } + + if (!f2fs_has_inline_data(inode)) { + f2fs_put_page(ipage, 1); + return -EAGAIN; + } + + if (page->index) + zero_user_segment(page, 0, PAGE_SIZE); + else + f2fs_do_read_inline_data(page, ipage); + + if (!PageUptodate(page)) + SetPageUptodate(page); + f2fs_put_page(ipage, 1); + unlock_page(page); + return 0; +} + +int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) +{ + struct f2fs_io_info fio = { + .sbi = F2FS_I_SB(dn->inode), + .ino = dn->inode->i_ino, + .type = DATA, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_PRIO, + .page = page, + .encrypted_page = NULL, + .io_type = FS_DATA_IO, + }; + struct node_info ni; + int dirty, err; + + if (!f2fs_exist_data(dn->inode)) + goto clear_out; + + err = f2fs_reserve_block(dn, 0); + if (err) + return err; + + err = f2fs_get_node_info(fio.sbi, dn->nid, &ni, false); + if (err) { + f2fs_truncate_data_blocks_range(dn, 1); + f2fs_put_dnode(dn); + return err; + } + + fio.version = ni.version; + + if (unlikely(dn->data_blkaddr != NEW_ADDR)) { + f2fs_put_dnode(dn); + set_sbi_flag(fio.sbi, SBI_NEED_FSCK); + f2fs_warn(fio.sbi, "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", + __func__, dn->inode->i_ino, dn->data_blkaddr); + f2fs_handle_error(fio.sbi, ERROR_INVALID_BLKADDR); + return -EFSCORRUPTED; + } + + f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page)); + + f2fs_do_read_inline_data(page, dn->inode_page); + set_page_dirty(page); + + /* clear dirty state */ + dirty = clear_page_dirty_for_io(page); + + /* write data page to try to make data consistent */ + set_page_writeback(page); + ClearPageError(page); + fio.old_blkaddr = dn->data_blkaddr; + set_inode_flag(dn->inode, FI_HOT_DATA); + f2fs_outplace_write_data(dn, &fio); + f2fs_wait_on_page_writeback(page, DATA, true, true); + if (dirty) { + inode_dec_dirty_pages(dn->inode); + f2fs_remove_dirty_inode(dn->inode); + } + + /* this converted inline_data should be recovered. */ + set_inode_flag(dn->inode, FI_APPEND_WRITE); + + /* clear inline data and flag after data writeback */ + f2fs_truncate_inline_inode(dn->inode, dn->inode_page, 0); + clear_page_private_inline(dn->inode_page); +clear_out: + stat_dec_inline_inode(dn->inode); + clear_inode_flag(dn->inode, FI_INLINE_DATA); + f2fs_put_dnode(dn); + return 0; +} + +int f2fs_convert_inline_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct page *ipage, *page; + int err = 0; + + if (!f2fs_has_inline_data(inode) || + f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb)) + return 0; + + err = f2fs_dquot_initialize(inode); + if (err) + return err; + + page = f2fs_grab_cache_page(inode->i_mapping, 0, false); + if (!page) + return -ENOMEM; + + f2fs_lock_op(sbi); + + ipage = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto out; + } + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_has_inline_data(inode)) + err = f2fs_convert_inline_page(&dn, page); + + f2fs_put_dnode(&dn); +out: + f2fs_unlock_op(sbi); + + f2fs_put_page(page, 1); + + if (!err) + f2fs_balance_fs(sbi, dn.node_changed); + + return err; +} + +int f2fs_write_inline_data(struct inode *inode, struct page *page) +{ + struct dnode_of_data dn; + int err; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, 0, LOOKUP_NODE); + if (err) + return err; + + if (!f2fs_has_inline_data(inode)) { + f2fs_put_dnode(&dn); + return -EAGAIN; + } + + f2fs_bug_on(F2FS_I_SB(inode), page->index); + + f2fs_wait_on_page_writeback(dn.inode_page, NODE, true, true); + memcpy_from_page(inline_data_addr(inode, dn.inode_page), + page, 0, MAX_INLINE_DATA(inode)); + set_page_dirty(dn.inode_page); + + f2fs_clear_page_cache_dirty_tag(page); + + set_inode_flag(inode, FI_APPEND_WRITE); + set_inode_flag(inode, FI_DATA_EXIST); + + clear_page_private_inline(dn.inode_page); + f2fs_put_dnode(&dn); + return 0; +} + +int f2fs_recover_inline_data(struct inode *inode, struct page *npage) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode *ri = NULL; + void *src_addr, *dst_addr; + struct page *ipage; + + /* + * The inline_data recovery policy is as follows. + * [prev.] [next] of inline_data flag + * o o -> recover inline_data + * o x -> remove inline_data, and then recover data blocks + * x o -> remove data blocks, and then recover inline_data + * x x -> recover data blocks + */ + if (IS_INODE(npage)) + ri = F2FS_INODE(npage); + + if (f2fs_has_inline_data(inode) && + ri && (ri->i_inline & F2FS_INLINE_DATA)) { +process_inline: + ipage = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + f2fs_wait_on_page_writeback(ipage, NODE, true, true); + + src_addr = inline_data_addr(inode, npage); + dst_addr = inline_data_addr(inode, ipage); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); + + set_inode_flag(inode, FI_INLINE_DATA); + set_inode_flag(inode, FI_DATA_EXIST); + + set_page_dirty(ipage); + f2fs_put_page(ipage, 1); + return 1; + } + + if (f2fs_has_inline_data(inode)) { + ipage = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + f2fs_truncate_inline_inode(inode, ipage, 0); + stat_dec_inline_inode(inode); + clear_inode_flag(inode, FI_INLINE_DATA); + f2fs_put_page(ipage, 1); + } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { + int ret; + + ret = f2fs_truncate_blocks(inode, 0, false); + if (ret) + return ret; + stat_inc_inline_inode(inode); + goto process_inline; + } + return 0; +} + +struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, + const struct f2fs_filename *fname, + struct page **res_page) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct f2fs_dir_entry *de; + struct f2fs_dentry_ptr d; + struct page *ipage; + void *inline_dentry; + + ipage = f2fs_get_node_page(sbi, dir->i_ino); + if (IS_ERR(ipage)) { + *res_page = ipage; + return NULL; + } + + inline_dentry = inline_data_addr(dir, ipage); + + make_dentry_ptr_inline(dir, &d, inline_dentry); + de = f2fs_find_target_dentry(&d, fname, NULL); + unlock_page(ipage); + if (IS_ERR(de)) { + *res_page = ERR_CAST(de); + de = NULL; + } + if (de) + *res_page = ipage; + else + f2fs_put_page(ipage, 0); + + return de; +} + +int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent, + struct page *ipage) +{ + struct f2fs_dentry_ptr d; + void *inline_dentry; + + inline_dentry = inline_data_addr(inode, ipage); + + make_dentry_ptr_inline(inode, &d, inline_dentry); + f2fs_do_make_empty_dir(inode, parent, &d); + + set_page_dirty(ipage); + + /* update i_size to MAX_INLINE_DATA */ + if (i_size_read(inode) < MAX_INLINE_DATA(inode)) + f2fs_i_size_write(inode, MAX_INLINE_DATA(inode)); + return 0; +} + +/* + * NOTE: ipage is grabbed by caller, but if any error occurs, we should + * release ipage in this function. + */ +static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, + void *inline_dentry) +{ + struct page *page; + struct dnode_of_data dn; + struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_ptr src, dst; + int err; + + page = f2fs_grab_cache_page(dir->i_mapping, 0, true); + if (!page) { + f2fs_put_page(ipage, 1); + return -ENOMEM; + } + + set_new_dnode(&dn, dir, ipage, NULL, 0); + err = f2fs_reserve_block(&dn, 0); + if (err) + goto out; + + if (unlikely(dn.data_blkaddr != NEW_ADDR)) { + f2fs_put_dnode(&dn); + set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK); + f2fs_warn(F2FS_P_SB(page), "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", + __func__, dir->i_ino, dn.data_blkaddr); + f2fs_handle_error(F2FS_P_SB(page), ERROR_INVALID_BLKADDR); + err = -EFSCORRUPTED; + goto out; + } + + f2fs_wait_on_page_writeback(page, DATA, true, true); + + dentry_blk = page_address(page); + + /* + * Start by zeroing the full block, to ensure that all unused space is + * zeroed and no uninitialized memory is leaked to disk. + */ + memset(dentry_blk, 0, F2FS_BLKSIZE); + + make_dentry_ptr_inline(dir, &src, inline_dentry); + make_dentry_ptr_block(dir, &dst, dentry_blk); + + /* copy data from inline dentry block to new dentry block */ + memcpy(dst.bitmap, src.bitmap, src.nr_bitmap); + memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max); + memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN); + + if (!PageUptodate(page)) + SetPageUptodate(page); + set_page_dirty(page); + + /* clear inline dir and flag after data writeback */ + f2fs_truncate_inline_inode(dir, ipage, 0); + + stat_dec_inline_dir(dir); + clear_inode_flag(dir, FI_INLINE_DENTRY); + + /* + * should retrieve reserved space which was used to keep + * inline_dentry's structure for backward compatibility. + */ + if (!f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(dir)) && + !f2fs_has_inline_xattr(dir)) + F2FS_I(dir)->i_inline_xattr_size = 0; + + f2fs_i_depth_write(dir, 1); + if (i_size_read(dir) < PAGE_SIZE) + f2fs_i_size_write(dir, PAGE_SIZE); +out: + f2fs_put_page(page, 1); + return err; +} + +static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) +{ + struct f2fs_dentry_ptr d; + unsigned long bit_pos = 0; + int err = 0; + + make_dentry_ptr_inline(dir, &d, inline_dentry); + + while (bit_pos < d.max) { + struct f2fs_dir_entry *de; + struct f2fs_filename fname; + nid_t ino; + umode_t fake_mode; + + if (!test_bit_le(bit_pos, d.bitmap)) { + bit_pos++; + continue; + } + + de = &d.dentry[bit_pos]; + + if (unlikely(!de->name_len)) { + bit_pos++; + continue; + } + + /* + * We only need the disk_name and hash to move the dentry. + * We don't need the original or casefolded filenames. + */ + memset(&fname, 0, sizeof(fname)); + fname.disk_name.name = d.filename[bit_pos]; + fname.disk_name.len = le16_to_cpu(de->name_len); + fname.hash = de->hash_code; + + ino = le32_to_cpu(de->ino); + fake_mode = f2fs_get_de_type(de) << S_SHIFT; + + err = f2fs_add_regular_entry(dir, &fname, NULL, ino, fake_mode); + if (err) + goto punch_dentry_pages; + + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + } + return 0; +punch_dentry_pages: + truncate_inode_pages(&dir->i_data, 0); + f2fs_truncate_blocks(dir, 0, false); + f2fs_remove_dirty_inode(dir); + return err; +} + +static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, + void *inline_dentry) +{ + void *backup_dentry; + int err; + + backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir), + MAX_INLINE_DATA(dir), GFP_F2FS_ZERO); + if (!backup_dentry) { + f2fs_put_page(ipage, 1); + return -ENOMEM; + } + + memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir)); + f2fs_truncate_inline_inode(dir, ipage, 0); + + unlock_page(ipage); + + err = f2fs_add_inline_entries(dir, backup_dentry); + if (err) + goto recover; + + lock_page(ipage); + + stat_dec_inline_dir(dir); + clear_inode_flag(dir, FI_INLINE_DENTRY); + + /* + * should retrieve reserved space which was used to keep + * inline_dentry's structure for backward compatibility. + */ + if (!f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(dir)) && + !f2fs_has_inline_xattr(dir)) + F2FS_I(dir)->i_inline_xattr_size = 0; + + kfree(backup_dentry); + return 0; +recover: + lock_page(ipage); + f2fs_wait_on_page_writeback(ipage, NODE, true, true); + memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir)); + f2fs_i_depth_write(dir, 0); + f2fs_i_size_write(dir, MAX_INLINE_DATA(dir)); + set_page_dirty(ipage); + f2fs_put_page(ipage, 1); + + kfree(backup_dentry); + return err; +} + +static int do_convert_inline_dir(struct inode *dir, struct page *ipage, + void *inline_dentry) +{ + if (!F2FS_I(dir)->i_dir_level) + return f2fs_move_inline_dirents(dir, ipage, inline_dentry); + else + return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry); +} + +int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct page *ipage; + struct f2fs_filename fname; + void *inline_dentry = NULL; + int err = 0; + + if (!f2fs_has_inline_dentry(dir)) + return 0; + + f2fs_lock_op(sbi); + + err = f2fs_setup_filename(dir, &dentry->d_name, 0, &fname); + if (err) + goto out; + + ipage = f2fs_get_node_page(sbi, dir->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto out_fname; + } + + if (f2fs_has_enough_room(dir, ipage, &fname)) { + f2fs_put_page(ipage, 1); + goto out_fname; + } + + inline_dentry = inline_data_addr(dir, ipage); + + err = do_convert_inline_dir(dir, ipage, inline_dentry); + if (!err) + f2fs_put_page(ipage, 1); +out_fname: + f2fs_free_filename(&fname); +out: + f2fs_unlock_op(sbi); + return err; +} + +int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct page *ipage; + unsigned int bit_pos; + void *inline_dentry = NULL; + struct f2fs_dentry_ptr d; + int slots = GET_DENTRY_SLOTS(fname->disk_name.len); + struct page *page = NULL; + int err = 0; + + ipage = f2fs_get_node_page(sbi, dir->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max); + if (bit_pos >= d.max) { + err = do_convert_inline_dir(dir, ipage, inline_dentry); + if (err) + return err; + err = -EAGAIN; + goto out; + } + + if (inode) { + f2fs_down_write_nested(&F2FS_I(inode)->i_sem, + SINGLE_DEPTH_NESTING); + page = f2fs_init_inode_metadata(inode, dir, fname, ipage); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } + } + + f2fs_wait_on_page_writeback(ipage, NODE, true, true); + + f2fs_update_dentry(ino, mode, &d, &fname->disk_name, fname->hash, + bit_pos); + + set_page_dirty(ipage); + + /* we don't need to mark_inode_dirty now */ + if (inode) { + f2fs_i_pino_write(inode, dir->i_ino); + + /* synchronize inode page's data from inode cache */ + if (is_inode_flag_set(inode, FI_NEW_INODE)) + f2fs_update_inode(inode, page); + + f2fs_put_page(page, 1); + } + + f2fs_update_parent_metadata(dir, inode, 0); +fail: + if (inode) + f2fs_up_write(&F2FS_I(inode)->i_sem); +out: + f2fs_put_page(ipage, 1); + return err; +} + +void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode) +{ + struct f2fs_dentry_ptr d; + void *inline_dentry; + int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); + unsigned int bit_pos; + int i; + + lock_page(page); + f2fs_wait_on_page_writeback(page, NODE, true, true); + + inline_dentry = inline_data_addr(dir, page); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = dentry - d.dentry; + for (i = 0; i < slots; i++) + __clear_bit_le(bit_pos + i, d.bitmap); + + set_page_dirty(page); + f2fs_put_page(page, 1); + + dir->i_ctime = dir->i_mtime = current_time(dir); + f2fs_mark_inode_dirty_sync(dir, false); + + if (inode) + f2fs_drop_nlink(dir, inode); +} + +bool f2fs_empty_inline_dir(struct inode *dir) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct page *ipage; + unsigned int bit_pos = 2; + void *inline_dentry; + struct f2fs_dentry_ptr d; + + ipage = f2fs_get_node_page(sbi, dir->i_ino); + if (IS_ERR(ipage)) + return false; + + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = find_next_bit_le(d.bitmap, d.max, bit_pos); + + f2fs_put_page(ipage, 1); + + if (bit_pos < d.max) + return false; + + return true; +} + +int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, + struct fscrypt_str *fstr) +{ + struct inode *inode = file_inode(file); + struct page *ipage = NULL; + struct f2fs_dentry_ptr d; + void *inline_dentry = NULL; + int err; + + make_dentry_ptr_inline(inode, &d, inline_dentry); + + if (ctx->pos == d.max) + return 0; + + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + /* + * f2fs_readdir was protected by inode.i_rwsem, it is safe to access + * ipage without page's lock held. + */ + unlock_page(ipage); + + inline_dentry = inline_data_addr(inode, ipage); + + make_dentry_ptr_inline(inode, &d, inline_dentry); + + err = f2fs_fill_dentries(ctx, &d, 0, fstr); + if (!err) + ctx->pos = d.max; + + f2fs_put_page(ipage, 0); + return err < 0 ? err : 0; +} + +int f2fs_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) +{ + __u64 byteaddr, ilen; + __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED | + FIEMAP_EXTENT_LAST; + struct node_info ni; + struct page *ipage; + int err = 0; + + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + if ((S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) && + !f2fs_has_inline_data(inode)) { + err = -EAGAIN; + goto out; + } + + if (S_ISDIR(inode->i_mode) && !f2fs_has_inline_dentry(inode)) { + err = -EAGAIN; + goto out; + } + + ilen = min_t(size_t, MAX_INLINE_DATA(inode), i_size_read(inode)); + if (start >= ilen) + goto out; + if (start + len < ilen) + ilen = start + len; + ilen -= start; + + err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni, false); + if (err) + goto out; + + byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; + byteaddr += (char *)inline_data_addr(inode, ipage) - + (char *)F2FS_INODE(ipage); + err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); + trace_f2fs_fiemap(inode, start, byteaddr, ilen, flags, err); +out: + f2fs_put_page(ipage, 1); + return err; +} diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c new file mode 100644 index 000000000..0010579f1 --- /dev/null +++ b/fs/f2fs/inode.c @@ -0,0 +1,957 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/f2fs/inode.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/sched/mm.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "xattr.h" + +#include <trace/events/f2fs.h> + +#ifdef CONFIG_F2FS_FS_COMPRESSION +extern const struct address_space_operations f2fs_compress_aops; +#endif + +void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) +{ + if (is_inode_flag_set(inode, FI_NEW_INODE)) + return; + + if (f2fs_inode_dirtied(inode, sync)) + return; + + mark_inode_dirty_sync(inode); +} + +void f2fs_set_inode_flags(struct inode *inode) +{ + unsigned int flags = F2FS_I(inode)->i_flags; + unsigned int new_fl = 0; + + if (flags & F2FS_SYNC_FL) + new_fl |= S_SYNC; + if (flags & F2FS_APPEND_FL) + new_fl |= S_APPEND; + if (flags & F2FS_IMMUTABLE_FL) + new_fl |= S_IMMUTABLE; + if (flags & F2FS_NOATIME_FL) + new_fl |= S_NOATIME; + if (flags & F2FS_DIRSYNC_FL) + new_fl |= S_DIRSYNC; + if (file_is_encrypt(inode)) + new_fl |= S_ENCRYPTED; + if (file_is_verity(inode)) + new_fl |= S_VERITY; + if (flags & F2FS_CASEFOLD_FL) + new_fl |= S_CASEFOLD; + inode_set_flags(inode, new_fl, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC| + S_ENCRYPTED|S_VERITY|S_CASEFOLD); +} + +static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) +{ + int extra_size = get_extra_isize(inode); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + if (ri->i_addr[extra_size]) + inode->i_rdev = old_decode_dev( + le32_to_cpu(ri->i_addr[extra_size])); + else + inode->i_rdev = new_decode_dev( + le32_to_cpu(ri->i_addr[extra_size + 1])); + } +} + +static int __written_first_block(struct f2fs_sb_info *sbi, + struct f2fs_inode *ri) +{ + block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); + + if (!__is_valid_data_blkaddr(addr)) + return 1; + if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + return -EFSCORRUPTED; + } + return 0; +} + +static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) +{ + int extra_size = get_extra_isize(inode); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + ri->i_addr[extra_size] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + ri->i_addr[extra_size + 1] = 0; + } else { + ri->i_addr[extra_size] = 0; + ri->i_addr[extra_size + 1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + ri->i_addr[extra_size + 2] = 0; + } + } +} + +static void __recover_inline_status(struct inode *inode, struct page *ipage) +{ + void *inline_data = inline_data_addr(inode, ipage); + __le32 *start = inline_data; + __le32 *end = start + MAX_INLINE_DATA(inode) / sizeof(__le32); + + while (start < end) { + if (*start++) { + f2fs_wait_on_page_writeback(ipage, NODE, true, true); + + set_inode_flag(inode, FI_DATA_EXIST); + set_raw_inline(inode, F2FS_INODE(ipage)); + set_page_dirty(ipage); + return; + } + } + return; +} + +static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + + if (!f2fs_sb_has_inode_chksum(sbi)) + return false; + + if (!IS_INODE(page) || !(ri->i_inline & F2FS_EXTRA_ATTR)) + return false; + + if (!F2FS_FITS_IN_INODE(ri, le16_to_cpu(ri->i_extra_isize), + i_inode_checksum)) + return false; + + return true; +} + +static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_node *node = F2FS_NODE(page); + struct f2fs_inode *ri = &node->i; + __le32 ino = node->footer.ino; + __le32 gen = ri->i_generation; + __u32 chksum, chksum_seed; + __u32 dummy_cs = 0; + unsigned int offset = offsetof(struct f2fs_inode, i_inode_checksum); + unsigned int cs_size = sizeof(dummy_cs); + + chksum = f2fs_chksum(sbi, sbi->s_chksum_seed, (__u8 *)&ino, + sizeof(ino)); + chksum_seed = f2fs_chksum(sbi, chksum, (__u8 *)&gen, sizeof(gen)); + + chksum = f2fs_chksum(sbi, chksum_seed, (__u8 *)ri, offset); + chksum = f2fs_chksum(sbi, chksum, (__u8 *)&dummy_cs, cs_size); + offset += cs_size; + chksum = f2fs_chksum(sbi, chksum, (__u8 *)ri + offset, + F2FS_BLKSIZE - offset); + return chksum; +} + +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri; + __u32 provided, calculated; + + if (unlikely(is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN))) + return true; + +#ifdef CONFIG_F2FS_CHECK_FS + if (!f2fs_enable_inode_chksum(sbi, page)) +#else + if (!f2fs_enable_inode_chksum(sbi, page) || + PageDirty(page) || PageWriteback(page)) +#endif + return true; + + ri = &F2FS_NODE(page)->i; + provided = le32_to_cpu(ri->i_inode_checksum); + calculated = f2fs_inode_chksum(sbi, page); + + if (provided != calculated) + f2fs_warn(sbi, "checksum invalid, nid = %lu, ino_of_node = %x, %x vs. %x", + page->index, ino_of_node(page), provided, calculated); + + return provided == calculated; +} + +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + + if (!f2fs_enable_inode_chksum(sbi, page)) + return; + + ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); +} + +static bool sanity_check_inode(struct inode *inode, struct page *node_page) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_inode *ri = F2FS_INODE(node_page); + unsigned long long iblocks; + + iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks); + if (!iblocks) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: corrupted inode i_blocks i_ino=%lx iblocks=%llu, run fsck to fix.", + __func__, inode->i_ino, iblocks); + return false; + } + + if (ino_of_node(node_page) != nid_of_node(node_page)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: corrupted inode footer i_ino=%lx, ino,nid: [%u, %u] run fsck to fix.", + __func__, inode->i_ino, + ino_of_node(node_page), nid_of_node(node_page)); + return false; + } + + if (f2fs_sb_has_flexible_inline_xattr(sbi) + && !f2fs_has_extra_attr(inode)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: corrupted inode ino=%lx, run fsck to fix.", + __func__, inode->i_ino); + return false; + } + + if (f2fs_has_extra_attr(inode) && + !f2fs_sb_has_extra_attr(sbi)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) is with extra_attr, but extra_attr feature is off", + __func__, inode->i_ino); + return false; + } + + if (fi->i_extra_isize > F2FS_TOTAL_EXTRA_ATTR_SIZE || + fi->i_extra_isize % sizeof(__le32)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_extra_isize: %d, max: %zu", + __func__, inode->i_ino, fi->i_extra_isize, + F2FS_TOTAL_EXTRA_ATTR_SIZE); + return false; + } + + if (f2fs_has_extra_attr(inode) && + f2fs_sb_has_flexible_inline_xattr(sbi) && + f2fs_has_inline_xattr(inode) && + (!fi->i_inline_xattr_size || + fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %zu", + __func__, inode->i_ino, fi->i_inline_xattr_size, + MAX_INLINE_XATTR_SIZE); + return false; + } + + if (f2fs_sanity_check_inline_data(inode)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix", + __func__, inode->i_ino, inode->i_mode); + return false; + } + + if (f2fs_has_inline_dentry(inode) && !S_ISDIR(inode->i_mode)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_dentry, run fsck to fix", + __func__, inode->i_ino, inode->i_mode); + return false; + } + + if ((fi->i_flags & F2FS_CASEFOLD_FL) && !f2fs_sb_has_casefold(sbi)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) has casefold flag, but casefold feature is off", + __func__, inode->i_ino); + return false; + } + + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) && + fi->i_flags & F2FS_COMPR_FL && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, + i_log_cluster_size)) { + if (ri->i_compress_algorithm >= COMPRESS_MAX) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported " + "compress algorithm: %u, run fsck to fix", + __func__, inode->i_ino, + ri->i_compress_algorithm); + return false; + } + if (le64_to_cpu(ri->i_compr_blocks) > + SECTOR_TO_BLOCK(inode->i_blocks)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) has inconsistent " + "i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix", + __func__, inode->i_ino, + le64_to_cpu(ri->i_compr_blocks), + SECTOR_TO_BLOCK(inode->i_blocks)); + return false; + } + if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE || + ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported " + "log cluster size: %u, run fsck to fix", + __func__, inode->i_ino, + ri->i_log_cluster_size); + return false; + } + } + + return true; +} + +static void init_idisk_time(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + fi->i_disk_time[0] = inode->i_atime; + fi->i_disk_time[1] = inode->i_ctime; + fi->i_disk_time[2] = inode->i_mtime; + fi->i_disk_time[3] = fi->i_crtime; +} + +static int do_read_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct page *node_page; + struct f2fs_inode *ri; + projid_t i_projid; + int err; + + /* Check if ino is within scope */ + if (f2fs_check_nid_range(sbi, inode->i_ino)) + return -EINVAL; + + node_page = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(node_page)) + return PTR_ERR(node_page); + + ri = F2FS_INODE(node_page); + + inode->i_mode = le16_to_cpu(ri->i_mode); + i_uid_write(inode, le32_to_cpu(ri->i_uid)); + i_gid_write(inode, le32_to_cpu(ri->i_gid)); + set_nlink(inode, le32_to_cpu(ri->i_links)); + inode->i_size = le64_to_cpu(ri->i_size); + inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1); + + inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime); + inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime); + inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime); + inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec); + inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); + inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); + inode->i_generation = le32_to_cpu(ri->i_generation); + if (S_ISDIR(inode->i_mode)) + fi->i_current_depth = le32_to_cpu(ri->i_current_depth); + else if (S_ISREG(inode->i_mode)) + fi->i_gc_failures[GC_FAILURE_PIN] = + le16_to_cpu(ri->i_gc_failures); + fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); + fi->i_flags = le32_to_cpu(ri->i_flags); + if (S_ISREG(inode->i_mode)) + fi->i_flags &= ~F2FS_PROJINHERIT_FL; + bitmap_zero(fi->flags, FI_MAX); + fi->i_advise = ri->i_advise; + fi->i_pino = le32_to_cpu(ri->i_pino); + fi->i_dir_level = ri->i_dir_level; + + get_inline_info(inode, ri); + + fi->i_extra_isize = f2fs_has_extra_attr(inode) ? + le16_to_cpu(ri->i_extra_isize) : 0; + + if (f2fs_sb_has_flexible_inline_xattr(sbi)) { + fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size); + } else if (f2fs_has_inline_xattr(inode) || + f2fs_has_inline_dentry(inode)) { + fi->i_inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + } else { + + /* + * Previous inline data or directory always reserved 200 bytes + * in inode layout, even if inline_xattr is disabled. In order + * to keep inline_dentry's structure for backward compatibility, + * we get the space back only from inline_data. + */ + fi->i_inline_xattr_size = 0; + } + + if (!sanity_check_inode(inode, node_page)) { + f2fs_put_page(node_page, 1); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return -EFSCORRUPTED; + } + + /* check data exist */ + if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) + __recover_inline_status(inode, node_page); + + /* try to recover cold bit for non-dir inode */ + if (!S_ISDIR(inode->i_mode) && !is_cold_node(node_page)) { + f2fs_wait_on_page_writeback(node_page, NODE, true, true); + set_cold_node(node_page, false); + set_page_dirty(node_page); + } + + /* get rdev by using inline_info */ + __get_inode_rdev(inode, ri); + + if (S_ISREG(inode->i_mode)) { + err = __written_first_block(sbi, ri); + if (err < 0) { + f2fs_put_page(node_page, 1); + return err; + } + if (!err) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + } + + if (!f2fs_need_inode_block_update(sbi, inode->i_ino)) + fi->last_disk_size = inode->i_size; + + if (fi->i_flags & F2FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) + i_projid = (projid_t)le32_to_cpu(ri->i_projid); + else + i_projid = F2FS_DEF_PROJID; + fi->i_projid = make_kprojid(&init_user_ns, i_projid); + + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_inode_crtime(sbi) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) { + fi->i_crtime.tv_sec = le64_to_cpu(ri->i_crtime); + fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec); + } + + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) && + (fi->i_flags & F2FS_COMPR_FL)) { + if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, + i_log_cluster_size)) { + unsigned short compress_flag; + + atomic_set(&fi->i_compr_blocks, + le64_to_cpu(ri->i_compr_blocks)); + fi->i_compress_algorithm = ri->i_compress_algorithm; + fi->i_log_cluster_size = ri->i_log_cluster_size; + compress_flag = le16_to_cpu(ri->i_compress_flag); + fi->i_compress_level = compress_flag >> + COMPRESS_LEVEL_OFFSET; + fi->i_compress_flag = compress_flag & + GENMASK(COMPRESS_LEVEL_OFFSET - 1, 0); + fi->i_cluster_size = BIT(fi->i_log_cluster_size); + set_inode_flag(inode, FI_COMPRESSED_FILE); + } + } + + init_idisk_time(inode); + + /* Need all the flag bits */ + f2fs_init_read_extent_tree(inode, node_page); + + if (!sanity_check_extent_cache(inode)) { + f2fs_put_page(node_page, 1); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return -EFSCORRUPTED; + } + + f2fs_put_page(node_page, 1); + + stat_inc_inline_xattr(inode); + stat_inc_inline_inode(inode); + stat_inc_inline_dir(inode); + stat_inc_compr_inode(inode); + stat_add_compr_blocks(inode, atomic_read(&fi->i_compr_blocks)); + + return 0; +} + +static bool is_meta_ino(struct f2fs_sb_info *sbi, unsigned int ino) +{ + return ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi) || + ino == F2FS_COMPRESS_INO(sbi); +} + +struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *inode; + int ret = 0; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!(inode->i_state & I_NEW)) { + if (is_meta_ino(sbi, ino)) { + f2fs_err(sbi, "inaccessible inode: %lu, run fsck to repair", ino); + set_sbi_flag(sbi, SBI_NEED_FSCK); + ret = -EFSCORRUPTED; + trace_f2fs_iget_exit(inode, ret); + iput(inode); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return ERR_PTR(ret); + } + + trace_f2fs_iget(inode); + return inode; + } + + if (is_meta_ino(sbi, ino)) + goto make_now; + + ret = do_read_inode(inode); + if (ret) + goto bad_inode; +make_now: + if (ino == F2FS_NODE_INO(sbi)) { + inode->i_mapping->a_ops = &f2fs_node_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + } else if (ino == F2FS_META_INO(sbi)) { + inode->i_mapping->a_ops = &f2fs_meta_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + } else if (ino == F2FS_COMPRESS_INO(sbi)) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + inode->i_mapping->a_ops = &f2fs_compress_aops; + /* + * generic_error_remove_page only truncates pages of regular + * inode + */ + inode->i_mode |= S_IFREG; +#endif + mapping_set_gfp_mask(inode->i_mapping, + GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE); + } else if (S_ISREG(inode->i_mode)) { + inode->i_op = &f2fs_file_inode_operations; + inode->i_fop = &f2fs_file_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &f2fs_dir_inode_operations; + inode->i_fop = &f2fs_dir_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + } else if (S_ISLNK(inode->i_mode)) { + if (file_is_encrypt(inode)) + inode->i_op = &f2fs_encrypted_symlink_inode_operations; + else + inode->i_op = &f2fs_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &f2fs_dblock_aops; + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + inode->i_op = &f2fs_special_inode_operations; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + } else { + ret = -EIO; + goto bad_inode; + } + f2fs_set_inode_flags(inode); + + if (file_should_truncate(inode) && + !is_sbi_flag_set(sbi, SBI_POR_DOING)) { + ret = f2fs_truncate(inode); + if (ret) + goto bad_inode; + file_dont_truncate(inode); + } + + unlock_new_inode(inode); + trace_f2fs_iget(inode); + return inode; + +bad_inode: + f2fs_inode_synced(inode); + iget_failed(inode); + trace_f2fs_iget_exit(inode, ret); + return ERR_PTR(ret); +} + +struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino) +{ + struct inode *inode; +retry: + inode = f2fs_iget(sb, ino); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -ENOMEM) { + memalloc_retry_wait(GFP_NOFS); + goto retry; + } + } + return inode; +} + +void f2fs_update_inode(struct inode *inode, struct page *node_page) +{ + struct f2fs_inode *ri; + struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ]; + + f2fs_wait_on_page_writeback(node_page, NODE, true, true); + set_page_dirty(node_page); + + f2fs_inode_synced(inode); + + ri = F2FS_INODE(node_page); + + ri->i_mode = cpu_to_le16(inode->i_mode); + ri->i_advise = F2FS_I(inode)->i_advise; + ri->i_uid = cpu_to_le32(i_uid_read(inode)); + ri->i_gid = cpu_to_le32(i_gid_read(inode)); + ri->i_links = cpu_to_le32(inode->i_nlink); + ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1); + + if (!f2fs_is_atomic_file(inode) || + is_inode_flag_set(inode, FI_ATOMIC_COMMITTED)) + ri->i_size = cpu_to_le64(i_size_read(inode)); + + if (et) { + read_lock(&et->lock); + set_raw_read_extent(&et->largest, &ri->i_ext); + read_unlock(&et->lock); + } else { + memset(&ri->i_ext, 0, sizeof(ri->i_ext)); + } + set_raw_inline(inode, ri); + + ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); + ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); + ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + if (S_ISDIR(inode->i_mode)) + ri->i_current_depth = + cpu_to_le32(F2FS_I(inode)->i_current_depth); + else if (S_ISREG(inode->i_mode)) + ri->i_gc_failures = + cpu_to_le16(F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]); + ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid); + ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); + ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); + ri->i_generation = cpu_to_le32(inode->i_generation); + ri->i_dir_level = F2FS_I(inode)->i_dir_level; + + if (f2fs_has_extra_attr(inode)) { + ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize); + + if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode))) + ri->i_inline_xattr_size = + cpu_to_le16(F2FS_I(inode)->i_inline_xattr_size); + + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_projid)) { + projid_t i_projid; + + i_projid = from_kprojid(&init_user_ns, + F2FS_I(inode)->i_projid); + ri->i_projid = cpu_to_le32(i_projid); + } + + if (f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_crtime)) { + ri->i_crtime = + cpu_to_le64(F2FS_I(inode)->i_crtime.tv_sec); + ri->i_crtime_nsec = + cpu_to_le32(F2FS_I(inode)->i_crtime.tv_nsec); + } + + if (f2fs_sb_has_compression(F2FS_I_SB(inode)) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_log_cluster_size)) { + unsigned short compress_flag; + + ri->i_compr_blocks = + cpu_to_le64(atomic_read( + &F2FS_I(inode)->i_compr_blocks)); + ri->i_compress_algorithm = + F2FS_I(inode)->i_compress_algorithm; + compress_flag = F2FS_I(inode)->i_compress_flag | + F2FS_I(inode)->i_compress_level << + COMPRESS_LEVEL_OFFSET; + ri->i_compress_flag = cpu_to_le16(compress_flag); + ri->i_log_cluster_size = + F2FS_I(inode)->i_log_cluster_size; + } + } + + __set_inode_rdev(inode, ri); + + /* deleted inode */ + if (inode->i_nlink == 0) + clear_page_private_inline(node_page); + + init_idisk_time(inode); +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_inode_chksum_set(F2FS_I_SB(inode), node_page); +#endif +} + +void f2fs_update_inode_page(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct page *node_page; + int count = 0; +retry: + node_page = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(node_page)) { + int err = PTR_ERR(node_page); + + /* The node block was truncated. */ + if (err == -ENOENT) + return; + + if (err == -ENOMEM || ++count <= DEFAULT_RETRY_IO_COUNT) + goto retry; + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_UPDATE_INODE); + return; + } + f2fs_update_inode(inode, node_page); + f2fs_put_page(node_page, 1); +} + +int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) + return 0; + + /* + * atime could be updated without dirtying f2fs inode in lazytime mode + */ + if (f2fs_is_time_consistent(inode) && + !is_inode_flag_set(inode, FI_DIRTY_INODE)) + return 0; + + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; + + /* + * We need to balance fs here to prevent from producing dirty node pages + * during the urgent cleaning time when running out of free sections. + */ + f2fs_update_inode_page(inode); + if (wbc && wbc->nr_to_write) + f2fs_balance_fs(sbi, true); + return 0; +} + +/* + * Called at the last iput() if i_nlink is zero + */ +void f2fs_evict_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + nid_t xnid = fi->i_xattr_nid; + int err = 0; + + f2fs_abort_atomic_write(inode, true); + + if (fi->cow_inode) { + clear_inode_flag(fi->cow_inode, FI_COW_FILE); + iput(fi->cow_inode); + fi->cow_inode = NULL; + } + + trace_f2fs_evict_inode(inode); + truncate_inode_pages_final(&inode->i_data); + + if ((inode->i_nlink || is_bad_inode(inode)) && + test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode)) + f2fs_invalidate_compress_pages(sbi, inode->i_ino); + + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi) || + inode->i_ino == F2FS_COMPRESS_INO(sbi)) + goto out_clear; + + f2fs_bug_on(sbi, get_dirty_pages(inode)); + f2fs_remove_dirty_inode(inode); + + f2fs_destroy_extent_tree(inode); + + if (inode->i_nlink || is_bad_inode(inode)) + goto no_delete; + + err = f2fs_dquot_initialize(inode); + if (err) { + err = 0; + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + } + + f2fs_remove_ino_entry(sbi, inode->i_ino, APPEND_INO); + f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); + f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO); + + if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING)) + sb_start_intwrite(inode->i_sb); + set_inode_flag(inode, FI_NO_ALLOC); + i_size_write(inode, 0); +retry: + if (F2FS_HAS_BLOCKS(inode)) + err = f2fs_truncate(inode); + + if (time_to_inject(sbi, FAULT_EVICT_INODE)) { + f2fs_show_injection_info(sbi, FAULT_EVICT_INODE); + err = -EIO; + } + + if (!err) { + f2fs_lock_op(sbi); + err = f2fs_remove_inode_page(inode); + f2fs_unlock_op(sbi); + if (err == -ENOENT) { + err = 0; + + /* + * in fuzzed image, another node may has the same + * block address as inode's, if it was truncated + * previously, truncation of inode node will fail. + */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + f2fs_warn(F2FS_I_SB(inode), + "f2fs_evict_inode: inconsistent node id, ino:%lu", + inode->i_ino); + f2fs_inode_synced(inode); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + } + } + + /* give more chances, if ENOMEM case */ + if (err == -ENOMEM) { + err = 0; + goto retry; + } + + if (err) { + f2fs_update_inode_page(inode); + if (dquot_initialize_needed(inode)) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + } + if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING)) + sb_end_intwrite(inode->i_sb); +no_delete: + dquot_drop(inode); + + stat_dec_inline_xattr(inode); + stat_dec_inline_dir(inode); + stat_dec_inline_inode(inode); + stat_dec_compr_inode(inode); + stat_sub_compr_blocks(inode, + atomic_read(&fi->i_compr_blocks)); + + if (likely(!f2fs_cp_error(sbi) && + !is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); + else + f2fs_inode_synced(inode); + + /* for the case f2fs_new_inode() was failed, .i_ino is zero, skip it */ + if (inode->i_ino) + invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, + inode->i_ino); + if (xnid) + invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); + if (inode->i_nlink) { + if (is_inode_flag_set(inode, FI_APPEND_WRITE)) + f2fs_add_ino_entry(sbi, inode->i_ino, APPEND_INO); + if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) + f2fs_add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + } + if (is_inode_flag_set(inode, FI_FREE_NID)) { + f2fs_alloc_nid_failed(sbi, inode->i_ino); + clear_inode_flag(inode, FI_FREE_NID); + } else { + /* + * If xattr nid is corrupted, we can reach out error condition, + * err & !f2fs_exist_written_data(sbi, inode->i_ino, ORPHAN_INO)). + * In that case, f2fs_check_nid_range() is enough to give a clue. + */ + } +out_clear: + fscrypt_put_encryption_info(inode); + fsverity_cleanup_inode(inode); + clear_inode(inode); +} + +/* caller should call f2fs_lock_op() */ +void f2fs_handle_failed_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct node_info ni; + int err; + + /* + * clear nlink of inode in order to release resource of inode + * immediately. + */ + clear_nlink(inode); + + /* + * we must call this to avoid inode being remained as dirty, resulting + * in a panic when flushing dirty inodes in gdirty_list. + */ + f2fs_update_inode_page(inode); + f2fs_inode_synced(inode); + + /* don't make bad inode, since it becomes a regular file. */ + unlock_new_inode(inode); + + /* + * Note: we should add inode to orphan list before f2fs_unlock_op() + * so we can prevent losing this orphan when encoutering checkpoint + * and following suddenly power-off. + */ + err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + set_inode_flag(inode, FI_FREE_NID); + f2fs_warn(sbi, "May loss orphan inode, run fsck to fix."); + goto out; + } + + if (ni.blk_addr != NULL_ADDR) { + err = f2fs_acquire_orphan_inode(sbi); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "Too many orphan inodes, run fsck to fix."); + } else { + f2fs_add_orphan_inode(inode); + } + f2fs_alloc_nid_done(sbi, inode->i_ino); + } else { + set_inode_flag(inode, FI_FREE_NID); + } + +out: + f2fs_unlock_op(sbi); + + /* iput will drop the inode object */ + iput(inode); +} diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c new file mode 100644 index 000000000..02393c95c --- /dev/null +++ b/fs/f2fs/iostat.c @@ -0,0 +1,324 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * f2fs iostat support + * + * Copyright 2021 Google LLC + * Author: Daeho Jeong <daehojeong@google.com> + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/seq_file.h> + +#include "f2fs.h" +#include "iostat.h" +#include <trace/events/f2fs.h> + +#define NUM_PREALLOC_IOSTAT_CTXS 128 +static struct kmem_cache *bio_iostat_ctx_cache; +static mempool_t *bio_iostat_ctx_pool; + +int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + time64_t now = ktime_get_real_seconds(); + + if (!sbi->iostat_enable) + return 0; + + seq_printf(seq, "time: %-16llu\n", now); + + /* print app write IOs */ + seq_puts(seq, "[WRITE]\n"); + seq_printf(seq, "app buffered data: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_IO]); + seq_printf(seq, "app direct data: %-16llu\n", + sbi->rw_iostat[APP_DIRECT_IO]); + seq_printf(seq, "app mapped data: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_IO]); + seq_printf(seq, "app buffered cdata: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_CDATA_IO]); + seq_printf(seq, "app mapped cdata: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_CDATA_IO]); + + /* print fs write IOs */ + seq_printf(seq, "fs data: %-16llu\n", + sbi->rw_iostat[FS_DATA_IO]); + seq_printf(seq, "fs cdata: %-16llu\n", + sbi->rw_iostat[FS_CDATA_IO]); + seq_printf(seq, "fs node: %-16llu\n", + sbi->rw_iostat[FS_NODE_IO]); + seq_printf(seq, "fs meta: %-16llu\n", + sbi->rw_iostat[FS_META_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->rw_iostat[FS_GC_DATA_IO]); + seq_printf(seq, "fs gc node: %-16llu\n", + sbi->rw_iostat[FS_GC_NODE_IO]); + seq_printf(seq, "fs cp data: %-16llu\n", + sbi->rw_iostat[FS_CP_DATA_IO]); + seq_printf(seq, "fs cp node: %-16llu\n", + sbi->rw_iostat[FS_CP_NODE_IO]); + seq_printf(seq, "fs cp meta: %-16llu\n", + sbi->rw_iostat[FS_CP_META_IO]); + + /* print app read IOs */ + seq_puts(seq, "[READ]\n"); + seq_printf(seq, "app buffered data: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_READ_IO]); + seq_printf(seq, "app direct data: %-16llu\n", + sbi->rw_iostat[APP_DIRECT_READ_IO]); + seq_printf(seq, "app mapped data: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_READ_IO]); + seq_printf(seq, "app buffered cdata: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO]); + seq_printf(seq, "app mapped cdata: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO]); + + /* print fs read IOs */ + seq_printf(seq, "fs data: %-16llu\n", + sbi->rw_iostat[FS_DATA_READ_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->rw_iostat[FS_GDATA_READ_IO]); + seq_printf(seq, "fs cdata: %-16llu\n", + sbi->rw_iostat[FS_CDATA_READ_IO]); + seq_printf(seq, "fs node: %-16llu\n", + sbi->rw_iostat[FS_NODE_READ_IO]); + seq_printf(seq, "fs meta: %-16llu\n", + sbi->rw_iostat[FS_META_READ_IO]); + + /* print other IOs */ + seq_puts(seq, "[OTHER]\n"); + seq_printf(seq, "fs discard: %-16llu\n", + sbi->rw_iostat[FS_DISCARD]); + + return 0; +} + +static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) +{ + int io, idx = 0; + unsigned int cnt; + struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; + struct iostat_lat_info *io_lat = sbi->iostat_io_lat; + unsigned long flags; + + spin_lock_irqsave(&sbi->iostat_lat_lock, flags); + for (idx = 0; idx < MAX_IO_TYPE; idx++) { + for (io = 0; io < NR_PAGE_TYPE; io++) { + cnt = io_lat->bio_cnt[idx][io]; + iostat_lat[idx][io].peak_lat = + jiffies_to_msecs(io_lat->peak_lat[idx][io]); + iostat_lat[idx][io].cnt = cnt; + iostat_lat[idx][io].avg_lat = cnt ? + jiffies_to_msecs(io_lat->sum_lat[idx][io]) / cnt : 0; + io_lat->sum_lat[idx][io] = 0; + io_lat->peak_lat[idx][io] = 0; + io_lat->bio_cnt[idx][io] = 0; + } + } + spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); + + trace_f2fs_iostat_latency(sbi, iostat_lat); +} + +static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) +{ + unsigned long long iostat_diff[NR_IO_TYPE]; + int i; + unsigned long flags; + + if (time_is_after_jiffies(sbi->iostat_next_period)) + return; + + /* Need double check under the lock */ + spin_lock_irqsave(&sbi->iostat_lock, flags); + if (time_is_after_jiffies(sbi->iostat_next_period)) { + spin_unlock_irqrestore(&sbi->iostat_lock, flags); + return; + } + sbi->iostat_next_period = jiffies + + msecs_to_jiffies(sbi->iostat_period_ms); + + for (i = 0; i < NR_IO_TYPE; i++) { + iostat_diff[i] = sbi->rw_iostat[i] - + sbi->prev_rw_iostat[i]; + sbi->prev_rw_iostat[i] = sbi->rw_iostat[i]; + } + spin_unlock_irqrestore(&sbi->iostat_lock, flags); + + trace_f2fs_iostat(sbi, iostat_diff); + + __record_iostat_latency(sbi); +} + +void f2fs_reset_iostat(struct f2fs_sb_info *sbi) +{ + struct iostat_lat_info *io_lat = sbi->iostat_io_lat; + int i; + + spin_lock_irq(&sbi->iostat_lock); + for (i = 0; i < NR_IO_TYPE; i++) { + sbi->rw_iostat[i] = 0; + sbi->prev_rw_iostat[i] = 0; + } + spin_unlock_irq(&sbi->iostat_lock); + + spin_lock_irq(&sbi->iostat_lat_lock); + memset(io_lat, 0, sizeof(struct iostat_lat_info)); + spin_unlock_irq(&sbi->iostat_lat_lock); +} + +void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, + enum iostat_type type, unsigned long long io_bytes) +{ + unsigned long flags; + + if (!sbi->iostat_enable) + return; + + spin_lock_irqsave(&sbi->iostat_lock, flags); + sbi->rw_iostat[type] += io_bytes; + + if (type == APP_BUFFERED_IO || type == APP_DIRECT_IO) + sbi->rw_iostat[APP_WRITE_IO] += io_bytes; + + if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO) + sbi->rw_iostat[APP_READ_IO] += io_bytes; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (inode && f2fs_compressed_file(inode)) { + if (type == APP_BUFFERED_IO) + sbi->rw_iostat[APP_BUFFERED_CDATA_IO] += io_bytes; + + if (type == APP_BUFFERED_READ_IO) + sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO] += io_bytes; + + if (type == APP_MAPPED_READ_IO) + sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO] += io_bytes; + + if (type == APP_MAPPED_IO) + sbi->rw_iostat[APP_MAPPED_CDATA_IO] += io_bytes; + + if (type == FS_DATA_READ_IO) + sbi->rw_iostat[FS_CDATA_READ_IO] += io_bytes; + + if (type == FS_DATA_IO) + sbi->rw_iostat[FS_CDATA_IO] += io_bytes; + } +#endif + + spin_unlock_irqrestore(&sbi->iostat_lock, flags); + + f2fs_record_iostat(sbi); +} + +static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, + int rw, bool is_sync) +{ + unsigned long ts_diff; + unsigned int iotype = iostat_ctx->type; + struct f2fs_sb_info *sbi = iostat_ctx->sbi; + struct iostat_lat_info *io_lat = sbi->iostat_io_lat; + int idx; + unsigned long flags; + + if (!sbi->iostat_enable) + return; + + ts_diff = jiffies - iostat_ctx->submit_ts; + if (iotype == META_FLUSH) { + iotype = META; + } else if (iotype >= NR_PAGE_TYPE) { + f2fs_warn(sbi, "%s: %d over NR_PAGE_TYPE", __func__, iotype); + return; + } + + if (rw == 0) { + idx = READ_IO; + } else { + if (is_sync) + idx = WRITE_SYNC_IO; + else + idx = WRITE_ASYNC_IO; + } + + spin_lock_irqsave(&sbi->iostat_lat_lock, flags); + io_lat->sum_lat[idx][iotype] += ts_diff; + io_lat->bio_cnt[idx][iotype]++; + if (ts_diff > io_lat->peak_lat[idx][iotype]) + io_lat->peak_lat[idx][iotype] = ts_diff; + spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); +} + +void iostat_update_and_unbind_ctx(struct bio *bio, int rw) +{ + struct bio_iostat_ctx *iostat_ctx = bio->bi_private; + bool is_sync = bio->bi_opf & REQ_SYNC; + + if (rw == 0) + bio->bi_private = iostat_ctx->post_read_ctx; + else + bio->bi_private = iostat_ctx->sbi; + __update_iostat_latency(iostat_ctx, rw, is_sync); + mempool_free(iostat_ctx, bio_iostat_ctx_pool); +} + +void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, + struct bio *bio, struct bio_post_read_ctx *ctx) +{ + struct bio_iostat_ctx *iostat_ctx; + /* Due to the mempool, this never fails. */ + iostat_ctx = mempool_alloc(bio_iostat_ctx_pool, GFP_NOFS); + iostat_ctx->sbi = sbi; + iostat_ctx->submit_ts = 0; + iostat_ctx->type = 0; + iostat_ctx->post_read_ctx = ctx; + bio->bi_private = iostat_ctx; +} + +int __init f2fs_init_iostat_processing(void) +{ + bio_iostat_ctx_cache = + kmem_cache_create("f2fs_bio_iostat_ctx", + sizeof(struct bio_iostat_ctx), 0, 0, NULL); + if (!bio_iostat_ctx_cache) + goto fail; + bio_iostat_ctx_pool = + mempool_create_slab_pool(NUM_PREALLOC_IOSTAT_CTXS, + bio_iostat_ctx_cache); + if (!bio_iostat_ctx_pool) + goto fail_free_cache; + return 0; + +fail_free_cache: + kmem_cache_destroy(bio_iostat_ctx_cache); +fail: + return -ENOMEM; +} + +void f2fs_destroy_iostat_processing(void) +{ + mempool_destroy(bio_iostat_ctx_pool); + kmem_cache_destroy(bio_iostat_ctx_cache); +} + +int f2fs_init_iostat(struct f2fs_sb_info *sbi) +{ + /* init iostat info */ + spin_lock_init(&sbi->iostat_lock); + spin_lock_init(&sbi->iostat_lat_lock); + sbi->iostat_enable = false; + sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS; + sbi->iostat_io_lat = f2fs_kzalloc(sbi, sizeof(struct iostat_lat_info), + GFP_KERNEL); + if (!sbi->iostat_io_lat) + return -ENOMEM; + + return 0; +} + +void f2fs_destroy_iostat(struct f2fs_sb_info *sbi) +{ + kfree(sbi->iostat_io_lat); +} diff --git a/fs/f2fs/iostat.h b/fs/f2fs/iostat.h new file mode 100644 index 000000000..2c048307b --- /dev/null +++ b/fs/f2fs/iostat.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2021 Google LLC + * Author: Daeho Jeong <daehojeong@google.com> + */ +#ifndef __F2FS_IOSTAT_H__ +#define __F2FS_IOSTAT_H__ + +struct bio_post_read_ctx; + +#ifdef CONFIG_F2FS_IOSTAT + +#define DEFAULT_IOSTAT_PERIOD_MS 3000 +#define MIN_IOSTAT_PERIOD_MS 100 +/* maximum period of iostat tracing is 1 day */ +#define MAX_IOSTAT_PERIOD_MS 8640000 + +enum { + READ_IO, + WRITE_SYNC_IO, + WRITE_ASYNC_IO, + MAX_IO_TYPE, +}; + +struct iostat_lat_info { + unsigned long sum_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* sum of io latencies */ + unsigned long peak_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* peak io latency */ + unsigned int bio_cnt[MAX_IO_TYPE][NR_PAGE_TYPE]; /* bio count */ +}; + +extern int __maybe_unused iostat_info_seq_show(struct seq_file *seq, + void *offset); +extern void f2fs_reset_iostat(struct f2fs_sb_info *sbi); +extern void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, + enum iostat_type type, unsigned long long io_bytes); + +struct bio_iostat_ctx { + struct f2fs_sb_info *sbi; + unsigned long submit_ts; + enum page_type type; + struct bio_post_read_ctx *post_read_ctx; +}; + +static inline void iostat_update_submit_ctx(struct bio *bio, + enum page_type type) +{ + struct bio_iostat_ctx *iostat_ctx = bio->bi_private; + + iostat_ctx->submit_ts = jiffies; + iostat_ctx->type = type; +} + +static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio) +{ + struct bio_iostat_ctx *iostat_ctx = bio->bi_private; + + return iostat_ctx->post_read_ctx; +} + +extern void iostat_update_and_unbind_ctx(struct bio *bio, int rw); +extern void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, + struct bio *bio, struct bio_post_read_ctx *ctx); +extern int f2fs_init_iostat_processing(void); +extern void f2fs_destroy_iostat_processing(void); +extern int f2fs_init_iostat(struct f2fs_sb_info *sbi); +extern void f2fs_destroy_iostat(struct f2fs_sb_info *sbi); +#else +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, + enum iostat_type type, unsigned long long io_bytes) {} +static inline void iostat_update_and_unbind_ctx(struct bio *bio, int rw) {} +static inline void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, + struct bio *bio, struct bio_post_read_ctx *ctx) {} +static inline void iostat_update_submit_ctx(struct bio *bio, + enum page_type type) {} +static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio) +{ + return bio->bi_private; +} +static inline int f2fs_init_iostat_processing(void) { return 0; } +static inline void f2fs_destroy_iostat_processing(void) {} +static inline int f2fs_init_iostat(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_iostat(struct f2fs_sb_info *sbi) {} +#endif +#endif /* __F2FS_IOSTAT_H__ */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c new file mode 100644 index 000000000..328cd20b1 --- /dev/null +++ b/fs/f2fs/namei.c @@ -0,0 +1,1400 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/f2fs/namei.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/pagemap.h> +#include <linux/sched.h> +#include <linux/ctype.h> +#include <linux/random.h> +#include <linux/dcache.h> +#include <linux/namei.h> +#include <linux/quotaops.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "xattr.h" +#include "acl.h" +#include <trace/events/f2fs.h> + +static inline int is_extension_exist(const unsigned char *s, const char *sub, + bool tmp_ext) +{ + size_t slen = strlen(s); + size_t sublen = strlen(sub); + int i; + + if (sublen == 1 && *sub == '*') + return 1; + + /* + * filename format of multimedia file should be defined as: + * "filename + '.' + extension + (optional: '.' + temp extension)". + */ + if (slen < sublen + 2) + return 0; + + if (!tmp_ext) { + /* file has no temp extension */ + if (s[slen - sublen - 1] != '.') + return 0; + return !strncasecmp(s + slen - sublen, sub, sublen); + } + + for (i = 1; i < slen - sublen; i++) { + if (s[i] != '.') + continue; + if (!strncasecmp(s + i + 1, sub, sublen)) + return 1; + } + + return 0; +} + +int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, + bool hot, bool set) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int cold_count = le32_to_cpu(sbi->raw_super->extension_count); + int hot_count = sbi->raw_super->hot_ext_count; + int total_count = cold_count + hot_count; + int start, count; + int i; + + if (set) { + if (total_count == F2FS_MAX_EXTENSION) + return -EINVAL; + } else { + if (!hot && !cold_count) + return -EINVAL; + if (hot && !hot_count) + return -EINVAL; + } + + if (hot) { + start = cold_count; + count = total_count; + } else { + start = 0; + count = cold_count; + } + + for (i = start; i < count; i++) { + if (strcmp(name, extlist[i])) + continue; + + if (set) + return -EINVAL; + + memcpy(extlist[i], extlist[i + 1], + F2FS_EXTENSION_LEN * (total_count - i - 1)); + memset(extlist[total_count - 1], 0, F2FS_EXTENSION_LEN); + if (hot) + sbi->raw_super->hot_ext_count = hot_count - 1; + else + sbi->raw_super->extension_count = + cpu_to_le32(cold_count - 1); + return 0; + } + + if (!set) + return -EINVAL; + + if (hot) { + memcpy(extlist[count], name, strlen(name)); + sbi->raw_super->hot_ext_count = hot_count + 1; + } else { + char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN]; + + memcpy(buf, &extlist[cold_count], + F2FS_EXTENSION_LEN * hot_count); + memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN); + memcpy(extlist[cold_count], name, strlen(name)); + memcpy(&extlist[cold_count + 1], buf, + F2FS_EXTENSION_LEN * hot_count); + sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1); + } + return 0; +} + +static void set_compress_new_inode(struct f2fs_sb_info *sbi, struct inode *dir, + struct inode *inode, const unsigned char *name) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + unsigned char (*noext)[F2FS_EXTENSION_LEN] = + F2FS_OPTION(sbi).noextensions; + unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions; + unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + int i, cold_count, hot_count; + + if (!f2fs_sb_has_compression(sbi)) + return; + + if (S_ISDIR(inode->i_mode)) + goto inherit_comp; + + /* This name comes only from normal files. */ + if (!name) + return; + + /* Don't compress hot files. */ + f2fs_down_read(&sbi->sb_lock); + cold_count = le32_to_cpu(sbi->raw_super->extension_count); + hot_count = sbi->raw_super->hot_ext_count; + for (i = cold_count; i < cold_count + hot_count; i++) + if (is_extension_exist(name, extlist[i], false)) + break; + f2fs_up_read(&sbi->sb_lock); + if (i < (cold_count + hot_count)) + return; + + /* Don't compress unallowed extension. */ + for (i = 0; i < noext_cnt; i++) + if (is_extension_exist(name, noext[i], false)) + return; + + /* Compress wanting extension. */ + for (i = 0; i < ext_cnt; i++) { + if (is_extension_exist(name, ext[i], false)) { + set_compress_context(inode); + return; + } + } +inherit_comp: + /* Inherit the {no-}compression flag in directory */ + if (F2FS_I(dir)->i_flags & F2FS_NOCOMP_FL) { + F2FS_I(inode)->i_flags |= F2FS_NOCOMP_FL; + f2fs_mark_inode_dirty_sync(inode, true); + } else if (F2FS_I(dir)->i_flags & F2FS_COMPR_FL) { + set_compress_context(inode); + } +} + +static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, + struct inode *dir, umode_t mode, + const char *name) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + nid_t ino; + struct inode *inode; + bool nid_free = false; + bool encrypt = false; + int xattr_size = 0; + int err; + + inode = new_inode(dir->i_sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!f2fs_alloc_nid(sbi, &ino)) { + err = -ENOSPC; + goto fail; + } + + nid_free = true; + + inode_init_owner(mnt_userns, inode, dir, mode); + + inode->i_ino = ino; + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + F2FS_I(inode)->i_crtime = inode->i_mtime; + inode->i_generation = get_random_u32(); + + if (S_ISDIR(inode->i_mode)) + F2FS_I(inode)->i_current_depth = 1; + + err = insert_inode_locked(inode); + if (err) { + err = -EINVAL; + goto fail; + } + + if (f2fs_sb_has_project_quota(sbi) && + (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL)) + F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; + else + F2FS_I(inode)->i_projid = make_kprojid(mnt_userns, + F2FS_DEF_PROJID); + + err = fscrypt_prepare_new_inode(dir, inode, &encrypt); + if (err) + goto fail_drop; + + err = f2fs_dquot_initialize(inode); + if (err) + goto fail_drop; + + set_inode_flag(inode, FI_NEW_INODE); + + if (encrypt) + f2fs_set_encrypted_inode(inode); + + if (f2fs_sb_has_extra_attr(sbi)) { + set_inode_flag(inode, FI_EXTRA_ATTR); + F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; + } + + if (test_opt(sbi, INLINE_XATTR)) + set_inode_flag(inode, FI_INLINE_XATTR); + + if (f2fs_may_inline_dentry(inode)) + set_inode_flag(inode, FI_INLINE_DENTRY); + + if (f2fs_sb_has_flexible_inline_xattr(sbi)) { + f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); + if (f2fs_has_inline_xattr(inode)) + xattr_size = F2FS_OPTION(sbi).inline_xattr_size; + /* Otherwise, will be 0 */ + } else if (f2fs_has_inline_xattr(inode) || + f2fs_has_inline_dentry(inode)) { + xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + } + F2FS_I(inode)->i_inline_xattr_size = xattr_size; + + F2FS_I(inode)->i_flags = + f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); + + if (S_ISDIR(inode->i_mode)) + F2FS_I(inode)->i_flags |= F2FS_INDEX_FL; + + if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + + /* Check compression first. */ + set_compress_new_inode(sbi, dir, inode, name); + + /* Should enable inline_data after compression set */ + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) + set_inode_flag(inode, FI_INLINE_DATA); + + stat_inc_inline_xattr(inode); + stat_inc_inline_inode(inode); + stat_inc_inline_dir(inode); + + f2fs_set_inode_flags(inode); + + f2fs_init_extent_tree(inode); + + trace_f2fs_new_inode(inode, 0); + return inode; + +fail: + trace_f2fs_new_inode(inode, err); + make_bad_inode(inode); + if (nid_free) + set_inode_flag(inode, FI_FREE_NID); + iput(inode); + return ERR_PTR(err); +fail_drop: + trace_f2fs_new_inode(inode, err); + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + if (nid_free) + set_inode_flag(inode, FI_FREE_NID); + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(err); +} + +/* + * Set file's temperature for hot/cold data separation + */ +static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, + const unsigned char *name) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int i, cold_count, hot_count; + + f2fs_down_read(&sbi->sb_lock); + + cold_count = le32_to_cpu(sbi->raw_super->extension_count); + hot_count = sbi->raw_super->hot_ext_count; + + for (i = 0; i < cold_count + hot_count; i++) { + if (is_extension_exist(name, extlist[i], true)) + break; + } + + f2fs_up_read(&sbi->sb_lock); + + if (i == cold_count + hot_count) + return; + + if (i < cold_count) + file_set_cold(inode); + else + file_set_hot(inode); +} + +static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode; + nid_t ino = 0; + int err; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; + + err = f2fs_dquot_initialize(dir); + if (err) + return err; + + inode = f2fs_new_inode(mnt_userns, dir, mode, dentry->d_name.name); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) + set_file_temperature(sbi, inode, dentry->d_name.name); + + inode->i_op = &f2fs_file_inode_operations; + inode->i_fop = &f2fs_file_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + ino = inode->i_ino; + + f2fs_lock_op(sbi); + err = f2fs_add_link(dentry, inode); + if (err) + goto out; + f2fs_unlock_op(sbi); + + f2fs_alloc_nid_done(sbi, ino); + + d_instantiate_new(dentry, inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); + return 0; +out: + f2fs_handle_failed_inode(inode); + return err; +} + +static int f2fs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = d_inode(old_dentry); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + int err; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; + + err = fscrypt_prepare_link(old_dentry, dir, dentry); + if (err) + return err; + + if (is_inode_flag_set(dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + + err = f2fs_dquot_initialize(dir); + if (err) + return err; + + f2fs_balance_fs(sbi, true); + + inode->i_ctime = current_time(inode); + ihold(inode); + + set_inode_flag(inode, FI_INC_LINK); + f2fs_lock_op(sbi); + err = f2fs_add_link(dentry, inode); + if (err) + goto out; + f2fs_unlock_op(sbi); + + d_instantiate(dentry, inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + return 0; +out: + clear_inode_flag(inode, FI_INC_LINK); + iput(inode); + f2fs_unlock_op(sbi); + return err; +} + +struct dentry *f2fs_get_parent(struct dentry *child) +{ + struct page *page; + unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot_name, &page); + + if (!ino) { + if (IS_ERR(page)) + return ERR_CAST(page); + return ERR_PTR(-ENOENT); + } + return d_obtain_alias(f2fs_iget(child->d_sb, ino)); +} + +static int __recover_dot_dentries(struct inode *dir, nid_t pino) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct qstr dot = QSTR_INIT(".", 1); + struct qstr dotdot = QSTR_INIT("..", 2); + struct f2fs_dir_entry *de; + struct page *page; + int err = 0; + + if (f2fs_readonly(sbi->sb)) { + f2fs_info(sbi, "skip recovering inline_dots inode (ino:%lu, pino:%u) in readonly mountpoint", + dir->i_ino, pino); + return 0; + } + + if (!S_ISDIR(dir->i_mode)) { + f2fs_err(sbi, "inconsistent inode status, skip recovering inline_dots inode (ino:%lu, i_mode:%u, pino:%u)", + dir->i_ino, dir->i_mode, pino); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return -ENOTDIR; + } + + err = f2fs_dquot_initialize(dir); + if (err) + return err; + + f2fs_balance_fs(sbi, true); + + f2fs_lock_op(sbi); + + de = f2fs_find_entry(dir, &dot, &page); + if (de) { + f2fs_put_page(page, 0); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } else { + err = f2fs_do_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR); + if (err) + goto out; + } + + de = f2fs_find_entry(dir, &dotdot, &page); + if (de) + f2fs_put_page(page, 0); + else if (IS_ERR(page)) + err = PTR_ERR(page); + else + err = f2fs_do_add_link(dir, &dotdot, NULL, pino, S_IFDIR); +out: + if (!err) + clear_inode_flag(dir, FI_INLINE_DOTS); + + f2fs_unlock_op(sbi); + return err; +} + +static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct inode *inode = NULL; + struct f2fs_dir_entry *de; + struct page *page; + struct dentry *new; + nid_t ino = -1; + int err = 0; + unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir)); + struct f2fs_filename fname; + + trace_f2fs_lookup_start(dir, dentry, flags); + + if (dentry->d_name.len > F2FS_NAME_LEN) { + err = -ENAMETOOLONG; + goto out; + } + + err = f2fs_prepare_lookup(dir, dentry, &fname); + generic_set_encrypted_ci_d_ops(dentry); + if (err == -ENOENT) + goto out_splice; + if (err) + goto out; + de = __f2fs_find_entry(dir, &fname, &page); + f2fs_free_filename(&fname); + + if (!de) { + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } + err = -ENOENT; + goto out_splice; + } + + ino = le32_to_cpu(de->ino); + f2fs_put_page(page, 0); + + inode = f2fs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } + + if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) { + err = __recover_dot_dentries(dir, root_ino); + if (err) + goto out_iput; + } + + if (f2fs_has_inline_dots(inode)) { + err = __recover_dot_dentries(inode, dir->i_ino); + if (err) + goto out_iput; + } + if (IS_ENCRYPTED(dir) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && + !fscrypt_has_permitted_context(dir, inode)) { + f2fs_warn(F2FS_I_SB(inode), "Inconsistent encryption contexts: %lu/%lu", + dir->i_ino, inode->i_ino); + err = -EPERM; + goto out_iput; + } +out_splice: +#if IS_ENABLED(CONFIG_UNICODE) + if (!inode && IS_CASEFOLDED(dir)) { + /* Eventually we want to call d_add_ci(dentry, NULL) + * for negative dentries in the encoding case as + * well. For now, prevent the negative dentry + * from being cached. + */ + trace_f2fs_lookup_end(dir, dentry, ino, err); + return NULL; + } +#endif + new = d_splice_alias(inode, dentry); + err = PTR_ERR_OR_ZERO(new); + trace_f2fs_lookup_end(dir, dentry, ino, !new ? -ENOENT : err); + return new; +out_iput: + iput(inode); +out: + trace_f2fs_lookup_end(dir, dentry, ino, err); + return ERR_PTR(err); +} + +static int f2fs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode = d_inode(dentry); + struct f2fs_dir_entry *de; + struct page *page; + int err; + + trace_f2fs_unlink_enter(dir, dentry); + + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto fail; + } + + err = f2fs_dquot_initialize(dir); + if (err) + goto fail; + err = f2fs_dquot_initialize(inode); + if (err) + goto fail; + + de = f2fs_find_entry(dir, &dentry->d_name, &page); + if (!de) { + if (IS_ERR(page)) + err = PTR_ERR(page); + goto fail; + } + + f2fs_balance_fs(sbi, true); + + f2fs_lock_op(sbi); + err = f2fs_acquire_orphan_inode(sbi); + if (err) { + f2fs_unlock_op(sbi); + f2fs_put_page(page, 0); + goto fail; + } + f2fs_delete_entry(de, page, dir, inode); +#if IS_ENABLED(CONFIG_UNICODE) + /* VFS negative dentries are incompatible with Encoding and + * Case-insensitiveness. Eventually we'll want avoid + * invalidating the dentries here, alongside with returning the + * negative dentries at f2fs_lookup(), when it is better + * supported by the VFS for the CI case. + */ + if (IS_CASEFOLDED(dir)) + d_invalidate(dentry); +#endif + f2fs_unlock_op(sbi); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); +fail: + trace_f2fs_unlink_exit(inode, err); + return err; +} + +static const char *f2fs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + const char *link = page_get_link(dentry, inode, done); + + if (!IS_ERR(link) && !*link) { + /* this is broken symlink case */ + do_delayed_call(done); + clear_delayed_call(done); + link = ERR_PTR(-ENOENT); + } + return link; +} + +static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode; + size_t len = strlen(symname); + struct fscrypt_str disk_link; + int err; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; + + err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, + &disk_link); + if (err) + return err; + + err = f2fs_dquot_initialize(dir); + if (err) + return err; + + inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (IS_ENCRYPTED(inode)) + inode->i_op = &f2fs_encrypted_symlink_inode_operations; + else + inode->i_op = &f2fs_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &f2fs_dblock_aops; + + f2fs_lock_op(sbi); + err = f2fs_add_link(dentry, inode); + if (err) + goto out_f2fs_handle_failed_inode; + f2fs_unlock_op(sbi); + f2fs_alloc_nid_done(sbi, inode->i_ino); + + err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link); + if (err) + goto err_out; + + err = page_symlink(inode, disk_link.name, disk_link.len); + +err_out: + d_instantiate_new(dentry, inode); + + /* + * Let's flush symlink data in order to avoid broken symlink as much as + * possible. Nevertheless, fsyncing is the best way, but there is no + * way to get a file descriptor in order to flush that. + * + * Note that, it needs to do dir->fsync to make this recoverable. + * If the symlink path is stored into inline_data, there is no + * performance regression. + */ + if (!err) { + filemap_write_and_wait_range(inode->i_mapping, 0, + disk_link.len - 1); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + } else { + f2fs_unlink(dir, dentry); + } + + f2fs_balance_fs(sbi, true); + goto out_free_encrypted_link; + +out_f2fs_handle_failed_inode: + f2fs_handle_failed_inode(inode); +out_free_encrypted_link: + if (disk_link.name != (unsigned char *)symname) + kfree(disk_link.name); + return err; +} + +static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode; + int err; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + err = f2fs_dquot_initialize(dir); + if (err) + return err; + + inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &f2fs_dir_inode_operations; + inode->i_fop = &f2fs_dir_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + + set_inode_flag(inode, FI_INC_LINK); + f2fs_lock_op(sbi); + err = f2fs_add_link(dentry, inode); + if (err) + goto out_fail; + f2fs_unlock_op(sbi); + + f2fs_alloc_nid_done(sbi, inode->i_ino); + + d_instantiate_new(dentry, inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); + return 0; + +out_fail: + clear_inode_flag(inode, FI_INC_LINK); + f2fs_handle_failed_inode(inode); + return err; +} + +static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + if (f2fs_empty_dir(inode)) + return f2fs_unlink(dir, dentry); + return -ENOTEMPTY; +} + +static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode; + int err = 0; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; + + err = f2fs_dquot_initialize(dir); + if (err) + return err; + + inode = f2fs_new_inode(mnt_userns, dir, mode, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + init_special_inode(inode, inode->i_mode, rdev); + inode->i_op = &f2fs_special_inode_operations; + + f2fs_lock_op(sbi); + err = f2fs_add_link(dentry, inode); + if (err) + goto out; + f2fs_unlock_op(sbi); + + f2fs_alloc_nid_done(sbi, inode->i_ino); + + d_instantiate_new(dentry, inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); + return 0; +out: + f2fs_handle_failed_inode(inode); + return err; +} + +static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct file *file, umode_t mode, bool is_whiteout, + struct inode **new_inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode; + int err; + + err = f2fs_dquot_initialize(dir); + if (err) + return err; + + inode = f2fs_new_inode(mnt_userns, dir, mode, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (is_whiteout) { + init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); + inode->i_op = &f2fs_special_inode_operations; + } else { + inode->i_op = &f2fs_file_inode_operations; + inode->i_fop = &f2fs_file_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + } + + f2fs_lock_op(sbi); + err = f2fs_acquire_orphan_inode(sbi); + if (err) + goto out; + + err = f2fs_do_tmpfile(inode, dir); + if (err) + goto release_out; + + /* + * add this non-linked tmpfile to orphan list, in this way we could + * remove all unused data of tmpfile after abnormal power-off. + */ + f2fs_add_orphan_inode(inode); + f2fs_alloc_nid_done(sbi, inode->i_ino); + + if (is_whiteout) { + f2fs_i_links_write(inode, false); + + spin_lock(&inode->i_lock); + inode->i_state |= I_LINKABLE; + spin_unlock(&inode->i_lock); + } else { + if (file) + d_tmpfile(file, inode); + else + f2fs_i_links_write(inode, false); + } + /* link_count was changed by d_tmpfile as well. */ + f2fs_unlock_op(sbi); + unlock_new_inode(inode); + + if (new_inode) + *new_inode = inode; + + f2fs_balance_fs(sbi, true); + return 0; + +release_out: + f2fs_release_orphan_inode(sbi); +out: + f2fs_handle_failed_inode(inode); + return err; +} + +static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct file *file, umode_t mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + int err; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; + + err = __f2fs_tmpfile(mnt_userns, dir, file, mode, false, NULL); + + return finish_open_simple(file, err); +} + +static int f2fs_create_whiteout(struct user_namespace *mnt_userns, + struct inode *dir, struct inode **whiteout) +{ + if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) + return -EIO; + + return __f2fs_tmpfile(mnt_userns, dir, NULL, + S_IFCHR | WHITEOUT_MODE, true, whiteout); +} + +int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct inode **new_inode) +{ + return __f2fs_tmpfile(mnt_userns, dir, NULL, S_IFREG, false, new_inode); +} + +static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + struct inode *whiteout = NULL; + struct page *old_dir_page = NULL; + struct page *old_page, *new_page = NULL; + struct f2fs_dir_entry *old_dir_entry = NULL; + struct f2fs_dir_entry *old_entry; + struct f2fs_dir_entry *new_entry; + int err; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; + + if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + + /* + * If new_inode is null, the below renaming flow will + * add a link in old_dir which can conver inline_dir. + * After then, if we failed to get the entry due to other + * reasons like ENOMEM, we had to remove the new entry. + * Instead of adding such the error handling routine, let's + * simply convert first here. + */ + if (old_dir == new_dir && !new_inode) { + err = f2fs_try_convert_inline_dir(old_dir, new_dentry); + if (err) + return err; + } + + if (flags & RENAME_WHITEOUT) { + err = f2fs_create_whiteout(mnt_userns, old_dir, &whiteout); + if (err) + return err; + } + + err = f2fs_dquot_initialize(old_dir); + if (err) + goto out; + + err = f2fs_dquot_initialize(new_dir); + if (err) + goto out; + + if (new_inode) { + err = f2fs_dquot_initialize(new_inode); + if (err) + goto out; + } + + err = -ENOENT; + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); + if (!old_entry) { + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); + goto out; + } + + if (S_ISDIR(old_inode->i_mode)) { + old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); + if (!old_dir_entry) { + if (IS_ERR(old_dir_page)) + err = PTR_ERR(old_dir_page); + goto out_old; + } + } + + if (new_inode) { + + err = -ENOTEMPTY; + if (old_dir_entry && !f2fs_empty_dir(new_inode)) + goto out_dir; + + err = -ENOENT; + new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, + &new_page); + if (!new_entry) { + if (IS_ERR(new_page)) + err = PTR_ERR(new_page); + goto out_dir; + } + + f2fs_balance_fs(sbi, true); + + f2fs_lock_op(sbi); + + err = f2fs_acquire_orphan_inode(sbi); + if (err) + goto put_out_dir; + + f2fs_set_link(new_dir, new_entry, new_page, old_inode); + new_page = NULL; + + new_inode->i_ctime = current_time(new_inode); + f2fs_down_write(&F2FS_I(new_inode)->i_sem); + if (old_dir_entry) + f2fs_i_links_write(new_inode, false); + f2fs_i_links_write(new_inode, false); + f2fs_up_write(&F2FS_I(new_inode)->i_sem); + + if (!new_inode->i_nlink) + f2fs_add_orphan_inode(new_inode); + else + f2fs_release_orphan_inode(sbi); + } else { + f2fs_balance_fs(sbi, true); + + f2fs_lock_op(sbi); + + err = f2fs_add_link(new_dentry, old_inode); + if (err) { + f2fs_unlock_op(sbi); + goto out_dir; + } + + if (old_dir_entry) + f2fs_i_links_write(new_dir, true); + } + + f2fs_down_write(&F2FS_I(old_inode)->i_sem); + if (!old_dir_entry || whiteout) + file_lost_pino(old_inode); + else + /* adjust dir's i_pino to pass fsck check */ + f2fs_i_pino_write(old_inode, new_dir->i_ino); + f2fs_up_write(&F2FS_I(old_inode)->i_sem); + + old_inode->i_ctime = current_time(old_inode); + f2fs_mark_inode_dirty_sync(old_inode, false); + + f2fs_delete_entry(old_entry, old_page, old_dir, NULL); + old_page = NULL; + + if (whiteout) { + set_inode_flag(whiteout, FI_INC_LINK); + err = f2fs_add_link(old_dentry, whiteout); + if (err) + goto put_out_dir; + + spin_lock(&whiteout->i_lock); + whiteout->i_state &= ~I_LINKABLE; + spin_unlock(&whiteout->i_lock); + + iput(whiteout); + } + + if (old_dir_entry) { + if (old_dir != new_dir) + f2fs_set_link(old_inode, old_dir_entry, + old_dir_page, new_dir); + else + f2fs_put_page(old_dir_page, 0); + f2fs_i_links_write(old_dir, false); + } + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) { + f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + if (S_ISDIR(old_inode->i_mode)) + f2fs_add_ino_entry(sbi, old_inode->i_ino, + TRANS_DIR_INO); + } + + f2fs_unlock_op(sbi); + + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + f2fs_sync_fs(sbi->sb, 1); + + f2fs_update_time(sbi, REQ_TIME); + return 0; + +put_out_dir: + f2fs_unlock_op(sbi); + f2fs_put_page(new_page, 0); +out_dir: + if (old_dir_entry) + f2fs_put_page(old_dir_page, 0); +out_old: + f2fs_put_page(old_page, 0); +out: + iput(whiteout); + return err; +} + +static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + struct page *old_dir_page, *new_dir_page; + struct page *old_page, *new_page; + struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL; + struct f2fs_dir_entry *old_entry, *new_entry; + int old_nlink = 0, new_nlink = 0; + int err; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; + + if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid)) || + (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(old_dir)->i_projid, + F2FS_I(new_dentry->d_inode)->i_projid))) + return -EXDEV; + + err = f2fs_dquot_initialize(old_dir); + if (err) + goto out; + + err = f2fs_dquot_initialize(new_dir); + if (err) + goto out; + + err = -ENOENT; + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); + if (!old_entry) { + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); + goto out; + } + + new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); + if (!new_entry) { + if (IS_ERR(new_page)) + err = PTR_ERR(new_page); + goto out_old; + } + + /* prepare for updating ".." directory entry info later */ + if (old_dir != new_dir) { + if (S_ISDIR(old_inode->i_mode)) { + old_dir_entry = f2fs_parent_dir(old_inode, + &old_dir_page); + if (!old_dir_entry) { + if (IS_ERR(old_dir_page)) + err = PTR_ERR(old_dir_page); + goto out_new; + } + } + + if (S_ISDIR(new_inode->i_mode)) { + new_dir_entry = f2fs_parent_dir(new_inode, + &new_dir_page); + if (!new_dir_entry) { + if (IS_ERR(new_dir_page)) + err = PTR_ERR(new_dir_page); + goto out_old_dir; + } + } + } + + /* + * If cross rename between file and directory those are not + * in the same directory, we will inc nlink of file's parent + * later, so we should check upper boundary of its nlink. + */ + if ((!old_dir_entry || !new_dir_entry) && + old_dir_entry != new_dir_entry) { + old_nlink = old_dir_entry ? -1 : 1; + new_nlink = -old_nlink; + err = -EMLINK; + if ((old_nlink > 0 && old_dir->i_nlink >= F2FS_LINK_MAX) || + (new_nlink > 0 && new_dir->i_nlink >= F2FS_LINK_MAX)) + goto out_new_dir; + } + + f2fs_balance_fs(sbi, true); + + f2fs_lock_op(sbi); + + /* update ".." directory entry info of old dentry */ + if (old_dir_entry) + f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); + + /* update ".." directory entry info of new dentry */ + if (new_dir_entry) + f2fs_set_link(new_inode, new_dir_entry, new_dir_page, old_dir); + + /* update directory entry info of old dir inode */ + f2fs_set_link(old_dir, old_entry, old_page, new_inode); + + f2fs_down_write(&F2FS_I(old_inode)->i_sem); + if (!old_dir_entry) + file_lost_pino(old_inode); + else + /* adjust dir's i_pino to pass fsck check */ + f2fs_i_pino_write(old_inode, new_dir->i_ino); + f2fs_up_write(&F2FS_I(old_inode)->i_sem); + + old_dir->i_ctime = current_time(old_dir); + if (old_nlink) { + f2fs_down_write(&F2FS_I(old_dir)->i_sem); + f2fs_i_links_write(old_dir, old_nlink > 0); + f2fs_up_write(&F2FS_I(old_dir)->i_sem); + } + f2fs_mark_inode_dirty_sync(old_dir, false); + + /* update directory entry info of new dir inode */ + f2fs_set_link(new_dir, new_entry, new_page, old_inode); + + f2fs_down_write(&F2FS_I(new_inode)->i_sem); + if (!new_dir_entry) + file_lost_pino(new_inode); + else + /* adjust dir's i_pino to pass fsck check */ + f2fs_i_pino_write(new_inode, old_dir->i_ino); + f2fs_up_write(&F2FS_I(new_inode)->i_sem); + + new_dir->i_ctime = current_time(new_dir); + if (new_nlink) { + f2fs_down_write(&F2FS_I(new_dir)->i_sem); + f2fs_i_links_write(new_dir, new_nlink > 0); + f2fs_up_write(&F2FS_I(new_dir)->i_sem); + } + f2fs_mark_inode_dirty_sync(new_dir, false); + + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) { + f2fs_add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); + f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + } + + f2fs_unlock_op(sbi); + + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + f2fs_sync_fs(sbi->sb, 1); + + f2fs_update_time(sbi, REQ_TIME); + return 0; +out_new_dir: + if (new_dir_entry) { + f2fs_put_page(new_dir_page, 0); + } +out_old_dir: + if (old_dir_entry) { + f2fs_put_page(old_dir_page, 0); + } +out_new: + f2fs_put_page(new_page, 0); +out_old: + f2fs_put_page(old_page, 0); +out: + return err; +} + +static int f2fs_rename2(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + int err; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) + return -EINVAL; + + err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); + if (err) + return err; + + if (flags & RENAME_EXCHANGE) { + return f2fs_cross_rename(old_dir, old_dentry, + new_dir, new_dentry); + } + /* + * VFS has already handled the new dentry existence case, + * here, we just deal with "RENAME_NOREPLACE" as regular rename. + */ + return f2fs_rename(mnt_userns, old_dir, old_dentry, + new_dir, new_dentry, flags); +} + +static const char *f2fs_encrypted_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + struct page *page; + const char *target; + + if (!dentry) + return ERR_PTR(-ECHILD); + + page = read_mapping_page(inode->i_mapping, 0, NULL); + if (IS_ERR(page)) + return ERR_CAST(page); + + target = fscrypt_get_symlink(inode, page_address(page), + inode->i_sb->s_blocksize, done); + put_page(page); + return target; +} + +static int f2fs_encrypted_symlink_getattr(struct user_namespace *mnt_userns, + const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + f2fs_getattr(mnt_userns, path, stat, request_mask, query_flags); + + return fscrypt_symlink_getattr(path, stat); +} + +const struct inode_operations f2fs_encrypted_symlink_inode_operations = { + .get_link = f2fs_encrypted_get_link, + .getattr = f2fs_encrypted_symlink_getattr, + .setattr = f2fs_setattr, + .listxattr = f2fs_listxattr, +}; + +const struct inode_operations f2fs_dir_inode_operations = { + .create = f2fs_create, + .lookup = f2fs_lookup, + .link = f2fs_link, + .unlink = f2fs_unlink, + .symlink = f2fs_symlink, + .mkdir = f2fs_mkdir, + .rmdir = f2fs_rmdir, + .mknod = f2fs_mknod, + .rename = f2fs_rename2, + .tmpfile = f2fs_tmpfile, + .getattr = f2fs_getattr, + .setattr = f2fs_setattr, + .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, + .listxattr = f2fs_listxattr, + .fiemap = f2fs_fiemap, + .fileattr_get = f2fs_fileattr_get, + .fileattr_set = f2fs_fileattr_set, +}; + +const struct inode_operations f2fs_symlink_inode_operations = { + .get_link = f2fs_get_link, + .getattr = f2fs_getattr, + .setattr = f2fs_setattr, + .listxattr = f2fs_listxattr, +}; + +const struct inode_operations f2fs_special_inode_operations = { + .getattr = f2fs_getattr, + .setattr = f2fs_setattr, + .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, + .listxattr = f2fs_listxattr, +}; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c new file mode 100644 index 000000000..c6d0e0709 --- /dev/null +++ b/fs/f2fs/node.c @@ -0,0 +1,3438 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/f2fs/node.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/mpage.h> +#include <linux/sched/mm.h> +#include <linux/blkdev.h> +#include <linux/pagevec.h> +#include <linux/swap.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "xattr.h" +#include "iostat.h" +#include <trace/events/f2fs.h> + +#define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) + +static struct kmem_cache *nat_entry_slab; +static struct kmem_cache *free_nid_slab; +static struct kmem_cache *nat_entry_set_slab; +static struct kmem_cache *fsync_node_entry_slab; + +/* + * Check whether the given nid is within node id range. + */ +int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) +{ + if (unlikely(nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.", + __func__, nid); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return -EFSCORRUPTED; + } + return 0; +} + +bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct sysinfo val; + unsigned long avail_ram; + unsigned long mem_size = 0; + bool res = false; + + if (!nm_i) + return true; + + si_meminfo(&val); + + /* only uses low memory */ + avail_ram = val.totalram - val.totalhigh; + + /* + * give 25%, 25%, 50%, 50%, 50% memory for each components respectively + */ + if (type == FREE_NIDS) { + mem_size = (nm_i->nid_cnt[FREE_NID] * + sizeof(struct free_nid)) >> PAGE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); + } else if (type == NAT_ENTRIES) { + mem_size = (nm_i->nat_cnt[TOTAL_NAT] * + sizeof(struct nat_entry)) >> PAGE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); + if (excess_cached_nats(sbi)) + res = false; + } else if (type == DIRTY_DENTS) { + if (sbi->sb->s_bdi->wb.dirty_exceeded) + return false; + mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + } else if (type == INO_ENTRIES) { + int i; + + for (i = 0; i < MAX_INO_ENTRY; i++) + mem_size += sbi->im[i].ino_num * + sizeof(struct ino_entry); + mem_size >>= PAGE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + } else if (type == READ_EXTENT_CACHE) { + struct extent_tree_info *eti = &sbi->extent_tree[EX_READ]; + + mem_size = (atomic_read(&eti->total_ext_tree) * + sizeof(struct extent_tree) + + atomic_read(&eti->total_ext_node) * + sizeof(struct extent_node)) >> PAGE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + } else if (type == DISCARD_CACHE) { + mem_size = (atomic_read(&dcc->discard_cmd_cnt) * + sizeof(struct discard_cmd)) >> PAGE_SHIFT; + res = mem_size < (avail_ram * nm_i->ram_thresh / 100); + } else if (type == COMPRESS_PAGE) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + unsigned long free_ram = val.freeram; + + /* + * free memory is lower than watermark or cached page count + * exceed threshold, deny caching compress page. + */ + res = (free_ram > avail_ram * sbi->compress_watermark / 100) && + (COMPRESS_MAPPING(sbi)->nrpages < + free_ram * sbi->compress_percent / 100); +#else + res = false; +#endif + } else { + if (!sbi->sb->s_bdi->wb.dirty_exceeded) + return true; + } + return res; +} + +static void clear_node_page_dirty(struct page *page) +{ + if (PageDirty(page)) { + f2fs_clear_page_cache_dirty_tag(page); + clear_page_dirty_for_io(page); + dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); + } + ClearPageUptodate(page); +} + +static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) +{ + return f2fs_get_meta_page_retry(sbi, current_nat_addr(sbi, nid)); +} + +static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct page *src_page; + struct page *dst_page; + pgoff_t dst_off; + void *src_addr; + void *dst_addr; + struct f2fs_nm_info *nm_i = NM_I(sbi); + + dst_off = next_nat_addr(sbi, current_nat_addr(sbi, nid)); + + /* get current nat block page with lock */ + src_page = get_current_nat_page(sbi, nid); + if (IS_ERR(src_page)) + return src_page; + dst_page = f2fs_grab_meta_page(sbi, dst_off); + f2fs_bug_on(sbi, PageDirty(src_page)); + + src_addr = page_address(src_page); + dst_addr = page_address(dst_page); + memcpy(dst_addr, src_addr, PAGE_SIZE); + set_page_dirty(dst_page); + f2fs_put_page(src_page, 1); + + set_to_next_nat(nm_i, nid); + + return dst_page; +} + +static struct nat_entry *__alloc_nat_entry(struct f2fs_sb_info *sbi, + nid_t nid, bool no_fail) +{ + struct nat_entry *new; + + new = f2fs_kmem_cache_alloc(nat_entry_slab, + GFP_F2FS_ZERO, no_fail, sbi); + if (new) { + nat_set_nid(new, nid); + nat_reset_flag(new); + } + return new; +} + +static void __free_nat_entry(struct nat_entry *e) +{ + kmem_cache_free(nat_entry_slab, e); +} + +/* must be locked by nat_tree_lock */ +static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, + struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail) +{ + if (no_fail) + f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne); + else if (radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne)) + return NULL; + + if (raw_ne) + node_info_from_raw_nat(&ne->ni, raw_ne); + + spin_lock(&nm_i->nat_list_lock); + list_add_tail(&ne->list, &nm_i->nat_entries); + spin_unlock(&nm_i->nat_list_lock); + + nm_i->nat_cnt[TOTAL_NAT]++; + nm_i->nat_cnt[RECLAIMABLE_NAT]++; + return ne; +} + +static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) +{ + struct nat_entry *ne; + + ne = radix_tree_lookup(&nm_i->nat_root, n); + + /* for recent accessed nat entry, move it to tail of lru list */ + if (ne && !get_nat_flag(ne, IS_DIRTY)) { + spin_lock(&nm_i->nat_list_lock); + if (!list_empty(&ne->list)) + list_move_tail(&ne->list, &nm_i->nat_entries); + spin_unlock(&nm_i->nat_list_lock); + } + + return ne; +} + +static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i, + nid_t start, unsigned int nr, struct nat_entry **ep) +{ + return radix_tree_gang_lookup(&nm_i->nat_root, (void **)ep, start, nr); +} + +static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) +{ + radix_tree_delete(&nm_i->nat_root, nat_get_nid(e)); + nm_i->nat_cnt[TOTAL_NAT]--; + nm_i->nat_cnt[RECLAIMABLE_NAT]--; + __free_nat_entry(e); +} + +static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) +{ + nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); + struct nat_entry_set *head; + + head = radix_tree_lookup(&nm_i->nat_set_root, set); + if (!head) { + head = f2fs_kmem_cache_alloc(nat_entry_set_slab, + GFP_NOFS, true, NULL); + + INIT_LIST_HEAD(&head->entry_list); + INIT_LIST_HEAD(&head->set_list); + head->set = set; + head->entry_cnt = 0; + f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head); + } + return head; +} + +static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) +{ + struct nat_entry_set *head; + bool new_ne = nat_get_blkaddr(ne) == NEW_ADDR; + + if (!new_ne) + head = __grab_nat_entry_set(nm_i, ne); + + /* + * update entry_cnt in below condition: + * 1. update NEW_ADDR to valid block address; + * 2. update old block address to new one; + */ + if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) || + !get_nat_flag(ne, IS_DIRTY))) + head->entry_cnt++; + + set_nat_flag(ne, IS_PREALLOC, new_ne); + + if (get_nat_flag(ne, IS_DIRTY)) + goto refresh_list; + + nm_i->nat_cnt[DIRTY_NAT]++; + nm_i->nat_cnt[RECLAIMABLE_NAT]--; + set_nat_flag(ne, IS_DIRTY, true); +refresh_list: + spin_lock(&nm_i->nat_list_lock); + if (new_ne) + list_del_init(&ne->list); + else + list_move_tail(&ne->list, &head->entry_list); + spin_unlock(&nm_i->nat_list_lock); +} + +static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, + struct nat_entry_set *set, struct nat_entry *ne) +{ + spin_lock(&nm_i->nat_list_lock); + list_move_tail(&ne->list, &nm_i->nat_entries); + spin_unlock(&nm_i->nat_list_lock); + + set_nat_flag(ne, IS_DIRTY, false); + set->entry_cnt--; + nm_i->nat_cnt[DIRTY_NAT]--; + nm_i->nat_cnt[RECLAIMABLE_NAT]++; +} + +static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, + nid_t start, unsigned int nr, struct nat_entry_set **ep) +{ + return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep, + start, nr); +} + +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page) +{ + return NODE_MAPPING(sbi) == page->mapping && + IS_DNODE(page) && is_cold_node(page); +} + +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi) +{ + spin_lock_init(&sbi->fsync_node_lock); + INIT_LIST_HEAD(&sbi->fsync_node_list); + sbi->fsync_seg_id = 0; + sbi->fsync_node_num = 0; +} + +static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi, + struct page *page) +{ + struct fsync_node_entry *fn; + unsigned long flags; + unsigned int seq_id; + + fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, + GFP_NOFS, true, NULL); + + get_page(page); + fn->page = page; + INIT_LIST_HEAD(&fn->list); + + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + list_add_tail(&fn->list, &sbi->fsync_node_list); + fn->seq_id = sbi->fsync_seg_id++; + seq_id = fn->seq_id; + sbi->fsync_node_num++; + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + + return seq_id; +} + +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page) +{ + struct fsync_node_entry *fn; + unsigned long flags; + + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + list_for_each_entry(fn, &sbi->fsync_node_list, list) { + if (fn->page == page) { + list_del(&fn->list); + sbi->fsync_node_num--; + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + kmem_cache_free(fsync_node_entry_slab, fn); + put_page(page); + return; + } + } + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + f2fs_bug_on(sbi, 1); +} + +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi) +{ + unsigned long flags; + + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + sbi->fsync_seg_id = 0; + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); +} + +int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + bool need = false; + + f2fs_down_read(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (e) { + if (!get_nat_flag(e, IS_CHECKPOINTED) && + !get_nat_flag(e, HAS_FSYNCED_INODE)) + need = true; + } + f2fs_up_read(&nm_i->nat_tree_lock); + return need; +} + +bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + bool is_cp = true; + + f2fs_down_read(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (e && !get_nat_flag(e, IS_CHECKPOINTED)) + is_cp = false; + f2fs_up_read(&nm_i->nat_tree_lock); + return is_cp; +} + +bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + bool need_update = true; + + f2fs_down_read(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, ino); + if (e && get_nat_flag(e, HAS_LAST_FSYNC) && + (get_nat_flag(e, IS_CHECKPOINTED) || + get_nat_flag(e, HAS_FSYNCED_INODE))) + need_update = false; + f2fs_up_read(&nm_i->nat_tree_lock); + return need_update; +} + +/* must be locked by nat_tree_lock */ +static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, + struct f2fs_nat_entry *ne) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *new, *e; + + /* Let's mitigate lock contention of nat_tree_lock during checkpoint */ + if (f2fs_rwsem_is_locked(&sbi->cp_global_sem)) + return; + + new = __alloc_nat_entry(sbi, nid, false); + if (!new) + return; + + f2fs_down_write(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (!e) + e = __init_nat_entry(nm_i, new, ne, false); + else + f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || + nat_get_blkaddr(e) != + le32_to_cpu(ne->block_addr) || + nat_get_version(e) != ne->version); + f2fs_up_write(&nm_i->nat_tree_lock); + if (e != new) + __free_nat_entry(new); +} + +static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, + block_t new_blkaddr, bool fsync_done) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true); + + f2fs_down_write(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, ni->nid); + if (!e) { + e = __init_nat_entry(nm_i, new, NULL, true); + copy_node_info(&e->ni, ni); + f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); + } else if (new_blkaddr == NEW_ADDR) { + /* + * when nid is reallocated, + * previous nat entry can be remained in nat cache. + * So, reinitialize it with new information. + */ + copy_node_info(&e->ni, ni); + f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR); + } + /* let's free early to reduce memory consumption */ + if (e != new) + __free_nat_entry(new); + + /* sanity check */ + f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr); + f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR && + new_blkaddr == NULL_ADDR); + f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR && + new_blkaddr == NEW_ADDR); + f2fs_bug_on(sbi, __is_valid_data_blkaddr(nat_get_blkaddr(e)) && + new_blkaddr == NEW_ADDR); + + /* increment version no as node is removed */ + if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { + unsigned char version = nat_get_version(e); + + nat_set_version(e, inc_node_version(version)); + } + + /* change address */ + nat_set_blkaddr(e, new_blkaddr); + if (!__is_valid_data_blkaddr(new_blkaddr)) + set_nat_flag(e, IS_CHECKPOINTED, false); + __set_nat_cache_dirty(nm_i, e); + + /* update fsync_mark if its inode nat entry is still alive */ + if (ni->nid != ni->ino) + e = __lookup_nat_cache(nm_i, ni->ino); + if (e) { + if (fsync_done && ni->nid == ni->ino) + set_nat_flag(e, HAS_FSYNCED_INODE, true); + set_nat_flag(e, HAS_LAST_FSYNC, fsync_done); + } + f2fs_up_write(&nm_i->nat_tree_lock); +} + +int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + int nr = nr_shrink; + + if (!f2fs_down_write_trylock(&nm_i->nat_tree_lock)) + return 0; + + spin_lock(&nm_i->nat_list_lock); + while (nr_shrink) { + struct nat_entry *ne; + + if (list_empty(&nm_i->nat_entries)) + break; + + ne = list_first_entry(&nm_i->nat_entries, + struct nat_entry, list); + list_del(&ne->list); + spin_unlock(&nm_i->nat_list_lock); + + __del_from_nat_cache(nm_i, ne); + nr_shrink--; + + spin_lock(&nm_i->nat_list_lock); + } + spin_unlock(&nm_i->nat_list_lock); + + f2fs_up_write(&nm_i->nat_tree_lock); + return nr - nr_shrink; +} + +int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, + struct node_info *ni, bool checkpoint_context) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + nid_t start_nid = START_NID(nid); + struct f2fs_nat_block *nat_blk; + struct page *page = NULL; + struct f2fs_nat_entry ne; + struct nat_entry *e; + pgoff_t index; + block_t blkaddr; + int i; + + ni->nid = nid; +retry: + /* Check nat cache */ + f2fs_down_read(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (e) { + ni->ino = nat_get_ino(e); + ni->blk_addr = nat_get_blkaddr(e); + ni->version = nat_get_version(e); + f2fs_up_read(&nm_i->nat_tree_lock); + return 0; + } + + /* + * Check current segment summary by trying to grab journal_rwsem first. + * This sem is on the critical path on the checkpoint requiring the above + * nat_tree_lock. Therefore, we should retry, if we failed to grab here + * while not bothering checkpoint. + */ + if (!f2fs_rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) { + down_read(&curseg->journal_rwsem); + } else if (f2fs_rwsem_is_contended(&nm_i->nat_tree_lock) || + !down_read_trylock(&curseg->journal_rwsem)) { + f2fs_up_read(&nm_i->nat_tree_lock); + goto retry; + } + + i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0); + if (i >= 0) { + ne = nat_in_journal(journal, i); + node_info_from_raw_nat(ni, &ne); + } + up_read(&curseg->journal_rwsem); + if (i >= 0) { + f2fs_up_read(&nm_i->nat_tree_lock); + goto cache; + } + + /* Fill node_info from nat page */ + index = current_nat_addr(sbi, nid); + f2fs_up_read(&nm_i->nat_tree_lock); + + page = f2fs_get_meta_page(sbi, index); + if (IS_ERR(page)) + return PTR_ERR(page); + + nat_blk = (struct f2fs_nat_block *)page_address(page); + ne = nat_blk->entries[nid - start_nid]; + node_info_from_raw_nat(ni, &ne); + f2fs_put_page(page, 1); +cache: + blkaddr = le32_to_cpu(ne.block_addr); + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) + return -EFAULT; + + /* cache nat entry */ + cache_nat_entry(sbi, nid, &ne); + return 0; +} + +/* + * readahead MAX_RA_NODE number of node pages. + */ +static void f2fs_ra_node_pages(struct page *parent, int start, int n) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + struct blk_plug plug; + int i, end; + nid_t nid; + + blk_start_plug(&plug); + + /* Then, try readahead for siblings of the desired node */ + end = start + n; + end = min(end, NIDS_PER_BLOCK); + for (i = start; i < end; i++) { + nid = get_nid(parent, i, false); + f2fs_ra_node_page(sbi, nid); + } + + blk_finish_plug(&plug); +} + +pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs) +{ + const long direct_index = ADDRS_PER_INODE(dn->inode); + const long direct_blks = ADDRS_PER_BLOCK(dn->inode); + const long indirect_blks = ADDRS_PER_BLOCK(dn->inode) * NIDS_PER_BLOCK; + unsigned int skipped_unit = ADDRS_PER_BLOCK(dn->inode); + int cur_level = dn->cur_level; + int max_level = dn->max_level; + pgoff_t base = 0; + + if (!dn->max_level) + return pgofs + 1; + + while (max_level-- > cur_level) + skipped_unit *= NIDS_PER_BLOCK; + + switch (dn->max_level) { + case 3: + base += 2 * indirect_blks; + fallthrough; + case 2: + base += 2 * direct_blks; + fallthrough; + case 1: + base += direct_index; + break; + default: + f2fs_bug_on(F2FS_I_SB(dn->inode), 1); + } + + return ((pgofs - base) / skipped_unit + 1) * skipped_unit + base; +} + +/* + * The maximum depth is four. + * Offset[0] will have raw inode offset. + */ +static int get_node_path(struct inode *inode, long block, + int offset[4], unsigned int noffset[4]) +{ + const long direct_index = ADDRS_PER_INODE(inode); + const long direct_blks = ADDRS_PER_BLOCK(inode); + const long dptrs_per_blk = NIDS_PER_BLOCK; + const long indirect_blks = ADDRS_PER_BLOCK(inode) * NIDS_PER_BLOCK; + const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK; + int n = 0; + int level = 0; + + noffset[0] = 0; + + if (block < direct_index) { + offset[n] = block; + goto got; + } + block -= direct_index; + if (block < direct_blks) { + offset[n++] = NODE_DIR1_BLOCK; + noffset[n] = 1; + offset[n] = block; + level = 1; + goto got; + } + block -= direct_blks; + if (block < direct_blks) { + offset[n++] = NODE_DIR2_BLOCK; + noffset[n] = 2; + offset[n] = block; + level = 1; + goto got; + } + block -= direct_blks; + if (block < indirect_blks) { + offset[n++] = NODE_IND1_BLOCK; + noffset[n] = 3; + offset[n++] = block / direct_blks; + noffset[n] = 4 + offset[n - 1]; + offset[n] = block % direct_blks; + level = 2; + goto got; + } + block -= indirect_blks; + if (block < indirect_blks) { + offset[n++] = NODE_IND2_BLOCK; + noffset[n] = 4 + dptrs_per_blk; + offset[n++] = block / direct_blks; + noffset[n] = 5 + dptrs_per_blk + offset[n - 1]; + offset[n] = block % direct_blks; + level = 2; + goto got; + } + block -= indirect_blks; + if (block < dindirect_blks) { + offset[n++] = NODE_DIND_BLOCK; + noffset[n] = 5 + (dptrs_per_blk * 2); + offset[n++] = block / indirect_blks; + noffset[n] = 6 + (dptrs_per_blk * 2) + + offset[n - 1] * (dptrs_per_blk + 1); + offset[n++] = (block / direct_blks) % dptrs_per_blk; + noffset[n] = 7 + (dptrs_per_blk * 2) + + offset[n - 2] * (dptrs_per_blk + 1) + + offset[n - 1]; + offset[n] = block % direct_blks; + level = 3; + goto got; + } else { + return -E2BIG; + } +got: + return level; +} + +/* + * Caller should call f2fs_put_dnode(dn). + * Also, it should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op() only if mode is set with ALLOC_NODE. + */ +int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct page *npage[4]; + struct page *parent = NULL; + int offset[4]; + unsigned int noffset[4]; + nid_t nids[4]; + int level, i = 0; + int err = 0; + + level = get_node_path(dn->inode, index, offset, noffset); + if (level < 0) + return level; + + nids[0] = dn->inode->i_ino; + npage[0] = dn->inode_page; + + if (!npage[0]) { + npage[0] = f2fs_get_node_page(sbi, nids[0]); + if (IS_ERR(npage[0])) + return PTR_ERR(npage[0]); + } + + /* if inline_data is set, should not report any block indices */ + if (f2fs_has_inline_data(dn->inode) && index) { + err = -ENOENT; + f2fs_put_page(npage[0], 1); + goto release_out; + } + + parent = npage[0]; + if (level != 0) + nids[1] = get_nid(parent, offset[0], true); + dn->inode_page = npage[0]; + dn->inode_page_locked = true; + + /* get indirect or direct nodes */ + for (i = 1; i <= level; i++) { + bool done = false; + + if (!nids[i] && mode == ALLOC_NODE) { + /* alloc new node */ + if (!f2fs_alloc_nid(sbi, &(nids[i]))) { + err = -ENOSPC; + goto release_pages; + } + + dn->nid = nids[i]; + npage[i] = f2fs_new_node_page(dn, noffset[i]); + if (IS_ERR(npage[i])) { + f2fs_alloc_nid_failed(sbi, nids[i]); + err = PTR_ERR(npage[i]); + goto release_pages; + } + + set_nid(parent, offset[i - 1], nids[i], i == 1); + f2fs_alloc_nid_done(sbi, nids[i]); + done = true; + } else if (mode == LOOKUP_NODE_RA && i == level && level > 1) { + npage[i] = f2fs_get_node_page_ra(parent, offset[i - 1]); + if (IS_ERR(npage[i])) { + err = PTR_ERR(npage[i]); + goto release_pages; + } + done = true; + } + if (i == 1) { + dn->inode_page_locked = false; + unlock_page(parent); + } else { + f2fs_put_page(parent, 1); + } + + if (!done) { + npage[i] = f2fs_get_node_page(sbi, nids[i]); + if (IS_ERR(npage[i])) { + err = PTR_ERR(npage[i]); + f2fs_put_page(npage[0], 0); + goto release_out; + } + } + if (i < level) { + parent = npage[i]; + nids[i + 1] = get_nid(parent, offset[i], false); + } + } + dn->nid = nids[level]; + dn->ofs_in_node = offset[level]; + dn->node_page = npage[level]; + dn->data_blkaddr = f2fs_data_blkaddr(dn); + + if (is_inode_flag_set(dn->inode, FI_COMPRESSED_FILE) && + f2fs_sb_has_readonly(sbi)) { + unsigned int c_len = f2fs_cluster_blocks_are_contiguous(dn); + block_t blkaddr; + + if (!c_len) + goto out; + + blkaddr = f2fs_data_blkaddr(dn); + if (blkaddr == COMPRESS_ADDR) + blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + 1); + + f2fs_update_read_extent_tree_range_compressed(dn->inode, + index, blkaddr, + F2FS_I(dn->inode)->i_cluster_size, + c_len); + } +out: + return 0; + +release_pages: + f2fs_put_page(parent, 1); + if (i > 1) + f2fs_put_page(npage[0], 0); +release_out: + dn->inode_page = NULL; + dn->node_page = NULL; + if (err == -ENOENT) { + dn->cur_level = i; + dn->max_level = level; + dn->ofs_in_node = offset[level]; + } + return err; +} + +static int truncate_node(struct dnode_of_data *dn) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct node_info ni; + int err; + pgoff_t index; + + err = f2fs_get_node_info(sbi, dn->nid, &ni, false); + if (err) + return err; + + /* Deallocate node address */ + f2fs_invalidate_blocks(sbi, ni.blk_addr); + dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino); + set_node_addr(sbi, &ni, NULL_ADDR, false); + + if (dn->nid == dn->inode->i_ino) { + f2fs_remove_orphan_inode(sbi, dn->nid); + dec_valid_inode_count(sbi); + f2fs_inode_synced(dn->inode); + } + + clear_node_page_dirty(dn->node_page); + set_sbi_flag(sbi, SBI_IS_DIRTY); + + index = dn->node_page->index; + f2fs_put_page(dn->node_page, 1); + + invalidate_mapping_pages(NODE_MAPPING(sbi), + index, index); + + dn->node_page = NULL; + trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); + + return 0; +} + +static int truncate_dnode(struct dnode_of_data *dn) +{ + struct page *page; + int err; + + if (dn->nid == 0) + return 1; + + /* get direct node */ + page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid); + if (PTR_ERR(page) == -ENOENT) + return 1; + else if (IS_ERR(page)) + return PTR_ERR(page); + + /* Make dnode_of_data for parameter */ + dn->node_page = page; + dn->ofs_in_node = 0; + f2fs_truncate_data_blocks(dn); + err = truncate_node(dn); + if (err) { + f2fs_put_page(page, 1); + return err; + } + + return 1; +} + +static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, + int ofs, int depth) +{ + struct dnode_of_data rdn = *dn; + struct page *page; + struct f2fs_node *rn; + nid_t child_nid; + unsigned int child_nofs; + int freed = 0; + int i, ret; + + if (dn->nid == 0) + return NIDS_PER_BLOCK + 1; + + trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); + + page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid); + if (IS_ERR(page)) { + trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page)); + return PTR_ERR(page); + } + + f2fs_ra_node_pages(page, ofs, NIDS_PER_BLOCK); + + rn = F2FS_NODE(page); + if (depth < 3) { + for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { + child_nid = le32_to_cpu(rn->in.nid[i]); + if (child_nid == 0) + continue; + rdn.nid = child_nid; + ret = truncate_dnode(&rdn); + if (ret < 0) + goto out_err; + if (set_nid(page, i, 0, false)) + dn->node_changed = true; + } + } else { + child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1; + for (i = ofs; i < NIDS_PER_BLOCK; i++) { + child_nid = le32_to_cpu(rn->in.nid[i]); + if (child_nid == 0) { + child_nofs += NIDS_PER_BLOCK + 1; + continue; + } + rdn.nid = child_nid; + ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1); + if (ret == (NIDS_PER_BLOCK + 1)) { + if (set_nid(page, i, 0, false)) + dn->node_changed = true; + child_nofs += ret; + } else if (ret < 0 && ret != -ENOENT) { + goto out_err; + } + } + freed = child_nofs; + } + + if (!ofs) { + /* remove current indirect node */ + dn->node_page = page; + ret = truncate_node(dn); + if (ret) + goto out_err; + freed++; + } else { + f2fs_put_page(page, 1); + } + trace_f2fs_truncate_nodes_exit(dn->inode, freed); + return freed; + +out_err: + f2fs_put_page(page, 1); + trace_f2fs_truncate_nodes_exit(dn->inode, ret); + return ret; +} + +static int truncate_partial_nodes(struct dnode_of_data *dn, + struct f2fs_inode *ri, int *offset, int depth) +{ + struct page *pages[2]; + nid_t nid[3]; + nid_t child_nid; + int err = 0; + int i; + int idx = depth - 2; + + nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); + if (!nid[0]) + return 0; + + /* get indirect nodes in the path */ + for (i = 0; i < idx + 1; i++) { + /* reference count'll be increased */ + pages[i] = f2fs_get_node_page(F2FS_I_SB(dn->inode), nid[i]); + if (IS_ERR(pages[i])) { + err = PTR_ERR(pages[i]); + idx = i - 1; + goto fail; + } + nid[i + 1] = get_nid(pages[i], offset[i + 1], false); + } + + f2fs_ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK); + + /* free direct nodes linked to a partial indirect node */ + for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { + child_nid = get_nid(pages[idx], i, false); + if (!child_nid) + continue; + dn->nid = child_nid; + err = truncate_dnode(dn); + if (err < 0) + goto fail; + if (set_nid(pages[idx], i, 0, false)) + dn->node_changed = true; + } + + if (offset[idx + 1] == 0) { + dn->node_page = pages[idx]; + dn->nid = nid[idx]; + err = truncate_node(dn); + if (err) + goto fail; + } else { + f2fs_put_page(pages[idx], 1); + } + offset[idx]++; + offset[idx + 1] = 0; + idx--; +fail: + for (i = idx; i >= 0; i--) + f2fs_put_page(pages[i], 1); + + trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err); + + return err; +} + +/* + * All the block addresses of data and nodes should be nullified. + */ +int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int err = 0, cont = 1; + int level, offset[4], noffset[4]; + unsigned int nofs = 0; + struct f2fs_inode *ri; + struct dnode_of_data dn; + struct page *page; + + trace_f2fs_truncate_inode_blocks_enter(inode, from); + + level = get_node_path(inode, from, offset, noffset); + if (level < 0) { + trace_f2fs_truncate_inode_blocks_exit(inode, level); + return level; + } + + page = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) { + trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); + return PTR_ERR(page); + } + + set_new_dnode(&dn, inode, page, NULL, 0); + unlock_page(page); + + ri = F2FS_INODE(page); + switch (level) { + case 0: + case 1: + nofs = noffset[1]; + break; + case 2: + nofs = noffset[1]; + if (!offset[level - 1]) + goto skip_partial; + err = truncate_partial_nodes(&dn, ri, offset, level); + if (err < 0 && err != -ENOENT) + goto fail; + nofs += 1 + NIDS_PER_BLOCK; + break; + case 3: + nofs = 5 + 2 * NIDS_PER_BLOCK; + if (!offset[level - 1]) + goto skip_partial; + err = truncate_partial_nodes(&dn, ri, offset, level); + if (err < 0 && err != -ENOENT) + goto fail; + break; + default: + BUG(); + } + +skip_partial: + while (cont) { + dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); + switch (offset[0]) { + case NODE_DIR1_BLOCK: + case NODE_DIR2_BLOCK: + err = truncate_dnode(&dn); + break; + + case NODE_IND1_BLOCK: + case NODE_IND2_BLOCK: + err = truncate_nodes(&dn, nofs, offset[1], 2); + break; + + case NODE_DIND_BLOCK: + err = truncate_nodes(&dn, nofs, offset[1], 3); + cont = 0; + break; + + default: + BUG(); + } + if (err < 0 && err != -ENOENT) + goto fail; + if (offset[1] == 0 && + ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { + lock_page(page); + BUG_ON(page->mapping != NODE_MAPPING(sbi)); + f2fs_wait_on_page_writeback(page, NODE, true, true); + ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; + set_page_dirty(page); + unlock_page(page); + } + offset[1] = 0; + offset[0]++; + nofs += err; + } +fail: + f2fs_put_page(page, 0); + trace_f2fs_truncate_inode_blocks_exit(inode, err); + return err > 0 ? 0 : err; +} + +/* caller must lock inode page */ +int f2fs_truncate_xattr_node(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t nid = F2FS_I(inode)->i_xattr_nid; + struct dnode_of_data dn; + struct page *npage; + int err; + + if (!nid) + return 0; + + npage = f2fs_get_node_page(sbi, nid); + if (IS_ERR(npage)) + return PTR_ERR(npage); + + set_new_dnode(&dn, inode, NULL, npage, nid); + err = truncate_node(&dn); + if (err) { + f2fs_put_page(npage, 1); + return err; + } + + f2fs_i_xnid_write(inode, 0); + + return 0; +} + +/* + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + */ +int f2fs_remove_inode_page(struct inode *inode) +{ + struct dnode_of_data dn; + int err; + + set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); + err = f2fs_get_dnode_of_data(&dn, 0, LOOKUP_NODE); + if (err) + return err; + + err = f2fs_truncate_xattr_node(inode); + if (err) { + f2fs_put_dnode(&dn); + return err; + } + + /* remove potential inline_data blocks */ + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) + f2fs_truncate_data_blocks_range(&dn, 1); + + /* 0 is possible, after f2fs_new_inode() has failed */ + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { + f2fs_put_dnode(&dn); + return -EIO; + } + + if (unlikely(inode->i_blocks != 0 && inode->i_blocks != 8)) { + f2fs_warn(F2FS_I_SB(inode), + "f2fs_remove_inode_page: inconsistent i_blocks, ino:%lu, iblocks:%llu", + inode->i_ino, (unsigned long long)inode->i_blocks); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + } + + /* will put inode & node pages */ + err = truncate_node(&dn); + if (err) { + f2fs_put_dnode(&dn); + return err; + } + return 0; +} + +struct page *f2fs_new_inode_page(struct inode *inode) +{ + struct dnode_of_data dn; + + /* allocate inode page for new inode */ + set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); + + /* caller should f2fs_put_page(page, 1); */ + return f2fs_new_node_page(&dn, 0); +} + +struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct node_info new_ni; + struct page *page; + int err; + + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) + return ERR_PTR(-EPERM); + + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false); + if (!page) + return ERR_PTR(-ENOMEM); + + if (unlikely((err = inc_valid_node_count(sbi, dn->inode, !ofs)))) + goto fail; + +#ifdef CONFIG_F2FS_CHECK_FS + err = f2fs_get_node_info(sbi, dn->nid, &new_ni, false); + if (err) { + dec_valid_node_count(sbi, dn->inode, !ofs); + goto fail; + } + if (unlikely(new_ni.blk_addr != NULL_ADDR)) { + err = -EFSCORRUPTED; + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + goto fail; + } +#endif + new_ni.nid = dn->nid; + new_ni.ino = dn->inode->i_ino; + new_ni.blk_addr = NULL_ADDR; + new_ni.flag = 0; + new_ni.version = 0; + set_node_addr(sbi, &new_ni, NEW_ADDR, false); + + f2fs_wait_on_page_writeback(page, NODE, true, true); + fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); + set_cold_node(page, S_ISDIR(dn->inode->i_mode)); + if (!PageUptodate(page)) + SetPageUptodate(page); + if (set_page_dirty(page)) + dn->node_changed = true; + + if (f2fs_has_xattr_block(ofs)) + f2fs_i_xnid_write(dn->inode, dn->nid); + + if (ofs == 0) + inc_valid_inode_count(sbi); + return page; + +fail: + clear_node_page_dirty(page); + f2fs_put_page(page, 1); + return ERR_PTR(err); +} + +/* + * Caller should do after getting the following values. + * 0: f2fs_put_page(page, 0) + * LOCKED_PAGE or error: f2fs_put_page(page, 1) + */ +static int read_node_page(struct page *page, blk_opf_t op_flags) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + struct node_info ni; + struct f2fs_io_info fio = { + .sbi = sbi, + .type = NODE, + .op = REQ_OP_READ, + .op_flags = op_flags, + .page = page, + .encrypted_page = NULL, + }; + int err; + + if (PageUptodate(page)) { + if (!f2fs_inode_chksum_verify(sbi, page)) { + ClearPageUptodate(page); + return -EFSBADCRC; + } + return LOCKED_PAGE; + } + + err = f2fs_get_node_info(sbi, page->index, &ni, false); + if (err) + return err; + + /* NEW_ADDR can be seen, after cp_error drops some dirty node pages */ + if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR)) { + ClearPageUptodate(page); + return -ENOENT; + } + + fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr; + + err = f2fs_submit_page_bio(&fio); + + if (!err) + f2fs_update_iostat(sbi, NULL, FS_NODE_READ_IO, F2FS_BLKSIZE); + + return err; +} + +/* + * Readahead a node page + */ +void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct page *apage; + int err; + + if (!nid) + return; + if (f2fs_check_nid_range(sbi, nid)) + return; + + apage = xa_load(&NODE_MAPPING(sbi)->i_pages, nid); + if (apage) + return; + + apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); + if (!apage) + return; + + err = read_node_page(apage, REQ_RAHEAD); + f2fs_put_page(apage, err ? 1 : 0); +} + +static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, + struct page *parent, int start) +{ + struct page *page; + int err; + + if (!nid) + return ERR_PTR(-ENOENT); + if (f2fs_check_nid_range(sbi, nid)) + return ERR_PTR(-EINVAL); +repeat: + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); + if (!page) + return ERR_PTR(-ENOMEM); + + err = read_node_page(page, 0); + if (err < 0) { + goto out_put_err; + } else if (err == LOCKED_PAGE) { + err = 0; + goto page_hit; + } + + if (parent) + f2fs_ra_node_pages(parent, start + 1, MAX_RA_NODE); + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { + f2fs_put_page(page, 1); + goto repeat; + } + + if (unlikely(!PageUptodate(page))) { + err = -EIO; + goto out_err; + } + + if (!f2fs_inode_chksum_verify(sbi, page)) { + err = -EFSBADCRC; + goto out_err; + } +page_hit: + if (likely(nid == nid_of_node(page))) + return page; + + f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + nid, nid_of_node(page), ino_of_node(page), + ofs_of_node(page), cpver_of_node(page), + next_blkaddr_of_node(page)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); + err = -EFSCORRUPTED; +out_err: + ClearPageUptodate(page); +out_put_err: + /* ENOENT comes from read_node_page which is not an error. */ + if (err != -ENOENT) + f2fs_handle_page_eio(sbi, page->index, NODE); + f2fs_put_page(page, 1); + return ERR_PTR(err); +} + +struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +{ + return __get_node_page(sbi, nid, NULL, 0); +} + +struct page *f2fs_get_node_page_ra(struct page *parent, int start) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + nid_t nid = get_nid(parent, start, false); + + return __get_node_page(sbi, nid, parent, start); +} + +static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct inode *inode; + struct page *page; + int ret; + + /* should flush inline_data before evict_inode */ + inode = ilookup(sbi->sb, ino); + if (!inode) + return; + + page = f2fs_pagecache_get_page(inode->i_mapping, 0, + FGP_LOCK|FGP_NOWAIT, 0); + if (!page) + goto iput_out; + + if (!PageUptodate(page)) + goto page_out; + + if (!PageDirty(page)) + goto page_out; + + if (!clear_page_dirty_for_io(page)) + goto page_out; + + ret = f2fs_write_inline_data(inode, page); + inode_dec_dirty_pages(inode); + f2fs_remove_dirty_inode(inode); + if (ret) + set_page_dirty(page); +page_out: + f2fs_put_page(page, 1); +iput_out: + iput(inode); +} + +static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) +{ + pgoff_t index; + struct pagevec pvec; + struct page *last_page = NULL; + int nr_pages; + + pagevec_init(&pvec); + index = 0; + + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY))) { + int i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_put_page(last_page, 0); + pagevec_release(&pvec); + return ERR_PTR(-EIO); + } + + if (!IS_DNODE(page) || !is_cold_node(page)) + continue; + if (ino_of_node(page) != ino) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + if (ino_of_node(page) != ino) + goto continue_unlock; + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (last_page) + f2fs_put_page(last_page, 0); + + get_page(page); + last_page = page; + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } + return last_page; +} + +static int __write_node_page(struct page *page, bool atomic, bool *submitted, + struct writeback_control *wbc, bool do_balance, + enum iostat_type io_type, unsigned int *seq_id) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + nid_t nid; + struct node_info ni; + struct f2fs_io_info fio = { + .sbi = sbi, + .ino = ino_of_node(page), + .type = NODE, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), + .page = page, + .encrypted_page = NULL, + .submitted = false, + .io_type = io_type, + .io_wbc = wbc, + }; + unsigned int seq; + + trace_f2fs_writepage(page, NODE); + + if (unlikely(f2fs_cp_error(sbi))) { + ClearPageUptodate(page); + dec_page_count(sbi, F2FS_DIRTY_NODES); + unlock_page(page); + return 0; + } + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + + if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) && + wbc->sync_mode == WB_SYNC_NONE && + IS_DNODE(page) && is_cold_node(page)) + goto redirty_out; + + /* get old block addr of this node page */ + nid = nid_of_node(page); + f2fs_bug_on(sbi, page->index != nid); + + if (f2fs_get_node_info(sbi, nid, &ni, !do_balance)) + goto redirty_out; + + if (wbc->for_reclaim) { + if (!f2fs_down_read_trylock(&sbi->node_write)) + goto redirty_out; + } else { + f2fs_down_read(&sbi->node_write); + } + + /* This page is already truncated */ + if (unlikely(ni.blk_addr == NULL_ADDR)) { + ClearPageUptodate(page); + dec_page_count(sbi, F2FS_DIRTY_NODES); + f2fs_up_read(&sbi->node_write); + unlock_page(page); + return 0; + } + + if (__is_valid_data_blkaddr(ni.blk_addr) && + !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, + DATA_GENERIC_ENHANCE)) { + f2fs_up_read(&sbi->node_write); + goto redirty_out; + } + + if (atomic && !test_opt(sbi, NOBARRIER) && !f2fs_sb_has_blkzoned(sbi)) + fio.op_flags |= REQ_PREFLUSH | REQ_FUA; + + /* should add to global list before clearing PAGECACHE status */ + if (f2fs_in_warm_node_list(sbi, page)) { + seq = f2fs_add_fsync_node_entry(sbi, page); + if (seq_id) + *seq_id = seq; + } + + set_page_writeback(page); + ClearPageError(page); + + fio.old_blkaddr = ni.blk_addr; + f2fs_do_write_node_page(nid, &fio); + set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); + dec_page_count(sbi, F2FS_DIRTY_NODES); + f2fs_up_read(&sbi->node_write); + + if (wbc->for_reclaim) { + f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE); + submitted = NULL; + } + + unlock_page(page); + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_submit_merged_write(sbi, NODE); + submitted = NULL; + } + if (submitted) + *submitted = fio.submitted; + + if (do_balance) + f2fs_balance_fs(sbi, false); + return 0; + +redirty_out: + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; +} + +int f2fs_move_node_page(struct page *node_page, int gc_type) +{ + int err = 0; + + if (gc_type == FG_GC) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + .for_reclaim = 0, + }; + + f2fs_wait_on_page_writeback(node_page, NODE, true, true); + + set_page_dirty(node_page); + + if (!clear_page_dirty_for_io(node_page)) { + err = -EAGAIN; + goto out_page; + } + + if (__write_node_page(node_page, false, NULL, + &wbc, false, FS_GC_NODE_IO, NULL)) { + err = -EAGAIN; + unlock_page(node_page); + } + goto release_page; + } else { + /* set page dirty and write it */ + if (!PageWriteback(node_page)) + set_page_dirty(node_page); + } +out_page: + unlock_page(node_page); +release_page: + f2fs_put_page(node_page, 0); + return err; +} + +static int f2fs_write_node_page(struct page *page, + struct writeback_control *wbc) +{ + return __write_node_page(page, false, NULL, wbc, false, + FS_NODE_IO, NULL); +} + +int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, + struct writeback_control *wbc, bool atomic, + unsigned int *seq_id) +{ + pgoff_t index; + struct pagevec pvec; + int ret = 0; + struct page *last_page = NULL; + bool marked = false; + nid_t ino = inode->i_ino; + int nr_pages; + int nwritten = 0; + + if (atomic) { + last_page = last_fsync_dnode(sbi, ino); + if (IS_ERR_OR_NULL(last_page)) + return PTR_ERR_OR_ZERO(last_page); + } +retry: + pagevec_init(&pvec); + index = 0; + + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY))) { + int i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + bool submitted = false; + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_put_page(last_page, 0); + pagevec_release(&pvec); + ret = -EIO; + goto out; + } + + if (!IS_DNODE(page) || !is_cold_node(page)) + continue; + if (ino_of_node(page) != ino) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + if (ino_of_node(page) != ino) + goto continue_unlock; + + if (!PageDirty(page) && page != last_page) { + /* someone wrote it for us */ + goto continue_unlock; + } + + f2fs_wait_on_page_writeback(page, NODE, true, true); + + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); + + if (!atomic || page == last_page) { + set_fsync_mark(page, 1); + percpu_counter_inc(&sbi->rf_node_block_count); + if (IS_INODE(page)) { + if (is_inode_flag_set(inode, + FI_DIRTY_INODE)) + f2fs_update_inode(inode, page); + set_dentry_mark(page, + f2fs_need_dentry_mark(sbi, ino)); + } + /* may be written by other thread */ + if (!PageDirty(page)) + set_page_dirty(page); + } + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + ret = __write_node_page(page, atomic && + page == last_page, + &submitted, wbc, true, + FS_NODE_IO, seq_id); + if (ret) { + unlock_page(page); + f2fs_put_page(last_page, 0); + break; + } else if (submitted) { + nwritten++; + } + + if (page == last_page) { + f2fs_put_page(page, 0); + marked = true; + break; + } + } + pagevec_release(&pvec); + cond_resched(); + + if (ret || marked) + break; + } + if (!ret && atomic && !marked) { + f2fs_debug(sbi, "Retry to write fsync mark: ino=%u, idx=%lx", + ino, last_page->index); + lock_page(last_page); + f2fs_wait_on_page_writeback(last_page, NODE, true, true); + set_page_dirty(last_page); + unlock_page(last_page); + goto retry; + } +out: + if (nwritten) + f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE); + return ret ? -EIO : 0; +} + +static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool clean; + + if (inode->i_ino != ino) + return 0; + + if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) + return 0; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + clean = list_empty(&F2FS_I(inode)->gdirty_list); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + + if (clean) + return 0; + + inode = igrab(inode); + if (!inode) + return 0; + return 1; +} + +static bool flush_dirty_inode(struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + struct inode *inode; + nid_t ino = ino_of_node(page); + + inode = find_inode_nowait(sbi->sb, ino, f2fs_match_ino, NULL); + if (!inode) + return false; + + f2fs_update_inode(inode, page); + unlock_page(page); + + iput(inode); + return true; +} + +void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) +{ + pgoff_t index = 0; + struct pagevec pvec; + int nr_pages; + + pagevec_init(&pvec); + + while ((nr_pages = pagevec_lookup_tag(&pvec, + NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) { + int i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (!IS_DNODE(page)) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + /* flush inline_data, if it's async context. */ + if (page_private_inline(page)) { + clear_page_private_inline(page); + unlock_page(page); + flush_inline_data(sbi, ino_of_node(page)); + continue; + } + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } +} + +int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, + struct writeback_control *wbc, + bool do_balance, enum iostat_type io_type) +{ + pgoff_t index; + struct pagevec pvec; + int step = 0; + int nwritten = 0; + int ret = 0; + int nr_pages, done = 0; + + pagevec_init(&pvec); + +next_step: + index = 0; + + while (!done && (nr_pages = pagevec_lookup_tag(&pvec, + NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) { + int i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + bool submitted = false; + + /* give a priority to WB_SYNC threads */ + if (atomic_read(&sbi->wb_sync_req[NODE]) && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } + + /* + * flushing sequence with step: + * 0. indirect nodes + * 1. dentry dnodes + * 2. file dnodes + */ + if (step == 0 && IS_DNODE(page)) + continue; + if (step == 1 && (!IS_DNODE(page) || + is_cold_node(page))) + continue; + if (step == 2 && (!IS_DNODE(page) || + !is_cold_node(page))) + continue; +lock_node: + if (wbc->sync_mode == WB_SYNC_ALL) + lock_page(page); + else if (!trylock_page(page)) + continue; + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + /* flush inline_data/inode, if it's async context. */ + if (!do_balance) + goto write_node; + + /* flush inline_data */ + if (page_private_inline(page)) { + clear_page_private_inline(page); + unlock_page(page); + flush_inline_data(sbi, ino_of_node(page)); + goto lock_node; + } + + /* flush dirty inode */ + if (IS_INODE(page) && flush_dirty_inode(page)) + goto lock_node; +write_node: + f2fs_wait_on_page_writeback(page, NODE, true, true); + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); + + ret = __write_node_page(page, false, &submitted, + wbc, do_balance, io_type, NULL); + if (ret) + unlock_page(page); + else if (submitted) + nwritten++; + + if (--wbc->nr_to_write == 0) + break; + } + pagevec_release(&pvec); + cond_resched(); + + if (wbc->nr_to_write == 0) { + step = 2; + break; + } + } + + if (step < 2) { + if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) && + wbc->sync_mode == WB_SYNC_NONE && step == 1) + goto out; + step++; + goto next_step; + } +out: + if (nwritten) + f2fs_submit_merged_write(sbi, NODE); + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + return ret; +} + +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, + unsigned int seq_id) +{ + struct fsync_node_entry *fn; + struct page *page; + struct list_head *head = &sbi->fsync_node_list; + unsigned long flags; + unsigned int cur_seq_id = 0; + int ret2, ret = 0; + + while (seq_id && cur_seq_id < seq_id) { + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + if (list_empty(head)) { + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + break; + } + fn = list_first_entry(head, struct fsync_node_entry, list); + if (fn->seq_id > seq_id) { + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + break; + } + cur_seq_id = fn->seq_id; + page = fn->page; + get_page(page); + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + + f2fs_wait_on_page_writeback(page, NODE, true, false); + if (TestClearPageError(page)) + ret = -EIO; + + put_page(page); + + if (ret) + break; + } + + ret2 = filemap_check_errors(NODE_MAPPING(sbi)); + if (!ret) + ret = ret2; + + return ret; +} + +static int f2fs_write_node_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + struct blk_plug plug; + long diff; + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + + /* balancing f2fs's metadata in background */ + f2fs_balance_fs_bg(sbi, true); + + /* collect a number of dirty node pages and write together */ + if (wbc->sync_mode != WB_SYNC_ALL && + get_pages(sbi, F2FS_DIRTY_NODES) < + nr_pages_to_skip(sbi, NODE)) + goto skip_write; + + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_inc(&sbi->wb_sync_req[NODE]); + else if (atomic_read(&sbi->wb_sync_req[NODE])) { + /* to avoid potential deadlock */ + if (current->plug) + blk_finish_plug(current->plug); + goto skip_write; + } + + trace_f2fs_writepages(mapping->host, wbc, NODE); + + diff = nr_pages_to_write(sbi, NODE, wbc); + blk_start_plug(&plug); + f2fs_sync_node_pages(sbi, wbc, true, FS_NODE_IO); + blk_finish_plug(&plug); + wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); + + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_dec(&sbi->wb_sync_req[NODE]); + return 0; + +skip_write: + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES); + trace_f2fs_writepages(mapping->host, wbc, NODE); + return 0; +} + +static bool f2fs_dirty_node_folio(struct address_space *mapping, + struct folio *folio) +{ + trace_f2fs_set_page_dirty(&folio->page, NODE); + + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); +#ifdef CONFIG_F2FS_CHECK_FS + if (IS_INODE(&folio->page)) + f2fs_inode_chksum_set(F2FS_M_SB(mapping), &folio->page); +#endif + if (filemap_dirty_folio(mapping, folio)) { + inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); + set_page_private_reference(&folio->page); + return true; + } + return false; +} + +/* + * Structure of the f2fs node operations + */ +const struct address_space_operations f2fs_node_aops = { + .writepage = f2fs_write_node_page, + .writepages = f2fs_write_node_pages, + .dirty_folio = f2fs_dirty_node_folio, + .invalidate_folio = f2fs_invalidate_folio, + .release_folio = f2fs_release_folio, + .migrate_folio = filemap_migrate_folio, +}; + +static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, + nid_t n) +{ + return radix_tree_lookup(&nm_i->free_nid_root, n); +} + +static int __insert_free_nid(struct f2fs_sb_info *sbi, + struct free_nid *i) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); + + if (err) + return err; + + nm_i->nid_cnt[FREE_NID]++; + list_add_tail(&i->list, &nm_i->free_nid_list); + return 0; +} + +static void __remove_free_nid(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_state state) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, state != i->state); + nm_i->nid_cnt[state]--; + if (state == FREE_NID) + list_del(&i->list); + radix_tree_delete(&nm_i->free_nid_root, i->nid); +} + +static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i, + enum nid_state org_state, enum nid_state dst_state) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, org_state != i->state); + i->state = dst_state; + nm_i->nid_cnt[org_state]--; + nm_i->nid_cnt[dst_state]++; + + switch (dst_state) { + case PREALLOC_NID: + list_del(&i->list); + break; + case FREE_NID: + list_add_tail(&i->list, &nm_i->free_nid_list); + break; + default: + BUG_ON(1); + } +} + +bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int i; + bool ret = true; + + f2fs_down_read(&nm_i->nat_tree_lock); + for (i = 0; i < nm_i->nat_blocks; i++) { + if (!test_bit_le(i, nm_i->nat_block_bitmap)) { + ret = false; + break; + } + } + f2fs_up_read(&nm_i->nat_tree_lock); + + return ret; +} + +static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, + bool set, bool build) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); + unsigned int nid_ofs = nid - START_NID(nid); + + if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) + return; + + if (set) { + if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) + return; + __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + nm_i->free_nid_count[nat_ofs]++; + } else { + if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) + return; + __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + if (!build) + nm_i->free_nid_count[nat_ofs]--; + } +} + +/* return if the nid is recognized as free */ +static bool add_free_nid(struct f2fs_sb_info *sbi, + nid_t nid, bool build, bool update) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i, *e; + struct nat_entry *ne; + int err = -EINVAL; + bool ret = false; + + /* 0 nid should not be used */ + if (unlikely(nid == 0)) + return false; + + if (unlikely(f2fs_check_nid_range(sbi, nid))) + return false; + + i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS, true, NULL); + i->nid = nid; + i->state = FREE_NID; + + radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); + + spin_lock(&nm_i->nid_list_lock); + + if (build) { + /* + * Thread A Thread B + * - f2fs_create + * - f2fs_new_inode + * - f2fs_alloc_nid + * - __insert_nid_to_list(PREALLOC_NID) + * - f2fs_balance_fs_bg + * - f2fs_build_free_nids + * - __f2fs_build_free_nids + * - scan_nat_page + * - add_free_nid + * - __lookup_nat_cache + * - f2fs_add_link + * - f2fs_init_inode_metadata + * - f2fs_new_inode_page + * - f2fs_new_node_page + * - set_node_addr + * - f2fs_alloc_nid_done + * - __remove_nid_from_list(PREALLOC_NID) + * - __insert_nid_to_list(FREE_NID) + */ + ne = __lookup_nat_cache(nm_i, nid); + if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || + nat_get_blkaddr(ne) != NULL_ADDR)) + goto err_out; + + e = __lookup_free_nid_list(nm_i, nid); + if (e) { + if (e->state == FREE_NID) + ret = true; + goto err_out; + } + } + ret = true; + err = __insert_free_nid(sbi, i); +err_out: + if (update) { + update_free_nid_bitmap(sbi, nid, ret, build); + if (!build) + nm_i->available_nids++; + } + spin_unlock(&nm_i->nid_list_lock); + radix_tree_preload_end(); + + if (err) + kmem_cache_free(free_nid_slab, i); + return ret; +} + +static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i; + bool need_free = false; + + spin_lock(&nm_i->nid_list_lock); + i = __lookup_free_nid_list(nm_i, nid); + if (i && i->state == FREE_NID) { + __remove_free_nid(sbi, i, FREE_NID); + need_free = true; + } + spin_unlock(&nm_i->nid_list_lock); + + if (need_free) + kmem_cache_free(free_nid_slab, i); +} + +static int scan_nat_page(struct f2fs_sb_info *sbi, + struct page *nat_page, nid_t start_nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct f2fs_nat_block *nat_blk = page_address(nat_page); + block_t blk_addr; + unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid); + int i; + + __set_bit_le(nat_ofs, nm_i->nat_block_bitmap); + + i = start_nid % NAT_ENTRY_PER_BLOCK; + + for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { + if (unlikely(start_nid >= nm_i->max_nid)) + break; + + blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); + + if (blk_addr == NEW_ADDR) + return -EINVAL; + + if (blk_addr == NULL_ADDR) { + add_free_nid(sbi, start_nid, true, true); + } else { + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, start_nid, false, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } + } + + return 0; +} + +static void scan_curseg_cache(struct f2fs_sb_info *sbi) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + int i; + + down_read(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { + block_t addr; + nid_t nid; + + addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); + nid = le32_to_cpu(nid_in_journal(journal, i)); + if (addr == NULL_ADDR) + add_free_nid(sbi, nid, true, false); + else + remove_free_nid(sbi, nid); + } + up_read(&curseg->journal_rwsem); +} + +static void scan_free_nid_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int i, idx; + nid_t nid; + + f2fs_down_read(&nm_i->nat_tree_lock); + + for (i = 0; i < nm_i->nat_blocks; i++) { + if (!test_bit_le(i, nm_i->nat_block_bitmap)) + continue; + if (!nm_i->free_nid_count[i]) + continue; + for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) { + idx = find_next_bit_le(nm_i->free_nid_bitmap[i], + NAT_ENTRY_PER_BLOCK, idx); + if (idx >= NAT_ENTRY_PER_BLOCK) + break; + + nid = i * NAT_ENTRY_PER_BLOCK + idx; + add_free_nid(sbi, nid, true, false); + + if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS) + goto out; + } + } +out: + scan_curseg_cache(sbi); + + f2fs_up_read(&nm_i->nat_tree_lock); +} + +static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, + bool sync, bool mount) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + int i = 0, ret; + nid_t nid = nm_i->next_scan_nid; + + if (unlikely(nid >= nm_i->max_nid)) + nid = 0; + + if (unlikely(nid % NAT_ENTRY_PER_BLOCK)) + nid = NAT_BLOCK_OFFSET(nid) * NAT_ENTRY_PER_BLOCK; + + /* Enough entries */ + if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) + return 0; + + if (!sync && !f2fs_available_free_memory(sbi, FREE_NIDS)) + return 0; + + if (!mount) { + /* try to find free nids in free_nid_bitmap */ + scan_free_nid_bits(sbi); + + if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) + return 0; + } + + /* readahead nat pages to be scanned */ + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, + META_NAT, true); + + f2fs_down_read(&nm_i->nat_tree_lock); + + while (1) { + if (!test_bit_le(NAT_BLOCK_OFFSET(nid), + nm_i->nat_block_bitmap)) { + struct page *page = get_current_nat_page(sbi, nid); + + if (IS_ERR(page)) { + ret = PTR_ERR(page); + } else { + ret = scan_nat_page(sbi, page, nid); + f2fs_put_page(page, 1); + } + + if (ret) { + f2fs_up_read(&nm_i->nat_tree_lock); + f2fs_err(sbi, "NAT is corrupt, run fsck to fix it"); + return ret; + } + } + + nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); + if (unlikely(nid >= nm_i->max_nid)) + nid = 0; + + if (++i >= FREE_NID_PAGES) + break; + } + + /* go to the next free nat pages to find free nids abundantly */ + nm_i->next_scan_nid = nid; + + /* find free nids from current sum_pages */ + scan_curseg_cache(sbi); + + f2fs_up_read(&nm_i->nat_tree_lock); + + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), + nm_i->ra_nid_pages, META_NAT, false); + + return 0; +} + +int f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) +{ + int ret; + + mutex_lock(&NM_I(sbi)->build_lock); + ret = __f2fs_build_free_nids(sbi, sync, mount); + mutex_unlock(&NM_I(sbi)->build_lock); + + return ret; +} + +/* + * If this function returns success, caller can obtain a new nid + * from second parameter of this function. + * The returned nid could be used ino as well as nid when inode is created. + */ +bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i = NULL; +retry: + if (time_to_inject(sbi, FAULT_ALLOC_NID)) { + f2fs_show_injection_info(sbi, FAULT_ALLOC_NID); + return false; + } + + spin_lock(&nm_i->nid_list_lock); + + if (unlikely(nm_i->available_nids == 0)) { + spin_unlock(&nm_i->nid_list_lock); + return false; + } + + /* We should not use stale free nids created by f2fs_build_free_nids */ + if (nm_i->nid_cnt[FREE_NID] && !on_f2fs_build_free_nids(nm_i)) { + f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); + i = list_first_entry(&nm_i->free_nid_list, + struct free_nid, list); + *nid = i->nid; + + __move_free_nid(sbi, i, FREE_NID, PREALLOC_NID); + nm_i->available_nids--; + + update_free_nid_bitmap(sbi, *nid, false, false); + + spin_unlock(&nm_i->nid_list_lock); + return true; + } + spin_unlock(&nm_i->nid_list_lock); + + /* Let's scan nat pages and its caches to get free nids */ + if (!f2fs_build_free_nids(sbi, true, false)) + goto retry; + return false; +} + +/* + * f2fs_alloc_nid() should be called prior to this function. + */ +void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i; + + spin_lock(&nm_i->nid_list_lock); + i = __lookup_free_nid_list(nm_i, nid); + f2fs_bug_on(sbi, !i); + __remove_free_nid(sbi, i, PREALLOC_NID); + spin_unlock(&nm_i->nid_list_lock); + + kmem_cache_free(free_nid_slab, i); +} + +/* + * f2fs_alloc_nid() should be called prior to this function. + */ +void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i; + bool need_free = false; + + if (!nid) + return; + + spin_lock(&nm_i->nid_list_lock); + i = __lookup_free_nid_list(nm_i, nid); + f2fs_bug_on(sbi, !i); + + if (!f2fs_available_free_memory(sbi, FREE_NIDS)) { + __remove_free_nid(sbi, i, PREALLOC_NID); + need_free = true; + } else { + __move_free_nid(sbi, i, PREALLOC_NID, FREE_NID); + } + + nm_i->available_nids++; + + update_free_nid_bitmap(sbi, nid, true, false); + + spin_unlock(&nm_i->nid_list_lock); + + if (need_free) + kmem_cache_free(free_nid_slab, i); +} + +int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + int nr = nr_shrink; + + if (nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) + return 0; + + if (!mutex_trylock(&nm_i->build_lock)) + return 0; + + while (nr_shrink && nm_i->nid_cnt[FREE_NID] > MAX_FREE_NIDS) { + struct free_nid *i, *next; + unsigned int batch = SHRINK_NID_BATCH_SIZE; + + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { + if (!nr_shrink || !batch || + nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) + break; + __remove_free_nid(sbi, i, FREE_NID); + kmem_cache_free(free_nid_slab, i); + nr_shrink--; + batch--; + } + spin_unlock(&nm_i->nid_list_lock); + } + + mutex_unlock(&nm_i->build_lock); + + return nr - nr_shrink; +} + +int f2fs_recover_inline_xattr(struct inode *inode, struct page *page) +{ + void *src_addr, *dst_addr; + size_t inline_size; + struct page *ipage; + struct f2fs_inode *ri; + + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + ri = F2FS_INODE(page); + if (ri->i_inline & F2FS_INLINE_XATTR) { + if (!f2fs_has_inline_xattr(inode)) { + set_inode_flag(inode, FI_INLINE_XATTR); + stat_inc_inline_xattr(inode); + } + } else { + if (f2fs_has_inline_xattr(inode)) { + stat_dec_inline_xattr(inode); + clear_inode_flag(inode, FI_INLINE_XATTR); + } + goto update_inode; + } + + dst_addr = inline_xattr_addr(inode, ipage); + src_addr = inline_xattr_addr(inode, page); + inline_size = inline_xattr_size(inode); + + f2fs_wait_on_page_writeback(ipage, NODE, true, true); + memcpy(dst_addr, src_addr, inline_size); +update_inode: + f2fs_update_inode(inode, ipage); + f2fs_put_page(ipage, 1); + return 0; +} + +int f2fs_recover_xattr_data(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; + nid_t new_xnid; + struct dnode_of_data dn; + struct node_info ni; + struct page *xpage; + int err; + + if (!prev_xnid) + goto recover_xnid; + + /* 1: invalidate the previous xattr nid */ + err = f2fs_get_node_info(sbi, prev_xnid, &ni, false); + if (err) + return err; + + f2fs_invalidate_blocks(sbi, ni.blk_addr); + dec_valid_node_count(sbi, inode, false); + set_node_addr(sbi, &ni, NULL_ADDR, false); + +recover_xnid: + /* 2: update xattr nid in inode */ + if (!f2fs_alloc_nid(sbi, &new_xnid)) + return -ENOSPC; + + set_new_dnode(&dn, inode, NULL, NULL, new_xnid); + xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET); + if (IS_ERR(xpage)) { + f2fs_alloc_nid_failed(sbi, new_xnid); + return PTR_ERR(xpage); + } + + f2fs_alloc_nid_done(sbi, new_xnid); + f2fs_update_inode_page(inode); + + /* 3: update and set xattr node page dirty */ + if (page) { + memcpy(F2FS_NODE(xpage), F2FS_NODE(page), + VALID_XATTR_BLOCK_SIZE); + set_page_dirty(xpage); + } + f2fs_put_page(xpage, 1); + + return 0; +} + +int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *src, *dst; + nid_t ino = ino_of_node(page); + struct node_info old_ni, new_ni; + struct page *ipage; + int err; + + err = f2fs_get_node_info(sbi, ino, &old_ni, false); + if (err) + return err; + + if (unlikely(old_ni.blk_addr != NULL_ADDR)) + return -EINVAL; +retry: + ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); + if (!ipage) { + memalloc_retry_wait(GFP_NOFS); + goto retry; + } + + /* Should not use this inode from free nid list */ + remove_free_nid(sbi, ino); + + if (!PageUptodate(ipage)) + SetPageUptodate(ipage); + fill_node_footer(ipage, ino, ino, 0, true); + set_cold_node(ipage, false); + + src = F2FS_INODE(page); + dst = F2FS_INODE(ipage); + + memcpy(dst, src, offsetof(struct f2fs_inode, i_ext)); + dst->i_size = 0; + dst->i_blocks = cpu_to_le64(1); + dst->i_links = cpu_to_le32(1); + dst->i_xattr_nid = 0; + dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR); + if (dst->i_inline & F2FS_EXTRA_ATTR) { + dst->i_extra_isize = src->i_extra_isize; + + if (f2fs_sb_has_flexible_inline_xattr(sbi) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_inline_xattr_size)) + dst->i_inline_xattr_size = src->i_inline_xattr_size; + + if (f2fs_sb_has_project_quota(sbi) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_projid)) + dst->i_projid = src->i_projid; + + if (f2fs_sb_has_inode_crtime(sbi) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_crtime_nsec)) { + dst->i_crtime = src->i_crtime; + dst->i_crtime_nsec = src->i_crtime_nsec; + } + } + + new_ni = old_ni; + new_ni.ino = ino; + + if (unlikely(inc_valid_node_count(sbi, NULL, true))) + WARN_ON(1); + set_node_addr(sbi, &new_ni, NEW_ADDR, false); + inc_valid_inode_count(sbi); + set_page_dirty(ipage); + f2fs_put_page(ipage, 1); + return 0; +} + +int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, + unsigned int segno, struct f2fs_summary_block *sum) +{ + struct f2fs_node *rn; + struct f2fs_summary *sum_entry; + block_t addr; + int i, idx, last_offset, nrpages; + + /* scan the node segment */ + last_offset = sbi->blocks_per_seg; + addr = START_BLOCK(sbi, segno); + sum_entry = &sum->entries[0]; + + for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { + nrpages = bio_max_segs(last_offset - i); + + /* readahead node pages */ + f2fs_ra_meta_pages(sbi, addr, nrpages, META_POR, true); + + for (idx = addr; idx < addr + nrpages; idx++) { + struct page *page = f2fs_get_tmp_page(sbi, idx); + + if (IS_ERR(page)) + return PTR_ERR(page); + + rn = F2FS_NODE(page); + sum_entry->nid = rn->footer.nid; + sum_entry->version = 0; + sum_entry->ofs_in_node = 0; + sum_entry++; + f2fs_put_page(page, 1); + } + + invalidate_mapping_pages(META_MAPPING(sbi), addr, + addr + nrpages); + } + return 0; +} + +static void remove_nats_in_journal(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + int i; + + down_write(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { + struct nat_entry *ne; + struct f2fs_nat_entry raw_ne; + nid_t nid = le32_to_cpu(nid_in_journal(journal, i)); + + if (f2fs_check_nid_range(sbi, nid)) + continue; + + raw_ne = nat_in_journal(journal, i); + + ne = __lookup_nat_cache(nm_i, nid); + if (!ne) { + ne = __alloc_nat_entry(sbi, nid, true); + __init_nat_entry(nm_i, ne, &raw_ne, true); + } + + /* + * if a free nat in journal has not been used after last + * checkpoint, we should remove it from available nids, + * since later we will add it again. + */ + if (!get_nat_flag(ne, IS_DIRTY) && + le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) { + spin_lock(&nm_i->nid_list_lock); + nm_i->available_nids--; + spin_unlock(&nm_i->nid_list_lock); + } + + __set_nat_cache_dirty(nm_i, ne); + } + update_nats_in_cursum(journal, -i); + up_write(&curseg->journal_rwsem); +} + +static void __adjust_nat_entry_set(struct nat_entry_set *nes, + struct list_head *head, int max) +{ + struct nat_entry_set *cur; + + if (nes->entry_cnt >= max) + goto add_out; + + list_for_each_entry(cur, head, set_list) { + if (cur->entry_cnt >= nes->entry_cnt) { + list_add(&nes->set_list, cur->set_list.prev); + return; + } + } +add_out: + list_add_tail(&nes->set_list, head); +} + +static void __update_nat_bits(struct f2fs_nm_info *nm_i, unsigned int nat_ofs, + unsigned int valid) +{ + if (valid == 0) { + __set_bit_le(nat_ofs, nm_i->empty_nat_bits); + __clear_bit_le(nat_ofs, nm_i->full_nat_bits); + return; + } + + __clear_bit_le(nat_ofs, nm_i->empty_nat_bits); + if (valid == NAT_ENTRY_PER_BLOCK) + __set_bit_le(nat_ofs, nm_i->full_nat_bits); + else + __clear_bit_le(nat_ofs, nm_i->full_nat_bits); +} + +static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, + struct page *page) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK; + struct f2fs_nat_block *nat_blk = page_address(page); + int valid = 0; + int i = 0; + + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) + return; + + if (nat_index == 0) { + valid = 1; + i = 1; + } + for (; i < NAT_ENTRY_PER_BLOCK; i++) { + if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR) + valid++; + } + + __update_nat_bits(nm_i, nat_index, valid); +} + +void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_ofs; + + f2fs_down_read(&nm_i->nat_tree_lock); + + for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) { + unsigned int valid = 0, nid_ofs = 0; + + /* handle nid zero due to it should never be used */ + if (unlikely(nat_ofs == 0)) { + valid = 1; + nid_ofs = 1; + } + + for (; nid_ofs < NAT_ENTRY_PER_BLOCK; nid_ofs++) { + if (!test_bit_le(nid_ofs, + nm_i->free_nid_bitmap[nat_ofs])) + valid++; + } + + __update_nat_bits(nm_i, nat_ofs, valid); + } + + f2fs_up_read(&nm_i->nat_tree_lock); +} + +static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, + struct nat_entry_set *set, struct cp_control *cpc) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK; + bool to_journal = true; + struct f2fs_nat_block *nat_blk; + struct nat_entry *ne, *cur; + struct page *page = NULL; + + /* + * there are two steps to flush nat entries: + * #1, flush nat entries to journal in current hot data summary block. + * #2, flush nat entries to nat page. + */ + if ((cpc->reason & CP_UMOUNT) || + !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) + to_journal = false; + + if (to_journal) { + down_write(&curseg->journal_rwsem); + } else { + page = get_next_nat_page(sbi, start_nid); + if (IS_ERR(page)) + return PTR_ERR(page); + + nat_blk = page_address(page); + f2fs_bug_on(sbi, !nat_blk); + } + + /* flush dirty nats in nat entry set */ + list_for_each_entry_safe(ne, cur, &set->entry_list, list) { + struct f2fs_nat_entry *raw_ne; + nid_t nid = nat_get_nid(ne); + int offset; + + f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR); + + if (to_journal) { + offset = f2fs_lookup_journal_in_cursum(journal, + NAT_JOURNAL, nid, 1); + f2fs_bug_on(sbi, offset < 0); + raw_ne = &nat_in_journal(journal, offset); + nid_in_journal(journal, offset) = cpu_to_le32(nid); + } else { + raw_ne = &nat_blk->entries[nid - start_nid]; + } + raw_nat_from_node_info(raw_ne, &ne->ni); + nat_reset_flag(ne); + __clear_nat_cache_dirty(NM_I(sbi), set, ne); + if (nat_get_blkaddr(ne) == NULL_ADDR) { + add_free_nid(sbi, nid, false, true); + } else { + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, nid, false, false); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } + } + + if (to_journal) { + up_write(&curseg->journal_rwsem); + } else { + update_nat_bits(sbi, start_nid, page); + f2fs_put_page(page, 1); + } + + /* Allow dirty nats by node block allocation in write_begin */ + if (!set->entry_cnt) { + radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); + kmem_cache_free(nat_entry_set_slab, set); + } + return 0; +} + +/* + * This function is called during the checkpointing process. + */ +int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + struct nat_entry_set *setvec[SETVEC_SIZE]; + struct nat_entry_set *set, *tmp; + unsigned int found; + nid_t set_idx = 0; + LIST_HEAD(sets); + int err = 0; + + /* + * during unmount, let's flush nat_bits before checking + * nat_cnt[DIRTY_NAT]. + */ + if (cpc->reason & CP_UMOUNT) { + f2fs_down_write(&nm_i->nat_tree_lock); + remove_nats_in_journal(sbi); + f2fs_up_write(&nm_i->nat_tree_lock); + } + + if (!nm_i->nat_cnt[DIRTY_NAT]) + return 0; + + f2fs_down_write(&nm_i->nat_tree_lock); + + /* + * if there are no enough space in journal to store dirty nat + * entries, remove all entries from journal and merge them + * into nat entry set. + */ + if (cpc->reason & CP_UMOUNT || + !__has_cursum_space(journal, + nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) + remove_nats_in_journal(sbi); + + while ((found = __gang_lookup_nat_set(nm_i, + set_idx, SETVEC_SIZE, setvec))) { + unsigned idx; + + set_idx = setvec[found - 1]->set + 1; + for (idx = 0; idx < found; idx++) + __adjust_nat_entry_set(setvec[idx], &sets, + MAX_NAT_JENTRIES(journal)); + } + + /* flush dirty nats in nat entry set */ + list_for_each_entry_safe(set, tmp, &sets, set_list) { + err = __flush_nat_entry_set(sbi, set, cpc); + if (err) + break; + } + + f2fs_up_write(&nm_i->nat_tree_lock); + /* Allow dirty nats by node block allocation in write_begin */ + + return err; +} + +static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE; + unsigned int i; + __u64 cp_ver = cur_cp_version(ckpt); + block_t nat_bits_addr; + + nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); + nm_i->nat_bits = f2fs_kvzalloc(sbi, + nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL); + if (!nm_i->nat_bits) + return -ENOMEM; + + nm_i->full_nat_bits = nm_i->nat_bits + 8; + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; + + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) + return 0; + + nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - + nm_i->nat_bits_blocks; + for (i = 0; i < nm_i->nat_bits_blocks; i++) { + struct page *page; + + page = f2fs_get_meta_page(sbi, nat_bits_addr++); + if (IS_ERR(page)) + return PTR_ERR(page); + + memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), + page_address(page), F2FS_BLKSIZE); + f2fs_put_page(page, 1); + } + + cp_ver |= (cur_cp_crc(ckpt) << 32); + if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { + clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + f2fs_notice(sbi, "Disable nat_bits due to incorrect cp_ver (%llu, %llu)", + cp_ver, le64_to_cpu(*(__le64 *)nm_i->nat_bits)); + return 0; + } + + f2fs_notice(sbi, "Found nat_bits in checkpoint"); + return 0; +} + +static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int i = 0; + nid_t nid, last_nid; + + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) + return; + + for (i = 0; i < nm_i->nat_blocks; i++) { + i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) + break; + + __set_bit_le(i, nm_i->nat_block_bitmap); + + nid = i * NAT_ENTRY_PER_BLOCK; + last_nid = nid + NAT_ENTRY_PER_BLOCK; + + spin_lock(&NM_I(sbi)->nid_list_lock); + for (; nid < last_nid; nid++) + update_free_nid_bitmap(sbi, nid, true, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } + + for (i = 0; i < nm_i->nat_blocks; i++) { + i = find_next_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) + break; + + __set_bit_le(i, nm_i->nat_block_bitmap); + } +} + +static int init_node_manager(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned char *version_bitmap; + unsigned int nat_segs; + int err; + + nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr); + + /* segment_count_nat includes pair segment so divide to 2. */ + nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; + nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks; + + /* not used nids: 0, node, meta, (and root counted as valid node) */ + nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - + F2FS_RESERVED_NODE_NUM; + nm_i->nid_cnt[FREE_NID] = 0; + nm_i->nid_cnt[PREALLOC_NID] = 0; + nm_i->ram_thresh = DEF_RAM_THRESHOLD; + nm_i->ra_nid_pages = DEF_RA_NID_PAGES; + nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; + nm_i->max_rf_node_blocks = DEF_RF_NODE_BLOCKS; + + INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); + INIT_LIST_HEAD(&nm_i->free_nid_list); + INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO); + INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO); + INIT_LIST_HEAD(&nm_i->nat_entries); + spin_lock_init(&nm_i->nat_list_lock); + + mutex_init(&nm_i->build_lock); + spin_lock_init(&nm_i->nid_list_lock); + init_f2fs_rwsem(&nm_i->nat_tree_lock); + + nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); + nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); + version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP); + nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size, + GFP_KERNEL); + if (!nm_i->nat_bitmap) + return -ENOMEM; + + err = __get_nat_bitmaps(sbi); + if (err) + return err; + +#ifdef CONFIG_F2FS_CHECK_FS + nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size, + GFP_KERNEL); + if (!nm_i->nat_bitmap_mir) + return -ENOMEM; +#endif + + return 0; +} + +static int init_free_nid_cache(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + int i; + + nm_i->free_nid_bitmap = + f2fs_kvzalloc(sbi, array_size(sizeof(unsigned char *), + nm_i->nat_blocks), + GFP_KERNEL); + if (!nm_i->free_nid_bitmap) + return -ENOMEM; + + for (i = 0; i < nm_i->nat_blocks; i++) { + nm_i->free_nid_bitmap[i] = f2fs_kvzalloc(sbi, + f2fs_bitmap_size(NAT_ENTRY_PER_BLOCK), GFP_KERNEL); + if (!nm_i->free_nid_bitmap[i]) + return -ENOMEM; + } + + nm_i->nat_block_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks / 8, + GFP_KERNEL); + if (!nm_i->nat_block_bitmap) + return -ENOMEM; + + nm_i->free_nid_count = + f2fs_kvzalloc(sbi, array_size(sizeof(unsigned short), + nm_i->nat_blocks), + GFP_KERNEL); + if (!nm_i->free_nid_count) + return -ENOMEM; + return 0; +} + +int f2fs_build_node_manager(struct f2fs_sb_info *sbi) +{ + int err; + + sbi->nm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_nm_info), + GFP_KERNEL); + if (!sbi->nm_info) + return -ENOMEM; + + err = init_node_manager(sbi); + if (err) + return err; + + err = init_free_nid_cache(sbi); + if (err) + return err; + + /* load free nid status from nat_bits table */ + load_free_nid_bitmap(sbi); + + return f2fs_build_free_nids(sbi, true, true); +} + +void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i, *next_i; + struct nat_entry *natvec[NATVEC_SIZE]; + struct nat_entry_set *setvec[SETVEC_SIZE]; + nid_t nid = 0; + unsigned int found; + + if (!nm_i) + return; + + /* destroy free nid list */ + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { + __remove_free_nid(sbi, i, FREE_NID); + spin_unlock(&nm_i->nid_list_lock); + kmem_cache_free(free_nid_slab, i); + spin_lock(&nm_i->nid_list_lock); + } + f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID]); + f2fs_bug_on(sbi, nm_i->nid_cnt[PREALLOC_NID]); + f2fs_bug_on(sbi, !list_empty(&nm_i->free_nid_list)); + spin_unlock(&nm_i->nid_list_lock); + + /* destroy nat cache */ + f2fs_down_write(&nm_i->nat_tree_lock); + while ((found = __gang_lookup_nat_cache(nm_i, + nid, NATVEC_SIZE, natvec))) { + unsigned idx; + + nid = nat_get_nid(natvec[found - 1]) + 1; + for (idx = 0; idx < found; idx++) { + spin_lock(&nm_i->nat_list_lock); + list_del(&natvec[idx]->list); + spin_unlock(&nm_i->nat_list_lock); + + __del_from_nat_cache(nm_i, natvec[idx]); + } + } + f2fs_bug_on(sbi, nm_i->nat_cnt[TOTAL_NAT]); + + /* destroy nat set cache */ + nid = 0; + while ((found = __gang_lookup_nat_set(nm_i, + nid, SETVEC_SIZE, setvec))) { + unsigned idx; + + nid = setvec[found - 1]->set + 1; + for (idx = 0; idx < found; idx++) { + /* entry_cnt is not zero, when cp_error was occurred */ + f2fs_bug_on(sbi, !list_empty(&setvec[idx]->entry_list)); + radix_tree_delete(&nm_i->nat_set_root, setvec[idx]->set); + kmem_cache_free(nat_entry_set_slab, setvec[idx]); + } + } + f2fs_up_write(&nm_i->nat_tree_lock); + + kvfree(nm_i->nat_block_bitmap); + if (nm_i->free_nid_bitmap) { + int i; + + for (i = 0; i < nm_i->nat_blocks; i++) + kvfree(nm_i->free_nid_bitmap[i]); + kvfree(nm_i->free_nid_bitmap); + } + kvfree(nm_i->free_nid_count); + + kvfree(nm_i->nat_bitmap); + kvfree(nm_i->nat_bits); +#ifdef CONFIG_F2FS_CHECK_FS + kvfree(nm_i->nat_bitmap_mir); +#endif + sbi->nm_info = NULL; + kfree(nm_i); +} + +int __init f2fs_create_node_manager_caches(void) +{ + nat_entry_slab = f2fs_kmem_cache_create("f2fs_nat_entry", + sizeof(struct nat_entry)); + if (!nat_entry_slab) + goto fail; + + free_nid_slab = f2fs_kmem_cache_create("f2fs_free_nid", + sizeof(struct free_nid)); + if (!free_nid_slab) + goto destroy_nat_entry; + + nat_entry_set_slab = f2fs_kmem_cache_create("f2fs_nat_entry_set", + sizeof(struct nat_entry_set)); + if (!nat_entry_set_slab) + goto destroy_free_nid; + + fsync_node_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_node_entry", + sizeof(struct fsync_node_entry)); + if (!fsync_node_entry_slab) + goto destroy_nat_entry_set; + return 0; + +destroy_nat_entry_set: + kmem_cache_destroy(nat_entry_set_slab); +destroy_free_nid: + kmem_cache_destroy(free_nid_slab); +destroy_nat_entry: + kmem_cache_destroy(nat_entry_slab); +fail: + return -ENOMEM; +} + +void f2fs_destroy_node_manager_caches(void) +{ + kmem_cache_destroy(fsync_node_entry_slab); + kmem_cache_destroy(nat_entry_set_slab); + kmem_cache_destroy(free_nid_slab); + kmem_cache_destroy(nat_entry_slab); +} diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h new file mode 100644 index 000000000..7068f3ac0 --- /dev/null +++ b/fs/f2fs/node.h @@ -0,0 +1,429 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * fs/f2fs/node.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + */ +/* start node id of a node block dedicated to the given node id */ +#define START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) + +/* node block offset on the NAT area dedicated to the given start node id */ +#define NAT_BLOCK_OFFSET(start_nid) ((start_nid) / NAT_ENTRY_PER_BLOCK) + +/* # of pages to perform synchronous readahead before building free nids */ +#define FREE_NID_PAGES 8 +#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) + +/* size of free nid batch when shrinking */ +#define SHRINK_NID_BATCH_SIZE 8 + +#define DEF_RA_NID_PAGES 0 /* # of nid pages to be readaheaded */ + +/* maximum readahead size for node during getting data blocks */ +#define MAX_RA_NODE 128 + +/* control the memory footprint threshold (10MB per 1GB ram) */ +#define DEF_RAM_THRESHOLD 1 + +/* control dirty nats ratio threshold (default: 10% over max nid count) */ +#define DEF_DIRTY_NAT_RATIO_THRESHOLD 10 +/* control total # of nats */ +#define DEF_NAT_CACHE_THRESHOLD 100000 + +/* control total # of node writes used for roll-fowrad recovery */ +#define DEF_RF_NODE_BLOCKS 0 + +/* vector size for gang look-up from nat cache that consists of radix tree */ +#define NATVEC_SIZE 64 +#define SETVEC_SIZE 32 + +/* return value for read_node_page */ +#define LOCKED_PAGE 1 + +/* check pinned file's alignment status of physical blocks */ +#define FILE_NOT_ALIGNED 1 + +/* For flag in struct node_info */ +enum { + IS_CHECKPOINTED, /* is it checkpointed before? */ + HAS_FSYNCED_INODE, /* is the inode fsynced before? */ + HAS_LAST_FSYNC, /* has the latest node fsync mark? */ + IS_DIRTY, /* this nat entry is dirty? */ + IS_PREALLOC, /* nat entry is preallocated */ +}; + +/* + * For node information + */ +struct node_info { + nid_t nid; /* node id */ + nid_t ino; /* inode number of the node's owner */ + block_t blk_addr; /* block address of the node */ + unsigned char version; /* version of the node */ + unsigned char flag; /* for node information bits */ +}; + +struct nat_entry { + struct list_head list; /* for clean or dirty nat list */ + struct node_info ni; /* in-memory node information */ +}; + +#define nat_get_nid(nat) ((nat)->ni.nid) +#define nat_set_nid(nat, n) ((nat)->ni.nid = (n)) +#define nat_get_blkaddr(nat) ((nat)->ni.blk_addr) +#define nat_set_blkaddr(nat, b) ((nat)->ni.blk_addr = (b)) +#define nat_get_ino(nat) ((nat)->ni.ino) +#define nat_set_ino(nat, i) ((nat)->ni.ino = (i)) +#define nat_get_version(nat) ((nat)->ni.version) +#define nat_set_version(nat, v) ((nat)->ni.version = (v)) + +#define inc_node_version(version) (++(version)) + +static inline void copy_node_info(struct node_info *dst, + struct node_info *src) +{ + dst->nid = src->nid; + dst->ino = src->ino; + dst->blk_addr = src->blk_addr; + dst->version = src->version; + /* should not copy flag here */ +} + +static inline void set_nat_flag(struct nat_entry *ne, + unsigned int type, bool set) +{ + if (set) + ne->ni.flag |= BIT(type); + else + ne->ni.flag &= ~BIT(type); +} + +static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type) +{ + return ne->ni.flag & BIT(type); +} + +static inline void nat_reset_flag(struct nat_entry *ne) +{ + /* these states can be set only after checkpoint was done */ + set_nat_flag(ne, IS_CHECKPOINTED, true); + set_nat_flag(ne, HAS_FSYNCED_INODE, false); + set_nat_flag(ne, HAS_LAST_FSYNC, true); +} + +static inline void node_info_from_raw_nat(struct node_info *ni, + struct f2fs_nat_entry *raw_ne) +{ + ni->ino = le32_to_cpu(raw_ne->ino); + ni->blk_addr = le32_to_cpu(raw_ne->block_addr); + ni->version = raw_ne->version; +} + +static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, + struct node_info *ni) +{ + raw_ne->ino = cpu_to_le32(ni->ino); + raw_ne->block_addr = cpu_to_le32(ni->blk_addr); + raw_ne->version = ni->version; +} + +static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->nat_cnt[DIRTY_NAT] >= NM_I(sbi)->max_nid * + NM_I(sbi)->dirty_nats_ratio / 100; +} + +static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->nat_cnt[TOTAL_NAT] >= DEF_NAT_CACHE_THRESHOLD; +} + +enum mem_type { + FREE_NIDS, /* indicates the free nid list */ + NAT_ENTRIES, /* indicates the cached nat entry */ + DIRTY_DENTS, /* indicates dirty dentry pages */ + INO_ENTRIES, /* indicates inode entries */ + READ_EXTENT_CACHE, /* indicates read extent cache */ + DISCARD_CACHE, /* indicates memory of cached discard cmds */ + COMPRESS_PAGE, /* indicates memory of cached compressed pages */ + BASE_CHECK, /* check kernel status */ +}; + +struct nat_entry_set { + struct list_head set_list; /* link with other nat sets */ + struct list_head entry_list; /* link with dirty nat entries */ + nid_t set; /* set number*/ + unsigned int entry_cnt; /* the # of nat entries in set */ +}; + +struct free_nid { + struct list_head list; /* for free node id list */ + nid_t nid; /* node id */ + int state; /* in use or not: FREE_NID or PREALLOC_NID */ +}; + +static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *fnid; + + spin_lock(&nm_i->nid_list_lock); + if (nm_i->nid_cnt[FREE_NID] <= 0) { + spin_unlock(&nm_i->nid_list_lock); + return; + } + fnid = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); + *nid = fnid->nid; + spin_unlock(&nm_i->nid_list_lock); +} + +/* + * inline functions + */ +static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(nm_i->nat_bitmap, nm_i->nat_bitmap_mir, + nm_i->bitmap_size)) + f2fs_bug_on(sbi, 1); +#endif + memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size); +} + +static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + pgoff_t block_off; + pgoff_t block_addr; + + /* + * block_off = segment_off * 512 + off_in_segment + * OLD = (segment_off * 512) * 2 + off_in_segment + * NEW = 2 * (segment_off * 512 + off_in_segment) - off_in_segment + */ + block_off = NAT_BLOCK_OFFSET(start); + + block_addr = (pgoff_t)(nm_i->nat_blkaddr + + (block_off << 1) - + (block_off & (sbi->blocks_per_seg - 1))); + + if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) + block_addr += sbi->blocks_per_seg; + + return block_addr; +} + +static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi, + pgoff_t block_addr) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + block_addr -= nm_i->nat_blkaddr; + block_addr ^= BIT(sbi->log_blocks_per_seg); + return block_addr + nm_i->nat_blkaddr; +} + +static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) +{ + unsigned int block_off = NAT_BLOCK_OFFSET(start_nid); + + f2fs_change_bit(block_off, nm_i->nat_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_change_bit(block_off, nm_i->nat_bitmap_mir); +#endif +} + +static inline nid_t ino_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.ino); +} + +static inline nid_t nid_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.nid); +} + +static inline unsigned int ofs_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + unsigned flag = le32_to_cpu(rn->footer.flag); + return flag >> OFFSET_BIT_SHIFT; +} + +static inline __u64 cpver_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le64_to_cpu(rn->footer.cp_ver); +} + +static inline block_t next_blkaddr_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.next_blkaddr); +} + +static inline void fill_node_footer(struct page *page, nid_t nid, + nid_t ino, unsigned int ofs, bool reset) +{ + struct f2fs_node *rn = F2FS_NODE(page); + unsigned int old_flag = 0; + + if (reset) + memset(rn, 0, sizeof(*rn)); + else + old_flag = le32_to_cpu(rn->footer.flag); + + rn->footer.nid = cpu_to_le32(nid); + rn->footer.ino = cpu_to_le32(ino); + + /* should remain old flag bits such as COLD_BIT_SHIFT */ + rn->footer.flag = cpu_to_le32((ofs << OFFSET_BIT_SHIFT) | + (old_flag & OFFSET_BIT_MASK)); +} + +static inline void copy_node_footer(struct page *dst, struct page *src) +{ + struct f2fs_node *src_rn = F2FS_NODE(src); + struct f2fs_node *dst_rn = F2FS_NODE(dst); + memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer)); +} + +static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); + struct f2fs_node *rn = F2FS_NODE(page); + __u64 cp_ver = cur_cp_version(ckpt); + + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) + cp_ver |= (cur_cp_crc(ckpt) << 32); + + rn->footer.cp_ver = cpu_to_le64(cp_ver); + rn->footer.next_blkaddr = cpu_to_le32(blkaddr); +} + +static inline bool is_recoverable_dnode(struct page *page) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); + __u64 cp_ver = cur_cp_version(ckpt); + + /* Don't care crc part, if fsck.f2fs sets it. */ + if (__is_set_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG)) + return (cp_ver << 32) == (cpver_of_node(page) << 32); + + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) + cp_ver |= (cur_cp_crc(ckpt) << 32); + + return cp_ver == cpver_of_node(page); +} + +/* + * f2fs assigns the following node offsets described as (num). + * N = NIDS_PER_BLOCK + * + * Inode block (0) + * |- direct node (1) + * |- direct node (2) + * |- indirect node (3) + * | `- direct node (4 => 4 + N - 1) + * |- indirect node (4 + N) + * | `- direct node (5 + N => 5 + 2N - 1) + * `- double indirect node (5 + 2N) + * `- indirect node (6 + 2N) + * `- direct node + * ...... + * `- indirect node ((6 + 2N) + x(N + 1)) + * `- direct node + * ...... + * `- indirect node ((6 + 2N) + (N - 1)(N + 1)) + * `- direct node + */ +static inline bool IS_DNODE(struct page *node_page) +{ + unsigned int ofs = ofs_of_node(node_page); + + if (f2fs_has_xattr_block(ofs)) + return true; + + if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || + ofs == 5 + 2 * NIDS_PER_BLOCK) + return false; + if (ofs >= 6 + 2 * NIDS_PER_BLOCK) { + ofs -= 6 + 2 * NIDS_PER_BLOCK; + if (!((long int)ofs % (NIDS_PER_BLOCK + 1))) + return false; + } + return true; +} + +static inline int set_nid(struct page *p, int off, nid_t nid, bool i) +{ + struct f2fs_node *rn = F2FS_NODE(p); + + f2fs_wait_on_page_writeback(p, NODE, true, true); + + if (i) + rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); + else + rn->in.nid[off] = cpu_to_le32(nid); + return set_page_dirty(p); +} + +static inline nid_t get_nid(struct page *p, int off, bool i) +{ + struct f2fs_node *rn = F2FS_NODE(p); + + if (i) + return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]); + return le32_to_cpu(rn->in.nid[off]); +} + +/* + * Coldness identification: + * - Mark cold files in f2fs_inode_info + * - Mark cold node blocks in their node footer + * - Mark cold data pages in page cache + */ + +static inline int is_node(struct page *page, int type) +{ + struct f2fs_node *rn = F2FS_NODE(page); + return le32_to_cpu(rn->footer.flag) & BIT(type); +} + +#define is_cold_node(page) is_node(page, COLD_BIT_SHIFT) +#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT) +#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT) + +static inline void set_cold_node(struct page *page, bool is_dir) +{ + struct f2fs_node *rn = F2FS_NODE(page); + unsigned int flag = le32_to_cpu(rn->footer.flag); + + if (is_dir) + flag &= ~BIT(COLD_BIT_SHIFT); + else + flag |= BIT(COLD_BIT_SHIFT); + rn->footer.flag = cpu_to_le32(flag); +} + +static inline void set_mark(struct page *page, int mark, int type) +{ + struct f2fs_node *rn = F2FS_NODE(page); + unsigned int flag = le32_to_cpu(rn->footer.flag); + if (mark) + flag |= BIT(type); + else + flag &= ~BIT(type); + rn->footer.flag = cpu_to_le32(flag); + +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_inode_chksum_set(F2FS_P_SB(page), page); +#endif +} +#define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT) +#define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c new file mode 100644 index 000000000..dea95b48b --- /dev/null +++ b/fs/f2fs/recovery.c @@ -0,0 +1,934 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/f2fs/recovery.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + */ +#include <asm/unaligned.h> +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/sched/mm.h> +#include "f2fs.h" +#include "node.h" +#include "segment.h" + +/* + * Roll forward recovery scenarios. + * + * [Term] F: fsync_mark, D: dentry_mark + * + * 1. inode(x) | CP | inode(x) | dnode(F) + * -> Update the latest inode(x). + * + * 2. inode(x) | CP | inode(F) | dnode(F) + * -> No problem. + * + * 3. inode(x) | CP | dnode(F) | inode(x) + * -> Recover to the latest dnode(F), and drop the last inode(x) + * + * 4. inode(x) | CP | dnode(F) | inode(F) + * -> No problem. + * + * 5. CP | inode(x) | dnode(F) + * -> The inode(DF) was missing. Should drop this dnode(F). + * + * 6. CP | inode(DF) | dnode(F) + * -> No problem. + * + * 7. CP | dnode(F) | inode(DF) + * -> If f2fs_iget fails, then goto next to find inode(DF). + * + * 8. CP | dnode(F) | inode(x) + * -> If f2fs_iget fails, then goto next to find inode(DF). + * But it will fail due to no inode(DF). + */ + +static struct kmem_cache *fsync_entry_slab; + +#if IS_ENABLED(CONFIG_UNICODE) +extern struct kmem_cache *f2fs_cf_name_slab; +#endif + +bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi) +{ + s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count); + + if (sbi->last_valid_block_count + nalloc > sbi->user_block_count) + return false; + if (NM_I(sbi)->max_rf_node_blocks && + percpu_counter_sum_positive(&sbi->rf_node_block_count) >= + NM_I(sbi)->max_rf_node_blocks) + return false; + return true; +} + +static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, + nid_t ino) +{ + struct fsync_inode_entry *entry; + + list_for_each_entry(entry, head, list) + if (entry->inode->i_ino == ino) + return entry; + + return NULL; +} + +static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, + struct list_head *head, nid_t ino, bool quota_inode) +{ + struct inode *inode; + struct fsync_inode_entry *entry; + int err; + + inode = f2fs_iget_retry(sbi->sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + err = f2fs_dquot_initialize(inode); + if (err) + goto err_out; + + if (quota_inode) { + err = dquot_alloc_inode(inode); + if (err) + goto err_out; + } + + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, + GFP_F2FS_ZERO, true, NULL); + entry->inode = inode; + list_add_tail(&entry->list, head); + + return entry; +err_out: + iput(inode); + return ERR_PTR(err); +} + +static void del_fsync_inode(struct fsync_inode_entry *entry, int drop) +{ + if (drop) { + /* inode should not be recovered, drop it */ + f2fs_inode_synced(entry->inode); + } + iput(entry->inode); + list_del(&entry->list); + kmem_cache_free(fsync_entry_slab, entry); +} + +static int init_recovered_filename(const struct inode *dir, + struct f2fs_inode *raw_inode, + struct f2fs_filename *fname, + struct qstr *usr_fname) +{ + int err; + + memset(fname, 0, sizeof(*fname)); + fname->disk_name.len = le32_to_cpu(raw_inode->i_namelen); + fname->disk_name.name = raw_inode->i_name; + + if (WARN_ON(fname->disk_name.len > F2FS_NAME_LEN)) + return -ENAMETOOLONG; + + if (!IS_ENCRYPTED(dir)) { + usr_fname->name = fname->disk_name.name; + usr_fname->len = fname->disk_name.len; + fname->usr_fname = usr_fname; + } + + /* Compute the hash of the filename */ + if (IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir)) { + /* + * In this case the hash isn't computable without the key, so it + * was saved on-disk. + */ + if (fname->disk_name.len + sizeof(f2fs_hash_t) > F2FS_NAME_LEN) + return -EINVAL; + fname->hash = get_unaligned((f2fs_hash_t *) + &raw_inode->i_name[fname->disk_name.len]); + } else if (IS_CASEFOLDED(dir)) { + err = f2fs_init_casefolded_name(dir, fname); + if (err) + return err; + f2fs_hash_filename(dir, fname); +#if IS_ENABLED(CONFIG_UNICODE) + /* Case-sensitive match is fine for recovery */ + kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); + fname->cf_name.name = NULL; +#endif + } else { + f2fs_hash_filename(dir, fname); + } + return 0; +} + +static int recover_dentry(struct inode *inode, struct page *ipage, + struct list_head *dir_list) +{ + struct f2fs_inode *raw_inode = F2FS_INODE(ipage); + nid_t pino = le32_to_cpu(raw_inode->i_pino); + struct f2fs_dir_entry *de; + struct f2fs_filename fname; + struct qstr usr_fname; + struct page *page; + struct inode *dir, *einode; + struct fsync_inode_entry *entry; + int err = 0; + char *name; + + entry = get_fsync_inode(dir_list, pino); + if (!entry) { + entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, + pino, false); + if (IS_ERR(entry)) { + dir = ERR_CAST(entry); + err = PTR_ERR(entry); + goto out; + } + } + + dir = entry->inode; + err = init_recovered_filename(dir, raw_inode, &fname, &usr_fname); + if (err) + goto out; +retry: + de = __f2fs_find_entry(dir, &fname, &page); + if (de && inode->i_ino == le32_to_cpu(de->ino)) + goto out_put; + + if (de) { + einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino)); + if (IS_ERR(einode)) { + WARN_ON(1); + err = PTR_ERR(einode); + if (err == -ENOENT) + err = -EEXIST; + goto out_put; + } + + err = f2fs_dquot_initialize(einode); + if (err) { + iput(einode); + goto out_put; + } + + err = f2fs_acquire_orphan_inode(F2FS_I_SB(inode)); + if (err) { + iput(einode); + goto out_put; + } + f2fs_delete_entry(de, page, dir, einode); + iput(einode); + goto retry; + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + } else { + err = f2fs_add_dentry(dir, &fname, inode, + inode->i_ino, inode->i_mode); + } + if (err == -ENOMEM) + goto retry; + goto out; + +out_put: + f2fs_put_page(page, 0); +out: + if (file_enc_name(inode)) + name = "<encrypted>"; + else + name = raw_inode->i_name; + f2fs_notice(F2FS_I_SB(inode), "%s: ino = %x, name = %s, dir = %lx, err = %d", + __func__, ino_of_node(ipage), name, + IS_ERR(dir) ? 0 : dir->i_ino, err); + return err; +} + +static int recover_quota_data(struct inode *inode, struct page *page) +{ + struct f2fs_inode *raw = F2FS_INODE(page); + struct iattr attr; + uid_t i_uid = le32_to_cpu(raw->i_uid); + gid_t i_gid = le32_to_cpu(raw->i_gid); + int err; + + memset(&attr, 0, sizeof(attr)); + + attr.ia_vfsuid = VFSUIDT_INIT(make_kuid(inode->i_sb->s_user_ns, i_uid)); + attr.ia_vfsgid = VFSGIDT_INIT(make_kgid(inode->i_sb->s_user_ns, i_gid)); + + if (!vfsuid_eq(attr.ia_vfsuid, i_uid_into_vfsuid(&init_user_ns, inode))) + attr.ia_valid |= ATTR_UID; + if (!vfsgid_eq(attr.ia_vfsgid, i_gid_into_vfsgid(&init_user_ns, inode))) + attr.ia_valid |= ATTR_GID; + + if (!attr.ia_valid) + return 0; + + err = dquot_transfer(&init_user_ns, inode, &attr); + if (err) + set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); + return err; +} + +static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri) +{ + if (ri->i_inline & F2FS_PIN_FILE) + set_inode_flag(inode, FI_PIN_FILE); + else + clear_inode_flag(inode, FI_PIN_FILE); + if (ri->i_inline & F2FS_DATA_EXIST) + set_inode_flag(inode, FI_DATA_EXIST); + else + clear_inode_flag(inode, FI_DATA_EXIST); +} + +static int recover_inode(struct inode *inode, struct page *page) +{ + struct f2fs_inode *raw = F2FS_INODE(page); + char *name; + int err; + + inode->i_mode = le16_to_cpu(raw->i_mode); + + err = recover_quota_data(inode, page); + if (err) + return err; + + i_uid_write(inode, le32_to_cpu(raw->i_uid)); + i_gid_write(inode, le32_to_cpu(raw->i_gid)); + + if (raw->i_inline & F2FS_EXTRA_ATTR) { + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)) && + F2FS_FITS_IN_INODE(raw, le16_to_cpu(raw->i_extra_isize), + i_projid)) { + projid_t i_projid; + kprojid_t kprojid; + + i_projid = (projid_t)le32_to_cpu(raw->i_projid); + kprojid = make_kprojid(&init_user_ns, i_projid); + + if (!projid_eq(kprojid, F2FS_I(inode)->i_projid)) { + err = f2fs_transfer_project_quota(inode, + kprojid); + if (err) + return err; + F2FS_I(inode)->i_projid = kprojid; + } + } + } + + f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); + inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime); + inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); + inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); + inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec); + inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); + inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + + F2FS_I(inode)->i_advise = raw->i_advise; + F2FS_I(inode)->i_flags = le32_to_cpu(raw->i_flags); + f2fs_set_inode_flags(inode); + F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = + le16_to_cpu(raw->i_gc_failures); + + recover_inline_flags(inode, raw); + + f2fs_mark_inode_dirty_sync(inode, true); + + if (file_enc_name(inode)) + name = "<encrypted>"; + else + name = F2FS_INODE(page)->i_name; + + f2fs_notice(F2FS_I_SB(inode), "recover_inode: ino = %x, name = %s, inline = %x", + ino_of_node(page), name, raw->i_inline); + return 0; +} + +static unsigned int adjust_por_ra_blocks(struct f2fs_sb_info *sbi, + unsigned int ra_blocks, unsigned int blkaddr, + unsigned int next_blkaddr) +{ + if (blkaddr + 1 == next_blkaddr) + ra_blocks = min_t(unsigned int, RECOVERY_MAX_RA_BLOCKS, + ra_blocks * 2); + else if (next_blkaddr % sbi->blocks_per_seg) + ra_blocks = max_t(unsigned int, RECOVERY_MIN_RA_BLOCKS, + ra_blocks / 2); + return ra_blocks; +} + +static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, + bool check_only) +{ + struct curseg_info *curseg; + struct page *page = NULL; + block_t blkaddr; + unsigned int loop_cnt = 0; + unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; + unsigned int free_blocks = MAIN_SEGS(sbi) * sbi->blocks_per_seg - + valid_user_blocks(sbi); + int err = 0; + + /* get node pages in the current segment */ + curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + + while (1) { + struct fsync_inode_entry *entry; + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) + return 0; + + page = f2fs_get_tmp_page(sbi, blkaddr); + if (IS_ERR(page)) { + err = PTR_ERR(page); + break; + } + + if (!is_recoverable_dnode(page)) { + f2fs_put_page(page, 1); + break; + } + + if (!is_fsync_dnode(page)) + goto next; + + entry = get_fsync_inode(head, ino_of_node(page)); + if (!entry) { + bool quota_inode = false; + + if (!check_only && + IS_INODE(page) && is_dent_dnode(page)) { + err = f2fs_recover_inode_page(sbi, page); + if (err) { + f2fs_put_page(page, 1); + break; + } + quota_inode = true; + } + + /* + * CP | dnode(F) | inode(DF) + * For this case, we should not give up now. + */ + entry = add_fsync_inode(sbi, head, ino_of_node(page), + quota_inode); + if (IS_ERR(entry)) { + err = PTR_ERR(entry); + if (err == -ENOENT) { + err = 0; + goto next; + } + f2fs_put_page(page, 1); + break; + } + } + entry->blkaddr = blkaddr; + + if (IS_INODE(page) && is_dent_dnode(page)) + entry->last_dentry = blkaddr; +next: + /* sanity check in order to detect looped node chain */ + if (++loop_cnt >= free_blocks || + blkaddr == next_blkaddr_of_node(page)) { + f2fs_notice(sbi, "%s: detect looped node chain, blkaddr:%u, next:%u", + __func__, blkaddr, + next_blkaddr_of_node(page)); + f2fs_put_page(page, 1); + err = -EINVAL; + break; + } + + ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr, + next_blkaddr_of_node(page)); + + /* check next segment */ + blkaddr = next_blkaddr_of_node(page); + f2fs_put_page(page, 1); + + f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks); + } + return err; +} + +static void destroy_fsync_dnodes(struct list_head *head, int drop) +{ + struct fsync_inode_entry *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, head, list) + del_fsync_inode(entry, drop); +} + +static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, + block_t blkaddr, struct dnode_of_data *dn) +{ + struct seg_entry *sentry; + unsigned int segno = GET_SEGNO(sbi, blkaddr); + unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + struct f2fs_summary_block *sum_node; + struct f2fs_summary sum; + struct page *sum_page, *node_page; + struct dnode_of_data tdn = *dn; + nid_t ino, nid; + struct inode *inode; + unsigned int offset, ofs_in_node, max_addrs; + block_t bidx; + int i; + + sentry = get_seg_entry(sbi, segno); + if (!f2fs_test_bit(blkoff, sentry->cur_valid_map)) + return 0; + + /* Get the previous summary */ + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + struct curseg_info *curseg = CURSEG_I(sbi, i); + + if (curseg->segno == segno) { + sum = curseg->sum_blk->entries[blkoff]; + goto got_it; + } + } + + sum_page = f2fs_get_sum_page(sbi, segno); + if (IS_ERR(sum_page)) + return PTR_ERR(sum_page); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + sum = sum_node->entries[blkoff]; + f2fs_put_page(sum_page, 1); +got_it: + /* Use the locked dnode page and inode */ + nid = le32_to_cpu(sum.nid); + ofs_in_node = le16_to_cpu(sum.ofs_in_node); + + max_addrs = ADDRS_PER_PAGE(dn->node_page, dn->inode); + if (ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u", + ofs_in_node, dn->inode->i_ino, nid, max_addrs); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUMMARY); + return -EFSCORRUPTED; + } + + if (dn->inode->i_ino == nid) { + tdn.nid = nid; + if (!dn->inode_page_locked) + lock_page(dn->inode_page); + tdn.node_page = dn->inode_page; + tdn.ofs_in_node = ofs_in_node; + goto truncate_out; + } else if (dn->nid == nid) { + tdn.ofs_in_node = ofs_in_node; + goto truncate_out; + } + + /* Get the node page */ + node_page = f2fs_get_node_page(sbi, nid); + if (IS_ERR(node_page)) + return PTR_ERR(node_page); + + offset = ofs_of_node(node_page); + ino = ino_of_node(node_page); + f2fs_put_page(node_page, 1); + + if (ino != dn->inode->i_ino) { + int ret; + + /* Deallocate previous index in the node page */ + inode = f2fs_iget_retry(sbi->sb, ino); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + ret = f2fs_dquot_initialize(inode); + if (ret) { + iput(inode); + return ret; + } + } else { + inode = dn->inode; + } + + bidx = f2fs_start_bidx_of_node(offset, inode) + + le16_to_cpu(sum.ofs_in_node); + + /* + * if inode page is locked, unlock temporarily, but its reference + * count keeps alive. + */ + if (ino == dn->inode->i_ino && dn->inode_page_locked) + unlock_page(dn->inode_page); + + set_new_dnode(&tdn, inode, NULL, NULL, 0); + if (f2fs_get_dnode_of_data(&tdn, bidx, LOOKUP_NODE)) + goto out; + + if (tdn.data_blkaddr == blkaddr) + f2fs_truncate_data_blocks_range(&tdn, 1); + + f2fs_put_dnode(&tdn); +out: + if (ino != dn->inode->i_ino) + iput(inode); + else if (dn->inode_page_locked) + lock_page(dn->inode_page); + return 0; + +truncate_out: + if (f2fs_data_blkaddr(&tdn) == blkaddr) + f2fs_truncate_data_blocks_range(&tdn, 1); + if (dn->inode->i_ino == nid && !dn->inode_page_locked) + unlock_page(dn->inode_page); + return 0; +} + +static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, + struct page *page) +{ + struct dnode_of_data dn; + struct node_info ni; + unsigned int start, end; + int err = 0, recovered = 0; + + /* step 1: recover xattr */ + if (IS_INODE(page)) { + err = f2fs_recover_inline_xattr(inode, page); + if (err) + goto out; + } else if (f2fs_has_xattr_block(ofs_of_node(page))) { + err = f2fs_recover_xattr_data(inode, page); + if (!err) + recovered++; + goto out; + } + + /* step 2: recover inline data */ + err = f2fs_recover_inline_data(inode, page); + if (err) { + if (err == 1) + err = 0; + goto out; + } + + /* step 3: recover data indices */ + start = f2fs_start_bidx_of_node(ofs_of_node(page), inode); + end = start + ADDRS_PER_PAGE(page, inode); + + set_new_dnode(&dn, inode, NULL, NULL, 0); +retry_dn: + err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE); + if (err) { + if (err == -ENOMEM) { + memalloc_retry_wait(GFP_NOFS); + goto retry_dn; + } + goto out; + } + + f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true); + + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); + if (err) + goto err; + + f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); + + if (ofs_of_node(dn.node_page) != ofs_of_node(page)) { + f2fs_warn(sbi, "Inconsistent ofs_of_node, ino:%lu, ofs:%u, %u", + inode->i_ino, ofs_of_node(dn.node_page), + ofs_of_node(page)); + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); + goto err; + } + + for (; start < end; start++, dn.ofs_in_node++) { + block_t src, dest; + + src = f2fs_data_blkaddr(&dn); + dest = data_blkaddr(dn.inode, page, dn.ofs_in_node); + + if (__is_valid_data_blkaddr(src) && + !f2fs_is_valid_blkaddr(sbi, src, META_POR)) { + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + goto err; + } + + if (__is_valid_data_blkaddr(dest) && + !f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + goto err; + } + + /* skip recovering if dest is the same as src */ + if (src == dest) + continue; + + /* dest is invalid, just invalidate src block */ + if (dest == NULL_ADDR) { + f2fs_truncate_data_blocks_range(&dn, 1); + continue; + } + + if (!file_keep_isize(inode) && + (i_size_read(inode) <= ((loff_t)start << PAGE_SHIFT))) + f2fs_i_size_write(inode, + (loff_t)(start + 1) << PAGE_SHIFT); + + /* + * dest is reserved block, invalidate src block + * and then reserve one new block in dnode page. + */ + if (dest == NEW_ADDR) { + f2fs_truncate_data_blocks_range(&dn, 1); + f2fs_reserve_new_block(&dn); + continue; + } + + /* dest is valid block, try to recover from src to dest */ + if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { + + if (src == NULL_ADDR) { + err = f2fs_reserve_new_block(&dn); + while (err && + IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) + err = f2fs_reserve_new_block(&dn); + /* We should not get -ENOSPC */ + f2fs_bug_on(sbi, err); + if (err) + goto err; + } +retry_prev: + /* Check the previous node page having this index */ + err = check_index_in_prev_nodes(sbi, dest, &dn); + if (err) { + if (err == -ENOMEM) { + memalloc_retry_wait(GFP_NOFS); + goto retry_prev; + } + goto err; + } + + if (f2fs_is_valid_blkaddr(sbi, dest, + DATA_GENERIC_ENHANCE_UPDATE)) { + f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u", + dest, inode->i_ino, dn.ofs_in_node); + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); + goto err; + } + + /* write dummy data page */ + f2fs_replace_block(sbi, &dn, src, dest, + ni.version, false, false); + recovered++; + } + } + + copy_node_footer(dn.node_page, page); + fill_node_footer(dn.node_page, dn.nid, ni.ino, + ofs_of_node(page), false); + set_page_dirty(dn.node_page); +err: + f2fs_put_dnode(&dn); +out: + f2fs_notice(sbi, "recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d", + inode->i_ino, file_keep_isize(inode) ? "keep" : "recover", + recovered, err); + return err; +} + +static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, + struct list_head *tmp_inode_list, struct list_head *dir_list) +{ + struct curseg_info *curseg; + struct page *page = NULL; + int err = 0; + block_t blkaddr; + unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; + + /* get node pages in the current segment */ + curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + + while (1) { + struct fsync_inode_entry *entry; + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) + break; + + page = f2fs_get_tmp_page(sbi, blkaddr); + if (IS_ERR(page)) { + err = PTR_ERR(page); + break; + } + + if (!is_recoverable_dnode(page)) { + f2fs_put_page(page, 1); + break; + } + + entry = get_fsync_inode(inode_list, ino_of_node(page)); + if (!entry) + goto next; + /* + * inode(x) | CP | inode(x) | dnode(F) + * In this case, we can lose the latest inode(x). + * So, call recover_inode for the inode update. + */ + if (IS_INODE(page)) { + err = recover_inode(entry->inode, page); + if (err) { + f2fs_put_page(page, 1); + break; + } + } + if (entry->last_dentry == blkaddr) { + err = recover_dentry(entry->inode, page, dir_list); + if (err) { + f2fs_put_page(page, 1); + break; + } + } + err = do_recover_data(sbi, entry->inode, page); + if (err) { + f2fs_put_page(page, 1); + break; + } + + if (entry->blkaddr == blkaddr) + list_move_tail(&entry->list, tmp_inode_list); +next: + ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr, + next_blkaddr_of_node(page)); + + /* check next segment */ + blkaddr = next_blkaddr_of_node(page); + f2fs_put_page(page, 1); + + f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks); + } + if (!err) + f2fs_allocate_new_segments(sbi); + return err; +} + +int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) +{ + struct list_head inode_list, tmp_inode_list; + struct list_head dir_list; + int err; + int ret = 0; + unsigned long s_flags = sbi->sb->s_flags; + bool need_writecp = false; + bool fix_curseg_write_pointer = false; +#ifdef CONFIG_QUOTA + int quota_enabled; +#endif + + if (s_flags & SB_RDONLY) { + f2fs_info(sbi, "recover fsync data on readonly fs"); + sbi->sb->s_flags &= ~SB_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Turn on quotas so that they are updated correctly */ + quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); +#endif + + INIT_LIST_HEAD(&inode_list); + INIT_LIST_HEAD(&tmp_inode_list); + INIT_LIST_HEAD(&dir_list); + + /* prevent checkpoint */ + f2fs_down_write(&sbi->cp_global_sem); + + /* step #1: find fsynced inode numbers */ + err = find_fsync_dnodes(sbi, &inode_list, check_only); + if (err || list_empty(&inode_list)) + goto skip; + + if (check_only) { + ret = 1; + goto skip; + } + + need_writecp = true; + + /* step #2: recover data */ + err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list); + if (!err) + f2fs_bug_on(sbi, !list_empty(&inode_list)); + else + f2fs_bug_on(sbi, sbi->sb->s_flags & SB_ACTIVE); +skip: + fix_curseg_write_pointer = !check_only || list_empty(&inode_list); + + destroy_fsync_dnodes(&inode_list, err); + destroy_fsync_dnodes(&tmp_inode_list, err); + + /* truncate meta pages to be used by the recovery */ + truncate_inode_pages_range(META_MAPPING(sbi), + (loff_t)MAIN_BLKADDR(sbi) << PAGE_SHIFT, -1); + + if (err) { + truncate_inode_pages_final(NODE_MAPPING(sbi)); + truncate_inode_pages_final(META_MAPPING(sbi)); + } + + /* + * If fsync data succeeds or there is no fsync data to recover, + * and the f2fs is not read only, check and fix zoned block devices' + * write pointer consistency. + */ + if (!err && fix_curseg_write_pointer && !f2fs_readonly(sbi->sb) && + f2fs_sb_has_blkzoned(sbi)) { + err = f2fs_fix_curseg_write_pointer(sbi); + ret = err; + } + + if (!err) + clear_sbi_flag(sbi, SBI_POR_DOING); + + f2fs_up_write(&sbi->cp_global_sem); + + /* let's drop all the directory inodes for clean checkpoint */ + destroy_fsync_dnodes(&dir_list, err); + + if (need_writecp) { + set_sbi_flag(sbi, SBI_IS_RECOVERED); + + if (!err) { + struct cp_control cpc = { + .reason = CP_RECOVERY, + }; + err = f2fs_write_checkpoint(sbi, &cpc); + } + } + +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + if (quota_enabled) + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ + + return ret ? ret : err; +} + +int __init f2fs_create_recovery_cache(void) +{ + fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", + sizeof(struct fsync_inode_entry)); + if (!fsync_entry_slab) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_recovery_cache(void) +{ + kmem_cache_destroy(fsync_entry_slab); +} diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c new file mode 100644 index 000000000..16bf9d5c8 --- /dev/null +++ b/fs/f2fs/segment.c @@ -0,0 +1,5251 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/f2fs/segment.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/sched/mm.h> +#include <linux/prefetch.h> +#include <linux/kthread.h> +#include <linux/swap.h> +#include <linux/timer.h> +#include <linux/freezer.h> +#include <linux/sched/signal.h> +#include <linux/random.h> + +#include "f2fs.h" +#include "segment.h" +#include "node.h" +#include "gc.h" +#include "iostat.h" +#include <trace/events/f2fs.h> + +#define __reverse_ffz(x) __reverse_ffs(~(x)) + +static struct kmem_cache *discard_entry_slab; +static struct kmem_cache *discard_cmd_slab; +static struct kmem_cache *sit_entry_set_slab; +static struct kmem_cache *revoke_entry_slab; + +static unsigned long __reverse_ulong(unsigned char *str) +{ + unsigned long tmp = 0; + int shift = 24, idx = 0; + +#if BITS_PER_LONG == 64 + shift = 56; +#endif + while (shift >= 0) { + tmp |= (unsigned long)str[idx++] << shift; + shift -= BITS_PER_BYTE; + } + return tmp; +} + +/* + * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since + * MSB and LSB are reversed in a byte by f2fs_set_bit. + */ +static inline unsigned long __reverse_ffs(unsigned long word) +{ + int num = 0; + +#if BITS_PER_LONG == 64 + if ((word & 0xffffffff00000000UL) == 0) + num += 32; + else + word >>= 32; +#endif + if ((word & 0xffff0000) == 0) + num += 16; + else + word >>= 16; + + if ((word & 0xff00) == 0) + num += 8; + else + word >>= 8; + + if ((word & 0xf0) == 0) + num += 4; + else + word >>= 4; + + if ((word & 0xc) == 0) + num += 2; + else + word >>= 2; + + if ((word & 0x2) == 0) + num += 1; + return num; +} + +/* + * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because + * f2fs_set_bit makes MSB and LSB reversed in a byte. + * @size must be integral times of unsigned long. + * Example: + * MSB <--> LSB + * f2fs_set_bit(0, bitmap) => 1000 0000 + * f2fs_set_bit(7, bitmap) => 0000 0001 + */ +static unsigned long __find_rev_next_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BIT_WORD(offset); + unsigned long result = size; + unsigned long tmp; + + if (offset >= size) + return size; + + size -= (offset & ~(BITS_PER_LONG - 1)); + offset %= BITS_PER_LONG; + + while (1) { + if (*p == 0) + goto pass; + + tmp = __reverse_ulong((unsigned char *)p); + + tmp &= ~0UL >> offset; + if (size < BITS_PER_LONG) + tmp &= (~0UL << (BITS_PER_LONG - size)); + if (tmp) + goto found; +pass: + if (size <= BITS_PER_LONG) + break; + size -= BITS_PER_LONG; + offset = 0; + p++; + } + return result; +found: + return result - size + __reverse_ffs(tmp); +} + +static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BIT_WORD(offset); + unsigned long result = size; + unsigned long tmp; + + if (offset >= size) + return size; + + size -= (offset & ~(BITS_PER_LONG - 1)); + offset %= BITS_PER_LONG; + + while (1) { + if (*p == ~0UL) + goto pass; + + tmp = __reverse_ulong((unsigned char *)p); + + if (offset) + tmp |= ~0UL << (BITS_PER_LONG - offset); + if (size < BITS_PER_LONG) + tmp |= ~0UL >> size; + if (tmp != ~0UL) + goto found; +pass: + if (size <= BITS_PER_LONG) + break; + size -= BITS_PER_LONG; + offset = 0; + p++; + } + return result; +found: + return result - size + __reverse_ffz(tmp); +} + +bool f2fs_need_SSR(struct f2fs_sb_info *sbi) +{ + int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); + int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + + if (f2fs_lfs_mode(sbi)) + return false; + if (sbi->gc_mode == GC_URGENT_HIGH) + return true; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return true; + + return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + + SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); +} + +void f2fs_abort_atomic_write(struct inode *inode, bool clean) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + if (!f2fs_is_atomic_file(inode)) + return; + + release_atomic_write_cnt(inode); + clear_inode_flag(inode, FI_ATOMIC_COMMITTED); + clear_inode_flag(inode, FI_ATOMIC_FILE); + stat_dec_atomic_inode(inode); + + F2FS_I(inode)->atomic_write_task = NULL; + + if (clean) { + truncate_inode_pages_final(inode->i_mapping); + f2fs_i_size_write(inode, fi->original_i_size); + fi->original_i_size = 0; + } + /* avoid stale dirty inode during eviction */ + sync_inode_metadata(inode, 0); +} + +static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, + block_t new_addr, block_t *old_addr, bool recover) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct node_info ni; + int err; + +retry: + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE_RA); + if (err) { + if (err == -ENOMEM) { + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + goto retry; + } + return err; + } + + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); + if (err) { + f2fs_put_dnode(&dn); + return err; + } + + if (recover) { + /* dn.data_blkaddr is always valid */ + if (!__is_valid_data_blkaddr(new_addr)) { + if (new_addr == NULL_ADDR) + dec_valid_block_count(sbi, inode, 1); + f2fs_invalidate_blocks(sbi, dn.data_blkaddr); + f2fs_update_data_blkaddr(&dn, new_addr); + } else { + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + new_addr, ni.version, true, true); + } + } else { + blkcnt_t count = 1; + + err = inc_valid_block_count(sbi, inode, &count); + if (err) { + f2fs_put_dnode(&dn); + return err; + } + + *old_addr = dn.data_blkaddr; + f2fs_truncate_data_blocks_range(&dn, 1); + dec_valid_block_count(sbi, F2FS_I(inode)->cow_inode, count); + + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, + ni.version, true, false); + } + + f2fs_put_dnode(&dn); + + trace_f2fs_replace_atomic_write_block(inode, F2FS_I(inode)->cow_inode, + index, old_addr ? *old_addr : 0, new_addr, recover); + return 0; +} + +static void __complete_revoke_list(struct inode *inode, struct list_head *head, + bool revoke) +{ + struct revoke_entry *cur, *tmp; + + list_for_each_entry_safe(cur, tmp, head, list) { + if (revoke) + __replace_atomic_write_block(inode, cur->index, + cur->old_addr, NULL, true); + list_del(&cur->list); + kmem_cache_free(revoke_entry_slab, cur); + } +} + +static int __f2fs_commit_atomic_write(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct inode *cow_inode = fi->cow_inode; + struct revoke_entry *new; + struct list_head revoke_list; + block_t blkaddr; + struct dnode_of_data dn; + pgoff_t len = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + pgoff_t off = 0, blen, index; + int ret = 0, i; + + INIT_LIST_HEAD(&revoke_list); + + while (len) { + blen = min_t(pgoff_t, ADDRS_PER_BLOCK(cow_inode), len); + + set_new_dnode(&dn, cow_inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + goto out; + } else if (ret == -ENOENT) { + ret = 0; + if (dn.max_level == 0) + goto out; + goto next; + } + + blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, cow_inode), + len); + index = off; + for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) { + blkaddr = f2fs_data_blkaddr(&dn); + + if (!__is_valid_data_blkaddr(blkaddr)) { + continue; + } else if (!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE)) { + f2fs_put_dnode(&dn); + ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); + goto out; + } + + new = f2fs_kmem_cache_alloc(revoke_entry_slab, GFP_NOFS, + true, NULL); + + ret = __replace_atomic_write_block(inode, index, blkaddr, + &new->old_addr, false); + if (ret) { + f2fs_put_dnode(&dn); + kmem_cache_free(revoke_entry_slab, new); + goto out; + } + + f2fs_update_data_blkaddr(&dn, NULL_ADDR); + new->index = index; + list_add_tail(&new->list, &revoke_list); + } + f2fs_put_dnode(&dn); +next: + off += blen; + len -= blen; + } + +out: + if (ret) { + sbi->revoked_atomic_block += fi->atomic_write_cnt; + } else { + sbi->committed_atomic_block += fi->atomic_write_cnt; + set_inode_flag(inode, FI_ATOMIC_COMMITTED); + } + + __complete_revoke_list(inode, &revoke_list, ret ? true : false); + + return ret; +} + +int f2fs_commit_atomic_write(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + int err; + + err = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (err) + return err; + + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); + f2fs_lock_op(sbi); + + err = __f2fs_commit_atomic_write(inode); + + f2fs_unlock_op(sbi); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + + return err; +} + +/* + * This function balances dirty node and dentry pages. + * In addition, it controls garbage collection. + */ +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) +{ + if (time_to_inject(sbi, FAULT_CHECKPOINT)) { + f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT); + } + + /* balance_fs_bg is able to be pending */ + if (need && excess_cached_nats(sbi)) + f2fs_balance_fs_bg(sbi, false); + + if (!f2fs_is_checkpoint_ready(sbi)) + return; + + /* + * We should do GC or end up with checkpoint, if there are so many dirty + * dir/node pages without enough free segments. + */ + if (has_not_enough_free_secs(sbi, 0, 0)) { + if (test_opt(sbi, GC_MERGE) && sbi->gc_thread && + sbi->gc_thread->f2fs_gc_task) { + DEFINE_WAIT(wait); + + prepare_to_wait(&sbi->gc_thread->fggc_wq, &wait, + TASK_UNINTERRUPTIBLE); + wake_up(&sbi->gc_thread->gc_wait_queue_head); + io_schedule(); + finish_wait(&sbi->gc_thread->fggc_wq, &wait); + } else { + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = BG_GC, + .no_bg_gc = true, + .should_migrate_blocks = false, + .err_gc_skipped = false, + .nr_free_secs = 1 }; + f2fs_down_write(&sbi->gc_lock); + f2fs_gc(sbi, &gc_control); + } + } +} + +static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi) +{ + int factor = f2fs_rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2; + unsigned int dents = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int qdata = get_pages(sbi, F2FS_DIRTY_QDATA); + unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES); + unsigned int meta = get_pages(sbi, F2FS_DIRTY_META); + unsigned int imeta = get_pages(sbi, F2FS_DIRTY_IMETA); + unsigned int threshold = sbi->blocks_per_seg * factor * + DEFAULT_DIRTY_THRESHOLD; + unsigned int global_threshold = threshold * 3 / 2; + + if (dents >= threshold || qdata >= threshold || + nodes >= threshold || meta >= threshold || + imeta >= threshold) + return true; + return dents + qdata + nodes + meta + imeta > global_threshold; +} + +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) +{ + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return; + + /* try to shrink extent cache when there is no enough memory */ + if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE)) + f2fs_shrink_read_extent_tree(sbi, + READ_EXTENT_CACHE_SHRINK_NUMBER); + + /* check the # of cached NAT entries */ + if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) + f2fs_try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); + + if (!f2fs_available_free_memory(sbi, FREE_NIDS)) + f2fs_try_to_free_nids(sbi, MAX_FREE_NIDS); + else + f2fs_build_free_nids(sbi, false, false); + + if (excess_dirty_nats(sbi) || excess_dirty_threshold(sbi) || + excess_prefree_segs(sbi) || !f2fs_space_for_roll_forward(sbi)) + goto do_sync; + + /* there is background inflight IO or foreground operation recently */ + if (is_inflight_io(sbi, REQ_TIME) || + (!f2fs_time_over(sbi, REQ_TIME) && f2fs_rwsem_is_locked(&sbi->cp_rwsem))) + return; + + /* exceed periodical checkpoint timeout threshold */ + if (f2fs_time_over(sbi, CP_TIME)) + goto do_sync; + + /* checkpoint is the only way to shrink partial cached entries */ + if (f2fs_available_free_memory(sbi, NAT_ENTRIES) && + f2fs_available_free_memory(sbi, INO_ENTRIES)) + return; + +do_sync: + if (test_opt(sbi, DATA_FLUSH) && from_bg) { + struct blk_plug plug; + + mutex_lock(&sbi->flush_lock); + + blk_start_plug(&plug); + f2fs_sync_dirty_inodes(sbi, FILE_INODE, false); + blk_finish_plug(&plug); + + mutex_unlock(&sbi->flush_lock); + } + f2fs_sync_fs(sbi->sb, 1); + stat_inc_bg_cp_count(sbi->stat_info); +} + +static int __submit_flush_wait(struct f2fs_sb_info *sbi, + struct block_device *bdev) +{ + int ret = blkdev_issue_flush(bdev); + + trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE), ret); + return ret; +} + +static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino) +{ + int ret = 0; + int i; + + if (!f2fs_is_multi_device(sbi)) + return __submit_flush_wait(sbi, sbi->sb->s_bdev); + + for (i = 0; i < sbi->s_ndevs; i++) { + if (!f2fs_is_dirty_device(sbi, ino, i, FLUSH_INO)) + continue; + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + break; + } + return ret; +} + +static int issue_flush_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; + wait_queue_head_t *q = &fcc->flush_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; + + if (!llist_empty(&fcc->issue_list)) { + struct flush_cmd *cmd, *next; + int ret; + + fcc->dispatch_list = llist_del_all(&fcc->issue_list); + fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); + + cmd = llist_entry(fcc->dispatch_list, struct flush_cmd, llnode); + + ret = submit_flush_wait(sbi, cmd->ino); + atomic_inc(&fcc->issued_flush); + + llist_for_each_entry_safe(cmd, next, + fcc->dispatch_list, llnode) { + cmd->ret = ret; + complete(&cmd->wait); + } + fcc->dispatch_list = NULL; + } + + wait_event_interruptible(*q, + kthread_should_stop() || !llist_empty(&fcc->issue_list)); + goto repeat; +} + +int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; + struct flush_cmd cmd; + int ret; + + if (test_opt(sbi, NOBARRIER)) + return 0; + + if (!test_opt(sbi, FLUSH_MERGE)) { + atomic_inc(&fcc->queued_flush); + ret = submit_flush_wait(sbi, ino); + atomic_dec(&fcc->queued_flush); + atomic_inc(&fcc->issued_flush); + return ret; + } + + if (atomic_inc_return(&fcc->queued_flush) == 1 || + f2fs_is_multi_device(sbi)) { + ret = submit_flush_wait(sbi, ino); + atomic_dec(&fcc->queued_flush); + + atomic_inc(&fcc->issued_flush); + return ret; + } + + cmd.ino = ino; + init_completion(&cmd.wait); + + llist_add(&cmd.llnode, &fcc->issue_list); + + /* + * update issue_list before we wake up issue_flush thread, this + * smp_mb() pairs with another barrier in ___wait_event(), see + * more details in comments of waitqueue_active(). + */ + smp_mb(); + + if (waitqueue_active(&fcc->flush_wait_queue)) + wake_up(&fcc->flush_wait_queue); + + if (fcc->f2fs_issue_flush) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->queued_flush); + } else { + struct llist_node *list; + + list = llist_del_all(&fcc->issue_list); + if (!list) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->queued_flush); + } else { + struct flush_cmd *tmp, *next; + + ret = submit_flush_wait(sbi, ino); + + llist_for_each_entry_safe(tmp, next, list, llnode) { + if (tmp == &cmd) { + cmd.ret = ret; + atomic_dec(&fcc->queued_flush); + continue; + } + tmp->ret = ret; + complete(&tmp->wait); + } + } + } + + return cmd.ret; +} + +int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct flush_cmd_control *fcc; + int err = 0; + + if (SM_I(sbi)->fcc_info) { + fcc = SM_I(sbi)->fcc_info; + if (fcc->f2fs_issue_flush) + return err; + goto init_thread; + } + + fcc = f2fs_kzalloc(sbi, sizeof(struct flush_cmd_control), GFP_KERNEL); + if (!fcc) + return -ENOMEM; + atomic_set(&fcc->issued_flush, 0); + atomic_set(&fcc->queued_flush, 0); + init_waitqueue_head(&fcc->flush_wait_queue); + init_llist_head(&fcc->issue_list); + SM_I(sbi)->fcc_info = fcc; + if (!test_opt(sbi, FLUSH_MERGE)) + return err; + +init_thread: + fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, + "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(fcc->f2fs_issue_flush)) { + err = PTR_ERR(fcc->f2fs_issue_flush); + kfree(fcc); + SM_I(sbi)->fcc_info = NULL; + return err; + } + + return err; +} + +void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) +{ + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; + + if (fcc && fcc->f2fs_issue_flush) { + struct task_struct *flush_thread = fcc->f2fs_issue_flush; + + fcc->f2fs_issue_flush = NULL; + kthread_stop(flush_thread); + } + if (free) { + kfree(fcc); + SM_I(sbi)->fcc_info = NULL; + } +} + +int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) +{ + int ret = 0, i; + + if (!f2fs_is_multi_device(sbi)) + return 0; + + if (test_opt(sbi, NOBARRIER)) + return 0; + + for (i = 1; i < sbi->s_ndevs; i++) { + int count = DEFAULT_RETRY_IO_COUNT; + + if (!f2fs_test_bit(i, (char *)&sbi->dirty_device)) + continue; + + do { + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + } while (ret && --count); + + if (ret) { + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_FLUSH_FAIL); + break; + } + + spin_lock(&sbi->dev_lock); + f2fs_clear_bit(i, (char *)&sbi->dirty_device); + spin_unlock(&sbi->dev_lock); + } + + return ret; +} + +static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, + enum dirty_type dirty_type) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + + /* need not be added */ + if (IS_CURSEG(sbi, segno)) + return; + + if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) + dirty_i->nr_dirty[dirty_type]++; + + if (dirty_type == DIRTY) { + struct seg_entry *sentry = get_seg_entry(sbi, segno); + enum dirty_type t = sentry->type; + + if (unlikely(t >= DIRTY)) { + f2fs_bug_on(sbi, 1); + return; + } + if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t])) + dirty_i->nr_dirty[t]++; + + if (__is_large_section(sbi)) { + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + block_t valid_blocks = + get_valid_blocks(sbi, segno, true); + + f2fs_bug_on(sbi, unlikely(!valid_blocks || + valid_blocks == CAP_BLKS_PER_SEC(sbi))); + + if (!IS_CURSEC(sbi, secno)) + set_bit(secno, dirty_i->dirty_secmap); + } + } +} + +static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, + enum dirty_type dirty_type) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + block_t valid_blocks; + + if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type])) + dirty_i->nr_dirty[dirty_type]--; + + if (dirty_type == DIRTY) { + struct seg_entry *sentry = get_seg_entry(sbi, segno); + enum dirty_type t = sentry->type; + + if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) + dirty_i->nr_dirty[t]--; + + valid_blocks = get_valid_blocks(sbi, segno, true); + if (valid_blocks == 0) { + clear_bit(GET_SEC_FROM_SEG(sbi, segno), + dirty_i->victim_secmap); +#ifdef CONFIG_F2FS_CHECK_FS + clear_bit(segno, SIT_I(sbi)->invalid_segmap); +#endif + } + if (__is_large_section(sbi)) { + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + + if (!valid_blocks || + valid_blocks == CAP_BLKS_PER_SEC(sbi)) { + clear_bit(secno, dirty_i->dirty_secmap); + return; + } + + if (!IS_CURSEC(sbi, secno)) + set_bit(secno, dirty_i->dirty_secmap); + } + } +} + +/* + * Should not occur error such as -ENOMEM. + * Adding dirty entry into seglist is not critical operation. + * If a given segment is one of current working segments, it won't be added. + */ +static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned short valid_blocks, ckpt_valid_blocks; + unsigned int usable_blocks; + + if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno)) + return; + + usable_blocks = f2fs_usable_blks_in_seg(sbi, segno); + mutex_lock(&dirty_i->seglist_lock); + + valid_blocks = get_valid_blocks(sbi, segno, false); + ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno, false); + + if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) || + ckpt_valid_blocks == usable_blocks)) { + __locate_dirty_segment(sbi, segno, PRE); + __remove_dirty_segment(sbi, segno, DIRTY); + } else if (valid_blocks < usable_blocks) { + __locate_dirty_segment(sbi, segno, DIRTY); + } else { + /* Recovery routine with SSR needs this */ + __remove_dirty_segment(sbi, segno, DIRTY); + } + + mutex_unlock(&dirty_i->seglist_lock); +} + +/* This moves currently empty dirty blocks to prefree. Must hold seglist_lock */ +void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int segno; + + mutex_lock(&dirty_i->seglist_lock); + for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { + if (get_valid_blocks(sbi, segno, false)) + continue; + if (IS_CURSEG(sbi, segno)) + continue; + __locate_dirty_segment(sbi, segno, PRE); + __remove_dirty_segment(sbi, segno, DIRTY); + } + mutex_unlock(&dirty_i->seglist_lock); +} + +block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi) +{ + int ovp_hole_segs = + (overprovision_segments(sbi) - reserved_segments(sbi)); + block_t ovp_holes = ovp_hole_segs << sbi->log_blocks_per_seg; + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + block_t holes[2] = {0, 0}; /* DATA and NODE */ + block_t unusable; + struct seg_entry *se; + unsigned int segno; + + mutex_lock(&dirty_i->seglist_lock); + for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { + se = get_seg_entry(sbi, segno); + if (IS_NODESEG(se->type)) + holes[NODE] += f2fs_usable_blks_in_seg(sbi, segno) - + se->valid_blocks; + else + holes[DATA] += f2fs_usable_blks_in_seg(sbi, segno) - + se->valid_blocks; + } + mutex_unlock(&dirty_i->seglist_lock); + + unusable = holes[DATA] > holes[NODE] ? holes[DATA] : holes[NODE]; + if (unusable > ovp_holes) + return unusable - ovp_holes; + return 0; +} + +int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable) +{ + int ovp_hole_segs = + (overprovision_segments(sbi) - reserved_segments(sbi)); + if (unusable > F2FS_OPTION(sbi).unusable_cap) + return -EAGAIN; + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) && + dirty_segments(sbi) > ovp_hole_segs) + return -EAGAIN; + return 0; +} + +/* This is only used by SBI_CP_DISABLED */ +static unsigned int get_free_segment(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int segno = 0; + + mutex_lock(&dirty_i->seglist_lock); + for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { + if (get_valid_blocks(sbi, segno, false)) + continue; + if (get_ckpt_valid_blocks(sbi, segno, false)) + continue; + mutex_unlock(&dirty_i->seglist_lock); + return segno; + } + mutex_unlock(&dirty_i->seglist_lock); + return NULL_SEGNO; +} + +static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc; + + f2fs_bug_on(sbi, !len); + + pend_list = &dcc->pend_list[plist_idx(len)]; + + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS, true, NULL); + INIT_LIST_HEAD(&dc->list); + dc->bdev = bdev; + dc->lstart = lstart; + dc->start = start; + dc->len = len; + dc->ref = 0; + dc->state = D_PREP; + dc->queued = 0; + dc->error = 0; + init_completion(&dc->wait); + list_add_tail(&dc->list, pend_list); + spin_lock_init(&dc->lock); + dc->bio_ref = 0; + atomic_inc(&dcc->discard_cmd_cnt); + dcc->undiscard_blks += len; + + return dc; +} + +static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node *parent, struct rb_node **p, + bool leftmost) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; + + dc = __create_discard_cmd(sbi, bdev, lstart, start, len); + + rb_link_node(&dc->rb_node, parent, p); + rb_insert_color_cached(&dc->rb_node, &dcc->root, leftmost); + + return dc; +} + +static void __detach_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + if (dc->state == D_DONE) + atomic_sub(dc->queued, &dcc->queued_discard); + + list_del(&dc->list); + rb_erase_cached(&dc->rb_node, &dcc->root); + dcc->undiscard_blks -= dc->len; + + kmem_cache_free(discard_cmd_slab, dc); + + atomic_dec(&dcc->discard_cmd_cnt); +} + +static void __remove_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + unsigned long flags; + + trace_f2fs_remove_discard(dc->bdev, dc->start, dc->len); + + spin_lock_irqsave(&dc->lock, flags); + if (dc->bio_ref) { + spin_unlock_irqrestore(&dc->lock, flags); + return; + } + spin_unlock_irqrestore(&dc->lock, flags); + + f2fs_bug_on(sbi, dc->ref); + + if (dc->error == -EOPNOTSUPP) + dc->error = 0; + + if (dc->error) + printk_ratelimited( + "%sF2FS-fs (%s): Issue discard(%u, %u, %u) failed, ret: %d", + KERN_INFO, sbi->sb->s_id, + dc->lstart, dc->start, dc->len, dc->error); + __detach_discard_cmd(dcc, dc); +} + +static void f2fs_submit_discard_endio(struct bio *bio) +{ + struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; + unsigned long flags; + + spin_lock_irqsave(&dc->lock, flags); + if (!dc->error) + dc->error = blk_status_to_errno(bio->bi_status); + dc->bio_ref--; + if (!dc->bio_ref && dc->state == D_SUBMIT) { + dc->state = D_DONE; + complete_all(&dc->wait); + } + spin_unlock_irqrestore(&dc->lock, flags); + bio_put(bio); +} + +static void __check_sit_bitmap(struct f2fs_sb_info *sbi, + block_t start, block_t end) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct seg_entry *sentry; + unsigned int segno; + block_t blk = start; + unsigned long offset, size, max_blocks = sbi->blocks_per_seg; + unsigned long *map; + + while (blk < end) { + segno = GET_SEGNO(sbi, blk); + sentry = get_seg_entry(sbi, segno); + offset = GET_BLKOFF_FROM_SEG0(sbi, blk); + + if (end < START_BLOCK(sbi, segno + 1)) + size = GET_BLKOFF_FROM_SEG0(sbi, end); + else + size = max_blocks; + map = (unsigned long *)(sentry->cur_valid_map); + offset = __find_rev_next_bit(map, size, offset); + f2fs_bug_on(sbi, offset != size); + blk = START_BLOCK(sbi, segno + 1); + } +#endif +} + +static void __init_discard_policy(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + int discard_type, unsigned int granularity) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + /* common policy */ + dpolicy->type = discard_type; + dpolicy->sync = true; + dpolicy->ordered = false; + dpolicy->granularity = granularity; + + dpolicy->max_requests = dcc->max_discard_request; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->timeout = false; + + if (discard_type == DPOLICY_BG) { + dpolicy->min_interval = dcc->min_discard_issue_time; + dpolicy->mid_interval = dcc->mid_discard_issue_time; + dpolicy->max_interval = dcc->max_discard_issue_time; + dpolicy->io_aware = true; + dpolicy->sync = false; + dpolicy->ordered = true; + if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { + dpolicy->granularity = 1; + if (atomic_read(&dcc->discard_cmd_cnt)) + dpolicy->max_interval = + dcc->min_discard_issue_time; + } + } else if (discard_type == DPOLICY_FORCE) { + dpolicy->min_interval = dcc->min_discard_issue_time; + dpolicy->mid_interval = dcc->mid_discard_issue_time; + dpolicy->max_interval = dcc->max_discard_issue_time; + dpolicy->io_aware = false; + } else if (discard_type == DPOLICY_FSTRIM) { + dpolicy->io_aware = false; + } else if (discard_type == DPOLICY_UMOUNT) { + dpolicy->io_aware = false; + /* we need to issue all to keep CP_TRIMMED_FLAG */ + dpolicy->granularity = 1; + dpolicy->timeout = true; + } +} + +static void __update_discard_tree_range(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len); +/* this function is copied from blkdev_issue_discard from block/blk-lib.c */ +static int __submit_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + struct discard_cmd *dc, + unsigned int *issued) +{ + struct block_device *bdev = dc->bdev; + unsigned int max_discard_blocks = + SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? + &(dcc->fstrim_list) : &(dcc->wait_list); + blk_opf_t flag = dpolicy->sync ? REQ_SYNC : 0; + block_t lstart, start, len, total_len; + int err = 0; + + if (dc->state != D_PREP) + return 0; + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + return 0; + + trace_f2fs_issue_discard(bdev, dc->start, dc->len); + + lstart = dc->lstart; + start = dc->start; + len = dc->len; + total_len = len; + + dc->len = 0; + + while (total_len && *issued < dpolicy->max_requests && !err) { + struct bio *bio = NULL; + unsigned long flags; + bool last = true; + + if (len > max_discard_blocks) { + len = max_discard_blocks; + last = false; + } + + (*issued)++; + if (*issued == dpolicy->max_requests) + last = true; + + dc->len += len; + + if (time_to_inject(sbi, FAULT_DISCARD)) { + f2fs_show_injection_info(sbi, FAULT_DISCARD); + err = -EIO; + goto submit; + } + err = __blkdev_issue_discard(bdev, + SECTOR_FROM_BLOCK(start), + SECTOR_FROM_BLOCK(len), + GFP_NOFS, &bio); +submit: + if (err) { + spin_lock_irqsave(&dc->lock, flags); + if (dc->state == D_PARTIAL) + dc->state = D_SUBMIT; + spin_unlock_irqrestore(&dc->lock, flags); + + break; + } + + f2fs_bug_on(sbi, !bio); + + /* + * should keep before submission to avoid D_DONE + * right away + */ + spin_lock_irqsave(&dc->lock, flags); + if (last) + dc->state = D_SUBMIT; + else + dc->state = D_PARTIAL; + dc->bio_ref++; + spin_unlock_irqrestore(&dc->lock, flags); + + atomic_inc(&dcc->queued_discard); + dc->queued++; + list_move_tail(&dc->list, wait_list); + + /* sanity check on discard range */ + __check_sit_bitmap(sbi, lstart, lstart + len); + + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; + bio->bi_opf |= flag; + submit_bio(bio); + + atomic_inc(&dcc->issued_discard); + + f2fs_update_iostat(sbi, NULL, FS_DISCARD, len * F2FS_BLKSIZE); + + lstart += len; + start += len; + total_len -= len; + len = total_len; + } + + if (!err && len) { + dcc->undiscard_blks -= len; + __update_discard_tree_range(sbi, bdev, lstart, start, len); + } + return err; +} + +static void __insert_discard_tree(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node **insert_p, + struct rb_node *insert_parent) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct rb_node **p; + struct rb_node *parent = NULL; + bool leftmost = true; + + if (insert_p && insert_parent) { + parent = insert_parent; + p = insert_p; + goto do_insert; + } + + p = f2fs_lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, + lstart, &leftmost); +do_insert: + __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, + p, leftmost); +} + +static void __relocate_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->len)]); +} + +static void __punch_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_info di = dc->di; + bool modified = false; + + if (dc->state == D_DONE || dc->len == 1) { + __remove_discard_cmd(sbi, dc); + return; + } + + dcc->undiscard_blks -= di.len; + + if (blkaddr > di.lstart) { + dc->len = blkaddr - dc->lstart; + dcc->undiscard_blks += dc->len; + __relocate_discard_cmd(dcc, dc); + modified = true; + } + + if (blkaddr < di.lstart + di.len - 1) { + if (modified) { + __insert_discard_tree(sbi, dc->bdev, blkaddr + 1, + di.start + blkaddr + 1 - di.lstart, + di.lstart + di.len - 1 - blkaddr, + NULL, NULL); + } else { + dc->lstart++; + dc->len--; + dc->start++; + dcc->undiscard_blks += dc->len; + __relocate_discard_cmd(dcc, dc); + } + } +} + +static void __update_discard_tree_range(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct discard_cmd *dc; + struct discard_info di = {0}; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + unsigned int max_discard_blocks = + SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); + block_t end = lstart + len; + + dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, + NULL, lstart, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true, NULL); + if (dc) + prev_dc = dc; + + if (!prev_dc) { + di.lstart = lstart; + di.len = next_dc ? next_dc->lstart - lstart : len; + di.len = min(di.len, len); + di.start = start; + } + + while (1) { + struct rb_node *node; + bool merged = false; + struct discard_cmd *tdc = NULL; + + if (prev_dc) { + di.lstart = prev_dc->lstart + prev_dc->len; + if (di.lstart < lstart) + di.lstart = lstart; + if (di.lstart >= end) + break; + + if (!next_dc || next_dc->lstart > end) + di.len = end - di.lstart; + else + di.len = next_dc->lstart - di.lstart; + di.start = start + di.lstart - lstart; + } + + if (!di.len) + goto next; + + if (prev_dc && prev_dc->state == D_PREP && + prev_dc->bdev == bdev && + __is_discard_back_mergeable(&di, &prev_dc->di, + max_discard_blocks)) { + prev_dc->di.len += di.len; + dcc->undiscard_blks += di.len; + __relocate_discard_cmd(dcc, prev_dc); + di = prev_dc->di; + tdc = prev_dc; + merged = true; + } + + if (next_dc && next_dc->state == D_PREP && + next_dc->bdev == bdev && + __is_discard_front_mergeable(&di, &next_dc->di, + max_discard_blocks)) { + next_dc->di.lstart = di.lstart; + next_dc->di.len += di.len; + next_dc->di.start = di.start; + dcc->undiscard_blks += di.len; + __relocate_discard_cmd(dcc, next_dc); + if (tdc) + __remove_discard_cmd(sbi, tdc); + merged = true; + } + + if (!merged) { + __insert_discard_tree(sbi, bdev, di.lstart, di.start, + di.len, NULL, NULL); + } + next: + prev_dc = next_dc; + if (!prev_dc) + break; + + node = rb_next(&prev_dc->rb_node); + next_dc = rb_entry_safe(node, struct discard_cmd, rb_node); + } +} + +static int __queue_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ + block_t lblkstart = blkstart; + + if (!f2fs_bdev_support_discard(bdev)) + return 0; + + trace_f2fs_queue_discard(bdev, blkstart, blklen); + + if (f2fs_is_multi_device(sbi)) { + int devi = f2fs_target_device_index(sbi, blkstart); + + blkstart -= FDEV(devi).start_blk; + } + mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); + __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); + mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); + return 0; +} + +static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct discard_cmd *dc; + struct blk_plug plug; + unsigned int pos = dcc->next_pos; + unsigned int issued = 0; + bool io_interrupted = false; + + mutex_lock(&dcc->cmd_lock); + dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, + NULL, pos, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true, NULL); + if (!dc) + dc = next_dc; + + blk_start_plug(&plug); + + while (dc) { + struct rb_node *node; + int err = 0; + + if (dc->state != D_PREP) + goto next; + + if (dpolicy->io_aware && !is_idle(sbi, DISCARD_TIME)) { + io_interrupted = true; + break; + } + + dcc->next_pos = dc->lstart + dc->len; + err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); + + if (issued >= dpolicy->max_requests) + break; +next: + node = rb_next(&dc->rb_node); + if (err) + __remove_discard_cmd(sbi, dc); + dc = rb_entry_safe(node, struct discard_cmd, rb_node); + } + + blk_finish_plug(&plug); + + if (!dc) + dcc->next_pos = 0; + + mutex_unlock(&dcc->cmd_lock); + + if (!issued && io_interrupted) + issued = -1; + + return issued; +} +static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy); + +static int __issue_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + struct blk_plug plug; + int i, issued; + bool io_interrupted = false; + + if (dpolicy->timeout) + f2fs_update_time(sbi, UMOUNT_DISCARD_TIMEOUT); + +retry: + issued = 0; + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + if (dpolicy->timeout && + f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT)) + break; + + if (i + 1 < dpolicy->granularity) + break; + + if (i + 1 < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered) + return __issue_discard_cmd_orderly(sbi, dpolicy); + + pend_list = &dcc->pend_list[i]; + + mutex_lock(&dcc->cmd_lock); + if (list_empty(pend_list)) + goto next; + if (unlikely(dcc->rbtree_check)) + f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, + &dcc->root)); + blk_start_plug(&plug); + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + + if (dpolicy->timeout && + f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT)) + break; + + if (dpolicy->io_aware && i < dpolicy->io_aware_gran && + !is_idle(sbi, DISCARD_TIME)) { + io_interrupted = true; + break; + } + + __submit_discard_cmd(sbi, dpolicy, dc, &issued); + + if (issued >= dpolicy->max_requests) + break; + } + blk_finish_plug(&plug); +next: + mutex_unlock(&dcc->cmd_lock); + + if (issued >= dpolicy->max_requests || io_interrupted) + break; + } + + if (dpolicy->type == DPOLICY_UMOUNT && issued) { + __wait_all_discard_cmd(sbi, dpolicy); + goto retry; + } + + if (!issued && io_interrupted) + issued = -1; + + return issued; +} + +static bool __drop_discard_cmd(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + int i; + bool dropped = false; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + __remove_discard_cmd(sbi, dc); + dropped = true; + } + } + mutex_unlock(&dcc->cmd_lock); + + return dropped; +} + +void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi) +{ + __drop_discard_cmd(sbi); +} + +static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + unsigned int len = 0; + + wait_for_completion_io(&dc->wait); + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, dc->state != D_DONE); + dc->ref--; + if (!dc->ref) { + if (!dc->error) + len = dc->len; + __remove_discard_cmd(sbi, dc); + } + mutex_unlock(&dcc->cmd_lock); + + return len; +} + +static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + block_t start, block_t end) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? + &(dcc->fstrim_list) : &(dcc->wait_list); + struct discard_cmd *dc = NULL, *iter, *tmp; + unsigned int trimmed = 0; + +next: + dc = NULL; + + mutex_lock(&dcc->cmd_lock); + list_for_each_entry_safe(iter, tmp, wait_list, list) { + if (iter->lstart + iter->len <= start || end <= iter->lstart) + continue; + if (iter->len < dpolicy->granularity) + continue; + if (iter->state == D_DONE && !iter->ref) { + wait_for_completion_io(&iter->wait); + if (!iter->error) + trimmed += iter->len; + __remove_discard_cmd(sbi, iter); + } else { + iter->ref++; + dc = iter; + break; + } + } + mutex_unlock(&dcc->cmd_lock); + + if (dc) { + trimmed += __wait_one_discard_bio(sbi, dc); + goto next; + } + + return trimmed; +} + +static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy) +{ + struct discard_policy dp; + unsigned int discard_blks; + + if (dpolicy) + return __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); + + /* wait all */ + __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, 1); + discard_blks = __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); + __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, 1); + discard_blks += __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); + + return discard_blks; +} + +/* This should be covered by global mutex, &sit_i->sentry_lock */ +static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; + bool need_wait = false; + + mutex_lock(&dcc->cmd_lock); + dc = (struct discard_cmd *)f2fs_lookup_rb_tree(&dcc->root, + NULL, blkaddr); + if (dc) { + if (dc->state == D_PREP) { + __punch_discard_cmd(sbi, dc, blkaddr); + } else { + dc->ref++; + need_wait = true; + } + } + mutex_unlock(&dcc->cmd_lock); + + if (need_wait) + __wait_one_discard_bio(sbi, dc); +} + +void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + if (dcc && dcc->f2fs_issue_discard) { + struct task_struct *discard_thread = dcc->f2fs_issue_discard; + + dcc->f2fs_issue_discard = NULL; + kthread_stop(discard_thread); + } +} + +/* This comes from f2fs_put_super */ +bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_policy dpolicy; + bool dropped; + + __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, + dcc->discard_granularity); + __issue_discard_cmd(sbi, &dpolicy); + dropped = __drop_discard_cmd(sbi); + + /* just to make sure there is no pending discard commands */ + __wait_all_discard_cmd(sbi, NULL); + + f2fs_bug_on(sbi, atomic_read(&dcc->discard_cmd_cnt)); + return dropped; +} + +static int issue_discard_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + wait_queue_head_t *q = &dcc->discard_wait_queue; + struct discard_policy dpolicy; + unsigned int wait_ms = dcc->min_discard_issue_time; + int issued; + + set_freezable(); + + do { + if (sbi->gc_mode == GC_URGENT_HIGH || + !f2fs_available_free_memory(sbi, DISCARD_CACHE)) + __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); + else + __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, + dcc->discard_granularity); + + if (!atomic_read(&dcc->discard_cmd_cnt)) + wait_ms = dpolicy.max_interval; + + wait_event_interruptible_timeout(*q, + kthread_should_stop() || freezing(current) || + dcc->discard_wake, + msecs_to_jiffies(wait_ms)); + + if (dcc->discard_wake) + dcc->discard_wake = 0; + + /* clean up pending candidates before going to sleep */ + if (atomic_read(&dcc->queued_discard)) + __wait_all_discard_cmd(sbi, NULL); + + if (try_to_freeze()) + continue; + if (f2fs_readonly(sbi->sb)) + continue; + if (kthread_should_stop()) + return 0; + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + wait_ms = dpolicy.max_interval; + continue; + } + if (!atomic_read(&dcc->discard_cmd_cnt)) + continue; + + sb_start_intwrite(sbi->sb); + + issued = __issue_discard_cmd(sbi, &dpolicy); + if (issued > 0) { + __wait_all_discard_cmd(sbi, &dpolicy); + wait_ms = dpolicy.min_interval; + } else if (issued == -1) { + wait_ms = f2fs_time_to_wait(sbi, DISCARD_TIME); + if (!wait_ms) + wait_ms = dpolicy.mid_interval; + } else { + wait_ms = dpolicy.max_interval; + } + + sb_end_intwrite(sbi->sb); + + } while (!kthread_should_stop()); + return 0; +} + +#ifdef CONFIG_BLK_DEV_ZONED +static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ + sector_t sector, nr_sects; + block_t lblkstart = blkstart; + int devi = 0; + + if (f2fs_is_multi_device(sbi)) { + devi = f2fs_target_device_index(sbi, blkstart); + if (blkstart < FDEV(devi).start_blk || + blkstart > FDEV(devi).end_blk) { + f2fs_err(sbi, "Invalid block %x", blkstart); + return -EIO; + } + blkstart -= FDEV(devi).start_blk; + } + + /* For sequential zones, reset the zone write pointer */ + if (f2fs_blkz_is_seq(sbi, devi, blkstart)) { + sector = SECTOR_FROM_BLOCK(blkstart); + nr_sects = SECTOR_FROM_BLOCK(blklen); + + if (sector & (bdev_zone_sectors(bdev) - 1) || + nr_sects != bdev_zone_sectors(bdev)) { + f2fs_err(sbi, "(%d) %s: Unaligned zone reset attempted (block %x + %x)", + devi, sbi->s_ndevs ? FDEV(devi).path : "", + blkstart, blklen); + return -EIO; + } + trace_f2fs_issue_reset_zone(bdev, blkstart); + return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + sector, nr_sects, GFP_NOFS); + } + + /* For conventional zones, use regular discard if supported */ + return __queue_discard_cmd(sbi, bdev, lblkstart, blklen); +} +#endif + +static int __issue_discard_async(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) + return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); +#endif + return __queue_discard_cmd(sbi, bdev, blkstart, blklen); +} + +static int f2fs_issue_discard(struct f2fs_sb_info *sbi, + block_t blkstart, block_t blklen) +{ + sector_t start = blkstart, len = 0; + struct block_device *bdev; + struct seg_entry *se; + unsigned int offset; + block_t i; + int err = 0; + + bdev = f2fs_target_device(sbi, blkstart, NULL); + + for (i = blkstart; i < blkstart + blklen; i++, len++) { + if (i != start) { + struct block_device *bdev2 = + f2fs_target_device(sbi, i, NULL); + + if (bdev2 != bdev) { + err = __issue_discard_async(sbi, bdev, + start, len); + if (err) + return err; + bdev = bdev2; + start = i; + len = 0; + } + } + + se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); + offset = GET_BLKOFF_FROM_SEG0(sbi, i); + + if (f2fs_block_unit_discard(sbi) && + !f2fs_test_and_set_bit(offset, se->discard_map)) + sbi->discard_blks--; + } + + if (len) + err = __issue_discard_async(sbi, bdev, start, len); + return err; +} + +static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, + bool check_only) +{ + int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); + int max_blocks = sbi->blocks_per_seg; + struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start); + unsigned long *cur_map = (unsigned long *)se->cur_valid_map; + unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long *discard_map = (unsigned long *)se->discard_map; + unsigned long *dmap = SIT_I(sbi)->tmp_map; + unsigned int start = 0, end = -1; + bool force = (cpc->reason & CP_DISCARD); + struct discard_entry *de = NULL; + struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; + int i; + + if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi) || + !f2fs_block_unit_discard(sbi)) + return false; + + if (!force) { + if (!f2fs_realtime_discard_enable(sbi) || !se->valid_blocks || + SM_I(sbi)->dcc_info->nr_discards >= + SM_I(sbi)->dcc_info->max_discards) + return false; + } + + /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ + for (i = 0; i < entries; i++) + dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] : + (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; + + while (force || SM_I(sbi)->dcc_info->nr_discards <= + SM_I(sbi)->dcc_info->max_discards) { + start = __find_rev_next_bit(dmap, max_blocks, end + 1); + if (start >= max_blocks) + break; + + end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); + if (force && start && end != max_blocks + && (end - start) < cpc->trim_minlen) + continue; + + if (check_only) + return true; + + if (!de) { + de = f2fs_kmem_cache_alloc(discard_entry_slab, + GFP_F2FS_ZERO, true, NULL); + de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start); + list_add_tail(&de->list, head); + } + + for (i = start; i < end; i++) + __set_bit_le(i, (void *)de->discard_map); + + SM_I(sbi)->dcc_info->nr_discards += end - start; + } + return false; +} + +static void release_discard_addr(struct discard_entry *entry) +{ + list_del(&entry->list); + kmem_cache_free(discard_entry_slab, entry); +} + +void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); + struct discard_entry *entry, *this; + + /* drop caches */ + list_for_each_entry_safe(entry, this, head, list) + release_discard_addr(entry); +} + +/* + * Should call f2fs_clear_prefree_segments after checkpoint is done. + */ +static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int segno; + + mutex_lock(&dirty_i->seglist_lock); + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi)) + __set_test_and_free(sbi, segno, false); + mutex_unlock(&dirty_i->seglist_lock); +} + +void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, + struct cp_control *cpc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *head = &dcc->entry_list; + struct discard_entry *entry, *this; + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; + unsigned int start = 0, end = -1; + unsigned int secno, start_segno; + bool force = (cpc->reason & CP_DISCARD); + bool section_alignment = F2FS_OPTION(sbi).discard_unit == + DISCARD_UNIT_SECTION; + + if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) + section_alignment = true; + + mutex_lock(&dirty_i->seglist_lock); + + while (1) { + int i; + + if (section_alignment && end != -1) + end--; + start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1); + if (start >= MAIN_SEGS(sbi)) + break; + end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi), + start + 1); + + if (section_alignment) { + start = rounddown(start, sbi->segs_per_sec); + end = roundup(end, sbi->segs_per_sec); + } + + for (i = start; i < end; i++) { + if (test_and_clear_bit(i, prefree_map)) + dirty_i->nr_dirty[PRE]--; + } + + if (!f2fs_realtime_discard_enable(sbi)) + continue; + + if (force && start >= cpc->trim_start && + (end - 1) <= cpc->trim_end) + continue; + + if (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi)) { + f2fs_issue_discard(sbi, START_BLOCK(sbi, start), + (end - start) << sbi->log_blocks_per_seg); + continue; + } +next: + secno = GET_SEC_FROM_SEG(sbi, start); + start_segno = GET_SEG_FROM_SEC(sbi, secno); + if (!IS_CURSEC(sbi, secno) && + !get_valid_blocks(sbi, start, true)) + f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), + sbi->segs_per_sec << sbi->log_blocks_per_seg); + + start = start_segno + sbi->segs_per_sec; + if (start < end) + goto next; + else + end = start - 1; + } + mutex_unlock(&dirty_i->seglist_lock); + + if (!f2fs_block_unit_discard(sbi)) + goto wakeup; + + /* send small discards */ + list_for_each_entry_safe(entry, this, head, list) { + unsigned int cur_pos = 0, next_pos, len, total_len = 0; + bool is_valid = test_bit_le(0, entry->discard_map); + +find_next: + if (is_valid) { + next_pos = find_next_zero_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + len = next_pos - cur_pos; + + if (f2fs_sb_has_blkzoned(sbi) || + (force && len < cpc->trim_minlen)) + goto skip; + + f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, + len); + total_len += len; + } else { + next_pos = find_next_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + } +skip: + cur_pos = next_pos; + is_valid = !is_valid; + + if (cur_pos < sbi->blocks_per_seg) + goto find_next; + + release_discard_addr(entry); + dcc->nr_discards -= total_len; + } + +wakeup: + wake_up_discard_thread(sbi, false); +} + +int f2fs_start_discard_thread(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int err = 0; + + if (!f2fs_realtime_discard_enable(sbi)) + return 0; + + dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, + "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(dcc->f2fs_issue_discard)) { + err = PTR_ERR(dcc->f2fs_issue_discard); + dcc->f2fs_issue_discard = NULL; + } + + return err; +} + +static int create_discard_cmd_control(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc; + int err = 0, i; + + if (SM_I(sbi)->dcc_info) { + dcc = SM_I(sbi)->dcc_info; + goto init_thread; + } + + dcc = f2fs_kzalloc(sbi, sizeof(struct discard_cmd_control), GFP_KERNEL); + if (!dcc) + return -ENOMEM; + + dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; + if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) + dcc->discard_granularity = sbi->blocks_per_seg; + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) + dcc->discard_granularity = BLKS_PER_SEC(sbi); + + INIT_LIST_HEAD(&dcc->entry_list); + for (i = 0; i < MAX_PLIST_NUM; i++) + INIT_LIST_HEAD(&dcc->pend_list[i]); + INIT_LIST_HEAD(&dcc->wait_list); + INIT_LIST_HEAD(&dcc->fstrim_list); + mutex_init(&dcc->cmd_lock); + atomic_set(&dcc->issued_discard, 0); + atomic_set(&dcc->queued_discard, 0); + atomic_set(&dcc->discard_cmd_cnt, 0); + dcc->nr_discards = 0; + dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; + dcc->max_discard_request = DEF_MAX_DISCARD_REQUEST; + dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME; + dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME; + dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME; + dcc->undiscard_blks = 0; + dcc->next_pos = 0; + dcc->root = RB_ROOT_CACHED; + dcc->rbtree_check = false; + + init_waitqueue_head(&dcc->discard_wait_queue); + SM_I(sbi)->dcc_info = dcc; +init_thread: + err = f2fs_start_discard_thread(sbi); + if (err) { + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; + } + + return err; +} + +static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + if (!dcc) + return; + + f2fs_stop_discard_thread(sbi); + + /* + * Recovery can cache discard commands, so in error path of + * fill_super(), it needs to give a chance to handle them. + */ + if (unlikely(atomic_read(&dcc->discard_cmd_cnt))) + f2fs_issue_discard_timeout(sbi); + + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; +} + +static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + + if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) { + sit_i->dirty_sentries++; + return false; + } + + return true; +} + +static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, + unsigned int segno, int modified) +{ + struct seg_entry *se = get_seg_entry(sbi, segno); + + se->type = type; + if (modified) + __mark_sit_entry_dirty(sbi, segno); +} + +static inline unsigned long long get_segment_mtime(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + unsigned int segno = GET_SEGNO(sbi, blkaddr); + + if (segno == NULL_SEGNO) + return 0; + return get_seg_entry(sbi, segno)->mtime; +} + +static void update_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr, + unsigned long long old_mtime) +{ + struct seg_entry *se; + unsigned int segno = GET_SEGNO(sbi, blkaddr); + unsigned long long ctime = get_mtime(sbi, false); + unsigned long long mtime = old_mtime ? old_mtime : ctime; + + if (segno == NULL_SEGNO) + return; + + se = get_seg_entry(sbi, segno); + + if (!se->mtime) + se->mtime = mtime; + else + se->mtime = div_u64(se->mtime * se->valid_blocks + mtime, + se->valid_blocks + 1); + + if (ctime > SIT_I(sbi)->max_mtime) + SIT_I(sbi)->max_mtime = ctime; +} + +static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) +{ + struct seg_entry *se; + unsigned int segno, offset; + long int new_vblocks; + bool exist; +#ifdef CONFIG_F2FS_CHECK_FS + bool mir_exist; +#endif + + segno = GET_SEGNO(sbi, blkaddr); + + se = get_seg_entry(sbi, segno); + new_vblocks = se->valid_blocks + del; + offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + + f2fs_bug_on(sbi, (new_vblocks < 0 || + (new_vblocks > f2fs_usable_blks_in_seg(sbi, segno)))); + + se->valid_blocks = new_vblocks; + + /* Update valid block bitmap */ + if (del > 0) { + exist = f2fs_test_and_set_bit(offset, se->cur_valid_map); +#ifdef CONFIG_F2FS_CHECK_FS + mir_exist = f2fs_test_and_set_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_err(sbi, "Inconsistent error when setting bitmap, blk:%u, old bit:%d", + blkaddr, exist); + f2fs_bug_on(sbi, 1); + } +#endif + if (unlikely(exist)) { + f2fs_err(sbi, "Bitmap was wrongly set, blk:%u", + blkaddr); + f2fs_bug_on(sbi, 1); + se->valid_blocks--; + del = 0; + } + + if (f2fs_block_unit_discard(sbi) && + !f2fs_test_and_set_bit(offset, se->discard_map)) + sbi->discard_blks--; + + /* + * SSR should never reuse block which is checkpointed + * or newly invalidated. + */ + if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { + if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) + se->ckpt_valid_blocks++; + } + } else { + exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map); +#ifdef CONFIG_F2FS_CHECK_FS + mir_exist = f2fs_test_and_clear_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_err(sbi, "Inconsistent error when clearing bitmap, blk:%u, old bit:%d", + blkaddr, exist); + f2fs_bug_on(sbi, 1); + } +#endif + if (unlikely(!exist)) { + f2fs_err(sbi, "Bitmap was wrongly cleared, blk:%u", + blkaddr); + f2fs_bug_on(sbi, 1); + se->valid_blocks++; + del = 0; + } else if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + /* + * If checkpoints are off, we must not reuse data that + * was used in the previous checkpoint. If it was used + * before, we must track that to know how much space we + * really have. + */ + if (f2fs_test_bit(offset, se->ckpt_valid_map)) { + spin_lock(&sbi->stat_lock); + sbi->unusable_block_count++; + spin_unlock(&sbi->stat_lock); + } + } + + if (f2fs_block_unit_discard(sbi) && + f2fs_test_and_clear_bit(offset, se->discard_map)) + sbi->discard_blks++; + } + if (!f2fs_test_bit(offset, se->ckpt_valid_map)) + se->ckpt_valid_blocks += del; + + __mark_sit_entry_dirty(sbi, segno); + + /* update total number of valid blocks to be written in ckpt area */ + SIT_I(sbi)->written_valid_blocks += del; + + if (__is_large_section(sbi)) + get_sec_entry(sbi, segno)->valid_blocks += del; +} + +void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) +{ + unsigned int segno = GET_SEGNO(sbi, addr); + struct sit_info *sit_i = SIT_I(sbi); + + f2fs_bug_on(sbi, addr == NULL_ADDR); + if (addr == NEW_ADDR || addr == COMPRESS_ADDR) + return; + + invalidate_mapping_pages(META_MAPPING(sbi), addr, addr); + f2fs_invalidate_compress_page(sbi, addr); + + /* add it into sit main buffer */ + down_write(&sit_i->sentry_lock); + + update_segment_mtime(sbi, addr, 0); + update_sit_entry(sbi, addr, -1); + + /* add it into dirty seglist */ + locate_dirty_segment(sbi, segno); + + up_write(&sit_i->sentry_lock); +} + +bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int segno, offset; + struct seg_entry *se; + bool is_cp = false; + + if (!__is_valid_data_blkaddr(blkaddr)) + return true; + + down_read(&sit_i->sentry_lock); + + segno = GET_SEGNO(sbi, blkaddr); + se = get_seg_entry(sbi, segno); + offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + + if (f2fs_test_bit(offset, se->ckpt_valid_map)) + is_cp = true; + + up_read(&sit_i->sentry_lock); + + return is_cp; +} + +/* + * This function should be resided under the curseg_mutex lock + */ +static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, + struct f2fs_summary *sum) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + void *addr = curseg->sum_blk; + + addr += curseg->next_blkoff * sizeof(struct f2fs_summary); + memcpy(addr, sum, sizeof(struct f2fs_summary)); +} + +/* + * Calculate the number of current summary pages for writing + */ +int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) +{ + int valid_sum_count = 0; + int i, sum_in_page; + + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + if (sbi->ckpt->alloc_type[i] == SSR) + valid_sum_count += sbi->blocks_per_seg; + else { + if (for_ra) + valid_sum_count += le16_to_cpu( + F2FS_CKPT(sbi)->cur_data_blkoff[i]); + else + valid_sum_count += curseg_blkoff(sbi, i); + } + } + + sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE - + SUM_FOOTER_SIZE) / SUMMARY_SIZE; + if (valid_sum_count <= sum_in_page) + return 1; + else if ((valid_sum_count - sum_in_page) <= + (PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) + return 2; + return 3; +} + +/* + * Caller should put this summary page + */ +struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) +{ + if (unlikely(f2fs_cp_error(sbi))) + return ERR_PTR(-EIO); + return f2fs_get_meta_page_retry(sbi, GET_SUM_BLOCK(sbi, segno)); +} + +void f2fs_update_meta_page(struct f2fs_sb_info *sbi, + void *src, block_t blk_addr) +{ + struct page *page = f2fs_grab_meta_page(sbi, blk_addr); + + memcpy(page_address(page), src, PAGE_SIZE); + set_page_dirty(page); + f2fs_put_page(page, 1); +} + +static void write_sum_page(struct f2fs_sb_info *sbi, + struct f2fs_summary_block *sum_blk, block_t blk_addr) +{ + f2fs_update_meta_page(sbi, (void *)sum_blk, blk_addr); +} + +static void write_current_sum_page(struct f2fs_sb_info *sbi, + int type, block_t blk_addr) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + struct page *page = f2fs_grab_meta_page(sbi, blk_addr); + struct f2fs_summary_block *src = curseg->sum_blk; + struct f2fs_summary_block *dst; + + dst = (struct f2fs_summary_block *)page_address(page); + memset(dst, 0, PAGE_SIZE); + + mutex_lock(&curseg->curseg_mutex); + + down_read(&curseg->journal_rwsem); + memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE); + up_read(&curseg->journal_rwsem); + + memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE); + memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE); + + mutex_unlock(&curseg->curseg_mutex); + + set_page_dirty(page); + f2fs_put_page(page, 1); +} + +static int is_next_segment_free(struct f2fs_sb_info *sbi, + struct curseg_info *curseg, int type) +{ + unsigned int segno = curseg->segno + 1; + struct free_segmap_info *free_i = FREE_I(sbi); + + if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec) + return !test_bit(segno, free_i->free_segmap); + return 0; +} + +/* + * Find a new segment from the free segments bitmap to right order + * This function should be returned with success, otherwise BUG + */ +static void get_new_segment(struct f2fs_sb_info *sbi, + unsigned int *newseg, bool new_sec, int dir) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int segno, secno, zoneno; + unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; + unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); + unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); + unsigned int left_start = hint; + bool init = true; + int go_left = 0; + int i; + + spin_lock(&free_i->segmap_lock); + + if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { + segno = find_next_zero_bit(free_i->free_segmap, + GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1); + if (segno < GET_SEG_FROM_SEC(sbi, hint + 1)) + goto got_it; + } +find_other_zone: + secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); + if (secno >= MAIN_SECS(sbi)) { + if (dir == ALLOC_RIGHT) { + secno = find_first_zero_bit(free_i->free_secmap, + MAIN_SECS(sbi)); + f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi)); + } else { + go_left = 1; + left_start = hint - 1; + } + } + if (go_left == 0) + goto skip_left; + + while (test_bit(left_start, free_i->free_secmap)) { + if (left_start > 0) { + left_start--; + continue; + } + left_start = find_first_zero_bit(free_i->free_secmap, + MAIN_SECS(sbi)); + f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi)); + break; + } + secno = left_start; +skip_left: + segno = GET_SEG_FROM_SEC(sbi, secno); + zoneno = GET_ZONE_FROM_SEC(sbi, secno); + + /* give up on finding another zone */ + if (!init) + goto got_it; + if (sbi->secs_per_zone == 1) + goto got_it; + if (zoneno == old_zoneno) + goto got_it; + if (dir == ALLOC_LEFT) { + if (!go_left && zoneno + 1 >= total_zones) + goto got_it; + if (go_left && zoneno == 0) + goto got_it; + } + for (i = 0; i < NR_CURSEG_TYPE; i++) + if (CURSEG_I(sbi, i)->zone == zoneno) + break; + + if (i < NR_CURSEG_TYPE) { + /* zone is in user, try another */ + if (go_left) + hint = zoneno * sbi->secs_per_zone - 1; + else if (zoneno + 1 >= total_zones) + hint = 0; + else + hint = (zoneno + 1) * sbi->secs_per_zone; + init = false; + goto find_other_zone; + } +got_it: + /* set it as dirty segment in free segmap */ + f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap)); + __set_inuse(sbi, segno); + *newseg = segno; + spin_unlock(&free_i->segmap_lock); +} + +static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + struct summary_footer *sum_footer; + unsigned short seg_type = curseg->seg_type; + + curseg->inited = true; + curseg->segno = curseg->next_segno; + curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); + curseg->next_blkoff = 0; + curseg->next_segno = NULL_SEGNO; + + sum_footer = &(curseg->sum_blk->footer); + memset(sum_footer, 0, sizeof(struct summary_footer)); + + sanity_check_seg_type(sbi, seg_type); + + if (IS_DATASEG(seg_type)) + SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA); + if (IS_NODESEG(seg_type)) + SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE); + __set_sit_entry_type(sbi, seg_type, curseg->segno, modified); +} + +static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned short seg_type = curseg->seg_type; + + sanity_check_seg_type(sbi, seg_type); + if (f2fs_need_rand_seg(sbi)) + return prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec); + + /* if segs_per_sec is large than 1, we need to keep original policy. */ + if (__is_large_section(sbi)) + return curseg->segno; + + /* inmem log may not locate on any segment after mount */ + if (!curseg->inited) + return 0; + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return 0; + + if (test_opt(sbi, NOHEAP) && + (seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type))) + return 0; + + if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) + return SIT_I(sbi)->last_victim[ALLOC_NEXT]; + + /* find segments from 0 to reuse freed segments */ + if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) + return 0; + + return curseg->segno; +} + +/* + * Allocate a current working segment. + * This function always allocates a free segment in LFS manner. + */ +static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned short seg_type = curseg->seg_type; + unsigned int segno = curseg->segno; + int dir = ALLOC_LEFT; + + if (curseg->inited) + write_sum_page(sbi, curseg->sum_blk, + GET_SUM_BLOCK(sbi, segno)); + if (seg_type == CURSEG_WARM_DATA || seg_type == CURSEG_COLD_DATA) + dir = ALLOC_RIGHT; + + if (test_opt(sbi, NOHEAP)) + dir = ALLOC_RIGHT; + + segno = __get_next_segno(sbi, type); + get_new_segment(sbi, &segno, new_sec, dir); + curseg->next_segno = segno; + reset_curseg(sbi, type, 1); + curseg->alloc_type = LFS; + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) + curseg->fragment_remained_chunk = + prandom_u32_max(sbi->max_fragment_chunk) + 1; +} + +static int __next_free_blkoff(struct f2fs_sb_info *sbi, + int segno, block_t start) +{ + struct seg_entry *se = get_seg_entry(sbi, segno); + int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); + unsigned long *target_map = SIT_I(sbi)->tmp_map; + unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long *cur_map = (unsigned long *)se->cur_valid_map; + int i; + + for (i = 0; i < entries; i++) + target_map[i] = ckpt_map[i] | cur_map[i]; + + return __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); +} + +/* + * If a segment is written by LFS manner, next block offset is just obtained + * by increasing the current block offset. However, if a segment is written by + * SSR manner, next block offset obtained by calling __next_free_blkoff + */ +static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, + struct curseg_info *seg) +{ + if (seg->alloc_type == SSR) { + seg->next_blkoff = + __next_free_blkoff(sbi, seg->segno, + seg->next_blkoff + 1); + } else { + seg->next_blkoff++; + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) { + /* To allocate block chunks in different sizes, use random number */ + if (--seg->fragment_remained_chunk <= 0) { + seg->fragment_remained_chunk = + prandom_u32_max(sbi->max_fragment_chunk) + 1; + seg->next_blkoff += + prandom_u32_max(sbi->max_fragment_hole) + 1; + } + } + } +} + +bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) +{ + return __next_free_blkoff(sbi, segno, 0) < sbi->blocks_per_seg; +} + +/* + * This function always allocates a used segment(from dirty seglist) by SSR + * manner, so it should recover the existing segment information of valid blocks + */ +static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int new_segno = curseg->next_segno; + struct f2fs_summary_block *sum_node; + struct page *sum_page; + + if (flush) + write_sum_page(sbi, curseg->sum_blk, + GET_SUM_BLOCK(sbi, curseg->segno)); + + __set_test_and_inuse(sbi, new_segno); + + mutex_lock(&dirty_i->seglist_lock); + __remove_dirty_segment(sbi, new_segno, PRE); + __remove_dirty_segment(sbi, new_segno, DIRTY); + mutex_unlock(&dirty_i->seglist_lock); + + reset_curseg(sbi, type, 1); + curseg->alloc_type = SSR; + curseg->next_blkoff = __next_free_blkoff(sbi, curseg->segno, 0); + + sum_page = f2fs_get_sum_page(sbi, new_segno); + if (IS_ERR(sum_page)) { + /* GC won't be able to use stale summary pages by cp_error */ + memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE); + return; + } + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); + f2fs_put_page(sum_page, 1); +} + +static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, + int alloc_mode, unsigned long long age); + +static void get_atssr_segment(struct f2fs_sb_info *sbi, int type, + int target_type, int alloc_mode, + unsigned long long age) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + + curseg->seg_type = target_type; + + if (get_ssr_segment(sbi, type, alloc_mode, age)) { + struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno); + + curseg->seg_type = se->type; + change_curseg(sbi, type, true); + } else { + /* allocate cold segment by default */ + curseg->seg_type = CURSEG_COLD_DATA; + new_curseg(sbi, type, true); + } + stat_inc_seg_type(sbi, curseg); +} + +static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC); + + if (!sbi->am.atgc_enabled) + return; + + f2fs_down_read(&SM_I(sbi)->curseg_lock); + + mutex_lock(&curseg->curseg_mutex); + down_write(&SIT_I(sbi)->sentry_lock); + + get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC, CURSEG_COLD_DATA, SSR, 0); + + up_write(&SIT_I(sbi)->sentry_lock); + mutex_unlock(&curseg->curseg_mutex); + + f2fs_up_read(&SM_I(sbi)->curseg_lock); + +} +void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi) +{ + __f2fs_init_atgc_curseg(sbi); +} + +static void __f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + + mutex_lock(&curseg->curseg_mutex); + if (!curseg->inited) + goto out; + + if (get_valid_blocks(sbi, curseg->segno, false)) { + write_sum_page(sbi, curseg->sum_blk, + GET_SUM_BLOCK(sbi, curseg->segno)); + } else { + mutex_lock(&DIRTY_I(sbi)->seglist_lock); + __set_test_and_free(sbi, curseg->segno, true); + mutex_unlock(&DIRTY_I(sbi)->seglist_lock); + } +out: + mutex_unlock(&curseg->curseg_mutex); +} + +void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi) +{ + __f2fs_save_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); + + if (sbi->am.atgc_enabled) + __f2fs_save_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC); +} + +static void __f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + + mutex_lock(&curseg->curseg_mutex); + if (!curseg->inited) + goto out; + if (get_valid_blocks(sbi, curseg->segno, false)) + goto out; + + mutex_lock(&DIRTY_I(sbi)->seglist_lock); + __set_test_and_inuse(sbi, curseg->segno); + mutex_unlock(&DIRTY_I(sbi)->seglist_lock); +out: + mutex_unlock(&curseg->curseg_mutex); +} + +void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi) +{ + __f2fs_restore_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); + + if (sbi->am.atgc_enabled) + __f2fs_restore_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC); +} + +static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, + int alloc_mode, unsigned long long age) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; + unsigned segno = NULL_SEGNO; + unsigned short seg_type = curseg->seg_type; + int i, cnt; + bool reversed = false; + + sanity_check_seg_type(sbi, seg_type); + + /* f2fs_need_SSR() already forces to do this */ + if (!v_ops->get_victim(sbi, &segno, BG_GC, seg_type, alloc_mode, age)) { + curseg->next_segno = segno; + return 1; + } + + /* For node segments, let's do SSR more intensively */ + if (IS_NODESEG(seg_type)) { + if (seg_type >= CURSEG_WARM_NODE) { + reversed = true; + i = CURSEG_COLD_NODE; + } else { + i = CURSEG_HOT_NODE; + } + cnt = NR_CURSEG_NODE_TYPE; + } else { + if (seg_type >= CURSEG_WARM_DATA) { + reversed = true; + i = CURSEG_COLD_DATA; + } else { + i = CURSEG_HOT_DATA; + } + cnt = NR_CURSEG_DATA_TYPE; + } + + for (; cnt-- > 0; reversed ? i-- : i++) { + if (i == seg_type) + continue; + if (!v_ops->get_victim(sbi, &segno, BG_GC, i, alloc_mode, age)) { + curseg->next_segno = segno; + return 1; + } + } + + /* find valid_blocks=0 in dirty list */ + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + segno = get_free_segment(sbi); + if (segno != NULL_SEGNO) { + curseg->next_segno = segno; + return 1; + } + } + return 0; +} + +/* + * flush out current segment and replace it with new segment + * This function should be returned with success, otherwise BUG + */ +static void allocate_segment_by_default(struct f2fs_sb_info *sbi, + int type, bool force) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + + if (force) + new_curseg(sbi, type, true); + else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && + curseg->seg_type == CURSEG_WARM_NODE) + new_curseg(sbi, type, false); + else if (curseg->alloc_type == LFS && + is_next_segment_free(sbi, curseg, type) && + likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + new_curseg(sbi, type, false); + else if (f2fs_need_SSR(sbi) && + get_ssr_segment(sbi, type, SSR, 0)) + change_curseg(sbi, type, true); + else + new_curseg(sbi, type, false); + + stat_inc_seg_type(sbi, curseg); +} + +void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, + unsigned int start, unsigned int end) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int segno; + + f2fs_down_read(&SM_I(sbi)->curseg_lock); + mutex_lock(&curseg->curseg_mutex); + down_write(&SIT_I(sbi)->sentry_lock); + + segno = CURSEG_I(sbi, type)->segno; + if (segno < start || segno > end) + goto unlock; + + if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0)) + change_curseg(sbi, type, true); + else + new_curseg(sbi, type, true); + + stat_inc_seg_type(sbi, curseg); + + locate_dirty_segment(sbi, segno); +unlock: + up_write(&SIT_I(sbi)->sentry_lock); + + if (segno != curseg->segno) + f2fs_notice(sbi, "For resize: curseg of type %d: %u ==> %u", + type, segno, curseg->segno); + + mutex_unlock(&curseg->curseg_mutex); + f2fs_up_read(&SM_I(sbi)->curseg_lock); +} + +static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, + bool new_sec, bool force) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int old_segno; + + if (!curseg->inited) + goto alloc; + + if (force || curseg->next_blkoff || + get_valid_blocks(sbi, curseg->segno, new_sec)) + goto alloc; + + if (!get_ckpt_valid_blocks(sbi, curseg->segno, new_sec)) + return; +alloc: + old_segno = curseg->segno; + SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); + locate_dirty_segment(sbi, old_segno); +} + +static void __allocate_new_section(struct f2fs_sb_info *sbi, + int type, bool force) +{ + __allocate_new_segment(sbi, type, true, force); +} + +void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force) +{ + f2fs_down_read(&SM_I(sbi)->curseg_lock); + down_write(&SIT_I(sbi)->sentry_lock); + __allocate_new_section(sbi, type, force); + up_write(&SIT_I(sbi)->sentry_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); +} + +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) +{ + int i; + + f2fs_down_read(&SM_I(sbi)->curseg_lock); + down_write(&SIT_I(sbi)->sentry_lock); + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) + __allocate_new_segment(sbi, i, false, false); + up_write(&SIT_I(sbi)->sentry_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); +} + +static const struct segment_allocation default_salloc_ops = { + .allocate_segment = allocate_segment_by_default, +}; + +bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, + struct cp_control *cpc) +{ + __u64 trim_start = cpc->trim_start; + bool has_candidate = false; + + down_write(&SIT_I(sbi)->sentry_lock); + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) { + if (add_discard_addrs(sbi, cpc, true)) { + has_candidate = true; + break; + } + } + up_write(&SIT_I(sbi)->sentry_lock); + + cpc->trim_start = trim_start; + return has_candidate; +} + +static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + unsigned int start, unsigned int end) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct discard_cmd *dc; + struct blk_plug plug; + int issued; + unsigned int trimmed = 0; + +next: + issued = 0; + + mutex_lock(&dcc->cmd_lock); + if (unlikely(dcc->rbtree_check)) + f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, + &dcc->root)); + + dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, + NULL, start, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true, NULL); + if (!dc) + dc = next_dc; + + blk_start_plug(&plug); + + while (dc && dc->lstart <= end) { + struct rb_node *node; + int err = 0; + + if (dc->len < dpolicy->granularity) + goto skip; + + if (dc->state != D_PREP) { + list_move_tail(&dc->list, &dcc->fstrim_list); + goto skip; + } + + err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); + + if (issued >= dpolicy->max_requests) { + start = dc->lstart + dc->len; + + if (err) + __remove_discard_cmd(sbi, dc); + + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + trimmed += __wait_all_discard_cmd(sbi, NULL); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + goto next; + } +skip: + node = rb_next(&dc->rb_node); + if (err) + __remove_discard_cmd(sbi, dc); + dc = rb_entry_safe(node, struct discard_cmd, rb_node); + + if (fatal_signal_pending(current)) + break; + } + + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + + return trimmed; +} + +int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) +{ + __u64 start = F2FS_BYTES_TO_BLK(range->start); + __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; + unsigned int start_segno, end_segno; + block_t start_block, end_block; + struct cp_control cpc; + struct discard_policy dpolicy; + unsigned long long trimmed = 0; + int err = 0; + bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi); + + if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) + return -EINVAL; + + if (end < MAIN_BLKADDR(sbi)) + goto out; + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + f2fs_warn(sbi, "Found FS corruption, run fsck to fix."); + return -EFSCORRUPTED; + } + + /* start/end segment number in main_area */ + start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); + end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : + GET_SEGNO(sbi, end); + if (need_align) { + start_segno = rounddown(start_segno, sbi->segs_per_sec); + end_segno = roundup(end_segno + 1, sbi->segs_per_sec) - 1; + } + + cpc.reason = CP_DISCARD; + cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); + cpc.trim_start = start_segno; + cpc.trim_end = end_segno; + + if (sbi->discard_blks == 0) + goto out; + + f2fs_down_write(&sbi->gc_lock); + err = f2fs_write_checkpoint(sbi, &cpc); + f2fs_up_write(&sbi->gc_lock); + if (err) + goto out; + + /* + * We filed discard candidates, but actually we don't need to wait for + * all of them, since they'll be issued in idle time along with runtime + * discard option. User configuration looks like using runtime discard + * or periodic fstrim instead of it. + */ + if (f2fs_realtime_discard_enable(sbi)) + goto out; + + start_block = START_BLOCK(sbi, start_segno); + end_block = START_BLOCK(sbi, end_segno + 1); + + __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); + trimmed = __issue_discard_cmd_range(sbi, &dpolicy, + start_block, end_block); + + trimmed += __wait_discard_cmd_range(sbi, &dpolicy, + start_block, end_block); +out: + if (!err) + range->len = F2FS_BLK_TO_BYTES(trimmed); + return err; +} + +static bool __has_curseg_space(struct f2fs_sb_info *sbi, + struct curseg_info *curseg) +{ + return curseg->next_blkoff < f2fs_usable_blks_in_seg(sbi, + curseg->segno); +} + +int f2fs_rw_hint_to_seg_type(enum rw_hint hint) +{ + switch (hint) { + case WRITE_LIFE_SHORT: + return CURSEG_HOT_DATA; + case WRITE_LIFE_EXTREME: + return CURSEG_COLD_DATA; + default: + return CURSEG_WARM_DATA; + } +} + +static int __get_segment_type_2(struct f2fs_io_info *fio) +{ + if (fio->type == DATA) + return CURSEG_HOT_DATA; + else + return CURSEG_HOT_NODE; +} + +static int __get_segment_type_4(struct f2fs_io_info *fio) +{ + if (fio->type == DATA) { + struct inode *inode = fio->page->mapping->host; + + if (S_ISDIR(inode->i_mode)) + return CURSEG_HOT_DATA; + else + return CURSEG_COLD_DATA; + } else { + if (IS_DNODE(fio->page) && is_cold_node(fio->page)) + return CURSEG_WARM_NODE; + else + return CURSEG_COLD_NODE; + } +} + +static int __get_segment_type_6(struct f2fs_io_info *fio) +{ + if (fio->type == DATA) { + struct inode *inode = fio->page->mapping->host; + + if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) + return CURSEG_COLD_DATA_PINNED; + + if (page_private_gcing(fio->page)) { + if (fio->sbi->am.atgc_enabled && + (fio->io_type == FS_DATA_IO) && + (fio->sbi->gc_mode != GC_URGENT_HIGH)) + return CURSEG_ALL_DATA_ATGC; + else + return CURSEG_COLD_DATA; + } + if (file_is_cold(inode) || f2fs_need_compress_data(inode)) + return CURSEG_COLD_DATA; + if (file_is_hot(inode) || + is_inode_flag_set(inode, FI_HOT_DATA) || + f2fs_is_cow_file(inode)) + return CURSEG_HOT_DATA; + return f2fs_rw_hint_to_seg_type(inode->i_write_hint); + } else { + if (IS_DNODE(fio->page)) + return is_cold_node(fio->page) ? CURSEG_WARM_NODE : + CURSEG_HOT_NODE; + return CURSEG_COLD_NODE; + } +} + +static int __get_segment_type(struct f2fs_io_info *fio) +{ + int type = 0; + + switch (F2FS_OPTION(fio->sbi).active_logs) { + case 2: + type = __get_segment_type_2(fio); + break; + case 4: + type = __get_segment_type_4(fio); + break; + case 6: + type = __get_segment_type_6(fio); + break; + default: + f2fs_bug_on(fio->sbi, true); + } + + if (IS_HOT(type)) + fio->temp = HOT; + else if (IS_WARM(type)) + fio->temp = WARM; + else + fio->temp = COLD; + return type; +} + +void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, int type, + struct f2fs_io_info *fio) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned long long old_mtime; + bool from_gc = (type == CURSEG_ALL_DATA_ATGC); + struct seg_entry *se = NULL; + + f2fs_down_read(&SM_I(sbi)->curseg_lock); + + mutex_lock(&curseg->curseg_mutex); + down_write(&sit_i->sentry_lock); + + if (from_gc) { + f2fs_bug_on(sbi, GET_SEGNO(sbi, old_blkaddr) == NULL_SEGNO); + se = get_seg_entry(sbi, GET_SEGNO(sbi, old_blkaddr)); + sanity_check_seg_type(sbi, se->type); + f2fs_bug_on(sbi, IS_NODESEG(se->type)); + } + *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + + f2fs_bug_on(sbi, curseg->next_blkoff >= sbi->blocks_per_seg); + + f2fs_wait_discard_bio(sbi, *new_blkaddr); + + /* + * __add_sum_entry should be resided under the curseg_mutex + * because, this function updates a summary entry in the + * current summary block. + */ + __add_sum_entry(sbi, type, sum); + + __refresh_next_blkoff(sbi, curseg); + + stat_inc_block_count(sbi, curseg); + + if (from_gc) { + old_mtime = get_segment_mtime(sbi, old_blkaddr); + } else { + update_segment_mtime(sbi, old_blkaddr, 0); + old_mtime = 0; + } + update_segment_mtime(sbi, *new_blkaddr, old_mtime); + + /* + * SIT information should be updated before segment allocation, + * since SSR needs latest valid block information. + */ + update_sit_entry(sbi, *new_blkaddr, 1); + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + update_sit_entry(sbi, old_blkaddr, -1); + + if (!__has_curseg_space(sbi, curseg)) { + if (from_gc) + get_atssr_segment(sbi, type, se->type, + AT_SSR, se->mtime); + else + sit_i->s_ops->allocate_segment(sbi, type, false); + } + /* + * segment dirty status should be updated after segment allocation, + * so we just need to update status only one time after previous + * segment being closed. + */ + locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr)); + + up_write(&sit_i->sentry_lock); + + if (page && IS_NODESEG(type)) { + fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + + f2fs_inode_chksum_set(sbi, page); + } + + if (fio) { + struct f2fs_bio_info *io; + + if (F2FS_IO_ALIGNED(sbi)) + fio->retry = false; + + INIT_LIST_HEAD(&fio->list); + fio->in_list = true; + io = sbi->write_io[fio->type] + fio->temp; + spin_lock(&io->io_lock); + list_add_tail(&fio->list, &io->io_list); + spin_unlock(&io->io_lock); + } + + mutex_unlock(&curseg->curseg_mutex); + + f2fs_up_read(&SM_I(sbi)->curseg_lock); +} + +void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, + block_t blkaddr, unsigned int blkcnt) +{ + if (!f2fs_is_multi_device(sbi)) + return; + + while (1) { + unsigned int devidx = f2fs_target_device_index(sbi, blkaddr); + unsigned int blks = FDEV(devidx).end_blk - blkaddr + 1; + + /* update device state for fsync */ + f2fs_set_dirty_device(sbi, ino, devidx, FLUSH_INO); + + /* update device state for checkpoint */ + if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { + spin_lock(&sbi->dev_lock); + f2fs_set_bit(devidx, (char *)&sbi->dirty_device); + spin_unlock(&sbi->dev_lock); + } + + if (blkcnt <= blks) + break; + blkcnt -= blks; + blkaddr += blks; + } +} + +static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) +{ + int type = __get_segment_type(fio); + bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA); + + if (keep_order) + f2fs_down_read(&fio->sbi->io_order_lock); +reallocate: + f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, + &fio->new_blkaddr, sum, type, fio); + if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) { + invalidate_mapping_pages(META_MAPPING(fio->sbi), + fio->old_blkaddr, fio->old_blkaddr); + f2fs_invalidate_compress_page(fio->sbi, fio->old_blkaddr); + } + + /* writeout dirty page into bdev */ + f2fs_submit_page_write(fio); + if (fio->retry) { + fio->old_blkaddr = fio->new_blkaddr; + goto reallocate; + } + + f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); + + if (keep_order) + f2fs_up_read(&fio->sbi->io_order_lock); +} + +void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type) +{ + struct f2fs_io_info fio = { + .sbi = sbi, + .type = META, + .temp = HOT, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, + .old_blkaddr = page->index, + .new_blkaddr = page->index, + .page = page, + .encrypted_page = NULL, + .in_list = false, + }; + + if (unlikely(page->index >= MAIN_BLKADDR(sbi))) + fio.op_flags &= ~REQ_META; + + set_page_writeback(page); + ClearPageError(page); + f2fs_submit_page_write(&fio); + + stat_inc_meta_count(sbi, page->index); + f2fs_update_iostat(sbi, NULL, io_type, F2FS_BLKSIZE); +} + +void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) +{ + struct f2fs_summary sum; + + set_summary(&sum, nid, 0, 0); + do_write_page(&sum, fio); + + f2fs_update_iostat(fio->sbi, NULL, fio->io_type, F2FS_BLKSIZE); +} + +void f2fs_outplace_write_data(struct dnode_of_data *dn, + struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = fio->sbi; + struct f2fs_summary sum; + + f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); + set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version); + do_write_page(&sum, fio); + f2fs_update_data_blkaddr(dn, fio->new_blkaddr); + + f2fs_update_iostat(sbi, dn->inode, fio->io_type, F2FS_BLKSIZE); +} + +int f2fs_inplace_write_data(struct f2fs_io_info *fio) +{ + int err; + struct f2fs_sb_info *sbi = fio->sbi; + unsigned int segno; + + fio->new_blkaddr = fio->old_blkaddr; + /* i/o temperature is needed for passing down write hints */ + __get_segment_type(fio); + + segno = GET_SEGNO(sbi, fio->new_blkaddr); + + if (!IS_DATASEG(get_seg_entry(sbi, segno)->type)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.", + __func__, segno); + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); + goto drop_bio; + } + + if (f2fs_cp_error(sbi)) { + err = -EIO; + goto drop_bio; + } + + if (fio->post_read) + invalidate_mapping_pages(META_MAPPING(sbi), + fio->new_blkaddr, fio->new_blkaddr); + + stat_inc_inplace_blocks(fio->sbi); + + if (fio->bio && !(SM_I(sbi)->ipu_policy & (1 << F2FS_IPU_NOCACHE))) + err = f2fs_merge_page_bio(fio); + else + err = f2fs_submit_page_bio(fio); + if (!err) { + f2fs_update_device_state(fio->sbi, fio->ino, + fio->new_blkaddr, 1); + f2fs_update_iostat(fio->sbi, fio->page->mapping->host, + fio->io_type, F2FS_BLKSIZE); + } + + return err; +drop_bio: + if (fio->bio && *(fio->bio)) { + struct bio *bio = *(fio->bio); + + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + *(fio->bio) = NULL; + } + return err; +} + +static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + int i; + + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { + if (CURSEG_I(sbi, i)->segno == segno) + break; + } + return i; +} + +void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + block_t old_blkaddr, block_t new_blkaddr, + bool recover_curseg, bool recover_newaddr, + bool from_gc) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct curseg_info *curseg; + unsigned int segno, old_cursegno; + struct seg_entry *se; + int type; + unsigned short old_blkoff; + unsigned char old_alloc_type; + + segno = GET_SEGNO(sbi, new_blkaddr); + se = get_seg_entry(sbi, segno); + type = se->type; + + f2fs_down_write(&SM_I(sbi)->curseg_lock); + + if (!recover_curseg) { + /* for recovery flow */ + if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) { + if (old_blkaddr == NULL_ADDR) + type = CURSEG_COLD_DATA; + else + type = CURSEG_WARM_DATA; + } + } else { + if (IS_CURSEG(sbi, segno)) { + /* se->type is volatile as SSR allocation */ + type = __f2fs_get_curseg(sbi, segno); + f2fs_bug_on(sbi, type == NO_CHECK_TYPE); + } else { + type = CURSEG_WARM_DATA; + } + } + + f2fs_bug_on(sbi, !IS_DATASEG(type)); + curseg = CURSEG_I(sbi, type); + + mutex_lock(&curseg->curseg_mutex); + down_write(&sit_i->sentry_lock); + + old_cursegno = curseg->segno; + old_blkoff = curseg->next_blkoff; + old_alloc_type = curseg->alloc_type; + + /* change the current segment */ + if (segno != curseg->segno) { + curseg->next_segno = segno; + change_curseg(sbi, type, true); + } + + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); + __add_sum_entry(sbi, type, sum); + + if (!recover_curseg || recover_newaddr) { + if (!from_gc) + update_segment_mtime(sbi, new_blkaddr, 0); + update_sit_entry(sbi, new_blkaddr, 1); + } + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { + invalidate_mapping_pages(META_MAPPING(sbi), + old_blkaddr, old_blkaddr); + f2fs_invalidate_compress_page(sbi, old_blkaddr); + if (!from_gc) + update_segment_mtime(sbi, old_blkaddr, 0); + update_sit_entry(sbi, old_blkaddr, -1); + } + + locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr)); + + locate_dirty_segment(sbi, old_cursegno); + + if (recover_curseg) { + if (old_cursegno != curseg->segno) { + curseg->next_segno = old_cursegno; + change_curseg(sbi, type, true); + } + curseg->next_blkoff = old_blkoff; + curseg->alloc_type = old_alloc_type; + } + + up_write(&sit_i->sentry_lock); + mutex_unlock(&curseg->curseg_mutex); + f2fs_up_write(&SM_I(sbi)->curseg_lock); +} + +void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, + block_t old_addr, block_t new_addr, + unsigned char version, bool recover_curseg, + bool recover_newaddr) +{ + struct f2fs_summary sum; + + set_summary(&sum, dn->nid, dn->ofs_in_node, version); + + f2fs_do_replace_block(sbi, &sum, old_addr, new_addr, + recover_curseg, recover_newaddr, false); + + f2fs_update_data_blkaddr(dn, new_addr); +} + +void f2fs_wait_on_page_writeback(struct page *page, + enum page_type type, bool ordered, bool locked) +{ + if (PageWriteback(page)) { + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + + /* submit cached LFS IO */ + f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type); + /* sbumit cached IPU IO */ + f2fs_submit_merged_ipu_write(sbi, NULL, page); + if (ordered) { + wait_on_page_writeback(page); + f2fs_bug_on(sbi, locked && PageWriteback(page)); + } else { + wait_for_stable_page(page); + } + } +} + +void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct page *cpage; + + if (!f2fs_post_read_required(inode)) + return; + + if (!__is_valid_data_blkaddr(blkaddr)) + return; + + cpage = find_lock_page(META_MAPPING(sbi), blkaddr); + if (cpage) { + f2fs_wait_on_page_writeback(cpage, DATA, true, true); + f2fs_put_page(cpage, 1); + } +} + +void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, + block_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + block_t i; + + if (!f2fs_post_read_required(inode)) + return; + + for (i = 0; i < len; i++) + f2fs_wait_on_block_writeback(inode, blkaddr + i); + + invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr + len - 1); +} + +static int read_compacted_summaries(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct curseg_info *seg_i; + unsigned char *kaddr; + struct page *page; + block_t start; + int i, j, offset; + + start = start_sum_block(sbi); + + page = f2fs_get_meta_page(sbi, start++); + if (IS_ERR(page)) + return PTR_ERR(page); + kaddr = (unsigned char *)page_address(page); + + /* Step 1: restore nat cache */ + seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); + memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE); + + /* Step 2: restore sit cache */ + seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); + memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE); + offset = 2 * SUM_JOURNAL_SIZE; + + /* Step 3: restore summary entries */ + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + unsigned short blk_off; + unsigned int segno; + + seg_i = CURSEG_I(sbi, i); + segno = le32_to_cpu(ckpt->cur_data_segno[i]); + blk_off = le16_to_cpu(ckpt->cur_data_blkoff[i]); + seg_i->next_segno = segno; + reset_curseg(sbi, i, 0); + seg_i->alloc_type = ckpt->alloc_type[i]; + seg_i->next_blkoff = blk_off; + + if (seg_i->alloc_type == SSR) + blk_off = sbi->blocks_per_seg; + + for (j = 0; j < blk_off; j++) { + struct f2fs_summary *s; + + s = (struct f2fs_summary *)(kaddr + offset); + seg_i->sum_blk->entries[j] = *s; + offset += SUMMARY_SIZE; + if (offset + SUMMARY_SIZE <= PAGE_SIZE - + SUM_FOOTER_SIZE) + continue; + + f2fs_put_page(page, 1); + page = NULL; + + page = f2fs_get_meta_page(sbi, start++); + if (IS_ERR(page)) + return PTR_ERR(page); + kaddr = (unsigned char *)page_address(page); + offset = 0; + } + } + f2fs_put_page(page, 1); + return 0; +} + +static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_summary_block *sum; + struct curseg_info *curseg; + struct page *new; + unsigned short blk_off; + unsigned int segno = 0; + block_t blk_addr = 0; + int err = 0; + + /* get segment number and block addr */ + if (IS_DATASEG(type)) { + segno = le32_to_cpu(ckpt->cur_data_segno[type]); + blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type - + CURSEG_HOT_DATA]); + if (__exist_node_summaries(sbi)) + blk_addr = sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type); + else + blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type); + } else { + segno = le32_to_cpu(ckpt->cur_node_segno[type - + CURSEG_HOT_NODE]); + blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type - + CURSEG_HOT_NODE]); + if (__exist_node_summaries(sbi)) + blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE, + type - CURSEG_HOT_NODE); + else + blk_addr = GET_SUM_BLOCK(sbi, segno); + } + + new = f2fs_get_meta_page(sbi, blk_addr); + if (IS_ERR(new)) + return PTR_ERR(new); + sum = (struct f2fs_summary_block *)page_address(new); + + if (IS_NODESEG(type)) { + if (__exist_node_summaries(sbi)) { + struct f2fs_summary *ns = &sum->entries[0]; + int i; + + for (i = 0; i < sbi->blocks_per_seg; i++, ns++) { + ns->version = 0; + ns->ofs_in_node = 0; + } + } else { + err = f2fs_restore_node_summary(sbi, segno, sum); + if (err) + goto out; + } + } + + /* set uncompleted segment to curseg */ + curseg = CURSEG_I(sbi, type); + mutex_lock(&curseg->curseg_mutex); + + /* update journal info */ + down_write(&curseg->journal_rwsem); + memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE); + up_write(&curseg->journal_rwsem); + + memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE); + memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE); + curseg->next_segno = segno; + reset_curseg(sbi, type, 0); + curseg->alloc_type = ckpt->alloc_type[type]; + curseg->next_blkoff = blk_off; + mutex_unlock(&curseg->curseg_mutex); +out: + f2fs_put_page(new, 1); + return err; +} + +static int restore_curseg_summaries(struct f2fs_sb_info *sbi) +{ + struct f2fs_journal *sit_j = CURSEG_I(sbi, CURSEG_COLD_DATA)->journal; + struct f2fs_journal *nat_j = CURSEG_I(sbi, CURSEG_HOT_DATA)->journal; + int type = CURSEG_HOT_DATA; + int err; + + if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) { + int npages = f2fs_npages_for_summary_flush(sbi, true); + + if (npages >= 2) + f2fs_ra_meta_pages(sbi, start_sum_block(sbi), npages, + META_CP, true); + + /* restore for compacted data summary */ + err = read_compacted_summaries(sbi); + if (err) + return err; + type = CURSEG_HOT_NODE; + } + + if (__exist_node_summaries(sbi)) + f2fs_ra_meta_pages(sbi, + sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type), + NR_CURSEG_PERSIST_TYPE - type, META_CP, true); + + for (; type <= CURSEG_COLD_NODE; type++) { + err = read_normal_summaries(sbi, type); + if (err) + return err; + } + + /* sanity check for summary blocks */ + if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || + sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) { + f2fs_err(sbi, "invalid journal entries nats %u sits %u", + nats_in_cursum(nat_j), sits_in_cursum(sit_j)); + return -EINVAL; + } + + return 0; +} + +static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct page *page; + unsigned char *kaddr; + struct f2fs_summary *summary; + struct curseg_info *seg_i; + int written_size = 0; + int i, j; + + page = f2fs_grab_meta_page(sbi, blkaddr++); + kaddr = (unsigned char *)page_address(page); + memset(kaddr, 0, PAGE_SIZE); + + /* Step 1: write nat cache */ + seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); + memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE); + written_size += SUM_JOURNAL_SIZE; + + /* Step 2: write sit cache */ + seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); + memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE); + written_size += SUM_JOURNAL_SIZE; + + /* Step 3: write summary entries */ + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + unsigned short blkoff; + + seg_i = CURSEG_I(sbi, i); + if (sbi->ckpt->alloc_type[i] == SSR) + blkoff = sbi->blocks_per_seg; + else + blkoff = curseg_blkoff(sbi, i); + + for (j = 0; j < blkoff; j++) { + if (!page) { + page = f2fs_grab_meta_page(sbi, blkaddr++); + kaddr = (unsigned char *)page_address(page); + memset(kaddr, 0, PAGE_SIZE); + written_size = 0; + } + summary = (struct f2fs_summary *)(kaddr + written_size); + *summary = seg_i->sum_blk->entries[j]; + written_size += SUMMARY_SIZE; + + if (written_size + SUMMARY_SIZE <= PAGE_SIZE - + SUM_FOOTER_SIZE) + continue; + + set_page_dirty(page); + f2fs_put_page(page, 1); + page = NULL; + } + } + if (page) { + set_page_dirty(page); + f2fs_put_page(page, 1); + } +} + +static void write_normal_summaries(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) +{ + int i, end; + + if (IS_DATASEG(type)) + end = type + NR_CURSEG_DATA_TYPE; + else + end = type + NR_CURSEG_NODE_TYPE; + + for (i = type; i < end; i++) + write_current_sum_page(sbi, i, blkaddr + (i - type)); +} + +void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) +{ + if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) + write_compacted_summaries(sbi, start_blk); + else + write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA); +} + +void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) +{ + write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); +} + +int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, + unsigned int val, int alloc) +{ + int i; + + if (type == NAT_JOURNAL) { + for (i = 0; i < nats_in_cursum(journal); i++) { + if (le32_to_cpu(nid_in_journal(journal, i)) == val) + return i; + } + if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL)) + return update_nats_in_cursum(journal, 1); + } else if (type == SIT_JOURNAL) { + for (i = 0; i < sits_in_cursum(journal); i++) + if (le32_to_cpu(segno_in_journal(journal, i)) == val) + return i; + if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL)) + return update_sits_in_cursum(journal, 1); + } + return -1; +} + +static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + return f2fs_get_meta_page(sbi, current_sit_addr(sbi, segno)); +} + +static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, + unsigned int start) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct page *page; + pgoff_t src_off, dst_off; + + src_off = current_sit_addr(sbi, start); + dst_off = next_sit_addr(sbi, src_off); + + page = f2fs_grab_meta_page(sbi, dst_off); + seg_info_to_sit_page(sbi, page, start); + + set_page_dirty(page); + set_to_next_sit(sit_i, start); + + return page; +} + +static struct sit_entry_set *grab_sit_entry_set(void) +{ + struct sit_entry_set *ses = + f2fs_kmem_cache_alloc(sit_entry_set_slab, + GFP_NOFS, true, NULL); + + ses->entry_cnt = 0; + INIT_LIST_HEAD(&ses->set_list); + return ses; +} + +static void release_sit_entry_set(struct sit_entry_set *ses) +{ + list_del(&ses->set_list); + kmem_cache_free(sit_entry_set_slab, ses); +} + +static void adjust_sit_entry_set(struct sit_entry_set *ses, + struct list_head *head) +{ + struct sit_entry_set *next = ses; + + if (list_is_last(&ses->set_list, head)) + return; + + list_for_each_entry_continue(next, head, set_list) + if (ses->entry_cnt <= next->entry_cnt) { + list_move_tail(&ses->set_list, &next->set_list); + return; + } + + list_move_tail(&ses->set_list, head); +} + +static void add_sit_entry(unsigned int segno, struct list_head *head) +{ + struct sit_entry_set *ses; + unsigned int start_segno = START_SEGNO(segno); + + list_for_each_entry(ses, head, set_list) { + if (ses->start_segno == start_segno) { + ses->entry_cnt++; + adjust_sit_entry_set(ses, head); + return; + } + } + + ses = grab_sit_entry_set(); + + ses->start_segno = start_segno; + ses->entry_cnt++; + list_add(&ses->set_list, head); +} + +static void add_sits_in_set(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_info = SM_I(sbi); + struct list_head *set_list = &sm_info->sit_entry_set; + unsigned long *bitmap = SIT_I(sbi)->dirty_sentries_bitmap; + unsigned int segno; + + for_each_set_bit(segno, bitmap, MAIN_SEGS(sbi)) + add_sit_entry(segno, set_list); +} + +static void remove_sits_in_journal(struct f2fs_sb_info *sbi) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); + struct f2fs_journal *journal = curseg->journal; + int i; + + down_write(&curseg->journal_rwsem); + for (i = 0; i < sits_in_cursum(journal); i++) { + unsigned int segno; + bool dirtied; + + segno = le32_to_cpu(segno_in_journal(journal, i)); + dirtied = __mark_sit_entry_dirty(sbi, segno); + + if (!dirtied) + add_sit_entry(segno, &SM_I(sbi)->sit_entry_set); + } + update_sits_in_cursum(journal, -i); + up_write(&curseg->journal_rwsem); +} + +/* + * CP calls this function, which flushes SIT entries including sit_journal, + * and moves prefree segs to free segs. + */ +void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned long *bitmap = sit_i->dirty_sentries_bitmap; + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); + struct f2fs_journal *journal = curseg->journal; + struct sit_entry_set *ses, *tmp; + struct list_head *head = &SM_I(sbi)->sit_entry_set; + bool to_journal = !is_sbi_flag_set(sbi, SBI_IS_RESIZEFS); + struct seg_entry *se; + + down_write(&sit_i->sentry_lock); + + if (!sit_i->dirty_sentries) + goto out; + + /* + * add and account sit entries of dirty bitmap in sit entry + * set temporarily + */ + add_sits_in_set(sbi); + + /* + * if there are no enough space in journal to store dirty sit + * entries, remove all entries from journal and add and account + * them in sit entry set. + */ + if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL) || + !to_journal) + remove_sits_in_journal(sbi); + + /* + * there are two steps to flush sit entries: + * #1, flush sit entries to journal in current cold data summary block. + * #2, flush sit entries to sit page. + */ + list_for_each_entry_safe(ses, tmp, head, set_list) { + struct page *page = NULL; + struct f2fs_sit_block *raw_sit = NULL; + unsigned int start_segno = ses->start_segno; + unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK, + (unsigned long)MAIN_SEGS(sbi)); + unsigned int segno = start_segno; + + if (to_journal && + !__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL)) + to_journal = false; + + if (to_journal) { + down_write(&curseg->journal_rwsem); + } else { + page = get_next_sit_page(sbi, start_segno); + raw_sit = page_address(page); + } + + /* flush dirty sit entries in region of current sit set */ + for_each_set_bit_from(segno, bitmap, end) { + int offset, sit_offset; + + se = get_seg_entry(sbi, segno); +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(se->cur_valid_map, se->cur_valid_map_mir, + SIT_VBLOCK_MAP_SIZE)) + f2fs_bug_on(sbi, 1); +#endif + + /* add discard candidates */ + if (!(cpc->reason & CP_DISCARD)) { + cpc->trim_start = segno; + add_discard_addrs(sbi, cpc, false); + } + + if (to_journal) { + offset = f2fs_lookup_journal_in_cursum(journal, + SIT_JOURNAL, segno, 1); + f2fs_bug_on(sbi, offset < 0); + segno_in_journal(journal, offset) = + cpu_to_le32(segno); + seg_info_to_raw_sit(se, + &sit_in_journal(journal, offset)); + check_block_count(sbi, segno, + &sit_in_journal(journal, offset)); + } else { + sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); + seg_info_to_raw_sit(se, + &raw_sit->entries[sit_offset]); + check_block_count(sbi, segno, + &raw_sit->entries[sit_offset]); + } + + __clear_bit(segno, bitmap); + sit_i->dirty_sentries--; + ses->entry_cnt--; + } + + if (to_journal) + up_write(&curseg->journal_rwsem); + else + f2fs_put_page(page, 1); + + f2fs_bug_on(sbi, ses->entry_cnt); + release_sit_entry_set(ses); + } + + f2fs_bug_on(sbi, !list_empty(head)); + f2fs_bug_on(sbi, sit_i->dirty_sentries); +out: + if (cpc->reason & CP_DISCARD) { + __u64 trim_start = cpc->trim_start; + + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) + add_discard_addrs(sbi, cpc, false); + + cpc->trim_start = trim_start; + } + up_write(&sit_i->sentry_lock); + + set_prefree_as_free_segments(sbi); +} + +static int build_sit_info(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + struct sit_info *sit_i; + unsigned int sit_segs, start; + char *src_bitmap, *bitmap; + unsigned int bitmap_size, main_bitmap_size, sit_bitmap_size; + unsigned int discard_map = f2fs_block_unit_discard(sbi) ? 1 : 0; + + /* allocate memory for SIT information */ + sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL); + if (!sit_i) + return -ENOMEM; + + SM_I(sbi)->sit_info = sit_i; + + sit_i->sentries = + f2fs_kvzalloc(sbi, array_size(sizeof(struct seg_entry), + MAIN_SEGS(sbi)), + GFP_KERNEL); + if (!sit_i->sentries) + return -ENOMEM; + + main_bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); + sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(sbi, main_bitmap_size, + GFP_KERNEL); + if (!sit_i->dirty_sentries_bitmap) + return -ENOMEM; + +#ifdef CONFIG_F2FS_CHECK_FS + bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (3 + discard_map); +#else + bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (2 + discard_map); +#endif + sit_i->bitmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); + if (!sit_i->bitmap) + return -ENOMEM; + + bitmap = sit_i->bitmap; + + for (start = 0; start < MAIN_SEGS(sbi); start++) { + sit_i->sentries[start].cur_valid_map = bitmap; + bitmap += SIT_VBLOCK_MAP_SIZE; + + sit_i->sentries[start].ckpt_valid_map = bitmap; + bitmap += SIT_VBLOCK_MAP_SIZE; + +#ifdef CONFIG_F2FS_CHECK_FS + sit_i->sentries[start].cur_valid_map_mir = bitmap; + bitmap += SIT_VBLOCK_MAP_SIZE; +#endif + + if (discard_map) { + sit_i->sentries[start].discard_map = bitmap; + bitmap += SIT_VBLOCK_MAP_SIZE; + } + } + + sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->tmp_map) + return -ENOMEM; + + if (__is_large_section(sbi)) { + sit_i->sec_entries = + f2fs_kvzalloc(sbi, array_size(sizeof(struct sec_entry), + MAIN_SECS(sbi)), + GFP_KERNEL); + if (!sit_i->sec_entries) + return -ENOMEM; + } + + /* get information related with SIT */ + sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1; + + /* setup SIT bitmap from ckeckpoint pack */ + sit_bitmap_size = __bitmap_size(sbi, SIT_BITMAP); + src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); + + sit_i->sit_bitmap = kmemdup(src_bitmap, sit_bitmap_size, GFP_KERNEL); + if (!sit_i->sit_bitmap) + return -ENOMEM; + +#ifdef CONFIG_F2FS_CHECK_FS + sit_i->sit_bitmap_mir = kmemdup(src_bitmap, + sit_bitmap_size, GFP_KERNEL); + if (!sit_i->sit_bitmap_mir) + return -ENOMEM; + + sit_i->invalid_segmap = f2fs_kvzalloc(sbi, + main_bitmap_size, GFP_KERNEL); + if (!sit_i->invalid_segmap) + return -ENOMEM; +#endif + + /* init SIT information */ + sit_i->s_ops = &default_salloc_ops; + + sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); + sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; + sit_i->written_valid_blocks = 0; + sit_i->bitmap_size = sit_bitmap_size; + sit_i->dirty_sentries = 0; + sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; + sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); + sit_i->mounted_time = ktime_get_boottime_seconds(); + init_rwsem(&sit_i->sentry_lock); + return 0; +} + +static int build_free_segmap(struct f2fs_sb_info *sbi) +{ + struct free_segmap_info *free_i; + unsigned int bitmap_size, sec_bitmap_size; + + /* allocate memory for free segmap information */ + free_i = f2fs_kzalloc(sbi, sizeof(struct free_segmap_info), GFP_KERNEL); + if (!free_i) + return -ENOMEM; + + SM_I(sbi)->free_info = free_i; + + bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); + free_i->free_segmap = f2fs_kvmalloc(sbi, bitmap_size, GFP_KERNEL); + if (!free_i->free_segmap) + return -ENOMEM; + + sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + free_i->free_secmap = f2fs_kvmalloc(sbi, sec_bitmap_size, GFP_KERNEL); + if (!free_i->free_secmap) + return -ENOMEM; + + /* set all segments as dirty temporarily */ + memset(free_i->free_segmap, 0xff, bitmap_size); + memset(free_i->free_secmap, 0xff, sec_bitmap_size); + + /* init free segmap information */ + free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi)); + free_i->free_segments = 0; + free_i->free_sections = 0; + spin_lock_init(&free_i->segmap_lock); + return 0; +} + +static int build_curseg(struct f2fs_sb_info *sbi) +{ + struct curseg_info *array; + int i; + + array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE, + sizeof(*array)), GFP_KERNEL); + if (!array) + return -ENOMEM; + + SM_I(sbi)->curseg_array = array; + + for (i = 0; i < NO_CHECK_TYPE; i++) { + mutex_init(&array[i].curseg_mutex); + array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL); + if (!array[i].sum_blk) + return -ENOMEM; + init_rwsem(&array[i].journal_rwsem); + array[i].journal = f2fs_kzalloc(sbi, + sizeof(struct f2fs_journal), GFP_KERNEL); + if (!array[i].journal) + return -ENOMEM; + if (i < NR_PERSISTENT_LOG) + array[i].seg_type = CURSEG_HOT_DATA + i; + else if (i == CURSEG_COLD_DATA_PINNED) + array[i].seg_type = CURSEG_COLD_DATA; + else if (i == CURSEG_ALL_DATA_ATGC) + array[i].seg_type = CURSEG_COLD_DATA; + array[i].segno = NULL_SEGNO; + array[i].next_blkoff = 0; + array[i].inited = false; + } + return restore_curseg_summaries(sbi); +} + +static int build_sit_entries(struct f2fs_sb_info *sbi) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); + struct f2fs_journal *journal = curseg->journal; + struct seg_entry *se; + struct f2fs_sit_entry sit; + int sit_blk_cnt = SIT_BLK_CNT(sbi); + unsigned int i, start, end; + unsigned int readed, start_blk = 0; + int err = 0; + block_t sit_valid_blocks[2] = {0, 0}; + + do { + readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS, + META_SIT, true); + + start = start_blk * sit_i->sents_per_block; + end = (start_blk + readed) * sit_i->sents_per_block; + + for (; start < end && start < MAIN_SEGS(sbi); start++) { + struct f2fs_sit_block *sit_blk; + struct page *page; + + se = &sit_i->sentries[start]; + page = get_current_sit_page(sbi, start); + if (IS_ERR(page)) + return PTR_ERR(page); + sit_blk = (struct f2fs_sit_block *)page_address(page); + sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; + f2fs_put_page(page, 1); + + err = check_block_count(sbi, start, &sit); + if (err) + return err; + seg_info_from_raw_sit(se, &sit); + + if (se->type >= NR_PERSISTENT_LOG) { + f2fs_err(sbi, "Invalid segment type: %u, segno: %u", + se->type, start); + f2fs_handle_error(sbi, + ERROR_INCONSISTENT_SUM_TYPE); + return -EFSCORRUPTED; + } + + sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; + + if (f2fs_block_unit_discard(sbi)) { + /* build discard map only one time */ + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, + se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += + sbi->blocks_per_seg - + se->valid_blocks; + } + } + + if (__is_large_section(sbi)) + get_sec_entry(sbi, start)->valid_blocks += + se->valid_blocks; + } + start_blk += readed; + } while (start_blk < sit_blk_cnt); + + down_read(&curseg->journal_rwsem); + for (i = 0; i < sits_in_cursum(journal); i++) { + unsigned int old_valid_blocks; + + start = le32_to_cpu(segno_in_journal(journal, i)); + if (start >= MAIN_SEGS(sbi)) { + f2fs_err(sbi, "Wrong journal entry on segno %u", + start); + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_CORRUPTED_JOURNAL); + break; + } + + se = &sit_i->sentries[start]; + sit = sit_in_journal(journal, i); + + old_valid_blocks = se->valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] -= old_valid_blocks; + + err = check_block_count(sbi, start, &sit); + if (err) + break; + seg_info_from_raw_sit(se, &sit); + + if (se->type >= NR_PERSISTENT_LOG) { + f2fs_err(sbi, "Invalid segment type: %u, segno: %u", + se->type, start); + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); + break; + } + + sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; + + if (f2fs_block_unit_discard(sbi)) { + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks; + sbi->discard_blks -= se->valid_blocks; + } + } + + if (__is_large_section(sbi)) { + get_sec_entry(sbi, start)->valid_blocks += + se->valid_blocks; + get_sec_entry(sbi, start)->valid_blocks -= + old_valid_blocks; + } + } + up_read(&curseg->journal_rwsem); + + if (err) + return err; + + if (sit_valid_blocks[NODE] != valid_node_count(sbi)) { + f2fs_err(sbi, "SIT is corrupted node# %u vs %u", + sit_valid_blocks[NODE], valid_node_count(sbi)); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NODE_COUNT); + return -EFSCORRUPTED; + } + + if (sit_valid_blocks[DATA] + sit_valid_blocks[NODE] > + valid_user_blocks(sbi)) { + f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u", + sit_valid_blocks[DATA], sit_valid_blocks[NODE], + valid_user_blocks(sbi)); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_BLOCK_COUNT); + return -EFSCORRUPTED; + } + + return 0; +} + +static void init_free_segmap(struct f2fs_sb_info *sbi) +{ + unsigned int start; + int type; + struct seg_entry *sentry; + + for (start = 0; start < MAIN_SEGS(sbi); start++) { + if (f2fs_usable_blks_in_seg(sbi, start) == 0) + continue; + sentry = get_seg_entry(sbi, start); + if (!sentry->valid_blocks) + __set_free(sbi, start); + else + SIT_I(sbi)->written_valid_blocks += + sentry->valid_blocks; + } + + /* set use the current segments */ + for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) { + struct curseg_info *curseg_t = CURSEG_I(sbi, type); + + __set_test_and_inuse(sbi, curseg_t->segno); + } +} + +static void init_dirty_segmap(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int segno = 0, offset = 0, secno; + block_t valid_blocks, usable_blks_in_seg; + + while (1) { + /* find dirty segment based on free segmap */ + segno = find_next_inuse(free_i, MAIN_SEGS(sbi), offset); + if (segno >= MAIN_SEGS(sbi)) + break; + offset = segno + 1; + valid_blocks = get_valid_blocks(sbi, segno, false); + usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); + if (valid_blocks == usable_blks_in_seg || !valid_blocks) + continue; + if (valid_blocks > usable_blks_in_seg) { + f2fs_bug_on(sbi, 1); + continue; + } + mutex_lock(&dirty_i->seglist_lock); + __locate_dirty_segment(sbi, segno, DIRTY); + mutex_unlock(&dirty_i->seglist_lock); + } + + if (!__is_large_section(sbi)) + return; + + mutex_lock(&dirty_i->seglist_lock); + for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { + valid_blocks = get_valid_blocks(sbi, segno, true); + secno = GET_SEC_FROM_SEG(sbi, segno); + + if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi)) + continue; + if (IS_CURSEC(sbi, secno)) + continue; + set_bit(secno, dirty_i->dirty_secmap); + } + mutex_unlock(&dirty_i->seglist_lock); +} + +static int init_victim_secmap(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + + dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); + if (!dirty_i->victim_secmap) + return -ENOMEM; + + dirty_i->pinned_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); + if (!dirty_i->pinned_secmap) + return -ENOMEM; + + dirty_i->pinned_secmap_cnt = 0; + dirty_i->enable_pin_section = true; + return 0; +} + +static int build_dirty_segmap(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i; + unsigned int bitmap_size, i; + + /* allocate memory for dirty segments list information */ + dirty_i = f2fs_kzalloc(sbi, sizeof(struct dirty_seglist_info), + GFP_KERNEL); + if (!dirty_i) + return -ENOMEM; + + SM_I(sbi)->dirty_info = dirty_i; + mutex_init(&dirty_i->seglist_lock); + + bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); + + for (i = 0; i < NR_DIRTY_TYPE; i++) { + dirty_i->dirty_segmap[i] = f2fs_kvzalloc(sbi, bitmap_size, + GFP_KERNEL); + if (!dirty_i->dirty_segmap[i]) + return -ENOMEM; + } + + if (__is_large_section(sbi)) { + bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + dirty_i->dirty_secmap = f2fs_kvzalloc(sbi, + bitmap_size, GFP_KERNEL); + if (!dirty_i->dirty_secmap) + return -ENOMEM; + } + + init_dirty_segmap(sbi); + return init_victim_secmap(sbi); +} + +static int sanity_check_curseg(struct f2fs_sb_info *sbi) +{ + int i; + + /* + * In LFS/SSR curseg, .next_blkoff should point to an unused blkaddr; + * In LFS curseg, all blkaddr after .next_blkoff should be unused. + */ + for (i = 0; i < NR_PERSISTENT_LOG; i++) { + struct curseg_info *curseg = CURSEG_I(sbi, i); + struct seg_entry *se = get_seg_entry(sbi, curseg->segno); + unsigned int blkofs = curseg->next_blkoff; + + if (f2fs_sb_has_readonly(sbi) && + i != CURSEG_HOT_DATA && i != CURSEG_HOT_NODE) + continue; + + sanity_check_seg_type(sbi, curseg->seg_type); + + if (curseg->alloc_type != LFS && curseg->alloc_type != SSR) { + f2fs_err(sbi, + "Current segment has invalid alloc_type:%d", + curseg->alloc_type); + f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); + return -EFSCORRUPTED; + } + + if (f2fs_test_bit(blkofs, se->cur_valid_map)) + goto out; + + if (curseg->alloc_type == SSR) + continue; + + for (blkofs += 1; blkofs < sbi->blocks_per_seg; blkofs++) { + if (!f2fs_test_bit(blkofs, se->cur_valid_map)) + continue; +out: + f2fs_err(sbi, + "Current segment's next free block offset is inconsistent with bitmap, logtype:%u, segno:%u, type:%u, next_blkoff:%u, blkofs:%u", + i, curseg->segno, curseg->alloc_type, + curseg->next_blkoff, blkofs); + f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); + return -EFSCORRUPTED; + } + } + return 0; +} + +#ifdef CONFIG_BLK_DEV_ZONED + +static int check_zone_write_pointer(struct f2fs_sb_info *sbi, + struct f2fs_dev_info *fdev, + struct blk_zone *zone) +{ + unsigned int wp_segno, wp_blkoff, zone_secno, zone_segno, segno; + block_t zone_block, wp_block, last_valid_block; + unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; + int i, s, b, ret; + struct seg_entry *se; + + if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) + return 0; + + wp_block = fdev->start_blk + (zone->wp >> log_sectors_per_block); + wp_segno = GET_SEGNO(sbi, wp_block); + wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); + zone_block = fdev->start_blk + (zone->start >> log_sectors_per_block); + zone_segno = GET_SEGNO(sbi, zone_block); + zone_secno = GET_SEC_FROM_SEG(sbi, zone_segno); + + if (zone_segno >= MAIN_SEGS(sbi)) + return 0; + + /* + * Skip check of zones cursegs point to, since + * fix_curseg_write_pointer() checks them. + */ + for (i = 0; i < NO_CHECK_TYPE; i++) + if (zone_secno == GET_SEC_FROM_SEG(sbi, + CURSEG_I(sbi, i)->segno)) + return 0; + + /* + * Get last valid block of the zone. + */ + last_valid_block = zone_block - 1; + for (s = sbi->segs_per_sec - 1; s >= 0; s--) { + segno = zone_segno + s; + se = get_seg_entry(sbi, segno); + for (b = sbi->blocks_per_seg - 1; b >= 0; b--) + if (f2fs_test_bit(b, se->cur_valid_map)) { + last_valid_block = START_BLOCK(sbi, segno) + b; + break; + } + if (last_valid_block >= zone_block) + break; + } + + /* + * If last valid block is beyond the write pointer, report the + * inconsistency. This inconsistency does not cause write error + * because the zone will not be selected for write operation until + * it get discarded. Just report it. + */ + if (last_valid_block >= wp_block) { + f2fs_notice(sbi, "Valid block beyond write pointer: " + "valid block[0x%x,0x%x] wp[0x%x,0x%x]", + GET_SEGNO(sbi, last_valid_block), + GET_BLKOFF_FROM_SEG0(sbi, last_valid_block), + wp_segno, wp_blkoff); + return 0; + } + + /* + * If there is no valid block in the zone and if write pointer is + * not at zone start, reset the write pointer. + */ + if (last_valid_block + 1 == zone_block && zone->wp != zone->start) { + f2fs_notice(sbi, + "Zone without valid block has non-zero write " + "pointer. Reset the write pointer: wp[0x%x,0x%x]", + wp_segno, wp_blkoff); + ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block, + zone->len >> log_sectors_per_block); + if (ret) { + f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", + fdev->path, ret); + return ret; + } + } + + return 0; +} + +static struct f2fs_dev_info *get_target_zoned_dev(struct f2fs_sb_info *sbi, + block_t zone_blkaddr) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + if (!bdev_is_zoned(FDEV(i).bdev)) + continue; + if (sbi->s_ndevs == 1 || (FDEV(i).start_blk <= zone_blkaddr && + zone_blkaddr <= FDEV(i).end_blk)) + return &FDEV(i); + } + + return NULL; +} + +static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + memcpy(data, zone, sizeof(struct blk_zone)); + return 0; +} + +static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *cs = CURSEG_I(sbi, type); + struct f2fs_dev_info *zbd; + struct blk_zone zone; + unsigned int cs_section, wp_segno, wp_blkoff, wp_sector_off; + block_t cs_zone_block, wp_block; + unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; + sector_t zone_sector; + int err; + + cs_section = GET_SEC_FROM_SEG(sbi, cs->segno); + cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section)); + + zbd = get_target_zoned_dev(sbi, cs_zone_block); + if (!zbd) + return 0; + + /* report zone for the sector the curseg points to */ + zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) + << log_sectors_per_block; + err = blkdev_report_zones(zbd->bdev, zone_sector, 1, + report_one_zone_cb, &zone); + if (err != 1) { + f2fs_err(sbi, "Report zone failed: %s errno=(%d)", + zbd->path, err); + return err; + } + + if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) + return 0; + + wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block); + wp_segno = GET_SEGNO(sbi, wp_block); + wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); + wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0); + + if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff && + wp_sector_off == 0) + return 0; + + f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: " + "curseg[0x%x,0x%x] wp[0x%x,0x%x]", + type, cs->segno, cs->next_blkoff, wp_segno, wp_blkoff); + + f2fs_notice(sbi, "Assign new section to curseg[%d]: " + "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); + + f2fs_allocate_new_section(sbi, type, true); + + /* check consistency of the zone curseg pointed to */ + if (check_zone_write_pointer(sbi, zbd, &zone)) + return -EIO; + + /* check newly assigned zone */ + cs_section = GET_SEC_FROM_SEG(sbi, cs->segno); + cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section)); + + zbd = get_target_zoned_dev(sbi, cs_zone_block); + if (!zbd) + return 0; + + zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) + << log_sectors_per_block; + err = blkdev_report_zones(zbd->bdev, zone_sector, 1, + report_one_zone_cb, &zone); + if (err != 1) { + f2fs_err(sbi, "Report zone failed: %s errno=(%d)", + zbd->path, err); + return err; + } + + if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) + return 0; + + if (zone.wp != zone.start) { + f2fs_notice(sbi, + "New zone for curseg[%d] is not yet discarded. " + "Reset the zone: curseg[0x%x,0x%x]", + type, cs->segno, cs->next_blkoff); + err = __f2fs_issue_discard_zone(sbi, zbd->bdev, + zone_sector >> log_sectors_per_block, + zone.len >> log_sectors_per_block); + if (err) { + f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", + zbd->path, err); + return err; + } + } + + return 0; +} + +int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) +{ + int i, ret; + + for (i = 0; i < NR_PERSISTENT_LOG; i++) { + ret = fix_curseg_write_pointer(sbi, i); + if (ret) + return ret; + } + + return 0; +} + +struct check_zone_write_pointer_args { + struct f2fs_sb_info *sbi; + struct f2fs_dev_info *fdev; +}; + +static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct check_zone_write_pointer_args *args; + + args = (struct check_zone_write_pointer_args *)data; + + return check_zone_write_pointer(args->sbi, args->fdev, zone); +} + +int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) +{ + int i, ret; + struct check_zone_write_pointer_args args; + + for (i = 0; i < sbi->s_ndevs; i++) { + if (!bdev_is_zoned(FDEV(i).bdev)) + continue; + + args.sbi = sbi; + args.fdev = &FDEV(i); + ret = blkdev_report_zones(FDEV(i).bdev, 0, BLK_ALL_ZONES, + check_zone_write_pointer_cb, &args); + if (ret < 0) + return ret; + } + + return 0; +} + +/* + * Return the number of usable blocks in a segment. The number of blocks + * returned is always equal to the number of blocks in a segment for + * segments fully contained within a sequential zone capacity or a + * conventional zone. For segments partially contained in a sequential + * zone capacity, the number of usable blocks up to the zone capacity + * is returned. 0 is returned in all other cases. + */ +static inline unsigned int f2fs_usable_zone_blks_in_seg( + struct f2fs_sb_info *sbi, unsigned int segno) +{ + block_t seg_start, sec_start_blkaddr, sec_cap_blkaddr; + unsigned int secno; + + if (!sbi->unusable_blocks_per_sec) + return sbi->blocks_per_seg; + + secno = GET_SEC_FROM_SEG(sbi, segno); + seg_start = START_BLOCK(sbi, segno); + sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno)); + sec_cap_blkaddr = sec_start_blkaddr + CAP_BLKS_PER_SEC(sbi); + + /* + * If segment starts before zone capacity and spans beyond + * zone capacity, then usable blocks are from seg start to + * zone capacity. If the segment starts after the zone capacity, + * then there are no usable blocks. + */ + if (seg_start >= sec_cap_blkaddr) + return 0; + if (seg_start + sbi->blocks_per_seg > sec_cap_blkaddr) + return sec_cap_blkaddr - seg_start; + + return sbi->blocks_per_seg; +} +#else +int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) +{ + return 0; +} + +int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) +{ + return 0; +} + +static inline unsigned int f2fs_usable_zone_blks_in_seg(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + return 0; +} + +#endif +unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + if (f2fs_sb_has_blkzoned(sbi)) + return f2fs_usable_zone_blks_in_seg(sbi, segno); + + return sbi->blocks_per_seg; +} + +unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + if (f2fs_sb_has_blkzoned(sbi)) + return CAP_SEGS_PER_SEC(sbi); + + return sbi->segs_per_sec; +} + +/* + * Update min, max modified time for cost-benefit GC algorithm + */ +static void init_min_max_mtime(struct f2fs_sb_info *sbi) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int segno; + + down_write(&sit_i->sentry_lock); + + sit_i->min_mtime = ULLONG_MAX; + + for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { + unsigned int i; + unsigned long long mtime = 0; + + for (i = 0; i < sbi->segs_per_sec; i++) + mtime += get_seg_entry(sbi, segno + i)->mtime; + + mtime = div_u64(mtime, sbi->segs_per_sec); + + if (sit_i->min_mtime > mtime) + sit_i->min_mtime = mtime; + } + sit_i->max_mtime = get_mtime(sbi, false); + sit_i->dirty_max_mtime = 0; + up_write(&sit_i->sentry_lock); +} + +int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_sm_info *sm_info; + int err; + + sm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_sm_info), GFP_KERNEL); + if (!sm_info) + return -ENOMEM; + + /* init sm info */ + sbi->sm_info = sm_info; + sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); + sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); + sm_info->segment_count = le32_to_cpu(raw_super->segment_count); + sm_info->reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); + sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); + sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); + sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); + sm_info->rec_prefree_segments = sm_info->main_segments * + DEF_RECLAIM_PREFREE_SEGMENTS / 100; + if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS) + sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS; + + if (!f2fs_lfs_mode(sbi)) + sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; + sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; + sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; + sm_info->min_seq_blocks = sbi->blocks_per_seg; + sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; + sm_info->min_ssr_sections = reserved_sections(sbi); + + INIT_LIST_HEAD(&sm_info->sit_entry_set); + + init_f2fs_rwsem(&sm_info->curseg_lock); + + if (!f2fs_readonly(sbi->sb)) { + err = f2fs_create_flush_cmd_control(sbi); + if (err) + return err; + } + + err = create_discard_cmd_control(sbi); + if (err) + return err; + + err = build_sit_info(sbi); + if (err) + return err; + err = build_free_segmap(sbi); + if (err) + return err; + err = build_curseg(sbi); + if (err) + return err; + + /* reinit free segmap based on SIT */ + err = build_sit_entries(sbi); + if (err) + return err; + + init_free_segmap(sbi); + err = build_dirty_segmap(sbi); + if (err) + return err; + + err = sanity_check_curseg(sbi); + if (err) + return err; + + init_min_max_mtime(sbi); + return 0; +} + +static void discard_dirty_segmap(struct f2fs_sb_info *sbi, + enum dirty_type dirty_type) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + + mutex_lock(&dirty_i->seglist_lock); + kvfree(dirty_i->dirty_segmap[dirty_type]); + dirty_i->nr_dirty[dirty_type] = 0; + mutex_unlock(&dirty_i->seglist_lock); +} + +static void destroy_victim_secmap(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + + kvfree(dirty_i->pinned_secmap); + kvfree(dirty_i->victim_secmap); +} + +static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + int i; + + if (!dirty_i) + return; + + /* discard pre-free/dirty segments list */ + for (i = 0; i < NR_DIRTY_TYPE; i++) + discard_dirty_segmap(sbi, i); + + if (__is_large_section(sbi)) { + mutex_lock(&dirty_i->seglist_lock); + kvfree(dirty_i->dirty_secmap); + mutex_unlock(&dirty_i->seglist_lock); + } + + destroy_victim_secmap(sbi); + SM_I(sbi)->dirty_info = NULL; + kfree(dirty_i); +} + +static void destroy_curseg(struct f2fs_sb_info *sbi) +{ + struct curseg_info *array = SM_I(sbi)->curseg_array; + int i; + + if (!array) + return; + SM_I(sbi)->curseg_array = NULL; + for (i = 0; i < NR_CURSEG_TYPE; i++) { + kfree(array[i].sum_blk); + kfree(array[i].journal); + } + kfree(array); +} + +static void destroy_free_segmap(struct f2fs_sb_info *sbi) +{ + struct free_segmap_info *free_i = SM_I(sbi)->free_info; + + if (!free_i) + return; + SM_I(sbi)->free_info = NULL; + kvfree(free_i->free_segmap); + kvfree(free_i->free_secmap); + kfree(free_i); +} + +static void destroy_sit_info(struct f2fs_sb_info *sbi) +{ + struct sit_info *sit_i = SIT_I(sbi); + + if (!sit_i) + return; + + if (sit_i->sentries) + kvfree(sit_i->bitmap); + kfree(sit_i->tmp_map); + + kvfree(sit_i->sentries); + kvfree(sit_i->sec_entries); + kvfree(sit_i->dirty_sentries_bitmap); + + SM_I(sbi)->sit_info = NULL; + kvfree(sit_i->sit_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + kvfree(sit_i->sit_bitmap_mir); + kvfree(sit_i->invalid_segmap); +#endif + kfree(sit_i); +} + +void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_info = SM_I(sbi); + + if (!sm_info) + return; + f2fs_destroy_flush_cmd_control(sbi, true); + destroy_discard_cmd_control(sbi); + destroy_dirty_segmap(sbi); + destroy_curseg(sbi); + destroy_free_segmap(sbi); + destroy_sit_info(sbi); + sbi->sm_info = NULL; + kfree(sm_info); +} + +int __init f2fs_create_segment_manager_caches(void) +{ + discard_entry_slab = f2fs_kmem_cache_create("f2fs_discard_entry", + sizeof(struct discard_entry)); + if (!discard_entry_slab) + goto fail; + + discard_cmd_slab = f2fs_kmem_cache_create("f2fs_discard_cmd", + sizeof(struct discard_cmd)); + if (!discard_cmd_slab) + goto destroy_discard_entry; + + sit_entry_set_slab = f2fs_kmem_cache_create("f2fs_sit_entry_set", + sizeof(struct sit_entry_set)); + if (!sit_entry_set_slab) + goto destroy_discard_cmd; + + revoke_entry_slab = f2fs_kmem_cache_create("f2fs_revoke_entry", + sizeof(struct revoke_entry)); + if (!revoke_entry_slab) + goto destroy_sit_entry_set; + return 0; + +destroy_sit_entry_set: + kmem_cache_destroy(sit_entry_set_slab); +destroy_discard_cmd: + kmem_cache_destroy(discard_cmd_slab); +destroy_discard_entry: + kmem_cache_destroy(discard_entry_slab); +fail: + return -ENOMEM; +} + +void f2fs_destroy_segment_manager_caches(void) +{ + kmem_cache_destroy(sit_entry_set_slab); + kmem_cache_destroy(discard_cmd_slab); + kmem_cache_destroy(discard_entry_slab); + kmem_cache_destroy(revoke_entry_slab); +} diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h new file mode 100644 index 000000000..f3951e8ad --- /dev/null +++ b/fs/f2fs/segment.h @@ -0,0 +1,959 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * fs/f2fs/segment.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + */ +#include <linux/blkdev.h> +#include <linux/backing-dev.h> + +/* constant macro */ +#define NULL_SEGNO ((unsigned int)(~0)) +#define NULL_SECNO ((unsigned int)(~0)) + +#define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ +#define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ + +#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ +#define F2FS_MIN_META_SEGMENTS 8 /* SB + 2 (CP + SIT + NAT) + SSA */ + +/* L: Logical segment # in volume, R: Relative segment # in main area */ +#define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) +#define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) + +#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) +#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE) +#define SE_PAGETYPE(se) ((IS_NODESEG((se)->type) ? NODE : DATA)) + +static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, + unsigned short seg_type) +{ + f2fs_bug_on(sbi, seg_type >= NR_PERSISTENT_LOG); +} + +#define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA) +#define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA) +#define IS_COLD(t) ((t) == CURSEG_COLD_NODE || (t) == CURSEG_COLD_DATA) + +#define IS_CURSEG(sbi, seg) \ + (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno)) + +#define IS_CURSEC(sbi, secno) \ + (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno / \ + (sbi)->segs_per_sec)) + +#define MAIN_BLKADDR(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->main_blkaddr)) +#define SEG0_BLKADDR(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->seg0_blkaddr : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment0_blkaddr)) + +#define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) +#define MAIN_SECS(sbi) ((sbi)->total_sections) + +#define TOTAL_SEGS(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->segment_count : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count)) +#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg) + +#define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) +#define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \ + (sbi)->log_blocks_per_seg)) + +#define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \ + (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg)) + +#define NEXT_FREE_BLKADDR(sbi, curseg) \ + (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff) + +#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi)) +#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg) +#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) + +#define GET_SEGNO(sbi, blk_addr) \ + ((!__is_valid_data_blkaddr(blk_addr)) ? \ + NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ + GET_SEGNO_FROM_SEG0(sbi, blk_addr))) +#define BLKS_PER_SEC(sbi) \ + ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) +#define CAP_BLKS_PER_SEC(sbi) \ + ((sbi)->segs_per_sec * (sbi)->blocks_per_seg - \ + (sbi)->unusable_blocks_per_sec) +#define CAP_SEGS_PER_SEC(sbi) \ + ((sbi)->segs_per_sec - ((sbi)->unusable_blocks_per_sec >>\ + (sbi)->log_blocks_per_seg)) +#define GET_SEC_FROM_SEG(sbi, segno) \ + (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec) +#define GET_SEG_FROM_SEC(sbi, secno) \ + ((secno) * (sbi)->segs_per_sec) +#define GET_ZONE_FROM_SEC(sbi, secno) \ + (((secno) == -1) ? -1: (secno) / (sbi)->secs_per_zone) +#define GET_ZONE_FROM_SEG(sbi, segno) \ + GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) + +#define GET_SUM_BLOCK(sbi, segno) \ + ((sbi)->sm_info->ssa_blkaddr + (segno)) + +#define GET_SUM_TYPE(footer) ((footer)->entry_type) +#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type)) + +#define SIT_ENTRY_OFFSET(sit_i, segno) \ + ((segno) % (sit_i)->sents_per_block) +#define SIT_BLOCK_OFFSET(segno) \ + ((segno) / SIT_ENTRY_PER_BLOCK) +#define START_SEGNO(segno) \ + (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK) +#define SIT_BLK_CNT(sbi) \ + DIV_ROUND_UP(MAIN_SEGS(sbi), SIT_ENTRY_PER_BLOCK) +#define f2fs_bitmap_size(nr) \ + (BITS_TO_LONGS(nr) * sizeof(unsigned long)) + +#define SECTOR_FROM_BLOCK(blk_addr) \ + (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) +#define SECTOR_TO_BLOCK(sectors) \ + ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK) + +/* + * indicate a block allocation direction: RIGHT and LEFT. + * RIGHT means allocating new sections towards the end of volume. + * LEFT means the opposite direction. + */ +enum { + ALLOC_RIGHT = 0, + ALLOC_LEFT +}; + +/* + * In the victim_sel_policy->alloc_mode, there are three block allocation modes. + * LFS writes data sequentially with cleaning operations. + * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations. + * AT_SSR (Age Threshold based Slack Space Recycle) merges fragments into + * fragmented segment which has similar aging degree. + */ +enum { + LFS = 0, + SSR, + AT_SSR, +}; + +/* + * In the victim_sel_policy->gc_mode, there are three gc, aka cleaning, modes. + * GC_CB is based on cost-benefit algorithm. + * GC_GREEDY is based on greedy algorithm. + * GC_AT is based on age-threshold algorithm. + */ +enum { + GC_CB = 0, + GC_GREEDY, + GC_AT, + ALLOC_NEXT, + FLUSH_DEVICE, + MAX_GC_POLICY, +}; + +/* + * BG_GC means the background cleaning job. + * FG_GC means the on-demand cleaning job. + */ +enum { + BG_GC = 0, + FG_GC, +}; + +/* for a function parameter to select a victim segment */ +struct victim_sel_policy { + int alloc_mode; /* LFS or SSR */ + int gc_mode; /* GC_CB or GC_GREEDY */ + unsigned long *dirty_bitmap; /* dirty segment/section bitmap */ + unsigned int max_search; /* + * maximum # of segments/sections + * to search + */ + unsigned int offset; /* last scanned bitmap offset */ + unsigned int ofs_unit; /* bitmap search unit */ + unsigned int min_cost; /* minimum cost */ + unsigned long long oldest_age; /* oldest age of segments having the same min cost */ + unsigned int min_segno; /* segment # having min. cost */ + unsigned long long age; /* mtime of GCed section*/ + unsigned long long age_threshold;/* age threshold */ +}; + +struct seg_entry { + unsigned int type:6; /* segment type like CURSEG_XXX_TYPE */ + unsigned int valid_blocks:10; /* # of valid blocks */ + unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */ + unsigned int padding:6; /* padding */ + unsigned char *cur_valid_map; /* validity bitmap of blocks */ +#ifdef CONFIG_F2FS_CHECK_FS + unsigned char *cur_valid_map_mir; /* mirror of current valid bitmap */ +#endif + /* + * # of valid blocks and the validity bitmap stored in the last + * checkpoint pack. This information is used by the SSR mode. + */ + unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */ + unsigned char *discard_map; + unsigned long long mtime; /* modification time of the segment */ +}; + +struct sec_entry { + unsigned int valid_blocks; /* # of valid blocks in a section */ +}; + +struct segment_allocation { + void (*allocate_segment)(struct f2fs_sb_info *, int, bool); +}; + +#define MAX_SKIP_GC_COUNT 16 + +struct revoke_entry { + struct list_head list; + block_t old_addr; /* for revoking when fail to commit */ + pgoff_t index; +}; + +struct sit_info { + const struct segment_allocation *s_ops; + + block_t sit_base_addr; /* start block address of SIT area */ + block_t sit_blocks; /* # of blocks used by SIT area */ + block_t written_valid_blocks; /* # of valid blocks in main area */ + char *bitmap; /* all bitmaps pointer */ + char *sit_bitmap; /* SIT bitmap pointer */ +#ifdef CONFIG_F2FS_CHECK_FS + char *sit_bitmap_mir; /* SIT bitmap mirror */ + + /* bitmap of segments to be ignored by GC in case of errors */ + unsigned long *invalid_segmap; +#endif + unsigned int bitmap_size; /* SIT bitmap size */ + + unsigned long *tmp_map; /* bitmap for temporal use */ + unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */ + unsigned int dirty_sentries; /* # of dirty sentries */ + unsigned int sents_per_block; /* # of SIT entries per block */ + struct rw_semaphore sentry_lock; /* to protect SIT cache */ + struct seg_entry *sentries; /* SIT segment-level cache */ + struct sec_entry *sec_entries; /* SIT section-level cache */ + + /* for cost-benefit algorithm in cleaning procedure */ + unsigned long long elapsed_time; /* elapsed time after mount */ + unsigned long long mounted_time; /* mount time */ + unsigned long long min_mtime; /* min. modification time */ + unsigned long long max_mtime; /* max. modification time */ + unsigned long long dirty_min_mtime; /* rerange candidates in GC_AT */ + unsigned long long dirty_max_mtime; /* rerange candidates in GC_AT */ + + unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */ +}; + +struct free_segmap_info { + unsigned int start_segno; /* start segment number logically */ + unsigned int free_segments; /* # of free segments */ + unsigned int free_sections; /* # of free sections */ + spinlock_t segmap_lock; /* free segmap lock */ + unsigned long *free_segmap; /* free segment bitmap */ + unsigned long *free_secmap; /* free section bitmap */ +}; + +/* Notice: The order of dirty type is same with CURSEG_XXX in f2fs.h */ +enum dirty_type { + DIRTY_HOT_DATA, /* dirty segments assigned as hot data logs */ + DIRTY_WARM_DATA, /* dirty segments assigned as warm data logs */ + DIRTY_COLD_DATA, /* dirty segments assigned as cold data logs */ + DIRTY_HOT_NODE, /* dirty segments assigned as hot node logs */ + DIRTY_WARM_NODE, /* dirty segments assigned as warm node logs */ + DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */ + DIRTY, /* to count # of dirty segments */ + PRE, /* to count # of entirely obsolete segments */ + NR_DIRTY_TYPE +}; + +struct dirty_seglist_info { + const struct victim_selection *v_ops; /* victim selction operation */ + unsigned long *dirty_segmap[NR_DIRTY_TYPE]; + unsigned long *dirty_secmap; + struct mutex seglist_lock; /* lock for segment bitmaps */ + int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ + unsigned long *victim_secmap; /* background GC victims */ + unsigned long *pinned_secmap; /* pinned victims from foreground GC */ + unsigned int pinned_secmap_cnt; /* count of victims which has pinned data */ + bool enable_pin_section; /* enable pinning section */ +}; + +/* victim selection function for cleaning and SSR */ +struct victim_selection { + int (*get_victim)(struct f2fs_sb_info *, unsigned int *, + int, int, char, unsigned long long); +}; + +/* for active log information */ +struct curseg_info { + struct mutex curseg_mutex; /* lock for consistency */ + struct f2fs_summary_block *sum_blk; /* cached summary block */ + struct rw_semaphore journal_rwsem; /* protect journal area */ + struct f2fs_journal *journal; /* cached journal info */ + unsigned char alloc_type; /* current allocation type */ + unsigned short seg_type; /* segment type like CURSEG_XXX_TYPE */ + unsigned int segno; /* current segment number */ + unsigned short next_blkoff; /* next block offset to write */ + unsigned int zone; /* current zone number */ + unsigned int next_segno; /* preallocated segment */ + int fragment_remained_chunk; /* remained block size in a chunk for block fragmentation mode */ + bool inited; /* indicate inmem log is inited */ +}; + +struct sit_entry_set { + struct list_head set_list; /* link with all sit sets */ + unsigned int start_segno; /* start segno of sits in set */ + unsigned int entry_cnt; /* the # of sit entries in set */ +}; + +/* + * inline functions + */ +static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type) +{ + return (struct curseg_info *)(SM_I(sbi)->curseg_array + type); +} + +static inline struct seg_entry *get_seg_entry(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + return &sit_i->sentries[segno]; +} + +static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)]; +} + +static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, + unsigned int segno, bool use_section) +{ + /* + * In order to get # of valid blocks in a section instantly from many + * segments, f2fs manages two counting structures separately. + */ + if (use_section && __is_large_section(sbi)) + return get_sec_entry(sbi, segno)->valid_blocks; + else + return get_seg_entry(sbi, segno)->valid_blocks; +} + +static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi, + unsigned int segno, bool use_section) +{ + if (use_section && __is_large_section(sbi)) { + unsigned int start_segno = START_SEGNO(segno); + unsigned int blocks = 0; + int i; + + for (i = 0; i < sbi->segs_per_sec; i++, start_segno++) { + struct seg_entry *se = get_seg_entry(sbi, start_segno); + + blocks += se->ckpt_valid_blocks; + } + return blocks; + } + return get_seg_entry(sbi, segno)->ckpt_valid_blocks; +} + +static inline void seg_info_from_raw_sit(struct seg_entry *se, + struct f2fs_sit_entry *rs) +{ + se->valid_blocks = GET_SIT_VBLOCKS(rs); + se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs); + memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); + memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +#ifdef CONFIG_F2FS_CHECK_FS + memcpy(se->cur_valid_map_mir, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +#endif + se->type = GET_SIT_TYPE(rs); + se->mtime = le64_to_cpu(rs->mtime); +} + +static inline void __seg_info_to_raw_sit(struct seg_entry *se, + struct f2fs_sit_entry *rs) +{ + unsigned short raw_vblocks = (se->type << SIT_VBLOCKS_SHIFT) | + se->valid_blocks; + rs->vblocks = cpu_to_le16(raw_vblocks); + memcpy(rs->valid_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); + rs->mtime = cpu_to_le64(se->mtime); +} + +static inline void seg_info_to_sit_page(struct f2fs_sb_info *sbi, + struct page *page, unsigned int start) +{ + struct f2fs_sit_block *raw_sit; + struct seg_entry *se; + struct f2fs_sit_entry *rs; + unsigned int end = min(start + SIT_ENTRY_PER_BLOCK, + (unsigned long)MAIN_SEGS(sbi)); + int i; + + raw_sit = (struct f2fs_sit_block *)page_address(page); + memset(raw_sit, 0, PAGE_SIZE); + for (i = 0; i < end - start; i++) { + rs = &raw_sit->entries[i]; + se = get_seg_entry(sbi, start + i); + __seg_info_to_raw_sit(se, rs); + } +} + +static inline void seg_info_to_raw_sit(struct seg_entry *se, + struct f2fs_sit_entry *rs) +{ + __seg_info_to_raw_sit(se, rs); + + memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); + se->ckpt_valid_blocks = se->valid_blocks; +} + +static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, + unsigned int max, unsigned int segno) +{ + unsigned int ret; + spin_lock(&free_i->segmap_lock); + ret = find_next_bit(free_i->free_segmap, max, segno); + spin_unlock(&free_i->segmap_lock); + return ret; +} + +static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); + unsigned int next; + unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno); + + spin_lock(&free_i->segmap_lock); + clear_bit(segno, free_i->free_segmap); + free_i->free_segments++; + + next = find_next_bit(free_i->free_segmap, + start_segno + sbi->segs_per_sec, start_segno); + if (next >= start_segno + usable_segs) { + clear_bit(secno, free_i->free_secmap); + free_i->free_sections++; + } + spin_unlock(&free_i->segmap_lock); +} + +static inline void __set_inuse(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + + set_bit(segno, free_i->free_segmap); + free_i->free_segments--; + if (!test_and_set_bit(secno, free_i->free_secmap)) + free_i->free_sections--; +} + +static inline void __set_test_and_free(struct f2fs_sb_info *sbi, + unsigned int segno, bool inmem) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); + unsigned int next; + unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno); + + spin_lock(&free_i->segmap_lock); + if (test_and_clear_bit(segno, free_i->free_segmap)) { + free_i->free_segments++; + + if (!inmem && IS_CURSEC(sbi, secno)) + goto skip_free; + next = find_next_bit(free_i->free_segmap, + start_segno + sbi->segs_per_sec, start_segno); + if (next >= start_segno + usable_segs) { + if (test_and_clear_bit(secno, free_i->free_secmap)) + free_i->free_sections++; + } + } +skip_free: + spin_unlock(&free_i->segmap_lock); +} + +static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + + spin_lock(&free_i->segmap_lock); + if (!test_and_set_bit(segno, free_i->free_segmap)) { + free_i->free_segments--; + if (!test_and_set_bit(secno, free_i->free_secmap)) + free_i->free_sections--; + } + spin_unlock(&free_i->segmap_lock); +} + +static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, + void *dst_addr) +{ + struct sit_info *sit_i = SIT_I(sbi); + +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(sit_i->sit_bitmap, sit_i->sit_bitmap_mir, + sit_i->bitmap_size)) + f2fs_bug_on(sbi, 1); +#endif + memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size); +} + +static inline block_t written_block_count(struct f2fs_sb_info *sbi) +{ + return SIT_I(sbi)->written_valid_blocks; +} + +static inline unsigned int free_segments(struct f2fs_sb_info *sbi) +{ + return FREE_I(sbi)->free_segments; +} + +static inline unsigned int reserved_segments(struct f2fs_sb_info *sbi) +{ + return SM_I(sbi)->reserved_segments + + SM_I(sbi)->additional_reserved_segments; +} + +static inline unsigned int free_sections(struct f2fs_sb_info *sbi) +{ + return FREE_I(sbi)->free_sections; +} + +static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi) +{ + return DIRTY_I(sbi)->nr_dirty[PRE]; +} + +static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi) +{ + return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] + + DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_DATA] + + DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_DATA] + + DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_NODE] + + DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_NODE] + + DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_NODE]; +} + +static inline int overprovision_segments(struct f2fs_sb_info *sbi) +{ + return SM_I(sbi)->ovp_segments; +} + +static inline int reserved_sections(struct f2fs_sb_info *sbi) +{ + return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi)); +} + +static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, + unsigned int node_blocks, unsigned int dent_blocks) +{ + + unsigned int segno, left_blocks; + int i; + + /* check current node segment */ + for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) { + segno = CURSEG_I(sbi, i)->segno; + left_blocks = f2fs_usable_blks_in_seg(sbi, segno) - + get_seg_entry(sbi, segno)->ckpt_valid_blocks; + + if (node_blocks > left_blocks) + return false; + } + + /* check current data segment */ + segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; + left_blocks = f2fs_usable_blks_in_seg(sbi, segno) - + get_seg_entry(sbi, segno)->ckpt_valid_blocks; + if (dent_blocks > left_blocks) + return false; + return true; +} + +/* + * calculate needed sections for dirty node/dentry + * and call has_curseg_enough_space + */ +static inline void __get_secs_required(struct f2fs_sb_info *sbi, + unsigned int *lower_p, unsigned int *upper_p, bool *curseg_p) +{ + unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + + get_pages(sbi, F2FS_DIRTY_DENTS) + + get_pages(sbi, F2FS_DIRTY_IMETA); + unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int node_secs = total_node_blocks / CAP_BLKS_PER_SEC(sbi); + unsigned int dent_secs = total_dent_blocks / CAP_BLKS_PER_SEC(sbi); + unsigned int node_blocks = total_node_blocks % CAP_BLKS_PER_SEC(sbi); + unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi); + + if (lower_p) + *lower_p = node_secs + dent_secs; + if (upper_p) + *upper_p = node_secs + dent_secs + + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0); + if (curseg_p) + *curseg_p = has_curseg_enough_space(sbi, + node_blocks, dent_blocks); +} + +static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, + int freed, int needed) +{ + unsigned int free_secs, lower_secs, upper_secs; + bool curseg_space; + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return false; + + __get_secs_required(sbi, &lower_secs, &upper_secs, &curseg_space); + + free_secs = free_sections(sbi) + freed; + lower_secs += needed + reserved_sections(sbi); + upper_secs += needed + reserved_sections(sbi); + + if (free_secs > upper_secs) + return false; + else if (free_secs <= lower_secs) + return true; + return !curseg_space; +} + +static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) +{ + if (likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return true; + if (likely(!has_not_enough_free_secs(sbi, 0, 0))) + return true; + return false; +} + +static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) +{ + return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments; +} + +static inline int utilization(struct f2fs_sb_info *sbi) +{ + return div_u64((u64)valid_user_blocks(sbi) * 100, + sbi->user_block_count); +} + +/* + * Sometimes f2fs may be better to drop out-of-place update policy. + * And, users can control the policy through sysfs entries. + * There are five policies with triggering conditions as follows. + * F2FS_IPU_FORCE - all the time, + * F2FS_IPU_SSR - if SSR mode is activated, + * F2FS_IPU_UTIL - if FS utilization is over threashold, + * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over + * threashold, + * F2FS_IPU_FSYNC - activated in fsync path only for high performance flash + * storages. IPU will be triggered only if the # of dirty + * pages over min_fsync_blocks. (=default option) + * F2FS_IPU_ASYNC - do IPU given by asynchronous write requests. + * F2FS_IPU_NOCACHE - disable IPU bio cache. + * F2FS_IPU_HONOR_OPU_WRITE - use OPU write prior to IPU write if inode has + * FI_OPU_WRITE flag. + * F2FS_IPU_DISABLE - disable IPU. (=default option in LFS mode) + */ +#define DEF_MIN_IPU_UTIL 70 +#define DEF_MIN_FSYNC_BLOCKS 8 +#define DEF_MIN_HOT_BLOCKS 16 + +#define SMALL_VOLUME_SEGMENTS (16 * 512) /* 16GB */ + +enum { + F2FS_IPU_FORCE, + F2FS_IPU_SSR, + F2FS_IPU_UTIL, + F2FS_IPU_SSR_UTIL, + F2FS_IPU_FSYNC, + F2FS_IPU_ASYNC, + F2FS_IPU_NOCACHE, + F2FS_IPU_HONOR_OPU_WRITE, +}; + +static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, + int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + return curseg->segno; +} + +static inline unsigned char curseg_alloc_type(struct f2fs_sb_info *sbi, + int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + return curseg->alloc_type; +} + +static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + return curseg->next_blkoff; +} + +static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) +{ + f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1); +} + +static inline void verify_fio_blkaddr(struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = fio->sbi; + + if (__is_valid_data_blkaddr(fio->old_blkaddr)) + verify_blkaddr(sbi, fio->old_blkaddr, __is_meta_io(fio) ? + META_GENERIC : DATA_GENERIC); + verify_blkaddr(sbi, fio->new_blkaddr, __is_meta_io(fio) ? + META_GENERIC : DATA_GENERIC_ENHANCE); +} + +/* + * Summary block is always treated as an invalid block + */ +static inline int check_block_count(struct f2fs_sb_info *sbi, + int segno, struct f2fs_sit_entry *raw_sit) +{ + bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; + int valid_blocks = 0; + int cur_pos = 0, next_pos; + unsigned int usable_blks_per_seg = f2fs_usable_blks_in_seg(sbi, segno); + + /* check bitmap with valid block count */ + do { + if (is_valid) { + next_pos = find_next_zero_bit_le(&raw_sit->valid_map, + usable_blks_per_seg, + cur_pos); + valid_blocks += next_pos - cur_pos; + } else + next_pos = find_next_bit_le(&raw_sit->valid_map, + usable_blks_per_seg, + cur_pos); + cur_pos = next_pos; + is_valid = !is_valid; + } while (cur_pos < usable_blks_per_seg); + + if (unlikely(GET_SIT_VBLOCKS(raw_sit) != valid_blocks)) { + f2fs_err(sbi, "Mismatch valid blocks %d vs. %d", + GET_SIT_VBLOCKS(raw_sit), valid_blocks); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); + return -EFSCORRUPTED; + } + + if (usable_blks_per_seg < sbi->blocks_per_seg) + f2fs_bug_on(sbi, find_next_bit_le(&raw_sit->valid_map, + sbi->blocks_per_seg, + usable_blks_per_seg) != sbi->blocks_per_seg); + + /* check segment usage, and check boundary of a given segment number */ + if (unlikely(GET_SIT_VBLOCKS(raw_sit) > usable_blks_per_seg + || segno > TOTAL_SEGS(sbi) - 1)) { + f2fs_err(sbi, "Wrong valid blocks %d or segno %u", + GET_SIT_VBLOCKS(raw_sit), segno); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); + return -EFSCORRUPTED; + } + return 0; +} + +static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, + unsigned int start) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int offset = SIT_BLOCK_OFFSET(start); + block_t blk_addr = sit_i->sit_base_addr + offset; + + check_seg_range(sbi, start); + +#ifdef CONFIG_F2FS_CHECK_FS + if (f2fs_test_bit(offset, sit_i->sit_bitmap) != + f2fs_test_bit(offset, sit_i->sit_bitmap_mir)) + f2fs_bug_on(sbi, 1); +#endif + + /* calculate sit block address */ + if (f2fs_test_bit(offset, sit_i->sit_bitmap)) + blk_addr += sit_i->sit_blocks; + + return blk_addr; +} + +static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi, + pgoff_t block_addr) +{ + struct sit_info *sit_i = SIT_I(sbi); + block_addr -= sit_i->sit_base_addr; + if (block_addr < sit_i->sit_blocks) + block_addr += sit_i->sit_blocks; + else + block_addr -= sit_i->sit_blocks; + + return block_addr + sit_i->sit_base_addr; +} + +static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) +{ + unsigned int block_off = SIT_BLOCK_OFFSET(start); + + f2fs_change_bit(block_off, sit_i->sit_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_change_bit(block_off, sit_i->sit_bitmap_mir); +#endif +} + +static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi, + bool base_time) +{ + struct sit_info *sit_i = SIT_I(sbi); + time64_t diff, now = ktime_get_boottime_seconds(); + + if (now >= sit_i->mounted_time) + return sit_i->elapsed_time + now - sit_i->mounted_time; + + /* system time is set to the past */ + if (!base_time) { + diff = sit_i->mounted_time - now; + if (sit_i->elapsed_time >= diff) + return sit_i->elapsed_time - diff; + return 0; + } + return sit_i->elapsed_time; +} + +static inline void set_summary(struct f2fs_summary *sum, nid_t nid, + unsigned int ofs_in_node, unsigned char version) +{ + sum->nid = cpu_to_le32(nid); + sum->ofs_in_node = cpu_to_le16(ofs_in_node); + sum->version = version; +} + +static inline block_t start_sum_block(struct f2fs_sb_info *sbi) +{ + return __start_cp_addr(sbi) + + le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); +} + +static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) +{ + return __start_cp_addr(sbi) + + le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count) + - (base + 1) + type; +} + +static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) +{ + if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) + return true; + return false; +} + +/* + * It is very important to gather dirty pages and write at once, so that we can + * submit a big bio without interfering other data writes. + * By default, 512 pages for directory data, + * 512 pages (2MB) * 8 for nodes, and + * 256 pages * 8 for meta are set. + */ +static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) +{ + if (sbi->sb->s_bdi->wb.dirty_exceeded) + return 0; + + if (type == DATA) + return sbi->blocks_per_seg; + else if (type == NODE) + return 8 * sbi->blocks_per_seg; + else if (type == META) + return 8 * BIO_MAX_VECS; + else + return 0; +} + +/* + * When writing pages, it'd better align nr_to_write for segment size. + */ +static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, + struct writeback_control *wbc) +{ + long nr_to_write, desired; + + if (wbc->sync_mode != WB_SYNC_NONE) + return 0; + + nr_to_write = wbc->nr_to_write; + desired = BIO_MAX_VECS; + if (type == NODE) + desired <<= 1; + + wbc->nr_to_write = desired; + return desired - nr_to_write; +} + +static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + bool wakeup = false; + int i; + + if (force) + goto wake_up; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + if (i + 1 < dcc->discard_granularity) + break; + if (!list_empty(&dcc->pend_list[i])) { + wakeup = true; + break; + } + } + mutex_unlock(&dcc->cmd_lock); + if (!wakeup || !is_idle(sbi, DISCARD_TIME)) + return; +wake_up: + dcc->discard_wake = 1; + wake_up_interruptible_all(&dcc->discard_wait_queue); +} diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c new file mode 100644 index 000000000..33c490e69 --- /dev/null +++ b/fs/f2fs/shrinker.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * f2fs shrinker support + * the basic infra was copied from fs/ubifs/shrinker.c + * + * Copyright (c) 2015 Motorola Mobility + * Copyright (c) 2015 Jaegeuk Kim <jaegeuk@kernel.org> + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" +#include "node.h" + +static LIST_HEAD(f2fs_list); +static DEFINE_SPINLOCK(f2fs_list_lock); +static unsigned int shrinker_run_no; + +static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->nat_cnt[RECLAIMABLE_NAT]; +} + +static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) +{ + long count = NM_I(sbi)->nid_cnt[FREE_NID] - MAX_FREE_NIDS; + + return count > 0 ? count : 0; +} + +static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi, + enum extent_type type) +{ + struct extent_tree_info *eti = &sbi->extent_tree[type]; + + return atomic_read(&eti->total_zombie_tree) + + atomic_read(&eti->total_ext_node); +} + +unsigned long f2fs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct f2fs_sb_info *sbi; + struct list_head *p; + unsigned long count = 0; + + spin_lock(&f2fs_list_lock); + p = f2fs_list.next; + while (p != &f2fs_list) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + /* count read extent cache entries */ + count += __count_extent_cache(sbi, EX_READ); + + /* count clean nat cache entries */ + count += __count_nat_entries(sbi); + + /* count free nids cache entries */ + count += __count_free_nids(sbi); + + spin_lock(&f2fs_list_lock); + p = p->next; + mutex_unlock(&sbi->umount_mutex); + } + spin_unlock(&f2fs_list_lock); + return count; +} + +unsigned long f2fs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + unsigned long nr = sc->nr_to_scan; + struct f2fs_sb_info *sbi; + struct list_head *p; + unsigned int run_no; + unsigned long freed = 0; + + spin_lock(&f2fs_list_lock); + do { + run_no = ++shrinker_run_no; + } while (run_no == 0); + p = f2fs_list.next; + while (p != &f2fs_list) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + if (sbi->shrinker_run_no == run_no) + break; + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + sbi->shrinker_run_no = run_no; + + /* shrink read extent cache entries */ + freed += f2fs_shrink_read_extent_tree(sbi, nr >> 1); + + /* shrink clean nat cache entries */ + if (freed < nr) + freed += f2fs_try_to_free_nats(sbi, nr - freed); + + /* shrink free nids cache entries */ + if (freed < nr) + freed += f2fs_try_to_free_nids(sbi, nr - freed); + + spin_lock(&f2fs_list_lock); + p = p->next; + list_move_tail(&sbi->s_list, &f2fs_list); + mutex_unlock(&sbi->umount_mutex); + if (freed >= nr) + break; + } + spin_unlock(&f2fs_list_lock); + return freed; +} + +void f2fs_join_shrinker(struct f2fs_sb_info *sbi) +{ + spin_lock(&f2fs_list_lock); + list_add_tail(&sbi->s_list, &f2fs_list); + spin_unlock(&f2fs_list_lock); +} + +void f2fs_leave_shrinker(struct f2fs_sb_info *sbi) +{ + f2fs_shrink_read_extent_tree(sbi, __count_extent_cache(sbi, EX_READ)); + + spin_lock(&f2fs_list_lock); + list_del_init(&sbi->s_list); + spin_unlock(&f2fs_list_lock); +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c new file mode 100644 index 000000000..3805162dc --- /dev/null +++ b/fs/f2fs/super.c @@ -0,0 +1,4807 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/f2fs/super.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + */ +#include <linux/module.h> +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/fs_context.h> +#include <linux/sched/mm.h> +#include <linux/statfs.h> +#include <linux/buffer_head.h> +#include <linux/kthread.h> +#include <linux/parser.h> +#include <linux/mount.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> +#include <linux/random.h> +#include <linux/exportfs.h> +#include <linux/blkdev.h> +#include <linux/quotaops.h> +#include <linux/f2fs_fs.h> +#include <linux/sysfs.h> +#include <linux/quota.h> +#include <linux/unicode.h> +#include <linux/part_stat.h> +#include <linux/zstd.h> +#include <linux/lz4.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "xattr.h" +#include "gc.h" +#include "iostat.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/f2fs.h> + +static struct kmem_cache *f2fs_inode_cachep; + +#ifdef CONFIG_F2FS_FAULT_INJECTION + +const char *f2fs_fault_name[FAULT_MAX] = { + [FAULT_KMALLOC] = "kmalloc", + [FAULT_KVMALLOC] = "kvmalloc", + [FAULT_PAGE_ALLOC] = "page alloc", + [FAULT_PAGE_GET] = "page get", + [FAULT_ALLOC_NID] = "alloc nid", + [FAULT_ORPHAN] = "orphan", + [FAULT_BLOCK] = "no more block", + [FAULT_DIR_DEPTH] = "too big dir depth", + [FAULT_EVICT_INODE] = "evict_inode fail", + [FAULT_TRUNCATE] = "truncate fail", + [FAULT_READ_IO] = "read IO error", + [FAULT_CHECKPOINT] = "checkpoint error", + [FAULT_DISCARD] = "discard error", + [FAULT_WRITE_IO] = "write IO error", + [FAULT_SLAB_ALLOC] = "slab alloc", + [FAULT_DQUOT_INIT] = "dquot initialize", + [FAULT_LOCK_OP] = "lock_op", +}; + +void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, + unsigned int type) +{ + struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; + + if (rate) { + atomic_set(&ffi->inject_ops, 0); + ffi->inject_rate = rate; + } + + if (type) + ffi->inject_type = type; + + if (!rate && !type) + memset(ffi, 0, sizeof(struct f2fs_fault_info)); +} +#endif + +/* f2fs-wide shrinker description */ +static struct shrinker f2fs_shrinker_info = { + .scan_objects = f2fs_shrink_scan, + .count_objects = f2fs_shrink_count, + .seeks = DEFAULT_SEEKS, +}; + +enum { + Opt_gc_background, + Opt_disable_roll_forward, + Opt_norecovery, + Opt_discard, + Opt_nodiscard, + Opt_noheap, + Opt_heap, + Opt_user_xattr, + Opt_nouser_xattr, + Opt_acl, + Opt_noacl, + Opt_active_logs, + Opt_disable_ext_identify, + Opt_inline_xattr, + Opt_noinline_xattr, + Opt_inline_xattr_size, + Opt_inline_data, + Opt_inline_dentry, + Opt_noinline_dentry, + Opt_flush_merge, + Opt_noflush_merge, + Opt_nobarrier, + Opt_fastboot, + Opt_extent_cache, + Opt_noextent_cache, + Opt_noinline_data, + Opt_data_flush, + Opt_reserve_root, + Opt_resgid, + Opt_resuid, + Opt_mode, + Opt_io_size_bits, + Opt_fault_injection, + Opt_fault_type, + Opt_lazytime, + Opt_nolazytime, + Opt_quota, + Opt_noquota, + Opt_usrquota, + Opt_grpquota, + Opt_prjquota, + Opt_usrjquota, + Opt_grpjquota, + Opt_prjjquota, + Opt_offusrjquota, + Opt_offgrpjquota, + Opt_offprjjquota, + Opt_jqfmt_vfsold, + Opt_jqfmt_vfsv0, + Opt_jqfmt_vfsv1, + Opt_alloc, + Opt_fsync, + Opt_test_dummy_encryption, + Opt_inlinecrypt, + Opt_checkpoint_disable, + Opt_checkpoint_disable_cap, + Opt_checkpoint_disable_cap_perc, + Opt_checkpoint_enable, + Opt_checkpoint_merge, + Opt_nocheckpoint_merge, + Opt_compress_algorithm, + Opt_compress_log_size, + Opt_compress_extension, + Opt_nocompress_extension, + Opt_compress_chksum, + Opt_compress_mode, + Opt_compress_cache, + Opt_atgc, + Opt_gc_merge, + Opt_nogc_merge, + Opt_discard_unit, + Opt_memory_mode, + Opt_err, +}; + +static match_table_t f2fs_tokens = { + {Opt_gc_background, "background_gc=%s"}, + {Opt_disable_roll_forward, "disable_roll_forward"}, + {Opt_norecovery, "norecovery"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_noheap, "no_heap"}, + {Opt_heap, "heap"}, + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_active_logs, "active_logs=%u"}, + {Opt_disable_ext_identify, "disable_ext_identify"}, + {Opt_inline_xattr, "inline_xattr"}, + {Opt_noinline_xattr, "noinline_xattr"}, + {Opt_inline_xattr_size, "inline_xattr_size=%u"}, + {Opt_inline_data, "inline_data"}, + {Opt_inline_dentry, "inline_dentry"}, + {Opt_noinline_dentry, "noinline_dentry"}, + {Opt_flush_merge, "flush_merge"}, + {Opt_noflush_merge, "noflush_merge"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_fastboot, "fastboot"}, + {Opt_extent_cache, "extent_cache"}, + {Opt_noextent_cache, "noextent_cache"}, + {Opt_noinline_data, "noinline_data"}, + {Opt_data_flush, "data_flush"}, + {Opt_reserve_root, "reserve_root=%u"}, + {Opt_resgid, "resgid=%u"}, + {Opt_resuid, "resuid=%u"}, + {Opt_mode, "mode=%s"}, + {Opt_io_size_bits, "io_bits=%u"}, + {Opt_fault_injection, "fault_injection=%u"}, + {Opt_fault_type, "fault_type=%u"}, + {Opt_lazytime, "lazytime"}, + {Opt_nolazytime, "nolazytime"}, + {Opt_quota, "quota"}, + {Opt_noquota, "noquota"}, + {Opt_usrquota, "usrquota"}, + {Opt_grpquota, "grpquota"}, + {Opt_prjquota, "prjquota"}, + {Opt_usrjquota, "usrjquota=%s"}, + {Opt_grpjquota, "grpjquota=%s"}, + {Opt_prjjquota, "prjjquota=%s"}, + {Opt_offusrjquota, "usrjquota="}, + {Opt_offgrpjquota, "grpjquota="}, + {Opt_offprjjquota, "prjjquota="}, + {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, + {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, + {Opt_alloc, "alloc_mode=%s"}, + {Opt_fsync, "fsync_mode=%s"}, + {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, + {Opt_test_dummy_encryption, "test_dummy_encryption"}, + {Opt_inlinecrypt, "inlinecrypt"}, + {Opt_checkpoint_disable, "checkpoint=disable"}, + {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, + {Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"}, + {Opt_checkpoint_enable, "checkpoint=enable"}, + {Opt_checkpoint_merge, "checkpoint_merge"}, + {Opt_nocheckpoint_merge, "nocheckpoint_merge"}, + {Opt_compress_algorithm, "compress_algorithm=%s"}, + {Opt_compress_log_size, "compress_log_size=%u"}, + {Opt_compress_extension, "compress_extension=%s"}, + {Opt_nocompress_extension, "nocompress_extension=%s"}, + {Opt_compress_chksum, "compress_chksum"}, + {Opt_compress_mode, "compress_mode=%s"}, + {Opt_compress_cache, "compress_cache"}, + {Opt_atgc, "atgc"}, + {Opt_gc_merge, "gc_merge"}, + {Opt_nogc_merge, "nogc_merge"}, + {Opt_discard_unit, "discard_unit=%s"}, + {Opt_memory_mode, "memory=%s"}, + {Opt_err, NULL}, +}; + +void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int level; + + va_start(args, fmt); + + level = printk_get_level(fmt); + vaf.fmt = printk_skip_level(fmt); + vaf.va = &args; + printk("%c%cF2FS-fs (%s): %pV\n", + KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); + + va_end(args); +} + +#if IS_ENABLED(CONFIG_UNICODE) +static const struct f2fs_sb_encodings { + __u16 magic; + char *name; + unsigned int version; +} f2fs_sb_encoding_map[] = { + {F2FS_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)}, +}; + +static const struct f2fs_sb_encodings * +f2fs_sb_read_encoding(const struct f2fs_super_block *sb) +{ + __u16 magic = le16_to_cpu(sb->s_encoding); + int i; + + for (i = 0; i < ARRAY_SIZE(f2fs_sb_encoding_map); i++) + if (magic == f2fs_sb_encoding_map[i].magic) + return &f2fs_sb_encoding_map[i]; + + return NULL; +} + +struct kmem_cache *f2fs_cf_name_slab; +static int __init f2fs_create_casefold_cache(void) +{ + f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name", + F2FS_NAME_LEN); + if (!f2fs_cf_name_slab) + return -ENOMEM; + return 0; +} + +static void f2fs_destroy_casefold_cache(void) +{ + kmem_cache_destroy(f2fs_cf_name_slab); +} +#else +static int __init f2fs_create_casefold_cache(void) { return 0; } +static void f2fs_destroy_casefold_cache(void) { } +#endif + +static inline void limit_reserve_root(struct f2fs_sb_info *sbi) +{ + block_t limit = min((sbi->user_block_count >> 3), + sbi->user_block_count - sbi->reserved_blocks); + + /* limit is 12.5% */ + if (test_opt(sbi, RESERVE_ROOT) && + F2FS_OPTION(sbi).root_reserved_blocks > limit) { + F2FS_OPTION(sbi).root_reserved_blocks = limit; + f2fs_info(sbi, "Reduce reserved blocks for root = %u", + F2FS_OPTION(sbi).root_reserved_blocks); + } + if (!test_opt(sbi, RESERVE_ROOT) && + (!uid_eq(F2FS_OPTION(sbi).s_resuid, + make_kuid(&init_user_ns, F2FS_DEF_RESUID)) || + !gid_eq(F2FS_OPTION(sbi).s_resgid, + make_kgid(&init_user_ns, F2FS_DEF_RESGID)))) + f2fs_info(sbi, "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root", + from_kuid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resuid), + from_kgid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resgid)); +} + +static inline int adjust_reserved_segment(struct f2fs_sb_info *sbi) +{ + unsigned int sec_blks = sbi->blocks_per_seg * sbi->segs_per_sec; + unsigned int avg_vblocks; + unsigned int wanted_reserved_segments; + block_t avail_user_block_count; + + if (!F2FS_IO_ALIGNED(sbi)) + return 0; + + /* average valid block count in section in worst case */ + avg_vblocks = sec_blks / F2FS_IO_SIZE(sbi); + + /* + * we need enough free space when migrating one section in worst case + */ + wanted_reserved_segments = (F2FS_IO_SIZE(sbi) / avg_vblocks) * + reserved_segments(sbi); + wanted_reserved_segments -= reserved_segments(sbi); + + avail_user_block_count = sbi->user_block_count - + sbi->current_reserved_blocks - + F2FS_OPTION(sbi).root_reserved_blocks; + + if (wanted_reserved_segments * sbi->blocks_per_seg > + avail_user_block_count) { + f2fs_err(sbi, "IO align feature can't grab additional reserved segment: %u, available segments: %u", + wanted_reserved_segments, + avail_user_block_count >> sbi->log_blocks_per_seg); + return -ENOSPC; + } + + SM_I(sbi)->additional_reserved_segments = wanted_reserved_segments; + + f2fs_info(sbi, "IO align feature needs additional reserved segment: %u", + wanted_reserved_segments); + + return 0; +} + +static inline void adjust_unusable_cap_perc(struct f2fs_sb_info *sbi) +{ + if (!F2FS_OPTION(sbi).unusable_cap_perc) + return; + + if (F2FS_OPTION(sbi).unusable_cap_perc == 100) + F2FS_OPTION(sbi).unusable_cap = sbi->user_block_count; + else + F2FS_OPTION(sbi).unusable_cap = (sbi->user_block_count / 100) * + F2FS_OPTION(sbi).unusable_cap_perc; + + f2fs_info(sbi, "Adjust unusable cap for checkpoint=disable = %u / %u%%", + F2FS_OPTION(sbi).unusable_cap, + F2FS_OPTION(sbi).unusable_cap_perc); +} + +static void init_once(void *foo) +{ + struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; + + inode_init_once(&fi->vfs_inode); +} + +#ifdef CONFIG_QUOTA +static const char * const quotatypes[] = INITQFNAMES; +#define QTYPE2NAME(t) (quotatypes[t]) +static int f2fs_set_qf_name(struct super_block *sb, int qtype, + substring_t *args) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + char *qname; + int ret = -EINVAL; + + if (sb_any_quota_loaded(sb) && !F2FS_OPTION(sbi).s_qf_names[qtype]) { + f2fs_err(sbi, "Cannot change journaled quota options when quota turned on"); + return -EINVAL; + } + if (f2fs_sb_has_quota_ino(sbi)) { + f2fs_info(sbi, "QUOTA feature is enabled, so ignore qf_name"); + return 0; + } + + qname = match_strdup(args); + if (!qname) { + f2fs_err(sbi, "Not enough memory for storing quotafile name"); + return -ENOMEM; + } + if (F2FS_OPTION(sbi).s_qf_names[qtype]) { + if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0) + ret = 0; + else + f2fs_err(sbi, "%s quota file already specified", + QTYPE2NAME(qtype)); + goto errout; + } + if (strchr(qname, '/')) { + f2fs_err(sbi, "quotafile must be on filesystem root"); + goto errout; + } + F2FS_OPTION(sbi).s_qf_names[qtype] = qname; + set_opt(sbi, QUOTA); + return 0; +errout: + kfree(qname); + return ret; +} + +static int f2fs_clear_qf_name(struct super_block *sb, int qtype) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) { + f2fs_err(sbi, "Cannot change journaled quota options when quota turned on"); + return -EINVAL; + } + kfree(F2FS_OPTION(sbi).s_qf_names[qtype]); + F2FS_OPTION(sbi).s_qf_names[qtype] = NULL; + return 0; +} + +static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) +{ + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi)) { + f2fs_err(sbi, "Project quota feature not enabled. Cannot enable project quota enforcement."); + return -1; + } + if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) { + if (test_opt(sbi, USRQUOTA) && + F2FS_OPTION(sbi).s_qf_names[USRQUOTA]) + clear_opt(sbi, USRQUOTA); + + if (test_opt(sbi, GRPQUOTA) && + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]) + clear_opt(sbi, GRPQUOTA); + + if (test_opt(sbi, PRJQUOTA) && + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) + clear_opt(sbi, PRJQUOTA); + + if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) || + test_opt(sbi, PRJQUOTA)) { + f2fs_err(sbi, "old and new quota format mixing"); + return -1; + } + + if (!F2FS_OPTION(sbi).s_jquota_fmt) { + f2fs_err(sbi, "journaled quota format not specified"); + return -1; + } + } + + if (f2fs_sb_has_quota_ino(sbi) && F2FS_OPTION(sbi).s_jquota_fmt) { + f2fs_info(sbi, "QUOTA feature is enabled, so ignore jquota_fmt"); + F2FS_OPTION(sbi).s_jquota_fmt = 0; + } + return 0; +} +#endif + +static int f2fs_set_test_dummy_encryption(struct super_block *sb, + const char *opt, + const substring_t *arg, + bool is_remount) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct fs_parameter param = { + .type = fs_value_is_string, + .string = arg->from ? arg->from : "", + }; + struct fscrypt_dummy_policy *policy = + &F2FS_OPTION(sbi).dummy_enc_policy; + int err; + + if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) { + f2fs_warn(sbi, "test_dummy_encryption option not supported"); + return -EINVAL; + } + + if (!f2fs_sb_has_encrypt(sbi)) { + f2fs_err(sbi, "Encrypt feature is off"); + return -EINVAL; + } + + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if (is_remount && !fscrypt_is_dummy_policy_set(policy)) { + f2fs_warn(sbi, "Can't set test_dummy_encryption on remount"); + return -EINVAL; + } + + err = fscrypt_parse_test_dummy_encryption(¶m, policy); + if (err) { + if (err == -EEXIST) + f2fs_warn(sbi, + "Can't change test_dummy_encryption on remount"); + else if (err == -EINVAL) + f2fs_warn(sbi, "Value of option \"%s\" is unrecognized", + opt); + else + f2fs_warn(sbi, "Error processing option \"%s\" [%d]", + opt, err); + return -EINVAL; + } + err = fscrypt_add_test_dummy_key(sb, policy); + if (err) { + f2fs_warn(sbi, "Error adding test dummy encryption key [%d]", + err); + return err; + } + f2fs_warn(sbi, "Test dummy encryption mode enabled"); + return 0; +} + +#ifdef CONFIG_F2FS_FS_COMPRESSION +static bool is_compress_extension_exist(struct f2fs_sb_info *sbi, + const char *new_ext, bool is_ext) +{ + unsigned char (*ext)[F2FS_EXTENSION_LEN]; + int ext_cnt; + int i; + + if (is_ext) { + ext = F2FS_OPTION(sbi).extensions; + ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + } else { + ext = F2FS_OPTION(sbi).noextensions; + ext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + } + + for (i = 0; i < ext_cnt; i++) { + if (!strcasecmp(new_ext, ext[i])) + return true; + } + + return false; +} + +/* + * 1. The same extension name cannot not appear in both compress and non-compress extension + * at the same time. + * 2. If the compress extension specifies all files, the types specified by the non-compress + * extension will be treated as special cases and will not be compressed. + * 3. Don't allow the non-compress extension specifies all files. + */ +static int f2fs_test_compress_extension(struct f2fs_sb_info *sbi) +{ + unsigned char (*ext)[F2FS_EXTENSION_LEN]; + unsigned char (*noext)[F2FS_EXTENSION_LEN]; + int ext_cnt, noext_cnt, index = 0, no_index = 0; + + ext = F2FS_OPTION(sbi).extensions; + ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + noext = F2FS_OPTION(sbi).noextensions; + noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + + if (!noext_cnt) + return 0; + + for (no_index = 0; no_index < noext_cnt; no_index++) { + if (!strcasecmp("*", noext[no_index])) { + f2fs_info(sbi, "Don't allow the nocompress extension specifies all files"); + return -EINVAL; + } + for (index = 0; index < ext_cnt; index++) { + if (!strcasecmp(ext[index], noext[no_index])) { + f2fs_info(sbi, "Don't allow the same extension %s appear in both compress and nocompress extension", + ext[index]); + return -EINVAL; + } + } + } + return 0; +} + +#ifdef CONFIG_F2FS_FS_LZ4 +static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) +{ +#ifdef CONFIG_F2FS_FS_LZ4HC + unsigned int level; + + if (strlen(str) == 3) { + F2FS_OPTION(sbi).compress_level = 0; + return 0; + } + + str += 3; + + if (str[0] != ':') { + f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>"); + return -EINVAL; + } + if (kstrtouint(str + 1, 10, &level)) + return -EINVAL; + + if (level < LZ4HC_MIN_CLEVEL || level > LZ4HC_MAX_CLEVEL) { + f2fs_info(sbi, "invalid lz4hc compress level: %d", level); + return -EINVAL; + } + + F2FS_OPTION(sbi).compress_level = level; + return 0; +#else + if (strlen(str) == 3) { + F2FS_OPTION(sbi).compress_level = 0; + return 0; + } + f2fs_info(sbi, "kernel doesn't support lz4hc compression"); + return -EINVAL; +#endif +} +#endif + +#ifdef CONFIG_F2FS_FS_ZSTD +static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) +{ + unsigned int level; + int len = 4; + + if (strlen(str) == len) { + F2FS_OPTION(sbi).compress_level = F2FS_ZSTD_DEFAULT_CLEVEL; + return 0; + } + + str += len; + + if (str[0] != ':') { + f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>"); + return -EINVAL; + } + if (kstrtouint(str + 1, 10, &level)) + return -EINVAL; + + if (level < zstd_min_clevel() || level > zstd_max_clevel()) { + f2fs_info(sbi, "invalid zstd compress level: %d", level); + return -EINVAL; + } + + F2FS_OPTION(sbi).compress_level = level; + return 0; +} +#endif +#endif + +static int parse_options(struct super_block *sb, char *options, bool is_remount) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + substring_t args[MAX_OPT_ARGS]; +#ifdef CONFIG_F2FS_FS_COMPRESSION + unsigned char (*ext)[F2FS_EXTENSION_LEN]; + unsigned char (*noext)[F2FS_EXTENSION_LEN]; + int ext_cnt, noext_cnt; +#endif + char *p, *name; + int arg = 0; + kuid_t uid; + kgid_t gid; + int ret; + + if (!options) + goto default_check; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) + continue; + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = args[0].from = NULL; + token = match_token(p, f2fs_tokens, args); + + switch (token) { + case Opt_gc_background: + name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (!strcmp(name, "on")) { + F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; + } else if (!strcmp(name, "off")) { + F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_OFF; + } else if (!strcmp(name, "sync")) { + F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_SYNC; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_disable_roll_forward: + set_opt(sbi, DISABLE_ROLL_FORWARD); + break; + case Opt_norecovery: + /* this option mounts f2fs with ro */ + set_opt(sbi, NORECOVERY); + if (!f2fs_readonly(sb)) + return -EINVAL; + break; + case Opt_discard: + if (!f2fs_hw_support_discard(sbi)) { + f2fs_warn(sbi, "device does not support discard"); + break; + } + set_opt(sbi, DISCARD); + break; + case Opt_nodiscard: + if (f2fs_hw_should_discard(sbi)) { + f2fs_warn(sbi, "discard is required for zoned block devices"); + return -EINVAL; + } + clear_opt(sbi, DISCARD); + break; + case Opt_noheap: + set_opt(sbi, NOHEAP); + break; + case Opt_heap: + clear_opt(sbi, NOHEAP); + break; +#ifdef CONFIG_F2FS_FS_XATTR + case Opt_user_xattr: + set_opt(sbi, XATTR_USER); + break; + case Opt_nouser_xattr: + clear_opt(sbi, XATTR_USER); + break; + case Opt_inline_xattr: + set_opt(sbi, INLINE_XATTR); + break; + case Opt_noinline_xattr: + clear_opt(sbi, INLINE_XATTR); + break; + case Opt_inline_xattr_size: + if (args->from && match_int(args, &arg)) + return -EINVAL; + set_opt(sbi, INLINE_XATTR_SIZE); + F2FS_OPTION(sbi).inline_xattr_size = arg; + break; +#else + case Opt_user_xattr: + f2fs_info(sbi, "user_xattr options not supported"); + break; + case Opt_nouser_xattr: + f2fs_info(sbi, "nouser_xattr options not supported"); + break; + case Opt_inline_xattr: + f2fs_info(sbi, "inline_xattr options not supported"); + break; + case Opt_noinline_xattr: + f2fs_info(sbi, "noinline_xattr options not supported"); + break; +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL + case Opt_acl: + set_opt(sbi, POSIX_ACL); + break; + case Opt_noacl: + clear_opt(sbi, POSIX_ACL); + break; +#else + case Opt_acl: + f2fs_info(sbi, "acl options not supported"); + break; + case Opt_noacl: + f2fs_info(sbi, "noacl options not supported"); + break; +#endif + case Opt_active_logs: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg != 2 && arg != 4 && + arg != NR_CURSEG_PERSIST_TYPE) + return -EINVAL; + F2FS_OPTION(sbi).active_logs = arg; + break; + case Opt_disable_ext_identify: + set_opt(sbi, DISABLE_EXT_IDENTIFY); + break; + case Opt_inline_data: + set_opt(sbi, INLINE_DATA); + break; + case Opt_inline_dentry: + set_opt(sbi, INLINE_DENTRY); + break; + case Opt_noinline_dentry: + clear_opt(sbi, INLINE_DENTRY); + break; + case Opt_flush_merge: + set_opt(sbi, FLUSH_MERGE); + break; + case Opt_noflush_merge: + clear_opt(sbi, FLUSH_MERGE); + break; + case Opt_nobarrier: + set_opt(sbi, NOBARRIER); + break; + case Opt_fastboot: + set_opt(sbi, FASTBOOT); + break; + case Opt_extent_cache: + set_opt(sbi, READ_EXTENT_CACHE); + break; + case Opt_noextent_cache: + clear_opt(sbi, READ_EXTENT_CACHE); + break; + case Opt_noinline_data: + clear_opt(sbi, INLINE_DATA); + break; + case Opt_data_flush: + set_opt(sbi, DATA_FLUSH); + break; + case Opt_reserve_root: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (test_opt(sbi, RESERVE_ROOT)) { + f2fs_info(sbi, "Preserve previous reserve_root=%u", + F2FS_OPTION(sbi).root_reserved_blocks); + } else { + F2FS_OPTION(sbi).root_reserved_blocks = arg; + set_opt(sbi, RESERVE_ROOT); + } + break; + case Opt_resuid: + if (args->from && match_int(args, &arg)) + return -EINVAL; + uid = make_kuid(current_user_ns(), arg); + if (!uid_valid(uid)) { + f2fs_err(sbi, "Invalid uid value %d", arg); + return -EINVAL; + } + F2FS_OPTION(sbi).s_resuid = uid; + break; + case Opt_resgid: + if (args->from && match_int(args, &arg)) + return -EINVAL; + gid = make_kgid(current_user_ns(), arg); + if (!gid_valid(gid)) { + f2fs_err(sbi, "Invalid gid value %d", arg); + return -EINVAL; + } + F2FS_OPTION(sbi).s_resgid = gid; + break; + case Opt_mode: + name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (!strcmp(name, "adaptive")) { + F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; + } else if (!strcmp(name, "lfs")) { + F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; + } else if (!strcmp(name, "fragment:segment")) { + F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_SEG; + } else if (!strcmp(name, "fragment:block")) { + F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_BLK; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_io_size_bits: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_VECS)) { + f2fs_warn(sbi, "Not support %ld, larger than %d", + BIT(arg), BIO_MAX_VECS); + return -EINVAL; + } + F2FS_OPTION(sbi).write_io_size_bits = arg; + break; +#ifdef CONFIG_F2FS_FAULT_INJECTION + case Opt_fault_injection: + if (args->from && match_int(args, &arg)) + return -EINVAL; + f2fs_build_fault_attr(sbi, arg, F2FS_ALL_FAULT_TYPE); + set_opt(sbi, FAULT_INJECTION); + break; + + case Opt_fault_type: + if (args->from && match_int(args, &arg)) + return -EINVAL; + f2fs_build_fault_attr(sbi, 0, arg); + set_opt(sbi, FAULT_INJECTION); + break; +#else + case Opt_fault_injection: + f2fs_info(sbi, "fault_injection options not supported"); + break; + + case Opt_fault_type: + f2fs_info(sbi, "fault_type options not supported"); + break; +#endif + case Opt_lazytime: + sb->s_flags |= SB_LAZYTIME; + break; + case Opt_nolazytime: + sb->s_flags &= ~SB_LAZYTIME; + break; +#ifdef CONFIG_QUOTA + case Opt_quota: + case Opt_usrquota: + set_opt(sbi, USRQUOTA); + break; + case Opt_grpquota: + set_opt(sbi, GRPQUOTA); + break; + case Opt_prjquota: + set_opt(sbi, PRJQUOTA); + break; + case Opt_usrjquota: + ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_grpjquota: + ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_prjjquota: + ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_offusrjquota: + ret = f2fs_clear_qf_name(sb, USRQUOTA); + if (ret) + return ret; + break; + case Opt_offgrpjquota: + ret = f2fs_clear_qf_name(sb, GRPQUOTA); + if (ret) + return ret; + break; + case Opt_offprjjquota: + ret = f2fs_clear_qf_name(sb, PRJQUOTA); + if (ret) + return ret; + break; + case Opt_jqfmt_vfsold: + F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_OLD; + break; + case Opt_jqfmt_vfsv0: + F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V0; + break; + case Opt_jqfmt_vfsv1: + F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V1; + break; + case Opt_noquota: + clear_opt(sbi, QUOTA); + clear_opt(sbi, USRQUOTA); + clear_opt(sbi, GRPQUOTA); + clear_opt(sbi, PRJQUOTA); + break; +#else + case Opt_quota: + case Opt_usrquota: + case Opt_grpquota: + case Opt_prjquota: + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_prjjquota: + case Opt_offusrjquota: + case Opt_offgrpjquota: + case Opt_offprjjquota: + case Opt_jqfmt_vfsold: + case Opt_jqfmt_vfsv0: + case Opt_jqfmt_vfsv1: + case Opt_noquota: + f2fs_info(sbi, "quota operations not supported"); + break; +#endif + case Opt_alloc: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + + if (!strcmp(name, "default")) { + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; + } else if (!strcmp(name, "reuse")) { + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_fsync: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (!strcmp(name, "posix")) { + F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; + } else if (!strcmp(name, "strict")) { + F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT; + } else if (!strcmp(name, "nobarrier")) { + F2FS_OPTION(sbi).fsync_mode = + FSYNC_MODE_NOBARRIER; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_test_dummy_encryption: + ret = f2fs_set_test_dummy_encryption(sb, p, &args[0], + is_remount); + if (ret) + return ret; + break; + case Opt_inlinecrypt: +#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT + sb->s_flags |= SB_INLINECRYPT; +#else + f2fs_info(sbi, "inline encryption not supported"); +#endif + break; + case Opt_checkpoint_disable_cap_perc: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg < 0 || arg > 100) + return -EINVAL; + F2FS_OPTION(sbi).unusable_cap_perc = arg; + set_opt(sbi, DISABLE_CHECKPOINT); + break; + case Opt_checkpoint_disable_cap: + if (args->from && match_int(args, &arg)) + return -EINVAL; + F2FS_OPTION(sbi).unusable_cap = arg; + set_opt(sbi, DISABLE_CHECKPOINT); + break; + case Opt_checkpoint_disable: + set_opt(sbi, DISABLE_CHECKPOINT); + break; + case Opt_checkpoint_enable: + clear_opt(sbi, DISABLE_CHECKPOINT); + break; + case Opt_checkpoint_merge: + set_opt(sbi, MERGE_CHECKPOINT); + break; + case Opt_nocheckpoint_merge: + clear_opt(sbi, MERGE_CHECKPOINT); + break; +#ifdef CONFIG_F2FS_FS_COMPRESSION + case Opt_compress_algorithm: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_info(sbi, "Image doesn't support compression"); + break; + } + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (!strcmp(name, "lzo")) { +#ifdef CONFIG_F2FS_FS_LZO + F2FS_OPTION(sbi).compress_level = 0; + F2FS_OPTION(sbi).compress_algorithm = + COMPRESS_LZO; +#else + f2fs_info(sbi, "kernel doesn't support lzo compression"); +#endif + } else if (!strncmp(name, "lz4", 3)) { +#ifdef CONFIG_F2FS_FS_LZ4 + ret = f2fs_set_lz4hc_level(sbi, name); + if (ret) { + kfree(name); + return -EINVAL; + } + F2FS_OPTION(sbi).compress_algorithm = + COMPRESS_LZ4; +#else + f2fs_info(sbi, "kernel doesn't support lz4 compression"); +#endif + } else if (!strncmp(name, "zstd", 4)) { +#ifdef CONFIG_F2FS_FS_ZSTD + ret = f2fs_set_zstd_level(sbi, name); + if (ret) { + kfree(name); + return -EINVAL; + } + F2FS_OPTION(sbi).compress_algorithm = + COMPRESS_ZSTD; +#else + f2fs_info(sbi, "kernel doesn't support zstd compression"); +#endif + } else if (!strcmp(name, "lzo-rle")) { +#ifdef CONFIG_F2FS_FS_LZORLE + F2FS_OPTION(sbi).compress_level = 0; + F2FS_OPTION(sbi).compress_algorithm = + COMPRESS_LZORLE; +#else + f2fs_info(sbi, "kernel doesn't support lzorle compression"); +#endif + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_compress_log_size: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_info(sbi, "Image doesn't support compression"); + break; + } + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg < MIN_COMPRESS_LOG_SIZE || + arg > MAX_COMPRESS_LOG_SIZE) { + f2fs_err(sbi, + "Compress cluster log size is out of range"); + return -EINVAL; + } + F2FS_OPTION(sbi).compress_log_size = arg; + break; + case Opt_compress_extension: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_info(sbi, "Image doesn't support compression"); + break; + } + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + + ext = F2FS_OPTION(sbi).extensions; + ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + + if (strlen(name) >= F2FS_EXTENSION_LEN || + ext_cnt >= COMPRESS_EXT_NUM) { + f2fs_err(sbi, + "invalid extension length/number"); + kfree(name); + return -EINVAL; + } + + if (is_compress_extension_exist(sbi, name, true)) { + kfree(name); + break; + } + + strcpy(ext[ext_cnt], name); + F2FS_OPTION(sbi).compress_ext_cnt++; + kfree(name); + break; + case Opt_nocompress_extension: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_info(sbi, "Image doesn't support compression"); + break; + } + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + + noext = F2FS_OPTION(sbi).noextensions; + noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + + if (strlen(name) >= F2FS_EXTENSION_LEN || + noext_cnt >= COMPRESS_EXT_NUM) { + f2fs_err(sbi, + "invalid extension length/number"); + kfree(name); + return -EINVAL; + } + + if (is_compress_extension_exist(sbi, name, false)) { + kfree(name); + break; + } + + strcpy(noext[noext_cnt], name); + F2FS_OPTION(sbi).nocompress_ext_cnt++; + kfree(name); + break; + case Opt_compress_chksum: + F2FS_OPTION(sbi).compress_chksum = true; + break; + case Opt_compress_mode: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (!strcmp(name, "fs")) { + F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; + } else if (!strcmp(name, "user")) { + F2FS_OPTION(sbi).compress_mode = COMPR_MODE_USER; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_compress_cache: + set_opt(sbi, COMPRESS_CACHE); + break; +#else + case Opt_compress_algorithm: + case Opt_compress_log_size: + case Opt_compress_extension: + case Opt_nocompress_extension: + case Opt_compress_chksum: + case Opt_compress_mode: + case Opt_compress_cache: + f2fs_info(sbi, "compression options not supported"); + break; +#endif + case Opt_atgc: + set_opt(sbi, ATGC); + break; + case Opt_gc_merge: + set_opt(sbi, GC_MERGE); + break; + case Opt_nogc_merge: + clear_opt(sbi, GC_MERGE); + break; + case Opt_discard_unit: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (!strcmp(name, "block")) { + F2FS_OPTION(sbi).discard_unit = + DISCARD_UNIT_BLOCK; + } else if (!strcmp(name, "segment")) { + F2FS_OPTION(sbi).discard_unit = + DISCARD_UNIT_SEGMENT; + } else if (!strcmp(name, "section")) { + F2FS_OPTION(sbi).discard_unit = + DISCARD_UNIT_SECTION; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_memory_mode: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (!strcmp(name, "normal")) { + F2FS_OPTION(sbi).memory_mode = + MEMORY_MODE_NORMAL; + } else if (!strcmp(name, "low")) { + F2FS_OPTION(sbi).memory_mode = + MEMORY_MODE_LOW; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + default: + f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", + p); + return -EINVAL; + } + } +default_check: +#ifdef CONFIG_QUOTA + if (f2fs_check_quota_options(sbi)) + return -EINVAL; +#else + if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sbi->sb)) { + f2fs_info(sbi, "Filesystem with quota feature cannot be mounted RDWR without CONFIG_QUOTA"); + return -EINVAL; + } + if (f2fs_sb_has_project_quota(sbi) && !f2fs_readonly(sbi->sb)) { + f2fs_err(sbi, "Filesystem with project quota feature cannot be mounted RDWR without CONFIG_QUOTA"); + return -EINVAL; + } +#endif +#if !IS_ENABLED(CONFIG_UNICODE) + if (f2fs_sb_has_casefold(sbi)) { + f2fs_err(sbi, + "Filesystem with casefold feature cannot be mounted without CONFIG_UNICODE"); + return -EINVAL; + } +#endif + /* + * The BLKZONED feature indicates that the drive was formatted with + * zone alignment optimization. This is optional for host-aware + * devices, but mandatory for host-managed zoned block devices. + */ + if (f2fs_sb_has_blkzoned(sbi)) { +#ifdef CONFIG_BLK_DEV_ZONED + if (F2FS_OPTION(sbi).discard_unit != + DISCARD_UNIT_SECTION) { + f2fs_info(sbi, "Zoned block device doesn't need small discard, set discard_unit=section by default"); + F2FS_OPTION(sbi).discard_unit = + DISCARD_UNIT_SECTION; + } + + if (F2FS_OPTION(sbi).fs_mode != FS_MODE_LFS) { + f2fs_info(sbi, "Only lfs mode is allowed with zoned block device feature"); + return -EINVAL; + } +#else + f2fs_err(sbi, "Zoned block device support is not enabled"); + return -EINVAL; +#endif + } + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_test_compress_extension(sbi)) { + f2fs_err(sbi, "invalid compress or nocompress extension"); + return -EINVAL; + } +#endif + + if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) { + f2fs_err(sbi, "Should set mode=lfs with %luKB-sized IO", + F2FS_IO_SIZE_KB(sbi)); + return -EINVAL; + } + + if (test_opt(sbi, INLINE_XATTR_SIZE)) { + int min_size, max_size; + + if (!f2fs_sb_has_extra_attr(sbi) || + !f2fs_sb_has_flexible_inline_xattr(sbi)) { + f2fs_err(sbi, "extra_attr or flexible_inline_xattr feature is off"); + return -EINVAL; + } + if (!test_opt(sbi, INLINE_XATTR)) { + f2fs_err(sbi, "inline_xattr_size option should be set with inline_xattr option"); + return -EINVAL; + } + + min_size = sizeof(struct f2fs_xattr_header) / sizeof(__le32); + max_size = MAX_INLINE_XATTR_SIZE; + + if (F2FS_OPTION(sbi).inline_xattr_size < min_size || + F2FS_OPTION(sbi).inline_xattr_size > max_size) { + f2fs_err(sbi, "inline xattr size is out of range: %d ~ %d", + min_size, max_size); + return -EINVAL; + } + } + + if (test_opt(sbi, DISABLE_CHECKPOINT) && f2fs_lfs_mode(sbi)) { + f2fs_err(sbi, "LFS not compatible with checkpoint=disable"); + return -EINVAL; + } + + if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) { + f2fs_err(sbi, "LFS not compatible with ATGC"); + return -EINVAL; + } + + if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { + f2fs_err(sbi, "Allow to mount readonly mode only"); + return -EROFS; + } + return 0; +} + +static struct inode *f2fs_alloc_inode(struct super_block *sb) +{ + struct f2fs_inode_info *fi; + + if (time_to_inject(F2FS_SB(sb), FAULT_SLAB_ALLOC)) { + f2fs_show_injection_info(F2FS_SB(sb), FAULT_SLAB_ALLOC); + return NULL; + } + + fi = alloc_inode_sb(sb, f2fs_inode_cachep, GFP_F2FS_ZERO); + if (!fi) + return NULL; + + init_once((void *) fi); + + /* Initialize f2fs-specific inode info */ + atomic_set(&fi->dirty_pages, 0); + atomic_set(&fi->i_compr_blocks, 0); + init_f2fs_rwsem(&fi->i_sem); + spin_lock_init(&fi->i_size_lock); + INIT_LIST_HEAD(&fi->dirty_list); + INIT_LIST_HEAD(&fi->gdirty_list); + init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); + init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); + init_f2fs_rwsem(&fi->i_xattr_sem); + + /* Will be used by directory only */ + fi->i_dir_level = F2FS_SB(sb)->dir_level; + + return &fi->vfs_inode; +} + +static int f2fs_drop_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int ret; + + /* + * during filesystem shutdown, if checkpoint is disabled, + * drop useless meta/node dirty pages. + */ + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) { + trace_f2fs_drop_inode(inode, 1); + return 1; + } + } + + /* + * This is to avoid a deadlock condition like below. + * writeback_single_inode(inode) + * - f2fs_write_data_page + * - f2fs_gc -> iput -> evict + * - inode_wait_for_writeback(inode) + */ + if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) { + if (!inode->i_nlink && !is_bad_inode(inode)) { + /* to avoid evict_inode call simultaneously */ + atomic_inc(&inode->i_count); + spin_unlock(&inode->i_lock); + + /* should remain fi->extent_tree for writepage */ + f2fs_destroy_extent_node(inode); + + sb_start_intwrite(inode->i_sb); + f2fs_i_size_write(inode, 0); + + f2fs_submit_merged_write_cond(F2FS_I_SB(inode), + inode, NULL, 0, DATA); + truncate_inode_pages_final(inode->i_mapping); + + if (F2FS_HAS_BLOCKS(inode)) + f2fs_truncate(inode); + + sb_end_intwrite(inode->i_sb); + + spin_lock(&inode->i_lock); + atomic_dec(&inode->i_count); + } + trace_f2fs_drop_inode(inode, 0); + return 0; + } + ret = generic_drop_inode(inode); + if (!ret) + ret = fscrypt_drop_inode(inode); + trace_f2fs_drop_inode(inode, ret); + return ret; +} + +int f2fs_inode_dirtied(struct inode *inode, bool sync) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int ret = 0; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + ret = 1; + } else { + set_inode_flag(inode, FI_DIRTY_INODE); + stat_inc_dirty_inode(sbi, DIRTY_META); + } + if (sync && list_empty(&F2FS_I(inode)->gdirty_list)) { + list_add_tail(&F2FS_I(inode)->gdirty_list, + &sbi->inode_list[DIRTY_META]); + inc_page_count(sbi, F2FS_DIRTY_IMETA); + } + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return ret; +} + +void f2fs_inode_synced(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return; + } + if (!list_empty(&F2FS_I(inode)->gdirty_list)) { + list_del_init(&F2FS_I(inode)->gdirty_list); + dec_page_count(sbi, F2FS_DIRTY_IMETA); + } + clear_inode_flag(inode, FI_DIRTY_INODE); + clear_inode_flag(inode, FI_AUTO_RECOVER); + stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META); + spin_unlock(&sbi->inode_lock[DIRTY_META]); +} + +/* + * f2fs_dirty_inode() is called from __mark_inode_dirty() + * + * We should call set_dirty_inode to write the dirty inode through write_inode. + */ +static void f2fs_dirty_inode(struct inode *inode, int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) + return; + + if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) + clear_inode_flag(inode, FI_AUTO_RECOVER); + + f2fs_inode_dirtied(inode, false); +} + +static void f2fs_free_inode(struct inode *inode) +{ + fscrypt_free_inode(inode); + kmem_cache_free(f2fs_inode_cachep, F2FS_I(inode)); +} + +static void destroy_percpu_info(struct f2fs_sb_info *sbi) +{ + percpu_counter_destroy(&sbi->total_valid_inode_count); + percpu_counter_destroy(&sbi->rf_node_block_count); + percpu_counter_destroy(&sbi->alloc_valid_block_count); +} + +static void destroy_device_list(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + blkdev_put(FDEV(i).bdev, FMODE_EXCL); +#ifdef CONFIG_BLK_DEV_ZONED + kvfree(FDEV(i).blkz_seq); +#endif + } + kvfree(sbi->devs); +} + +static void f2fs_put_super(struct super_block *sb) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int i; + bool dropped; + + /* unregister procfs/sysfs entries in advance to avoid race case */ + f2fs_unregister_sysfs(sbi); + + f2fs_quota_off_umount(sb); + + /* prevent remaining shrinker jobs */ + mutex_lock(&sbi->umount_mutex); + + /* + * flush all issued checkpoints and stop checkpoint issue thread. + * after then, all checkpoints should be done by each process context. + */ + f2fs_stop_ckpt_thread(sbi); + + /* + * We don't need to do checkpoint when superblock is clean. + * But, the previous checkpoint was not done by umount, it needs to do + * clean checkpoint again. + */ + if ((is_sbi_flag_set(sbi, SBI_IS_DIRTY) || + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG))) { + struct cp_control cpc = { + .reason = CP_UMOUNT, + }; + f2fs_write_checkpoint(sbi, &cpc); + } + + /* be sure to wait for any on-going discard commands */ + dropped = f2fs_issue_discard_timeout(sbi); + + if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) && + !sbi->discard_blks && !dropped) { + struct cp_control cpc = { + .reason = CP_UMOUNT | CP_TRIMMED, + }; + f2fs_write_checkpoint(sbi, &cpc); + } + + /* + * normally superblock is clean, so we need to release this. + * In addition, EIO will skip do checkpoint, we need this as well. + */ + f2fs_release_ino_entry(sbi, true); + + f2fs_leave_shrinker(sbi); + mutex_unlock(&sbi->umount_mutex); + + /* our cp_error case, we can wait for any writeback page */ + f2fs_flush_merged_writes(sbi); + + f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); + + f2fs_bug_on(sbi, sbi->fsync_node_num); + + f2fs_destroy_compress_inode(sbi); + + iput(sbi->node_inode); + sbi->node_inode = NULL; + + iput(sbi->meta_inode); + sbi->meta_inode = NULL; + + /* + * iput() can update stat information, if f2fs_write_checkpoint() + * above failed with error. + */ + f2fs_destroy_stats(sbi); + + /* destroy f2fs internal modules */ + f2fs_destroy_node_manager(sbi); + f2fs_destroy_segment_manager(sbi); + + f2fs_destroy_post_read_wq(sbi); + + kvfree(sbi->ckpt); + + sb->s_fs_info = NULL; + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); + kfree(sbi->raw_super); + + destroy_device_list(sbi); + f2fs_destroy_page_array_cache(sbi); + f2fs_destroy_xattr_caches(sbi); + mempool_destroy(sbi->write_io_dummy); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(F2FS_OPTION(sbi).s_qf_names[i]); +#endif + fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy); + destroy_percpu_info(sbi); + f2fs_destroy_iostat(sbi); + for (i = 0; i < NR_PAGE_TYPE; i++) + kvfree(sbi->write_io[i]); +#if IS_ENABLED(CONFIG_UNICODE) + utf8_unload(sb->s_encoding); +#endif + kfree(sbi); +} + +int f2fs_sync_fs(struct super_block *sb, int sync) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int err = 0; + + if (unlikely(f2fs_cp_error(sbi))) + return 0; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return 0; + + trace_f2fs_sync_fs(sb, sync); + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return -EAGAIN; + + if (sync) + err = f2fs_issue_checkpoint(sbi); + + return err; +} + +static int f2fs_freeze(struct super_block *sb) +{ + if (f2fs_readonly(sb)) + return 0; + + /* IO error happened before */ + if (unlikely(f2fs_cp_error(F2FS_SB(sb)))) + return -EIO; + + /* must be clean, since sync_filesystem() was already called */ + if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) + return -EINVAL; + + /* Let's flush checkpoints and stop the thread. */ + f2fs_flush_ckpt_thread(F2FS_SB(sb)); + + /* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */ + set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); + return 0; +} + +static int f2fs_unfreeze(struct super_block *sb) +{ + clear_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); + return 0; +} + +#ifdef CONFIG_QUOTA +static int f2fs_statfs_project(struct super_block *sb, + kprojid_t projid, struct kstatfs *buf) +{ + struct kqid qid; + struct dquot *dquot; + u64 limit; + u64 curblock; + + qid = make_kqid_projid(projid); + dquot = dqget(sb, qid); + if (IS_ERR(dquot)) + return PTR_ERR(dquot); + spin_lock(&dquot->dq_dqb_lock); + + limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit, + dquot->dq_dqb.dqb_bhardlimit); + if (limit) + limit >>= sb->s_blocksize_bits; + + if (limit && buf->f_blocks > limit) { + curblock = (dquot->dq_dqb.dqb_curspace + + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; + buf->f_blocks = limit; + buf->f_bfree = buf->f_bavail = + (buf->f_blocks > curblock) ? + (buf->f_blocks - curblock) : 0; + } + + limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, + dquot->dq_dqb.dqb_ihardlimit); + + if (limit && buf->f_files > limit) { + buf->f_files = limit; + buf->f_ffree = + (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? + (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + } + + spin_unlock(&dquot->dq_dqb_lock); + dqput(dquot); + return 0; +} +#endif + +static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + block_t total_count, user_block_count, start_count; + u64 avail_node_count; + unsigned int total_valid_node_count; + + total_count = le64_to_cpu(sbi->raw_super->block_count); + start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr); + buf->f_type = F2FS_SUPER_MAGIC; + buf->f_bsize = sbi->blocksize; + + buf->f_blocks = total_count - start_count; + + spin_lock(&sbi->stat_lock); + + user_block_count = sbi->user_block_count; + total_valid_node_count = valid_node_count(sbi); + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + buf->f_bfree = user_block_count - valid_user_blocks(sbi) - + sbi->current_reserved_blocks; + + if (unlikely(buf->f_bfree <= sbi->unusable_block_count)) + buf->f_bfree = 0; + else + buf->f_bfree -= sbi->unusable_block_count; + spin_unlock(&sbi->stat_lock); + + if (buf->f_bfree > F2FS_OPTION(sbi).root_reserved_blocks) + buf->f_bavail = buf->f_bfree - + F2FS_OPTION(sbi).root_reserved_blocks; + else + buf->f_bavail = 0; + + if (avail_node_count > user_block_count) { + buf->f_files = user_block_count; + buf->f_ffree = buf->f_bavail; + } else { + buf->f_files = avail_node_count; + buf->f_ffree = min(avail_node_count - total_valid_node_count, + buf->f_bavail); + } + + buf->f_namelen = F2FS_NAME_LEN; + buf->f_fsid = u64_to_fsid(id); + +#ifdef CONFIG_QUOTA + if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) && + sb_has_quota_limits_enabled(sb, PRJQUOTA)) { + f2fs_statfs_project(sb, F2FS_I(dentry->d_inode)->i_projid, buf); + } +#endif + return 0; +} + +static inline void f2fs_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (F2FS_OPTION(sbi).s_jquota_fmt) { + char *fmtname = ""; + + switch (F2FS_OPTION(sbi).s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA]) + seq_show_option(seq, "usrjquota", + F2FS_OPTION(sbi).s_qf_names[USRQUOTA]); + + if (F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]) + seq_show_option(seq, "grpjquota", + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]); + + if (F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) + seq_show_option(seq, "prjjquota", + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]); +#endif +} + +#ifdef CONFIG_F2FS_FS_COMPRESSION +static inline void f2fs_show_compress_options(struct seq_file *seq, + struct super_block *sb) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + char *algtype = ""; + int i; + + if (!f2fs_sb_has_compression(sbi)) + return; + + switch (F2FS_OPTION(sbi).compress_algorithm) { + case COMPRESS_LZO: + algtype = "lzo"; + break; + case COMPRESS_LZ4: + algtype = "lz4"; + break; + case COMPRESS_ZSTD: + algtype = "zstd"; + break; + case COMPRESS_LZORLE: + algtype = "lzo-rle"; + break; + } + seq_printf(seq, ",compress_algorithm=%s", algtype); + + if (F2FS_OPTION(sbi).compress_level) + seq_printf(seq, ":%d", F2FS_OPTION(sbi).compress_level); + + seq_printf(seq, ",compress_log_size=%u", + F2FS_OPTION(sbi).compress_log_size); + + for (i = 0; i < F2FS_OPTION(sbi).compress_ext_cnt; i++) { + seq_printf(seq, ",compress_extension=%s", + F2FS_OPTION(sbi).extensions[i]); + } + + for (i = 0; i < F2FS_OPTION(sbi).nocompress_ext_cnt; i++) { + seq_printf(seq, ",nocompress_extension=%s", + F2FS_OPTION(sbi).noextensions[i]); + } + + if (F2FS_OPTION(sbi).compress_chksum) + seq_puts(seq, ",compress_chksum"); + + if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_FS) + seq_printf(seq, ",compress_mode=%s", "fs"); + else if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER) + seq_printf(seq, ",compress_mode=%s", "user"); + + if (test_opt(sbi, COMPRESS_CACHE)) + seq_puts(seq, ",compress_cache"); +} +#endif + +static int f2fs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); + + if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC) + seq_printf(seq, ",background_gc=%s", "sync"); + else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_ON) + seq_printf(seq, ",background_gc=%s", "on"); + else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) + seq_printf(seq, ",background_gc=%s", "off"); + + if (test_opt(sbi, GC_MERGE)) + seq_puts(seq, ",gc_merge"); + + if (test_opt(sbi, DISABLE_ROLL_FORWARD)) + seq_puts(seq, ",disable_roll_forward"); + if (test_opt(sbi, NORECOVERY)) + seq_puts(seq, ",norecovery"); + if (test_opt(sbi, DISCARD)) + seq_puts(seq, ",discard"); + else + seq_puts(seq, ",nodiscard"); + if (test_opt(sbi, NOHEAP)) + seq_puts(seq, ",no_heap"); + else + seq_puts(seq, ",heap"); +#ifdef CONFIG_F2FS_FS_XATTR + if (test_opt(sbi, XATTR_USER)) + seq_puts(seq, ",user_xattr"); + else + seq_puts(seq, ",nouser_xattr"); + if (test_opt(sbi, INLINE_XATTR)) + seq_puts(seq, ",inline_xattr"); + else + seq_puts(seq, ",noinline_xattr"); + if (test_opt(sbi, INLINE_XATTR_SIZE)) + seq_printf(seq, ",inline_xattr_size=%u", + F2FS_OPTION(sbi).inline_xattr_size); +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL + if (test_opt(sbi, POSIX_ACL)) + seq_puts(seq, ",acl"); + else + seq_puts(seq, ",noacl"); +#endif + if (test_opt(sbi, DISABLE_EXT_IDENTIFY)) + seq_puts(seq, ",disable_ext_identify"); + if (test_opt(sbi, INLINE_DATA)) + seq_puts(seq, ",inline_data"); + else + seq_puts(seq, ",noinline_data"); + if (test_opt(sbi, INLINE_DENTRY)) + seq_puts(seq, ",inline_dentry"); + else + seq_puts(seq, ",noinline_dentry"); + if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) + seq_puts(seq, ",flush_merge"); + if (test_opt(sbi, NOBARRIER)) + seq_puts(seq, ",nobarrier"); + if (test_opt(sbi, FASTBOOT)) + seq_puts(seq, ",fastboot"); + if (test_opt(sbi, READ_EXTENT_CACHE)) + seq_puts(seq, ",extent_cache"); + else + seq_puts(seq, ",noextent_cache"); + if (test_opt(sbi, DATA_FLUSH)) + seq_puts(seq, ",data_flush"); + + seq_puts(seq, ",mode="); + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_ADAPTIVE) + seq_puts(seq, "adaptive"); + else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS) + seq_puts(seq, "lfs"); + else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG) + seq_puts(seq, "fragment:segment"); + else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) + seq_puts(seq, "fragment:block"); + seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs); + if (test_opt(sbi, RESERVE_ROOT)) + seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u", + F2FS_OPTION(sbi).root_reserved_blocks, + from_kuid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resuid), + from_kgid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resgid)); + if (F2FS_IO_SIZE_BITS(sbi)) + seq_printf(seq, ",io_bits=%u", + F2FS_OPTION(sbi).write_io_size_bits); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (test_opt(sbi, FAULT_INJECTION)) { + seq_printf(seq, ",fault_injection=%u", + F2FS_OPTION(sbi).fault_info.inject_rate); + seq_printf(seq, ",fault_type=%u", + F2FS_OPTION(sbi).fault_info.inject_type); + } +#endif +#ifdef CONFIG_QUOTA + if (test_opt(sbi, QUOTA)) + seq_puts(seq, ",quota"); + if (test_opt(sbi, USRQUOTA)) + seq_puts(seq, ",usrquota"); + if (test_opt(sbi, GRPQUOTA)) + seq_puts(seq, ",grpquota"); + if (test_opt(sbi, PRJQUOTA)) + seq_puts(seq, ",prjquota"); +#endif + f2fs_show_quota_options(seq, sbi->sb); + + fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb); + + if (sbi->sb->s_flags & SB_INLINECRYPT) + seq_puts(seq, ",inlinecrypt"); + + if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT) + seq_printf(seq, ",alloc_mode=%s", "default"); + else if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) + seq_printf(seq, ",alloc_mode=%s", "reuse"); + + if (test_opt(sbi, DISABLE_CHECKPOINT)) + seq_printf(seq, ",checkpoint=disable:%u", + F2FS_OPTION(sbi).unusable_cap); + if (test_opt(sbi, MERGE_CHECKPOINT)) + seq_puts(seq, ",checkpoint_merge"); + else + seq_puts(seq, ",nocheckpoint_merge"); + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX) + seq_printf(seq, ",fsync_mode=%s", "posix"); + else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) + seq_printf(seq, ",fsync_mode=%s", "strict"); + else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_NOBARRIER) + seq_printf(seq, ",fsync_mode=%s", "nobarrier"); + +#ifdef CONFIG_F2FS_FS_COMPRESSION + f2fs_show_compress_options(seq, sbi->sb); +#endif + + if (test_opt(sbi, ATGC)) + seq_puts(seq, ",atgc"); + + if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK) + seq_printf(seq, ",discard_unit=%s", "block"); + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) + seq_printf(seq, ",discard_unit=%s", "segment"); + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) + seq_printf(seq, ",discard_unit=%s", "section"); + + if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_NORMAL) + seq_printf(seq, ",memory=%s", "normal"); + else if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW) + seq_printf(seq, ",memory=%s", "low"); + + return 0; +} + +static void default_options(struct f2fs_sb_info *sbi) +{ + /* init some FS parameters */ + if (f2fs_sb_has_readonly(sbi)) + F2FS_OPTION(sbi).active_logs = NR_CURSEG_RO_TYPE; + else + F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; + + F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; + F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; + F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); + F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); + F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; + F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE; + F2FS_OPTION(sbi).compress_ext_cnt = 0; + F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; + F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; + F2FS_OPTION(sbi).memory_mode = MEMORY_MODE_NORMAL; + + sbi->sb->s_flags &= ~SB_INLINECRYPT; + + set_opt(sbi, INLINE_XATTR); + set_opt(sbi, INLINE_DATA); + set_opt(sbi, INLINE_DENTRY); + set_opt(sbi, READ_EXTENT_CACHE); + set_opt(sbi, NOHEAP); + clear_opt(sbi, DISABLE_CHECKPOINT); + set_opt(sbi, MERGE_CHECKPOINT); + F2FS_OPTION(sbi).unusable_cap = 0; + sbi->sb->s_flags |= SB_LAZYTIME; + set_opt(sbi, FLUSH_MERGE); + if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) + set_opt(sbi, DISCARD); + if (f2fs_sb_has_blkzoned(sbi)) { + F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; + F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION; + } else { + F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; + F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_BLOCK; + } + +#ifdef CONFIG_F2FS_FS_XATTR + set_opt(sbi, XATTR_USER); +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL + set_opt(sbi, POSIX_ACL); +#endif + + f2fs_build_fault_attr(sbi, 0, 0); +} + +#ifdef CONFIG_QUOTA +static int f2fs_enable_quotas(struct super_block *sb); +#endif + +static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) +{ + unsigned int s_flags = sbi->sb->s_flags; + struct cp_control cpc; + unsigned int gc_mode = sbi->gc_mode; + int err = 0; + int ret; + block_t unusable; + + if (s_flags & SB_RDONLY) { + f2fs_err(sbi, "checkpoint=disable on readonly fs"); + return -EINVAL; + } + sbi->sb->s_flags |= SB_ACTIVE; + + /* check if we need more GC first */ + unusable = f2fs_get_unusable_blocks(sbi); + if (!f2fs_disable_cp_again(sbi, unusable)) + goto skip_gc; + + f2fs_update_time(sbi, DISABLE_TIME); + + sbi->gc_mode = GC_URGENT_HIGH; + + while (!f2fs_time_over(sbi, DISABLE_TIME)) { + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = FG_GC, + .should_migrate_blocks = false, + .err_gc_skipped = true, + .nr_free_secs = 1 }; + + f2fs_down_write(&sbi->gc_lock); + err = f2fs_gc(sbi, &gc_control); + if (err == -ENODATA) { + err = 0; + break; + } + if (err && err != -EAGAIN) + break; + } + + ret = sync_filesystem(sbi->sb); + if (ret || err) { + err = ret ? ret : err; + goto restore_flag; + } + + unusable = f2fs_get_unusable_blocks(sbi); + if (f2fs_disable_cp_again(sbi, unusable)) { + err = -EAGAIN; + goto restore_flag; + } + +skip_gc: + f2fs_down_write(&sbi->gc_lock); + cpc.reason = CP_PAUSE; + set_sbi_flag(sbi, SBI_CP_DISABLED); + err = f2fs_write_checkpoint(sbi, &cpc); + if (err) + goto out_unlock; + + spin_lock(&sbi->stat_lock); + sbi->unusable_block_count = unusable; + spin_unlock(&sbi->stat_lock); + +out_unlock: + f2fs_up_write(&sbi->gc_lock); +restore_flag: + sbi->gc_mode = gc_mode; + sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ + return err; +} + +static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) +{ + int retry = DEFAULT_RETRY_IO_COUNT; + + /* we should flush all the data to keep data consistency */ + do { + sync_inodes_sb(sbi->sb); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--); + + if (unlikely(retry < 0)) + f2fs_warn(sbi, "checkpoint=enable has some unwritten data."); + + f2fs_down_write(&sbi->gc_lock); + f2fs_dirty_to_prefree(sbi); + + clear_sbi_flag(sbi, SBI_CP_DISABLED); + set_sbi_flag(sbi, SBI_IS_DIRTY); + f2fs_up_write(&sbi->gc_lock); + + f2fs_sync_fs(sbi->sb, 1); + + /* Let's ensure there's no pending checkpoint anymore */ + f2fs_flush_ckpt_thread(sbi); +} + +static int f2fs_remount(struct super_block *sb, int *flags, char *data) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_mount_info org_mount_opt; + unsigned long old_sb_flags; + int err; + bool need_restart_gc = false, need_stop_gc = false; + bool need_restart_ckpt = false, need_stop_ckpt = false; + bool need_restart_flush = false, need_stop_flush = false; + bool need_restart_discard = false, need_stop_discard = false; + bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE); + bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); + bool no_io_align = !F2FS_IO_ALIGNED(sbi); + bool no_atgc = !test_opt(sbi, ATGC); + bool no_discard = !test_opt(sbi, DISCARD); + bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); + bool block_unit_discard = f2fs_block_unit_discard(sbi); + struct discard_cmd_control *dcc; +#ifdef CONFIG_QUOTA + int i, j; +#endif + + /* + * Save the old mount options in case we + * need to restore them. + */ + org_mount_opt = sbi->mount_opt; + old_sb_flags = sb->s_flags; + +#ifdef CONFIG_QUOTA + org_mount_opt.s_jquota_fmt = F2FS_OPTION(sbi).s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + if (F2FS_OPTION(sbi).s_qf_names[i]) { + org_mount_opt.s_qf_names[i] = + kstrdup(F2FS_OPTION(sbi).s_qf_names[i], + GFP_KERNEL); + if (!org_mount_opt.s_qf_names[i]) { + for (j = 0; j < i; j++) + kfree(org_mount_opt.s_qf_names[j]); + return -ENOMEM; + } + } else { + org_mount_opt.s_qf_names[i] = NULL; + } + } +#endif + + /* recover superblocks we couldn't write due to previous RO mount */ + if (!(*flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { + err = f2fs_commit_super(sbi, false); + f2fs_info(sbi, "Try to recover all the superblocks, ret: %d", + err); + if (!err) + clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); + } + + default_options(sbi); + + /* parse mount options */ + err = parse_options(sb, data, true); + if (err) + goto restore_opts; + + /* + * Previous and new state of filesystem is RO, + * so skip checking GC and FLUSH_MERGE conditions. + */ + if (f2fs_readonly(sb) && (*flags & SB_RDONLY)) + goto skip; + + if (f2fs_dev_is_readonly(sbi) && !(*flags & SB_RDONLY)) { + err = -EROFS; + goto restore_opts; + } + +#ifdef CONFIG_QUOTA + if (!f2fs_readonly(sb) && (*flags & SB_RDONLY)) { + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + } else if (f2fs_readonly(sb) && !(*flags & SB_RDONLY)) { + /* dquot_resume needs RW */ + sb->s_flags &= ~SB_RDONLY; + if (sb_any_quota_suspended(sb)) { + dquot_resume(sb, -1); + } else if (f2fs_sb_has_quota_ino(sbi)) { + err = f2fs_enable_quotas(sb); + if (err) + goto restore_opts; + } + } +#endif + /* disallow enable atgc dynamically */ + if (no_atgc == !!test_opt(sbi, ATGC)) { + err = -EINVAL; + f2fs_warn(sbi, "switch atgc option is not allowed"); + goto restore_opts; + } + + /* disallow enable/disable extent_cache dynamically */ + if (no_read_extent_cache == !!test_opt(sbi, READ_EXTENT_CACHE)) { + err = -EINVAL; + f2fs_warn(sbi, "switch extent_cache option is not allowed"); + goto restore_opts; + } + + if (no_io_align == !!F2FS_IO_ALIGNED(sbi)) { + err = -EINVAL; + f2fs_warn(sbi, "switch io_bits option is not allowed"); + goto restore_opts; + } + + if (no_compress_cache == !!test_opt(sbi, COMPRESS_CACHE)) { + err = -EINVAL; + f2fs_warn(sbi, "switch compress_cache option is not allowed"); + goto restore_opts; + } + + if (block_unit_discard != f2fs_block_unit_discard(sbi)) { + err = -EINVAL; + f2fs_warn(sbi, "switch discard_unit option is not allowed"); + goto restore_opts; + } + + if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { + err = -EINVAL; + f2fs_warn(sbi, "disabling checkpoint not compatible with read-only"); + goto restore_opts; + } + + /* + * We stop the GC thread if FS is mounted as RO + * or if background_gc = off is passed in mount + * option. Also sync the filesystem. + */ + if ((*flags & SB_RDONLY) || + (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF && + !test_opt(sbi, GC_MERGE))) { + if (sbi->gc_thread) { + f2fs_stop_gc_thread(sbi); + need_restart_gc = true; + } + } else if (!sbi->gc_thread) { + err = f2fs_start_gc_thread(sbi); + if (err) + goto restore_opts; + need_stop_gc = true; + } + + if (*flags & SB_RDONLY) { + sync_inodes_sb(sb); + + set_sbi_flag(sbi, SBI_IS_DIRTY); + set_sbi_flag(sbi, SBI_IS_CLOSE); + f2fs_sync_fs(sb, 1); + clear_sbi_flag(sbi, SBI_IS_CLOSE); + } + + if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || + !test_opt(sbi, MERGE_CHECKPOINT)) { + f2fs_stop_ckpt_thread(sbi); + need_restart_ckpt = true; + } else { + /* Flush if the prevous checkpoint, if exists. */ + f2fs_flush_ckpt_thread(sbi); + + err = f2fs_start_ckpt_thread(sbi); + if (err) { + f2fs_err(sbi, + "Failed to start F2FS issue_checkpoint_thread (%d)", + err); + goto restore_gc; + } + need_stop_ckpt = true; + } + + /* + * We stop issue flush thread if FS is mounted as RO + * or if flush_merge is not passed in mount option. + */ + if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { + clear_opt(sbi, FLUSH_MERGE); + f2fs_destroy_flush_cmd_control(sbi, false); + need_restart_flush = true; + } else { + err = f2fs_create_flush_cmd_control(sbi); + if (err) + goto restore_ckpt; + need_stop_flush = true; + } + + if (no_discard == !!test_opt(sbi, DISCARD)) { + if (test_opt(sbi, DISCARD)) { + err = f2fs_start_discard_thread(sbi); + if (err) + goto restore_flush; + need_stop_discard = true; + } else { + dcc = SM_I(sbi)->dcc_info; + f2fs_stop_discard_thread(sbi); + if (atomic_read(&dcc->discard_cmd_cnt)) + f2fs_issue_discard_timeout(sbi); + need_restart_discard = true; + } + } + + if (enable_checkpoint == !!test_opt(sbi, DISABLE_CHECKPOINT)) { + if (test_opt(sbi, DISABLE_CHECKPOINT)) { + err = f2fs_disable_checkpoint(sbi); + if (err) + goto restore_discard; + } else { + f2fs_enable_checkpoint(sbi); + } + } + +skip: +#ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < MAXQUOTAS; i++) + kfree(org_mount_opt.s_qf_names[i]); +#endif + /* Update the POSIXACL Flag */ + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); + + limit_reserve_root(sbi); + adjust_unusable_cap_perc(sbi); + *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); + return 0; +restore_discard: + if (need_restart_discard) { + if (f2fs_start_discard_thread(sbi)) + f2fs_warn(sbi, "discard has been stopped"); + } else if (need_stop_discard) { + f2fs_stop_discard_thread(sbi); + } +restore_flush: + if (need_restart_flush) { + if (f2fs_create_flush_cmd_control(sbi)) + f2fs_warn(sbi, "background flush thread has stopped"); + } else if (need_stop_flush) { + clear_opt(sbi, FLUSH_MERGE); + f2fs_destroy_flush_cmd_control(sbi, false); + } +restore_ckpt: + if (need_restart_ckpt) { + if (f2fs_start_ckpt_thread(sbi)) + f2fs_warn(sbi, "background ckpt thread has stopped"); + } else if (need_stop_ckpt) { + f2fs_stop_ckpt_thread(sbi); + } +restore_gc: + if (need_restart_gc) { + if (f2fs_start_gc_thread(sbi)) + f2fs_warn(sbi, "background gc thread has stopped"); + } else if (need_stop_gc) { + f2fs_stop_gc_thread(sbi); + } +restore_opts: +#ifdef CONFIG_QUOTA + F2FS_OPTION(sbi).s_jquota_fmt = org_mount_opt.s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + kfree(F2FS_OPTION(sbi).s_qf_names[i]); + F2FS_OPTION(sbi).s_qf_names[i] = org_mount_opt.s_qf_names[i]; + } +#endif + sbi->mount_opt = org_mount_opt; + sb->s_flags = old_sb_flags; + return err; +} + +#ifdef CONFIG_QUOTA +/* Read data from quotafile */ +static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + struct address_space *mapping = inode->i_mapping; + block_t blkidx = F2FS_BYTES_TO_BLK(off); + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + loff_t i_size = i_size_read(inode); + struct page *page; + + if (off > i_size) + return 0; + + if (off + len > i_size) + len = i_size - off; + toread = len; + while (toread > 0) { + tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); +repeat: + page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS); + if (IS_ERR(page)) { + if (PTR_ERR(page) == -ENOMEM) { + memalloc_retry_wait(GFP_NOFS); + goto repeat; + } + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); + return PTR_ERR(page); + } + + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; + } + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); + return -EIO; + } + + memcpy_from_page(data, page, offset, tocopy); + f2fs_put_page(page, 1); + + offset = 0; + toread -= tocopy; + data += tocopy; + blkidx++; + } + return len; +} + +/* Write to quotafile */ +static ssize_t f2fs_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + int offset = off & (sb->s_blocksize - 1); + size_t towrite = len; + struct page *page; + void *fsdata = NULL; + int err = 0; + int tocopy; + + while (towrite > 0) { + tocopy = min_t(unsigned long, sb->s_blocksize - offset, + towrite); +retry: + err = a_ops->write_begin(NULL, mapping, off, tocopy, + &page, &fsdata); + if (unlikely(err)) { + if (err == -ENOMEM) { + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + goto retry; + } + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); + break; + } + + memcpy_to_page(page, offset, data, tocopy); + + a_ops->write_end(NULL, mapping, off, tocopy, tocopy, + page, fsdata); + offset = 0; + towrite -= tocopy; + off += tocopy; + data += tocopy; + cond_resched(); + } + + if (len == towrite) + return err; + inode->i_mtime = inode->i_ctime = current_time(inode); + f2fs_mark_inode_dirty_sync(inode, false); + return len - towrite; +} + +int f2fs_dquot_initialize(struct inode *inode) +{ + if (time_to_inject(F2FS_I_SB(inode), FAULT_DQUOT_INIT)) { + f2fs_show_injection_info(F2FS_I_SB(inode), FAULT_DQUOT_INIT); + return -ESRCH; + } + + return dquot_initialize(inode); +} + +static struct dquot **f2fs_get_dquots(struct inode *inode) +{ + return F2FS_I(inode)->i_dquot; +} + +static qsize_t *f2fs_get_reserved_space(struct inode *inode) +{ + return &F2FS_I(inode)->i_reserved_quota; +} + +static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) +{ + if (is_set_ckpt_flags(sbi, CP_QUOTA_NEED_FSCK_FLAG)) { + f2fs_err(sbi, "quota sysfile may be corrupted, skip loading it"); + return 0; + } + + return dquot_quota_on_mount(sbi->sb, F2FS_OPTION(sbi).s_qf_names[type], + F2FS_OPTION(sbi).s_jquota_fmt, type); +} + +int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly) +{ + int enabled = 0; + int i, err; + + if (f2fs_sb_has_quota_ino(sbi) && rdonly) { + err = f2fs_enable_quotas(sbi->sb); + if (err) { + f2fs_err(sbi, "Cannot turn on quota_ino: %d", err); + return 0; + } + return 1; + } + + for (i = 0; i < MAXQUOTAS; i++) { + if (F2FS_OPTION(sbi).s_qf_names[i]) { + err = f2fs_quota_on_mount(sbi, i); + if (!err) { + enabled = 1; + continue; + } + f2fs_err(sbi, "Cannot turn on quotas: %d on %d", + err, i); + } + } + return enabled; +} + +static int f2fs_quota_enable(struct super_block *sb, int type, int format_id, + unsigned int flags) +{ + struct inode *qf_inode; + unsigned long qf_inum; + int err; + + BUG_ON(!f2fs_sb_has_quota_ino(F2FS_SB(sb))); + + qf_inum = f2fs_qf_ino(sb, type); + if (!qf_inum) + return -EPERM; + + qf_inode = f2fs_iget(sb, qf_inum); + if (IS_ERR(qf_inode)) { + f2fs_err(F2FS_SB(sb), "Bad quota inode %u:%lu", type, qf_inum); + return PTR_ERR(qf_inode); + } + + /* Don't account quota for quota files to avoid recursion */ + qf_inode->i_flags |= S_NOQUOTA; + err = dquot_load_quota_inode(qf_inode, type, format_id, flags); + iput(qf_inode); + return err; +} + +static int f2fs_enable_quotas(struct super_block *sb) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int type, err = 0; + unsigned long qf_inum; + bool quota_mopt[MAXQUOTAS] = { + test_opt(sbi, USRQUOTA), + test_opt(sbi, GRPQUOTA), + test_opt(sbi, PRJQUOTA), + }; + + if (is_set_ckpt_flags(F2FS_SB(sb), CP_QUOTA_NEED_FSCK_FLAG)) { + f2fs_err(sbi, "quota file may be corrupted, skip loading it"); + return 0; + } + + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; + + for (type = 0; type < MAXQUOTAS; type++) { + qf_inum = f2fs_qf_ino(sb, type); + if (qf_inum) { + err = f2fs_quota_enable(sb, type, QFMT_VFS_V1, + DQUOT_USAGE_ENABLED | + (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); + if (err) { + f2fs_err(sbi, "Failed to enable quota tracking (type=%d, err=%d). Please run fsck to fix.", + type, err); + for (type--; type >= 0; type--) + dquot_quota_off(sb, type); + set_sbi_flag(F2FS_SB(sb), + SBI_QUOTA_NEED_REPAIR); + return err; + } + } + } + return 0; +} + +static int f2fs_quota_sync_file(struct f2fs_sb_info *sbi, int type) +{ + struct quota_info *dqopt = sb_dqopt(sbi->sb); + struct address_space *mapping = dqopt->files[type]->i_mapping; + int ret = 0; + + ret = dquot_writeback_dquots(sbi->sb, type); + if (ret) + goto out; + + ret = filemap_fdatawrite(mapping); + if (ret) + goto out; + + /* if we are using journalled quota */ + if (is_journalled_quota(sbi)) + goto out; + + ret = filemap_fdatawait(mapping); + + truncate_inode_pages(&dqopt->files[type]->i_data, 0); +out: + if (ret) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + return ret; +} + +int f2fs_quota_sync(struct super_block *sb, int type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct quota_info *dqopt = sb_dqopt(sb); + int cnt; + int ret = 0; + + /* + * Now when everything is written we can discard the pagecache so + * that userspace sees the changes. + */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + + if (type != -1 && cnt != type) + continue; + + if (!sb_has_quota_active(sb, cnt)) + continue; + + if (!f2fs_sb_has_quota_ino(sbi)) + inode_lock(dqopt->files[cnt]); + + /* + * do_quotactl + * f2fs_quota_sync + * f2fs_down_read(quota_sem) + * dquot_writeback_dquots() + * f2fs_dquot_commit + * block_operation + * f2fs_down_read(quota_sem) + */ + f2fs_lock_op(sbi); + f2fs_down_read(&sbi->quota_sem); + + ret = f2fs_quota_sync_file(sbi, cnt); + + f2fs_up_read(&sbi->quota_sem); + f2fs_unlock_op(sbi); + + if (!f2fs_sb_has_quota_ino(sbi)) + inode_unlock(dqopt->files[cnt]); + + if (ret) + break; + } + return ret; +} + +static int f2fs_quota_on(struct super_block *sb, int type, int format_id, + const struct path *path) +{ + struct inode *inode; + int err; + + /* if quota sysfile exists, deny enabling quota with specific file */ + if (f2fs_sb_has_quota_ino(F2FS_SB(sb))) { + f2fs_err(F2FS_SB(sb), "quota sysfile already exists"); + return -EBUSY; + } + + err = f2fs_quota_sync(sb, type); + if (err) + return err; + + err = dquot_quota_on(sb, type, format_id, path); + if (err) + return err; + + inode = d_inode(path->dentry); + + inode_lock(inode); + F2FS_I(inode)->i_flags |= F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL; + f2fs_set_inode_flags(inode); + inode_unlock(inode); + f2fs_mark_inode_dirty_sync(inode, false); + + return 0; +} + +static int __f2fs_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + int err; + + if (!inode || !igrab(inode)) + return dquot_quota_off(sb, type); + + err = f2fs_quota_sync(sb, type); + if (err) + goto out_put; + + err = dquot_quota_off(sb, type); + if (err || f2fs_sb_has_quota_ino(F2FS_SB(sb))) + goto out_put; + + inode_lock(inode); + F2FS_I(inode)->i_flags &= ~(F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL); + f2fs_set_inode_flags(inode); + inode_unlock(inode); + f2fs_mark_inode_dirty_sync(inode, false); +out_put: + iput(inode); + return err; +} + +static int f2fs_quota_off(struct super_block *sb, int type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int err; + + err = __f2fs_quota_off(sb, type); + + /* + * quotactl can shutdown journalled quota, result in inconsistence + * between quota record and fs data by following updates, tag the + * flag to let fsck be aware of it. + */ + if (is_journalled_quota(sbi)) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + return err; +} + +void f2fs_quota_off_umount(struct super_block *sb) +{ + int type; + int err; + + for (type = 0; type < MAXQUOTAS; type++) { + err = __f2fs_quota_off(sb, type); + if (err) { + int ret = dquot_quota_off(sb, type); + + f2fs_err(F2FS_SB(sb), "Fail to turn off disk quota (type: %d, err: %d, ret:%d), Please run fsck to fix it.", + type, err, ret); + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); + } + } + /* + * In case of checkpoint=disable, we must flush quota blocks. + * This can cause NULL exception for node_inode in end_io, since + * put_super already dropped it. + */ + sync_filesystem(sb); +} + +static void f2fs_truncate_quota_inode_pages(struct super_block *sb) +{ + struct quota_info *dqopt = sb_dqopt(sb); + int type; + + for (type = 0; type < MAXQUOTAS; type++) { + if (!dqopt->files[type]) + continue; + f2fs_inode_synced(dqopt->files[type]); + } +} + +static int f2fs_dquot_commit(struct dquot *dquot) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); + int ret; + + f2fs_down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING); + ret = dquot_commit(dquot); + if (ret < 0) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + f2fs_up_read(&sbi->quota_sem); + return ret; +} + +static int f2fs_dquot_acquire(struct dquot *dquot) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); + int ret; + + f2fs_down_read(&sbi->quota_sem); + ret = dquot_acquire(dquot); + if (ret < 0) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + f2fs_up_read(&sbi->quota_sem); + return ret; +} + +static int f2fs_dquot_release(struct dquot *dquot) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); + int ret = dquot_release(dquot); + + if (ret < 0) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + return ret; +} + +static int f2fs_dquot_mark_dquot_dirty(struct dquot *dquot) +{ + struct super_block *sb = dquot->dq_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int ret = dquot_mark_dquot_dirty(dquot); + + /* if we are using journalled quota */ + if (is_journalled_quota(sbi)) + set_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH); + + return ret; +} + +static int f2fs_dquot_commit_info(struct super_block *sb, int type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int ret = dquot_commit_info(sb, type); + + if (ret < 0) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + return ret; +} + +static int f2fs_get_projid(struct inode *inode, kprojid_t *projid) +{ + *projid = F2FS_I(inode)->i_projid; + return 0; +} + +static const struct dquot_operations f2fs_quota_operations = { + .get_reserved_space = f2fs_get_reserved_space, + .write_dquot = f2fs_dquot_commit, + .acquire_dquot = f2fs_dquot_acquire, + .release_dquot = f2fs_dquot_release, + .mark_dirty = f2fs_dquot_mark_dquot_dirty, + .write_info = f2fs_dquot_commit_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, + .get_projid = f2fs_get_projid, + .get_next_id = dquot_get_next_id, +}; + +static const struct quotactl_ops f2fs_quotactl_ops = { + .quota_on = f2fs_quota_on, + .quota_off = f2fs_quota_off, + .quota_sync = f2fs_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, + .get_nextdqblk = dquot_get_next_dqblk, +}; +#else +int f2fs_dquot_initialize(struct inode *inode) +{ + return 0; +} + +int f2fs_quota_sync(struct super_block *sb, int type) +{ + return 0; +} + +void f2fs_quota_off_umount(struct super_block *sb) +{ +} +#endif + +static const struct super_operations f2fs_sops = { + .alloc_inode = f2fs_alloc_inode, + .free_inode = f2fs_free_inode, + .drop_inode = f2fs_drop_inode, + .write_inode = f2fs_write_inode, + .dirty_inode = f2fs_dirty_inode, + .show_options = f2fs_show_options, +#ifdef CONFIG_QUOTA + .quota_read = f2fs_quota_read, + .quota_write = f2fs_quota_write, + .get_dquots = f2fs_get_dquots, +#endif + .evict_inode = f2fs_evict_inode, + .put_super = f2fs_put_super, + .sync_fs = f2fs_sync_fs, + .freeze_fs = f2fs_freeze, + .unfreeze_fs = f2fs_unfreeze, + .statfs = f2fs_statfs, + .remount_fs = f2fs_remount, +}; + +#ifdef CONFIG_FS_ENCRYPTION +static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) +{ + return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, NULL); +} + +static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, + void *fs_data) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + /* + * Encrypting the root directory is not allowed because fsck + * expects lost+found directory to exist and remain unencrypted + * if LOST_FOUND feature is enabled. + * + */ + if (f2fs_sb_has_lost_found(sbi) && + inode->i_ino == F2FS_ROOT_INO(sbi)) + return -EPERM; + + return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, fs_data, XATTR_CREATE); +} + +static const union fscrypt_policy *f2fs_get_dummy_policy(struct super_block *sb) +{ + return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_policy.policy; +} + +static bool f2fs_has_stable_inodes(struct super_block *sb) +{ + return true; +} + +static void f2fs_get_ino_and_lblk_bits(struct super_block *sb, + int *ino_bits_ret, int *lblk_bits_ret) +{ + *ino_bits_ret = 8 * sizeof(nid_t); + *lblk_bits_ret = 8 * sizeof(block_t); +} + +static struct block_device **f2fs_get_devices(struct super_block *sb, + unsigned int *num_devs) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct block_device **devs; + int i; + + if (!f2fs_is_multi_device(sbi)) + return NULL; + + devs = kmalloc_array(sbi->s_ndevs, sizeof(*devs), GFP_KERNEL); + if (!devs) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < sbi->s_ndevs; i++) + devs[i] = FDEV(i).bdev; + *num_devs = sbi->s_ndevs; + return devs; +} + +static const struct fscrypt_operations f2fs_cryptops = { + .key_prefix = "f2fs:", + .get_context = f2fs_get_context, + .set_context = f2fs_set_context, + .get_dummy_policy = f2fs_get_dummy_policy, + .empty_dir = f2fs_empty_dir, + .has_stable_inodes = f2fs_has_stable_inodes, + .get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits, + .get_devices = f2fs_get_devices, +}; +#endif + +static struct inode *f2fs_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *inode; + + if (f2fs_check_nid_range(sbi, ino)) + return ERR_PTR(-ESTALE); + + /* + * f2fs_iget isn't quite right if the inode is currently unallocated! + * However f2fs_iget currently does appropriate checks to handle stale + * inodes so everything is OK. + */ + inode = f2fs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (unlikely(generation && inode->i_generation != generation)) { + /* we didn't find the right inode.. */ + iput(inode); + return ERR_PTR(-ESTALE); + } + return inode; +} + +static struct dentry *f2fs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + f2fs_nfs_get_inode); +} + +static struct dentry *f2fs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + f2fs_nfs_get_inode); +} + +static const struct export_operations f2fs_export_ops = { + .fh_to_dentry = f2fs_fh_to_dentry, + .fh_to_parent = f2fs_fh_to_parent, + .get_parent = f2fs_get_parent, +}; + +loff_t max_file_blocks(struct inode *inode) +{ + loff_t result = 0; + loff_t leaf_count; + + /* + * note: previously, result is equal to (DEF_ADDRS_PER_INODE - + * DEFAULT_INLINE_XATTR_ADDRS), but now f2fs try to reserve more + * space in inode.i_addr, it will be more safe to reassign + * result as zero. + */ + + if (inode && f2fs_compressed_file(inode)) + leaf_count = ADDRS_PER_BLOCK(inode); + else + leaf_count = DEF_ADDRS_PER_BLOCK; + + /* two direct node blocks */ + result += (leaf_count * 2); + + /* two indirect node blocks */ + leaf_count *= NIDS_PER_BLOCK; + result += (leaf_count * 2); + + /* one double indirect node block */ + leaf_count *= NIDS_PER_BLOCK; + result += leaf_count; + + return result; +} + +static int __f2fs_commit_super(struct buffer_head *bh, + struct f2fs_super_block *super) +{ + lock_buffer(bh); + if (super) + memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super)); + set_buffer_dirty(bh); + unlock_buffer(bh); + + /* it's rare case, we can do fua all the time */ + return __sync_dirty_buffer(bh, REQ_SYNC | REQ_PREFLUSH | REQ_FUA); +} + +static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, + struct buffer_head *bh) +{ + struct f2fs_super_block *raw_super = (struct f2fs_super_block *) + (bh->b_data + F2FS_SUPER_OFFSET); + struct super_block *sb = sbi->sb; + u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); + u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr); + u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr); + u32 nat_blkaddr = le32_to_cpu(raw_super->nat_blkaddr); + u32 ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); + u32 main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); + u32 segment_count_ckpt = le32_to_cpu(raw_super->segment_count_ckpt); + u32 segment_count_sit = le32_to_cpu(raw_super->segment_count_sit); + u32 segment_count_nat = le32_to_cpu(raw_super->segment_count_nat); + u32 segment_count_ssa = le32_to_cpu(raw_super->segment_count_ssa); + u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main); + u32 segment_count = le32_to_cpu(raw_super->segment_count); + u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + u64 main_end_blkaddr = main_blkaddr + + (segment_count_main << log_blocks_per_seg); + u64 seg_end_blkaddr = segment0_blkaddr + + (segment_count << log_blocks_per_seg); + + if (segment0_blkaddr != cp_blkaddr) { + f2fs_info(sbi, "Mismatch start address, segment0(%u) cp_blkaddr(%u)", + segment0_blkaddr, cp_blkaddr); + return true; + } + + if (cp_blkaddr + (segment_count_ckpt << log_blocks_per_seg) != + sit_blkaddr) { + f2fs_info(sbi, "Wrong CP boundary, start(%u) end(%u) blocks(%u)", + cp_blkaddr, sit_blkaddr, + segment_count_ckpt << log_blocks_per_seg); + return true; + } + + if (sit_blkaddr + (segment_count_sit << log_blocks_per_seg) != + nat_blkaddr) { + f2fs_info(sbi, "Wrong SIT boundary, start(%u) end(%u) blocks(%u)", + sit_blkaddr, nat_blkaddr, + segment_count_sit << log_blocks_per_seg); + return true; + } + + if (nat_blkaddr + (segment_count_nat << log_blocks_per_seg) != + ssa_blkaddr) { + f2fs_info(sbi, "Wrong NAT boundary, start(%u) end(%u) blocks(%u)", + nat_blkaddr, ssa_blkaddr, + segment_count_nat << log_blocks_per_seg); + return true; + } + + if (ssa_blkaddr + (segment_count_ssa << log_blocks_per_seg) != + main_blkaddr) { + f2fs_info(sbi, "Wrong SSA boundary, start(%u) end(%u) blocks(%u)", + ssa_blkaddr, main_blkaddr, + segment_count_ssa << log_blocks_per_seg); + return true; + } + + if (main_end_blkaddr > seg_end_blkaddr) { + f2fs_info(sbi, "Wrong MAIN_AREA boundary, start(%u) end(%llu) block(%u)", + main_blkaddr, seg_end_blkaddr, + segment_count_main << log_blocks_per_seg); + return true; + } else if (main_end_blkaddr < seg_end_blkaddr) { + int err = 0; + char *res; + + /* fix in-memory information all the time */ + raw_super->segment_count = cpu_to_le32((main_end_blkaddr - + segment0_blkaddr) >> log_blocks_per_seg); + + if (f2fs_readonly(sb) || bdev_read_only(sb->s_bdev)) { + set_sbi_flag(sbi, SBI_NEED_SB_WRITE); + res = "internally"; + } else { + err = __f2fs_commit_super(bh, NULL); + res = err ? "failed" : "done"; + } + f2fs_info(sbi, "Fix alignment : %s, start(%u) end(%llu) block(%u)", + res, main_blkaddr, seg_end_blkaddr, + segment_count_main << log_blocks_per_seg); + if (err) + return true; + } + return false; +} + +static int sanity_check_raw_super(struct f2fs_sb_info *sbi, + struct buffer_head *bh) +{ + block_t segment_count, segs_per_sec, secs_per_zone, segment_count_main; + block_t total_sections, blocks_per_seg; + struct f2fs_super_block *raw_super = (struct f2fs_super_block *) + (bh->b_data + F2FS_SUPER_OFFSET); + size_t crc_offset = 0; + __u32 crc = 0; + + if (le32_to_cpu(raw_super->magic) != F2FS_SUPER_MAGIC) { + f2fs_info(sbi, "Magic Mismatch, valid(0x%x) - read(0x%x)", + F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic)); + return -EINVAL; + } + + /* Check checksum_offset and crc in superblock */ + if (__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_SB_CHKSUM)) { + crc_offset = le32_to_cpu(raw_super->checksum_offset); + if (crc_offset != + offsetof(struct f2fs_super_block, crc)) { + f2fs_info(sbi, "Invalid SB checksum offset: %zu", + crc_offset); + return -EFSCORRUPTED; + } + crc = le32_to_cpu(raw_super->crc); + if (!f2fs_crc_valid(sbi, crc, raw_super, crc_offset)) { + f2fs_info(sbi, "Invalid SB checksum value: %u", crc); + return -EFSCORRUPTED; + } + } + + /* Currently, support only 4KB block size */ + if (le32_to_cpu(raw_super->log_blocksize) != F2FS_BLKSIZE_BITS) { + f2fs_info(sbi, "Invalid log_blocksize (%u), supports only %u", + le32_to_cpu(raw_super->log_blocksize), + F2FS_BLKSIZE_BITS); + return -EFSCORRUPTED; + } + + /* check log blocks per segment */ + if (le32_to_cpu(raw_super->log_blocks_per_seg) != 9) { + f2fs_info(sbi, "Invalid log blocks per segment (%u)", + le32_to_cpu(raw_super->log_blocks_per_seg)); + return -EFSCORRUPTED; + } + + /* Currently, support 512/1024/2048/4096 bytes sector size */ + if (le32_to_cpu(raw_super->log_sectorsize) > + F2FS_MAX_LOG_SECTOR_SIZE || + le32_to_cpu(raw_super->log_sectorsize) < + F2FS_MIN_LOG_SECTOR_SIZE) { + f2fs_info(sbi, "Invalid log sectorsize (%u)", + le32_to_cpu(raw_super->log_sectorsize)); + return -EFSCORRUPTED; + } + if (le32_to_cpu(raw_super->log_sectors_per_block) + + le32_to_cpu(raw_super->log_sectorsize) != + F2FS_MAX_LOG_SECTOR_SIZE) { + f2fs_info(sbi, "Invalid log sectors per block(%u) log sectorsize(%u)", + le32_to_cpu(raw_super->log_sectors_per_block), + le32_to_cpu(raw_super->log_sectorsize)); + return -EFSCORRUPTED; + } + + segment_count = le32_to_cpu(raw_super->segment_count); + segment_count_main = le32_to_cpu(raw_super->segment_count_main); + segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); + secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); + total_sections = le32_to_cpu(raw_super->section_count); + + /* blocks_per_seg should be 512, given the above check */ + blocks_per_seg = BIT(le32_to_cpu(raw_super->log_blocks_per_seg)); + + if (segment_count > F2FS_MAX_SEGMENT || + segment_count < F2FS_MIN_SEGMENTS) { + f2fs_info(sbi, "Invalid segment count (%u)", segment_count); + return -EFSCORRUPTED; + } + + if (total_sections > segment_count_main || total_sections < 1 || + segs_per_sec > segment_count || !segs_per_sec) { + f2fs_info(sbi, "Invalid segment/section count (%u, %u x %u)", + segment_count, total_sections, segs_per_sec); + return -EFSCORRUPTED; + } + + if (segment_count_main != total_sections * segs_per_sec) { + f2fs_info(sbi, "Invalid segment/section count (%u != %u * %u)", + segment_count_main, total_sections, segs_per_sec); + return -EFSCORRUPTED; + } + + if ((segment_count / segs_per_sec) < total_sections) { + f2fs_info(sbi, "Small segment_count (%u < %u * %u)", + segment_count, segs_per_sec, total_sections); + return -EFSCORRUPTED; + } + + if (segment_count > (le64_to_cpu(raw_super->block_count) >> 9)) { + f2fs_info(sbi, "Wrong segment_count / block_count (%u > %llu)", + segment_count, le64_to_cpu(raw_super->block_count)); + return -EFSCORRUPTED; + } + + if (RDEV(0).path[0]) { + block_t dev_seg_count = le32_to_cpu(RDEV(0).total_segments); + int i = 1; + + while (i < MAX_DEVICES && RDEV(i).path[0]) { + dev_seg_count += le32_to_cpu(RDEV(i).total_segments); + i++; + } + if (segment_count != dev_seg_count) { + f2fs_info(sbi, "Segment count (%u) mismatch with total segments from devices (%u)", + segment_count, dev_seg_count); + return -EFSCORRUPTED; + } + } else { + if (__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_BLKZONED) && + !bdev_is_zoned(sbi->sb->s_bdev)) { + f2fs_info(sbi, "Zoned block device path is missing"); + return -EFSCORRUPTED; + } + } + + if (secs_per_zone > total_sections || !secs_per_zone) { + f2fs_info(sbi, "Wrong secs_per_zone / total_sections (%u, %u)", + secs_per_zone, total_sections); + return -EFSCORRUPTED; + } + if (le32_to_cpu(raw_super->extension_count) > F2FS_MAX_EXTENSION || + raw_super->hot_ext_count > F2FS_MAX_EXTENSION || + (le32_to_cpu(raw_super->extension_count) + + raw_super->hot_ext_count) > F2FS_MAX_EXTENSION) { + f2fs_info(sbi, "Corrupted extension count (%u + %u > %u)", + le32_to_cpu(raw_super->extension_count), + raw_super->hot_ext_count, + F2FS_MAX_EXTENSION); + return -EFSCORRUPTED; + } + + if (le32_to_cpu(raw_super->cp_payload) >= + (blocks_per_seg - F2FS_CP_PACKS - + NR_CURSEG_PERSIST_TYPE)) { + f2fs_info(sbi, "Insane cp_payload (%u >= %u)", + le32_to_cpu(raw_super->cp_payload), + blocks_per_seg - F2FS_CP_PACKS - + NR_CURSEG_PERSIST_TYPE); + return -EFSCORRUPTED; + } + + /* check reserved ino info */ + if (le32_to_cpu(raw_super->node_ino) != 1 || + le32_to_cpu(raw_super->meta_ino) != 2 || + le32_to_cpu(raw_super->root_ino) != 3) { + f2fs_info(sbi, "Invalid Fs Meta Ino: node(%u) meta(%u) root(%u)", + le32_to_cpu(raw_super->node_ino), + le32_to_cpu(raw_super->meta_ino), + le32_to_cpu(raw_super->root_ino)); + return -EFSCORRUPTED; + } + + /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ + if (sanity_check_area_boundary(sbi, bh)) + return -EFSCORRUPTED; + + return 0; +} + +int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) +{ + unsigned int total, fsmeta; + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned int ovp_segments, reserved_segments; + unsigned int main_segs, blocks_per_seg; + unsigned int sit_segs, nat_segs; + unsigned int sit_bitmap_size, nat_bitmap_size; + unsigned int log_blocks_per_seg; + unsigned int segment_count_main; + unsigned int cp_pack_start_sum, cp_payload; + block_t user_block_count, valid_user_blocks; + block_t avail_node_count, valid_node_count; + unsigned int nat_blocks, nat_bits_bytes, nat_bits_blocks; + int i, j; + + total = le32_to_cpu(raw_super->segment_count); + fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); + sit_segs = le32_to_cpu(raw_super->segment_count_sit); + fsmeta += sit_segs; + nat_segs = le32_to_cpu(raw_super->segment_count_nat); + fsmeta += nat_segs; + fsmeta += le32_to_cpu(ckpt->rsvd_segment_count); + fsmeta += le32_to_cpu(raw_super->segment_count_ssa); + + if (unlikely(fsmeta >= total)) + return 1; + + ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); + reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); + + if (!f2fs_sb_has_readonly(sbi) && + unlikely(fsmeta < F2FS_MIN_META_SEGMENTS || + ovp_segments == 0 || reserved_segments == 0)) { + f2fs_err(sbi, "Wrong layout: check mkfs.f2fs version"); + return 1; + } + user_block_count = le64_to_cpu(ckpt->user_block_count); + segment_count_main = le32_to_cpu(raw_super->segment_count_main) + + (f2fs_sb_has_readonly(sbi) ? 1 : 0); + log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + if (!user_block_count || user_block_count >= + segment_count_main << log_blocks_per_seg) { + f2fs_err(sbi, "Wrong user_block_count: %u", + user_block_count); + return 1; + } + + valid_user_blocks = le64_to_cpu(ckpt->valid_block_count); + if (valid_user_blocks > user_block_count) { + f2fs_err(sbi, "Wrong valid_user_blocks: %u, user_block_count: %u", + valid_user_blocks, user_block_count); + return 1; + } + + valid_node_count = le32_to_cpu(ckpt->valid_node_count); + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + if (valid_node_count > avail_node_count) { + f2fs_err(sbi, "Wrong valid_node_count: %u, avail_node_count: %u", + valid_node_count, avail_node_count); + return 1; + } + + main_segs = le32_to_cpu(raw_super->segment_count_main); + blocks_per_seg = sbi->blocks_per_seg; + + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) + return 1; + + if (f2fs_sb_has_readonly(sbi)) + goto check_data; + + for (j = i + 1; j < NR_CURSEG_NODE_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) == + le32_to_cpu(ckpt->cur_node_segno[j])) { + f2fs_err(sbi, "Node segment (%u, %u) has the same segno: %u", + i, j, + le32_to_cpu(ckpt->cur_node_segno[i])); + return 1; + } + } + } +check_data: + for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) + return 1; + + if (f2fs_sb_has_readonly(sbi)) + goto skip_cross; + + for (j = i + 1; j < NR_CURSEG_DATA_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) == + le32_to_cpu(ckpt->cur_data_segno[j])) { + f2fs_err(sbi, "Data segment (%u, %u) has the same segno: %u", + i, j, + le32_to_cpu(ckpt->cur_data_segno[i])); + return 1; + } + } + } + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + for (j = 0; j < NR_CURSEG_DATA_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) == + le32_to_cpu(ckpt->cur_data_segno[j])) { + f2fs_err(sbi, "Node segment (%u) and Data segment (%u) has the same segno: %u", + i, j, + le32_to_cpu(ckpt->cur_node_segno[i])); + return 1; + } + } + } +skip_cross: + sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); + nat_bitmap_size = le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); + + if (sit_bitmap_size != ((sit_segs / 2) << log_blocks_per_seg) / 8 || + nat_bitmap_size != ((nat_segs / 2) << log_blocks_per_seg) / 8) { + f2fs_err(sbi, "Wrong bitmap size: sit: %u, nat:%u", + sit_bitmap_size, nat_bitmap_size); + return 1; + } + + cp_pack_start_sum = __start_sum_addr(sbi); + cp_payload = __cp_payload(sbi); + if (cp_pack_start_sum < cp_payload + 1 || + cp_pack_start_sum > blocks_per_seg - 1 - + NR_CURSEG_PERSIST_TYPE) { + f2fs_err(sbi, "Wrong cp_pack_start_sum: %u", + cp_pack_start_sum); + return 1; + } + + if (__is_set_ckpt_flags(ckpt, CP_LARGE_NAT_BITMAP_FLAG) && + le32_to_cpu(ckpt->checksum_offset) != CP_MIN_CHKSUM_OFFSET) { + f2fs_warn(sbi, "using deprecated layout of large_nat_bitmap, " + "please run fsck v1.13.0 or higher to repair, chksum_offset: %u, " + "fixed with patch: \"f2fs-tools: relocate chksum_offset for large_nat_bitmap feature\"", + le32_to_cpu(ckpt->checksum_offset)); + return 1; + } + + nat_blocks = nat_segs << log_blocks_per_seg; + nat_bits_bytes = nat_blocks / BITS_PER_BYTE; + nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); + if (__is_set_ckpt_flags(ckpt, CP_NAT_BITS_FLAG) && + (cp_payload + F2FS_CP_PACKS + + NR_CURSEG_PERSIST_TYPE + nat_bits_blocks >= blocks_per_seg)) { + f2fs_warn(sbi, "Insane cp_payload: %u, nat_bits_blocks: %u)", + cp_payload, nat_bits_blocks); + return 1; + } + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_err(sbi, "A bug case: need to run fsck"); + return 1; + } + return 0; +} + +static void init_sb_info(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = sbi->raw_super; + int i; + + sbi->log_sectors_per_block = + le32_to_cpu(raw_super->log_sectors_per_block); + sbi->log_blocksize = le32_to_cpu(raw_super->log_blocksize); + sbi->blocksize = BIT(sbi->log_blocksize); + sbi->log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + sbi->blocks_per_seg = BIT(sbi->log_blocks_per_seg); + sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); + sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); + sbi->total_sections = le32_to_cpu(raw_super->section_count); + sbi->total_node_count = + (le32_to_cpu(raw_super->segment_count_nat) / 2) + * sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK; + F2FS_ROOT_INO(sbi) = le32_to_cpu(raw_super->root_ino); + F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino); + F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino); + sbi->cur_victim_sec = NULL_SECNO; + sbi->gc_mode = GC_NORMAL; + sbi->next_victim_seg[BG_GC] = NULL_SEGNO; + sbi->next_victim_seg[FG_GC] = NULL_SEGNO; + sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; + sbi->migration_granularity = sbi->segs_per_sec; + sbi->seq_file_ra_mul = MIN_RA_MUL; + sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE; + sbi->max_fragment_hole = DEF_FRAGMENT_SIZE; + spin_lock_init(&sbi->gc_urgent_high_lock); + atomic64_set(&sbi->current_atomic_write, 0); + + sbi->dir_level = DEF_DIR_LEVEL; + sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; + sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; + sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL; + sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL; + sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL; + sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] = + DEF_UMOUNT_DISCARD_TIMEOUT; + clear_sbi_flag(sbi, SBI_NEED_FSCK); + + for (i = 0; i < NR_COUNT_TYPE; i++) + atomic_set(&sbi->nr_pages[i], 0); + + for (i = 0; i < META; i++) + atomic_set(&sbi->wb_sync_req[i], 0); + + INIT_LIST_HEAD(&sbi->s_list); + mutex_init(&sbi->umount_mutex); + init_f2fs_rwsem(&sbi->io_order_lock); + spin_lock_init(&sbi->cp_lock); + + sbi->dirty_device = 0; + spin_lock_init(&sbi->dev_lock); + + init_f2fs_rwsem(&sbi->sb_lock); + init_f2fs_rwsem(&sbi->pin_sem); +} + +static int init_percpu_info(struct f2fs_sb_info *sbi) +{ + int err; + + err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL); + if (err) + return err; + + err = percpu_counter_init(&sbi->rf_node_block_count, 0, GFP_KERNEL); + if (err) + goto err_valid_block; + + err = percpu_counter_init(&sbi->total_valid_inode_count, 0, + GFP_KERNEL); + if (err) + goto err_node_block; + return 0; + +err_node_block: + percpu_counter_destroy(&sbi->rf_node_block_count); +err_valid_block: + percpu_counter_destroy(&sbi->alloc_valid_block_count); + return err; +} + +#ifdef CONFIG_BLK_DEV_ZONED + +struct f2fs_report_zones_args { + struct f2fs_sb_info *sbi; + struct f2fs_dev_info *dev; +}; + +static int f2fs_report_zone_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct f2fs_report_zones_args *rz_args = data; + block_t unusable_blocks = (zone->len - zone->capacity) >> + F2FS_LOG_SECTORS_PER_BLOCK; + + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return 0; + + set_bit(idx, rz_args->dev->blkz_seq); + if (!rz_args->sbi->unusable_blocks_per_sec) { + rz_args->sbi->unusable_blocks_per_sec = unusable_blocks; + return 0; + } + if (rz_args->sbi->unusable_blocks_per_sec != unusable_blocks) { + f2fs_err(rz_args->sbi, "F2FS supports single zone capacity\n"); + return -EINVAL; + } + return 0; +} + +static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) +{ + struct block_device *bdev = FDEV(devi).bdev; + sector_t nr_sectors = bdev_nr_sectors(bdev); + struct f2fs_report_zones_args rep_zone_arg; + u64 zone_sectors; + int ret; + + if (!f2fs_sb_has_blkzoned(sbi)) + return 0; + + zone_sectors = bdev_zone_sectors(bdev); + if (!is_power_of_2(zone_sectors)) { + f2fs_err(sbi, "F2FS does not support non power of 2 zone sizes\n"); + return -EINVAL; + } + + if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != + SECTOR_TO_BLOCK(zone_sectors)) + return -EINVAL; + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(zone_sectors); + if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != + __ilog2_u32(sbi->blocks_per_blkz)) + return -EINVAL; + sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); + FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> + sbi->log_blocks_per_blkz; + if (nr_sectors & (zone_sectors - 1)) + FDEV(devi).nr_blkz++; + + FDEV(devi).blkz_seq = f2fs_kvzalloc(sbi, + BITS_TO_LONGS(FDEV(devi).nr_blkz) + * sizeof(unsigned long), + GFP_KERNEL); + if (!FDEV(devi).blkz_seq) + return -ENOMEM; + + rep_zone_arg.sbi = sbi; + rep_zone_arg.dev = &FDEV(devi); + + ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, f2fs_report_zone_cb, + &rep_zone_arg); + if (ret < 0) + return ret; + return 0; +} +#endif + +/* + * Read f2fs raw super block. + * Because we have two copies of super block, so read both of them + * to get the first valid one. If any one of them is broken, we pass + * them recovery flag back to the caller. + */ +static int read_raw_super_block(struct f2fs_sb_info *sbi, + struct f2fs_super_block **raw_super, + int *valid_super_block, int *recovery) +{ + struct super_block *sb = sbi->sb; + int block; + struct buffer_head *bh; + struct f2fs_super_block *super; + int err = 0; + + super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL); + if (!super) + return -ENOMEM; + + for (block = 0; block < 2; block++) { + bh = sb_bread(sb, block); + if (!bh) { + f2fs_err(sbi, "Unable to read %dth superblock", + block + 1); + err = -EIO; + *recovery = 1; + continue; + } + + /* sanity checking of raw super */ + err = sanity_check_raw_super(sbi, bh); + if (err) { + f2fs_err(sbi, "Can't find valid F2FS filesystem in %dth superblock", + block + 1); + brelse(bh); + *recovery = 1; + continue; + } + + if (!*raw_super) { + memcpy(super, bh->b_data + F2FS_SUPER_OFFSET, + sizeof(*super)); + *valid_super_block = block; + *raw_super = super; + } + brelse(bh); + } + + /* No valid superblock */ + if (!*raw_super) + kfree(super); + else + err = 0; + + return err; +} + +int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) +{ + struct buffer_head *bh; + __u32 crc = 0; + int err; + + if ((recover && f2fs_readonly(sbi->sb)) || + bdev_read_only(sbi->sb->s_bdev)) { + set_sbi_flag(sbi, SBI_NEED_SB_WRITE); + return -EROFS; + } + + /* we should update superblock crc here */ + if (!recover && f2fs_sb_has_sb_chksum(sbi)) { + crc = f2fs_crc32(sbi, F2FS_RAW_SUPER(sbi), + offsetof(struct f2fs_super_block, crc)); + F2FS_RAW_SUPER(sbi)->crc = cpu_to_le32(crc); + } + + /* write back-up superblock first */ + bh = sb_bread(sbi->sb, sbi->valid_super_block ? 0 : 1); + if (!bh) + return -EIO; + err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); + brelse(bh); + + /* if we are in recovery path, skip writing valid superblock */ + if (recover || err) + return err; + + /* write current valid superblock */ + bh = sb_bread(sbi->sb, sbi->valid_super_block); + if (!bh) + return -EIO; + err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); + brelse(bh); + return err; +} + +void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + int err; + + f2fs_down_write(&sbi->sb_lock); + + if (raw_super->s_stop_reason[reason] < GENMASK(BITS_PER_BYTE - 1, 0)) + raw_super->s_stop_reason[reason]++; + + err = f2fs_commit_super(sbi, false); + if (err) + f2fs_err(sbi, "f2fs_commit_super fails to record reason:%u err:%d", + reason, err); + f2fs_up_write(&sbi->sb_lock); +} + +void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag) +{ + spin_lock(&sbi->error_lock); + if (!test_bit(flag, (unsigned long *)sbi->errors)) { + set_bit(flag, (unsigned long *)sbi->errors); + sbi->error_dirty = true; + } + spin_unlock(&sbi->error_lock); +} + +static bool f2fs_update_errors(struct f2fs_sb_info *sbi) +{ + bool need_update = false; + + spin_lock(&sbi->error_lock); + if (sbi->error_dirty) { + memcpy(F2FS_RAW_SUPER(sbi)->s_errors, sbi->errors, + MAX_F2FS_ERRORS); + sbi->error_dirty = false; + need_update = true; + } + spin_unlock(&sbi->error_lock); + + return need_update; +} + +void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error) +{ + int err; + + f2fs_save_errors(sbi, error); + + f2fs_down_write(&sbi->sb_lock); + + if (!f2fs_update_errors(sbi)) + goto out_unlock; + + err = f2fs_commit_super(sbi, false); + if (err) + f2fs_err(sbi, "f2fs_commit_super fails to record errors:%u, err:%d", + error, err); +out_unlock: + f2fs_up_write(&sbi->sb_lock); +} + +static int f2fs_scan_devices(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + unsigned int max_devices = MAX_DEVICES; + unsigned int logical_blksize; + int i; + + /* Initialize single device information */ + if (!RDEV(0).path[0]) { + if (!bdev_is_zoned(sbi->sb->s_bdev)) + return 0; + max_devices = 1; + } + + /* + * Initialize multiple devices information, or single + * zoned block device information. + */ + sbi->devs = f2fs_kzalloc(sbi, + array_size(max_devices, + sizeof(struct f2fs_dev_info)), + GFP_KERNEL); + if (!sbi->devs) + return -ENOMEM; + + logical_blksize = bdev_logical_block_size(sbi->sb->s_bdev); + sbi->aligned_blksize = true; + + for (i = 0; i < max_devices; i++) { + + if (i > 0 && !RDEV(i).path[0]) + break; + + if (max_devices == 1) { + /* Single zoned block device mount */ + FDEV(0).bdev = + blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, + sbi->sb->s_mode, sbi->sb->s_type); + } else { + /* Multi-device mount */ + memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); + FDEV(i).total_segments = + le32_to_cpu(RDEV(i).total_segments); + if (i == 0) { + FDEV(i).start_blk = 0; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1 + + le32_to_cpu(raw_super->segment0_blkaddr); + } else { + FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1; + } + FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + sbi->sb->s_mode, sbi->sb->s_type); + } + if (IS_ERR(FDEV(i).bdev)) + return PTR_ERR(FDEV(i).bdev); + + /* to release errored devices */ + sbi->s_ndevs = i + 1; + + if (logical_blksize != bdev_logical_block_size(FDEV(i).bdev)) + sbi->aligned_blksize = false; + +#ifdef CONFIG_BLK_DEV_ZONED + if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && + !f2fs_sb_has_blkzoned(sbi)) { + f2fs_err(sbi, "Zoned block device feature not enabled"); + return -EINVAL; + } + if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) { + if (init_blkz_info(sbi, i)) { + f2fs_err(sbi, "Failed to initialize F2FS blkzone information"); + return -EINVAL; + } + if (max_devices == 1) + break; + f2fs_info(sbi, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk, + bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ? + "Host-aware" : "Host-managed"); + continue; + } +#endif + f2fs_info(sbi, "Mount Device [%2d]: %20s, %8u, %8x - %8x", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk); + } + f2fs_info(sbi, + "IO Block Size: %8ld KB", F2FS_IO_SIZE_KB(sbi)); + return 0; +} + +static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) +{ +#if IS_ENABLED(CONFIG_UNICODE) + if (f2fs_sb_has_casefold(sbi) && !sbi->sb->s_encoding) { + const struct f2fs_sb_encodings *encoding_info; + struct unicode_map *encoding; + __u16 encoding_flags; + + encoding_info = f2fs_sb_read_encoding(sbi->raw_super); + if (!encoding_info) { + f2fs_err(sbi, + "Encoding requested by superblock is unknown"); + return -EINVAL; + } + + encoding_flags = le16_to_cpu(sbi->raw_super->s_encoding_flags); + encoding = utf8_load(encoding_info->version); + if (IS_ERR(encoding)) { + f2fs_err(sbi, + "can't mount with superblock charset: %s-%u.%u.%u " + "not supported by the kernel. flags: 0x%x.", + encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), + encoding_flags); + return PTR_ERR(encoding); + } + f2fs_info(sbi, "Using encoding defined by superblock: " + "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), + encoding_flags); + + sbi->sb->s_encoding = encoding; + sbi->sb->s_encoding_flags = encoding_flags; + } +#else + if (f2fs_sb_has_casefold(sbi)) { + f2fs_err(sbi, "Filesystem with casefold feature cannot be mounted without CONFIG_UNICODE"); + return -EINVAL; + } +#endif + return 0; +} + +static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_i = SM_I(sbi); + + /* adjust parameters according to the volume size */ + if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) { + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; + if (f2fs_block_unit_discard(sbi)) + sm_i->dcc_info->discard_granularity = 1; + sm_i->ipu_policy = 1 << F2FS_IPU_FORCE | + 1 << F2FS_IPU_HONOR_OPU_WRITE; + } + + sbi->readdir_ra = 1; +} + +static int f2fs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct f2fs_sb_info *sbi; + struct f2fs_super_block *raw_super; + struct inode *root; + int err; + bool skip_recovery = false, need_fsck = false; + char *options = NULL; + int recovery, i, valid_super_block; + struct curseg_info *seg_i; + int retry_cnt = 1; + +try_onemore: + err = -EINVAL; + raw_super = NULL; + valid_super_block = -1; + recovery = 0; + + /* allocate memory for f2fs-specific super block info */ + sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sbi->sb = sb; + + /* initialize locks within allocated memory */ + init_f2fs_rwsem(&sbi->gc_lock); + mutex_init(&sbi->writepages); + init_f2fs_rwsem(&sbi->cp_global_sem); + init_f2fs_rwsem(&sbi->node_write); + init_f2fs_rwsem(&sbi->node_change); + spin_lock_init(&sbi->stat_lock); + init_f2fs_rwsem(&sbi->cp_rwsem); + init_f2fs_rwsem(&sbi->quota_sem); + init_waitqueue_head(&sbi->cp_wait); + spin_lock_init(&sbi->error_lock); + + for (i = 0; i < NR_INODE_TYPE; i++) { + INIT_LIST_HEAD(&sbi->inode_list[i]); + spin_lock_init(&sbi->inode_lock[i]); + } + mutex_init(&sbi->flush_lock); + + /* Load the checksum driver */ + sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0); + if (IS_ERR(sbi->s_chksum_driver)) { + f2fs_err(sbi, "Cannot load crc32 driver."); + err = PTR_ERR(sbi->s_chksum_driver); + sbi->s_chksum_driver = NULL; + goto free_sbi; + } + + /* set a block size */ + if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) { + f2fs_err(sbi, "unable to set blocksize"); + goto free_sbi; + } + + err = read_raw_super_block(sbi, &raw_super, &valid_super_block, + &recovery); + if (err) + goto free_sbi; + + sb->s_fs_info = sbi; + sbi->raw_super = raw_super; + + memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); + + /* precompute checksum seed for metadata */ + if (f2fs_sb_has_inode_chksum(sbi)) + sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, + sizeof(raw_super->uuid)); + + default_options(sbi); + /* parse mount options */ + options = kstrdup((const char *)data, GFP_KERNEL); + if (data && !options) { + err = -ENOMEM; + goto free_sb_buf; + } + + err = parse_options(sb, options, false); + if (err) + goto free_options; + + sb->s_maxbytes = max_file_blocks(NULL) << + le32_to_cpu(raw_super->log_blocksize); + sb->s_max_links = F2FS_LINK_MAX; + + err = f2fs_setup_casefold(sbi); + if (err) + goto free_options; + +#ifdef CONFIG_QUOTA + sb->dq_op = &f2fs_quota_operations; + sb->s_qcop = &f2fs_quotactl_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; + + if (f2fs_sb_has_quota_ino(sbi)) { + for (i = 0; i < MAXQUOTAS; i++) { + if (f2fs_qf_ino(sbi->sb, i)) + sbi->nquota_files++; + } + } +#endif + + sb->s_op = &f2fs_sops; +#ifdef CONFIG_FS_ENCRYPTION + sb->s_cop = &f2fs_cryptops; +#endif +#ifdef CONFIG_FS_VERITY + sb->s_vop = &f2fs_verityops; +#endif + sb->s_xattr = f2fs_xattr_handlers; + sb->s_export_op = &f2fs_export_ops; + sb->s_magic = F2FS_SUPER_MAGIC; + sb->s_time_gran = 1; + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); + memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); + sb->s_iflags |= SB_I_CGROUPWB; + + /* init f2fs-specific super block info */ + sbi->valid_super_block = valid_super_block; + + /* disallow all the data/node/meta page writes */ + set_sbi_flag(sbi, SBI_POR_DOING); + + err = f2fs_init_write_merge_io(sbi); + if (err) + goto free_bio_info; + + init_sb_info(sbi); + + err = f2fs_init_iostat(sbi); + if (err) + goto free_bio_info; + + err = init_percpu_info(sbi); + if (err) + goto free_iostat; + + if (F2FS_IO_ALIGNED(sbi)) { + sbi->write_io_dummy = + mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0); + if (!sbi->write_io_dummy) { + err = -ENOMEM; + goto free_percpu; + } + } + + /* init per sbi slab cache */ + err = f2fs_init_xattr_caches(sbi); + if (err) + goto free_io_dummy; + err = f2fs_init_page_array_cache(sbi); + if (err) + goto free_xattr_cache; + + /* get an inode for meta space */ + sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); + if (IS_ERR(sbi->meta_inode)) { + f2fs_err(sbi, "Failed to read F2FS meta data inode"); + err = PTR_ERR(sbi->meta_inode); + goto free_page_array_cache; + } + + err = f2fs_get_valid_checkpoint(sbi); + if (err) { + f2fs_err(sbi, "Failed to get valid F2FS checkpoint"); + goto free_meta_inode; + } + + if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_QUOTA_NEED_FSCK_FLAG)) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_DISABLED_QUICK_FLAG)) { + set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_QUICK_INTERVAL; + } + + if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FSCK_FLAG)) + set_sbi_flag(sbi, SBI_NEED_FSCK); + + /* Initialize device list */ + err = f2fs_scan_devices(sbi); + if (err) { + f2fs_err(sbi, "Failed to find devices"); + goto free_devices; + } + + err = f2fs_init_post_read_wq(sbi); + if (err) { + f2fs_err(sbi, "Failed to initialize post read workqueue"); + goto free_devices; + } + + sbi->total_valid_node_count = + le32_to_cpu(sbi->ckpt->valid_node_count); + percpu_counter_set(&sbi->total_valid_inode_count, + le32_to_cpu(sbi->ckpt->valid_inode_count)); + sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count); + sbi->total_valid_block_count = + le64_to_cpu(sbi->ckpt->valid_block_count); + sbi->last_valid_block_count = sbi->total_valid_block_count; + sbi->reserved_blocks = 0; + sbi->current_reserved_blocks = 0; + limit_reserve_root(sbi); + adjust_unusable_cap_perc(sbi); + + f2fs_init_extent_cache_info(sbi); + + f2fs_init_ino_entry_info(sbi); + + f2fs_init_fsync_node_info(sbi); + + /* setup checkpoint request control and start checkpoint issue thread */ + f2fs_init_ckpt_req_control(sbi); + if (!f2fs_readonly(sb) && !test_opt(sbi, DISABLE_CHECKPOINT) && + test_opt(sbi, MERGE_CHECKPOINT)) { + err = f2fs_start_ckpt_thread(sbi); + if (err) { + f2fs_err(sbi, + "Failed to start F2FS issue_checkpoint_thread (%d)", + err); + goto stop_ckpt_thread; + } + } + + /* setup f2fs internal modules */ + err = f2fs_build_segment_manager(sbi); + if (err) { + f2fs_err(sbi, "Failed to initialize F2FS segment manager (%d)", + err); + goto free_sm; + } + err = f2fs_build_node_manager(sbi); + if (err) { + f2fs_err(sbi, "Failed to initialize F2FS node manager (%d)", + err); + goto free_nm; + } + + err = adjust_reserved_segment(sbi); + if (err) + goto free_nm; + + /* For write statistics */ + sbi->sectors_written_start = f2fs_get_sectors_written(sbi); + + /* Read accumulated write IO statistics if exists */ + seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); + if (__exist_node_summaries(sbi)) + sbi->kbytes_written = + le64_to_cpu(seg_i->journal->info.kbytes_written); + + f2fs_build_gc_manager(sbi); + + err = f2fs_build_stats(sbi); + if (err) + goto free_nm; + + /* get an inode for node space */ + sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi)); + if (IS_ERR(sbi->node_inode)) { + f2fs_err(sbi, "Failed to read node inode"); + err = PTR_ERR(sbi->node_inode); + goto free_stats; + } + + /* read root inode and dentry */ + root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); + if (IS_ERR(root)) { + f2fs_err(sbi, "Failed to read root inode"); + err = PTR_ERR(root); + goto free_node_inode; + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || + !root->i_size || !root->i_nlink) { + iput(root); + err = -EINVAL; + goto free_node_inode; + } + + sb->s_root = d_make_root(root); /* allocate root dentry */ + if (!sb->s_root) { + err = -ENOMEM; + goto free_node_inode; + } + + err = f2fs_init_compress_inode(sbi); + if (err) + goto free_root_inode; + + err = f2fs_register_sysfs(sbi); + if (err) + goto free_compress_inode; + +#ifdef CONFIG_QUOTA + /* Enable quota usage during mount */ + if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) { + err = f2fs_enable_quotas(sb); + if (err) + f2fs_err(sbi, "Cannot turn on quotas: error %d", err); + } +#endif + /* if there are any orphan inodes, free them */ + err = f2fs_recover_orphan_inodes(sbi); + if (err) + goto free_meta; + + if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) + goto reset_checkpoint; + + /* recover fsynced data */ + if (!test_opt(sbi, DISABLE_ROLL_FORWARD) && + !test_opt(sbi, NORECOVERY)) { + /* + * mount should be failed, when device has readonly mode, and + * previous checkpoint was not done by clean system shutdown. + */ + if (f2fs_hw_is_readonly(sbi)) { + if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + err = f2fs_recover_fsync_data(sbi, true); + if (err > 0) { + err = -EROFS; + f2fs_err(sbi, "Need to recover fsync data, but " + "write access unavailable, please try " + "mount w/ disable_roll_forward or norecovery"); + } + if (err < 0) + goto free_meta; + } + f2fs_info(sbi, "write access unavailable, skipping recovery"); + goto reset_checkpoint; + } + + if (need_fsck) + set_sbi_flag(sbi, SBI_NEED_FSCK); + + if (skip_recovery) + goto reset_checkpoint; + + err = f2fs_recover_fsync_data(sbi, false); + if (err < 0) { + if (err != -ENOMEM) + skip_recovery = true; + need_fsck = true; + f2fs_err(sbi, "Cannot recover all fsync data errno=%d", + err); + goto free_meta; + } + } else { + err = f2fs_recover_fsync_data(sbi, true); + + if (!f2fs_readonly(sb) && err > 0) { + err = -EINVAL; + f2fs_err(sbi, "Need to recover fsync data"); + goto free_meta; + } + } + + /* + * If the f2fs is not readonly and fsync data recovery succeeds, + * check zoned block devices' write pointer consistency. + */ + if (!err && !f2fs_readonly(sb) && f2fs_sb_has_blkzoned(sbi)) { + err = f2fs_check_write_pointer(sbi); + if (err) + goto free_meta; + } + +reset_checkpoint: + f2fs_init_inmem_curseg(sbi); + + /* f2fs_recover_fsync_data() cleared this already */ + clear_sbi_flag(sbi, SBI_POR_DOING); + + if (test_opt(sbi, DISABLE_CHECKPOINT)) { + err = f2fs_disable_checkpoint(sbi); + if (err) + goto sync_free_meta; + } else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)) { + f2fs_enable_checkpoint(sbi); + } + + /* + * If filesystem is not mounted as read-only then + * do start the gc_thread. + */ + if ((F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF || + test_opt(sbi, GC_MERGE)) && !f2fs_readonly(sb)) { + /* After POR, we can run background GC thread.*/ + err = f2fs_start_gc_thread(sbi); + if (err) + goto sync_free_meta; + } + kvfree(options); + + /* recover broken superblock */ + if (recovery) { + err = f2fs_commit_super(sbi, true); + f2fs_info(sbi, "Try to recover %dth superblock, ret: %d", + sbi->valid_super_block ? 1 : 2, err); + } + + f2fs_join_shrinker(sbi); + + f2fs_tuning_parameters(sbi); + + f2fs_notice(sbi, "Mounted with checkpoint version = %llx", + cur_cp_version(F2FS_CKPT(sbi))); + f2fs_update_time(sbi, CP_TIME); + f2fs_update_time(sbi, REQ_TIME); + clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + return 0; + +sync_free_meta: + /* safe to flush all the data */ + sync_filesystem(sbi->sb); + retry_cnt = 0; + +free_meta: +#ifdef CONFIG_QUOTA + f2fs_truncate_quota_inode_pages(sb); + if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) + f2fs_quota_off_umount(sbi->sb); +#endif + /* + * Some dirty meta pages can be produced by f2fs_recover_orphan_inodes() + * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() + * followed by f2fs_write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in f2fs_sync_meta_pages(). + */ + truncate_inode_pages_final(META_MAPPING(sbi)); + /* evict some inodes being cached by GC */ + evict_inodes(sb); + f2fs_unregister_sysfs(sbi); +free_compress_inode: + f2fs_destroy_compress_inode(sbi); +free_root_inode: + dput(sb->s_root); + sb->s_root = NULL; +free_node_inode: + f2fs_release_ino_entry(sbi, true); + truncate_inode_pages_final(NODE_MAPPING(sbi)); + iput(sbi->node_inode); + sbi->node_inode = NULL; +free_stats: + f2fs_destroy_stats(sbi); +free_nm: + /* stop discard thread before destroying node manager */ + f2fs_stop_discard_thread(sbi); + f2fs_destroy_node_manager(sbi); +free_sm: + f2fs_destroy_segment_manager(sbi); +stop_ckpt_thread: + f2fs_stop_ckpt_thread(sbi); + f2fs_destroy_post_read_wq(sbi); +free_devices: + destroy_device_list(sbi); + kvfree(sbi->ckpt); +free_meta_inode: + make_bad_inode(sbi->meta_inode); + iput(sbi->meta_inode); + sbi->meta_inode = NULL; +free_page_array_cache: + f2fs_destroy_page_array_cache(sbi); +free_xattr_cache: + f2fs_destroy_xattr_caches(sbi); +free_io_dummy: + mempool_destroy(sbi->write_io_dummy); +free_percpu: + destroy_percpu_info(sbi); +free_iostat: + f2fs_destroy_iostat(sbi); +free_bio_info: + for (i = 0; i < NR_PAGE_TYPE; i++) + kvfree(sbi->write_io[i]); + +#if IS_ENABLED(CONFIG_UNICODE) + utf8_unload(sb->s_encoding); + sb->s_encoding = NULL; +#endif +free_options: +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(F2FS_OPTION(sbi).s_qf_names[i]); +#endif + fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy); + kvfree(options); +free_sb_buf: + kfree(raw_super); +free_sbi: + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); + kfree(sbi); + + /* give only one another chance */ + if (retry_cnt > 0 && skip_recovery) { + retry_cnt--; + shrink_dcache_sb(sb); + goto try_onemore; + } + return err; +} + +static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super); +} + +static void kill_f2fs_super(struct super_block *sb) +{ + if (sb->s_root) { + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + set_sbi_flag(sbi, SBI_IS_CLOSE); + f2fs_stop_gc_thread(sbi); + f2fs_stop_discard_thread(sbi); + +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* + * latter evict_inode() can bypass checking and invalidating + * compress inode cache. + */ + if (test_opt(sbi, COMPRESS_CACHE)) + truncate_inode_pages_final(COMPRESS_MAPPING(sbi)); +#endif + + if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + struct cp_control cpc = { + .reason = CP_UMOUNT, + }; + f2fs_write_checkpoint(sbi, &cpc); + } + + if (is_sbi_flag_set(sbi, SBI_IS_RECOVERED) && f2fs_readonly(sb)) + sb->s_flags &= ~SB_RDONLY; + } + kill_block_super(sb); +} + +static struct file_system_type f2fs_fs_type = { + .owner = THIS_MODULE, + .name = "f2fs", + .mount = f2fs_mount, + .kill_sb = kill_f2fs_super, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, +}; +MODULE_ALIAS_FS("f2fs"); + +static int __init init_inodecache(void) +{ + f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache", + sizeof(struct f2fs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL); + if (!f2fs_inode_cachep) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(f2fs_inode_cachep); +} + +static int __init init_f2fs_fs(void) +{ + int err; + + if (PAGE_SIZE != F2FS_BLKSIZE) { + printk("F2FS not supported on PAGE_SIZE(%lu) != %d\n", + PAGE_SIZE, F2FS_BLKSIZE); + return -EINVAL; + } + + err = init_inodecache(); + if (err) + goto fail; + err = f2fs_create_node_manager_caches(); + if (err) + goto free_inodecache; + err = f2fs_create_segment_manager_caches(); + if (err) + goto free_node_manager_caches; + err = f2fs_create_checkpoint_caches(); + if (err) + goto free_segment_manager_caches; + err = f2fs_create_recovery_cache(); + if (err) + goto free_checkpoint_caches; + err = f2fs_create_extent_cache(); + if (err) + goto free_recovery_cache; + err = f2fs_create_garbage_collection_cache(); + if (err) + goto free_extent_cache; + err = f2fs_init_sysfs(); + if (err) + goto free_garbage_collection_cache; + err = register_shrinker(&f2fs_shrinker_info, "f2fs-shrinker"); + if (err) + goto free_sysfs; + err = register_filesystem(&f2fs_fs_type); + if (err) + goto free_shrinker; + f2fs_create_root_stats(); + err = f2fs_init_post_read_processing(); + if (err) + goto free_root_stats; + err = f2fs_init_iostat_processing(); + if (err) + goto free_post_read; + err = f2fs_init_bio_entry_cache(); + if (err) + goto free_iostat; + err = f2fs_init_bioset(); + if (err) + goto free_bio_enrty_cache; + err = f2fs_init_compress_mempool(); + if (err) + goto free_bioset; + err = f2fs_init_compress_cache(); + if (err) + goto free_compress_mempool; + err = f2fs_create_casefold_cache(); + if (err) + goto free_compress_cache; + return 0; +free_compress_cache: + f2fs_destroy_compress_cache(); +free_compress_mempool: + f2fs_destroy_compress_mempool(); +free_bioset: + f2fs_destroy_bioset(); +free_bio_enrty_cache: + f2fs_destroy_bio_entry_cache(); +free_iostat: + f2fs_destroy_iostat_processing(); +free_post_read: + f2fs_destroy_post_read_processing(); +free_root_stats: + f2fs_destroy_root_stats(); + unregister_filesystem(&f2fs_fs_type); +free_shrinker: + unregister_shrinker(&f2fs_shrinker_info); +free_sysfs: + f2fs_exit_sysfs(); +free_garbage_collection_cache: + f2fs_destroy_garbage_collection_cache(); +free_extent_cache: + f2fs_destroy_extent_cache(); +free_recovery_cache: + f2fs_destroy_recovery_cache(); +free_checkpoint_caches: + f2fs_destroy_checkpoint_caches(); +free_segment_manager_caches: + f2fs_destroy_segment_manager_caches(); +free_node_manager_caches: + f2fs_destroy_node_manager_caches(); +free_inodecache: + destroy_inodecache(); +fail: + return err; +} + +static void __exit exit_f2fs_fs(void) +{ + f2fs_destroy_casefold_cache(); + f2fs_destroy_compress_cache(); + f2fs_destroy_compress_mempool(); + f2fs_destroy_bioset(); + f2fs_destroy_bio_entry_cache(); + f2fs_destroy_iostat_processing(); + f2fs_destroy_post_read_processing(); + f2fs_destroy_root_stats(); + unregister_filesystem(&f2fs_fs_type); + unregister_shrinker(&f2fs_shrinker_info); + f2fs_exit_sysfs(); + f2fs_destroy_garbage_collection_cache(); + f2fs_destroy_extent_cache(); + f2fs_destroy_recovery_cache(); + f2fs_destroy_checkpoint_caches(); + f2fs_destroy_segment_manager_caches(); + f2fs_destroy_node_manager_caches(); + destroy_inodecache(); +} + +module_init(init_f2fs_fs) +module_exit(exit_f2fs_fs) + +MODULE_AUTHOR("Samsung Electronics's Praesto Team"); +MODULE_DESCRIPTION("Flash Friendly File System"); +MODULE_LICENSE("GPL"); +MODULE_SOFTDEP("pre: crc32"); + diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c new file mode 100644 index 000000000..751a108e6 --- /dev/null +++ b/fs/f2fs/sysfs.c @@ -0,0 +1,1352 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * f2fs sysfs interface + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * Copyright (c) 2017 Chao Yu <chao@kernel.org> + */ +#include <linux/compiler.h> +#include <linux/proc_fs.h> +#include <linux/f2fs_fs.h> +#include <linux/seq_file.h> +#include <linux/unicode.h> +#include <linux/ioprio.h> +#include <linux/sysfs.h> + +#include "f2fs.h" +#include "segment.h" +#include "gc.h" +#include "iostat.h" +#include <trace/events/f2fs.h> + +static struct proc_dir_entry *f2fs_proc_root; + +/* Sysfs support for f2fs */ +enum { + GC_THREAD, /* struct f2fs_gc_thread */ + SM_INFO, /* struct f2fs_sm_info */ + DCC_INFO, /* struct discard_cmd_control */ + NM_INFO, /* struct f2fs_nm_info */ + F2FS_SBI, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_STAT_FS + STAT_INFO, /* struct f2fs_stat_info */ +#endif +#ifdef CONFIG_F2FS_FAULT_INJECTION + FAULT_INFO_RATE, /* struct f2fs_fault_info */ + FAULT_INFO_TYPE, /* struct f2fs_fault_info */ +#endif + RESERVED_BLOCKS, /* struct f2fs_sb_info */ + CPRC_INFO, /* struct ckpt_req_control */ + ATGC_INFO, /* struct atgc_management */ +}; + +static const char *gc_mode_names[MAX_GC_MODE] = { + "GC_NORMAL", + "GC_IDLE_CB", + "GC_IDLE_GREEDY", + "GC_IDLE_AT", + "GC_URGENT_HIGH", + "GC_URGENT_LOW", + "GC_URGENT_MID" +}; + +struct f2fs_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); + ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, + const char *, size_t); + int struct_type; + int offset; + int id; +}; + +static ssize_t f2fs_sbi_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf); + +static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) +{ + if (struct_type == GC_THREAD) + return (unsigned char *)sbi->gc_thread; + else if (struct_type == SM_INFO) + return (unsigned char *)SM_I(sbi); + else if (struct_type == DCC_INFO) + return (unsigned char *)SM_I(sbi)->dcc_info; + else if (struct_type == NM_INFO) + return (unsigned char *)NM_I(sbi); + else if (struct_type == F2FS_SBI || struct_type == RESERVED_BLOCKS) + return (unsigned char *)sbi; +#ifdef CONFIG_F2FS_FAULT_INJECTION + else if (struct_type == FAULT_INFO_RATE || + struct_type == FAULT_INFO_TYPE) + return (unsigned char *)&F2FS_OPTION(sbi).fault_info; +#endif +#ifdef CONFIG_F2FS_STAT_FS + else if (struct_type == STAT_INFO) + return (unsigned char *)F2FS_STAT(sbi); +#endif + else if (struct_type == CPRC_INFO) + return (unsigned char *)&sbi->cprc_info; + else if (struct_type == ATGC_INFO) + return (unsigned char *)&sbi->am; + return NULL; +} + +static ssize_t dirty_segments_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%llu\n", + (unsigned long long)(dirty_segments(sbi))); +} + +static ssize_t free_segments_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%llu\n", + (unsigned long long)(free_segments(sbi))); +} + +static ssize_t ovp_segments_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%llu\n", + (unsigned long long)(overprovision_segments(sbi))); +} + +static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%llu\n", + (unsigned long long)(sbi->kbytes_written + + ((f2fs_get_sectors_written(sbi) - + sbi->sectors_written_start) >> 1))); +} + +static ssize_t sb_status_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%lx\n", sbi->s_flag); +} + +static ssize_t cp_status_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags)); +} + +static ssize_t pending_discard_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (!SM_I(sbi)->dcc_info) + return -EINVAL; + return sprintf(buf, "%llu\n", (unsigned long long)atomic_read( + &SM_I(sbi)->dcc_info->discard_cmd_cnt)); +} + +static ssize_t features_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + int len = 0; + + if (f2fs_sb_has_encrypt(sbi)) + len += scnprintf(buf, PAGE_SIZE - len, "%s", + "encryption"); + if (f2fs_sb_has_blkzoned(sbi)) + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "blkzoned"); + if (f2fs_sb_has_extra_attr(sbi)) + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "extra_attr"); + if (f2fs_sb_has_project_quota(sbi)) + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "projquota"); + if (f2fs_sb_has_inode_chksum(sbi)) + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "inode_checksum"); + if (f2fs_sb_has_flexible_inline_xattr(sbi)) + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "flexible_inline_xattr"); + if (f2fs_sb_has_quota_ino(sbi)) + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "quota_ino"); + if (f2fs_sb_has_inode_crtime(sbi)) + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "inode_crtime"); + if (f2fs_sb_has_lost_found(sbi)) + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "lost_found"); + if (f2fs_sb_has_verity(sbi)) + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "verity"); + if (f2fs_sb_has_sb_chksum(sbi)) + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "sb_checksum"); + if (f2fs_sb_has_casefold(sbi)) + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "casefold"); + if (f2fs_sb_has_readonly(sbi)) + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "readonly"); + if (f2fs_sb_has_compression(sbi)) + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "compression"); + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "pin_file"); + len += scnprintf(buf + len, PAGE_SIZE - len, "\n"); + return len; +} + +static ssize_t current_reserved_blocks_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%u\n", sbi->current_reserved_blocks); +} + +static ssize_t unusable_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + block_t unusable; + + if (test_opt(sbi, DISABLE_CHECKPOINT)) + unusable = sbi->unusable_block_count; + else + unusable = f2fs_get_unusable_blocks(sbi); + return sprintf(buf, "%llu\n", (unsigned long long)unusable); +} + +static ssize_t encoding_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ +#if IS_ENABLED(CONFIG_UNICODE) + struct super_block *sb = sbi->sb; + + if (f2fs_sb_has_casefold(sbi)) + return sysfs_emit(buf, "UTF-8 (%d.%d.%d)\n", + (sb->s_encoding->version >> 16) & 0xff, + (sb->s_encoding->version >> 8) & 0xff, + sb->s_encoding->version & 0xff); +#endif + return sprintf(buf, "(none)"); +} + +static ssize_t mounted_time_sec_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%llu", SIT_I(sbi)->mounted_time); +} + +#ifdef CONFIG_F2FS_STAT_FS +static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + + return sprintf(buf, "%llu\n", + (unsigned long long)(si->tot_blks - + (si->bg_data_blks + si->bg_node_blks))); +} + +static ssize_t moved_blocks_background_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + + return sprintf(buf, "%llu\n", + (unsigned long long)(si->bg_data_blks + si->bg_node_blks)); +} + +static ssize_t avg_vblocks_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + + si->dirty_count = dirty_segments(sbi); + f2fs_update_sit_info(sbi); + return sprintf(buf, "%llu\n", (unsigned long long)(si->avg_vblocks)); +} +#endif + +static ssize_t main_blkaddr_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%llu\n", + (unsigned long long)MAIN_BLKADDR(sbi)); +} + +static ssize_t f2fs_sbi_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + unsigned char *ptr = NULL; + unsigned int *ui; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + if (!strcmp(a->attr.name, "extension_list")) { + __u8 (*extlist)[F2FS_EXTENSION_LEN] = + sbi->raw_super->extension_list; + int cold_count = le32_to_cpu(sbi->raw_super->extension_count); + int hot_count = sbi->raw_super->hot_ext_count; + int len = 0, i; + + len += scnprintf(buf + len, PAGE_SIZE - len, + "cold file extension:\n"); + for (i = 0; i < cold_count; i++) + len += scnprintf(buf + len, PAGE_SIZE - len, "%s\n", + extlist[i]); + + len += scnprintf(buf + len, PAGE_SIZE - len, + "hot file extension:\n"); + for (i = cold_count; i < cold_count + hot_count; i++) + len += scnprintf(buf + len, PAGE_SIZE - len, "%s\n", + extlist[i]); + return len; + } + + if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { + struct ckpt_req_control *cprc = &sbi->cprc_info; + int len = 0; + int class = IOPRIO_PRIO_CLASS(cprc->ckpt_thread_ioprio); + int data = IOPRIO_PRIO_DATA(cprc->ckpt_thread_ioprio); + + if (class == IOPRIO_CLASS_RT) + len += scnprintf(buf + len, PAGE_SIZE - len, "rt,"); + else if (class == IOPRIO_CLASS_BE) + len += scnprintf(buf + len, PAGE_SIZE - len, "be,"); + else + return -EINVAL; + + len += scnprintf(buf + len, PAGE_SIZE - len, "%d\n", data); + return len; + } + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (!strcmp(a->attr.name, "compr_written_block")) + return sysfs_emit(buf, "%llu\n", sbi->compr_written_block); + + if (!strcmp(a->attr.name, "compr_saved_block")) + return sysfs_emit(buf, "%llu\n", sbi->compr_saved_block); + + if (!strcmp(a->attr.name, "compr_new_inode")) + return sysfs_emit(buf, "%u\n", sbi->compr_new_inode); +#endif + + if (!strcmp(a->attr.name, "gc_urgent")) + return sysfs_emit(buf, "%s\n", + gc_mode_names[sbi->gc_mode]); + + if (!strcmp(a->attr.name, "gc_segment_mode")) + return sysfs_emit(buf, "%s\n", + gc_mode_names[sbi->gc_segment_mode]); + + if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { + return sysfs_emit(buf, "%u\n", + sbi->gc_reclaimed_segs[sbi->gc_segment_mode]); + } + + if (!strcmp(a->attr.name, "current_atomic_write")) { + s64 current_write = atomic64_read(&sbi->current_atomic_write); + + return sysfs_emit(buf, "%lld\n", current_write); + } + + if (!strcmp(a->attr.name, "peak_atomic_write")) + return sysfs_emit(buf, "%lld\n", sbi->peak_atomic_write); + + if (!strcmp(a->attr.name, "committed_atomic_block")) + return sysfs_emit(buf, "%llu\n", sbi->committed_atomic_block); + + if (!strcmp(a->attr.name, "revoked_atomic_block")) + return sysfs_emit(buf, "%llu\n", sbi->revoked_atomic_block); + + ui = (unsigned int *)(ptr + a->offset); + + return sprintf(buf, "%u\n", *ui); +} + +static ssize_t __sbi_store(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned char *ptr; + unsigned long t; + unsigned int *ui; + ssize_t ret; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + if (!strcmp(a->attr.name, "extension_list")) { + const char *name = strim((char *)buf); + bool set = true, hot; + + if (!strncmp(name, "[h]", 3)) + hot = true; + else if (!strncmp(name, "[c]", 3)) + hot = false; + else + return -EINVAL; + + name += 3; + + if (*name == '!') { + name++; + set = false; + } + + if (!strlen(name) || strlen(name) >= F2FS_EXTENSION_LEN) + return -EINVAL; + + f2fs_down_write(&sbi->sb_lock); + + ret = f2fs_update_extension_list(sbi, name, hot, set); + if (ret) + goto out; + + ret = f2fs_commit_super(sbi, false); + if (ret) + f2fs_update_extension_list(sbi, name, hot, !set); +out: + f2fs_up_write(&sbi->sb_lock); + return ret ? ret : count; + } + + if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { + const char *name = strim((char *)buf); + struct ckpt_req_control *cprc = &sbi->cprc_info; + int class; + long data; + int ret; + + if (!strncmp(name, "rt,", 3)) + class = IOPRIO_CLASS_RT; + else if (!strncmp(name, "be,", 3)) + class = IOPRIO_CLASS_BE; + else + return -EINVAL; + + name += 3; + ret = kstrtol(name, 10, &data); + if (ret) + return ret; + if (data >= IOPRIO_NR_LEVELS || data < 0) + return -EINVAL; + + cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data); + if (test_opt(sbi, MERGE_CHECKPOINT)) { + ret = set_task_ioprio(cprc->f2fs_issue_ckpt, + cprc->ckpt_thread_ioprio); + if (ret) + return ret; + } + + return count; + } + + ui = (unsigned int *)(ptr + a->offset); + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret < 0) + return ret; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (a->struct_type == FAULT_INFO_TYPE && t >= BIT(FAULT_MAX)) + return -EINVAL; + if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX) + return -EINVAL; +#endif + if (a->struct_type == RESERVED_BLOCKS) { + spin_lock(&sbi->stat_lock); + if (t > (unsigned long)(sbi->user_block_count - + F2FS_OPTION(sbi).root_reserved_blocks - + sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments)) { + spin_unlock(&sbi->stat_lock); + return -EINVAL; + } + *ui = t; + sbi->current_reserved_blocks = min(sbi->reserved_blocks, + sbi->user_block_count - valid_user_blocks(sbi)); + spin_unlock(&sbi->stat_lock); + return count; + } + + if (!strcmp(a->attr.name, "discard_granularity")) { + if (t == 0 || t > MAX_PLIST_NUM) + return -EINVAL; + if (!f2fs_block_unit_discard(sbi)) + return -EINVAL; + if (t == *ui) + return count; + *ui = t; + return count; + } + + if (!strcmp(a->attr.name, "migration_granularity")) { + if (t == 0 || t > sbi->segs_per_sec) + return -EINVAL; + } + + if (!strcmp(a->attr.name, "trim_sections")) + return -EINVAL; + + if (!strcmp(a->attr.name, "gc_urgent")) { + if (t == 0) { + sbi->gc_mode = GC_NORMAL; + } else if (t == 1) { + sbi->gc_mode = GC_URGENT_HIGH; + if (sbi->gc_thread) { + sbi->gc_thread->gc_wake = 1; + wake_up_interruptible_all( + &sbi->gc_thread->gc_wait_queue_head); + wake_up_discard_thread(sbi, true); + } + } else if (t == 2) { + sbi->gc_mode = GC_URGENT_LOW; + } else if (t == 3) { + sbi->gc_mode = GC_URGENT_MID; + if (sbi->gc_thread) { + sbi->gc_thread->gc_wake = 1; + wake_up_interruptible_all( + &sbi->gc_thread->gc_wait_queue_head); + } + } else { + return -EINVAL; + } + return count; + } + if (!strcmp(a->attr.name, "gc_idle")) { + if (t == GC_IDLE_CB) { + sbi->gc_mode = GC_IDLE_CB; + } else if (t == GC_IDLE_GREEDY) { + sbi->gc_mode = GC_IDLE_GREEDY; + } else if (t == GC_IDLE_AT) { + if (!sbi->am.atgc_enabled) + return -EINVAL; + sbi->gc_mode = GC_IDLE_AT; + } else { + sbi->gc_mode = GC_NORMAL; + } + return count; + } + + if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) { + spin_lock(&sbi->gc_urgent_high_lock); + sbi->gc_urgent_high_remaining = t; + spin_unlock(&sbi->gc_urgent_high_lock); + + return count; + } + +#ifdef CONFIG_F2FS_IOSTAT + if (!strcmp(a->attr.name, "iostat_enable")) { + sbi->iostat_enable = !!t; + if (!sbi->iostat_enable) + f2fs_reset_iostat(sbi); + return count; + } + + if (!strcmp(a->attr.name, "iostat_period_ms")) { + if (t < MIN_IOSTAT_PERIOD_MS || t > MAX_IOSTAT_PERIOD_MS) + return -EINVAL; + spin_lock_irq(&sbi->iostat_lock); + sbi->iostat_period_ms = (unsigned int)t; + spin_unlock_irq(&sbi->iostat_lock); + return count; + } +#endif + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (!strcmp(a->attr.name, "compr_written_block") || + !strcmp(a->attr.name, "compr_saved_block")) { + if (t != 0) + return -EINVAL; + sbi->compr_written_block = 0; + sbi->compr_saved_block = 0; + return count; + } + + if (!strcmp(a->attr.name, "compr_new_inode")) { + if (t != 0) + return -EINVAL; + sbi->compr_new_inode = 0; + return count; + } +#endif + + if (!strcmp(a->attr.name, "atgc_candidate_ratio")) { + if (t > 100) + return -EINVAL; + sbi->am.candidate_ratio = t; + return count; + } + + if (!strcmp(a->attr.name, "atgc_age_weight")) { + if (t > 100) + return -EINVAL; + sbi->am.age_weight = t; + return count; + } + + if (!strcmp(a->attr.name, "gc_segment_mode")) { + if (t < MAX_GC_MODE) + sbi->gc_segment_mode = t; + else + return -EINVAL; + return count; + } + + if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { + if (t != 0) + return -EINVAL; + sbi->gc_reclaimed_segs[sbi->gc_segment_mode] = 0; + return count; + } + + if (!strcmp(a->attr.name, "seq_file_ra_mul")) { + if (t >= MIN_RA_MUL && t <= MAX_RA_MUL) + sbi->seq_file_ra_mul = t; + else + return -EINVAL; + return count; + } + + if (!strcmp(a->attr.name, "max_fragment_chunk")) { + if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE) + sbi->max_fragment_chunk = t; + else + return -EINVAL; + return count; + } + + if (!strcmp(a->attr.name, "max_fragment_hole")) { + if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE) + sbi->max_fragment_hole = t; + else + return -EINVAL; + return count; + } + + if (!strcmp(a->attr.name, "peak_atomic_write")) { + if (t != 0) + return -EINVAL; + sbi->peak_atomic_write = 0; + return count; + } + + if (!strcmp(a->attr.name, "committed_atomic_block")) { + if (t != 0) + return -EINVAL; + sbi->committed_atomic_block = 0; + return count; + } + + if (!strcmp(a->attr.name, "revoked_atomic_block")) { + if (t != 0) + return -EINVAL; + sbi->revoked_atomic_block = 0; + return count; + } + + *ui = (unsigned int)t; + + return count; +} + +static ssize_t f2fs_sbi_store(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + const char *buf, size_t count) +{ + ssize_t ret; + bool gc_entry = (!strcmp(a->attr.name, "gc_urgent") || + a->struct_type == GC_THREAD); + + if (gc_entry) { + if (!down_read_trylock(&sbi->sb->s_umount)) + return -EAGAIN; + } + ret = __sbi_store(a, sbi, buf, count); + if (gc_entry) + up_read(&sbi->sb->s_umount); + + return ret; +} + +static ssize_t f2fs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_sb_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +/* + * Note that there are three feature list entries: + * 1) /sys/fs/f2fs/features + * : shows runtime features supported by in-kernel f2fs along with Kconfig. + * - ref. F2FS_FEATURE_RO_ATTR() + * + * 2) /sys/fs/f2fs/$s_id/features <deprecated> + * : shows on-disk features enabled by mkfs.f2fs, used for old kernels. This + * won't add new feature anymore, and thus, users should check entries in 3) + * instead of this 2). + * + * 3) /sys/fs/f2fs/$s_id/feature_list + * : shows on-disk features enabled by mkfs.f2fs per instance, which follows + * sysfs entry rule where each entry should expose single value. + * This list covers old feature list provided by 2) and beyond. Therefore, + * please add new on-disk feature in this list only. + * - ref. F2FS_SB_FEATURE_RO_ATTR() + */ +static ssize_t f2fs_feature_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "supported\n"); +} + +#define F2FS_FEATURE_RO_ATTR(_name) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_feature_show, \ +} + +static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (F2FS_HAS_FEATURE(sbi, a->id)) + return sprintf(buf, "supported\n"); + return sprintf(buf, "unsupported\n"); +} + +#define F2FS_SB_FEATURE_RO_ATTR(_name, _feat) \ +static struct f2fs_attr f2fs_attr_sb_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_sb_feature_show, \ + .id = F2FS_FEATURE_##_feat, \ +} + +#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .struct_type = _struct_type, \ + .offset = _offset \ +} + +#define F2FS_RO_ATTR(struct_type, struct_name, name, elname) \ + F2FS_ATTR_OFFSET(struct_type, name, 0444, \ + f2fs_sbi_show, NULL, \ + offsetof(struct struct_name, elname)) + +#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ + F2FS_ATTR_OFFSET(struct_type, name, 0644, \ + f2fs_sbi_show, f2fs_sbi_store, \ + offsetof(struct struct_name, elname)) + +#define F2FS_GENERAL_RO_ATTR(name) \ +static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) + +#define F2FS_STAT_ATTR(_struct_type, _struct_name, _name, _elname) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_sbi_show, \ + .struct_type = _struct_type, \ + .offset = offsetof(struct _struct_name, _elname), \ +} + +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time, + urgent_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_request); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); +F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_seq_blocks, min_seq_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, max_roll_forward_node_blocks, max_rf_node_blocks); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, migration_granularity, migration_granularity); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval, + interval_time[DISCARD_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, + umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]); +#ifdef CONFIG_F2FS_IOSTAT +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_period_ms, iostat_period_ms); +#endif +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_io_bytes, max_io_bytes); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); +F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list); +#ifdef CONFIG_F2FS_FAULT_INJECTION +F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); +F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); +#endif +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent_high_remaining, gc_urgent_high_remaining); +F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); +F2FS_GENERAL_RO_ATTR(dirty_segments); +F2FS_GENERAL_RO_ATTR(free_segments); +F2FS_GENERAL_RO_ATTR(ovp_segments); +F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); +F2FS_GENERAL_RO_ATTR(features); +F2FS_GENERAL_RO_ATTR(current_reserved_blocks); +F2FS_GENERAL_RO_ATTR(unusable); +F2FS_GENERAL_RO_ATTR(encoding); +F2FS_GENERAL_RO_ATTR(mounted_time_sec); +F2FS_GENERAL_RO_ATTR(main_blkaddr); +F2FS_GENERAL_RO_ATTR(pending_discard); +#ifdef CONFIG_F2FS_STAT_FS +F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count); +F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count); +F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, gc_foreground_calls, call_count); +F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, gc_background_calls, bg_gc); +F2FS_GENERAL_RO_ATTR(moved_blocks_background); +F2FS_GENERAL_RO_ATTR(moved_blocks_foreground); +F2FS_GENERAL_RO_ATTR(avg_vblocks); +#endif + +#ifdef CONFIG_FS_ENCRYPTION +F2FS_FEATURE_RO_ATTR(encryption); +F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2); +#if IS_ENABLED(CONFIG_UNICODE) +F2FS_FEATURE_RO_ATTR(encrypted_casefold); +#endif +#endif /* CONFIG_FS_ENCRYPTION */ +#ifdef CONFIG_BLK_DEV_ZONED +F2FS_FEATURE_RO_ATTR(block_zoned); +F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, unusable_blocks_per_sec, + unusable_blocks_per_sec); +#endif +F2FS_FEATURE_RO_ATTR(atomic_write); +F2FS_FEATURE_RO_ATTR(extra_attr); +F2FS_FEATURE_RO_ATTR(project_quota); +F2FS_FEATURE_RO_ATTR(inode_checksum); +F2FS_FEATURE_RO_ATTR(flexible_inline_xattr); +F2FS_FEATURE_RO_ATTR(quota_ino); +F2FS_FEATURE_RO_ATTR(inode_crtime); +F2FS_FEATURE_RO_ATTR(lost_found); +#ifdef CONFIG_FS_VERITY +F2FS_FEATURE_RO_ATTR(verity); +#endif +F2FS_FEATURE_RO_ATTR(sb_checksum); +#if IS_ENABLED(CONFIG_UNICODE) +F2FS_FEATURE_RO_ATTR(casefold); +#endif +F2FS_FEATURE_RO_ATTR(readonly); +#ifdef CONFIG_F2FS_FS_COMPRESSION +F2FS_FEATURE_RO_ATTR(compression); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode); +#endif +F2FS_FEATURE_RO_ATTR(pin_file); + +/* For ATGC */ +F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_ratio, candidate_ratio); +F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_count, max_candidate_count); +F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_weight, age_weight); +F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold); + +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, seq_file_ra_mul, seq_file_ra_mul); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_segment_mode, gc_segment_mode); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_chunk, max_fragment_chunk); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_hole, max_fragment_hole); + +/* For atomic write */ +F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, current_atomic_write, current_atomic_write); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, peak_atomic_write, peak_atomic_write); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, committed_atomic_block, committed_atomic_block); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block); + +#define ATTR_LIST(name) (&f2fs_attr_##name.attr) +static struct attribute *f2fs_attrs[] = { + ATTR_LIST(gc_urgent_sleep_time), + ATTR_LIST(gc_min_sleep_time), + ATTR_LIST(gc_max_sleep_time), + ATTR_LIST(gc_no_gc_sleep_time), + ATTR_LIST(gc_idle), + ATTR_LIST(gc_urgent), + ATTR_LIST(reclaim_segments), + ATTR_LIST(main_blkaddr), + ATTR_LIST(max_small_discards), + ATTR_LIST(max_discard_request), + ATTR_LIST(min_discard_issue_time), + ATTR_LIST(mid_discard_issue_time), + ATTR_LIST(max_discard_issue_time), + ATTR_LIST(discard_granularity), + ATTR_LIST(pending_discard), + ATTR_LIST(batched_trim_sections), + ATTR_LIST(ipu_policy), + ATTR_LIST(min_ipu_util), + ATTR_LIST(min_fsync_blocks), + ATTR_LIST(min_seq_blocks), + ATTR_LIST(min_hot_blocks), + ATTR_LIST(min_ssr_sections), + ATTR_LIST(max_victim_search), + ATTR_LIST(migration_granularity), + ATTR_LIST(dir_level), + ATTR_LIST(ram_thresh), + ATTR_LIST(ra_nid_pages), + ATTR_LIST(dirty_nats_ratio), + ATTR_LIST(max_roll_forward_node_blocks), + ATTR_LIST(cp_interval), + ATTR_LIST(idle_interval), + ATTR_LIST(discard_idle_interval), + ATTR_LIST(gc_idle_interval), + ATTR_LIST(umount_discard_timeout), +#ifdef CONFIG_F2FS_IOSTAT + ATTR_LIST(iostat_enable), + ATTR_LIST(iostat_period_ms), +#endif + ATTR_LIST(readdir_ra), + ATTR_LIST(max_io_bytes), + ATTR_LIST(gc_pin_file_thresh), + ATTR_LIST(extension_list), +#ifdef CONFIG_F2FS_FAULT_INJECTION + ATTR_LIST(inject_rate), + ATTR_LIST(inject_type), +#endif + ATTR_LIST(data_io_flag), + ATTR_LIST(node_io_flag), + ATTR_LIST(gc_urgent_high_remaining), + ATTR_LIST(ckpt_thread_ioprio), + ATTR_LIST(dirty_segments), + ATTR_LIST(free_segments), + ATTR_LIST(ovp_segments), + ATTR_LIST(unusable), + ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(features), + ATTR_LIST(reserved_blocks), + ATTR_LIST(current_reserved_blocks), + ATTR_LIST(encoding), + ATTR_LIST(mounted_time_sec), +#ifdef CONFIG_F2FS_STAT_FS + ATTR_LIST(cp_foreground_calls), + ATTR_LIST(cp_background_calls), + ATTR_LIST(gc_foreground_calls), + ATTR_LIST(gc_background_calls), + ATTR_LIST(moved_blocks_foreground), + ATTR_LIST(moved_blocks_background), + ATTR_LIST(avg_vblocks), +#endif +#ifdef CONFIG_BLK_DEV_ZONED + ATTR_LIST(unusable_blocks_per_sec), +#endif +#ifdef CONFIG_F2FS_FS_COMPRESSION + ATTR_LIST(compr_written_block), + ATTR_LIST(compr_saved_block), + ATTR_LIST(compr_new_inode), +#endif + /* For ATGC */ + ATTR_LIST(atgc_candidate_ratio), + ATTR_LIST(atgc_candidate_count), + ATTR_LIST(atgc_age_weight), + ATTR_LIST(atgc_age_threshold), + ATTR_LIST(seq_file_ra_mul), + ATTR_LIST(gc_segment_mode), + ATTR_LIST(gc_reclaimed_segments), + ATTR_LIST(max_fragment_chunk), + ATTR_LIST(max_fragment_hole), + ATTR_LIST(current_atomic_write), + ATTR_LIST(peak_atomic_write), + ATTR_LIST(committed_atomic_block), + ATTR_LIST(revoked_atomic_block), + NULL, +}; +ATTRIBUTE_GROUPS(f2fs); + +static struct attribute *f2fs_feat_attrs[] = { +#ifdef CONFIG_FS_ENCRYPTION + ATTR_LIST(encryption), + ATTR_LIST(test_dummy_encryption_v2), +#if IS_ENABLED(CONFIG_UNICODE) + ATTR_LIST(encrypted_casefold), +#endif +#endif /* CONFIG_FS_ENCRYPTION */ +#ifdef CONFIG_BLK_DEV_ZONED + ATTR_LIST(block_zoned), +#endif + ATTR_LIST(atomic_write), + ATTR_LIST(extra_attr), + ATTR_LIST(project_quota), + ATTR_LIST(inode_checksum), + ATTR_LIST(flexible_inline_xattr), + ATTR_LIST(quota_ino), + ATTR_LIST(inode_crtime), + ATTR_LIST(lost_found), +#ifdef CONFIG_FS_VERITY + ATTR_LIST(verity), +#endif + ATTR_LIST(sb_checksum), +#if IS_ENABLED(CONFIG_UNICODE) + ATTR_LIST(casefold), +#endif + ATTR_LIST(readonly), +#ifdef CONFIG_F2FS_FS_COMPRESSION + ATTR_LIST(compression), +#endif + ATTR_LIST(pin_file), + NULL, +}; +ATTRIBUTE_GROUPS(f2fs_feat); + +F2FS_GENERAL_RO_ATTR(sb_status); +F2FS_GENERAL_RO_ATTR(cp_status); +static struct attribute *f2fs_stat_attrs[] = { + ATTR_LIST(sb_status), + ATTR_LIST(cp_status), + NULL, +}; +ATTRIBUTE_GROUPS(f2fs_stat); + +F2FS_SB_FEATURE_RO_ATTR(encryption, ENCRYPT); +F2FS_SB_FEATURE_RO_ATTR(block_zoned, BLKZONED); +F2FS_SB_FEATURE_RO_ATTR(extra_attr, EXTRA_ATTR); +F2FS_SB_FEATURE_RO_ATTR(project_quota, PRJQUOTA); +F2FS_SB_FEATURE_RO_ATTR(inode_checksum, INODE_CHKSUM); +F2FS_SB_FEATURE_RO_ATTR(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); +F2FS_SB_FEATURE_RO_ATTR(quota_ino, QUOTA_INO); +F2FS_SB_FEATURE_RO_ATTR(inode_crtime, INODE_CRTIME); +F2FS_SB_FEATURE_RO_ATTR(lost_found, LOST_FOUND); +F2FS_SB_FEATURE_RO_ATTR(verity, VERITY); +F2FS_SB_FEATURE_RO_ATTR(sb_checksum, SB_CHKSUM); +F2FS_SB_FEATURE_RO_ATTR(casefold, CASEFOLD); +F2FS_SB_FEATURE_RO_ATTR(compression, COMPRESSION); +F2FS_SB_FEATURE_RO_ATTR(readonly, RO); + +static struct attribute *f2fs_sb_feat_attrs[] = { + ATTR_LIST(sb_encryption), + ATTR_LIST(sb_block_zoned), + ATTR_LIST(sb_extra_attr), + ATTR_LIST(sb_project_quota), + ATTR_LIST(sb_inode_checksum), + ATTR_LIST(sb_flexible_inline_xattr), + ATTR_LIST(sb_quota_ino), + ATTR_LIST(sb_inode_crtime), + ATTR_LIST(sb_lost_found), + ATTR_LIST(sb_verity), + ATTR_LIST(sb_sb_checksum), + ATTR_LIST(sb_casefold), + ATTR_LIST(sb_compression), + ATTR_LIST(sb_readonly), + NULL, +}; +ATTRIBUTE_GROUPS(f2fs_sb_feat); + +static const struct sysfs_ops f2fs_attr_ops = { + .show = f2fs_attr_show, + .store = f2fs_attr_store, +}; + +static struct kobj_type f2fs_sb_ktype = { + .default_groups = f2fs_groups, + .sysfs_ops = &f2fs_attr_ops, + .release = f2fs_sb_release, +}; + +static struct kobj_type f2fs_ktype = { + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kset f2fs_kset = { + .kobj = {.ktype = &f2fs_ktype}, +}; + +static struct kobj_type f2fs_feat_ktype = { + .default_groups = f2fs_feat_groups, + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kobject f2fs_feat = { + .kset = &f2fs_kset, +}; + +static ssize_t f2fs_stat_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_stat_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_stat_kobj_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + complete(&sbi->s_stat_kobj_unregister); +} + +static const struct sysfs_ops f2fs_stat_attr_ops = { + .show = f2fs_stat_attr_show, + .store = f2fs_stat_attr_store, +}; + +static struct kobj_type f2fs_stat_ktype = { + .default_groups = f2fs_stat_groups, + .sysfs_ops = &f2fs_stat_attr_ops, + .release = f2fs_stat_kobj_release, +}; + +static ssize_t f2fs_sb_feat_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_feature_list_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static void f2fs_feature_list_kobj_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_feature_list_kobj); + complete(&sbi->s_feature_list_kobj_unregister); +} + +static const struct sysfs_ops f2fs_feature_list_attr_ops = { + .show = f2fs_sb_feat_attr_show, +}; + +static struct kobj_type f2fs_feature_list_ktype = { + .default_groups = f2fs_sb_feat_groups, + .sysfs_ops = &f2fs_feature_list_attr_ops, + .release = f2fs_feature_list_kobj_release, +}; + +static int __maybe_unused segment_info_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i; + + seq_puts(seq, "format: segment_type|valid_blocks\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + if ((i % 10) == 0) + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u", se->type, se->valid_blocks); + if ((i % 10) == 9 || i == (total_segs - 1)) + seq_putc(seq, '\n'); + else + seq_putc(seq, ' '); + } + + return 0; +} + +static int __maybe_unused segment_bits_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i, j; + + seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u|", se->type, se->valid_blocks); + for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) + seq_printf(seq, " %.2x", se->cur_valid_map[j]); + seq_putc(seq, '\n'); + } + return 0; +} + +static int __maybe_unused victim_bits_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + int i; + + seq_puts(seq, "format: victim_secmap bitmaps\n"); + + for (i = 0; i < MAIN_SECS(sbi); i++) { + if ((i % 10) == 0) + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d", test_bit(i, dirty_i->victim_secmap) ? 1 : 0); + if ((i % 10) == 9 || i == (MAIN_SECS(sbi) - 1)) + seq_putc(seq, '\n'); + else + seq_putc(seq, ' '); + } + return 0; +} + +int __init f2fs_init_sysfs(void) +{ + int ret; + + kobject_set_name(&f2fs_kset.kobj, "f2fs"); + f2fs_kset.kobj.parent = fs_kobj; + ret = kset_register(&f2fs_kset); + if (ret) + return ret; + + ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype, + NULL, "features"); + if (ret) { + kobject_put(&f2fs_feat); + kset_unregister(&f2fs_kset); + } else { + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + } + return ret; +} + +void f2fs_exit_sysfs(void) +{ + kobject_put(&f2fs_feat); + kset_unregister(&f2fs_kset); + remove_proc_entry("fs/f2fs", NULL); + f2fs_proc_root = NULL; +} + +int f2fs_register_sysfs(struct f2fs_sb_info *sbi) +{ + struct super_block *sb = sbi->sb; + int err; + + sbi->s_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, + "%s", sb->s_id); + if (err) + goto put_sb_kobj; + + sbi->s_stat_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_stat_kobj_unregister); + err = kobject_init_and_add(&sbi->s_stat_kobj, &f2fs_stat_ktype, + &sbi->s_kobj, "stat"); + if (err) + goto put_stat_kobj; + + sbi->s_feature_list_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_feature_list_kobj_unregister); + err = kobject_init_and_add(&sbi->s_feature_list_kobj, + &f2fs_feature_list_ktype, + &sbi->s_kobj, "feature_list"); + if (err) + goto put_feature_list_kobj; + + if (f2fs_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); + + if (sbi->s_proc) { + proc_create_single_data("segment_info", 0444, sbi->s_proc, + segment_info_seq_show, sb); + proc_create_single_data("segment_bits", 0444, sbi->s_proc, + segment_bits_seq_show, sb); +#ifdef CONFIG_F2FS_IOSTAT + proc_create_single_data("iostat_info", 0444, sbi->s_proc, + iostat_info_seq_show, sb); +#endif + proc_create_single_data("victim_bits", 0444, sbi->s_proc, + victim_bits_seq_show, sb); + } + return 0; +put_feature_list_kobj: + kobject_put(&sbi->s_feature_list_kobj); + wait_for_completion(&sbi->s_feature_list_kobj_unregister); +put_stat_kobj: + kobject_put(&sbi->s_stat_kobj); + wait_for_completion(&sbi->s_stat_kobj_unregister); +put_sb_kobj: + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + return err; +} + +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) +{ + if (sbi->s_proc) { +#ifdef CONFIG_F2FS_IOSTAT + remove_proc_entry("iostat_info", sbi->s_proc); +#endif + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); + remove_proc_entry("victim_bits", sbi->s_proc); + remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); + } + + kobject_del(&sbi->s_stat_kobj); + kobject_put(&sbi->s_stat_kobj); + wait_for_completion(&sbi->s_stat_kobj_unregister); + kobject_del(&sbi->s_feature_list_kobj); + kobject_put(&sbi->s_feature_list_kobj); + wait_for_completion(&sbi->s_feature_list_kobj_unregister); + + kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); +} diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c new file mode 100644 index 000000000..3f4f3295f --- /dev/null +++ b/fs/f2fs/verity.c @@ -0,0 +1,292 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/f2fs/verity.c: fs-verity support for f2fs + * + * Copyright 2019 Google LLC + */ + +/* + * Implementation of fsverity_operations for f2fs. + * + * Like ext4, f2fs stores the verity metadata (Merkle tree and + * fsverity_descriptor) past the end of the file, starting at the first 64K + * boundary beyond i_size. This approach works because (a) verity files are + * readonly, and (b) pages fully beyond i_size aren't visible to userspace but + * can be read/written internally by f2fs with only some relatively small + * changes to f2fs. Extended attributes cannot be used because (a) f2fs limits + * the total size of an inode's xattr entries to 4096 bytes, which wouldn't be + * enough for even a single Merkle tree block, and (b) f2fs encryption doesn't + * encrypt xattrs, yet the verity metadata *must* be encrypted when the file is + * because it contains hashes of the plaintext data. + * + * Using a 64K boundary rather than a 4K one keeps things ready for + * architectures with 64K pages, and it doesn't necessarily waste space on-disk + * since there can be a hole between i_size and the start of the Merkle tree. + */ + +#include <linux/f2fs_fs.h> + +#include "f2fs.h" +#include "xattr.h" + +#define F2FS_VERIFY_VER (1) + +static inline loff_t f2fs_verity_metadata_pos(const struct inode *inode) +{ + return round_up(inode->i_size, 65536); +} + +/* + * Read some verity metadata from the inode. __vfs_read() can't be used because + * we need to read beyond i_size. + */ +static int pagecache_read(struct inode *inode, void *buf, size_t count, + loff_t pos) +{ + while (count) { + size_t n = min_t(size_t, count, + PAGE_SIZE - offset_in_page(pos)); + struct page *page; + + page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, + NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + memcpy_from_page(buf, page, offset_in_page(pos), n); + + put_page(page); + + buf += n; + pos += n; + count -= n; + } + return 0; +} + +/* + * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY. + * kernel_write() can't be used because the file descriptor is readonly. + */ +static int pagecache_write(struct inode *inode, const void *buf, size_t count, + loff_t pos) +{ + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *aops = mapping->a_ops; + + if (pos + count > inode->i_sb->s_maxbytes) + return -EFBIG; + + while (count) { + size_t n = min_t(size_t, count, + PAGE_SIZE - offset_in_page(pos)); + struct page *page; + void *fsdata = NULL; + int res; + + res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); + if (res) + return res; + + memcpy_to_page(page, offset_in_page(pos), buf, n); + + res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata); + if (res < 0) + return res; + if (res != n) + return -EIO; + + buf += n; + pos += n; + count -= n; + } + return 0; +} + +/* + * Format of f2fs verity xattr. This points to the location of the verity + * descriptor within the file data rather than containing it directly because + * the verity descriptor *must* be encrypted when f2fs encryption is used. But, + * f2fs encryption does not encrypt xattrs. + */ +struct fsverity_descriptor_location { + __le32 version; + __le32 size; + __le64 pos; +}; + +static int f2fs_begin_enable_verity(struct file *filp) +{ + struct inode *inode = file_inode(filp); + int err; + + if (f2fs_verity_in_progress(inode)) + return -EBUSY; + + if (f2fs_is_atomic_file(inode)) + return -EOPNOTSUPP; + + /* + * Since the file was opened readonly, we have to initialize the quotas + * here and not rely on ->open() doing it. This must be done before + * evicting the inline data. + */ + err = f2fs_dquot_initialize(inode); + if (err) + return err; + + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + + set_inode_flag(inode, FI_VERITY_IN_PROGRESS); + return 0; +} + +static int f2fs_end_enable_verity(struct file *filp, const void *desc, + size_t desc_size, u64 merkle_tree_size) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + u64 desc_pos = f2fs_verity_metadata_pos(inode) + merkle_tree_size; + struct fsverity_descriptor_location dloc = { + .version = cpu_to_le32(F2FS_VERIFY_VER), + .size = cpu_to_le32(desc_size), + .pos = cpu_to_le64(desc_pos), + }; + int err = 0, err2 = 0; + + /* + * If an error already occurred (which fs/verity/ signals by passing + * desc == NULL), then only clean-up is needed. + */ + if (desc == NULL) + goto cleanup; + + /* Append the verity descriptor. */ + err = pagecache_write(inode, desc, desc_size, desc_pos); + if (err) + goto cleanup; + + /* + * Write all pages (both data and verity metadata). Note that this must + * happen before clearing FI_VERITY_IN_PROGRESS; otherwise pages beyond + * i_size won't be written properly. For crash consistency, this also + * must happen before the verity inode flag gets persisted. + */ + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto cleanup; + + /* Set the verity xattr. */ + err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY, + F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), + NULL, XATTR_CREATE); + if (err) + goto cleanup; + + /* Finally, set the verity inode flag. */ + file_set_verity(inode); + f2fs_set_inode_flags(inode); + f2fs_mark_inode_dirty_sync(inode, true); + + clear_inode_flag(inode, FI_VERITY_IN_PROGRESS); + return 0; + +cleanup: + /* + * Verity failed to be enabled, so clean up by truncating any verity + * metadata that was written beyond i_size (both from cache and from + * disk) and clearing FI_VERITY_IN_PROGRESS. + * + * Taking i_gc_rwsem[WRITE] is needed to stop f2fs garbage collection + * from re-instantiating cached pages we are truncating (since unlike + * normal file accesses, garbage collection isn't limited by i_size). + */ + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + truncate_inode_pages(inode->i_mapping, inode->i_size); + err2 = f2fs_truncate(inode); + if (err2) { + f2fs_err(sbi, "Truncating verity metadata failed (errno=%d)", + err2); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + clear_inode_flag(inode, FI_VERITY_IN_PROGRESS); + return err ?: err2; +} + +static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, + size_t buf_size) +{ + struct fsverity_descriptor_location dloc; + int res; + u32 size; + u64 pos; + + /* Get the descriptor location */ + res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_VERITY, + F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), NULL); + if (res < 0 && res != -ERANGE) + return res; + if (res != sizeof(dloc) || dloc.version != cpu_to_le32(F2FS_VERIFY_VER)) { + f2fs_warn(F2FS_I_SB(inode), "unknown verity xattr format"); + return -EINVAL; + } + size = le32_to_cpu(dloc.size); + pos = le64_to_cpu(dloc.pos); + + /* Get the descriptor */ + if (pos + size < pos || pos + size > inode->i_sb->s_maxbytes || + pos < f2fs_verity_metadata_pos(inode) || size > INT_MAX) { + f2fs_warn(F2FS_I_SB(inode), "invalid verity xattr"); + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_VERITY_XATTR); + return -EFSCORRUPTED; + } + if (buf_size) { + if (size > buf_size) + return -ERANGE; + res = pagecache_read(inode, buf, size, pos); + if (res) + return res; + } + return size; +} + +static struct page *f2fs_read_merkle_tree_page(struct inode *inode, + pgoff_t index, + unsigned long num_ra_pages) +{ + struct page *page; + + index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; + + page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); + if (!page || !PageUptodate(page)) { + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); + + if (page) + put_page(page); + else if (num_ra_pages > 1) + page_cache_ra_unbounded(&ractl, num_ra_pages, 0); + page = read_mapping_page(inode->i_mapping, index, NULL); + } + return page; +} + +static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf, + u64 index, int log_blocksize) +{ + loff_t pos = f2fs_verity_metadata_pos(inode) + (index << log_blocksize); + + return pagecache_write(inode, buf, 1 << log_blocksize, pos); +} + +const struct fsverity_operations f2fs_verityops = { + .begin_enable_verity = f2fs_begin_enable_verity, + .end_enable_verity = f2fs_end_enable_verity, + .get_verity_descriptor = f2fs_get_verity_descriptor, + .read_merkle_tree_page = f2fs_read_merkle_tree_page, + .write_merkle_tree_block = f2fs_write_merkle_tree_block, +}; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c new file mode 100644 index 000000000..0631b383e --- /dev/null +++ b/fs/f2fs/xattr.c @@ -0,0 +1,844 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/f2fs/xattr.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext2/xattr.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher <agruen@suse.de> + * + * Fix by Harrison Xing <harrison@mountainviewdata.com>. + * Extended attributes for symlinks and special files added per + * suggestion of Luka Renko <luka.renko@hermes.si>. + * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>, + * Red Hat Inc. + */ +#include <linux/rwsem.h> +#include <linux/f2fs_fs.h> +#include <linux/security.h> +#include <linux/posix_acl_xattr.h> +#include "f2fs.h" +#include "xattr.h" +#include "segment.h" + +static void *xattr_alloc(struct f2fs_sb_info *sbi, int size, bool *is_inline) +{ + if (likely(size == sbi->inline_xattr_slab_size)) { + *is_inline = true; + return f2fs_kmem_cache_alloc(sbi->inline_xattr_slab, + GFP_F2FS_ZERO, false, sbi); + } + *is_inline = false; + return f2fs_kzalloc(sbi, size, GFP_NOFS); +} + +static void xattr_free(struct f2fs_sb_info *sbi, void *xattr_addr, + bool is_inline) +{ + if (is_inline) + kmem_cache_free(sbi->inline_xattr_slab, xattr_addr); + else + kfree(xattr_addr); +} + +static int f2fs_xattr_generic_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + + switch (handler->flags) { + case F2FS_XATTR_INDEX_USER: + if (!test_opt(sbi, XATTR_USER)) + return -EOPNOTSUPP; + break; + case F2FS_XATTR_INDEX_TRUSTED: + case F2FS_XATTR_INDEX_SECURITY: + break; + default: + return -EINVAL; + } + return f2fs_getxattr(inode, handler->flags, name, + buffer, size, NULL); +} + +static int f2fs_xattr_generic_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + + switch (handler->flags) { + case F2FS_XATTR_INDEX_USER: + if (!test_opt(sbi, XATTR_USER)) + return -EOPNOTSUPP; + break; + case F2FS_XATTR_INDEX_TRUSTED: + case F2FS_XATTR_INDEX_SECURITY: + break; + default: + return -EINVAL; + } + return f2fs_setxattr(inode, handler->flags, name, + value, size, NULL, flags); +} + +static bool f2fs_xattr_user_list(struct dentry *dentry) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + + return test_opt(sbi, XATTR_USER); +} + +static bool f2fs_xattr_trusted_list(struct dentry *dentry) +{ + return capable(CAP_SYS_ADMIN); +} + +static int f2fs_xattr_advise_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + if (buffer) + *((char *)buffer) = F2FS_I(inode)->i_advise; + return sizeof(char); +} + +static int f2fs_xattr_advise_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + unsigned char old_advise = F2FS_I(inode)->i_advise; + unsigned char new_advise; + + if (!inode_owner_or_capable(&init_user_ns, inode)) + return -EPERM; + if (value == NULL) + return -EINVAL; + + new_advise = *(char *)value; + if (new_advise & ~FADVISE_MODIFIABLE_BITS) + return -EINVAL; + + new_advise = new_advise & FADVISE_MODIFIABLE_BITS; + new_advise |= old_advise & ~FADVISE_MODIFIABLE_BITS; + + F2FS_I(inode)->i_advise = new_advise; + f2fs_mark_inode_dirty_sync(inode, true); + return 0; +} + +#ifdef CONFIG_F2FS_FS_SECURITY +static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *page) +{ + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, (struct page *)page, 0); + if (err < 0) + break; + } + return err; +} + +int f2fs_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr, struct page *ipage) +{ + return security_inode_init_security(inode, dir, qstr, + &f2fs_initxattrs, ipage); +} +#endif + +const struct xattr_handler f2fs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = F2FS_XATTR_INDEX_USER, + .list = f2fs_xattr_user_list, + .get = f2fs_xattr_generic_get, + .set = f2fs_xattr_generic_set, +}; + +const struct xattr_handler f2fs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = F2FS_XATTR_INDEX_TRUSTED, + .list = f2fs_xattr_trusted_list, + .get = f2fs_xattr_generic_get, + .set = f2fs_xattr_generic_set, +}; + +const struct xattr_handler f2fs_xattr_advise_handler = { + .name = F2FS_SYSTEM_ADVISE_NAME, + .flags = F2FS_XATTR_INDEX_ADVISE, + .get = f2fs_xattr_advise_get, + .set = f2fs_xattr_advise_set, +}; + +const struct xattr_handler f2fs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = F2FS_XATTR_INDEX_SECURITY, + .get = f2fs_xattr_generic_get, + .set = f2fs_xattr_generic_set, +}; + +static const struct xattr_handler *f2fs_xattr_handler_map[] = { + [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler, +#ifdef CONFIG_F2FS_FS_POSIX_ACL + [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, +#endif + [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler, +#ifdef CONFIG_F2FS_FS_SECURITY + [F2FS_XATTR_INDEX_SECURITY] = &f2fs_xattr_security_handler, +#endif + [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler, +}; + +const struct xattr_handler *f2fs_xattr_handlers[] = { + &f2fs_xattr_user_handler, +#ifdef CONFIG_F2FS_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &f2fs_xattr_trusted_handler, +#ifdef CONFIG_F2FS_FS_SECURITY + &f2fs_xattr_security_handler, +#endif + &f2fs_xattr_advise_handler, + NULL, +}; + +static inline const struct xattr_handler *f2fs_xattr_handler(int index) +{ + const struct xattr_handler *handler = NULL; + + if (index > 0 && index < ARRAY_SIZE(f2fs_xattr_handler_map)) + handler = f2fs_xattr_handler_map[index]; + return handler; +} + +static struct f2fs_xattr_entry *__find_xattr(void *base_addr, + void *last_base_addr, void **last_addr, + int index, size_t len, const char *name) +{ + struct f2fs_xattr_entry *entry; + + list_for_each_xattr(entry, base_addr) { + if ((void *)(entry) + sizeof(__u32) > last_base_addr || + (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { + if (last_addr) + *last_addr = entry; + return NULL; + } + + if (entry->e_name_index != index) + continue; + if (entry->e_name_len != len) + continue; + if (!memcmp(entry->e_name, name, len)) + break; + } + return entry; +} + +static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, + void *base_addr, void **last_addr, int index, + size_t len, const char *name) +{ + struct f2fs_xattr_entry *entry; + unsigned int inline_size = inline_xattr_size(inode); + void *max_addr = base_addr + inline_size; + + entry = __find_xattr(base_addr, max_addr, last_addr, index, len, name); + if (!entry) + return NULL; + + /* inline xattr header or entry across max inline xattr size */ + if (IS_XATTR_LAST_ENTRY(entry) && + (void *)entry + sizeof(__u32) > max_addr) { + *last_addr = entry; + return NULL; + } + return entry; +} + +static int read_inline_xattr(struct inode *inode, struct page *ipage, + void *txattr_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int inline_size = inline_xattr_size(inode); + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(inode, ipage); + } else { + page = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) + return PTR_ERR(page); + + inline_addr = inline_xattr_addr(inode, page); + } + memcpy(txattr_addr, inline_addr, inline_size); + f2fs_put_page(page, 1); + + return 0; +} + +static int read_xattr_block(struct inode *inode, void *txattr_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int inline_size = inline_xattr_size(inode); + struct page *xpage; + void *xattr_addr; + + /* The inode already has an extended attribute block. */ + xpage = f2fs_get_node_page(sbi, xnid); + if (IS_ERR(xpage)) + return PTR_ERR(xpage); + + xattr_addr = page_address(xpage); + memcpy(txattr_addr + inline_size, xattr_addr, VALID_XATTR_BLOCK_SIZE); + f2fs_put_page(xpage, 1); + + return 0; +} + +static int lookup_all_xattrs(struct inode *inode, struct page *ipage, + unsigned int index, unsigned int len, + const char *name, struct f2fs_xattr_entry **xe, + void **base_addr, int *base_size, + bool *is_inline) +{ + void *cur_addr, *txattr_addr, *last_txattr_addr; + void *last_addr = NULL; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int inline_size = inline_xattr_size(inode); + int err; + + if (!xnid && !inline_size) + return -ENODATA; + + *base_size = XATTR_SIZE(inode) + XATTR_PADDING_SIZE; + txattr_addr = xattr_alloc(F2FS_I_SB(inode), *base_size, is_inline); + if (!txattr_addr) + return -ENOMEM; + + last_txattr_addr = (void *)txattr_addr + XATTR_SIZE(inode); + + /* read from inline xattr */ + if (inline_size) { + err = read_inline_xattr(inode, ipage, txattr_addr); + if (err) + goto out; + + *xe = __find_inline_xattr(inode, txattr_addr, &last_addr, + index, len, name); + if (*xe) { + *base_size = inline_size; + goto check; + } + } + + /* read from xattr node block */ + if (xnid) { + err = read_xattr_block(inode, txattr_addr); + if (err) + goto out; + } + + if (last_addr) + cur_addr = XATTR_HDR(last_addr) - 1; + else + cur_addr = txattr_addr; + + *xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name); + if (!*xe) { + f2fs_err(F2FS_I_SB(inode), "lookup inode (%lu) has corrupted xattr", + inode->i_ino); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + err = -ENODATA; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); + goto out; + } +check: + if (IS_XATTR_LAST_ENTRY(*xe)) { + err = -ENODATA; + goto out; + } + + *base_addr = txattr_addr; + return 0; +out: + xattr_free(F2FS_I_SB(inode), txattr_addr, *is_inline); + return err; +} + +static int read_all_xattrs(struct inode *inode, struct page *ipage, + void **base_addr) +{ + struct f2fs_xattr_header *header; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int size = VALID_XATTR_BLOCK_SIZE; + unsigned int inline_size = inline_xattr_size(inode); + void *txattr_addr; + int err; + + txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), + inline_size + size + XATTR_PADDING_SIZE, GFP_NOFS); + if (!txattr_addr) + return -ENOMEM; + + /* read from inline xattr */ + if (inline_size) { + err = read_inline_xattr(inode, ipage, txattr_addr); + if (err) + goto fail; + } + + /* read from xattr node block */ + if (xnid) { + err = read_xattr_block(inode, txattr_addr); + if (err) + goto fail; + } + + header = XATTR_HDR(txattr_addr); + + /* never been allocated xattrs */ + if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) { + header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC); + header->h_refcount = cpu_to_le32(1); + } + *base_addr = txattr_addr; + return 0; +fail: + kfree(txattr_addr); + return err; +} + +static inline int write_all_xattrs(struct inode *inode, __u32 hsize, + void *txattr_addr, struct page *ipage) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + size_t inline_size = inline_xattr_size(inode); + struct page *in_page = NULL; + void *xattr_addr; + void *inline_addr = NULL; + struct page *xpage; + nid_t new_nid = 0; + int err = 0; + + if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) + if (!f2fs_alloc_nid(sbi, &new_nid)) + return -ENOSPC; + + /* write to inline xattr */ + if (inline_size) { + if (ipage) { + inline_addr = inline_xattr_addr(inode, ipage); + } else { + in_page = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(in_page)) { + f2fs_alloc_nid_failed(sbi, new_nid); + return PTR_ERR(in_page); + } + inline_addr = inline_xattr_addr(inode, in_page); + } + + f2fs_wait_on_page_writeback(ipage ? ipage : in_page, + NODE, true, true); + /* no need to use xattr node block */ + if (hsize <= inline_size) { + err = f2fs_truncate_xattr_node(inode); + f2fs_alloc_nid_failed(sbi, new_nid); + if (err) { + f2fs_put_page(in_page, 1); + return err; + } + memcpy(inline_addr, txattr_addr, inline_size); + set_page_dirty(ipage ? ipage : in_page); + goto in_page_out; + } + } + + /* write to xattr node block */ + if (F2FS_I(inode)->i_xattr_nid) { + xpage = f2fs_get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); + f2fs_alloc_nid_failed(sbi, new_nid); + goto in_page_out; + } + f2fs_bug_on(sbi, new_nid); + f2fs_wait_on_page_writeback(xpage, NODE, true, true); + } else { + struct dnode_of_data dn; + + set_new_dnode(&dn, inode, NULL, NULL, new_nid); + xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET); + if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); + f2fs_alloc_nid_failed(sbi, new_nid); + goto in_page_out; + } + f2fs_alloc_nid_done(sbi, new_nid); + } + xattr_addr = page_address(xpage); + + if (inline_size) + memcpy(inline_addr, txattr_addr, inline_size); + memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE); + + if (inline_size) + set_page_dirty(ipage ? ipage : in_page); + set_page_dirty(xpage); + + f2fs_put_page(xpage, 1); +in_page_out: + f2fs_put_page(in_page, 1); + return err; +} + +int f2fs_getxattr(struct inode *inode, int index, const char *name, + void *buffer, size_t buffer_size, struct page *ipage) +{ + struct f2fs_xattr_entry *entry = NULL; + int error; + unsigned int size, len; + void *base_addr = NULL; + int base_size; + bool is_inline; + + if (name == NULL) + return -EINVAL; + + len = strlen(name); + if (len > F2FS_NAME_LEN) + return -ERANGE; + + if (!ipage) + f2fs_down_read(&F2FS_I(inode)->i_xattr_sem); + error = lookup_all_xattrs(inode, ipage, index, len, name, + &entry, &base_addr, &base_size, &is_inline); + if (!ipage) + f2fs_up_read(&F2FS_I(inode)->i_xattr_sem); + if (error) + return error; + + size = le16_to_cpu(entry->e_value_size); + + if (buffer && size > buffer_size) { + error = -ERANGE; + goto out; + } + + if (buffer) { + char *pval = entry->e_name + entry->e_name_len; + + if (base_size - (pval - (char *)base_addr) < size) { + error = -ERANGE; + goto out; + } + memcpy(buffer, pval, size); + } + error = size; +out: + xattr_free(F2FS_I_SB(inode), base_addr, is_inline); + return error; +} + +ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct inode *inode = d_inode(dentry); + struct f2fs_xattr_entry *entry; + void *base_addr, *last_base_addr; + int error; + size_t rest = buffer_size; + + f2fs_down_read(&F2FS_I(inode)->i_xattr_sem); + error = read_all_xattrs(inode, NULL, &base_addr); + f2fs_up_read(&F2FS_I(inode)->i_xattr_sem); + if (error) + return error; + + last_base_addr = (void *)base_addr + XATTR_SIZE(inode); + + list_for_each_xattr(entry, base_addr) { + const struct xattr_handler *handler = + f2fs_xattr_handler(entry->e_name_index); + const char *prefix; + size_t prefix_len; + size_t size; + + if ((void *)(entry) + sizeof(__u32) > last_base_addr || + (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { + f2fs_err(F2FS_I_SB(inode), "list inode (%lu) has corrupted xattr", + inode->i_ino); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); + break; + } + + if (!handler || (handler->list && !handler->list(dentry))) + continue; + + prefix = xattr_prefix(handler); + prefix_len = strlen(prefix); + size = prefix_len + entry->e_name_len + 1; + if (buffer) { + if (size > rest) { + error = -ERANGE; + goto cleanup; + } + memcpy(buffer, prefix, prefix_len); + buffer += prefix_len; + memcpy(buffer, entry->e_name, entry->e_name_len); + buffer += entry->e_name_len; + *buffer++ = 0; + } + rest -= size; + } + error = buffer_size - rest; +cleanup: + kfree(base_addr); + return error; +} + +static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry, + const void *value, size_t size) +{ + void *pval = entry->e_name + entry->e_name_len; + + return (le16_to_cpu(entry->e_value_size) == size) && + !memcmp(pval, value, size); +} + +static int __f2fs_setxattr(struct inode *inode, int index, + const char *name, const void *value, size_t size, + struct page *ipage, int flags) +{ + struct f2fs_xattr_entry *here, *last; + void *base_addr, *last_base_addr; + int found, newsize; + size_t len; + __u32 new_hsize; + int error; + + if (name == NULL) + return -EINVAL; + + if (value == NULL) + size = 0; + + len = strlen(name); + + if (len > F2FS_NAME_LEN) + return -ERANGE; + + if (size > MAX_VALUE_LEN(inode)) + return -E2BIG; +retry: + error = read_all_xattrs(inode, ipage, &base_addr); + if (error) + return error; + + last_base_addr = (void *)base_addr + XATTR_SIZE(inode); + + /* find entry with wanted name. */ + here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name); + if (!here) { + if (!F2FS_I(inode)->i_xattr_nid) { + error = f2fs_recover_xattr_data(inode, NULL); + f2fs_notice(F2FS_I_SB(inode), + "recover xattr in inode (%lu), error(%d)", + inode->i_ino, error); + if (!error) { + kfree(base_addr); + goto retry; + } + } + f2fs_err(F2FS_I_SB(inode), "set inode (%lu) has corrupted xattr", + inode->i_ino); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); + goto exit; + } + + found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1; + + if (found) { + if ((flags & XATTR_CREATE)) { + error = -EEXIST; + goto exit; + } + + if (value && f2fs_xattr_value_same(here, value, size)) + goto same; + } else if ((flags & XATTR_REPLACE)) { + error = -ENODATA; + goto exit; + } + + last = here; + while (!IS_XATTR_LAST_ENTRY(last)) { + if ((void *)(last) + sizeof(__u32) > last_base_addr || + (void *)XATTR_NEXT_ENTRY(last) > last_base_addr) { + f2fs_err(F2FS_I_SB(inode), "inode (%lu) has invalid last xattr entry, entry_size: %zu", + inode->i_ino, ENTRY_SIZE(last)); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); + goto exit; + } + last = XATTR_NEXT_ENTRY(last); + } + + newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + len + size); + + /* 1. Check space */ + if (value) { + int free; + /* + * If value is NULL, it is remove operation. + * In case of update operation, we calculate free. + */ + free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr); + if (found) + free = free + ENTRY_SIZE(here); + + if (unlikely(free < newsize)) { + error = -E2BIG; + goto exit; + } + } + + /* 2. Remove old entry */ + if (found) { + /* + * If entry is found, remove old entry. + * If not found, remove operation is not needed. + */ + struct f2fs_xattr_entry *next = XATTR_NEXT_ENTRY(here); + int oldsize = ENTRY_SIZE(here); + + memmove(here, next, (char *)last - (char *)next); + last = (struct f2fs_xattr_entry *)((char *)last - oldsize); + memset(last, 0, oldsize); + } + + new_hsize = (char *)last - (char *)base_addr; + + /* 3. Write new entry */ + if (value) { + char *pval; + /* + * Before we come here, old entry is removed. + * We just write new entry. + */ + last->e_name_index = index; + last->e_name_len = len; + memcpy(last->e_name, name, len); + pval = last->e_name + len; + memcpy(pval, value, size); + last->e_value_size = cpu_to_le16(size); + new_hsize += newsize; + /* + * Explicitly add the null terminator. The unused xattr space + * is supposed to always be zeroed, which would make this + * unnecessary, but don't depend on that. + */ + *(u32 *)((u8 *)last + newsize) = 0; + } + + error = write_all_xattrs(inode, new_hsize, base_addr, ipage); + if (error) + goto exit; + + if (index == F2FS_XATTR_INDEX_ENCRYPTION && + !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) + f2fs_set_encrypted_inode(inode); + f2fs_mark_inode_dirty_sync(inode, true); + if (!error && S_ISDIR(inode->i_mode)) + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); + +same: + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; + inode->i_ctime = current_time(inode); + clear_inode_flag(inode, FI_ACL_MODE); + } + +exit: + kfree(base_addr); + return error; +} + +int f2fs_setxattr(struct inode *inode, int index, const char *name, + const void *value, size_t size, + struct page *ipage, int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int err; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; + + err = f2fs_dquot_initialize(inode); + if (err) + return err; + + /* this case is only from f2fs_init_inode_metadata */ + if (ipage) + return __f2fs_setxattr(inode, index, name, value, + size, ipage, flags); + f2fs_balance_fs(sbi, true); + + f2fs_lock_op(sbi); + f2fs_down_write(&F2FS_I(inode)->i_xattr_sem); + err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); + f2fs_up_write(&F2FS_I(inode)->i_xattr_sem); + f2fs_unlock_op(sbi); + + f2fs_update_time(sbi, REQ_TIME); + return err; +} + +int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + char slab_name[32]; + + sprintf(slab_name, "f2fs_xattr_entry-%u:%u", MAJOR(dev), MINOR(dev)); + + sbi->inline_xattr_slab_size = F2FS_OPTION(sbi).inline_xattr_size * + sizeof(__le32) + XATTR_PADDING_SIZE; + + sbi->inline_xattr_slab = f2fs_kmem_cache_create(slab_name, + sbi->inline_xattr_slab_size); + if (!sbi->inline_xattr_slab) + return -ENOMEM; + + return 0; +} + +void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi) +{ + kmem_cache_destroy(sbi->inline_xattr_slab); +} diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h new file mode 100644 index 000000000..416d65277 --- /dev/null +++ b/fs/f2fs/xattr.h @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * fs/f2fs/xattr.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext2/xattr.h + * + * On-disk format of extended attributes for the ext2 filesystem. + * + * (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> + */ +#ifndef __F2FS_XATTR_H__ +#define __F2FS_XATTR_H__ + +#include <linux/init.h> +#include <linux/xattr.h> + +/* Magic value in attribute blocks */ +#define F2FS_XATTR_MAGIC 0xF2F52011 + +/* Maximum number of references to one attribute block */ +#define F2FS_XATTR_REFCOUNT_MAX 1024 + +/* Name indexes */ +#define F2FS_SYSTEM_ADVISE_NAME "system.advise" +#define F2FS_XATTR_INDEX_USER 1 +#define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define F2FS_XATTR_INDEX_TRUSTED 4 +#define F2FS_XATTR_INDEX_LUSTRE 5 +#define F2FS_XATTR_INDEX_SECURITY 6 +#define F2FS_XATTR_INDEX_ADVISE 7 +/* Should be same as EXT4_XATTR_INDEX_ENCRYPTION */ +#define F2FS_XATTR_INDEX_ENCRYPTION 9 +#define F2FS_XATTR_INDEX_VERITY 11 + +#define F2FS_XATTR_NAME_ENCRYPTION_CONTEXT "c" +#define F2FS_XATTR_NAME_VERITY "v" + +struct f2fs_xattr_header { + __le32 h_magic; /* magic number for identification */ + __le32 h_refcount; /* reference count */ + __u32 h_reserved[4]; /* zero right now */ +}; + +struct f2fs_xattr_entry { + __u8 e_name_index; + __u8 e_name_len; + __le16 e_value_size; /* size of attribute value */ + char e_name[]; /* attribute name */ +}; + +#define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr)) +#define XATTR_ENTRY(ptr) ((struct f2fs_xattr_entry *)(ptr)) +#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1)) +#define XATTR_ROUND (3) + +#define XATTR_ALIGN(size) (((size) + XATTR_ROUND) & ~XATTR_ROUND) + +#define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \ + (entry)->e_name_len + le16_to_cpu((entry)->e_value_size))) + +#define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\ + ENTRY_SIZE(entry))) + +#define IS_XATTR_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + +#define list_for_each_xattr(entry, addr) \ + for (entry = XATTR_FIRST_ENTRY(addr);\ + !IS_XATTR_LAST_ENTRY(entry);\ + entry = XATTR_NEXT_ENTRY(entry)) +#define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) +#define XATTR_PADDING_SIZE (sizeof(__u32)) +#define XATTR_SIZE(i) ((F2FS_I(i)->i_xattr_nid ? \ + VALID_XATTR_BLOCK_SIZE : 0) + \ + (inline_xattr_size(i))) +#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \ + VALID_XATTR_BLOCK_SIZE) + +#define MAX_VALUE_LEN(i) (MIN_OFFSET(i) - \ + sizeof(struct f2fs_xattr_header) - \ + sizeof(struct f2fs_xattr_entry)) + +#define MAX_INLINE_XATTR_SIZE \ + (DEF_ADDRS_PER_INODE - \ + F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - \ + DEF_INLINE_RESERVED_SIZE - \ + MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) + +/* + * On-disk structure of f2fs_xattr + * We use inline xattrs space + 1 block for xattr. + * + * +--------------------+ + * | f2fs_xattr_header | + * | | + * +--------------------+ + * | f2fs_xattr_entry | + * | .e_name_index = 1 | + * | .e_name_len = 3 | + * | .e_value_size = 14 | + * | .e_name = "foo" | + * | "value_of_xattr" |<- value_offs = e_name + e_name_len + * +--------------------+ + * | f2fs_xattr_entry | + * | .e_name_index = 4 | + * | .e_name = "bar" | + * +--------------------+ + * | | + * | Free | + * | | + * +--------------------+<- MIN_OFFSET + * | node_footer | + * | (nid, ino, offset) | + * +--------------------+ + * + **/ + +#ifdef CONFIG_F2FS_FS_XATTR +extern const struct xattr_handler f2fs_xattr_user_handler; +extern const struct xattr_handler f2fs_xattr_trusted_handler; +extern const struct xattr_handler f2fs_xattr_advise_handler; +extern const struct xattr_handler f2fs_xattr_security_handler; + +extern const struct xattr_handler *f2fs_xattr_handlers[]; + +extern int f2fs_setxattr(struct inode *, int, const char *, + const void *, size_t, struct page *, int); +extern int f2fs_getxattr(struct inode *, int, const char *, void *, + size_t, struct page *); +extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t); +extern int f2fs_init_xattr_caches(struct f2fs_sb_info *); +extern void f2fs_destroy_xattr_caches(struct f2fs_sb_info *); +#else + +#define f2fs_xattr_handlers NULL +#define f2fs_listxattr NULL +static inline int f2fs_setxattr(struct inode *inode, int index, + const char *name, const void *value, size_t size, + struct page *page, int flags) +{ + return -EOPNOTSUPP; +} +static inline int f2fs_getxattr(struct inode *inode, int index, + const char *name, void *buffer, + size_t buffer_size, struct page *dpage) +{ + return -EOPNOTSUPP; +} +static inline int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi) { } +#endif + +#ifdef CONFIG_F2FS_FS_SECURITY +extern int f2fs_init_security(struct inode *, struct inode *, + const struct qstr *, struct page *); +#else +static inline int f2fs_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr, struct page *ipage) +{ + return 0; +} +#endif +#endif /* __F2FS_XATTR_H__ */ |