diff options
Diffstat (limited to 'fs/f2fs')
-rw-r--r-- | fs/f2fs/Kconfig | 100 | ||||
-rw-r--r-- | fs/f2fs/Makefile | 10 | ||||
-rw-r--r-- | fs/f2fs/acl.c | 408 | ||||
-rw-r--r-- | fs/f2fs/acl.h | 53 | ||||
-rw-r--r-- | fs/f2fs/checkpoint.c | 1576 | ||||
-rw-r--r-- | fs/f2fs/data.c | 2761 | ||||
-rw-r--r-- | fs/f2fs/debug.c | 527 | ||||
-rw-r--r-- | fs/f2fs/dir.c | 940 | ||||
-rw-r--r-- | fs/f2fs/extent_cache.c | 835 | ||||
-rw-r--r-- | fs/f2fs/f2fs.h | 3504 | ||||
-rw-r--r-- | fs/f2fs/file.c | 3106 | ||||
-rw-r--r-- | fs/f2fs/gc.c | 1269 | ||||
-rw-r--r-- | fs/f2fs/gc.h | 113 | ||||
-rw-r--r-- | fs/f2fs/hash.c | 108 | ||||
-rw-r--r-- | fs/f2fs/inline.c | 735 | ||||
-rw-r--r-- | fs/f2fs/inode.c | 792 | ||||
-rw-r--r-- | fs/f2fs/namei.c | 1277 | ||||
-rw-r--r-- | fs/f2fs/node.c | 3174 | ||||
-rw-r--r-- | fs/f2fs/node.h | 458 | ||||
-rw-r--r-- | fs/f2fs/recovery.c | 766 | ||||
-rw-r--r-- | fs/f2fs/segment.c | 4397 | ||||
-rw-r--r-- | fs/f2fs/segment.h | 868 | ||||
-rw-r--r-- | fs/f2fs/shrinker.c | 143 | ||||
-rw-r--r-- | fs/f2fs/super.c | 3377 | ||||
-rw-r--r-- | fs/f2fs/sysfs.c | 721 | ||||
-rw-r--r-- | fs/f2fs/trace.c | 168 | ||||
-rw-r--r-- | fs/f2fs/trace.h | 46 | ||||
-rw-r--r-- | fs/f2fs/xattr.c | 770 | ||||
-rw-r--r-- | fs/f2fs/xattr.h | 160 |
29 files changed, 33162 insertions, 0 deletions
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig new file mode 100644 index 000000000..9a20ef42f --- /dev/null +++ b/fs/f2fs/Kconfig @@ -0,0 +1,100 @@ +config F2FS_FS + tristate "F2FS filesystem support" + depends on BLOCK + select CRYPTO + select CRYPTO_CRC32 + help + F2FS is based on Log-structured File System (LFS), which supports + versatile "flash-friendly" features. The design has been focused on + addressing the fundamental issues in LFS, which are snowball effect + of wandering tree and high cleaning overhead. + + Since flash-based storages show different characteristics according to + the internal geometry or flash memory management schemes aka FTL, F2FS + and tools support various parameters not only for configuring on-disk + layout, but also for selecting allocation and cleaning algorithms. + + If unsure, say N. + +config F2FS_STAT_FS + bool "F2FS Status Information" + depends on F2FS_FS && DEBUG_FS + default y + help + /sys/kernel/debug/f2fs/ contains information about all the partitions + mounted as f2fs. Each file shows the whole f2fs information. + + /sys/kernel/debug/f2fs/status includes: + - major filesystem information managed by f2fs currently + - average SIT information about whole segments + - current memory footprint consumed by f2fs. + +config F2FS_FS_XATTR + bool "F2FS extended attributes" + depends on F2FS_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page for details). + + If unsure, say N. + +config F2FS_FS_POSIX_ACL + bool "F2FS Access Control Lists" + depends on F2FS_FS_XATTR + select FS_POSIX_ACL + default y + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + If you don't know what Access Control Lists are, say N + +config F2FS_FS_SECURITY + bool "F2FS Security Labels" + depends on F2FS_FS_XATTR + help + Security labels provide an access control facility to support Linux + Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO + Linux. This option enables an extended attribute handler for file + security labels in the f2fs filesystem, so that it requires enabling + the extended attribute support in advance. + + If you are not using a security module, say N. + +config F2FS_CHECK_FS + bool "F2FS consistency checking feature" + depends on F2FS_FS + help + Enables BUG_ONs which check the filesystem consistency in runtime. + + If you want to improve the performance, say N. + +config F2FS_FS_ENCRYPTION + bool "F2FS Encryption" + depends on F2FS_FS + depends on F2FS_FS_XATTR + select FS_ENCRYPTION + help + Enable encryption of f2fs files and directories. This + feature is similar to ecryptfs, but it is more memory + efficient since it avoids caching the encrypted and + decrypted pages in the page cache. + +config F2FS_IO_TRACE + bool "F2FS IO tracer" + depends on F2FS_FS + depends on FUNCTION_TRACER + help + F2FS IO trace is based on a function trace, which gathers process + information and block IO patterns in the filesystem level. + + If unsure, say N. + +config F2FS_FAULT_INJECTION + bool "F2FS fault injection facility" + depends on F2FS_FS + help + Test F2FS to inject faults such as ENOMEM, ENOSPC, and so on. + + If unsure, say N. diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile new file mode 100644 index 000000000..776c4b936 --- /dev/null +++ b/fs/f2fs/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_F2FS_FS) += f2fs.o + +f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o +f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o +f2fs-y += shrinker.o extent_cache.o sysfs.o +f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o +f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o +f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o +f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c new file mode 100644 index 000000000..b9fe937a3 --- /dev/null +++ b/fs/f2fs/acl.c @@ -0,0 +1,408 @@ +/* + * fs/f2fs/acl.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext2/acl.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/f2fs_fs.h> +#include "f2fs.h" +#include "xattr.h" +#include "acl.h" + +static inline size_t f2fs_acl_size(int count) +{ + if (count <= 4) { + return sizeof(struct f2fs_acl_header) + + count * sizeof(struct f2fs_acl_entry_short); + } else { + return sizeof(struct f2fs_acl_header) + + 4 * sizeof(struct f2fs_acl_entry_short) + + (count - 4) * sizeof(struct f2fs_acl_entry); + } +} + +static inline int f2fs_acl_count(size_t size) +{ + ssize_t s; + size -= sizeof(struct f2fs_acl_header); + s = size - 4 * sizeof(struct f2fs_acl_entry_short); + if (s < 0) { + if (size % sizeof(struct f2fs_acl_entry_short)) + return -1; + return size / sizeof(struct f2fs_acl_entry_short); + } else { + if (s % sizeof(struct f2fs_acl_entry)) + return -1; + return s / sizeof(struct f2fs_acl_entry) + 4; + } +} + +static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size) +{ + int i, count; + struct posix_acl *acl; + struct f2fs_acl_header *hdr = (struct f2fs_acl_header *)value; + struct f2fs_acl_entry *entry = (struct f2fs_acl_entry *)(hdr + 1); + const char *end = value + size; + + if (hdr->a_version != cpu_to_le32(F2FS_ACL_VERSION)) + return ERR_PTR(-EINVAL); + + count = f2fs_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < count; i++) { + + if ((char *)entry > end) + goto fail; + + acl->a_entries[i].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[i].e_perm = le16_to_cpu(entry->e_perm); + + switch (acl->a_entries[i].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry_short)); + break; + + case ACL_USER: + acl->a_entries[i].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry)); + break; + case ACL_GROUP: + acl->a_entries[i].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry)); + break; + default: + goto fail; + } + } + if ((char *)entry != end) + goto fail; + return acl; +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi, + const struct posix_acl *acl, size_t *size) +{ + struct f2fs_acl_header *f2fs_acl; + struct f2fs_acl_entry *entry; + int i; + + f2fs_acl = f2fs_kmalloc(sbi, sizeof(struct f2fs_acl_header) + + acl->a_count * sizeof(struct f2fs_acl_entry), + GFP_NOFS); + if (!f2fs_acl) + return ERR_PTR(-ENOMEM); + + f2fs_acl->a_version = cpu_to_le32(F2FS_ACL_VERSION); + entry = (struct f2fs_acl_entry *)(f2fs_acl + 1); + + for (i = 0; i < acl->a_count; i++) { + + entry->e_tag = cpu_to_le16(acl->a_entries[i].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[i].e_perm); + + switch (acl->a_entries[i].e_tag) { + case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, + acl->a_entries[i].e_uid)); + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry)); + break; + case ACL_GROUP: + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, + acl->a_entries[i].e_gid)); + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry)); + break; + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + entry = (struct f2fs_acl_entry *)((char *)entry + + sizeof(struct f2fs_acl_entry_short)); + break; + default: + goto fail; + } + } + *size = f2fs_acl_size(acl->a_count); + return (void *)f2fs_acl; + +fail: + kfree(f2fs_acl); + return ERR_PTR(-EINVAL); +} + +static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, + struct page *dpage) +{ + int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT; + void *value = NULL; + struct posix_acl *acl; + int retval; + + if (type == ACL_TYPE_ACCESS) + name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; + + retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage); + if (retval > 0) { + value = f2fs_kmalloc(F2FS_I_SB(inode), retval, GFP_F2FS_ZERO); + if (!value) + return ERR_PTR(-ENOMEM); + retval = f2fs_getxattr(inode, name_index, "", value, + retval, dpage); + } + + if (retval > 0) + acl = f2fs_acl_from_disk(value, retval); + else if (retval == -ENODATA) + acl = NULL; + else + acl = ERR_PTR(retval); + kfree(value); + + return acl; +} + +struct posix_acl *f2fs_get_acl(struct inode *inode, int type) +{ + return __f2fs_get_acl(inode, type, NULL); +} + +static int __f2fs_set_acl(struct inode *inode, int type, + struct posix_acl *acl, struct page *ipage) +{ + int name_index; + void *value = NULL; + size_t size = 0; + int error; + umode_t mode = inode->i_mode; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl && !ipage) { + error = posix_acl_update_mode(inode, &mode, &acl); + if (error) + return error; + set_acl_inode(inode, mode); + } + break; + + case ACL_TYPE_DEFAULT: + name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + + if (acl) { + value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size); + if (IS_ERR(value)) { + clear_inode_flag(inode, FI_ACL_MODE); + return PTR_ERR(value); + } + } + + error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0); + + kfree(value); + if (!error) + set_cached_acl(inode, type, acl); + + clear_inode_flag(inode, FI_ACL_MODE); + return error; +} + +int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + + return __f2fs_set_acl(inode, type, acl, NULL); +} + +/* + * Most part of f2fs_acl_clone, f2fs_acl_create_masq, f2fs_acl_create + * are copied from posix_acl.c + */ +static struct posix_acl *f2fs_acl_clone(const struct posix_acl *acl, + gfp_t flags) +{ + struct posix_acl *clone = NULL; + + if (acl) { + int size = sizeof(struct posix_acl) + acl->a_count * + sizeof(struct posix_acl_entry); + clone = kmemdup(acl, size, flags); + if (clone) + refcount_set(&clone->a_refcount, 1); + } + return clone; +} + +static int f2fs_acl_create_masq(struct posix_acl *acl, umode_t *mode_p) +{ + struct posix_acl_entry *pa, *pe; + struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; + umode_t mode = *mode_p; + int not_equiv = 0; + + /* assert(atomic_read(acl->a_refcount) == 1); */ + + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch(pa->e_tag) { + case ACL_USER_OBJ: + pa->e_perm &= (mode >> 6) | ~S_IRWXO; + mode &= (pa->e_perm << 6) | ~S_IRWXU; + break; + + case ACL_USER: + case ACL_GROUP: + not_equiv = 1; + break; + + case ACL_GROUP_OBJ: + group_obj = pa; + break; + + case ACL_OTHER: + pa->e_perm &= mode | ~S_IRWXO; + mode &= pa->e_perm | ~S_IRWXO; + break; + + case ACL_MASK: + mask_obj = pa; + not_equiv = 1; + break; + + default: + return -EIO; + } + } + + if (mask_obj) { + mask_obj->e_perm &= (mode >> 3) | ~S_IRWXO; + mode &= (mask_obj->e_perm << 3) | ~S_IRWXG; + } else { + if (!group_obj) + return -EIO; + group_obj->e_perm &= (mode >> 3) | ~S_IRWXO; + mode &= (group_obj->e_perm << 3) | ~S_IRWXG; + } + + *mode_p = (*mode_p & ~S_IRWXUGO) | mode; + return not_equiv; +} + +static int f2fs_acl_create(struct inode *dir, umode_t *mode, + struct posix_acl **default_acl, struct posix_acl **acl, + struct page *dpage) +{ + struct posix_acl *p; + struct posix_acl *clone; + int ret; + + *acl = NULL; + *default_acl = NULL; + + if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) + return 0; + + p = __f2fs_get_acl(dir, ACL_TYPE_DEFAULT, dpage); + if (!p || p == ERR_PTR(-EOPNOTSUPP)) { + *mode &= ~current_umask(); + return 0; + } + if (IS_ERR(p)) + return PTR_ERR(p); + + clone = f2fs_acl_clone(p, GFP_NOFS); + if (!clone) { + ret = -ENOMEM; + goto release_acl; + } + + ret = f2fs_acl_create_masq(clone, mode); + if (ret < 0) + goto release_clone; + + if (ret == 0) + posix_acl_release(clone); + else + *acl = clone; + + if (!S_ISDIR(*mode)) + posix_acl_release(p); + else + *default_acl = p; + + return 0; + +release_clone: + posix_acl_release(clone); +release_acl: + posix_acl_release(p); + return ret; +} + +int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, + struct page *dpage) +{ + struct posix_acl *default_acl = NULL, *acl = NULL; + int error = 0; + + error = f2fs_acl_create(dir, &inode->i_mode, &default_acl, &acl, dpage); + if (error) + return error; + + f2fs_mark_inode_dirty_sync(inode, true); + + if (default_acl) { + error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, + ipage); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, + ipage); + posix_acl_release(acl); + } + + return error; +} diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h new file mode 100644 index 000000000..2c685185c --- /dev/null +++ b/fs/f2fs/acl.h @@ -0,0 +1,53 @@ +/* + * fs/f2fs/acl.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext2/acl.h + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __F2FS_ACL_H__ +#define __F2FS_ACL_H__ + +#include <linux/posix_acl_xattr.h> + +#define F2FS_ACL_VERSION 0x0001 + +struct f2fs_acl_entry { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +}; + +struct f2fs_acl_entry_short { + __le16 e_tag; + __le16 e_perm; +}; + +struct f2fs_acl_header { + __le32 a_version; +}; + +#ifdef CONFIG_F2FS_FS_POSIX_ACL + +extern struct posix_acl *f2fs_get_acl(struct inode *, int); +extern int f2fs_set_acl(struct inode *, struct posix_acl *, int); +extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, + struct page *); +#else +#define f2fs_get_acl NULL +#define f2fs_set_acl NULL + +static inline int f2fs_init_acl(struct inode *inode, struct inode *dir, + struct page *ipage, struct page *dpage) +{ + return 0; +} +#endif +#endif /* __F2FS_ACL_H__ */ diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c new file mode 100644 index 000000000..a563de5cc --- /dev/null +++ b/fs/f2fs/checkpoint.c @@ -0,0 +1,1576 @@ +/* + * fs/f2fs/checkpoint.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/bio.h> +#include <linux/mpage.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/f2fs_fs.h> +#include <linux/pagevec.h> +#include <linux/swap.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "trace.h" +#include <trace/events/f2fs.h> + +static struct kmem_cache *ino_entry_slab; +struct kmem_cache *f2fs_inode_entry_slab; + +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) +{ + f2fs_build_fault_attr(sbi, 0, 0); + set_ckpt_flags(sbi, CP_ERROR_FLAG); + if (!end_io) + f2fs_flush_merged_writes(sbi); +} + +/* + * We guarantee no failure on the returned page. + */ +struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +{ + struct address_space *mapping = META_MAPPING(sbi); + struct page *page = NULL; +repeat: + page = f2fs_grab_cache_page(mapping, index, false); + if (!page) { + cond_resched(); + goto repeat; + } + f2fs_wait_on_page_writeback(page, META, true); + if (!PageUptodate(page)) + SetPageUptodate(page); + return page; +} + +/* + * We guarantee no failure on the returned page. + */ +static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, + bool is_meta) +{ + struct address_space *mapping = META_MAPPING(sbi); + struct page *page; + struct f2fs_io_info fio = { + .sbi = sbi, + .type = META, + .op = REQ_OP_READ, + .op_flags = REQ_META | REQ_PRIO, + .old_blkaddr = index, + .new_blkaddr = index, + .encrypted_page = NULL, + .is_meta = is_meta, + }; + int err; + + if (unlikely(!is_meta)) + fio.op_flags &= ~REQ_META; +repeat: + page = f2fs_grab_cache_page(mapping, index, false); + if (!page) { + cond_resched(); + goto repeat; + } + if (PageUptodate(page)) + goto out; + + fio.page = page; + + err = f2fs_submit_page_bio(&fio); + if (err) { + f2fs_put_page(page, 1); + return ERR_PTR(err); + } + + lock_page(page); + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; + } + + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } +out: + return page; +} + +struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +{ + return __get_meta_page(sbi, index, true); +} + +struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index) +{ + struct page *page; + int count = 0; + +retry: + page = __get_meta_page(sbi, index, true); + if (IS_ERR(page)) { + if (PTR_ERR(page) == -EIO && + ++count <= DEFAULT_RETRY_IO_COUNT) + goto retry; + + f2fs_stop_checkpoint(sbi, false); + f2fs_bug_on(sbi, 1); + } + + return page; +} + +/* for POR only */ +struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) +{ + return __get_meta_page(sbi, index, false); +} + +bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) +{ + switch (type) { + case META_NAT: + break; + case META_SIT: + if (unlikely(blkaddr >= SIT_BLK_CNT(sbi))) + return false; + break; + case META_SSA: + if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) || + blkaddr < SM_I(sbi)->ssa_blkaddr)) + return false; + break; + case META_CP: + if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr || + blkaddr < __start_cp_addr(sbi))) + return false; + break; + case META_POR: + case DATA_GENERIC: + if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || + blkaddr < MAIN_BLKADDR(sbi))) { + if (type == DATA_GENERIC) { + f2fs_msg(sbi->sb, KERN_WARNING, + "access invalid blkaddr:%u", blkaddr); + WARN_ON(1); + } + return false; + } + break; + case META_GENERIC: + if (unlikely(blkaddr < SEG0_BLKADDR(sbi) || + blkaddr >= MAIN_BLKADDR(sbi))) + return false; + break; + default: + BUG(); + } + + return true; +} + +/* + * Readahead CP/NAT/SIT/SSA pages + */ +int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, + int type, bool sync) +{ + struct page *page; + block_t blkno = start; + struct f2fs_io_info fio = { + .sbi = sbi, + .type = META, + .op = REQ_OP_READ, + .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD, + .encrypted_page = NULL, + .in_list = false, + .is_meta = (type != META_POR), + }; + struct blk_plug plug; + + if (unlikely(type == META_POR)) + fio.op_flags &= ~REQ_META; + + blk_start_plug(&plug); + for (; nrpages-- > 0; blkno++) { + + if (!f2fs_is_valid_blkaddr(sbi, blkno, type)) + goto out; + + switch (type) { + case META_NAT: + if (unlikely(blkno >= + NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid))) + blkno = 0; + /* get nat block addr */ + fio.new_blkaddr = current_nat_addr(sbi, + blkno * NAT_ENTRY_PER_BLOCK); + break; + case META_SIT: + if (unlikely(blkno >= TOTAL_SEGS(sbi))) + goto out; + /* get sit block addr */ + fio.new_blkaddr = current_sit_addr(sbi, + blkno * SIT_ENTRY_PER_BLOCK); + break; + case META_SSA: + case META_CP: + case META_POR: + fio.new_blkaddr = blkno; + break; + default: + BUG(); + } + + page = f2fs_grab_cache_page(META_MAPPING(sbi), + fio.new_blkaddr, false); + if (!page) + continue; + if (PageUptodate(page)) { + f2fs_put_page(page, 1); + continue; + } + + fio.page = page; + f2fs_submit_page_bio(&fio); + f2fs_put_page(page, 0); + } +out: + blk_finish_plug(&plug); + return blkno - start; +} + +void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) +{ + struct page *page; + bool readahead = false; + + page = find_get_page(META_MAPPING(sbi), index); + if (!page || !PageUptodate(page)) + readahead = true; + f2fs_put_page(page, 0); + + if (readahead) + f2fs_ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); +} + +static int __f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + + trace_f2fs_writepage(page, META); + + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0)) + goto redirty_out; + + f2fs_do_write_meta_page(sbi, page, io_type); + dec_page_count(sbi, F2FS_DIRTY_META); + + if (wbc->for_reclaim) + f2fs_submit_merged_write_cond(sbi, page->mapping->host, + 0, page->index, META); + + unlock_page(page); + + if (unlikely(f2fs_cp_error(sbi))) + f2fs_submit_merged_write(sbi, META); + + return 0; + +redirty_out: + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; +} + +static int f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc) +{ + return __f2fs_write_meta_page(page, wbc, FS_META_IO); +} + +static int f2fs_write_meta_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + long diff, written; + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + + /* collect a number of dirty meta pages and write together */ + if (wbc->for_kupdate || + get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) + goto skip_write; + + /* if locked failed, cp will flush dirty pages instead */ + if (!mutex_trylock(&sbi->cp_mutex)) + goto skip_write; + + trace_f2fs_writepages(mapping->host, wbc, META); + diff = nr_pages_to_write(sbi, META, wbc); + written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); + mutex_unlock(&sbi->cp_mutex); + wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); + return 0; + +skip_write: + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META); + trace_f2fs_writepages(mapping->host, wbc, META); + return 0; +} + +long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, + long nr_to_write, enum iostat_type io_type) +{ + struct address_space *mapping = META_MAPPING(sbi); + pgoff_t index = 0, prev = ULONG_MAX; + struct pagevec pvec; + long nwritten = 0; + int nr_pages; + struct writeback_control wbc = { + .for_reclaim = 0, + }; + struct blk_plug plug; + + pagevec_init(&pvec); + + blk_start_plug(&plug); + + while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY))) { + int i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (prev == ULONG_MAX) + prev = page->index - 1; + if (nr_to_write != LONG_MAX && page->index != prev + 1) { + pagevec_release(&pvec); + goto stop; + } + + lock_page(page); + + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + f2fs_wait_on_page_writeback(page, META, true); + + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + if (__f2fs_write_meta_page(page, &wbc, io_type)) { + unlock_page(page); + break; + } + nwritten++; + prev = page->index; + if (unlikely(nwritten >= nr_to_write)) + break; + } + pagevec_release(&pvec); + cond_resched(); + } +stop: + if (nwritten) + f2fs_submit_merged_write(sbi, type); + + blk_finish_plug(&plug); + + return nwritten; +} + +static int f2fs_set_meta_page_dirty(struct page *page) +{ + trace_f2fs_set_page_dirty(page, META); + + if (!PageUptodate(page)) + SetPageUptodate(page); + if (!PageDirty(page)) { + __set_page_dirty_nobuffers(page); + inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); + SetPagePrivate(page); + f2fs_trace_pid(page); + return 1; + } + return 0; +} + +const struct address_space_operations f2fs_meta_aops = { + .writepage = f2fs_write_meta_page, + .writepages = f2fs_write_meta_pages, + .set_page_dirty = f2fs_set_meta_page_dirty, + .invalidatepage = f2fs_invalidate_page, + .releasepage = f2fs_release_page, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif +}; + +static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) +{ + struct inode_management *im = &sbi->im[type]; + struct ino_entry *e, *tmp; + + tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS); + + radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); + + spin_lock(&im->ino_lock); + e = radix_tree_lookup(&im->ino_root, ino); + if (!e) { + e = tmp; + if (unlikely(radix_tree_insert(&im->ino_root, ino, e))) + f2fs_bug_on(sbi, 1); + + memset(e, 0, sizeof(struct ino_entry)); + e->ino = ino; + + list_add_tail(&e->list, &im->ino_list); + if (type != ORPHAN_INO) + im->ino_num++; + } + + if (type == FLUSH_INO) + f2fs_set_bit(devidx, (char *)&e->dirty_device); + + spin_unlock(&im->ino_lock); + radix_tree_preload_end(); + + if (e != tmp) + kmem_cache_free(ino_entry_slab, tmp); +} + +static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + struct inode_management *im = &sbi->im[type]; + struct ino_entry *e; + + spin_lock(&im->ino_lock); + e = radix_tree_lookup(&im->ino_root, ino); + if (e) { + list_del(&e->list); + radix_tree_delete(&im->ino_root, ino); + im->ino_num--; + spin_unlock(&im->ino_lock); + kmem_cache_free(ino_entry_slab, e); + return; + } + spin_unlock(&im->ino_lock); +} + +void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + /* add new dirty ino entry into list */ + __add_ino_entry(sbi, ino, 0, type); +} + +void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + /* remove dirty ino entry from list */ + __remove_ino_entry(sbi, ino, type); +} + +/* mode should be APPEND_INO or UPDATE_INO */ +bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) +{ + struct inode_management *im = &sbi->im[mode]; + struct ino_entry *e; + + spin_lock(&im->ino_lock); + e = radix_tree_lookup(&im->ino_root, ino); + spin_unlock(&im->ino_lock); + return e ? true : false; +} + +void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all) +{ + struct ino_entry *e, *tmp; + int i; + + for (i = all ? ORPHAN_INO : APPEND_INO; i < MAX_INO_ENTRY; i++) { + struct inode_management *im = &sbi->im[i]; + + spin_lock(&im->ino_lock); + list_for_each_entry_safe(e, tmp, &im->ino_list, list) { + list_del(&e->list); + radix_tree_delete(&im->ino_root, e->ino); + kmem_cache_free(ino_entry_slab, e); + im->ino_num--; + } + spin_unlock(&im->ino_lock); + } +} + +void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) +{ + __add_ino_entry(sbi, ino, devidx, type); +} + +bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) +{ + struct inode_management *im = &sbi->im[type]; + struct ino_entry *e; + bool is_dirty = false; + + spin_lock(&im->ino_lock); + e = radix_tree_lookup(&im->ino_root, ino); + if (e && f2fs_test_bit(devidx, (char *)&e->dirty_device)) + is_dirty = true; + spin_unlock(&im->ino_lock); + return is_dirty; +} + +int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi) +{ + struct inode_management *im = &sbi->im[ORPHAN_INO]; + int err = 0; + + spin_lock(&im->ino_lock); + + if (time_to_inject(sbi, FAULT_ORPHAN)) { + spin_unlock(&im->ino_lock); + f2fs_show_injection_info(FAULT_ORPHAN); + return -ENOSPC; + } + + if (unlikely(im->ino_num >= sbi->max_orphans)) + err = -ENOSPC; + else + im->ino_num++; + spin_unlock(&im->ino_lock); + + return err; +} + +void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi) +{ + struct inode_management *im = &sbi->im[ORPHAN_INO]; + + spin_lock(&im->ino_lock); + f2fs_bug_on(sbi, im->ino_num == 0); + im->ino_num--; + spin_unlock(&im->ino_lock); +} + +void f2fs_add_orphan_inode(struct inode *inode) +{ + /* add new orphan ino entry into list */ + __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, 0, ORPHAN_INO); + f2fs_update_inode_page(inode); +} + +void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +{ + /* remove orphan entry from orphan list */ + __remove_ino_entry(sbi, ino, ORPHAN_INO); +} + +static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct inode *inode; + struct node_info ni; + int err; + + inode = f2fs_iget_retry(sbi->sb, ino); + if (IS_ERR(inode)) { + /* + * there should be a bug that we can't find the entry + * to orphan inode. + */ + f2fs_bug_on(sbi, PTR_ERR(inode) == -ENOENT); + return PTR_ERR(inode); + } + + err = dquot_initialize(inode); + if (err) { + iput(inode); + goto err_out; + } + + clear_nlink(inode); + + /* truncate all the data during iput */ + iput(inode); + + err = f2fs_get_node_info(sbi, ino, &ni); + if (err) + goto err_out; + + /* ENOMEM was fully retried in f2fs_evict_inode. */ + if (ni.blk_addr != NULL_ADDR) { + err = -EIO; + goto err_out; + } + return 0; + +err_out: + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: orphan failed (ino=%x), run fsck to fix.", + __func__, ino); + return err; +} + +int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) +{ + block_t start_blk, orphan_blocks, i, j; + unsigned int s_flags = sbi->sb->s_flags; + int err = 0; +#ifdef CONFIG_QUOTA + int quota_enabled; +#endif + + if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) + return 0; + + if (s_flags & SB_RDONLY) { + f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + sbi->sb->s_flags &= ~SB_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= SB_ACTIVE; + + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ + quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); +#endif + + start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); + orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); + + f2fs_ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true); + + for (i = 0; i < orphan_blocks; i++) { + struct page *page; + struct f2fs_orphan_block *orphan_blk; + + page = f2fs_get_meta_page(sbi, start_blk + i); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } + + orphan_blk = (struct f2fs_orphan_block *)page_address(page); + for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) { + nid_t ino = le32_to_cpu(orphan_blk->ino[j]); + err = recover_orphan_inode(sbi, ino); + if (err) { + f2fs_put_page(page, 1); + goto out; + } + } + f2fs_put_page(page, 1); + } + /* clear Orphan Flag */ + clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); +out: + set_sbi_flag(sbi, SBI_IS_RECOVERED); + +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + if (quota_enabled) + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ + + return err; +} + +static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) +{ + struct list_head *head; + struct f2fs_orphan_block *orphan_blk = NULL; + unsigned int nentries = 0; + unsigned short index = 1; + unsigned short orphan_blocks; + struct page *page = NULL; + struct ino_entry *orphan = NULL; + struct inode_management *im = &sbi->im[ORPHAN_INO]; + + orphan_blocks = GET_ORPHAN_BLOCKS(im->ino_num); + + /* + * we don't need to do spin_lock(&im->ino_lock) here, since all the + * orphan inode operations are covered under f2fs_lock_op(). + * And, spin_lock should be avoided due to page operations below. + */ + head = &im->ino_list; + + /* loop for each orphan inode entry and write them in Jornal block */ + list_for_each_entry(orphan, head, list) { + if (!page) { + page = f2fs_grab_meta_page(sbi, start_blk++); + orphan_blk = + (struct f2fs_orphan_block *)page_address(page); + memset(orphan_blk, 0, sizeof(*orphan_blk)); + } + + orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); + + if (nentries == F2FS_ORPHANS_PER_BLOCK) { + /* + * an orphan block is full of 1020 entries, + * then we need to flush current orphan blocks + * and bring another one in memory + */ + orphan_blk->blk_addr = cpu_to_le16(index); + orphan_blk->blk_count = cpu_to_le16(orphan_blocks); + orphan_blk->entry_count = cpu_to_le32(nentries); + set_page_dirty(page); + f2fs_put_page(page, 1); + index++; + nentries = 0; + page = NULL; + } + } + + if (page) { + orphan_blk->blk_addr = cpu_to_le16(index); + orphan_blk->blk_count = cpu_to_le16(orphan_blocks); + orphan_blk->entry_count = cpu_to_le32(nentries); + set_page_dirty(page); + f2fs_put_page(page, 1); + } +} + +static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, + struct f2fs_checkpoint **cp_block, struct page **cp_page, + unsigned long long *version) +{ + unsigned long blk_size = sbi->blocksize; + size_t crc_offset = 0; + __u32 crc = 0; + + *cp_page = f2fs_get_meta_page(sbi, cp_addr); + if (IS_ERR(*cp_page)) + return PTR_ERR(*cp_page); + + *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); + + crc_offset = le32_to_cpu((*cp_block)->checksum_offset); + if (crc_offset > (blk_size - sizeof(__le32))) { + f2fs_put_page(*cp_page, 1); + f2fs_msg(sbi->sb, KERN_WARNING, + "invalid crc_offset: %zu", crc_offset); + return -EINVAL; + } + + crc = cur_cp_crc(*cp_block); + if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) { + f2fs_put_page(*cp_page, 1); + f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value"); + return -EINVAL; + } + + *version = cur_cp_version(*cp_block); + return 0; +} + +static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, + block_t cp_addr, unsigned long long *version) +{ + struct page *cp_page_1 = NULL, *cp_page_2 = NULL; + struct f2fs_checkpoint *cp_block = NULL; + unsigned long long cur_version = 0, pre_version = 0; + int err; + + err = get_checkpoint_version(sbi, cp_addr, &cp_block, + &cp_page_1, version); + if (err) + return NULL; + + if (le32_to_cpu(cp_block->cp_pack_total_block_count) > + sbi->blocks_per_seg) { + f2fs_msg(sbi->sb, KERN_WARNING, + "invalid cp_pack_total_block_count:%u", + le32_to_cpu(cp_block->cp_pack_total_block_count)); + goto invalid_cp; + } + pre_version = *version; + + cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; + err = get_checkpoint_version(sbi, cp_addr, &cp_block, + &cp_page_2, version); + if (err) + goto invalid_cp; + cur_version = *version; + + if (cur_version == pre_version) { + *version = cur_version; + f2fs_put_page(cp_page_2, 1); + return cp_page_1; + } + f2fs_put_page(cp_page_2, 1); +invalid_cp: + f2fs_put_page(cp_page_1, 1); + return NULL; +} + +int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *cp_block; + struct f2fs_super_block *fsb = sbi->raw_super; + struct page *cp1, *cp2, *cur_page; + unsigned long blk_size = sbi->blocksize; + unsigned long long cp1_version = 0, cp2_version = 0; + unsigned long long cp_start_blk_no; + unsigned int cp_blks = 1 + __cp_payload(sbi); + block_t cp_blk_no; + int i; + int err; + + sbi->ckpt = f2fs_kzalloc(sbi, array_size(blk_size, cp_blks), + GFP_KERNEL); + if (!sbi->ckpt) + return -ENOMEM; + /* + * Finding out valid cp block involves read both + * sets( cp pack1 and cp pack 2) + */ + cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr); + cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version); + + /* The second checkpoint pack should start at the next segment */ + cp_start_blk_no += ((unsigned long long)1) << + le32_to_cpu(fsb->log_blocks_per_seg); + cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version); + + if (cp1 && cp2) { + if (ver_after(cp2_version, cp1_version)) + cur_page = cp2; + else + cur_page = cp1; + } else if (cp1) { + cur_page = cp1; + } else if (cp2) { + cur_page = cp2; + } else { + err = -EFSCORRUPTED; + goto fail_no_cp; + } + + cp_block = (struct f2fs_checkpoint *)page_address(cur_page); + memcpy(sbi->ckpt, cp_block, blk_size); + + if (cur_page == cp1) + sbi->cur_cp_pack = 1; + else + sbi->cur_cp_pack = 2; + + /* Sanity checking of checkpoint */ + if (f2fs_sanity_check_ckpt(sbi)) { + err = -EFSCORRUPTED; + goto free_fail_no_cp; + } + + if (cp_blks <= 1) + goto done; + + cp_blk_no = le32_to_cpu(fsb->cp_blkaddr); + if (cur_page == cp2) + cp_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg); + + for (i = 1; i < cp_blks; i++) { + void *sit_bitmap_ptr; + unsigned char *ckpt = (unsigned char *)sbi->ckpt; + + cur_page = f2fs_get_meta_page(sbi, cp_blk_no + i); + if (IS_ERR(cur_page)) { + err = PTR_ERR(cur_page); + goto free_fail_no_cp; + } + sit_bitmap_ptr = page_address(cur_page); + memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size); + f2fs_put_page(cur_page, 1); + } +done: + f2fs_put_page(cp1, 1); + f2fs_put_page(cp2, 1); + return 0; + +free_fail_no_cp: + f2fs_put_page(cp1, 1); + f2fs_put_page(cp2, 1); +fail_no_cp: + kfree(sbi->ckpt); + return err; +} + +static void __add_dirty_inode(struct inode *inode, enum inode_type type) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; + + if (is_inode_flag_set(inode, flag)) + return; + + set_inode_flag(inode, flag); + if (!f2fs_is_volatile_file(inode)) + list_add_tail(&F2FS_I(inode)->dirty_list, + &sbi->inode_list[type]); + stat_inc_dirty_inode(sbi, type); +} + +static void __remove_dirty_inode(struct inode *inode, enum inode_type type) +{ + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; + + if (get_dirty_pages(inode) || !is_inode_flag_set(inode, flag)) + return; + + list_del_init(&F2FS_I(inode)->dirty_list); + clear_inode_flag(inode, flag); + stat_dec_dirty_inode(F2FS_I_SB(inode), type); +} + +void f2fs_update_dirty_page(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; + + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) + return; + + spin_lock(&sbi->inode_lock[type]); + if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH)) + __add_dirty_inode(inode, type); + inode_inc_dirty_pages(inode); + spin_unlock(&sbi->inode_lock[type]); + + SetPagePrivate(page); + f2fs_trace_pid(page); +} + +void f2fs_remove_dirty_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; + + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) + return; + + if (type == FILE_INODE && !test_opt(sbi, DATA_FLUSH)) + return; + + spin_lock(&sbi->inode_lock[type]); + __remove_dirty_inode(inode, type); + spin_unlock(&sbi->inode_lock[type]); +} + +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) +{ + struct list_head *head; + struct inode *inode; + struct f2fs_inode_info *fi; + bool is_dir = (type == DIR_INODE); + unsigned long ino = 0; + + trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); +retry: + if (unlikely(f2fs_cp_error(sbi))) { + trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); + return -EIO; + } + + spin_lock(&sbi->inode_lock[type]); + + head = &sbi->inode_list[type]; + if (list_empty(head)) { + spin_unlock(&sbi->inode_lock[type]); + trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); + return 0; + } + fi = list_first_entry(head, struct f2fs_inode_info, dirty_list); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[type]); + if (inode) { + unsigned long cur_ino = inode->i_ino; + + if (is_dir) + F2FS_I(inode)->cp_task = current; + + filemap_fdatawrite(inode->i_mapping); + + if (is_dir) + F2FS_I(inode)->cp_task = NULL; + + iput(inode); + /* We need to give cpu to another writers. */ + if (ino == cur_ino) + cond_resched(); + else + ino = cur_ino; + } else { + /* + * We should submit bio, since it exists several + * wribacking dentry pages in the freeing inode. + */ + f2fs_submit_merged_write(sbi, DATA); + cond_resched(); + } + goto retry; +} + +int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &sbi->inode_list[DIRTY_META]; + struct inode *inode; + struct f2fs_inode_info *fi; + s64 total = get_pages(sbi, F2FS_DIRTY_IMETA); + + while (total--) { + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (list_empty(head)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return 0; + } + fi = list_first_entry(head, struct f2fs_inode_info, + gdirty_list); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + if (inode) { + sync_inode_metadata(inode, 0); + + /* it's on eviction */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) + f2fs_update_inode_page(inode); + iput(inode); + } + } + return 0; +} + +static void __prepare_cp_block(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + nid_t last_nid = nm_i->next_scan_nid; + + next_free_nid(sbi, &last_nid); + ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); + ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); + ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); + ckpt->next_free_nid = cpu_to_le32(last_nid); +} + +/* + * Freeze all the FS-operations for checkpoint. + */ +static int block_operations(struct f2fs_sb_info *sbi) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .for_reclaim = 0, + }; + struct blk_plug plug; + int err = 0; + + blk_start_plug(&plug); + +retry_flush_dents: + f2fs_lock_all(sbi); + /* write all the dirty dentry pages */ + if (get_pages(sbi, F2FS_DIRTY_DENTS)) { + f2fs_unlock_all(sbi); + err = f2fs_sync_dirty_inodes(sbi, DIR_INODE); + if (err) + goto out; + cond_resched(); + goto retry_flush_dents; + } + + /* + * POR: we should ensure that there are no dirty node pages + * until finishing nat/sit flush. inode->i_blocks can be updated. + */ + down_write(&sbi->node_change); + + if (get_pages(sbi, F2FS_DIRTY_IMETA)) { + up_write(&sbi->node_change); + f2fs_unlock_all(sbi); + err = f2fs_sync_inode_meta(sbi); + if (err) + goto out; + cond_resched(); + goto retry_flush_dents; + } + +retry_flush_nodes: + down_write(&sbi->node_write); + + if (get_pages(sbi, F2FS_DIRTY_NODES)) { + up_write(&sbi->node_write); + atomic_inc(&sbi->wb_sync_req[NODE]); + err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); + atomic_dec(&sbi->wb_sync_req[NODE]); + if (err) { + up_write(&sbi->node_change); + f2fs_unlock_all(sbi); + goto out; + } + cond_resched(); + goto retry_flush_nodes; + } + + /* + * sbi->node_change is used only for AIO write_begin path which produces + * dirty node blocks and some checkpoint values by block allocation. + */ + __prepare_cp_block(sbi); + up_write(&sbi->node_change); +out: + blk_finish_plug(&plug); + return err; +} + +static void unblock_operations(struct f2fs_sb_info *sbi) +{ + up_write(&sbi->node_write); + f2fs_unlock_all(sbi); +} + +void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) +{ + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); + + if (!get_pages(sbi, F2FS_WB_CP_DATA)) + break; + + if (unlikely(f2fs_cp_error(sbi))) + break; + + io_schedule_timeout(5*HZ); + } + finish_wait(&sbi->cp_wait, &wait); +} + +static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); + + if ((cpc->reason & CP_UMOUNT) && + le32_to_cpu(ckpt->cp_pack_total_block_count) > + sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) + disable_nat_bits(sbi, false); + + if (cpc->reason & CP_TRIMMED) + __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); + else + __clear_ckpt_flags(ckpt, CP_TRIMMED_FLAG); + + if (cpc->reason & CP_UMOUNT) + __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + + if (cpc->reason & CP_FASTBOOT) + __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + + if (orphan_num) + __set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + __set_ckpt_flags(ckpt, CP_FSCK_FLAG); + + /* set this flag to activate crc|cp_ver for recovery */ + __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); + __clear_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG); + + spin_unlock_irqrestore(&sbi->cp_lock, flags); +} + +static void commit_checkpoint(struct f2fs_sb_info *sbi, + void *src, block_t blk_addr) +{ + struct writeback_control wbc = { + .for_reclaim = 0, + }; + + /* + * pagevec_lookup_tag and lock_page again will take + * some extra time. Therefore, f2fs_update_meta_pages and + * f2fs_sync_meta_pages are combined in this function. + */ + struct page *page = f2fs_grab_meta_page(sbi, blk_addr); + int err; + + memcpy(page_address(page), src, PAGE_SIZE); + set_page_dirty(page); + + f2fs_wait_on_page_writeback(page, META, true); + f2fs_bug_on(sbi, PageWriteback(page)); + if (unlikely(!clear_page_dirty_for_io(page))) + f2fs_bug_on(sbi, 1); + + /* writeout cp pack 2 page */ + err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO); + if (unlikely(err && f2fs_cp_error(sbi))) { + f2fs_put_page(page, 1); + return; + } + + f2fs_bug_on(sbi, err); + f2fs_put_page(page, 0); + + /* submit checkpoint (with barrier if NOBARRIER is not set) */ + f2fs_submit_merged_write(sbi, META_FLUSH); +} + +static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num, flags; + block_t start_blk; + unsigned int data_sum_blocks, orphan_blocks; + __u32 crc32 = 0; + int i; + int cp_payload_blks = __cp_payload(sbi); + struct super_block *sb = sbi->sb; + struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); + u64 kbytes_written; + int err; + + /* Flush all the NAT/SIT pages */ + while (get_pages(sbi, F2FS_DIRTY_META)) { + f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + if (unlikely(f2fs_cp_error(sbi))) + break; + } + + /* + * modify checkpoint + * version number is already updated + */ + ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true)); + ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + ckpt->cur_node_segno[i] = + cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE)); + ckpt->cur_node_blkoff[i] = + cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_NODE)); + ckpt->alloc_type[i + CURSEG_HOT_NODE] = + curseg_alloc_type(sbi, i + CURSEG_HOT_NODE); + } + for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { + ckpt->cur_data_segno[i] = + cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA)); + ckpt->cur_data_blkoff[i] = + cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_DATA)); + ckpt->alloc_type[i + CURSEG_HOT_DATA] = + curseg_alloc_type(sbi, i + CURSEG_HOT_DATA); + } + + /* 2 cp + n data seg summary + orphan inode blocks */ + data_sum_blocks = f2fs_npages_for_summary_flush(sbi, false); + spin_lock_irqsave(&sbi->cp_lock, flags); + if (data_sum_blocks < NR_CURSEG_DATA_TYPE) + __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + else + __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + spin_unlock_irqrestore(&sbi->cp_lock, flags); + + orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num); + ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + + orphan_blocks); + + if (__remain_node_summaries(cpc->reason)) + ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+ + cp_payload_blks + data_sum_blocks + + orphan_blocks + NR_CURSEG_NODE_TYPE); + else + ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS + + cp_payload_blks + data_sum_blocks + + orphan_blocks); + + /* update ckpt flag for checkpoint */ + update_ckpt_flags(sbi, cpc); + + /* update SIT/NAT bitmap */ + get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); + get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); + + crc32 = f2fs_crc32(sbi, ckpt, le32_to_cpu(ckpt->checksum_offset)); + *((__le32 *)((unsigned char *)ckpt + + le32_to_cpu(ckpt->checksum_offset))) + = cpu_to_le32(crc32); + + start_blk = __start_cp_next_addr(sbi); + + /* write nat bits */ + if (enabled_nat_bits(sbi, cpc)) { + __u64 cp_ver = cur_cp_version(ckpt); + block_t blk; + + cp_ver |= ((__u64)crc32 << 32); + *(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver); + + blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks; + for (i = 0; i < nm_i->nat_bits_blocks; i++) + f2fs_update_meta_page(sbi, nm_i->nat_bits + + (i << F2FS_BLKSIZE_BITS), blk + i); + + /* Flush all the NAT BITS pages */ + while (get_pages(sbi, F2FS_DIRTY_META)) { + f2fs_sync_meta_pages(sbi, META, LONG_MAX, + FS_CP_META_IO); + if (unlikely(f2fs_cp_error(sbi))) + break; + } + } + + /* write out checkpoint buffer at block 0 */ + f2fs_update_meta_page(sbi, ckpt, start_blk++); + + for (i = 1; i < 1 + cp_payload_blks; i++) + f2fs_update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE, + start_blk++); + + if (orphan_num) { + write_orphan_inodes(sbi, start_blk); + start_blk += orphan_blocks; + } + + f2fs_write_data_summaries(sbi, start_blk); + start_blk += data_sum_blocks; + + /* Record write statistics in the hot node summary */ + kbytes_written = sbi->kbytes_written; + if (sb->s_bdev->bd_part) + kbytes_written += BD_PART_WRITTEN(sbi); + + seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written); + + if (__remain_node_summaries(cpc->reason)) { + f2fs_write_node_summaries(sbi, start_blk); + start_blk += NR_CURSEG_NODE_TYPE; + } + + /* update user_block_counts */ + sbi->last_valid_block_count = sbi->total_valid_block_count; + percpu_counter_set(&sbi->alloc_valid_block_count, 0); + + /* Here, we have one bio having CP pack except cp pack 2 page */ + f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + + /* wait for previous submitted meta pages writeback */ + f2fs_wait_on_all_pages_writeback(sbi); + + /* flush all device cache */ + err = f2fs_flush_device_cache(sbi); + if (err) + return err; + + /* barrier and flush checkpoint cp pack 2 page if it can */ + commit_checkpoint(sbi, ckpt, start_blk); + f2fs_wait_on_all_pages_writeback(sbi); + + /* + * invalidate intermediate page cache borrowed from meta inode + * which are used for migration of encrypted inode's blocks. + */ + if (f2fs_sb_has_encrypt(sbi->sb)) + invalidate_mapping_pages(META_MAPPING(sbi), + MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1); + + f2fs_release_ino_entry(sbi, false); + + f2fs_reset_fsync_node_info(sbi); + + clear_sbi_flag(sbi, SBI_IS_DIRTY); + clear_sbi_flag(sbi, SBI_NEED_CP); + __set_cp_next_pack(sbi); + + /* + * redirty superblock if metadata like node page or inode cache is + * updated during writing checkpoint. + */ + if (get_pages(sbi, F2FS_DIRTY_NODES) || + get_pages(sbi, F2FS_DIRTY_IMETA)) + set_sbi_flag(sbi, SBI_IS_DIRTY); + + f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS)); + + return unlikely(f2fs_cp_error(sbi)) ? -EIO : 0; +} + +/* + * We guarantee that this checkpoint procedure will not fail. + */ +int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned long long ckpt_ver; + int err = 0; + + mutex_lock(&sbi->cp_mutex); + + if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && + ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || + ((cpc->reason & CP_DISCARD) && !sbi->discard_blks))) + goto out; + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto out; + } + if (f2fs_readonly(sbi->sb)) { + err = -EROFS; + goto out; + } + + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); + + err = block_operations(sbi); + if (err) + goto out; + + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); + + f2fs_flush_merged_writes(sbi); + + /* this is the case of multiple fstrims without any changes */ + if (cpc->reason & CP_DISCARD) { + if (!f2fs_exist_trim_candidates(sbi, cpc)) { + unblock_operations(sbi); + goto out; + } + + if (NM_I(sbi)->dirty_nat_cnt == 0 && + SIT_I(sbi)->dirty_sentries == 0 && + prefree_segments(sbi) == 0) { + f2fs_flush_sit_entries(sbi, cpc); + f2fs_clear_prefree_segments(sbi, cpc); + unblock_operations(sbi); + goto out; + } + } + + /* + * update checkpoint pack index + * Increase the version number so that + * SIT entries and seg summaries are written at correct place + */ + ckpt_ver = cur_cp_version(ckpt); + ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); + + /* write cached NAT/SIT entries to NAT/SIT area */ + f2fs_flush_nat_entries(sbi, cpc); + f2fs_flush_sit_entries(sbi, cpc); + + /* unlock all the fs_lock[] in do_checkpoint() */ + err = do_checkpoint(sbi, cpc); + if (err) + f2fs_release_discard_addrs(sbi); + else + f2fs_clear_prefree_segments(sbi, cpc); + + unblock_operations(sbi); + stat_inc_cp_count(sbi->stat_info); + + if (cpc->reason & CP_RECOVERY) + f2fs_msg(sbi->sb, KERN_NOTICE, + "checkpoint: version = %llx", ckpt_ver); + + /* do checkpoint periodically */ + f2fs_update_time(sbi, CP_TIME); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); +out: + mutex_unlock(&sbi->cp_mutex); + return err; +} + +void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < MAX_INO_ENTRY; i++) { + struct inode_management *im = &sbi->im[i]; + + INIT_RADIX_TREE(&im->ino_root, GFP_ATOMIC); + spin_lock_init(&im->ino_lock); + INIT_LIST_HEAD(&im->ino_list); + im->ino_num = 0; + } + + sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS - + NR_CURSEG_TYPE - __cp_payload(sbi)) * + F2FS_ORPHANS_PER_BLOCK; +} + +int __init f2fs_create_checkpoint_caches(void) +{ + ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry", + sizeof(struct ino_entry)); + if (!ino_entry_slab) + return -ENOMEM; + f2fs_inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry", + sizeof(struct inode_entry)); + if (!f2fs_inode_entry_slab) { + kmem_cache_destroy(ino_entry_slab); + return -ENOMEM; + } + return 0; +} + +void f2fs_destroy_checkpoint_caches(void) +{ + kmem_cache_destroy(ino_entry_slab); + kmem_cache_destroy(f2fs_inode_entry_slab); +} diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c new file mode 100644 index 000000000..c63f5e326 --- /dev/null +++ b/fs/f2fs/data.c @@ -0,0 +1,2761 @@ +/* + * fs/f2fs/data.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/buffer_head.h> +#include <linux/mpage.h> +#include <linux/writeback.h> +#include <linux/backing-dev.h> +#include <linux/pagevec.h> +#include <linux/blkdev.h> +#include <linux/bio.h> +#include <linux/prefetch.h> +#include <linux/uio.h> +#include <linux/cleancache.h> +#include <linux/sched/signal.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "trace.h" +#include <trace/events/f2fs.h> + +#define NUM_PREALLOC_POST_READ_CTXS 128 + +static struct kmem_cache *bio_post_read_ctx_cache; +static mempool_t *bio_post_read_ctx_pool; + +static bool __is_cp_guaranteed(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode; + struct f2fs_sb_info *sbi; + + if (!mapping) + return false; + + inode = mapping->host; + sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_META_INO(sbi) || + inode->i_ino == F2FS_NODE_INO(sbi) || + S_ISDIR(inode->i_mode) || + (S_ISREG(inode->i_mode) && + is_inode_flag_set(inode, FI_ATOMIC_FILE)) || + is_cold_data(page)) + return true; + return false; +} + +/* postprocessing steps for read bios */ +enum bio_post_read_step { + STEP_INITIAL = 0, + STEP_DECRYPT, +}; + +struct bio_post_read_ctx { + struct bio *bio; + struct work_struct work; + unsigned int cur_step; + unsigned int enabled_steps; +}; + +static void __read_end_io(struct bio *bio) +{ + struct page *page; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + page = bv->bv_page; + + /* PG_error was set if any post_read step failed */ + if (bio->bi_status || PageError(page)) { + ClearPageUptodate(page); + /* will re-read again later */ + ClearPageError(page); + } else { + SetPageUptodate(page); + } + unlock_page(page); + } + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); + bio_put(bio); +} + +static void bio_post_read_processing(struct bio_post_read_ctx *ctx); + +static void decrypt_work(struct work_struct *work) +{ + struct bio_post_read_ctx *ctx = + container_of(work, struct bio_post_read_ctx, work); + + fscrypt_decrypt_bio(ctx->bio); + + bio_post_read_processing(ctx); +} + +static void bio_post_read_processing(struct bio_post_read_ctx *ctx) +{ + switch (++ctx->cur_step) { + case STEP_DECRYPT: + if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { + INIT_WORK(&ctx->work, decrypt_work); + fscrypt_enqueue_decrypt_work(&ctx->work); + return; + } + ctx->cur_step++; + /* fall-through */ + default: + __read_end_io(ctx->bio); + } +} + +static bool f2fs_bio_post_read_required(struct bio *bio) +{ + return bio->bi_private && !bio->bi_status; +} + +static void f2fs_read_end_io(struct bio *bio) +{ + if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), FAULT_IO)) { + f2fs_show_injection_info(FAULT_IO); + bio->bi_status = BLK_STS_IOERR; + } + + if (f2fs_bio_post_read_required(bio)) { + struct bio_post_read_ctx *ctx = bio->bi_private; + + ctx->cur_step = STEP_INITIAL; + bio_post_read_processing(ctx); + return; + } + + __read_end_io(bio); +} + +static void f2fs_write_end_io(struct bio *bio) +{ + struct f2fs_sb_info *sbi = bio->bi_private; + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + enum count_type type = WB_DATA_TYPE(page); + + if (IS_DUMMY_WRITTEN_PAGE(page)) { + set_page_private(page, (unsigned long)NULL); + ClearPagePrivate(page); + unlock_page(page); + mempool_free(page, sbi->write_io_dummy); + + if (unlikely(bio->bi_status)) + f2fs_stop_checkpoint(sbi, true); + continue; + } + + fscrypt_pullback_bio_page(&page, true); + + if (unlikely(bio->bi_status)) { + mapping_set_error(page->mapping, -EIO); + if (type == F2FS_WB_CP_DATA) + f2fs_stop_checkpoint(sbi, true); + } + + f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) && + page->index != nid_of_node(page)); + + dec_page_count(sbi, type); + if (f2fs_in_warm_node_list(sbi, page)) + f2fs_del_fsync_node_entry(sbi, page); + clear_cold_data(page); + end_page_writeback(page); + } + if (!get_pages(sbi, F2FS_WB_CP_DATA) && + wq_has_sleeper(&sbi->cp_wait)) + wake_up(&sbi->cp_wait); + + bio_put(bio); +} + +/* + * Return true, if pre_bio's bdev is same as its target device. + */ +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio) +{ + struct block_device *bdev = sbi->sb->s_bdev; + int i; + + if (f2fs_is_multi_device(sbi)) { + for (i = 0; i < sbi->s_ndevs; i++) { + if (FDEV(i).start_blk <= blk_addr && + FDEV(i).end_blk >= blk_addr) { + blk_addr -= FDEV(i).start_blk; + bdev = FDEV(i).bdev; + break; + } + } + } + if (bio) { + bio_set_dev(bio, bdev); + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); + } + return bdev; +} + +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + int i; + + if (!f2fs_is_multi_device(sbi)) + return 0; + + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr) + return i; + return 0; +} + +static bool __same_bdev(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio) +{ + struct block_device *b = f2fs_target_device(sbi, blk_addr, NULL); + return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno; +} + +/* + * Low-level block read/write IO operations. + */ +static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, + struct writeback_control *wbc, + int npages, bool is_read, + enum page_type type, enum temp_type temp) +{ + struct bio *bio; + + bio = f2fs_bio_alloc(sbi, npages, true); + + f2fs_target_device(sbi, blk_addr, bio); + if (is_read) { + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = NULL; + } else { + bio->bi_end_io = f2fs_write_end_io; + bio->bi_private = sbi; + bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, type, temp); + } + if (wbc) + wbc_init_bio(wbc, bio); + + return bio; +} + +static inline void __submit_bio(struct f2fs_sb_info *sbi, + struct bio *bio, enum page_type type) +{ + if (!is_read_io(bio_op(bio))) { + unsigned int start; + + if (type != DATA && type != NODE) + goto submit_io; + + if (test_opt(sbi, LFS) && current->plug) + blk_finish_plug(current->plug); + + start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; + start %= F2FS_IO_SIZE(sbi); + + if (start == 0) + goto submit_io; + + /* fill dummy pages */ + for (; start < F2FS_IO_SIZE(sbi); start++) { + struct page *page = + mempool_alloc(sbi->write_io_dummy, + GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL); + f2fs_bug_on(sbi, !page); + + SetPagePrivate(page); + set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE); + lock_page(page); + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) + f2fs_bug_on(sbi, 1); + } + /* + * In the NODE case, we lose next block address chain. So, we + * need to do checkpoint in f2fs_sync_file. + */ + if (type == NODE) + set_sbi_flag(sbi, SBI_NEED_CP); + } +submit_io: + if (is_read_io(bio_op(bio))) + trace_f2fs_submit_read_bio(sbi->sb, type, bio); + else + trace_f2fs_submit_write_bio(sbi->sb, type, bio); + submit_bio(bio); +} + +static void __submit_merged_bio(struct f2fs_bio_info *io) +{ + struct f2fs_io_info *fio = &io->fio; + + if (!io->bio) + return; + + bio_set_op_attrs(io->bio, fio->op, fio->op_flags); + + if (is_read_io(fio->op)) + trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio); + else + trace_f2fs_prepare_write_bio(io->sbi->sb, fio->type, io->bio); + + __submit_bio(io->sbi, io->bio, fio->type); + io->bio = NULL; +} + +static bool __has_merged_page(struct f2fs_bio_info *io, + struct inode *inode, nid_t ino, pgoff_t idx) +{ + struct bio_vec *bvec; + struct page *target; + int i; + + if (!io->bio) + return false; + + if (!inode && !ino) + return true; + + bio_for_each_segment_all(bvec, io->bio, i) { + + if (bvec->bv_page->mapping) + target = bvec->bv_page; + else + target = fscrypt_control_page(bvec->bv_page); + + if (idx != target->index) + continue; + + if (inode && inode == target->mapping->host) + return true; + if (ino && ino == ino_of_node(target)) + return true; + } + + return false; +} + +static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, + nid_t ino, pgoff_t idx, enum page_type type) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(type); + enum temp_type temp; + struct f2fs_bio_info *io; + bool ret = false; + + for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { + io = sbi->write_io[btype] + temp; + + down_read(&io->io_rwsem); + ret = __has_merged_page(io, inode, ino, idx); + up_read(&io->io_rwsem); + + /* TODO: use HOT temp only for meta pages now. */ + if (ret || btype == META) + break; + } + return ret; +} + +static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io = sbi->write_io[btype] + temp; + + down_write(&io->io_rwsem); + + /* change META to META_FLUSH in the checkpoint procedure */ + if (type >= META_FLUSH) { + io->fio.type = META_FLUSH; + io->fio.op = REQ_OP_WRITE; + io->fio.op_flags = REQ_META | REQ_PRIO | REQ_SYNC; + if (!test_opt(sbi, NOBARRIER)) + io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA; + } + __submit_merged_bio(io); + up_write(&io->io_rwsem); +} + +static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type, bool force) +{ + enum temp_type temp; + + if (!force && !has_merged_page(sbi, inode, ino, idx, type)) + return; + + for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { + + __f2fs_submit_merged_write(sbi, type, temp); + + /* TODO: use HOT temp only for meta pages now. */ + if (type >= META) + break; + } +} + +void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) +{ + __submit_merged_write_cond(sbi, NULL, 0, 0, type, true); +} + +void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type) +{ + __submit_merged_write_cond(sbi, inode, ino, idx, type, false); +} + +void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) +{ + f2fs_submit_merged_write(sbi, DATA); + f2fs_submit_merged_write(sbi, NODE); + f2fs_submit_merged_write(sbi, META); +} + +/* + * Fill the locked page with data located in the block address. + * A caller needs to unlock the page on failure. + */ +int f2fs_submit_page_bio(struct f2fs_io_info *fio) +{ + struct bio *bio; + struct page *page = fio->encrypted_page ? + fio->encrypted_page : fio->page; + + if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, + __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) + return -EFSCORRUPTED; + + trace_f2fs_submit_page_bio(page, fio); + f2fs_trace_ios(fio, 0); + + /* Allocate a new bio */ + bio = __bio_alloc(fio->sbi, fio->new_blkaddr, fio->io_wbc, + 1, is_read_io(fio->op), fio->type, fio->temp); + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + bio_put(bio); + return -EFAULT; + } + + if (fio->io_wbc && !is_read_io(fio->op)) + wbc_account_io(fio->io_wbc, page, PAGE_SIZE); + + bio_set_op_attrs(bio, fio->op, fio->op_flags); + + if (!is_read_io(fio->op)) + inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page)); + + __submit_bio(fio->sbi, bio, fio->type); + return 0; +} + +void f2fs_submit_page_write(struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = fio->sbi; + enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); + struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; + struct page *bio_page; + + f2fs_bug_on(sbi, is_read_io(fio->op)); + + down_write(&io->io_rwsem); +next: + if (fio->in_list) { + spin_lock(&io->io_lock); + if (list_empty(&io->io_list)) { + spin_unlock(&io->io_lock); + goto out; + } + fio = list_first_entry(&io->io_list, + struct f2fs_io_info, list); + list_del(&fio->list); + spin_unlock(&io->io_lock); + } + + if (__is_valid_data_blkaddr(fio->old_blkaddr)) + verify_block_addr(fio, fio->old_blkaddr); + verify_block_addr(fio, fio->new_blkaddr); + + bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; + + /* set submitted = true as a return value */ + fio->submitted = true; + + inc_page_count(sbi, WB_DATA_TYPE(bio_page)); + + if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || + (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) || + !__same_bdev(sbi, fio->new_blkaddr, io->bio))) + __submit_merged_bio(io); +alloc_new: + if (io->bio == NULL) { + if ((fio->type == DATA || fio->type == NODE) && + fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { + dec_page_count(sbi, WB_DATA_TYPE(bio_page)); + fio->retry = true; + goto skip; + } + io->bio = __bio_alloc(sbi, fio->new_blkaddr, fio->io_wbc, + BIO_MAX_PAGES, false, + fio->type, fio->temp); + io->fio = *fio; + } + + if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { + __submit_merged_bio(io); + goto alloc_new; + } + + if (fio->io_wbc) + wbc_account_io(fio->io_wbc, bio_page, PAGE_SIZE); + + io->last_block_in_bio = fio->new_blkaddr; + f2fs_trace_ios(fio, 0); + + trace_f2fs_submit_page_write(fio->page, fio); +skip: + if (fio->in_list) + goto next; +out: + if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) + __submit_merged_bio(io); + up_write(&io->io_rwsem); +} + +static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages, unsigned op_flag) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct bio *bio; + struct bio_post_read_ctx *ctx; + unsigned int post_read_steps = 0; + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) + return ERR_PTR(-EFAULT); + + bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false); + if (!bio) + return ERR_PTR(-ENOMEM); + f2fs_target_device(sbi, blkaddr, bio); + bio->bi_end_io = f2fs_read_end_io; + bio_set_op_attrs(bio, REQ_OP_READ, op_flag); + + if (f2fs_encrypted_file(inode)) + post_read_steps |= 1 << STEP_DECRYPT; + if (post_read_steps) { + ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); + if (!ctx) { + bio_put(bio); + return ERR_PTR(-ENOMEM); + } + ctx->bio = bio; + ctx->enabled_steps = post_read_steps; + bio->bi_private = ctx; + } + + return bio; +} + +/* This can handle encryption stuffs */ +static int f2fs_submit_page_read(struct inode *inode, struct page *page, + block_t blkaddr) +{ + struct bio *bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + /* wait for GCed page writeback via META_MAPPING */ + f2fs_wait_on_block_writeback(inode, blkaddr); + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + bio_put(bio); + return -EFAULT; + } + ClearPageError(page); + __submit_bio(F2FS_I_SB(inode), bio, DATA); + return 0; +} + +static void __set_data_blkaddr(struct dnode_of_data *dn) +{ + struct f2fs_node *rn = F2FS_NODE(dn->node_page); + __le32 *addr_array; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); + + /* Get physical address of data block */ + addr_array = blkaddr_in_node(rn); + addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); +} + +/* + * Lock ordering for the change of data block address: + * ->data_page + * ->node_page + * update block addresses in the node page + */ +void f2fs_set_data_blkaddr(struct dnode_of_data *dn) +{ + f2fs_wait_on_page_writeback(dn->node_page, NODE, true); + __set_data_blkaddr(dn); + if (set_page_dirty(dn->node_page)) + dn->node_changed = true; +} + +void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) +{ + dn->data_blkaddr = blkaddr; + f2fs_set_data_blkaddr(dn); + f2fs_update_extent_cache(dn); +} + +/* dn->ofs_in_node will be returned with up-to-date last block pointer */ +int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + int err; + + if (!count) + return 0; + + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) + return -EPERM; + if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) + return err; + + trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, + dn->ofs_in_node, count); + + f2fs_wait_on_page_writeback(dn->node_page, NODE, true); + + for (; count > 0; dn->ofs_in_node++) { + block_t blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); + if (blkaddr == NULL_ADDR) { + dn->data_blkaddr = NEW_ADDR; + __set_data_blkaddr(dn); + count--; + } + } + + if (set_page_dirty(dn->node_page)) + dn->node_changed = true; + return 0; +} + +/* Should keep dn->ofs_in_node unchanged */ +int f2fs_reserve_new_block(struct dnode_of_data *dn) +{ + unsigned int ofs_in_node = dn->ofs_in_node; + int ret; + + ret = f2fs_reserve_new_blocks(dn, 1); + dn->ofs_in_node = ofs_in_node; + return ret; +} + +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) +{ + bool need_put = dn->inode_page ? false : true; + int err; + + err = f2fs_get_dnode_of_data(dn, index, ALLOC_NODE); + if (err) + return err; + + if (dn->data_blkaddr == NULL_ADDR) + err = f2fs_reserve_new_block(dn); + if (err || need_put) + f2fs_put_dnode(dn); + return err; +} + +int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) +{ + struct extent_info ei = {0,0,0}; + struct inode *inode = dn->inode; + + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn->data_blkaddr = ei.blk + index - ei.fofs; + return 0; + } + + return f2fs_reserve_block(dn, index); +} + +struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, + int op_flags, bool for_write) +{ + struct address_space *mapping = inode->i_mapping; + struct dnode_of_data dn; + struct page *page; + struct extent_info ei = {0,0,0}; + int err; + + page = f2fs_grab_cache_page(mapping, index, for_write); + if (!page) + return ERR_PTR(-ENOMEM); + + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + goto got_it; + } + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) + goto put_err; + f2fs_put_dnode(&dn); + + if (unlikely(dn.data_blkaddr == NULL_ADDR)) { + err = -ENOENT; + goto put_err; + } +got_it: + if (PageUptodate(page)) { + unlock_page(page); + return page; + } + + /* + * A new dentry page is allocated but not able to be written, since its + * new inode page couldn't be allocated due to -ENOSPC. + * In such the case, its blkaddr can be remained as NEW_ADDR. + * see, f2fs_add_link -> f2fs_get_new_data_page -> + * f2fs_init_inode_metadata. + */ + if (dn.data_blkaddr == NEW_ADDR) { + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); + unlock_page(page); + return page; + } + + err = f2fs_submit_page_read(inode, page, dn.data_blkaddr); + if (err) + goto put_err; + return page; + +put_err: + f2fs_put_page(page, 1); + return ERR_PTR(err); +} + +struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + + page = find_get_page(mapping, index); + if (page && PageUptodate(page)) + return page; + f2fs_put_page(page, 0); + + page = f2fs_get_read_data_page(inode, index, 0, false); + if (IS_ERR(page)) + return page; + + if (PageUptodate(page)) + return page; + + wait_on_page_locked(page); + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 0); + return ERR_PTR(-EIO); + } + return page; +} + +/* + * If it tries to access a hole, return an error. + * Because, the callers, functions in dir.c and GC, should be able to know + * whether this page exists or not. + */ +struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, + bool for_write) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; +repeat: + page = f2fs_get_read_data_page(inode, index, 0, for_write); + if (IS_ERR(page)) + return page; + + /* wait for read completion */ + lock_page(page); + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; + } + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } + return page; +} + +/* + * Caller ensures that this data page is never allocated. + * A new zero-filled data page is allocated in the page cache. + * + * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + * Note that, ipage is set only by make_empty_dir, and if any error occur, + * ipage should be released by this function. + */ +struct page *f2fs_get_new_data_page(struct inode *inode, + struct page *ipage, pgoff_t index, bool new_i_size) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + struct dnode_of_data dn; + int err; + + page = f2fs_grab_cache_page(mapping, index, true); + if (!page) { + /* + * before exiting, we should make sure ipage will be released + * if any error occur. + */ + f2fs_put_page(ipage, 1); + return ERR_PTR(-ENOMEM); + } + + set_new_dnode(&dn, inode, ipage, NULL, 0); + err = f2fs_reserve_block(&dn, index); + if (err) { + f2fs_put_page(page, 1); + return ERR_PTR(err); + } + if (!ipage) + f2fs_put_dnode(&dn); + + if (PageUptodate(page)) + goto got_it; + + if (dn.data_blkaddr == NEW_ADDR) { + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); + } else { + f2fs_put_page(page, 1); + + /* if ipage exists, blkaddr should be NEW_ADDR */ + f2fs_bug_on(F2FS_I_SB(inode), ipage); + page = f2fs_get_lock_data_page(inode, index, true); + if (IS_ERR(page)) + return page; + } +got_it: + if (new_i_size && i_size_read(inode) < + ((loff_t)(index + 1) << PAGE_SHIFT)) + f2fs_i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT)); + return page; +} + +static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct f2fs_summary sum; + struct node_info ni; + block_t old_blkaddr; + blkcnt_t count = 1; + int err; + + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) + return -EPERM; + + err = f2fs_get_node_info(sbi, dn->nid, &ni); + if (err) + return err; + + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); + if (dn->data_blkaddr == NEW_ADDR) + goto alloc; + + if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) + return err; + +alloc: + set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); + old_blkaddr = dn->data_blkaddr; + f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, + &sum, seg_type, NULL, false); + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + invalidate_mapping_pages(META_MAPPING(sbi), + old_blkaddr, old_blkaddr); + f2fs_set_data_blkaddr(dn); + + /* + * i_size will be updated by direct_IO. Otherwise, we'll get stale + * data from unwritten block via dio_read. + */ + return 0; +} + +int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct f2fs_map_blocks map; + int flag; + int err = 0; + bool direct_io = iocb->ki_flags & IOCB_DIRECT; + + /* convert inline data for Direct I/O*/ + if (direct_io) { + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } + + if (is_inode_flag_set(inode, FI_NO_PREALLOC)) + return 0; + + map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); + map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); + if (map.m_len > map.m_lblk) + map.m_len -= map.m_lblk; + else + map.m_len = 0; + + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; + map.m_seg_type = NO_CHECK_TYPE; + + if (direct_io) { + map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint); + flag = f2fs_force_buffered_io(inode, WRITE) ? + F2FS_GET_BLOCK_PRE_AIO : + F2FS_GET_BLOCK_PRE_DIO; + goto map_blocks; + } + if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } + if (f2fs_has_inline_data(inode)) + return err; + + flag = F2FS_GET_BLOCK_PRE_AIO; + +map_blocks: + err = f2fs_map_blocks(inode, &map, 1, flag); + if (map.m_len > 0 && err == -ENOSPC) { + if (!direct_io) + set_inode_flag(inode, FI_NO_PREALLOC); + err = 0; + } + return err; +} + +static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) +{ + if (flag == F2FS_GET_BLOCK_PRE_AIO) { + if (lock) + down_read(&sbi->node_change); + else + up_read(&sbi->node_change); + } else { + if (lock) + f2fs_lock_op(sbi); + else + f2fs_unlock_op(sbi); + } +} + +/* + * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with + * f2fs_map_blocks structure. + * If original data blocks are allocated, then give them to blockdev. + * Otherwise, + * a. preallocate requested block addresses + * b. do not use extent cache for better performance + * c. give the block addresses to blockdev + */ +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, + int create, int flag) +{ + unsigned int maxblocks = map->m_len; + struct dnode_of_data dn; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int mode = create ? ALLOC_NODE : LOOKUP_NODE; + pgoff_t pgofs, end_offset, end; + int err = 0, ofs = 1; + unsigned int ofs_in_node, last_ofs_in_node; + blkcnt_t prealloc; + struct extent_info ei = {0,0,0}; + block_t blkaddr; + unsigned int start_pgofs; + + if (!maxblocks) + return 0; + + map->m_len = 0; + map->m_flags = 0; + + /* it only supports block size == page size */ + pgofs = (pgoff_t)map->m_lblk; + end = pgofs + maxblocks; + + if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) { + map->m_pblk = ei.blk + pgofs - ei.fofs; + map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs); + map->m_flags = F2FS_MAP_MAPPED; + if (map->m_next_extent) + *map->m_next_extent = pgofs + map->m_len; + goto out; + } + +next_dnode: + if (create) + __do_map_lock(sbi, flag, true); + + /* When reading holes, we need its node page */ + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, pgofs, mode); + if (err) { + if (flag == F2FS_GET_BLOCK_BMAP) + map->m_pblk = 0; + if (err == -ENOENT) { + err = 0; + if (map->m_next_pgofs) + *map->m_next_pgofs = + f2fs_get_next_page_offset(&dn, pgofs); + if (map->m_next_extent) + *map->m_next_extent = + f2fs_get_next_page_offset(&dn, pgofs); + } + goto unlock_out; + } + + start_pgofs = pgofs; + prealloc = 0; + last_ofs_in_node = ofs_in_node = dn.ofs_in_node; + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + +next_block: + blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); + + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) { + err = -EFSCORRUPTED; + goto sync_out; + } + + if (!is_valid_data_blkaddr(sbi, blkaddr)) { + if (create) { + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto sync_out; + } + if (flag == F2FS_GET_BLOCK_PRE_AIO) { + if (blkaddr == NULL_ADDR) { + prealloc++; + last_ofs_in_node = dn.ofs_in_node; + } + } else { + WARN_ON(flag != F2FS_GET_BLOCK_PRE_DIO && + flag != F2FS_GET_BLOCK_DIO); + err = __allocate_data_block(&dn, + map->m_seg_type); + if (!err) + set_inode_flag(inode, FI_APPEND_WRITE); + } + if (err) + goto sync_out; + map->m_flags |= F2FS_MAP_NEW; + blkaddr = dn.data_blkaddr; + } else { + if (flag == F2FS_GET_BLOCK_BMAP) { + map->m_pblk = 0; + goto sync_out; + } + if (flag == F2FS_GET_BLOCK_PRECACHE) + goto sync_out; + if (flag == F2FS_GET_BLOCK_FIEMAP && + blkaddr == NULL_ADDR) { + if (map->m_next_pgofs) + *map->m_next_pgofs = pgofs + 1; + goto sync_out; + } + if (flag != F2FS_GET_BLOCK_FIEMAP) { + /* for defragment case */ + if (map->m_next_pgofs) + *map->m_next_pgofs = pgofs + 1; + goto sync_out; + } + } + } + + if (flag == F2FS_GET_BLOCK_PRE_AIO) + goto skip; + + if (map->m_len == 0) { + /* preallocated unwritten block should be mapped for fiemap. */ + if (blkaddr == NEW_ADDR) + map->m_flags |= F2FS_MAP_UNWRITTEN; + map->m_flags |= F2FS_MAP_MAPPED; + + map->m_pblk = blkaddr; + map->m_len = 1; + } else if ((map->m_pblk != NEW_ADDR && + blkaddr == (map->m_pblk + ofs)) || + (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) || + flag == F2FS_GET_BLOCK_PRE_DIO) { + ofs++; + map->m_len++; + } else { + goto sync_out; + } + +skip: + dn.ofs_in_node++; + pgofs++; + + /* preallocate blocks in batch for one dnode page */ + if (flag == F2FS_GET_BLOCK_PRE_AIO && + (pgofs == end || dn.ofs_in_node == end_offset)) { + + dn.ofs_in_node = ofs_in_node; + err = f2fs_reserve_new_blocks(&dn, prealloc); + if (err) + goto sync_out; + + map->m_len += dn.ofs_in_node - ofs_in_node; + if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { + err = -ENOSPC; + goto sync_out; + } + dn.ofs_in_node = end_offset; + } + + if (pgofs >= end) + goto sync_out; + else if (dn.ofs_in_node < end_offset) + goto next_block; + + if (flag == F2FS_GET_BLOCK_PRECACHE) { + if (map->m_flags & F2FS_MAP_MAPPED) { + unsigned int ofs = start_pgofs - map->m_lblk; + + f2fs_update_extent_cache_range(&dn, + start_pgofs, map->m_pblk + ofs, + map->m_len - ofs); + } + } + + f2fs_put_dnode(&dn); + + if (create) { + __do_map_lock(sbi, flag, false); + f2fs_balance_fs(sbi, dn.node_changed); + } + goto next_dnode; + +sync_out: + if (flag == F2FS_GET_BLOCK_PRECACHE) { + if (map->m_flags & F2FS_MAP_MAPPED) { + unsigned int ofs = start_pgofs - map->m_lblk; + + f2fs_update_extent_cache_range(&dn, + start_pgofs, map->m_pblk + ofs, + map->m_len - ofs); + } + if (map->m_next_extent) + *map->m_next_extent = pgofs + 1; + } + f2fs_put_dnode(&dn); +unlock_out: + if (create) { + __do_map_lock(sbi, flag, false); + f2fs_balance_fs(sbi, dn.node_changed); + } +out: + trace_f2fs_map_blocks(inode, map, err); + return err; +} + +bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) +{ + struct f2fs_map_blocks map; + block_t last_lblk; + int err; + + if (pos + len > i_size_read(inode)) + return false; + + map.m_lblk = F2FS_BYTES_TO_BLK(pos); + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; + map.m_seg_type = NO_CHECK_TYPE; + last_lblk = F2FS_BLK_ALIGN(pos + len); + + while (map.m_lblk < last_lblk) { + map.m_len = last_lblk - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); + if (err || map.m_len == 0) + return false; + map.m_lblk += map.m_len; + } + return true; +} + +static int __get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create, int flag, + pgoff_t *next_pgofs, int seg_type) +{ + struct f2fs_map_blocks map; + int err; + + map.m_lblk = iblock; + map.m_len = bh->b_size >> inode->i_blkbits; + map.m_next_pgofs = next_pgofs; + map.m_next_extent = NULL; + map.m_seg_type = seg_type; + + err = f2fs_map_blocks(inode, &map, create, flag); + if (!err) { + map_bh(bh, inode->i_sb, map.m_pblk); + bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; + bh->b_size = (u64)map.m_len << inode->i_blkbits; + } + return err; +} + +static int get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create, int flag, + pgoff_t *next_pgofs) +{ + return __get_data_block(inode, iblock, bh_result, create, + flag, next_pgofs, + NO_CHECK_TYPE); +} + +static int get_data_block_dio(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, + F2FS_GET_BLOCK_DIO, NULL, + f2fs_rw_hint_to_seg_type( + inode->i_write_hint)); +} + +static int get_data_block_bmap(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + /* Block number less than F2FS MAX BLOCKS */ + if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks)) + return -EFBIG; + + return __get_data_block(inode, iblock, bh_result, create, + F2FS_GET_BLOCK_BMAP, NULL, + NO_CHECK_TYPE); +} + +static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) +{ + return (offset >> inode->i_blkbits); +} + +static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) +{ + return (blk << inode->i_blkbits); +} + +static int f2fs_xattr_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct page *page; + struct node_info ni; + __u64 phys = 0, len; + __u32 flags; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + int err = 0; + + if (f2fs_has_inline_xattr(inode)) { + int offset; + + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), + inode->i_ino, false); + if (!page) + return -ENOMEM; + + err = f2fs_get_node_info(sbi, inode->i_ino, &ni); + if (err) { + f2fs_put_page(page, 1); + return err; + } + + phys = (__u64)blk_to_logical(inode, ni.blk_addr); + offset = offsetof(struct f2fs_inode, i_addr) + + sizeof(__le32) * (DEF_ADDRS_PER_INODE - + get_inline_xattr_addrs(inode)); + + phys += offset; + len = inline_xattr_size(inode); + + f2fs_put_page(page, 1); + + flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED; + + if (!xnid) + flags |= FIEMAP_EXTENT_LAST; + + err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); + if (err || err == 1) + return err; + } + + if (xnid) { + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), xnid, false); + if (!page) + return -ENOMEM; + + err = f2fs_get_node_info(sbi, xnid, &ni); + if (err) { + f2fs_put_page(page, 1); + return err; + } + + phys = (__u64)blk_to_logical(inode, ni.blk_addr); + len = inode->i_sb->s_blocksize; + + f2fs_put_page(page, 1); + + flags = FIEMAP_EXTENT_LAST; + } + + if (phys) + err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); + + return (err < 0 ? err : 0); +} + +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + struct buffer_head map_bh; + sector_t start_blk, last_blk; + pgoff_t next_pgofs; + u64 logical = 0, phys = 0, size = 0; + u32 flags = 0; + int ret = 0; + + if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + ret = f2fs_precache_extents(inode); + if (ret) + return ret; + } + + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR); + if (ret) + return ret; + + inode_lock(inode); + + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { + ret = f2fs_xattr_fiemap(inode, fieinfo); + goto out; + } + + if (f2fs_has_inline_data(inode)) { + ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len); + if (ret != -EAGAIN) + goto out; + } + + if (logical_to_blk(inode, len) == 0) + len = blk_to_logical(inode, 1); + + start_blk = logical_to_blk(inode, start); + last_blk = logical_to_blk(inode, start + len - 1); + +next: + memset(&map_bh, 0, sizeof(struct buffer_head)); + map_bh.b_size = len; + + ret = get_data_block(inode, start_blk, &map_bh, 0, + F2FS_GET_BLOCK_FIEMAP, &next_pgofs); + if (ret) + goto out; + + /* HOLE */ + if (!buffer_mapped(&map_bh)) { + start_blk = next_pgofs; + + if (blk_to_logical(inode, start_blk) < blk_to_logical(inode, + F2FS_I_SB(inode)->max_file_blocks)) + goto prep_next; + + flags |= FIEMAP_EXTENT_LAST; + } + + if (size) { + if (f2fs_encrypted_inode(inode)) + flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; + + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + } + + if (start_blk > last_blk || ret) + goto out; + + logical = blk_to_logical(inode, start_blk); + phys = blk_to_logical(inode, map_bh.b_blocknr); + size = map_bh.b_size; + flags = 0; + if (buffer_unwritten(&map_bh)) + flags = FIEMAP_EXTENT_UNWRITTEN; + + start_blk += logical_to_blk(inode, size); + +prep_next: + cond_resched(); + if (fatal_signal_pending(current)) + ret = -EINTR; + else + goto next; +out: + if (ret == 1) + ret = 0; + + inode_unlock(inode); + return ret; +} + +/* + * This function was originally taken from fs/mpage.c, and customized for f2fs. + * Major change was from block_size == page_size in f2fs by default. + * + * Note that the aops->readpages() function is ONLY used for read-ahead. If + * this function ever deviates from doing just read-ahead, it should either + * use ->readpage() or do the necessary surgery to decouple ->readpages() + * from read-ahead. + */ +static int f2fs_mpage_readpages(struct address_space *mapping, + struct list_head *pages, struct page *page, + unsigned nr_pages, bool is_readahead) +{ + struct bio *bio = NULL; + sector_t last_block_in_bio = 0; + struct inode *inode = mapping->host; + const unsigned blkbits = inode->i_blkbits; + const unsigned blocksize = 1 << blkbits; + sector_t block_in_file; + sector_t last_block; + sector_t last_block_in_file; + sector_t block_nr; + struct f2fs_map_blocks map; + + map.m_pblk = 0; + map.m_lblk = 0; + map.m_len = 0; + map.m_flags = 0; + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; + map.m_seg_type = NO_CHECK_TYPE; + + for (; nr_pages; nr_pages--) { + if (pages) { + page = list_last_entry(pages, struct page, lru); + + prefetchw(&page->flags); + list_del(&page->lru); + if (add_to_page_cache_lru(page, mapping, + page->index, + readahead_gfp_mask(mapping))) + goto next_page; + } + + block_in_file = (sector_t)page->index; + last_block = block_in_file + nr_pages; + last_block_in_file = (i_size_read(inode) + blocksize - 1) >> + blkbits; + if (last_block > last_block_in_file) + last_block = last_block_in_file; + + /* + * Map blocks using the previous result first. + */ + if ((map.m_flags & F2FS_MAP_MAPPED) && + block_in_file > map.m_lblk && + block_in_file < (map.m_lblk + map.m_len)) + goto got_it; + + /* + * Then do more f2fs_map_blocks() calls until we are + * done with this page. + */ + map.m_flags = 0; + + if (block_in_file < last_block) { + map.m_lblk = block_in_file; + map.m_len = last_block - block_in_file; + + if (f2fs_map_blocks(inode, &map, 0, + F2FS_GET_BLOCK_DEFAULT)) + goto set_error_page; + } +got_it: + if ((map.m_flags & F2FS_MAP_MAPPED)) { + block_nr = map.m_pblk + block_in_file - map.m_lblk; + SetPageMappedToDisk(page); + + if (!PageUptodate(page) && !cleancache_get_page(page)) { + SetPageUptodate(page); + goto confused; + } + + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, + DATA_GENERIC)) + goto set_error_page; + } else { + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); + unlock_page(page); + goto next_page; + } + + /* + * This page will go to BIO. Do we need to send this + * BIO off first? + */ + if (bio && (last_block_in_bio != block_nr - 1 || + !__same_bdev(F2FS_I_SB(inode), block_nr, bio))) { +submit_and_realloc: + __submit_bio(F2FS_I_SB(inode), bio, DATA); + bio = NULL; + } + if (bio == NULL) { + bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, + is_readahead ? REQ_RAHEAD : 0); + if (IS_ERR(bio)) { + bio = NULL; + goto set_error_page; + } + } + + /* + * If the page is under writeback, we need to wait for + * its completion to see the correct decrypted data. + */ + f2fs_wait_on_block_writeback(inode, block_nr); + + if (bio_add_page(bio, page, blocksize, 0) < blocksize) + goto submit_and_realloc; + + ClearPageError(page); + last_block_in_bio = block_nr; + goto next_page; +set_error_page: + SetPageError(page); + zero_user_segment(page, 0, PAGE_SIZE); + unlock_page(page); + goto next_page; +confused: + if (bio) { + __submit_bio(F2FS_I_SB(inode), bio, DATA); + bio = NULL; + } + unlock_page(page); +next_page: + if (pages) + put_page(page); + } + BUG_ON(pages && !list_empty(pages)); + if (bio) + __submit_bio(F2FS_I_SB(inode), bio, DATA); + return 0; +} + +static int f2fs_read_data_page(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + int ret = -EAGAIN; + + trace_f2fs_readpage(page, DATA); + + /* If the file has inline data, try to read it directly */ + if (f2fs_has_inline_data(inode)) + ret = f2fs_read_inline_data(inode, page); + if (ret == -EAGAIN) + ret = f2fs_mpage_readpages(page->mapping, NULL, page, 1, false); + return ret; +} + +static int f2fs_read_data_pages(struct file *file, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct inode *inode = mapping->host; + struct page *page = list_last_entry(pages, struct page, lru); + + trace_f2fs_readpages(inode, page, nr_pages); + + /* If the file has inline data, skip readpages */ + if (f2fs_has_inline_data(inode)) + return 0; + + return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages, true); +} + +static int encrypt_one_page(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + struct page *mpage; + gfp_t gfp_flags = GFP_NOFS; + + if (!f2fs_encrypted_file(inode)) + return 0; + + /* wait for GCed page writeback via META_MAPPING */ + f2fs_wait_on_block_writeback(inode, fio->old_blkaddr); + +retry_encrypt: + fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, + PAGE_SIZE, 0, fio->page->index, gfp_flags); + if (IS_ERR(fio->encrypted_page)) { + /* flush pending IOs and wait for a while in the ENOMEM case */ + if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { + f2fs_flush_merged_writes(fio->sbi); + congestion_wait(BLK_RW_ASYNC, HZ/50); + gfp_flags |= __GFP_NOFAIL; + goto retry_encrypt; + } + return PTR_ERR(fio->encrypted_page); + } + + mpage = find_lock_page(META_MAPPING(fio->sbi), fio->old_blkaddr); + if (mpage) { + if (PageUptodate(mpage)) + memcpy(page_address(mpage), + page_address(fio->encrypted_page), PAGE_SIZE); + f2fs_put_page(mpage, 1); + } + return 0; +} + +static inline bool check_inplace_update_policy(struct inode *inode, + struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int policy = SM_I(sbi)->ipu_policy; + + if (policy & (0x1 << F2FS_IPU_FORCE)) + return true; + if (policy & (0x1 << F2FS_IPU_SSR) && f2fs_need_SSR(sbi)) + return true; + if (policy & (0x1 << F2FS_IPU_UTIL) && + utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && f2fs_need_SSR(sbi) && + utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + + /* + * IPU for rewrite async pages + */ + if (policy & (0x1 << F2FS_IPU_ASYNC) && + fio && fio->op == REQ_OP_WRITE && + !(fio->op_flags & REQ_SYNC) && + !f2fs_encrypted_inode(inode)) + return true; + + /* this is only set during fdatasync */ + if (policy & (0x1 << F2FS_IPU_FSYNC) && + is_inode_flag_set(inode, FI_NEED_IPU)) + return true; + + return false; +} + +bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) +{ + if (f2fs_is_pinned_file(inode)) + return true; + + /* if this is cold file, we should overwrite to avoid fragmentation */ + if (file_is_cold(inode)) + return true; + + return check_inplace_update_policy(inode, fio); +} + +bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (test_opt(sbi, LFS)) + return true; + if (S_ISDIR(inode->i_mode)) + return true; + if (f2fs_is_atomic_file(inode)) + return true; + if (fio) { + if (is_cold_data(fio->page)) + return true; + if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) + return true; + } + return false; +} + +static inline bool need_inplace_update(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + + if (f2fs_should_update_outplace(inode, fio)) + return false; + + return f2fs_should_update_inplace(inode, fio); +} + +int f2fs_do_write_data_page(struct f2fs_io_info *fio) +{ + struct page *page = fio->page; + struct inode *inode = page->mapping->host; + struct dnode_of_data dn; + struct extent_info ei = {0,0,0}; + struct node_info ni; + bool ipu_force = false; + int err = 0; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + if (need_inplace_update(fio) && + f2fs_lookup_extent_cache(inode, page->index, &ei)) { + fio->old_blkaddr = ei.blk + page->index - ei.fofs; + + if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, + DATA_GENERIC)) + return -EFSCORRUPTED; + + ipu_force = true; + fio->need_lock = LOCK_DONE; + goto got_it; + } + + /* Deadlock due to between page->lock and f2fs_lock_op */ + if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi)) + return -EAGAIN; + + err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + if (err) + goto out; + + fio->old_blkaddr = dn.data_blkaddr; + + /* This page is already truncated */ + if (fio->old_blkaddr == NULL_ADDR) { + ClearPageUptodate(page); + clear_cold_data(page); + goto out_writepage; + } +got_it: + if (__is_valid_data_blkaddr(fio->old_blkaddr) && + !f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, + DATA_GENERIC)) { + err = -EFSCORRUPTED; + goto out_writepage; + } + /* + * If current allocation needs SSR, + * it had better in-place writes for updated data. + */ + if (ipu_force || (is_valid_data_blkaddr(fio->sbi, fio->old_blkaddr) && + need_inplace_update(fio))) { + err = encrypt_one_page(fio); + if (err) + goto out_writepage; + + set_page_writeback(page); + ClearPageError(page); + f2fs_put_dnode(&dn); + if (fio->need_lock == LOCK_REQ) + f2fs_unlock_op(fio->sbi); + err = f2fs_inplace_write_data(fio); + trace_f2fs_do_write_data_page(fio->page, IPU); + set_inode_flag(inode, FI_UPDATE_WRITE); + return err; + } + + if (fio->need_lock == LOCK_RETRY) { + if (!f2fs_trylock_op(fio->sbi)) { + err = -EAGAIN; + goto out_writepage; + } + fio->need_lock = LOCK_REQ; + } + + err = f2fs_get_node_info(fio->sbi, dn.nid, &ni); + if (err) + goto out_writepage; + + fio->version = ni.version; + + err = encrypt_one_page(fio); + if (err) + goto out_writepage; + + set_page_writeback(page); + ClearPageError(page); + + /* LFS mode write path */ + f2fs_outplace_write_data(&dn, fio); + trace_f2fs_do_write_data_page(page, OPU); + set_inode_flag(inode, FI_APPEND_WRITE); + if (page->index == 0) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); +out_writepage: + f2fs_put_dnode(&dn); +out: + if (fio->need_lock == LOCK_REQ) + f2fs_unlock_op(fio->sbi); + return err; +} + +static int __write_data_page(struct page *page, bool *submitted, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + struct inode *inode = page->mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + loff_t i_size = i_size_read(inode); + const pgoff_t end_index = ((unsigned long long) i_size) + >> PAGE_SHIFT; + loff_t psize = (loff_t)(page->index + 1) << PAGE_SHIFT; + unsigned offset = 0; + bool need_balance_fs = false; + int err = 0; + struct f2fs_io_info fio = { + .sbi = sbi, + .ino = inode->i_ino, + .type = DATA, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), + .old_blkaddr = NULL_ADDR, + .page = page, + .encrypted_page = NULL, + .submitted = false, + .need_lock = LOCK_RETRY, + .io_type = io_type, + .io_wbc = wbc, + }; + + trace_f2fs_writepage(page, DATA); + + /* we should bypass data pages to proceed the kworkder jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + mapping_set_error(page->mapping, -EIO); + /* + * don't drop any dirty dentry pages for keeping lastest + * directory structure. + */ + if (S_ISDIR(inode->i_mode)) + goto redirty_out; + goto out; + } + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + + if (page->index < end_index) + goto write; + + /* + * If the offset is out-of-range of file size, + * this page does not have to be written to disk. + */ + offset = i_size & (PAGE_SIZE - 1); + if ((page->index >= end_index + 1) || !offset) + goto out; + + zero_user_segment(page, offset, PAGE_SIZE); +write: + if (f2fs_is_drop_cache(inode)) + goto out; + /* we should not write 0'th page having journal header */ + if (f2fs_is_volatile_file(inode) && (!page->index || + (!wbc->for_reclaim && + f2fs_available_free_memory(sbi, BASE_CHECK)))) + goto redirty_out; + + /* Dentry blocks are controlled by checkpoint */ + if (S_ISDIR(inode->i_mode)) { + fio.need_lock = LOCK_DONE; + err = f2fs_do_write_data_page(&fio); + goto done; + } + + if (!wbc->for_reclaim) + need_balance_fs = true; + else if (has_not_enough_free_secs(sbi, 0, 0)) + goto redirty_out; + else + set_inode_flag(inode, FI_HOT_DATA); + + err = -EAGAIN; + if (f2fs_has_inline_data(inode)) { + err = f2fs_write_inline_data(inode, page); + if (!err) + goto out; + } + + if (err == -EAGAIN) { + err = f2fs_do_write_data_page(&fio); + if (err == -EAGAIN) { + fio.need_lock = LOCK_REQ; + err = f2fs_do_write_data_page(&fio); + } + } + + if (err) { + file_set_keep_isize(inode); + } else { + down_write(&F2FS_I(inode)->i_sem); + if (F2FS_I(inode)->last_disk_size < psize) + F2FS_I(inode)->last_disk_size = psize; + up_write(&F2FS_I(inode)->i_sem); + } + +done: + if (err && err != -ENOENT) + goto redirty_out; + +out: + inode_dec_dirty_pages(inode); + if (err) { + ClearPageUptodate(page); + clear_cold_data(page); + } + + if (wbc->for_reclaim) { + f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA); + clear_inode_flag(inode, FI_HOT_DATA); + f2fs_remove_dirty_inode(inode); + submitted = NULL; + } + + unlock_page(page); + if (!S_ISDIR(inode->i_mode)) + f2fs_balance_fs(sbi, need_balance_fs); + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_submit_merged_write(sbi, DATA); + submitted = NULL; + } + + if (submitted) + *submitted = fio.submitted; + + return 0; + +redirty_out: + redirty_page_for_writepage(wbc, page); + /* + * pageout() in MM traslates EAGAIN, so calls handle_write_error() + * -> mapping_set_error() -> set_bit(AS_EIO, ...). + * file_write_and_wait_range() will see EIO error, which is critical + * to return value of fsync() followed by atomic_write failure to user. + */ + if (!err || wbc->for_reclaim) + return AOP_WRITEPAGE_ACTIVATE; + unlock_page(page); + return err; +} + +static int f2fs_write_data_page(struct page *page, + struct writeback_control *wbc) +{ + return __write_data_page(page, NULL, wbc, FS_DATA_IO); +} + +/* + * This function was copied from write_cche_pages from mm/page-writeback.c. + * The major change is making write step of cold data page separately from + * warm/hot data page. + */ +static int f2fs_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + int ret = 0; + int done = 0; + struct pagevec pvec; + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + int nr_pages; + pgoff_t uninitialized_var(writeback_index); + pgoff_t index; + pgoff_t end; /* Inclusive */ + pgoff_t done_index; + pgoff_t last_idx = ULONG_MAX; + int cycled; + int range_whole = 0; + int tag; + + pagevec_init(&pvec); + + if (get_dirty_pages(mapping->host) <= + SM_I(F2FS_M_SB(mapping))->min_hot_blocks) + set_inode_flag(mapping->host, FI_HOT_DATA); + else + clear_inode_flag(mapping->host, FI_HOT_DATA); + + if (wbc->range_cyclic) { + writeback_index = mapping->writeback_index; /* prev offset */ + index = writeback_index; + if (index == 0) + cycled = 1; + else + cycled = 0; + end = -1; + } else { + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + cycled = 1; /* ignore range_cyclic tests */ + } + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; +retry: + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, index, end); + done_index = index; + while (!done && (index <= end)) { + int i; + + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + bool submitted = false; + + /* give a priority to WB_SYNC threads */ + if (atomic_read(&sbi->wb_sync_req[DATA]) && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } + + done_index = page->index; +retry_write: + lock_page(page); + + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + f2fs_wait_on_page_writeback(page, + DATA, true); + else + goto continue_unlock; + } + + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + ret = __write_data_page(page, &submitted, wbc, io_type); + if (unlikely(ret)) { + /* + * keep nr_to_write, since vfs uses this to + * get # of written pages. + */ + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + continue; + } else if (ret == -EAGAIN) { + ret = 0; + if (wbc->sync_mode == WB_SYNC_ALL) { + cond_resched(); + congestion_wait(BLK_RW_ASYNC, + HZ/50); + goto retry_write; + } + continue; + } + done_index = page->index + 1; + done = 1; + break; + } else if (submitted) { + last_idx = page->index; + } + + if (--wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } + } + pagevec_release(&pvec); + cond_resched(); + } + + if (!cycled && !done) { + cycled = 1; + index = 0; + end = writeback_index - 1; + goto retry; + } + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = done_index; + + if (last_idx != ULONG_MAX) + f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host, + 0, last_idx, DATA); + + return ret; +} + +static inline bool __should_serialize_io(struct inode *inode, + struct writeback_control *wbc) +{ + if (!S_ISREG(inode->i_mode)) + return false; + if (wbc->sync_mode != WB_SYNC_ALL) + return true; + if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks) + return true; + return false; +} + +static int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct blk_plug plug; + int ret; + bool locked = false; + + /* deal with chardevs and other special file */ + if (!mapping->a_ops->writepage) + return 0; + + /* skip writing if there is no dirty page in this inode */ + if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) + return 0; + + /* during POR, we don't need to trigger writepage at all. */ + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && + get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && + f2fs_available_free_memory(sbi, DIRTY_DENTS)) + goto skip_write; + + /* skip writing during file defragment */ + if (is_inode_flag_set(inode, FI_DO_DEFRAG)) + goto skip_write; + + trace_f2fs_writepages(mapping->host, wbc, DATA); + + /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_inc(&sbi->wb_sync_req[DATA]); + else if (atomic_read(&sbi->wb_sync_req[DATA])) + goto skip_write; + + if (__should_serialize_io(inode, wbc)) { + mutex_lock(&sbi->writepages); + locked = true; + } + + blk_start_plug(&plug); + ret = f2fs_write_cache_pages(mapping, wbc, io_type); + blk_finish_plug(&plug); + + if (locked) + mutex_unlock(&sbi->writepages); + + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_dec(&sbi->wb_sync_req[DATA]); + /* + * if some pages were truncated, we cannot guarantee its mapping->host + * to detect pending bios. + */ + + f2fs_remove_dirty_inode(inode); + return ret; + +skip_write: + wbc->pages_skipped += get_dirty_pages(inode); + trace_f2fs_writepages(mapping->host, wbc, DATA); + return 0; +} + +static int f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + + return __f2fs_write_data_pages(mapping, wbc, + F2FS_I(inode)->cp_task == current ? + FS_CP_DATA_IO : FS_DATA_IO); +} + +static void f2fs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + loff_t i_size = i_size_read(inode); + + if (to > i_size) { + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); + + truncate_pagecache(inode, i_size); + f2fs_truncate_blocks(inode, i_size, true); + + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + } +} + +static int prepare_write_begin(struct f2fs_sb_info *sbi, + struct page *page, loff_t pos, unsigned len, + block_t *blk_addr, bool *node_changed) +{ + struct inode *inode = page->mapping->host; + pgoff_t index = page->index; + struct dnode_of_data dn; + struct page *ipage; + bool locked = false; + struct extent_info ei = {0,0,0}; + int err = 0; + int flag; + + /* + * we already allocated all the blocks, so we don't need to get + * the block addresses when there is no need to fill the page. + */ + if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE && + !is_inode_flag_set(inode, FI_NO_PREALLOC)) + return 0; + + /* f2fs_lock_op avoids race between write CP and convert_inline_page */ + if (f2fs_has_inline_data(inode) && pos + len > MAX_INLINE_DATA(inode)) + flag = F2FS_GET_BLOCK_DEFAULT; + else + flag = F2FS_GET_BLOCK_PRE_AIO; + + if (f2fs_has_inline_data(inode) || + (pos & PAGE_MASK) >= i_size_read(inode)) { + __do_map_lock(sbi, flag, true); + locked = true; + } +restart: + /* check inline_data */ + ipage = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_out; + } + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_has_inline_data(inode)) { + if (pos + len <= MAX_INLINE_DATA(inode)) { + f2fs_do_read_inline_data(page, ipage); + set_inode_flag(inode, FI_DATA_EXIST); + if (inode->i_nlink) + set_inline_node(ipage); + } else { + err = f2fs_convert_inline_page(&dn, page); + if (err) + goto out; + if (dn.data_blkaddr == NULL_ADDR) + err = f2fs_get_block(&dn, index); + } + } else if (locked) { + err = f2fs_get_block(&dn, index); + } else { + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + } else { + /* hole case */ + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err || dn.data_blkaddr == NULL_ADDR) { + f2fs_put_dnode(&dn); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, + true); + WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO); + locked = true; + goto restart; + } + } + } + + /* convert_inline_page can make node_changed */ + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; +out: + f2fs_put_dnode(&dn); +unlock_out: + if (locked) + __do_map_lock(sbi, flag, false); + return err; +} + +static int f2fs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct page *page = NULL; + pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; + bool need_balance = false, drop_atomic = false; + block_t blkaddr = NULL_ADDR; + int err = 0; + + trace_f2fs_write_begin(inode, pos, len, flags); + + if ((f2fs_is_atomic_file(inode) && + !f2fs_available_free_memory(sbi, INMEM_PAGES)) || + is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { + err = -ENOMEM; + drop_atomic = true; + goto fail; + } + + /* + * We should check this at this moment to avoid deadlock on inode page + * and #0 page. The locking rule for inline_data conversion should be: + * lock_page(page #0) -> lock_page(inode_page) + */ + if (index != 0) { + err = f2fs_convert_inline_inode(inode); + if (err) + goto fail; + } +repeat: + /* + * Do not use grab_cache_page_write_begin() to avoid deadlock due to + * wait_for_stable_page. Will wait that below with our IO control. + */ + page = f2fs_pagecache_get_page(mapping, index, + FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS); + if (!page) { + err = -ENOMEM; + goto fail; + } + + *pagep = page; + + err = prepare_write_begin(sbi, page, pos, len, + &blkaddr, &need_balance); + if (err) + goto fail; + + if (need_balance && has_not_enough_free_secs(sbi, 0, 0)) { + unlock_page(page); + f2fs_balance_fs(sbi, true); + lock_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + f2fs_put_page(page, 1); + goto repeat; + } + } + + f2fs_wait_on_page_writeback(page, DATA, false); + + if (len == PAGE_SIZE || PageUptodate(page)) + return 0; + + if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode)) { + zero_user_segment(page, len, PAGE_SIZE); + return 0; + } + + if (blkaddr == NEW_ADDR) { + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); + } else { + err = f2fs_submit_page_read(inode, page, blkaddr); + if (err) + goto fail; + + lock_page(page); + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; + } + if (unlikely(!PageUptodate(page))) { + err = -EIO; + goto fail; + } + } + return 0; + +fail: + f2fs_put_page(page, 1); + f2fs_write_failed(mapping, pos + len); + if (drop_atomic) + f2fs_drop_inmem_pages_all(sbi, false); + return err; +} + +static int f2fs_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + + trace_f2fs_write_end(inode, pos, len, copied); + + /* + * This should be come from len == PAGE_SIZE, and we expect copied + * should be PAGE_SIZE. Otherwise, we treat it with zero copied and + * let generic_perform_write() try to copy data again through copied=0. + */ + if (!PageUptodate(page)) { + if (unlikely(copied != len)) + copied = 0; + else + SetPageUptodate(page); + } + if (!copied) + goto unlock_out; + + set_page_dirty(page); + + if (pos + copied > i_size_read(inode)) + f2fs_i_size_write(inode, pos + copied); +unlock_out: + f2fs_put_page(page, 1); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + return copied; +} + +static int check_direct_IO(struct inode *inode, struct iov_iter *iter, + loff_t offset) +{ + unsigned i_blkbits = READ_ONCE(inode->i_blkbits); + unsigned blkbits = i_blkbits; + unsigned blocksize_mask = (1 << blkbits) - 1; + unsigned long align = offset | iov_iter_alignment(iter); + struct block_device *bdev = inode->i_sb->s_bdev; + + if (iov_iter_rw(iter) == READ && offset >= i_size_read(inode)) + return 1; + + if (align & blocksize_mask) { + if (bdev) + blkbits = blksize_bits(bdev_logical_block_size(bdev)); + blocksize_mask = (1 << blkbits) - 1; + if (align & blocksize_mask) + return -EINVAL; + return 1; + } + return 0; +} + +static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + size_t count = iov_iter_count(iter); + loff_t offset = iocb->ki_pos; + int rw = iov_iter_rw(iter); + int err; + enum rw_hint hint = iocb->ki_hint; + int whint_mode = F2FS_OPTION(sbi).whint_mode; + + err = check_direct_IO(inode, iter, offset); + if (err) + return err < 0 ? err : 0; + + if (f2fs_force_buffered_io(inode, rw)) + return 0; + + trace_f2fs_direct_IO_enter(inode, offset, count, rw); + + if (rw == WRITE && whint_mode == WHINT_MODE_OFF) + iocb->ki_hint = WRITE_LIFE_NOT_SET; + + if (!down_read_trylock(&F2FS_I(inode)->i_gc_rwsem[rw])) { + if (iocb->ki_flags & IOCB_NOWAIT) { + iocb->ki_hint = hint; + err = -EAGAIN; + goto out; + } + down_read(&F2FS_I(inode)->i_gc_rwsem[rw]); + } + + err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio); + up_read(&F2FS_I(inode)->i_gc_rwsem[rw]); + + if (rw == WRITE) { + if (whint_mode == WHINT_MODE_OFF) + iocb->ki_hint = hint; + if (err > 0) { + f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, + err); + set_inode_flag(inode, FI_UPDATE_WRITE); + } else if (err < 0) { + f2fs_write_failed(mapping, offset + count); + } + } + +out: + trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); + + return err; +} + +void f2fs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length) +{ + struct inode *inode = page->mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino >= F2FS_ROOT_INO(sbi) && + (offset % PAGE_SIZE || length != PAGE_SIZE)) + return; + + if (PageDirty(page)) { + if (inode->i_ino == F2FS_META_INO(sbi)) { + dec_page_count(sbi, F2FS_DIRTY_META); + } else if (inode->i_ino == F2FS_NODE_INO(sbi)) { + dec_page_count(sbi, F2FS_DIRTY_NODES); + } else { + inode_dec_dirty_pages(inode); + f2fs_remove_dirty_inode(inode); + } + } + + clear_cold_data(page); + + /* This is atomic written page, keep Private */ + if (IS_ATOMIC_WRITTEN_PAGE(page)) + return f2fs_drop_inmem_page(inode, page); + + set_page_private(page, 0); + ClearPagePrivate(page); +} + +int f2fs_release_page(struct page *page, gfp_t wait) +{ + /* If this is dirty page, keep PagePrivate */ + if (PageDirty(page)) + return 0; + + /* This is atomic written page, keep Private */ + if (IS_ATOMIC_WRITTEN_PAGE(page)) + return 0; + + clear_cold_data(page); + set_page_private(page, 0); + ClearPagePrivate(page); + return 1; +} + +static int f2fs_set_data_page_dirty(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + + trace_f2fs_set_page_dirty(page, DATA); + + if (!PageUptodate(page)) + SetPageUptodate(page); + + if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { + if (!IS_ATOMIC_WRITTEN_PAGE(page)) { + f2fs_register_inmem_page(inode, page); + return 1; + } + /* + * Previously, this page has been registered, we just + * return here. + */ + return 0; + } + + if (!PageDirty(page)) { + __set_page_dirty_nobuffers(page); + f2fs_update_dirty_page(inode, page); + return 1; + } + return 0; +} + +static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) +{ + struct inode *inode = mapping->host; + + if (f2fs_has_inline_data(inode)) + return 0; + + /* make sure allocating whole blocks */ + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + filemap_write_and_wait(mapping); + + return generic_block_bmap(mapping, block, get_data_block_bmap); +} + +#ifdef CONFIG_MIGRATION +#include <linux/migrate.h> + +int f2fs_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + int rc, extra_count; + struct f2fs_inode_info *fi = F2FS_I(mapping->host); + bool atomic_written = IS_ATOMIC_WRITTEN_PAGE(page); + + BUG_ON(PageWriteback(page)); + + /* migrating an atomic written page is safe with the inmem_lock hold */ + if (atomic_written) { + if (mode != MIGRATE_SYNC) + return -EBUSY; + if (!mutex_trylock(&fi->inmem_lock)) + return -EAGAIN; + } + + /* + * A reference is expected if PagePrivate set when move mapping, + * however F2FS breaks this for maintaining dirty page counts when + * truncating pages. So here adjusting the 'extra_count' make it work. + */ + extra_count = (atomic_written ? 1 : 0) - page_has_private(page); + rc = migrate_page_move_mapping(mapping, newpage, + page, NULL, mode, extra_count); + if (rc != MIGRATEPAGE_SUCCESS) { + if (atomic_written) + mutex_unlock(&fi->inmem_lock); + return rc; + } + + if (atomic_written) { + struct inmem_pages *cur; + list_for_each_entry(cur, &fi->inmem_pages, list) + if (cur->page == page) { + cur->page = newpage; + break; + } + mutex_unlock(&fi->inmem_lock); + put_page(page); + get_page(newpage); + } + + if (PagePrivate(page)) + SetPagePrivate(newpage); + set_page_private(newpage, page_private(page)); + + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); + + return MIGRATEPAGE_SUCCESS; +} +#endif + +const struct address_space_operations f2fs_dblock_aops = { + .readpage = f2fs_read_data_page, + .readpages = f2fs_read_data_pages, + .writepage = f2fs_write_data_page, + .writepages = f2fs_write_data_pages, + .write_begin = f2fs_write_begin, + .write_end = f2fs_write_end, + .set_page_dirty = f2fs_set_data_page_dirty, + .invalidatepage = f2fs_invalidate_page, + .releasepage = f2fs_release_page, + .direct_IO = f2fs_direct_IO, + .bmap = f2fs_bmap, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif +}; + +void f2fs_clear_radix_tree_dirty_tag(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + unsigned long flags; + + xa_lock_irqsave(&mapping->i_pages, flags); + radix_tree_tag_clear(&mapping->i_pages, page_index(page), + PAGECACHE_TAG_DIRTY); + xa_unlock_irqrestore(&mapping->i_pages, flags); +} + +int __init f2fs_init_post_read_processing(void) +{ + bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, 0); + if (!bio_post_read_ctx_cache) + goto fail; + bio_post_read_ctx_pool = + mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, + bio_post_read_ctx_cache); + if (!bio_post_read_ctx_pool) + goto fail_free_cache; + return 0; + +fail_free_cache: + kmem_cache_destroy(bio_post_read_ctx_cache); +fail: + return -ENOMEM; +} + +void __exit f2fs_destroy_post_read_processing(void) +{ + mempool_destroy(bio_post_read_ctx_pool); + kmem_cache_destroy(bio_post_read_ctx_cache); +} diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c new file mode 100644 index 000000000..bbe155465 --- /dev/null +++ b/fs/f2fs/debug.c @@ -0,0 +1,527 @@ +/* + * f2fs debugging statistics + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * Copyright (c) 2012 Linux Foundation + * Copyright (c) 2012 Greg Kroah-Hartman <gregkh@linuxfoundation.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/backing-dev.h> +#include <linux/f2fs_fs.h> +#include <linux/blkdev.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "gc.h" + +static LIST_HEAD(f2fs_stat_list); +static struct dentry *f2fs_debugfs_root; +static DEFINE_MUTEX(f2fs_stat_mutex); + +static void update_general_status(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + int i; + + /* validation check of the segment numbers */ + si->hit_largest = atomic64_read(&sbi->read_hit_largest); + si->hit_cached = atomic64_read(&sbi->read_hit_cached); + si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree); + si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; + si->total_ext = atomic64_read(&sbi->total_hit_ext); + si->ext_tree = atomic_read(&sbi->total_ext_tree); + si->zombie_tree = atomic_read(&sbi->total_zombie_tree); + si->ext_node = atomic_read(&sbi->total_ext_node); + si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); + si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); + si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); + si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA); + si->ndirty_qdata = get_pages(sbi, F2FS_DIRTY_QDATA); + si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA); + si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; + si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; + si->nquota_files = sbi->nquota_files; + si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; + si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); + si->aw_cnt = atomic_read(&sbi->aw_cnt); + si->vw_cnt = atomic_read(&sbi->vw_cnt); + si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); + si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); + si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); + si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); + if (SM_I(sbi) && SM_I(sbi)->fcc_info) { + si->nr_flushed = + atomic_read(&SM_I(sbi)->fcc_info->issued_flush); + si->nr_flushing = + atomic_read(&SM_I(sbi)->fcc_info->issing_flush); + si->flush_list_empty = + llist_empty(&SM_I(sbi)->fcc_info->issue_list); + } + if (SM_I(sbi) && SM_I(sbi)->dcc_info) { + si->nr_discarded = + atomic_read(&SM_I(sbi)->dcc_info->issued_discard); + si->nr_discarding = + atomic_read(&SM_I(sbi)->dcc_info->issing_discard); + si->nr_discard_cmd = + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; + } + si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; + si->rsvd_segs = reserved_segments(sbi); + si->overp_segs = overprovision_segments(sbi); + si->valid_count = valid_user_blocks(sbi); + si->discard_blks = discard_blocks(sbi); + si->valid_node_count = valid_node_count(sbi); + si->valid_inode_count = valid_inode_count(sbi); + si->inline_xattr = atomic_read(&sbi->inline_xattr); + si->inline_inode = atomic_read(&sbi->inline_inode); + si->inline_dir = atomic_read(&sbi->inline_dir); + si->append = sbi->im[APPEND_INO].ino_num; + si->update = sbi->im[UPDATE_INO].ino_num; + si->orphans = sbi->im[ORPHAN_INO].ino_num; + si->utilization = utilization(sbi); + + si->free_segs = free_segments(sbi); + si->free_secs = free_sections(sbi); + si->prefree_count = prefree_segments(sbi); + si->dirty_count = dirty_segments(sbi); + if (sbi->node_inode) + si->node_pages = NODE_MAPPING(sbi)->nrpages; + if (sbi->meta_inode) + si->meta_pages = META_MAPPING(sbi)->nrpages; + si->nats = NM_I(sbi)->nat_cnt; + si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; + si->sits = MAIN_SEGS(sbi); + si->dirty_sits = SIT_I(sbi)->dirty_sentries; + si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID]; + si->avail_nids = NM_I(sbi)->available_nids; + si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; + si->bg_gc = sbi->bg_gc; + si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC]; + si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC]; + si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) + * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) + / 2; + si->util_valid = (int)(written_block_count(sbi) >> + sbi->log_blocks_per_seg) + * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) + / 2; + si->util_invalid = 50 - si->util_free - si->util_valid; + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) { + struct curseg_info *curseg = CURSEG_I(sbi, i); + si->curseg[i] = curseg->segno; + si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); + si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]); + } + + for (i = 0; i < 2; i++) { + si->segment_count[i] = sbi->segment_count[i]; + si->block_count[i] = sbi->block_count[i]; + } + + si->inplace_count = atomic_read(&sbi->inplace_count); +} + +/* + * This function calculates BDF of every segments + */ +static void update_sit_info(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + unsigned long long blks_per_sec, hblks_per_sec, total_vblocks; + unsigned long long bimodal, dist; + unsigned int segno, vblocks; + int ndirty = 0; + + bimodal = 0; + total_vblocks = 0; + blks_per_sec = BLKS_PER_SEC(sbi); + hblks_per_sec = blks_per_sec / 2; + for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { + vblocks = get_valid_blocks(sbi, segno, true); + dist = abs(vblocks - hblks_per_sec); + bimodal += dist * dist; + + if (vblocks > 0 && vblocks < blks_per_sec) { + total_vblocks += vblocks; + ndirty++; + } + } + dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100); + si->bimodal = div64_u64(bimodal, dist); + if (si->dirty_count) + si->avg_vblocks = div_u64(total_vblocks, ndirty); + else + si->avg_vblocks = 0; +} + +/* + * This function calculates memory footprint. + */ +static void update_mem_info(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + int i; + + if (si->base_mem) + goto get_cache; + + /* build stat */ + si->base_mem = sizeof(struct f2fs_stat_info); + + /* build superblock */ + si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; + si->base_mem += 2 * sizeof(struct f2fs_inode_info); + si->base_mem += sizeof(*sbi->ckpt); + + /* build sm */ + si->base_mem += sizeof(struct f2fs_sm_info); + + /* build sit */ + si->base_mem += sizeof(struct sit_info); + si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); + si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); + si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + si->base_mem += SIT_VBLOCK_MAP_SIZE; + if (sbi->segs_per_sec > 1) + si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); + si->base_mem += __bitmap_size(sbi, SIT_BITMAP); + + /* build free segmap */ + si->base_mem += sizeof(struct free_segmap_info); + si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); + si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi)); + + /* build curseg */ + si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE; + si->base_mem += PAGE_SIZE * NR_CURSEG_TYPE; + + /* build dirty segmap */ + si->base_mem += sizeof(struct dirty_seglist_info); + si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(MAIN_SEGS(sbi)); + si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi)); + + /* build nm */ + si->base_mem += sizeof(struct f2fs_nm_info); + si->base_mem += __bitmap_size(sbi, NAT_BITMAP); + si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS); + si->base_mem += NM_I(sbi)->nat_blocks * + f2fs_bitmap_size(NAT_ENTRY_PER_BLOCK); + si->base_mem += NM_I(sbi)->nat_blocks / 8; + si->base_mem += NM_I(sbi)->nat_blocks * sizeof(unsigned short); + +get_cache: + si->cache_mem = 0; + + /* build gc */ + if (sbi->gc_thread) + si->cache_mem += sizeof(struct f2fs_gc_kthread); + + /* build merge flush thread */ + if (SM_I(sbi)->fcc_info) + si->cache_mem += sizeof(struct flush_cmd_control); + if (SM_I(sbi)->dcc_info) { + si->cache_mem += sizeof(struct discard_cmd_control); + si->cache_mem += sizeof(struct discard_cmd) * + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + } + + /* free nids */ + si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] + + NM_I(sbi)->nid_cnt[PREALLOC_NID]) * + sizeof(struct free_nid); + si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); + si->cache_mem += NM_I(sbi)->dirty_nat_cnt * + sizeof(struct nat_entry_set); + si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); + for (i = 0; i < MAX_INO_ENTRY; i++) + si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); + si->cache_mem += atomic_read(&sbi->total_ext_tree) * + sizeof(struct extent_tree); + si->cache_mem += atomic_read(&sbi->total_ext_node) * + sizeof(struct extent_node); + + si->page_mem = 0; + if (sbi->node_inode) { + unsigned npages = NODE_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } + if (sbi->meta_inode) { + unsigned npages = META_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } +} + +static int stat_show(struct seq_file *s, void *v) +{ + struct f2fs_stat_info *si; + int i = 0; + int j; + + mutex_lock(&f2fs_stat_mutex); + list_for_each_entry(si, &f2fs_stat_list, stat_list) { + update_general_status(si->sbi); + + seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", + si->sbi->sb->s_bdev, i++, + f2fs_readonly(si->sbi->sb) ? "RO": "RW", + f2fs_cp_error(si->sbi) ? "Error": "Good"); + seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", + si->sit_area_segs, si->nat_area_segs); + seq_printf(s, "[SSA: %d] [MAIN: %d", + si->ssa_area_segs, si->main_area_segs); + seq_printf(s, "(OverProv:%d Resv:%d)]\n\n", + si->overp_segs, si->rsvd_segs); + if (test_opt(si->sbi, DISCARD)) + seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n", + si->utilization, si->valid_count, si->discard_blks); + else + seq_printf(s, "Utilization: %u%% (%u valid blocks)\n", + si->utilization, si->valid_count); + + seq_printf(s, " - Node: %u (Inode: %u, ", + si->valid_node_count, si->valid_inode_count); + seq_printf(s, "Other: %u)\n - Data: %u\n", + si->valid_node_count - si->valid_inode_count, + si->valid_count - si->valid_node_count); + seq_printf(s, " - Inline_xattr Inode: %u\n", + si->inline_xattr); + seq_printf(s, " - Inline_data Inode: %u\n", + si->inline_inode); + seq_printf(s, " - Inline_dentry Inode: %u\n", + si->inline_dir); + seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", + si->orphans, si->append, si->update); + seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", + si->main_area_segs, si->main_area_sections, + si->main_area_zones); + seq_printf(s, " - COLD data: %d, %d, %d\n", + si->curseg[CURSEG_COLD_DATA], + si->cursec[CURSEG_COLD_DATA], + si->curzone[CURSEG_COLD_DATA]); + seq_printf(s, " - WARM data: %d, %d, %d\n", + si->curseg[CURSEG_WARM_DATA], + si->cursec[CURSEG_WARM_DATA], + si->curzone[CURSEG_WARM_DATA]); + seq_printf(s, " - HOT data: %d, %d, %d\n", + si->curseg[CURSEG_HOT_DATA], + si->cursec[CURSEG_HOT_DATA], + si->curzone[CURSEG_HOT_DATA]); + seq_printf(s, " - Dir dnode: %d, %d, %d\n", + si->curseg[CURSEG_HOT_NODE], + si->cursec[CURSEG_HOT_NODE], + si->curzone[CURSEG_HOT_NODE]); + seq_printf(s, " - File dnode: %d, %d, %d\n", + si->curseg[CURSEG_WARM_NODE], + si->cursec[CURSEG_WARM_NODE], + si->curzone[CURSEG_WARM_NODE]); + seq_printf(s, " - Indir nodes: %d, %d, %d\n", + si->curseg[CURSEG_COLD_NODE], + si->cursec[CURSEG_COLD_NODE], + si->curzone[CURSEG_COLD_NODE]); + seq_printf(s, "\n - Valid: %d\n - Dirty: %d\n", + si->main_area_segs - si->dirty_count - + si->prefree_count - si->free_segs, + si->dirty_count); + seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", + si->prefree_count, si->free_segs, si->free_secs); + seq_printf(s, "CP calls: %d (BG: %d)\n", + si->cp_count, si->bg_cp_count); + seq_printf(s, "GC calls: %d (BG: %d)\n", + si->call_count, si->bg_gc); + seq_printf(s, " - data segments : %d (%d)\n", + si->data_segs, si->bg_data_segs); + seq_printf(s, " - node segments : %d (%d)\n", + si->node_segs, si->bg_node_segs); + seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks, + si->bg_data_blks + si->bg_node_blks); + seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks, + si->bg_data_blks); + seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, + si->bg_node_blks); + seq_printf(s, "Skipped : atomic write %llu (%llu)\n", + si->skipped_atomic_files[BG_GC] + + si->skipped_atomic_files[FG_GC], + si->skipped_atomic_files[BG_GC]); + seq_puts(s, "\nExtent Cache:\n"); + seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n", + si->hit_largest, si->hit_cached, + si->hit_rbtree); + seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n", + !si->total_ext ? 0 : + div64_u64(si->hit_total * 100, si->total_ext), + si->hit_total, si->total_ext); + seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", + si->ext_tree, si->zombie_tree, si->ext_node); + seq_puts(s, "\nBalancing F2FS Async:\n"); + seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), " + "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", + si->nr_wb_cp_data, si->nr_wb_data, + si->nr_flushing, si->nr_flushed, + si->flush_list_empty, + si->nr_discarding, si->nr_discarded, + si->nr_discard_cmd, si->undiscard_blks); + seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " + "volatile IO: %4d (Max. %4d)\n", + si->inmem_pages, si->aw_cnt, si->max_aw_cnt, + si->vw_cnt, si->max_vw_cnt); + seq_printf(s, " - nodes: %4d in %4d\n", + si->ndirty_node, si->node_pages); + seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", + si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); + seq_printf(s, " - datas: %4d in files:%4d\n", + si->ndirty_data, si->ndirty_files); + seq_printf(s, " - quota datas: %4d in quota files:%4d\n", + si->ndirty_qdata, si->nquota_files); + seq_printf(s, " - meta: %4d in %4d\n", + si->ndirty_meta, si->meta_pages); + seq_printf(s, " - imeta: %4d\n", + si->ndirty_imeta); + seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", + si->dirty_nats, si->nats, si->dirty_sits, si->sits); + seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n", + si->free_nids, si->avail_nids, si->alloc_nids); + seq_puts(s, "\nDistribution of User Blocks:"); + seq_puts(s, " [ valid | invalid | free ]\n"); + seq_puts(s, " ["); + + for (j = 0; j < si->util_valid; j++) + seq_putc(s, '-'); + seq_putc(s, '|'); + + for (j = 0; j < si->util_invalid; j++) + seq_putc(s, '-'); + seq_putc(s, '|'); + + for (j = 0; j < si->util_free; j++) + seq_putc(s, '-'); + seq_puts(s, "]\n\n"); + seq_printf(s, "IPU: %u blocks\n", si->inplace_count); + seq_printf(s, "SSR: %u blocks in %u segments\n", + si->block_count[SSR], si->segment_count[SSR]); + seq_printf(s, "LFS: %u blocks in %u segments\n", + si->block_count[LFS], si->segment_count[LFS]); + + /* segment usage info */ + update_sit_info(si->sbi); + seq_printf(s, "\nBDF: %u, avg. vblocks: %u\n", + si->bimodal, si->avg_vblocks); + + /* memory footprint */ + update_mem_info(si->sbi); + seq_printf(s, "\nMemory: %llu KB\n", + (si->base_mem + si->cache_mem + si->page_mem) >> 10); + seq_printf(s, " - static: %llu KB\n", + si->base_mem >> 10); + seq_printf(s, " - cached: %llu KB\n", + si->cache_mem >> 10); + seq_printf(s, " - paged : %llu KB\n", + si->page_mem >> 10); + } + mutex_unlock(&f2fs_stat_mutex); + return 0; +} + +static int stat_open(struct inode *inode, struct file *file) +{ + return single_open(file, stat_show, inode->i_private); +} + +static const struct file_operations stat_fops = { + .owner = THIS_MODULE, + .open = stat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +int f2fs_build_stats(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + struct f2fs_stat_info *si; + + si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL); + if (!si) + return -ENOMEM; + + si->all_area_segs = le32_to_cpu(raw_super->segment_count); + si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit); + si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat); + si->ssa_area_segs = le32_to_cpu(raw_super->segment_count_ssa); + si->main_area_segs = le32_to_cpu(raw_super->segment_count_main); + si->main_area_sections = le32_to_cpu(raw_super->section_count); + si->main_area_zones = si->main_area_sections / + le32_to_cpu(raw_super->secs_per_zone); + si->sbi = sbi; + sbi->stat_info = si; + + atomic64_set(&sbi->total_hit_ext, 0); + atomic64_set(&sbi->read_hit_rbtree, 0); + atomic64_set(&sbi->read_hit_largest, 0); + atomic64_set(&sbi->read_hit_cached, 0); + + atomic_set(&sbi->inline_xattr, 0); + atomic_set(&sbi->inline_inode, 0); + atomic_set(&sbi->inline_dir, 0); + atomic_set(&sbi->inplace_count, 0); + + atomic_set(&sbi->aw_cnt, 0); + atomic_set(&sbi->vw_cnt, 0); + atomic_set(&sbi->max_aw_cnt, 0); + atomic_set(&sbi->max_vw_cnt, 0); + + mutex_lock(&f2fs_stat_mutex); + list_add_tail(&si->stat_list, &f2fs_stat_list); + mutex_unlock(&f2fs_stat_mutex); + + return 0; +} + +void f2fs_destroy_stats(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + + mutex_lock(&f2fs_stat_mutex); + list_del(&si->stat_list); + mutex_unlock(&f2fs_stat_mutex); + + kfree(si); +} + +int __init f2fs_create_root_stats(void) +{ + struct dentry *file; + + f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); + if (!f2fs_debugfs_root) + return -ENOMEM; + + file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, + NULL, &stat_fops); + if (!file) { + debugfs_remove(f2fs_debugfs_root); + f2fs_debugfs_root = NULL; + return -ENOMEM; + } + + return 0; +} + +void f2fs_destroy_root_stats(void) +{ + if (!f2fs_debugfs_root) + return; + + debugfs_remove_recursive(f2fs_debugfs_root); + f2fs_debugfs_root = NULL; +} diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c new file mode 100644 index 000000000..2cd85ce3e --- /dev/null +++ b/fs/f2fs/dir.c @@ -0,0 +1,940 @@ +/* + * fs/f2fs/dir.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/sched/signal.h> +#include "f2fs.h" +#include "node.h" +#include "acl.h" +#include "xattr.h" +#include <trace/events/f2fs.h> + +static unsigned long dir_blocks(struct inode *inode) +{ + return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1)) + >> PAGE_SHIFT; +} + +static unsigned int dir_buckets(unsigned int level, int dir_level) +{ + if (level + dir_level < MAX_DIR_HASH_DEPTH / 2) + return 1 << (level + dir_level); + else + return MAX_DIR_BUCKETS; +} + +static unsigned int bucket_blocks(unsigned int level) +{ + if (level < MAX_DIR_HASH_DEPTH / 2) + return 2; + else + return 4; +} + +static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { + [F2FS_FT_UNKNOWN] = DT_UNKNOWN, + [F2FS_FT_REG_FILE] = DT_REG, + [F2FS_FT_DIR] = DT_DIR, + [F2FS_FT_CHRDEV] = DT_CHR, + [F2FS_FT_BLKDEV] = DT_BLK, + [F2FS_FT_FIFO] = DT_FIFO, + [F2FS_FT_SOCK] = DT_SOCK, + [F2FS_FT_SYMLINK] = DT_LNK, +}; + +static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = F2FS_FT_DIR, + [S_IFCHR >> S_SHIFT] = F2FS_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = F2FS_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = F2FS_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = F2FS_FT_SOCK, + [S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK, +}; + +static void set_de_type(struct f2fs_dir_entry *de, umode_t mode) +{ + de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; +} + +unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de) +{ + if (de->file_type < F2FS_FT_MAX) + return f2fs_filetype_table[de->file_type]; + return DT_UNKNOWN; +} + +static unsigned long dir_block_index(unsigned int level, + int dir_level, unsigned int idx) +{ + unsigned long i; + unsigned long bidx = 0; + + for (i = 0; i < level; i++) + bidx += dir_buckets(i, dir_level) * bucket_blocks(i); + bidx += idx * bucket_blocks(level); + return bidx; +} + +static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, + struct fscrypt_name *fname, + f2fs_hash_t namehash, + int *max_slots, + struct page **res_page) +{ + struct f2fs_dentry_block *dentry_blk; + struct f2fs_dir_entry *de; + struct f2fs_dentry_ptr d; + + dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page); + + make_dentry_ptr_block(NULL, &d, dentry_blk); + de = f2fs_find_target_dentry(fname, namehash, max_slots, &d); + if (de) + *res_page = dentry_page; + + return de; +} + +struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, + f2fs_hash_t namehash, int *max_slots, + struct f2fs_dentry_ptr *d) +{ + struct f2fs_dir_entry *de; + unsigned long bit_pos = 0; + int max_len = 0; + + if (max_slots) + *max_slots = 0; + while (bit_pos < d->max) { + if (!test_bit_le(bit_pos, d->bitmap)) { + bit_pos++; + max_len++; + continue; + } + + de = &d->dentry[bit_pos]; + + if (unlikely(!de->name_len)) { + bit_pos++; + continue; + } + + if (de->hash_code == namehash && + fscrypt_match_name(fname, d->filename[bit_pos], + le16_to_cpu(de->name_len))) + goto found; + + if (max_slots && max_len > *max_slots) + *max_slots = max_len; + max_len = 0; + + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + } + + de = NULL; +found: + if (max_slots && max_len > *max_slots) + *max_slots = max_len; + return de; +} + +static struct f2fs_dir_entry *find_in_level(struct inode *dir, + unsigned int level, + struct fscrypt_name *fname, + struct page **res_page) +{ + struct qstr name = FSTR_TO_QSTR(&fname->disk_name); + int s = GET_DENTRY_SLOTS(name.len); + unsigned int nbucket, nblock; + unsigned int bidx, end_block; + struct page *dentry_page; + struct f2fs_dir_entry *de = NULL; + bool room = false; + int max_slots; + f2fs_hash_t namehash = f2fs_dentry_hash(&name, fname); + + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); + nblock = bucket_blocks(level); + + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, + le32_to_cpu(namehash) % nbucket); + end_block = bidx + nblock; + + for (; bidx < end_block; bidx++) { + /* no need to allocate new dentry pages to all the indices */ + dentry_page = f2fs_find_data_page(dir, bidx); + if (IS_ERR(dentry_page)) { + if (PTR_ERR(dentry_page) == -ENOENT) { + room = true; + continue; + } else { + *res_page = dentry_page; + break; + } + } + + de = find_in_block(dentry_page, fname, namehash, &max_slots, + res_page); + if (de) + break; + + if (max_slots >= s) + room = true; + f2fs_put_page(dentry_page, 0); + } + + if (!de && room && F2FS_I(dir)->chash != namehash) { + F2FS_I(dir)->chash = namehash; + F2FS_I(dir)->clevel = level; + } + + return de; +} + +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page) +{ + unsigned long npages = dir_blocks(dir); + struct f2fs_dir_entry *de = NULL; + unsigned int max_depth; + unsigned int level; + + *res_page = NULL; + + if (f2fs_has_inline_dentry(dir)) { + de = f2fs_find_in_inline_dir(dir, fname, res_page); + goto out; + } + + if (npages == 0) + goto out; + + max_depth = F2FS_I(dir)->i_current_depth; + if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { + f2fs_msg(F2FS_I_SB(dir)->sb, KERN_WARNING, + "Corrupted max_depth of %lu: %u", + dir->i_ino, max_depth); + max_depth = MAX_DIR_HASH_DEPTH; + f2fs_i_depth_write(dir, max_depth); + } + + for (level = 0; level < max_depth; level++) { + de = find_in_level(dir, level, fname, res_page); + if (de || IS_ERR(*res_page)) + break; + } +out: + /* This is to increase the speed of f2fs_create */ + if (!de) + F2FS_I(dir)->task = current; + return de; +} + +/* + * Find an entry in the specified directory with the wanted name. + * It returns the page where the entry was found (as a parameter - res_page), + * and the entry itself. Page is returned mapped and unlocked. + * Entry is guaranteed to be valid. + */ +struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, + const struct qstr *child, struct page **res_page) +{ + struct f2fs_dir_entry *de = NULL; + struct fscrypt_name fname; + int err; + + err = fscrypt_setup_filename(dir, child, 1, &fname); + if (err) { + if (err == -ENOENT) + *res_page = NULL; + else + *res_page = ERR_PTR(err); + return NULL; + } + + de = __f2fs_find_entry(dir, &fname, res_page); + + fscrypt_free_filename(&fname); + return de; +} + +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) +{ + struct qstr dotdot = QSTR_INIT("..", 2); + + return f2fs_find_entry(dir, &dotdot, p); +} + +ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, + struct page **page) +{ + ino_t res = 0; + struct f2fs_dir_entry *de; + + de = f2fs_find_entry(dir, qstr, page); + if (de) { + res = le32_to_cpu(de->ino); + f2fs_put_page(*page, 0); + } + + return res; +} + +void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, + struct page *page, struct inode *inode) +{ + enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; + lock_page(page); + f2fs_wait_on_page_writeback(page, type, true); + de->ino = cpu_to_le32(inode->i_ino); + set_de_type(de, inode->i_mode); + set_page_dirty(page); + + dir->i_mtime = dir->i_ctime = current_time(dir); + f2fs_mark_inode_dirty_sync(dir, false); + f2fs_put_page(page, 1); +} + +static void init_dent_inode(const struct qstr *name, struct page *ipage) +{ + struct f2fs_inode *ri; + + f2fs_wait_on_page_writeback(ipage, NODE, true); + + /* copy name info. to this inode page */ + ri = F2FS_INODE(ipage); + ri->i_namelen = cpu_to_le32(name->len); + memcpy(ri->i_name, name->name, name->len); + set_page_dirty(ipage); +} + +void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, + struct f2fs_dentry_ptr *d) +{ + struct qstr dot = QSTR_INIT(".", 1); + struct qstr dotdot = QSTR_INIT("..", 2); + + /* update dirent of "." */ + f2fs_update_dentry(inode->i_ino, inode->i_mode, d, &dot, 0, 0); + + /* update dirent of ".." */ + f2fs_update_dentry(parent->i_ino, parent->i_mode, d, &dotdot, 0, 1); +} + +static int make_empty_dir(struct inode *inode, + struct inode *parent, struct page *page) +{ + struct page *dentry_page; + struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_ptr d; + + if (f2fs_has_inline_dentry(inode)) + return f2fs_make_empty_inline_dir(inode, parent, page); + + dentry_page = f2fs_get_new_data_page(inode, page, 0, true); + if (IS_ERR(dentry_page)) + return PTR_ERR(dentry_page); + + dentry_blk = page_address(dentry_page); + + make_dentry_ptr_block(NULL, &d, dentry_blk); + f2fs_do_make_empty_dir(inode, parent, &d); + + set_page_dirty(dentry_page); + f2fs_put_page(dentry_page, 1); + return 0; +} + +struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, + const struct qstr *new_name, const struct qstr *orig_name, + struct page *dpage) +{ + struct page *page; + int dummy_encrypt = DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(dir)); + int err; + + if (is_inode_flag_set(inode, FI_NEW_INODE)) { + page = f2fs_new_inode_page(inode); + if (IS_ERR(page)) + return page; + + if (S_ISDIR(inode->i_mode)) { + /* in order to handle error case */ + get_page(page); + err = make_empty_dir(inode, dir, page); + if (err) { + lock_page(page); + goto put_error; + } + put_page(page); + } + + err = f2fs_init_acl(inode, dir, page, dpage); + if (err) + goto put_error; + + err = f2fs_init_security(inode, dir, orig_name, page); + if (err) + goto put_error; + + if ((f2fs_encrypted_inode(dir) || dummy_encrypt) && + f2fs_may_encrypt(inode)) { + err = fscrypt_inherit_context(dir, inode, page, false); + if (err) + goto put_error; + } + } else { + page = f2fs_get_node_page(F2FS_I_SB(dir), inode->i_ino); + if (IS_ERR(page)) + return page; + } + + if (new_name) { + init_dent_inode(new_name, page); + if (f2fs_encrypted_inode(dir)) + file_set_enc_name(inode); + } + + /* + * This file should be checkpointed during fsync. + * We lost i_pino from now on. + */ + if (is_inode_flag_set(inode, FI_INC_LINK)) { + if (!S_ISDIR(inode->i_mode)) + file_lost_pino(inode); + /* + * If link the tmpfile to alias through linkat path, + * we should remove this inode from orphan list. + */ + if (inode->i_nlink == 0) + f2fs_remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); + f2fs_i_links_write(inode, true); + } + return page; + +put_error: + clear_nlink(inode); + f2fs_update_inode(inode, page); + f2fs_put_page(page, 1); + return ERR_PTR(err); +} + +void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, + unsigned int current_depth) +{ + if (inode && is_inode_flag_set(inode, FI_NEW_INODE)) { + if (S_ISDIR(inode->i_mode)) + f2fs_i_links_write(dir, true); + clear_inode_flag(inode, FI_NEW_INODE); + } + dir->i_mtime = dir->i_ctime = current_time(dir); + f2fs_mark_inode_dirty_sync(dir, false); + + if (F2FS_I(dir)->i_current_depth != current_depth) + f2fs_i_depth_write(dir, current_depth); + + if (inode && is_inode_flag_set(inode, FI_INC_LINK)) + clear_inode_flag(inode, FI_INC_LINK); +} + +int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots) +{ + int bit_start = 0; + int zero_start, zero_end; +next: + zero_start = find_next_zero_bit_le(bitmap, max_slots, bit_start); + if (zero_start >= max_slots) + return max_slots; + + zero_end = find_next_bit_le(bitmap, max_slots, zero_start); + if (zero_end - zero_start >= slots) + return zero_start; + + bit_start = zero_end + 1; + + if (zero_end + 1 >= max_slots) + return max_slots; + goto next; +} + +void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, + const struct qstr *name, f2fs_hash_t name_hash, + unsigned int bit_pos) +{ + struct f2fs_dir_entry *de; + int slots = GET_DENTRY_SLOTS(name->len); + int i; + + de = &d->dentry[bit_pos]; + de->hash_code = name_hash; + de->name_len = cpu_to_le16(name->len); + memcpy(d->filename[bit_pos], name->name, name->len); + de->ino = cpu_to_le32(ino); + set_de_type(de, mode); + for (i = 0; i < slots; i++) { + __set_bit_le(bit_pos + i, (void *)d->bitmap); + /* avoid wrong garbage data for readdir */ + if (i) + (de + i)->name_len = 0; + } +} + +int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode) +{ + unsigned int bit_pos; + unsigned int level; + unsigned int current_depth; + unsigned long bidx, block; + f2fs_hash_t dentry_hash; + unsigned int nbucket, nblock; + struct page *dentry_page = NULL; + struct f2fs_dentry_block *dentry_blk = NULL; + struct f2fs_dentry_ptr d; + struct page *page = NULL; + int slots, err = 0; + + level = 0; + slots = GET_DENTRY_SLOTS(new_name->len); + dentry_hash = f2fs_dentry_hash(new_name, NULL); + + current_depth = F2FS_I(dir)->i_current_depth; + if (F2FS_I(dir)->chash == dentry_hash) { + level = F2FS_I(dir)->clevel; + F2FS_I(dir)->chash = 0; + } + +start: + if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) { + f2fs_show_injection_info(FAULT_DIR_DEPTH); + return -ENOSPC; + } + + if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) + return -ENOSPC; + + /* Increase the depth, if required */ + if (level == current_depth) + ++current_depth; + + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); + nblock = bucket_blocks(level); + + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, + (le32_to_cpu(dentry_hash) % nbucket)); + + for (block = bidx; block <= (bidx + nblock - 1); block++) { + dentry_page = f2fs_get_new_data_page(dir, NULL, block, true); + if (IS_ERR(dentry_page)) + return PTR_ERR(dentry_page); + + dentry_blk = page_address(dentry_page); + bit_pos = f2fs_room_for_filename(&dentry_blk->dentry_bitmap, + slots, NR_DENTRY_IN_BLOCK); + if (bit_pos < NR_DENTRY_IN_BLOCK) + goto add_dentry; + + f2fs_put_page(dentry_page, 1); + } + + /* Move to next level to find the empty slot for new dentry */ + ++level; + goto start; +add_dentry: + f2fs_wait_on_page_writeback(dentry_page, DATA, true); + + if (inode) { + down_write(&F2FS_I(inode)->i_sem); + page = f2fs_init_inode_metadata(inode, dir, new_name, + orig_name, NULL); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } + } + + make_dentry_ptr_block(NULL, &d, dentry_blk); + f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos); + + set_page_dirty(dentry_page); + + if (inode) { + f2fs_i_pino_write(inode, dir->i_ino); + + /* synchronize inode page's data from inode cache */ + if (is_inode_flag_set(inode, FI_NEW_INODE)) + f2fs_update_inode(inode, page); + + f2fs_put_page(page, 1); + } + + f2fs_update_parent_metadata(dir, inode, current_depth); +fail: + if (inode) + up_write(&F2FS_I(inode)->i_sem); + + f2fs_put_page(dentry_page, 1); + + return err; +} + +int f2fs_add_dentry(struct inode *dir, struct fscrypt_name *fname, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct qstr new_name; + int err = -EAGAIN; + + new_name.name = fname_name(fname); + new_name.len = fname_len(fname); + + if (f2fs_has_inline_dentry(dir)) + err = f2fs_add_inline_entry(dir, &new_name, fname->usr_fname, + inode, ino, mode); + if (err == -EAGAIN) + err = f2fs_add_regular_entry(dir, &new_name, fname->usr_fname, + inode, ino, mode); + + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + return err; +} + +/* + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + */ +int f2fs_do_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct fscrypt_name fname; + struct page *page = NULL; + struct f2fs_dir_entry *de = NULL; + int err; + + err = fscrypt_setup_filename(dir, name, 0, &fname); + if (err) + return err; + + /* + * An immature stakable filesystem shows a race condition between lookup + * and create. If we have same task when doing lookup and create, it's + * definitely fine as expected by VFS normally. Otherwise, let's just + * verify on-disk dentry one more time, which guarantees filesystem + * consistency more. + */ + if (current != F2FS_I(dir)->task) { + de = __f2fs_find_entry(dir, &fname, &page); + F2FS_I(dir)->task = NULL; + } + if (de) { + f2fs_put_page(page, 0); + err = -EEXIST; + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + } else { + err = f2fs_add_dentry(dir, &fname, inode, ino, mode); + } + fscrypt_free_filename(&fname); + return err; +} + +int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) +{ + struct page *page; + int err = 0; + + down_write(&F2FS_I(inode)->i_sem); + page = f2fs_init_inode_metadata(inode, dir, NULL, NULL, NULL); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } + f2fs_put_page(page, 1); + + clear_inode_flag(inode, FI_NEW_INODE); +fail: + up_write(&F2FS_I(inode)->i_sem); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + return err; +} + +void f2fs_drop_nlink(struct inode *dir, struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + + down_write(&F2FS_I(inode)->i_sem); + + if (S_ISDIR(inode->i_mode)) + f2fs_i_links_write(dir, false); + inode->i_ctime = current_time(inode); + + f2fs_i_links_write(inode, false); + if (S_ISDIR(inode->i_mode)) { + f2fs_i_links_write(inode, false); + f2fs_i_size_write(inode, 0); + } + up_write(&F2FS_I(inode)->i_sem); + + if (inode->i_nlink == 0) + f2fs_add_orphan_inode(inode); + else + f2fs_release_orphan_inode(sbi); +} + +/* + * It only removes the dentry from the dentry page, corresponding name + * entry in name page does not need to be touched during deletion. + */ +void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode) +{ + struct f2fs_dentry_block *dentry_blk; + unsigned int bit_pos; + int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); + int i; + + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + + if (F2FS_OPTION(F2FS_I_SB(dir)).fsync_mode == FSYNC_MODE_STRICT) + f2fs_add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); + + if (f2fs_has_inline_dentry(dir)) + return f2fs_delete_inline_entry(dentry, page, dir, inode); + + lock_page(page); + f2fs_wait_on_page_writeback(page, DATA, true); + + dentry_blk = page_address(page); + bit_pos = dentry - dentry_blk->dentry; + for (i = 0; i < slots; i++) + __clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); + + /* Let's check and deallocate this dentry page */ + bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, + NR_DENTRY_IN_BLOCK, + 0); + set_page_dirty(page); + + dir->i_ctime = dir->i_mtime = current_time(dir); + f2fs_mark_inode_dirty_sync(dir, false); + + if (inode) + f2fs_drop_nlink(dir, inode); + + if (bit_pos == NR_DENTRY_IN_BLOCK && + !f2fs_truncate_hole(dir, page->index, page->index + 1)) { + f2fs_clear_radix_tree_dirty_tag(page); + clear_page_dirty_for_io(page); + ClearPagePrivate(page); + ClearPageUptodate(page); + clear_cold_data(page); + inode_dec_dirty_pages(dir); + f2fs_remove_dirty_inode(dir); + } + f2fs_put_page(page, 1); +} + +bool f2fs_empty_dir(struct inode *dir) +{ + unsigned long bidx; + struct page *dentry_page; + unsigned int bit_pos; + struct f2fs_dentry_block *dentry_blk; + unsigned long nblock = dir_blocks(dir); + + if (f2fs_has_inline_dentry(dir)) + return f2fs_empty_inline_dir(dir); + + for (bidx = 0; bidx < nblock; bidx++) { + dentry_page = f2fs_get_lock_data_page(dir, bidx, false); + if (IS_ERR(dentry_page)) { + if (PTR_ERR(dentry_page) == -ENOENT) + continue; + else + return false; + } + + dentry_blk = page_address(dentry_page); + if (bidx == 0) + bit_pos = 2; + else + bit_pos = 0; + bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, + NR_DENTRY_IN_BLOCK, + bit_pos); + + f2fs_put_page(dentry_page, 1); + + if (bit_pos < NR_DENTRY_IN_BLOCK) + return false; + } + return true; +} + +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, + unsigned int start_pos, struct fscrypt_str *fstr) +{ + unsigned char d_type = DT_UNKNOWN; + unsigned int bit_pos; + struct f2fs_dir_entry *de = NULL; + struct fscrypt_str de_name = FSTR_INIT(NULL, 0); + struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); + int err = 0; + + bit_pos = ((unsigned long)ctx->pos % d->max); + + while (bit_pos < d->max) { + bit_pos = find_next_bit_le(d->bitmap, d->max, bit_pos); + if (bit_pos >= d->max) + break; + + de = &d->dentry[bit_pos]; + if (de->name_len == 0) { + bit_pos++; + ctx->pos = start_pos + bit_pos; + continue; + } + + d_type = f2fs_get_de_type(de); + + de_name.name = d->filename[bit_pos]; + de_name.len = le16_to_cpu(de->name_len); + + /* check memory boundary before moving forward */ + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + if (unlikely(bit_pos > d->max || + le16_to_cpu(de->name_len) > F2FS_NAME_LEN)) { + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: corrupted namelen=%d, run fsck to fix.", + __func__, le16_to_cpu(de->name_len)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EINVAL; + goto out; + } + + if (f2fs_encrypted_inode(d->inode)) { + int save_len = fstr->len; + + err = fscrypt_fname_disk_to_usr(d->inode, + (u32)de->hash_code, 0, + &de_name, fstr); + if (err) + return err; + + de_name = *fstr; + fstr->len = save_len; + } + + if (!dir_emit(ctx, de_name.name, de_name.len, + le32_to_cpu(de->ino), d_type)) + return 1; + + if (sbi->readdir_ra == 1) + f2fs_ra_node_page(sbi, le32_to_cpu(de->ino)); + + ctx->pos = start_pos + bit_pos; + } +out: + return err; +} + +static int f2fs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + unsigned long npages = dir_blocks(inode); + struct f2fs_dentry_block *dentry_blk = NULL; + struct page *dentry_page = NULL; + struct file_ra_state *ra = &file->f_ra; + loff_t start_pos = ctx->pos; + unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); + struct f2fs_dentry_ptr d; + struct fscrypt_str fstr = FSTR_INIT(NULL, 0); + int err = 0; + + if (f2fs_encrypted_inode(inode)) { + err = fscrypt_get_encryption_info(inode); + if (err && err != -ENOKEY) + goto out; + + err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr); + if (err < 0) + goto out; + } + + if (f2fs_has_inline_dentry(inode)) { + err = f2fs_read_inline_dir(file, ctx, &fstr); + goto out_free; + } + + for (; n < npages; n++, ctx->pos = n * NR_DENTRY_IN_BLOCK) { + + /* allow readdir() to be interrupted */ + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto out_free; + } + cond_resched(); + + /* readahead for multi pages of dir */ + if (npages - n > 1 && !ra_has_index(ra, n)) + page_cache_sync_readahead(inode->i_mapping, ra, file, n, + min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); + + dentry_page = f2fs_get_lock_data_page(inode, n, false); + if (IS_ERR(dentry_page)) { + err = PTR_ERR(dentry_page); + if (err == -ENOENT) { + err = 0; + continue; + } else { + goto out_free; + } + } + + dentry_blk = page_address(dentry_page); + + make_dentry_ptr_block(inode, &d, dentry_blk); + + err = f2fs_fill_dentries(ctx, &d, + n * NR_DENTRY_IN_BLOCK, &fstr); + if (err) { + f2fs_put_page(dentry_page, 1); + break; + } + + f2fs_put_page(dentry_page, 1); + } +out_free: + fscrypt_fname_free_buffer(&fstr); +out: + trace_f2fs_readdir(inode, start_pos, ctx->pos, err); + return err < 0 ? err : 0; +} + +static int f2fs_dir_open(struct inode *inode, struct file *filp) +{ + if (f2fs_encrypted_inode(inode)) + return fscrypt_get_encryption_info(inode) ? -EACCES : 0; + return 0; +} + +const struct file_operations f2fs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate_shared = f2fs_readdir, + .fsync = f2fs_sync_file, + .open = f2fs_dir_open, + .unlocked_ioctl = f2fs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = f2fs_compat_ioctl, +#endif +}; diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c new file mode 100644 index 000000000..a70cd2580 --- /dev/null +++ b/fs/f2fs/extent_cache.c @@ -0,0 +1,835 @@ +/* + * f2fs extent cache support + * + * Copyright (c) 2015 Motorola Mobility + * Copyright (c) 2015 Samsung Electronics + * Authors: Jaegeuk Kim <jaegeuk@kernel.org> + * Chao Yu <chao2.yu@samsung.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" +#include "node.h" +#include <trace/events/f2fs.h> + +static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, + unsigned int ofs) +{ + if (cached_re) { + if (cached_re->ofs <= ofs && + cached_re->ofs + cached_re->len > ofs) { + return cached_re; + } + } + return NULL; +} + +static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root, + unsigned int ofs) +{ + struct rb_node *node = root->rb_node; + struct rb_entry *re; + + while (node) { + re = rb_entry(node, struct rb_entry, rb_node); + + if (ofs < re->ofs) + node = node->rb_left; + else if (ofs >= re->ofs + re->len) + node = node->rb_right; + else + return re; + } + return NULL; +} + +struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs) +{ + struct rb_entry *re; + + re = __lookup_rb_tree_fast(cached_re, ofs); + if (!re) + return __lookup_rb_tree_slow(root, ofs); + + return re; +} + +struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root *root, struct rb_node **parent, + unsigned int ofs) +{ + struct rb_node **p = &root->rb_node; + struct rb_entry *re; + + while (*p) { + *parent = *p; + re = rb_entry(*parent, struct rb_entry, rb_node); + + if (ofs < re->ofs) + p = &(*p)->rb_left; + else if (ofs >= re->ofs + re->len) + p = &(*p)->rb_right; + else + f2fs_bug_on(sbi, 1); + } + + return p; +} + +/* + * lookup rb entry in position of @ofs in rb-tree, + * if hit, return the entry, otherwise, return NULL + * @prev_ex: extent before ofs + * @next_ex: extent after ofs + * @insert_p: insert point for new extent at ofs + * in order to simpfy the insertion after. + * tree must stay unchanged between lookup and insertion. + */ +struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root, + struct rb_entry *cached_re, + unsigned int ofs, + struct rb_entry **prev_entry, + struct rb_entry **next_entry, + struct rb_node ***insert_p, + struct rb_node **insert_parent, + bool force) +{ + struct rb_node **pnode = &root->rb_node; + struct rb_node *parent = NULL, *tmp_node; + struct rb_entry *re = cached_re; + + *insert_p = NULL; + *insert_parent = NULL; + *prev_entry = NULL; + *next_entry = NULL; + + if (RB_EMPTY_ROOT(root)) + return NULL; + + if (re) { + if (re->ofs <= ofs && re->ofs + re->len > ofs) + goto lookup_neighbors; + } + + while (*pnode) { + parent = *pnode; + re = rb_entry(*pnode, struct rb_entry, rb_node); + + if (ofs < re->ofs) + pnode = &(*pnode)->rb_left; + else if (ofs >= re->ofs + re->len) + pnode = &(*pnode)->rb_right; + else + goto lookup_neighbors; + } + + *insert_p = pnode; + *insert_parent = parent; + + re = rb_entry(parent, struct rb_entry, rb_node); + tmp_node = parent; + if (parent && ofs > re->ofs) + tmp_node = rb_next(parent); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + + tmp_node = parent; + if (parent && ofs < re->ofs) + tmp_node = rb_prev(parent); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + return NULL; + +lookup_neighbors: + if (ofs == re->ofs || force) { + /* lookup prev node for merging backward later */ + tmp_node = rb_prev(&re->rb_node); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + if (ofs == re->ofs + re->len - 1 || force) { + /* lookup next node for merging frontward later */ + tmp_node = rb_next(&re->rb_node); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + return re; +} + +bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root *root) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct rb_node *cur = rb_first(root), *next; + struct rb_entry *cur_re, *next_re; + + if (!cur) + return true; + + while (cur) { + next = rb_next(cur); + if (!next) + return true; + + cur_re = rb_entry(cur, struct rb_entry, rb_node); + next_re = rb_entry(next, struct rb_entry, rb_node); + + if (cur_re->ofs + cur_re->len > next_re->ofs) { + f2fs_msg(sbi->sb, KERN_INFO, "inconsistent rbtree, " + "cur(%u, %u) next(%u, %u)", + cur_re->ofs, cur_re->len, + next_re->ofs, next_re->len); + return false; + } + + cur = next; + } +#endif + return true; +} + +static struct kmem_cache *extent_tree_slab; +static struct kmem_cache *extent_node_slab; + +static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct rb_node *parent, struct rb_node **p) +{ + struct extent_node *en; + + en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC); + if (!en) + return NULL; + + en->ei = *ei; + INIT_LIST_HEAD(&en->list); + en->et = et; + + rb_link_node(&en->rb_node, parent, p); + rb_insert_color(&en->rb_node, &et->root); + atomic_inc(&et->node_cnt); + atomic_inc(&sbi->total_ext_node); + return en; +} + +static void __detach_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + rb_erase(&en->rb_node, &et->root); + atomic_dec(&et->node_cnt); + atomic_dec(&sbi->total_ext_node); + + if (et->cached_en == en) + et->cached_en = NULL; + kmem_cache_free(extent_node_slab, en); +} + +/* + * Flow to release an extent_node: + * 1. list_del_init + * 2. __detach_extent_node + * 3. kmem_cache_free. + */ +static void __release_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + spin_lock(&sbi->extent_lock); + f2fs_bug_on(sbi, list_empty(&en->list)); + list_del_init(&en->list); + spin_unlock(&sbi->extent_lock); + + __detach_extent_node(sbi, et, en); +} + +static struct extent_tree *__grab_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + nid_t ino = inode->i_ino; + + mutex_lock(&sbi->extent_tree_lock); + et = radix_tree_lookup(&sbi->extent_tree_root, ino); + if (!et) { + et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); + f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); + memset(et, 0, sizeof(struct extent_tree)); + et->ino = ino; + et->root = RB_ROOT; + et->cached_en = NULL; + rwlock_init(&et->lock); + INIT_LIST_HEAD(&et->list); + atomic_set(&et->node_cnt, 0); + atomic_inc(&sbi->total_ext_tree); + } else { + atomic_dec(&sbi->total_zombie_tree); + list_del_init(&et->list); + } + mutex_unlock(&sbi->extent_tree_lock); + + /* never died until evict_inode */ + F2FS_I(inode)->extent_tree = et; + + return et; +} + +static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei) +{ + struct rb_node **p = &et->root.rb_node; + struct extent_node *en; + + en = __attach_extent_node(sbi, et, ei, NULL, p); + if (!en) + return NULL; + + et->largest = en->ei; + et->cached_en = en; + return en; +} + +static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et) +{ + struct rb_node *node, *next; + struct extent_node *en; + unsigned int count = atomic_read(&et->node_cnt); + + node = rb_first(&et->root); + while (node) { + next = rb_next(node); + en = rb_entry(node, struct extent_node, rb_node); + __release_extent_node(sbi, et, en); + node = next; + } + + return count - atomic_read(&et->node_cnt); +} + +static void __drop_largest_extent(struct extent_tree *et, + pgoff_t fofs, unsigned int len) +{ + if (fofs < et->largest.fofs + et->largest.len && + fofs + len > et->largest.fofs) { + et->largest.len = 0; + et->largest_updated = true; + } +} + +/* return true, if inode page is changed */ +static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + struct extent_node *en; + struct extent_info ei; + + if (!f2fs_may_extent_tree(inode)) { + /* drop largest extent */ + if (i_ext && i_ext->len) { + i_ext->len = 0; + return true; + } + return false; + } + + et = __grab_extent_tree(inode); + + if (!i_ext || !i_ext->len) + return false; + + get_extent_info(&ei, i_ext); + + write_lock(&et->lock); + if (atomic_read(&et->node_cnt)) + goto out; + + en = __init_extent_tree(sbi, et, &ei); + if (en) { + spin_lock(&sbi->extent_lock); + list_add_tail(&en->list, &sbi->extent_list); + spin_unlock(&sbi->extent_lock); + } +out: + write_unlock(&et->lock); + return false; +} + +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +{ + bool ret = __f2fs_init_extent_tree(inode, i_ext); + + if (!F2FS_I(inode)->extent_tree) + set_inode_flag(inode, FI_NO_EXTENT); + + return ret; +} + +static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_node *en; + bool ret = false; + + f2fs_bug_on(sbi, !et); + + trace_f2fs_lookup_extent_tree_start(inode, pgofs); + + read_lock(&et->lock); + + if (et->largest.fofs <= pgofs && + et->largest.fofs + et->largest.len > pgofs) { + *ei = et->largest; + ret = true; + stat_inc_largest_node_hit(sbi); + goto out; + } + + en = (struct extent_node *)f2fs_lookup_rb_tree(&et->root, + (struct rb_entry *)et->cached_en, pgofs); + if (!en) + goto out; + + if (en == et->cached_en) + stat_inc_cached_node_hit(sbi); + else + stat_inc_rbtree_node_hit(sbi); + + *ei = en->ei; + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) { + list_move_tail(&en->list, &sbi->extent_list); + et->cached_en = en; + } + spin_unlock(&sbi->extent_lock); + ret = true; +out: + stat_inc_total_hit(sbi); + read_unlock(&et->lock); + + trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei); + return ret; +} + +static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct extent_node *prev_ex, + struct extent_node *next_ex) +{ + struct extent_node *en = NULL; + + if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) { + prev_ex->ei.len += ei->len; + ei = &prev_ex->ei; + en = prev_ex; + } + + if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { + next_ex->ei.fofs = ei->fofs; + next_ex->ei.blk = ei->blk; + next_ex->ei.len += ei->len; + if (en) + __release_extent_node(sbi, et, prev_ex); + + en = next_ex; + } + + if (!en) + return NULL; + + __try_update_largest_extent(et, en); + + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) { + list_move_tail(&en->list, &sbi->extent_list); + et->cached_en = en; + } + spin_unlock(&sbi->extent_lock); + return en; +} + +static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct rb_node **insert_p, + struct rb_node *insert_parent) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct extent_node *en = NULL; + + if (insert_p && insert_parent) { + parent = insert_parent; + p = insert_p; + goto do_insert; + } + + p = f2fs_lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs); +do_insert: + en = __attach_extent_node(sbi, et, ei, parent, p); + if (!en) + return NULL; + + __try_update_largest_extent(et, en); + + /* update in global extent list */ + spin_lock(&sbi->extent_lock); + list_add_tail(&en->list, &sbi->extent_list); + et->cached_en = en; + spin_unlock(&sbi->extent_lock); + return en; +} + +static void f2fs_update_extent_tree_range(struct inode *inode, + pgoff_t fofs, block_t blkaddr, unsigned int len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_node *en = NULL, *en1 = NULL; + struct extent_node *prev_en = NULL, *next_en = NULL; + struct extent_info ei, dei, prev; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + unsigned int end = fofs + len; + unsigned int pos = (unsigned int)fofs; + bool updated = false; + + if (!et) + return; + + trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len); + + write_lock(&et->lock); + + if (is_inode_flag_set(inode, FI_NO_EXTENT)) { + write_unlock(&et->lock); + return; + } + + prev = et->largest; + dei.len = 0; + + /* + * drop largest extent before lookup, in case it's already + * been shrunk from extent tree + */ + __drop_largest_extent(et, fofs, len); + + /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ + en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, + (struct rb_entry *)et->cached_en, fofs, + (struct rb_entry **)&prev_en, + (struct rb_entry **)&next_en, + &insert_p, &insert_parent, false); + if (!en) + en = next_en; + + /* 2. invlidate all extent nodes in range [fofs, fofs + len - 1] */ + while (en && en->ei.fofs < end) { + unsigned int org_end; + int parts = 0; /* # of parts current extent split into */ + + next_en = en1 = NULL; + + dei = en->ei; + org_end = dei.fofs + dei.len; + f2fs_bug_on(sbi, pos >= org_end); + + if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { + en->ei.len = pos - en->ei.fofs; + prev_en = en; + parts = 1; + } + + if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) { + if (parts) { + set_extent_info(&ei, end, + end - dei.fofs + dei.blk, + org_end - end); + en1 = __insert_extent_tree(sbi, et, &ei, + NULL, NULL); + next_en = en1; + } else { + en->ei.fofs = end; + en->ei.blk += end - dei.fofs; + en->ei.len -= end - dei.fofs; + next_en = en; + } + parts++; + } + + if (!next_en) { + struct rb_node *node = rb_next(&en->rb_node); + + next_en = rb_entry_safe(node, struct extent_node, + rb_node); + } + + if (parts) + __try_update_largest_extent(et, en); + else + __release_extent_node(sbi, et, en); + + /* + * if original extent is split into zero or two parts, extent + * tree has been altered by deletion or insertion, therefore + * invalidate pointers regard to tree. + */ + if (parts != 1) { + insert_p = NULL; + insert_parent = NULL; + } + en = next_en; + } + + /* 3. update extent in extent cache */ + if (blkaddr) { + + set_extent_info(&ei, fofs, blkaddr, len); + if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) + __insert_extent_tree(sbi, et, &ei, + insert_p, insert_parent); + + /* give up extent_cache, if split and small updates happen */ + if (dei.len >= 1 && + prev.len < F2FS_MIN_EXTENT_LEN && + et->largest.len < F2FS_MIN_EXTENT_LEN) { + et->largest.len = 0; + et->largest_updated = true; + set_inode_flag(inode, FI_NO_EXTENT); + } + } + + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + __free_extent_tree(sbi, et); + + if (et->largest_updated) { + et->largest_updated = false; + updated = true; + } + + write_unlock(&et->lock); + + if (updated) + f2fs_mark_inode_dirty_sync(inode, true); +} + +unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + struct extent_tree *et, *next; + struct extent_node *en; + unsigned int node_cnt = 0, tree_cnt = 0; + int remained; + + if (!test_opt(sbi, EXTENT_CACHE)) + return 0; + + if (!atomic_read(&sbi->total_zombie_tree)) + goto free_node; + + if (!mutex_trylock(&sbi->extent_tree_lock)) + goto out; + + /* 1. remove unreferenced extent tree */ + list_for_each_entry_safe(et, next, &sbi->zombie_list, list) { + if (atomic_read(&et->node_cnt)) { + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et); + write_unlock(&et->lock); + } + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); + list_del_init(&et->list); + radix_tree_delete(&sbi->extent_tree_root, et->ino); + kmem_cache_free(extent_tree_slab, et); + atomic_dec(&sbi->total_ext_tree); + atomic_dec(&sbi->total_zombie_tree); + tree_cnt++; + + if (node_cnt + tree_cnt >= nr_shrink) + goto unlock_out; + cond_resched(); + } + mutex_unlock(&sbi->extent_tree_lock); + +free_node: + /* 2. remove LRU extent entries */ + if (!mutex_trylock(&sbi->extent_tree_lock)) + goto out; + + remained = nr_shrink - (node_cnt + tree_cnt); + + spin_lock(&sbi->extent_lock); + for (; remained > 0; remained--) { + if (list_empty(&sbi->extent_list)) + break; + en = list_first_entry(&sbi->extent_list, + struct extent_node, list); + et = en->et; + if (!write_trylock(&et->lock)) { + /* refresh this extent node's position in extent list */ + list_move_tail(&en->list, &sbi->extent_list); + continue; + } + + list_del_init(&en->list); + spin_unlock(&sbi->extent_lock); + + __detach_extent_node(sbi, et, en); + + write_unlock(&et->lock); + node_cnt++; + spin_lock(&sbi->extent_lock); + } + spin_unlock(&sbi->extent_lock); + +unlock_out: + mutex_unlock(&sbi->extent_tree_lock); +out: + trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); + + return node_cnt + tree_cnt; +} + +unsigned int f2fs_destroy_extent_node(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + unsigned int node_cnt = 0; + + if (!et || !atomic_read(&et->node_cnt)) + return 0; + + write_lock(&et->lock); + node_cnt = __free_extent_tree(sbi, et); + write_unlock(&et->lock); + + return node_cnt; +} + +void f2fs_drop_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + bool updated = false; + + if (!f2fs_may_extent_tree(inode)) + return; + + set_inode_flag(inode, FI_NO_EXTENT); + + write_lock(&et->lock); + __free_extent_tree(sbi, et); + if (et->largest.len) { + et->largest.len = 0; + updated = true; + } + write_unlock(&et->lock); + if (updated) + f2fs_mark_inode_dirty_sync(inode, true); +} + +void f2fs_destroy_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + unsigned int node_cnt = 0; + + if (!et) + return; + + if (inode->i_nlink && !is_bad_inode(inode) && + atomic_read(&et->node_cnt)) { + mutex_lock(&sbi->extent_tree_lock); + list_add_tail(&et->list, &sbi->zombie_list); + atomic_inc(&sbi->total_zombie_tree); + mutex_unlock(&sbi->extent_tree_lock); + return; + } + + /* free all extent info belong to this extent tree */ + node_cnt = f2fs_destroy_extent_node(inode); + + /* delete extent tree entry in radix tree */ + mutex_lock(&sbi->extent_tree_lock); + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); + radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); + kmem_cache_free(extent_tree_slab, et); + atomic_dec(&sbi->total_ext_tree); + mutex_unlock(&sbi->extent_tree_lock); + + F2FS_I(inode)->extent_tree = NULL; + + trace_f2fs_destroy_extent_tree(inode, node_cnt); +} + +bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (!f2fs_may_extent_tree(inode)) + return false; + + return f2fs_lookup_extent_tree(inode, pgofs, ei); +} + +void f2fs_update_extent_cache(struct dnode_of_data *dn) +{ + pgoff_t fofs; + block_t blkaddr; + + if (!f2fs_may_extent_tree(dn->inode)) + return; + + if (dn->data_blkaddr == NEW_ADDR) + blkaddr = NULL_ADDR; + else + blkaddr = dn->data_blkaddr; + + fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + + dn->ofs_in_node; + f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1); +} + +void f2fs_update_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len) + +{ + if (!f2fs_may_extent_tree(dn->inode)) + return; + + f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len); +} + +void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) +{ + INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); + mutex_init(&sbi->extent_tree_lock); + INIT_LIST_HEAD(&sbi->extent_list); + spin_lock_init(&sbi->extent_lock); + atomic_set(&sbi->total_ext_tree, 0); + INIT_LIST_HEAD(&sbi->zombie_list); + atomic_set(&sbi->total_zombie_tree, 0); + atomic_set(&sbi->total_ext_node, 0); +} + +int __init f2fs_create_extent_cache(void) +{ + extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree", + sizeof(struct extent_tree)); + if (!extent_tree_slab) + return -ENOMEM; + extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node", + sizeof(struct extent_node)); + if (!extent_node_slab) { + kmem_cache_destroy(extent_tree_slab); + return -ENOMEM; + } + return 0; +} + +void f2fs_destroy_extent_cache(void) +{ + kmem_cache_destroy(extent_node_slab); + kmem_cache_destroy(extent_tree_slab); +} diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h new file mode 100644 index 000000000..aacd8e117 --- /dev/null +++ b/fs/f2fs/f2fs.h @@ -0,0 +1,3504 @@ +/* + * fs/f2fs/f2fs.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef _LINUX_F2FS_H +#define _LINUX_F2FS_H + +#include <linux/types.h> +#include <linux/page-flags.h> +#include <linux/buffer_head.h> +#include <linux/slab.h> +#include <linux/crc32.h> +#include <linux/magic.h> +#include <linux/kobject.h> +#include <linux/sched.h> +#include <linux/cred.h> +#include <linux/vmalloc.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/quotaops.h> +#include <crypto/hash.h> + +#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_F2FS_FS_ENCRYPTION) +#include <linux/fscrypt.h> + +#ifdef CONFIG_F2FS_CHECK_FS +#define f2fs_bug_on(sbi, condition) BUG_ON(condition) +#else +#define f2fs_bug_on(sbi, condition) \ + do { \ + if (unlikely(condition)) { \ + WARN_ON(1); \ + set_sbi_flag(sbi, SBI_NEED_FSCK); \ + } \ + } while (0) +#endif + +enum { + FAULT_KMALLOC, + FAULT_KVMALLOC, + FAULT_PAGE_ALLOC, + FAULT_PAGE_GET, + FAULT_ALLOC_BIO, + FAULT_ALLOC_NID, + FAULT_ORPHAN, + FAULT_BLOCK, + FAULT_DIR_DEPTH, + FAULT_EVICT_INODE, + FAULT_TRUNCATE, + FAULT_IO, + FAULT_CHECKPOINT, + FAULT_DISCARD, + FAULT_MAX, +}; + +#ifdef CONFIG_F2FS_FAULT_INJECTION +#define F2FS_ALL_FAULT_TYPE ((1 << FAULT_MAX) - 1) + +struct f2fs_fault_info { + atomic_t inject_ops; + unsigned int inject_rate; + unsigned int inject_type; +}; + +extern char *f2fs_fault_name[FAULT_MAX]; +#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) +#endif + +/* + * For mount options + */ +#define F2FS_MOUNT_BG_GC 0x00000001 +#define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000002 +#define F2FS_MOUNT_DISCARD 0x00000004 +#define F2FS_MOUNT_NOHEAP 0x00000008 +#define F2FS_MOUNT_XATTR_USER 0x00000010 +#define F2FS_MOUNT_POSIX_ACL 0x00000020 +#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 +#define F2FS_MOUNT_INLINE_XATTR 0x00000080 +#define F2FS_MOUNT_INLINE_DATA 0x00000100 +#define F2FS_MOUNT_INLINE_DENTRY 0x00000200 +#define F2FS_MOUNT_FLUSH_MERGE 0x00000400 +#define F2FS_MOUNT_NOBARRIER 0x00000800 +#define F2FS_MOUNT_FASTBOOT 0x00001000 +#define F2FS_MOUNT_EXTENT_CACHE 0x00002000 +#define F2FS_MOUNT_FORCE_FG_GC 0x00004000 +#define F2FS_MOUNT_DATA_FLUSH 0x00008000 +#define F2FS_MOUNT_FAULT_INJECTION 0x00010000 +#define F2FS_MOUNT_ADAPTIVE 0x00020000 +#define F2FS_MOUNT_LFS 0x00040000 +#define F2FS_MOUNT_USRQUOTA 0x00080000 +#define F2FS_MOUNT_GRPQUOTA 0x00100000 +#define F2FS_MOUNT_PRJQUOTA 0x00200000 +#define F2FS_MOUNT_QUOTA 0x00400000 +#define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 +#define F2FS_MOUNT_RESERVE_ROOT 0x01000000 + +#define F2FS_OPTION(sbi) ((sbi)->mount_opt) +#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) +#define set_opt(sbi, option) (F2FS_OPTION(sbi).opt |= F2FS_MOUNT_##option) +#define test_opt(sbi, option) (F2FS_OPTION(sbi).opt & F2FS_MOUNT_##option) + +#define ver_after(a, b) (typecheck(unsigned long long, a) && \ + typecheck(unsigned long long, b) && \ + ((long long)((a) - (b)) > 0)) + +typedef u32 block_t; /* + * should not change u32, since it is the on-disk block + * address format, __le32. + */ +typedef u32 nid_t; + +struct f2fs_mount_info { + unsigned int opt; + int write_io_size_bits; /* Write IO size bits */ + block_t root_reserved_blocks; /* root reserved blocks */ + kuid_t s_resuid; /* reserved blocks for uid */ + kgid_t s_resgid; /* reserved blocks for gid */ + int active_logs; /* # of active logs */ + int inline_xattr_size; /* inline xattr size */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct f2fs_fault_info fault_info; /* For fault injection */ +#endif +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char *s_qf_names[MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif + /* For which write hints are passed down to block layer */ + int whint_mode; + int alloc_mode; /* segment allocation policy */ + int fsync_mode; /* fsync policy */ + bool test_dummy_encryption; /* test dummy encryption */ +}; + +#define F2FS_FEATURE_ENCRYPT 0x0001 +#define F2FS_FEATURE_BLKZONED 0x0002 +#define F2FS_FEATURE_ATOMIC_WRITE 0x0004 +#define F2FS_FEATURE_EXTRA_ATTR 0x0008 +#define F2FS_FEATURE_PRJQUOTA 0x0010 +#define F2FS_FEATURE_INODE_CHKSUM 0x0020 +#define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x0040 +#define F2FS_FEATURE_QUOTA_INO 0x0080 +#define F2FS_FEATURE_INODE_CRTIME 0x0100 +#define F2FS_FEATURE_LOST_FOUND 0x0200 +#define F2FS_FEATURE_VERITY 0x0400 /* reserved */ + +#define F2FS_HAS_FEATURE(sb, mask) \ + ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) +#define F2FS_SET_FEATURE(sb, mask) \ + (F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask)) +#define F2FS_CLEAR_FEATURE(sb, mask) \ + (F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)) + +/* + * Default values for user and/or group using reserved blocks + */ +#define F2FS_DEF_RESUID 0 +#define F2FS_DEF_RESGID 0 + +/* + * For checkpoint manager + */ +enum { + NAT_BITMAP, + SIT_BITMAP +}; + +#define CP_UMOUNT 0x00000001 +#define CP_FASTBOOT 0x00000002 +#define CP_SYNC 0x00000004 +#define CP_RECOVERY 0x00000008 +#define CP_DISCARD 0x00000010 +#define CP_TRIMMED 0x00000020 + +#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) +#define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ +#define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ +#define DEF_MID_DISCARD_ISSUE_TIME 500 /* 500 ms, if device busy */ +#define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ +#define DEF_DISCARD_URGENT_UTIL 80 /* do more discard over 80% */ +#define DEF_CP_INTERVAL 60 /* 60 secs */ +#define DEF_IDLE_INTERVAL 5 /* 5 secs */ + +struct cp_control { + int reason; + __u64 trim_start; + __u64 trim_end; + __u64 trim_minlen; +}; + +/* + * indicate meta/data type + */ +enum { + META_CP, + META_NAT, + META_SIT, + META_SSA, + META_POR, + DATA_GENERIC, + META_GENERIC, +}; + +/* for the list of ino */ +enum { + ORPHAN_INO, /* for orphan ino list */ + APPEND_INO, /* for append ino list */ + UPDATE_INO, /* for update ino list */ + TRANS_DIR_INO, /* for trasactions dir ino list */ + FLUSH_INO, /* for multiple device flushing */ + MAX_INO_ENTRY, /* max. list */ +}; + +struct ino_entry { + struct list_head list; /* list head */ + nid_t ino; /* inode number */ + unsigned int dirty_device; /* dirty device bitmap */ +}; + +/* for the list of inodes to be GCed */ +struct inode_entry { + struct list_head list; /* list head */ + struct inode *inode; /* vfs inode pointer */ +}; + +struct fsync_node_entry { + struct list_head list; /* list head */ + struct page *page; /* warm node page pointer */ + unsigned int seq_id; /* sequence id */ +}; + +/* for the bitmap indicate blocks to be discarded */ +struct discard_entry { + struct list_head list; /* list head */ + block_t start_blkaddr; /* start blockaddr of current segment */ + unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ +}; + +/* default discard granularity of inner discard thread, unit: block count */ +#define DEFAULT_DISCARD_GRANULARITY 16 + +/* max discard pend list number */ +#define MAX_PLIST_NUM 512 +#define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ + (MAX_PLIST_NUM - 1) : (blk_num - 1)) + +enum { + D_PREP, /* initial */ + D_PARTIAL, /* partially submitted */ + D_SUBMIT, /* all submitted */ + D_DONE, /* finished */ +}; + +struct discard_info { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ +}; + +struct discard_cmd { + struct rb_node rb_node; /* rb node located in rb-tree */ + union { + struct { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ + }; + struct discard_info di; /* discard info */ + + }; + struct list_head list; /* command list */ + struct completion wait; /* compleation */ + struct block_device *bdev; /* bdev */ + unsigned short ref; /* reference count */ + unsigned char state; /* state */ + unsigned char issuing; /* issuing discard */ + int error; /* bio error */ + spinlock_t lock; /* for state/bio_ref updating */ + unsigned short bio_ref; /* bio reference count */ +}; + +enum { + DPOLICY_BG, + DPOLICY_FORCE, + DPOLICY_FSTRIM, + DPOLICY_UMOUNT, + MAX_DPOLICY, +}; + +struct discard_policy { + int type; /* type of discard */ + unsigned int min_interval; /* used for candidates exist */ + unsigned int mid_interval; /* used for device busy */ + unsigned int max_interval; /* used for candidates not exist */ + unsigned int max_requests; /* # of discards issued per round */ + unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */ + bool io_aware; /* issue discard in idle time */ + bool sync; /* submit discard with REQ_SYNC flag */ + bool ordered; /* issue discard by lba order */ + unsigned int granularity; /* discard granularity */ +}; + +struct discard_cmd_control { + struct task_struct *f2fs_issue_discard; /* discard thread */ + struct list_head entry_list; /* 4KB discard entry list */ + struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ + struct list_head wait_list; /* store on-flushing entries */ + struct list_head fstrim_list; /* in-flight discard from fstrim */ + wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ + unsigned int discard_wake; /* to wake up discard thread */ + struct mutex cmd_lock; + unsigned int nr_discards; /* # of discards in the list */ + unsigned int max_discards; /* max. discards to be issued */ + unsigned int discard_granularity; /* discard granularity */ + unsigned int undiscard_blks; /* # of undiscard blocks */ + unsigned int next_pos; /* next discard position */ + atomic_t issued_discard; /* # of issued discard */ + atomic_t issing_discard; /* # of issing discard */ + atomic_t discard_cmd_cnt; /* # of cached cmd count */ + struct rb_root root; /* root of discard rb-tree */ + bool rbtree_check; /* config for consistence check */ +}; + +/* for the list of fsync inodes, used only during recovery */ +struct fsync_inode_entry { + struct list_head list; /* list head */ + struct inode *inode; /* vfs inode pointer */ + block_t blkaddr; /* block address locating the last fsync */ + block_t last_dentry; /* block address locating the last dentry */ +}; + +#define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats)) +#define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits)) + +#define nat_in_journal(jnl, i) ((jnl)->nat_j.entries[i].ne) +#define nid_in_journal(jnl, i) ((jnl)->nat_j.entries[i].nid) +#define sit_in_journal(jnl, i) ((jnl)->sit_j.entries[i].se) +#define segno_in_journal(jnl, i) ((jnl)->sit_j.entries[i].segno) + +#define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl)) +#define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl)) + +static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) +{ + int before = nats_in_cursum(journal); + + journal->n_nats = cpu_to_le16(before + i); + return before; +} + +static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i) +{ + int before = sits_in_cursum(journal); + + journal->n_sits = cpu_to_le16(before + i); + return before; +} + +static inline bool __has_cursum_space(struct f2fs_journal *journal, + int size, int type) +{ + if (type == NAT_JOURNAL) + return size <= MAX_NAT_JENTRIES(journal); + return size <= MAX_SIT_JENTRIES(journal); +} + +/* + * ioctl commands + */ +#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS +#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS +#define F2FS_IOC_GETVERSION FS_IOC_GETVERSION + +#define F2FS_IOCTL_MAGIC 0xf5 +#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1) +#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2) +#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) +#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) +#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) +#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32) +#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) +#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \ + struct f2fs_defragment) +#define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ + struct f2fs_move_range) +#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \ + struct f2fs_flush_device) +#define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \ + struct f2fs_gc_range) +#define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32) +#define F2FS_IOC_SET_PIN_FILE _IOW(F2FS_IOCTL_MAGIC, 13, __u32) +#define F2FS_IOC_GET_PIN_FILE _IOR(F2FS_IOCTL_MAGIC, 14, __u32) +#define F2FS_IOC_PRECACHE_EXTENTS _IO(F2FS_IOCTL_MAGIC, 15) + +#define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY +#define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY +#define F2FS_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT + +/* + * should be same as XFS_IOC_GOINGDOWN. + * Flags for going down operation used by FS_IOC_GOINGDOWN + */ +#define F2FS_IOC_SHUTDOWN _IOR('X', 125, __u32) /* Shutdown */ +#define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */ +#define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */ +#define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */ +#define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */ + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +/* + * ioctl commands in 32 bit emulation + */ +#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION +#endif + +#define F2FS_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define F2FS_IOC_FSSETXATTR FS_IOC_FSSETXATTR + +struct f2fs_gc_range { + u32 sync; + u64 start; + u64 len; +}; + +struct f2fs_defragment { + u64 start; + u64 len; +}; + +struct f2fs_move_range { + u32 dst_fd; /* destination fd */ + u64 pos_in; /* start position in src_fd */ + u64 pos_out; /* start position in dst_fd */ + u64 len; /* size to move */ +}; + +struct f2fs_flush_device { + u32 dev_num; /* device number to flush */ + u32 segments; /* # of segments to flush */ +}; + +/* for inline stuff */ +#define DEF_INLINE_RESERVED_SIZE 1 +static inline int get_extra_isize(struct inode *inode); +static inline int get_inline_xattr_addrs(struct inode *inode); +#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ + (CUR_ADDRS_PER_INODE(inode) - \ + get_inline_xattr_addrs(inode) - \ + DEF_INLINE_RESERVED_SIZE)) + +/* for inline dir */ +#define NR_INLINE_DENTRY(inode) (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + BITS_PER_BYTE + 1)) +#define INLINE_DENTRY_BITMAP_SIZE(inode) ((NR_INLINE_DENTRY(inode) + \ + BITS_PER_BYTE - 1) / BITS_PER_BYTE) +#define INLINE_RESERVED_SIZE(inode) (MAX_INLINE_DATA(inode) - \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + NR_INLINE_DENTRY(inode) + \ + INLINE_DENTRY_BITMAP_SIZE(inode))) + +/* + * For INODE and NODE manager + */ +/* for directory operations */ +struct f2fs_dentry_ptr { + struct inode *inode; + void *bitmap; + struct f2fs_dir_entry *dentry; + __u8 (*filename)[F2FS_SLOT_LEN]; + int max; + int nr_bitmap; +}; + +static inline void make_dentry_ptr_block(struct inode *inode, + struct f2fs_dentry_ptr *d, struct f2fs_dentry_block *t) +{ + d->inode = inode; + d->max = NR_DENTRY_IN_BLOCK; + d->nr_bitmap = SIZE_OF_DENTRY_BITMAP; + d->bitmap = t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; +} + +static inline void make_dentry_ptr_inline(struct inode *inode, + struct f2fs_dentry_ptr *d, void *t) +{ + int entry_cnt = NR_INLINE_DENTRY(inode); + int bitmap_size = INLINE_DENTRY_BITMAP_SIZE(inode); + int reserved_size = INLINE_RESERVED_SIZE(inode); + + d->inode = inode; + d->max = entry_cnt; + d->nr_bitmap = bitmap_size; + d->bitmap = t; + d->dentry = t + bitmap_size + reserved_size; + d->filename = t + bitmap_size + reserved_size + + SIZE_OF_DIR_ENTRY * entry_cnt; +} + +/* + * XATTR_NODE_OFFSET stores xattrs to one node block per file keeping -1 + * as its node offset to distinguish from index node blocks. + * But some bits are used to mark the node block. + */ +#define XATTR_NODE_OFFSET ((((unsigned int)-1) << OFFSET_BIT_SHIFT) \ + >> OFFSET_BIT_SHIFT) +enum { + ALLOC_NODE, /* allocate a new node page if needed */ + LOOKUP_NODE, /* look up a node without readahead */ + LOOKUP_NODE_RA, /* + * look up a node with readahead called + * by get_data_block. + */ +}; + +#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO count */ + +#define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ + +#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ + +/* for in-memory extent cache entry */ +#define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ + +/* number of extent info in extent cache we try to shrink */ +#define EXTENT_CACHE_SHRINK_NUMBER 128 + +struct rb_entry { + struct rb_node rb_node; /* rb node located in rb-tree */ + unsigned int ofs; /* start offset of the entry */ + unsigned int len; /* length of the entry */ +}; + +struct extent_info { + unsigned int fofs; /* start offset in a file */ + unsigned int len; /* length of the extent */ + u32 blk; /* start block address of the extent */ +}; + +struct extent_node { + struct rb_node rb_node; + union { + struct { + unsigned int fofs; + unsigned int len; + u32 blk; + }; + struct extent_info ei; /* extent info */ + + }; + struct list_head list; /* node in global extent list of sbi */ + struct extent_tree *et; /* extent tree pointer */ +}; + +struct extent_tree { + nid_t ino; /* inode number */ + struct rb_root root; /* root of extent info rb-tree */ + struct extent_node *cached_en; /* recently accessed extent node */ + struct extent_info largest; /* largested extent info */ + struct list_head list; /* to be used by sbi->zombie_list */ + rwlock_t lock; /* protect extent info rb-tree */ + atomic_t node_cnt; /* # of extent node in rb-tree*/ + bool largest_updated; /* largest extent updated */ +}; + +/* + * This structure is taken from ext4_map_blocks. + * + * Note that, however, f2fs uses NEW and MAPPED flags for f2fs_map_blocks(). + */ +#define F2FS_MAP_NEW (1 << BH_New) +#define F2FS_MAP_MAPPED (1 << BH_Mapped) +#define F2FS_MAP_UNWRITTEN (1 << BH_Unwritten) +#define F2FS_MAP_FLAGS (F2FS_MAP_NEW | F2FS_MAP_MAPPED |\ + F2FS_MAP_UNWRITTEN) + +struct f2fs_map_blocks { + block_t m_pblk; + block_t m_lblk; + unsigned int m_len; + unsigned int m_flags; + pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */ + pgoff_t *m_next_extent; /* point to next possible extent */ + int m_seg_type; +}; + +/* for flag in get_data_block */ +enum { + F2FS_GET_BLOCK_DEFAULT, + F2FS_GET_BLOCK_FIEMAP, + F2FS_GET_BLOCK_BMAP, + F2FS_GET_BLOCK_DIO, + F2FS_GET_BLOCK_PRE_DIO, + F2FS_GET_BLOCK_PRE_AIO, + F2FS_GET_BLOCK_PRECACHE, +}; + +/* + * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. + */ +#define FADVISE_COLD_BIT 0x01 +#define FADVISE_LOST_PINO_BIT 0x02 +#define FADVISE_ENCRYPT_BIT 0x04 +#define FADVISE_ENC_NAME_BIT 0x08 +#define FADVISE_KEEP_SIZE_BIT 0x10 +#define FADVISE_HOT_BIT 0x20 +#define FADVISE_VERITY_BIT 0x40 /* reserved */ + +#define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT) + +#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) +#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) +#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) +#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) +#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT) +#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT) +#define file_is_encrypt(inode) is_file(inode, FADVISE_ENCRYPT_BIT) +#define file_set_encrypt(inode) set_file(inode, FADVISE_ENCRYPT_BIT) +#define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT) +#define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) +#define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) +#define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) +#define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) +#define file_is_hot(inode) is_file(inode, FADVISE_HOT_BIT) +#define file_set_hot(inode) set_file(inode, FADVISE_HOT_BIT) +#define file_clear_hot(inode) clear_file(inode, FADVISE_HOT_BIT) + +#define DEF_DIR_LEVEL 0 + +enum { + GC_FAILURE_PIN, + GC_FAILURE_ATOMIC, + MAX_GC_FAILURE +}; + +struct f2fs_inode_info { + struct inode vfs_inode; /* serve a vfs inode */ + unsigned long i_flags; /* keep an inode flags for ioctl */ + unsigned char i_advise; /* use to give file attribute hints */ + unsigned char i_dir_level; /* use for dentry level for large dir */ + unsigned int i_current_depth; /* only for directory depth */ + /* for gc failure statistic */ + unsigned int i_gc_failures[MAX_GC_FAILURE]; + unsigned int i_pino; /* parent inode number */ + umode_t i_acl_mode; /* keep file acl mode temporarily */ + + /* Use below internally in f2fs*/ + unsigned long flags; /* use to pass per-file flags */ + struct rw_semaphore i_sem; /* protect fi info */ + atomic_t dirty_pages; /* # of dirty pages */ + f2fs_hash_t chash; /* hash value of given file name */ + unsigned int clevel; /* maximum level of given file name */ + struct task_struct *task; /* lookup and create consistency */ + struct task_struct *cp_task; /* separate cp/wb IO stats*/ + nid_t i_xattr_nid; /* node id that contains xattrs */ + loff_t last_disk_size; /* lastly written file size */ + +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; + + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif + struct list_head dirty_list; /* dirty list for dirs and files */ + struct list_head gdirty_list; /* linked in global dirty list */ + struct list_head inmem_ilist; /* list for inmem inodes */ + struct list_head inmem_pages; /* inmemory pages managed by f2fs */ + struct task_struct *inmem_task; /* store inmemory task */ + struct mutex inmem_lock; /* lock for inmemory pages */ + struct extent_tree *extent_tree; /* cached extent_tree entry */ + + /* avoid racing between foreground op and gc */ + struct rw_semaphore i_gc_rwsem[2]; + struct rw_semaphore i_mmap_sem; + struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ + + int i_extra_isize; /* size of extra space located in i_addr */ + kprojid_t i_projid; /* id for project quota */ + int i_inline_xattr_size; /* inline xattr size */ + struct timespec64 i_crtime; /* inode creation time */ + struct timespec64 i_disk_time[4];/* inode disk times */ +}; + +static inline void get_extent_info(struct extent_info *ext, + struct f2fs_extent *i_ext) +{ + ext->fofs = le32_to_cpu(i_ext->fofs); + ext->blk = le32_to_cpu(i_ext->blk); + ext->len = le32_to_cpu(i_ext->len); +} + +static inline void set_raw_extent(struct extent_info *ext, + struct f2fs_extent *i_ext) +{ + i_ext->fofs = cpu_to_le32(ext->fofs); + i_ext->blk = cpu_to_le32(ext->blk); + i_ext->len = cpu_to_le32(ext->len); +} + +static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, + u32 blk, unsigned int len) +{ + ei->fofs = fofs; + ei->blk = blk; + ei->len = len; +} + +static inline bool __is_discard_mergeable(struct discard_info *back, + struct discard_info *front, unsigned int max_len) +{ + return (back->lstart + back->len == front->lstart) && + (back->len + front->len <= max_len); +} + +static inline bool __is_discard_back_mergeable(struct discard_info *cur, + struct discard_info *back, unsigned int max_len) +{ + return __is_discard_mergeable(back, cur, max_len); +} + +static inline bool __is_discard_front_mergeable(struct discard_info *cur, + struct discard_info *front, unsigned int max_len) +{ + return __is_discard_mergeable(cur, front, max_len); +} + +static inline bool __is_extent_mergeable(struct extent_info *back, + struct extent_info *front) +{ + return (back->fofs + back->len == front->fofs && + back->blk + back->len == front->blk); +} + +static inline bool __is_back_mergeable(struct extent_info *cur, + struct extent_info *back) +{ + return __is_extent_mergeable(back, cur); +} + +static inline bool __is_front_mergeable(struct extent_info *cur, + struct extent_info *front) +{ + return __is_extent_mergeable(cur, front); +} + +extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); +static inline void __try_update_largest_extent(struct extent_tree *et, + struct extent_node *en) +{ + if (en->ei.len > et->largest.len) { + et->largest = en->ei; + et->largest_updated = true; + } +} + +/* + * For free nid management + */ +enum nid_state { + FREE_NID, /* newly added to free nid list */ + PREALLOC_NID, /* it is preallocated */ + MAX_NID_STATE, +}; + +struct f2fs_nm_info { + block_t nat_blkaddr; /* base disk address of NAT */ + nid_t max_nid; /* maximum possible node ids */ + nid_t available_nids; /* # of available node ids */ + nid_t next_scan_nid; /* the next nid to be scanned */ + unsigned int ram_thresh; /* control the memory footprint */ + unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ + unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */ + + /* NAT cache management */ + struct radix_tree_root nat_root;/* root of the nat entry cache */ + struct radix_tree_root nat_set_root;/* root of the nat set cache */ + struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ + struct list_head nat_entries; /* cached nat entry list (clean) */ + spinlock_t nat_list_lock; /* protect clean nat entry list */ + unsigned int nat_cnt; /* the # of cached nat entries */ + unsigned int dirty_nat_cnt; /* total num of nat entries in set */ + unsigned int nat_blocks; /* # of nat blocks */ + + /* free node ids management */ + struct radix_tree_root free_nid_root;/* root of the free_nid cache */ + struct list_head free_nid_list; /* list for free nids excluding preallocated nids */ + unsigned int nid_cnt[MAX_NID_STATE]; /* the number of free node id */ + spinlock_t nid_list_lock; /* protect nid lists ops */ + struct mutex build_lock; /* lock for build free nids */ + unsigned char **free_nid_bitmap; + unsigned char *nat_block_bitmap; + unsigned short *free_nid_count; /* free nid count of NAT block */ + + /* for checkpoint */ + char *nat_bitmap; /* NAT bitmap pointer */ + + unsigned int nat_bits_blocks; /* # of nat bits blocks */ + unsigned char *nat_bits; /* NAT bits blocks */ + unsigned char *full_nat_bits; /* full NAT pages */ + unsigned char *empty_nat_bits; /* empty NAT pages */ +#ifdef CONFIG_F2FS_CHECK_FS + char *nat_bitmap_mir; /* NAT bitmap mirror */ +#endif + int bitmap_size; /* bitmap size */ +}; + +/* + * this structure is used as one of function parameters. + * all the information are dedicated to a given direct node block determined + * by the data offset in a file. + */ +struct dnode_of_data { + struct inode *inode; /* vfs inode pointer */ + struct page *inode_page; /* its inode page, NULL is possible */ + struct page *node_page; /* cached direct node page */ + nid_t nid; /* node id of the direct node block */ + unsigned int ofs_in_node; /* data offset in the node page */ + bool inode_page_locked; /* inode page is locked or not */ + bool node_changed; /* is node block changed */ + char cur_level; /* level of hole node page */ + char max_level; /* level of current page located */ + block_t data_blkaddr; /* block address of the node block */ +}; + +static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, + struct page *ipage, struct page *npage, nid_t nid) +{ + memset(dn, 0, sizeof(*dn)); + dn->inode = inode; + dn->inode_page = ipage; + dn->node_page = npage; + dn->nid = nid; +} + +/* + * For SIT manager + * + * By default, there are 6 active log areas across the whole main area. + * When considering hot and cold data separation to reduce cleaning overhead, + * we split 3 for data logs and 3 for node logs as hot, warm, and cold types, + * respectively. + * In the current design, you should not change the numbers intentionally. + * Instead, as a mount option such as active_logs=x, you can use 2, 4, and 6 + * logs individually according to the underlying devices. (default: 6) + * Just in case, on-disk layout covers maximum 16 logs that consist of 8 for + * data and 8 for node logs. + */ +#define NR_CURSEG_DATA_TYPE (3) +#define NR_CURSEG_NODE_TYPE (3) +#define NR_CURSEG_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) + +enum { + CURSEG_HOT_DATA = 0, /* directory entry blocks */ + CURSEG_WARM_DATA, /* data blocks */ + CURSEG_COLD_DATA, /* multimedia or GCed data blocks */ + CURSEG_HOT_NODE, /* direct node blocks of directory files */ + CURSEG_WARM_NODE, /* direct node blocks of normal files */ + CURSEG_COLD_NODE, /* indirect node blocks */ + NO_CHECK_TYPE, +}; + +struct flush_cmd { + struct completion wait; + struct llist_node llnode; + nid_t ino; + int ret; +}; + +struct flush_cmd_control { + struct task_struct *f2fs_issue_flush; /* flush thread */ + wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ + atomic_t issued_flush; /* # of issued flushes */ + atomic_t issing_flush; /* # of issing flushes */ + struct llist_head issue_list; /* list for command issue */ + struct llist_node *dispatch_list; /* list for command dispatch */ +}; + +struct f2fs_sm_info { + struct sit_info *sit_info; /* whole segment information */ + struct free_segmap_info *free_info; /* free segment information */ + struct dirty_seglist_info *dirty_info; /* dirty segment information */ + struct curseg_info *curseg_array; /* active segment information */ + + struct rw_semaphore curseg_lock; /* for preventing curseg change */ + + block_t seg0_blkaddr; /* block address of 0'th segment */ + block_t main_blkaddr; /* start block address of main area */ + block_t ssa_blkaddr; /* start block address of SSA area */ + + unsigned int segment_count; /* total # of segments */ + unsigned int main_segments; /* # of segments in main area */ + unsigned int reserved_segments; /* # of reserved segments */ + unsigned int ovp_segments; /* # of overprovision segments */ + + /* a threshold to reclaim prefree segments */ + unsigned int rec_prefree_segments; + + /* for batched trimming */ + unsigned int trim_sections; /* # of sections to trim */ + + struct list_head sit_entry_set; /* sit entry set list */ + + unsigned int ipu_policy; /* in-place-update policy */ + unsigned int min_ipu_util; /* in-place-update threshold */ + unsigned int min_fsync_blocks; /* threshold for fsync */ + unsigned int min_seq_blocks; /* threshold for sequential blocks */ + unsigned int min_hot_blocks; /* threshold for hot block allocation */ + unsigned int min_ssr_sections; /* threshold to trigger SSR allocation */ + + /* for flush command control */ + struct flush_cmd_control *fcc_info; + + /* for discard command control */ + struct discard_cmd_control *dcc_info; +}; + +/* + * For superblock + */ +/* + * COUNT_TYPE for monitoring + * + * f2fs monitors the number of several block types such as on-writeback, + * dirty dentry blocks, dirty node blocks, and dirty meta blocks. + */ +#define WB_DATA_TYPE(p) (__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) +enum count_type { + F2FS_DIRTY_DENTS, + F2FS_DIRTY_DATA, + F2FS_DIRTY_QDATA, + F2FS_DIRTY_NODES, + F2FS_DIRTY_META, + F2FS_INMEM_PAGES, + F2FS_DIRTY_IMETA, + F2FS_WB_CP_DATA, + F2FS_WB_DATA, + NR_COUNT_TYPE, +}; + +/* + * The below are the page types of bios used in submit_bio(). + * The available types are: + * DATA User data pages. It operates as async mode. + * NODE Node pages. It operates as async mode. + * META FS metadata pages such as SIT, NAT, CP. + * NR_PAGE_TYPE The number of page types. + * META_FLUSH Make sure the previous pages are written + * with waiting the bio's completion + * ... Only can be used with META. + */ +#define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type)) +enum page_type { + DATA, + NODE, + META, + NR_PAGE_TYPE, + META_FLUSH, + INMEM, /* the below types are used by tracepoints only. */ + INMEM_DROP, + INMEM_INVALIDATE, + INMEM_REVOKE, + IPU, + OPU, +}; + +enum temp_type { + HOT = 0, /* must be zero for meta bio */ + WARM, + COLD, + NR_TEMP_TYPE, +}; + +enum need_lock_type { + LOCK_REQ = 0, + LOCK_DONE, + LOCK_RETRY, +}; + +enum cp_reason_type { + CP_NO_NEEDED, + CP_NON_REGULAR, + CP_HARDLINK, + CP_SB_NEED_CP, + CP_WRONG_PINO, + CP_NO_SPC_ROLL, + CP_NODE_NEED_CP, + CP_FASTBOOT_MODE, + CP_SPEC_LOG_NUM, + CP_RECOVER_DIR, +}; + +enum iostat_type { + APP_DIRECT_IO, /* app direct IOs */ + APP_BUFFERED_IO, /* app buffered IOs */ + APP_WRITE_IO, /* app write IOs */ + APP_MAPPED_IO, /* app mapped IOs */ + FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */ + FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */ + FS_META_IO, /* meta IOs from kworker/reclaimer */ + FS_GC_DATA_IO, /* data IOs from forground gc */ + FS_GC_NODE_IO, /* node IOs from forground gc */ + FS_CP_DATA_IO, /* data IOs from checkpoint */ + FS_CP_NODE_IO, /* node IOs from checkpoint */ + FS_CP_META_IO, /* meta IOs from checkpoint */ + FS_DISCARD, /* discard */ + NR_IO_TYPE, +}; + +struct f2fs_io_info { + struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ + nid_t ino; /* inode number */ + enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ + enum temp_type temp; /* contains HOT/WARM/COLD */ + int op; /* contains REQ_OP_ */ + int op_flags; /* req_flag_bits */ + block_t new_blkaddr; /* new block address to be written */ + block_t old_blkaddr; /* old block address before Cow */ + struct page *page; /* page to be written */ + struct page *encrypted_page; /* encrypted page */ + struct list_head list; /* serialize IOs */ + bool submitted; /* indicate IO submission */ + int need_lock; /* indicate we need to lock cp_rwsem */ + bool in_list; /* indicate fio is in io_list */ + bool is_meta; /* indicate borrow meta inode mapping or not */ + bool retry; /* need to reallocate block address */ + enum iostat_type io_type; /* io type */ + struct writeback_control *io_wbc; /* writeback control */ + unsigned char version; /* version of the node */ +}; + +#define is_read_io(rw) ((rw) == READ) +struct f2fs_bio_info { + struct f2fs_sb_info *sbi; /* f2fs superblock */ + struct bio *bio; /* bios to merge */ + sector_t last_block_in_bio; /* last block number */ + struct f2fs_io_info fio; /* store buffered io info. */ + struct rw_semaphore io_rwsem; /* blocking op for bio */ + spinlock_t io_lock; /* serialize DATA/NODE IOs */ + struct list_head io_list; /* track fios */ +}; + +#define FDEV(i) (sbi->devs[i]) +#define RDEV(i) (raw_super->devs[i]) +struct f2fs_dev_info { + struct block_device *bdev; + char path[MAX_PATH_LEN]; + unsigned int total_segments; + block_t start_blk; + block_t end_blk; +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int nr_blkz; /* Total number of zones */ + u8 *blkz_type; /* Array of zones type */ +#endif +}; + +enum inode_type { + DIR_INODE, /* for dirty dir inode */ + FILE_INODE, /* for dirty regular/symlink inode */ + DIRTY_META, /* for all dirtied inode metadata */ + ATOMIC_FILE, /* for all atomic files */ + NR_INODE_TYPE, +}; + +/* for inner inode cache management */ +struct inode_management { + struct radix_tree_root ino_root; /* ino entry array */ + spinlock_t ino_lock; /* for ino entry lock */ + struct list_head ino_list; /* inode list head */ + unsigned long ino_num; /* number of entries */ +}; + +/* For s_flag in struct f2fs_sb_info */ +enum { + SBI_IS_DIRTY, /* dirty flag for checkpoint */ + SBI_IS_CLOSE, /* specify unmounting */ + SBI_NEED_FSCK, /* need fsck.f2fs to fix */ + SBI_POR_DOING, /* recovery is doing or not */ + SBI_NEED_SB_WRITE, /* need to recover superblock */ + SBI_NEED_CP, /* need to checkpoint */ + SBI_IS_SHUTDOWN, /* shutdown by ioctl */ + SBI_IS_RECOVERED, /* recovered orphan/data */ +}; + +enum { + CP_TIME, + REQ_TIME, + MAX_TIME, +}; + +enum { + GC_NORMAL, + GC_IDLE_CB, + GC_IDLE_GREEDY, + GC_URGENT, +}; + +enum { + WHINT_MODE_OFF, /* not pass down write hints */ + WHINT_MODE_USER, /* try to pass down hints given by users */ + WHINT_MODE_FS, /* pass down hints with F2FS policy */ +}; + +enum { + ALLOC_MODE_DEFAULT, /* stay default */ + ALLOC_MODE_REUSE, /* reuse segments as much as possible */ +}; + +enum fsync_mode { + FSYNC_MODE_POSIX, /* fsync follows posix semantics */ + FSYNC_MODE_STRICT, /* fsync behaves in line with ext4 */ + FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */ +}; + +#ifdef CONFIG_F2FS_FS_ENCRYPTION +#define DUMMY_ENCRYPTION_ENABLED(sbi) \ + (unlikely(F2FS_OPTION(sbi).test_dummy_encryption)) +#else +#define DUMMY_ENCRYPTION_ENABLED(sbi) (0) +#endif + +struct f2fs_sb_info { + struct super_block *sb; /* pointer to VFS super block */ + struct proc_dir_entry *s_proc; /* proc entry */ + struct f2fs_super_block *raw_super; /* raw super block pointer */ + struct rw_semaphore sb_lock; /* lock for raw super block */ + int valid_super_block; /* valid super block no */ + unsigned long s_flag; /* flags for sbi */ + struct mutex writepages; /* mutex for writepages() */ + +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int blocks_per_blkz; /* F2FS blocks per zone */ + unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ +#endif + + /* for node-related operations */ + struct f2fs_nm_info *nm_info; /* node manager */ + struct inode *node_inode; /* cache node blocks */ + + /* for segment-related operations */ + struct f2fs_sm_info *sm_info; /* segment manager */ + + /* for bio operations */ + struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ + struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE]; + /* bio ordering for NODE/DATA */ + /* keep migration IO order for LFS mode */ + struct rw_semaphore io_order_lock; + mempool_t *write_io_dummy; /* Dummy pages */ + + /* for checkpoint */ + struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ + int cur_cp_pack; /* remain current cp pack */ + spinlock_t cp_lock; /* for flag in ckpt */ + struct inode *meta_inode; /* cache meta blocks */ + struct mutex cp_mutex; /* checkpoint procedure lock */ + struct rw_semaphore cp_rwsem; /* blocking FS operations */ + struct rw_semaphore node_write; /* locking node writes */ + struct rw_semaphore node_change; /* locking node change */ + wait_queue_head_t cp_wait; + unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ + long interval_time[MAX_TIME]; /* to store thresholds */ + + struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ + + spinlock_t fsync_node_lock; /* for node entry lock */ + struct list_head fsync_node_list; /* node list head */ + unsigned int fsync_seg_id; /* sequence id */ + unsigned int fsync_node_num; /* number of node entries */ + + /* for orphan inode, use 0'th array */ + unsigned int max_orphans; /* max orphan inodes */ + + /* for inode management */ + struct list_head inode_list[NR_INODE_TYPE]; /* dirty inode list */ + spinlock_t inode_lock[NR_INODE_TYPE]; /* for dirty inode list lock */ + + /* for extent tree cache */ + struct radix_tree_root extent_tree_root;/* cache extent cache entries */ + struct mutex extent_tree_lock; /* locking extent radix tree */ + struct list_head extent_list; /* lru list for shrinker */ + spinlock_t extent_lock; /* locking extent lru list */ + atomic_t total_ext_tree; /* extent tree count */ + struct list_head zombie_list; /* extent zombie tree list */ + atomic_t total_zombie_tree; /* extent zombie tree count */ + atomic_t total_ext_node; /* extent info count */ + + /* basic filesystem units */ + unsigned int log_sectors_per_block; /* log2 sectors per block */ + unsigned int log_blocksize; /* log2 block size */ + unsigned int blocksize; /* block size */ + unsigned int root_ino_num; /* root inode number*/ + unsigned int node_ino_num; /* node inode number*/ + unsigned int meta_ino_num; /* meta inode number*/ + unsigned int log_blocks_per_seg; /* log2 blocks per segment */ + unsigned int blocks_per_seg; /* blocks per segment */ + unsigned int segs_per_sec; /* segments per section */ + unsigned int secs_per_zone; /* sections per zone */ + unsigned int total_sections; /* total section count */ + unsigned int total_node_count; /* total node block count */ + unsigned int total_valid_node_count; /* valid node block count */ + loff_t max_file_blocks; /* max block index of file */ + int dir_level; /* directory level */ + unsigned int trigger_ssr_threshold; /* threshold to trigger ssr */ + int readdir_ra; /* readahead inode in readdir */ + + block_t user_block_count; /* # of user blocks */ + block_t total_valid_block_count; /* # of valid blocks */ + block_t discard_blks; /* discard command candidats */ + block_t last_valid_block_count; /* for recovery */ + block_t reserved_blocks; /* configurable reserved blocks */ + block_t current_reserved_blocks; /* current reserved blocks */ + + unsigned int nquota_files; /* # of quota sysfile */ + + u32 s_next_generation; /* for NFS support */ + + /* # of pages, see count_type */ + atomic_t nr_pages[NR_COUNT_TYPE]; + /* # of allocated blocks */ + struct percpu_counter alloc_valid_block_count; + + /* writeback control */ + atomic_t wb_sync_req[META]; /* count # of WB_SYNC threads */ + + /* valid inode count */ + struct percpu_counter total_valid_inode_count; + + struct f2fs_mount_info mount_opt; /* mount options */ + + /* for cleaning operations */ + struct mutex gc_mutex; /* mutex for GC */ + struct f2fs_gc_kthread *gc_thread; /* GC thread */ + unsigned int cur_victim_sec; /* current victim section num */ + unsigned int gc_mode; /* current GC state */ + /* for skip statistic */ + unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ + unsigned long long skipped_gc_rwsem; /* FG_GC only */ + + /* threshold for gc trials on pinned files */ + u64 gc_pin_file_threshold; + + /* maximum # of trials to find a victim segment for SSR and GC */ + unsigned int max_victim_search; + + /* + * for stat information. + * one is for the LFS mode, and the other is for the SSR mode. + */ +#ifdef CONFIG_F2FS_STAT_FS + struct f2fs_stat_info *stat_info; /* FS status information */ + unsigned int segment_count[2]; /* # of allocated segments */ + unsigned int block_count[2]; /* # of allocated blocks */ + atomic_t inplace_count; /* # of inplace update */ + atomic64_t total_hit_ext; /* # of lookup extent cache */ + atomic64_t read_hit_rbtree; /* # of hit rbtree extent node */ + atomic64_t read_hit_largest; /* # of hit largest extent node */ + atomic64_t read_hit_cached; /* # of hit cached extent node */ + atomic_t inline_xattr; /* # of inline_xattr inodes */ + atomic_t inline_inode; /* # of inline_data inodes */ + atomic_t inline_dir; /* # of inline_dentry inodes */ + atomic_t aw_cnt; /* # of atomic writes */ + atomic_t vw_cnt; /* # of volatile writes */ + atomic_t max_aw_cnt; /* max # of atomic writes */ + atomic_t max_vw_cnt; /* max # of volatile writes */ + int bg_gc; /* background gc calls */ + unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ +#endif + spinlock_t stat_lock; /* lock for stat operations */ + + /* For app/fs IO statistics */ + spinlock_t iostat_lock; + unsigned long long write_iostat[NR_IO_TYPE]; + bool iostat_enable; + + /* For sysfs suppport */ + struct kobject s_kobj; + struct completion s_kobj_unregister; + + /* For shrinker support */ + struct list_head s_list; + int s_ndevs; /* number of devices */ + struct f2fs_dev_info *devs; /* for device list */ + unsigned int dirty_device; /* for checkpoint data flush */ + spinlock_t dev_lock; /* protect dirty_device */ + struct mutex umount_mutex; + unsigned int shrinker_run_no; + + /* For write statistics */ + u64 sectors_written_start; + u64 kbytes_written; + + /* Reference to checksum algorithm driver via cryptoapi */ + struct crypto_shash *s_chksum_driver; + + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_chksum_seed; +}; + +#ifdef CONFIG_F2FS_FAULT_INJECTION +#define f2fs_show_injection_info(type) \ + printk("%sF2FS-fs : inject %s in %s of %pF\n", \ + KERN_INFO, f2fs_fault_name[type], \ + __func__, __builtin_return_address(0)) +static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) +{ + struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; + + if (!ffi->inject_rate) + return false; + + if (!IS_FAULT_SET(ffi, type)) + return false; + + atomic_inc(&ffi->inject_ops); + if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { + atomic_set(&ffi->inject_ops, 0); + return true; + } + return false; +} +#else +#define f2fs_show_injection_info(type) do { } while (0) +static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) +{ + return false; +} +#endif + +/* + * Test if the mounted volume is a multi-device volume. + * - For a single regular disk volume, sbi->s_ndevs is 0. + * - For a single zoned disk volume, sbi->s_ndevs is 1. + * - For a multi-device volume, sbi->s_ndevs is always 2 or more. + */ +static inline bool f2fs_is_multi_device(struct f2fs_sb_info *sbi) +{ + return sbi->s_ndevs > 1; +} + +/* For write statistics. Suppose sector size is 512 bytes, + * and the return value is in kbytes. s is of struct f2fs_sb_info. + */ +#define BD_PART_WRITTEN(s) \ +(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[STAT_WRITE]) - \ + (s)->sectors_written_start) >> 1) + +static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) +{ + sbi->last_time[type] = jiffies; +} + +static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) +{ + unsigned long interval = sbi->interval_time[type] * HZ; + + return time_after(jiffies, sbi->last_time[type] + interval); +} + +static inline bool is_idle(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + struct request_queue *q = bdev_get_queue(bdev); + struct request_list *rl = &q->root_rl; + + if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC]) + return false; + + return f2fs_time_over(sbi, REQ_TIME); +} + +/* + * Inline functions + */ +static inline u32 __f2fs_crc32(struct f2fs_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + int err; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} + +static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, + unsigned int length) +{ + return __f2fs_crc32(sbi, F2FS_SUPER_MAGIC, address, length); +} + +static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, + void *buf, size_t buf_size) +{ + return f2fs_crc32(sbi, buf, buf_size) == blk_crc; +} + +static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + return __f2fs_crc32(sbi, crc, address, length); +} + +static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) +{ + return container_of(inode, struct f2fs_inode_info, vfs_inode); +} + +static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct f2fs_sb_info *F2FS_I_SB(struct inode *inode) +{ + return F2FS_SB(inode->i_sb); +} + +static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping) +{ + return F2FS_I_SB(mapping->host); +} + +static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page) +{ + return F2FS_M_SB(page->mapping); +} + +static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_super_block *)(sbi->raw_super); +} + +static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_checkpoint *)(sbi->ckpt); +} + +static inline struct f2fs_node *F2FS_NODE(struct page *page) +{ + return (struct f2fs_node *)page_address(page); +} + +static inline struct f2fs_inode *F2FS_INODE(struct page *page) +{ + return &((struct f2fs_node *)page_address(page))->i; +} + +static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_nm_info *)(sbi->nm_info); +} + +static inline struct f2fs_sm_info *SM_I(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_sm_info *)(sbi->sm_info); +} + +static inline struct sit_info *SIT_I(struct f2fs_sb_info *sbi) +{ + return (struct sit_info *)(SM_I(sbi)->sit_info); +} + +static inline struct free_segmap_info *FREE_I(struct f2fs_sb_info *sbi) +{ + return (struct free_segmap_info *)(SM_I(sbi)->free_info); +} + +static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi) +{ + return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info); +} + +static inline struct address_space *META_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->meta_inode->i_mapping; +} + +static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->node_inode->i_mapping; +} + +static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type) +{ + return test_bit(type, &sbi->s_flag); +} + +static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) +{ + set_bit(type, &sbi->s_flag); +} + +static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) +{ + clear_bit(type, &sbi->s_flag); +} + +static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) +{ + return le64_to_cpu(cp->checkpoint_ver); +} + +static inline unsigned long f2fs_qf_ino(struct super_block *sb, int type) +{ + if (type < F2FS_MAX_QUOTAS) + return le32_to_cpu(F2FS_SB(sb)->raw_super->qf_ino[type]); + return 0; +} + +static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp) +{ + size_t crc_offset = le32_to_cpu(cp->checksum_offset); + return le32_to_cpu(*((__le32 *)((unsigned char *)cp + crc_offset))); +} + +static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + + return ckpt_flags & f; +} + +static inline bool is_set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) +{ + return __is_set_ckpt_flags(F2FS_CKPT(sbi), f); +} + +static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags; + + ckpt_flags = le32_to_cpu(cp->ckpt_flags); + ckpt_flags |= f; + cp->ckpt_flags = cpu_to_le32(ckpt_flags); +} + +static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) +{ + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); + __set_ckpt_flags(F2FS_CKPT(sbi), f); + spin_unlock_irqrestore(&sbi->cp_lock, flags); +} + +static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags; + + ckpt_flags = le32_to_cpu(cp->ckpt_flags); + ckpt_flags &= (~f); + cp->ckpt_flags = cpu_to_le32(ckpt_flags); +} + +static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) +{ + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); + __clear_ckpt_flags(F2FS_CKPT(sbi), f); + spin_unlock_irqrestore(&sbi->cp_lock, flags); +} + +static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) +{ + unsigned long flags; + + set_sbi_flag(sbi, SBI_NEED_FSCK); + + if (lock) + spin_lock_irqsave(&sbi->cp_lock, flags); + __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); + kfree(NM_I(sbi)->nat_bits); + NM_I(sbi)->nat_bits = NULL; + if (lock) + spin_unlock_irqrestore(&sbi->cp_lock, flags); +} + +static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, + struct cp_control *cpc) +{ + bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + + return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; +} + +static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) +{ + down_read(&sbi->cp_rwsem); +} + +static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) +{ + return down_read_trylock(&sbi->cp_rwsem); +} + +static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) +{ + up_read(&sbi->cp_rwsem); +} + +static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) +{ + down_write(&sbi->cp_rwsem); +} + +static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) +{ + up_write(&sbi->cp_rwsem); +} + +static inline int __get_cp_reason(struct f2fs_sb_info *sbi) +{ + int reason = CP_SYNC; + + if (test_opt(sbi, FASTBOOT)) + reason = CP_FASTBOOT; + if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) + reason = CP_UMOUNT; + return reason; +} + +static inline bool __remain_node_summaries(int reason) +{ + return (reason & (CP_UMOUNT | CP_FASTBOOT)); +} + +static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) +{ + return (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG) || + is_set_ckpt_flags(sbi, CP_FASTBOOT_FLAG)); +} + +/* + * Check whether the inode has blocks or not + */ +static inline int F2FS_HAS_BLOCKS(struct inode *inode) +{ + block_t xattr_block = F2FS_I(inode)->i_xattr_nid ? 1 : 0; + + return (inode->i_blocks >> F2FS_LOG_SECTORS_PER_BLOCK) > xattr_block; +} + +static inline bool f2fs_has_xattr_block(unsigned int ofs) +{ + return ofs == XATTR_NODE_OFFSET; +} + +static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, + struct inode *inode, bool cap) +{ + if (!inode) + return true; + if (!test_opt(sbi, RESERVE_ROOT)) + return false; + if (IS_NOQUOTA(inode)) + return true; + if (uid_eq(F2FS_OPTION(sbi).s_resuid, current_fsuid())) + return true; + if (!gid_eq(F2FS_OPTION(sbi).s_resgid, GLOBAL_ROOT_GID) && + in_group_p(F2FS_OPTION(sbi).s_resgid)) + return true; + if (cap && capable(CAP_SYS_RESOURCE)) + return true; + return false; +} + +static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool); +static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, + struct inode *inode, blkcnt_t *count) +{ + blkcnt_t diff = 0, release = 0; + block_t avail_user_block_count; + int ret; + + ret = dquot_reserve_block(inode, *count); + if (ret) + return ret; + + if (time_to_inject(sbi, FAULT_BLOCK)) { + f2fs_show_injection_info(FAULT_BLOCK); + release = *count; + goto release_quota; + } + + /* + * let's increase this in prior to actual block count change in order + * for f2fs_sync_file to avoid data races when deciding checkpoint. + */ + percpu_counter_add(&sbi->alloc_valid_block_count, (*count)); + + spin_lock(&sbi->stat_lock); + sbi->total_valid_block_count += (block_t)(*count); + avail_user_block_count = sbi->user_block_count - + sbi->current_reserved_blocks; + + if (!__allow_reserved_blocks(sbi, inode, true)) + avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; + + if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { + diff = sbi->total_valid_block_count - avail_user_block_count; + if (diff > *count) + diff = *count; + *count -= diff; + release = diff; + sbi->total_valid_block_count -= diff; + if (!*count) { + spin_unlock(&sbi->stat_lock); + goto enospc; + } + } + spin_unlock(&sbi->stat_lock); + + if (unlikely(release)) { + percpu_counter_sub(&sbi->alloc_valid_block_count, release); + dquot_release_reservation_block(inode, release); + } + f2fs_i_blocks_write(inode, *count, true, true); + return 0; + +enospc: + percpu_counter_sub(&sbi->alloc_valid_block_count, release); +release_quota: + dquot_release_reservation_block(inode, release); + return -ENOSPC; +} + +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...); +static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, + struct inode *inode, + block_t count) +{ + blkcnt_t sectors = count << F2FS_LOG_SECTORS_PER_BLOCK; + + spin_lock(&sbi->stat_lock); + f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); + sbi->total_valid_block_count -= (block_t)count; + if (sbi->reserved_blocks && + sbi->current_reserved_blocks < sbi->reserved_blocks) + sbi->current_reserved_blocks = min(sbi->reserved_blocks, + sbi->current_reserved_blocks + count); + spin_unlock(&sbi->stat_lock); + if (unlikely(inode->i_blocks < sectors)) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Inconsistent i_blocks, ino:%lu, iblocks:%llu, sectors:%llu", + inode->i_ino, + (unsigned long long)inode->i_blocks, + (unsigned long long)sectors); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return; + } + f2fs_i_blocks_write(inode, count, false, true); +} + +static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) +{ + atomic_inc(&sbi->nr_pages[count_type]); + + if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES || + count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA) + return; + + set_sbi_flag(sbi, SBI_IS_DIRTY); +} + +static inline void inode_inc_dirty_pages(struct inode *inode) +{ + atomic_inc(&F2FS_I(inode)->dirty_pages); + inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); + if (IS_NOQUOTA(inode)) + inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); +} + +static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) +{ + atomic_dec(&sbi->nr_pages[count_type]); +} + +static inline void inode_dec_dirty_pages(struct inode *inode) +{ + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) + return; + + atomic_dec(&F2FS_I(inode)->dirty_pages); + dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); + if (IS_NOQUOTA(inode)) + dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); +} + +static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) +{ + return atomic_read(&sbi->nr_pages[count_type]); +} + +static inline int get_dirty_pages(struct inode *inode) +{ + return atomic_read(&F2FS_I(inode)->dirty_pages); +} + +static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) +{ + unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; + unsigned int segs = (get_pages(sbi, block_type) + pages_per_sec - 1) >> + sbi->log_blocks_per_seg; + + return segs / sbi->segs_per_sec; +} + +static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) +{ + return sbi->total_valid_block_count; +} + +static inline block_t discard_blocks(struct f2fs_sb_info *sbi) +{ + return sbi->discard_blks; +} + +static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + + /* return NAT or SIT bitmap */ + if (flag == NAT_BITMAP) + return le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); + else if (flag == SIT_BITMAP) + return le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); + + return 0; +} + +static inline block_t __cp_payload(struct f2fs_sb_info *sbi) +{ + return le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); +} + +static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + int offset; + + if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) { + offset = (flag == SIT_BITMAP) ? + le32_to_cpu(ckpt->nat_ver_bitmap_bytesize) : 0; + return &ckpt->sit_nat_version_bitmap + offset; + } + + if (__cp_payload(sbi) > 0) { + if (flag == NAT_BITMAP) + return &ckpt->sit_nat_version_bitmap; + else + return (unsigned char *)ckpt + F2FS_BLKSIZE; + } else { + offset = (flag == NAT_BITMAP) ? + le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; + return &ckpt->sit_nat_version_bitmap + offset; + } +} + +static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) +{ + block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); + + if (sbi->cur_cp_pack == 2) + start_addr += sbi->blocks_per_seg; + return start_addr; +} + +static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi) +{ + block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); + + if (sbi->cur_cp_pack == 1) + start_addr += sbi->blocks_per_seg; + return start_addr; +} + +static inline void __set_cp_next_pack(struct f2fs_sb_info *sbi) +{ + sbi->cur_cp_pack = (sbi->cur_cp_pack == 1) ? 2 : 1; +} + +static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) +{ + return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); +} + +static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, + struct inode *inode, bool is_inode) +{ + block_t valid_block_count; + unsigned int valid_node_count; + bool quota = inode && !is_inode; + + if (quota) { + int ret = dquot_reserve_block(inode, 1); + if (ret) + return ret; + } + + if (time_to_inject(sbi, FAULT_BLOCK)) { + f2fs_show_injection_info(FAULT_BLOCK); + goto enospc; + } + + spin_lock(&sbi->stat_lock); + + valid_block_count = sbi->total_valid_block_count + + sbi->current_reserved_blocks + 1; + + if (!__allow_reserved_blocks(sbi, inode, false)) + valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks; + + if (unlikely(valid_block_count > sbi->user_block_count)) { + spin_unlock(&sbi->stat_lock); + goto enospc; + } + + valid_node_count = sbi->total_valid_node_count + 1; + if (unlikely(valid_node_count > sbi->total_node_count)) { + spin_unlock(&sbi->stat_lock); + goto enospc; + } + + sbi->total_valid_node_count++; + sbi->total_valid_block_count++; + spin_unlock(&sbi->stat_lock); + + if (inode) { + if (is_inode) + f2fs_mark_inode_dirty_sync(inode, true); + else + f2fs_i_blocks_write(inode, 1, true, true); + } + + percpu_counter_inc(&sbi->alloc_valid_block_count); + return 0; + +enospc: + if (quota) + dquot_release_reservation_block(inode, 1); + return -ENOSPC; +} + +static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, + struct inode *inode, bool is_inode) +{ + spin_lock(&sbi->stat_lock); + + f2fs_bug_on(sbi, !sbi->total_valid_block_count); + f2fs_bug_on(sbi, !sbi->total_valid_node_count); + f2fs_bug_on(sbi, !is_inode && !inode->i_blocks); + + sbi->total_valid_node_count--; + sbi->total_valid_block_count--; + if (sbi->reserved_blocks && + sbi->current_reserved_blocks < sbi->reserved_blocks) + sbi->current_reserved_blocks++; + + spin_unlock(&sbi->stat_lock); + + if (!is_inode) + f2fs_i_blocks_write(inode, 1, false, true); +} + +static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) +{ + return sbi->total_valid_node_count; +} + +static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) +{ + percpu_counter_inc(&sbi->total_valid_inode_count); +} + +static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) +{ + percpu_counter_dec(&sbi->total_valid_inode_count); +} + +static inline s64 valid_inode_count(struct f2fs_sb_info *sbi) +{ + return percpu_counter_sum_positive(&sbi->total_valid_inode_count); +} + +static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, + pgoff_t index, bool for_write) +{ + struct page *page; + + if (IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) { + if (!for_write) + page = find_get_page_flags(mapping, index, + FGP_LOCK | FGP_ACCESSED); + else + page = find_lock_page(mapping, index); + if (page) + return page; + + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) { + f2fs_show_injection_info(FAULT_PAGE_ALLOC); + return NULL; + } + } + + if (!for_write) + return grab_cache_page(mapping, index); + return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); +} + +static inline struct page *f2fs_pagecache_get_page( + struct address_space *mapping, pgoff_t index, + int fgp_flags, gfp_t gfp_mask) +{ + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) { + f2fs_show_injection_info(FAULT_PAGE_GET); + return NULL; + } + + return pagecache_get_page(mapping, index, fgp_flags, gfp_mask); +} + +static inline void f2fs_copy_page(struct page *src, struct page *dst) +{ + char *src_kaddr = kmap(src); + char *dst_kaddr = kmap(dst); + + memcpy(dst_kaddr, src_kaddr, PAGE_SIZE); + kunmap(dst); + kunmap(src); +} + +static inline void f2fs_put_page(struct page *page, int unlock) +{ + if (!page) + return; + + if (unlock) { + f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page)); + unlock_page(page); + } + put_page(page); +} + +static inline void f2fs_put_dnode(struct dnode_of_data *dn) +{ + if (dn->node_page) + f2fs_put_page(dn->node_page, 1); + if (dn->inode_page && dn->node_page != dn->inode_page) + f2fs_put_page(dn->inode_page, 0); + dn->node_page = NULL; + dn->inode_page = NULL; +} + +static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, + size_t size) +{ + return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL); +} + +static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, + gfp_t flags) +{ + void *entry; + + entry = kmem_cache_alloc(cachep, flags); + if (!entry) + entry = kmem_cache_alloc(cachep, flags | __GFP_NOFAIL); + return entry; +} + +static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, + int npages, bool no_fail) +{ + struct bio *bio; + + if (no_fail) { + /* No failure on bio allocation */ + bio = bio_alloc(GFP_NOIO, npages); + if (!bio) + bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); + return bio; + } + if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { + f2fs_show_injection_info(FAULT_ALLOC_BIO); + return NULL; + } + + return bio_alloc(GFP_KERNEL, npages); +} + +static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, + unsigned long index, void *item) +{ + while (radix_tree_insert(root, index, item)) + cond_resched(); +} + +#define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino) + +static inline bool IS_INODE(struct page *page) +{ + struct f2fs_node *p = F2FS_NODE(page); + + return RAW_IS_INODE(p); +} + +static inline int offset_in_addr(struct f2fs_inode *i) +{ + return (i->i_inline & F2FS_EXTRA_ATTR) ? + (le16_to_cpu(i->i_extra_isize) / sizeof(__le32)) : 0; +} + +static inline __le32 *blkaddr_in_node(struct f2fs_node *node) +{ + return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr; +} + +static inline int f2fs_has_extra_attr(struct inode *inode); +static inline block_t datablock_addr(struct inode *inode, + struct page *node_page, unsigned int offset) +{ + struct f2fs_node *raw_node; + __le32 *addr_array; + int base = 0; + bool is_inode = IS_INODE(node_page); + + raw_node = F2FS_NODE(node_page); + + /* from GC path only */ + if (is_inode) { + if (!inode) + base = offset_in_addr(&raw_node->i); + else if (f2fs_has_extra_attr(inode)) + base = get_extra_isize(inode); + } + + addr_array = blkaddr_in_node(raw_node); + return le32_to_cpu(addr_array[base + offset]); +} + +static inline int f2fs_test_bit(unsigned int nr, char *addr) +{ + int mask; + + addr += (nr >> 3); + mask = 1 << (7 - (nr & 0x07)); + return mask & *addr; +} + +static inline void f2fs_set_bit(unsigned int nr, char *addr) +{ + int mask; + + addr += (nr >> 3); + mask = 1 << (7 - (nr & 0x07)); + *addr |= mask; +} + +static inline void f2fs_clear_bit(unsigned int nr, char *addr) +{ + int mask; + + addr += (nr >> 3); + mask = 1 << (7 - (nr & 0x07)); + *addr &= ~mask; +} + +static inline int f2fs_test_and_set_bit(unsigned int nr, char *addr) +{ + int mask; + int ret; + + addr += (nr >> 3); + mask = 1 << (7 - (nr & 0x07)); + ret = mask & *addr; + *addr |= mask; + return ret; +} + +static inline int f2fs_test_and_clear_bit(unsigned int nr, char *addr) +{ + int mask; + int ret; + + addr += (nr >> 3); + mask = 1 << (7 - (nr & 0x07)); + ret = mask & *addr; + *addr &= ~mask; + return ret; +} + +static inline void f2fs_change_bit(unsigned int nr, char *addr) +{ + int mask; + + addr += (nr >> 3); + mask = 1 << (7 - (nr & 0x07)); + *addr ^= mask; +} + +/* + * Inode flags + */ +#define F2FS_SECRM_FL 0x00000001 /* Secure deletion */ +#define F2FS_UNRM_FL 0x00000002 /* Undelete */ +#define F2FS_COMPR_FL 0x00000004 /* Compress file */ +#define F2FS_SYNC_FL 0x00000008 /* Synchronous updates */ +#define F2FS_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define F2FS_APPEND_FL 0x00000020 /* writes to file may only append */ +#define F2FS_NODUMP_FL 0x00000040 /* do not dump file */ +#define F2FS_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define F2FS_DIRTY_FL 0x00000100 +#define F2FS_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define F2FS_NOCOMPR_FL 0x00000400 /* Don't compress */ +#define F2FS_ENCRYPT_FL 0x00000800 /* encrypted file */ +/* End compression flags --- maybe not all used */ +#define F2FS_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define F2FS_IMAGIC_FL 0x00002000 /* AFS directory */ +#define F2FS_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define F2FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define F2FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define F2FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define F2FS_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ +#define F2FS_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define F2FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +#define F2FS_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define F2FS_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ +#define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ +#define F2FS_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + +#define F2FS_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ +#define F2FS_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */ + +/* Flags we can manipulate with through F2FS_IOC_FSSETXATTR */ +#define F2FS_FL_XFLAG_VISIBLE (F2FS_SYNC_FL | \ + F2FS_IMMUTABLE_FL | \ + F2FS_APPEND_FL | \ + F2FS_NODUMP_FL | \ + F2FS_NOATIME_FL | \ + F2FS_PROJINHERIT_FL) + +/* Flags that should be inherited by new inodes from their parent. */ +#define F2FS_FL_INHERITED (F2FS_SECRM_FL | F2FS_UNRM_FL | F2FS_COMPR_FL |\ + F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL |\ + F2FS_NOCOMPR_FL | F2FS_JOURNAL_DATA_FL |\ + F2FS_NOTAIL_FL | F2FS_DIRSYNC_FL |\ + F2FS_PROJINHERIT_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define F2FS_OTHER_FLMASK (F2FS_NODUMP_FL | F2FS_NOATIME_FL) + +static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & F2FS_REG_FLMASK; + else + return flags & F2FS_OTHER_FLMASK; +} + +/* used for f2fs_inode_info->flags */ +enum { + FI_NEW_INODE, /* indicate newly allocated inode */ + FI_DIRTY_INODE, /* indicate inode is dirty or not */ + FI_AUTO_RECOVER, /* indicate inode is recoverable */ + FI_DIRTY_DIR, /* indicate directory has dirty pages */ + FI_INC_LINK, /* need to increment i_nlink */ + FI_ACL_MODE, /* indicate acl mode */ + FI_NO_ALLOC, /* should not allocate any blocks */ + FI_FREE_NID, /* free allocated nide */ + FI_NO_EXTENT, /* not to use the extent cache */ + FI_INLINE_XATTR, /* used for inline xattr */ + FI_INLINE_DATA, /* used for inline data*/ + FI_INLINE_DENTRY, /* used for inline dentry */ + FI_APPEND_WRITE, /* inode has appended data */ + FI_UPDATE_WRITE, /* inode has in-place-update data */ + FI_NEED_IPU, /* used for ipu per file */ + FI_ATOMIC_FILE, /* indicate atomic file */ + FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */ + FI_VOLATILE_FILE, /* indicate volatile file */ + FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ + FI_DROP_CACHE, /* drop dirty page cache */ + FI_DATA_EXIST, /* indicate data exists */ + FI_INLINE_DOTS, /* indicate inline dot dentries */ + FI_DO_DEFRAG, /* indicate defragment is running */ + FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ + FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ + FI_HOT_DATA, /* indicate file is hot */ + FI_EXTRA_ATTR, /* indicate file has extra attribute */ + FI_PROJ_INHERIT, /* indicate file inherits projectid */ + FI_PIN_FILE, /* indicate file should not be gced */ + FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ +}; + +static inline void __mark_inode_dirty_flag(struct inode *inode, + int flag, bool set) +{ + switch (flag) { + case FI_INLINE_XATTR: + case FI_INLINE_DATA: + case FI_INLINE_DENTRY: + case FI_NEW_INODE: + if (set) + return; + case FI_DATA_EXIST: + case FI_INLINE_DOTS: + case FI_PIN_FILE: + f2fs_mark_inode_dirty_sync(inode, true); + } +} + +static inline void set_inode_flag(struct inode *inode, int flag) +{ + if (!test_bit(flag, &F2FS_I(inode)->flags)) + set_bit(flag, &F2FS_I(inode)->flags); + __mark_inode_dirty_flag(inode, flag, true); +} + +static inline int is_inode_flag_set(struct inode *inode, int flag) +{ + return test_bit(flag, &F2FS_I(inode)->flags); +} + +static inline void clear_inode_flag(struct inode *inode, int flag) +{ + if (test_bit(flag, &F2FS_I(inode)->flags)) + clear_bit(flag, &F2FS_I(inode)->flags); + __mark_inode_dirty_flag(inode, flag, false); +} + +static inline void set_acl_inode(struct inode *inode, umode_t mode) +{ + F2FS_I(inode)->i_acl_mode = mode; + set_inode_flag(inode, FI_ACL_MODE); + f2fs_mark_inode_dirty_sync(inode, false); +} + +static inline void f2fs_i_links_write(struct inode *inode, bool inc) +{ + if (inc) + inc_nlink(inode); + else + drop_nlink(inode); + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline void f2fs_i_blocks_write(struct inode *inode, + block_t diff, bool add, bool claim) +{ + bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); + bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); + + /* add = 1, claim = 1 should be dquot_reserve_block in pair */ + if (add) { + if (claim) + dquot_claim_block(inode, diff); + else + dquot_alloc_block_nofail(inode, diff); + } else { + dquot_free_block(inode, diff); + } + + f2fs_mark_inode_dirty_sync(inode, true); + if (clean || recover) + set_inode_flag(inode, FI_AUTO_RECOVER); +} + +static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) +{ + bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); + bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); + + if (i_size_read(inode) == i_size) + return; + + i_size_write(inode, i_size); + f2fs_mark_inode_dirty_sync(inode, true); + if (clean || recover) + set_inode_flag(inode, FI_AUTO_RECOVER); +} + +static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) +{ + F2FS_I(inode)->i_current_depth = depth; + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline void f2fs_i_gc_failures_write(struct inode *inode, + unsigned int count) +{ + F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = count; + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) +{ + F2FS_I(inode)->i_xattr_nid = xnid; + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino) +{ + F2FS_I(inode)->i_pino = pino; + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + if (ri->i_inline & F2FS_INLINE_XATTR) + set_bit(FI_INLINE_XATTR, &fi->flags); + if (ri->i_inline & F2FS_INLINE_DATA) + set_bit(FI_INLINE_DATA, &fi->flags); + if (ri->i_inline & F2FS_INLINE_DENTRY) + set_bit(FI_INLINE_DENTRY, &fi->flags); + if (ri->i_inline & F2FS_DATA_EXIST) + set_bit(FI_DATA_EXIST, &fi->flags); + if (ri->i_inline & F2FS_INLINE_DOTS) + set_bit(FI_INLINE_DOTS, &fi->flags); + if (ri->i_inline & F2FS_EXTRA_ATTR) + set_bit(FI_EXTRA_ATTR, &fi->flags); + if (ri->i_inline & F2FS_PIN_FILE) + set_bit(FI_PIN_FILE, &fi->flags); +} + +static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) +{ + ri->i_inline = 0; + + if (is_inode_flag_set(inode, FI_INLINE_XATTR)) + ri->i_inline |= F2FS_INLINE_XATTR; + if (is_inode_flag_set(inode, FI_INLINE_DATA)) + ri->i_inline |= F2FS_INLINE_DATA; + if (is_inode_flag_set(inode, FI_INLINE_DENTRY)) + ri->i_inline |= F2FS_INLINE_DENTRY; + if (is_inode_flag_set(inode, FI_DATA_EXIST)) + ri->i_inline |= F2FS_DATA_EXIST; + if (is_inode_flag_set(inode, FI_INLINE_DOTS)) + ri->i_inline |= F2FS_INLINE_DOTS; + if (is_inode_flag_set(inode, FI_EXTRA_ATTR)) + ri->i_inline |= F2FS_EXTRA_ATTR; + if (is_inode_flag_set(inode, FI_PIN_FILE)) + ri->i_inline |= F2FS_PIN_FILE; +} + +static inline int f2fs_has_extra_attr(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_EXTRA_ATTR); +} + +static inline int f2fs_has_inline_xattr(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_INLINE_XATTR); +} + +static inline unsigned int addrs_per_inode(struct inode *inode) +{ + return CUR_ADDRS_PER_INODE(inode) - get_inline_xattr_addrs(inode); +} + +static inline void *inline_xattr_addr(struct inode *inode, struct page *page) +{ + struct f2fs_inode *ri = F2FS_INODE(page); + + return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - + get_inline_xattr_addrs(inode)]); +} + +static inline int inline_xattr_size(struct inode *inode) +{ + if (f2fs_has_inline_xattr(inode)) + return get_inline_xattr_addrs(inode) * sizeof(__le32); + return 0; +} + +static inline int f2fs_has_inline_data(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_INLINE_DATA); +} + +static inline int f2fs_exist_data(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_DATA_EXIST); +} + +static inline int f2fs_has_inline_dots(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_INLINE_DOTS); +} + +static inline bool f2fs_is_pinned_file(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_PIN_FILE); +} + +static inline bool f2fs_is_atomic_file(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_ATOMIC_FILE); +} + +static inline bool f2fs_is_commit_atomic_write(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_ATOMIC_COMMIT); +} + +static inline bool f2fs_is_volatile_file(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_VOLATILE_FILE); +} + +static inline bool f2fs_is_first_block_written(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN); +} + +static inline bool f2fs_is_drop_cache(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_DROP_CACHE); +} + +static inline void *inline_data_addr(struct inode *inode, struct page *page) +{ + struct f2fs_inode *ri = F2FS_INODE(page); + int extra_size = get_extra_isize(inode); + + return (void *)&(ri->i_addr[extra_size + DEF_INLINE_RESERVED_SIZE]); +} + +static inline int f2fs_has_inline_dentry(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_INLINE_DENTRY); +} + +static inline int is_file(struct inode *inode, int type) +{ + return F2FS_I(inode)->i_advise & type; +} + +static inline void set_file(struct inode *inode, int type) +{ + F2FS_I(inode)->i_advise |= type; + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline void clear_file(struct inode *inode, int type) +{ + F2FS_I(inode)->i_advise &= ~type; + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline bool f2fs_is_time_consistent(struct inode *inode) +{ + if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) + return false; + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) + return false; + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) + return false; + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 3, + &F2FS_I(inode)->i_crtime)) + return false; + return true; +} + +static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) +{ + bool ret; + + if (dsync) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + spin_lock(&sbi->inode_lock[DIRTY_META]); + ret = list_empty(&F2FS_I(inode)->gdirty_list); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return ret; + } + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || + file_keep_isize(inode) || + i_size_read(inode) & ~PAGE_MASK) + return false; + + if (!f2fs_is_time_consistent(inode)) + return false; + + down_read(&F2FS_I(inode)->i_sem); + ret = F2FS_I(inode)->last_disk_size == i_size_read(inode); + up_read(&F2FS_I(inode)->i_sem); + + return ret; +} + +static inline bool f2fs_readonly(struct super_block *sb) +{ + return sb_rdonly(sb); +} + +static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) +{ + return is_set_ckpt_flags(sbi, CP_ERROR_FLAG); +} + +static inline bool is_dot_dotdot(const struct qstr *str) +{ + if (str->len == 1 && str->name[0] == '.') + return true; + + if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') + return true; + + return false; +} + +static inline bool f2fs_may_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, EXTENT_CACHE) || + is_inode_flag_set(inode, FI_NO_EXTENT)) + return false; + + /* + * for recovered files during mount do not create extents + * if shrinker is not registered. + */ + if (list_empty(&sbi->s_list)) + return false; + + return S_ISREG(inode->i_mode); +} + +static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ + if (time_to_inject(sbi, FAULT_KMALLOC)) { + f2fs_show_injection_info(FAULT_KMALLOC); + return NULL; + } + + return kmalloc(size, flags); +} + +static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ + return f2fs_kmalloc(sbi, size, flags | __GFP_ZERO); +} + +static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ + if (time_to_inject(sbi, FAULT_KVMALLOC)) { + f2fs_show_injection_info(FAULT_KVMALLOC); + return NULL; + } + + return kvmalloc(size, flags); +} + +static inline void *f2fs_kvzalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ + return f2fs_kvmalloc(sbi, size, flags | __GFP_ZERO); +} + +static inline int get_extra_isize(struct inode *inode) +{ + return F2FS_I(inode)->i_extra_isize / sizeof(__le32); +} + +static inline int get_inline_xattr_addrs(struct inode *inode) +{ + return F2FS_I(inode)->i_inline_xattr_size; +} + +#define f2fs_get_inode_mode(i) \ + ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ + (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) + +#define F2FS_TOTAL_EXTRA_ATTR_SIZE \ + (offsetof(struct f2fs_inode, i_extra_end) - \ + offsetof(struct f2fs_inode, i_extra_isize)) \ + +#define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr)) +#define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \ + ((offsetof(typeof(*f2fs_inode), field) + \ + sizeof((f2fs_inode)->field)) \ + <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \ + +static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi) +{ + int i; + + spin_lock(&sbi->iostat_lock); + for (i = 0; i < NR_IO_TYPE; i++) + sbi->write_iostat[i] = 0; + spin_unlock(&sbi->iostat_lock); +} + +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes) +{ + if (!sbi->iostat_enable) + return; + spin_lock(&sbi->iostat_lock); + sbi->write_iostat[type] += io_bytes; + + if (type == APP_WRITE_IO || type == APP_DIRECT_IO) + sbi->write_iostat[APP_BUFFERED_IO] = + sbi->write_iostat[APP_WRITE_IO] - + sbi->write_iostat[APP_DIRECT_IO]; + spin_unlock(&sbi->iostat_lock); +} + +#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META && \ + (!is_read_io(fio->op) || fio->is_meta)) + +bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type); +static inline void verify_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) +{ + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type)) { + f2fs_msg(sbi->sb, KERN_ERR, + "invalid blkaddr: %u, type: %d, run fsck to fix.", + blkaddr, type); + f2fs_bug_on(sbi, 1); + } +} + +static inline bool __is_valid_data_blkaddr(block_t blkaddr) +{ + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) + return false; + return true; +} + +static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + if (!__is_valid_data_blkaddr(blkaddr)) + return false; + verify_blkaddr(sbi, blkaddr, DATA_GENERIC); + return true; +} + +/* + * file.c + */ +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); +void f2fs_truncate_data_blocks(struct dnode_of_data *dn); +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); +int f2fs_truncate(struct inode *inode); +int f2fs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags); +int f2fs_setattr(struct dentry *dentry, struct iattr *attr); +int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); +void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count); +int f2fs_precache_extents(struct inode *inode); +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int f2fs_pin_file_control(struct inode *inode, bool inc); + +/* + * inode.c + */ +void f2fs_set_inode_flags(struct inode *inode); +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); +struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); +struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); +int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); +void f2fs_update_inode(struct inode *inode, struct page *node_page); +void f2fs_update_inode_page(struct inode *inode); +int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); +void f2fs_evict_inode(struct inode *inode); +void f2fs_handle_failed_inode(struct inode *inode); + +/* + * namei.c + */ +int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, + bool hot, bool set); +struct dentry *f2fs_get_parent(struct dentry *child); + +/* + * dir.c + */ +unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de); +struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, + f2fs_hash_t namehash, int *max_slots, + struct f2fs_dentry_ptr *d); +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, + unsigned int start_pos, struct fscrypt_str *fstr); +void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, + struct f2fs_dentry_ptr *d); +struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, + const struct qstr *new_name, + const struct qstr *orig_name, struct page *dpage); +void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, + unsigned int current_depth); +int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots); +void f2fs_drop_nlink(struct inode *dir, struct inode *inode); +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page); +struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, + const struct qstr *child, struct page **res_page); +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p); +ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, + struct page **page); +void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, + struct page *page, struct inode *inode); +void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, + const struct qstr *name, f2fs_hash_t name_hash, + unsigned int bit_pos); +int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode); +int f2fs_add_dentry(struct inode *dir, struct fscrypt_name *fname, + struct inode *inode, nid_t ino, umode_t mode); +int f2fs_do_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode); +void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode); +int f2fs_do_tmpfile(struct inode *inode, struct inode *dir); +bool f2fs_empty_dir(struct inode *dir); + +static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) +{ + if (fscrypt_is_nokey_name(dentry)) + return -ENOKEY; + return f2fs_do_add_link(d_inode(dentry->d_parent), &dentry->d_name, + inode, inode->i_ino, inode->i_mode); +} + +/* + * super.c + */ +int f2fs_inode_dirtied(struct inode *inode, bool sync); +void f2fs_inode_synced(struct inode *inode); +int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); +void f2fs_quota_off_umount(struct super_block *sb); +int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); +int f2fs_sync_fs(struct super_block *sb, int sync); +extern __printf(3, 4) +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...); +int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); + +/* + * hash.c + */ +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct fscrypt_name *fname); + +/* + * node.c + */ +struct dnode_of_data; +struct node_info; + +int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); +bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi); +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi); +int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); +bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); +bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); +int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, + struct node_info *ni); +pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); +int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); +int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); +int f2fs_truncate_xattr_node(struct inode *inode); +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, + unsigned int seq_id); +int f2fs_remove_inode_page(struct inode *inode); +struct page *f2fs_new_inode_page(struct inode *inode); +struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); +void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); +struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); +struct page *f2fs_get_node_page_ra(struct page *parent, int start); +void f2fs_move_node_page(struct page *node_page, int gc_type); +int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, + struct writeback_control *wbc, bool atomic, + unsigned int *seq_id); +int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, + struct writeback_control *wbc, + bool do_balance, enum iostat_type io_type); +int f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); +bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); +void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); +void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); +int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); +int f2fs_recover_inline_xattr(struct inode *inode, struct page *page); +int f2fs_recover_xattr_data(struct inode *inode, struct page *page); +int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); +int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, + unsigned int segno, struct f2fs_summary_block *sum); +void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int f2fs_build_node_manager(struct f2fs_sb_info *sbi); +void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi); +int __init f2fs_create_node_manager_caches(void); +void f2fs_destroy_node_manager_caches(void); + +/* + * segment.c + */ +bool f2fs_need_SSR(struct f2fs_sb_info *sbi); +void f2fs_register_inmem_page(struct inode *inode, struct page *page); +void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure); +void f2fs_drop_inmem_pages(struct inode *inode); +void f2fs_drop_inmem_page(struct inode *inode, struct page *page); +int f2fs_commit_inmem_pages(struct inode *inode); +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); +int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); +int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi); +int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); +void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); +void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); +bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); +void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi); +void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi); +bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); +void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, + struct cp_control *cpc); +void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); +int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); +int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); +bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, + struct cp_control *cpc); +struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); +void f2fs_update_meta_page(struct f2fs_sb_info *sbi, void *src, + block_t blk_addr); +void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type); +void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio); +void f2fs_outplace_write_data(struct dnode_of_data *dn, + struct f2fs_io_info *fio); +int f2fs_inplace_write_data(struct f2fs_io_info *fio); +void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + block_t old_blkaddr, block_t new_blkaddr, + bool recover_curseg, bool recover_newaddr); +void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, + block_t old_addr, block_t new_addr, + unsigned char version, bool recover_curseg, + bool recover_newaddr); +void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, int type, + struct f2fs_io_info *fio, bool add_list); +void f2fs_wait_on_page_writeback(struct page *page, + enum page_type type, bool ordered); +void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); +void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, + unsigned int val, int alloc); +void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int f2fs_build_segment_manager(struct f2fs_sb_info *sbi); +void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); +int __init f2fs_create_segment_manager_caches(void); +void f2fs_destroy_segment_manager_caches(void); +int f2fs_rw_hint_to_seg_type(enum rw_hint hint); +enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp); + +/* + * checkpoint.c + */ +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); +struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); +bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type); +int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, + int type, bool sync); +void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); +long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, + long nr_to_write, enum iostat_type io_type); +void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all); +bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); +void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type); +bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type); +int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi); +int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi); +void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi); +void f2fs_add_orphan_inode(struct inode *inode); +void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino); +int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi); +int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi); +void f2fs_update_dirty_page(struct inode *inode, struct page *page); +void f2fs_remove_dirty_inode(struct inode *inode); +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); +void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi); +int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); +int __init f2fs_create_checkpoint_caches(void); +void f2fs_destroy_checkpoint_caches(void); + +/* + * data.c + */ +int f2fs_init_post_read_processing(void); +void f2fs_destroy_post_read_processing(void); +void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); +void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type); +void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); +int f2fs_submit_page_bio(struct f2fs_io_info *fio); +void f2fs_submit_page_write(struct f2fs_io_info *fio); +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio); +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); +void f2fs_set_data_blkaddr(struct dnode_of_data *dn); +void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); +int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); +int f2fs_reserve_new_block(struct dnode_of_data *dn); +int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); +int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); +struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, + int op_flags, bool for_write); +struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index); +struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, + bool for_write); +struct page *f2fs_get_new_data_page(struct inode *inode, + struct page *ipage, pgoff_t index, bool new_i_size); +int f2fs_do_write_data_page(struct f2fs_io_info *fio); +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, + int create, int flag); +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); +bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); +bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); +void f2fs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length); +int f2fs_release_page(struct page *page, gfp_t wait); +#ifdef CONFIG_MIGRATION +int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, + struct page *page, enum migrate_mode mode); +#endif +bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); +void f2fs_clear_radix_tree_dirty_tag(struct page *page); + +/* + * gc.c + */ +int f2fs_start_gc_thread(struct f2fs_sb_info *sbi); +void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); +block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, + unsigned int segno); +void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); + +/* + * recovery.c + */ +int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); +bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi); + +/* + * debug.c + */ +#ifdef CONFIG_F2FS_STAT_FS +struct f2fs_stat_info { + struct list_head stat_list; + struct f2fs_sb_info *sbi; + int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; + int main_area_segs, main_area_sections, main_area_zones; + unsigned long long hit_largest, hit_cached, hit_rbtree; + unsigned long long hit_total, total_ext; + int ext_tree, zombie_tree, ext_node; + int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; + int ndirty_data, ndirty_qdata; + int inmem_pages; + unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; + int nats, dirty_nats, sits, dirty_sits; + int free_nids, avail_nids, alloc_nids; + int total_count, utilization; + int bg_gc, nr_wb_cp_data, nr_wb_data; + int nr_flushing, nr_flushed, flush_list_empty; + int nr_discarding, nr_discarded; + int nr_discard_cmd; + unsigned int undiscard_blks; + int inline_xattr, inline_inode, inline_dir, append, update, orphans; + int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; + unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; + unsigned int bimodal, avg_vblocks; + int util_free, util_valid, util_invalid; + int rsvd_segs, overp_segs; + int dirty_count, node_pages, meta_pages; + int prefree_count, call_count, cp_count, bg_cp_count; + int tot_segs, node_segs, data_segs, free_segs, free_secs; + int bg_node_segs, bg_data_segs; + int tot_blks, data_blks, node_blks; + int bg_data_blks, bg_node_blks; + unsigned long long skipped_atomic_files[2]; + int curseg[NR_CURSEG_TYPE]; + int cursec[NR_CURSEG_TYPE]; + int curzone[NR_CURSEG_TYPE]; + + unsigned int segment_count[2]; + unsigned int block_count[2]; + unsigned int inplace_count; + unsigned long long base_mem, cache_mem, page_mem; +}; + +static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_stat_info *)sbi->stat_info; +} + +#define stat_inc_cp_count(si) ((si)->cp_count++) +#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++) +#define stat_inc_call_count(si) ((si)->call_count++) +#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) +#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) +#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) +#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) +#define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree)) +#define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) +#define stat_inc_cached_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_cached)) +#define stat_inc_inline_xattr(inode) \ + do { \ + if (f2fs_has_inline_xattr(inode)) \ + (atomic_inc(&F2FS_I_SB(inode)->inline_xattr)); \ + } while (0) +#define stat_dec_inline_xattr(inode) \ + do { \ + if (f2fs_has_inline_xattr(inode)) \ + (atomic_dec(&F2FS_I_SB(inode)->inline_xattr)); \ + } while (0) +#define stat_inc_inline_inode(inode) \ + do { \ + if (f2fs_has_inline_data(inode)) \ + (atomic_inc(&F2FS_I_SB(inode)->inline_inode)); \ + } while (0) +#define stat_dec_inline_inode(inode) \ + do { \ + if (f2fs_has_inline_data(inode)) \ + (atomic_dec(&F2FS_I_SB(inode)->inline_inode)); \ + } while (0) +#define stat_inc_inline_dir(inode) \ + do { \ + if (f2fs_has_inline_dentry(inode)) \ + (atomic_inc(&F2FS_I_SB(inode)->inline_dir)); \ + } while (0) +#define stat_dec_inline_dir(inode) \ + do { \ + if (f2fs_has_inline_dentry(inode)) \ + (atomic_dec(&F2FS_I_SB(inode)->inline_dir)); \ + } while (0) +#define stat_inc_seg_type(sbi, curseg) \ + ((sbi)->segment_count[(curseg)->alloc_type]++) +#define stat_inc_block_count(sbi, curseg) \ + ((sbi)->block_count[(curseg)->alloc_type]++) +#define stat_inc_inplace_blocks(sbi) \ + (atomic_inc(&(sbi)->inplace_count)) +#define stat_inc_atomic_write(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->aw_cnt)) +#define stat_dec_atomic_write(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->aw_cnt)) +#define stat_update_max_atomic_write(inode) \ + do { \ + int cur = atomic_read(&F2FS_I_SB(inode)->aw_cnt); \ + int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ + if (cur > max) \ + atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ + } while (0) +#define stat_inc_volatile_write(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->vw_cnt)) +#define stat_dec_volatile_write(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->vw_cnt)) +#define stat_update_max_volatile_write(inode) \ + do { \ + int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \ + int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \ + if (cur > max) \ + atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \ + } while (0) +#define stat_inc_seg_count(sbi, type, gc_type) \ + do { \ + struct f2fs_stat_info *si = F2FS_STAT(sbi); \ + si->tot_segs++; \ + if ((type) == SUM_TYPE_DATA) { \ + si->data_segs++; \ + si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \ + } else { \ + si->node_segs++; \ + si->bg_node_segs += (gc_type == BG_GC) ? 1 : 0; \ + } \ + } while (0) + +#define stat_inc_tot_blk_count(si, blks) \ + ((si)->tot_blks += (blks)) + +#define stat_inc_data_blk_count(sbi, blks, gc_type) \ + do { \ + struct f2fs_stat_info *si = F2FS_STAT(sbi); \ + stat_inc_tot_blk_count(si, blks); \ + si->data_blks += (blks); \ + si->bg_data_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ + } while (0) + +#define stat_inc_node_blk_count(sbi, blks, gc_type) \ + do { \ + struct f2fs_stat_info *si = F2FS_STAT(sbi); \ + stat_inc_tot_blk_count(si, blks); \ + si->node_blks += (blks); \ + si->bg_node_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ + } while (0) + +int f2fs_build_stats(struct f2fs_sb_info *sbi); +void f2fs_destroy_stats(struct f2fs_sb_info *sbi); +int __init f2fs_create_root_stats(void); +void f2fs_destroy_root_stats(void); +#else +#define stat_inc_cp_count(si) do { } while (0) +#define stat_inc_bg_cp_count(si) do { } while (0) +#define stat_inc_call_count(si) do { } while (0) +#define stat_inc_bggc_count(si) do { } while (0) +#define stat_inc_dirty_inode(sbi, type) do { } while (0) +#define stat_dec_dirty_inode(sbi, type) do { } while (0) +#define stat_inc_total_hit(sb) do { } while (0) +#define stat_inc_rbtree_node_hit(sb) do { } while (0) +#define stat_inc_largest_node_hit(sbi) do { } while (0) +#define stat_inc_cached_node_hit(sbi) do { } while (0) +#define stat_inc_inline_xattr(inode) do { } while (0) +#define stat_dec_inline_xattr(inode) do { } while (0) +#define stat_inc_inline_inode(inode) do { } while (0) +#define stat_dec_inline_inode(inode) do { } while (0) +#define stat_inc_inline_dir(inode) do { } while (0) +#define stat_dec_inline_dir(inode) do { } while (0) +#define stat_inc_atomic_write(inode) do { } while (0) +#define stat_dec_atomic_write(inode) do { } while (0) +#define stat_update_max_atomic_write(inode) do { } while (0) +#define stat_inc_volatile_write(inode) do { } while (0) +#define stat_dec_volatile_write(inode) do { } while (0) +#define stat_update_max_volatile_write(inode) do { } while (0) +#define stat_inc_seg_type(sbi, curseg) do { } while (0) +#define stat_inc_block_count(sbi, curseg) do { } while (0) +#define stat_inc_inplace_blocks(sbi) do { } while (0) +#define stat_inc_seg_count(sbi, type, gc_type) do { } while (0) +#define stat_inc_tot_blk_count(si, blks) do { } while (0) +#define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0) +#define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0) + +static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } +static inline int __init f2fs_create_root_stats(void) { return 0; } +static inline void f2fs_destroy_root_stats(void) { } +#endif + +extern const struct file_operations f2fs_dir_operations; +extern const struct file_operations f2fs_file_operations; +extern const struct inode_operations f2fs_file_inode_operations; +extern const struct address_space_operations f2fs_dblock_aops; +extern const struct address_space_operations f2fs_node_aops; +extern const struct address_space_operations f2fs_meta_aops; +extern const struct inode_operations f2fs_dir_inode_operations; +extern const struct inode_operations f2fs_symlink_inode_operations; +extern const struct inode_operations f2fs_encrypted_symlink_inode_operations; +extern const struct inode_operations f2fs_special_inode_operations; +extern struct kmem_cache *f2fs_inode_entry_slab; + +/* + * inline.c + */ +bool f2fs_may_inline_data(struct inode *inode); +bool f2fs_may_inline_dentry(struct inode *inode); +void f2fs_do_read_inline_data(struct page *page, struct page *ipage); +void f2fs_truncate_inline_inode(struct inode *inode, + struct page *ipage, u64 from); +int f2fs_read_inline_data(struct inode *inode, struct page *page); +int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); +int f2fs_convert_inline_inode(struct inode *inode); +int f2fs_write_inline_data(struct inode *inode, struct page *page); +int f2fs_recover_inline_data(struct inode *inode, struct page *npage); +struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page); +int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent, + struct page *ipage); +int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode); +void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, + struct page *page, struct inode *dir, + struct inode *inode); +bool f2fs_empty_inline_dir(struct inode *dir); +int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, + struct fscrypt_str *fstr); +int f2fs_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); + +/* + * shrinker.c + */ +unsigned long f2fs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc); +unsigned long f2fs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc); +void f2fs_join_shrinker(struct f2fs_sb_info *sbi); +void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); + +/* + * extent_cache.c + */ +struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs); +struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root *root, struct rb_node **parent, + unsigned int ofs); +struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs, + struct rb_entry **prev_entry, struct rb_entry **next_entry, + struct rb_node ***insert_p, struct rb_node **insert_parent, + bool force); +bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root *root); +unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); +void f2fs_drop_extent_tree(struct inode *inode); +unsigned int f2fs_destroy_extent_node(struct inode *inode); +void f2fs_destroy_extent_tree(struct inode *inode); +bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_extent_cache(struct dnode_of_data *dn); +void f2fs_update_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len); +void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi); +int __init f2fs_create_extent_cache(void); +void f2fs_destroy_extent_cache(void); + +/* + * sysfs.c + */ +int __init f2fs_init_sysfs(void); +void f2fs_exit_sysfs(void); +int f2fs_register_sysfs(struct f2fs_sb_info *sbi); +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi); + +/* + * crypto support + */ +static inline bool f2fs_encrypted_inode(struct inode *inode) +{ + return file_is_encrypt(inode); +} + +static inline bool f2fs_encrypted_file(struct inode *inode) +{ + return f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode); +} + +static inline void f2fs_set_encrypted_inode(struct inode *inode) +{ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + file_set_encrypt(inode); + f2fs_set_inode_flags(inode); +#endif +} + +/* + * Returns true if the reads of the inode's data need to undergo some + * postprocessing step, like decryption or authenticity verification. + */ +static inline bool f2fs_post_read_required(struct inode *inode) +{ + return f2fs_encrypted_file(inode); +} + +#define F2FS_FEATURE_FUNCS(name, flagname) \ +static inline int f2fs_sb_has_##name(struct super_block *sb) \ +{ \ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_##flagname); \ +} + +F2FS_FEATURE_FUNCS(encrypt, ENCRYPT); +F2FS_FEATURE_FUNCS(blkzoned, BLKZONED); +F2FS_FEATURE_FUNCS(extra_attr, EXTRA_ATTR); +F2FS_FEATURE_FUNCS(project_quota, PRJQUOTA); +F2FS_FEATURE_FUNCS(inode_chksum, INODE_CHKSUM); +F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); +F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO); +F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME); +F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND); + +#ifdef CONFIG_BLK_DEV_ZONED +static inline int get_blkz_type(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkaddr) +{ + unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz; + int i; + + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).bdev == bdev) + return FDEV(i).blkz_type[zno]; + return -EINVAL; +} +#endif + +static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi) +{ + return f2fs_sb_has_blkzoned(sbi->sb); +} + +static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi) +{ + return blk_queue_discard(bdev_get_queue(sbi->sb->s_bdev)); +} + +static inline bool f2fs_realtime_discard_enable(struct f2fs_sb_info *sbi) +{ + return (test_opt(sbi, DISCARD) && f2fs_hw_support_discard(sbi)) || + f2fs_hw_should_discard(sbi); +} + +static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) +{ + clear_opt(sbi, ADAPTIVE); + clear_opt(sbi, LFS); + + switch (mt) { + case F2FS_MOUNT_ADAPTIVE: + set_opt(sbi, ADAPTIVE); + break; + case F2FS_MOUNT_LFS: + set_opt(sbi, LFS); + break; + } +} + +static inline bool f2fs_may_encrypt(struct inode *inode) +{ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + umode_t mode = inode->i_mode; + + return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); +#else + return false; +#endif +} + +static inline bool f2fs_force_buffered_io(struct inode *inode, int rw) +{ + return (f2fs_post_read_required(inode) || + (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || + f2fs_is_multi_device(F2FS_I_SB(inode))); +} + +#ifdef CONFIG_F2FS_FAULT_INJECTION +extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, + unsigned int type); +#else +#define f2fs_build_fault_attr(sbi, rate, type) do { } while (0) +#endif + +#endif + +#define EFSBADCRC EBADMSG /* Bad CRC detected */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ + diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c new file mode 100644 index 000000000..2a7249496 --- /dev/null +++ b/fs/f2fs/file.c @@ -0,0 +1,3106 @@ +/* + * fs/f2fs/file.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/stat.h> +#include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/falloc.h> +#include <linux/types.h> +#include <linux/compat.h> +#include <linux/uaccess.h> +#include <linux/mount.h> +#include <linux/pagevec.h> +#include <linux/uio.h> +#include <linux/uuid.h> +#include <linux/file.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "xattr.h" +#include "acl.h" +#include "gc.h" +#include "trace.h" +#include <trace/events/f2fs.h> + +static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + vm_fault_t ret; + + down_read(&F2FS_I(inode)->i_mmap_sem); + ret = filemap_fault(vmf); + up_read(&F2FS_I(inode)->i_mmap_sem); + + return ret; +} + +static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vmf->vma->vm_file); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + int err; + + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto err; + } + + sb_start_pagefault(inode->i_sb); + + f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); + + /* block allocation */ + f2fs_lock_op(sbi); + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_reserve_block(&dn, page->index); + if (err) { + f2fs_unlock_op(sbi); + goto out; + } + f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); + + f2fs_balance_fs(sbi, dn.node_changed); + + file_update_time(vmf->vma->vm_file); + down_read(&F2FS_I(inode)->i_mmap_sem); + lock_page(page); + if (unlikely(page->mapping != inode->i_mapping || + page_offset(page) > i_size_read(inode) || + !PageUptodate(page))) { + unlock_page(page); + err = -EFAULT; + goto out_sem; + } + + /* + * check to see if the page is mapped already (no holes) + */ + if (PageMappedToDisk(page)) + goto mapped; + + /* page is wholly or partially inside EOF */ + if (((loff_t)(page->index + 1) << PAGE_SHIFT) > + i_size_read(inode)) { + loff_t offset; + + offset = i_size_read(inode) & ~PAGE_MASK; + zero_user_segment(page, offset, PAGE_SIZE); + } + set_page_dirty(page); + if (!PageUptodate(page)) + SetPageUptodate(page); + + f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE); + + trace_f2fs_vm_page_mkwrite(page, DATA); +mapped: + /* fill the page */ + f2fs_wait_on_page_writeback(page, DATA, false); + + /* wait for GCed page writeback via META_MAPPING */ + f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); + +out_sem: + up_read(&F2FS_I(inode)->i_mmap_sem); +out: + sb_end_pagefault(inode->i_sb); + f2fs_update_time(sbi, REQ_TIME); +err: + return block_page_mkwrite_return(err); +} + +static const struct vm_operations_struct f2fs_file_vm_ops = { + .fault = f2fs_filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = f2fs_vm_page_mkwrite, +}; + +static int get_parent_ino(struct inode *inode, nid_t *pino) +{ + struct dentry *dentry; + + inode = igrab(inode); + dentry = d_find_any_alias(inode); + iput(inode); + if (!dentry) + return 0; + + *pino = parent_ino(dentry); + dput(dentry); + return 1; +} + +static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + enum cp_reason_type cp_reason = CP_NO_NEEDED; + + if (!S_ISREG(inode->i_mode)) + cp_reason = CP_NON_REGULAR; + else if (inode->i_nlink != 1) + cp_reason = CP_HARDLINK; + else if (is_sbi_flag_set(sbi, SBI_NEED_CP)) + cp_reason = CP_SB_NEED_CP; + else if (file_wrong_pino(inode)) + cp_reason = CP_WRONG_PINO; + else if (!f2fs_space_for_roll_forward(sbi)) + cp_reason = CP_NO_SPC_ROLL; + else if (!f2fs_is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) + cp_reason = CP_NODE_NEED_CP; + else if (test_opt(sbi, FASTBOOT)) + cp_reason = CP_FASTBOOT_MODE; + else if (F2FS_OPTION(sbi).active_logs == 2) + cp_reason = CP_SPEC_LOG_NUM; + else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT && + f2fs_need_dentry_mark(sbi, inode->i_ino) && + f2fs_exist_written_data(sbi, F2FS_I(inode)->i_pino, + TRANS_DIR_INO)) + cp_reason = CP_RECOVER_DIR; + + return cp_reason; +} + +static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct page *i = find_get_page(NODE_MAPPING(sbi), ino); + bool ret = false; + /* But we need to avoid that there are some inode updates */ + if ((i && PageDirty(i)) || f2fs_need_inode_block_update(sbi, ino)) + ret = true; + f2fs_put_page(i, 0); + return ret; +} + +static void try_to_fix_pino(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + nid_t pino; + + down_write(&fi->i_sem); + if (file_wrong_pino(inode) && inode->i_nlink == 1 && + get_parent_ino(inode, &pino)) { + f2fs_i_pino_write(inode, pino); + file_got_pino(inode); + } + up_write(&fi->i_sem); +} + +static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, + int datasync, bool atomic) +{ + struct inode *inode = file->f_mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t ino = inode->i_ino; + int ret = 0; + enum cp_reason_type cp_reason = 0; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .for_reclaim = 0, + }; + unsigned int seq_id = 0; + + if (unlikely(f2fs_readonly(inode->i_sb))) + return 0; + + trace_f2fs_sync_file_enter(inode); + + if (S_ISDIR(inode->i_mode)) + goto go_write; + + /* if fdatasync is triggered, let's do in-place-update */ + if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) + set_inode_flag(inode, FI_NEED_IPU); + ret = file_write_and_wait_range(file, start, end); + clear_inode_flag(inode, FI_NEED_IPU); + + if (ret) { + trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); + return ret; + } + + /* if the inode is dirty, let's recover all the time */ + if (!f2fs_skip_inode_update(inode, datasync)) { + f2fs_write_inode(inode, NULL); + goto go_write; + } + + /* + * if there is no written data, don't waste time to write recovery info. + */ + if (!is_inode_flag_set(inode, FI_APPEND_WRITE) && + !f2fs_exist_written_data(sbi, ino, APPEND_INO)) { + + /* it may call write_inode just prior to fsync */ + if (need_inode_page_update(sbi, ino)) + goto go_write; + + if (is_inode_flag_set(inode, FI_UPDATE_WRITE) || + f2fs_exist_written_data(sbi, ino, UPDATE_INO)) + goto flush_out; + goto out; + } +go_write: + /* + * Both of fdatasync() and fsync() are able to be recovered from + * sudden-power-off. + */ + down_read(&F2FS_I(inode)->i_sem); + cp_reason = need_do_checkpoint(inode); + up_read(&F2FS_I(inode)->i_sem); + + if (cp_reason) { + /* all the dirty node pages should be flushed for POR */ + ret = f2fs_sync_fs(inode->i_sb, 1); + + /* + * We've secured consistency through sync_fs. Following pino + * will be used only for fsynced inodes after checkpoint. + */ + try_to_fix_pino(inode); + clear_inode_flag(inode, FI_APPEND_WRITE); + clear_inode_flag(inode, FI_UPDATE_WRITE); + goto out; + } +sync_nodes: + atomic_inc(&sbi->wb_sync_req[NODE]); + ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic, &seq_id); + atomic_dec(&sbi->wb_sync_req[NODE]); + if (ret) + goto out; + + /* if cp_error was enabled, we should avoid infinite loop */ + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; + goto out; + } + + if (f2fs_need_inode_block_update(sbi, ino)) { + f2fs_mark_inode_dirty_sync(inode, true); + f2fs_write_inode(inode, NULL); + goto sync_nodes; + } + + /* + * If it's atomic_write, it's just fine to keep write ordering. So + * here we don't need to wait for node write completion, since we use + * node chain which serializes node blocks. If one of node writes are + * reordered, we can see simply broken chain, resulting in stopping + * roll-forward recovery. It means we'll recover all or none node blocks + * given fsync mark. + */ + if (!atomic) { + ret = f2fs_wait_on_node_pages_writeback(sbi, seq_id); + if (ret) + goto out; + } + + /* once recovery info is written, don't need to tack this */ + f2fs_remove_ino_entry(sbi, ino, APPEND_INO); + clear_inode_flag(inode, FI_APPEND_WRITE); +flush_out: + if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) + ret = f2fs_issue_flush(sbi, inode->i_ino); + if (!ret) { + f2fs_remove_ino_entry(sbi, ino, UPDATE_INO); + clear_inode_flag(inode, FI_UPDATE_WRITE); + f2fs_remove_ino_entry(sbi, ino, FLUSH_INO); + } + f2fs_update_time(sbi, REQ_TIME); +out: + trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); + f2fs_trace_ios(NULL, 1); + return ret; +} + +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +{ + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(file))))) + return -EIO; + return f2fs_do_sync_file(file, start, end, datasync, false); +} + +static pgoff_t __get_first_dirty_index(struct address_space *mapping, + pgoff_t pgofs, int whence) +{ + struct page *page; + int nr_pages; + + if (whence != SEEK_DATA) + return 0; + + /* find first dirty page index */ + nr_pages = find_get_pages_tag(mapping, &pgofs, PAGECACHE_TAG_DIRTY, + 1, &page); + if (!nr_pages) + return ULONG_MAX; + pgofs = page->index; + put_page(page); + return pgofs; +} + +static bool __found_offset(struct f2fs_sb_info *sbi, block_t blkaddr, + pgoff_t dirty, pgoff_t pgofs, int whence) +{ + switch (whence) { + case SEEK_DATA: + if ((blkaddr == NEW_ADDR && dirty == pgofs) || + is_valid_data_blkaddr(sbi, blkaddr)) + return true; + break; + case SEEK_HOLE: + if (blkaddr == NULL_ADDR) + return true; + break; + } + return false; +} + +static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes = inode->i_sb->s_maxbytes; + struct dnode_of_data dn; + pgoff_t pgofs, end_offset, dirty; + loff_t data_ofs = offset; + loff_t isize; + int err = 0; + + inode_lock(inode); + + isize = i_size_read(inode); + if (offset >= isize) + goto fail; + + /* handle inline data case */ + if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) { + if (whence == SEEK_HOLE) + data_ofs = isize; + goto found; + } + + pgofs = (pgoff_t)(offset >> PAGE_SHIFT); + + dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence); + + for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) { + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, pgofs, LOOKUP_NODE); + if (err && err != -ENOENT) { + goto fail; + } else if (err == -ENOENT) { + /* direct node does not exists */ + if (whence == SEEK_DATA) { + pgofs = f2fs_get_next_page_offset(&dn, pgofs); + continue; + } else { + goto found; + } + } + + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + + /* find data/hole in dnode block */ + for (; dn.ofs_in_node < end_offset; + dn.ofs_in_node++, pgofs++, + data_ofs = (loff_t)pgofs << PAGE_SHIFT) { + block_t blkaddr; + + blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); + + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(F2FS_I_SB(inode), + blkaddr, DATA_GENERIC)) { + f2fs_put_dnode(&dn); + goto fail; + } + + if (__found_offset(F2FS_I_SB(inode), blkaddr, dirty, + pgofs, whence)) { + f2fs_put_dnode(&dn); + goto found; + } + } + f2fs_put_dnode(&dn); + } + + if (whence == SEEK_DATA) + goto fail; +found: + if (whence == SEEK_HOLE && data_ofs > isize) + data_ofs = isize; + inode_unlock(inode); + return vfs_setpos(file, data_ofs, maxbytes); +fail: + inode_unlock(inode); + return -ENXIO; +} + +static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes = inode->i_sb->s_maxbytes; + + switch (whence) { + case SEEK_SET: + case SEEK_CUR: + case SEEK_END: + return generic_file_llseek_size(file, offset, whence, + maxbytes, i_size_read(inode)); + case SEEK_DATA: + case SEEK_HOLE: + if (offset < 0) + return -ENXIO; + return f2fs_seek_block(file, offset, whence); + } + + return -EINVAL; +} + +static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(file); + int err; + + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + + /* we don't need to use inline_data strictly */ + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + + file_accessed(file); + vma->vm_ops = &f2fs_file_vm_ops; + return 0; +} + +static int f2fs_file_open(struct inode *inode, struct file *filp) +{ + int err = fscrypt_file_open(inode, filp); + + if (err) + return err; + + filp->f_mode |= FMODE_NOWAIT; + + return dquot_file_open(inode, filp); +} + +void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct f2fs_node *raw_node; + int nr_free = 0, ofs = dn->ofs_in_node, len = count; + __le32 *addr; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); + + raw_node = F2FS_NODE(dn->node_page); + addr = blkaddr_in_node(raw_node) + base + ofs; + + for (; count > 0; count--, addr++, dn->ofs_in_node++) { + block_t blkaddr = le32_to_cpu(*addr); + + if (blkaddr == NULL_ADDR) + continue; + + dn->data_blkaddr = NULL_ADDR; + f2fs_set_data_blkaddr(dn); + + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) + continue; + + f2fs_invalidate_blocks(sbi, blkaddr); + if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) + clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); + nr_free++; + } + + if (nr_free) { + pgoff_t fofs; + /* + * once we invalidate valid blkaddr in range [ofs, ofs + count], + * we will invalidate all blkaddr in the whole range. + */ + fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), + dn->inode) + ofs; + f2fs_update_extent_cache_range(dn, fofs, 0, len); + dec_valid_block_count(sbi, dn->inode, nr_free); + } + dn->ofs_in_node = ofs; + + f2fs_update_time(sbi, REQ_TIME); + trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid, + dn->ofs_in_node, nr_free); +} + +void f2fs_truncate_data_blocks(struct dnode_of_data *dn) +{ + f2fs_truncate_data_blocks_range(dn, ADDRS_PER_BLOCK); +} + +static int truncate_partial_data_page(struct inode *inode, u64 from, + bool cache_only) +{ + loff_t offset = from & (PAGE_SIZE - 1); + pgoff_t index = from >> PAGE_SHIFT; + struct address_space *mapping = inode->i_mapping; + struct page *page; + + if (!offset && !cache_only) + return 0; + + if (cache_only) { + page = find_lock_page(mapping, index); + if (page && PageUptodate(page)) + goto truncate_out; + f2fs_put_page(page, 1); + return 0; + } + + page = f2fs_get_lock_data_page(inode, index, true); + if (IS_ERR(page)) + return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); +truncate_out: + f2fs_wait_on_page_writeback(page, DATA, true); + zero_user(page, offset, PAGE_SIZE - offset); + + /* An encrypted inode should have a key and truncate the last page. */ + f2fs_bug_on(F2FS_I_SB(inode), cache_only && f2fs_encrypted_inode(inode)); + if (!cache_only) + set_page_dirty(page); + f2fs_put_page(page, 1); + return 0; +} + +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + pgoff_t free_from; + int count = 0, err = 0; + struct page *ipage; + bool truncate_page = false; + + trace_f2fs_truncate_blocks_enter(inode, from); + + free_from = (pgoff_t)F2FS_BLK_ALIGN(from); + + if (free_from >= sbi->max_file_blocks) + goto free_partial; + + if (lock) + f2fs_lock_op(sbi); + + ipage = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto out; + } + + if (f2fs_has_inline_data(inode)) { + f2fs_truncate_inline_inode(inode, ipage, from); + f2fs_put_page(ipage, 1); + truncate_page = true; + goto out; + } + + set_new_dnode(&dn, inode, ipage, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA); + if (err) { + if (err == -ENOENT) + goto free_next; + goto out; + } + + count = ADDRS_PER_PAGE(dn.node_page, inode); + + count -= dn.ofs_in_node; + f2fs_bug_on(sbi, count < 0); + + if (dn.ofs_in_node || IS_INODE(dn.node_page)) { + f2fs_truncate_data_blocks_range(&dn, count); + free_from += count; + } + + f2fs_put_dnode(&dn); +free_next: + err = f2fs_truncate_inode_blocks(inode, free_from); +out: + if (lock) + f2fs_unlock_op(sbi); +free_partial: + /* lastly zero out the first data page */ + if (!err) + err = truncate_partial_data_page(inode, from, truncate_page); + + trace_f2fs_truncate_blocks_exit(inode, err); + return err; +} + +int f2fs_truncate(struct inode *inode) +{ + int err; + + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return 0; + + trace_f2fs_truncate(inode); + + if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) { + f2fs_show_injection_info(FAULT_TRUNCATE); + return -EIO; + } + + err = dquot_initialize(inode); + if (err) + return err; + + /* we should check inline_data size */ + if (!f2fs_may_inline_data(inode)) { + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } + + err = f2fs_truncate_blocks(inode, i_size_read(inode), true); + if (err) + return err; + + inode->i_mtime = inode->i_ctime = current_time(inode); + f2fs_mark_inode_dirty_sync(inode, false); + return 0; +} + +int f2fs_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_inode *ri; + unsigned int flags; + + if (f2fs_has_extra_attr(inode) && + f2fs_sb_has_inode_crtime(inode->i_sb) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) { + stat->result_mask |= STATX_BTIME; + stat->btime.tv_sec = fi->i_crtime.tv_sec; + stat->btime.tv_nsec = fi->i_crtime.tv_nsec; + } + + flags = fi->i_flags & F2FS_FL_USER_VISIBLE; + if (flags & F2FS_APPEND_FL) + stat->attributes |= STATX_ATTR_APPEND; + if (flags & F2FS_COMPR_FL) + stat->attributes |= STATX_ATTR_COMPRESSED; + if (f2fs_encrypted_inode(inode)) + stat->attributes |= STATX_ATTR_ENCRYPTED; + if (flags & F2FS_IMMUTABLE_FL) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (flags & F2FS_NODUMP_FL) + stat->attributes |= STATX_ATTR_NODUMP; + + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_COMPRESSED | + STATX_ATTR_ENCRYPTED | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); + + generic_fillattr(inode, stat); + + /* we need to show initial sectors used for inline_data/dentries */ + if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) || + f2fs_has_inline_dentry(inode)) + stat->blocks += (stat->size + 511) >> 9; + + return 0; +} + +#ifdef CONFIG_F2FS_FS_POSIX_ACL +static void __setattr_copy(struct inode *inode, const struct iattr *attr) +{ + unsigned int ia_valid = attr->ia_valid; + + if (ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + if (ia_valid & ATTR_ATIME) + inode->i_atime = timespec64_trunc(attr->ia_atime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MTIME) + inode->i_mtime = timespec64_trunc(attr->ia_mtime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_CTIME) + inode->i_ctime = timespec64_trunc(attr->ia_ctime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MODE) { + umode_t mode = attr->ia_mode; + + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + mode &= ~S_ISGID; + set_acl_inode(inode, mode); + } +} +#else +#define __setattr_copy setattr_copy +#endif + +int f2fs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + int err; + + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + + err = setattr_prepare(dentry, attr); + if (err) + return err; + + err = fscrypt_prepare_setattr(dentry, attr); + if (err) + return err; + + if (is_quota_modification(inode, attr)) { + err = dquot_initialize(inode); + if (err) + return err; + } + if ((attr->ia_valid & ATTR_UID && + !uid_eq(attr->ia_uid, inode->i_uid)) || + (attr->ia_valid & ATTR_GID && + !gid_eq(attr->ia_gid, inode->i_gid))) { + err = dquot_transfer(inode, attr); + if (err) + return err; + } + + if (attr->ia_valid & ATTR_SIZE) { + bool to_smaller = (attr->ia_size <= i_size_read(inode)); + + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); + + truncate_setsize(inode, attr->ia_size); + + if (to_smaller) + err = f2fs_truncate(inode); + /* + * do not trim all blocks after i_size if target size is + * larger than i_size. + */ + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + + if (err) + return err; + + if (!to_smaller) { + /* should convert inline inode here */ + if (!f2fs_may_inline_data(inode)) { + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } + inode->i_mtime = inode->i_ctime = current_time(inode); + } + + down_write(&F2FS_I(inode)->i_sem); + F2FS_I(inode)->last_disk_size = i_size_read(inode); + up_write(&F2FS_I(inode)->i_sem); + } + + __setattr_copy(inode, attr); + + if (attr->ia_valid & ATTR_MODE) { + err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode)); + if (err || is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; + clear_inode_flag(inode, FI_ACL_MODE); + } + } + + /* file size may changed here */ + f2fs_mark_inode_dirty_sync(inode, true); + + /* inode change will produce dirty node pages flushed by checkpoint */ + f2fs_balance_fs(F2FS_I_SB(inode), true); + + return err; +} + +const struct inode_operations f2fs_file_inode_operations = { + .getattr = f2fs_getattr, + .setattr = f2fs_setattr, + .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, +#ifdef CONFIG_F2FS_FS_XATTR + .listxattr = f2fs_listxattr, +#endif + .fiemap = f2fs_fiemap, +}; + +static int fill_zero(struct inode *inode, pgoff_t index, + loff_t start, loff_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct page *page; + + if (!len) + return 0; + + f2fs_balance_fs(sbi, true); + + f2fs_lock_op(sbi); + page = f2fs_get_new_data_page(inode, NULL, index, false); + f2fs_unlock_op(sbi); + + if (IS_ERR(page)) + return PTR_ERR(page); + + f2fs_wait_on_page_writeback(page, DATA, true); + zero_user(page, start, len); + set_page_dirty(page); + f2fs_put_page(page, 1); + return 0; +} + +int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) +{ + int err; + + while (pg_start < pg_end) { + struct dnode_of_data dn; + pgoff_t end_offset, count; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, pg_start, LOOKUP_NODE); + if (err) { + if (err == -ENOENT) { + pg_start = f2fs_get_next_page_offset(&dn, + pg_start); + continue; + } + return err; + } + + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + count = min(end_offset - dn.ofs_in_node, pg_end - pg_start); + + f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset); + + f2fs_truncate_data_blocks_range(&dn, count); + f2fs_put_dnode(&dn); + + pg_start += count; + } + return 0; +} + +static int punch_hole(struct inode *inode, loff_t offset, loff_t len) +{ + pgoff_t pg_start, pg_end; + loff_t off_start, off_end; + int ret; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; + + off_start = offset & (PAGE_SIZE - 1); + off_end = (offset + len) & (PAGE_SIZE - 1); + + if (pg_start == pg_end) { + ret = fill_zero(inode, pg_start, off_start, + off_end - off_start); + if (ret) + return ret; + } else { + if (off_start) { + ret = fill_zero(inode, pg_start++, off_start, + PAGE_SIZE - off_start); + if (ret) + return ret; + } + if (off_end) { + ret = fill_zero(inode, pg_end, 0, off_end); + if (ret) + return ret; + } + + if (pg_start < pg_end) { + loff_t blk_start, blk_end; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + f2fs_balance_fs(sbi, true); + + blk_start = (loff_t)pg_start << PAGE_SHIFT; + blk_end = (loff_t)pg_end << PAGE_SHIFT; + + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); + + truncate_pagecache_range(inode, blk_start, blk_end - 1); + + f2fs_lock_op(sbi); + ret = f2fs_truncate_hole(inode, pg_start, pg_end); + f2fs_unlock_op(sbi); + + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + } + } + + return ret; +} + +static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr, + int *do_replace, pgoff_t off, pgoff_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + int ret, done, i; + +next_dnode: + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + return ret; + } else if (ret == -ENOENT) { + if (dn.max_level == 0) + return -ENOENT; + done = min((pgoff_t)ADDRS_PER_BLOCK - dn.ofs_in_node, len); + blkaddr += done; + do_replace += done; + goto next; + } + + done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - + dn.ofs_in_node, len); + for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { + *blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); + if (!f2fs_is_checkpointed_data(sbi, *blkaddr)) { + + if (test_opt(sbi, LFS)) { + f2fs_put_dnode(&dn); + return -ENOTSUPP; + } + + /* do not invalidate this block address */ + f2fs_update_data_blkaddr(&dn, NULL_ADDR); + *do_replace = 1; + } + } + f2fs_put_dnode(&dn); +next: + len -= done; + off += done; + if (len) + goto next_dnode; + return 0; +} + +static int __roll_back_blkaddrs(struct inode *inode, block_t *blkaddr, + int *do_replace, pgoff_t off, int len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + int ret, i; + + for (i = 0; i < len; i++, do_replace++, blkaddr++) { + if (*do_replace == 0) + continue; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA); + if (ret) { + dec_valid_block_count(sbi, inode, 1); + f2fs_invalidate_blocks(sbi, *blkaddr); + } else { + f2fs_update_data_blkaddr(&dn, *blkaddr); + } + f2fs_put_dnode(&dn); + } + return 0; +} + +static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, + block_t *blkaddr, int *do_replace, + pgoff_t src, pgoff_t dst, pgoff_t len, bool full) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(src_inode); + pgoff_t i = 0; + int ret; + + while (i < len) { + if (blkaddr[i] == NULL_ADDR && !full) { + i++; + continue; + } + + if (do_replace[i] || blkaddr[i] == NULL_ADDR) { + struct dnode_of_data dn; + struct node_info ni; + size_t new_size; + pgoff_t ilen; + + set_new_dnode(&dn, dst_inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, dst + i, ALLOC_NODE); + if (ret) + return ret; + + ret = f2fs_get_node_info(sbi, dn.nid, &ni); + if (ret) { + f2fs_put_dnode(&dn); + return ret; + } + + ilen = min((pgoff_t) + ADDRS_PER_PAGE(dn.node_page, dst_inode) - + dn.ofs_in_node, len - i); + do { + dn.data_blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); + f2fs_truncate_data_blocks_range(&dn, 1); + + if (do_replace[i]) { + f2fs_i_blocks_write(src_inode, + 1, false, false); + f2fs_i_blocks_write(dst_inode, + 1, true, false); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + blkaddr[i], ni.version, true, false); + + do_replace[i] = 0; + } + dn.ofs_in_node++; + i++; + new_size = (loff_t)(dst + i) << PAGE_SHIFT; + if (dst_inode->i_size < new_size) + f2fs_i_size_write(dst_inode, new_size); + } while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR)); + + f2fs_put_dnode(&dn); + } else { + struct page *psrc, *pdst; + + psrc = f2fs_get_lock_data_page(src_inode, + src + i, true); + if (IS_ERR(psrc)) + return PTR_ERR(psrc); + pdst = f2fs_get_new_data_page(dst_inode, NULL, dst + i, + true); + if (IS_ERR(pdst)) { + f2fs_put_page(psrc, 1); + return PTR_ERR(pdst); + } + f2fs_copy_page(psrc, pdst); + set_page_dirty(pdst); + f2fs_put_page(pdst, 1); + f2fs_put_page(psrc, 1); + + ret = f2fs_truncate_hole(src_inode, + src + i, src + i + 1); + if (ret) + return ret; + i++; + } + } + return 0; +} + +static int __exchange_data_block(struct inode *src_inode, + struct inode *dst_inode, pgoff_t src, pgoff_t dst, + pgoff_t len, bool full) +{ + block_t *src_blkaddr; + int *do_replace; + pgoff_t olen; + int ret; + + while (len) { + olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); + + src_blkaddr = f2fs_kvzalloc(F2FS_I_SB(src_inode), + array_size(olen, sizeof(block_t)), + GFP_KERNEL); + if (!src_blkaddr) + return -ENOMEM; + + do_replace = f2fs_kvzalloc(F2FS_I_SB(src_inode), + array_size(olen, sizeof(int)), + GFP_KERNEL); + if (!do_replace) { + kvfree(src_blkaddr); + return -ENOMEM; + } + + ret = __read_out_blkaddrs(src_inode, src_blkaddr, + do_replace, src, olen); + if (ret) + goto roll_back; + + ret = __clone_blkaddrs(src_inode, dst_inode, src_blkaddr, + do_replace, src, dst, olen, full); + if (ret) + goto roll_back; + + src += olen; + dst += olen; + len -= olen; + + kvfree(src_blkaddr); + kvfree(do_replace); + } + return 0; + +roll_back: + __roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, olen); + kvfree(src_blkaddr); + kvfree(do_replace); + return ret; +} + +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + pgoff_t start = offset >> PAGE_SHIFT; + pgoff_t end = (offset + len) >> PAGE_SHIFT; + int ret; + + f2fs_balance_fs(sbi, true); + + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); + + f2fs_lock_op(sbi); + f2fs_drop_extent_tree(inode); + truncate_pagecache(inode, offset); + ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); + f2fs_unlock_op(sbi); + + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + return ret; +} + +static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) +{ + loff_t new_size; + int ret; + + if (offset + len >= i_size_read(inode)) + return -EINVAL; + + /* collapse range should be aligned to block size of f2fs. */ + if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + /* write out all dirty pages from offset */ + ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + if (ret) + return ret; + + ret = f2fs_do_collapse(inode, offset, len); + if (ret) + return ret; + + /* write out all moved pages, if possible */ + down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + truncate_pagecache(inode, offset); + + new_size = i_size_read(inode) - len; + truncate_pagecache(inode, new_size); + + ret = f2fs_truncate_blocks(inode, new_size, true); + up_write(&F2FS_I(inode)->i_mmap_sem); + if (!ret) + f2fs_i_size_write(inode, new_size); + return ret; +} + +static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, + pgoff_t end) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + pgoff_t index = start; + unsigned int ofs_in_node = dn->ofs_in_node; + blkcnt_t count = 0; + int ret; + + for (; index < end; index++, dn->ofs_in_node++) { + if (datablock_addr(dn->inode, dn->node_page, + dn->ofs_in_node) == NULL_ADDR) + count++; + } + + dn->ofs_in_node = ofs_in_node; + ret = f2fs_reserve_new_blocks(dn, count); + if (ret) + return ret; + + dn->ofs_in_node = ofs_in_node; + for (index = start; index < end; index++, dn->ofs_in_node++) { + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); + /* + * f2fs_reserve_new_blocks will not guarantee entire block + * allocation. + */ + if (dn->data_blkaddr == NULL_ADDR) { + ret = -ENOSPC; + break; + } + if (dn->data_blkaddr != NEW_ADDR) { + f2fs_invalidate_blocks(sbi, dn->data_blkaddr); + dn->data_blkaddr = NEW_ADDR; + f2fs_set_data_blkaddr(dn); + } + } + + f2fs_update_extent_cache_range(dn, start, 0, index - start); + + return ret; +} + +static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, + int mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; + pgoff_t index, pg_start, pg_end; + loff_t new_size = i_size_read(inode); + loff_t off_start, off_end; + int ret = 0; + + ret = inode_newsize_ok(inode, (len + offset)); + if (ret) + return ret; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); + if (ret) + return ret; + + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; + + off_start = offset & (PAGE_SIZE - 1); + off_end = (offset + len) & (PAGE_SIZE - 1); + + if (pg_start == pg_end) { + ret = fill_zero(inode, pg_start, off_start, + off_end - off_start); + if (ret) + return ret; + + new_size = max_t(loff_t, new_size, offset + len); + } else { + if (off_start) { + ret = fill_zero(inode, pg_start++, off_start, + PAGE_SIZE - off_start); + if (ret) + return ret; + + new_size = max_t(loff_t, new_size, + (loff_t)pg_start << PAGE_SHIFT); + } + + for (index = pg_start; index < pg_end;) { + struct dnode_of_data dn; + unsigned int end_offset; + pgoff_t end; + + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); + + truncate_pagecache_range(inode, + (loff_t)index << PAGE_SHIFT, + ((loff_t)pg_end << PAGE_SHIFT) - 1); + + f2fs_lock_op(sbi); + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); + if (ret) { + f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + goto out; + } + + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + end = min(pg_end, end_offset - dn.ofs_in_node + index); + + ret = f2fs_do_zero_range(&dn, index, end); + f2fs_put_dnode(&dn); + + f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + + f2fs_balance_fs(sbi, dn.node_changed); + + if (ret) + goto out; + + index = end; + new_size = max_t(loff_t, new_size, + (loff_t)index << PAGE_SHIFT); + } + + if (off_end) { + ret = fill_zero(inode, pg_end, 0, off_end); + if (ret) + goto out; + + new_size = max_t(loff_t, new_size, offset + len); + } + } + +out: + if (new_size > i_size_read(inode)) { + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); + else + f2fs_i_size_write(inode, new_size); + } + return ret; +} + +static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t nr, pg_start, pg_end, delta, idx; + loff_t new_size; + int ret = 0; + + new_size = i_size_read(inode) + len; + ret = inode_newsize_ok(inode, new_size); + if (ret) + return ret; + + if (offset >= i_size_read(inode)) + return -EINVAL; + + /* insert range should be aligned to block size of f2fs. */ + if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + f2fs_balance_fs(sbi, true); + + down_write(&F2FS_I(inode)->i_mmap_sem); + ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); + up_write(&F2FS_I(inode)->i_mmap_sem); + if (ret) + return ret; + + /* write out all dirty pages from offset */ + ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + if (ret) + return ret; + + pg_start = offset >> PAGE_SHIFT; + pg_end = (offset + len) >> PAGE_SHIFT; + delta = pg_end - pg_start; + idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); + truncate_pagecache(inode, offset); + + while (!ret && idx > pg_start) { + nr = idx - pg_start; + if (nr > delta) + nr = delta; + idx -= nr; + + f2fs_lock_op(sbi); + f2fs_drop_extent_tree(inode); + + ret = __exchange_data_block(inode, inode, idx, + idx + delta, nr, false); + f2fs_unlock_op(sbi); + } + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + + /* write out all moved pages, if possible */ + down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + truncate_pagecache(inode, offset); + up_write(&F2FS_I(inode)->i_mmap_sem); + + if (!ret) + f2fs_i_size_write(inode, new_size); + return ret; +} + +static int expand_inode_data(struct inode *inode, loff_t offset, + loff_t len, int mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_map_blocks map = { .m_next_pgofs = NULL, + .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE }; + pgoff_t pg_end; + loff_t new_size = i_size_read(inode); + loff_t off_end; + int err; + + err = inode_newsize_ok(inode, (len + offset)); + if (err) + return err; + + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + + f2fs_balance_fs(sbi, true); + + pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT; + off_end = (offset + len) & (PAGE_SIZE - 1); + + map.m_lblk = ((unsigned long long)offset) >> PAGE_SHIFT; + map.m_len = pg_end - map.m_lblk; + if (off_end) + map.m_len++; + + err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (err) { + pgoff_t last_off; + + if (!map.m_len) + return err; + + last_off = map.m_lblk + map.m_len - 1; + + /* update new size to the failed position */ + new_size = (last_off == pg_end) ? offset + len : + (loff_t)(last_off + 1) << PAGE_SHIFT; + } else { + new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end; + } + + if (new_size > i_size_read(inode)) { + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); + else + f2fs_i_size_write(inode, new_size); + } + + return err; +} + +static long f2fs_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + long ret = 0; + + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + + /* f2fs only support ->fallocate for regular file */ + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (f2fs_encrypted_inode(inode) && + (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) + return -EOPNOTSUPP; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | + FALLOC_FL_INSERT_RANGE)) + return -EOPNOTSUPP; + + inode_lock(inode); + + if (mode & FALLOC_FL_PUNCH_HOLE) { + if (offset >= inode->i_size) + goto out; + + ret = punch_hole(inode, offset, len); + } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { + ret = f2fs_collapse_range(inode, offset, len); + } else if (mode & FALLOC_FL_ZERO_RANGE) { + ret = f2fs_zero_range(inode, offset, len, mode); + } else if (mode & FALLOC_FL_INSERT_RANGE) { + ret = f2fs_insert_range(inode, offset, len); + } else { + ret = expand_inode_data(inode, offset, len, mode); + } + + if (!ret) { + inode->i_mtime = inode->i_ctime = current_time(inode); + f2fs_mark_inode_dirty_sync(inode, false); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + } + +out: + inode_unlock(inode); + + trace_f2fs_fallocate(inode, mode, offset, len, ret); + return ret; +} + +static int f2fs_release_file(struct inode *inode, struct file *filp) +{ + /* + * f2fs_relase_file is called at every close calls. So we should + * not drop any inmemory pages by close called by other process. + */ + if (!(filp->f_mode & FMODE_WRITE) || + atomic_read(&inode->i_writecount) != 1) + return 0; + + /* some remained atomic pages should discarded */ + if (f2fs_is_atomic_file(inode)) + f2fs_drop_inmem_pages(inode); + if (f2fs_is_volatile_file(inode)) { + set_inode_flag(inode, FI_DROP_CACHE); + filemap_fdatawrite(inode->i_mapping); + clear_inode_flag(inode, FI_DROP_CACHE); + clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); + } + return 0; +} + +static int f2fs_file_flush(struct file *file, fl_owner_t id) +{ + struct inode *inode = file_inode(file); + + /* + * If the process doing a transaction is crashed, we should do + * roll-back. Otherwise, other reader/write can see corrupted database + * until all the writers close its file. Since this should be done + * before dropping file lock, it needs to do in ->flush. + */ + if (f2fs_is_atomic_file(inode) && + F2FS_I(inode)->inmem_task == current) + f2fs_drop_inmem_pages(inode); + return 0; +} + +static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + unsigned int flags = fi->i_flags; + + if (f2fs_encrypted_inode(inode)) + flags |= F2FS_ENCRYPT_FL; + if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) + flags |= F2FS_INLINE_DATA_FL; + + flags &= F2FS_FL_USER_VISIBLE; + + return put_user(flags, (int __user *)arg); +} + +static int __f2fs_ioc_setflags(struct inode *inode, unsigned int flags) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + unsigned int oldflags; + + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + return -EPERM; + + flags = f2fs_mask_flags(inode->i_mode, flags); + + oldflags = fi->i_flags; + + if ((flags ^ oldflags) & (F2FS_APPEND_FL | F2FS_IMMUTABLE_FL)) + if (!capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + + flags = flags & F2FS_FL_USER_MODIFIABLE; + flags |= oldflags & ~F2FS_FL_USER_MODIFIABLE; + fi->i_flags = flags; + + if (fi->i_flags & F2FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + else + clear_inode_flag(inode, FI_PROJ_INHERIT); + + inode->i_ctime = current_time(inode); + f2fs_set_inode_flags(inode); + f2fs_mark_inode_dirty_sync(inode, true); + return 0; +} + +static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + unsigned int flags; + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(flags, (int __user *)arg)) + return -EFAULT; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + + ret = __f2fs_ioc_setflags(inode, flags); + + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + + return put_user(inode->i_generation, (int __user *)arg); +} + +static int f2fs_ioc_start_atomic_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + + if (f2fs_is_atomic_file(inode)) { + if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) + ret = -EINVAL; + goto out; + } + + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + + /* + * Should wait end_io to count F2FS_WB_CP_DATA correctly by + * f2fs_is_atomic_file. + */ + if (get_dirty_pages(inode)) + f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, + "Unexpected flush for atomic writes: ino=%lu, npages=%u", + inode->i_ino, get_dirty_pages(inode)); + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) { + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + goto out; + } + + set_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + F2FS_I(inode)->inmem_task = current; + stat_inc_atomic_write(inode); + stat_update_max_atomic_write(inode); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_commit_atomic_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + f2fs_balance_fs(F2FS_I_SB(inode), true); + + inode_lock(inode); + + if (f2fs_is_volatile_file(inode)) { + ret = -EINVAL; + goto err_out; + } + + if (f2fs_is_atomic_file(inode)) { + ret = f2fs_commit_inmem_pages(inode); + if (ret) + goto err_out; + + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + if (!ret) { + clear_inode_flag(inode, FI_ATOMIC_FILE); + F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC] = 0; + stat_dec_atomic_write(inode); + } + } else { + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); + } +err_out: + if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { + clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); + ret = -EINVAL; + } + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_start_volatile_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + + if (f2fs_is_volatile_file(inode)) + goto out; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + stat_inc_volatile_write(inode); + stat_update_max_volatile_write(inode); + + set_inode_flag(inode, FI_VOLATILE_FILE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_release_volatile_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + + if (!f2fs_is_volatile_file(inode)) + goto out; + + if (!f2fs_is_first_block_written(inode)) { + ret = truncate_partial_data_page(inode, 0, true); + goto out; + } + + ret = punch_hole(inode, 0, F2FS_BLKSIZE); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_abort_volatile_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + + if (f2fs_is_atomic_file(inode)) + f2fs_drop_inmem_pages(inode); + if (f2fs_is_volatile_file(inode)) { + clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + } + + clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); + + inode_unlock(inode); + + mnt_drop_write_file(filp); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + return ret; +} + +static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct super_block *sb = sbi->sb; + __u32 in; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(in, (__u32 __user *)arg)) + return -EFAULT; + + if (in != F2FS_GOING_DOWN_FULLSYNC) { + ret = mnt_want_write_file(filp); + if (ret) + return ret; + } + + switch (in) { + case F2FS_GOING_DOWN_FULLSYNC: + sb = freeze_bdev(sb->s_bdev); + if (IS_ERR(sb)) { + ret = PTR_ERR(sb); + goto out; + } + if (sb) { + f2fs_stop_checkpoint(sbi, false); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + thaw_bdev(sb->s_bdev, sb); + } + break; + case F2FS_GOING_DOWN_METASYNC: + /* do checkpoint only */ + ret = f2fs_sync_fs(sb, 1); + if (ret) + goto out; + f2fs_stop_checkpoint(sbi, false); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + break; + case F2FS_GOING_DOWN_NOSYNC: + f2fs_stop_checkpoint(sbi, false); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + break; + case F2FS_GOING_DOWN_METAFLUSH: + f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); + f2fs_stop_checkpoint(sbi, false); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + break; + default: + ret = -EINVAL; + goto out; + } + + f2fs_stop_gc_thread(sbi); + f2fs_stop_discard_thread(sbi); + + f2fs_drop_discard_cmd(sbi); + clear_opt(sbi, DISCARD); + + f2fs_update_time(sbi, REQ_TIME); +out: + if (in != F2FS_GOING_DOWN_FULLSYNC) + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct super_block *sb = inode->i_sb; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + struct fstrim_range range; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!f2fs_hw_support_discard(F2FS_SB(sb))) + return -EOPNOTSUPP; + + if (copy_from_user(&range, (struct fstrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + range.minlen = max((unsigned int)range.minlen, + q->limits.discard_granularity); + ret = f2fs_trim_fs(F2FS_SB(sb), &range); + mnt_drop_write_file(filp); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range __user *)arg, &range, + sizeof(range))) + return -EFAULT; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + return 0; +} + +static bool uuid_is_nonzero(__u8 u[16]) +{ + int i; + + for (i = 0; i < 16; i++) + if (u[i]) + return true; + return false; +} + +static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + + if (!f2fs_sb_has_encrypt(inode->i_sb)) + return -EOPNOTSUPP; + + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + + return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); +} + +static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(file_inode(filp)->i_sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); +} + +static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int err; + + if (!f2fs_sb_has_encrypt(inode->i_sb)) + return -EOPNOTSUPP; + + err = mnt_want_write_file(filp); + if (err) + return err; + + down_write(&sbi->sb_lock); + + if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) + goto got_it; + + /* update superblock with uuid */ + generate_random_uuid(sbi->raw_super->encrypt_pw_salt); + + err = f2fs_commit_super(sbi, false); + if (err) { + /* undo new data */ + memset(sbi->raw_super->encrypt_pw_salt, 0, 16); + goto out_err; + } +got_it: + if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt, + 16)) + err = -EFAULT; +out_err: + up_write(&sbi->sb_lock); + mnt_drop_write_file(filp); + return err; +} + +static int f2fs_ioc_gc(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + __u32 sync; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(sync, (__u32 __user *)arg)) + return -EFAULT; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (!sync) { + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } + } else { + mutex_lock(&sbi->gc_mutex); + } + + ret = f2fs_gc(sbi, sync, true, NULL_SEGNO); +out: + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_gc_range range; + u64 end; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&range, (struct f2fs_gc_range __user *)arg, + sizeof(range))) + return -EFAULT; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + end = range.start + range.len; + if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) { + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + +do_more: + if (!range.sync) { + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } + } else { + mutex_lock(&sbi->gc_mutex); + } + + ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start)); + range.start += BLKS_PER_SEC(sbi); + if (range.start <= end) + goto do_more; +out: + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = f2fs_sync_fs(sbi->sb, 1); + + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_defragment_range(struct f2fs_sb_info *sbi, + struct file *filp, + struct f2fs_defragment *range) +{ + struct inode *inode = file_inode(filp); + struct f2fs_map_blocks map = { .m_next_extent = NULL, + .m_seg_type = NO_CHECK_TYPE }; + struct extent_info ei = {0, 0, 0}; + pgoff_t pg_start, pg_end, next_pgofs; + unsigned int blk_per_seg = sbi->blocks_per_seg; + unsigned int total = 0, sec_num; + block_t blk_end = 0; + bool fragmented = false; + int err; + + /* if in-place-update policy is enabled, don't waste time here */ + if (f2fs_should_update_inplace(inode, NULL)) + return -EINVAL; + + pg_start = range->start >> PAGE_SHIFT; + pg_end = (range->start + range->len) >> PAGE_SHIFT; + + f2fs_balance_fs(sbi, true); + + inode_lock(inode); + + /* writeback all dirty pages in the range */ + err = filemap_write_and_wait_range(inode->i_mapping, range->start, + range->start + range->len - 1); + if (err) + goto out; + + /* + * lookup mapping info in extent cache, skip defragmenting if physical + * block addresses are continuous. + */ + if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) { + if (ei.fofs + ei.len >= pg_end) + goto out; + } + + map.m_lblk = pg_start; + map.m_next_pgofs = &next_pgofs; + + /* + * lookup mapping info in dnode page cache, skip defragmenting if all + * physical block addresses are continuous even if there are hole(s) + * in logical blocks. + */ + while (map.m_lblk < pg_end) { + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); + if (err) + goto out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk = next_pgofs; + continue; + } + + if (blk_end && blk_end != map.m_pblk) + fragmented = true; + + /* record total count of block that we're going to move */ + total += map.m_len; + + blk_end = map.m_pblk + map.m_len; + + map.m_lblk += map.m_len; + } + + if (!fragmented) + goto out; + + sec_num = (total + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi); + + /* + * make sure there are enough free section for LFS allocation, this can + * avoid defragment running in SSR mode when free section are allocated + * intensively + */ + if (has_not_enough_free_secs(sbi, 0, sec_num)) { + err = -EAGAIN; + goto out; + } + + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; + total = 0; + + while (map.m_lblk < pg_end) { + pgoff_t idx; + int cnt = 0; + +do_map: + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); + if (err) + goto clear_out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk = next_pgofs; + continue; + } + + set_inode_flag(inode, FI_DO_DEFRAG); + + idx = map.m_lblk; + while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) { + struct page *page; + + page = f2fs_get_lock_data_page(inode, idx, true); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto clear_out; + } + + set_page_dirty(page); + f2fs_put_page(page, 1); + + idx++; + cnt++; + total++; + } + + map.m_lblk = idx; + + if (idx < pg_end && cnt < blk_per_seg) + goto do_map; + + clear_inode_flag(inode, FI_DO_DEFRAG); + + err = filemap_fdatawrite(inode->i_mapping); + if (err) + goto out; + } +clear_out: + clear_inode_flag(inode, FI_DO_DEFRAG); +out: + inode_unlock(inode); + if (!err) + range->len = (u64)total << PAGE_SHIFT; + return err; +} + +static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_defragment range; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode)) + return -EINVAL; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, + sizeof(range))) + return -EFAULT; + + /* verify alignment of offset & size */ + if (range.start & (F2FS_BLKSIZE - 1) || range.len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + if (unlikely((range.start + range.len) >> PAGE_SHIFT > + sbi->max_file_blocks)) + return -EINVAL; + + err = mnt_want_write_file(filp); + if (err) + return err; + + err = f2fs_defragment_range(sbi, filp, &range); + mnt_drop_write_file(filp); + + f2fs_update_time(sbi, REQ_TIME); + if (err < 0) + return err; + + if (copy_to_user((struct f2fs_defragment __user *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; +} + +static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, size_t len) +{ + struct inode *src = file_inode(file_in); + struct inode *dst = file_inode(file_out); + struct f2fs_sb_info *sbi = F2FS_I_SB(src); + size_t olen = len, dst_max_i_size = 0; + size_t dst_osize; + int ret; + + if (file_in->f_path.mnt != file_out->f_path.mnt || + src->i_sb != dst->i_sb) + return -EXDEV; + + if (unlikely(f2fs_readonly(src->i_sb))) + return -EROFS; + + if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode)) + return -EINVAL; + + if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst)) + return -EOPNOTSUPP; + + if (src == dst) { + if (pos_in == pos_out) + return 0; + if (pos_out > pos_in && pos_out < pos_in + len) + return -EINVAL; + } + + inode_lock(src); + if (src != dst) { + ret = -EBUSY; + if (!inode_trylock(dst)) + goto out; + } + + ret = -EINVAL; + if (pos_in + len > src->i_size || pos_in + len < pos_in) + goto out_unlock; + if (len == 0) + olen = len = src->i_size - pos_in; + if (pos_in + len == src->i_size) + len = ALIGN(src->i_size, F2FS_BLKSIZE) - pos_in; + if (len == 0) { + ret = 0; + goto out_unlock; + } + + dst_osize = dst->i_size; + if (pos_out + olen > dst->i_size) + dst_max_i_size = pos_out + olen; + + /* verify the end result is block aligned */ + if (!IS_ALIGNED(pos_in, F2FS_BLKSIZE) || + !IS_ALIGNED(pos_in + len, F2FS_BLKSIZE) || + !IS_ALIGNED(pos_out, F2FS_BLKSIZE)) + goto out_unlock; + + ret = f2fs_convert_inline_inode(src); + if (ret) + goto out_unlock; + + ret = f2fs_convert_inline_inode(dst); + if (ret) + goto out_unlock; + + /* write out all dirty pages from offset */ + ret = filemap_write_and_wait_range(src->i_mapping, + pos_in, pos_in + len); + if (ret) + goto out_unlock; + + ret = filemap_write_and_wait_range(dst->i_mapping, + pos_out, pos_out + len); + if (ret) + goto out_unlock; + + f2fs_balance_fs(sbi, true); + + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + if (src != dst) { + ret = -EBUSY; + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) + goto out_src; + } + + f2fs_lock_op(sbi); + ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, + pos_out >> F2FS_BLKSIZE_BITS, + len >> F2FS_BLKSIZE_BITS, false); + + if (!ret) { + if (dst_max_i_size) + f2fs_i_size_write(dst, dst_max_i_size); + else if (dst_osize != dst->i_size) + f2fs_i_size_write(dst, dst_osize); + } + f2fs_unlock_op(sbi); + + if (src != dst) + up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); +out_src: + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); +out_unlock: + if (src != dst) + inode_unlock(dst); +out: + inode_unlock(src); + return ret; +} + +static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) +{ + struct f2fs_move_range range; + struct fd dst; + int err; + + if (!(filp->f_mode & FMODE_READ) || + !(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&range, (struct f2fs_move_range __user *)arg, + sizeof(range))) + return -EFAULT; + + dst = fdget(range.dst_fd); + if (!dst.file) + return -EBADF; + + if (!(dst.file->f_mode & FMODE_WRITE)) { + err = -EBADF; + goto err_out; + } + + err = mnt_want_write_file(filp); + if (err) + goto err_out; + + err = f2fs_move_file_range(filp, range.pos_in, dst.file, + range.pos_out, range.len); + + mnt_drop_write_file(filp); + if (err) + goto err_out; + + if (copy_to_user((struct f2fs_move_range __user *)arg, + &range, sizeof(range))) + err = -EFAULT; +err_out: + fdput(dst); + return err; +} + +static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct sit_info *sm = SIT_I(sbi); + unsigned int start_segno = 0, end_segno = 0; + unsigned int dev_start_segno = 0, dev_end_segno = 0; + struct f2fs_flush_device range; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg, + sizeof(range))) + return -EFAULT; + + if (!f2fs_is_multi_device(sbi) || sbi->s_ndevs - 1 <= range.dev_num || + sbi->segs_per_sec != 1) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Can't flush %u in %d for segs_per_sec %u != 1\n", + range.dev_num, sbi->s_ndevs, + sbi->segs_per_sec); + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (range.dev_num != 0) + dev_start_segno = GET_SEGNO(sbi, FDEV(range.dev_num).start_blk); + dev_end_segno = GET_SEGNO(sbi, FDEV(range.dev_num).end_blk); + + start_segno = sm->last_victim[FLUSH_DEVICE]; + if (start_segno < dev_start_segno || start_segno >= dev_end_segno) + start_segno = dev_start_segno; + end_segno = min(start_segno + range.segments, dev_end_segno); + + while (start_segno < end_segno) { + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } + sm->last_victim[GC_CB] = end_segno + 1; + sm->last_victim[GC_GREEDY] = end_segno + 1; + sm->last_victim[ALLOC_NEXT] = end_segno + 1; + ret = f2fs_gc(sbi, true, true, start_segno); + if (ret == -EAGAIN) + ret = 0; + else if (ret < 0) + break; + start_segno++; + } +out: + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_get_features(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + u32 sb_feature = le32_to_cpu(F2FS_I_SB(inode)->raw_super->feature); + + /* Must validate to set it with SQLite behavior in Android. */ + sb_feature |= F2FS_FEATURE_ATOMIC_WRITE; + + return put_user(sb_feature, (u32 __user *)arg); +} + +#ifdef CONFIG_QUOTA +static int f2fs_ioc_setproject(struct file *filp, __u32 projid) +{ + struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct super_block *sb = sbi->sb; + struct dquot *transfer_to[MAXQUOTAS] = {}; + struct page *ipage; + kprojid_t kprojid; + int err; + + if (!f2fs_sb_has_project_quota(sb)) { + if (projid != F2FS_DEF_PROJID) + return -EOPNOTSUPP; + else + return 0; + } + + if (!f2fs_has_extra_attr(inode)) + return -EOPNOTSUPP; + + kprojid = make_kprojid(&init_user_ns, (projid_t)projid); + + if (projid_eq(kprojid, F2FS_I(inode)->i_projid)) + return 0; + + err = -EPERM; + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + return err; + + ipage = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + if (!F2FS_FITS_IN_INODE(F2FS_INODE(ipage), fi->i_extra_isize, + i_projid)) { + err = -EOVERFLOW; + f2fs_put_page(ipage, 1); + return err; + } + f2fs_put_page(ipage, 1); + + err = dquot_initialize(inode); + if (err) + return err; + + transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); + if (!IS_ERR(transfer_to[PRJQUOTA])) { + err = __dquot_transfer(inode, transfer_to); + dqput(transfer_to[PRJQUOTA]); + if (err) + goto out_dirty; + } + + F2FS_I(inode)->i_projid = kprojid; + inode->i_ctime = current_time(inode); +out_dirty: + f2fs_mark_inode_dirty_sync(inode, true); + return err; +} +#else +static int f2fs_ioc_setproject(struct file *filp, __u32 projid) +{ + if (projid != F2FS_DEF_PROJID) + return -EOPNOTSUPP; + return 0; +} +#endif + +/* Transfer internal flags to xflags */ +static inline __u32 f2fs_iflags_to_xflags(unsigned long iflags) +{ + __u32 xflags = 0; + + if (iflags & F2FS_SYNC_FL) + xflags |= FS_XFLAG_SYNC; + if (iflags & F2FS_IMMUTABLE_FL) + xflags |= FS_XFLAG_IMMUTABLE; + if (iflags & F2FS_APPEND_FL) + xflags |= FS_XFLAG_APPEND; + if (iflags & F2FS_NODUMP_FL) + xflags |= FS_XFLAG_NODUMP; + if (iflags & F2FS_NOATIME_FL) + xflags |= FS_XFLAG_NOATIME; + if (iflags & F2FS_PROJINHERIT_FL) + xflags |= FS_XFLAG_PROJINHERIT; + return xflags; +} + +#define F2FS_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \ + FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \ + FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT) + +/* Transfer xflags flags to internal */ +static inline unsigned long f2fs_xflags_to_iflags(__u32 xflags) +{ + unsigned long iflags = 0; + + if (xflags & FS_XFLAG_SYNC) + iflags |= F2FS_SYNC_FL; + if (xflags & FS_XFLAG_IMMUTABLE) + iflags |= F2FS_IMMUTABLE_FL; + if (xflags & FS_XFLAG_APPEND) + iflags |= F2FS_APPEND_FL; + if (xflags & FS_XFLAG_NODUMP) + iflags |= F2FS_NODUMP_FL; + if (xflags & FS_XFLAG_NOATIME) + iflags |= F2FS_NOATIME_FL; + if (xflags & FS_XFLAG_PROJINHERIT) + iflags |= F2FS_PROJINHERIT_FL; + + return iflags; +} + +static int f2fs_ioc_fsgetxattr(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct fsxattr fa; + + memset(&fa, 0, sizeof(struct fsxattr)); + fa.fsx_xflags = f2fs_iflags_to_xflags(fi->i_flags & + F2FS_FL_USER_VISIBLE); + + if (f2fs_sb_has_project_quota(inode->i_sb)) + fa.fsx_projid = (__u32)from_kprojid(&init_user_ns, + fi->i_projid); + + if (copy_to_user((struct fsxattr __user *)arg, &fa, sizeof(fa))) + return -EFAULT; + return 0; +} + +static int f2fs_ioctl_check_project(struct inode *inode, struct fsxattr *fa) +{ + /* + * Project Quota ID state is only allowed to change from within the init + * namespace. Enforce that restriction only if we are trying to change + * the quota ID state. Everything else is allowed in user namespaces. + */ + if (current_user_ns() == &init_user_ns) + return 0; + + if (__kprojid_val(F2FS_I(inode)->i_projid) != fa->fsx_projid) + return -EINVAL; + + if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) { + if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT)) + return -EINVAL; + } else { + if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT) + return -EINVAL; + } + + return 0; +} + +static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct fsxattr fa; + unsigned int flags; + int err; + + if (copy_from_user(&fa, (struct fsxattr __user *)arg, sizeof(fa))) + return -EFAULT; + + /* Make sure caller has proper permission */ + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (fa.fsx_xflags & ~F2FS_SUPPORTED_FS_XFLAGS) + return -EOPNOTSUPP; + + flags = f2fs_xflags_to_iflags(fa.fsx_xflags); + if (f2fs_mask_flags(inode->i_mode, flags) != flags) + return -EOPNOTSUPP; + + err = mnt_want_write_file(filp); + if (err) + return err; + + inode_lock(inode); + err = f2fs_ioctl_check_project(inode, &fa); + if (err) + goto out; + flags = (fi->i_flags & ~F2FS_FL_XFLAG_VISIBLE) | + (flags & F2FS_FL_XFLAG_VISIBLE); + err = __f2fs_ioc_setflags(inode, flags); + if (err) + goto out; + + err = f2fs_ioc_setproject(filp, fa.fsx_projid); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return err; +} + +int f2fs_pin_file_control(struct inode *inode, bool inc) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + /* Use i_gc_failures for normal file as a risk signal. */ + if (inc) + f2fs_i_gc_failures_write(inode, + fi->i_gc_failures[GC_FAILURE_PIN] + 1); + + if (fi->i_gc_failures[GC_FAILURE_PIN] > sbi->gc_pin_file_threshold) { + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: Enable GC = ino %lx after %x GC trials\n", + __func__, inode->i_ino, + fi->i_gc_failures[GC_FAILURE_PIN]); + clear_inode_flag(inode, FI_PIN_FILE); + return -EAGAIN; + } + return 0; +} + +static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u32 pin; + int ret = 0; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(pin, (__u32 __user *)arg)) + return -EFAULT; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (f2fs_readonly(F2FS_I_SB(inode)->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + + if (f2fs_should_update_outplace(inode, NULL)) { + ret = -EINVAL; + goto out; + } + + if (!pin) { + clear_inode_flag(inode, FI_PIN_FILE); + f2fs_i_gc_failures_write(inode, 0); + goto done; + } + + if (f2fs_pin_file_control(inode, false)) { + ret = -EAGAIN; + goto out; + } + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + set_inode_flag(inode, FI_PIN_FILE); + ret = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]; +done: + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u32 pin = 0; + + if (is_inode_flag_set(inode, FI_PIN_FILE)) + pin = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]; + return put_user(pin, (u32 __user *)arg); +} + +int f2fs_precache_extents(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_map_blocks map; + pgoff_t m_next_extent; + loff_t end; + int err; + + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return -EOPNOTSUPP; + + map.m_lblk = 0; + map.m_next_pgofs = NULL; + map.m_next_extent = &m_next_extent; + map.m_seg_type = NO_CHECK_TYPE; + end = F2FS_I_SB(inode)->max_file_blocks; + + while (map.m_lblk < end) { + map.m_len = end - map.m_lblk; + + down_write(&fi->i_gc_rwsem[WRITE]); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE); + up_write(&fi->i_gc_rwsem[WRITE]); + if (err) + return err; + + map.m_lblk = m_next_extent; + } + + return err; +} + +static int f2fs_ioc_precache_extents(struct file *filp, unsigned long arg) +{ + return f2fs_precache_extents(file_inode(filp)); +} + +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) + return -EIO; + + switch (cmd) { + case F2FS_IOC_GETFLAGS: + return f2fs_ioc_getflags(filp, arg); + case F2FS_IOC_SETFLAGS: + return f2fs_ioc_setflags(filp, arg); + case F2FS_IOC_GETVERSION: + return f2fs_ioc_getversion(filp, arg); + case F2FS_IOC_START_ATOMIC_WRITE: + return f2fs_ioc_start_atomic_write(filp); + case F2FS_IOC_COMMIT_ATOMIC_WRITE: + return f2fs_ioc_commit_atomic_write(filp); + case F2FS_IOC_START_VOLATILE_WRITE: + return f2fs_ioc_start_volatile_write(filp); + case F2FS_IOC_RELEASE_VOLATILE_WRITE: + return f2fs_ioc_release_volatile_write(filp); + case F2FS_IOC_ABORT_VOLATILE_WRITE: + return f2fs_ioc_abort_volatile_write(filp); + case F2FS_IOC_SHUTDOWN: + return f2fs_ioc_shutdown(filp, arg); + case FITRIM: + return f2fs_ioc_fitrim(filp, arg); + case F2FS_IOC_SET_ENCRYPTION_POLICY: + return f2fs_ioc_set_encryption_policy(filp, arg); + case F2FS_IOC_GET_ENCRYPTION_POLICY: + return f2fs_ioc_get_encryption_policy(filp, arg); + case F2FS_IOC_GET_ENCRYPTION_PWSALT: + return f2fs_ioc_get_encryption_pwsalt(filp, arg); + case F2FS_IOC_GARBAGE_COLLECT: + return f2fs_ioc_gc(filp, arg); + case F2FS_IOC_GARBAGE_COLLECT_RANGE: + return f2fs_ioc_gc_range(filp, arg); + case F2FS_IOC_WRITE_CHECKPOINT: + return f2fs_ioc_write_checkpoint(filp, arg); + case F2FS_IOC_DEFRAGMENT: + return f2fs_ioc_defragment(filp, arg); + case F2FS_IOC_MOVE_RANGE: + return f2fs_ioc_move_range(filp, arg); + case F2FS_IOC_FLUSH_DEVICE: + return f2fs_ioc_flush_device(filp, arg); + case F2FS_IOC_GET_FEATURES: + return f2fs_ioc_get_features(filp, arg); + case F2FS_IOC_FSGETXATTR: + return f2fs_ioc_fsgetxattr(filp, arg); + case F2FS_IOC_FSSETXATTR: + return f2fs_ioc_fssetxattr(filp, arg); + case F2FS_IOC_GET_PIN_FILE: + return f2fs_ioc_get_pin_file(filp, arg); + case F2FS_IOC_SET_PIN_FILE: + return f2fs_ioc_set_pin_file(filp, arg); + case F2FS_IOC_PRECACHE_EXTENTS: + return f2fs_ioc_precache_extents(filp, arg); + default: + return -ENOTTY; + } +} + +static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + ssize_t ret; + + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + return -EINVAL; + + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } + + ret = generic_write_checks(iocb, from); + if (ret > 0) { + bool preallocated = false; + size_t target_size = 0; + int err; + + if (iov_iter_fault_in_readable(from, iov_iter_count(from))) + set_inode_flag(inode, FI_NO_PREALLOC); + + if ((iocb->ki_flags & IOCB_NOWAIT) && + (iocb->ki_flags & IOCB_DIRECT)) { + if (!f2fs_overwrite_io(inode, iocb->ki_pos, + iov_iter_count(from)) || + f2fs_has_inline_data(inode) || + f2fs_force_buffered_io(inode, WRITE)) { + clear_inode_flag(inode, + FI_NO_PREALLOC); + inode_unlock(inode); + return -EAGAIN; + } + + } else { + preallocated = true; + target_size = iocb->ki_pos + iov_iter_count(from); + + err = f2fs_preallocate_blocks(iocb, from); + if (err) { + clear_inode_flag(inode, FI_NO_PREALLOC); + inode_unlock(inode); + return err; + } + } + ret = __generic_file_write_iter(iocb, from); + clear_inode_flag(inode, FI_NO_PREALLOC); + + /* if we couldn't write data, we should deallocate blocks. */ + if (preallocated && i_size_read(inode) < target_size) + f2fs_truncate(inode); + + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); + } + inode_unlock(inode); + + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} + +#ifdef CONFIG_COMPAT +long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case F2FS_IOC32_GETFLAGS: + cmd = F2FS_IOC_GETFLAGS; + break; + case F2FS_IOC32_SETFLAGS: + cmd = F2FS_IOC_SETFLAGS; + break; + case F2FS_IOC32_GETVERSION: + cmd = F2FS_IOC_GETVERSION; + break; + case F2FS_IOC_START_ATOMIC_WRITE: + case F2FS_IOC_COMMIT_ATOMIC_WRITE: + case F2FS_IOC_START_VOLATILE_WRITE: + case F2FS_IOC_RELEASE_VOLATILE_WRITE: + case F2FS_IOC_ABORT_VOLATILE_WRITE: + case F2FS_IOC_SHUTDOWN: + case F2FS_IOC_SET_ENCRYPTION_POLICY: + case F2FS_IOC_GET_ENCRYPTION_PWSALT: + case F2FS_IOC_GET_ENCRYPTION_POLICY: + case F2FS_IOC_GARBAGE_COLLECT: + case F2FS_IOC_GARBAGE_COLLECT_RANGE: + case F2FS_IOC_WRITE_CHECKPOINT: + case F2FS_IOC_DEFRAGMENT: + case F2FS_IOC_MOVE_RANGE: + case F2FS_IOC_FLUSH_DEVICE: + case F2FS_IOC_GET_FEATURES: + case F2FS_IOC_FSGETXATTR: + case F2FS_IOC_FSSETXATTR: + case F2FS_IOC_GET_PIN_FILE: + case F2FS_IOC_SET_PIN_FILE: + case F2FS_IOC_PRECACHE_EXTENTS: + break; + default: + return -ENOIOCTLCMD; + } + return f2fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif + +const struct file_operations f2fs_file_operations = { + .llseek = f2fs_llseek, + .read_iter = generic_file_read_iter, + .write_iter = f2fs_file_write_iter, + .open = f2fs_file_open, + .release = f2fs_release_file, + .mmap = f2fs_file_mmap, + .flush = f2fs_file_flush, + .fsync = f2fs_sync_file, + .fallocate = f2fs_fallocate, + .unlocked_ioctl = f2fs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = f2fs_compat_ioctl, +#endif + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, +}; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c new file mode 100644 index 000000000..ff447bbb5 --- /dev/null +++ b/fs/f2fs/gc.c @@ -0,0 +1,1269 @@ +/* + * fs/f2fs/gc.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/backing-dev.h> +#include <linux/init.h> +#include <linux/f2fs_fs.h> +#include <linux/kthread.h> +#include <linux/delay.h> +#include <linux/freezer.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "gc.h" +#include <trace/events/f2fs.h> + +static int gc_thread_func(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct f2fs_gc_kthread *gc_th = sbi->gc_thread; + wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; + unsigned int wait_ms; + + wait_ms = gc_th->min_sleep_time; + + set_freezable(); + do { + wait_event_interruptible_timeout(*wq, + kthread_should_stop() || freezing(current) || + gc_th->gc_wake, + msecs_to_jiffies(wait_ms)); + + /* give it a try one time */ + if (gc_th->gc_wake) + gc_th->gc_wake = 0; + + if (try_to_freeze()) + continue; + if (kthread_should_stop()) + break; + + if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { + increase_sleep_time(gc_th, &wait_ms); + continue; + } + + if (time_to_inject(sbi, FAULT_CHECKPOINT)) { + f2fs_show_injection_info(FAULT_CHECKPOINT); + f2fs_stop_checkpoint(sbi, false); + } + + if (!sb_start_write_trylock(sbi->sb)) + continue; + + /* + * [GC triggering condition] + * 0. GC is not conducted currently. + * 1. There are enough dirty segments. + * 2. IO subsystem is idle by checking the # of writeback pages. + * 3. IO subsystem is idle by checking the # of requests in + * bdev's request list. + * + * Note) We have to avoid triggering GCs frequently. + * Because it is possible that some segments can be + * invalidated soon after by user update or deletion. + * So, I'd like to wait some time to collect dirty segments. + */ + if (sbi->gc_mode == GC_URGENT) { + wait_ms = gc_th->urgent_sleep_time; + mutex_lock(&sbi->gc_mutex); + goto do_gc; + } + + if (!mutex_trylock(&sbi->gc_mutex)) + goto next; + + if (!is_idle(sbi)) { + increase_sleep_time(gc_th, &wait_ms); + mutex_unlock(&sbi->gc_mutex); + goto next; + } + + if (has_enough_invalid_blocks(sbi)) + decrease_sleep_time(gc_th, &wait_ms); + else + increase_sleep_time(gc_th, &wait_ms); +do_gc: + stat_inc_bggc_count(sbi); + + /* if return value is not zero, no victim was selected */ + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO)) + wait_ms = gc_th->no_gc_sleep_time; + + trace_f2fs_background_gc(sbi->sb, wait_ms, + prefree_segments(sbi), free_segments(sbi)); + + /* balancing f2fs's metadata periodically */ + f2fs_balance_fs_bg(sbi); +next: + sb_end_write(sbi->sb); + + } while (!kthread_should_stop()); + return 0; +} + +int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) +{ + struct f2fs_gc_kthread *gc_th; + dev_t dev = sbi->sb->s_bdev->bd_dev; + int err = 0; + + gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL); + if (!gc_th) { + err = -ENOMEM; + goto out; + } + + gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; + gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; + gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; + gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; + + gc_th->gc_wake= 0; + + sbi->gc_thread = gc_th; + init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); + sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, + "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(gc_th->f2fs_gc_task)) { + err = PTR_ERR(gc_th->f2fs_gc_task); + kfree(gc_th); + sbi->gc_thread = NULL; + } +out: + return err; +} + +void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) +{ + struct f2fs_gc_kthread *gc_th = sbi->gc_thread; + if (!gc_th) + return; + kthread_stop(gc_th->f2fs_gc_task); + kfree(gc_th); + sbi->gc_thread = NULL; +} + +static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) +{ + int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY; + + switch (sbi->gc_mode) { + case GC_IDLE_CB: + gc_mode = GC_CB; + break; + case GC_IDLE_GREEDY: + case GC_URGENT: + gc_mode = GC_GREEDY; + break; + } + return gc_mode; +} + +static void select_policy(struct f2fs_sb_info *sbi, int gc_type, + int type, struct victim_sel_policy *p) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + + if (p->alloc_mode == SSR) { + p->gc_mode = GC_GREEDY; + p->dirty_segmap = dirty_i->dirty_segmap[type]; + p->max_search = dirty_i->nr_dirty[type]; + p->ofs_unit = 1; + } else { + p->gc_mode = select_gc_type(sbi, gc_type); + p->dirty_segmap = dirty_i->dirty_segmap[DIRTY]; + p->max_search = dirty_i->nr_dirty[DIRTY]; + p->ofs_unit = sbi->segs_per_sec; + } + + /* we need to check every dirty segments in the FG_GC case */ + if (gc_type != FG_GC && + (sbi->gc_mode != GC_URGENT) && + p->max_search > sbi->max_victim_search) + p->max_search = sbi->max_victim_search; + + /* let's select beginning hot/small space first in no_heap mode*/ + if (test_opt(sbi, NOHEAP) && + (type == CURSEG_HOT_DATA || IS_NODESEG(type))) + p->offset = 0; + else + p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; +} + +static unsigned int get_max_cost(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + /* SSR allocates in a segment unit */ + if (p->alloc_mode == SSR) + return sbi->blocks_per_seg; + if (p->gc_mode == GC_GREEDY) + return 2 * sbi->blocks_per_seg * p->ofs_unit; + else if (p->gc_mode == GC_CB) + return UINT_MAX; + else /* No other gc_mode */ + return 0; +} + +static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int secno; + + /* + * If the gc_type is FG_GC, we can select victim segments + * selected by background GC before. + * Those segments guarantee they have small valid blocks. + */ + for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { + if (sec_usage_check(sbi, secno)) + continue; + clear_bit(secno, dirty_i->victim_secmap); + return GET_SEG_FROM_SEC(sbi, secno); + } + return NULL_SEGNO; +} + +static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start = GET_SEG_FROM_SEC(sbi, secno); + unsigned long long mtime = 0; + unsigned int vblocks; + unsigned char age = 0; + unsigned char u; + unsigned int i; + + for (i = 0; i < sbi->segs_per_sec; i++) + mtime += get_seg_entry(sbi, start + i)->mtime; + vblocks = get_valid_blocks(sbi, segno, true); + + mtime = div_u64(mtime, sbi->segs_per_sec); + vblocks = div_u64(vblocks, sbi->segs_per_sec); + + u = (vblocks * 100) >> sbi->log_blocks_per_seg; + + /* Handle if the system time has changed by the user */ + if (mtime < sit_i->min_mtime) + sit_i->min_mtime = mtime; + if (mtime > sit_i->max_mtime) + sit_i->max_mtime = mtime; + if (sit_i->max_mtime != sit_i->min_mtime) + age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime), + sit_i->max_mtime - sit_i->min_mtime); + + return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); +} + +static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, + unsigned int segno, struct victim_sel_policy *p) +{ + if (p->alloc_mode == SSR) + return get_seg_entry(sbi, segno)->ckpt_valid_blocks; + + /* alloc_mode == LFS */ + if (p->gc_mode == GC_GREEDY) + return get_valid_blocks(sbi, segno, true); + else + return get_cb_cost(sbi, segno); +} + +static unsigned int count_bits(const unsigned long *addr, + unsigned int offset, unsigned int len) +{ + unsigned int end = offset + len, sum = 0; + + while (offset < end) { + if (test_bit(offset++, addr)) + ++sum; + } + return sum; +} + +/* + * This function is called from two paths. + * One is garbage collection and the other is SSR segment selection. + * When it is called during GC, it just gets a victim segment + * and it does not remove it from dirty seglist. + * When it is called from SSR segment selection, it finds a segment + * which has minimum valid blocks and removes it from dirty seglist. + */ +static int get_victim_by_default(struct f2fs_sb_info *sbi, + unsigned int *result, int gc_type, int type, char alloc_mode) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct sit_info *sm = SIT_I(sbi); + struct victim_sel_policy p; + unsigned int secno, last_victim; + unsigned int last_segment = MAIN_SEGS(sbi); + unsigned int nsearched = 0; + + mutex_lock(&dirty_i->seglist_lock); + + p.alloc_mode = alloc_mode; + select_policy(sbi, gc_type, type, &p); + + p.min_segno = NULL_SEGNO; + p.min_cost = get_max_cost(sbi, &p); + + if (*result != NULL_SEGNO) { + if (get_valid_blocks(sbi, *result, false) && + !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) + p.min_segno = *result; + goto out; + } + + if (p.max_search == 0) + goto out; + + last_victim = sm->last_victim[p.gc_mode]; + if (p.alloc_mode == LFS && gc_type == FG_GC) { + p.min_segno = check_bg_victims(sbi); + if (p.min_segno != NULL_SEGNO) + goto got_it; + } + + while (1) { + unsigned long cost; + unsigned int segno; + + segno = find_next_bit(p.dirty_segmap, last_segment, p.offset); + if (segno >= last_segment) { + if (sm->last_victim[p.gc_mode]) { + last_segment = + sm->last_victim[p.gc_mode]; + sm->last_victim[p.gc_mode] = 0; + p.offset = 0; + continue; + } + break; + } + + p.offset = segno + p.ofs_unit; + if (p.ofs_unit > 1) { + p.offset -= segno % p.ofs_unit; + nsearched += count_bits(p.dirty_segmap, + p.offset - p.ofs_unit, + p.ofs_unit); + } else { + nsearched++; + } + + secno = GET_SEC_FROM_SEG(sbi, segno); + + if (sec_usage_check(sbi, secno)) + goto next; + if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) + goto next; + + cost = get_gc_cost(sbi, segno, &p); + + if (p.min_cost > cost) { + p.min_segno = segno; + p.min_cost = cost; + } +next: + if (nsearched >= p.max_search) { + if (!sm->last_victim[p.gc_mode] && segno <= last_victim) + sm->last_victim[p.gc_mode] = last_victim + 1; + else + sm->last_victim[p.gc_mode] = segno + 1; + sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi); + break; + } + } + if (p.min_segno != NULL_SEGNO) { +got_it: + if (p.alloc_mode == LFS) { + secno = GET_SEC_FROM_SEG(sbi, p.min_segno); + if (gc_type == FG_GC) + sbi->cur_victim_sec = secno; + else + set_bit(secno, dirty_i->victim_secmap); + } + *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; + + trace_f2fs_get_victim(sbi->sb, type, gc_type, &p, + sbi->cur_victim_sec, + prefree_segments(sbi), free_segments(sbi)); + } +out: + mutex_unlock(&dirty_i->seglist_lock); + + return (p.min_segno == NULL_SEGNO) ? 0 : 1; +} + +static const struct victim_selection default_v_ops = { + .get_victim = get_victim_by_default, +}; + +static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino) +{ + struct inode_entry *ie; + + ie = radix_tree_lookup(&gc_list->iroot, ino); + if (ie) + return ie->inode; + return NULL; +} + +static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) +{ + struct inode_entry *new_ie; + + if (inode == find_gc_inode(gc_list, inode->i_ino)) { + iput(inode); + return; + } + new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, GFP_NOFS); + new_ie->inode = inode; + + f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie); + list_add_tail(&new_ie->list, &gc_list->ilist); +} + +static void put_gc_inode(struct gc_inode_list *gc_list) +{ + struct inode_entry *ie, *next_ie; + list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) { + radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); + iput(ie->inode); + list_del(&ie->list); + kmem_cache_free(f2fs_inode_entry_slab, ie); + } +} + +static int check_valid_map(struct f2fs_sb_info *sbi, + unsigned int segno, int offset) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct seg_entry *sentry; + int ret; + + down_read(&sit_i->sentry_lock); + sentry = get_seg_entry(sbi, segno); + ret = f2fs_test_bit(offset, sentry->cur_valid_map); + up_read(&sit_i->sentry_lock); + return ret; +} + +/* + * This function compares node address got in summary with that in NAT. + * On validity, copy that node with cold status, otherwise (invalid node) + * ignore that. + */ +static void gc_node_segment(struct f2fs_sb_info *sbi, + struct f2fs_summary *sum, unsigned int segno, int gc_type) +{ + struct f2fs_summary *entry; + block_t start_addr; + int off; + int phase = 0; + bool fggc = (gc_type == FG_GC); + + start_addr = START_BLOCK(sbi, segno); + +next_step: + entry = sum; + + if (fggc && phase == 2) + atomic_inc(&sbi->wb_sync_req[NODE]); + + for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { + nid_t nid = le32_to_cpu(entry->nid); + struct page *node_page; + struct node_info ni; + + /* stop BG_GC if there is not enough free sections. */ + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) + return; + + if (check_valid_map(sbi, segno, off) == 0) + continue; + + if (phase == 0) { + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + META_NAT, true); + continue; + } + + if (phase == 1) { + f2fs_ra_node_page(sbi, nid); + continue; + } + + /* phase == 2 */ + node_page = f2fs_get_node_page(sbi, nid); + if (IS_ERR(node_page)) + continue; + + /* block may become invalid during f2fs_get_node_page */ + if (check_valid_map(sbi, segno, off) == 0) { + f2fs_put_page(node_page, 1); + continue; + } + + if (f2fs_get_node_info(sbi, nid, &ni)) { + f2fs_put_page(node_page, 1); + continue; + } + + if (ni.blk_addr != start_addr + off) { + f2fs_put_page(node_page, 1); + continue; + } + + f2fs_move_node_page(node_page, gc_type); + stat_inc_node_blk_count(sbi, 1, gc_type); + } + + if (++phase < 3) + goto next_step; + + if (fggc) + atomic_dec(&sbi->wb_sync_req[NODE]); +} + +/* + * Calculate start block index indicating the given node offset. + * Be careful, caller should give this node offset only indicating direct node + * blocks. If any node offsets, which point the other types of node blocks such + * as indirect or double indirect node blocks, are given, it must be a caller's + * bug. + */ +block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode) +{ + unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; + unsigned int bidx; + + if (node_ofs == 0) + return 0; + + if (node_ofs <= 2) { + bidx = node_ofs - 1; + } else if (node_ofs <= indirect_blks) { + int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); + bidx = node_ofs - 2 - dec; + } else { + int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); + bidx = node_ofs - 5 - dec; + } + return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode); +} + +static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + struct node_info *dni, block_t blkaddr, unsigned int *nofs) +{ + struct page *node_page; + nid_t nid; + unsigned int ofs_in_node; + block_t source_blkaddr; + + nid = le32_to_cpu(sum->nid); + ofs_in_node = le16_to_cpu(sum->ofs_in_node); + + node_page = f2fs_get_node_page(sbi, nid); + if (IS_ERR(node_page)) + return false; + + if (f2fs_get_node_info(sbi, nid, dni)) { + f2fs_put_page(node_page, 1); + return false; + } + + if (sum->version != dni->version) { + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: valid data with mismatched node version.", + __func__); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + + if (f2fs_check_nid_range(sbi, dni->ino)) { + f2fs_put_page(node_page, 1); + return false; + } + + *nofs = ofs_of_node(node_page); + source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node); + f2fs_put_page(node_page, 1); + + if (source_blkaddr != blkaddr) + return false; + return true; +} + +static int ra_data_block(struct inode *inode, pgoff_t index) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; + struct dnode_of_data dn; + struct page *page; + struct extent_info ei = {0, 0, 0}; + struct f2fs_io_info fio = { + .sbi = sbi, + .ino = inode->i_ino, + .type = DATA, + .temp = COLD, + .op = REQ_OP_READ, + .op_flags = 0, + .encrypted_page = NULL, + .in_list = false, + .retry = false, + }; + int err; + + page = f2fs_grab_cache_page(mapping, index, true); + if (!page) + return -ENOMEM; + + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + goto got_it; + } + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) + goto put_page; + f2fs_put_dnode(&dn); + + if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, + DATA_GENERIC))) { + err = -EFSCORRUPTED; + goto put_page; + } +got_it: + /* read page */ + fio.page = page; + fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; + + fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi), + dn.data_blkaddr, + FGP_LOCK | FGP_CREAT, GFP_NOFS); + if (!fio.encrypted_page) { + err = -ENOMEM; + goto put_page; + } + + err = f2fs_submit_page_bio(&fio); + if (err) + goto put_encrypted_page; + f2fs_put_page(fio.encrypted_page, 0); + f2fs_put_page(page, 1); + return 0; +put_encrypted_page: + f2fs_put_page(fio.encrypted_page, 1); +put_page: + f2fs_put_page(page, 1); + return err; +} + +/* + * Move data block via META_MAPPING while keeping locked data page. + * This can be used to move blocks, aka LBAs, directly on disk. + */ +static void move_data_block(struct inode *inode, block_t bidx, + int gc_type, unsigned int segno, int off) +{ + struct f2fs_io_info fio = { + .sbi = F2FS_I_SB(inode), + .ino = inode->i_ino, + .type = DATA, + .temp = COLD, + .op = REQ_OP_READ, + .op_flags = 0, + .encrypted_page = NULL, + .in_list = false, + .retry = false, + }; + struct dnode_of_data dn; + struct f2fs_summary sum; + struct node_info ni; + struct page *page, *mpage; + block_t newaddr; + int err; + bool lfs_mode = test_opt(fio.sbi, LFS); + + /* do not read out */ + page = f2fs_grab_cache_page(inode->i_mapping, bidx, false); + if (!page) + return; + + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + goto out; + + if (f2fs_is_atomic_file(inode)) { + F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; + F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; + goto out; + } + + if (f2fs_is_pinned_file(inode)) { + f2fs_pin_file_control(inode, true); + goto out; + } + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE); + if (err) + goto out; + + if (unlikely(dn.data_blkaddr == NULL_ADDR)) { + ClearPageUptodate(page); + goto put_out; + } + + /* + * don't cache encrypted data into meta inode until previous dirty + * data were writebacked to avoid racing between GC and flush. + */ + f2fs_wait_on_page_writeback(page, DATA, true); + + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); + if (err) + goto put_out; + + set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); + + /* read page */ + fio.page = page; + fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; + + if (lfs_mode) + down_write(&fio.sbi->io_order_lock); + + f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, + &sum, CURSEG_COLD_DATA, NULL, false); + + fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), + newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); + if (!fio.encrypted_page) { + err = -ENOMEM; + goto recover_block; + } + + mpage = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), + fio.old_blkaddr, FGP_LOCK, GFP_NOFS); + if (mpage) { + bool updated = false; + + if (PageUptodate(mpage)) { + memcpy(page_address(fio.encrypted_page), + page_address(mpage), PAGE_SIZE); + updated = true; + } + f2fs_put_page(mpage, 1); + invalidate_mapping_pages(META_MAPPING(fio.sbi), + fio.old_blkaddr, fio.old_blkaddr); + if (updated) + goto write_page; + } + + err = f2fs_submit_page_bio(&fio); + if (err) + goto put_page_out; + + /* write page */ + lock_page(fio.encrypted_page); + + if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) { + err = -EIO; + goto put_page_out; + } + if (unlikely(!PageUptodate(fio.encrypted_page))) { + err = -EIO; + goto put_page_out; + } + +write_page: + set_page_dirty(fio.encrypted_page); + f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true); + if (clear_page_dirty_for_io(fio.encrypted_page)) + dec_page_count(fio.sbi, F2FS_DIRTY_META); + + set_page_writeback(fio.encrypted_page); + ClearPageError(page); + + /* allocate block address */ + f2fs_wait_on_page_writeback(dn.node_page, NODE, true); + + fio.op = REQ_OP_WRITE; + fio.op_flags = REQ_SYNC; + fio.new_blkaddr = newaddr; + f2fs_submit_page_write(&fio); + if (fio.retry) { + if (PageWriteback(fio.encrypted_page)) + end_page_writeback(fio.encrypted_page); + goto put_page_out; + } + + f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); + + f2fs_update_data_blkaddr(&dn, newaddr); + set_inode_flag(inode, FI_APPEND_WRITE); + if (page->index == 0) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); +put_page_out: + f2fs_put_page(fio.encrypted_page, 1); +recover_block: + if (lfs_mode) + up_write(&fio.sbi->io_order_lock); + if (err) + f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, + true, true); +put_out: + f2fs_put_dnode(&dn); +out: + f2fs_put_page(page, 1); +} + +static void move_data_page(struct inode *inode, block_t bidx, int gc_type, + unsigned int segno, int off) +{ + struct page *page; + + page = f2fs_get_lock_data_page(inode, bidx, true); + if (IS_ERR(page)) + return; + + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + goto out; + + if (f2fs_is_atomic_file(inode)) { + F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; + F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; + goto out; + } + if (f2fs_is_pinned_file(inode)) { + if (gc_type == FG_GC) + f2fs_pin_file_control(inode, true); + goto out; + } + + if (gc_type == BG_GC) { + if (PageWriteback(page)) + goto out; + set_page_dirty(page); + set_cold_data(page); + } else { + struct f2fs_io_info fio = { + .sbi = F2FS_I_SB(inode), + .ino = inode->i_ino, + .type = DATA, + .temp = COLD, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC, + .old_blkaddr = NULL_ADDR, + .page = page, + .encrypted_page = NULL, + .need_lock = LOCK_REQ, + .io_type = FS_GC_DATA_IO, + }; + bool is_dirty = PageDirty(page); + int err; + +retry: + set_page_dirty(page); + f2fs_wait_on_page_writeback(page, DATA, true); + if (clear_page_dirty_for_io(page)) { + inode_dec_dirty_pages(inode); + f2fs_remove_dirty_inode(inode); + } + + set_cold_data(page); + + err = f2fs_do_write_data_page(&fio); + if (err) { + clear_cold_data(page); + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + if (is_dirty) + set_page_dirty(page); + } + } +out: + f2fs_put_page(page, 1); +} + +/* + * This function tries to get parent node of victim data block, and identifies + * data block validity. If the block is valid, copy that with cold status and + * modify parent node. + * If the parent node is not valid or the data block address is different, + * the victim data block is ignored. + */ +static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + struct gc_inode_list *gc_list, unsigned int segno, int gc_type) +{ + struct super_block *sb = sbi->sb; + struct f2fs_summary *entry; + block_t start_addr; + int off; + int phase = 0; + + start_addr = START_BLOCK(sbi, segno); + +next_step: + entry = sum; + + for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { + struct page *data_page; + struct inode *inode; + struct node_info dni; /* dnode info for the data */ + unsigned int ofs_in_node, nofs; + block_t start_bidx; + nid_t nid = le32_to_cpu(entry->nid); + + /* stop BG_GC if there is not enough free sections. */ + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) + return; + + if (check_valid_map(sbi, segno, off) == 0) + continue; + + if (phase == 0) { + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + META_NAT, true); + continue; + } + + if (phase == 1) { + f2fs_ra_node_page(sbi, nid); + continue; + } + + /* Get an inode by ino with checking validity */ + if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs)) + continue; + + if (phase == 2) { + f2fs_ra_node_page(sbi, dni.ino); + continue; + } + + ofs_in_node = le16_to_cpu(entry->ofs_in_node); + + if (phase == 3) { + inode = f2fs_iget(sb, dni.ino); + if (IS_ERR(inode) || is_bad_inode(inode)) + continue; + + if (!down_write_trylock( + &F2FS_I(inode)->i_gc_rwsem[WRITE])) { + iput(inode); + sbi->skipped_gc_rwsem++; + continue; + } + + start_bidx = f2fs_start_bidx_of_node(nofs, inode) + + ofs_in_node; + + if (f2fs_post_read_required(inode)) { + int err = ra_data_block(inode, start_bidx); + + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + if (err) { + iput(inode); + continue; + } + add_gc_inode(gc_list, inode); + continue; + } + + data_page = f2fs_get_read_data_page(inode, + start_bidx, REQ_RAHEAD, true); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + if (IS_ERR(data_page)) { + iput(inode); + continue; + } + + f2fs_put_page(data_page, 0); + add_gc_inode(gc_list, inode); + continue; + } + + /* phase 4 */ + inode = find_gc_inode(gc_list, dni.ino); + if (inode) { + struct f2fs_inode_info *fi = F2FS_I(inode); + bool locked = false; + + if (S_ISREG(inode->i_mode)) { + if (!down_write_trylock(&fi->i_gc_rwsem[READ])) { + sbi->skipped_gc_rwsem++; + continue; + } + if (!down_write_trylock( + &fi->i_gc_rwsem[WRITE])) { + sbi->skipped_gc_rwsem++; + up_write(&fi->i_gc_rwsem[READ]); + continue; + } + locked = true; + + /* wait for all inflight aio data */ + inode_dio_wait(inode); + } + + start_bidx = f2fs_start_bidx_of_node(nofs, inode) + + ofs_in_node; + if (f2fs_post_read_required(inode)) + move_data_block(inode, start_bidx, gc_type, + segno, off); + else + move_data_page(inode, start_bidx, gc_type, + segno, off); + + if (locked) { + up_write(&fi->i_gc_rwsem[WRITE]); + up_write(&fi->i_gc_rwsem[READ]); + } + + stat_inc_data_blk_count(sbi, 1, gc_type); + } + } + + if (++phase < 5) + goto next_step; +} + +static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, + int gc_type) +{ + struct sit_info *sit_i = SIT_I(sbi); + int ret; + + down_write(&sit_i->sentry_lock); + ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, + NO_CHECK_TYPE, LFS); + up_write(&sit_i->sentry_lock); + return ret; +} + +static int do_garbage_collect(struct f2fs_sb_info *sbi, + unsigned int start_segno, + struct gc_inode_list *gc_list, int gc_type) +{ + struct page *sum_page; + struct f2fs_summary_block *sum; + struct blk_plug plug; + unsigned int segno = start_segno; + unsigned int end_segno = start_segno + sbi->segs_per_sec; + int seg_freed = 0; + unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? + SUM_TYPE_DATA : SUM_TYPE_NODE; + + /* readahead multi ssa blocks those have contiguous address */ + if (sbi->segs_per_sec > 1) + f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), + sbi->segs_per_sec, META_SSA, true); + + /* reference all summary page */ + while (segno < end_segno) { + sum_page = f2fs_get_sum_page(sbi, segno++); + unlock_page(sum_page); + } + + blk_start_plug(&plug); + + for (segno = start_segno; segno < end_segno; segno++) { + + /* find segment summary of victim */ + sum_page = find_get_page(META_MAPPING(sbi), + GET_SUM_BLOCK(sbi, segno)); + f2fs_put_page(sum_page, 0); + + if (get_valid_blocks(sbi, segno, false) == 0) + goto freed; + if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi))) + goto next; + + sum = page_address(sum_page); + if (type != GET_SUM_TYPE((&sum->footer))) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent segment (%u) " + "type [%d, %d] in SSA and SIT", + segno, type, GET_SUM_TYPE((&sum->footer))); + set_sbi_flag(sbi, SBI_NEED_FSCK); + goto next; + } + + /* + * this is to avoid deadlock: + * - lock_page(sum_page) - f2fs_replace_block + * - check_valid_map() - down_write(sentry_lock) + * - down_read(sentry_lock) - change_curseg() + * - lock_page(sum_page) + */ + if (type == SUM_TYPE_NODE) + gc_node_segment(sbi, sum->entries, segno, gc_type); + else + gc_data_segment(sbi, sum->entries, gc_list, segno, + gc_type); + + stat_inc_seg_count(sbi, type, gc_type); + +freed: + if (gc_type == FG_GC && + get_valid_blocks(sbi, segno, false) == 0) + seg_freed++; +next: + f2fs_put_page(sum_page, 0); + } + + if (gc_type == FG_GC) + f2fs_submit_merged_write(sbi, + (type == SUM_TYPE_NODE) ? NODE : DATA); + + blk_finish_plug(&plug); + + stat_inc_call_count(sbi->stat_info); + + return seg_freed; +} + +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, + bool background, unsigned int segno) +{ + int gc_type = sync ? FG_GC : BG_GC; + int sec_freed = 0, seg_freed = 0, total_freed = 0; + int ret = 0; + struct cp_control cpc; + unsigned int init_segno = segno; + struct gc_inode_list gc_list = { + .ilist = LIST_HEAD_INIT(gc_list.ilist), + .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), + }; + unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; + unsigned long long first_skipped; + unsigned int skipped_round = 0, round = 0; + + trace_f2fs_gc_begin(sbi->sb, sync, background, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + + cpc.reason = __get_cp_reason(sbi); + sbi->skipped_gc_rwsem = 0; + first_skipped = last_skipped; +gc_more: + if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { + ret = -EINVAL; + goto stop; + } + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; + goto stop; + } + + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) { + /* + * For example, if there are many prefree_segments below given + * threshold, we can make them free by checkpoint. Then, we + * secure free segments which doesn't need fggc any more. + */ + if (prefree_segments(sbi)) { + ret = f2fs_write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + } + if (has_not_enough_free_secs(sbi, 0, 0)) + gc_type = FG_GC; + } + + /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ + if (gc_type == BG_GC && !background) { + ret = -EINVAL; + goto stop; + } + if (!__get_victim(sbi, &segno, gc_type)) { + ret = -ENODATA; + goto stop; + } + + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); + if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec) + sec_freed++; + total_freed += seg_freed; + + if (gc_type == FG_GC) { + if (sbi->skipped_atomic_files[FG_GC] > last_skipped || + sbi->skipped_gc_rwsem) + skipped_round++; + last_skipped = sbi->skipped_atomic_files[FG_GC]; + round++; + } + + if (gc_type == FG_GC) + sbi->cur_victim_sec = NULL_SEGNO; + + if (sync) + goto stop; + + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { + if (skipped_round <= MAX_SKIP_GC_COUNT || + skipped_round * 2 < round) { + segno = NULL_SEGNO; + goto gc_more; + } + + if (first_skipped < last_skipped && + (last_skipped - first_skipped) > + sbi->skipped_gc_rwsem) { + f2fs_drop_inmem_pages_all(sbi, true); + segno = NULL_SEGNO; + goto gc_more; + } + if (gc_type == FG_GC) + ret = f2fs_write_checkpoint(sbi, &cpc); + } +stop: + SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; + SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + + trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + + mutex_unlock(&sbi->gc_mutex); + + put_gc_inode(&gc_list); + + if (sync && !ret) + ret = sec_freed ? 0 : -EAGAIN; + return ret; +} + +void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) +{ + DIRTY_I(sbi)->v_ops = &default_v_ops; + + sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES; + + /* give warm/cold data area from slower device */ + if (f2fs_is_multi_device(sbi) && sbi->segs_per_sec == 1) + SIT_I(sbi)->last_victim[ALLOC_NEXT] = + GET_SEGNO(sbi, FDEV(0).end_blk) + 1; +} diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h new file mode 100644 index 000000000..c8619e408 --- /dev/null +++ b/fs/f2fs/gc.h @@ -0,0 +1,113 @@ +/* + * fs/f2fs/gc.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define GC_THREAD_MIN_WB_PAGES 1 /* + * a threshold to determine + * whether IO subsystem is idle + * or not + */ +#define DEF_GC_THREAD_URGENT_SLEEP_TIME 500 /* 500 ms */ +#define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ +#define DEF_GC_THREAD_MAX_SLEEP_TIME 60000 +#define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ +#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ +#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ + +#define DEF_GC_FAILED_PINNED_FILES 2048 + +/* Search max. number of dirty segments to select a victim segment */ +#define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ + +struct f2fs_gc_kthread { + struct task_struct *f2fs_gc_task; + wait_queue_head_t gc_wait_queue_head; + + /* for gc sleep time */ + unsigned int urgent_sleep_time; + unsigned int min_sleep_time; + unsigned int max_sleep_time; + unsigned int no_gc_sleep_time; + + /* for changing gc mode */ + unsigned int gc_wake; +}; + +struct gc_inode_list { + struct list_head ilist; + struct radix_tree_root iroot; +}; + +/* + * inline functions + */ +static inline block_t free_user_blocks(struct f2fs_sb_info *sbi) +{ + if (free_segments(sbi) < overprovision_segments(sbi)) + return 0; + else + return (free_segments(sbi) - overprovision_segments(sbi)) + << sbi->log_blocks_per_seg; +} + +static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi) +{ + return (long)(sbi->user_block_count * LIMIT_INVALID_BLOCK) / 100; +} + +static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) +{ + block_t reclaimable_user_blocks = sbi->user_block_count - + written_block_count(sbi); + return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100; +} + +static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th, + unsigned int *wait) +{ + unsigned int min_time = gc_th->min_sleep_time; + unsigned int max_time = gc_th->max_sleep_time; + + if (*wait == gc_th->no_gc_sleep_time) + return; + + if ((long long)*wait + (long long)min_time > (long long)max_time) + *wait = max_time; + else + *wait += min_time; +} + +static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th, + unsigned int *wait) +{ + unsigned int min_time = gc_th->min_sleep_time; + + if (*wait == gc_th->no_gc_sleep_time) + *wait = gc_th->max_sleep_time; + + if ((long long)*wait - (long long)min_time < (long long)min_time) + *wait = min_time; + else + *wait -= min_time; +} + +static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) +{ + block_t invalid_user_blocks = sbi->user_block_count - + written_block_count(sbi); + /* + * Background GC is triggered with the following conditions. + * 1. There are a number of invalid blocks. + * 2. There is not enough free space. + */ + if (invalid_user_blocks > limit_invalid_user_blocks(sbi) && + free_user_blocks(sbi) < limit_free_user_blocks(sbi)) + return true; + return false; +} diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c new file mode 100644 index 000000000..eb2e031ea --- /dev/null +++ b/fs/f2fs/hash.c @@ -0,0 +1,108 @@ +/* + * fs/f2fs/hash.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext3/hash.c + * + * Copyright (C) 2002 by Theodore Ts'o + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/cryptohash.h> +#include <linux/pagemap.h> + +#include "f2fs.h" + +/* + * Hashing code copied from ext3 + */ +#define DELTA 0x9E3779B9 + +static void TEA_transform(unsigned int buf[4], unsigned int const in[]) +{ + __u32 sum = 0; + __u32 b0 = buf[0], b1 = buf[1]; + __u32 a = in[0], b = in[1], c = in[2], d = in[3]; + int n = 16; + + do { + sum += DELTA; + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); + } while (--n); + + buf[0] += b0; + buf[1] += b1; +} + +static void str2hashbuf(const unsigned char *msg, size_t len, + unsigned int *buf, int num) +{ + unsigned pad, val; + int i; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num * 4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = msg[i] + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct fscrypt_name *fname) +{ + __u32 hash; + f2fs_hash_t f2fs_hash; + const unsigned char *p; + __u32 in[8], buf[4]; + const unsigned char *name = name_info->name; + size_t len = name_info->len; + + /* encrypted bigname case */ + if (fname && !fname->disk_name.name) + return cpu_to_le32(fname->hash); + + if (is_dot_dotdot(name_info)) + return 0; + + /* Initialize the default seed for the hash checksum functions */ + buf[0] = 0x67452301; + buf[1] = 0xefcdab89; + buf[2] = 0x98badcfe; + buf[3] = 0x10325476; + + p = name; + while (1) { + str2hashbuf(p, len, in, 4); + TEA_transform(buf, in); + p += 16; + if (len <= 16) + break; + len -= 16; + } + hash = buf[0]; + f2fs_hash = cpu_to_le32(hash & ~F2FS_HASH_COL_BIT); + return f2fs_hash; +} diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c new file mode 100644 index 000000000..6bf78cf63 --- /dev/null +++ b/fs/f2fs/inline.c @@ -0,0 +1,735 @@ +/* + * fs/f2fs/inline.c + * Copyright (c) 2013, Intel Corporation + * Authors: Huajun Li <huajun.li@intel.com> + * Haicheng Li <haicheng.li@intel.com> + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" +#include "node.h" + +bool f2fs_may_inline_data(struct inode *inode) +{ + if (f2fs_is_atomic_file(inode)) + return false; + + if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) + return false; + + if (i_size_read(inode) > MAX_INLINE_DATA(inode)) + return false; + + if (f2fs_post_read_required(inode)) + return false; + + return true; +} + +bool f2fs_may_inline_dentry(struct inode *inode) +{ + if (!test_opt(F2FS_I_SB(inode), INLINE_DENTRY)) + return false; + + if (!S_ISDIR(inode->i_mode)) + return false; + + return true; +} + +void f2fs_do_read_inline_data(struct page *page, struct page *ipage) +{ + struct inode *inode = page->mapping->host; + void *src_addr, *dst_addr; + + if (PageUptodate(page)) + return; + + f2fs_bug_on(F2FS_P_SB(page), page->index); + + zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE); + + /* Copy the whole inline data block */ + src_addr = inline_data_addr(inode, ipage); + dst_addr = kmap_atomic(page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); + flush_dcache_page(page); + kunmap_atomic(dst_addr); + if (!PageUptodate(page)) + SetPageUptodate(page); +} + +void f2fs_truncate_inline_inode(struct inode *inode, + struct page *ipage, u64 from) +{ + void *addr; + + if (from >= MAX_INLINE_DATA(inode)) + return; + + addr = inline_data_addr(inode, ipage); + + f2fs_wait_on_page_writeback(ipage, NODE, true); + memset(addr + from, 0, MAX_INLINE_DATA(inode) - from); + set_page_dirty(ipage); + + if (from == 0) + clear_inode_flag(inode, FI_DATA_EXIST); +} + +int f2fs_read_inline_data(struct inode *inode, struct page *page) +{ + struct page *ipage; + + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) { + unlock_page(page); + return PTR_ERR(ipage); + } + + if (!f2fs_has_inline_data(inode)) { + f2fs_put_page(ipage, 1); + return -EAGAIN; + } + + if (page->index) + zero_user_segment(page, 0, PAGE_SIZE); + else + f2fs_do_read_inline_data(page, ipage); + + if (!PageUptodate(page)) + SetPageUptodate(page); + f2fs_put_page(ipage, 1); + unlock_page(page); + return 0; +} + +int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) +{ + struct f2fs_io_info fio = { + .sbi = F2FS_I_SB(dn->inode), + .ino = dn->inode->i_ino, + .type = DATA, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_PRIO, + .page = page, + .encrypted_page = NULL, + .io_type = FS_DATA_IO, + }; + struct node_info ni; + int dirty, err; + + if (!f2fs_exist_data(dn->inode)) + goto clear_out; + + err = f2fs_reserve_block(dn, 0); + if (err) + return err; + + err = f2fs_get_node_info(fio.sbi, dn->nid, &ni); + if (err) { + f2fs_truncate_data_blocks_range(dn, 1); + f2fs_put_dnode(dn); + return err; + } + + fio.version = ni.version; + + if (unlikely(dn->data_blkaddr != NEW_ADDR)) { + f2fs_put_dnode(dn); + set_sbi_flag(fio.sbi, SBI_NEED_FSCK); + f2fs_msg(fio.sbi->sb, KERN_WARNING, + "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, " + "run fsck to fix.", + __func__, dn->inode->i_ino, dn->data_blkaddr); + return -EFSCORRUPTED; + } + + f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page)); + + f2fs_do_read_inline_data(page, dn->inode_page); + set_page_dirty(page); + + /* clear dirty state */ + dirty = clear_page_dirty_for_io(page); + + /* write data page to try to make data consistent */ + set_page_writeback(page); + ClearPageError(page); + fio.old_blkaddr = dn->data_blkaddr; + set_inode_flag(dn->inode, FI_HOT_DATA); + f2fs_outplace_write_data(dn, &fio); + f2fs_wait_on_page_writeback(page, DATA, true); + if (dirty) { + inode_dec_dirty_pages(dn->inode); + f2fs_remove_dirty_inode(dn->inode); + } + + /* this converted inline_data should be recovered. */ + set_inode_flag(dn->inode, FI_APPEND_WRITE); + + /* clear inline data and flag after data writeback */ + f2fs_truncate_inline_inode(dn->inode, dn->inode_page, 0); + clear_inline_node(dn->inode_page); +clear_out: + stat_dec_inline_inode(dn->inode); + clear_inode_flag(dn->inode, FI_INLINE_DATA); + f2fs_put_dnode(dn); + return 0; +} + +int f2fs_convert_inline_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct page *ipage, *page; + int err = 0; + + if (!f2fs_has_inline_data(inode)) + return 0; + + err = dquot_initialize(inode); + if (err) + return err; + + page = f2fs_grab_cache_page(inode->i_mapping, 0, false); + if (!page) + return -ENOMEM; + + f2fs_lock_op(sbi); + + ipage = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto out; + } + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_has_inline_data(inode)) + err = f2fs_convert_inline_page(&dn, page); + + f2fs_put_dnode(&dn); +out: + f2fs_unlock_op(sbi); + + f2fs_put_page(page, 1); + + if (!err) + f2fs_balance_fs(sbi, dn.node_changed); + + return err; +} + +int f2fs_write_inline_data(struct inode *inode, struct page *page) +{ + void *src_addr, *dst_addr; + struct dnode_of_data dn; + int err; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, 0, LOOKUP_NODE); + if (err) + return err; + + if (!f2fs_has_inline_data(inode)) { + f2fs_put_dnode(&dn); + return -EAGAIN; + } + + f2fs_bug_on(F2FS_I_SB(inode), page->index); + + f2fs_wait_on_page_writeback(dn.inode_page, NODE, true); + src_addr = kmap_atomic(page); + dst_addr = inline_data_addr(inode, dn.inode_page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); + kunmap_atomic(src_addr); + set_page_dirty(dn.inode_page); + + f2fs_clear_radix_tree_dirty_tag(page); + + set_inode_flag(inode, FI_APPEND_WRITE); + set_inode_flag(inode, FI_DATA_EXIST); + + clear_inline_node(dn.inode_page); + f2fs_put_dnode(&dn); + return 0; +} + +int f2fs_recover_inline_data(struct inode *inode, struct page *npage) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode *ri = NULL; + void *src_addr, *dst_addr; + struct page *ipage; + + /* + * The inline_data recovery policy is as follows. + * [prev.] [next] of inline_data flag + * o o -> recover inline_data + * o x -> remove inline_data, and then recover data blocks + * x o -> remove inline_data, and then recover inline_data + * x x -> recover data blocks + */ + if (IS_INODE(npage)) + ri = F2FS_INODE(npage); + + if (f2fs_has_inline_data(inode) && + ri && (ri->i_inline & F2FS_INLINE_DATA)) { +process_inline: + ipage = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + f2fs_wait_on_page_writeback(ipage, NODE, true); + + src_addr = inline_data_addr(inode, npage); + dst_addr = inline_data_addr(inode, ipage); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); + + set_inode_flag(inode, FI_INLINE_DATA); + set_inode_flag(inode, FI_DATA_EXIST); + + set_page_dirty(ipage); + f2fs_put_page(ipage, 1); + return 1; + } + + if (f2fs_has_inline_data(inode)) { + ipage = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + f2fs_truncate_inline_inode(inode, ipage, 0); + clear_inode_flag(inode, FI_INLINE_DATA); + f2fs_put_page(ipage, 1); + } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { + int ret; + + ret = f2fs_truncate_blocks(inode, 0, false); + if (ret) + return ret; + goto process_inline; + } + return 0; +} + +struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct qstr name = FSTR_TO_QSTR(&fname->disk_name); + struct f2fs_dir_entry *de; + struct f2fs_dentry_ptr d; + struct page *ipage; + void *inline_dentry; + f2fs_hash_t namehash; + + ipage = f2fs_get_node_page(sbi, dir->i_ino); + if (IS_ERR(ipage)) { + *res_page = ipage; + return NULL; + } + + namehash = f2fs_dentry_hash(&name, fname); + + inline_dentry = inline_data_addr(dir, ipage); + + make_dentry_ptr_inline(dir, &d, inline_dentry); + de = f2fs_find_target_dentry(fname, namehash, NULL, &d); + unlock_page(ipage); + if (de) + *res_page = ipage; + else + f2fs_put_page(ipage, 0); + + return de; +} + +int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent, + struct page *ipage) +{ + struct f2fs_dentry_ptr d; + void *inline_dentry; + + inline_dentry = inline_data_addr(inode, ipage); + + make_dentry_ptr_inline(inode, &d, inline_dentry); + f2fs_do_make_empty_dir(inode, parent, &d); + + set_page_dirty(ipage); + + /* update i_size to MAX_INLINE_DATA */ + if (i_size_read(inode) < MAX_INLINE_DATA(inode)) + f2fs_i_size_write(inode, MAX_INLINE_DATA(inode)); + return 0; +} + +/* + * NOTE: ipage is grabbed by caller, but if any error occurs, we should + * release ipage in this function. + */ +static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, + void *inline_dentry) +{ + struct page *page; + struct dnode_of_data dn; + struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_ptr src, dst; + int err; + + page = f2fs_grab_cache_page(dir->i_mapping, 0, false); + if (!page) { + f2fs_put_page(ipage, 1); + return -ENOMEM; + } + + set_new_dnode(&dn, dir, ipage, NULL, 0); + err = f2fs_reserve_block(&dn, 0); + if (err) + goto out; + + if (unlikely(dn.data_blkaddr != NEW_ADDR)) { + f2fs_put_dnode(&dn); + set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK); + f2fs_msg(F2FS_P_SB(page)->sb, KERN_WARNING, + "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, " + "run fsck to fix.", + __func__, dir->i_ino, dn.data_blkaddr); + err = -EFSCORRUPTED; + goto out; + } + + f2fs_wait_on_page_writeback(page, DATA, true); + + dentry_blk = page_address(page); + + make_dentry_ptr_inline(dir, &src, inline_dentry); + make_dentry_ptr_block(dir, &dst, dentry_blk); + + /* copy data from inline dentry block to new dentry block */ + memcpy(dst.bitmap, src.bitmap, src.nr_bitmap); + memset(dst.bitmap + src.nr_bitmap, 0, dst.nr_bitmap - src.nr_bitmap); + /* + * we do not need to zero out remainder part of dentry and filename + * field, since we have used bitmap for marking the usage status of + * them, besides, we can also ignore copying/zeroing reserved space + * of dentry block, because them haven't been used so far. + */ + memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max); + memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN); + + if (!PageUptodate(page)) + SetPageUptodate(page); + set_page_dirty(page); + + /* clear inline dir and flag after data writeback */ + f2fs_truncate_inline_inode(dir, ipage, 0); + + stat_dec_inline_dir(dir); + clear_inode_flag(dir, FI_INLINE_DENTRY); + + f2fs_i_depth_write(dir, 1); + if (i_size_read(dir) < PAGE_SIZE) + f2fs_i_size_write(dir, PAGE_SIZE); +out: + f2fs_put_page(page, 1); + return err; +} + +static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) +{ + struct f2fs_dentry_ptr d; + unsigned long bit_pos = 0; + int err = 0; + + make_dentry_ptr_inline(dir, &d, inline_dentry); + + while (bit_pos < d.max) { + struct f2fs_dir_entry *de; + struct qstr new_name; + nid_t ino; + umode_t fake_mode; + + if (!test_bit_le(bit_pos, d.bitmap)) { + bit_pos++; + continue; + } + + de = &d.dentry[bit_pos]; + + if (unlikely(!de->name_len)) { + bit_pos++; + continue; + } + + new_name.name = d.filename[bit_pos]; + new_name.len = le16_to_cpu(de->name_len); + + ino = le32_to_cpu(de->ino); + fake_mode = f2fs_get_de_type(de) << S_SHIFT; + + err = f2fs_add_regular_entry(dir, &new_name, NULL, NULL, + ino, fake_mode); + if (err) + goto punch_dentry_pages; + + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + } + return 0; +punch_dentry_pages: + truncate_inode_pages(&dir->i_data, 0); + f2fs_truncate_blocks(dir, 0, false); + f2fs_remove_dirty_inode(dir); + return err; +} + +static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, + void *inline_dentry) +{ + void *backup_dentry; + int err; + + backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir), + MAX_INLINE_DATA(dir), GFP_F2FS_ZERO); + if (!backup_dentry) { + f2fs_put_page(ipage, 1); + return -ENOMEM; + } + + memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir)); + f2fs_truncate_inline_inode(dir, ipage, 0); + + unlock_page(ipage); + + err = f2fs_add_inline_entries(dir, backup_dentry); + if (err) + goto recover; + + lock_page(ipage); + + stat_dec_inline_dir(dir); + clear_inode_flag(dir, FI_INLINE_DENTRY); + kfree(backup_dentry); + return 0; +recover: + lock_page(ipage); + f2fs_wait_on_page_writeback(ipage, NODE, true); + memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir)); + f2fs_i_depth_write(dir, 0); + f2fs_i_size_write(dir, MAX_INLINE_DATA(dir)); + set_page_dirty(ipage); + f2fs_put_page(ipage, 1); + + kfree(backup_dentry); + return err; +} + +static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, + void *inline_dentry) +{ + if (!F2FS_I(dir)->i_dir_level) + return f2fs_move_inline_dirents(dir, ipage, inline_dentry); + else + return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry); +} + +int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct page *ipage; + unsigned int bit_pos; + f2fs_hash_t name_hash; + void *inline_dentry = NULL; + struct f2fs_dentry_ptr d; + int slots = GET_DENTRY_SLOTS(new_name->len); + struct page *page = NULL; + int err = 0; + + ipage = f2fs_get_node_page(sbi, dir->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max); + if (bit_pos >= d.max) { + err = f2fs_convert_inline_dir(dir, ipage, inline_dentry); + if (err) + return err; + err = -EAGAIN; + goto out; + } + + if (inode) { + down_write(&F2FS_I(inode)->i_sem); + page = f2fs_init_inode_metadata(inode, dir, new_name, + orig_name, ipage); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } + } + + f2fs_wait_on_page_writeback(ipage, NODE, true); + + name_hash = f2fs_dentry_hash(new_name, NULL); + f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); + + set_page_dirty(ipage); + + /* we don't need to mark_inode_dirty now */ + if (inode) { + f2fs_i_pino_write(inode, dir->i_ino); + + /* synchronize inode page's data from inode cache */ + if (is_inode_flag_set(inode, FI_NEW_INODE)) + f2fs_update_inode(inode, page); + + f2fs_put_page(page, 1); + } + + f2fs_update_parent_metadata(dir, inode, 0); +fail: + if (inode) + up_write(&F2FS_I(inode)->i_sem); +out: + f2fs_put_page(ipage, 1); + return err; +} + +void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode) +{ + struct f2fs_dentry_ptr d; + void *inline_dentry; + int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); + unsigned int bit_pos; + int i; + + lock_page(page); + f2fs_wait_on_page_writeback(page, NODE, true); + + inline_dentry = inline_data_addr(dir, page); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = dentry - d.dentry; + for (i = 0; i < slots; i++) + __clear_bit_le(bit_pos + i, d.bitmap); + + set_page_dirty(page); + f2fs_put_page(page, 1); + + dir->i_ctime = dir->i_mtime = current_time(dir); + f2fs_mark_inode_dirty_sync(dir, false); + + if (inode) + f2fs_drop_nlink(dir, inode); +} + +bool f2fs_empty_inline_dir(struct inode *dir) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct page *ipage; + unsigned int bit_pos = 2; + void *inline_dentry; + struct f2fs_dentry_ptr d; + + ipage = f2fs_get_node_page(sbi, dir->i_ino); + if (IS_ERR(ipage)) + return false; + + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = find_next_bit_le(d.bitmap, d.max, bit_pos); + + f2fs_put_page(ipage, 1); + + if (bit_pos < d.max) + return false; + + return true; +} + +int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, + struct fscrypt_str *fstr) +{ + struct inode *inode = file_inode(file); + struct page *ipage = NULL; + struct f2fs_dentry_ptr d; + void *inline_dentry = NULL; + int err; + + make_dentry_ptr_inline(inode, &d, inline_dentry); + + if (ctx->pos == d.max) + return 0; + + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + /* + * f2fs_readdir was protected by inode.i_rwsem, it is safe to access + * ipage without page's lock held. + */ + unlock_page(ipage); + + inline_dentry = inline_data_addr(inode, ipage); + + make_dentry_ptr_inline(inode, &d, inline_dentry); + + err = f2fs_fill_dentries(ctx, &d, 0, fstr); + if (!err) + ctx->pos = d.max; + + f2fs_put_page(ipage, 0); + return err < 0 ? err : 0; +} + +int f2fs_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) +{ + __u64 byteaddr, ilen; + __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED | + FIEMAP_EXTENT_LAST; + struct node_info ni; + struct page *ipage; + int err = 0; + + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + if (!f2fs_has_inline_data(inode)) { + err = -EAGAIN; + goto out; + } + + ilen = min_t(size_t, MAX_INLINE_DATA(inode), i_size_read(inode)); + if (start >= ilen) + goto out; + if (start + len < ilen) + ilen = start + len; + ilen -= start; + + err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); + if (err) + goto out; + + byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; + byteaddr += (char *)inline_data_addr(inode, ipage) - + (char *)F2FS_INODE(ipage); + err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); +out: + f2fs_put_page(ipage, 1); + return err; +} diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c new file mode 100644 index 000000000..15ff5d9b8 --- /dev/null +++ b/fs/f2fs/inode.c @@ -0,0 +1,792 @@ +/* + * fs/f2fs/inode.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/buffer_head.h> +#include <linux/backing-dev.h> +#include <linux/writeback.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" + +#include <trace/events/f2fs.h> + +void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) +{ + if (is_inode_flag_set(inode, FI_NEW_INODE)) + return; + + if (f2fs_inode_dirtied(inode, sync)) + return; + + mark_inode_dirty_sync(inode); +} + +void f2fs_set_inode_flags(struct inode *inode) +{ + unsigned int flags = F2FS_I(inode)->i_flags; + unsigned int new_fl = 0; + + if (flags & F2FS_SYNC_FL) + new_fl |= S_SYNC; + if (flags & F2FS_APPEND_FL) + new_fl |= S_APPEND; + if (flags & F2FS_IMMUTABLE_FL) + new_fl |= S_IMMUTABLE; + if (flags & F2FS_NOATIME_FL) + new_fl |= S_NOATIME; + if (flags & F2FS_DIRSYNC_FL) + new_fl |= S_DIRSYNC; + if (f2fs_encrypted_inode(inode)) + new_fl |= S_ENCRYPTED; + inode_set_flags(inode, new_fl, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC| + S_ENCRYPTED); +} + +static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) +{ + int extra_size = get_extra_isize(inode); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + if (ri->i_addr[extra_size]) + inode->i_rdev = old_decode_dev( + le32_to_cpu(ri->i_addr[extra_size])); + else + inode->i_rdev = new_decode_dev( + le32_to_cpu(ri->i_addr[extra_size + 1])); + } +} + +static int __written_first_block(struct f2fs_sb_info *sbi, + struct f2fs_inode *ri) +{ + block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); + + if (!__is_valid_data_blkaddr(addr)) + return 1; + if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC)) + return -EFSCORRUPTED; + return 0; +} + +static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) +{ + int extra_size = get_extra_isize(inode); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + ri->i_addr[extra_size] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + ri->i_addr[extra_size + 1] = 0; + } else { + ri->i_addr[extra_size] = 0; + ri->i_addr[extra_size + 1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + ri->i_addr[extra_size + 2] = 0; + } + } +} + +static void __recover_inline_status(struct inode *inode, struct page *ipage) +{ + void *inline_data = inline_data_addr(inode, ipage); + __le32 *start = inline_data; + __le32 *end = start + MAX_INLINE_DATA(inode) / sizeof(__le32); + + while (start < end) { + if (*start++) { + f2fs_wait_on_page_writeback(ipage, NODE, true); + + set_inode_flag(inode, FI_DATA_EXIST); + set_raw_inline(inode, F2FS_INODE(ipage)); + set_page_dirty(ipage); + return; + } + } + return; +} + +static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + + if (!f2fs_sb_has_inode_chksum(sbi->sb)) + return false; + + if (!IS_INODE(page) || !(ri->i_inline & F2FS_EXTRA_ATTR)) + return false; + + if (!F2FS_FITS_IN_INODE(ri, le16_to_cpu(ri->i_extra_isize), + i_inode_checksum)) + return false; + + return true; +} + +static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_node *node = F2FS_NODE(page); + struct f2fs_inode *ri = &node->i; + __le32 ino = node->footer.ino; + __le32 gen = ri->i_generation; + __u32 chksum, chksum_seed; + __u32 dummy_cs = 0; + unsigned int offset = offsetof(struct f2fs_inode, i_inode_checksum); + unsigned int cs_size = sizeof(dummy_cs); + + chksum = f2fs_chksum(sbi, sbi->s_chksum_seed, (__u8 *)&ino, + sizeof(ino)); + chksum_seed = f2fs_chksum(sbi, chksum, (__u8 *)&gen, sizeof(gen)); + + chksum = f2fs_chksum(sbi, chksum_seed, (__u8 *)ri, offset); + chksum = f2fs_chksum(sbi, chksum, (__u8 *)&dummy_cs, cs_size); + offset += cs_size; + chksum = f2fs_chksum(sbi, chksum, (__u8 *)ri + offset, + F2FS_BLKSIZE - offset); + return chksum; +} + +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri; + __u32 provided, calculated; + + if (unlikely(is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN))) + return true; + +#ifdef CONFIG_F2FS_CHECK_FS + if (!f2fs_enable_inode_chksum(sbi, page)) +#else + if (!f2fs_enable_inode_chksum(sbi, page) || + PageDirty(page) || PageWriteback(page)) +#endif + return true; + + ri = &F2FS_NODE(page)->i; + provided = le32_to_cpu(ri->i_inode_checksum); + calculated = f2fs_inode_chksum(sbi, page); + + if (provided != calculated) + f2fs_msg(sbi->sb, KERN_WARNING, + "checksum invalid, nid = %lu, ino_of_node = %x, %x vs. %x", + page->index, ino_of_node(page), provided, calculated); + + return provided == calculated; +} + +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + + if (!f2fs_enable_inode_chksum(sbi, page)) + return; + + ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); +} + +static bool sanity_check_inode(struct inode *inode, struct page *node_page) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + unsigned long long iblocks; + + iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks); + if (!iblocks) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: corrupted inode i_blocks i_ino=%lx iblocks=%llu, " + "run fsck to fix.", + __func__, inode->i_ino, iblocks); + return false; + } + + if (ino_of_node(node_page) != nid_of_node(node_page)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: corrupted inode footer i_ino=%lx, ino,nid: " + "[%u, %u] run fsck to fix.", + __func__, inode->i_ino, + ino_of_node(node_page), nid_of_node(node_page)); + return false; + } + + if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) + && !f2fs_has_extra_attr(inode)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: corrupted inode ino=%lx, run fsck to fix.", + __func__, inode->i_ino); + return false; + } + + if (f2fs_has_extra_attr(inode) && + !f2fs_sb_has_extra_attr(sbi->sb)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx) is with extra_attr, " + "but extra_attr feature is off", + __func__, inode->i_ino); + return false; + } + + if (fi->i_extra_isize > F2FS_TOTAL_EXTRA_ATTR_SIZE || + fi->i_extra_isize % sizeof(__le32)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx) has corrupted i_extra_isize: %d, " + "max: %zu", + __func__, inode->i_ino, fi->i_extra_isize, + F2FS_TOTAL_EXTRA_ATTR_SIZE); + return false; + } + + if (F2FS_I(inode)->extent_tree) { + struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; + + if (ei->len && + (!f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC) || + !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1, + DATA_GENERIC))) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx) extent info [%u, %u, %u] " + "is incorrect, run fsck to fix", + __func__, inode->i_ino, + ei->blk, ei->fofs, ei->len); + return false; + } + } + + if (f2fs_has_inline_data(inode) && + (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx, mode=%u) should not have " + "inline_data, run fsck to fix", + __func__, inode->i_ino, inode->i_mode); + return false; + } + + if (f2fs_has_inline_dentry(inode) && !S_ISDIR(inode->i_mode)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx, mode=%u) should not have " + "inline_dentry, run fsck to fix", + __func__, inode->i_ino, inode->i_mode); + return false; + } + + return true; +} + +static int do_read_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct page *node_page; + struct f2fs_inode *ri; + projid_t i_projid; + int err; + + /* Check if ino is within scope */ + if (f2fs_check_nid_range(sbi, inode->i_ino)) + return -EINVAL; + + node_page = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(node_page)) + return PTR_ERR(node_page); + + ri = F2FS_INODE(node_page); + + inode->i_mode = le16_to_cpu(ri->i_mode); + i_uid_write(inode, le32_to_cpu(ri->i_uid)); + i_gid_write(inode, le32_to_cpu(ri->i_gid)); + set_nlink(inode, le32_to_cpu(ri->i_links)); + inode->i_size = le64_to_cpu(ri->i_size); + inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1); + + inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime); + inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime); + inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime); + inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec); + inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); + inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); + inode->i_generation = le32_to_cpu(ri->i_generation); + if (S_ISDIR(inode->i_mode)) + fi->i_current_depth = le32_to_cpu(ri->i_current_depth); + else if (S_ISREG(inode->i_mode)) + fi->i_gc_failures[GC_FAILURE_PIN] = + le16_to_cpu(ri->i_gc_failures); + fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); + fi->i_flags = le32_to_cpu(ri->i_flags); + fi->flags = 0; + fi->i_advise = ri->i_advise; + fi->i_pino = le32_to_cpu(ri->i_pino); + fi->i_dir_level = ri->i_dir_level; + + if (f2fs_init_extent_tree(inode, &ri->i_ext)) + set_page_dirty(node_page); + + get_inline_info(inode, ri); + + fi->i_extra_isize = f2fs_has_extra_attr(inode) ? + le16_to_cpu(ri->i_extra_isize) : 0; + + if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { + fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size); + } else if (f2fs_has_inline_xattr(inode) || + f2fs_has_inline_dentry(inode)) { + fi->i_inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + } else { + + /* + * Previous inline data or directory always reserved 200 bytes + * in inode layout, even if inline_xattr is disabled. In order + * to keep inline_dentry's structure for backward compatibility, + * we get the space back only from inline_data. + */ + fi->i_inline_xattr_size = 0; + } + + if (!sanity_check_inode(inode, node_page)) { + f2fs_put_page(node_page, 1); + return -EFSCORRUPTED; + } + + /* check data exist */ + if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) + __recover_inline_status(inode, node_page); + + /* try to recover cold bit for non-dir inode */ + if (!S_ISDIR(inode->i_mode) && !is_cold_node(node_page)) { + set_cold_node(node_page, false); + set_page_dirty(node_page); + } + + /* get rdev by using inline_info */ + __get_inode_rdev(inode, ri); + + if (S_ISREG(inode->i_mode)) { + err = __written_first_block(sbi, ri); + if (err < 0) { + f2fs_put_page(node_page, 1); + return err; + } + if (!err) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + } + + if (!f2fs_need_inode_block_update(sbi, inode->i_ino)) + fi->last_disk_size = inode->i_size; + + if (fi->i_flags & F2FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) + i_projid = (projid_t)le32_to_cpu(ri->i_projid); + else + i_projid = F2FS_DEF_PROJID; + fi->i_projid = make_kprojid(&init_user_ns, i_projid); + + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_inode_crtime(sbi->sb) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) { + fi->i_crtime.tv_sec = le64_to_cpu(ri->i_crtime); + fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec); + } + + F2FS_I(inode)->i_disk_time[0] = inode->i_atime; + F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; + F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; + F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; + f2fs_put_page(node_page, 1); + + stat_inc_inline_xattr(inode); + stat_inc_inline_inode(inode); + stat_inc_inline_dir(inode); + + return 0; +} + +struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *inode; + int ret = 0; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!(inode->i_state & I_NEW)) { + trace_f2fs_iget(inode); + return inode; + } + if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) + goto make_now; + + ret = do_read_inode(inode); + if (ret) + goto bad_inode; +make_now: + if (ino == F2FS_NODE_INO(sbi)) { + inode->i_mapping->a_ops = &f2fs_node_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + } else if (ino == F2FS_META_INO(sbi)) { + inode->i_mapping->a_ops = &f2fs_meta_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + } else if (S_ISREG(inode->i_mode)) { + inode->i_op = &f2fs_file_inode_operations; + inode->i_fop = &f2fs_file_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &f2fs_dir_inode_operations; + inode->i_fop = &f2fs_dir_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + } else if (S_ISLNK(inode->i_mode)) { + if (f2fs_encrypted_inode(inode)) + inode->i_op = &f2fs_encrypted_symlink_inode_operations; + else + inode->i_op = &f2fs_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &f2fs_dblock_aops; + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + inode->i_op = &f2fs_special_inode_operations; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + } else { + ret = -EIO; + goto bad_inode; + } + f2fs_set_inode_flags(inode); + unlock_new_inode(inode); + trace_f2fs_iget(inode); + return inode; + +bad_inode: + f2fs_inode_synced(inode); + iget_failed(inode); + trace_f2fs_iget_exit(inode, ret); + return ERR_PTR(ret); +} + +struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino) +{ + struct inode *inode; +retry: + inode = f2fs_iget(sb, ino); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + } + return inode; +} + +void f2fs_update_inode(struct inode *inode, struct page *node_page) +{ + struct f2fs_inode *ri; + struct extent_tree *et = F2FS_I(inode)->extent_tree; + + f2fs_wait_on_page_writeback(node_page, NODE, true); + set_page_dirty(node_page); + + f2fs_inode_synced(inode); + + ri = F2FS_INODE(node_page); + + ri->i_mode = cpu_to_le16(inode->i_mode); + ri->i_advise = F2FS_I(inode)->i_advise; + ri->i_uid = cpu_to_le32(i_uid_read(inode)); + ri->i_gid = cpu_to_le32(i_gid_read(inode)); + ri->i_links = cpu_to_le32(inode->i_nlink); + ri->i_size = cpu_to_le64(i_size_read(inode)); + ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1); + + if (et) { + read_lock(&et->lock); + set_raw_extent(&et->largest, &ri->i_ext); + read_unlock(&et->lock); + } else { + memset(&ri->i_ext, 0, sizeof(ri->i_ext)); + } + set_raw_inline(inode, ri); + + ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); + ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); + ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + if (S_ISDIR(inode->i_mode)) + ri->i_current_depth = + cpu_to_le32(F2FS_I(inode)->i_current_depth); + else if (S_ISREG(inode->i_mode)) + ri->i_gc_failures = + cpu_to_le16(F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]); + ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid); + ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); + ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); + ri->i_generation = cpu_to_le32(inode->i_generation); + ri->i_dir_level = F2FS_I(inode)->i_dir_level; + + if (f2fs_has_extra_attr(inode)) { + ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize); + + if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode)->sb)) + ri->i_inline_xattr_size = + cpu_to_le16(F2FS_I(inode)->i_inline_xattr_size); + + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_projid)) { + projid_t i_projid; + + i_projid = from_kprojid(&init_user_ns, + F2FS_I(inode)->i_projid); + ri->i_projid = cpu_to_le32(i_projid); + } + + if (f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)->sb) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_crtime)) { + ri->i_crtime = + cpu_to_le64(F2FS_I(inode)->i_crtime.tv_sec); + ri->i_crtime_nsec = + cpu_to_le32(F2FS_I(inode)->i_crtime.tv_nsec); + } + } + + __set_inode_rdev(inode, ri); + + /* deleted inode */ + if (inode->i_nlink == 0) + clear_inline_node(node_page); + + F2FS_I(inode)->i_disk_time[0] = inode->i_atime; + F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; + F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; + F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; + +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_inode_chksum_set(F2FS_I_SB(inode), node_page); +#endif +} + +void f2fs_update_inode_page(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct page *node_page; +retry: + node_page = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(node_page)) { + int err = PTR_ERR(node_page); + if (err == -ENOMEM) { + cond_resched(); + goto retry; + } else if (err != -ENOENT) { + f2fs_stop_checkpoint(sbi, false); + } + return; + } + f2fs_update_inode(inode, node_page); + f2fs_put_page(node_page, 1); +} + +int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) + return 0; + + /* + * atime could be updated without dirtying f2fs inode in lazytime mode + */ + if (f2fs_is_time_consistent(inode) && + !is_inode_flag_set(inode, FI_DIRTY_INODE)) + return 0; + + /* + * We need to balance fs here to prevent from producing dirty node pages + * during the urgent cleaning time when runing out of free sections. + */ + f2fs_update_inode_page(inode); + if (wbc && wbc->nr_to_write) + f2fs_balance_fs(sbi, true); + return 0; +} + +/* + * Called at the last iput() if i_nlink is zero + */ +void f2fs_evict_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + int err = 0; + + /* some remained atomic pages should discarded */ + if (f2fs_is_atomic_file(inode)) + f2fs_drop_inmem_pages(inode); + + trace_f2fs_evict_inode(inode); + truncate_inode_pages_final(&inode->i_data); + + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) + goto out_clear; + + f2fs_bug_on(sbi, get_dirty_pages(inode)); + f2fs_remove_dirty_inode(inode); + + f2fs_destroy_extent_tree(inode); + + if (inode->i_nlink || is_bad_inode(inode)) + goto no_delete; + + dquot_initialize(inode); + + f2fs_remove_ino_entry(sbi, inode->i_ino, APPEND_INO); + f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); + f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO); + + sb_start_intwrite(inode->i_sb); + set_inode_flag(inode, FI_NO_ALLOC); + i_size_write(inode, 0); +retry: + if (F2FS_HAS_BLOCKS(inode)) + err = f2fs_truncate(inode); + + if (time_to_inject(sbi, FAULT_EVICT_INODE)) { + f2fs_show_injection_info(FAULT_EVICT_INODE); + err = -EIO; + } + + if (!err) { + f2fs_lock_op(sbi); + err = f2fs_remove_inode_page(inode); + f2fs_unlock_op(sbi); + if (err == -ENOENT) + err = 0; + } + + /* give more chances, if ENOMEM case */ + if (err == -ENOMEM) { + err = 0; + goto retry; + } + + if (err) + f2fs_update_inode_page(inode); + dquot_free_inode(inode); + sb_end_intwrite(inode->i_sb); +no_delete: + dquot_drop(inode); + + stat_dec_inline_xattr(inode); + stat_dec_inline_dir(inode); + stat_dec_inline_inode(inode); + + if (likely(!is_set_ckpt_flags(sbi, CP_ERROR_FLAG))) + f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); + else + f2fs_inode_synced(inode); + + /* ino == 0, if f2fs_new_inode() was failed t*/ + if (inode->i_ino) + invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, + inode->i_ino); + if (xnid) + invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); + if (inode->i_nlink) { + if (is_inode_flag_set(inode, FI_APPEND_WRITE)) + f2fs_add_ino_entry(sbi, inode->i_ino, APPEND_INO); + if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) + f2fs_add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + } + if (is_inode_flag_set(inode, FI_FREE_NID)) { + f2fs_alloc_nid_failed(sbi, inode->i_ino); + clear_inode_flag(inode, FI_FREE_NID); + } else { + /* + * If xattr nid is corrupted, we can reach out error condition, + * err & !f2fs_exist_written_data(sbi, inode->i_ino, ORPHAN_INO)). + * In that case, f2fs_check_nid_range() is enough to give a clue. + */ + } +out_clear: + fscrypt_put_encryption_info(inode); + clear_inode(inode); +} + +/* caller should call f2fs_lock_op() */ +void f2fs_handle_failed_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct node_info ni; + int err; + + /* + * clear nlink of inode in order to release resource of inode + * immediately. + */ + clear_nlink(inode); + + /* + * we must call this to avoid inode being remained as dirty, resulting + * in a panic when flushing dirty inodes in gdirty_list. + */ + f2fs_update_inode_page(inode); + f2fs_inode_synced(inode); + + /* don't make bad inode, since it becomes a regular file. */ + unlock_new_inode(inode); + + /* + * Note: we should add inode to orphan list before f2fs_unlock_op() + * so we can prevent losing this orphan when encoutering checkpoint + * and following suddenly power-off. + */ + err = f2fs_get_node_info(sbi, inode->i_ino, &ni); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "May loss orphan inode, run fsck to fix."); + goto out; + } + + if (ni.blk_addr != NULL_ADDR) { + err = f2fs_acquire_orphan_inode(sbi); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "Too many orphan inodes, run fsck to fix."); + } else { + f2fs_add_orphan_inode(inode); + } + f2fs_alloc_nid_done(sbi, inode->i_ino); + } else { + set_inode_flag(inode, FI_FREE_NID); + } + +out: + f2fs_unlock_op(sbi); + + /* iput will drop the inode object */ + iput(inode); +} diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c new file mode 100644 index 000000000..9e4c38481 --- /dev/null +++ b/fs/f2fs/namei.c @@ -0,0 +1,1277 @@ +/* + * fs/f2fs/namei.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/pagemap.h> +#include <linux/sched.h> +#include <linux/ctype.h> +#include <linux/dcache.h> +#include <linux/namei.h> +#include <linux/quotaops.h> + +#include "f2fs.h" +#include "node.h" +#include "xattr.h" +#include "acl.h" +#include <trace/events/f2fs.h> + +static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + nid_t ino; + struct inode *inode; + bool nid_free = false; + int xattr_size = 0; + int err; + + inode = new_inode(dir->i_sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + f2fs_lock_op(sbi); + if (!f2fs_alloc_nid(sbi, &ino)) { + f2fs_unlock_op(sbi); + err = -ENOSPC; + goto fail; + } + f2fs_unlock_op(sbi); + + nid_free = true; + + inode_init_owner(inode, dir, mode); + + inode->i_ino = ino; + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + F2FS_I(inode)->i_crtime = inode->i_mtime; + inode->i_generation = sbi->s_next_generation++; + + if (S_ISDIR(inode->i_mode)) + F2FS_I(inode)->i_current_depth = 1; + + err = insert_inode_locked(inode); + if (err) { + err = -EINVAL; + goto fail; + } + + if (f2fs_sb_has_project_quota(sbi->sb) && + (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL)) + F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; + else + F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, + F2FS_DEF_PROJID); + + err = dquot_initialize(inode); + if (err) + goto fail_drop; + + err = dquot_alloc_inode(inode); + if (err) + goto fail_drop; + + set_inode_flag(inode, FI_NEW_INODE); + + /* If the directory encrypted, then we should encrypt the inode. */ + if ((f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && + f2fs_may_encrypt(inode)) + f2fs_set_encrypted_inode(inode); + + if (f2fs_sb_has_extra_attr(sbi->sb)) { + set_inode_flag(inode, FI_EXTRA_ATTR); + F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; + } + + if (test_opt(sbi, INLINE_XATTR)) + set_inode_flag(inode, FI_INLINE_XATTR); + + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) + set_inode_flag(inode, FI_INLINE_DATA); + if (f2fs_may_inline_dentry(inode)) + set_inode_flag(inode, FI_INLINE_DENTRY); + + if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { + f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); + if (f2fs_has_inline_xattr(inode)) + xattr_size = F2FS_OPTION(sbi).inline_xattr_size; + /* Otherwise, will be 0 */ + } else if (f2fs_has_inline_xattr(inode) || + f2fs_has_inline_dentry(inode)) { + xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + } + F2FS_I(inode)->i_inline_xattr_size = xattr_size; + + f2fs_init_extent_tree(inode, NULL); + + stat_inc_inline_xattr(inode); + stat_inc_inline_inode(inode); + stat_inc_inline_dir(inode); + + F2FS_I(inode)->i_flags = + f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); + + if (S_ISDIR(inode->i_mode)) + F2FS_I(inode)->i_flags |= F2FS_INDEX_FL; + + if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + + f2fs_set_inode_flags(inode); + + trace_f2fs_new_inode(inode, 0); + return inode; + +fail: + trace_f2fs_new_inode(inode, err); + make_bad_inode(inode); + if (nid_free) + set_inode_flag(inode, FI_FREE_NID); + iput(inode); + return ERR_PTR(err); +fail_drop: + trace_f2fs_new_inode(inode, err); + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + if (nid_free) + set_inode_flag(inode, FI_FREE_NID); + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(err); +} + +static int is_extension_exist(const unsigned char *s, const char *sub) +{ + size_t slen = strlen(s); + size_t sublen = strlen(sub); + int i; + + /* + * filename format of multimedia file should be defined as: + * "filename + '.' + extension + (optional: '.' + temp extension)". + */ + if (slen < sublen + 2) + return 0; + + for (i = 1; i < slen - sublen; i++) { + if (s[i] != '.') + continue; + if (!strncasecmp(s + i + 1, sub, sublen)) + return 1; + } + + return 0; +} + +/* + * Set multimedia files as cold files for hot/cold data separation + */ +static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, + const unsigned char *name) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int i, cold_count, hot_count; + + down_read(&sbi->sb_lock); + + cold_count = le32_to_cpu(sbi->raw_super->extension_count); + hot_count = sbi->raw_super->hot_ext_count; + + for (i = 0; i < cold_count + hot_count; i++) { + if (!is_extension_exist(name, extlist[i])) + continue; + if (i < cold_count) + file_set_cold(inode); + else + file_set_hot(inode); + break; + } + + up_read(&sbi->sb_lock); +} + +int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, + bool hot, bool set) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int cold_count = le32_to_cpu(sbi->raw_super->extension_count); + int hot_count = sbi->raw_super->hot_ext_count; + int total_count = cold_count + hot_count; + int start, count; + int i; + + if (set) { + if (total_count == F2FS_MAX_EXTENSION) + return -EINVAL; + } else { + if (!hot && !cold_count) + return -EINVAL; + if (hot && !hot_count) + return -EINVAL; + } + + if (hot) { + start = cold_count; + count = total_count; + } else { + start = 0; + count = cold_count; + } + + for (i = start; i < count; i++) { + if (strcmp(name, extlist[i])) + continue; + + if (set) + return -EINVAL; + + memcpy(extlist[i], extlist[i + 1], + F2FS_EXTENSION_LEN * (total_count - i - 1)); + memset(extlist[total_count - 1], 0, F2FS_EXTENSION_LEN); + if (hot) + sbi->raw_super->hot_ext_count = hot_count - 1; + else + sbi->raw_super->extension_count = + cpu_to_le32(cold_count - 1); + return 0; + } + + if (!set) + return -EINVAL; + + if (hot) { + memcpy(extlist[count], name, strlen(name)); + sbi->raw_super->hot_ext_count = hot_count + 1; + } else { + char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN]; + + memcpy(buf, &extlist[cold_count], + F2FS_EXTENSION_LEN * hot_count); + memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN); + memcpy(extlist[cold_count], name, strlen(name)); + memcpy(&extlist[cold_count + 1], buf, + F2FS_EXTENSION_LEN * hot_count); + sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1); + } + return 0; +} + +static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode; + nid_t ino = 0; + int err; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + err = dquot_initialize(dir); + if (err) + return err; + + inode = f2fs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) + set_file_temperature(sbi, inode, dentry->d_name.name); + + inode->i_op = &f2fs_file_inode_operations; + inode->i_fop = &f2fs_file_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + ino = inode->i_ino; + + f2fs_lock_op(sbi); + err = f2fs_add_link(dentry, inode); + if (err) + goto out; + f2fs_unlock_op(sbi); + + f2fs_alloc_nid_done(sbi, ino); + + d_instantiate_new(dentry, inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); + return 0; +out: + f2fs_handle_failed_inode(inode); + return err; +} + +static int f2fs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = d_inode(old_dentry); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + int err; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + err = fscrypt_prepare_link(old_dentry, dir, dentry); + if (err) + return err; + + if (is_inode_flag_set(dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + + err = dquot_initialize(dir); + if (err) + return err; + + f2fs_balance_fs(sbi, true); + + inode->i_ctime = current_time(inode); + ihold(inode); + + set_inode_flag(inode, FI_INC_LINK); + f2fs_lock_op(sbi); + err = f2fs_add_link(dentry, inode); + if (err) + goto out; + f2fs_unlock_op(sbi); + + d_instantiate(dentry, inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + return 0; +out: + clear_inode_flag(inode, FI_INC_LINK); + iput(inode); + f2fs_unlock_op(sbi); + return err; +} + +struct dentry *f2fs_get_parent(struct dentry *child) +{ + struct qstr dotdot = QSTR_INIT("..", 2); + struct page *page; + unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page); + if (!ino) { + if (IS_ERR(page)) + return ERR_CAST(page); + return ERR_PTR(-ENOENT); + } + return d_obtain_alias(f2fs_iget(child->d_sb, ino)); +} + +static int __recover_dot_dentries(struct inode *dir, nid_t pino) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct qstr dot = QSTR_INIT(".", 1); + struct qstr dotdot = QSTR_INIT("..", 2); + struct f2fs_dir_entry *de; + struct page *page; + int err = 0; + + if (f2fs_readonly(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_INFO, + "skip recovering inline_dots inode (ino:%lu, pino:%u) " + "in readonly mountpoint", dir->i_ino, pino); + return 0; + } + + err = dquot_initialize(dir); + if (err) + return err; + + f2fs_balance_fs(sbi, true); + + f2fs_lock_op(sbi); + + de = f2fs_find_entry(dir, &dot, &page); + if (de) { + f2fs_put_page(page, 0); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } else { + err = f2fs_do_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR); + if (err) + goto out; + } + + de = f2fs_find_entry(dir, &dotdot, &page); + if (de) + f2fs_put_page(page, 0); + else if (IS_ERR(page)) + err = PTR_ERR(page); + else + err = f2fs_do_add_link(dir, &dotdot, NULL, pino, S_IFDIR); +out: + if (!err) + clear_inode_flag(dir, FI_INLINE_DOTS); + + f2fs_unlock_op(sbi); + return err; +} + +static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct inode *inode = NULL; + struct f2fs_dir_entry *de; + struct page *page; + struct dentry *new; + nid_t ino = -1; + int err = 0; + unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir)); + struct fscrypt_name fname; + + trace_f2fs_lookup_start(dir, dentry, flags); + + if (dentry->d_name.len > F2FS_NAME_LEN) { + err = -ENAMETOOLONG; + goto out; + } + + err = fscrypt_prepare_lookup(dir, dentry, &fname); + if (err == -ENOENT) + goto out_splice; + if (err) + goto out; + de = __f2fs_find_entry(dir, &fname, &page); + fscrypt_free_filename(&fname); + + if (!de) { + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } + goto out_splice; + } + + ino = le32_to_cpu(de->ino); + f2fs_put_page(page, 0); + + inode = f2fs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } + + if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) { + err = __recover_dot_dentries(dir, root_ino); + if (err) + goto out_iput; + } + + if (f2fs_has_inline_dots(inode)) { + err = __recover_dot_dentries(inode, dir->i_ino); + if (err) + goto out_iput; + } + if (f2fs_encrypted_inode(dir) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && + !fscrypt_has_permitted_context(dir, inode)) { + f2fs_msg(inode->i_sb, KERN_WARNING, + "Inconsistent encryption contexts: %lu/%lu", + dir->i_ino, inode->i_ino); + err = -EPERM; + goto out_iput; + } +out_splice: + new = d_splice_alias(inode, dentry); + err = PTR_ERR_OR_ZERO(new); + trace_f2fs_lookup_end(dir, dentry, ino, err); + return new; +out_iput: + iput(inode); +out: + trace_f2fs_lookup_end(dir, dentry, ino, err); + return ERR_PTR(err); +} + +static int f2fs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode = d_inode(dentry); + struct f2fs_dir_entry *de; + struct page *page; + int err = -ENOENT; + + trace_f2fs_unlink_enter(dir, dentry); + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + err = dquot_initialize(dir); + if (err) + return err; + err = dquot_initialize(inode); + if (err) + return err; + + de = f2fs_find_entry(dir, &dentry->d_name, &page); + if (!de) { + if (IS_ERR(page)) + err = PTR_ERR(page); + goto fail; + } + + f2fs_balance_fs(sbi, true); + + f2fs_lock_op(sbi); + err = f2fs_acquire_orphan_inode(sbi); + if (err) { + f2fs_unlock_op(sbi); + f2fs_put_page(page, 0); + goto fail; + } + f2fs_delete_entry(de, page, dir, inode); + f2fs_unlock_op(sbi); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); +fail: + trace_f2fs_unlink_exit(inode, err); + return err; +} + +static const char *f2fs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + const char *link = page_get_link(dentry, inode, done); + if (!IS_ERR(link) && !*link) { + /* this is broken symlink case */ + do_delayed_call(done); + clear_delayed_call(done); + link = ERR_PTR(-ENOENT); + } + return link; +} + +static int f2fs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode; + size_t len = strlen(symname); + struct fscrypt_str disk_link; + int err; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, + &disk_link); + if (err) + return err; + + err = dquot_initialize(dir); + if (err) + return err; + + inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (IS_ENCRYPTED(inode)) + inode->i_op = &f2fs_encrypted_symlink_inode_operations; + else + inode->i_op = &f2fs_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &f2fs_dblock_aops; + + f2fs_lock_op(sbi); + err = f2fs_add_link(dentry, inode); + if (err) + goto out_f2fs_handle_failed_inode; + f2fs_unlock_op(sbi); + f2fs_alloc_nid_done(sbi, inode->i_ino); + + err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link); + if (err) + goto err_out; + + err = page_symlink(inode, disk_link.name, disk_link.len); + +err_out: + d_instantiate_new(dentry, inode); + + /* + * Let's flush symlink data in order to avoid broken symlink as much as + * possible. Nevertheless, fsyncing is the best way, but there is no + * way to get a file descriptor in order to flush that. + * + * Note that, it needs to do dir->fsync to make this recoverable. + * If the symlink path is stored into inline_data, there is no + * performance regression. + */ + if (!err) { + filemap_write_and_wait_range(inode->i_mapping, 0, + disk_link.len - 1); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + } else { + f2fs_unlink(dir, dentry); + } + + f2fs_balance_fs(sbi, true); + goto out_free_encrypted_link; + +out_f2fs_handle_failed_inode: + f2fs_handle_failed_inode(inode); +out_free_encrypted_link: + if (disk_link.name != (unsigned char *)symname) + kfree(disk_link.name); + return err; +} + +static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode; + int err; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + err = dquot_initialize(dir); + if (err) + return err; + + inode = f2fs_new_inode(dir, S_IFDIR | mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &f2fs_dir_inode_operations; + inode->i_fop = &f2fs_dir_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + + set_inode_flag(inode, FI_INC_LINK); + f2fs_lock_op(sbi); + err = f2fs_add_link(dentry, inode); + if (err) + goto out_fail; + f2fs_unlock_op(sbi); + + f2fs_alloc_nid_done(sbi, inode->i_ino); + + d_instantiate_new(dentry, inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); + return 0; + +out_fail: + clear_inode_flag(inode, FI_INC_LINK); + f2fs_handle_failed_inode(inode); + return err; +} + +static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + if (f2fs_empty_dir(inode)) + return f2fs_unlink(dir, dentry); + return -ENOTEMPTY; +} + +static int f2fs_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode; + int err = 0; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + err = dquot_initialize(dir); + if (err) + return err; + + inode = f2fs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + init_special_inode(inode, inode->i_mode, rdev); + inode->i_op = &f2fs_special_inode_operations; + + f2fs_lock_op(sbi); + err = f2fs_add_link(dentry, inode); + if (err) + goto out; + f2fs_unlock_op(sbi); + + f2fs_alloc_nid_done(sbi, inode->i_ino); + + d_instantiate_new(dentry, inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); + return 0; +out: + f2fs_handle_failed_inode(inode); + return err; +} + +static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, + umode_t mode, struct inode **whiteout) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode; + int err; + + err = dquot_initialize(dir); + if (err) + return err; + + inode = f2fs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (whiteout) { + init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); + inode->i_op = &f2fs_special_inode_operations; + } else { + inode->i_op = &f2fs_file_inode_operations; + inode->i_fop = &f2fs_file_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + } + + f2fs_lock_op(sbi); + err = f2fs_acquire_orphan_inode(sbi); + if (err) + goto out; + + err = f2fs_do_tmpfile(inode, dir); + if (err) + goto release_out; + + /* + * add this non-linked tmpfile to orphan list, in this way we could + * remove all unused data of tmpfile after abnormal power-off. + */ + f2fs_add_orphan_inode(inode); + f2fs_alloc_nid_done(sbi, inode->i_ino); + + if (whiteout) { + f2fs_i_links_write(inode, false); + + spin_lock(&inode->i_lock); + inode->i_state |= I_LINKABLE; + spin_unlock(&inode->i_lock); + + *whiteout = inode; + } else { + d_tmpfile(dentry, inode); + } + /* link_count was changed by d_tmpfile as well. */ + f2fs_unlock_op(sbi); + unlock_new_inode(inode); + + f2fs_balance_fs(sbi, true); + return 0; + +release_out: + f2fs_release_orphan_inode(sbi); +out: + f2fs_handle_failed_inode(inode); + return err; +} + +static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + if (f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) { + int err = fscrypt_get_encryption_info(dir); + if (err) + return err; + } + + return __f2fs_tmpfile(dir, dentry, mode, NULL); +} + +static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout) +{ + if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) + return -EIO; + + return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout); +} + +static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + struct inode *whiteout = NULL; + struct page *old_dir_page; + struct page *old_page, *new_page = NULL; + struct f2fs_dir_entry *old_dir_entry = NULL; + struct f2fs_dir_entry *old_entry; + struct f2fs_dir_entry *new_entry; + bool is_old_inline = f2fs_has_inline_dentry(old_dir); + int err = -ENOENT; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + + if (flags & RENAME_WHITEOUT) { + err = f2fs_create_whiteout(old_dir, &whiteout); + if (err) + return err; + } + + err = dquot_initialize(old_dir); + if (err) + goto out; + + err = dquot_initialize(new_dir); + if (err) + goto out; + + if (new_inode) { + err = dquot_initialize(new_inode); + if (err) + goto out; + } + + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); + if (!old_entry) { + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); + goto out; + } + + if (S_ISDIR(old_inode->i_mode)) { + old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); + if (!old_dir_entry) { + if (IS_ERR(old_dir_page)) + err = PTR_ERR(old_dir_page); + goto out_old; + } + } + + if (new_inode) { + + err = -ENOTEMPTY; + if (old_dir_entry && !f2fs_empty_dir(new_inode)) + goto out_dir; + + err = -ENOENT; + new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, + &new_page); + if (!new_entry) { + if (IS_ERR(new_page)) + err = PTR_ERR(new_page); + goto out_dir; + } + + f2fs_balance_fs(sbi, true); + + f2fs_lock_op(sbi); + + err = f2fs_acquire_orphan_inode(sbi); + if (err) + goto put_out_dir; + + f2fs_set_link(new_dir, new_entry, new_page, old_inode); + + new_inode->i_ctime = current_time(new_inode); + down_write(&F2FS_I(new_inode)->i_sem); + if (old_dir_entry) + f2fs_i_links_write(new_inode, false); + f2fs_i_links_write(new_inode, false); + up_write(&F2FS_I(new_inode)->i_sem); + + if (!new_inode->i_nlink) + f2fs_add_orphan_inode(new_inode); + else + f2fs_release_orphan_inode(sbi); + } else { + f2fs_balance_fs(sbi, true); + + f2fs_lock_op(sbi); + + err = f2fs_add_link(new_dentry, old_inode); + if (err) { + f2fs_unlock_op(sbi); + goto out_dir; + } + + if (old_dir_entry) + f2fs_i_links_write(new_dir, true); + + /* + * old entry and new entry can locate in the same inline + * dentry in inode, when attaching new entry in inline dentry, + * it could force inline dentry conversion, after that, + * old_entry and old_page will point to wrong address, in + * order to avoid this, let's do the check and update here. + */ + if (is_old_inline && !f2fs_has_inline_dentry(old_dir)) { + f2fs_put_page(old_page, 0); + old_page = NULL; + + old_entry = f2fs_find_entry(old_dir, + &old_dentry->d_name, &old_page); + if (!old_entry) { + err = -ENOENT; + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); + f2fs_unlock_op(sbi); + goto out_dir; + } + } + } + + down_write(&F2FS_I(old_inode)->i_sem); + if (!old_dir_entry || whiteout) + file_lost_pino(old_inode); + else + /* adjust dir's i_pino to pass fsck check */ + f2fs_i_pino_write(old_inode, new_dir->i_ino); + up_write(&F2FS_I(old_inode)->i_sem); + + old_inode->i_ctime = current_time(old_inode); + f2fs_mark_inode_dirty_sync(old_inode, false); + + f2fs_delete_entry(old_entry, old_page, old_dir, NULL); + + if (whiteout) { + set_inode_flag(whiteout, FI_INC_LINK); + err = f2fs_add_link(old_dentry, whiteout); + if (err) + goto put_out_dir; + + spin_lock(&whiteout->i_lock); + whiteout->i_state &= ~I_LINKABLE; + spin_unlock(&whiteout->i_lock); + + iput(whiteout); + } + + if (old_dir_entry) { + if (old_dir != new_dir && !whiteout) + f2fs_set_link(old_inode, old_dir_entry, + old_dir_page, new_dir); + else + f2fs_put_page(old_dir_page, 0); + f2fs_i_links_write(old_dir, false); + } + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) { + f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + if (S_ISDIR(old_inode->i_mode)) + f2fs_add_ino_entry(sbi, old_inode->i_ino, + TRANS_DIR_INO); + } + + f2fs_unlock_op(sbi); + + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + f2fs_sync_fs(sbi->sb, 1); + return 0; + +put_out_dir: + f2fs_unlock_op(sbi); + if (new_page) + f2fs_put_page(new_page, 0); +out_dir: + if (old_dir_entry) + f2fs_put_page(old_dir_page, 0); +out_old: + f2fs_put_page(old_page, 0); +out: + if (whiteout) + iput(whiteout); + return err; +} + +static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + struct page *old_dir_page, *new_dir_page; + struct page *old_page, *new_page; + struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL; + struct f2fs_dir_entry *old_entry, *new_entry; + int old_nlink = 0, new_nlink = 0; + int err = -ENOENT; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid)) || + (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(old_dir)->i_projid, + F2FS_I(new_dentry->d_inode)->i_projid))) + return -EXDEV; + + err = dquot_initialize(old_dir); + if (err) + goto out; + + err = dquot_initialize(new_dir); + if (err) + goto out; + + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); + if (!old_entry) { + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); + goto out; + } + + new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); + if (!new_entry) { + if (IS_ERR(new_page)) + err = PTR_ERR(new_page); + goto out_old; + } + + /* prepare for updating ".." directory entry info later */ + if (old_dir != new_dir) { + if (S_ISDIR(old_inode->i_mode)) { + old_dir_entry = f2fs_parent_dir(old_inode, + &old_dir_page); + if (!old_dir_entry) { + if (IS_ERR(old_dir_page)) + err = PTR_ERR(old_dir_page); + goto out_new; + } + } + + if (S_ISDIR(new_inode->i_mode)) { + new_dir_entry = f2fs_parent_dir(new_inode, + &new_dir_page); + if (!new_dir_entry) { + if (IS_ERR(new_dir_page)) + err = PTR_ERR(new_dir_page); + goto out_old_dir; + } + } + } + + /* + * If cross rename between file and directory those are not + * in the same directory, we will inc nlink of file's parent + * later, so we should check upper boundary of its nlink. + */ + if ((!old_dir_entry || !new_dir_entry) && + old_dir_entry != new_dir_entry) { + old_nlink = old_dir_entry ? -1 : 1; + new_nlink = -old_nlink; + err = -EMLINK; + if ((old_nlink > 0 && old_dir->i_nlink >= F2FS_LINK_MAX) || + (new_nlink > 0 && new_dir->i_nlink >= F2FS_LINK_MAX)) + goto out_new_dir; + } + + f2fs_balance_fs(sbi, true); + + f2fs_lock_op(sbi); + + /* update ".." directory entry info of old dentry */ + if (old_dir_entry) + f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); + + /* update ".." directory entry info of new dentry */ + if (new_dir_entry) + f2fs_set_link(new_inode, new_dir_entry, new_dir_page, old_dir); + + /* update directory entry info of old dir inode */ + f2fs_set_link(old_dir, old_entry, old_page, new_inode); + + down_write(&F2FS_I(old_inode)->i_sem); + if (!old_dir_entry) + file_lost_pino(old_inode); + else + /* adjust dir's i_pino to pass fsck check */ + f2fs_i_pino_write(old_inode, new_dir->i_ino); + up_write(&F2FS_I(old_inode)->i_sem); + + old_dir->i_ctime = current_time(old_dir); + if (old_nlink) { + down_write(&F2FS_I(old_dir)->i_sem); + f2fs_i_links_write(old_dir, old_nlink > 0); + up_write(&F2FS_I(old_dir)->i_sem); + } + f2fs_mark_inode_dirty_sync(old_dir, false); + + /* update directory entry info of new dir inode */ + f2fs_set_link(new_dir, new_entry, new_page, old_inode); + + down_write(&F2FS_I(new_inode)->i_sem); + if (!new_dir_entry) + file_lost_pino(new_inode); + else + /* adjust dir's i_pino to pass fsck check */ + f2fs_i_pino_write(new_inode, old_dir->i_ino); + up_write(&F2FS_I(new_inode)->i_sem); + + new_dir->i_ctime = current_time(new_dir); + if (new_nlink) { + down_write(&F2FS_I(new_dir)->i_sem); + f2fs_i_links_write(new_dir, new_nlink > 0); + up_write(&F2FS_I(new_dir)->i_sem); + } + f2fs_mark_inode_dirty_sync(new_dir, false); + + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) { + f2fs_add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); + f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + } + + f2fs_unlock_op(sbi); + + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + f2fs_sync_fs(sbi->sb, 1); + return 0; +out_new_dir: + if (new_dir_entry) { + f2fs_put_page(new_dir_page, 0); + } +out_old_dir: + if (old_dir_entry) { + f2fs_put_page(old_dir_page, 0); + } +out_new: + f2fs_put_page(new_page, 0); +out_old: + f2fs_put_page(old_page, 0); +out: + return err; +} + +static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + int err; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) + return -EINVAL; + + err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); + if (err) + return err; + + if (flags & RENAME_EXCHANGE) { + return f2fs_cross_rename(old_dir, old_dentry, + new_dir, new_dentry); + } + /* + * VFS has already handled the new dentry existence case, + * here, we just deal with "RENAME_NOREPLACE" as regular rename. + */ + return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); +} + +static const char *f2fs_encrypted_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + struct page *page; + const char *target; + + if (!dentry) + return ERR_PTR(-ECHILD); + + page = read_mapping_page(inode->i_mapping, 0, NULL); + if (IS_ERR(page)) + return ERR_CAST(page); + + target = fscrypt_get_symlink(inode, page_address(page), + inode->i_sb->s_blocksize, done); + put_page(page); + return target; +} + +static int f2fs_encrypted_symlink_getattr(const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + f2fs_getattr(path, stat, request_mask, query_flags); + + return fscrypt_symlink_getattr(path, stat); +} + +const struct inode_operations f2fs_encrypted_symlink_inode_operations = { + .get_link = f2fs_encrypted_get_link, + .getattr = f2fs_encrypted_symlink_getattr, + .setattr = f2fs_setattr, +#ifdef CONFIG_F2FS_FS_XATTR + .listxattr = f2fs_listxattr, +#endif +}; + +const struct inode_operations f2fs_dir_inode_operations = { + .create = f2fs_create, + .lookup = f2fs_lookup, + .link = f2fs_link, + .unlink = f2fs_unlink, + .symlink = f2fs_symlink, + .mkdir = f2fs_mkdir, + .rmdir = f2fs_rmdir, + .mknod = f2fs_mknod, + .rename = f2fs_rename2, + .tmpfile = f2fs_tmpfile, + .getattr = f2fs_getattr, + .setattr = f2fs_setattr, + .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, +#ifdef CONFIG_F2FS_FS_XATTR + .listxattr = f2fs_listxattr, +#endif +}; + +const struct inode_operations f2fs_symlink_inode_operations = { + .get_link = f2fs_get_link, + .getattr = f2fs_getattr, + .setattr = f2fs_setattr, +#ifdef CONFIG_F2FS_FS_XATTR + .listxattr = f2fs_listxattr, +#endif +}; + +const struct inode_operations f2fs_special_inode_operations = { + .getattr = f2fs_getattr, + .setattr = f2fs_setattr, + .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, +#ifdef CONFIG_F2FS_FS_XATTR + .listxattr = f2fs_listxattr, +#endif +}; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c new file mode 100644 index 000000000..ff3f97ba1 --- /dev/null +++ b/fs/f2fs/node.c @@ -0,0 +1,3174 @@ +/* + * fs/f2fs/node.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/mpage.h> +#include <linux/backing-dev.h> +#include <linux/blkdev.h> +#include <linux/pagevec.h> +#include <linux/swap.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "xattr.h" +#include "trace.h" +#include <trace/events/f2fs.h> + +#define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) + +static struct kmem_cache *nat_entry_slab; +static struct kmem_cache *free_nid_slab; +static struct kmem_cache *nat_entry_set_slab; +static struct kmem_cache *fsync_node_entry_slab; + +/* + * Check whether the given nid is within node id range. + */ +int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) +{ + if (unlikely(nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: out-of-range nid=%x, run fsck to fix.", + __func__, nid); + return -EFSCORRUPTED; + } + return 0; +} + +bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct sysinfo val; + unsigned long avail_ram; + unsigned long mem_size = 0; + bool res = false; + + si_meminfo(&val); + + /* only uses low memory */ + avail_ram = val.totalram - val.totalhigh; + + /* + * give 25%, 25%, 50%, 50%, 50% memory for each components respectively + */ + if (type == FREE_NIDS) { + mem_size = (nm_i->nid_cnt[FREE_NID] * + sizeof(struct free_nid)) >> PAGE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); + } else if (type == NAT_ENTRIES) { + mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> + PAGE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); + if (excess_cached_nats(sbi)) + res = false; + } else if (type == DIRTY_DENTS) { + if (sbi->sb->s_bdi->wb.dirty_exceeded) + return false; + mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + } else if (type == INO_ENTRIES) { + int i; + + for (i = 0; i < MAX_INO_ENTRY; i++) + mem_size += sbi->im[i].ino_num * + sizeof(struct ino_entry); + mem_size >>= PAGE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + } else if (type == EXTENT_CACHE) { + mem_size = (atomic_read(&sbi->total_ext_tree) * + sizeof(struct extent_tree) + + atomic_read(&sbi->total_ext_node) * + sizeof(struct extent_node)) >> PAGE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + } else if (type == INMEM_PAGES) { + /* it allows 20% / total_ram for inmemory pages */ + mem_size = get_pages(sbi, F2FS_INMEM_PAGES); + res = mem_size < (val.totalram / 5); + } else { + if (!sbi->sb->s_bdi->wb.dirty_exceeded) + return true; + } + return res; +} + +static void clear_node_page_dirty(struct page *page) +{ + if (PageDirty(page)) { + f2fs_clear_radix_tree_dirty_tag(page); + clear_page_dirty_for_io(page); + dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); + } + ClearPageUptodate(page); +} + +static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) +{ + return f2fs_get_meta_page_nofail(sbi, current_nat_addr(sbi, nid)); +} + +static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct page *src_page; + struct page *dst_page; + pgoff_t dst_off; + void *src_addr; + void *dst_addr; + struct f2fs_nm_info *nm_i = NM_I(sbi); + + dst_off = next_nat_addr(sbi, current_nat_addr(sbi, nid)); + + /* get current nat block page with lock */ + src_page = get_current_nat_page(sbi, nid); + dst_page = f2fs_grab_meta_page(sbi, dst_off); + f2fs_bug_on(sbi, PageDirty(src_page)); + + src_addr = page_address(src_page); + dst_addr = page_address(dst_page); + memcpy(dst_addr, src_addr, PAGE_SIZE); + set_page_dirty(dst_page); + f2fs_put_page(src_page, 1); + + set_to_next_nat(nm_i, nid); + + return dst_page; +} + +static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail) +{ + struct nat_entry *new; + + if (no_fail) + new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO); + else + new = kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO); + if (new) { + nat_set_nid(new, nid); + nat_reset_flag(new); + } + return new; +} + +static void __free_nat_entry(struct nat_entry *e) +{ + kmem_cache_free(nat_entry_slab, e); +} + +/* must be locked by nat_tree_lock */ +static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, + struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail) +{ + if (no_fail) + f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne); + else if (radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne)) + return NULL; + + if (raw_ne) + node_info_from_raw_nat(&ne->ni, raw_ne); + + spin_lock(&nm_i->nat_list_lock); + list_add_tail(&ne->list, &nm_i->nat_entries); + spin_unlock(&nm_i->nat_list_lock); + + nm_i->nat_cnt++; + return ne; +} + +static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) +{ + struct nat_entry *ne; + + ne = radix_tree_lookup(&nm_i->nat_root, n); + + /* for recent accessed nat entry, move it to tail of lru list */ + if (ne && !get_nat_flag(ne, IS_DIRTY)) { + spin_lock(&nm_i->nat_list_lock); + if (!list_empty(&ne->list)) + list_move_tail(&ne->list, &nm_i->nat_entries); + spin_unlock(&nm_i->nat_list_lock); + } + + return ne; +} + +static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i, + nid_t start, unsigned int nr, struct nat_entry **ep) +{ + return radix_tree_gang_lookup(&nm_i->nat_root, (void **)ep, start, nr); +} + +static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) +{ + radix_tree_delete(&nm_i->nat_root, nat_get_nid(e)); + nm_i->nat_cnt--; + __free_nat_entry(e); +} + +static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) +{ + nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); + struct nat_entry_set *head; + + head = radix_tree_lookup(&nm_i->nat_set_root, set); + if (!head) { + head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS); + + INIT_LIST_HEAD(&head->entry_list); + INIT_LIST_HEAD(&head->set_list); + head->set = set; + head->entry_cnt = 0; + f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head); + } + return head; +} + +static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) +{ + struct nat_entry_set *head; + bool new_ne = nat_get_blkaddr(ne) == NEW_ADDR; + + if (!new_ne) + head = __grab_nat_entry_set(nm_i, ne); + + /* + * update entry_cnt in below condition: + * 1. update NEW_ADDR to valid block address; + * 2. update old block address to new one; + */ + if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) || + !get_nat_flag(ne, IS_DIRTY))) + head->entry_cnt++; + + set_nat_flag(ne, IS_PREALLOC, new_ne); + + if (get_nat_flag(ne, IS_DIRTY)) + goto refresh_list; + + nm_i->dirty_nat_cnt++; + set_nat_flag(ne, IS_DIRTY, true); +refresh_list: + spin_lock(&nm_i->nat_list_lock); + if (new_ne) + list_del_init(&ne->list); + else + list_move_tail(&ne->list, &head->entry_list); + spin_unlock(&nm_i->nat_list_lock); +} + +static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, + struct nat_entry_set *set, struct nat_entry *ne) +{ + spin_lock(&nm_i->nat_list_lock); + list_move_tail(&ne->list, &nm_i->nat_entries); + spin_unlock(&nm_i->nat_list_lock); + + set_nat_flag(ne, IS_DIRTY, false); + set->entry_cnt--; + nm_i->dirty_nat_cnt--; +} + +static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, + nid_t start, unsigned int nr, struct nat_entry_set **ep) +{ + return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep, + start, nr); +} + +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page) +{ + return NODE_MAPPING(sbi) == page->mapping && + IS_DNODE(page) && is_cold_node(page); +} + +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi) +{ + spin_lock_init(&sbi->fsync_node_lock); + INIT_LIST_HEAD(&sbi->fsync_node_list); + sbi->fsync_seg_id = 0; + sbi->fsync_node_num = 0; +} + +static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi, + struct page *page) +{ + struct fsync_node_entry *fn; + unsigned long flags; + unsigned int seq_id; + + fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS); + + get_page(page); + fn->page = page; + INIT_LIST_HEAD(&fn->list); + + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + list_add_tail(&fn->list, &sbi->fsync_node_list); + fn->seq_id = sbi->fsync_seg_id++; + seq_id = fn->seq_id; + sbi->fsync_node_num++; + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + + return seq_id; +} + +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page) +{ + struct fsync_node_entry *fn; + unsigned long flags; + + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + list_for_each_entry(fn, &sbi->fsync_node_list, list) { + if (fn->page == page) { + list_del(&fn->list); + sbi->fsync_node_num--; + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + kmem_cache_free(fsync_node_entry_slab, fn); + put_page(page); + return; + } + } + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + f2fs_bug_on(sbi, 1); +} + +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi) +{ + unsigned long flags; + + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + sbi->fsync_seg_id = 0; + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); +} + +int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + bool need = false; + + down_read(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (e) { + if (!get_nat_flag(e, IS_CHECKPOINTED) && + !get_nat_flag(e, HAS_FSYNCED_INODE)) + need = true; + } + up_read(&nm_i->nat_tree_lock); + return need; +} + +bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + bool is_cp = true; + + down_read(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (e && !get_nat_flag(e, IS_CHECKPOINTED)) + is_cp = false; + up_read(&nm_i->nat_tree_lock); + return is_cp; +} + +bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + bool need_update = true; + + down_read(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, ino); + if (e && get_nat_flag(e, HAS_LAST_FSYNC) && + (get_nat_flag(e, IS_CHECKPOINTED) || + get_nat_flag(e, HAS_FSYNCED_INODE))) + need_update = false; + up_read(&nm_i->nat_tree_lock); + return need_update; +} + +/* must be locked by nat_tree_lock */ +static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, + struct f2fs_nat_entry *ne) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *new, *e; + + new = __alloc_nat_entry(nid, false); + if (!new) + return; + + down_write(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (!e) + e = __init_nat_entry(nm_i, new, ne, false); + else + f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || + nat_get_blkaddr(e) != + le32_to_cpu(ne->block_addr) || + nat_get_version(e) != ne->version); + up_write(&nm_i->nat_tree_lock); + if (e != new) + __free_nat_entry(new); +} + +static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, + block_t new_blkaddr, bool fsync_done) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + struct nat_entry *new = __alloc_nat_entry(ni->nid, true); + + down_write(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, ni->nid); + if (!e) { + e = __init_nat_entry(nm_i, new, NULL, true); + copy_node_info(&e->ni, ni); + f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); + } else if (new_blkaddr == NEW_ADDR) { + /* + * when nid is reallocated, + * previous nat entry can be remained in nat cache. + * So, reinitialize it with new information. + */ + copy_node_info(&e->ni, ni); + f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR); + } + /* let's free early to reduce memory consumption */ + if (e != new) + __free_nat_entry(new); + + /* sanity check */ + f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr); + f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR && + new_blkaddr == NULL_ADDR); + f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR && + new_blkaddr == NEW_ADDR); + f2fs_bug_on(sbi, is_valid_data_blkaddr(sbi, nat_get_blkaddr(e)) && + new_blkaddr == NEW_ADDR); + + /* increment version no as node is removed */ + if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { + unsigned char version = nat_get_version(e); + nat_set_version(e, inc_node_version(version)); + } + + /* change address */ + nat_set_blkaddr(e, new_blkaddr); + if (!is_valid_data_blkaddr(sbi, new_blkaddr)) + set_nat_flag(e, IS_CHECKPOINTED, false); + __set_nat_cache_dirty(nm_i, e); + + /* update fsync_mark if its inode nat entry is still alive */ + if (ni->nid != ni->ino) + e = __lookup_nat_cache(nm_i, ni->ino); + if (e) { + if (fsync_done && ni->nid == ni->ino) + set_nat_flag(e, HAS_FSYNCED_INODE, true); + set_nat_flag(e, HAS_LAST_FSYNC, fsync_done); + } + up_write(&nm_i->nat_tree_lock); +} + +int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + int nr = nr_shrink; + + if (!down_write_trylock(&nm_i->nat_tree_lock)) + return 0; + + spin_lock(&nm_i->nat_list_lock); + while (nr_shrink) { + struct nat_entry *ne; + + if (list_empty(&nm_i->nat_entries)) + break; + + ne = list_first_entry(&nm_i->nat_entries, + struct nat_entry, list); + list_del(&ne->list); + spin_unlock(&nm_i->nat_list_lock); + + __del_from_nat_cache(nm_i, ne); + nr_shrink--; + + spin_lock(&nm_i->nat_list_lock); + } + spin_unlock(&nm_i->nat_list_lock); + + up_write(&nm_i->nat_tree_lock); + return nr - nr_shrink; +} + +/* + * This function always returns success + */ +int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, + struct node_info *ni) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + nid_t start_nid = START_NID(nid); + struct f2fs_nat_block *nat_blk; + struct page *page = NULL; + struct f2fs_nat_entry ne; + struct nat_entry *e; + pgoff_t index; + int i; + + ni->nid = nid; + + /* Check nat cache */ + down_read(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (e) { + ni->ino = nat_get_ino(e); + ni->blk_addr = nat_get_blkaddr(e); + ni->version = nat_get_version(e); + up_read(&nm_i->nat_tree_lock); + return 0; + } + + memset(&ne, 0, sizeof(struct f2fs_nat_entry)); + + /* Check current segment summary */ + down_read(&curseg->journal_rwsem); + i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0); + if (i >= 0) { + ne = nat_in_journal(journal, i); + node_info_from_raw_nat(ni, &ne); + } + up_read(&curseg->journal_rwsem); + if (i >= 0) { + up_read(&nm_i->nat_tree_lock); + goto cache; + } + + /* Fill node_info from nat page */ + index = current_nat_addr(sbi, nid); + up_read(&nm_i->nat_tree_lock); + + page = f2fs_get_meta_page(sbi, index); + if (IS_ERR(page)) + return PTR_ERR(page); + + nat_blk = (struct f2fs_nat_block *)page_address(page); + ne = nat_blk->entries[nid - start_nid]; + node_info_from_raw_nat(ni, &ne); + f2fs_put_page(page, 1); +cache: + /* cache nat entry */ + cache_nat_entry(sbi, nid, &ne); + return 0; +} + +/* + * readahead MAX_RA_NODE number of node pages. + */ +static void f2fs_ra_node_pages(struct page *parent, int start, int n) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + struct blk_plug plug; + int i, end; + nid_t nid; + + blk_start_plug(&plug); + + /* Then, try readahead for siblings of the desired node */ + end = start + n; + end = min(end, NIDS_PER_BLOCK); + for (i = start; i < end; i++) { + nid = get_nid(parent, i, false); + f2fs_ra_node_page(sbi, nid); + } + + blk_finish_plug(&plug); +} + +pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs) +{ + const long direct_index = ADDRS_PER_INODE(dn->inode); + const long direct_blks = ADDRS_PER_BLOCK; + const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK; + unsigned int skipped_unit = ADDRS_PER_BLOCK; + int cur_level = dn->cur_level; + int max_level = dn->max_level; + pgoff_t base = 0; + + if (!dn->max_level) + return pgofs + 1; + + while (max_level-- > cur_level) + skipped_unit *= NIDS_PER_BLOCK; + + switch (dn->max_level) { + case 3: + base += 2 * indirect_blks; + case 2: + base += 2 * direct_blks; + case 1: + base += direct_index; + break; + default: + f2fs_bug_on(F2FS_I_SB(dn->inode), 1); + } + + return ((pgofs - base) / skipped_unit + 1) * skipped_unit + base; +} + +/* + * The maximum depth is four. + * Offset[0] will have raw inode offset. + */ +static int get_node_path(struct inode *inode, long block, + int offset[4], unsigned int noffset[4]) +{ + const long direct_index = ADDRS_PER_INODE(inode); + const long direct_blks = ADDRS_PER_BLOCK; + const long dptrs_per_blk = NIDS_PER_BLOCK; + const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK; + const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK; + int n = 0; + int level = 0; + + noffset[0] = 0; + + if (block < direct_index) { + offset[n] = block; + goto got; + } + block -= direct_index; + if (block < direct_blks) { + offset[n++] = NODE_DIR1_BLOCK; + noffset[n] = 1; + offset[n] = block; + level = 1; + goto got; + } + block -= direct_blks; + if (block < direct_blks) { + offset[n++] = NODE_DIR2_BLOCK; + noffset[n] = 2; + offset[n] = block; + level = 1; + goto got; + } + block -= direct_blks; + if (block < indirect_blks) { + offset[n++] = NODE_IND1_BLOCK; + noffset[n] = 3; + offset[n++] = block / direct_blks; + noffset[n] = 4 + offset[n - 1]; + offset[n] = block % direct_blks; + level = 2; + goto got; + } + block -= indirect_blks; + if (block < indirect_blks) { + offset[n++] = NODE_IND2_BLOCK; + noffset[n] = 4 + dptrs_per_blk; + offset[n++] = block / direct_blks; + noffset[n] = 5 + dptrs_per_blk + offset[n - 1]; + offset[n] = block % direct_blks; + level = 2; + goto got; + } + block -= indirect_blks; + if (block < dindirect_blks) { + offset[n++] = NODE_DIND_BLOCK; + noffset[n] = 5 + (dptrs_per_blk * 2); + offset[n++] = block / indirect_blks; + noffset[n] = 6 + (dptrs_per_blk * 2) + + offset[n - 1] * (dptrs_per_blk + 1); + offset[n++] = (block / direct_blks) % dptrs_per_blk; + noffset[n] = 7 + (dptrs_per_blk * 2) + + offset[n - 2] * (dptrs_per_blk + 1) + + offset[n - 1]; + offset[n] = block % direct_blks; + level = 3; + goto got; + } else { + return -E2BIG; + } +got: + return level; +} + +/* + * Caller should call f2fs_put_dnode(dn). + * Also, it should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op() only if ro is not set RDONLY_NODE. + * In the case of RDONLY_NODE, we don't need to care about mutex. + */ +int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct page *npage[4]; + struct page *parent = NULL; + int offset[4]; + unsigned int noffset[4]; + nid_t nids[4]; + int level, i = 0; + int err = 0; + + level = get_node_path(dn->inode, index, offset, noffset); + if (level < 0) + return level; + + nids[0] = dn->inode->i_ino; + npage[0] = dn->inode_page; + + if (!npage[0]) { + npage[0] = f2fs_get_node_page(sbi, nids[0]); + if (IS_ERR(npage[0])) + return PTR_ERR(npage[0]); + } + + /* if inline_data is set, should not report any block indices */ + if (f2fs_has_inline_data(dn->inode) && index) { + err = -ENOENT; + f2fs_put_page(npage[0], 1); + goto release_out; + } + + parent = npage[0]; + if (level != 0) + nids[1] = get_nid(parent, offset[0], true); + dn->inode_page = npage[0]; + dn->inode_page_locked = true; + + /* get indirect or direct nodes */ + for (i = 1; i <= level; i++) { + bool done = false; + + if (!nids[i] && mode == ALLOC_NODE) { + /* alloc new node */ + if (!f2fs_alloc_nid(sbi, &(nids[i]))) { + err = -ENOSPC; + goto release_pages; + } + + dn->nid = nids[i]; + npage[i] = f2fs_new_node_page(dn, noffset[i]); + if (IS_ERR(npage[i])) { + f2fs_alloc_nid_failed(sbi, nids[i]); + err = PTR_ERR(npage[i]); + goto release_pages; + } + + set_nid(parent, offset[i - 1], nids[i], i == 1); + f2fs_alloc_nid_done(sbi, nids[i]); + done = true; + } else if (mode == LOOKUP_NODE_RA && i == level && level > 1) { + npage[i] = f2fs_get_node_page_ra(parent, offset[i - 1]); + if (IS_ERR(npage[i])) { + err = PTR_ERR(npage[i]); + goto release_pages; + } + done = true; + } + if (i == 1) { + dn->inode_page_locked = false; + unlock_page(parent); + } else { + f2fs_put_page(parent, 1); + } + + if (!done) { + npage[i] = f2fs_get_node_page(sbi, nids[i]); + if (IS_ERR(npage[i])) { + err = PTR_ERR(npage[i]); + f2fs_put_page(npage[0], 0); + goto release_out; + } + } + if (i < level) { + parent = npage[i]; + nids[i + 1] = get_nid(parent, offset[i], false); + } + } + dn->nid = nids[level]; + dn->ofs_in_node = offset[level]; + dn->node_page = npage[level]; + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); + return 0; + +release_pages: + f2fs_put_page(parent, 1); + if (i > 1) + f2fs_put_page(npage[0], 0); +release_out: + dn->inode_page = NULL; + dn->node_page = NULL; + if (err == -ENOENT) { + dn->cur_level = i; + dn->max_level = level; + dn->ofs_in_node = offset[level]; + } + return err; +} + +static int truncate_node(struct dnode_of_data *dn) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct node_info ni; + int err; + pgoff_t index; + + err = f2fs_get_node_info(sbi, dn->nid, &ni); + if (err) + return err; + + /* Deallocate node address */ + f2fs_invalidate_blocks(sbi, ni.blk_addr); + dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino); + set_node_addr(sbi, &ni, NULL_ADDR, false); + + if (dn->nid == dn->inode->i_ino) { + f2fs_remove_orphan_inode(sbi, dn->nid); + dec_valid_inode_count(sbi); + f2fs_inode_synced(dn->inode); + } + + clear_node_page_dirty(dn->node_page); + set_sbi_flag(sbi, SBI_IS_DIRTY); + + index = dn->node_page->index; + f2fs_put_page(dn->node_page, 1); + + invalidate_mapping_pages(NODE_MAPPING(sbi), + index, index); + + dn->node_page = NULL; + trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); + + return 0; +} + +static int truncate_dnode(struct dnode_of_data *dn) +{ + struct page *page; + int err; + + if (dn->nid == 0) + return 1; + + /* get direct node */ + page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid); + if (IS_ERR(page) && PTR_ERR(page) == -ENOENT) + return 1; + else if (IS_ERR(page)) + return PTR_ERR(page); + + /* Make dnode_of_data for parameter */ + dn->node_page = page; + dn->ofs_in_node = 0; + f2fs_truncate_data_blocks(dn); + err = truncate_node(dn); + if (err) + return err; + + return 1; +} + +static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, + int ofs, int depth) +{ + struct dnode_of_data rdn = *dn; + struct page *page; + struct f2fs_node *rn; + nid_t child_nid; + unsigned int child_nofs; + int freed = 0; + int i, ret; + + if (dn->nid == 0) + return NIDS_PER_BLOCK + 1; + + trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); + + page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid); + if (IS_ERR(page)) { + trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page)); + return PTR_ERR(page); + } + + f2fs_ra_node_pages(page, ofs, NIDS_PER_BLOCK); + + rn = F2FS_NODE(page); + if (depth < 3) { + for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { + child_nid = le32_to_cpu(rn->in.nid[i]); + if (child_nid == 0) + continue; + rdn.nid = child_nid; + ret = truncate_dnode(&rdn); + if (ret < 0) + goto out_err; + if (set_nid(page, i, 0, false)) + dn->node_changed = true; + } + } else { + child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1; + for (i = ofs; i < NIDS_PER_BLOCK; i++) { + child_nid = le32_to_cpu(rn->in.nid[i]); + if (child_nid == 0) { + child_nofs += NIDS_PER_BLOCK + 1; + continue; + } + rdn.nid = child_nid; + ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1); + if (ret == (NIDS_PER_BLOCK + 1)) { + if (set_nid(page, i, 0, false)) + dn->node_changed = true; + child_nofs += ret; + } else if (ret < 0 && ret != -ENOENT) { + goto out_err; + } + } + freed = child_nofs; + } + + if (!ofs) { + /* remove current indirect node */ + dn->node_page = page; + ret = truncate_node(dn); + if (ret) + goto out_err; + freed++; + } else { + f2fs_put_page(page, 1); + } + trace_f2fs_truncate_nodes_exit(dn->inode, freed); + return freed; + +out_err: + f2fs_put_page(page, 1); + trace_f2fs_truncate_nodes_exit(dn->inode, ret); + return ret; +} + +static int truncate_partial_nodes(struct dnode_of_data *dn, + struct f2fs_inode *ri, int *offset, int depth) +{ + struct page *pages[2]; + nid_t nid[3]; + nid_t child_nid; + int err = 0; + int i; + int idx = depth - 2; + + nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); + if (!nid[0]) + return 0; + + /* get indirect nodes in the path */ + for (i = 0; i < idx + 1; i++) { + /* reference count'll be increased */ + pages[i] = f2fs_get_node_page(F2FS_I_SB(dn->inode), nid[i]); + if (IS_ERR(pages[i])) { + err = PTR_ERR(pages[i]); + idx = i - 1; + goto fail; + } + nid[i + 1] = get_nid(pages[i], offset[i + 1], false); + } + + f2fs_ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK); + + /* free direct nodes linked to a partial indirect node */ + for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { + child_nid = get_nid(pages[idx], i, false); + if (!child_nid) + continue; + dn->nid = child_nid; + err = truncate_dnode(dn); + if (err < 0) + goto fail; + if (set_nid(pages[idx], i, 0, false)) + dn->node_changed = true; + } + + if (offset[idx + 1] == 0) { + dn->node_page = pages[idx]; + dn->nid = nid[idx]; + err = truncate_node(dn); + if (err) + goto fail; + } else { + f2fs_put_page(pages[idx], 1); + } + offset[idx]++; + offset[idx + 1] = 0; + idx--; +fail: + for (i = idx; i >= 0; i--) + f2fs_put_page(pages[i], 1); + + trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err); + + return err; +} + +/* + * All the block addresses of data and nodes should be nullified. + */ +int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int err = 0, cont = 1; + int level, offset[4], noffset[4]; + unsigned int nofs = 0; + struct f2fs_inode *ri; + struct dnode_of_data dn; + struct page *page; + + trace_f2fs_truncate_inode_blocks_enter(inode, from); + + level = get_node_path(inode, from, offset, noffset); + if (level < 0) + return level; + + page = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) { + trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); + return PTR_ERR(page); + } + + set_new_dnode(&dn, inode, page, NULL, 0); + unlock_page(page); + + ri = F2FS_INODE(page); + switch (level) { + case 0: + case 1: + nofs = noffset[1]; + break; + case 2: + nofs = noffset[1]; + if (!offset[level - 1]) + goto skip_partial; + err = truncate_partial_nodes(&dn, ri, offset, level); + if (err < 0 && err != -ENOENT) + goto fail; + nofs += 1 + NIDS_PER_BLOCK; + break; + case 3: + nofs = 5 + 2 * NIDS_PER_BLOCK; + if (!offset[level - 1]) + goto skip_partial; + err = truncate_partial_nodes(&dn, ri, offset, level); + if (err < 0 && err != -ENOENT) + goto fail; + break; + default: + BUG(); + } + +skip_partial: + while (cont) { + dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); + switch (offset[0]) { + case NODE_DIR1_BLOCK: + case NODE_DIR2_BLOCK: + err = truncate_dnode(&dn); + break; + + case NODE_IND1_BLOCK: + case NODE_IND2_BLOCK: + err = truncate_nodes(&dn, nofs, offset[1], 2); + break; + + case NODE_DIND_BLOCK: + err = truncate_nodes(&dn, nofs, offset[1], 3); + cont = 0; + break; + + default: + BUG(); + } + if (err < 0 && err != -ENOENT) + goto fail; + if (offset[1] == 0 && + ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { + lock_page(page); + BUG_ON(page->mapping != NODE_MAPPING(sbi)); + f2fs_wait_on_page_writeback(page, NODE, true); + ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; + set_page_dirty(page); + unlock_page(page); + } + offset[1] = 0; + offset[0]++; + nofs += err; + } +fail: + f2fs_put_page(page, 0); + trace_f2fs_truncate_inode_blocks_exit(inode, err); + return err > 0 ? 0 : err; +} + +/* caller must lock inode page */ +int f2fs_truncate_xattr_node(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t nid = F2FS_I(inode)->i_xattr_nid; + struct dnode_of_data dn; + struct page *npage; + int err; + + if (!nid) + return 0; + + npage = f2fs_get_node_page(sbi, nid); + if (IS_ERR(npage)) + return PTR_ERR(npage); + + set_new_dnode(&dn, inode, NULL, npage, nid); + err = truncate_node(&dn); + if (err) { + f2fs_put_page(npage, 1); + return err; + } + + f2fs_i_xnid_write(inode, 0); + + return 0; +} + +/* + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + */ +int f2fs_remove_inode_page(struct inode *inode) +{ + struct dnode_of_data dn; + int err; + + set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); + err = f2fs_get_dnode_of_data(&dn, 0, LOOKUP_NODE); + if (err) + return err; + + err = f2fs_truncate_xattr_node(inode); + if (err) { + f2fs_put_dnode(&dn); + return err; + } + + /* remove potential inline_data blocks */ + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) + f2fs_truncate_data_blocks_range(&dn, 1); + + /* 0 is possible, after f2fs_new_inode() has failed */ + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { + f2fs_put_dnode(&dn); + return -EIO; + } + + if (unlikely(inode->i_blocks != 0 && inode->i_blocks != 8)) { + f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, + "Inconsistent i_blocks, ino:%lu, iblocks:%llu", + inode->i_ino, + (unsigned long long)inode->i_blocks); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + } + + /* will put inode & node pages */ + err = truncate_node(&dn); + if (err) { + f2fs_put_dnode(&dn); + return err; + } + return 0; +} + +struct page *f2fs_new_inode_page(struct inode *inode) +{ + struct dnode_of_data dn; + + /* allocate inode page for new inode */ + set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); + + /* caller should f2fs_put_page(page, 1); */ + return f2fs_new_node_page(&dn, 0); +} + +struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct node_info new_ni; + struct page *page; + int err; + + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) + return ERR_PTR(-EPERM); + + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false); + if (!page) + return ERR_PTR(-ENOMEM); + + if (unlikely((err = inc_valid_node_count(sbi, dn->inode, !ofs)))) + goto fail; + +#ifdef CONFIG_F2FS_CHECK_FS + err = f2fs_get_node_info(sbi, dn->nid, &new_ni); + if (err) { + dec_valid_node_count(sbi, dn->inode, !ofs); + goto fail; + } + f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR); +#endif + new_ni.nid = dn->nid; + new_ni.ino = dn->inode->i_ino; + new_ni.blk_addr = NULL_ADDR; + new_ni.flag = 0; + new_ni.version = 0; + set_node_addr(sbi, &new_ni, NEW_ADDR, false); + + f2fs_wait_on_page_writeback(page, NODE, true); + fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); + set_cold_node(page, S_ISDIR(dn->inode->i_mode)); + if (!PageUptodate(page)) + SetPageUptodate(page); + if (set_page_dirty(page)) + dn->node_changed = true; + + if (f2fs_has_xattr_block(ofs)) + f2fs_i_xnid_write(dn->inode, dn->nid); + + if (ofs == 0) + inc_valid_inode_count(sbi); + return page; + +fail: + clear_node_page_dirty(page); + f2fs_put_page(page, 1); + return ERR_PTR(err); +} + +/* + * Caller should do after getting the following values. + * 0: f2fs_put_page(page, 0) + * LOCKED_PAGE or error: f2fs_put_page(page, 1) + */ +static int read_node_page(struct page *page, int op_flags) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + struct node_info ni; + struct f2fs_io_info fio = { + .sbi = sbi, + .type = NODE, + .op = REQ_OP_READ, + .op_flags = op_flags, + .page = page, + .encrypted_page = NULL, + }; + int err; + + if (PageUptodate(page)) { + if (!f2fs_inode_chksum_verify(sbi, page)) { + ClearPageUptodate(page); + return -EFSBADCRC; + } + return LOCKED_PAGE; + } + + err = f2fs_get_node_info(sbi, page->index, &ni); + if (err) + return err; + + if (unlikely(ni.blk_addr == NULL_ADDR) || + is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) { + ClearPageUptodate(page); + return -ENOENT; + } + + fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr; + return f2fs_submit_page_bio(&fio); +} + +/* + * Readahead a node page + */ +void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct page *apage; + int err; + + if (!nid) + return; + if (f2fs_check_nid_range(sbi, nid)) + return; + + rcu_read_lock(); + apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid); + rcu_read_unlock(); + if (apage) + return; + + apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); + if (!apage) + return; + + err = read_node_page(apage, REQ_RAHEAD); + f2fs_put_page(apage, err ? 1 : 0); +} + +static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, + struct page *parent, int start) +{ + struct page *page; + int err; + + if (!nid) + return ERR_PTR(-ENOENT); + if (f2fs_check_nid_range(sbi, nid)) + return ERR_PTR(-EINVAL); +repeat: + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); + if (!page) + return ERR_PTR(-ENOMEM); + + err = read_node_page(page, 0); + if (err < 0) { + f2fs_put_page(page, 1); + return ERR_PTR(err); + } else if (err == LOCKED_PAGE) { + err = 0; + goto page_hit; + } + + if (parent) + f2fs_ra_node_pages(parent, start + 1, MAX_RA_NODE); + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { + f2fs_put_page(page, 1); + goto repeat; + } + + if (unlikely(!PageUptodate(page))) { + err = -EIO; + goto out_err; + } + + if (!f2fs_inode_chksum_verify(sbi, page)) { + err = -EFSBADCRC; + goto out_err; + } +page_hit: + if(unlikely(nid != nid_of_node(page))) { + f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, " + "nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + nid, nid_of_node(page), ino_of_node(page), + ofs_of_node(page), cpver_of_node(page), + next_blkaddr_of_node(page)); + err = -EINVAL; +out_err: + ClearPageUptodate(page); + f2fs_put_page(page, 1); + return ERR_PTR(err); + } + return page; +} + +struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +{ + return __get_node_page(sbi, nid, NULL, 0); +} + +struct page *f2fs_get_node_page_ra(struct page *parent, int start) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + nid_t nid = get_nid(parent, start, false); + + return __get_node_page(sbi, nid, parent, start); +} + +static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct inode *inode; + struct page *page; + int ret; + + /* should flush inline_data before evict_inode */ + inode = ilookup(sbi->sb, ino); + if (!inode) + return; + + page = f2fs_pagecache_get_page(inode->i_mapping, 0, + FGP_LOCK|FGP_NOWAIT, 0); + if (!page) + goto iput_out; + + if (!PageUptodate(page)) + goto page_out; + + if (!PageDirty(page)) + goto page_out; + + if (!clear_page_dirty_for_io(page)) + goto page_out; + + ret = f2fs_write_inline_data(inode, page); + inode_dec_dirty_pages(inode); + f2fs_remove_dirty_inode(inode); + if (ret) + set_page_dirty(page); +page_out: + f2fs_put_page(page, 1); +iput_out: + iput(inode); +} + +static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) +{ + pgoff_t index; + struct pagevec pvec; + struct page *last_page = NULL; + int nr_pages; + + pagevec_init(&pvec); + index = 0; + + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY))) { + int i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_put_page(last_page, 0); + pagevec_release(&pvec); + return ERR_PTR(-EIO); + } + + if (!IS_DNODE(page) || !is_cold_node(page)) + continue; + if (ino_of_node(page) != ino) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + if (ino_of_node(page) != ino) + goto continue_unlock; + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (last_page) + f2fs_put_page(last_page, 0); + + get_page(page); + last_page = page; + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } + return last_page; +} + +static int __write_node_page(struct page *page, bool atomic, bool *submitted, + struct writeback_control *wbc, bool do_balance, + enum iostat_type io_type, unsigned int *seq_id) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + nid_t nid; + struct node_info ni; + struct f2fs_io_info fio = { + .sbi = sbi, + .ino = ino_of_node(page), + .type = NODE, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), + .page = page, + .encrypted_page = NULL, + .submitted = false, + .io_type = io_type, + .io_wbc = wbc, + }; + unsigned int seq; + + trace_f2fs_writepage(page, NODE); + + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + + if (wbc->sync_mode == WB_SYNC_NONE && + IS_DNODE(page) && is_cold_node(page)) + goto redirty_out; + + /* get old block addr of this node page */ + nid = nid_of_node(page); + f2fs_bug_on(sbi, page->index != nid); + + if (f2fs_get_node_info(sbi, nid, &ni)) + goto redirty_out; + + if (wbc->for_reclaim) { + if (!down_read_trylock(&sbi->node_write)) + goto redirty_out; + } else { + down_read(&sbi->node_write); + } + + /* This page is already truncated */ + if (unlikely(ni.blk_addr == NULL_ADDR)) { + ClearPageUptodate(page); + dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); + unlock_page(page); + return 0; + } + + if (__is_valid_data_blkaddr(ni.blk_addr) && + !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC)) { + up_read(&sbi->node_write); + goto redirty_out; + } + + if (atomic && !test_opt(sbi, NOBARRIER)) + fio.op_flags |= REQ_PREFLUSH | REQ_FUA; + + /* should add to global list before clearing PAGECACHE status */ + if (f2fs_in_warm_node_list(sbi, page)) { + seq = f2fs_add_fsync_node_entry(sbi, page); + if (seq_id) + *seq_id = seq; + } + + set_page_writeback(page); + ClearPageError(page); + + fio.old_blkaddr = ni.blk_addr; + f2fs_do_write_node_page(nid, &fio); + set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); + dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); + + if (wbc->for_reclaim) { + f2fs_submit_merged_write_cond(sbi, page->mapping->host, 0, + page->index, NODE); + submitted = NULL; + } + + unlock_page(page); + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_submit_merged_write(sbi, NODE); + submitted = NULL; + } + if (submitted) + *submitted = fio.submitted; + + if (do_balance) + f2fs_balance_fs(sbi, false); + return 0; + +redirty_out: + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; +} + +void f2fs_move_node_page(struct page *node_page, int gc_type) +{ + if (gc_type == FG_GC) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + .for_reclaim = 0, + }; + + set_page_dirty(node_page); + f2fs_wait_on_page_writeback(node_page, NODE, true); + + f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page)); + if (!clear_page_dirty_for_io(node_page)) + goto out_page; + + if (__write_node_page(node_page, false, NULL, + &wbc, false, FS_GC_NODE_IO, NULL)) + unlock_page(node_page); + goto release_page; + } else { + /* set page dirty and write it */ + if (!PageWriteback(node_page)) + set_page_dirty(node_page); + } +out_page: + unlock_page(node_page); +release_page: + f2fs_put_page(node_page, 0); +} + +static int f2fs_write_node_page(struct page *page, + struct writeback_control *wbc) +{ + return __write_node_page(page, false, NULL, wbc, false, + FS_NODE_IO, NULL); +} + +int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, + struct writeback_control *wbc, bool atomic, + unsigned int *seq_id) +{ + pgoff_t index; + pgoff_t last_idx = ULONG_MAX; + struct pagevec pvec; + int ret = 0; + struct page *last_page = NULL; + bool marked = false; + nid_t ino = inode->i_ino; + int nr_pages; + + if (atomic) { + last_page = last_fsync_dnode(sbi, ino); + if (IS_ERR_OR_NULL(last_page)) + return PTR_ERR_OR_ZERO(last_page); + } +retry: + pagevec_init(&pvec); + index = 0; + + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY))) { + int i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + bool submitted = false; + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_put_page(last_page, 0); + pagevec_release(&pvec); + ret = -EIO; + goto out; + } + + if (!IS_DNODE(page) || !is_cold_node(page)) + continue; + if (ino_of_node(page) != ino) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + if (ino_of_node(page) != ino) + goto continue_unlock; + + if (!PageDirty(page) && page != last_page) { + /* someone wrote it for us */ + goto continue_unlock; + } + + f2fs_wait_on_page_writeback(page, NODE, true); + BUG_ON(PageWriteback(page)); + + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); + + if (!atomic || page == last_page) { + set_fsync_mark(page, 1); + if (IS_INODE(page)) { + if (is_inode_flag_set(inode, + FI_DIRTY_INODE)) + f2fs_update_inode(inode, page); + set_dentry_mark(page, + f2fs_need_dentry_mark(sbi, ino)); + } + /* may be written by other thread */ + if (!PageDirty(page)) + set_page_dirty(page); + } + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + ret = __write_node_page(page, atomic && + page == last_page, + &submitted, wbc, true, + FS_NODE_IO, seq_id); + if (ret) { + unlock_page(page); + f2fs_put_page(last_page, 0); + break; + } else if (submitted) { + last_idx = page->index; + } + + if (page == last_page) { + f2fs_put_page(page, 0); + marked = true; + break; + } + } + pagevec_release(&pvec); + cond_resched(); + + if (ret || marked) + break; + } + if (!ret && atomic && !marked) { + f2fs_msg(sbi->sb, KERN_DEBUG, + "Retry to write fsync mark: ino=%u, idx=%lx", + ino, last_page->index); + lock_page(last_page); + f2fs_wait_on_page_writeback(last_page, NODE, true); + set_page_dirty(last_page); + unlock_page(last_page); + goto retry; + } +out: + if (last_idx != ULONG_MAX) + f2fs_submit_merged_write_cond(sbi, NULL, ino, last_idx, NODE); + return ret ? -EIO: 0; +} + +int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, + struct writeback_control *wbc, + bool do_balance, enum iostat_type io_type) +{ + pgoff_t index; + struct pagevec pvec; + int step = 0; + int nwritten = 0; + int ret = 0; + int nr_pages, done = 0; + + pagevec_init(&pvec); + +next_step: + index = 0; + + while (!done && (nr_pages = pagevec_lookup_tag(&pvec, + NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) { + int i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + bool submitted = false; + + /* give a priority to WB_SYNC threads */ + if (atomic_read(&sbi->wb_sync_req[NODE]) && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } + + /* + * flushing sequence with step: + * 0. indirect nodes + * 1. dentry dnodes + * 2. file dnodes + */ + if (step == 0 && IS_DNODE(page)) + continue; + if (step == 1 && (!IS_DNODE(page) || + is_cold_node(page))) + continue; + if (step == 2 && (!IS_DNODE(page) || + !is_cold_node(page))) + continue; +lock_node: + if (wbc->sync_mode == WB_SYNC_ALL) + lock_page(page); + else if (!trylock_page(page)) + continue; + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + /* flush inline_data */ + if (is_inline_node(page)) { + clear_inline_node(page); + unlock_page(page); + flush_inline_data(sbi, ino_of_node(page)); + goto lock_node; + } + + f2fs_wait_on_page_writeback(page, NODE, true); + + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); + + ret = __write_node_page(page, false, &submitted, + wbc, do_balance, io_type, NULL); + if (ret) + unlock_page(page); + else if (submitted) + nwritten++; + + if (--wbc->nr_to_write == 0) + break; + } + pagevec_release(&pvec); + cond_resched(); + + if (wbc->nr_to_write == 0) { + step = 2; + break; + } + } + + if (step < 2) { + if (wbc->sync_mode == WB_SYNC_NONE && step == 1) + goto out; + step++; + goto next_step; + } +out: + if (nwritten) + f2fs_submit_merged_write(sbi, NODE); + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + return ret; +} + +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, + unsigned int seq_id) +{ + struct fsync_node_entry *fn; + struct page *page; + struct list_head *head = &sbi->fsync_node_list; + unsigned long flags; + unsigned int cur_seq_id = 0; + int ret2, ret = 0; + + while (seq_id && cur_seq_id < seq_id) { + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + if (list_empty(head)) { + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + break; + } + fn = list_first_entry(head, struct fsync_node_entry, list); + if (fn->seq_id > seq_id) { + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + break; + } + cur_seq_id = fn->seq_id; + page = fn->page; + get_page(page); + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + + f2fs_wait_on_page_writeback(page, NODE, true); + if (TestClearPageError(page)) + ret = -EIO; + + put_page(page); + + if (ret) + break; + } + + ret2 = filemap_check_errors(NODE_MAPPING(sbi)); + if (!ret) + ret = ret2; + + return ret; +} + +static int f2fs_write_node_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + struct blk_plug plug; + long diff; + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + + /* balancing f2fs's metadata in background */ + f2fs_balance_fs_bg(sbi); + + /* collect a number of dirty node pages and write together */ + if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) + goto skip_write; + + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_inc(&sbi->wb_sync_req[NODE]); + else if (atomic_read(&sbi->wb_sync_req[NODE])) + goto skip_write; + + trace_f2fs_writepages(mapping->host, wbc, NODE); + + diff = nr_pages_to_write(sbi, NODE, wbc); + blk_start_plug(&plug); + f2fs_sync_node_pages(sbi, wbc, true, FS_NODE_IO); + blk_finish_plug(&plug); + wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); + + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_dec(&sbi->wb_sync_req[NODE]); + return 0; + +skip_write: + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES); + trace_f2fs_writepages(mapping->host, wbc, NODE); + return 0; +} + +static int f2fs_set_node_page_dirty(struct page *page) +{ + trace_f2fs_set_page_dirty(page, NODE); + + if (!PageUptodate(page)) + SetPageUptodate(page); +#ifdef CONFIG_F2FS_CHECK_FS + if (IS_INODE(page)) + f2fs_inode_chksum_set(F2FS_P_SB(page), page); +#endif + if (!PageDirty(page)) { + __set_page_dirty_nobuffers(page); + inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); + SetPagePrivate(page); + f2fs_trace_pid(page); + return 1; + } + return 0; +} + +/* + * Structure of the f2fs node operations + */ +const struct address_space_operations f2fs_node_aops = { + .writepage = f2fs_write_node_page, + .writepages = f2fs_write_node_pages, + .set_page_dirty = f2fs_set_node_page_dirty, + .invalidatepage = f2fs_invalidate_page, + .releasepage = f2fs_release_page, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif +}; + +static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, + nid_t n) +{ + return radix_tree_lookup(&nm_i->free_nid_root, n); +} + +static int __insert_free_nid(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_state state) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); + if (err) + return err; + + f2fs_bug_on(sbi, state != i->state); + nm_i->nid_cnt[state]++; + if (state == FREE_NID) + list_add_tail(&i->list, &nm_i->free_nid_list); + return 0; +} + +static void __remove_free_nid(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_state state) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, state != i->state); + nm_i->nid_cnt[state]--; + if (state == FREE_NID) + list_del(&i->list); + radix_tree_delete(&nm_i->free_nid_root, i->nid); +} + +static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i, + enum nid_state org_state, enum nid_state dst_state) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, org_state != i->state); + i->state = dst_state; + nm_i->nid_cnt[org_state]--; + nm_i->nid_cnt[dst_state]++; + + switch (dst_state) { + case PREALLOC_NID: + list_del(&i->list); + break; + case FREE_NID: + list_add_tail(&i->list, &nm_i->free_nid_list); + break; + default: + BUG_ON(1); + } +} + +static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, + bool set, bool build) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); + unsigned int nid_ofs = nid - START_NID(nid); + + if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) + return; + + if (set) { + if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) + return; + __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + nm_i->free_nid_count[nat_ofs]++; + } else { + if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) + return; + __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + if (!build) + nm_i->free_nid_count[nat_ofs]--; + } +} + +/* return if the nid is recognized as free */ +static bool add_free_nid(struct f2fs_sb_info *sbi, + nid_t nid, bool build, bool update) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i, *e; + struct nat_entry *ne; + int err = -EINVAL; + bool ret = false; + + /* 0 nid should not be used */ + if (unlikely(nid == 0)) + return false; + + if (unlikely(f2fs_check_nid_range(sbi, nid))) + return false; + + i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); + i->nid = nid; + i->state = FREE_NID; + + radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); + + spin_lock(&nm_i->nid_list_lock); + + if (build) { + /* + * Thread A Thread B + * - f2fs_create + * - f2fs_new_inode + * - f2fs_alloc_nid + * - __insert_nid_to_list(PREALLOC_NID) + * - f2fs_balance_fs_bg + * - f2fs_build_free_nids + * - __f2fs_build_free_nids + * - scan_nat_page + * - add_free_nid + * - __lookup_nat_cache + * - f2fs_add_link + * - f2fs_init_inode_metadata + * - f2fs_new_inode_page + * - f2fs_new_node_page + * - set_node_addr + * - f2fs_alloc_nid_done + * - __remove_nid_from_list(PREALLOC_NID) + * - __insert_nid_to_list(FREE_NID) + */ + ne = __lookup_nat_cache(nm_i, nid); + if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || + nat_get_blkaddr(ne) != NULL_ADDR)) + goto err_out; + + e = __lookup_free_nid_list(nm_i, nid); + if (e) { + if (e->state == FREE_NID) + ret = true; + goto err_out; + } + } + ret = true; + err = __insert_free_nid(sbi, i, FREE_NID); +err_out: + if (update) { + update_free_nid_bitmap(sbi, nid, ret, build); + if (!build) + nm_i->available_nids++; + } + spin_unlock(&nm_i->nid_list_lock); + radix_tree_preload_end(); + + if (err) + kmem_cache_free(free_nid_slab, i); + return ret; +} + +static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i; + bool need_free = false; + + spin_lock(&nm_i->nid_list_lock); + i = __lookup_free_nid_list(nm_i, nid); + if (i && i->state == FREE_NID) { + __remove_free_nid(sbi, i, FREE_NID); + need_free = true; + } + spin_unlock(&nm_i->nid_list_lock); + + if (need_free) + kmem_cache_free(free_nid_slab, i); +} + +static int scan_nat_page(struct f2fs_sb_info *sbi, + struct page *nat_page, nid_t start_nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct f2fs_nat_block *nat_blk = page_address(nat_page); + block_t blk_addr; + unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid); + int i; + + __set_bit_le(nat_ofs, nm_i->nat_block_bitmap); + + i = start_nid % NAT_ENTRY_PER_BLOCK; + + for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { + if (unlikely(start_nid >= nm_i->max_nid)) + break; + + blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); + + if (blk_addr == NEW_ADDR) + return -EINVAL; + + if (blk_addr == NULL_ADDR) { + add_free_nid(sbi, start_nid, true, true); + } else { + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, start_nid, false, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } + } + + return 0; +} + +static void scan_curseg_cache(struct f2fs_sb_info *sbi) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + int i; + + down_read(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { + block_t addr; + nid_t nid; + + addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); + nid = le32_to_cpu(nid_in_journal(journal, i)); + if (addr == NULL_ADDR) + add_free_nid(sbi, nid, true, false); + else + remove_free_nid(sbi, nid); + } + up_read(&curseg->journal_rwsem); +} + +static void scan_free_nid_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int i, idx; + nid_t nid; + + down_read(&nm_i->nat_tree_lock); + + for (i = 0; i < nm_i->nat_blocks; i++) { + if (!test_bit_le(i, nm_i->nat_block_bitmap)) + continue; + if (!nm_i->free_nid_count[i]) + continue; + for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) { + idx = find_next_bit_le(nm_i->free_nid_bitmap[i], + NAT_ENTRY_PER_BLOCK, idx); + if (idx >= NAT_ENTRY_PER_BLOCK) + break; + + nid = i * NAT_ENTRY_PER_BLOCK + idx; + add_free_nid(sbi, nid, true, false); + + if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS) + goto out; + } + } +out: + scan_curseg_cache(sbi); + + up_read(&nm_i->nat_tree_lock); +} + +static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, + bool sync, bool mount) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + int i = 0, ret; + nid_t nid = nm_i->next_scan_nid; + + if (unlikely(nid >= nm_i->max_nid)) + nid = 0; + + if (unlikely(nid % NAT_ENTRY_PER_BLOCK)) + nid = NAT_BLOCK_OFFSET(nid) * NAT_ENTRY_PER_BLOCK; + + /* Enough entries */ + if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) + return 0; + + if (!sync && !f2fs_available_free_memory(sbi, FREE_NIDS)) + return 0; + + if (!mount) { + /* try to find free nids in free_nid_bitmap */ + scan_free_nid_bits(sbi); + + if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) + return 0; + } + + /* readahead nat pages to be scanned */ + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, + META_NAT, true); + + down_read(&nm_i->nat_tree_lock); + + while (1) { + if (!test_bit_le(NAT_BLOCK_OFFSET(nid), + nm_i->nat_block_bitmap)) { + struct page *page = get_current_nat_page(sbi, nid); + + ret = scan_nat_page(sbi, page, nid); + f2fs_put_page(page, 1); + + if (ret) { + up_read(&nm_i->nat_tree_lock); + f2fs_bug_on(sbi, !mount); + f2fs_msg(sbi->sb, KERN_ERR, + "NAT is corrupt, run fsck to fix it"); + return -EINVAL; + } + } + + nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); + if (unlikely(nid >= nm_i->max_nid)) + nid = 0; + + if (++i >= FREE_NID_PAGES) + break; + } + + /* go to the next free nat pages to find free nids abundantly */ + nm_i->next_scan_nid = nid; + + /* find free nids from current sum_pages */ + scan_curseg_cache(sbi); + + up_read(&nm_i->nat_tree_lock); + + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), + nm_i->ra_nid_pages, META_NAT, false); + + return 0; +} + +int f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) +{ + int ret; + + mutex_lock(&NM_I(sbi)->build_lock); + ret = __f2fs_build_free_nids(sbi, sync, mount); + mutex_unlock(&NM_I(sbi)->build_lock); + + return ret; +} + +/* + * If this function returns success, caller can obtain a new nid + * from second parameter of this function. + * The returned nid could be used ino as well as nid when inode is created. + */ +bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i = NULL; +retry: + if (time_to_inject(sbi, FAULT_ALLOC_NID)) { + f2fs_show_injection_info(FAULT_ALLOC_NID); + return false; + } + + spin_lock(&nm_i->nid_list_lock); + + if (unlikely(nm_i->available_nids == 0)) { + spin_unlock(&nm_i->nid_list_lock); + return false; + } + + /* We should not use stale free nids created by f2fs_build_free_nids */ + if (nm_i->nid_cnt[FREE_NID] && !on_f2fs_build_free_nids(nm_i)) { + f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); + i = list_first_entry(&nm_i->free_nid_list, + struct free_nid, list); + *nid = i->nid; + + __move_free_nid(sbi, i, FREE_NID, PREALLOC_NID); + nm_i->available_nids--; + + update_free_nid_bitmap(sbi, *nid, false, false); + + spin_unlock(&nm_i->nid_list_lock); + return true; + } + spin_unlock(&nm_i->nid_list_lock); + + /* Let's scan nat pages and its caches to get free nids */ + if (!f2fs_build_free_nids(sbi, true, false)) + goto retry; + return false; +} + +/* + * f2fs_alloc_nid() should be called prior to this function. + */ +void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i; + + spin_lock(&nm_i->nid_list_lock); + i = __lookup_free_nid_list(nm_i, nid); + f2fs_bug_on(sbi, !i); + __remove_free_nid(sbi, i, PREALLOC_NID); + spin_unlock(&nm_i->nid_list_lock); + + kmem_cache_free(free_nid_slab, i); +} + +/* + * f2fs_alloc_nid() should be called prior to this function. + */ +void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i; + bool need_free = false; + + if (!nid) + return; + + spin_lock(&nm_i->nid_list_lock); + i = __lookup_free_nid_list(nm_i, nid); + f2fs_bug_on(sbi, !i); + + if (!f2fs_available_free_memory(sbi, FREE_NIDS)) { + __remove_free_nid(sbi, i, PREALLOC_NID); + need_free = true; + } else { + __move_free_nid(sbi, i, PREALLOC_NID, FREE_NID); + } + + nm_i->available_nids++; + + update_free_nid_bitmap(sbi, nid, true, false); + + spin_unlock(&nm_i->nid_list_lock); + + if (need_free) + kmem_cache_free(free_nid_slab, i); +} + +int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i, *next; + int nr = nr_shrink; + + if (nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) + return 0; + + if (!mutex_trylock(&nm_i->build_lock)) + return 0; + + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { + if (nr_shrink <= 0 || + nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) + break; + + __remove_free_nid(sbi, i, FREE_NID); + kmem_cache_free(free_nid_slab, i); + nr_shrink--; + } + spin_unlock(&nm_i->nid_list_lock); + mutex_unlock(&nm_i->build_lock); + + return nr - nr_shrink; +} + +int f2fs_recover_inline_xattr(struct inode *inode, struct page *page) +{ + void *src_addr, *dst_addr; + size_t inline_size; + struct page *ipage; + struct f2fs_inode *ri; + + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + ri = F2FS_INODE(page); + if (ri->i_inline & F2FS_INLINE_XATTR) { + set_inode_flag(inode, FI_INLINE_XATTR); + } else { + clear_inode_flag(inode, FI_INLINE_XATTR); + goto update_inode; + } + + dst_addr = inline_xattr_addr(inode, ipage); + src_addr = inline_xattr_addr(inode, page); + inline_size = inline_xattr_size(inode); + + f2fs_wait_on_page_writeback(ipage, NODE, true); + memcpy(dst_addr, src_addr, inline_size); +update_inode: + f2fs_update_inode(inode, ipage); + f2fs_put_page(ipage, 1); + return 0; +} + +int f2fs_recover_xattr_data(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; + nid_t new_xnid; + struct dnode_of_data dn; + struct node_info ni; + struct page *xpage; + int err; + + if (!prev_xnid) + goto recover_xnid; + + /* 1: invalidate the previous xattr nid */ + err = f2fs_get_node_info(sbi, prev_xnid, &ni); + if (err) + return err; + + f2fs_invalidate_blocks(sbi, ni.blk_addr); + dec_valid_node_count(sbi, inode, false); + set_node_addr(sbi, &ni, NULL_ADDR, false); + +recover_xnid: + /* 2: update xattr nid in inode */ + if (!f2fs_alloc_nid(sbi, &new_xnid)) + return -ENOSPC; + + set_new_dnode(&dn, inode, NULL, NULL, new_xnid); + xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET); + if (IS_ERR(xpage)) { + f2fs_alloc_nid_failed(sbi, new_xnid); + return PTR_ERR(xpage); + } + + f2fs_alloc_nid_done(sbi, new_xnid); + f2fs_update_inode_page(inode); + + /* 3: update and set xattr node page dirty */ + memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE); + + set_page_dirty(xpage); + f2fs_put_page(xpage, 1); + + return 0; +} + +int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *src, *dst; + nid_t ino = ino_of_node(page); + struct node_info old_ni, new_ni; + struct page *ipage; + int err; + + err = f2fs_get_node_info(sbi, ino, &old_ni); + if (err) + return err; + + if (unlikely(old_ni.blk_addr != NULL_ADDR)) + return -EINVAL; +retry: + ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); + if (!ipage) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + + /* Should not use this inode from free nid list */ + remove_free_nid(sbi, ino); + + if (!PageUptodate(ipage)) + SetPageUptodate(ipage); + fill_node_footer(ipage, ino, ino, 0, true); + set_cold_node(ipage, false); + + src = F2FS_INODE(page); + dst = F2FS_INODE(ipage); + + memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src); + dst->i_size = 0; + dst->i_blocks = cpu_to_le64(1); + dst->i_links = cpu_to_le32(1); + dst->i_xattr_nid = 0; + dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR); + if (dst->i_inline & F2FS_EXTRA_ATTR) { + dst->i_extra_isize = src->i_extra_isize; + + if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_inline_xattr_size)) + dst->i_inline_xattr_size = src->i_inline_xattr_size; + + if (f2fs_sb_has_project_quota(sbi->sb) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_projid)) + dst->i_projid = src->i_projid; + + if (f2fs_sb_has_inode_crtime(sbi->sb) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_crtime_nsec)) { + dst->i_crtime = src->i_crtime; + dst->i_crtime_nsec = src->i_crtime_nsec; + } + } + + new_ni = old_ni; + new_ni.ino = ino; + + if (unlikely(inc_valid_node_count(sbi, NULL, true))) + WARN_ON(1); + set_node_addr(sbi, &new_ni, NEW_ADDR, false); + inc_valid_inode_count(sbi); + set_page_dirty(ipage); + f2fs_put_page(ipage, 1); + return 0; +} + +int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, + unsigned int segno, struct f2fs_summary_block *sum) +{ + struct f2fs_node *rn; + struct f2fs_summary *sum_entry; + block_t addr; + int i, idx, last_offset, nrpages; + + /* scan the node segment */ + last_offset = sbi->blocks_per_seg; + addr = START_BLOCK(sbi, segno); + sum_entry = &sum->entries[0]; + + for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { + nrpages = min(last_offset - i, BIO_MAX_PAGES); + + /* readahead node pages */ + f2fs_ra_meta_pages(sbi, addr, nrpages, META_POR, true); + + for (idx = addr; idx < addr + nrpages; idx++) { + struct page *page = f2fs_get_tmp_page(sbi, idx); + + if (IS_ERR(page)) + return PTR_ERR(page); + + rn = F2FS_NODE(page); + sum_entry->nid = rn->footer.nid; + sum_entry->version = 0; + sum_entry->ofs_in_node = 0; + sum_entry++; + f2fs_put_page(page, 1); + } + + invalidate_mapping_pages(META_MAPPING(sbi), addr, + addr + nrpages); + } + return 0; +} + +static void remove_nats_in_journal(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + int i; + + down_write(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { + struct nat_entry *ne; + struct f2fs_nat_entry raw_ne; + nid_t nid = le32_to_cpu(nid_in_journal(journal, i)); + + if (f2fs_check_nid_range(sbi, nid)) + continue; + + raw_ne = nat_in_journal(journal, i); + + ne = __lookup_nat_cache(nm_i, nid); + if (!ne) { + ne = __alloc_nat_entry(nid, true); + __init_nat_entry(nm_i, ne, &raw_ne, true); + } + + /* + * if a free nat in journal has not been used after last + * checkpoint, we should remove it from available nids, + * since later we will add it again. + */ + if (!get_nat_flag(ne, IS_DIRTY) && + le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) { + spin_lock(&nm_i->nid_list_lock); + nm_i->available_nids--; + spin_unlock(&nm_i->nid_list_lock); + } + + __set_nat_cache_dirty(nm_i, ne); + } + update_nats_in_cursum(journal, -i); + up_write(&curseg->journal_rwsem); +} + +static void __adjust_nat_entry_set(struct nat_entry_set *nes, + struct list_head *head, int max) +{ + struct nat_entry_set *cur; + + if (nes->entry_cnt >= max) + goto add_out; + + list_for_each_entry(cur, head, set_list) { + if (cur->entry_cnt >= nes->entry_cnt) { + list_add(&nes->set_list, cur->set_list.prev); + return; + } + } +add_out: + list_add_tail(&nes->set_list, head); +} + +static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, + struct page *page) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK; + struct f2fs_nat_block *nat_blk = page_address(page); + int valid = 0; + int i = 0; + + if (!enabled_nat_bits(sbi, NULL)) + return; + + if (nat_index == 0) { + valid = 1; + i = 1; + } + for (; i < NAT_ENTRY_PER_BLOCK; i++) { + if (nat_blk->entries[i].block_addr != NULL_ADDR) + valid++; + } + if (valid == 0) { + __set_bit_le(nat_index, nm_i->empty_nat_bits); + __clear_bit_le(nat_index, nm_i->full_nat_bits); + return; + } + + __clear_bit_le(nat_index, nm_i->empty_nat_bits); + if (valid == NAT_ENTRY_PER_BLOCK) + __set_bit_le(nat_index, nm_i->full_nat_bits); + else + __clear_bit_le(nat_index, nm_i->full_nat_bits); +} + +static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, + struct nat_entry_set *set, struct cp_control *cpc) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK; + bool to_journal = true; + struct f2fs_nat_block *nat_blk; + struct nat_entry *ne, *cur; + struct page *page = NULL; + + /* + * there are two steps to flush nat entries: + * #1, flush nat entries to journal in current hot data summary block. + * #2, flush nat entries to nat page. + */ + if (enabled_nat_bits(sbi, cpc) || + !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) + to_journal = false; + + if (to_journal) { + down_write(&curseg->journal_rwsem); + } else { + page = get_next_nat_page(sbi, start_nid); + nat_blk = page_address(page); + f2fs_bug_on(sbi, !nat_blk); + } + + /* flush dirty nats in nat entry set */ + list_for_each_entry_safe(ne, cur, &set->entry_list, list) { + struct f2fs_nat_entry *raw_ne; + nid_t nid = nat_get_nid(ne); + int offset; + + f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR); + + if (to_journal) { + offset = f2fs_lookup_journal_in_cursum(journal, + NAT_JOURNAL, nid, 1); + f2fs_bug_on(sbi, offset < 0); + raw_ne = &nat_in_journal(journal, offset); + nid_in_journal(journal, offset) = cpu_to_le32(nid); + } else { + raw_ne = &nat_blk->entries[nid - start_nid]; + } + raw_nat_from_node_info(raw_ne, &ne->ni); + nat_reset_flag(ne); + __clear_nat_cache_dirty(NM_I(sbi), set, ne); + if (nat_get_blkaddr(ne) == NULL_ADDR) { + add_free_nid(sbi, nid, false, true); + } else { + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, nid, false, false); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } + } + + if (to_journal) { + up_write(&curseg->journal_rwsem); + } else { + __update_nat_bits(sbi, start_nid, page); + f2fs_put_page(page, 1); + } + + /* Allow dirty nats by node block allocation in write_begin */ + if (!set->entry_cnt) { + radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); + kmem_cache_free(nat_entry_set_slab, set); + } +} + +/* + * This function is called during the checkpointing process. + */ +void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + struct nat_entry_set *setvec[SETVEC_SIZE]; + struct nat_entry_set *set, *tmp; + unsigned int found; + nid_t set_idx = 0; + LIST_HEAD(sets); + + /* during unmount, let's flush nat_bits before checking dirty_nat_cnt */ + if (enabled_nat_bits(sbi, cpc)) { + down_write(&nm_i->nat_tree_lock); + remove_nats_in_journal(sbi); + up_write(&nm_i->nat_tree_lock); + } + + if (!nm_i->dirty_nat_cnt) + return; + + down_write(&nm_i->nat_tree_lock); + + /* + * if there are no enough space in journal to store dirty nat + * entries, remove all entries from journal and merge them + * into nat entry set. + */ + if (enabled_nat_bits(sbi, cpc) || + !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + remove_nats_in_journal(sbi); + + while ((found = __gang_lookup_nat_set(nm_i, + set_idx, SETVEC_SIZE, setvec))) { + unsigned idx; + set_idx = setvec[found - 1]->set + 1; + for (idx = 0; idx < found; idx++) + __adjust_nat_entry_set(setvec[idx], &sets, + MAX_NAT_JENTRIES(journal)); + } + + /* flush dirty nats in nat entry set */ + list_for_each_entry_safe(set, tmp, &sets, set_list) + __flush_nat_entry_set(sbi, set, cpc); + + up_write(&nm_i->nat_tree_lock); + /* Allow dirty nats by node block allocation in write_begin */ +} + +static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE; + unsigned int i; + __u64 cp_ver = cur_cp_version(ckpt); + block_t nat_bits_addr; + + if (!enabled_nat_bits(sbi, NULL)) + return 0; + + nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); + nm_i->nat_bits = f2fs_kzalloc(sbi, + nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL); + if (!nm_i->nat_bits) + return -ENOMEM; + + nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - + nm_i->nat_bits_blocks; + for (i = 0; i < nm_i->nat_bits_blocks; i++) { + struct page *page; + + page = f2fs_get_meta_page(sbi, nat_bits_addr++); + if (IS_ERR(page)) { + disable_nat_bits(sbi, true); + return PTR_ERR(page); + } + + memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), + page_address(page), F2FS_BLKSIZE); + f2fs_put_page(page, 1); + } + + cp_ver |= (cur_cp_crc(ckpt) << 32); + if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { + disable_nat_bits(sbi, true); + return 0; + } + + nm_i->full_nat_bits = nm_i->nat_bits + 8; + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; + + f2fs_msg(sbi->sb, KERN_NOTICE, "Found nat_bits in checkpoint"); + return 0; +} + +static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int i = 0; + nid_t nid, last_nid; + + if (!enabled_nat_bits(sbi, NULL)) + return; + + for (i = 0; i < nm_i->nat_blocks; i++) { + i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) + break; + + __set_bit_le(i, nm_i->nat_block_bitmap); + + nid = i * NAT_ENTRY_PER_BLOCK; + last_nid = nid + NAT_ENTRY_PER_BLOCK; + + spin_lock(&NM_I(sbi)->nid_list_lock); + for (; nid < last_nid; nid++) + update_free_nid_bitmap(sbi, nid, true, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } + + for (i = 0; i < nm_i->nat_blocks; i++) { + i = find_next_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) + break; + + __set_bit_le(i, nm_i->nat_block_bitmap); + } +} + +static int init_node_manager(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned char *version_bitmap; + unsigned int nat_segs; + int err; + + nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr); + + /* segment_count_nat includes pair segment so divide to 2. */ + nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; + nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks; + + /* not used nids: 0, node, meta, (and root counted as valid node) */ + nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - + sbi->nquota_files - F2FS_RESERVED_NODE_NUM; + nm_i->nid_cnt[FREE_NID] = 0; + nm_i->nid_cnt[PREALLOC_NID] = 0; + nm_i->nat_cnt = 0; + nm_i->ram_thresh = DEF_RAM_THRESHOLD; + nm_i->ra_nid_pages = DEF_RA_NID_PAGES; + nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; + + INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); + INIT_LIST_HEAD(&nm_i->free_nid_list); + INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO); + INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO); + INIT_LIST_HEAD(&nm_i->nat_entries); + spin_lock_init(&nm_i->nat_list_lock); + + mutex_init(&nm_i->build_lock); + spin_lock_init(&nm_i->nid_list_lock); + init_rwsem(&nm_i->nat_tree_lock); + + nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); + nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); + version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP); + if (!version_bitmap) + return -EFAULT; + + nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size, + GFP_KERNEL); + if (!nm_i->nat_bitmap) + return -ENOMEM; + + err = __get_nat_bitmaps(sbi); + if (err) + return err; + +#ifdef CONFIG_F2FS_CHECK_FS + nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size, + GFP_KERNEL); + if (!nm_i->nat_bitmap_mir) + return -ENOMEM; +#endif + + return 0; +} + +static int init_free_nid_cache(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + int i; + + nm_i->free_nid_bitmap = + f2fs_kzalloc(sbi, array_size(sizeof(unsigned char *), + nm_i->nat_blocks), + GFP_KERNEL); + if (!nm_i->free_nid_bitmap) + return -ENOMEM; + + for (i = 0; i < nm_i->nat_blocks; i++) { + nm_i->free_nid_bitmap[i] = f2fs_kvzalloc(sbi, + f2fs_bitmap_size(NAT_ENTRY_PER_BLOCK), GFP_KERNEL); + if (!nm_i->free_nid_bitmap[i]) + return -ENOMEM; + } + + nm_i->nat_block_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks / 8, + GFP_KERNEL); + if (!nm_i->nat_block_bitmap) + return -ENOMEM; + + nm_i->free_nid_count = + f2fs_kvzalloc(sbi, array_size(sizeof(unsigned short), + nm_i->nat_blocks), + GFP_KERNEL); + if (!nm_i->free_nid_count) + return -ENOMEM; + return 0; +} + +int f2fs_build_node_manager(struct f2fs_sb_info *sbi) +{ + int err; + + sbi->nm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_nm_info), + GFP_KERNEL); + if (!sbi->nm_info) + return -ENOMEM; + + err = init_node_manager(sbi); + if (err) + return err; + + err = init_free_nid_cache(sbi); + if (err) + return err; + + /* load free nid status from nat_bits table */ + load_free_nid_bitmap(sbi); + + return f2fs_build_free_nids(sbi, true, true); +} + +void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i, *next_i; + struct nat_entry *natvec[NATVEC_SIZE]; + struct nat_entry_set *setvec[SETVEC_SIZE]; + nid_t nid = 0; + unsigned int found; + + if (!nm_i) + return; + + /* destroy free nid list */ + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { + __remove_free_nid(sbi, i, FREE_NID); + spin_unlock(&nm_i->nid_list_lock); + kmem_cache_free(free_nid_slab, i); + spin_lock(&nm_i->nid_list_lock); + } + f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID]); + f2fs_bug_on(sbi, nm_i->nid_cnt[PREALLOC_NID]); + f2fs_bug_on(sbi, !list_empty(&nm_i->free_nid_list)); + spin_unlock(&nm_i->nid_list_lock); + + /* destroy nat cache */ + down_write(&nm_i->nat_tree_lock); + while ((found = __gang_lookup_nat_cache(nm_i, + nid, NATVEC_SIZE, natvec))) { + unsigned idx; + + nid = nat_get_nid(natvec[found - 1]) + 1; + for (idx = 0; idx < found; idx++) { + spin_lock(&nm_i->nat_list_lock); + list_del(&natvec[idx]->list); + spin_unlock(&nm_i->nat_list_lock); + + __del_from_nat_cache(nm_i, natvec[idx]); + } + } + f2fs_bug_on(sbi, nm_i->nat_cnt); + + /* destroy nat set cache */ + nid = 0; + while ((found = __gang_lookup_nat_set(nm_i, + nid, SETVEC_SIZE, setvec))) { + unsigned idx; + + nid = setvec[found - 1]->set + 1; + for (idx = 0; idx < found; idx++) { + /* entry_cnt is not zero, when cp_error was occurred */ + f2fs_bug_on(sbi, !list_empty(&setvec[idx]->entry_list)); + radix_tree_delete(&nm_i->nat_set_root, setvec[idx]->set); + kmem_cache_free(nat_entry_set_slab, setvec[idx]); + } + } + up_write(&nm_i->nat_tree_lock); + + kvfree(nm_i->nat_block_bitmap); + if (nm_i->free_nid_bitmap) { + int i; + + for (i = 0; i < nm_i->nat_blocks; i++) + kvfree(nm_i->free_nid_bitmap[i]); + kfree(nm_i->free_nid_bitmap); + } + kvfree(nm_i->free_nid_count); + + kfree(nm_i->nat_bitmap); + kfree(nm_i->nat_bits); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(nm_i->nat_bitmap_mir); +#endif + sbi->nm_info = NULL; + kfree(nm_i); +} + +int __init f2fs_create_node_manager_caches(void) +{ + nat_entry_slab = f2fs_kmem_cache_create("nat_entry", + sizeof(struct nat_entry)); + if (!nat_entry_slab) + goto fail; + + free_nid_slab = f2fs_kmem_cache_create("free_nid", + sizeof(struct free_nid)); + if (!free_nid_slab) + goto destroy_nat_entry; + + nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set", + sizeof(struct nat_entry_set)); + if (!nat_entry_set_slab) + goto destroy_free_nid; + + fsync_node_entry_slab = f2fs_kmem_cache_create("fsync_node_entry", + sizeof(struct fsync_node_entry)); + if (!fsync_node_entry_slab) + goto destroy_nat_entry_set; + return 0; + +destroy_nat_entry_set: + kmem_cache_destroy(nat_entry_set_slab); +destroy_free_nid: + kmem_cache_destroy(free_nid_slab); +destroy_nat_entry: + kmem_cache_destroy(nat_entry_slab); +fail: + return -ENOMEM; +} + +void f2fs_destroy_node_manager_caches(void) +{ + kmem_cache_destroy(fsync_node_entry_slab); + kmem_cache_destroy(nat_entry_set_slab); + kmem_cache_destroy(free_nid_slab); + kmem_cache_destroy(nat_entry_slab); +} diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h new file mode 100644 index 000000000..0f4db7a61 --- /dev/null +++ b/fs/f2fs/node.h @@ -0,0 +1,458 @@ +/* + * fs/f2fs/node.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +/* start node id of a node block dedicated to the given node id */ +#define START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) + +/* node block offset on the NAT area dedicated to the given start node id */ +#define NAT_BLOCK_OFFSET(start_nid) ((start_nid) / NAT_ENTRY_PER_BLOCK) + +/* # of pages to perform synchronous readahead before building free nids */ +#define FREE_NID_PAGES 8 +#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) + +#define DEF_RA_NID_PAGES 0 /* # of nid pages to be readaheaded */ + +/* maximum readahead size for node during getting data blocks */ +#define MAX_RA_NODE 128 + +/* control the memory footprint threshold (10MB per 1GB ram) */ +#define DEF_RAM_THRESHOLD 1 + +/* control dirty nats ratio threshold (default: 10% over max nid count) */ +#define DEF_DIRTY_NAT_RATIO_THRESHOLD 10 +/* control total # of nats */ +#define DEF_NAT_CACHE_THRESHOLD 100000 + +/* vector size for gang look-up from nat cache that consists of radix tree */ +#define NATVEC_SIZE 64 +#define SETVEC_SIZE 32 + +/* return value for read_node_page */ +#define LOCKED_PAGE 1 + +/* For flag in struct node_info */ +enum { + IS_CHECKPOINTED, /* is it checkpointed before? */ + HAS_FSYNCED_INODE, /* is the inode fsynced before? */ + HAS_LAST_FSYNC, /* has the latest node fsync mark? */ + IS_DIRTY, /* this nat entry is dirty? */ + IS_PREALLOC, /* nat entry is preallocated */ +}; + +/* + * For node information + */ +struct node_info { + nid_t nid; /* node id */ + nid_t ino; /* inode number of the node's owner */ + block_t blk_addr; /* block address of the node */ + unsigned char version; /* version of the node */ + unsigned char flag; /* for node information bits */ +}; + +struct nat_entry { + struct list_head list; /* for clean or dirty nat list */ + struct node_info ni; /* in-memory node information */ +}; + +#define nat_get_nid(nat) ((nat)->ni.nid) +#define nat_set_nid(nat, n) ((nat)->ni.nid = (n)) +#define nat_get_blkaddr(nat) ((nat)->ni.blk_addr) +#define nat_set_blkaddr(nat, b) ((nat)->ni.blk_addr = (b)) +#define nat_get_ino(nat) ((nat)->ni.ino) +#define nat_set_ino(nat, i) ((nat)->ni.ino = (i)) +#define nat_get_version(nat) ((nat)->ni.version) +#define nat_set_version(nat, v) ((nat)->ni.version = (v)) + +#define inc_node_version(version) (++(version)) + +static inline void copy_node_info(struct node_info *dst, + struct node_info *src) +{ + dst->nid = src->nid; + dst->ino = src->ino; + dst->blk_addr = src->blk_addr; + dst->version = src->version; + /* should not copy flag here */ +} + +static inline void set_nat_flag(struct nat_entry *ne, + unsigned int type, bool set) +{ + unsigned char mask = 0x01 << type; + if (set) + ne->ni.flag |= mask; + else + ne->ni.flag &= ~mask; +} + +static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type) +{ + unsigned char mask = 0x01 << type; + return ne->ni.flag & mask; +} + +static inline void nat_reset_flag(struct nat_entry *ne) +{ + /* these states can be set only after checkpoint was done */ + set_nat_flag(ne, IS_CHECKPOINTED, true); + set_nat_flag(ne, HAS_FSYNCED_INODE, false); + set_nat_flag(ne, HAS_LAST_FSYNC, true); +} + +static inline void node_info_from_raw_nat(struct node_info *ni, + struct f2fs_nat_entry *raw_ne) +{ + ni->ino = le32_to_cpu(raw_ne->ino); + ni->blk_addr = le32_to_cpu(raw_ne->block_addr); + ni->version = raw_ne->version; +} + +static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, + struct node_info *ni) +{ + raw_ne->ino = cpu_to_le32(ni->ino); + raw_ne->block_addr = cpu_to_le32(ni->blk_addr); + raw_ne->version = ni->version; +} + +static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid * + NM_I(sbi)->dirty_nats_ratio / 100; +} + +static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD; +} + +static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi) +{ + return get_pages(sbi, F2FS_DIRTY_NODES) >= sbi->blocks_per_seg * 8; +} + +enum mem_type { + FREE_NIDS, /* indicates the free nid list */ + NAT_ENTRIES, /* indicates the cached nat entry */ + DIRTY_DENTS, /* indicates dirty dentry pages */ + INO_ENTRIES, /* indicates inode entries */ + EXTENT_CACHE, /* indicates extent cache */ + INMEM_PAGES, /* indicates inmemory pages */ + BASE_CHECK, /* check kernel status */ +}; + +struct nat_entry_set { + struct list_head set_list; /* link with other nat sets */ + struct list_head entry_list; /* link with dirty nat entries */ + nid_t set; /* set number*/ + unsigned int entry_cnt; /* the # of nat entries in set */ +}; + +struct free_nid { + struct list_head list; /* for free node id list */ + nid_t nid; /* node id */ + int state; /* in use or not: FREE_NID or PREALLOC_NID */ +}; + +static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *fnid; + + spin_lock(&nm_i->nid_list_lock); + if (nm_i->nid_cnt[FREE_NID] <= 0) { + spin_unlock(&nm_i->nid_list_lock); + return; + } + fnid = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); + *nid = fnid->nid; + spin_unlock(&nm_i->nid_list_lock); +} + +/* + * inline functions + */ +static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(nm_i->nat_bitmap, nm_i->nat_bitmap_mir, + nm_i->bitmap_size)) + f2fs_bug_on(sbi, 1); +#endif + memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size); +} + +static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + pgoff_t block_off; + pgoff_t block_addr; + + /* + * block_off = segment_off * 512 + off_in_segment + * OLD = (segment_off * 512) * 2 + off_in_segment + * NEW = 2 * (segment_off * 512 + off_in_segment) - off_in_segment + */ + block_off = NAT_BLOCK_OFFSET(start); + + block_addr = (pgoff_t)(nm_i->nat_blkaddr + + (block_off << 1) - + (block_off & (sbi->blocks_per_seg - 1))); + + if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) + block_addr += sbi->blocks_per_seg; + + return block_addr; +} + +static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi, + pgoff_t block_addr) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + block_addr -= nm_i->nat_blkaddr; + block_addr ^= 1 << sbi->log_blocks_per_seg; + return block_addr + nm_i->nat_blkaddr; +} + +static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) +{ + unsigned int block_off = NAT_BLOCK_OFFSET(start_nid); + + f2fs_change_bit(block_off, nm_i->nat_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_change_bit(block_off, nm_i->nat_bitmap_mir); +#endif +} + +static inline nid_t ino_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.ino); +} + +static inline nid_t nid_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.nid); +} + +static inline unsigned int ofs_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + unsigned flag = le32_to_cpu(rn->footer.flag); + return flag >> OFFSET_BIT_SHIFT; +} + +static inline __u64 cpver_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le64_to_cpu(rn->footer.cp_ver); +} + +static inline block_t next_blkaddr_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.next_blkaddr); +} + +static inline void fill_node_footer(struct page *page, nid_t nid, + nid_t ino, unsigned int ofs, bool reset) +{ + struct f2fs_node *rn = F2FS_NODE(page); + unsigned int old_flag = 0; + + if (reset) + memset(rn, 0, sizeof(*rn)); + else + old_flag = le32_to_cpu(rn->footer.flag); + + rn->footer.nid = cpu_to_le32(nid); + rn->footer.ino = cpu_to_le32(ino); + + /* should remain old flag bits such as COLD_BIT_SHIFT */ + rn->footer.flag = cpu_to_le32((ofs << OFFSET_BIT_SHIFT) | + (old_flag & OFFSET_BIT_MASK)); +} + +static inline void copy_node_footer(struct page *dst, struct page *src) +{ + struct f2fs_node *src_rn = F2FS_NODE(src); + struct f2fs_node *dst_rn = F2FS_NODE(dst); + memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer)); +} + +static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); + struct f2fs_node *rn = F2FS_NODE(page); + __u64 cp_ver = cur_cp_version(ckpt); + + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) + cp_ver |= (cur_cp_crc(ckpt) << 32); + + rn->footer.cp_ver = cpu_to_le64(cp_ver); + rn->footer.next_blkaddr = cpu_to_le32(blkaddr); +} + +static inline bool is_recoverable_dnode(struct page *page) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); + __u64 cp_ver = cur_cp_version(ckpt); + + /* Don't care crc part, if fsck.f2fs sets it. */ + if (__is_set_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG)) + return (cp_ver << 32) == (cpver_of_node(page) << 32); + + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) + cp_ver |= (cur_cp_crc(ckpt) << 32); + + return cp_ver == cpver_of_node(page); +} + +/* + * f2fs assigns the following node offsets described as (num). + * N = NIDS_PER_BLOCK + * + * Inode block (0) + * |- direct node (1) + * |- direct node (2) + * |- indirect node (3) + * | `- direct node (4 => 4 + N - 1) + * |- indirect node (4 + N) + * | `- direct node (5 + N => 5 + 2N - 1) + * `- double indirect node (5 + 2N) + * `- indirect node (6 + 2N) + * `- direct node + * ...... + * `- indirect node ((6 + 2N) + x(N + 1)) + * `- direct node + * ...... + * `- indirect node ((6 + 2N) + (N - 1)(N + 1)) + * `- direct node + */ +static inline bool IS_DNODE(struct page *node_page) +{ + unsigned int ofs = ofs_of_node(node_page); + + if (f2fs_has_xattr_block(ofs)) + return true; + + if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || + ofs == 5 + 2 * NIDS_PER_BLOCK) + return false; + if (ofs >= 6 + 2 * NIDS_PER_BLOCK) { + ofs -= 6 + 2 * NIDS_PER_BLOCK; + if (!((long int)ofs % (NIDS_PER_BLOCK + 1))) + return false; + } + return true; +} + +static inline int set_nid(struct page *p, int off, nid_t nid, bool i) +{ + struct f2fs_node *rn = F2FS_NODE(p); + + f2fs_wait_on_page_writeback(p, NODE, true); + + if (i) + rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); + else + rn->in.nid[off] = cpu_to_le32(nid); + return set_page_dirty(p); +} + +static inline nid_t get_nid(struct page *p, int off, bool i) +{ + struct f2fs_node *rn = F2FS_NODE(p); + + if (i) + return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]); + return le32_to_cpu(rn->in.nid[off]); +} + +/* + * Coldness identification: + * - Mark cold files in f2fs_inode_info + * - Mark cold node blocks in their node footer + * - Mark cold data pages in page cache + */ +static inline int is_cold_data(struct page *page) +{ + return PageChecked(page); +} + +static inline void set_cold_data(struct page *page) +{ + SetPageChecked(page); +} + +static inline void clear_cold_data(struct page *page) +{ + ClearPageChecked(page); +} + +static inline int is_node(struct page *page, int type) +{ + struct f2fs_node *rn = F2FS_NODE(page); + return le32_to_cpu(rn->footer.flag) & (1 << type); +} + +#define is_cold_node(page) is_node(page, COLD_BIT_SHIFT) +#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT) +#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT) + +static inline int is_inline_node(struct page *page) +{ + return PageChecked(page); +} + +static inline void set_inline_node(struct page *page) +{ + SetPageChecked(page); +} + +static inline void clear_inline_node(struct page *page) +{ + ClearPageChecked(page); +} + +static inline void set_cold_node(struct page *page, bool is_dir) +{ + struct f2fs_node *rn = F2FS_NODE(page); + unsigned int flag = le32_to_cpu(rn->footer.flag); + + if (is_dir) + flag &= ~(0x1 << COLD_BIT_SHIFT); + else + flag |= (0x1 << COLD_BIT_SHIFT); + rn->footer.flag = cpu_to_le32(flag); +} + +static inline void set_mark(struct page *page, int mark, int type) +{ + struct f2fs_node *rn = F2FS_NODE(page); + unsigned int flag = le32_to_cpu(rn->footer.flag); + if (mark) + flag |= (0x1 << type); + else + flag &= ~(0x1 << type); + rn->footer.flag = cpu_to_le32(flag); + +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_inode_chksum_set(F2FS_P_SB(page), page); +#endif +} +#define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT) +#define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c new file mode 100644 index 000000000..ad0486bee --- /dev/null +++ b/fs/f2fs/recovery.c @@ -0,0 +1,766 @@ +/* + * fs/f2fs/recovery.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include "f2fs.h" +#include "node.h" +#include "segment.h" + +/* + * Roll forward recovery scenarios. + * + * [Term] F: fsync_mark, D: dentry_mark + * + * 1. inode(x) | CP | inode(x) | dnode(F) + * -> Update the latest inode(x). + * + * 2. inode(x) | CP | inode(F) | dnode(F) + * -> No problem. + * + * 3. inode(x) | CP | dnode(F) | inode(x) + * -> Recover to the latest dnode(F), and drop the last inode(x) + * + * 4. inode(x) | CP | dnode(F) | inode(F) + * -> No problem. + * + * 5. CP | inode(x) | dnode(F) + * -> The inode(DF) was missing. Should drop this dnode(F). + * + * 6. CP | inode(DF) | dnode(F) + * -> No problem. + * + * 7. CP | dnode(F) | inode(DF) + * -> If f2fs_iget fails, then goto next to find inode(DF). + * + * 8. CP | dnode(F) | inode(x) + * -> If f2fs_iget fails, then goto next to find inode(DF). + * But it will fail due to no inode(DF). + */ + +static struct kmem_cache *fsync_entry_slab; + +bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi) +{ + s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count); + + if (sbi->last_valid_block_count + nalloc > sbi->user_block_count) + return false; + return true; +} + +static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, + nid_t ino) +{ + struct fsync_inode_entry *entry; + + list_for_each_entry(entry, head, list) + if (entry->inode->i_ino == ino) + return entry; + + return NULL; +} + +static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, + struct list_head *head, nid_t ino, bool quota_inode) +{ + struct inode *inode; + struct fsync_inode_entry *entry; + int err; + + inode = f2fs_iget_retry(sbi->sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + err = dquot_initialize(inode); + if (err) + goto err_out; + + if (quota_inode) { + err = dquot_alloc_inode(inode); + if (err) + goto err_out; + } + + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); + entry->inode = inode; + list_add_tail(&entry->list, head); + + return entry; +err_out: + iput(inode); + return ERR_PTR(err); +} + +static void del_fsync_inode(struct fsync_inode_entry *entry, int drop) +{ + if (drop) { + /* inode should not be recovered, drop it */ + f2fs_inode_synced(entry->inode); + } + iput(entry->inode); + list_del(&entry->list); + kmem_cache_free(fsync_entry_slab, entry); +} + +static int recover_dentry(struct inode *inode, struct page *ipage, + struct list_head *dir_list) +{ + struct f2fs_inode *raw_inode = F2FS_INODE(ipage); + nid_t pino = le32_to_cpu(raw_inode->i_pino); + struct f2fs_dir_entry *de; + struct fscrypt_name fname; + struct page *page; + struct inode *dir, *einode; + struct fsync_inode_entry *entry; + int err = 0; + char *name; + + entry = get_fsync_inode(dir_list, pino); + if (!entry) { + entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, + pino, false); + if (IS_ERR(entry)) { + dir = ERR_CAST(entry); + err = PTR_ERR(entry); + goto out; + } + } + + dir = entry->inode; + + memset(&fname, 0, sizeof(struct fscrypt_name)); + fname.disk_name.len = le32_to_cpu(raw_inode->i_namelen); + fname.disk_name.name = raw_inode->i_name; + + if (unlikely(fname.disk_name.len > F2FS_NAME_LEN)) { + WARN_ON(1); + err = -ENAMETOOLONG; + goto out; + } +retry: + de = __f2fs_find_entry(dir, &fname, &page); + if (de && inode->i_ino == le32_to_cpu(de->ino)) + goto out_put; + + if (de) { + einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino)); + if (IS_ERR(einode)) { + WARN_ON(1); + err = PTR_ERR(einode); + if (err == -ENOENT) + err = -EEXIST; + goto out_put; + } + + err = dquot_initialize(einode); + if (err) { + iput(einode); + goto out_put; + } + + err = f2fs_acquire_orphan_inode(F2FS_I_SB(inode)); + if (err) { + iput(einode); + goto out_put; + } + f2fs_delete_entry(de, page, dir, einode); + iput(einode); + goto retry; + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + } else { + err = f2fs_add_dentry(dir, &fname, inode, + inode->i_ino, inode->i_mode); + } + if (err == -ENOMEM) + goto retry; + goto out; + +out_put: + f2fs_put_page(page, 0); +out: + if (file_enc_name(inode)) + name = "<encrypted>"; + else + name = raw_inode->i_name; + f2fs_msg(inode->i_sb, KERN_NOTICE, + "%s: ino = %x, name = %s, dir = %lx, err = %d", + __func__, ino_of_node(ipage), name, + IS_ERR(dir) ? 0 : dir->i_ino, err); + return err; +} + +static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri) +{ + if (ri->i_inline & F2FS_PIN_FILE) + set_inode_flag(inode, FI_PIN_FILE); + else + clear_inode_flag(inode, FI_PIN_FILE); + if (ri->i_inline & F2FS_DATA_EXIST) + set_inode_flag(inode, FI_DATA_EXIST); + else + clear_inode_flag(inode, FI_DATA_EXIST); +} + +static void recover_inode(struct inode *inode, struct page *page) +{ + struct f2fs_inode *raw = F2FS_INODE(page); + char *name; + + inode->i_mode = le16_to_cpu(raw->i_mode); + i_uid_write(inode, le32_to_cpu(raw->i_uid)); + i_gid_write(inode, le32_to_cpu(raw->i_gid)); + + if (raw->i_inline & F2FS_EXTRA_ATTR) { + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && + F2FS_FITS_IN_INODE(raw, le16_to_cpu(raw->i_extra_isize), + i_projid)) { + projid_t i_projid; + + i_projid = (projid_t)le32_to_cpu(raw->i_projid); + F2FS_I(inode)->i_projid = + make_kprojid(&init_user_ns, i_projid); + } + } + + f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); + inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime); + inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); + inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); + inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec); + inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); + inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + + F2FS_I(inode)->i_advise = raw->i_advise; + F2FS_I(inode)->i_flags = le32_to_cpu(raw->i_flags); + f2fs_set_inode_flags(inode); + F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = + le16_to_cpu(raw->i_gc_failures); + + recover_inline_flags(inode, raw); + + f2fs_mark_inode_dirty_sync(inode, true); + + if (file_enc_name(inode)) + name = "<encrypted>"; + else + name = F2FS_INODE(page)->i_name; + + f2fs_msg(inode->i_sb, KERN_NOTICE, + "recover_inode: ino = %x, name = %s, inline = %x", + ino_of_node(page), name, raw->i_inline); +} + +static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, + bool check_only) +{ + struct curseg_info *curseg; + struct page *page = NULL; + block_t blkaddr; + unsigned int loop_cnt = 0; + unsigned int free_blocks = MAIN_SEGS(sbi) * sbi->blocks_per_seg - + valid_user_blocks(sbi); + int err = 0; + + /* get node pages in the current segment */ + curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + + while (1) { + struct fsync_inode_entry *entry; + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) + return 0; + + page = f2fs_get_tmp_page(sbi, blkaddr); + if (IS_ERR(page)) { + err = PTR_ERR(page); + break; + } + + if (!is_recoverable_dnode(page)) + break; + + if (!is_fsync_dnode(page)) + goto next; + + entry = get_fsync_inode(head, ino_of_node(page)); + if (!entry) { + bool quota_inode = false; + + if (!check_only && + IS_INODE(page) && is_dent_dnode(page)) { + err = f2fs_recover_inode_page(sbi, page); + if (err) + break; + quota_inode = true; + } + + /* + * CP | dnode(F) | inode(DF) + * For this case, we should not give up now. + */ + entry = add_fsync_inode(sbi, head, ino_of_node(page), + quota_inode); + if (IS_ERR(entry)) { + err = PTR_ERR(entry); + if (err == -ENOENT) { + err = 0; + goto next; + } + break; + } + } + entry->blkaddr = blkaddr; + + if (IS_INODE(page) && is_dent_dnode(page)) + entry->last_dentry = blkaddr; +next: + /* sanity check in order to detect looped node chain */ + if (++loop_cnt >= free_blocks || + blkaddr == next_blkaddr_of_node(page)) { + f2fs_msg(sbi->sb, KERN_NOTICE, + "%s: detect looped node chain, " + "blkaddr:%u, next:%u", + __func__, blkaddr, next_blkaddr_of_node(page)); + err = -EINVAL; + break; + } + + /* check next segment */ + blkaddr = next_blkaddr_of_node(page); + f2fs_put_page(page, 1); + + f2fs_ra_meta_pages_cond(sbi, blkaddr); + } + f2fs_put_page(page, 1); + return err; +} + +static void destroy_fsync_dnodes(struct list_head *head, int drop) +{ + struct fsync_inode_entry *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, head, list) + del_fsync_inode(entry, drop); +} + +static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, + block_t blkaddr, struct dnode_of_data *dn) +{ + struct seg_entry *sentry; + unsigned int segno = GET_SEGNO(sbi, blkaddr); + unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + struct f2fs_summary_block *sum_node; + struct f2fs_summary sum; + struct page *sum_page, *node_page; + struct dnode_of_data tdn = *dn; + nid_t ino, nid; + struct inode *inode; + unsigned int offset; + block_t bidx; + int i; + + sentry = get_seg_entry(sbi, segno); + if (!f2fs_test_bit(blkoff, sentry->cur_valid_map)) + return 0; + + /* Get the previous summary */ + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + struct curseg_info *curseg = CURSEG_I(sbi, i); + if (curseg->segno == segno) { + sum = curseg->sum_blk->entries[blkoff]; + goto got_it; + } + } + + sum_page = f2fs_get_sum_page(sbi, segno); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + sum = sum_node->entries[blkoff]; + f2fs_put_page(sum_page, 1); +got_it: + /* Use the locked dnode page and inode */ + nid = le32_to_cpu(sum.nid); + if (dn->inode->i_ino == nid) { + tdn.nid = nid; + if (!dn->inode_page_locked) + lock_page(dn->inode_page); + tdn.node_page = dn->inode_page; + tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + goto truncate_out; + } else if (dn->nid == nid) { + tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + goto truncate_out; + } + + /* Get the node page */ + node_page = f2fs_get_node_page(sbi, nid); + if (IS_ERR(node_page)) + return PTR_ERR(node_page); + + offset = ofs_of_node(node_page); + ino = ino_of_node(node_page); + f2fs_put_page(node_page, 1); + + if (ino != dn->inode->i_ino) { + int ret; + + /* Deallocate previous index in the node page */ + inode = f2fs_iget_retry(sbi->sb, ino); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + ret = dquot_initialize(inode); + if (ret) { + iput(inode); + return ret; + } + } else { + inode = dn->inode; + } + + bidx = f2fs_start_bidx_of_node(offset, inode) + + le16_to_cpu(sum.ofs_in_node); + + /* + * if inode page is locked, unlock temporarily, but its reference + * count keeps alive. + */ + if (ino == dn->inode->i_ino && dn->inode_page_locked) + unlock_page(dn->inode_page); + + set_new_dnode(&tdn, inode, NULL, NULL, 0); + if (f2fs_get_dnode_of_data(&tdn, bidx, LOOKUP_NODE)) + goto out; + + if (tdn.data_blkaddr == blkaddr) + f2fs_truncate_data_blocks_range(&tdn, 1); + + f2fs_put_dnode(&tdn); +out: + if (ino != dn->inode->i_ino) + iput(inode); + else if (dn->inode_page_locked) + lock_page(dn->inode_page); + return 0; + +truncate_out: + if (datablock_addr(tdn.inode, tdn.node_page, + tdn.ofs_in_node) == blkaddr) + f2fs_truncate_data_blocks_range(&tdn, 1); + if (dn->inode->i_ino == nid && !dn->inode_page_locked) + unlock_page(dn->inode_page); + return 0; +} + +static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, + struct page *page) +{ + struct dnode_of_data dn; + struct node_info ni; + unsigned int start, end; + int err = 0, recovered = 0; + + /* step 1: recover xattr */ + if (IS_INODE(page)) { + err = f2fs_recover_inline_xattr(inode, page); + if (err) + goto out; + } else if (f2fs_has_xattr_block(ofs_of_node(page))) { + err = f2fs_recover_xattr_data(inode, page); + if (!err) + recovered++; + goto out; + } + + /* step 2: recover inline data */ + err = f2fs_recover_inline_data(inode, page); + if (err) { + if (err == 1) + err = 0; + goto out; + } + + /* step 3: recover data indices */ + start = f2fs_start_bidx_of_node(ofs_of_node(page), inode); + end = start + ADDRS_PER_PAGE(page, inode); + + set_new_dnode(&dn, inode, NULL, NULL, 0); +retry_dn: + err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE); + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_dn; + } + goto out; + } + + f2fs_wait_on_page_writeback(dn.node_page, NODE, true); + + err = f2fs_get_node_info(sbi, dn.nid, &ni); + if (err) + goto err; + + f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); + + if (ofs_of_node(dn.node_page) != ofs_of_node(page)) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Inconsistent ofs_of_node, ino:%lu, ofs:%u, %u", + inode->i_ino, ofs_of_node(dn.node_page), + ofs_of_node(page)); + err = -EFSCORRUPTED; + goto err; + } + + for (; start < end; start++, dn.ofs_in_node++) { + block_t src, dest; + + src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); + dest = datablock_addr(dn.inode, page, dn.ofs_in_node); + + /* skip recovering if dest is the same as src */ + if (src == dest) + continue; + + /* dest is invalid, just invalidate src block */ + if (dest == NULL_ADDR) { + f2fs_truncate_data_blocks_range(&dn, 1); + continue; + } + + if (!file_keep_isize(inode) && + (i_size_read(inode) <= ((loff_t)start << PAGE_SHIFT))) + f2fs_i_size_write(inode, + (loff_t)(start + 1) << PAGE_SHIFT); + + /* + * dest is reserved block, invalidate src block + * and then reserve one new block in dnode page. + */ + if (dest == NEW_ADDR) { + f2fs_truncate_data_blocks_range(&dn, 1); + f2fs_reserve_new_block(&dn); + continue; + } + + /* dest is valid block, try to recover from src to dest */ + if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { + + if (src == NULL_ADDR) { + err = f2fs_reserve_new_block(&dn); + while (err && + IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) + err = f2fs_reserve_new_block(&dn); + /* We should not get -ENOSPC */ + f2fs_bug_on(sbi, err); + if (err) + goto err; + } +retry_prev: + /* Check the previous node page having this index */ + err = check_index_in_prev_nodes(sbi, dest, &dn); + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_prev; + } + goto err; + } + + /* write dummy data page */ + f2fs_replace_block(sbi, &dn, src, dest, + ni.version, false, false); + recovered++; + } + } + + copy_node_footer(dn.node_page, page); + fill_node_footer(dn.node_page, dn.nid, ni.ino, + ofs_of_node(page), false); + set_page_dirty(dn.node_page); +err: + f2fs_put_dnode(&dn); +out: + f2fs_msg(sbi->sb, KERN_NOTICE, + "recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d", + inode->i_ino, + file_keep_isize(inode) ? "keep" : "recover", + recovered, err); + return err; +} + +static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, + struct list_head *tmp_inode_list, struct list_head *dir_list) +{ + struct curseg_info *curseg; + struct page *page = NULL; + int err = 0; + block_t blkaddr; + + /* get node pages in the current segment */ + curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + + while (1) { + struct fsync_inode_entry *entry; + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) + break; + + f2fs_ra_meta_pages_cond(sbi, blkaddr); + + page = f2fs_get_tmp_page(sbi, blkaddr); + if (IS_ERR(page)) { + err = PTR_ERR(page); + break; + } + + if (!is_recoverable_dnode(page)) { + f2fs_put_page(page, 1); + break; + } + + entry = get_fsync_inode(inode_list, ino_of_node(page)); + if (!entry) + goto next; + /* + * inode(x) | CP | inode(x) | dnode(F) + * In this case, we can lose the latest inode(x). + * So, call recover_inode for the inode update. + */ + if (IS_INODE(page)) + recover_inode(entry->inode, page); + if (entry->last_dentry == blkaddr) { + err = recover_dentry(entry->inode, page, dir_list); + if (err) { + f2fs_put_page(page, 1); + break; + } + } + err = do_recover_data(sbi, entry->inode, page); + if (err) { + f2fs_put_page(page, 1); + break; + } + + if (entry->blkaddr == blkaddr) + list_move_tail(&entry->list, tmp_inode_list); +next: + /* check next segment */ + blkaddr = next_blkaddr_of_node(page); + f2fs_put_page(page, 1); + } + if (!err) + f2fs_allocate_new_segments(sbi); + return err; +} + +int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) +{ + struct list_head inode_list, tmp_inode_list; + struct list_head dir_list; + int err; + int ret = 0; + unsigned long s_flags = sbi->sb->s_flags; + bool need_writecp = false; +#ifdef CONFIG_QUOTA + int quota_enabled; +#endif + + if (s_flags & SB_RDONLY) { + f2fs_msg(sbi->sb, KERN_INFO, + "recover fsync data on readonly fs"); + sbi->sb->s_flags &= ~SB_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= SB_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); +#endif + + fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", + sizeof(struct fsync_inode_entry)); + if (!fsync_entry_slab) { + err = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&inode_list); + INIT_LIST_HEAD(&tmp_inode_list); + INIT_LIST_HEAD(&dir_list); + + /* prevent checkpoint */ + mutex_lock(&sbi->cp_mutex); + + /* step #1: find fsynced inode numbers */ + err = find_fsync_dnodes(sbi, &inode_list, check_only); + if (err || list_empty(&inode_list)) + goto skip; + + if (check_only) { + ret = 1; + goto skip; + } + + need_writecp = true; + + /* step #2: recover data */ + err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list); + if (!err) + f2fs_bug_on(sbi, !list_empty(&inode_list)); + else { + /* restore s_flags to let iput() trash data */ + sbi->sb->s_flags = s_flags; + } +skip: + destroy_fsync_dnodes(&inode_list, err); + destroy_fsync_dnodes(&tmp_inode_list, err); + + /* truncate meta pages to be used by the recovery */ + truncate_inode_pages_range(META_MAPPING(sbi), + (loff_t)MAIN_BLKADDR(sbi) << PAGE_SHIFT, -1); + + if (err) { + truncate_inode_pages_final(NODE_MAPPING(sbi)); + truncate_inode_pages_final(META_MAPPING(sbi)); + } else { + clear_sbi_flag(sbi, SBI_POR_DOING); + } + mutex_unlock(&sbi->cp_mutex); + + /* let's drop all the directory inodes for clean checkpoint */ + destroy_fsync_dnodes(&dir_list, err); + + if (need_writecp) { + set_sbi_flag(sbi, SBI_IS_RECOVERED); + + if (!err) { + struct cp_control cpc = { + .reason = CP_RECOVERY, + }; + err = f2fs_write_checkpoint(sbi, &cpc); + } + } + + kmem_cache_destroy(fsync_entry_slab); +out: +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + if (quota_enabled) + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ + + return ret ? ret: err; +} diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c new file mode 100644 index 000000000..6fbf04713 --- /dev/null +++ b/fs/f2fs/segment.c @@ -0,0 +1,4397 @@ +/* + * fs/f2fs/segment.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/prefetch.h> +#include <linux/kthread.h> +#include <linux/swap.h> +#include <linux/timer.h> +#include <linux/freezer.h> +#include <linux/sched/signal.h> + +#include "f2fs.h" +#include "segment.h" +#include "node.h" +#include "gc.h" +#include "trace.h" +#include <trace/events/f2fs.h> + +#define __reverse_ffz(x) __reverse_ffs(~(x)) + +static struct kmem_cache *discard_entry_slab; +static struct kmem_cache *discard_cmd_slab; +static struct kmem_cache *sit_entry_set_slab; +static struct kmem_cache *inmem_entry_slab; + +static unsigned long __reverse_ulong(unsigned char *str) +{ + unsigned long tmp = 0; + int shift = 24, idx = 0; + +#if BITS_PER_LONG == 64 + shift = 56; +#endif + while (shift >= 0) { + tmp |= (unsigned long)str[idx++] << shift; + shift -= BITS_PER_BYTE; + } + return tmp; +} + +/* + * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since + * MSB and LSB are reversed in a byte by f2fs_set_bit. + */ +static inline unsigned long __reverse_ffs(unsigned long word) +{ + int num = 0; + +#if BITS_PER_LONG == 64 + if ((word & 0xffffffff00000000UL) == 0) + num += 32; + else + word >>= 32; +#endif + if ((word & 0xffff0000) == 0) + num += 16; + else + word >>= 16; + + if ((word & 0xff00) == 0) + num += 8; + else + word >>= 8; + + if ((word & 0xf0) == 0) + num += 4; + else + word >>= 4; + + if ((word & 0xc) == 0) + num += 2; + else + word >>= 2; + + if ((word & 0x2) == 0) + num += 1; + return num; +} + +/* + * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because + * f2fs_set_bit makes MSB and LSB reversed in a byte. + * @size must be integral times of unsigned long. + * Example: + * MSB <--> LSB + * f2fs_set_bit(0, bitmap) => 1000 0000 + * f2fs_set_bit(7, bitmap) => 0000 0001 + */ +static unsigned long __find_rev_next_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BIT_WORD(offset); + unsigned long result = size; + unsigned long tmp; + + if (offset >= size) + return size; + + size -= (offset & ~(BITS_PER_LONG - 1)); + offset %= BITS_PER_LONG; + + while (1) { + if (*p == 0) + goto pass; + + tmp = __reverse_ulong((unsigned char *)p); + + tmp &= ~0UL >> offset; + if (size < BITS_PER_LONG) + tmp &= (~0UL << (BITS_PER_LONG - size)); + if (tmp) + goto found; +pass: + if (size <= BITS_PER_LONG) + break; + size -= BITS_PER_LONG; + offset = 0; + p++; + } + return result; +found: + return result - size + __reverse_ffs(tmp); +} + +static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BIT_WORD(offset); + unsigned long result = size; + unsigned long tmp; + + if (offset >= size) + return size; + + size -= (offset & ~(BITS_PER_LONG - 1)); + offset %= BITS_PER_LONG; + + while (1) { + if (*p == ~0UL) + goto pass; + + tmp = __reverse_ulong((unsigned char *)p); + + if (offset) + tmp |= ~0UL << (BITS_PER_LONG - offset); + if (size < BITS_PER_LONG) + tmp |= ~0UL >> size; + if (tmp != ~0UL) + goto found; +pass: + if (size <= BITS_PER_LONG) + break; + size -= BITS_PER_LONG; + offset = 0; + p++; + } + return result; +found: + return result - size + __reverse_ffz(tmp); +} + +bool f2fs_need_SSR(struct f2fs_sb_info *sbi) +{ + int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); + int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + + if (test_opt(sbi, LFS)) + return false; + if (sbi->gc_mode == GC_URGENT) + return true; + + return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + + SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); +} + +void f2fs_register_inmem_page(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct inmem_pages *new; + + f2fs_trace_pid(page); + + set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); + SetPagePrivate(page); + + new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); + + /* add atomic page indices to the list */ + new->page = page; + INIT_LIST_HEAD(&new->list); + + /* increase reference count with clean state */ + mutex_lock(&fi->inmem_lock); + get_page(page); + list_add_tail(&new->list, &fi->inmem_pages); + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (list_empty(&fi->inmem_ilist)) + list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); + mutex_unlock(&fi->inmem_lock); + + trace_f2fs_register_inmem_page(page, INMEM); +} + +static int __revoke_inmem_pages(struct inode *inode, + struct list_head *head, bool drop, bool recover, + bool trylock) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct inmem_pages *cur, *tmp; + int err = 0; + + list_for_each_entry_safe(cur, tmp, head, list) { + struct page *page = cur->page; + + if (drop) + trace_f2fs_commit_inmem_page(page, INMEM_DROP); + + if (trylock) { + /* + * to avoid deadlock in between page lock and + * inmem_lock. + */ + if (!trylock_page(page)) + continue; + } else { + lock_page(page); + } + + f2fs_wait_on_page_writeback(page, DATA, true); + + if (recover) { + struct dnode_of_data dn; + struct node_info ni; + + trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); +retry: + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, page->index, + LOOKUP_NODE); + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto retry; + } + err = -EAGAIN; + goto next; + } + + err = f2fs_get_node_info(sbi, dn.nid, &ni); + if (err) { + f2fs_put_dnode(&dn); + return err; + } + + if (cur->old_addr == NEW_ADDR) { + f2fs_invalidate_blocks(sbi, dn.data_blkaddr); + f2fs_update_data_blkaddr(&dn, NEW_ADDR); + } else + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + cur->old_addr, ni.version, true, true); + f2fs_put_dnode(&dn); + } +next: + /* we don't need to invalidate this in the sccessful status */ + if (drop || recover) { + ClearPageUptodate(page); + clear_cold_data(page); + } + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 1); + + list_del(&cur->list); + kmem_cache_free(inmem_entry_slab, cur); + dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); + } + return err; +} + +void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) +{ + struct list_head *head = &sbi->inode_list[ATOMIC_FILE]; + struct inode *inode; + struct f2fs_inode_info *fi; +next: + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (list_empty(head)) { + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + return; + } + fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + + if (inode) { + if (gc_failure) { + if (fi->i_gc_failures[GC_FAILURE_ATOMIC]) + goto drop; + goto skip; + } +drop: + set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); + f2fs_drop_inmem_pages(inode); + iput(inode); + } +skip: + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto next; +} + +void f2fs_drop_inmem_pages(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + + while (!list_empty(&fi->inmem_pages)) { + mutex_lock(&fi->inmem_lock); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, true); + + if (list_empty(&fi->inmem_pages)) { + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + } + mutex_unlock(&fi->inmem_lock); + } + + clear_inode_flag(inode, FI_ATOMIC_FILE); + fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; + stat_dec_atomic_write(inode); +} + +void f2fs_drop_inmem_page(struct inode *inode, struct page *page) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct list_head *head = &fi->inmem_pages; + struct inmem_pages *cur = NULL; + struct inmem_pages *tmp; + + f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page)); + + mutex_lock(&fi->inmem_lock); + list_for_each_entry(tmp, head, list) { + if (tmp->page == page) { + cur = tmp; + break; + } + } + + f2fs_bug_on(sbi, !cur); + list_del(&cur->list); + mutex_unlock(&fi->inmem_lock); + + dec_page_count(sbi, F2FS_INMEM_PAGES); + kmem_cache_free(inmem_entry_slab, cur); + + ClearPageUptodate(page); + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 0); + + trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); +} + +static int __f2fs_commit_inmem_pages(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct inmem_pages *cur, *tmp; + struct f2fs_io_info fio = { + .sbi = sbi, + .ino = inode->i_ino, + .type = DATA, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_PRIO, + .io_type = FS_DATA_IO, + }; + struct list_head revoke_list; + pgoff_t last_idx = ULONG_MAX; + int err = 0; + + INIT_LIST_HEAD(&revoke_list); + + list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { + struct page *page = cur->page; + + lock_page(page); + if (page->mapping == inode->i_mapping) { + trace_f2fs_commit_inmem_page(page, INMEM); + + set_page_dirty(page); + f2fs_wait_on_page_writeback(page, DATA, true); + if (clear_page_dirty_for_io(page)) { + inode_dec_dirty_pages(inode); + f2fs_remove_dirty_inode(inode); + } +retry: + fio.page = page; + fio.old_blkaddr = NULL_ADDR; + fio.encrypted_page = NULL; + fio.need_lock = LOCK_DONE; + err = f2fs_do_write_data_page(&fio); + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto retry; + } + unlock_page(page); + break; + } + /* record old blkaddr for revoking */ + cur->old_addr = fio.old_blkaddr; + last_idx = page->index; + } + unlock_page(page); + list_move_tail(&cur->list, &revoke_list); + } + + if (last_idx != ULONG_MAX) + f2fs_submit_merged_write_cond(sbi, inode, 0, last_idx, DATA); + + if (err) { + /* + * try to revoke all committed pages, but still we could fail + * due to no memory or other reason, if that happened, EAGAIN + * will be returned, which means in such case, transaction is + * already not integrity, caller should use journal to do the + * recovery or rewrite & commit last transaction. For other + * error number, revoking was done by filesystem itself. + */ + err = __revoke_inmem_pages(inode, &revoke_list, + false, true, false); + + /* drop all uncommitted pages */ + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, false); + } else { + __revoke_inmem_pages(inode, &revoke_list, + false, false, false); + } + + return err; +} + +int f2fs_commit_inmem_pages(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + int err; + + f2fs_balance_fs(sbi, true); + + down_write(&fi->i_gc_rwsem[WRITE]); + + f2fs_lock_op(sbi); + set_inode_flag(inode, FI_ATOMIC_COMMIT); + + mutex_lock(&fi->inmem_lock); + err = __f2fs_commit_inmem_pages(inode); + + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + mutex_unlock(&fi->inmem_lock); + + clear_inode_flag(inode, FI_ATOMIC_COMMIT); + + f2fs_unlock_op(sbi); + up_write(&fi->i_gc_rwsem[WRITE]); + + return err; +} + +/* + * This function balances dirty node and dentry pages. + * In addition, it controls garbage collection. + */ +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) +{ + if (time_to_inject(sbi, FAULT_CHECKPOINT)) { + f2fs_show_injection_info(FAULT_CHECKPOINT); + f2fs_stop_checkpoint(sbi, false); + } + + /* balance_fs_bg is able to be pending */ + if (need && excess_cached_nats(sbi)) + f2fs_balance_fs_bg(sbi); + + /* + * We should do GC or end up with checkpoint, if there are so many dirty + * dir/node pages without enough free segments. + */ + if (has_not_enough_free_secs(sbi, 0, 0)) { + mutex_lock(&sbi->gc_mutex); + f2fs_gc(sbi, false, false, NULL_SEGNO); + } +} + +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) +{ + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return; + + /* try to shrink extent cache when there is no enough memory */ + if (!f2fs_available_free_memory(sbi, EXTENT_CACHE)) + f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); + + /* check the # of cached NAT entries */ + if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) + f2fs_try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); + + if (!f2fs_available_free_memory(sbi, FREE_NIDS)) + f2fs_try_to_free_nids(sbi, MAX_FREE_NIDS); + else + f2fs_build_free_nids(sbi, false, false); + + if (!is_idle(sbi) && + (!excess_dirty_nats(sbi) && !excess_dirty_nodes(sbi))) + return; + + /* checkpoint is the only way to shrink partial cached entries */ + if (!f2fs_available_free_memory(sbi, NAT_ENTRIES) || + !f2fs_available_free_memory(sbi, INO_ENTRIES) || + excess_prefree_segs(sbi) || + excess_dirty_nats(sbi) || + excess_dirty_nodes(sbi) || + f2fs_time_over(sbi, CP_TIME)) { + if (test_opt(sbi, DATA_FLUSH)) { + struct blk_plug plug; + + blk_start_plug(&plug); + f2fs_sync_dirty_inodes(sbi, FILE_INODE); + blk_finish_plug(&plug); + } + f2fs_sync_fs(sbi->sb, true); + stat_inc_bg_cp_count(sbi->stat_info); + } +} + +static int __submit_flush_wait(struct f2fs_sb_info *sbi, + struct block_device *bdev) +{ + struct bio *bio = f2fs_bio_alloc(sbi, 0, true); + int ret; + + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; + bio_set_dev(bio, bdev); + ret = submit_bio_wait(bio); + bio_put(bio); + + trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE), ret); + return ret; +} + +static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino) +{ + int ret = 0; + int i; + + if (!f2fs_is_multi_device(sbi)) + return __submit_flush_wait(sbi, sbi->sb->s_bdev); + + for (i = 0; i < sbi->s_ndevs; i++) { + if (!f2fs_is_dirty_device(sbi, ino, i, FLUSH_INO)) + continue; + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + break; + } + return ret; +} + +static int issue_flush_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; + wait_queue_head_t *q = &fcc->flush_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; + + sb_start_intwrite(sbi->sb); + + if (!llist_empty(&fcc->issue_list)) { + struct flush_cmd *cmd, *next; + int ret; + + fcc->dispatch_list = llist_del_all(&fcc->issue_list); + fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); + + cmd = llist_entry(fcc->dispatch_list, struct flush_cmd, llnode); + + ret = submit_flush_wait(sbi, cmd->ino); + atomic_inc(&fcc->issued_flush); + + llist_for_each_entry_safe(cmd, next, + fcc->dispatch_list, llnode) { + cmd->ret = ret; + complete(&cmd->wait); + } + fcc->dispatch_list = NULL; + } + + sb_end_intwrite(sbi->sb); + + wait_event_interruptible(*q, + kthread_should_stop() || !llist_empty(&fcc->issue_list)); + goto repeat; +} + +int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; + struct flush_cmd cmd; + int ret; + + if (test_opt(sbi, NOBARRIER)) + return 0; + + if (!test_opt(sbi, FLUSH_MERGE)) { + atomic_inc(&fcc->issing_flush); + ret = submit_flush_wait(sbi, ino); + atomic_dec(&fcc->issing_flush); + atomic_inc(&fcc->issued_flush); + return ret; + } + + if (atomic_inc_return(&fcc->issing_flush) == 1 || + f2fs_is_multi_device(sbi)) { + ret = submit_flush_wait(sbi, ino); + atomic_dec(&fcc->issing_flush); + + atomic_inc(&fcc->issued_flush); + return ret; + } + + cmd.ino = ino; + init_completion(&cmd.wait); + + llist_add(&cmd.llnode, &fcc->issue_list); + + /* update issue_list before we wake up issue_flush thread */ + smp_mb(); + + if (waitqueue_active(&fcc->flush_wait_queue)) + wake_up(&fcc->flush_wait_queue); + + if (fcc->f2fs_issue_flush) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->issing_flush); + } else { + struct llist_node *list; + + list = llist_del_all(&fcc->issue_list); + if (!list) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->issing_flush); + } else { + struct flush_cmd *tmp, *next; + + ret = submit_flush_wait(sbi, ino); + + llist_for_each_entry_safe(tmp, next, list, llnode) { + if (tmp == &cmd) { + cmd.ret = ret; + atomic_dec(&fcc->issing_flush); + continue; + } + tmp->ret = ret; + complete(&tmp->wait); + } + } + } + + return cmd.ret; +} + +int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct flush_cmd_control *fcc; + int err = 0; + + if (SM_I(sbi)->fcc_info) { + fcc = SM_I(sbi)->fcc_info; + if (fcc->f2fs_issue_flush) + return err; + goto init_thread; + } + + fcc = f2fs_kzalloc(sbi, sizeof(struct flush_cmd_control), GFP_KERNEL); + if (!fcc) + return -ENOMEM; + atomic_set(&fcc->issued_flush, 0); + atomic_set(&fcc->issing_flush, 0); + init_waitqueue_head(&fcc->flush_wait_queue); + init_llist_head(&fcc->issue_list); + SM_I(sbi)->fcc_info = fcc; + if (!test_opt(sbi, FLUSH_MERGE)) + return err; + +init_thread: + fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, + "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(fcc->f2fs_issue_flush)) { + err = PTR_ERR(fcc->f2fs_issue_flush); + kfree(fcc); + SM_I(sbi)->fcc_info = NULL; + return err; + } + + return err; +} + +void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) +{ + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; + + if (fcc && fcc->f2fs_issue_flush) { + struct task_struct *flush_thread = fcc->f2fs_issue_flush; + + fcc->f2fs_issue_flush = NULL; + kthread_stop(flush_thread); + } + if (free) { + kfree(fcc); + SM_I(sbi)->fcc_info = NULL; + } +} + +int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) +{ + int ret = 0, i; + + if (!f2fs_is_multi_device(sbi)) + return 0; + + for (i = 1; i < sbi->s_ndevs; i++) { + if (!f2fs_test_bit(i, (char *)&sbi->dirty_device)) + continue; + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + break; + + spin_lock(&sbi->dev_lock); + f2fs_clear_bit(i, (char *)&sbi->dirty_device); + spin_unlock(&sbi->dev_lock); + } + + return ret; +} + +static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, + enum dirty_type dirty_type) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + + /* need not be added */ + if (IS_CURSEG(sbi, segno)) + return; + + if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) + dirty_i->nr_dirty[dirty_type]++; + + if (dirty_type == DIRTY) { + struct seg_entry *sentry = get_seg_entry(sbi, segno); + enum dirty_type t = sentry->type; + + if (unlikely(t >= DIRTY)) { + f2fs_bug_on(sbi, 1); + return; + } + if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t])) + dirty_i->nr_dirty[t]++; + } +} + +static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, + enum dirty_type dirty_type) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + + if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type])) + dirty_i->nr_dirty[dirty_type]--; + + if (dirty_type == DIRTY) { + struct seg_entry *sentry = get_seg_entry(sbi, segno); + enum dirty_type t = sentry->type; + + if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) + dirty_i->nr_dirty[t]--; + + if (get_valid_blocks(sbi, segno, true) == 0) + clear_bit(GET_SEC_FROM_SEG(sbi, segno), + dirty_i->victim_secmap); + } +} + +/* + * Should not occur error such as -ENOMEM. + * Adding dirty entry into seglist is not critical operation. + * If a given segment is one of current working segments, it won't be added. + */ +static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned short valid_blocks; + + if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno)) + return; + + mutex_lock(&dirty_i->seglist_lock); + + valid_blocks = get_valid_blocks(sbi, segno, false); + + if (valid_blocks == 0) { + __locate_dirty_segment(sbi, segno, PRE); + __remove_dirty_segment(sbi, segno, DIRTY); + } else if (valid_blocks < sbi->blocks_per_seg) { + __locate_dirty_segment(sbi, segno, DIRTY); + } else { + /* Recovery routine with SSR needs this */ + __remove_dirty_segment(sbi, segno, DIRTY); + } + + mutex_unlock(&dirty_i->seglist_lock); +} + +static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc; + + f2fs_bug_on(sbi, !len); + + pend_list = &dcc->pend_list[plist_idx(len)]; + + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); + INIT_LIST_HEAD(&dc->list); + dc->bdev = bdev; + dc->lstart = lstart; + dc->start = start; + dc->len = len; + dc->ref = 0; + dc->state = D_PREP; + dc->issuing = 0; + dc->error = 0; + init_completion(&dc->wait); + list_add_tail(&dc->list, pend_list); + spin_lock_init(&dc->lock); + dc->bio_ref = 0; + atomic_inc(&dcc->discard_cmd_cnt); + dcc->undiscard_blks += len; + + return dc; +} + +static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node *parent, struct rb_node **p) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; + + dc = __create_discard_cmd(sbi, bdev, lstart, start, len); + + rb_link_node(&dc->rb_node, parent, p); + rb_insert_color(&dc->rb_node, &dcc->root); + + return dc; +} + +static void __detach_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + if (dc->state == D_DONE) + atomic_sub(dc->issuing, &dcc->issing_discard); + + list_del(&dc->list); + rb_erase(&dc->rb_node, &dcc->root); + dcc->undiscard_blks -= dc->len; + + kmem_cache_free(discard_cmd_slab, dc); + + atomic_dec(&dcc->discard_cmd_cnt); +} + +static void __remove_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + unsigned long flags; + + trace_f2fs_remove_discard(dc->bdev, dc->start, dc->len); + + spin_lock_irqsave(&dc->lock, flags); + if (dc->bio_ref) { + spin_unlock_irqrestore(&dc->lock, flags); + return; + } + spin_unlock_irqrestore(&dc->lock, flags); + + f2fs_bug_on(sbi, dc->ref); + + if (dc->error == -EOPNOTSUPP) + dc->error = 0; + + if (dc->error) + f2fs_msg(sbi->sb, KERN_INFO, + "Issue discard(%u, %u, %u) failed, ret: %d", + dc->lstart, dc->start, dc->len, dc->error); + __detach_discard_cmd(dcc, dc); +} + +static void f2fs_submit_discard_endio(struct bio *bio) +{ + struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; + unsigned long flags; + + dc->error = blk_status_to_errno(bio->bi_status); + + spin_lock_irqsave(&dc->lock, flags); + dc->bio_ref--; + if (!dc->bio_ref && dc->state == D_SUBMIT) { + dc->state = D_DONE; + complete_all(&dc->wait); + } + spin_unlock_irqrestore(&dc->lock, flags); + bio_put(bio); +} + +static void __check_sit_bitmap(struct f2fs_sb_info *sbi, + block_t start, block_t end) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct seg_entry *sentry; + unsigned int segno; + block_t blk = start; + unsigned long offset, size, max_blocks = sbi->blocks_per_seg; + unsigned long *map; + + while (blk < end) { + segno = GET_SEGNO(sbi, blk); + sentry = get_seg_entry(sbi, segno); + offset = GET_BLKOFF_FROM_SEG0(sbi, blk); + + if (end < START_BLOCK(sbi, segno + 1)) + size = GET_BLKOFF_FROM_SEG0(sbi, end); + else + size = max_blocks; + map = (unsigned long *)(sentry->cur_valid_map); + offset = __find_rev_next_bit(map, size, offset); + f2fs_bug_on(sbi, offset != size); + blk = START_BLOCK(sbi, segno + 1); + } +#endif +} + +static void __init_discard_policy(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + int discard_type, unsigned int granularity) +{ + /* common policy */ + dpolicy->type = discard_type; + dpolicy->sync = true; + dpolicy->ordered = false; + dpolicy->granularity = granularity; + + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + + if (discard_type == DPOLICY_BG) { + dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME; + dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->io_aware = true; + dpolicy->sync = false; + dpolicy->ordered = true; + if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { + dpolicy->granularity = 1; + dpolicy->max_interval = DEF_MIN_DISCARD_ISSUE_TIME; + } + } else if (discard_type == DPOLICY_FORCE) { + dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME; + dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->io_aware = false; + } else if (discard_type == DPOLICY_FSTRIM) { + dpolicy->io_aware = false; + } else if (discard_type == DPOLICY_UMOUNT) { + dpolicy->max_requests = UINT_MAX; + dpolicy->io_aware = false; + } +} + +static void __update_discard_tree_range(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len); +/* this function is copied from blkdev_issue_discard from block/blk-lib.c */ +static int __submit_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + struct discard_cmd *dc, + unsigned int *issued) +{ + struct block_device *bdev = dc->bdev; + struct request_queue *q = bdev_get_queue(bdev); + unsigned int max_discard_blocks = + SECTOR_TO_BLOCK(q->limits.max_discard_sectors); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? + &(dcc->fstrim_list) : &(dcc->wait_list); + int flag = dpolicy->sync ? REQ_SYNC : 0; + block_t lstart, start, len, total_len; + int err = 0; + + if (dc->state != D_PREP) + return 0; + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + return 0; + + trace_f2fs_issue_discard(bdev, dc->start, dc->len); + + lstart = dc->lstart; + start = dc->start; + len = dc->len; + total_len = len; + + dc->len = 0; + + while (total_len && *issued < dpolicy->max_requests && !err) { + struct bio *bio = NULL; + unsigned long flags; + bool last = true; + + if (len > max_discard_blocks) { + len = max_discard_blocks; + last = false; + } + + (*issued)++; + if (*issued == dpolicy->max_requests) + last = true; + + dc->len += len; + + if (time_to_inject(sbi, FAULT_DISCARD)) { + f2fs_show_injection_info(FAULT_DISCARD); + err = -EIO; + goto submit; + } + err = __blkdev_issue_discard(bdev, + SECTOR_FROM_BLOCK(start), + SECTOR_FROM_BLOCK(len), + GFP_NOFS, 0, &bio); +submit: + if (err) { + spin_lock_irqsave(&dc->lock, flags); + if (dc->state == D_PARTIAL) + dc->state = D_SUBMIT; + spin_unlock_irqrestore(&dc->lock, flags); + + break; + } + + f2fs_bug_on(sbi, !bio); + + /* + * should keep before submission to avoid D_DONE + * right away + */ + spin_lock_irqsave(&dc->lock, flags); + if (last) + dc->state = D_SUBMIT; + else + dc->state = D_PARTIAL; + dc->bio_ref++; + spin_unlock_irqrestore(&dc->lock, flags); + + atomic_inc(&dcc->issing_discard); + dc->issuing++; + list_move_tail(&dc->list, wait_list); + + /* sanity check on discard range */ + __check_sit_bitmap(sbi, lstart, lstart + len); + + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; + bio->bi_opf |= flag; + submit_bio(bio); + + atomic_inc(&dcc->issued_discard); + + f2fs_update_iostat(sbi, FS_DISCARD, 1); + + lstart += len; + start += len; + total_len -= len; + len = total_len; + } + + if (!err && len) + __update_discard_tree_range(sbi, bdev, lstart, start, len); + return err; +} + +static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node **insert_p, + struct rb_node *insert_parent) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct rb_node **p; + struct rb_node *parent = NULL; + struct discard_cmd *dc = NULL; + + if (insert_p && insert_parent) { + parent = insert_parent; + p = insert_p; + goto do_insert; + } + + p = f2fs_lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart); +do_insert: + dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p); + if (!dc) + return NULL; + + return dc; +} + +static void __relocate_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->len)]); +} + +static void __punch_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_info di = dc->di; + bool modified = false; + + if (dc->state == D_DONE || dc->len == 1) { + __remove_discard_cmd(sbi, dc); + return; + } + + dcc->undiscard_blks -= di.len; + + if (blkaddr > di.lstart) { + dc->len = blkaddr - dc->lstart; + dcc->undiscard_blks += dc->len; + __relocate_discard_cmd(dcc, dc); + modified = true; + } + + if (blkaddr < di.lstart + di.len - 1) { + if (modified) { + __insert_discard_tree(sbi, dc->bdev, blkaddr + 1, + di.start + blkaddr + 1 - di.lstart, + di.lstart + di.len - 1 - blkaddr, + NULL, NULL); + } else { + dc->lstart++; + dc->len--; + dc->start++; + dcc->undiscard_blks += dc->len; + __relocate_discard_cmd(dcc, dc); + } + } +} + +static void __update_discard_tree_range(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct discard_cmd *dc; + struct discard_info di = {0}; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct request_queue *q = bdev_get_queue(bdev); + unsigned int max_discard_blocks = + SECTOR_TO_BLOCK(q->limits.max_discard_sectors); + block_t end = lstart + len; + + dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, + NULL, lstart, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (dc) + prev_dc = dc; + + if (!prev_dc) { + di.lstart = lstart; + di.len = next_dc ? next_dc->lstart - lstart : len; + di.len = min(di.len, len); + di.start = start; + } + + while (1) { + struct rb_node *node; + bool merged = false; + struct discard_cmd *tdc = NULL; + + if (prev_dc) { + di.lstart = prev_dc->lstart + prev_dc->len; + if (di.lstart < lstart) + di.lstart = lstart; + if (di.lstart >= end) + break; + + if (!next_dc || next_dc->lstart > end) + di.len = end - di.lstart; + else + di.len = next_dc->lstart - di.lstart; + di.start = start + di.lstart - lstart; + } + + if (!di.len) + goto next; + + if (prev_dc && prev_dc->state == D_PREP && + prev_dc->bdev == bdev && + __is_discard_back_mergeable(&di, &prev_dc->di, + max_discard_blocks)) { + prev_dc->di.len += di.len; + dcc->undiscard_blks += di.len; + __relocate_discard_cmd(dcc, prev_dc); + di = prev_dc->di; + tdc = prev_dc; + merged = true; + } + + if (next_dc && next_dc->state == D_PREP && + next_dc->bdev == bdev && + __is_discard_front_mergeable(&di, &next_dc->di, + max_discard_blocks)) { + next_dc->di.lstart = di.lstart; + next_dc->di.len += di.len; + next_dc->di.start = di.start; + dcc->undiscard_blks += di.len; + __relocate_discard_cmd(dcc, next_dc); + if (tdc) + __remove_discard_cmd(sbi, tdc); + merged = true; + } + + if (!merged) { + __insert_discard_tree(sbi, bdev, di.lstart, di.start, + di.len, NULL, NULL); + } + next: + prev_dc = next_dc; + if (!prev_dc) + break; + + node = rb_next(&prev_dc->rb_node); + next_dc = rb_entry_safe(node, struct discard_cmd, rb_node); + } +} + +static int __queue_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ + block_t lblkstart = blkstart; + + trace_f2fs_queue_discard(bdev, blkstart, blklen); + + if (f2fs_is_multi_device(sbi)) { + int devi = f2fs_target_device_index(sbi, blkstart); + + blkstart -= FDEV(devi).start_blk; + } + mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); + __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); + mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); + return 0; +} + +static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct discard_cmd *dc; + struct blk_plug plug; + unsigned int pos = dcc->next_pos; + unsigned int issued = 0; + bool io_interrupted = false; + + mutex_lock(&dcc->cmd_lock); + dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, + NULL, pos, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (!dc) + dc = next_dc; + + blk_start_plug(&plug); + + while (dc) { + struct rb_node *node; + int err = 0; + + if (dc->state != D_PREP) + goto next; + + if (dpolicy->io_aware && !is_idle(sbi)) { + io_interrupted = true; + break; + } + + dcc->next_pos = dc->lstart + dc->len; + err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); + + if (issued >= dpolicy->max_requests) + break; +next: + node = rb_next(&dc->rb_node); + if (err) + __remove_discard_cmd(sbi, dc); + dc = rb_entry_safe(node, struct discard_cmd, rb_node); + } + + blk_finish_plug(&plug); + + if (!dc) + dcc->next_pos = 0; + + mutex_unlock(&dcc->cmd_lock); + + if (!issued && io_interrupted) + issued = -1; + + return issued; +} + +static int __issue_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + struct blk_plug plug; + int i, issued = 0; + bool io_interrupted = false; + + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + if (i + 1 < dpolicy->granularity) + break; + + if (i < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered) + return __issue_discard_cmd_orderly(sbi, dpolicy); + + pend_list = &dcc->pend_list[i]; + + mutex_lock(&dcc->cmd_lock); + if (list_empty(pend_list)) + goto next; + if (unlikely(dcc->rbtree_check)) + f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, + &dcc->root)); + blk_start_plug(&plug); + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + + if (dpolicy->io_aware && i < dpolicy->io_aware_gran && + !is_idle(sbi)) { + io_interrupted = true; + break; + } + + __submit_discard_cmd(sbi, dpolicy, dc, &issued); + + if (issued >= dpolicy->max_requests) + break; + } + blk_finish_plug(&plug); +next: + mutex_unlock(&dcc->cmd_lock); + + if (issued >= dpolicy->max_requests || io_interrupted) + break; + } + + if (!issued && io_interrupted) + issued = -1; + + return issued; +} + +static bool __drop_discard_cmd(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + int i; + bool dropped = false; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + __remove_discard_cmd(sbi, dc); + dropped = true; + } + } + mutex_unlock(&dcc->cmd_lock); + + return dropped; +} + +void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi) +{ + __drop_discard_cmd(sbi); +} + +static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + unsigned int len = 0; + + wait_for_completion_io(&dc->wait); + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, dc->state != D_DONE); + dc->ref--; + if (!dc->ref) { + if (!dc->error) + len = dc->len; + __remove_discard_cmd(sbi, dc); + } + mutex_unlock(&dcc->cmd_lock); + + return len; +} + +static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + block_t start, block_t end) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? + &(dcc->fstrim_list) : &(dcc->wait_list); + struct discard_cmd *dc, *tmp; + bool need_wait; + unsigned int trimmed = 0; + +next: + need_wait = false; + + mutex_lock(&dcc->cmd_lock); + list_for_each_entry_safe(dc, tmp, wait_list, list) { + if (dc->lstart + dc->len <= start || end <= dc->lstart) + continue; + if (dc->len < dpolicy->granularity) + continue; + if (dc->state == D_DONE && !dc->ref) { + wait_for_completion_io(&dc->wait); + if (!dc->error) + trimmed += dc->len; + __remove_discard_cmd(sbi, dc); + } else { + dc->ref++; + need_wait = true; + break; + } + } + mutex_unlock(&dcc->cmd_lock); + + if (need_wait) { + trimmed += __wait_one_discard_bio(sbi, dc); + goto next; + } + + return trimmed; +} + +static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy) +{ + struct discard_policy dp; + unsigned int discard_blks; + + if (dpolicy) + return __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); + + /* wait all */ + __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, 1); + discard_blks = __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); + __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, 1); + discard_blks += __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); + + return discard_blks; +} + +/* This should be covered by global mutex, &sit_i->sentry_lock */ +static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; + bool need_wait = false; + + mutex_lock(&dcc->cmd_lock); + dc = (struct discard_cmd *)f2fs_lookup_rb_tree(&dcc->root, + NULL, blkaddr); + if (dc) { + if (dc->state == D_PREP) { + __punch_discard_cmd(sbi, dc, blkaddr); + } else { + dc->ref++; + need_wait = true; + } + } + mutex_unlock(&dcc->cmd_lock); + + if (need_wait) + __wait_one_discard_bio(sbi, dc); +} + +void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + if (dcc && dcc->f2fs_issue_discard) { + struct task_struct *discard_thread = dcc->f2fs_issue_discard; + + dcc->f2fs_issue_discard = NULL; + kthread_stop(discard_thread); + } +} + +/* This comes from f2fs_put_super */ +bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_policy dpolicy; + bool dropped; + + __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, + dcc->discard_granularity); + __issue_discard_cmd(sbi, &dpolicy); + dropped = __drop_discard_cmd(sbi); + + /* just to make sure there is no pending discard commands */ + __wait_all_discard_cmd(sbi, NULL); + + f2fs_bug_on(sbi, atomic_read(&dcc->discard_cmd_cnt)); + return dropped; +} + +static int issue_discard_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + wait_queue_head_t *q = &dcc->discard_wait_queue; + struct discard_policy dpolicy; + unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + int issued; + + set_freezable(); + + do { + __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, + dcc->discard_granularity); + + wait_event_interruptible_timeout(*q, + kthread_should_stop() || freezing(current) || + dcc->discard_wake, + msecs_to_jiffies(wait_ms)); + + if (dcc->discard_wake) + dcc->discard_wake = 0; + + if (try_to_freeze()) + continue; + if (f2fs_readonly(sbi->sb)) + continue; + if (kthread_should_stop()) + return 0; + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + wait_ms = dpolicy.max_interval; + continue; + } + + if (sbi->gc_mode == GC_URGENT) + __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); + + sb_start_intwrite(sbi->sb); + + issued = __issue_discard_cmd(sbi, &dpolicy); + if (issued > 0) { + __wait_all_discard_cmd(sbi, &dpolicy); + wait_ms = dpolicy.min_interval; + } else if (issued == -1){ + wait_ms = dpolicy.mid_interval; + } else { + wait_ms = dpolicy.max_interval; + } + + sb_end_intwrite(sbi->sb); + + } while (!kthread_should_stop()); + return 0; +} + +#ifdef CONFIG_BLK_DEV_ZONED +static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ + sector_t sector, nr_sects; + block_t lblkstart = blkstart; + int devi = 0; + + if (f2fs_is_multi_device(sbi)) { + devi = f2fs_target_device_index(sbi, blkstart); + blkstart -= FDEV(devi).start_blk; + } + + /* + * We need to know the type of the zone: for conventional zones, + * use regular discard if the drive supports it. For sequential + * zones, reset the zone write pointer. + */ + switch (get_blkz_type(sbi, bdev, blkstart)) { + + case BLK_ZONE_TYPE_CONVENTIONAL: + if (!blk_queue_discard(bdev_get_queue(bdev))) + return 0; + return __queue_discard_cmd(sbi, bdev, lblkstart, blklen); + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: + sector = SECTOR_FROM_BLOCK(blkstart); + nr_sects = SECTOR_FROM_BLOCK(blklen); + + if (sector & (bdev_zone_sectors(bdev) - 1) || + nr_sects != bdev_zone_sectors(bdev)) { + f2fs_msg(sbi->sb, KERN_INFO, + "(%d) %s: Unaligned discard attempted (block %x + %x)", + devi, sbi->s_ndevs ? FDEV(devi).path: "", + blkstart, blklen); + return -EIO; + } + trace_f2fs_issue_reset_zone(bdev, blkstart); + return blkdev_reset_zones(bdev, sector, + nr_sects, GFP_NOFS); + default: + /* Unknown zone type: broken device ? */ + return -EIO; + } +} +#endif + +static int __issue_discard_async(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi->sb) && + bdev_zoned_model(bdev) != BLK_ZONED_NONE) + return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); +#endif + return __queue_discard_cmd(sbi, bdev, blkstart, blklen); +} + +static int f2fs_issue_discard(struct f2fs_sb_info *sbi, + block_t blkstart, block_t blklen) +{ + sector_t start = blkstart, len = 0; + struct block_device *bdev; + struct seg_entry *se; + unsigned int offset; + block_t i; + int err = 0; + + bdev = f2fs_target_device(sbi, blkstart, NULL); + + for (i = blkstart; i < blkstart + blklen; i++, len++) { + if (i != start) { + struct block_device *bdev2 = + f2fs_target_device(sbi, i, NULL); + + if (bdev2 != bdev) { + err = __issue_discard_async(sbi, bdev, + start, len); + if (err) + return err; + bdev = bdev2; + start = i; + len = 0; + } + } + + se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); + offset = GET_BLKOFF_FROM_SEG0(sbi, i); + + if (!f2fs_test_and_set_bit(offset, se->discard_map)) + sbi->discard_blks--; + } + + if (len) + err = __issue_discard_async(sbi, bdev, start, len); + return err; +} + +static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, + bool check_only) +{ + int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); + int max_blocks = sbi->blocks_per_seg; + struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start); + unsigned long *cur_map = (unsigned long *)se->cur_valid_map; + unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long *discard_map = (unsigned long *)se->discard_map; + unsigned long *dmap = SIT_I(sbi)->tmp_map; + unsigned int start = 0, end = -1; + bool force = (cpc->reason & CP_DISCARD); + struct discard_entry *de = NULL; + struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; + int i; + + if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi)) + return false; + + if (!force) { + if (!f2fs_realtime_discard_enable(sbi) || !se->valid_blocks || + SM_I(sbi)->dcc_info->nr_discards >= + SM_I(sbi)->dcc_info->max_discards) + return false; + } + + /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ + for (i = 0; i < entries; i++) + dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] : + (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; + + while (force || SM_I(sbi)->dcc_info->nr_discards <= + SM_I(sbi)->dcc_info->max_discards) { + start = __find_rev_next_bit(dmap, max_blocks, end + 1); + if (start >= max_blocks) + break; + + end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); + if (force && start && end != max_blocks + && (end - start) < cpc->trim_minlen) + continue; + + if (check_only) + return true; + + if (!de) { + de = f2fs_kmem_cache_alloc(discard_entry_slab, + GFP_F2FS_ZERO); + de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start); + list_add_tail(&de->list, head); + } + + for (i = start; i < end; i++) + __set_bit_le(i, (void *)de->discard_map); + + SM_I(sbi)->dcc_info->nr_discards += end - start; + } + return false; +} + +static void release_discard_addr(struct discard_entry *entry) +{ + list_del(&entry->list); + kmem_cache_free(discard_entry_slab, entry); +} + +void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); + struct discard_entry *entry, *this; + + /* drop caches */ + list_for_each_entry_safe(entry, this, head, list) + release_discard_addr(entry); +} + +/* + * Should call f2fs_clear_prefree_segments after checkpoint is done. + */ +static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int segno; + + mutex_lock(&dirty_i->seglist_lock); + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi)) + __set_test_and_free(sbi, segno); + mutex_unlock(&dirty_i->seglist_lock); +} + +void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, + struct cp_control *cpc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *head = &dcc->entry_list; + struct discard_entry *entry, *this; + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; + unsigned int start = 0, end = -1; + unsigned int secno, start_segno; + bool force = (cpc->reason & CP_DISCARD); + bool need_align = test_opt(sbi, LFS) && sbi->segs_per_sec > 1; + + mutex_lock(&dirty_i->seglist_lock); + + while (1) { + int i; + + if (need_align && end != -1) + end--; + start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1); + if (start >= MAIN_SEGS(sbi)) + break; + end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi), + start + 1); + + if (need_align) { + start = rounddown(start, sbi->segs_per_sec); + end = roundup(end, sbi->segs_per_sec); + } + + for (i = start; i < end; i++) { + if (test_and_clear_bit(i, prefree_map)) + dirty_i->nr_dirty[PRE]--; + } + + if (!f2fs_realtime_discard_enable(sbi)) + continue; + + if (force && start >= cpc->trim_start && + (end - 1) <= cpc->trim_end) + continue; + + if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) { + f2fs_issue_discard(sbi, START_BLOCK(sbi, start), + (end - start) << sbi->log_blocks_per_seg); + continue; + } +next: + secno = GET_SEC_FROM_SEG(sbi, start); + start_segno = GET_SEG_FROM_SEC(sbi, secno); + if (!IS_CURSEC(sbi, secno) && + !get_valid_blocks(sbi, start, true)) + f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), + sbi->segs_per_sec << sbi->log_blocks_per_seg); + + start = start_segno + sbi->segs_per_sec; + if (start < end) + goto next; + else + end = start - 1; + } + mutex_unlock(&dirty_i->seglist_lock); + + /* send small discards */ + list_for_each_entry_safe(entry, this, head, list) { + unsigned int cur_pos = 0, next_pos, len, total_len = 0; + bool is_valid = test_bit_le(0, entry->discard_map); + +find_next: + if (is_valid) { + next_pos = find_next_zero_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + len = next_pos - cur_pos; + + if (f2fs_sb_has_blkzoned(sbi->sb) || + (force && len < cpc->trim_minlen)) + goto skip; + + f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, + len); + total_len += len; + } else { + next_pos = find_next_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + } +skip: + cur_pos = next_pos; + is_valid = !is_valid; + + if (cur_pos < sbi->blocks_per_seg) + goto find_next; + + release_discard_addr(entry); + dcc->nr_discards -= total_len; + } + + wake_up_discard_thread(sbi, false); +} + +static int create_discard_cmd_control(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct discard_cmd_control *dcc; + int err = 0, i; + + if (SM_I(sbi)->dcc_info) { + dcc = SM_I(sbi)->dcc_info; + goto init_thread; + } + + dcc = f2fs_kzalloc(sbi, sizeof(struct discard_cmd_control), GFP_KERNEL); + if (!dcc) + return -ENOMEM; + + dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; + INIT_LIST_HEAD(&dcc->entry_list); + for (i = 0; i < MAX_PLIST_NUM; i++) + INIT_LIST_HEAD(&dcc->pend_list[i]); + INIT_LIST_HEAD(&dcc->wait_list); + INIT_LIST_HEAD(&dcc->fstrim_list); + mutex_init(&dcc->cmd_lock); + atomic_set(&dcc->issued_discard, 0); + atomic_set(&dcc->issing_discard, 0); + atomic_set(&dcc->discard_cmd_cnt, 0); + dcc->nr_discards = 0; + dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; + dcc->undiscard_blks = 0; + dcc->next_pos = 0; + dcc->root = RB_ROOT; + dcc->rbtree_check = false; + + init_waitqueue_head(&dcc->discard_wait_queue); + SM_I(sbi)->dcc_info = dcc; +init_thread: + dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, + "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(dcc->f2fs_issue_discard)) { + err = PTR_ERR(dcc->f2fs_issue_discard); + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; + return err; + } + + return err; +} + +static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + if (!dcc) + return; + + f2fs_stop_discard_thread(sbi); + + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; +} + +static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + + if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) { + sit_i->dirty_sentries++; + return false; + } + + return true; +} + +static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, + unsigned int segno, int modified) +{ + struct seg_entry *se = get_seg_entry(sbi, segno); + se->type = type; + if (modified) + __mark_sit_entry_dirty(sbi, segno); +} + +static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) +{ + struct seg_entry *se; + unsigned int segno, offset; + long int new_vblocks; + bool exist; +#ifdef CONFIG_F2FS_CHECK_FS + bool mir_exist; +#endif + + segno = GET_SEGNO(sbi, blkaddr); + + se = get_seg_entry(sbi, segno); + new_vblocks = se->valid_blocks + del; + offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + + f2fs_bug_on(sbi, (new_vblocks >> (sizeof(unsigned short) << 3) || + (new_vblocks > sbi->blocks_per_seg))); + + se->valid_blocks = new_vblocks; + se->mtime = get_mtime(sbi, false); + if (se->mtime > SIT_I(sbi)->max_mtime) + SIT_I(sbi)->max_mtime = se->mtime; + + /* Update valid block bitmap */ + if (del > 0) { + exist = f2fs_test_and_set_bit(offset, se->cur_valid_map); +#ifdef CONFIG_F2FS_CHECK_FS + mir_exist = f2fs_test_and_set_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error " + "when setting bitmap, blk:%u, old bit:%d", + blkaddr, exist); + f2fs_bug_on(sbi, 1); + } +#endif + if (unlikely(exist)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Bitmap was wrongly set, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + se->valid_blocks--; + del = 0; + } + + if (!f2fs_test_and_set_bit(offset, se->discard_map)) + sbi->discard_blks--; + + /* don't overwrite by SSR to keep node chain */ + if (IS_NODESEG(se->type)) { + if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) + se->ckpt_valid_blocks++; + } + } else { + exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map); +#ifdef CONFIG_F2FS_CHECK_FS + mir_exist = f2fs_test_and_clear_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error " + "when clearing bitmap, blk:%u, old bit:%d", + blkaddr, exist); + f2fs_bug_on(sbi, 1); + } +#endif + if (unlikely(!exist)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Bitmap was wrongly cleared, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + se->valid_blocks++; + del = 0; + } + + if (f2fs_test_and_clear_bit(offset, se->discard_map)) + sbi->discard_blks++; + } + if (!f2fs_test_bit(offset, se->ckpt_valid_map)) + se->ckpt_valid_blocks += del; + + __mark_sit_entry_dirty(sbi, segno); + + /* update total number of valid blocks to be written in ckpt area */ + SIT_I(sbi)->written_valid_blocks += del; + + if (sbi->segs_per_sec > 1) + get_sec_entry(sbi, segno)->valid_blocks += del; +} + +void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) +{ + unsigned int segno = GET_SEGNO(sbi, addr); + struct sit_info *sit_i = SIT_I(sbi); + + f2fs_bug_on(sbi, addr == NULL_ADDR); + if (addr == NEW_ADDR) + return; + + invalidate_mapping_pages(META_MAPPING(sbi), addr, addr); + + /* add it into sit main buffer */ + down_write(&sit_i->sentry_lock); + + update_sit_entry(sbi, addr, -1); + + /* add it into dirty seglist */ + locate_dirty_segment(sbi, segno); + + up_write(&sit_i->sentry_lock); +} + +bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int segno, offset; + struct seg_entry *se; + bool is_cp = false; + + if (!is_valid_data_blkaddr(sbi, blkaddr)) + return true; + + down_read(&sit_i->sentry_lock); + + segno = GET_SEGNO(sbi, blkaddr); + se = get_seg_entry(sbi, segno); + offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + + if (f2fs_test_bit(offset, se->ckpt_valid_map)) + is_cp = true; + + up_read(&sit_i->sentry_lock); + + return is_cp; +} + +/* + * This function should be resided under the curseg_mutex lock + */ +static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, + struct f2fs_summary *sum) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + void *addr = curseg->sum_blk; + addr += curseg->next_blkoff * sizeof(struct f2fs_summary); + memcpy(addr, sum, sizeof(struct f2fs_summary)); +} + +/* + * Calculate the number of current summary pages for writing + */ +int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) +{ + int valid_sum_count = 0; + int i, sum_in_page; + + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + if (sbi->ckpt->alloc_type[i] == SSR) + valid_sum_count += sbi->blocks_per_seg; + else { + if (for_ra) + valid_sum_count += le16_to_cpu( + F2FS_CKPT(sbi)->cur_data_blkoff[i]); + else + valid_sum_count += curseg_blkoff(sbi, i); + } + } + + sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE - + SUM_FOOTER_SIZE) / SUMMARY_SIZE; + if (valid_sum_count <= sum_in_page) + return 1; + else if ((valid_sum_count - sum_in_page) <= + (PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) + return 2; + return 3; +} + +/* + * Caller should put this summary page + */ +struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) +{ + return f2fs_get_meta_page_nofail(sbi, GET_SUM_BLOCK(sbi, segno)); +} + +void f2fs_update_meta_page(struct f2fs_sb_info *sbi, + void *src, block_t blk_addr) +{ + struct page *page = f2fs_grab_meta_page(sbi, blk_addr); + + memcpy(page_address(page), src, PAGE_SIZE); + set_page_dirty(page); + f2fs_put_page(page, 1); +} + +static void write_sum_page(struct f2fs_sb_info *sbi, + struct f2fs_summary_block *sum_blk, block_t blk_addr) +{ + f2fs_update_meta_page(sbi, (void *)sum_blk, blk_addr); +} + +static void write_current_sum_page(struct f2fs_sb_info *sbi, + int type, block_t blk_addr) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + struct page *page = f2fs_grab_meta_page(sbi, blk_addr); + struct f2fs_summary_block *src = curseg->sum_blk; + struct f2fs_summary_block *dst; + + dst = (struct f2fs_summary_block *)page_address(page); + memset(dst, 0, PAGE_SIZE); + + mutex_lock(&curseg->curseg_mutex); + + down_read(&curseg->journal_rwsem); + memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE); + up_read(&curseg->journal_rwsem); + + memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE); + memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE); + + mutex_unlock(&curseg->curseg_mutex); + + set_page_dirty(page); + f2fs_put_page(page, 1); +} + +static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int segno = curseg->segno + 1; + struct free_segmap_info *free_i = FREE_I(sbi); + + if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec) + return !test_bit(segno, free_i->free_segmap); + return 0; +} + +/* + * Find a new segment from the free segments bitmap to right order + * This function should be returned with success, otherwise BUG + */ +static void get_new_segment(struct f2fs_sb_info *sbi, + unsigned int *newseg, bool new_sec, int dir) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int segno, secno, zoneno; + unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; + unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); + unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); + unsigned int left_start = hint; + bool init = true; + int go_left = 0; + int i; + + spin_lock(&free_i->segmap_lock); + + if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { + segno = find_next_zero_bit(free_i->free_segmap, + GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1); + if (segno < GET_SEG_FROM_SEC(sbi, hint + 1)) + goto got_it; + } +find_other_zone: + secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); + if (secno >= MAIN_SECS(sbi)) { + if (dir == ALLOC_RIGHT) { + secno = find_next_zero_bit(free_i->free_secmap, + MAIN_SECS(sbi), 0); + f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi)); + } else { + go_left = 1; + left_start = hint - 1; + } + } + if (go_left == 0) + goto skip_left; + + while (test_bit(left_start, free_i->free_secmap)) { + if (left_start > 0) { + left_start--; + continue; + } + left_start = find_next_zero_bit(free_i->free_secmap, + MAIN_SECS(sbi), 0); + f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi)); + break; + } + secno = left_start; +skip_left: + segno = GET_SEG_FROM_SEC(sbi, secno); + zoneno = GET_ZONE_FROM_SEC(sbi, secno); + + /* give up on finding another zone */ + if (!init) + goto got_it; + if (sbi->secs_per_zone == 1) + goto got_it; + if (zoneno == old_zoneno) + goto got_it; + if (dir == ALLOC_LEFT) { + if (!go_left && zoneno + 1 >= total_zones) + goto got_it; + if (go_left && zoneno == 0) + goto got_it; + } + for (i = 0; i < NR_CURSEG_TYPE; i++) + if (CURSEG_I(sbi, i)->zone == zoneno) + break; + + if (i < NR_CURSEG_TYPE) { + /* zone is in user, try another */ + if (go_left) + hint = zoneno * sbi->secs_per_zone - 1; + else if (zoneno + 1 >= total_zones) + hint = 0; + else + hint = (zoneno + 1) * sbi->secs_per_zone; + init = false; + goto find_other_zone; + } +got_it: + /* set it as dirty segment in free segmap */ + f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap)); + __set_inuse(sbi, segno); + *newseg = segno; + spin_unlock(&free_i->segmap_lock); +} + +static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + struct summary_footer *sum_footer; + + curseg->segno = curseg->next_segno; + curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); + curseg->next_blkoff = 0; + curseg->next_segno = NULL_SEGNO; + + sum_footer = &(curseg->sum_blk->footer); + memset(sum_footer, 0, sizeof(struct summary_footer)); + if (IS_DATASEG(type)) + SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA); + if (IS_NODESEG(type)) + SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE); + __set_sit_entry_type(sbi, type, curseg->segno, modified); +} + +static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) +{ + /* if segs_per_sec is large than 1, we need to keep original policy. */ + if (sbi->segs_per_sec != 1) + return CURSEG_I(sbi, type)->segno; + + if (test_opt(sbi, NOHEAP) && + (type == CURSEG_HOT_DATA || IS_NODESEG(type))) + return 0; + + if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) + return SIT_I(sbi)->last_victim[ALLOC_NEXT]; + + /* find segments from 0 to reuse freed segments */ + if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) + return 0; + + return CURSEG_I(sbi, type)->segno; +} + +/* + * Allocate a current working segment. + * This function always allocates a free segment in LFS manner. + */ +static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int segno = curseg->segno; + int dir = ALLOC_LEFT; + + write_sum_page(sbi, curseg->sum_blk, + GET_SUM_BLOCK(sbi, segno)); + if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA) + dir = ALLOC_RIGHT; + + if (test_opt(sbi, NOHEAP)) + dir = ALLOC_RIGHT; + + segno = __get_next_segno(sbi, type); + get_new_segment(sbi, &segno, new_sec, dir); + curseg->next_segno = segno; + reset_curseg(sbi, type, 1); + curseg->alloc_type = LFS; +} + +static void __next_free_blkoff(struct f2fs_sb_info *sbi, + struct curseg_info *seg, block_t start) +{ + struct seg_entry *se = get_seg_entry(sbi, seg->segno); + int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); + unsigned long *target_map = SIT_I(sbi)->tmp_map; + unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long *cur_map = (unsigned long *)se->cur_valid_map; + int i, pos; + + for (i = 0; i < entries; i++) + target_map[i] = ckpt_map[i] | cur_map[i]; + + pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); + + seg->next_blkoff = pos; +} + +/* + * If a segment is written by LFS manner, next block offset is just obtained + * by increasing the current block offset. However, if a segment is written by + * SSR manner, next block offset obtained by calling __next_free_blkoff + */ +static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, + struct curseg_info *seg) +{ + if (seg->alloc_type == SSR) + __next_free_blkoff(sbi, seg, seg->next_blkoff + 1); + else + seg->next_blkoff++; +} + +/* + * This function always allocates a used segment(from dirty seglist) by SSR + * manner, so it should recover the existing segment information of valid blocks + */ +static void change_curseg(struct f2fs_sb_info *sbi, int type) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int new_segno = curseg->next_segno; + struct f2fs_summary_block *sum_node; + struct page *sum_page; + + write_sum_page(sbi, curseg->sum_blk, + GET_SUM_BLOCK(sbi, curseg->segno)); + __set_test_and_inuse(sbi, new_segno); + + mutex_lock(&dirty_i->seglist_lock); + __remove_dirty_segment(sbi, new_segno, PRE); + __remove_dirty_segment(sbi, new_segno, DIRTY); + mutex_unlock(&dirty_i->seglist_lock); + + reset_curseg(sbi, type, 1); + curseg->alloc_type = SSR; + __next_free_blkoff(sbi, curseg, 0); + + sum_page = f2fs_get_sum_page(sbi, new_segno); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); + f2fs_put_page(sum_page, 1); +} + +static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; + unsigned segno = NULL_SEGNO; + int i, cnt; + bool reversed = false; + + /* f2fs_need_SSR() already forces to do this */ + if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) { + curseg->next_segno = segno; + return 1; + } + + /* For node segments, let's do SSR more intensively */ + if (IS_NODESEG(type)) { + if (type >= CURSEG_WARM_NODE) { + reversed = true; + i = CURSEG_COLD_NODE; + } else { + i = CURSEG_HOT_NODE; + } + cnt = NR_CURSEG_NODE_TYPE; + } else { + if (type >= CURSEG_WARM_DATA) { + reversed = true; + i = CURSEG_COLD_DATA; + } else { + i = CURSEG_HOT_DATA; + } + cnt = NR_CURSEG_DATA_TYPE; + } + + for (; cnt-- > 0; reversed ? i-- : i++) { + if (i == type) + continue; + if (v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) { + curseg->next_segno = segno; + return 1; + } + } + return 0; +} + +/* + * flush out current segment and replace it with new segment + * This function should be returned with success, otherwise BUG + */ +static void allocate_segment_by_default(struct f2fs_sb_info *sbi, + int type, bool force) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + + if (force) + new_curseg(sbi, type, true); + else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && + type == CURSEG_WARM_NODE) + new_curseg(sbi, type, false); + else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) + new_curseg(sbi, type, false); + else if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type)) + change_curseg(sbi, type); + else + new_curseg(sbi, type, false); + + stat_inc_seg_type(sbi, curseg); +} + +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) +{ + struct curseg_info *curseg; + unsigned int old_segno; + int i; + + down_write(&SIT_I(sbi)->sentry_lock); + + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + curseg = CURSEG_I(sbi, i); + old_segno = curseg->segno; + SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); + locate_dirty_segment(sbi, old_segno); + } + + up_write(&SIT_I(sbi)->sentry_lock); +} + +static const struct segment_allocation default_salloc_ops = { + .allocate_segment = allocate_segment_by_default, +}; + +bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, + struct cp_control *cpc) +{ + __u64 trim_start = cpc->trim_start; + bool has_candidate = false; + + down_write(&SIT_I(sbi)->sentry_lock); + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) { + if (add_discard_addrs(sbi, cpc, true)) { + has_candidate = true; + break; + } + } + up_write(&SIT_I(sbi)->sentry_lock); + + cpc->trim_start = trim_start; + return has_candidate; +} + +static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + unsigned int start, unsigned int end) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct discard_cmd *dc; + struct blk_plug plug; + int issued; + unsigned int trimmed = 0; + +next: + issued = 0; + + mutex_lock(&dcc->cmd_lock); + if (unlikely(dcc->rbtree_check)) + f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, + &dcc->root)); + + dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, + NULL, start, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (!dc) + dc = next_dc; + + blk_start_plug(&plug); + + while (dc && dc->lstart <= end) { + struct rb_node *node; + int err = 0; + + if (dc->len < dpolicy->granularity) + goto skip; + + if (dc->state != D_PREP) { + list_move_tail(&dc->list, &dcc->fstrim_list); + goto skip; + } + + err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); + + if (issued >= dpolicy->max_requests) { + start = dc->lstart + dc->len; + + if (err) + __remove_discard_cmd(sbi, dc); + + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + trimmed += __wait_all_discard_cmd(sbi, NULL); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto next; + } +skip: + node = rb_next(&dc->rb_node); + if (err) + __remove_discard_cmd(sbi, dc); + dc = rb_entry_safe(node, struct discard_cmd, rb_node); + + if (fatal_signal_pending(current)) + break; + } + + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + + return trimmed; +} + +int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) +{ + __u64 start = F2FS_BYTES_TO_BLK(range->start); + __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; + unsigned int start_segno, end_segno; + block_t start_block, end_block; + struct cp_control cpc; + struct discard_policy dpolicy; + unsigned long long trimmed = 0; + int err = 0; + bool need_align = test_opt(sbi, LFS) && sbi->segs_per_sec > 1; + + if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) + return -EINVAL; + + if (end < MAIN_BLKADDR(sbi)) + goto out; + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Found FS corruption, run fsck to fix."); + return -EFSCORRUPTED; + } + + /* start/end segment number in main_area */ + start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); + end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : + GET_SEGNO(sbi, end); + if (need_align) { + start_segno = rounddown(start_segno, sbi->segs_per_sec); + end_segno = roundup(end_segno + 1, sbi->segs_per_sec) - 1; + } + + cpc.reason = CP_DISCARD; + cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); + cpc.trim_start = start_segno; + cpc.trim_end = end_segno; + + if (sbi->discard_blks == 0) + goto out; + + mutex_lock(&sbi->gc_mutex); + err = f2fs_write_checkpoint(sbi, &cpc); + mutex_unlock(&sbi->gc_mutex); + if (err) + goto out; + + /* + * We filed discard candidates, but actually we don't need to wait for + * all of them, since they'll be issued in idle time along with runtime + * discard option. User configuration looks like using runtime discard + * or periodic fstrim instead of it. + */ + if (f2fs_realtime_discard_enable(sbi)) + goto out; + + start_block = START_BLOCK(sbi, start_segno); + end_block = START_BLOCK(sbi, end_segno + 1); + + __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); + trimmed = __issue_discard_cmd_range(sbi, &dpolicy, + start_block, end_block); + + trimmed += __wait_discard_cmd_range(sbi, &dpolicy, + start_block, end_block); +out: + if (!err) + range->len = F2FS_BLK_TO_BYTES(trimmed); + return err; +} + +static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + if (curseg->next_blkoff < sbi->blocks_per_seg) + return true; + return false; +} + +int f2fs_rw_hint_to_seg_type(enum rw_hint hint) +{ + switch (hint) { + case WRITE_LIFE_SHORT: + return CURSEG_HOT_DATA; + case WRITE_LIFE_EXTREME: + return CURSEG_COLD_DATA; + default: + return CURSEG_WARM_DATA; + } +} + +/* This returns write hints for each segment type. This hints will be + * passed down to block layer. There are mapping tables which depend on + * the mount option 'whint_mode'. + * + * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. + * + * 2) whint_mode=user-based. F2FS tries to pass down hints given by users. + * + * User F2FS Block + * ---- ---- ----- + * META WRITE_LIFE_NOT_SET + * HOT_NODE " + * WARM_NODE " + * COLD_NODE " + * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME + * extension list " " + * + * -- buffered io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET + * WRITE_LIFE_NONE " " + * WRITE_LIFE_MEDIUM " " + * WRITE_LIFE_LONG " " + * + * -- direct io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET + * WRITE_LIFE_NONE " WRITE_LIFE_NONE + * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM + * WRITE_LIFE_LONG " WRITE_LIFE_LONG + * + * 3) whint_mode=fs-based. F2FS passes down hints with its policy. + * + * User F2FS Block + * ---- ---- ----- + * META WRITE_LIFE_MEDIUM; + * HOT_NODE WRITE_LIFE_NOT_SET + * WARM_NODE " + * COLD_NODE WRITE_LIFE_NONE + * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME + * extension list " " + * + * -- buffered io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG + * WRITE_LIFE_NONE " " + * WRITE_LIFE_MEDIUM " " + * WRITE_LIFE_LONG " " + * + * -- direct io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET + * WRITE_LIFE_NONE " WRITE_LIFE_NONE + * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM + * WRITE_LIFE_LONG " WRITE_LIFE_LONG + */ + +enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp) +{ + if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) { + if (type == DATA) { + if (temp == WARM) + return WRITE_LIFE_NOT_SET; + else if (temp == HOT) + return WRITE_LIFE_SHORT; + else if (temp == COLD) + return WRITE_LIFE_EXTREME; + } else { + return WRITE_LIFE_NOT_SET; + } + } else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) { + if (type == DATA) { + if (temp == WARM) + return WRITE_LIFE_LONG; + else if (temp == HOT) + return WRITE_LIFE_SHORT; + else if (temp == COLD) + return WRITE_LIFE_EXTREME; + } else if (type == NODE) { + if (temp == WARM || temp == HOT) + return WRITE_LIFE_NOT_SET; + else if (temp == COLD) + return WRITE_LIFE_NONE; + } else if (type == META) { + return WRITE_LIFE_MEDIUM; + } + } + return WRITE_LIFE_NOT_SET; +} + +static int __get_segment_type_2(struct f2fs_io_info *fio) +{ + if (fio->type == DATA) + return CURSEG_HOT_DATA; + else + return CURSEG_HOT_NODE; +} + +static int __get_segment_type_4(struct f2fs_io_info *fio) +{ + if (fio->type == DATA) { + struct inode *inode = fio->page->mapping->host; + + if (S_ISDIR(inode->i_mode)) + return CURSEG_HOT_DATA; + else + return CURSEG_COLD_DATA; + } else { + if (IS_DNODE(fio->page) && is_cold_node(fio->page)) + return CURSEG_WARM_NODE; + else + return CURSEG_COLD_NODE; + } +} + +static int __get_segment_type_6(struct f2fs_io_info *fio) +{ + if (fio->type == DATA) { + struct inode *inode = fio->page->mapping->host; + + if (is_cold_data(fio->page) || file_is_cold(inode)) + return CURSEG_COLD_DATA; + if (file_is_hot(inode) || + is_inode_flag_set(inode, FI_HOT_DATA) || + f2fs_is_atomic_file(inode) || + f2fs_is_volatile_file(inode)) + return CURSEG_HOT_DATA; + return f2fs_rw_hint_to_seg_type(inode->i_write_hint); + } else { + if (IS_DNODE(fio->page)) + return is_cold_node(fio->page) ? CURSEG_WARM_NODE : + CURSEG_HOT_NODE; + return CURSEG_COLD_NODE; + } +} + +static int __get_segment_type(struct f2fs_io_info *fio) +{ + int type = 0; + + switch (F2FS_OPTION(fio->sbi).active_logs) { + case 2: + type = __get_segment_type_2(fio); + break; + case 4: + type = __get_segment_type_4(fio); + break; + case 6: + type = __get_segment_type_6(fio); + break; + default: + f2fs_bug_on(fio->sbi, true); + } + + if (IS_HOT(type)) + fio->temp = HOT; + else if (IS_WARM(type)) + fio->temp = WARM; + else + fio->temp = COLD; + return type; +} + +void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, int type, + struct f2fs_io_info *fio, bool add_list) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, type); + + down_read(&SM_I(sbi)->curseg_lock); + + mutex_lock(&curseg->curseg_mutex); + down_write(&sit_i->sentry_lock); + + *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + + f2fs_wait_discard_bio(sbi, *new_blkaddr); + + /* + * __add_sum_entry should be resided under the curseg_mutex + * because, this function updates a summary entry in the + * current summary block. + */ + __add_sum_entry(sbi, type, sum); + + __refresh_next_blkoff(sbi, curseg); + + stat_inc_block_count(sbi, curseg); + + /* + * SIT information should be updated before segment allocation, + * since SSR needs latest valid block information. + */ + update_sit_entry(sbi, *new_blkaddr, 1); + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + update_sit_entry(sbi, old_blkaddr, -1); + + if (!__has_curseg_space(sbi, type)) + sit_i->s_ops->allocate_segment(sbi, type, false); + + /* + * segment dirty status should be updated after segment allocation, + * so we just need to update status only one time after previous + * segment being closed. + */ + locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr)); + + up_write(&sit_i->sentry_lock); + + if (page && IS_NODESEG(type)) { + fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + + f2fs_inode_chksum_set(sbi, page); + } + + if (add_list) { + struct f2fs_bio_info *io; + + INIT_LIST_HEAD(&fio->list); + fio->in_list = true; + fio->retry = false; + io = sbi->write_io[fio->type] + fio->temp; + spin_lock(&io->io_lock); + list_add_tail(&fio->list, &io->io_list); + spin_unlock(&io->io_lock); + } + + mutex_unlock(&curseg->curseg_mutex); + + up_read(&SM_I(sbi)->curseg_lock); +} + +static void update_device_state(struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = fio->sbi; + unsigned int devidx; + + if (!f2fs_is_multi_device(sbi)) + return; + + devidx = f2fs_target_device_index(sbi, fio->new_blkaddr); + + /* update device state for fsync */ + f2fs_set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO); + + /* update device state for checkpoint */ + if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { + spin_lock(&sbi->dev_lock); + f2fs_set_bit(devidx, (char *)&sbi->dirty_device); + spin_unlock(&sbi->dev_lock); + } +} + +static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) +{ + int type = __get_segment_type(fio); + bool keep_order = (test_opt(fio->sbi, LFS) && type == CURSEG_COLD_DATA); + + if (keep_order) + down_read(&fio->sbi->io_order_lock); +reallocate: + f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, + &fio->new_blkaddr, sum, type, fio, true); + if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) + invalidate_mapping_pages(META_MAPPING(fio->sbi), + fio->old_blkaddr, fio->old_blkaddr); + + /* writeout dirty page into bdev */ + f2fs_submit_page_write(fio); + if (fio->retry) { + fio->old_blkaddr = fio->new_blkaddr; + goto reallocate; + } + + update_device_state(fio); + + if (keep_order) + up_read(&fio->sbi->io_order_lock); +} + +void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type) +{ + struct f2fs_io_info fio = { + .sbi = sbi, + .type = META, + .temp = HOT, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, + .old_blkaddr = page->index, + .new_blkaddr = page->index, + .page = page, + .encrypted_page = NULL, + .in_list = false, + }; + + if (unlikely(page->index >= MAIN_BLKADDR(sbi))) + fio.op_flags &= ~REQ_META; + + set_page_writeback(page); + ClearPageError(page); + f2fs_submit_page_write(&fio); + + f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); +} + +void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) +{ + struct f2fs_summary sum; + + set_summary(&sum, nid, 0, 0); + do_write_page(&sum, fio); + + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); +} + +void f2fs_outplace_write_data(struct dnode_of_data *dn, + struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = fio->sbi; + struct f2fs_summary sum; + + f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); + set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version); + do_write_page(&sum, fio); + f2fs_update_data_blkaddr(dn, fio->new_blkaddr); + + f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE); +} + +int f2fs_inplace_write_data(struct f2fs_io_info *fio) +{ + int err; + struct f2fs_sb_info *sbi = fio->sbi; + unsigned int segno; + + fio->new_blkaddr = fio->old_blkaddr; + /* i/o temperature is needed for passing down write hints */ + __get_segment_type(fio); + + segno = GET_SEGNO(sbi, fio->new_blkaddr); + + if (!IS_DATASEG(get_seg_entry(sbi, segno)->type)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + return -EFSCORRUPTED; + } + + stat_inc_inplace_blocks(fio->sbi); + + err = f2fs_submit_page_bio(fio); + if (!err) + update_device_state(fio); + + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + + return err; +} + +static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + int i; + + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { + if (CURSEG_I(sbi, i)->segno == segno) + break; + } + return i; +} + +void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + block_t old_blkaddr, block_t new_blkaddr, + bool recover_curseg, bool recover_newaddr) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct curseg_info *curseg; + unsigned int segno, old_cursegno; + struct seg_entry *se; + int type; + unsigned short old_blkoff; + + segno = GET_SEGNO(sbi, new_blkaddr); + se = get_seg_entry(sbi, segno); + type = se->type; + + down_write(&SM_I(sbi)->curseg_lock); + + if (!recover_curseg) { + /* for recovery flow */ + if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) { + if (old_blkaddr == NULL_ADDR) + type = CURSEG_COLD_DATA; + else + type = CURSEG_WARM_DATA; + } + } else { + if (IS_CURSEG(sbi, segno)) { + /* se->type is volatile as SSR allocation */ + type = __f2fs_get_curseg(sbi, segno); + f2fs_bug_on(sbi, type == NO_CHECK_TYPE); + } else { + type = CURSEG_WARM_DATA; + } + } + + f2fs_bug_on(sbi, !IS_DATASEG(type)); + curseg = CURSEG_I(sbi, type); + + mutex_lock(&curseg->curseg_mutex); + down_write(&sit_i->sentry_lock); + + old_cursegno = curseg->segno; + old_blkoff = curseg->next_blkoff; + + /* change the current segment */ + if (segno != curseg->segno) { + curseg->next_segno = segno; + change_curseg(sbi, type); + } + + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); + __add_sum_entry(sbi, type, sum); + + if (!recover_curseg || recover_newaddr) + update_sit_entry(sbi, new_blkaddr, 1); + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { + invalidate_mapping_pages(META_MAPPING(sbi), + old_blkaddr, old_blkaddr); + update_sit_entry(sbi, old_blkaddr, -1); + } + + locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr)); + + locate_dirty_segment(sbi, old_cursegno); + + if (recover_curseg) { + if (old_cursegno != curseg->segno) { + curseg->next_segno = old_cursegno; + change_curseg(sbi, type); + } + curseg->next_blkoff = old_blkoff; + } + + up_write(&sit_i->sentry_lock); + mutex_unlock(&curseg->curseg_mutex); + up_write(&SM_I(sbi)->curseg_lock); +} + +void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, + block_t old_addr, block_t new_addr, + unsigned char version, bool recover_curseg, + bool recover_newaddr) +{ + struct f2fs_summary sum; + + set_summary(&sum, dn->nid, dn->ofs_in_node, version); + + f2fs_do_replace_block(sbi, &sum, old_addr, new_addr, + recover_curseg, recover_newaddr); + + f2fs_update_data_blkaddr(dn, new_addr); +} + +void f2fs_wait_on_page_writeback(struct page *page, + enum page_type type, bool ordered) +{ + if (PageWriteback(page)) { + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + + f2fs_submit_merged_write_cond(sbi, page->mapping->host, + 0, page->index, type); + if (ordered) + wait_on_page_writeback(page); + else + wait_for_stable_page(page); + } +} + +void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct page *cpage; + + if (!f2fs_post_read_required(inode)) + return; + + if (!is_valid_data_blkaddr(sbi, blkaddr)) + return; + + cpage = find_lock_page(META_MAPPING(sbi), blkaddr); + if (cpage) { + f2fs_wait_on_page_writeback(cpage, DATA, true); + f2fs_put_page(cpage, 1); + } +} + +static int read_compacted_summaries(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct curseg_info *seg_i; + unsigned char *kaddr; + struct page *page; + block_t start; + int i, j, offset; + + start = start_sum_block(sbi); + + page = f2fs_get_meta_page(sbi, start++); + if (IS_ERR(page)) + return PTR_ERR(page); + kaddr = (unsigned char *)page_address(page); + + /* Step 1: restore nat cache */ + seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); + memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE); + + /* Step 2: restore sit cache */ + seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); + memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE); + offset = 2 * SUM_JOURNAL_SIZE; + + /* Step 3: restore summary entries */ + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + unsigned short blk_off; + unsigned int segno; + + seg_i = CURSEG_I(sbi, i); + segno = le32_to_cpu(ckpt->cur_data_segno[i]); + blk_off = le16_to_cpu(ckpt->cur_data_blkoff[i]); + seg_i->next_segno = segno; + reset_curseg(sbi, i, 0); + seg_i->alloc_type = ckpt->alloc_type[i]; + seg_i->next_blkoff = blk_off; + + if (seg_i->alloc_type == SSR) + blk_off = sbi->blocks_per_seg; + + for (j = 0; j < blk_off; j++) { + struct f2fs_summary *s; + s = (struct f2fs_summary *)(kaddr + offset); + seg_i->sum_blk->entries[j] = *s; + offset += SUMMARY_SIZE; + if (offset + SUMMARY_SIZE <= PAGE_SIZE - + SUM_FOOTER_SIZE) + continue; + + f2fs_put_page(page, 1); + page = NULL; + + page = f2fs_get_meta_page(sbi, start++); + if (IS_ERR(page)) + return PTR_ERR(page); + kaddr = (unsigned char *)page_address(page); + offset = 0; + } + } + f2fs_put_page(page, 1); + return 0; +} + +static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_summary_block *sum; + struct curseg_info *curseg; + struct page *new; + unsigned short blk_off; + unsigned int segno = 0; + block_t blk_addr = 0; + int err = 0; + + /* get segment number and block addr */ + if (IS_DATASEG(type)) { + segno = le32_to_cpu(ckpt->cur_data_segno[type]); + blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type - + CURSEG_HOT_DATA]); + if (__exist_node_summaries(sbi)) + blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type); + else + blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type); + } else { + segno = le32_to_cpu(ckpt->cur_node_segno[type - + CURSEG_HOT_NODE]); + blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type - + CURSEG_HOT_NODE]); + if (__exist_node_summaries(sbi)) + blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE, + type - CURSEG_HOT_NODE); + else + blk_addr = GET_SUM_BLOCK(sbi, segno); + } + + new = f2fs_get_meta_page(sbi, blk_addr); + if (IS_ERR(new)) + return PTR_ERR(new); + sum = (struct f2fs_summary_block *)page_address(new); + + if (IS_NODESEG(type)) { + if (__exist_node_summaries(sbi)) { + struct f2fs_summary *ns = &sum->entries[0]; + int i; + for (i = 0; i < sbi->blocks_per_seg; i++, ns++) { + ns->version = 0; + ns->ofs_in_node = 0; + } + } else { + err = f2fs_restore_node_summary(sbi, segno, sum); + if (err) + goto out; + } + } + + /* set uncompleted segment to curseg */ + curseg = CURSEG_I(sbi, type); + mutex_lock(&curseg->curseg_mutex); + + /* update journal info */ + down_write(&curseg->journal_rwsem); + memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE); + up_write(&curseg->journal_rwsem); + + memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE); + memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE); + curseg->next_segno = segno; + reset_curseg(sbi, type, 0); + curseg->alloc_type = ckpt->alloc_type[type]; + curseg->next_blkoff = blk_off; + mutex_unlock(&curseg->curseg_mutex); +out: + f2fs_put_page(new, 1); + return err; +} + +static int restore_curseg_summaries(struct f2fs_sb_info *sbi) +{ + struct f2fs_journal *sit_j = CURSEG_I(sbi, CURSEG_COLD_DATA)->journal; + struct f2fs_journal *nat_j = CURSEG_I(sbi, CURSEG_HOT_DATA)->journal; + int type = CURSEG_HOT_DATA; + int err; + + if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) { + int npages = f2fs_npages_for_summary_flush(sbi, true); + + if (npages >= 2) + f2fs_ra_meta_pages(sbi, start_sum_block(sbi), npages, + META_CP, true); + + /* restore for compacted data summary */ + err = read_compacted_summaries(sbi); + if (err) + return err; + type = CURSEG_HOT_NODE; + } + + if (__exist_node_summaries(sbi)) + f2fs_ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type), + NR_CURSEG_TYPE - type, META_CP, true); + + for (; type <= CURSEG_COLD_NODE; type++) { + err = read_normal_summaries(sbi, type); + if (err) + return err; + } + + /* sanity check for summary blocks */ + if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || + sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) + return -EINVAL; + + return 0; +} + +static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct page *page; + unsigned char *kaddr; + struct f2fs_summary *summary; + struct curseg_info *seg_i; + int written_size = 0; + int i, j; + + page = f2fs_grab_meta_page(sbi, blkaddr++); + kaddr = (unsigned char *)page_address(page); + memset(kaddr, 0, PAGE_SIZE); + + /* Step 1: write nat cache */ + seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); + memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE); + written_size += SUM_JOURNAL_SIZE; + + /* Step 2: write sit cache */ + seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); + memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE); + written_size += SUM_JOURNAL_SIZE; + + /* Step 3: write summary entries */ + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + unsigned short blkoff; + seg_i = CURSEG_I(sbi, i); + if (sbi->ckpt->alloc_type[i] == SSR) + blkoff = sbi->blocks_per_seg; + else + blkoff = curseg_blkoff(sbi, i); + + for (j = 0; j < blkoff; j++) { + if (!page) { + page = f2fs_grab_meta_page(sbi, blkaddr++); + kaddr = (unsigned char *)page_address(page); + memset(kaddr, 0, PAGE_SIZE); + written_size = 0; + } + summary = (struct f2fs_summary *)(kaddr + written_size); + *summary = seg_i->sum_blk->entries[j]; + written_size += SUMMARY_SIZE; + + if (written_size + SUMMARY_SIZE <= PAGE_SIZE - + SUM_FOOTER_SIZE) + continue; + + set_page_dirty(page); + f2fs_put_page(page, 1); + page = NULL; + } + } + if (page) { + set_page_dirty(page); + f2fs_put_page(page, 1); + } +} + +static void write_normal_summaries(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) +{ + int i, end; + if (IS_DATASEG(type)) + end = type + NR_CURSEG_DATA_TYPE; + else + end = type + NR_CURSEG_NODE_TYPE; + + for (i = type; i < end; i++) + write_current_sum_page(sbi, i, blkaddr + (i - type)); +} + +void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) +{ + if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) + write_compacted_summaries(sbi, start_blk); + else + write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA); +} + +void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) +{ + write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); +} + +int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, + unsigned int val, int alloc) +{ + int i; + + if (type == NAT_JOURNAL) { + for (i = 0; i < nats_in_cursum(journal); i++) { + if (le32_to_cpu(nid_in_journal(journal, i)) == val) + return i; + } + if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL)) + return update_nats_in_cursum(journal, 1); + } else if (type == SIT_JOURNAL) { + for (i = 0; i < sits_in_cursum(journal); i++) + if (le32_to_cpu(segno_in_journal(journal, i)) == val) + return i; + if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL)) + return update_sits_in_cursum(journal, 1); + } + return -1; +} + +static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + return f2fs_get_meta_page_nofail(sbi, current_sit_addr(sbi, segno)); +} + +static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, + unsigned int start) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct page *page; + pgoff_t src_off, dst_off; + + src_off = current_sit_addr(sbi, start); + dst_off = next_sit_addr(sbi, src_off); + + page = f2fs_grab_meta_page(sbi, dst_off); + seg_info_to_sit_page(sbi, page, start); + + set_page_dirty(page); + set_to_next_sit(sit_i, start); + + return page; +} + +static struct sit_entry_set *grab_sit_entry_set(void) +{ + struct sit_entry_set *ses = + f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_NOFS); + + ses->entry_cnt = 0; + INIT_LIST_HEAD(&ses->set_list); + return ses; +} + +static void release_sit_entry_set(struct sit_entry_set *ses) +{ + list_del(&ses->set_list); + kmem_cache_free(sit_entry_set_slab, ses); +} + +static void adjust_sit_entry_set(struct sit_entry_set *ses, + struct list_head *head) +{ + struct sit_entry_set *next = ses; + + if (list_is_last(&ses->set_list, head)) + return; + + list_for_each_entry_continue(next, head, set_list) + if (ses->entry_cnt <= next->entry_cnt) + break; + + list_move_tail(&ses->set_list, &next->set_list); +} + +static void add_sit_entry(unsigned int segno, struct list_head *head) +{ + struct sit_entry_set *ses; + unsigned int start_segno = START_SEGNO(segno); + + list_for_each_entry(ses, head, set_list) { + if (ses->start_segno == start_segno) { + ses->entry_cnt++; + adjust_sit_entry_set(ses, head); + return; + } + } + + ses = grab_sit_entry_set(); + + ses->start_segno = start_segno; + ses->entry_cnt++; + list_add(&ses->set_list, head); +} + +static void add_sits_in_set(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_info = SM_I(sbi); + struct list_head *set_list = &sm_info->sit_entry_set; + unsigned long *bitmap = SIT_I(sbi)->dirty_sentries_bitmap; + unsigned int segno; + + for_each_set_bit(segno, bitmap, MAIN_SEGS(sbi)) + add_sit_entry(segno, set_list); +} + +static void remove_sits_in_journal(struct f2fs_sb_info *sbi) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); + struct f2fs_journal *journal = curseg->journal; + int i; + + down_write(&curseg->journal_rwsem); + for (i = 0; i < sits_in_cursum(journal); i++) { + unsigned int segno; + bool dirtied; + + segno = le32_to_cpu(segno_in_journal(journal, i)); + dirtied = __mark_sit_entry_dirty(sbi, segno); + + if (!dirtied) + add_sit_entry(segno, &SM_I(sbi)->sit_entry_set); + } + update_sits_in_cursum(journal, -i); + up_write(&curseg->journal_rwsem); +} + +/* + * CP calls this function, which flushes SIT entries including sit_journal, + * and moves prefree segs to free segs. + */ +void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned long *bitmap = sit_i->dirty_sentries_bitmap; + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); + struct f2fs_journal *journal = curseg->journal; + struct sit_entry_set *ses, *tmp; + struct list_head *head = &SM_I(sbi)->sit_entry_set; + bool to_journal = true; + struct seg_entry *se; + + down_write(&sit_i->sentry_lock); + + if (!sit_i->dirty_sentries) + goto out; + + /* + * add and account sit entries of dirty bitmap in sit entry + * set temporarily + */ + add_sits_in_set(sbi); + + /* + * if there are no enough space in journal to store dirty sit + * entries, remove all entries from journal and add and account + * them in sit entry set. + */ + if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL)) + remove_sits_in_journal(sbi); + + /* + * there are two steps to flush sit entries: + * #1, flush sit entries to journal in current cold data summary block. + * #2, flush sit entries to sit page. + */ + list_for_each_entry_safe(ses, tmp, head, set_list) { + struct page *page = NULL; + struct f2fs_sit_block *raw_sit = NULL; + unsigned int start_segno = ses->start_segno; + unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK, + (unsigned long)MAIN_SEGS(sbi)); + unsigned int segno = start_segno; + + if (to_journal && + !__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL)) + to_journal = false; + + if (to_journal) { + down_write(&curseg->journal_rwsem); + } else { + page = get_next_sit_page(sbi, start_segno); + raw_sit = page_address(page); + } + + /* flush dirty sit entries in region of current sit set */ + for_each_set_bit_from(segno, bitmap, end) { + int offset, sit_offset; + + se = get_seg_entry(sbi, segno); +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(se->cur_valid_map, se->cur_valid_map_mir, + SIT_VBLOCK_MAP_SIZE)) + f2fs_bug_on(sbi, 1); +#endif + + /* add discard candidates */ + if (!(cpc->reason & CP_DISCARD)) { + cpc->trim_start = segno; + add_discard_addrs(sbi, cpc, false); + } + + if (to_journal) { + offset = f2fs_lookup_journal_in_cursum(journal, + SIT_JOURNAL, segno, 1); + f2fs_bug_on(sbi, offset < 0); + segno_in_journal(journal, offset) = + cpu_to_le32(segno); + seg_info_to_raw_sit(se, + &sit_in_journal(journal, offset)); + check_block_count(sbi, segno, + &sit_in_journal(journal, offset)); + } else { + sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); + seg_info_to_raw_sit(se, + &raw_sit->entries[sit_offset]); + check_block_count(sbi, segno, + &raw_sit->entries[sit_offset]); + } + + __clear_bit(segno, bitmap); + sit_i->dirty_sentries--; + ses->entry_cnt--; + } + + if (to_journal) + up_write(&curseg->journal_rwsem); + else + f2fs_put_page(page, 1); + + f2fs_bug_on(sbi, ses->entry_cnt); + release_sit_entry_set(ses); + } + + f2fs_bug_on(sbi, !list_empty(head)); + f2fs_bug_on(sbi, sit_i->dirty_sentries); +out: + if (cpc->reason & CP_DISCARD) { + __u64 trim_start = cpc->trim_start; + + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) + add_discard_addrs(sbi, cpc, false); + + cpc->trim_start = trim_start; + } + up_write(&sit_i->sentry_lock); + + set_prefree_as_free_segments(sbi); +} + +static int build_sit_info(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + struct sit_info *sit_i; + unsigned int sit_segs, start; + char *src_bitmap; + unsigned int bitmap_size; + + /* allocate memory for SIT information */ + sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL); + if (!sit_i) + return -ENOMEM; + + SM_I(sbi)->sit_info = sit_i; + + sit_i->sentries = + f2fs_kvzalloc(sbi, array_size(sizeof(struct seg_entry), + MAIN_SEGS(sbi)), + GFP_KERNEL); + if (!sit_i->sentries) + return -ENOMEM; + + bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); + sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(sbi, bitmap_size, + GFP_KERNEL); + if (!sit_i->dirty_sentries_bitmap) + return -ENOMEM; + + for (start = 0; start < MAIN_SEGS(sbi); start++) { + sit_i->sentries[start].cur_valid_map + = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + sit_i->sentries[start].ckpt_valid_map + = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->sentries[start].cur_valid_map || + !sit_i->sentries[start].ckpt_valid_map) + return -ENOMEM; + +#ifdef CONFIG_F2FS_CHECK_FS + sit_i->sentries[start].cur_valid_map_mir + = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->sentries[start].cur_valid_map_mir) + return -ENOMEM; +#endif + + sit_i->sentries[start].discard_map + = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, + GFP_KERNEL); + if (!sit_i->sentries[start].discard_map) + return -ENOMEM; + } + + sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->tmp_map) + return -ENOMEM; + + if (sbi->segs_per_sec > 1) { + sit_i->sec_entries = + f2fs_kvzalloc(sbi, array_size(sizeof(struct sec_entry), + MAIN_SECS(sbi)), + GFP_KERNEL); + if (!sit_i->sec_entries) + return -ENOMEM; + } + + /* get information related with SIT */ + sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1; + + /* setup SIT bitmap from ckeckpoint pack */ + bitmap_size = __bitmap_size(sbi, SIT_BITMAP); + src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); + + sit_i->sit_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); + if (!sit_i->sit_bitmap) + return -ENOMEM; + +#ifdef CONFIG_F2FS_CHECK_FS + sit_i->sit_bitmap_mir = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); + if (!sit_i->sit_bitmap_mir) + return -ENOMEM; +#endif + + /* init SIT information */ + sit_i->s_ops = &default_salloc_ops; + + sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); + sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; + sit_i->written_valid_blocks = 0; + sit_i->bitmap_size = bitmap_size; + sit_i->dirty_sentries = 0; + sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; + sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); + sit_i->mounted_time = ktime_get_real_seconds(); + init_rwsem(&sit_i->sentry_lock); + return 0; +} + +static int build_free_segmap(struct f2fs_sb_info *sbi) +{ + struct free_segmap_info *free_i; + unsigned int bitmap_size, sec_bitmap_size; + + /* allocate memory for free segmap information */ + free_i = f2fs_kzalloc(sbi, sizeof(struct free_segmap_info), GFP_KERNEL); + if (!free_i) + return -ENOMEM; + + SM_I(sbi)->free_info = free_i; + + bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); + free_i->free_segmap = f2fs_kvmalloc(sbi, bitmap_size, GFP_KERNEL); + if (!free_i->free_segmap) + return -ENOMEM; + + sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + free_i->free_secmap = f2fs_kvmalloc(sbi, sec_bitmap_size, GFP_KERNEL); + if (!free_i->free_secmap) + return -ENOMEM; + + /* set all segments as dirty temporarily */ + memset(free_i->free_segmap, 0xff, bitmap_size); + memset(free_i->free_secmap, 0xff, sec_bitmap_size); + + /* init free segmap information */ + free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi)); + free_i->free_segments = 0; + free_i->free_sections = 0; + spin_lock_init(&free_i->segmap_lock); + return 0; +} + +static int build_curseg(struct f2fs_sb_info *sbi) +{ + struct curseg_info *array; + int i; + + array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE, sizeof(*array)), + GFP_KERNEL); + if (!array) + return -ENOMEM; + + SM_I(sbi)->curseg_array = array; + + for (i = 0; i < NR_CURSEG_TYPE; i++) { + mutex_init(&array[i].curseg_mutex); + array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL); + if (!array[i].sum_blk) + return -ENOMEM; + init_rwsem(&array[i].journal_rwsem); + array[i].journal = f2fs_kzalloc(sbi, + sizeof(struct f2fs_journal), GFP_KERNEL); + if (!array[i].journal) + return -ENOMEM; + array[i].segno = NULL_SEGNO; + array[i].next_blkoff = 0; + } + return restore_curseg_summaries(sbi); +} + +static int build_sit_entries(struct f2fs_sb_info *sbi) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); + struct f2fs_journal *journal = curseg->journal; + struct seg_entry *se; + struct f2fs_sit_entry sit; + int sit_blk_cnt = SIT_BLK_CNT(sbi); + unsigned int i, start, end; + unsigned int readed, start_blk = 0; + int err = 0; + block_t total_node_blocks = 0; + + do { + readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, + META_SIT, true); + + start = start_blk * sit_i->sents_per_block; + end = (start_blk + readed) * sit_i->sents_per_block; + + for (; start < end && start < MAIN_SEGS(sbi); start++) { + struct f2fs_sit_block *sit_blk; + struct page *page; + + se = &sit_i->sentries[start]; + page = get_current_sit_page(sbi, start); + sit_blk = (struct f2fs_sit_block *)page_address(page); + sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; + f2fs_put_page(page, 1); + + err = check_block_count(sbi, start, &sit); + if (err) + return err; + seg_info_from_raw_sit(se, &sit); + if (IS_NODESEG(se->type)) + total_node_blocks += se->valid_blocks; + + /* build discard map only one time */ + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, + se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += + sbi->blocks_per_seg - + se->valid_blocks; + } + + if (sbi->segs_per_sec > 1) + get_sec_entry(sbi, start)->valid_blocks += + se->valid_blocks; + } + start_blk += readed; + } while (start_blk < sit_blk_cnt); + + down_read(&curseg->journal_rwsem); + for (i = 0; i < sits_in_cursum(journal); i++) { + unsigned int old_valid_blocks; + + start = le32_to_cpu(segno_in_journal(journal, i)); + if (start >= MAIN_SEGS(sbi)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong journal entry on segno %u", + start); + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EFSCORRUPTED; + break; + } + + se = &sit_i->sentries[start]; + sit = sit_in_journal(journal, i); + + old_valid_blocks = se->valid_blocks; + if (IS_NODESEG(se->type)) + total_node_blocks -= old_valid_blocks; + + err = check_block_count(sbi, start, &sit); + if (err) + break; + seg_info_from_raw_sit(se, &sit); + if (IS_NODESEG(se->type)) + total_node_blocks += se->valid_blocks; + + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks; + sbi->discard_blks -= se->valid_blocks; + } + + if (sbi->segs_per_sec > 1) { + get_sec_entry(sbi, start)->valid_blocks += + se->valid_blocks; + get_sec_entry(sbi, start)->valid_blocks -= + old_valid_blocks; + } + } + up_read(&curseg->journal_rwsem); + + if (!err && total_node_blocks != valid_node_count(sbi)) { + f2fs_msg(sbi->sb, KERN_ERR, + "SIT is corrupted node# %u vs %u", + total_node_blocks, valid_node_count(sbi)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EFSCORRUPTED; + } + + return err; +} + +static void init_free_segmap(struct f2fs_sb_info *sbi) +{ + unsigned int start; + int type; + + for (start = 0; start < MAIN_SEGS(sbi); start++) { + struct seg_entry *sentry = get_seg_entry(sbi, start); + if (!sentry->valid_blocks) + __set_free(sbi, start); + else + SIT_I(sbi)->written_valid_blocks += + sentry->valid_blocks; + } + + /* set use the current segments */ + for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) { + struct curseg_info *curseg_t = CURSEG_I(sbi, type); + __set_test_and_inuse(sbi, curseg_t->segno); + } +} + +static void init_dirty_segmap(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int segno = 0, offset = 0; + unsigned short valid_blocks; + + while (1) { + /* find dirty segment based on free segmap */ + segno = find_next_inuse(free_i, MAIN_SEGS(sbi), offset); + if (segno >= MAIN_SEGS(sbi)) + break; + offset = segno + 1; + valid_blocks = get_valid_blocks(sbi, segno, false); + if (valid_blocks == sbi->blocks_per_seg || !valid_blocks) + continue; + if (valid_blocks > sbi->blocks_per_seg) { + f2fs_bug_on(sbi, 1); + continue; + } + mutex_lock(&dirty_i->seglist_lock); + __locate_dirty_segment(sbi, segno, DIRTY); + mutex_unlock(&dirty_i->seglist_lock); + } +} + +static int init_victim_secmap(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + + dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); + if (!dirty_i->victim_secmap) + return -ENOMEM; + return 0; +} + +static int build_dirty_segmap(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i; + unsigned int bitmap_size, i; + + /* allocate memory for dirty segments list information */ + dirty_i = f2fs_kzalloc(sbi, sizeof(struct dirty_seglist_info), + GFP_KERNEL); + if (!dirty_i) + return -ENOMEM; + + SM_I(sbi)->dirty_info = dirty_i; + mutex_init(&dirty_i->seglist_lock); + + bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); + + for (i = 0; i < NR_DIRTY_TYPE; i++) { + dirty_i->dirty_segmap[i] = f2fs_kvzalloc(sbi, bitmap_size, + GFP_KERNEL); + if (!dirty_i->dirty_segmap[i]) + return -ENOMEM; + } + + init_dirty_segmap(sbi); + return init_victim_secmap(sbi); +} + +static int sanity_check_curseg(struct f2fs_sb_info *sbi) +{ + int i; + + /* + * In LFS/SSR curseg, .next_blkoff should point to an unused blkaddr; + * In LFS curseg, all blkaddr after .next_blkoff should be unused. + */ + for (i = 0; i < NO_CHECK_TYPE; i++) { + struct curseg_info *curseg = CURSEG_I(sbi, i); + struct seg_entry *se = get_seg_entry(sbi, curseg->segno); + unsigned int blkofs = curseg->next_blkoff; + + if (f2fs_test_bit(blkofs, se->cur_valid_map)) + goto out; + + if (curseg->alloc_type == SSR) + continue; + + for (blkofs += 1; blkofs < sbi->blocks_per_seg; blkofs++) { + if (!f2fs_test_bit(blkofs, se->cur_valid_map)) + continue; +out: + f2fs_msg(sbi->sb, KERN_ERR, + "Current segment's next free block offset is " + "inconsistent with bitmap, logtype:%u, " + "segno:%u, type:%u, next_blkoff:%u, blkofs:%u", + i, curseg->segno, curseg->alloc_type, + curseg->next_blkoff, blkofs); + return -EFSCORRUPTED; + } + } + return 0; +} + +/* + * Update min, max modified time for cost-benefit GC algorithm + */ +static void init_min_max_mtime(struct f2fs_sb_info *sbi) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int segno; + + down_write(&sit_i->sentry_lock); + + sit_i->min_mtime = ULLONG_MAX; + + for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { + unsigned int i; + unsigned long long mtime = 0; + + for (i = 0; i < sbi->segs_per_sec; i++) + mtime += get_seg_entry(sbi, segno + i)->mtime; + + mtime = div_u64(mtime, sbi->segs_per_sec); + + if (sit_i->min_mtime > mtime) + sit_i->min_mtime = mtime; + } + sit_i->max_mtime = get_mtime(sbi, false); + up_write(&sit_i->sentry_lock); +} + +int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_sm_info *sm_info; + int err; + + sm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_sm_info), GFP_KERNEL); + if (!sm_info) + return -ENOMEM; + + /* init sm info */ + sbi->sm_info = sm_info; + sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); + sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); + sm_info->segment_count = le32_to_cpu(raw_super->segment_count); + sm_info->reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); + sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); + sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); + sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); + sm_info->rec_prefree_segments = sm_info->main_segments * + DEF_RECLAIM_PREFREE_SEGMENTS / 100; + if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS) + sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS; + + if (!test_opt(sbi, LFS)) + sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; + sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; + sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; + sm_info->min_seq_blocks = sbi->blocks_per_seg * sbi->segs_per_sec; + sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; + sm_info->min_ssr_sections = reserved_sections(sbi); + + INIT_LIST_HEAD(&sm_info->sit_entry_set); + + init_rwsem(&sm_info->curseg_lock); + + if (!f2fs_readonly(sbi->sb)) { + err = f2fs_create_flush_cmd_control(sbi); + if (err) + return err; + } + + err = create_discard_cmd_control(sbi); + if (err) + return err; + + err = build_sit_info(sbi); + if (err) + return err; + err = build_free_segmap(sbi); + if (err) + return err; + err = build_curseg(sbi); + if (err) + return err; + + /* reinit free segmap based on SIT */ + err = build_sit_entries(sbi); + if (err) + return err; + + init_free_segmap(sbi); + err = build_dirty_segmap(sbi); + if (err) + return err; + + err = sanity_check_curseg(sbi); + if (err) + return err; + + init_min_max_mtime(sbi); + return 0; +} + +static void discard_dirty_segmap(struct f2fs_sb_info *sbi, + enum dirty_type dirty_type) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + + mutex_lock(&dirty_i->seglist_lock); + kvfree(dirty_i->dirty_segmap[dirty_type]); + dirty_i->nr_dirty[dirty_type] = 0; + mutex_unlock(&dirty_i->seglist_lock); +} + +static void destroy_victim_secmap(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + kvfree(dirty_i->victim_secmap); +} + +static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + int i; + + if (!dirty_i) + return; + + /* discard pre-free/dirty segments list */ + for (i = 0; i < NR_DIRTY_TYPE; i++) + discard_dirty_segmap(sbi, i); + + destroy_victim_secmap(sbi); + SM_I(sbi)->dirty_info = NULL; + kfree(dirty_i); +} + +static void destroy_curseg(struct f2fs_sb_info *sbi) +{ + struct curseg_info *array = SM_I(sbi)->curseg_array; + int i; + + if (!array) + return; + SM_I(sbi)->curseg_array = NULL; + for (i = 0; i < NR_CURSEG_TYPE; i++) { + kfree(array[i].sum_blk); + kfree(array[i].journal); + } + kfree(array); +} + +static void destroy_free_segmap(struct f2fs_sb_info *sbi) +{ + struct free_segmap_info *free_i = SM_I(sbi)->free_info; + if (!free_i) + return; + SM_I(sbi)->free_info = NULL; + kvfree(free_i->free_segmap); + kvfree(free_i->free_secmap); + kfree(free_i); +} + +static void destroy_sit_info(struct f2fs_sb_info *sbi) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int start; + + if (!sit_i) + return; + + if (sit_i->sentries) { + for (start = 0; start < MAIN_SEGS(sbi); start++) { + kfree(sit_i->sentries[start].cur_valid_map); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(sit_i->sentries[start].cur_valid_map_mir); +#endif + kfree(sit_i->sentries[start].ckpt_valid_map); + kfree(sit_i->sentries[start].discard_map); + } + } + kfree(sit_i->tmp_map); + + kvfree(sit_i->sentries); + kvfree(sit_i->sec_entries); + kvfree(sit_i->dirty_sentries_bitmap); + + SM_I(sbi)->sit_info = NULL; + kfree(sit_i->sit_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(sit_i->sit_bitmap_mir); +#endif + kfree(sit_i); +} + +void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_info = SM_I(sbi); + + if (!sm_info) + return; + f2fs_destroy_flush_cmd_control(sbi, true); + destroy_discard_cmd_control(sbi); + destroy_dirty_segmap(sbi); + destroy_curseg(sbi); + destroy_free_segmap(sbi); + destroy_sit_info(sbi); + sbi->sm_info = NULL; + kfree(sm_info); +} + +int __init f2fs_create_segment_manager_caches(void) +{ + discard_entry_slab = f2fs_kmem_cache_create("discard_entry", + sizeof(struct discard_entry)); + if (!discard_entry_slab) + goto fail; + + discard_cmd_slab = f2fs_kmem_cache_create("discard_cmd", + sizeof(struct discard_cmd)); + if (!discard_cmd_slab) + goto destroy_discard_entry; + + sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", + sizeof(struct sit_entry_set)); + if (!sit_entry_set_slab) + goto destroy_discard_cmd; + + inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", + sizeof(struct inmem_pages)); + if (!inmem_entry_slab) + goto destroy_sit_entry_set; + return 0; + +destroy_sit_entry_set: + kmem_cache_destroy(sit_entry_set_slab); +destroy_discard_cmd: + kmem_cache_destroy(discard_cmd_slab); +destroy_discard_entry: + kmem_cache_destroy(discard_entry_slab); +fail: + return -ENOMEM; +} + +void f2fs_destroy_segment_manager_caches(void) +{ + kmem_cache_destroy(sit_entry_set_slab); + kmem_cache_destroy(discard_cmd_slab); + kmem_cache_destroy(discard_entry_slab); + kmem_cache_destroy(inmem_entry_slab); +} diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h new file mode 100644 index 000000000..d5f9c9289 --- /dev/null +++ b/fs/f2fs/segment.h @@ -0,0 +1,868 @@ +/* + * fs/f2fs/segment.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/blkdev.h> +#include <linux/backing-dev.h> + +/* constant macro */ +#define NULL_SEGNO ((unsigned int)(~0)) +#define NULL_SECNO ((unsigned int)(~0)) + +#define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ +#define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ + +#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ + +/* L: Logical segment # in volume, R: Relative segment # in main area */ +#define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) +#define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) + +#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) +#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE) + +#define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA) +#define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA) +#define IS_COLD(t) ((t) == CURSEG_COLD_NODE || (t) == CURSEG_COLD_DATA) + +#define IS_CURSEG(sbi, seg) \ + (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) + +#define IS_CURSEC(sbi, secno) \ + (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ + (sbi)->segs_per_sec)) \ + +#define MAIN_BLKADDR(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->main_blkaddr)) +#define SEG0_BLKADDR(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->seg0_blkaddr : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment0_blkaddr)) + +#define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) +#define MAIN_SECS(sbi) ((sbi)->total_sections) + +#define TOTAL_SEGS(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->segment_count : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count)) +#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg) + +#define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) +#define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \ + (sbi)->log_blocks_per_seg)) + +#define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \ + (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg)) + +#define NEXT_FREE_BLKADDR(sbi, curseg) \ + (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff) + +#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi)) +#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg) +#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) + +#define GET_SEGNO(sbi, blk_addr) \ + ((!is_valid_data_blkaddr(sbi, blk_addr)) ? \ + NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ + GET_SEGNO_FROM_SEG0(sbi, blk_addr))) +#define BLKS_PER_SEC(sbi) \ + ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) +#define GET_SEC_FROM_SEG(sbi, segno) \ + (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec) +#define GET_SEG_FROM_SEC(sbi, secno) \ + ((secno) * (sbi)->segs_per_sec) +#define GET_ZONE_FROM_SEC(sbi, secno) \ + (((secno) == -1) ? -1: (secno) / (sbi)->secs_per_zone) +#define GET_ZONE_FROM_SEG(sbi, segno) \ + GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) + +#define GET_SUM_BLOCK(sbi, segno) \ + ((sbi)->sm_info->ssa_blkaddr + (segno)) + +#define GET_SUM_TYPE(footer) ((footer)->entry_type) +#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type)) + +#define SIT_ENTRY_OFFSET(sit_i, segno) \ + ((segno) % (sit_i)->sents_per_block) +#define SIT_BLOCK_OFFSET(segno) \ + ((segno) / SIT_ENTRY_PER_BLOCK) +#define START_SEGNO(segno) \ + (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK) +#define SIT_BLK_CNT(sbi) \ + ((MAIN_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK) +#define f2fs_bitmap_size(nr) \ + (BITS_TO_LONGS(nr) * sizeof(unsigned long)) + +#define SECTOR_FROM_BLOCK(blk_addr) \ + (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) +#define SECTOR_TO_BLOCK(sectors) \ + ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK) + +/* + * indicate a block allocation direction: RIGHT and LEFT. + * RIGHT means allocating new sections towards the end of volume. + * LEFT means the opposite direction. + */ +enum { + ALLOC_RIGHT = 0, + ALLOC_LEFT +}; + +/* + * In the victim_sel_policy->alloc_mode, there are two block allocation modes. + * LFS writes data sequentially with cleaning operations. + * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations. + */ +enum { + LFS = 0, + SSR +}; + +/* + * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes. + * GC_CB is based on cost-benefit algorithm. + * GC_GREEDY is based on greedy algorithm. + */ +enum { + GC_CB = 0, + GC_GREEDY, + ALLOC_NEXT, + FLUSH_DEVICE, + MAX_GC_POLICY, +}; + +/* + * BG_GC means the background cleaning job. + * FG_GC means the on-demand cleaning job. + * FORCE_FG_GC means on-demand cleaning job in background. + */ +enum { + BG_GC = 0, + FG_GC, + FORCE_FG_GC, +}; + +/* for a function parameter to select a victim segment */ +struct victim_sel_policy { + int alloc_mode; /* LFS or SSR */ + int gc_mode; /* GC_CB or GC_GREEDY */ + unsigned long *dirty_segmap; /* dirty segment bitmap */ + unsigned int max_search; /* maximum # of segments to search */ + unsigned int offset; /* last scanned bitmap offset */ + unsigned int ofs_unit; /* bitmap search unit */ + unsigned int min_cost; /* minimum cost */ + unsigned int min_segno; /* segment # having min. cost */ +}; + +struct seg_entry { + unsigned int type:6; /* segment type like CURSEG_XXX_TYPE */ + unsigned int valid_blocks:10; /* # of valid blocks */ + unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */ + unsigned int padding:6; /* padding */ + unsigned char *cur_valid_map; /* validity bitmap of blocks */ +#ifdef CONFIG_F2FS_CHECK_FS + unsigned char *cur_valid_map_mir; /* mirror of current valid bitmap */ +#endif + /* + * # of valid blocks and the validity bitmap stored in the the last + * checkpoint pack. This information is used by the SSR mode. + */ + unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */ + unsigned char *discard_map; + unsigned long long mtime; /* modification time of the segment */ +}; + +struct sec_entry { + unsigned int valid_blocks; /* # of valid blocks in a section */ +}; + +struct segment_allocation { + void (*allocate_segment)(struct f2fs_sb_info *, int, bool); +}; + +/* + * this value is set in page as a private data which indicate that + * the page is atomically written, and it is in inmem_pages list. + */ +#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) +#define DUMMY_WRITTEN_PAGE ((unsigned long)-2) + +#define IS_ATOMIC_WRITTEN_PAGE(page) \ + (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) +#define IS_DUMMY_WRITTEN_PAGE(page) \ + (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) + +#define MAX_SKIP_GC_COUNT 16 + +struct inmem_pages { + struct list_head list; + struct page *page; + block_t old_addr; /* for revoking when fail to commit */ +}; + +struct sit_info { + const struct segment_allocation *s_ops; + + block_t sit_base_addr; /* start block address of SIT area */ + block_t sit_blocks; /* # of blocks used by SIT area */ + block_t written_valid_blocks; /* # of valid blocks in main area */ + char *sit_bitmap; /* SIT bitmap pointer */ +#ifdef CONFIG_F2FS_CHECK_FS + char *sit_bitmap_mir; /* SIT bitmap mirror */ +#endif + unsigned int bitmap_size; /* SIT bitmap size */ + + unsigned long *tmp_map; /* bitmap for temporal use */ + unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */ + unsigned int dirty_sentries; /* # of dirty sentries */ + unsigned int sents_per_block; /* # of SIT entries per block */ + struct rw_semaphore sentry_lock; /* to protect SIT cache */ + struct seg_entry *sentries; /* SIT segment-level cache */ + struct sec_entry *sec_entries; /* SIT section-level cache */ + + /* for cost-benefit algorithm in cleaning procedure */ + unsigned long long elapsed_time; /* elapsed time after mount */ + unsigned long long mounted_time; /* mount time */ + unsigned long long min_mtime; /* min. modification time */ + unsigned long long max_mtime; /* max. modification time */ + + unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */ +}; + +struct free_segmap_info { + unsigned int start_segno; /* start segment number logically */ + unsigned int free_segments; /* # of free segments */ + unsigned int free_sections; /* # of free sections */ + spinlock_t segmap_lock; /* free segmap lock */ + unsigned long *free_segmap; /* free segment bitmap */ + unsigned long *free_secmap; /* free section bitmap */ +}; + +/* Notice: The order of dirty type is same with CURSEG_XXX in f2fs.h */ +enum dirty_type { + DIRTY_HOT_DATA, /* dirty segments assigned as hot data logs */ + DIRTY_WARM_DATA, /* dirty segments assigned as warm data logs */ + DIRTY_COLD_DATA, /* dirty segments assigned as cold data logs */ + DIRTY_HOT_NODE, /* dirty segments assigned as hot node logs */ + DIRTY_WARM_NODE, /* dirty segments assigned as warm node logs */ + DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */ + DIRTY, /* to count # of dirty segments */ + PRE, /* to count # of entirely obsolete segments */ + NR_DIRTY_TYPE +}; + +struct dirty_seglist_info { + const struct victim_selection *v_ops; /* victim selction operation */ + unsigned long *dirty_segmap[NR_DIRTY_TYPE]; + struct mutex seglist_lock; /* lock for segment bitmaps */ + int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ + unsigned long *victim_secmap; /* background GC victims */ +}; + +/* victim selection function for cleaning and SSR */ +struct victim_selection { + int (*get_victim)(struct f2fs_sb_info *, unsigned int *, + int, int, char); +}; + +/* for active log information */ +struct curseg_info { + struct mutex curseg_mutex; /* lock for consistency */ + struct f2fs_summary_block *sum_blk; /* cached summary block */ + struct rw_semaphore journal_rwsem; /* protect journal area */ + struct f2fs_journal *journal; /* cached journal info */ + unsigned char alloc_type; /* current allocation type */ + unsigned int segno; /* current segment number */ + unsigned short next_blkoff; /* next block offset to write */ + unsigned int zone; /* current zone number */ + unsigned int next_segno; /* preallocated segment */ +}; + +struct sit_entry_set { + struct list_head set_list; /* link with all sit sets */ + unsigned int start_segno; /* start segno of sits in set */ + unsigned int entry_cnt; /* the # of sit entries in set */ +}; + +/* + * inline functions + */ +static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type) +{ + return (struct curseg_info *)(SM_I(sbi)->curseg_array + type); +} + +static inline struct seg_entry *get_seg_entry(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + return &sit_i->sentries[segno]; +} + +static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)]; +} + +static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, + unsigned int segno, bool use_section) +{ + /* + * In order to get # of valid blocks in a section instantly from many + * segments, f2fs manages two counting structures separately. + */ + if (use_section && sbi->segs_per_sec > 1) + return get_sec_entry(sbi, segno)->valid_blocks; + else + return get_seg_entry(sbi, segno)->valid_blocks; +} + +static inline void seg_info_from_raw_sit(struct seg_entry *se, + struct f2fs_sit_entry *rs) +{ + se->valid_blocks = GET_SIT_VBLOCKS(rs); + se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs); + memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); + memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +#ifdef CONFIG_F2FS_CHECK_FS + memcpy(se->cur_valid_map_mir, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +#endif + se->type = GET_SIT_TYPE(rs); + se->mtime = le64_to_cpu(rs->mtime); +} + +static inline void __seg_info_to_raw_sit(struct seg_entry *se, + struct f2fs_sit_entry *rs) +{ + unsigned short raw_vblocks = (se->type << SIT_VBLOCKS_SHIFT) | + se->valid_blocks; + rs->vblocks = cpu_to_le16(raw_vblocks); + memcpy(rs->valid_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); + rs->mtime = cpu_to_le64(se->mtime); +} + +static inline void seg_info_to_sit_page(struct f2fs_sb_info *sbi, + struct page *page, unsigned int start) +{ + struct f2fs_sit_block *raw_sit; + struct seg_entry *se; + struct f2fs_sit_entry *rs; + unsigned int end = min(start + SIT_ENTRY_PER_BLOCK, + (unsigned long)MAIN_SEGS(sbi)); + int i; + + raw_sit = (struct f2fs_sit_block *)page_address(page); + memset(raw_sit, 0, PAGE_SIZE); + for (i = 0; i < end - start; i++) { + rs = &raw_sit->entries[i]; + se = get_seg_entry(sbi, start + i); + __seg_info_to_raw_sit(se, rs); + } +} + +static inline void seg_info_to_raw_sit(struct seg_entry *se, + struct f2fs_sit_entry *rs) +{ + __seg_info_to_raw_sit(se, rs); + + memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); + se->ckpt_valid_blocks = se->valid_blocks; +} + +static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, + unsigned int max, unsigned int segno) +{ + unsigned int ret; + spin_lock(&free_i->segmap_lock); + ret = find_next_bit(free_i->free_segmap, max, segno); + spin_unlock(&free_i->segmap_lock); + return ret; +} + +static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); + unsigned int next; + + spin_lock(&free_i->segmap_lock); + clear_bit(segno, free_i->free_segmap); + free_i->free_segments++; + + next = find_next_bit(free_i->free_segmap, + start_segno + sbi->segs_per_sec, start_segno); + if (next >= start_segno + sbi->segs_per_sec) { + clear_bit(secno, free_i->free_secmap); + free_i->free_sections++; + } + spin_unlock(&free_i->segmap_lock); +} + +static inline void __set_inuse(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + + set_bit(segno, free_i->free_segmap); + free_i->free_segments--; + if (!test_and_set_bit(secno, free_i->free_secmap)) + free_i->free_sections--; +} + +static inline void __set_test_and_free(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); + unsigned int next; + + spin_lock(&free_i->segmap_lock); + if (test_and_clear_bit(segno, free_i->free_segmap)) { + free_i->free_segments++; + + if (IS_CURSEC(sbi, secno)) + goto skip_free; + next = find_next_bit(free_i->free_segmap, + start_segno + sbi->segs_per_sec, start_segno); + if (next >= start_segno + sbi->segs_per_sec) { + if (test_and_clear_bit(secno, free_i->free_secmap)) + free_i->free_sections++; + } + } +skip_free: + spin_unlock(&free_i->segmap_lock); +} + +static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct free_segmap_info *free_i = FREE_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + + spin_lock(&free_i->segmap_lock); + if (!test_and_set_bit(segno, free_i->free_segmap)) { + free_i->free_segments--; + if (!test_and_set_bit(secno, free_i->free_secmap)) + free_i->free_sections--; + } + spin_unlock(&free_i->segmap_lock); +} + +static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, + void *dst_addr) +{ + struct sit_info *sit_i = SIT_I(sbi); + +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(sit_i->sit_bitmap, sit_i->sit_bitmap_mir, + sit_i->bitmap_size)) + f2fs_bug_on(sbi, 1); +#endif + memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size); +} + +static inline block_t written_block_count(struct f2fs_sb_info *sbi) +{ + return SIT_I(sbi)->written_valid_blocks; +} + +static inline unsigned int free_segments(struct f2fs_sb_info *sbi) +{ + return FREE_I(sbi)->free_segments; +} + +static inline int reserved_segments(struct f2fs_sb_info *sbi) +{ + return SM_I(sbi)->reserved_segments; +} + +static inline unsigned int free_sections(struct f2fs_sb_info *sbi) +{ + return FREE_I(sbi)->free_sections; +} + +static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi) +{ + return DIRTY_I(sbi)->nr_dirty[PRE]; +} + +static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi) +{ + return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] + + DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_DATA] + + DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_DATA] + + DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_NODE] + + DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_NODE] + + DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_NODE]; +} + +static inline int overprovision_segments(struct f2fs_sb_info *sbi) +{ + return SM_I(sbi)->ovp_segments; +} + +static inline int reserved_sections(struct f2fs_sb_info *sbi) +{ + return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); +} + +static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, + unsigned int node_blocks, unsigned int dent_blocks) +{ + + unsigned int segno, left_blocks; + int i; + + /* check current node segment */ + for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) { + segno = CURSEG_I(sbi, i)->segno; + left_blocks = sbi->blocks_per_seg - + get_seg_entry(sbi, segno)->ckpt_valid_blocks; + + if (node_blocks > left_blocks) + return false; + } + + /* check current data segment */ + segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; + left_blocks = sbi->blocks_per_seg - + get_seg_entry(sbi, segno)->ckpt_valid_blocks; + if (dent_blocks > left_blocks) + return false; + return true; +} + +static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, + int freed, int needed) +{ + unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + + get_pages(sbi, F2FS_DIRTY_DENTS) + + get_pages(sbi, F2FS_DIRTY_IMETA); + unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int node_secs = total_node_blocks / BLKS_PER_SEC(sbi); + unsigned int dent_secs = total_dent_blocks / BLKS_PER_SEC(sbi); + unsigned int node_blocks = total_node_blocks % BLKS_PER_SEC(sbi); + unsigned int dent_blocks = total_dent_blocks % BLKS_PER_SEC(sbi); + unsigned int free, need_lower, need_upper; + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return false; + + free = free_sections(sbi) + freed; + need_lower = node_secs + dent_secs + reserved_sections(sbi) + needed; + need_upper = need_lower + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0); + + if (free > need_upper) + return false; + else if (free <= need_lower) + return true; + return !has_curseg_enough_space(sbi, node_blocks, dent_blocks); +} + +static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) +{ + return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments; +} + +static inline int utilization(struct f2fs_sb_info *sbi) +{ + return div_u64((u64)valid_user_blocks(sbi) * 100, + sbi->user_block_count); +} + +/* + * Sometimes f2fs may be better to drop out-of-place update policy. + * And, users can control the policy through sysfs entries. + * There are five policies with triggering conditions as follows. + * F2FS_IPU_FORCE - all the time, + * F2FS_IPU_SSR - if SSR mode is activated, + * F2FS_IPU_UTIL - if FS utilization is over threashold, + * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over + * threashold, + * F2FS_IPU_FSYNC - activated in fsync path only for high performance flash + * storages. IPU will be triggered only if the # of dirty + * pages over min_fsync_blocks. + * F2FS_IPUT_DISABLE - disable IPU. (=default option) + */ +#define DEF_MIN_IPU_UTIL 70 +#define DEF_MIN_FSYNC_BLOCKS 8 +#define DEF_MIN_HOT_BLOCKS 16 + +#define SMALL_VOLUME_SEGMENTS (16 * 512) /* 16GB */ + +enum { + F2FS_IPU_FORCE, + F2FS_IPU_SSR, + F2FS_IPU_UTIL, + F2FS_IPU_SSR_UTIL, + F2FS_IPU_FSYNC, + F2FS_IPU_ASYNC, +}; + +static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, + int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + return curseg->segno; +} + +static inline unsigned char curseg_alloc_type(struct f2fs_sb_info *sbi, + int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + return curseg->alloc_type; +} + +static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + return curseg->next_blkoff; +} + +static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) +{ + f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1); +} + +static inline void verify_block_addr(struct f2fs_io_info *fio, block_t blk_addr) +{ + struct f2fs_sb_info *sbi = fio->sbi; + + if (__is_meta_io(fio)) + verify_blkaddr(sbi, blk_addr, META_GENERIC); + else + verify_blkaddr(sbi, blk_addr, DATA_GENERIC); +} + +/* + * Summary block is always treated as an invalid block + */ +static inline int check_block_count(struct f2fs_sb_info *sbi, + int segno, struct f2fs_sit_entry *raw_sit) +{ + bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; + int valid_blocks = 0; + int cur_pos = 0, next_pos; + + /* check bitmap with valid block count */ + do { + if (is_valid) { + next_pos = find_next_zero_bit_le(&raw_sit->valid_map, + sbi->blocks_per_seg, + cur_pos); + valid_blocks += next_pos - cur_pos; + } else + next_pos = find_next_bit_le(&raw_sit->valid_map, + sbi->blocks_per_seg, + cur_pos); + cur_pos = next_pos; + is_valid = !is_valid; + } while (cur_pos < sbi->blocks_per_seg); + + if (unlikely(GET_SIT_VBLOCKS(raw_sit) != valid_blocks)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Mismatch valid blocks %d vs. %d", + GET_SIT_VBLOCKS(raw_sit), valid_blocks); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return -EFSCORRUPTED; + } + + /* check segment usage, and check boundary of a given segment number */ + if (unlikely(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg + || segno > TOTAL_SEGS(sbi) - 1)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong valid blocks %d or segno %u", + GET_SIT_VBLOCKS(raw_sit), segno); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return -EFSCORRUPTED; + } + return 0; +} + +static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, + unsigned int start) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int offset = SIT_BLOCK_OFFSET(start); + block_t blk_addr = sit_i->sit_base_addr + offset; + + check_seg_range(sbi, start); + +#ifdef CONFIG_F2FS_CHECK_FS + if (f2fs_test_bit(offset, sit_i->sit_bitmap) != + f2fs_test_bit(offset, sit_i->sit_bitmap_mir)) + f2fs_bug_on(sbi, 1); +#endif + + /* calculate sit block address */ + if (f2fs_test_bit(offset, sit_i->sit_bitmap)) + blk_addr += sit_i->sit_blocks; + + return blk_addr; +} + +static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi, + pgoff_t block_addr) +{ + struct sit_info *sit_i = SIT_I(sbi); + block_addr -= sit_i->sit_base_addr; + if (block_addr < sit_i->sit_blocks) + block_addr += sit_i->sit_blocks; + else + block_addr -= sit_i->sit_blocks; + + return block_addr + sit_i->sit_base_addr; +} + +static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) +{ + unsigned int block_off = SIT_BLOCK_OFFSET(start); + + f2fs_change_bit(block_off, sit_i->sit_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_change_bit(block_off, sit_i->sit_bitmap_mir); +#endif +} + +static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi, + bool base_time) +{ + struct sit_info *sit_i = SIT_I(sbi); + time64_t diff, now = ktime_get_real_seconds(); + + if (now >= sit_i->mounted_time) + return sit_i->elapsed_time + now - sit_i->mounted_time; + + /* system time is set to the past */ + if (!base_time) { + diff = sit_i->mounted_time - now; + if (sit_i->elapsed_time >= diff) + return sit_i->elapsed_time - diff; + return 0; + } + return sit_i->elapsed_time; +} + +static inline void set_summary(struct f2fs_summary *sum, nid_t nid, + unsigned int ofs_in_node, unsigned char version) +{ + sum->nid = cpu_to_le32(nid); + sum->ofs_in_node = cpu_to_le16(ofs_in_node); + sum->version = version; +} + +static inline block_t start_sum_block(struct f2fs_sb_info *sbi) +{ + return __start_cp_addr(sbi) + + le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); +} + +static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) +{ + return __start_cp_addr(sbi) + + le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count) + - (base + 1) + type; +} + +static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) +{ + if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) + return true; + return false; +} + +/* + * It is very important to gather dirty pages and write at once, so that we can + * submit a big bio without interfering other data writes. + * By default, 512 pages for directory data, + * 512 pages (2MB) * 8 for nodes, and + * 256 pages * 8 for meta are set. + */ +static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) +{ + if (sbi->sb->s_bdi->wb.dirty_exceeded) + return 0; + + if (type == DATA) + return sbi->blocks_per_seg; + else if (type == NODE) + return 8 * sbi->blocks_per_seg; + else if (type == META) + return 8 * BIO_MAX_PAGES; + else + return 0; +} + +/* + * When writing pages, it'd better align nr_to_write for segment size. + */ +static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, + struct writeback_control *wbc) +{ + long nr_to_write, desired; + + if (wbc->sync_mode != WB_SYNC_NONE) + return 0; + + nr_to_write = wbc->nr_to_write; + desired = BIO_MAX_PAGES; + if (type == NODE) + desired <<= 1; + + wbc->nr_to_write = desired; + return desired - nr_to_write; +} + +static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + bool wakeup = false; + int i; + + if (force) + goto wake_up; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + if (i + 1 < dcc->discard_granularity) + break; + if (!list_empty(&dcc->pend_list[i])) { + wakeup = true; + break; + } + } + mutex_unlock(&dcc->cmd_lock); + if (!wakeup) + return; +wake_up: + dcc->discard_wake = 1; + wake_up_interruptible_all(&dcc->discard_wait_queue); +} diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c new file mode 100644 index 000000000..29042e6d5 --- /dev/null +++ b/fs/f2fs/shrinker.c @@ -0,0 +1,143 @@ +/* + * f2fs shrinker support + * the basic infra was copied from fs/ubifs/shrinker.c + * + * Copyright (c) 2015 Motorola Mobility + * Copyright (c) 2015 Jaegeuk Kim <jaegeuk@kernel.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" +#include "node.h" + +static LIST_HEAD(f2fs_list); +static DEFINE_SPINLOCK(f2fs_list_lock); +static unsigned int shrinker_run_no; + +static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) +{ + long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; + + return count > 0 ? count : 0; +} + +static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) +{ + long count = NM_I(sbi)->nid_cnt[FREE_NID] - MAX_FREE_NIDS; + + return count > 0 ? count : 0; +} + +static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) +{ + return atomic_read(&sbi->total_zombie_tree) + + atomic_read(&sbi->total_ext_node); +} + +unsigned long f2fs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct f2fs_sb_info *sbi; + struct list_head *p; + unsigned long count = 0; + + spin_lock(&f2fs_list_lock); + p = f2fs_list.next; + while (p != &f2fs_list) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + /* count extent cache entries */ + count += __count_extent_cache(sbi); + + /* shrink clean nat cache entries */ + count += __count_nat_entries(sbi); + + /* count free nids cache entries */ + count += __count_free_nids(sbi); + + spin_lock(&f2fs_list_lock); + p = p->next; + mutex_unlock(&sbi->umount_mutex); + } + spin_unlock(&f2fs_list_lock); + return count; +} + +unsigned long f2fs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + unsigned long nr = sc->nr_to_scan; + struct f2fs_sb_info *sbi; + struct list_head *p; + unsigned int run_no; + unsigned long freed = 0; + + spin_lock(&f2fs_list_lock); + do { + run_no = ++shrinker_run_no; + } while (run_no == 0); + p = f2fs_list.next; + while (p != &f2fs_list) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + if (sbi->shrinker_run_no == run_no) + break; + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + sbi->shrinker_run_no = run_no; + + /* shrink extent cache entries */ + freed += f2fs_shrink_extent_tree(sbi, nr >> 1); + + /* shrink clean nat cache entries */ + if (freed < nr) + freed += f2fs_try_to_free_nats(sbi, nr - freed); + + /* shrink free nids cache entries */ + if (freed < nr) + freed += f2fs_try_to_free_nids(sbi, nr - freed); + + spin_lock(&f2fs_list_lock); + p = p->next; + list_move_tail(&sbi->s_list, &f2fs_list); + mutex_unlock(&sbi->umount_mutex); + if (freed >= nr) + break; + } + spin_unlock(&f2fs_list_lock); + return freed; +} + +void f2fs_join_shrinker(struct f2fs_sb_info *sbi) +{ + spin_lock(&f2fs_list_lock); + list_add_tail(&sbi->s_list, &f2fs_list); + spin_unlock(&f2fs_list_lock); +} + +void f2fs_leave_shrinker(struct f2fs_sb_info *sbi) +{ + f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi)); + + spin_lock(&f2fs_list_lock); + list_del_init(&sbi->s_list); + spin_unlock(&f2fs_list_lock); +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c new file mode 100644 index 000000000..89fc8a4ce --- /dev/null +++ b/fs/f2fs/super.c @@ -0,0 +1,3377 @@ +/* + * fs/f2fs/super.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/statfs.h> +#include <linux/buffer_head.h> +#include <linux/backing-dev.h> +#include <linux/kthread.h> +#include <linux/parser.h> +#include <linux/mount.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> +#include <linux/random.h> +#include <linux/exportfs.h> +#include <linux/blkdev.h> +#include <linux/quotaops.h> +#include <linux/f2fs_fs.h> +#include <linux/sysfs.h> +#include <linux/quota.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "xattr.h" +#include "gc.h" +#include "trace.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/f2fs.h> + +static struct kmem_cache *f2fs_inode_cachep; + +#ifdef CONFIG_F2FS_FAULT_INJECTION + +char *f2fs_fault_name[FAULT_MAX] = { + [FAULT_KMALLOC] = "kmalloc", + [FAULT_KVMALLOC] = "kvmalloc", + [FAULT_PAGE_ALLOC] = "page alloc", + [FAULT_PAGE_GET] = "page get", + [FAULT_ALLOC_BIO] = "alloc bio", + [FAULT_ALLOC_NID] = "alloc nid", + [FAULT_ORPHAN] = "orphan", + [FAULT_BLOCK] = "no more block", + [FAULT_DIR_DEPTH] = "too big dir depth", + [FAULT_EVICT_INODE] = "evict_inode fail", + [FAULT_TRUNCATE] = "truncate fail", + [FAULT_IO] = "IO error", + [FAULT_CHECKPOINT] = "checkpoint error", + [FAULT_DISCARD] = "discard error", +}; + +void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, + unsigned int type) +{ + struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; + + if (rate) { + atomic_set(&ffi->inject_ops, 0); + ffi->inject_rate = rate; + } + + if (type) + ffi->inject_type = type; + + if (!rate && !type) + memset(ffi, 0, sizeof(struct f2fs_fault_info)); +} +#endif + +/* f2fs-wide shrinker description */ +static struct shrinker f2fs_shrinker_info = { + .scan_objects = f2fs_shrink_scan, + .count_objects = f2fs_shrink_count, + .seeks = DEFAULT_SEEKS, +}; + +enum { + Opt_gc_background, + Opt_disable_roll_forward, + Opt_norecovery, + Opt_discard, + Opt_nodiscard, + Opt_noheap, + Opt_heap, + Opt_user_xattr, + Opt_nouser_xattr, + Opt_acl, + Opt_noacl, + Opt_active_logs, + Opt_disable_ext_identify, + Opt_inline_xattr, + Opt_noinline_xattr, + Opt_inline_xattr_size, + Opt_inline_data, + Opt_inline_dentry, + Opt_noinline_dentry, + Opt_flush_merge, + Opt_noflush_merge, + Opt_nobarrier, + Opt_fastboot, + Opt_extent_cache, + Opt_noextent_cache, + Opt_noinline_data, + Opt_data_flush, + Opt_reserve_root, + Opt_resgid, + Opt_resuid, + Opt_mode, + Opt_io_size_bits, + Opt_fault_injection, + Opt_fault_type, + Opt_lazytime, + Opt_nolazytime, + Opt_quota, + Opt_noquota, + Opt_usrquota, + Opt_grpquota, + Opt_prjquota, + Opt_usrjquota, + Opt_grpjquota, + Opt_prjjquota, + Opt_offusrjquota, + Opt_offgrpjquota, + Opt_offprjjquota, + Opt_jqfmt_vfsold, + Opt_jqfmt_vfsv0, + Opt_jqfmt_vfsv1, + Opt_whint, + Opt_alloc, + Opt_fsync, + Opt_test_dummy_encryption, + Opt_err, +}; + +static match_table_t f2fs_tokens = { + {Opt_gc_background, "background_gc=%s"}, + {Opt_disable_roll_forward, "disable_roll_forward"}, + {Opt_norecovery, "norecovery"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_noheap, "no_heap"}, + {Opt_heap, "heap"}, + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_active_logs, "active_logs=%u"}, + {Opt_disable_ext_identify, "disable_ext_identify"}, + {Opt_inline_xattr, "inline_xattr"}, + {Opt_noinline_xattr, "noinline_xattr"}, + {Opt_inline_xattr_size, "inline_xattr_size=%u"}, + {Opt_inline_data, "inline_data"}, + {Opt_inline_dentry, "inline_dentry"}, + {Opt_noinline_dentry, "noinline_dentry"}, + {Opt_flush_merge, "flush_merge"}, + {Opt_noflush_merge, "noflush_merge"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_fastboot, "fastboot"}, + {Opt_extent_cache, "extent_cache"}, + {Opt_noextent_cache, "noextent_cache"}, + {Opt_noinline_data, "noinline_data"}, + {Opt_data_flush, "data_flush"}, + {Opt_reserve_root, "reserve_root=%u"}, + {Opt_resgid, "resgid=%u"}, + {Opt_resuid, "resuid=%u"}, + {Opt_mode, "mode=%s"}, + {Opt_io_size_bits, "io_bits=%u"}, + {Opt_fault_injection, "fault_injection=%u"}, + {Opt_fault_type, "fault_type=%u"}, + {Opt_lazytime, "lazytime"}, + {Opt_nolazytime, "nolazytime"}, + {Opt_quota, "quota"}, + {Opt_noquota, "noquota"}, + {Opt_usrquota, "usrquota"}, + {Opt_grpquota, "grpquota"}, + {Opt_prjquota, "prjquota"}, + {Opt_usrjquota, "usrjquota=%s"}, + {Opt_grpjquota, "grpjquota=%s"}, + {Opt_prjjquota, "prjjquota=%s"}, + {Opt_offusrjquota, "usrjquota="}, + {Opt_offgrpjquota, "grpjquota="}, + {Opt_offprjjquota, "prjjquota="}, + {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, + {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, + {Opt_whint, "whint_mode=%s"}, + {Opt_alloc, "alloc_mode=%s"}, + {Opt_fsync, "fsync_mode=%s"}, + {Opt_test_dummy_encryption, "test_dummy_encryption"}, + {Opt_err, NULL}, +}; + +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk_ratelimited("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); + va_end(args); +} + +static inline void limit_reserve_root(struct f2fs_sb_info *sbi) +{ + block_t limit = (sbi->user_block_count << 1) / 1000; + + /* limit is 0.2% */ + if (test_opt(sbi, RESERVE_ROOT) && + F2FS_OPTION(sbi).root_reserved_blocks > limit) { + F2FS_OPTION(sbi).root_reserved_blocks = limit; + f2fs_msg(sbi->sb, KERN_INFO, + "Reduce reserved blocks for root = %u", + F2FS_OPTION(sbi).root_reserved_blocks); + } + if (!test_opt(sbi, RESERVE_ROOT) && + (!uid_eq(F2FS_OPTION(sbi).s_resuid, + make_kuid(&init_user_ns, F2FS_DEF_RESUID)) || + !gid_eq(F2FS_OPTION(sbi).s_resgid, + make_kgid(&init_user_ns, F2FS_DEF_RESGID)))) + f2fs_msg(sbi->sb, KERN_INFO, + "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root", + from_kuid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resuid), + from_kgid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resgid)); +} + +static void init_once(void *foo) +{ + struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; + + inode_init_once(&fi->vfs_inode); +} + +#ifdef CONFIG_QUOTA +static const char * const quotatypes[] = INITQFNAMES; +#define QTYPE2NAME(t) (quotatypes[t]) +static int f2fs_set_qf_name(struct super_block *sb, int qtype, + substring_t *args) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + char *qname; + int ret = -EINVAL; + + if (sb_any_quota_loaded(sb) && !F2FS_OPTION(sbi).s_qf_names[qtype]) { + f2fs_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return -EINVAL; + } + if (f2fs_sb_has_quota_ino(sb)) { + f2fs_msg(sb, KERN_INFO, + "QUOTA feature is enabled, so ignore qf_name"); + return 0; + } + + qname = match_strdup(args); + if (!qname) { + f2fs_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return -EINVAL; + } + if (F2FS_OPTION(sbi).s_qf_names[qtype]) { + if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0) + ret = 0; + else + f2fs_msg(sb, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + goto errout; + } + if (strchr(qname, '/')) { + f2fs_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + goto errout; + } + F2FS_OPTION(sbi).s_qf_names[qtype] = qname; + set_opt(sbi, QUOTA); + return 0; +errout: + kfree(qname); + return ret; +} + +static int f2fs_clear_qf_name(struct super_block *sb, int qtype) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) { + f2fs_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return -EINVAL; + } + kfree(F2FS_OPTION(sbi).s_qf_names[qtype]); + F2FS_OPTION(sbi).s_qf_names[qtype] = NULL; + return 0; +} + +static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) +{ + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_ERR, "Project quota feature not enabled. " + "Cannot enable project quota enforcement."); + return -1; + } + if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) { + if (test_opt(sbi, USRQUOTA) && + F2FS_OPTION(sbi).s_qf_names[USRQUOTA]) + clear_opt(sbi, USRQUOTA); + + if (test_opt(sbi, GRPQUOTA) && + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]) + clear_opt(sbi, GRPQUOTA); + + if (test_opt(sbi, PRJQUOTA) && + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) + clear_opt(sbi, PRJQUOTA); + + if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) || + test_opt(sbi, PRJQUOTA)) { + f2fs_msg(sbi->sb, KERN_ERR, "old and new quota " + "format mixing"); + return -1; + } + + if (!F2FS_OPTION(sbi).s_jquota_fmt) { + f2fs_msg(sbi->sb, KERN_ERR, "journaled quota format " + "not specified"); + return -1; + } + } + + if (f2fs_sb_has_quota_ino(sbi->sb) && F2FS_OPTION(sbi).s_jquota_fmt) { + f2fs_msg(sbi->sb, KERN_INFO, + "QUOTA feature is enabled, so ignore jquota_fmt"); + F2FS_OPTION(sbi).s_jquota_fmt = 0; + } + return 0; +} +#endif + +static int parse_options(struct super_block *sb, char *options) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + substring_t args[MAX_OPT_ARGS]; + char *p, *name; + int arg = 0; + kuid_t uid; + kgid_t gid; +#ifdef CONFIG_QUOTA + int ret; +#endif + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = args[0].from = NULL; + token = match_token(p, f2fs_tokens, args); + + switch (token) { + case Opt_gc_background: + name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (strlen(name) == 2 && !strncmp(name, "on", 2)) { + set_opt(sbi, BG_GC); + clear_opt(sbi, FORCE_FG_GC); + } else if (strlen(name) == 3 && !strncmp(name, "off", 3)) { + clear_opt(sbi, BG_GC); + clear_opt(sbi, FORCE_FG_GC); + } else if (strlen(name) == 4 && !strncmp(name, "sync", 4)) { + set_opt(sbi, BG_GC); + set_opt(sbi, FORCE_FG_GC); + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_disable_roll_forward: + set_opt(sbi, DISABLE_ROLL_FORWARD); + break; + case Opt_norecovery: + /* this option mounts f2fs with ro */ + set_opt(sbi, DISABLE_ROLL_FORWARD); + if (!f2fs_readonly(sb)) + return -EINVAL; + break; + case Opt_discard: + set_opt(sbi, DISCARD); + break; + case Opt_nodiscard: + if (f2fs_sb_has_blkzoned(sb)) { + f2fs_msg(sb, KERN_WARNING, + "discard is required for zoned block devices"); + return -EINVAL; + } + clear_opt(sbi, DISCARD); + break; + case Opt_noheap: + set_opt(sbi, NOHEAP); + break; + case Opt_heap: + clear_opt(sbi, NOHEAP); + break; +#ifdef CONFIG_F2FS_FS_XATTR + case Opt_user_xattr: + set_opt(sbi, XATTR_USER); + break; + case Opt_nouser_xattr: + clear_opt(sbi, XATTR_USER); + break; + case Opt_inline_xattr: + set_opt(sbi, INLINE_XATTR); + break; + case Opt_noinline_xattr: + clear_opt(sbi, INLINE_XATTR); + break; + case Opt_inline_xattr_size: + if (args->from && match_int(args, &arg)) + return -EINVAL; + set_opt(sbi, INLINE_XATTR_SIZE); + F2FS_OPTION(sbi).inline_xattr_size = arg; + break; +#else + case Opt_user_xattr: + f2fs_msg(sb, KERN_INFO, + "user_xattr options not supported"); + break; + case Opt_nouser_xattr: + f2fs_msg(sb, KERN_INFO, + "nouser_xattr options not supported"); + break; + case Opt_inline_xattr: + f2fs_msg(sb, KERN_INFO, + "inline_xattr options not supported"); + break; + case Opt_noinline_xattr: + f2fs_msg(sb, KERN_INFO, + "noinline_xattr options not supported"); + break; +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL + case Opt_acl: + set_opt(sbi, POSIX_ACL); + break; + case Opt_noacl: + clear_opt(sbi, POSIX_ACL); + break; +#else + case Opt_acl: + f2fs_msg(sb, KERN_INFO, "acl options not supported"); + break; + case Opt_noacl: + f2fs_msg(sb, KERN_INFO, "noacl options not supported"); + break; +#endif + case Opt_active_logs: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE) + return -EINVAL; + F2FS_OPTION(sbi).active_logs = arg; + break; + case Opt_disable_ext_identify: + set_opt(sbi, DISABLE_EXT_IDENTIFY); + break; + case Opt_inline_data: + set_opt(sbi, INLINE_DATA); + break; + case Opt_inline_dentry: + set_opt(sbi, INLINE_DENTRY); + break; + case Opt_noinline_dentry: + clear_opt(sbi, INLINE_DENTRY); + break; + case Opt_flush_merge: + set_opt(sbi, FLUSH_MERGE); + break; + case Opt_noflush_merge: + clear_opt(sbi, FLUSH_MERGE); + break; + case Opt_nobarrier: + set_opt(sbi, NOBARRIER); + break; + case Opt_fastboot: + set_opt(sbi, FASTBOOT); + break; + case Opt_extent_cache: + set_opt(sbi, EXTENT_CACHE); + break; + case Opt_noextent_cache: + clear_opt(sbi, EXTENT_CACHE); + break; + case Opt_noinline_data: + clear_opt(sbi, INLINE_DATA); + break; + case Opt_data_flush: + set_opt(sbi, DATA_FLUSH); + break; + case Opt_reserve_root: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (test_opt(sbi, RESERVE_ROOT)) { + f2fs_msg(sb, KERN_INFO, + "Preserve previous reserve_root=%u", + F2FS_OPTION(sbi).root_reserved_blocks); + } else { + F2FS_OPTION(sbi).root_reserved_blocks = arg; + set_opt(sbi, RESERVE_ROOT); + } + break; + case Opt_resuid: + if (args->from && match_int(args, &arg)) + return -EINVAL; + uid = make_kuid(current_user_ns(), arg); + if (!uid_valid(uid)) { + f2fs_msg(sb, KERN_ERR, + "Invalid uid value %d", arg); + return -EINVAL; + } + F2FS_OPTION(sbi).s_resuid = uid; + break; + case Opt_resgid: + if (args->from && match_int(args, &arg)) + return -EINVAL; + gid = make_kgid(current_user_ns(), arg); + if (!gid_valid(gid)) { + f2fs_msg(sb, KERN_ERR, + "Invalid gid value %d", arg); + return -EINVAL; + } + F2FS_OPTION(sbi).s_resgid = gid; + break; + case Opt_mode: + name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (strlen(name) == 8 && + !strncmp(name, "adaptive", 8)) { + if (f2fs_sb_has_blkzoned(sb)) { + f2fs_msg(sb, KERN_WARNING, + "adaptive mode is not allowed with " + "zoned block device feature"); + kfree(name); + return -EINVAL; + } + set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); + } else if (strlen(name) == 3 && + !strncmp(name, "lfs", 3)) { + set_opt_mode(sbi, F2FS_MOUNT_LFS); + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_io_size_bits: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg > __ilog2_u32(BIO_MAX_PAGES)) { + f2fs_msg(sb, KERN_WARNING, + "Not support %d, larger than %d", + 1 << arg, BIO_MAX_PAGES); + return -EINVAL; + } + F2FS_OPTION(sbi).write_io_size_bits = arg; + break; + case Opt_fault_injection: + if (args->from && match_int(args, &arg)) + return -EINVAL; +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_build_fault_attr(sbi, arg, F2FS_ALL_FAULT_TYPE); + set_opt(sbi, FAULT_INJECTION); +#else + f2fs_msg(sb, KERN_INFO, + "FAULT_INJECTION was not selected"); +#endif + break; + case Opt_fault_type: + if (args->from && match_int(args, &arg)) + return -EINVAL; +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_build_fault_attr(sbi, 0, arg); + set_opt(sbi, FAULT_INJECTION); +#else + f2fs_msg(sb, KERN_INFO, + "FAULT_INJECTION was not selected"); +#endif + break; + case Opt_lazytime: + sb->s_flags |= SB_LAZYTIME; + break; + case Opt_nolazytime: + sb->s_flags &= ~SB_LAZYTIME; + break; +#ifdef CONFIG_QUOTA + case Opt_quota: + case Opt_usrquota: + set_opt(sbi, USRQUOTA); + break; + case Opt_grpquota: + set_opt(sbi, GRPQUOTA); + break; + case Opt_prjquota: + set_opt(sbi, PRJQUOTA); + break; + case Opt_usrjquota: + ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_grpjquota: + ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_prjjquota: + ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_offusrjquota: + ret = f2fs_clear_qf_name(sb, USRQUOTA); + if (ret) + return ret; + break; + case Opt_offgrpjquota: + ret = f2fs_clear_qf_name(sb, GRPQUOTA); + if (ret) + return ret; + break; + case Opt_offprjjquota: + ret = f2fs_clear_qf_name(sb, PRJQUOTA); + if (ret) + return ret; + break; + case Opt_jqfmt_vfsold: + F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_OLD; + break; + case Opt_jqfmt_vfsv0: + F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V0; + break; + case Opt_jqfmt_vfsv1: + F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V1; + break; + case Opt_noquota: + clear_opt(sbi, QUOTA); + clear_opt(sbi, USRQUOTA); + clear_opt(sbi, GRPQUOTA); + clear_opt(sbi, PRJQUOTA); + break; +#else + case Opt_quota: + case Opt_usrquota: + case Opt_grpquota: + case Opt_prjquota: + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_prjjquota: + case Opt_offusrjquota: + case Opt_offgrpjquota: + case Opt_offprjjquota: + case Opt_jqfmt_vfsold: + case Opt_jqfmt_vfsv0: + case Opt_jqfmt_vfsv1: + case Opt_noquota: + f2fs_msg(sb, KERN_INFO, + "quota operations not supported"); + break; +#endif + case Opt_whint: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (strlen(name) == 10 && + !strncmp(name, "user-based", 10)) { + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER; + } else if (strlen(name) == 3 && + !strncmp(name, "off", 3)) { + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; + } else if (strlen(name) == 8 && + !strncmp(name, "fs-based", 8)) { + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_alloc: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + + if (strlen(name) == 7 && + !strncmp(name, "default", 7)) { + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; + } else if (strlen(name) == 5 && + !strncmp(name, "reuse", 5)) { + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_fsync: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (strlen(name) == 5 && + !strncmp(name, "posix", 5)) { + F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; + } else if (strlen(name) == 6 && + !strncmp(name, "strict", 6)) { + F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT; + } else if (strlen(name) == 9 && + !strncmp(name, "nobarrier", 9)) { + F2FS_OPTION(sbi).fsync_mode = + FSYNC_MODE_NOBARRIER; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_test_dummy_encryption: +#ifdef CONFIG_F2FS_FS_ENCRYPTION + if (!f2fs_sb_has_encrypt(sb)) { + f2fs_msg(sb, KERN_ERR, "Encrypt feature is off"); + return -EINVAL; + } + + F2FS_OPTION(sbi).test_dummy_encryption = true; + f2fs_msg(sb, KERN_INFO, + "Test dummy encryption mode enabled"); +#else + f2fs_msg(sb, KERN_INFO, + "Test dummy encryption mount option ignored"); +#endif + break; + default: + f2fs_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" or missing value", + p); + return -EINVAL; + } + } +#ifdef CONFIG_QUOTA + if (f2fs_check_quota_options(sbi)) + return -EINVAL; +#else + if (f2fs_sb_has_quota_ino(sbi->sb) && !f2fs_readonly(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_INFO, + "Filesystem with quota feature cannot be mounted RDWR " + "without CONFIG_QUOTA"); + return -EINVAL; + } + if (f2fs_sb_has_project_quota(sbi->sb) && !f2fs_readonly(sbi->sb)) { + f2fs_msg(sb, KERN_ERR, + "Filesystem with project quota feature cannot be " + "mounted RDWR without CONFIG_QUOTA"); + return -EINVAL; + } +#endif + + if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { + f2fs_msg(sb, KERN_ERR, + "Should set mode=lfs with %uKB-sized IO", + F2FS_IO_SIZE_KB(sbi)); + return -EINVAL; + } + + if (test_opt(sbi, INLINE_XATTR_SIZE)) { + if (!f2fs_sb_has_extra_attr(sb) || + !f2fs_sb_has_flexible_inline_xattr(sb)) { + f2fs_msg(sb, KERN_ERR, + "extra_attr or flexible_inline_xattr " + "feature is off"); + return -EINVAL; + } + if (!test_opt(sbi, INLINE_XATTR)) { + f2fs_msg(sb, KERN_ERR, + "inline_xattr_size option should be " + "set with inline_xattr option"); + return -EINVAL; + } + if (F2FS_OPTION(sbi).inline_xattr_size < + sizeof(struct f2fs_xattr_header) / sizeof(__le32) || + F2FS_OPTION(sbi).inline_xattr_size > + DEF_ADDRS_PER_INODE - + F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - + DEF_INLINE_RESERVED_SIZE - + MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) { + f2fs_msg(sb, KERN_ERR, + "inline xattr size is out of range"); + return -EINVAL; + } + } + + /* Not pass down write hints if the number of active logs is lesser + * than NR_CURSEG_TYPE. + */ + if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE) + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; + return 0; +} + +static struct inode *f2fs_alloc_inode(struct super_block *sb) +{ + struct f2fs_inode_info *fi; + + fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_F2FS_ZERO); + if (!fi) + return NULL; + + init_once((void *) fi); + + /* Initialize f2fs-specific inode info */ + atomic_set(&fi->dirty_pages, 0); + init_rwsem(&fi->i_sem); + INIT_LIST_HEAD(&fi->dirty_list); + INIT_LIST_HEAD(&fi->gdirty_list); + INIT_LIST_HEAD(&fi->inmem_ilist); + INIT_LIST_HEAD(&fi->inmem_pages); + mutex_init(&fi->inmem_lock); + init_rwsem(&fi->i_gc_rwsem[READ]); + init_rwsem(&fi->i_gc_rwsem[WRITE]); + init_rwsem(&fi->i_mmap_sem); + init_rwsem(&fi->i_xattr_sem); + + /* Will be used by directory only */ + fi->i_dir_level = F2FS_SB(sb)->dir_level; + + return &fi->vfs_inode; +} + +static int f2fs_drop_inode(struct inode *inode) +{ + int ret; + /* + * This is to avoid a deadlock condition like below. + * writeback_single_inode(inode) + * - f2fs_write_data_page + * - f2fs_gc -> iput -> evict + * - inode_wait_for_writeback(inode) + */ + if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) { + if (!inode->i_nlink && !is_bad_inode(inode)) { + /* to avoid evict_inode call simultaneously */ + atomic_inc(&inode->i_count); + spin_unlock(&inode->i_lock); + + /* some remained atomic pages should discarded */ + if (f2fs_is_atomic_file(inode)) + f2fs_drop_inmem_pages(inode); + + /* should remain fi->extent_tree for writepage */ + f2fs_destroy_extent_node(inode); + + sb_start_intwrite(inode->i_sb); + f2fs_i_size_write(inode, 0); + + if (F2FS_HAS_BLOCKS(inode)) + f2fs_truncate(inode); + + sb_end_intwrite(inode->i_sb); + + spin_lock(&inode->i_lock); + atomic_dec(&inode->i_count); + } + trace_f2fs_drop_inode(inode, 0); + return 0; + } + ret = generic_drop_inode(inode); + trace_f2fs_drop_inode(inode, ret); + return ret; +} + +int f2fs_inode_dirtied(struct inode *inode, bool sync) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int ret = 0; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + ret = 1; + } else { + set_inode_flag(inode, FI_DIRTY_INODE); + stat_inc_dirty_inode(sbi, DIRTY_META); + } + if (sync && list_empty(&F2FS_I(inode)->gdirty_list)) { + list_add_tail(&F2FS_I(inode)->gdirty_list, + &sbi->inode_list[DIRTY_META]); + inc_page_count(sbi, F2FS_DIRTY_IMETA); + } + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return ret; +} + +void f2fs_inode_synced(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return; + } + if (!list_empty(&F2FS_I(inode)->gdirty_list)) { + list_del_init(&F2FS_I(inode)->gdirty_list); + dec_page_count(sbi, F2FS_DIRTY_IMETA); + } + clear_inode_flag(inode, FI_DIRTY_INODE); + clear_inode_flag(inode, FI_AUTO_RECOVER); + stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META); + spin_unlock(&sbi->inode_lock[DIRTY_META]); +} + +/* + * f2fs_dirty_inode() is called from __mark_inode_dirty() + * + * We should call set_dirty_inode to write the dirty inode through write_inode. + */ +static void f2fs_dirty_inode(struct inode *inode, int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) + return; + + if (flags == I_DIRTY_TIME) + return; + + if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) + clear_inode_flag(inode, FI_AUTO_RECOVER); + + f2fs_inode_dirtied(inode, false); +} + +static void f2fs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(f2fs_inode_cachep, F2FS_I(inode)); +} + +static void f2fs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, f2fs_i_callback); +} + +static void destroy_percpu_info(struct f2fs_sb_info *sbi) +{ + percpu_counter_destroy(&sbi->alloc_valid_block_count); + percpu_counter_destroy(&sbi->total_valid_inode_count); +} + +static void destroy_device_list(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + blkdev_put(FDEV(i).bdev, FMODE_EXCL); +#ifdef CONFIG_BLK_DEV_ZONED + kfree(FDEV(i).blkz_type); +#endif + } + kfree(sbi->devs); +} + +static void f2fs_put_super(struct super_block *sb) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int i; + bool dropped; + + /* unregister procfs/sysfs entries in advance to avoid race case */ + f2fs_unregister_sysfs(sbi); + + f2fs_quota_off_umount(sb); + + /* prevent remaining shrinker jobs */ + mutex_lock(&sbi->umount_mutex); + + /* + * We don't need to do checkpoint when superblock is clean. + * But, the previous checkpoint was not done by umount, it needs to do + * clean checkpoint again. + */ + if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + struct cp_control cpc = { + .reason = CP_UMOUNT, + }; + f2fs_write_checkpoint(sbi, &cpc); + } + + /* be sure to wait for any on-going discard commands */ + dropped = f2fs_wait_discard_bios(sbi); + + if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) && + !sbi->discard_blks && !dropped) { + struct cp_control cpc = { + .reason = CP_UMOUNT | CP_TRIMMED, + }; + f2fs_write_checkpoint(sbi, &cpc); + } + + /* + * normally superblock is clean, so we need to release this. + * In addition, EIO will skip do checkpoint, we need this as well. + */ + f2fs_release_ino_entry(sbi, true); + + f2fs_leave_shrinker(sbi); + mutex_unlock(&sbi->umount_mutex); + + /* our cp_error case, we can wait for any writeback page */ + f2fs_flush_merged_writes(sbi); + + f2fs_wait_on_all_pages_writeback(sbi); + + f2fs_bug_on(sbi, sbi->fsync_node_num); + + iput(sbi->node_inode); + sbi->node_inode = NULL; + + iput(sbi->meta_inode); + sbi->meta_inode = NULL; + + /* + * iput() can update stat information, if f2fs_write_checkpoint() + * above failed with error. + */ + f2fs_destroy_stats(sbi); + + /* destroy f2fs internal modules */ + f2fs_destroy_node_manager(sbi); + f2fs_destroy_segment_manager(sbi); + + kfree(sbi->ckpt); + + sb->s_fs_info = NULL; + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); + kfree(sbi->raw_super); + + destroy_device_list(sbi); + mempool_destroy(sbi->write_io_dummy); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(F2FS_OPTION(sbi).s_qf_names[i]); +#endif + destroy_percpu_info(sbi); + for (i = 0; i < NR_PAGE_TYPE; i++) + kfree(sbi->write_io[i]); + kfree(sbi); +} + +int f2fs_sync_fs(struct super_block *sb, int sync) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int err = 0; + + if (unlikely(f2fs_cp_error(sbi))) + return 0; + + trace_f2fs_sync_fs(sb, sync); + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return -EAGAIN; + + if (sync) { + struct cp_control cpc; + + cpc.reason = __get_cp_reason(sbi); + + mutex_lock(&sbi->gc_mutex); + err = f2fs_write_checkpoint(sbi, &cpc); + mutex_unlock(&sbi->gc_mutex); + } + f2fs_trace_ios(NULL, 1); + + return err; +} + +static int f2fs_freeze(struct super_block *sb) +{ + if (f2fs_readonly(sb)) + return 0; + + /* IO error happened before */ + if (unlikely(f2fs_cp_error(F2FS_SB(sb)))) + return -EIO; + + /* must be clean, since sync_filesystem() was already called */ + if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) + return -EINVAL; + return 0; +} + +static int f2fs_unfreeze(struct super_block *sb) +{ + return 0; +} + +#ifdef CONFIG_QUOTA +static int f2fs_statfs_project(struct super_block *sb, + kprojid_t projid, struct kstatfs *buf) +{ + struct kqid qid; + struct dquot *dquot; + u64 limit; + u64 curblock; + + qid = make_kqid_projid(projid); + dquot = dqget(sb, qid); + if (IS_ERR(dquot)) + return PTR_ERR(dquot); + spin_lock(&dquot->dq_dqb_lock); + + limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit, + dquot->dq_dqb.dqb_bhardlimit); + if (limit) + limit >>= sb->s_blocksize_bits; + + if (limit && buf->f_blocks > limit) { + curblock = (dquot->dq_dqb.dqb_curspace + + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; + buf->f_blocks = limit; + buf->f_bfree = buf->f_bavail = + (buf->f_blocks > curblock) ? + (buf->f_blocks - curblock) : 0; + } + + limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, + dquot->dq_dqb.dqb_ihardlimit); + + if (limit && buf->f_files > limit) { + buf->f_files = limit; + buf->f_ffree = + (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? + (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + } + + spin_unlock(&dquot->dq_dqb_lock); + dqput(dquot); + return 0; +} +#endif + +static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + block_t total_count, user_block_count, start_count; + u64 avail_node_count; + + total_count = le64_to_cpu(sbi->raw_super->block_count); + user_block_count = sbi->user_block_count; + start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr); + buf->f_type = F2FS_SUPER_MAGIC; + buf->f_bsize = sbi->blocksize; + + buf->f_blocks = total_count - start_count; + buf->f_bfree = user_block_count - valid_user_blocks(sbi) - + sbi->current_reserved_blocks; + if (buf->f_bfree > F2FS_OPTION(sbi).root_reserved_blocks) + buf->f_bavail = buf->f_bfree - + F2FS_OPTION(sbi).root_reserved_blocks; + else + buf->f_bavail = 0; + + avail_node_count = sbi->total_node_count - sbi->nquota_files - + F2FS_RESERVED_NODE_NUM; + + if (avail_node_count > user_block_count) { + buf->f_files = user_block_count; + buf->f_ffree = buf->f_bavail; + } else { + buf->f_files = avail_node_count; + buf->f_ffree = min(avail_node_count - valid_node_count(sbi), + buf->f_bavail); + } + + buf->f_namelen = F2FS_NAME_LEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + +#ifdef CONFIG_QUOTA + if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) && + sb_has_quota_limits_enabled(sb, PRJQUOTA)) { + f2fs_statfs_project(sb, F2FS_I(dentry->d_inode)->i_projid, buf); + } +#endif + return 0; +} + +static inline void f2fs_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (F2FS_OPTION(sbi).s_jquota_fmt) { + char *fmtname = ""; + + switch (F2FS_OPTION(sbi).s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA]) + seq_show_option(seq, "usrjquota", + F2FS_OPTION(sbi).s_qf_names[USRQUOTA]); + + if (F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]) + seq_show_option(seq, "grpjquota", + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]); + + if (F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) + seq_show_option(seq, "prjjquota", + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]); +#endif +} + +static int f2fs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); + + if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) { + if (test_opt(sbi, FORCE_FG_GC)) + seq_printf(seq, ",background_gc=%s", "sync"); + else + seq_printf(seq, ",background_gc=%s", "on"); + } else { + seq_printf(seq, ",background_gc=%s", "off"); + } + if (test_opt(sbi, DISABLE_ROLL_FORWARD)) + seq_puts(seq, ",disable_roll_forward"); + if (test_opt(sbi, DISCARD)) + seq_puts(seq, ",discard"); + if (test_opt(sbi, NOHEAP)) + seq_puts(seq, ",no_heap"); + else + seq_puts(seq, ",heap"); +#ifdef CONFIG_F2FS_FS_XATTR + if (test_opt(sbi, XATTR_USER)) + seq_puts(seq, ",user_xattr"); + else + seq_puts(seq, ",nouser_xattr"); + if (test_opt(sbi, INLINE_XATTR)) + seq_puts(seq, ",inline_xattr"); + else + seq_puts(seq, ",noinline_xattr"); + if (test_opt(sbi, INLINE_XATTR_SIZE)) + seq_printf(seq, ",inline_xattr_size=%u", + F2FS_OPTION(sbi).inline_xattr_size); +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL + if (test_opt(sbi, POSIX_ACL)) + seq_puts(seq, ",acl"); + else + seq_puts(seq, ",noacl"); +#endif + if (test_opt(sbi, DISABLE_EXT_IDENTIFY)) + seq_puts(seq, ",disable_ext_identify"); + if (test_opt(sbi, INLINE_DATA)) + seq_puts(seq, ",inline_data"); + else + seq_puts(seq, ",noinline_data"); + if (test_opt(sbi, INLINE_DENTRY)) + seq_puts(seq, ",inline_dentry"); + else + seq_puts(seq, ",noinline_dentry"); + if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) + seq_puts(seq, ",flush_merge"); + if (test_opt(sbi, NOBARRIER)) + seq_puts(seq, ",nobarrier"); + if (test_opt(sbi, FASTBOOT)) + seq_puts(seq, ",fastboot"); + if (test_opt(sbi, EXTENT_CACHE)) + seq_puts(seq, ",extent_cache"); + else + seq_puts(seq, ",noextent_cache"); + if (test_opt(sbi, DATA_FLUSH)) + seq_puts(seq, ",data_flush"); + + seq_puts(seq, ",mode="); + if (test_opt(sbi, ADAPTIVE)) + seq_puts(seq, "adaptive"); + else if (test_opt(sbi, LFS)) + seq_puts(seq, "lfs"); + seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs); + if (test_opt(sbi, RESERVE_ROOT)) + seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u", + F2FS_OPTION(sbi).root_reserved_blocks, + from_kuid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resuid), + from_kgid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resgid)); + if (F2FS_IO_SIZE_BITS(sbi)) + seq_printf(seq, ",io_bits=%u", + F2FS_OPTION(sbi).write_io_size_bits); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (test_opt(sbi, FAULT_INJECTION)) { + seq_printf(seq, ",fault_injection=%u", + F2FS_OPTION(sbi).fault_info.inject_rate); + seq_printf(seq, ",fault_type=%u", + F2FS_OPTION(sbi).fault_info.inject_type); + } +#endif +#ifdef CONFIG_QUOTA + if (test_opt(sbi, QUOTA)) + seq_puts(seq, ",quota"); + if (test_opt(sbi, USRQUOTA)) + seq_puts(seq, ",usrquota"); + if (test_opt(sbi, GRPQUOTA)) + seq_puts(seq, ",grpquota"); + if (test_opt(sbi, PRJQUOTA)) + seq_puts(seq, ",prjquota"); +#endif + f2fs_show_quota_options(seq, sbi->sb); + if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) + seq_printf(seq, ",whint_mode=%s", "user-based"); + else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) + seq_printf(seq, ",whint_mode=%s", "fs-based"); +#ifdef CONFIG_F2FS_FS_ENCRYPTION + if (F2FS_OPTION(sbi).test_dummy_encryption) + seq_puts(seq, ",test_dummy_encryption"); +#endif + + if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT) + seq_printf(seq, ",alloc_mode=%s", "default"); + else if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) + seq_printf(seq, ",alloc_mode=%s", "reuse"); + + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX) + seq_printf(seq, ",fsync_mode=%s", "posix"); + else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) + seq_printf(seq, ",fsync_mode=%s", "strict"); + else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_NOBARRIER) + seq_printf(seq, ",fsync_mode=%s", "nobarrier"); + return 0; +} + +static void default_options(struct f2fs_sb_info *sbi) +{ + /* init some FS parameters */ + F2FS_OPTION(sbi).active_logs = NR_CURSEG_TYPE; + F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; + F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; + F2FS_OPTION(sbi).test_dummy_encryption = false; + F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); + F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); + + set_opt(sbi, BG_GC); + set_opt(sbi, INLINE_XATTR); + set_opt(sbi, INLINE_DATA); + set_opt(sbi, INLINE_DENTRY); + set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, NOHEAP); + sbi->sb->s_flags |= SB_LAZYTIME; + set_opt(sbi, FLUSH_MERGE); + set_opt(sbi, DISCARD); + if (f2fs_sb_has_blkzoned(sbi->sb)) + set_opt_mode(sbi, F2FS_MOUNT_LFS); + else + set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); + +#ifdef CONFIG_F2FS_FS_XATTR + set_opt(sbi, XATTR_USER); +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL + set_opt(sbi, POSIX_ACL); +#endif + + f2fs_build_fault_attr(sbi, 0, 0); +} + +#ifdef CONFIG_QUOTA +static int f2fs_enable_quotas(struct super_block *sb); +#endif +static int f2fs_remount(struct super_block *sb, int *flags, char *data) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_mount_info org_mount_opt; + unsigned long old_sb_flags; + int err; + bool need_restart_gc = false; + bool need_stop_gc = false; + bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); +#ifdef CONFIG_QUOTA + int i, j; +#endif + + /* + * Save the old mount options in case we + * need to restore them. + */ + org_mount_opt = sbi->mount_opt; + old_sb_flags = sb->s_flags; + +#ifdef CONFIG_QUOTA + org_mount_opt.s_jquota_fmt = F2FS_OPTION(sbi).s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + if (F2FS_OPTION(sbi).s_qf_names[i]) { + org_mount_opt.s_qf_names[i] = + kstrdup(F2FS_OPTION(sbi).s_qf_names[i], + GFP_KERNEL); + if (!org_mount_opt.s_qf_names[i]) { + for (j = 0; j < i; j++) + kfree(org_mount_opt.s_qf_names[j]); + return -ENOMEM; + } + } else { + org_mount_opt.s_qf_names[i] = NULL; + } + } +#endif + + /* recover superblocks we couldn't write due to previous RO mount */ + if (!(*flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { + err = f2fs_commit_super(sbi, false); + f2fs_msg(sb, KERN_INFO, + "Try to recover all the superblocks, ret: %d", err); + if (!err) + clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); + } + + default_options(sbi); + + /* parse mount options */ + err = parse_options(sb, data); + if (err) + goto restore_opts; + + /* + * Previous and new state of filesystem is RO, + * so skip checking GC and FLUSH_MERGE conditions. + */ + if (f2fs_readonly(sb) && (*flags & SB_RDONLY)) + goto skip; + +#ifdef CONFIG_QUOTA + if (!f2fs_readonly(sb) && (*flags & SB_RDONLY)) { + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + } else if (f2fs_readonly(sb) && !(*flags & MS_RDONLY)) { + /* dquot_resume needs RW */ + sb->s_flags &= ~SB_RDONLY; + if (sb_any_quota_suspended(sb)) { + dquot_resume(sb, -1); + } else if (f2fs_sb_has_quota_ino(sb)) { + err = f2fs_enable_quotas(sb); + if (err) + goto restore_opts; + } + } +#endif + /* disallow enable/disable extent_cache dynamically */ + if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { + err = -EINVAL; + f2fs_msg(sbi->sb, KERN_WARNING, + "switch extent_cache option is not allowed"); + goto restore_opts; + } + + /* + * We stop the GC thread if FS is mounted as RO + * or if background_gc = off is passed in mount + * option. Also sync the filesystem. + */ + if ((*flags & SB_RDONLY) || !test_opt(sbi, BG_GC)) { + if (sbi->gc_thread) { + f2fs_stop_gc_thread(sbi); + need_restart_gc = true; + } + } else if (!sbi->gc_thread) { + err = f2fs_start_gc_thread(sbi); + if (err) + goto restore_opts; + need_stop_gc = true; + } + + if (*flags & SB_RDONLY || + F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) { + writeback_inodes_sb(sb, WB_REASON_SYNC); + sync_inodes_sb(sb); + + set_sbi_flag(sbi, SBI_IS_DIRTY); + set_sbi_flag(sbi, SBI_IS_CLOSE); + f2fs_sync_fs(sb, 1); + clear_sbi_flag(sbi, SBI_IS_CLOSE); + } + + /* + * We stop issue flush thread if FS is mounted as RO + * or if flush_merge is not passed in mount option. + */ + if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { + clear_opt(sbi, FLUSH_MERGE); + f2fs_destroy_flush_cmd_control(sbi, false); + } else { + err = f2fs_create_flush_cmd_control(sbi); + if (err) + goto restore_gc; + } +skip: +#ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < MAXQUOTAS; i++) + kfree(org_mount_opt.s_qf_names[i]); +#endif + /* Update the POSIXACL Flag */ + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); + + limit_reserve_root(sbi); + *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); + return 0; +restore_gc: + if (need_restart_gc) { + if (f2fs_start_gc_thread(sbi)) + f2fs_msg(sbi->sb, KERN_WARNING, + "background gc thread has stopped"); + } else if (need_stop_gc) { + f2fs_stop_gc_thread(sbi); + } +restore_opts: +#ifdef CONFIG_QUOTA + F2FS_OPTION(sbi).s_jquota_fmt = org_mount_opt.s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + kfree(F2FS_OPTION(sbi).s_qf_names[i]); + F2FS_OPTION(sbi).s_qf_names[i] = org_mount_opt.s_qf_names[i]; + } +#endif + sbi->mount_opt = org_mount_opt; + sb->s_flags = old_sb_flags; + return err; +} + +#ifdef CONFIG_QUOTA +/* Read data from quotafile */ +static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + struct address_space *mapping = inode->i_mapping; + block_t blkidx = F2FS_BYTES_TO_BLK(off); + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + loff_t i_size = i_size_read(inode); + struct page *page; + char *kaddr; + + if (off > i_size) + return 0; + + if (off + len > i_size) + len = i_size - off; + toread = len; + while (toread > 0) { + tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); +repeat: + page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS); + if (IS_ERR(page)) { + if (PTR_ERR(page) == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto repeat; + } + return PTR_ERR(page); + } + + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; + } + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return -EIO; + } + + kaddr = kmap_atomic(page); + memcpy(data, kaddr + offset, tocopy); + kunmap_atomic(kaddr); + f2fs_put_page(page, 1); + + offset = 0; + toread -= tocopy; + data += tocopy; + blkidx++; + } + return len; +} + +/* Write to quotafile */ +static ssize_t f2fs_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + int offset = off & (sb->s_blocksize - 1); + size_t towrite = len; + struct page *page; + void *fsdata = NULL; + char *kaddr; + int err = 0; + int tocopy; + + while (towrite > 0) { + tocopy = min_t(unsigned long, sb->s_blocksize - offset, + towrite); +retry: + err = a_ops->write_begin(NULL, mapping, off, tocopy, 0, + &page, &fsdata); + if (unlikely(err)) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + break; + } + + kaddr = kmap_atomic(page); + memcpy(kaddr + offset, data, tocopy); + kunmap_atomic(kaddr); + flush_dcache_page(page); + + a_ops->write_end(NULL, mapping, off, tocopy, tocopy, + page, fsdata); + offset = 0; + towrite -= tocopy; + off += tocopy; + data += tocopy; + cond_resched(); + } + + if (len == towrite) + return err; + inode->i_mtime = inode->i_ctime = current_time(inode); + f2fs_mark_inode_dirty_sync(inode, false); + return len - towrite; +} + +static struct dquot **f2fs_get_dquots(struct inode *inode) +{ + return F2FS_I(inode)->i_dquot; +} + +static qsize_t *f2fs_get_reserved_space(struct inode *inode) +{ + return &F2FS_I(inode)->i_reserved_quota; +} + +static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) +{ + return dquot_quota_on_mount(sbi->sb, F2FS_OPTION(sbi).s_qf_names[type], + F2FS_OPTION(sbi).s_jquota_fmt, type); +} + +int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly) +{ + int enabled = 0; + int i, err; + + if (f2fs_sb_has_quota_ino(sbi->sb) && rdonly) { + err = f2fs_enable_quotas(sbi->sb); + if (err) { + f2fs_msg(sbi->sb, KERN_ERR, + "Cannot turn on quota_ino: %d", err); + return 0; + } + return 1; + } + + for (i = 0; i < MAXQUOTAS; i++) { + if (F2FS_OPTION(sbi).s_qf_names[i]) { + err = f2fs_quota_on_mount(sbi, i); + if (!err) { + enabled = 1; + continue; + } + f2fs_msg(sbi->sb, KERN_ERR, + "Cannot turn on quotas: %d on %d", err, i); + } + } + return enabled; +} + +static int f2fs_quota_enable(struct super_block *sb, int type, int format_id, + unsigned int flags) +{ + struct inode *qf_inode; + unsigned long qf_inum; + int err; + + BUG_ON(!f2fs_sb_has_quota_ino(sb)); + + qf_inum = f2fs_qf_ino(sb, type); + if (!qf_inum) + return -EPERM; + + qf_inode = f2fs_iget(sb, qf_inum); + if (IS_ERR(qf_inode)) { + f2fs_msg(sb, KERN_ERR, + "Bad quota inode %u:%lu", type, qf_inum); + return PTR_ERR(qf_inode); + } + + /* Don't account quota for quota files to avoid recursion */ + qf_inode->i_flags |= S_NOQUOTA; + err = dquot_enable(qf_inode, type, format_id, flags); + iput(qf_inode); + return err; +} + +static int f2fs_enable_quotas(struct super_block *sb) +{ + int type, err = 0; + unsigned long qf_inum; + bool quota_mopt[MAXQUOTAS] = { + test_opt(F2FS_SB(sb), USRQUOTA), + test_opt(F2FS_SB(sb), GRPQUOTA), + test_opt(F2FS_SB(sb), PRJQUOTA), + }; + + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; + for (type = 0; type < MAXQUOTAS; type++) { + qf_inum = f2fs_qf_ino(sb, type); + if (qf_inum) { + err = f2fs_quota_enable(sb, type, QFMT_VFS_V1, + DQUOT_USAGE_ENABLED | + (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to enable quota tracking " + "(type=%d, err=%d). Please run " + "fsck to fix.", type, err); + for (type--; type >= 0; type--) + dquot_quota_off(sb, type); + return err; + } + } + } + return 0; +} + +static int f2fs_quota_sync(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + int cnt; + int ret; + + ret = dquot_writeback_dquots(sb, type); + if (ret) + return ret; + + /* + * Now when everything is written we can discard the pagecache so + * that userspace sees the changes. + */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_active(sb, cnt)) + continue; + + ret = filemap_write_and_wait(dqopt->files[cnt]->i_mapping); + if (ret) + return ret; + + inode_lock(dqopt->files[cnt]); + truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); + inode_unlock(dqopt->files[cnt]); + } + return 0; +} + +static int f2fs_quota_on(struct super_block *sb, int type, int format_id, + const struct path *path) +{ + struct inode *inode; + int err; + + err = f2fs_quota_sync(sb, type); + if (err) + return err; + + err = dquot_quota_on(sb, type, format_id, path); + if (err) + return err; + + inode = d_inode(path->dentry); + + inode_lock(inode); + F2FS_I(inode)->i_flags |= F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL; + f2fs_set_inode_flags(inode); + inode_unlock(inode); + f2fs_mark_inode_dirty_sync(inode, false); + + return 0; +} + +static int f2fs_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + int err; + + if (!inode || !igrab(inode)) + return dquot_quota_off(sb, type); + + err = f2fs_quota_sync(sb, type); + if (err) + goto out_put; + + err = dquot_quota_off(sb, type); + if (err || f2fs_sb_has_quota_ino(sb)) + goto out_put; + + inode_lock(inode); + F2FS_I(inode)->i_flags &= ~(F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL); + f2fs_set_inode_flags(inode); + inode_unlock(inode); + f2fs_mark_inode_dirty_sync(inode, false); +out_put: + iput(inode); + return err; +} + +void f2fs_quota_off_umount(struct super_block *sb) +{ + int type; + int err; + + for (type = 0; type < MAXQUOTAS; type++) { + err = f2fs_quota_off(sb, type); + if (err) { + int ret = dquot_quota_off(sb, type); + + f2fs_msg(sb, KERN_ERR, + "Fail to turn off disk quota " + "(type: %d, err: %d, ret:%d), Please " + "run fsck to fix it.", type, err, ret); + set_sbi_flag(F2FS_SB(sb), SBI_NEED_FSCK); + } + } + /* + * In case of checkpoint=disable, we must flush quota blocks. + * This can cause NULL exception for node_inode in end_io, since + * put_super already dropped it. + */ + sync_filesystem(sb); +} + +static void f2fs_truncate_quota_inode_pages(struct super_block *sb) +{ + struct quota_info *dqopt = sb_dqopt(sb); + int type; + + for (type = 0; type < MAXQUOTAS; type++) { + if (!dqopt->files[type]) + continue; + f2fs_inode_synced(dqopt->files[type]); + } +} + + +static int f2fs_get_projid(struct inode *inode, kprojid_t *projid) +{ + *projid = F2FS_I(inode)->i_projid; + return 0; +} + +static const struct dquot_operations f2fs_quota_operations = { + .get_reserved_space = f2fs_get_reserved_space, + .write_dquot = dquot_commit, + .acquire_dquot = dquot_acquire, + .release_dquot = dquot_release, + .mark_dirty = dquot_mark_dquot_dirty, + .write_info = dquot_commit_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, + .get_projid = f2fs_get_projid, + .get_next_id = dquot_get_next_id, +}; + +static const struct quotactl_ops f2fs_quotactl_ops = { + .quota_on = f2fs_quota_on, + .quota_off = f2fs_quota_off, + .quota_sync = f2fs_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, + .get_nextdqblk = dquot_get_next_dqblk, +}; +#else +void f2fs_quota_off_umount(struct super_block *sb) +{ +} +#endif + +static const struct super_operations f2fs_sops = { + .alloc_inode = f2fs_alloc_inode, + .drop_inode = f2fs_drop_inode, + .destroy_inode = f2fs_destroy_inode, + .write_inode = f2fs_write_inode, + .dirty_inode = f2fs_dirty_inode, + .show_options = f2fs_show_options, +#ifdef CONFIG_QUOTA + .quota_read = f2fs_quota_read, + .quota_write = f2fs_quota_write, + .get_dquots = f2fs_get_dquots, +#endif + .evict_inode = f2fs_evict_inode, + .put_super = f2fs_put_super, + .sync_fs = f2fs_sync_fs, + .freeze_fs = f2fs_freeze, + .unfreeze_fs = f2fs_unfreeze, + .statfs = f2fs_statfs, + .remount_fs = f2fs_remount, +}; + +#ifdef CONFIG_F2FS_FS_ENCRYPTION +static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) +{ + return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, NULL); +} + +static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, + void *fs_data) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + /* + * Encrypting the root directory is not allowed because fsck + * expects lost+found directory to exist and remain unencrypted + * if LOST_FOUND feature is enabled. + * + */ + if (f2fs_sb_has_lost_found(sbi->sb) && + inode->i_ino == F2FS_ROOT_INO(sbi)) + return -EPERM; + + return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, fs_data, XATTR_CREATE); +} + +static bool f2fs_dummy_context(struct inode *inode) +{ + return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode)); +} + +static const struct fscrypt_operations f2fs_cryptops = { + .key_prefix = "f2fs:", + .get_context = f2fs_get_context, + .set_context = f2fs_set_context, + .dummy_context = f2fs_dummy_context, + .empty_dir = f2fs_empty_dir, + .max_namelen = F2FS_NAME_LEN, +}; +#endif + +static struct inode *f2fs_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *inode; + + if (f2fs_check_nid_range(sbi, ino)) + return ERR_PTR(-ESTALE); + + /* + * f2fs_iget isn't quite right if the inode is currently unallocated! + * However f2fs_iget currently does appropriate checks to handle stale + * inodes so everything is OK. + */ + inode = f2fs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (unlikely(generation && inode->i_generation != generation)) { + /* we didn't find the right inode.. */ + iput(inode); + return ERR_PTR(-ESTALE); + } + return inode; +} + +static struct dentry *f2fs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + f2fs_nfs_get_inode); +} + +static struct dentry *f2fs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + f2fs_nfs_get_inode); +} + +static const struct export_operations f2fs_export_ops = { + .fh_to_dentry = f2fs_fh_to_dentry, + .fh_to_parent = f2fs_fh_to_parent, + .get_parent = f2fs_get_parent, +}; + +static loff_t max_file_blocks(void) +{ + loff_t result = 0; + loff_t leaf_count = ADDRS_PER_BLOCK; + + /* + * note: previously, result is equal to (DEF_ADDRS_PER_INODE - + * DEFAULT_INLINE_XATTR_ADDRS), but now f2fs try to reserve more + * space in inode.i_addr, it will be more safe to reassign + * result as zero. + */ + + /* two direct node blocks */ + result += (leaf_count * 2); + + /* two indirect node blocks */ + leaf_count *= NIDS_PER_BLOCK; + result += (leaf_count * 2); + + /* one double indirect node block */ + leaf_count *= NIDS_PER_BLOCK; + result += leaf_count; + + return result; +} + +static int __f2fs_commit_super(struct buffer_head *bh, + struct f2fs_super_block *super) +{ + lock_buffer(bh); + if (super) + memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super)); + set_buffer_dirty(bh); + unlock_buffer(bh); + + /* it's rare case, we can do fua all the time */ + return __sync_dirty_buffer(bh, REQ_SYNC | REQ_PREFLUSH | REQ_FUA); +} + +static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, + struct buffer_head *bh) +{ + struct f2fs_super_block *raw_super = (struct f2fs_super_block *) + (bh->b_data + F2FS_SUPER_OFFSET); + struct super_block *sb = sbi->sb; + u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); + u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr); + u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr); + u32 nat_blkaddr = le32_to_cpu(raw_super->nat_blkaddr); + u32 ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); + u32 main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); + u32 segment_count_ckpt = le32_to_cpu(raw_super->segment_count_ckpt); + u32 segment_count_sit = le32_to_cpu(raw_super->segment_count_sit); + u32 segment_count_nat = le32_to_cpu(raw_super->segment_count_nat); + u32 segment_count_ssa = le32_to_cpu(raw_super->segment_count_ssa); + u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main); + u32 segment_count = le32_to_cpu(raw_super->segment_count); + u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + u64 main_end_blkaddr = main_blkaddr + + (segment_count_main << log_blocks_per_seg); + u64 seg_end_blkaddr = segment0_blkaddr + + (segment_count << log_blocks_per_seg); + + if (segment0_blkaddr != cp_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Mismatch start address, segment0(%u) cp_blkaddr(%u)", + segment0_blkaddr, cp_blkaddr); + return true; + } + + if (cp_blkaddr + (segment_count_ckpt << log_blocks_per_seg) != + sit_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong CP boundary, start(%u) end(%u) blocks(%u)", + cp_blkaddr, sit_blkaddr, + segment_count_ckpt << log_blocks_per_seg); + return true; + } + + if (sit_blkaddr + (segment_count_sit << log_blocks_per_seg) != + nat_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong SIT boundary, start(%u) end(%u) blocks(%u)", + sit_blkaddr, nat_blkaddr, + segment_count_sit << log_blocks_per_seg); + return true; + } + + if (nat_blkaddr + (segment_count_nat << log_blocks_per_seg) != + ssa_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong NAT boundary, start(%u) end(%u) blocks(%u)", + nat_blkaddr, ssa_blkaddr, + segment_count_nat << log_blocks_per_seg); + return true; + } + + if (ssa_blkaddr + (segment_count_ssa << log_blocks_per_seg) != + main_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong SSA boundary, start(%u) end(%u) blocks(%u)", + ssa_blkaddr, main_blkaddr, + segment_count_ssa << log_blocks_per_seg); + return true; + } + + if (main_end_blkaddr > seg_end_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong MAIN_AREA boundary, start(%u) end(%u) block(%u)", + main_blkaddr, + segment0_blkaddr + + (segment_count << log_blocks_per_seg), + segment_count_main << log_blocks_per_seg); + return true; + } else if (main_end_blkaddr < seg_end_blkaddr) { + int err = 0; + char *res; + + /* fix in-memory information all the time */ + raw_super->segment_count = cpu_to_le32((main_end_blkaddr - + segment0_blkaddr) >> log_blocks_per_seg); + + if (f2fs_readonly(sb) || bdev_read_only(sb->s_bdev)) { + set_sbi_flag(sbi, SBI_NEED_SB_WRITE); + res = "internally"; + } else { + err = __f2fs_commit_super(bh, NULL); + res = err ? "failed" : "done"; + } + f2fs_msg(sb, KERN_INFO, + "Fix alignment : %s, start(%u) end(%u) block(%u)", + res, main_blkaddr, + segment0_blkaddr + + (segment_count << log_blocks_per_seg), + segment_count_main << log_blocks_per_seg); + if (err) + return true; + } + return false; +} + +static int sanity_check_raw_super(struct f2fs_sb_info *sbi, + struct buffer_head *bh) +{ + block_t segment_count, segs_per_sec, secs_per_zone; + block_t total_sections, blocks_per_seg; + struct f2fs_super_block *raw_super = (struct f2fs_super_block *) + (bh->b_data + F2FS_SUPER_OFFSET); + struct super_block *sb = sbi->sb; + unsigned int blocksize; + + if (le32_to_cpu(raw_super->magic) != F2FS_SUPER_MAGIC) { + f2fs_msg(sb, KERN_INFO, + "Magic Mismatch, valid(0x%x) - read(0x%x)", + F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic)); + return -EINVAL; + } + + /* Currently, support only 4KB page cache size */ + if (F2FS_BLKSIZE != PAGE_SIZE) { + f2fs_msg(sb, KERN_INFO, + "Invalid page_cache_size (%lu), supports only 4KB\n", + PAGE_SIZE); + return -EFSCORRUPTED; + } + + /* Currently, support only 4KB block size */ + blocksize = 1 << le32_to_cpu(raw_super->log_blocksize); + if (blocksize != F2FS_BLKSIZE) { + f2fs_msg(sb, KERN_INFO, + "Invalid blocksize (%u), supports only 4KB\n", + blocksize); + return -EFSCORRUPTED; + } + + /* check log blocks per segment */ + if (le32_to_cpu(raw_super->log_blocks_per_seg) != 9) { + f2fs_msg(sb, KERN_INFO, + "Invalid log blocks per segment (%u)\n", + le32_to_cpu(raw_super->log_blocks_per_seg)); + return -EFSCORRUPTED; + } + + /* Currently, support 512/1024/2048/4096 bytes sector size */ + if (le32_to_cpu(raw_super->log_sectorsize) > + F2FS_MAX_LOG_SECTOR_SIZE || + le32_to_cpu(raw_super->log_sectorsize) < + F2FS_MIN_LOG_SECTOR_SIZE) { + f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize (%u)", + le32_to_cpu(raw_super->log_sectorsize)); + return -EFSCORRUPTED; + } + if (le32_to_cpu(raw_super->log_sectors_per_block) + + le32_to_cpu(raw_super->log_sectorsize) != + F2FS_MAX_LOG_SECTOR_SIZE) { + f2fs_msg(sb, KERN_INFO, + "Invalid log sectors per block(%u) log sectorsize(%u)", + le32_to_cpu(raw_super->log_sectors_per_block), + le32_to_cpu(raw_super->log_sectorsize)); + return -EFSCORRUPTED; + } + + segment_count = le32_to_cpu(raw_super->segment_count); + segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); + secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); + total_sections = le32_to_cpu(raw_super->section_count); + + /* blocks_per_seg should be 512, given the above check */ + blocks_per_seg = 1 << le32_to_cpu(raw_super->log_blocks_per_seg); + + if (segment_count > F2FS_MAX_SEGMENT || + segment_count < F2FS_MIN_SEGMENTS) { + f2fs_msg(sb, KERN_INFO, + "Invalid segment count (%u)", + segment_count); + return -EFSCORRUPTED; + } + + if (total_sections > segment_count || + total_sections < F2FS_MIN_SEGMENTS || + segs_per_sec > segment_count || !segs_per_sec) { + f2fs_msg(sb, KERN_INFO, + "Invalid segment/section count (%u, %u x %u)", + segment_count, total_sections, segs_per_sec); + return -EFSCORRUPTED; + } + + if ((segment_count / segs_per_sec) < total_sections) { + f2fs_msg(sb, KERN_INFO, + "Small segment_count (%u < %u * %u)", + segment_count, segs_per_sec, total_sections); + return -EFSCORRUPTED; + } + + if (segment_count > (le64_to_cpu(raw_super->block_count) >> 9)) { + f2fs_msg(sb, KERN_INFO, + "Wrong segment_count / block_count (%u > %llu)", + segment_count, le64_to_cpu(raw_super->block_count)); + return -EFSCORRUPTED; + } + + if (secs_per_zone > total_sections || !secs_per_zone) { + f2fs_msg(sb, KERN_INFO, + "Wrong secs_per_zone / total_sections (%u, %u)", + secs_per_zone, total_sections); + return -EFSCORRUPTED; + } + if (le32_to_cpu(raw_super->extension_count) > F2FS_MAX_EXTENSION || + raw_super->hot_ext_count > F2FS_MAX_EXTENSION || + (le32_to_cpu(raw_super->extension_count) + + raw_super->hot_ext_count) > F2FS_MAX_EXTENSION) { + f2fs_msg(sb, KERN_INFO, + "Corrupted extension count (%u + %u > %u)", + le32_to_cpu(raw_super->extension_count), + raw_super->hot_ext_count, + F2FS_MAX_EXTENSION); + return -EFSCORRUPTED; + } + + if (le32_to_cpu(raw_super->cp_payload) > + (blocks_per_seg - F2FS_CP_PACKS)) { + f2fs_msg(sb, KERN_INFO, + "Insane cp_payload (%u > %u)", + le32_to_cpu(raw_super->cp_payload), + blocks_per_seg - F2FS_CP_PACKS); + return -EFSCORRUPTED; + } + + /* check reserved ino info */ + if (le32_to_cpu(raw_super->node_ino) != 1 || + le32_to_cpu(raw_super->meta_ino) != 2 || + le32_to_cpu(raw_super->root_ino) != 3) { + f2fs_msg(sb, KERN_INFO, + "Invalid Fs Meta Ino: node(%u) meta(%u) root(%u)", + le32_to_cpu(raw_super->node_ino), + le32_to_cpu(raw_super->meta_ino), + le32_to_cpu(raw_super->root_ino)); + return -EFSCORRUPTED; + } + + /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ + if (sanity_check_area_boundary(sbi, bh)) + return -EFSCORRUPTED; + + return 0; +} + +int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) +{ + unsigned int total, fsmeta; + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned int ovp_segments, reserved_segments; + unsigned int main_segs, blocks_per_seg; + unsigned int sit_segs, nat_segs; + unsigned int sit_bitmap_size, nat_bitmap_size; + unsigned int log_blocks_per_seg; + unsigned int segment_count_main; + unsigned int cp_pack_start_sum, cp_payload; + block_t user_block_count; + int i, j; + + total = le32_to_cpu(raw_super->segment_count); + fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); + sit_segs = le32_to_cpu(raw_super->segment_count_sit); + fsmeta += sit_segs; + nat_segs = le32_to_cpu(raw_super->segment_count_nat); + fsmeta += nat_segs; + fsmeta += le32_to_cpu(ckpt->rsvd_segment_count); + fsmeta += le32_to_cpu(raw_super->segment_count_ssa); + + if (unlikely(fsmeta >= total)) + return 1; + + ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); + reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); + + if (unlikely(fsmeta < F2FS_MIN_SEGMENTS || + ovp_segments == 0 || reserved_segments == 0)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong layout: check mkfs.f2fs version"); + return 1; + } + + user_block_count = le64_to_cpu(ckpt->user_block_count); + segment_count_main = le32_to_cpu(raw_super->segment_count_main); + log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + if (!user_block_count || user_block_count >= + segment_count_main << log_blocks_per_seg) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong user_block_count: %u", user_block_count); + return 1; + } + + main_segs = le32_to_cpu(raw_super->segment_count_main); + blocks_per_seg = sbi->blocks_per_seg; + + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) + return 1; + for (j = i + 1; j < NR_CURSEG_NODE_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) == + le32_to_cpu(ckpt->cur_node_segno[j])) { + f2fs_msg(sbi->sb, KERN_ERR, + "Node segment (%u, %u) has the same " + "segno: %u", i, j, + le32_to_cpu(ckpt->cur_node_segno[i])); + return 1; + } + } + } + for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) + return 1; + for (j = i + 1; j < NR_CURSEG_DATA_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) == + le32_to_cpu(ckpt->cur_data_segno[j])) { + f2fs_msg(sbi->sb, KERN_ERR, + "Data segment (%u, %u) has the same " + "segno: %u", i, j, + le32_to_cpu(ckpt->cur_data_segno[i])); + return 1; + } + } + } + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + for (j = 0; j < NR_CURSEG_DATA_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) == + le32_to_cpu(ckpt->cur_data_segno[j])) { + f2fs_msg(sbi->sb, KERN_ERR, + "Node segment (%u) and Data segment (%u)" + " has the same segno: %u", i, j, + le32_to_cpu(ckpt->cur_node_segno[i])); + return 1; + } + } + } + + sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); + nat_bitmap_size = le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); + + if (sit_bitmap_size != ((sit_segs / 2) << log_blocks_per_seg) / 8 || + nat_bitmap_size != ((nat_segs / 2) << log_blocks_per_seg) / 8) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong bitmap size: sit: %u, nat:%u", + sit_bitmap_size, nat_bitmap_size); + return 1; + } + + cp_pack_start_sum = __start_sum_addr(sbi); + cp_payload = __cp_payload(sbi); + if (cp_pack_start_sum < cp_payload + 1 || + cp_pack_start_sum > blocks_per_seg - 1 - + NR_CURSEG_TYPE) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong cp_pack_start_sum: %u", + cp_pack_start_sum); + return 1; + } + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); + return 1; + } + return 0; +} + +static void init_sb_info(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = sbi->raw_super; + int i, j; + + sbi->log_sectors_per_block = + le32_to_cpu(raw_super->log_sectors_per_block); + sbi->log_blocksize = le32_to_cpu(raw_super->log_blocksize); + sbi->blocksize = 1 << sbi->log_blocksize; + sbi->log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + sbi->blocks_per_seg = 1 << sbi->log_blocks_per_seg; + sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); + sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); + sbi->total_sections = le32_to_cpu(raw_super->section_count); + sbi->total_node_count = + (le32_to_cpu(raw_super->segment_count_nat) / 2) + * sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK; + sbi->root_ino_num = le32_to_cpu(raw_super->root_ino); + sbi->node_ino_num = le32_to_cpu(raw_super->node_ino); + sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino); + sbi->cur_victim_sec = NULL_SECNO; + sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; + + sbi->dir_level = DEF_DIR_LEVEL; + sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; + sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; + clear_sbi_flag(sbi, SBI_NEED_FSCK); + + for (i = 0; i < NR_COUNT_TYPE; i++) + atomic_set(&sbi->nr_pages[i], 0); + + for (i = 0; i < META; i++) + atomic_set(&sbi->wb_sync_req[i], 0); + + INIT_LIST_HEAD(&sbi->s_list); + mutex_init(&sbi->umount_mutex); + for (i = 0; i < NR_PAGE_TYPE - 1; i++) + for (j = HOT; j < NR_TEMP_TYPE; j++) + mutex_init(&sbi->wio_mutex[i][j]); + init_rwsem(&sbi->io_order_lock); + spin_lock_init(&sbi->cp_lock); + + sbi->dirty_device = 0; + spin_lock_init(&sbi->dev_lock); + + init_rwsem(&sbi->sb_lock); +} + +static int init_percpu_info(struct f2fs_sb_info *sbi) +{ + int err; + + err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL); + if (err) + return err; + + err = percpu_counter_init(&sbi->total_valid_inode_count, 0, + GFP_KERNEL); + if (err) + percpu_counter_destroy(&sbi->alloc_valid_block_count); + + return err; +} + +#ifdef CONFIG_BLK_DEV_ZONED +static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) +{ + struct block_device *bdev = FDEV(devi).bdev; + sector_t nr_sectors = bdev->bd_part->nr_sects; + sector_t sector = 0; + struct blk_zone *zones; + unsigned int i, nr_zones; + unsigned int n = 0; + int err = -EIO; + + if (!f2fs_sb_has_blkzoned(sbi->sb)) + return 0; + + if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != + SECTOR_TO_BLOCK(bdev_zone_sectors(bdev))) + return -EINVAL; + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)); + if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != + __ilog2_u32(sbi->blocks_per_blkz)) + return -EINVAL; + sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); + FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> + sbi->log_blocks_per_blkz; + if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) + FDEV(devi).nr_blkz++; + + FDEV(devi).blkz_type = f2fs_kmalloc(sbi, FDEV(devi).nr_blkz, + GFP_KERNEL); + if (!FDEV(devi).blkz_type) + return -ENOMEM; + +#define F2FS_REPORT_NR_ZONES 4096 + + zones = f2fs_kzalloc(sbi, + array_size(F2FS_REPORT_NR_ZONES, + sizeof(struct blk_zone)), + GFP_KERNEL); + if (!zones) + return -ENOMEM; + + /* Get block zones type */ + while (zones && sector < nr_sectors) { + + nr_zones = F2FS_REPORT_NR_ZONES; + err = blkdev_report_zones(bdev, sector, + zones, &nr_zones, + GFP_KERNEL); + if (err) + break; + if (!nr_zones) { + err = -EIO; + break; + } + + for (i = 0; i < nr_zones; i++) { + FDEV(devi).blkz_type[n] = zones[i].type; + sector += zones[i].len; + n++; + } + } + + kfree(zones); + + return err; +} +#endif + +/* + * Read f2fs raw super block. + * Because we have two copies of super block, so read both of them + * to get the first valid one. If any one of them is broken, we pass + * them recovery flag back to the caller. + */ +static int read_raw_super_block(struct f2fs_sb_info *sbi, + struct f2fs_super_block **raw_super, + int *valid_super_block, int *recovery) +{ + struct super_block *sb = sbi->sb; + int block; + struct buffer_head *bh; + struct f2fs_super_block *super; + int err = 0; + + super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL); + if (!super) + return -ENOMEM; + + for (block = 0; block < 2; block++) { + bh = sb_bread(sb, block); + if (!bh) { + f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock", + block + 1); + err = -EIO; + continue; + } + + /* sanity checking of raw super */ + err = sanity_check_raw_super(sbi, bh); + if (err) { + f2fs_msg(sb, KERN_ERR, + "Can't find valid F2FS filesystem in %dth superblock", + block + 1); + brelse(bh); + continue; + } + + if (!*raw_super) { + memcpy(super, bh->b_data + F2FS_SUPER_OFFSET, + sizeof(*super)); + *valid_super_block = block; + *raw_super = super; + } + brelse(bh); + } + + /* Fail to read any one of the superblocks*/ + if (err < 0) + *recovery = 1; + + /* No valid superblock */ + if (!*raw_super) + kfree(super); + else + err = 0; + + return err; +} + +int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) +{ + struct buffer_head *bh; + int err; + + if ((recover && f2fs_readonly(sbi->sb)) || + bdev_read_only(sbi->sb->s_bdev)) { + set_sbi_flag(sbi, SBI_NEED_SB_WRITE); + return -EROFS; + } + + /* write back-up superblock first */ + bh = sb_bread(sbi->sb, sbi->valid_super_block ? 0 : 1); + if (!bh) + return -EIO; + err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); + brelse(bh); + + /* if we are in recovery path, skip writing valid superblock */ + if (recover || err) + return err; + + /* write current valid superblock */ + bh = sb_bread(sbi->sb, sbi->valid_super_block); + if (!bh) + return -EIO; + err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); + brelse(bh); + return err; +} + +static int f2fs_scan_devices(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + unsigned int max_devices = MAX_DEVICES; + int i; + + /* Initialize single device information */ + if (!RDEV(0).path[0]) { + if (!bdev_is_zoned(sbi->sb->s_bdev)) + return 0; + max_devices = 1; + } + + /* + * Initialize multiple devices information, or single + * zoned block device information. + */ + sbi->devs = f2fs_kzalloc(sbi, + array_size(max_devices, + sizeof(struct f2fs_dev_info)), + GFP_KERNEL); + if (!sbi->devs) + return -ENOMEM; + + for (i = 0; i < max_devices; i++) { + + if (i > 0 && !RDEV(i).path[0]) + break; + + if (max_devices == 1) { + /* Single zoned block device mount */ + FDEV(0).bdev = + blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, + sbi->sb->s_mode, sbi->sb->s_type); + } else { + /* Multi-device mount */ + memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); + FDEV(i).total_segments = + le32_to_cpu(RDEV(i).total_segments); + if (i == 0) { + FDEV(i).start_blk = 0; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1 + + le32_to_cpu(raw_super->segment0_blkaddr); + } else { + FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1; + } + FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + sbi->sb->s_mode, sbi->sb->s_type); + } + if (IS_ERR(FDEV(i).bdev)) + return PTR_ERR(FDEV(i).bdev); + + /* to release errored devices */ + sbi->s_ndevs = i + 1; + +#ifdef CONFIG_BLK_DEV_ZONED + if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && + !f2fs_sb_has_blkzoned(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Zoned block device feature not enabled\n"); + return -EINVAL; + } + if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) { + if (init_blkz_info(sbi, i)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Failed to initialize F2FS blkzone information"); + return -EINVAL; + } + if (max_devices == 1) + break; + f2fs_msg(sbi->sb, KERN_INFO, + "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk, + bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ? + "Host-aware" : "Host-managed"); + continue; + } +#endif + f2fs_msg(sbi->sb, KERN_INFO, + "Mount Device [%2d]: %20s, %8u, %8x - %8x", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk); + } + f2fs_msg(sbi->sb, KERN_INFO, + "IO Block Size: %8d KB", F2FS_IO_SIZE_KB(sbi)); + return 0; +} + +static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_i = SM_I(sbi); + + /* adjust parameters according to the volume size */ + if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) { + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; + sm_i->dcc_info->discard_granularity = 1; + sm_i->ipu_policy = 1 << F2FS_IPU_FORCE; + } + + sbi->readdir_ra = 1; +} + +static int f2fs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct f2fs_sb_info *sbi; + struct f2fs_super_block *raw_super; + struct inode *root; + int err; + bool retry = true, need_fsck = false; + char *options = NULL; + int recovery, i, valid_super_block; + struct curseg_info *seg_i; + +try_onemore: + err = -EINVAL; + raw_super = NULL; + valid_super_block = -1; + recovery = 0; + + /* allocate memory for f2fs-specific super block info */ + sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sbi->sb = sb; + + /* Load the checksum driver */ + sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0); + if (IS_ERR(sbi->s_chksum_driver)) { + f2fs_msg(sb, KERN_ERR, "Cannot load crc32 driver."); + err = PTR_ERR(sbi->s_chksum_driver); + sbi->s_chksum_driver = NULL; + goto free_sbi; + } + + /* set a block size */ + if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) { + f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); + goto free_sbi; + } + + err = read_raw_super_block(sbi, &raw_super, &valid_super_block, + &recovery); + if (err) + goto free_sbi; + + sb->s_fs_info = sbi; + sbi->raw_super = raw_super; + + /* precompute checksum seed for metadata */ + if (f2fs_sb_has_inode_chksum(sb)) + sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, + sizeof(raw_super->uuid)); + + /* + * The BLKZONED feature indicates that the drive was formatted with + * zone alignment optimization. This is optional for host-aware + * devices, but mandatory for host-managed zoned block devices. + */ +#ifndef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sb)) { + f2fs_msg(sb, KERN_ERR, + "Zoned block device support is not enabled\n"); + err = -EOPNOTSUPP; + goto free_sb_buf; + } +#endif + default_options(sbi); + /* parse mount options */ + options = kstrdup((const char *)data, GFP_KERNEL); + if (data && !options) { + err = -ENOMEM; + goto free_sb_buf; + } + + err = parse_options(sb, options); + if (err) + goto free_options; + + sbi->max_file_blocks = max_file_blocks(); + sb->s_maxbytes = sbi->max_file_blocks << + le32_to_cpu(raw_super->log_blocksize); + sb->s_max_links = F2FS_LINK_MAX; + get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + +#ifdef CONFIG_QUOTA + sb->dq_op = &f2fs_quota_operations; + if (f2fs_sb_has_quota_ino(sb)) + sb->s_qcop = &dquot_quotactl_sysfile_ops; + else + sb->s_qcop = &f2fs_quotactl_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; + + if (f2fs_sb_has_quota_ino(sbi->sb)) { + for (i = 0; i < MAXQUOTAS; i++) { + if (f2fs_qf_ino(sbi->sb, i)) + sbi->nquota_files++; + } + } +#endif + + sb->s_op = &f2fs_sops; +#ifdef CONFIG_F2FS_FS_ENCRYPTION + sb->s_cop = &f2fs_cryptops; +#endif + sb->s_xattr = f2fs_xattr_handlers; + sb->s_export_op = &f2fs_export_ops; + sb->s_magic = F2FS_SUPER_MAGIC; + sb->s_time_gran = 1; + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); + memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); + sb->s_iflags |= SB_I_CGROUPWB; + + /* init f2fs-specific super block info */ + sbi->valid_super_block = valid_super_block; + mutex_init(&sbi->gc_mutex); + mutex_init(&sbi->writepages); + mutex_init(&sbi->cp_mutex); + init_rwsem(&sbi->node_write); + init_rwsem(&sbi->node_change); + + /* disallow all the data/node/meta page writes */ + set_sbi_flag(sbi, SBI_POR_DOING); + spin_lock_init(&sbi->stat_lock); + + /* init iostat info */ + spin_lock_init(&sbi->iostat_lock); + sbi->iostat_enable = false; + + for (i = 0; i < NR_PAGE_TYPE; i++) { + int n = (i == META) ? 1: NR_TEMP_TYPE; + int j; + + sbi->write_io[i] = + f2fs_kmalloc(sbi, + array_size(n, + sizeof(struct f2fs_bio_info)), + GFP_KERNEL); + if (!sbi->write_io[i]) { + err = -ENOMEM; + goto free_bio_info; + } + + for (j = HOT; j < n; j++) { + init_rwsem(&sbi->write_io[i][j].io_rwsem); + sbi->write_io[i][j].sbi = sbi; + sbi->write_io[i][j].bio = NULL; + spin_lock_init(&sbi->write_io[i][j].io_lock); + INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); + } + } + + init_rwsem(&sbi->cp_rwsem); + init_waitqueue_head(&sbi->cp_wait); + init_sb_info(sbi); + + err = init_percpu_info(sbi); + if (err) + goto free_bio_info; + + if (F2FS_IO_SIZE(sbi) > 1) { + sbi->write_io_dummy = + mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0); + if (!sbi->write_io_dummy) { + err = -ENOMEM; + goto free_percpu; + } + } + + /* get an inode for meta space */ + sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); + if (IS_ERR(sbi->meta_inode)) { + f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode"); + err = PTR_ERR(sbi->meta_inode); + goto free_io_dummy; + } + + err = f2fs_get_valid_checkpoint(sbi); + if (err) { + f2fs_msg(sb, KERN_ERR, "Failed to get valid F2FS checkpoint"); + goto free_meta_inode; + } + + /* Initialize device list */ + err = f2fs_scan_devices(sbi); + if (err) { + f2fs_msg(sb, KERN_ERR, "Failed to find devices"); + goto free_devices; + } + + sbi->total_valid_node_count = + le32_to_cpu(sbi->ckpt->valid_node_count); + percpu_counter_set(&sbi->total_valid_inode_count, + le32_to_cpu(sbi->ckpt->valid_inode_count)); + sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count); + sbi->total_valid_block_count = + le64_to_cpu(sbi->ckpt->valid_block_count); + sbi->last_valid_block_count = sbi->total_valid_block_count; + sbi->reserved_blocks = 0; + sbi->current_reserved_blocks = 0; + limit_reserve_root(sbi); + + for (i = 0; i < NR_INODE_TYPE; i++) { + INIT_LIST_HEAD(&sbi->inode_list[i]); + spin_lock_init(&sbi->inode_lock[i]); + } + + f2fs_init_extent_cache_info(sbi); + + f2fs_init_ino_entry_info(sbi); + + f2fs_init_fsync_node_info(sbi); + + /* setup f2fs internal modules */ + err = f2fs_build_segment_manager(sbi); + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to initialize F2FS segment manager"); + goto free_sm; + } + err = f2fs_build_node_manager(sbi); + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to initialize F2FS node manager"); + goto free_nm; + } + + /* For write statistics */ + if (sb->s_bdev->bd_part) + sbi->sectors_written_start = + (u64)part_stat_read(sb->s_bdev->bd_part, + sectors[STAT_WRITE]); + + /* Read accumulated write IO statistics if exists */ + seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); + if (__exist_node_summaries(sbi)) + sbi->kbytes_written = + le64_to_cpu(seg_i->journal->info.kbytes_written); + + f2fs_build_gc_manager(sbi); + + err = f2fs_build_stats(sbi); + if (err) + goto free_nm; + + /* get an inode for node space */ + sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi)); + if (IS_ERR(sbi->node_inode)) { + f2fs_msg(sb, KERN_ERR, "Failed to read node inode"); + err = PTR_ERR(sbi->node_inode); + goto free_stats; + } + + /* read root inode and dentry */ + root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); + if (IS_ERR(root)) { + f2fs_msg(sb, KERN_ERR, "Failed to read root inode"); + err = PTR_ERR(root); + goto free_node_inode; + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || + !root->i_size || !root->i_nlink) { + iput(root); + err = -EINVAL; + goto free_node_inode; + } + + sb->s_root = d_make_root(root); /* allocate root dentry */ + if (!sb->s_root) { + err = -ENOMEM; + goto free_root_inode; + } + + err = f2fs_register_sysfs(sbi); + if (err) + goto free_root_inode; + +#ifdef CONFIG_QUOTA + /* Enable quota usage during mount */ + if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) { + err = f2fs_enable_quotas(sb); + if (err) { + f2fs_msg(sb, KERN_ERR, + "Cannot turn on quotas: error %d", err); + goto free_sysfs; + } + } +#endif + /* if there are nt orphan nodes free them */ + err = f2fs_recover_orphan_inodes(sbi); + if (err) + goto free_meta; + + /* recover fsynced data */ + if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { + /* + * mount should be failed, when device has readonly mode, and + * previous checkpoint was not done by clean system shutdown. + */ + if (bdev_read_only(sb->s_bdev) && + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + err = -EROFS; + goto free_meta; + } + + if (need_fsck) + set_sbi_flag(sbi, SBI_NEED_FSCK); + + if (!retry) + goto skip_recovery; + + err = f2fs_recover_fsync_data(sbi, false); + if (err < 0) { + need_fsck = true; + f2fs_msg(sb, KERN_ERR, + "Cannot recover all fsync data errno=%d", err); + goto free_meta; + } + } else { + err = f2fs_recover_fsync_data(sbi, true); + + if (!f2fs_readonly(sb) && err > 0) { + err = -EINVAL; + f2fs_msg(sb, KERN_ERR, + "Need to recover fsync data"); + goto free_meta; + } + } +skip_recovery: + /* f2fs_recover_fsync_data() cleared this already */ + clear_sbi_flag(sbi, SBI_POR_DOING); + + /* + * If filesystem is not mounted as read-only then + * do start the gc_thread. + */ + if (test_opt(sbi, BG_GC) && !f2fs_readonly(sb)) { + /* After POR, we can run background GC thread.*/ + err = f2fs_start_gc_thread(sbi); + if (err) + goto free_meta; + } + kfree(options); + + /* recover broken superblock */ + if (recovery) { + err = f2fs_commit_super(sbi, true); + f2fs_msg(sb, KERN_INFO, + "Try to recover %dth superblock, ret: %d", + sbi->valid_super_block ? 1 : 2, err); + } + + f2fs_join_shrinker(sbi); + + f2fs_tuning_parameters(sbi); + + f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx", + cur_cp_version(F2FS_CKPT(sbi))); + f2fs_update_time(sbi, CP_TIME); + f2fs_update_time(sbi, REQ_TIME); + return 0; + +free_meta: +#ifdef CONFIG_QUOTA + f2fs_truncate_quota_inode_pages(sb); + if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) + f2fs_quota_off_umount(sbi->sb); +#endif + /* + * Some dirty meta pages can be produced by f2fs_recover_orphan_inodes() + * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() + * followed by f2fs_write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in f2fs_sync_meta_pages(). + */ + truncate_inode_pages_final(META_MAPPING(sbi)); +#ifdef CONFIG_QUOTA +free_sysfs: +#endif + f2fs_unregister_sysfs(sbi); +free_root_inode: + dput(sb->s_root); + sb->s_root = NULL; +free_node_inode: + f2fs_release_ino_entry(sbi, true); + truncate_inode_pages_final(NODE_MAPPING(sbi)); + iput(sbi->node_inode); + sbi->node_inode = NULL; +free_stats: + f2fs_destroy_stats(sbi); +free_nm: + f2fs_destroy_node_manager(sbi); +free_sm: + f2fs_destroy_segment_manager(sbi); +free_devices: + destroy_device_list(sbi); + kfree(sbi->ckpt); +free_meta_inode: + make_bad_inode(sbi->meta_inode); + iput(sbi->meta_inode); + sbi->meta_inode = NULL; +free_io_dummy: + mempool_destroy(sbi->write_io_dummy); +free_percpu: + destroy_percpu_info(sbi); +free_bio_info: + for (i = 0; i < NR_PAGE_TYPE; i++) + kfree(sbi->write_io[i]); +free_options: +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(F2FS_OPTION(sbi).s_qf_names[i]); +#endif + kfree(options); +free_sb_buf: + kfree(raw_super); +free_sbi: + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); + kfree(sbi); + + /* give only one another chance */ + if (retry) { + retry = false; + shrink_dcache_sb(sb); + goto try_onemore; + } + return err; +} + +static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super); +} + +static void kill_f2fs_super(struct super_block *sb) +{ + if (sb->s_root) { + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + set_sbi_flag(sbi, SBI_IS_CLOSE); + f2fs_stop_gc_thread(sbi); + f2fs_stop_discard_thread(sbi); + + if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + struct cp_control cpc = { + .reason = CP_UMOUNT, + }; + f2fs_write_checkpoint(sbi, &cpc); + } + + if (is_sbi_flag_set(sbi, SBI_IS_RECOVERED) && f2fs_readonly(sb)) + sb->s_flags &= ~SB_RDONLY; + } + kill_block_super(sb); +} + +static struct file_system_type f2fs_fs_type = { + .owner = THIS_MODULE, + .name = "f2fs", + .mount = f2fs_mount, + .kill_sb = kill_f2fs_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("f2fs"); + +static int __init init_inodecache(void) +{ + f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache", + sizeof(struct f2fs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL); + if (!f2fs_inode_cachep) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(f2fs_inode_cachep); +} + +static int __init init_f2fs_fs(void) +{ + int err; + + if (PAGE_SIZE != F2FS_BLKSIZE) { + printk("F2FS not supported on PAGE_SIZE(%lu) != %d\n", + PAGE_SIZE, F2FS_BLKSIZE); + return -EINVAL; + } + + f2fs_build_trace_ios(); + + err = init_inodecache(); + if (err) + goto fail; + err = f2fs_create_node_manager_caches(); + if (err) + goto free_inodecache; + err = f2fs_create_segment_manager_caches(); + if (err) + goto free_node_manager_caches; + err = f2fs_create_checkpoint_caches(); + if (err) + goto free_segment_manager_caches; + err = f2fs_create_extent_cache(); + if (err) + goto free_checkpoint_caches; + err = f2fs_init_sysfs(); + if (err) + goto free_extent_cache; + err = register_shrinker(&f2fs_shrinker_info); + if (err) + goto free_sysfs; + err = register_filesystem(&f2fs_fs_type); + if (err) + goto free_shrinker; + err = f2fs_create_root_stats(); + if (err) + goto free_filesystem; + err = f2fs_init_post_read_processing(); + if (err) + goto free_root_stats; + return 0; + +free_root_stats: + f2fs_destroy_root_stats(); +free_filesystem: + unregister_filesystem(&f2fs_fs_type); +free_shrinker: + unregister_shrinker(&f2fs_shrinker_info); +free_sysfs: + f2fs_exit_sysfs(); +free_extent_cache: + f2fs_destroy_extent_cache(); +free_checkpoint_caches: + f2fs_destroy_checkpoint_caches(); +free_segment_manager_caches: + f2fs_destroy_segment_manager_caches(); +free_node_manager_caches: + f2fs_destroy_node_manager_caches(); +free_inodecache: + destroy_inodecache(); +fail: + return err; +} + +static void __exit exit_f2fs_fs(void) +{ + f2fs_destroy_post_read_processing(); + f2fs_destroy_root_stats(); + unregister_filesystem(&f2fs_fs_type); + unregister_shrinker(&f2fs_shrinker_info); + f2fs_exit_sysfs(); + f2fs_destroy_extent_cache(); + f2fs_destroy_checkpoint_caches(); + f2fs_destroy_segment_manager_caches(); + f2fs_destroy_node_manager_caches(); + destroy_inodecache(); + f2fs_destroy_trace_ios(); +} + +module_init(init_f2fs_fs) +module_exit(exit_f2fs_fs) + +MODULE_AUTHOR("Samsung Electronics's Praesto Team"); +MODULE_DESCRIPTION("Flash Friendly File System"); +MODULE_LICENSE("GPL"); +MODULE_SOFTDEP("pre: crc32"); + diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c new file mode 100644 index 000000000..89b6c33ba --- /dev/null +++ b/fs/f2fs/sysfs.c @@ -0,0 +1,721 @@ +/* + * f2fs sysfs interface + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * Copyright (c) 2017 Chao Yu <chao@kernel.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/compiler.h> +#include <linux/proc_fs.h> +#include <linux/f2fs_fs.h> +#include <linux/seq_file.h> + +#include "f2fs.h" +#include "segment.h" +#include "gc.h" + +static struct proc_dir_entry *f2fs_proc_root; + +/* Sysfs support for f2fs */ +enum { + GC_THREAD, /* struct f2fs_gc_thread */ + SM_INFO, /* struct f2fs_sm_info */ + DCC_INFO, /* struct discard_cmd_control */ + NM_INFO, /* struct f2fs_nm_info */ + F2FS_SBI, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + FAULT_INFO_RATE, /* struct f2fs_fault_info */ + FAULT_INFO_TYPE, /* struct f2fs_fault_info */ +#endif + RESERVED_BLOCKS, /* struct f2fs_sb_info */ +}; + +struct f2fs_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); + ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, + const char *, size_t); + int struct_type; + int offset; + int id; +}; + +static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) +{ + if (struct_type == GC_THREAD) + return (unsigned char *)sbi->gc_thread; + else if (struct_type == SM_INFO) + return (unsigned char *)SM_I(sbi); + else if (struct_type == DCC_INFO) + return (unsigned char *)SM_I(sbi)->dcc_info; + else if (struct_type == NM_INFO) + return (unsigned char *)NM_I(sbi); + else if (struct_type == F2FS_SBI || struct_type == RESERVED_BLOCKS) + return (unsigned char *)sbi; +#ifdef CONFIG_F2FS_FAULT_INJECTION + else if (struct_type == FAULT_INFO_RATE || + struct_type == FAULT_INFO_TYPE) + return (unsigned char *)&F2FS_OPTION(sbi).fault_info; +#endif + return NULL; +} + +static ssize_t dirty_segments_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(dirty_segments(sbi))); +} + +static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(sbi->kbytes_written + + BD_PART_WRITTEN(sbi))); +} + +static ssize_t features_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + int len = 0; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + if (f2fs_sb_has_encrypt(sb)) + len += snprintf(buf, PAGE_SIZE - len, "%s", + "encryption"); + if (f2fs_sb_has_blkzoned(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "blkzoned"); + if (f2fs_sb_has_extra_attr(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "extra_attr"); + if (f2fs_sb_has_project_quota(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "projquota"); + if (f2fs_sb_has_inode_chksum(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "inode_checksum"); + if (f2fs_sb_has_flexible_inline_xattr(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "flexible_inline_xattr"); + if (f2fs_sb_has_quota_ino(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "quota_ino"); + if (f2fs_sb_has_inode_crtime(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "inode_crtime"); + if (f2fs_sb_has_lost_found(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "lost_found"); + len += snprintf(buf + len, PAGE_SIZE - len, "\n"); + return len; +} + +static ssize_t current_reserved_blocks_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", sbi->current_reserved_blocks); +} + +static ssize_t f2fs_sbi_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + unsigned char *ptr = NULL; + unsigned int *ui; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + if (!strcmp(a->attr.name, "extension_list")) { + __u8 (*extlist)[F2FS_EXTENSION_LEN] = + sbi->raw_super->extension_list; + int cold_count = le32_to_cpu(sbi->raw_super->extension_count); + int hot_count = sbi->raw_super->hot_ext_count; + int len = 0, i; + + len += snprintf(buf + len, PAGE_SIZE - len, + "cold file extension:\n"); + for (i = 0; i < cold_count; i++) + len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", + extlist[i]); + + len += snprintf(buf + len, PAGE_SIZE - len, + "hot file extension:\n"); + for (i = cold_count; i < cold_count + hot_count; i++) + len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", + extlist[i]); + return len; + } + + ui = (unsigned int *)(ptr + a->offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + +static ssize_t __sbi_store(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned char *ptr; + unsigned long t; + unsigned int *ui; + ssize_t ret; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + if (!strcmp(a->attr.name, "extension_list")) { + const char *name = strim((char *)buf); + bool set = true, hot; + + if (!strncmp(name, "[h]", 3)) + hot = true; + else if (!strncmp(name, "[c]", 3)) + hot = false; + else + return -EINVAL; + + name += 3; + + if (*name == '!') { + name++; + set = false; + } + + if (strlen(name) >= F2FS_EXTENSION_LEN) + return -EINVAL; + + down_write(&sbi->sb_lock); + + ret = f2fs_update_extension_list(sbi, name, hot, set); + if (ret) + goto out; + + ret = f2fs_commit_super(sbi, false); + if (ret) + f2fs_update_extension_list(sbi, name, hot, !set); +out: + up_write(&sbi->sb_lock); + return ret ? ret : count; + } + + ui = (unsigned int *)(ptr + a->offset); + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret < 0) + return ret; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) + return -EINVAL; +#endif + if (a->struct_type == RESERVED_BLOCKS) { + spin_lock(&sbi->stat_lock); + if (t > (unsigned long)(sbi->user_block_count - + F2FS_OPTION(sbi).root_reserved_blocks)) { + spin_unlock(&sbi->stat_lock); + return -EINVAL; + } + *ui = t; + sbi->current_reserved_blocks = min(sbi->reserved_blocks, + sbi->user_block_count - valid_user_blocks(sbi)); + spin_unlock(&sbi->stat_lock); + return count; + } + + if (!strcmp(a->attr.name, "discard_granularity")) { + if (t == 0 || t > MAX_PLIST_NUM) + return -EINVAL; + if (t == *ui) + return count; + *ui = t; + return count; + } + + if (!strcmp(a->attr.name, "trim_sections")) + return -EINVAL; + + if (!strcmp(a->attr.name, "gc_urgent")) { + if (t >= 1) { + sbi->gc_mode = GC_URGENT; + if (sbi->gc_thread) { + sbi->gc_thread->gc_wake = 1; + wake_up_interruptible_all( + &sbi->gc_thread->gc_wait_queue_head); + wake_up_discard_thread(sbi, true); + } + } else { + sbi->gc_mode = GC_NORMAL; + } + return count; + } + if (!strcmp(a->attr.name, "gc_idle")) { + if (t == GC_IDLE_CB) + sbi->gc_mode = GC_IDLE_CB; + else if (t == GC_IDLE_GREEDY) + sbi->gc_mode = GC_IDLE_GREEDY; + else + sbi->gc_mode = GC_NORMAL; + return count; + } + + + if (!strcmp(a->attr.name, "iostat_enable")) { + sbi->iostat_enable = !!t; + if (!sbi->iostat_enable) + f2fs_reset_iostat(sbi); + return count; + } + + *ui = (unsigned int)t; + + return count; +} + +static ssize_t f2fs_sbi_store(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + const char *buf, size_t count) +{ + ssize_t ret; + bool gc_entry = (!strcmp(a->attr.name, "gc_urgent") || + a->struct_type == GC_THREAD); + + if (gc_entry) { + if (!down_read_trylock(&sbi->sb->s_umount)) + return -EAGAIN; + } + ret = __sbi_store(a, sbi, buf, count); + if (gc_entry) + up_read(&sbi->sb->s_umount); + + return ret; +} + +static ssize_t f2fs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_sb_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +enum feat_id { + FEAT_CRYPTO = 0, + FEAT_BLKZONED, + FEAT_ATOMIC_WRITE, + FEAT_EXTRA_ATTR, + FEAT_PROJECT_QUOTA, + FEAT_INODE_CHECKSUM, + FEAT_FLEXIBLE_INLINE_XATTR, + FEAT_QUOTA_INO, + FEAT_INODE_CRTIME, + FEAT_LOST_FOUND, +}; + +static ssize_t f2fs_feature_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + switch (a->id) { + case FEAT_CRYPTO: + case FEAT_BLKZONED: + case FEAT_ATOMIC_WRITE: + case FEAT_EXTRA_ATTR: + case FEAT_PROJECT_QUOTA: + case FEAT_INODE_CHECKSUM: + case FEAT_FLEXIBLE_INLINE_XATTR: + case FEAT_QUOTA_INO: + case FEAT_INODE_CRTIME: + case FEAT_LOST_FOUND: + return snprintf(buf, PAGE_SIZE, "supported\n"); + } + return 0; +} + +#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .struct_type = _struct_type, \ + .offset = _offset \ +} + +#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ + F2FS_ATTR_OFFSET(struct_type, name, 0644, \ + f2fs_sbi_show, f2fs_sbi_store, \ + offsetof(struct struct_name, elname)) + +#define F2FS_GENERAL_RO_ATTR(name) \ +static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) + +#define F2FS_FEATURE_RO_ATTR(_name, _id) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_feature_show, \ + .id = _id, \ +} + +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time, + urgent_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); +F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_seq_blocks, min_seq_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); +F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list); +#ifdef CONFIG_F2FS_FAULT_INJECTION +F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); +F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); +#endif +F2FS_GENERAL_RO_ATTR(dirty_segments); +F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); +F2FS_GENERAL_RO_ATTR(features); +F2FS_GENERAL_RO_ATTR(current_reserved_blocks); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION +F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); +#endif +#ifdef CONFIG_BLK_DEV_ZONED +F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED); +#endif +F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE); +F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR); +F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); +F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); +F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); +F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); +F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME); +F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND); + +#define ATTR_LIST(name) (&f2fs_attr_##name.attr) +static struct attribute *f2fs_attrs[] = { + ATTR_LIST(gc_urgent_sleep_time), + ATTR_LIST(gc_min_sleep_time), + ATTR_LIST(gc_max_sleep_time), + ATTR_LIST(gc_no_gc_sleep_time), + ATTR_LIST(gc_idle), + ATTR_LIST(gc_urgent), + ATTR_LIST(reclaim_segments), + ATTR_LIST(max_small_discards), + ATTR_LIST(discard_granularity), + ATTR_LIST(batched_trim_sections), + ATTR_LIST(ipu_policy), + ATTR_LIST(min_ipu_util), + ATTR_LIST(min_fsync_blocks), + ATTR_LIST(min_seq_blocks), + ATTR_LIST(min_hot_blocks), + ATTR_LIST(min_ssr_sections), + ATTR_LIST(max_victim_search), + ATTR_LIST(dir_level), + ATTR_LIST(ram_thresh), + ATTR_LIST(ra_nid_pages), + ATTR_LIST(dirty_nats_ratio), + ATTR_LIST(cp_interval), + ATTR_LIST(idle_interval), + ATTR_LIST(iostat_enable), + ATTR_LIST(readdir_ra), + ATTR_LIST(gc_pin_file_thresh), + ATTR_LIST(extension_list), +#ifdef CONFIG_F2FS_FAULT_INJECTION + ATTR_LIST(inject_rate), + ATTR_LIST(inject_type), +#endif + ATTR_LIST(dirty_segments), + ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(features), + ATTR_LIST(reserved_blocks), + ATTR_LIST(current_reserved_blocks), + NULL, +}; + +static struct attribute *f2fs_feat_attrs[] = { +#ifdef CONFIG_F2FS_FS_ENCRYPTION + ATTR_LIST(encryption), +#endif +#ifdef CONFIG_BLK_DEV_ZONED + ATTR_LIST(block_zoned), +#endif + ATTR_LIST(atomic_write), + ATTR_LIST(extra_attr), + ATTR_LIST(project_quota), + ATTR_LIST(inode_checksum), + ATTR_LIST(flexible_inline_xattr), + ATTR_LIST(quota_ino), + ATTR_LIST(inode_crtime), + ATTR_LIST(lost_found), + NULL, +}; + +static const struct sysfs_ops f2fs_attr_ops = { + .show = f2fs_attr_show, + .store = f2fs_attr_store, +}; + +static struct kobj_type f2fs_sb_ktype = { + .default_attrs = f2fs_attrs, + .sysfs_ops = &f2fs_attr_ops, + .release = f2fs_sb_release, +}; + +static struct kobj_type f2fs_ktype = { + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kset f2fs_kset = { + .kobj = {.ktype = &f2fs_ktype}, +}; + +static struct kobj_type f2fs_feat_ktype = { + .default_attrs = f2fs_feat_attrs, + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kobject f2fs_feat = { + .kset = &f2fs_kset, +}; + +static int __maybe_unused segment_info_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i; + + seq_puts(seq, "format: segment_type|valid_blocks\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + if ((i % 10) == 0) + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u", se->type, + get_valid_blocks(sbi, i, false)); + if ((i % 10) == 9 || i == (total_segs - 1)) + seq_putc(seq, '\n'); + else + seq_putc(seq, ' '); + } + + return 0; +} + +static int __maybe_unused segment_bits_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i, j; + + seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u|", se->type, + get_valid_blocks(sbi, i, false)); + for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) + seq_printf(seq, " %.2x", se->cur_valid_map[j]); + seq_putc(seq, '\n'); + } + return 0; +} + +static int __maybe_unused iostat_info_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + time64_t now = ktime_get_real_seconds(); + + if (!sbi->iostat_enable) + return 0; + + seq_printf(seq, "time: %-16llu\n", now); + + /* print app IOs */ + seq_printf(seq, "app buffered: %-16llu\n", + sbi->write_iostat[APP_BUFFERED_IO]); + seq_printf(seq, "app direct: %-16llu\n", + sbi->write_iostat[APP_DIRECT_IO]); + seq_printf(seq, "app mapped: %-16llu\n", + sbi->write_iostat[APP_MAPPED_IO]); + + /* print fs IOs */ + seq_printf(seq, "fs data: %-16llu\n", + sbi->write_iostat[FS_DATA_IO]); + seq_printf(seq, "fs node: %-16llu\n", + sbi->write_iostat[FS_NODE_IO]); + seq_printf(seq, "fs meta: %-16llu\n", + sbi->write_iostat[FS_META_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->write_iostat[FS_GC_DATA_IO]); + seq_printf(seq, "fs gc node: %-16llu\n", + sbi->write_iostat[FS_GC_NODE_IO]); + seq_printf(seq, "fs cp data: %-16llu\n", + sbi->write_iostat[FS_CP_DATA_IO]); + seq_printf(seq, "fs cp node: %-16llu\n", + sbi->write_iostat[FS_CP_NODE_IO]); + seq_printf(seq, "fs cp meta: %-16llu\n", + sbi->write_iostat[FS_CP_META_IO]); + seq_printf(seq, "fs discard: %-16llu\n", + sbi->write_iostat[FS_DISCARD]); + + return 0; +} + +static int __maybe_unused victim_bits_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + int i; + + seq_puts(seq, "format: victim_secmap bitmaps\n"); + + for (i = 0; i < MAIN_SECS(sbi); i++) { + if ((i % 10) == 0) + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d", test_bit(i, dirty_i->victim_secmap) ? 1 : 0); + if ((i % 10) == 9 || i == (MAIN_SECS(sbi) - 1)) + seq_putc(seq, '\n'); + else + seq_putc(seq, ' '); + } + return 0; +} + +int __init f2fs_init_sysfs(void) +{ + int ret; + + kobject_set_name(&f2fs_kset.kobj, "f2fs"); + f2fs_kset.kobj.parent = fs_kobj; + ret = kset_register(&f2fs_kset); + if (ret) + return ret; + + ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype, + NULL, "features"); + if (ret) { + kobject_put(&f2fs_feat); + kset_unregister(&f2fs_kset); + } else { + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + } + return ret; +} + +void f2fs_exit_sysfs(void) +{ + kobject_put(&f2fs_feat); + kset_unregister(&f2fs_kset); + remove_proc_entry("fs/f2fs", NULL); + f2fs_proc_root = NULL; +} + +int f2fs_register_sysfs(struct f2fs_sb_info *sbi) +{ + struct super_block *sb = sbi->sb; + int err; + + sbi->s_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, + "%s", sb->s_id); + if (err) { + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + return err; + } + + if (f2fs_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); + + if (sbi->s_proc) { + proc_create_single_data("segment_info", S_IRUGO, sbi->s_proc, + segment_info_seq_show, sb); + proc_create_single_data("segment_bits", S_IRUGO, sbi->s_proc, + segment_bits_seq_show, sb); + proc_create_single_data("iostat_info", S_IRUGO, sbi->s_proc, + iostat_info_seq_show, sb); + proc_create_single_data("victim_bits", S_IRUGO, sbi->s_proc, + victim_bits_seq_show, sb); + } + return 0; +} + +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) +{ + if (sbi->s_proc) { + remove_proc_entry("iostat_info", sbi->s_proc); + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); + remove_proc_entry("victim_bits", sbi->s_proc); + remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); + } + kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); +} diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c new file mode 100644 index 000000000..8ac1851a2 --- /dev/null +++ b/fs/f2fs/trace.c @@ -0,0 +1,168 @@ +/* + * f2fs IO tracer + * + * Copyright (c) 2014 Motorola Mobility + * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/sched.h> +#include <linux/radix-tree.h> + +#include "f2fs.h" +#include "trace.h" + +static RADIX_TREE(pids, GFP_ATOMIC); +static spinlock_t pids_lock; +static struct last_io_info last_io; + +static inline void __print_last_io(void) +{ + if (!last_io.len) + return; + + trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n", + last_io.major, last_io.minor, + last_io.pid, "----------------", + last_io.type, + last_io.fio.op, last_io.fio.op_flags, + last_io.fio.new_blkaddr, + last_io.len); + memset(&last_io, 0, sizeof(last_io)); +} + +static int __file_type(struct inode *inode, pid_t pid) +{ + if (f2fs_is_atomic_file(inode)) + return __ATOMIC_FILE; + else if (f2fs_is_volatile_file(inode)) + return __VOLATILE_FILE; + else if (S_ISDIR(inode->i_mode)) + return __DIR_FILE; + else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode))) + return __NODE_FILE; + else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode))) + return __META_FILE; + else if (pid) + return __NORMAL_FILE; + else + return __MISC_FILE; +} + +void f2fs_trace_pid(struct page *page) +{ + struct inode *inode = page->mapping->host; + pid_t pid = task_pid_nr(current); + void *p; + + set_page_private(page, (unsigned long)pid); + +retry: + if (radix_tree_preload(GFP_NOFS)) + return; + + spin_lock(&pids_lock); + p = radix_tree_lookup(&pids, pid); + if (p == current) + goto out; + if (p) + radix_tree_delete(&pids, pid); + + if (radix_tree_insert(&pids, pid, current)) { + spin_unlock(&pids_lock); + radix_tree_preload_end(); + cond_resched(); + goto retry; + } + + trace_printk("%3x:%3x %4x %-16s\n", + MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), + pid, current->comm); +out: + spin_unlock(&pids_lock); + radix_tree_preload_end(); +} + +void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) +{ + struct inode *inode; + pid_t pid; + int major, minor; + + if (flush) { + __print_last_io(); + return; + } + + inode = fio->page->mapping->host; + pid = page_private(fio->page); + + major = MAJOR(inode->i_sb->s_dev); + minor = MINOR(inode->i_sb->s_dev); + + if (last_io.major == major && last_io.minor == minor && + last_io.pid == pid && + last_io.type == __file_type(inode, pid) && + last_io.fio.op == fio->op && + last_io.fio.op_flags == fio->op_flags && + last_io.fio.new_blkaddr + last_io.len == + fio->new_blkaddr) { + last_io.len++; + return; + } + + __print_last_io(); + + last_io.major = major; + last_io.minor = minor; + last_io.pid = pid; + last_io.type = __file_type(inode, pid); + last_io.fio = *fio; + last_io.len = 1; + return; +} + +void f2fs_build_trace_ios(void) +{ + spin_lock_init(&pids_lock); +} + +#define PIDVEC_SIZE 128 +static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index, + unsigned int max_items) +{ + struct radix_tree_iter iter; + void **slot; + unsigned int ret = 0; + + if (unlikely(!max_items)) + return 0; + + radix_tree_for_each_slot(slot, &pids, &iter, first_index) { + results[ret] = iter.index; + if (++ret == max_items) + break; + } + return ret; +} + +void f2fs_destroy_trace_ios(void) +{ + pid_t pid[PIDVEC_SIZE]; + pid_t next_pid = 0; + unsigned int found; + + spin_lock(&pids_lock); + while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { + unsigned idx; + + next_pid = pid[found - 1] + 1; + for (idx = 0; idx < found; idx++) + radix_tree_delete(&pids, pid[idx]); + } + spin_unlock(&pids_lock); +} diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h new file mode 100644 index 000000000..67db24ac1 --- /dev/null +++ b/fs/f2fs/trace.h @@ -0,0 +1,46 @@ +/* + * f2fs IO tracer + * + * Copyright (c) 2014 Motorola Mobility + * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __F2FS_TRACE_H__ +#define __F2FS_TRACE_H__ + +#ifdef CONFIG_F2FS_IO_TRACE +#include <trace/events/f2fs.h> + +enum file_type { + __NORMAL_FILE, + __DIR_FILE, + __NODE_FILE, + __META_FILE, + __ATOMIC_FILE, + __VOLATILE_FILE, + __MISC_FILE, +}; + +struct last_io_info { + int major, minor; + pid_t pid; + enum file_type type; + struct f2fs_io_info fio; + block_t len; +}; + +extern void f2fs_trace_pid(struct page *); +extern void f2fs_trace_ios(struct f2fs_io_info *, int); +extern void f2fs_build_trace_ios(void); +extern void f2fs_destroy_trace_ios(void); +#else +#define f2fs_trace_pid(p) +#define f2fs_trace_ios(i, n) +#define f2fs_build_trace_ios() +#define f2fs_destroy_trace_ios() + +#endif +#endif /* __F2FS_TRACE_H__ */ diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c new file mode 100644 index 000000000..64352d283 --- /dev/null +++ b/fs/f2fs/xattr.c @@ -0,0 +1,770 @@ +/* + * fs/f2fs/xattr.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext2/xattr.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher <agruen@suse.de> + * + * Fix by Harrison Xing <harrison@mountainviewdata.com>. + * Extended attributes for symlinks and special files added per + * suggestion of Luka Renko <luka.renko@hermes.si>. + * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>, + * Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/rwsem.h> +#include <linux/f2fs_fs.h> +#include <linux/security.h> +#include <linux/posix_acl_xattr.h> +#include "f2fs.h" +#include "xattr.h" + +static int f2fs_xattr_generic_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + + switch (handler->flags) { + case F2FS_XATTR_INDEX_USER: + if (!test_opt(sbi, XATTR_USER)) + return -EOPNOTSUPP; + break; + case F2FS_XATTR_INDEX_TRUSTED: + case F2FS_XATTR_INDEX_SECURITY: + break; + default: + return -EINVAL; + } + return f2fs_getxattr(inode, handler->flags, name, + buffer, size, NULL); +} + +static int f2fs_xattr_generic_set(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + + switch (handler->flags) { + case F2FS_XATTR_INDEX_USER: + if (!test_opt(sbi, XATTR_USER)) + return -EOPNOTSUPP; + break; + case F2FS_XATTR_INDEX_TRUSTED: + case F2FS_XATTR_INDEX_SECURITY: + break; + default: + return -EINVAL; + } + return f2fs_setxattr(inode, handler->flags, name, + value, size, NULL, flags); +} + +static bool f2fs_xattr_user_list(struct dentry *dentry) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + + return test_opt(sbi, XATTR_USER); +} + +static bool f2fs_xattr_trusted_list(struct dentry *dentry) +{ + return capable(CAP_SYS_ADMIN); +} + +static int f2fs_xattr_advise_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + if (buffer) + *((char *)buffer) = F2FS_I(inode)->i_advise; + return sizeof(char); +} + +static int f2fs_xattr_advise_set(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + unsigned char old_advise = F2FS_I(inode)->i_advise; + unsigned char new_advise; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + if (value == NULL) + return -EINVAL; + + new_advise = *(char *)value; + if (new_advise & ~FADVISE_MODIFIABLE_BITS) + return -EINVAL; + + new_advise = new_advise & FADVISE_MODIFIABLE_BITS; + new_advise |= old_advise & ~FADVISE_MODIFIABLE_BITS; + + F2FS_I(inode)->i_advise = new_advise; + f2fs_mark_inode_dirty_sync(inode, true); + return 0; +} + +#ifdef CONFIG_F2FS_FS_SECURITY +static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *page) +{ + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, (struct page *)page, 0); + if (err < 0) + break; + } + return err; +} + +int f2fs_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr, struct page *ipage) +{ + return security_inode_init_security(inode, dir, qstr, + &f2fs_initxattrs, ipage); +} +#endif + +const struct xattr_handler f2fs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = F2FS_XATTR_INDEX_USER, + .list = f2fs_xattr_user_list, + .get = f2fs_xattr_generic_get, + .set = f2fs_xattr_generic_set, +}; + +const struct xattr_handler f2fs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = F2FS_XATTR_INDEX_TRUSTED, + .list = f2fs_xattr_trusted_list, + .get = f2fs_xattr_generic_get, + .set = f2fs_xattr_generic_set, +}; + +const struct xattr_handler f2fs_xattr_advise_handler = { + .name = F2FS_SYSTEM_ADVISE_NAME, + .flags = F2FS_XATTR_INDEX_ADVISE, + .get = f2fs_xattr_advise_get, + .set = f2fs_xattr_advise_set, +}; + +const struct xattr_handler f2fs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = F2FS_XATTR_INDEX_SECURITY, + .get = f2fs_xattr_generic_get, + .set = f2fs_xattr_generic_set, +}; + +static const struct xattr_handler *f2fs_xattr_handler_map[] = { + [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler, +#ifdef CONFIG_F2FS_FS_POSIX_ACL + [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, +#endif + [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler, +#ifdef CONFIG_F2FS_FS_SECURITY + [F2FS_XATTR_INDEX_SECURITY] = &f2fs_xattr_security_handler, +#endif + [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler, +}; + +const struct xattr_handler *f2fs_xattr_handlers[] = { + &f2fs_xattr_user_handler, +#ifdef CONFIG_F2FS_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &f2fs_xattr_trusted_handler, +#ifdef CONFIG_F2FS_FS_SECURITY + &f2fs_xattr_security_handler, +#endif + &f2fs_xattr_advise_handler, + NULL, +}; + +static inline const struct xattr_handler *f2fs_xattr_handler(int index) +{ + const struct xattr_handler *handler = NULL; + + if (index > 0 && index < ARRAY_SIZE(f2fs_xattr_handler_map)) + handler = f2fs_xattr_handler_map[index]; + return handler; +} + +static struct f2fs_xattr_entry *__find_xattr(void *base_addr, + void *last_base_addr, int index, + size_t len, const char *name) +{ + struct f2fs_xattr_entry *entry; + + list_for_each_xattr(entry, base_addr) { + if ((void *)(entry) + sizeof(__u32) > last_base_addr || + (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) + return NULL; + + if (entry->e_name_index != index) + continue; + if (entry->e_name_len != len) + continue; + if (!memcmp(entry->e_name, name, len)) + break; + } + return entry; +} + +static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, + void *base_addr, void **last_addr, int index, + size_t len, const char *name) +{ + struct f2fs_xattr_entry *entry; + unsigned int inline_size = inline_xattr_size(inode); + void *max_addr = base_addr + inline_size; + + list_for_each_xattr(entry, base_addr) { + if ((void *)entry + sizeof(__u32) > max_addr || + (void *)XATTR_NEXT_ENTRY(entry) > max_addr) { + *last_addr = entry; + return NULL; + } + if (entry->e_name_index != index) + continue; + if (entry->e_name_len != len) + continue; + if (!memcmp(entry->e_name, name, len)) + break; + } + + /* inline xattr header or entry across max inline xattr size */ + if (IS_XATTR_LAST_ENTRY(entry) && + (void *)entry + sizeof(__u32) > max_addr) { + *last_addr = entry; + return NULL; + } + return entry; +} + +static int read_inline_xattr(struct inode *inode, struct page *ipage, + void *txattr_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int inline_size = inline_xattr_size(inode); + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(inode, ipage); + } else { + page = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) + return PTR_ERR(page); + + inline_addr = inline_xattr_addr(inode, page); + } + memcpy(txattr_addr, inline_addr, inline_size); + f2fs_put_page(page, 1); + + return 0; +} + +static int read_xattr_block(struct inode *inode, void *txattr_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int inline_size = inline_xattr_size(inode); + struct page *xpage; + void *xattr_addr; + + /* The inode already has an extended attribute block. */ + xpage = f2fs_get_node_page(sbi, xnid); + if (IS_ERR(xpage)) + return PTR_ERR(xpage); + + xattr_addr = page_address(xpage); + memcpy(txattr_addr + inline_size, xattr_addr, VALID_XATTR_BLOCK_SIZE); + f2fs_put_page(xpage, 1); + + return 0; +} + +static int lookup_all_xattrs(struct inode *inode, struct page *ipage, + unsigned int index, unsigned int len, + const char *name, struct f2fs_xattr_entry **xe, + void **base_addr, int *base_size) +{ + void *cur_addr, *txattr_addr, *last_txattr_addr; + void *last_addr = NULL; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int inline_size = inline_xattr_size(inode); + int err = 0; + + if (!xnid && !inline_size) + return -ENODATA; + + *base_size = XATTR_SIZE(xnid, inode) + XATTR_PADDING_SIZE; + txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), *base_size, GFP_NOFS); + if (!txattr_addr) + return -ENOMEM; + + last_txattr_addr = (void *)txattr_addr + XATTR_SIZE(xnid, inode); + + /* read from inline xattr */ + if (inline_size) { + err = read_inline_xattr(inode, ipage, txattr_addr); + if (err) + goto out; + + *xe = __find_inline_xattr(inode, txattr_addr, &last_addr, + index, len, name); + if (*xe) { + *base_size = inline_size; + goto check; + } + } + + /* read from xattr node block */ + if (xnid) { + err = read_xattr_block(inode, txattr_addr); + if (err) + goto out; + } + + if (last_addr) + cur_addr = XATTR_HDR(last_addr) - 1; + else + cur_addr = txattr_addr; + + *xe = __find_xattr(cur_addr, last_txattr_addr, index, len, name); + if (!*xe) { + err = -EFSCORRUPTED; + goto out; + } +check: + if (IS_XATTR_LAST_ENTRY(*xe)) { + err = -ENODATA; + goto out; + } + + *base_addr = txattr_addr; + return 0; +out: + kzfree(txattr_addr); + return err; +} + +static int read_all_xattrs(struct inode *inode, struct page *ipage, + void **base_addr) +{ + struct f2fs_xattr_header *header; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int size = VALID_XATTR_BLOCK_SIZE; + unsigned int inline_size = inline_xattr_size(inode); + void *txattr_addr; + int err; + + txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), + inline_size + size + XATTR_PADDING_SIZE, GFP_NOFS); + if (!txattr_addr) + return -ENOMEM; + + /* read from inline xattr */ + if (inline_size) { + err = read_inline_xattr(inode, ipage, txattr_addr); + if (err) + goto fail; + } + + /* read from xattr node block */ + if (xnid) { + err = read_xattr_block(inode, txattr_addr); + if (err) + goto fail; + } + + header = XATTR_HDR(txattr_addr); + + /* never been allocated xattrs */ + if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) { + header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC); + header->h_refcount = cpu_to_le32(1); + } + *base_addr = txattr_addr; + return 0; +fail: + kzfree(txattr_addr); + return err; +} + +static inline int write_all_xattrs(struct inode *inode, __u32 hsize, + void *txattr_addr, struct page *ipage) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + size_t inline_size = inline_xattr_size(inode); + struct page *in_page = NULL; + void *xattr_addr; + void *inline_addr = NULL; + struct page *xpage; + nid_t new_nid = 0; + int err = 0; + + if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) + if (!f2fs_alloc_nid(sbi, &new_nid)) + return -ENOSPC; + + /* write to inline xattr */ + if (inline_size) { + if (ipage) { + inline_addr = inline_xattr_addr(inode, ipage); + } else { + in_page = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(in_page)) { + f2fs_alloc_nid_failed(sbi, new_nid); + return PTR_ERR(in_page); + } + inline_addr = inline_xattr_addr(inode, in_page); + } + + f2fs_wait_on_page_writeback(ipage ? ipage : in_page, + NODE, true); + /* no need to use xattr node block */ + if (hsize <= inline_size) { + err = f2fs_truncate_xattr_node(inode); + f2fs_alloc_nid_failed(sbi, new_nid); + if (err) { + f2fs_put_page(in_page, 1); + return err; + } + memcpy(inline_addr, txattr_addr, inline_size); + set_page_dirty(ipage ? ipage : in_page); + goto in_page_out; + } + } + + /* write to xattr node block */ + if (F2FS_I(inode)->i_xattr_nid) { + xpage = f2fs_get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); + f2fs_alloc_nid_failed(sbi, new_nid); + goto in_page_out; + } + f2fs_bug_on(sbi, new_nid); + f2fs_wait_on_page_writeback(xpage, NODE, true); + } else { + struct dnode_of_data dn; + set_new_dnode(&dn, inode, NULL, NULL, new_nid); + xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET); + if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); + f2fs_alloc_nid_failed(sbi, new_nid); + goto in_page_out; + } + f2fs_alloc_nid_done(sbi, new_nid); + } + xattr_addr = page_address(xpage); + + if (inline_size) + memcpy(inline_addr, txattr_addr, inline_size); + memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE); + + if (inline_size) + set_page_dirty(ipage ? ipage : in_page); + set_page_dirty(xpage); + + f2fs_put_page(xpage, 1); +in_page_out: + f2fs_put_page(in_page, 1); + return err; +} + +int f2fs_getxattr(struct inode *inode, int index, const char *name, + void *buffer, size_t buffer_size, struct page *ipage) +{ + struct f2fs_xattr_entry *entry = NULL; + int error = 0; + unsigned int size, len; + void *base_addr = NULL; + int base_size; + + if (name == NULL) + return -EINVAL; + + len = strlen(name); + if (len > F2FS_NAME_LEN) + return -ERANGE; + + down_read(&F2FS_I(inode)->i_xattr_sem); + error = lookup_all_xattrs(inode, ipage, index, len, name, + &entry, &base_addr, &base_size); + up_read(&F2FS_I(inode)->i_xattr_sem); + if (error) + return error; + + size = le16_to_cpu(entry->e_value_size); + + if (buffer && size > buffer_size) { + error = -ERANGE; + goto out; + } + + if (buffer) { + char *pval = entry->e_name + entry->e_name_len; + + if (base_size - (pval - (char *)base_addr) < size) { + error = -ERANGE; + goto out; + } + memcpy(buffer, pval, size); + } + error = size; +out: + kzfree(base_addr); + return error; +} + +ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct inode *inode = d_inode(dentry); + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + struct f2fs_xattr_entry *entry; + void *base_addr, *last_base_addr; + int error = 0; + size_t rest = buffer_size; + + down_read(&F2FS_I(inode)->i_xattr_sem); + error = read_all_xattrs(inode, NULL, &base_addr); + up_read(&F2FS_I(inode)->i_xattr_sem); + if (error) + return error; + + last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode); + + list_for_each_xattr(entry, base_addr) { + const struct xattr_handler *handler = + f2fs_xattr_handler(entry->e_name_index); + const char *prefix; + size_t prefix_len; + size_t size; + + if ((void *)(entry) + sizeof(__u32) > last_base_addr || + (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { + f2fs_msg(dentry->d_sb, KERN_ERR, + "inode (%lu) has corrupted xattr", + inode->i_ino); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + error = -EFSCORRUPTED; + goto cleanup; + } + + if (!handler || (handler->list && !handler->list(dentry))) + continue; + + prefix = handler->prefix ?: handler->name; + prefix_len = strlen(prefix); + size = prefix_len + entry->e_name_len + 1; + if (buffer) { + if (size > rest) { + error = -ERANGE; + goto cleanup; + } + memcpy(buffer, prefix, prefix_len); + buffer += prefix_len; + memcpy(buffer, entry->e_name, entry->e_name_len); + buffer += entry->e_name_len; + *buffer++ = 0; + } + rest -= size; + } + error = buffer_size - rest; +cleanup: + kzfree(base_addr); + return error; +} + +static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry, + const void *value, size_t size) +{ + void *pval = entry->e_name + entry->e_name_len; + + return (le16_to_cpu(entry->e_value_size) == size) && + !memcmp(pval, value, size); +} + +static int __f2fs_setxattr(struct inode *inode, int index, + const char *name, const void *value, size_t size, + struct page *ipage, int flags) +{ + struct f2fs_xattr_entry *here, *last; + void *base_addr, *last_base_addr; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + int found, newsize; + size_t len; + __u32 new_hsize; + int error = 0; + + if (name == NULL) + return -EINVAL; + + if (value == NULL) + size = 0; + + len = strlen(name); + + if (len > F2FS_NAME_LEN) + return -ERANGE; + + if (size > MAX_VALUE_LEN(inode)) + return -E2BIG; + + error = read_all_xattrs(inode, ipage, &base_addr); + if (error) + return error; + + last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode); + + /* find entry with wanted name. */ + here = __find_xattr(base_addr, last_base_addr, index, len, name); + if (!here) { + error = -EFSCORRUPTED; + goto exit; + } + + found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1; + + if (found) { + if ((flags & XATTR_CREATE)) { + error = -EEXIST; + goto exit; + } + + if (value && f2fs_xattr_value_same(here, value, size)) + goto exit; + } else if ((flags & XATTR_REPLACE)) { + error = -ENODATA; + goto exit; + } + + last = here; + while (!IS_XATTR_LAST_ENTRY(last)) { + if ((void *)(last) + sizeof(__u32) > last_base_addr || + (void *)XATTR_NEXT_ENTRY(last) > last_base_addr) { + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + error = -EFSCORRUPTED; + goto exit; + } + last = XATTR_NEXT_ENTRY(last); + } + + newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + len + size); + + /* 1. Check space */ + if (value) { + int free; + /* + * If value is NULL, it is remove operation. + * In case of update operation, we calculate free. + */ + free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr); + if (found) + free = free + ENTRY_SIZE(here); + + if (unlikely(free < newsize)) { + error = -E2BIG; + goto exit; + } + } + + /* 2. Remove old entry */ + if (found) { + /* + * If entry is found, remove old entry. + * If not found, remove operation is not needed. + */ + struct f2fs_xattr_entry *next = XATTR_NEXT_ENTRY(here); + int oldsize = ENTRY_SIZE(here); + + memmove(here, next, (char *)last - (char *)next); + last = (struct f2fs_xattr_entry *)((char *)last - oldsize); + memset(last, 0, oldsize); + } + + new_hsize = (char *)last - (char *)base_addr; + + /* 3. Write new entry */ + if (value) { + char *pval; + /* + * Before we come here, old entry is removed. + * We just write new entry. + */ + last->e_name_index = index; + last->e_name_len = len; + memcpy(last->e_name, name, len); + pval = last->e_name + len; + memcpy(pval, value, size); + last->e_value_size = cpu_to_le16(size); + new_hsize += newsize; + } + + error = write_all_xattrs(inode, new_hsize, base_addr, ipage); + if (error) + goto exit; + + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; + inode->i_ctime = current_time(inode); + clear_inode_flag(inode, FI_ACL_MODE); + } + if (index == F2FS_XATTR_INDEX_ENCRYPTION && + !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) + f2fs_set_encrypted_inode(inode); + f2fs_mark_inode_dirty_sync(inode, true); + if (!error && S_ISDIR(inode->i_mode)) + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); +exit: + kzfree(base_addr); + return error; +} + +int f2fs_setxattr(struct inode *inode, int index, const char *name, + const void *value, size_t size, + struct page *ipage, int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int err; + + err = dquot_initialize(inode); + if (err) + return err; + + /* this case is only from f2fs_init_inode_metadata */ + if (ipage) + return __f2fs_setxattr(inode, index, name, value, + size, ipage, flags); + f2fs_balance_fs(sbi, true); + + f2fs_lock_op(sbi); + /* protect xattr_ver */ + down_write(&F2FS_I(inode)->i_sem); + down_write(&F2FS_I(inode)->i_xattr_sem); + err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); + up_write(&F2FS_I(inode)->i_xattr_sem); + up_write(&F2FS_I(inode)->i_sem); + f2fs_unlock_op(sbi); + + f2fs_update_time(sbi, REQ_TIME); + return err; +} diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h new file mode 100644 index 000000000..2a4ecaf33 --- /dev/null +++ b/fs/f2fs/xattr.h @@ -0,0 +1,160 @@ +/* + * fs/f2fs/xattr.h + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * Portions of this code from linux/fs/ext2/xattr.h + * + * On-disk format of extended attributes for the ext2 filesystem. + * + * (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __F2FS_XATTR_H__ +#define __F2FS_XATTR_H__ + +#include <linux/init.h> +#include <linux/xattr.h> + +/* Magic value in attribute blocks */ +#define F2FS_XATTR_MAGIC 0xF2F52011 + +/* Maximum number of references to one attribute block */ +#define F2FS_XATTR_REFCOUNT_MAX 1024 + +/* Name indexes */ +#define F2FS_SYSTEM_ADVISE_NAME "system.advise" +#define F2FS_XATTR_INDEX_USER 1 +#define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define F2FS_XATTR_INDEX_TRUSTED 4 +#define F2FS_XATTR_INDEX_LUSTRE 5 +#define F2FS_XATTR_INDEX_SECURITY 6 +#define F2FS_XATTR_INDEX_ADVISE 7 +/* Should be same as EXT4_XATTR_INDEX_ENCRYPTION */ +#define F2FS_XATTR_INDEX_ENCRYPTION 9 + +#define F2FS_XATTR_NAME_ENCRYPTION_CONTEXT "c" + +struct f2fs_xattr_header { + __le32 h_magic; /* magic number for identification */ + __le32 h_refcount; /* reference count */ + __u32 h_reserved[4]; /* zero right now */ +}; + +struct f2fs_xattr_entry { + __u8 e_name_index; + __u8 e_name_len; + __le16 e_value_size; /* size of attribute value */ + char e_name[0]; /* attribute name */ +}; + +#define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr)) +#define XATTR_ENTRY(ptr) ((struct f2fs_xattr_entry *)(ptr)) +#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1)) +#define XATTR_ROUND (3) + +#define XATTR_ALIGN(size) (((size) + XATTR_ROUND) & ~XATTR_ROUND) + +#define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \ + (entry)->e_name_len + le16_to_cpu((entry)->e_value_size))) + +#define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\ + ENTRY_SIZE(entry))) + +#define IS_XATTR_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + +#define list_for_each_xattr(entry, addr) \ + for (entry = XATTR_FIRST_ENTRY(addr);\ + !IS_XATTR_LAST_ENTRY(entry);\ + entry = XATTR_NEXT_ENTRY(entry)) +#define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) +#define XATTR_PADDING_SIZE (sizeof(__u32)) +#define XATTR_SIZE(x,i) (((x) ? VALID_XATTR_BLOCK_SIZE : 0) + \ + (inline_xattr_size(i))) +#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \ + VALID_XATTR_BLOCK_SIZE) + +#define MAX_VALUE_LEN(i) (MIN_OFFSET(i) - \ + sizeof(struct f2fs_xattr_header) - \ + sizeof(struct f2fs_xattr_entry)) + +/* + * On-disk structure of f2fs_xattr + * We use inline xattrs space + 1 block for xattr. + * + * +--------------------+ + * | f2fs_xattr_header | + * | | + * +--------------------+ + * | f2fs_xattr_entry | + * | .e_name_index = 1 | + * | .e_name_len = 3 | + * | .e_value_size = 14 | + * | .e_name = "foo" | + * | "value_of_xattr" |<- value_offs = e_name + e_name_len + * +--------------------+ + * | f2fs_xattr_entry | + * | .e_name_index = 4 | + * | .e_name = "bar" | + * +--------------------+ + * | | + * | Free | + * | | + * +--------------------+<- MIN_OFFSET + * | node_footer | + * | (nid, ino, offset) | + * +--------------------+ + * + **/ + +#ifdef CONFIG_F2FS_FS_XATTR +extern const struct xattr_handler f2fs_xattr_user_handler; +extern const struct xattr_handler f2fs_xattr_trusted_handler; +extern const struct xattr_handler f2fs_xattr_advise_handler; +extern const struct xattr_handler f2fs_xattr_security_handler; + +extern const struct xattr_handler *f2fs_xattr_handlers[]; + +extern int f2fs_setxattr(struct inode *, int, const char *, + const void *, size_t, struct page *, int); +extern int f2fs_getxattr(struct inode *, int, const char *, void *, + size_t, struct page *); +extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t); +#else + +#define f2fs_xattr_handlers NULL +static inline int f2fs_setxattr(struct inode *inode, int index, + const char *name, const void *value, size_t size, + struct page *page, int flags) +{ + return -EOPNOTSUPP; +} +static inline int f2fs_getxattr(struct inode *inode, int index, + const char *name, void *buffer, + size_t buffer_size, struct page *dpage) +{ + return -EOPNOTSUPP; +} +static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, + size_t buffer_size) +{ + return -EOPNOTSUPP; +} +#endif + +#ifdef CONFIG_F2FS_FS_SECURITY +extern int f2fs_init_security(struct inode *, struct inode *, + const struct qstr *, struct page *); +#else +static inline int f2fs_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr, struct page *ipage) +{ + return 0; +} +#endif +#endif /* __F2FS_XATTR_H__ */ |