From 2c3c1048746a4622d8c89a29670120dc8fab93c4 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 20:49:45 +0200 Subject: Adding upstream version 6.1.76. Signed-off-by: Daniel Baumann --- fs/fuse/Kconfig | 54 + fs/fuse/Makefile | 13 + fs/fuse/acl.c | 114 ++ fs/fuse/control.c | 384 ++++++ fs/fuse/cuse.c | 655 +++++++++++ fs/fuse/dax.c | 1391 ++++++++++++++++++++++ fs/fuse/dev.c | 2337 +++++++++++++++++++++++++++++++++++++ fs/fuse/dir.c | 2013 ++++++++++++++++++++++++++++++++ fs/fuse/file.c | 3216 +++++++++++++++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_i.h | 1337 +++++++++++++++++++++ fs/fuse/inode.c | 2068 +++++++++++++++++++++++++++++++++ fs/fuse/ioctl.c | 515 +++++++++ fs/fuse/readdir.c | 604 ++++++++++ fs/fuse/virtio_fs.c | 1541 ++++++++++++++++++++++++ fs/fuse/xattr.c | 266 +++++ 15 files changed, 16508 insertions(+) create mode 100644 fs/fuse/Kconfig create mode 100644 fs/fuse/Makefile create mode 100644 fs/fuse/acl.c create mode 100644 fs/fuse/control.c create mode 100644 fs/fuse/cuse.c create mode 100644 fs/fuse/dax.c create mode 100644 fs/fuse/dev.c create mode 100644 fs/fuse/dir.c create mode 100644 fs/fuse/file.c create mode 100644 fs/fuse/fuse_i.h create mode 100644 fs/fuse/inode.c create mode 100644 fs/fuse/ioctl.c create mode 100644 fs/fuse/readdir.c create mode 100644 fs/fuse/virtio_fs.c create mode 100644 fs/fuse/xattr.c (limited to 'fs/fuse') diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig new file mode 100644 index 000000000..038ed0b9a --- /dev/null +++ b/fs/fuse/Kconfig @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: GPL-2.0-only +config FUSE_FS + tristate "FUSE (Filesystem in Userspace) support" + select FS_POSIX_ACL + help + With FUSE it is possible to implement a fully functional filesystem + in a userspace program. + + There's also a companion library: libfuse2. This library is available + from the FUSE homepage: + + although chances are your distribution already has that library + installed if you've installed the "fuse" package itself. + + See for more information. + See for needed library/utility version. + + If you want to develop a userspace FS, or if you want to use + a filesystem based on FUSE, answer Y or M. + +config CUSE + tristate "Character device in Userspace support" + depends on FUSE_FS + help + This FUSE extension allows character devices to be + implemented in userspace. + + If you want to develop or use a userspace character device + based on CUSE, answer Y or M. + +config VIRTIO_FS + tristate "Virtio Filesystem" + depends on FUSE_FS + select VIRTIO + help + The Virtio Filesystem allows guests to mount file systems from the + host. + + If you want to share files between guests or with the host, answer Y + or M. + +config FUSE_DAX + bool "Virtio Filesystem Direct Host Memory Access support" + default y + select INTERVAL_TREE + depends on VIRTIO_FS + depends on FS_DAX + depends on DAX + help + This allows bypassing guest page cache and allows mapping host page + cache directly in guest address space. + + If you want to allow mounting a Virtio Filesystem with the "dax" + option, answer Y. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile new file mode 100644 index 000000000..0c48b35c0 --- /dev/null +++ b/fs/fuse/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for the FUSE filesystem. +# + +obj-$(CONFIG_FUSE_FS) += fuse.o +obj-$(CONFIG_CUSE) += cuse.o +obj-$(CONFIG_VIRTIO_FS) += virtiofs.o + +fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o +fuse-$(CONFIG_FUSE_DAX) += dax.o + +virtiofs-y := virtio_fs.o diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c new file mode 100644 index 000000000..337cb29a8 --- /dev/null +++ b/fs/fuse/acl.c @@ -0,0 +1,114 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2016 Canonical Ltd. + * + * This program can be distributed under the terms of the GNU GPL. + * See the file COPYING. + */ + +#include "fuse_i.h" + +#include +#include + +struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + int size; + const char *name; + void *value = NULL; + struct posix_acl *acl; + + if (rcu) + return ERR_PTR(-ECHILD); + + if (fuse_is_bad(inode)) + return ERR_PTR(-EIO); + + if (!fc->posix_acl || fc->no_getxattr) + return NULL; + + if (type == ACL_TYPE_ACCESS) + name = XATTR_NAME_POSIX_ACL_ACCESS; + else if (type == ACL_TYPE_DEFAULT) + name = XATTR_NAME_POSIX_ACL_DEFAULT; + else + return ERR_PTR(-EOPNOTSUPP); + + value = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + size = fuse_getxattr(inode, name, value, PAGE_SIZE); + if (size > 0) + acl = posix_acl_from_xattr(fc->user_ns, value, size); + else if ((size == 0) || (size == -ENODATA) || + (size == -EOPNOTSUPP && fc->no_getxattr)) + acl = NULL; + else if (size == -ERANGE) + acl = ERR_PTR(-E2BIG); + else + acl = ERR_PTR(size); + + kfree(value); + return acl; +} + +int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + const char *name; + int ret; + + if (fuse_is_bad(inode)) + return -EIO; + + if (!fc->posix_acl || fc->no_setxattr) + return -EOPNOTSUPP; + + if (type == ACL_TYPE_ACCESS) + name = XATTR_NAME_POSIX_ACL_ACCESS; + else if (type == ACL_TYPE_DEFAULT) + name = XATTR_NAME_POSIX_ACL_DEFAULT; + else + return -EINVAL; + + if (acl) { + unsigned int extra_flags = 0; + /* + * Fuse userspace is responsible for updating access + * permissions in the inode, if needed. fuse_setxattr + * invalidates the inode attributes, which will force + * them to be refreshed the next time they are used, + * and it also updates i_ctime. + */ + size_t size = posix_acl_xattr_size(acl->a_count); + void *value; + + if (size > PAGE_SIZE) + return -E2BIG; + + value = kmalloc(size, GFP_KERNEL); + if (!value) + return -ENOMEM; + + ret = posix_acl_to_xattr(fc->user_ns, acl, value, size); + if (ret < 0) { + kfree(value); + return ret; + } + + if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) && + !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) + extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; + + ret = fuse_setxattr(inode, name, value, size, 0, extra_flags); + kfree(value); + } else { + ret = fuse_removexattr(inode, name); + } + forget_all_cached_acls(inode); + fuse_invalidate_attr(inode); + + return ret; +} diff --git a/fs/fuse/control.c b/fs/fuse/control.c new file mode 100644 index 000000000..247ef4f76 --- /dev/null +++ b/fs/fuse/control.c @@ -0,0 +1,384 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include +#include +#include + +#define FUSE_CTL_SUPER_MAGIC 0x65735543 + +/* + * This is non-NULL when the single instance of the control filesystem + * exists. Protected by fuse_mutex + */ +static struct super_block *fuse_control_sb; + +static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file) +{ + struct fuse_conn *fc; + mutex_lock(&fuse_mutex); + fc = file_inode(file)->i_private; + if (fc) + fc = fuse_conn_get(fc); + mutex_unlock(&fuse_mutex); + return fc; +} + +static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct fuse_conn *fc = fuse_ctl_file_conn_get(file); + if (fc) { + if (fc->abort_err) + fc->aborted = true; + fuse_abort_conn(fc); + fuse_conn_put(fc); + } + return count; +} + +static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + char tmp[32]; + size_t size; + + if (!*ppos) { + long value; + struct fuse_conn *fc = fuse_ctl_file_conn_get(file); + if (!fc) + return 0; + + value = atomic_read(&fc->num_waiting); + file->private_data = (void *)value; + fuse_conn_put(fc); + } + size = sprintf(tmp, "%ld\n", (long)file->private_data); + return simple_read_from_buffer(buf, len, ppos, tmp, size); +} + +static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos, unsigned val) +{ + char tmp[32]; + size_t size = sprintf(tmp, "%u\n", val); + + return simple_read_from_buffer(buf, len, ppos, tmp, size); +} + +static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, unsigned *val, + unsigned global_limit) +{ + unsigned long t; + unsigned limit = (1 << 16) - 1; + int err; + + if (*ppos) + return -EINVAL; + + err = kstrtoul_from_user(buf, count, 0, &t); + if (err) + return err; + + if (!capable(CAP_SYS_ADMIN)) + limit = min(limit, global_limit); + + if (t > limit) + return -EINVAL; + + *val = t; + + return count; +} + +static ssize_t fuse_conn_max_background_read(struct file *file, + char __user *buf, size_t len, + loff_t *ppos) +{ + struct fuse_conn *fc; + unsigned val; + + fc = fuse_ctl_file_conn_get(file); + if (!fc) + return 0; + + val = READ_ONCE(fc->max_background); + fuse_conn_put(fc); + + return fuse_conn_limit_read(file, buf, len, ppos, val); +} + +static ssize_t fuse_conn_max_background_write(struct file *file, + const char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned val; + ssize_t ret; + + ret = fuse_conn_limit_write(file, buf, count, ppos, &val, + max_user_bgreq); + if (ret > 0) { + struct fuse_conn *fc = fuse_ctl_file_conn_get(file); + if (fc) { + spin_lock(&fc->bg_lock); + fc->max_background = val; + fc->blocked = fc->num_background >= fc->max_background; + if (!fc->blocked) + wake_up(&fc->blocked_waitq); + spin_unlock(&fc->bg_lock); + fuse_conn_put(fc); + } + } + + return ret; +} + +static ssize_t fuse_conn_congestion_threshold_read(struct file *file, + char __user *buf, size_t len, + loff_t *ppos) +{ + struct fuse_conn *fc; + unsigned val; + + fc = fuse_ctl_file_conn_get(file); + if (!fc) + return 0; + + val = READ_ONCE(fc->congestion_threshold); + fuse_conn_put(fc); + + return fuse_conn_limit_read(file, buf, len, ppos, val); +} + +static ssize_t fuse_conn_congestion_threshold_write(struct file *file, + const char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned val; + struct fuse_conn *fc; + ssize_t ret; + + ret = fuse_conn_limit_write(file, buf, count, ppos, &val, + max_user_congthresh); + if (ret <= 0) + goto out; + fc = fuse_ctl_file_conn_get(file); + if (!fc) + goto out; + + down_read(&fc->killsb); + spin_lock(&fc->bg_lock); + fc->congestion_threshold = val; + spin_unlock(&fc->bg_lock); + up_read(&fc->killsb); + fuse_conn_put(fc); +out: + return ret; +} + +static const struct file_operations fuse_ctl_abort_ops = { + .open = nonseekable_open, + .write = fuse_conn_abort_write, + .llseek = no_llseek, +}; + +static const struct file_operations fuse_ctl_waiting_ops = { + .open = nonseekable_open, + .read = fuse_conn_waiting_read, + .llseek = no_llseek, +}; + +static const struct file_operations fuse_conn_max_background_ops = { + .open = nonseekable_open, + .read = fuse_conn_max_background_read, + .write = fuse_conn_max_background_write, + .llseek = no_llseek, +}; + +static const struct file_operations fuse_conn_congestion_threshold_ops = { + .open = nonseekable_open, + .read = fuse_conn_congestion_threshold_read, + .write = fuse_conn_congestion_threshold_write, + .llseek = no_llseek, +}; + +static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, + struct fuse_conn *fc, + const char *name, + int mode, int nlink, + const struct inode_operations *iop, + const struct file_operations *fop) +{ + struct dentry *dentry; + struct inode *inode; + + BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES); + dentry = d_alloc_name(parent, name); + if (!dentry) + return NULL; + + inode = new_inode(fuse_control_sb); + if (!inode) { + dput(dentry); + return NULL; + } + + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_uid = fc->user_id; + inode->i_gid = fc->group_id; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + /* setting ->i_op to NULL is not allowed */ + if (iop) + inode->i_op = iop; + inode->i_fop = fop; + set_nlink(inode, nlink); + inode->i_private = fc; + d_add(dentry, inode); + + fc->ctl_dentry[fc->ctl_ndents++] = dentry; + + return dentry; +} + +/* + * Add a connection to the control filesystem (if it exists). Caller + * must hold fuse_mutex + */ +int fuse_ctl_add_conn(struct fuse_conn *fc) +{ + struct dentry *parent; + char name[32]; + + if (!fuse_control_sb || fc->no_control) + return 0; + + parent = fuse_control_sb->s_root; + inc_nlink(d_inode(parent)); + sprintf(name, "%u", fc->dev); + parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2, + &simple_dir_inode_operations, + &simple_dir_operations); + if (!parent) + goto err; + + if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1, + NULL, &fuse_ctl_waiting_ops) || + !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1, + NULL, &fuse_ctl_abort_ops) || + !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600, + 1, NULL, &fuse_conn_max_background_ops) || + !fuse_ctl_add_dentry(parent, fc, "congestion_threshold", + S_IFREG | 0600, 1, NULL, + &fuse_conn_congestion_threshold_ops)) + goto err; + + return 0; + + err: + fuse_ctl_remove_conn(fc); + return -ENOMEM; +} + +/* + * Remove a connection from the control filesystem (if it exists). + * Caller must hold fuse_mutex + */ +void fuse_ctl_remove_conn(struct fuse_conn *fc) +{ + int i; + + if (!fuse_control_sb || fc->no_control) + return; + + for (i = fc->ctl_ndents - 1; i >= 0; i--) { + struct dentry *dentry = fc->ctl_dentry[i]; + d_inode(dentry)->i_private = NULL; + if (!i) { + /* Get rid of submounts: */ + d_invalidate(dentry); + } + dput(dentry); + } + drop_nlink(d_inode(fuse_control_sb->s_root)); +} + +static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fsc) +{ + static const struct tree_descr empty_descr = {""}; + struct fuse_conn *fc; + int err; + + err = simple_fill_super(sb, FUSE_CTL_SUPER_MAGIC, &empty_descr); + if (err) + return err; + + mutex_lock(&fuse_mutex); + BUG_ON(fuse_control_sb); + fuse_control_sb = sb; + list_for_each_entry(fc, &fuse_conn_list, entry) { + err = fuse_ctl_add_conn(fc); + if (err) { + fuse_control_sb = NULL; + mutex_unlock(&fuse_mutex); + return err; + } + } + mutex_unlock(&fuse_mutex); + + return 0; +} + +static int fuse_ctl_get_tree(struct fs_context *fsc) +{ + return get_tree_single(fsc, fuse_ctl_fill_super); +} + +static const struct fs_context_operations fuse_ctl_context_ops = { + .get_tree = fuse_ctl_get_tree, +}; + +static int fuse_ctl_init_fs_context(struct fs_context *fsc) +{ + fsc->ops = &fuse_ctl_context_ops; + return 0; +} + +static void fuse_ctl_kill_sb(struct super_block *sb) +{ + struct fuse_conn *fc; + + mutex_lock(&fuse_mutex); + fuse_control_sb = NULL; + list_for_each_entry(fc, &fuse_conn_list, entry) + fc->ctl_ndents = 0; + mutex_unlock(&fuse_mutex); + + kill_litter_super(sb); +} + +static struct file_system_type fuse_ctl_fs_type = { + .owner = THIS_MODULE, + .name = "fusectl", + .init_fs_context = fuse_ctl_init_fs_context, + .kill_sb = fuse_ctl_kill_sb, +}; +MODULE_ALIAS_FS("fusectl"); + +int __init fuse_ctl_init(void) +{ + return register_filesystem(&fuse_ctl_fs_type); +} + +void __exit fuse_ctl_cleanup(void) +{ + unregister_filesystem(&fuse_ctl_fs_type); +} diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c new file mode 100644 index 000000000..c7d882a9f --- /dev/null +++ b/fs/fuse/cuse.c @@ -0,0 +1,655 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * CUSE: Character device in Userspace + * + * Copyright (C) 2008-2009 SUSE Linux Products GmbH + * Copyright (C) 2008-2009 Tejun Heo + * + * CUSE enables character devices to be implemented from userland much + * like FUSE allows filesystems. On initialization /dev/cuse is + * created. By opening the file and replying to the CUSE_INIT request + * userland CUSE server can create a character device. After that the + * operation is very similar to FUSE. + * + * A CUSE instance involves the following objects. + * + * cuse_conn : contains fuse_conn and serves as bonding structure + * channel : file handle connected to the userland CUSE server + * cdev : the implemented character device + * dev : generic device for cdev + * + * Note that 'channel' is what 'dev' is in FUSE. As CUSE deals with + * devices, it's called 'channel' to reduce confusion. + * + * channel determines when the character device dies. When channel is + * closed, everything begins to destruct. The cuse_conn is taken off + * the lookup table preventing further access from cdev, cdev and + * generic device are removed and the base reference of cuse_conn is + * put. + * + * On each open, the matching cuse_conn is looked up and if found an + * additional reference is taken which is released when the file is + * closed. + */ + +#define pr_fmt(fmt) "CUSE: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fuse_i.h" + +#define CUSE_CONNTBL_LEN 64 + +struct cuse_conn { + struct list_head list; /* linked on cuse_conntbl */ + struct fuse_mount fm; /* Dummy mount referencing fc */ + struct fuse_conn fc; /* fuse connection */ + struct cdev *cdev; /* associated character device */ + struct device *dev; /* device representing @cdev */ + + /* init parameters, set once during initialization */ + bool unrestricted_ioctl; +}; + +static DEFINE_MUTEX(cuse_lock); /* protects registration */ +static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN]; +static struct class *cuse_class; + +static struct cuse_conn *fc_to_cc(struct fuse_conn *fc) +{ + return container_of(fc, struct cuse_conn, fc); +} + +static struct list_head *cuse_conntbl_head(dev_t devt) +{ + return &cuse_conntbl[(MAJOR(devt) + MINOR(devt)) % CUSE_CONNTBL_LEN]; +} + + +/************************************************************************** + * CUSE frontend operations + * + * These are file operations for the character device. + * + * On open, CUSE opens a file from the FUSE mnt and stores it to + * private_data of the open file. All other ops call FUSE ops on the + * FUSE file. + */ + +static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to) +{ + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb); + loff_t pos = 0; + + return fuse_direct_io(&io, to, &pos, FUSE_DIO_CUSE); +} + +static ssize_t cuse_write_iter(struct kiocb *kiocb, struct iov_iter *from) +{ + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb); + loff_t pos = 0; + /* + * No locking or generic_write_checks(), the server is + * responsible for locking and sanity checks. + */ + return fuse_direct_io(&io, from, &pos, + FUSE_DIO_WRITE | FUSE_DIO_CUSE); +} + +static int cuse_open(struct inode *inode, struct file *file) +{ + dev_t devt = inode->i_cdev->dev; + struct cuse_conn *cc = NULL, *pos; + int rc; + + /* look up and get the connection */ + mutex_lock(&cuse_lock); + list_for_each_entry(pos, cuse_conntbl_head(devt), list) + if (pos->dev->devt == devt) { + fuse_conn_get(&pos->fc); + cc = pos; + break; + } + mutex_unlock(&cuse_lock); + + /* dead? */ + if (!cc) + return -ENODEV; + + /* + * Generic permission check is already done against the chrdev + * file, proceed to open. + */ + rc = fuse_do_open(&cc->fm, 0, file, 0); + if (rc) + fuse_conn_put(&cc->fc); + return rc; +} + +static int cuse_release(struct inode *inode, struct file *file) +{ + struct fuse_file *ff = file->private_data; + struct fuse_mount *fm = ff->fm; + + fuse_sync_release(NULL, ff, file->f_flags); + fuse_conn_put(fm->fc); + + return 0; +} + +static long cuse_file_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fuse_file *ff = file->private_data; + struct cuse_conn *cc = fc_to_cc(ff->fm->fc); + unsigned int flags = 0; + + if (cc->unrestricted_ioctl) + flags |= FUSE_IOCTL_UNRESTRICTED; + + return fuse_do_ioctl(file, cmd, arg, flags); +} + +static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fuse_file *ff = file->private_data; + struct cuse_conn *cc = fc_to_cc(ff->fm->fc); + unsigned int flags = FUSE_IOCTL_COMPAT; + + if (cc->unrestricted_ioctl) + flags |= FUSE_IOCTL_UNRESTRICTED; + + return fuse_do_ioctl(file, cmd, arg, flags); +} + +static const struct file_operations cuse_frontend_fops = { + .owner = THIS_MODULE, + .read_iter = cuse_read_iter, + .write_iter = cuse_write_iter, + .open = cuse_open, + .release = cuse_release, + .unlocked_ioctl = cuse_file_ioctl, + .compat_ioctl = cuse_file_compat_ioctl, + .poll = fuse_file_poll, + .llseek = noop_llseek, +}; + + +/************************************************************************** + * CUSE channel initialization and destruction + */ + +struct cuse_devinfo { + const char *name; +}; + +/** + * cuse_parse_one - parse one key=value pair + * @pp: i/o parameter for the current position + * @end: points to one past the end of the packed string + * @keyp: out parameter for key + * @valp: out parameter for value + * + * *@pp points to packed strings - "key0=val0\0key1=val1\0" which ends + * at @end - 1. This function parses one pair and set *@keyp to the + * start of the key and *@valp to the start of the value. Note that + * the original string is modified such that the key string is + * terminated with '\0'. *@pp is updated to point to the next string. + * + * RETURNS: + * 1 on successful parse, 0 on EOF, -errno on failure. + */ +static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp) +{ + char *p = *pp; + char *key, *val; + + while (p < end && *p == '\0') + p++; + if (p == end) + return 0; + + if (end[-1] != '\0') { + pr_err("info not properly terminated\n"); + return -EINVAL; + } + + key = val = p; + p += strlen(p); + + if (valp) { + strsep(&val, "="); + if (!val) + val = key + strlen(key); + key = strstrip(key); + val = strstrip(val); + } else + key = strstrip(key); + + if (!strlen(key)) { + pr_err("zero length info key specified\n"); + return -EINVAL; + } + + *pp = p; + *keyp = key; + if (valp) + *valp = val; + + return 1; +} + +/** + * cuse_parse_dev_info - parse device info + * @p: device info string + * @len: length of device info string + * @devinfo: out parameter for parsed device info + * + * Parse @p to extract device info and store it into @devinfo. String + * pointed to by @p is modified by parsing and @devinfo points into + * them, so @p shouldn't be freed while @devinfo is in use. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo) +{ + char *end = p + len; + char *key, *val; + int rc; + + while (true) { + rc = cuse_parse_one(&p, end, &key, &val); + if (rc < 0) + return rc; + if (!rc) + break; + if (strcmp(key, "DEVNAME") == 0) + devinfo->name = val; + else + pr_warn("unknown device info \"%s\"\n", key); + } + + if (!devinfo->name || !strlen(devinfo->name)) { + pr_err("DEVNAME unspecified\n"); + return -EINVAL; + } + + return 0; +} + +static void cuse_gendev_release(struct device *dev) +{ + kfree(dev); +} + +struct cuse_init_args { + struct fuse_args_pages ap; + struct cuse_init_in in; + struct cuse_init_out out; + struct page *page; + struct fuse_page_desc desc; +}; + +/** + * cuse_process_init_reply - finish initializing CUSE channel + * + * This function creates the character device and sets up all the + * required data structures for it. Please read the comment at the + * top of this file for high level overview. + */ +static void cuse_process_init_reply(struct fuse_mount *fm, + struct fuse_args *args, int error) +{ + struct fuse_conn *fc = fm->fc; + struct cuse_init_args *ia = container_of(args, typeof(*ia), ap.args); + struct fuse_args_pages *ap = &ia->ap; + struct cuse_conn *cc = fc_to_cc(fc), *pos; + struct cuse_init_out *arg = &ia->out; + struct page *page = ap->pages[0]; + struct cuse_devinfo devinfo = { }; + struct device *dev; + struct cdev *cdev; + dev_t devt; + int rc, i; + + if (error || arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) + goto err; + + fc->minor = arg->minor; + fc->max_read = max_t(unsigned, arg->max_read, 4096); + fc->max_write = max_t(unsigned, arg->max_write, 4096); + + /* parse init reply */ + cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL; + + rc = cuse_parse_devinfo(page_address(page), ap->args.out_args[1].size, + &devinfo); + if (rc) + goto err; + + /* determine and reserve devt */ + devt = MKDEV(arg->dev_major, arg->dev_minor); + if (!MAJOR(devt)) + rc = alloc_chrdev_region(&devt, MINOR(devt), 1, devinfo.name); + else + rc = register_chrdev_region(devt, 1, devinfo.name); + if (rc) { + pr_err("failed to register chrdev region\n"); + goto err; + } + + /* devt determined, create device */ + rc = -ENOMEM; + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + goto err_region; + + device_initialize(dev); + dev_set_uevent_suppress(dev, 1); + dev->class = cuse_class; + dev->devt = devt; + dev->release = cuse_gendev_release; + dev_set_drvdata(dev, cc); + dev_set_name(dev, "%s", devinfo.name); + + mutex_lock(&cuse_lock); + + /* make sure the device-name is unique */ + for (i = 0; i < CUSE_CONNTBL_LEN; ++i) { + list_for_each_entry(pos, &cuse_conntbl[i], list) + if (!strcmp(dev_name(pos->dev), dev_name(dev))) + goto err_unlock; + } + + rc = device_add(dev); + if (rc) + goto err_unlock; + + /* register cdev */ + rc = -ENOMEM; + cdev = cdev_alloc(); + if (!cdev) + goto err_unlock; + + cdev->owner = THIS_MODULE; + cdev->ops = &cuse_frontend_fops; + + rc = cdev_add(cdev, devt, 1); + if (rc) + goto err_cdev; + + cc->dev = dev; + cc->cdev = cdev; + + /* make the device available */ + list_add(&cc->list, cuse_conntbl_head(devt)); + mutex_unlock(&cuse_lock); + + /* announce device availability */ + dev_set_uevent_suppress(dev, 0); + kobject_uevent(&dev->kobj, KOBJ_ADD); +out: + kfree(ia); + __free_page(page); + return; + +err_cdev: + cdev_del(cdev); +err_unlock: + mutex_unlock(&cuse_lock); + put_device(dev); +err_region: + unregister_chrdev_region(devt, 1); +err: + fuse_abort_conn(fc); + goto out; +} + +static int cuse_send_init(struct cuse_conn *cc) +{ + int rc; + struct page *page; + struct fuse_mount *fm = &cc->fm; + struct cuse_init_args *ia; + struct fuse_args_pages *ap; + + BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); + + rc = -ENOMEM; + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto err; + + ia = kzalloc(sizeof(*ia), GFP_KERNEL); + if (!ia) + goto err_free_page; + + ap = &ia->ap; + ia->in.major = FUSE_KERNEL_VERSION; + ia->in.minor = FUSE_KERNEL_MINOR_VERSION; + ia->in.flags |= CUSE_UNRESTRICTED_IOCTL; + ap->args.opcode = CUSE_INIT; + ap->args.in_numargs = 1; + ap->args.in_args[0].size = sizeof(ia->in); + ap->args.in_args[0].value = &ia->in; + ap->args.out_numargs = 2; + ap->args.out_args[0].size = sizeof(ia->out); + ap->args.out_args[0].value = &ia->out; + ap->args.out_args[1].size = CUSE_INIT_INFO_MAX; + ap->args.out_argvar = true; + ap->args.out_pages = true; + ap->num_pages = 1; + ap->pages = &ia->page; + ap->descs = &ia->desc; + ia->page = page; + ia->desc.length = ap->args.out_args[1].size; + ap->args.end = cuse_process_init_reply; + + rc = fuse_simple_background(fm, &ap->args, GFP_KERNEL); + if (rc) { + kfree(ia); +err_free_page: + __free_page(page); + } +err: + return rc; +} + +static void cuse_fc_release(struct fuse_conn *fc) +{ + struct cuse_conn *cc = fc_to_cc(fc); + kfree_rcu(cc, fc.rcu); +} + +/** + * cuse_channel_open - open method for /dev/cuse + * @inode: inode for /dev/cuse + * @file: file struct being opened + * + * Userland CUSE server can create a CUSE device by opening /dev/cuse + * and replying to the initialization request kernel sends. This + * function is responsible for handling CUSE device initialization. + * Because the fd opened by this function is used during + * initialization, this function only creates cuse_conn and sends + * init. The rest is delegated to a kthread. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int cuse_channel_open(struct inode *inode, struct file *file) +{ + struct fuse_dev *fud; + struct cuse_conn *cc; + int rc; + + /* set up cuse_conn */ + cc = kzalloc(sizeof(*cc), GFP_KERNEL); + if (!cc) + return -ENOMEM; + + /* + * Limit the cuse channel to requests that can + * be represented in file->f_cred->user_ns. + */ + fuse_conn_init(&cc->fc, &cc->fm, file->f_cred->user_ns, + &fuse_dev_fiq_ops, NULL); + + cc->fc.release = cuse_fc_release; + fud = fuse_dev_alloc_install(&cc->fc); + fuse_conn_put(&cc->fc); + if (!fud) + return -ENOMEM; + + INIT_LIST_HEAD(&cc->list); + + cc->fc.initialized = 1; + rc = cuse_send_init(cc); + if (rc) { + fuse_dev_free(fud); + return rc; + } + file->private_data = fud; + + return 0; +} + +/** + * cuse_channel_release - release method for /dev/cuse + * @inode: inode for /dev/cuse + * @file: file struct being closed + * + * Disconnect the channel, deregister CUSE device and initiate + * destruction by putting the default reference. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int cuse_channel_release(struct inode *inode, struct file *file) +{ + struct fuse_dev *fud = file->private_data; + struct cuse_conn *cc = fc_to_cc(fud->fc); + int rc; + + /* remove from the conntbl, no more access from this point on */ + mutex_lock(&cuse_lock); + list_del_init(&cc->list); + mutex_unlock(&cuse_lock); + + /* remove device */ + if (cc->dev) + device_unregister(cc->dev); + if (cc->cdev) { + unregister_chrdev_region(cc->cdev->dev, 1); + cdev_del(cc->cdev); + } + + rc = fuse_dev_release(inode, file); /* puts the base reference */ + + return rc; +} + +static struct file_operations cuse_channel_fops; /* initialized during init */ + + +/************************************************************************** + * Misc stuff and module initializatiion + * + * CUSE exports the same set of attributes to sysfs as fusectl. + */ + +static ssize_t cuse_class_waiting_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cuse_conn *cc = dev_get_drvdata(dev); + + return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting)); +} +static DEVICE_ATTR(waiting, 0400, cuse_class_waiting_show, NULL); + +static ssize_t cuse_class_abort_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cuse_conn *cc = dev_get_drvdata(dev); + + fuse_abort_conn(&cc->fc); + return count; +} +static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store); + +static struct attribute *cuse_class_dev_attrs[] = { + &dev_attr_waiting.attr, + &dev_attr_abort.attr, + NULL, +}; +ATTRIBUTE_GROUPS(cuse_class_dev); + +static struct miscdevice cuse_miscdev = { + .minor = CUSE_MINOR, + .name = "cuse", + .fops = &cuse_channel_fops, +}; + +MODULE_ALIAS_MISCDEV(CUSE_MINOR); +MODULE_ALIAS("devname:cuse"); + +static int __init cuse_init(void) +{ + int i, rc; + + /* init conntbl */ + for (i = 0; i < CUSE_CONNTBL_LEN; i++) + INIT_LIST_HEAD(&cuse_conntbl[i]); + + /* inherit and extend fuse_dev_operations */ + cuse_channel_fops = fuse_dev_operations; + cuse_channel_fops.owner = THIS_MODULE; + cuse_channel_fops.open = cuse_channel_open; + cuse_channel_fops.release = cuse_channel_release; + /* CUSE is not prepared for FUSE_DEV_IOC_CLONE */ + cuse_channel_fops.unlocked_ioctl = NULL; + + cuse_class = class_create(THIS_MODULE, "cuse"); + if (IS_ERR(cuse_class)) + return PTR_ERR(cuse_class); + + cuse_class->dev_groups = cuse_class_dev_groups; + + rc = misc_register(&cuse_miscdev); + if (rc) { + class_destroy(cuse_class); + return rc; + } + + return 0; +} + +static void __exit cuse_exit(void) +{ + misc_deregister(&cuse_miscdev); + class_destroy(cuse_class); +} + +module_init(cuse_init); +module_exit(cuse_exit); + +MODULE_AUTHOR("Tejun Heo "); +MODULE_DESCRIPTION("Character device in Userspace"); +MODULE_LICENSE("GPL"); diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c new file mode 100644 index 000000000..6e71904c3 --- /dev/null +++ b/fs/fuse/dax.c @@ -0,0 +1,1391 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * dax: direct host memory access + * Copyright (C) 2020 Red Hat, Inc. + */ + +#include "fuse_i.h" + +#include +#include +#include +#include +#include +#include +#include + +/* + * Default memory range size. A power of 2 so it agrees with common FUSE_INIT + * map_alignment values 4KB and 64KB. + */ +#define FUSE_DAX_SHIFT 21 +#define FUSE_DAX_SZ (1 << FUSE_DAX_SHIFT) +#define FUSE_DAX_PAGES (FUSE_DAX_SZ / PAGE_SIZE) + +/* Number of ranges reclaimer will try to free in one invocation */ +#define FUSE_DAX_RECLAIM_CHUNK (10) + +/* + * Dax memory reclaim threshold in percetage of total ranges. When free + * number of free ranges drops below this threshold, reclaim can trigger + * Default is 20% + */ +#define FUSE_DAX_RECLAIM_THRESHOLD (20) + +/** Translation information for file offsets to DAX window offsets */ +struct fuse_dax_mapping { + /* Pointer to inode where this memory range is mapped */ + struct inode *inode; + + /* Will connect in fcd->free_ranges to keep track of free memory */ + struct list_head list; + + /* For interval tree in file/inode */ + struct interval_tree_node itn; + + /* Will connect in fc->busy_ranges to keep track busy memory */ + struct list_head busy_list; + + /** Position in DAX window */ + u64 window_offset; + + /** Length of mapping, in bytes */ + loff_t length; + + /* Is this mapping read-only or read-write */ + bool writable; + + /* reference count when the mapping is used by dax iomap. */ + refcount_t refcnt; +}; + +/* Per-inode dax map */ +struct fuse_inode_dax { + /* Semaphore to protect modifications to the dmap tree */ + struct rw_semaphore sem; + + /* Sorted rb tree of struct fuse_dax_mapping elements */ + struct rb_root_cached tree; + unsigned long nr; +}; + +struct fuse_conn_dax { + /* DAX device */ + struct dax_device *dev; + + /* Lock protecting accessess to members of this structure */ + spinlock_t lock; + + /* List of memory ranges which are busy */ + unsigned long nr_busy_ranges; + struct list_head busy_ranges; + + /* Worker to free up memory ranges */ + struct delayed_work free_work; + + /* Wait queue for a dax range to become free */ + wait_queue_head_t range_waitq; + + /* DAX Window Free Ranges */ + long nr_free_ranges; + struct list_head free_ranges; + + unsigned long nr_ranges; +}; + +static inline struct fuse_dax_mapping * +node_to_dmap(struct interval_tree_node *node) +{ + if (!node) + return NULL; + + return container_of(node, struct fuse_dax_mapping, itn); +} + +static struct fuse_dax_mapping * +alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode); + +static void +__kick_dmap_free_worker(struct fuse_conn_dax *fcd, unsigned long delay_ms) +{ + unsigned long free_threshold; + + /* If number of free ranges are below threshold, start reclaim */ + free_threshold = max_t(unsigned long, fcd->nr_ranges * FUSE_DAX_RECLAIM_THRESHOLD / 100, + 1); + if (fcd->nr_free_ranges < free_threshold) + queue_delayed_work(system_long_wq, &fcd->free_work, + msecs_to_jiffies(delay_ms)); +} + +static void kick_dmap_free_worker(struct fuse_conn_dax *fcd, + unsigned long delay_ms) +{ + spin_lock(&fcd->lock); + __kick_dmap_free_worker(fcd, delay_ms); + spin_unlock(&fcd->lock); +} + +static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd) +{ + struct fuse_dax_mapping *dmap; + + spin_lock(&fcd->lock); + dmap = list_first_entry_or_null(&fcd->free_ranges, + struct fuse_dax_mapping, list); + if (dmap) { + list_del_init(&dmap->list); + WARN_ON(fcd->nr_free_ranges <= 0); + fcd->nr_free_ranges--; + } + __kick_dmap_free_worker(fcd, 0); + spin_unlock(&fcd->lock); + + return dmap; +} + +/* This assumes fcd->lock is held */ +static void __dmap_remove_busy_list(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + list_del_init(&dmap->busy_list); + WARN_ON(fcd->nr_busy_ranges == 0); + fcd->nr_busy_ranges--; +} + +static void dmap_remove_busy_list(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + spin_lock(&fcd->lock); + __dmap_remove_busy_list(fcd, dmap); + spin_unlock(&fcd->lock); +} + +/* This assumes fcd->lock is held */ +static void __dmap_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + list_add_tail(&dmap->list, &fcd->free_ranges); + fcd->nr_free_ranges++; + wake_up(&fcd->range_waitq); +} + +static void dmap_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + /* Return fuse_dax_mapping to free list */ + spin_lock(&fcd->lock); + __dmap_add_to_free_pool(fcd, dmap); + spin_unlock(&fcd->lock); +} + +static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx, + struct fuse_dax_mapping *dmap, bool writable, + bool upgrade) +{ + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn_dax *fcd = fm->fc->dax; + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_setupmapping_in inarg; + loff_t offset = start_idx << FUSE_DAX_SHIFT; + FUSE_ARGS(args); + ssize_t err; + + WARN_ON(fcd->nr_free_ranges < 0); + + /* Ask fuse daemon to setup mapping */ + memset(&inarg, 0, sizeof(inarg)); + inarg.foffset = offset; + inarg.fh = -1; + inarg.moffset = dmap->window_offset; + inarg.len = FUSE_DAX_SZ; + inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ; + if (writable) + inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE; + args.opcode = FUSE_SETUPMAPPING; + args.nodeid = fi->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + err = fuse_simple_request(fm, &args); + if (err < 0) + return err; + dmap->writable = writable; + if (!upgrade) { + /* + * We don't take a reference on inode. inode is valid right now + * and when inode is going away, cleanup logic should first + * cleanup dmap entries. + */ + dmap->inode = inode; + dmap->itn.start = dmap->itn.last = start_idx; + /* Protected by fi->dax->sem */ + interval_tree_insert(&dmap->itn, &fi->dax->tree); + fi->dax->nr++; + spin_lock(&fcd->lock); + list_add_tail(&dmap->busy_list, &fcd->busy_ranges); + fcd->nr_busy_ranges++; + spin_unlock(&fcd->lock); + } + return 0; +} + +static int fuse_send_removemapping(struct inode *inode, + struct fuse_removemapping_in *inargp, + struct fuse_removemapping_one *remove_one) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_mount *fm = get_fuse_mount(inode); + FUSE_ARGS(args); + + args.opcode = FUSE_REMOVEMAPPING; + args.nodeid = fi->nodeid; + args.in_numargs = 2; + args.in_args[0].size = sizeof(*inargp); + args.in_args[0].value = inargp; + args.in_args[1].size = inargp->count * sizeof(*remove_one); + args.in_args[1].value = remove_one; + return fuse_simple_request(fm, &args); +} + +static int dmap_removemapping_list(struct inode *inode, unsigned int num, + struct list_head *to_remove) +{ + struct fuse_removemapping_one *remove_one, *ptr; + struct fuse_removemapping_in inarg; + struct fuse_dax_mapping *dmap; + int ret, i = 0, nr_alloc; + + nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY); + remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS); + if (!remove_one) + return -ENOMEM; + + ptr = remove_one; + list_for_each_entry(dmap, to_remove, list) { + ptr->moffset = dmap->window_offset; + ptr->len = dmap->length; + ptr++; + i++; + num--; + if (i >= nr_alloc || num == 0) { + memset(&inarg, 0, sizeof(inarg)); + inarg.count = i; + ret = fuse_send_removemapping(inode, &inarg, + remove_one); + if (ret) + goto out; + ptr = remove_one; + i = 0; + } + } +out: + kfree(remove_one); + return ret; +} + +/* + * Cleanup dmap entry and add back to free list. This should be called with + * fcd->lock held. + */ +static void dmap_reinit_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + pr_debug("fuse: freeing memory range start_idx=0x%lx end_idx=0x%lx window_offset=0x%llx length=0x%llx\n", + dmap->itn.start, dmap->itn.last, dmap->window_offset, + dmap->length); + __dmap_remove_busy_list(fcd, dmap); + dmap->inode = NULL; + dmap->itn.start = dmap->itn.last = 0; + __dmap_add_to_free_pool(fcd, dmap); +} + +/* + * Free inode dmap entries whose range falls inside [start, end]. + * Does not take any locks. At this point of time it should only be + * called from evict_inode() path where we know all dmap entries can be + * reclaimed. + */ +static void inode_reclaim_dmap_range(struct fuse_conn_dax *fcd, + struct inode *inode, + loff_t start, loff_t end) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap, *n; + int err, num = 0; + LIST_HEAD(to_remove); + unsigned long start_idx = start >> FUSE_DAX_SHIFT; + unsigned long end_idx = end >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + while (1) { + node = interval_tree_iter_first(&fi->dax->tree, start_idx, + end_idx); + if (!node) + break; + dmap = node_to_dmap(node); + /* inode is going away. There should not be any users of dmap */ + WARN_ON(refcount_read(&dmap->refcnt) > 1); + interval_tree_remove(&dmap->itn, &fi->dax->tree); + num++; + list_add(&dmap->list, &to_remove); + } + + /* Nothing to remove */ + if (list_empty(&to_remove)) + return; + + WARN_ON(fi->dax->nr < num); + fi->dax->nr -= num; + err = dmap_removemapping_list(inode, num, &to_remove); + if (err && err != -ENOTCONN) { + pr_warn("Failed to removemappings. start=0x%llx end=0x%llx\n", + start, end); + } + spin_lock(&fcd->lock); + list_for_each_entry_safe(dmap, n, &to_remove, list) { + list_del_init(&dmap->list); + dmap_reinit_add_to_free_pool(fcd, dmap); + } + spin_unlock(&fcd->lock); +} + +static int dmap_removemapping_one(struct inode *inode, + struct fuse_dax_mapping *dmap) +{ + struct fuse_removemapping_one forget_one; + struct fuse_removemapping_in inarg; + + memset(&inarg, 0, sizeof(inarg)); + inarg.count = 1; + memset(&forget_one, 0, sizeof(forget_one)); + forget_one.moffset = dmap->window_offset; + forget_one.len = dmap->length; + + return fuse_send_removemapping(inode, &inarg, &forget_one); +} + +/* + * It is called from evict_inode() and by that time inode is going away. So + * this function does not take any locks like fi->dax->sem for traversing + * that fuse inode interval tree. If that lock is taken then lock validator + * complains of deadlock situation w.r.t fs_reclaim lock. + */ +void fuse_dax_inode_cleanup(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + /* + * fuse_evict_inode() has already called truncate_inode_pages_final() + * before we arrive here. So we should not have to worry about any + * pages/exception entries still associated with inode. + */ + inode_reclaim_dmap_range(fc->dax, inode, 0, -1); + WARN_ON(fi->dax->nr); +} + +static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length) +{ + iomap->addr = IOMAP_NULL_ADDR; + iomap->length = length; + iomap->type = IOMAP_HOLE; +} + +static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap, struct fuse_dax_mapping *dmap, + unsigned int flags) +{ + loff_t offset, len; + loff_t i_size = i_size_read(inode); + + offset = pos - (dmap->itn.start << FUSE_DAX_SHIFT); + len = min(length, dmap->length - offset); + + /* If length is beyond end of file, truncate further */ + if (pos + len > i_size) + len = i_size - pos; + + if (len > 0) { + iomap->addr = dmap->window_offset + offset; + iomap->length = len; + if (flags & IOMAP_FAULT) + iomap->length = ALIGN(len, PAGE_SIZE); + iomap->type = IOMAP_MAPPED; + /* + * increace refcnt so that reclaim code knows this dmap is in + * use. This assumes fi->dax->sem mutex is held either + * shared/exclusive. + */ + refcount_inc(&dmap->refcnt); + + /* iomap->private should be NULL */ + WARN_ON_ONCE(iomap->private); + iomap->private = dmap; + } else { + /* Mapping beyond end of file is hole */ + fuse_fill_iomap_hole(iomap, length); + } +} + +static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos, + loff_t length, unsigned int flags, + struct iomap *iomap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_conn_dax *fcd = fc->dax; + struct fuse_dax_mapping *dmap, *alloc_dmap = NULL; + int ret; + bool writable = flags & IOMAP_WRITE; + unsigned long start_idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + /* + * Can't do inline reclaim in fault path. We call + * dax_layout_busy_page() before we free a range. And + * fuse_wait_dax_page() drops mapping->invalidate_lock and requires it. + * In fault path we enter with mapping->invalidate_lock held and can't + * drop it. Also in fault path we hold mapping->invalidate_lock shared + * and not exclusive, so that creates further issues with + * fuse_wait_dax_page(). Hence return -EAGAIN and fuse_dax_fault() + * will wait for a memory range to become free and retry. + */ + if (flags & IOMAP_FAULT) { + alloc_dmap = alloc_dax_mapping(fcd); + if (!alloc_dmap) + return -EAGAIN; + } else { + alloc_dmap = alloc_dax_mapping_reclaim(fcd, inode); + if (IS_ERR(alloc_dmap)) + return PTR_ERR(alloc_dmap); + } + + /* If we are here, we should have memory allocated */ + if (WARN_ON(!alloc_dmap)) + return -EIO; + + /* + * Take write lock so that only one caller can try to setup mapping + * and other waits. + */ + down_write(&fi->dax->sem); + /* + * We dropped lock. Check again if somebody else setup + * mapping already. + */ + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + if (node) { + dmap = node_to_dmap(node); + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); + dmap_add_to_free_pool(fcd, alloc_dmap); + up_write(&fi->dax->sem); + return 0; + } + + /* Setup one mapping */ + ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, alloc_dmap, + writable, false); + if (ret < 0) { + dmap_add_to_free_pool(fcd, alloc_dmap); + up_write(&fi->dax->sem); + return ret; + } + fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags); + up_write(&fi->dax->sem); + return 0; +} + +static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos, + loff_t length, unsigned int flags, + struct iomap *iomap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + int ret; + unsigned long idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + /* + * Take exclusive lock so that only one caller can try to setup + * mapping and others wait. + */ + down_write(&fi->dax->sem); + node = interval_tree_iter_first(&fi->dax->tree, idx, idx); + + /* We are holding either inode lock or invalidate_lock, and that should + * ensure that dmap can't be truncated. We are holding a reference + * on dmap and that should make sure it can't be reclaimed. So dmap + * should still be there in tree despite the fact we dropped and + * re-acquired the fi->dax->sem lock. + */ + ret = -EIO; + if (WARN_ON(!node)) + goto out_err; + + dmap = node_to_dmap(node); + + /* We took an extra reference on dmap to make sure its not reclaimd. + * Now we hold fi->dax->sem lock and that reference is not needed + * anymore. Drop it. + */ + if (refcount_dec_and_test(&dmap->refcnt)) { + /* refcount should not hit 0. This object only goes + * away when fuse connection goes away + */ + WARN_ON_ONCE(1); + } + + /* Maybe another thread already upgraded mapping while we were not + * holding lock. + */ + if (dmap->writable) { + ret = 0; + goto out_fill_iomap; + } + + ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, dmap, true, + true); + if (ret < 0) + goto out_err; +out_fill_iomap: + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); +out_err: + up_write(&fi->dax->sem); + return ret; +} + +/* This is just for DAX and the mapping is ephemeral, do not use it for other + * purposes since there is no block device with a permanent mapping. + */ +static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_dax_mapping *dmap; + bool writable = flags & IOMAP_WRITE; + unsigned long start_idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + /* We don't support FIEMAP */ + if (WARN_ON(flags & IOMAP_REPORT)) + return -EIO; + + iomap->offset = pos; + iomap->flags = 0; + iomap->bdev = NULL; + iomap->dax_dev = fc->dax->dev; + + /* + * Both read/write and mmap path can race here. So we need something + * to make sure if we are setting up mapping, then other path waits + * + * For now, use a semaphore for this. It probably needs to be + * optimized later. + */ + down_read(&fi->dax->sem); + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + if (node) { + dmap = node_to_dmap(node); + if (writable && !dmap->writable) { + /* Upgrade read-only mapping to read-write. This will + * require exclusive fi->dax->sem lock as we don't want + * two threads to be trying to this simultaneously + * for same dmap. So drop shared lock and acquire + * exclusive lock. + * + * Before dropping fi->dax->sem lock, take reference + * on dmap so that its not freed by range reclaim. + */ + refcount_inc(&dmap->refcnt); + up_read(&fi->dax->sem); + pr_debug("%s: Upgrading mapping at offset 0x%llx length 0x%llx\n", + __func__, pos, length); + return fuse_upgrade_dax_mapping(inode, pos, length, + flags, iomap); + } else { + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); + up_read(&fi->dax->sem); + return 0; + } + } else { + up_read(&fi->dax->sem); + pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n", + __func__, pos, length); + if (pos >= i_size_read(inode)) + goto iomap_hole; + + return fuse_setup_new_dax_mapping(inode, pos, length, flags, + iomap); + } + + /* + * If read beyond end of file happens, fs code seems to return + * it as hole + */ +iomap_hole: + fuse_fill_iomap_hole(iomap, length); + pr_debug("%s returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n", + __func__, pos, length, iomap->length); + return 0; +} + +static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, + struct iomap *iomap) +{ + struct fuse_dax_mapping *dmap = iomap->private; + + if (dmap) { + if (refcount_dec_and_test(&dmap->refcnt)) { + /* refcount should not hit 0. This object only goes + * away when fuse connection goes away + */ + WARN_ON_ONCE(1); + } + } + + /* DAX writes beyond end-of-file aren't handled using iomap, so the + * file size is unchanged and there is nothing to do here. + */ + return 0; +} + +static const struct iomap_ops fuse_iomap_ops = { + .iomap_begin = fuse_iomap_begin, + .iomap_end = fuse_iomap_end, +}; + +static void fuse_wait_dax_page(struct inode *inode) +{ + filemap_invalidate_unlock(inode->i_mapping); + schedule(); + filemap_invalidate_lock(inode->i_mapping); +} + +/* Should be called with mapping->invalidate_lock held exclusively */ +static int __fuse_dax_break_layouts(struct inode *inode, bool *retry, + loff_t start, loff_t end) +{ + struct page *page; + + page = dax_layout_busy_page_range(inode->i_mapping, start, end); + if (!page) + return 0; + + *retry = true; + return ___wait_var_event(&page->_refcount, + atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, + 0, 0, fuse_wait_dax_page(inode)); +} + +/* dmap_end == 0 leads to unmapping of whole file */ +int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, + u64 dmap_end) +{ + bool retry; + int ret; + + do { + retry = false; + ret = __fuse_dax_break_layouts(inode, &retry, dmap_start, + dmap_end); + } while (ret == 0 && retry); + + return ret; +} + +ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + + ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops); + inode_unlock_shared(inode); + + /* TODO file_accessed(iocb->f_filp) */ + return ret; +} + +static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + return (iov_iter_rw(from) == WRITE && + ((iocb->ki_pos) >= i_size_read(inode) || + (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode)))); +} + +static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + ssize_t ret; + + ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); + + fuse_write_update_attr(inode, iocb->ki_pos, ret); + return ret; +} + +ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + + ret = file_remove_privs(iocb->ki_filp); + if (ret) + goto out; + /* TODO file_update_time() but we don't want metadata I/O */ + + /* Do not use dax for file extending writes as write and on + * disk i_size increase are not atomic otherwise. + */ + if (file_extending_write(iocb, from)) + ret = fuse_dax_direct_write(iocb, from); + else + ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops); + +out: + inode_unlock(inode); + + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} + +static int fuse_dax_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + + return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc); +} + +static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, + enum page_entry_size pe_size, bool write) +{ + vm_fault_t ret; + struct inode *inode = file_inode(vmf->vma->vm_file); + struct super_block *sb = inode->i_sb; + pfn_t pfn; + int error = 0; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_conn_dax *fcd = fc->dax; + bool retry = false; + + if (write) + sb_start_pagefault(sb); +retry: + if (retry && !(fcd->nr_free_ranges > 0)) + wait_event(fcd->range_waitq, (fcd->nr_free_ranges > 0)); + + /* + * We need to serialize against not only truncate but also against + * fuse dax memory range reclaim. While a range is being reclaimed, + * we do not want any read/write/mmap to make progress and try + * to populate page cache or access memory we are trying to free. + */ + filemap_invalidate_lock_shared(inode->i_mapping); + ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops); + if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) { + error = 0; + retry = true; + filemap_invalidate_unlock_shared(inode->i_mapping); + goto retry; + } + + if (ret & VM_FAULT_NEEDDSYNC) + ret = dax_finish_sync_fault(vmf, pe_size, pfn); + filemap_invalidate_unlock_shared(inode->i_mapping); + + if (write) + sb_end_pagefault(sb); + + return ret; +} + +static vm_fault_t fuse_dax_fault(struct vm_fault *vmf) +{ + return __fuse_dax_fault(vmf, PE_SIZE_PTE, + vmf->flags & FAULT_FLAG_WRITE); +} + +static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf, + enum page_entry_size pe_size) +{ + return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE); +} + +static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf) +{ + return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); +} + +static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf) +{ + return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); +} + +static const struct vm_operations_struct fuse_dax_vm_ops = { + .fault = fuse_dax_fault, + .huge_fault = fuse_dax_huge_fault, + .page_mkwrite = fuse_dax_page_mkwrite, + .pfn_mkwrite = fuse_dax_pfn_mkwrite, +}; + +int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + vma->vm_ops = &fuse_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + return 0; +} + +static int dmap_writeback_invalidate(struct inode *inode, + struct fuse_dax_mapping *dmap) +{ + int ret; + loff_t start_pos = dmap->itn.start << FUSE_DAX_SHIFT; + loff_t end_pos = (start_pos + FUSE_DAX_SZ - 1); + + ret = filemap_fdatawrite_range(inode->i_mapping, start_pos, end_pos); + if (ret) { + pr_debug("fuse: filemap_fdatawrite_range() failed. err=%d start_pos=0x%llx, end_pos=0x%llx\n", + ret, start_pos, end_pos); + return ret; + } + + ret = invalidate_inode_pages2_range(inode->i_mapping, + start_pos >> PAGE_SHIFT, + end_pos >> PAGE_SHIFT); + if (ret) + pr_debug("fuse: invalidate_inode_pages2_range() failed err=%d\n", + ret); + + return ret; +} + +static int reclaim_one_dmap_locked(struct inode *inode, + struct fuse_dax_mapping *dmap) +{ + int ret; + struct fuse_inode *fi = get_fuse_inode(inode); + + /* + * igrab() was done to make sure inode won't go under us, and this + * further avoids the race with evict(). + */ + ret = dmap_writeback_invalidate(inode, dmap); + if (ret) + return ret; + + /* Remove dax mapping from inode interval tree now */ + interval_tree_remove(&dmap->itn, &fi->dax->tree); + fi->dax->nr--; + + /* It is possible that umount/shutdown has killed the fuse connection + * and worker thread is trying to reclaim memory in parallel. Don't + * warn in that case. + */ + ret = dmap_removemapping_one(inode, dmap); + if (ret && ret != -ENOTCONN) { + pr_warn("Failed to remove mapping. offset=0x%llx len=0x%llx ret=%d\n", + dmap->window_offset, dmap->length, ret); + } + return 0; +} + +/* Find first mapped dmap for an inode and return file offset. Caller needs + * to hold fi->dax->sem lock either shared or exclusive. + */ +static struct fuse_dax_mapping *inode_lookup_first_dmap(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + struct interval_tree_node *node; + + for (node = interval_tree_iter_first(&fi->dax->tree, 0, -1); node; + node = interval_tree_iter_next(node, 0, -1)) { + dmap = node_to_dmap(node); + /* still in use. */ + if (refcount_read(&dmap->refcnt) > 1) + continue; + + return dmap; + } + + return NULL; +} + +/* + * Find first mapping in the tree and free it and return it. Do not add + * it back to free pool. + */ +static struct fuse_dax_mapping * +inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode, + bool *retry) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + u64 dmap_start, dmap_end; + unsigned long start_idx; + int ret; + struct interval_tree_node *node; + + filemap_invalidate_lock(inode->i_mapping); + + /* Lookup a dmap and corresponding file offset to reclaim. */ + down_read(&fi->dax->sem); + dmap = inode_lookup_first_dmap(inode); + if (dmap) { + start_idx = dmap->itn.start; + dmap_start = start_idx << FUSE_DAX_SHIFT; + dmap_end = dmap_start + FUSE_DAX_SZ - 1; + } + up_read(&fi->dax->sem); + + if (!dmap) + goto out_mmap_sem; + /* + * Make sure there are no references to inode pages using + * get_user_pages() + */ + ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end); + if (ret) { + pr_debug("fuse: fuse_dax_break_layouts() failed. err=%d\n", + ret); + dmap = ERR_PTR(ret); + goto out_mmap_sem; + } + + down_write(&fi->dax->sem); + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + /* Range already got reclaimed by somebody else */ + if (!node) { + if (retry) + *retry = true; + goto out_write_dmap_sem; + } + + dmap = node_to_dmap(node); + /* still in use. */ + if (refcount_read(&dmap->refcnt) > 1) { + dmap = NULL; + if (retry) + *retry = true; + goto out_write_dmap_sem; + } + + ret = reclaim_one_dmap_locked(inode, dmap); + if (ret < 0) { + dmap = ERR_PTR(ret); + goto out_write_dmap_sem; + } + + /* Clean up dmap. Do not add back to free list */ + dmap_remove_busy_list(fcd, dmap); + dmap->inode = NULL; + dmap->itn.start = dmap->itn.last = 0; + + pr_debug("fuse: %s: inline reclaimed memory range. inode=%p, window_offset=0x%llx, length=0x%llx\n", + __func__, inode, dmap->window_offset, dmap->length); + +out_write_dmap_sem: + up_write(&fi->dax->sem); +out_mmap_sem: + filemap_invalidate_unlock(inode->i_mapping); + return dmap; +} + +static struct fuse_dax_mapping * +alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode) +{ + struct fuse_dax_mapping *dmap; + struct fuse_inode *fi = get_fuse_inode(inode); + + while (1) { + bool retry = false; + + dmap = alloc_dax_mapping(fcd); + if (dmap) + return dmap; + + dmap = inode_inline_reclaim_one_dmap(fcd, inode, &retry); + /* + * Either we got a mapping or it is an error, return in both + * the cases. + */ + if (dmap) + return dmap; + + /* If we could not reclaim a mapping because it + * had a reference or some other temporary failure, + * Try again. We want to give up inline reclaim only + * if there is no range assigned to this node. Otherwise + * if a deadlock is possible if we sleep with + * mapping->invalidate_lock held and worker to free memory + * can't make progress due to unavailability of + * mapping->invalidate_lock. So sleep only if fi->dax->nr=0 + */ + if (retry) + continue; + /* + * There are no mappings which can be reclaimed. Wait for one. + * We are not holding fi->dax->sem. So it is possible + * that range gets added now. But as we are not holding + * mapping->invalidate_lock, worker should still be able to + * free up a range and wake us up. + */ + if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) { + if (wait_event_killable_exclusive(fcd->range_waitq, + (fcd->nr_free_ranges > 0))) { + return ERR_PTR(-EINTR); + } + } + } +} + +static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd, + struct inode *inode, + unsigned long start_idx) +{ + int ret; + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + struct interval_tree_node *node; + + /* Find fuse dax mapping at file offset inode. */ + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + + /* Range already got cleaned up by somebody else */ + if (!node) + return 0; + dmap = node_to_dmap(node); + + /* still in use. */ + if (refcount_read(&dmap->refcnt) > 1) + return 0; + + ret = reclaim_one_dmap_locked(inode, dmap); + if (ret < 0) + return ret; + + /* Cleanup dmap entry and add back to free list */ + spin_lock(&fcd->lock); + dmap_reinit_add_to_free_pool(fcd, dmap); + spin_unlock(&fcd->lock); + return ret; +} + +/* + * Free a range of memory. + * Locking: + * 1. Take mapping->invalidate_lock to block dax faults. + * 2. Take fi->dax->sem to protect interval tree and also to make sure + * read/write can not reuse a dmap which we might be freeing. + */ +static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd, + struct inode *inode, + unsigned long start_idx, + unsigned long end_idx) +{ + int ret; + struct fuse_inode *fi = get_fuse_inode(inode); + loff_t dmap_start = start_idx << FUSE_DAX_SHIFT; + loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1; + + filemap_invalidate_lock(inode->i_mapping); + ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end); + if (ret) { + pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n", + ret); + goto out_mmap_sem; + } + + down_write(&fi->dax->sem); + ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx); + up_write(&fi->dax->sem); +out_mmap_sem: + filemap_invalidate_unlock(inode->i_mapping); + return ret; +} + +static int try_to_free_dmap_chunks(struct fuse_conn_dax *fcd, + unsigned long nr_to_free) +{ + struct fuse_dax_mapping *dmap, *pos, *temp; + int ret, nr_freed = 0; + unsigned long start_idx = 0, end_idx = 0; + struct inode *inode = NULL; + + /* Pick first busy range and free it for now*/ + while (1) { + if (nr_freed >= nr_to_free) + break; + + dmap = NULL; + spin_lock(&fcd->lock); + + if (!fcd->nr_busy_ranges) { + spin_unlock(&fcd->lock); + return 0; + } + + list_for_each_entry_safe(pos, temp, &fcd->busy_ranges, + busy_list) { + /* skip this range if it's in use. */ + if (refcount_read(&pos->refcnt) > 1) + continue; + + inode = igrab(pos->inode); + /* + * This inode is going away. That will free + * up all the ranges anyway, continue to + * next range. + */ + if (!inode) + continue; + /* + * Take this element off list and add it tail. If + * this element can't be freed, it will help with + * selecting new element in next iteration of loop. + */ + dmap = pos; + list_move_tail(&dmap->busy_list, &fcd->busy_ranges); + start_idx = end_idx = dmap->itn.start; + break; + } + spin_unlock(&fcd->lock); + if (!dmap) + return 0; + + ret = lookup_and_reclaim_dmap(fcd, inode, start_idx, end_idx); + iput(inode); + if (ret) + return ret; + nr_freed++; + } + return 0; +} + +static void fuse_dax_free_mem_worker(struct work_struct *work) +{ + int ret; + struct fuse_conn_dax *fcd = container_of(work, struct fuse_conn_dax, + free_work.work); + ret = try_to_free_dmap_chunks(fcd, FUSE_DAX_RECLAIM_CHUNK); + if (ret) { + pr_debug("fuse: try_to_free_dmap_chunks() failed with err=%d\n", + ret); + } + + /* If number of free ranges are still below threshold, requeue */ + kick_dmap_free_worker(fcd, 1); +} + +static void fuse_free_dax_mem_ranges(struct list_head *mem_list) +{ + struct fuse_dax_mapping *range, *temp; + + /* Free All allocated elements */ + list_for_each_entry_safe(range, temp, mem_list, list) { + list_del(&range->list); + if (!list_empty(&range->busy_list)) + list_del(&range->busy_list); + kfree(range); + } +} + +void fuse_dax_conn_free(struct fuse_conn *fc) +{ + if (fc->dax) { + fuse_free_dax_mem_ranges(&fc->dax->free_ranges); + kfree(fc->dax); + fc->dax = NULL; + } +} + +static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd) +{ + long nr_pages, nr_ranges; + struct fuse_dax_mapping *range; + int ret, id; + size_t dax_size = -1; + unsigned long i; + + init_waitqueue_head(&fcd->range_waitq); + INIT_LIST_HEAD(&fcd->free_ranges); + INIT_LIST_HEAD(&fcd->busy_ranges); + INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker); + + id = dax_read_lock(); + nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), + DAX_ACCESS, NULL, NULL); + dax_read_unlock(id); + if (nr_pages < 0) { + pr_debug("dax_direct_access() returned %ld\n", nr_pages); + return nr_pages; + } + + nr_ranges = nr_pages/FUSE_DAX_PAGES; + pr_debug("%s: dax mapped %ld pages. nr_ranges=%ld\n", + __func__, nr_pages, nr_ranges); + + for (i = 0; i < nr_ranges; i++) { + range = kzalloc(sizeof(struct fuse_dax_mapping), GFP_KERNEL); + ret = -ENOMEM; + if (!range) + goto out_err; + + /* TODO: This offset only works if virtio-fs driver is not + * having some memory hidden at the beginning. This needs + * better handling + */ + range->window_offset = i * FUSE_DAX_SZ; + range->length = FUSE_DAX_SZ; + INIT_LIST_HEAD(&range->busy_list); + refcount_set(&range->refcnt, 1); + list_add_tail(&range->list, &fcd->free_ranges); + } + + fcd->nr_free_ranges = nr_ranges; + fcd->nr_ranges = nr_ranges; + return 0; +out_err: + /* Free All allocated elements */ + fuse_free_dax_mem_ranges(&fcd->free_ranges); + return ret; +} + +int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode dax_mode, + struct dax_device *dax_dev) +{ + struct fuse_conn_dax *fcd; + int err; + + fc->dax_mode = dax_mode; + + if (!dax_dev) + return 0; + + fcd = kzalloc(sizeof(*fcd), GFP_KERNEL); + if (!fcd) + return -ENOMEM; + + spin_lock_init(&fcd->lock); + fcd->dev = dax_dev; + err = fuse_dax_mem_range_init(fcd); + if (err) { + kfree(fcd); + return err; + } + + fc->dax = fcd; + return 0; +} + +bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + + fi->dax = NULL; + if (fc->dax) { + fi->dax = kzalloc(sizeof(*fi->dax), GFP_KERNEL_ACCOUNT); + if (!fi->dax) + return false; + + init_rwsem(&fi->dax->sem); + fi->dax->tree = RB_ROOT_CACHED; + } + + return true; +} + +static const struct address_space_operations fuse_dax_file_aops = { + .writepages = fuse_dax_writepages, + .direct_IO = noop_direct_IO, + .dirty_folio = noop_dirty_folio, +}; + +static bool fuse_should_enable_dax(struct inode *inode, unsigned int flags) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + enum fuse_dax_mode dax_mode = fc->dax_mode; + + if (dax_mode == FUSE_DAX_NEVER) + return false; + + /* + * fc->dax may be NULL in 'inode' mode when filesystem device doesn't + * support DAX, in which case it will silently fallback to 'never' mode. + */ + if (!fc->dax) + return false; + + if (dax_mode == FUSE_DAX_ALWAYS) + return true; + + /* dax_mode is FUSE_DAX_INODE* */ + return fc->inode_dax && (flags & FUSE_ATTR_DAX); +} + +void fuse_dax_inode_init(struct inode *inode, unsigned int flags) +{ + if (!fuse_should_enable_dax(inode, flags)) + return; + + inode->i_flags |= S_DAX; + inode->i_data.a_ops = &fuse_dax_file_aops; +} + +void fuse_dax_dontcache(struct inode *inode, unsigned int flags) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + if (fuse_is_inode_dax_mode(fc->dax_mode) && + ((bool) IS_DAX(inode) != (bool) (flags & FUSE_ATTR_DAX))) + d_mark_dontcache(inode); +} + +bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment) +{ + if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) { + pr_warn("FUSE: map_alignment %u incompatible with dax mem range size %u\n", + map_alignment, FUSE_DAX_SZ); + return false; + } + return true; +} + +void fuse_dax_cancel_work(struct fuse_conn *fc) +{ + struct fuse_conn_dax *fcd = fc->dax; + + if (fcd) + cancel_delayed_work_sync(&fcd->free_work); + +} +EXPORT_SYMBOL_GPL(fuse_dax_cancel_work); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c new file mode 100644 index 000000000..b4a6e0a1b --- /dev/null +++ b/fs/fuse/dev.c @@ -0,0 +1,2337 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_ALIAS_MISCDEV(FUSE_MINOR); +MODULE_ALIAS("devname:fuse"); + +/* Ordinary requests have even IDs, while interrupts IDs are odd */ +#define FUSE_INT_REQ_BIT (1ULL << 0) +#define FUSE_REQ_ID_STEP (1ULL << 1) + +static struct kmem_cache *fuse_req_cachep; + +static struct fuse_dev *fuse_get_dev(struct file *file) +{ + /* + * Lockless access is OK, because file->private data is set + * once during mount and is valid until the file is released. + */ + return READ_ONCE(file->private_data); +} + +static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req) +{ + INIT_LIST_HEAD(&req->list); + INIT_LIST_HEAD(&req->intr_entry); + init_waitqueue_head(&req->waitq); + refcount_set(&req->count, 1); + __set_bit(FR_PENDING, &req->flags); + req->fm = fm; +} + +static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags) +{ + struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags); + if (req) + fuse_request_init(fm, req); + + return req; +} + +static void fuse_request_free(struct fuse_req *req) +{ + kmem_cache_free(fuse_req_cachep, req); +} + +static void __fuse_get_request(struct fuse_req *req) +{ + refcount_inc(&req->count); +} + +/* Must be called with > 1 refcount */ +static void __fuse_put_request(struct fuse_req *req) +{ + refcount_dec(&req->count); +} + +void fuse_set_initialized(struct fuse_conn *fc) +{ + /* Make sure stores before this are seen on another CPU */ + smp_wmb(); + fc->initialized = 1; +} + +static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background) +{ + return !fc->initialized || (for_background && fc->blocked); +} + +static void fuse_drop_waiting(struct fuse_conn *fc) +{ + /* + * lockess check of fc->connected is okay, because atomic_dec_and_test() + * provides a memory barrier matched with the one in fuse_wait_aborted() + * to ensure no wake-up is missed. + */ + if (atomic_dec_and_test(&fc->num_waiting) && + !READ_ONCE(fc->connected)) { + /* wake up aborters */ + wake_up_all(&fc->blocked_waitq); + } +} + +static void fuse_put_request(struct fuse_req *req); + +static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background) +{ + struct fuse_conn *fc = fm->fc; + struct fuse_req *req; + int err; + atomic_inc(&fc->num_waiting); + + if (fuse_block_alloc(fc, for_background)) { + err = -EINTR; + if (wait_event_killable_exclusive(fc->blocked_waitq, + !fuse_block_alloc(fc, for_background))) + goto out; + } + /* Matches smp_wmb() in fuse_set_initialized() */ + smp_rmb(); + + err = -ENOTCONN; + if (!fc->connected) + goto out; + + err = -ECONNREFUSED; + if (fc->conn_error) + goto out; + + req = fuse_request_alloc(fm, GFP_KERNEL); + err = -ENOMEM; + if (!req) { + if (for_background) + wake_up(&fc->blocked_waitq); + goto out; + } + + req->in.h.uid = from_kuid(fc->user_ns, current_fsuid()); + req->in.h.gid = from_kgid(fc->user_ns, current_fsgid()); + req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); + + __set_bit(FR_WAITING, &req->flags); + if (for_background) + __set_bit(FR_BACKGROUND, &req->flags); + + if (unlikely(req->in.h.uid == ((uid_t)-1) || + req->in.h.gid == ((gid_t)-1))) { + fuse_put_request(req); + return ERR_PTR(-EOVERFLOW); + } + return req; + + out: + fuse_drop_waiting(fc); + return ERR_PTR(err); +} + +static void fuse_put_request(struct fuse_req *req) +{ + struct fuse_conn *fc = req->fm->fc; + + if (refcount_dec_and_test(&req->count)) { + if (test_bit(FR_BACKGROUND, &req->flags)) { + /* + * We get here in the unlikely case that a background + * request was allocated but not sent + */ + spin_lock(&fc->bg_lock); + if (!fc->blocked) + wake_up(&fc->blocked_waitq); + spin_unlock(&fc->bg_lock); + } + + if (test_bit(FR_WAITING, &req->flags)) { + __clear_bit(FR_WAITING, &req->flags); + fuse_drop_waiting(fc); + } + + fuse_request_free(req); + } +} + +unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args) +{ + unsigned nbytes = 0; + unsigned i; + + for (i = 0; i < numargs; i++) + nbytes += args[i].size; + + return nbytes; +} +EXPORT_SYMBOL_GPL(fuse_len_args); + +u64 fuse_get_unique(struct fuse_iqueue *fiq) +{ + fiq->reqctr += FUSE_REQ_ID_STEP; + return fiq->reqctr; +} +EXPORT_SYMBOL_GPL(fuse_get_unique); + +static unsigned int fuse_req_hash(u64 unique) +{ + return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); +} + +/** + * A new request is available, wake fiq->waitq + */ +static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq) +__releases(fiq->lock) +{ + wake_up(&fiq->waitq); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + spin_unlock(&fiq->lock); +} + +const struct fuse_iqueue_ops fuse_dev_fiq_ops = { + .wake_forget_and_unlock = fuse_dev_wake_and_unlock, + .wake_interrupt_and_unlock = fuse_dev_wake_and_unlock, + .wake_pending_and_unlock = fuse_dev_wake_and_unlock, +}; +EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops); + +static void queue_request_and_unlock(struct fuse_iqueue *fiq, + struct fuse_req *req) +__releases(fiq->lock) +{ + req->in.h.len = sizeof(struct fuse_in_header) + + fuse_len_args(req->args->in_numargs, + (struct fuse_arg *) req->args->in_args); + list_add_tail(&req->list, &fiq->pending); + fiq->ops->wake_pending_and_unlock(fiq); +} + +void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, + u64 nodeid, u64 nlookup) +{ + struct fuse_iqueue *fiq = &fc->iq; + + forget->forget_one.nodeid = nodeid; + forget->forget_one.nlookup = nlookup; + + spin_lock(&fiq->lock); + if (fiq->connected) { + fiq->forget_list_tail->next = forget; + fiq->forget_list_tail = forget; + fiq->ops->wake_forget_and_unlock(fiq); + } else { + kfree(forget); + spin_unlock(&fiq->lock); + } +} + +static void flush_bg_queue(struct fuse_conn *fc) +{ + struct fuse_iqueue *fiq = &fc->iq; + + while (fc->active_background < fc->max_background && + !list_empty(&fc->bg_queue)) { + struct fuse_req *req; + + req = list_first_entry(&fc->bg_queue, struct fuse_req, list); + list_del(&req->list); + fc->active_background++; + spin_lock(&fiq->lock); + req->in.h.unique = fuse_get_unique(fiq); + queue_request_and_unlock(fiq, req); + } +} + +/* + * This function is called when a request is finished. Either a reply + * has arrived or it was aborted (and not yet sent) or some error + * occurred during communication with userspace, or the device file + * was closed. The requester thread is woken up (if still waiting), + * the 'end' callback is called if given, else the reference to the + * request is released + */ +void fuse_request_end(struct fuse_req *req) +{ + struct fuse_mount *fm = req->fm; + struct fuse_conn *fc = fm->fc; + struct fuse_iqueue *fiq = &fc->iq; + + if (test_and_set_bit(FR_FINISHED, &req->flags)) + goto put_request; + + /* + * test_and_set_bit() implies smp_mb() between bit + * changing and below FR_INTERRUPTED check. Pairs with + * smp_mb() from queue_interrupt(). + */ + if (test_bit(FR_INTERRUPTED, &req->flags)) { + spin_lock(&fiq->lock); + list_del_init(&req->intr_entry); + spin_unlock(&fiq->lock); + } + WARN_ON(test_bit(FR_PENDING, &req->flags)); + WARN_ON(test_bit(FR_SENT, &req->flags)); + if (test_bit(FR_BACKGROUND, &req->flags)) { + spin_lock(&fc->bg_lock); + clear_bit(FR_BACKGROUND, &req->flags); + if (fc->num_background == fc->max_background) { + fc->blocked = 0; + wake_up(&fc->blocked_waitq); + } else if (!fc->blocked) { + /* + * Wake up next waiter, if any. It's okay to use + * waitqueue_active(), as we've already synced up + * fc->blocked with waiters with the wake_up() call + * above. + */ + if (waitqueue_active(&fc->blocked_waitq)) + wake_up(&fc->blocked_waitq); + } + + fc->num_background--; + fc->active_background--; + flush_bg_queue(fc); + spin_unlock(&fc->bg_lock); + } else { + /* Wake up waiter sleeping in request_wait_answer() */ + wake_up(&req->waitq); + } + + if (test_bit(FR_ASYNC, &req->flags)) + req->args->end(fm, req->args, req->out.h.error); +put_request: + fuse_put_request(req); +} +EXPORT_SYMBOL_GPL(fuse_request_end); + +static int queue_interrupt(struct fuse_req *req) +{ + struct fuse_iqueue *fiq = &req->fm->fc->iq; + + spin_lock(&fiq->lock); + /* Check for we've sent request to interrupt this req */ + if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) { + spin_unlock(&fiq->lock); + return -EINVAL; + } + + if (list_empty(&req->intr_entry)) { + list_add_tail(&req->intr_entry, &fiq->interrupts); + /* + * Pairs with smp_mb() implied by test_and_set_bit() + * from fuse_request_end(). + */ + smp_mb(); + if (test_bit(FR_FINISHED, &req->flags)) { + list_del_init(&req->intr_entry); + spin_unlock(&fiq->lock); + return 0; + } + fiq->ops->wake_interrupt_and_unlock(fiq); + } else { + spin_unlock(&fiq->lock); + } + return 0; +} + +static void request_wait_answer(struct fuse_req *req) +{ + struct fuse_conn *fc = req->fm->fc; + struct fuse_iqueue *fiq = &fc->iq; + int err; + + if (!fc->no_interrupt) { + /* Any signal may interrupt this */ + err = wait_event_interruptible(req->waitq, + test_bit(FR_FINISHED, &req->flags)); + if (!err) + return; + + set_bit(FR_INTERRUPTED, &req->flags); + /* matches barrier in fuse_dev_do_read() */ + smp_mb__after_atomic(); + if (test_bit(FR_SENT, &req->flags)) + queue_interrupt(req); + } + + if (!test_bit(FR_FORCE, &req->flags)) { + /* Only fatal signals may interrupt this */ + err = wait_event_killable(req->waitq, + test_bit(FR_FINISHED, &req->flags)); + if (!err) + return; + + spin_lock(&fiq->lock); + /* Request is not yet in userspace, bail out */ + if (test_bit(FR_PENDING, &req->flags)) { + list_del(&req->list); + spin_unlock(&fiq->lock); + __fuse_put_request(req); + req->out.h.error = -EINTR; + return; + } + spin_unlock(&fiq->lock); + } + + /* + * Either request is already in userspace, or it was forced. + * Wait it out. + */ + wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags)); +} + +static void __fuse_request_send(struct fuse_req *req) +{ + struct fuse_iqueue *fiq = &req->fm->fc->iq; + + BUG_ON(test_bit(FR_BACKGROUND, &req->flags)); + spin_lock(&fiq->lock); + if (!fiq->connected) { + spin_unlock(&fiq->lock); + req->out.h.error = -ENOTCONN; + } else { + req->in.h.unique = fuse_get_unique(fiq); + /* acquire extra reference, since request is still needed + after fuse_request_end() */ + __fuse_get_request(req); + queue_request_and_unlock(fiq, req); + + request_wait_answer(req); + /* Pairs with smp_wmb() in fuse_request_end() */ + smp_rmb(); + } +} + +static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) +{ + if (fc->minor < 4 && args->opcode == FUSE_STATFS) + args->out_args[0].size = FUSE_COMPAT_STATFS_SIZE; + + if (fc->minor < 9) { + switch (args->opcode) { + case FUSE_LOOKUP: + case FUSE_CREATE: + case FUSE_MKNOD: + case FUSE_MKDIR: + case FUSE_SYMLINK: + case FUSE_LINK: + args->out_args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; + break; + case FUSE_GETATTR: + case FUSE_SETATTR: + args->out_args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; + break; + } + } + if (fc->minor < 12) { + switch (args->opcode) { + case FUSE_CREATE: + args->in_args[0].size = sizeof(struct fuse_open_in); + break; + case FUSE_MKNOD: + args->in_args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE; + break; + } + } +} + +static void fuse_force_creds(struct fuse_req *req) +{ + struct fuse_conn *fc = req->fm->fc; + + req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); + req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); + req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); +} + +static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) +{ + req->in.h.opcode = args->opcode; + req->in.h.nodeid = args->nodeid; + req->args = args; + if (args->end) + __set_bit(FR_ASYNC, &req->flags); +} + +ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) +{ + struct fuse_conn *fc = fm->fc; + struct fuse_req *req; + ssize_t ret; + + if (args->force) { + atomic_inc(&fc->num_waiting); + req = fuse_request_alloc(fm, GFP_KERNEL | __GFP_NOFAIL); + + if (!args->nocreds) + fuse_force_creds(req); + + __set_bit(FR_WAITING, &req->flags); + __set_bit(FR_FORCE, &req->flags); + } else { + WARN_ON(args->nocreds); + req = fuse_get_req(fm, false); + if (IS_ERR(req)) + return PTR_ERR(req); + } + + /* Needs to be done after fuse_get_req() so that fc->minor is valid */ + fuse_adjust_compat(fc, args); + fuse_args_to_req(req, args); + + if (!args->noreply) + __set_bit(FR_ISREPLY, &req->flags); + __fuse_request_send(req); + ret = req->out.h.error; + if (!ret && args->out_argvar) { + BUG_ON(args->out_numargs == 0); + ret = args->out_args[args->out_numargs - 1].size; + } + fuse_put_request(req); + + return ret; +} + +static bool fuse_request_queue_background(struct fuse_req *req) +{ + struct fuse_mount *fm = req->fm; + struct fuse_conn *fc = fm->fc; + bool queued = false; + + WARN_ON(!test_bit(FR_BACKGROUND, &req->flags)); + if (!test_bit(FR_WAITING, &req->flags)) { + __set_bit(FR_WAITING, &req->flags); + atomic_inc(&fc->num_waiting); + } + __set_bit(FR_ISREPLY, &req->flags); + spin_lock(&fc->bg_lock); + if (likely(fc->connected)) { + fc->num_background++; + if (fc->num_background == fc->max_background) + fc->blocked = 1; + list_add_tail(&req->list, &fc->bg_queue); + flush_bg_queue(fc); + queued = true; + } + spin_unlock(&fc->bg_lock); + + return queued; +} + +int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, + gfp_t gfp_flags) +{ + struct fuse_req *req; + + if (args->force) { + WARN_ON(!args->nocreds); + req = fuse_request_alloc(fm, gfp_flags); + if (!req) + return -ENOMEM; + __set_bit(FR_BACKGROUND, &req->flags); + } else { + WARN_ON(args->nocreds); + req = fuse_get_req(fm, true); + if (IS_ERR(req)) + return PTR_ERR(req); + } + + fuse_args_to_req(req, args); + + if (!fuse_request_queue_background(req)) { + fuse_put_request(req); + return -ENOTCONN; + } + + return 0; +} +EXPORT_SYMBOL_GPL(fuse_simple_background); + +static int fuse_simple_notify_reply(struct fuse_mount *fm, + struct fuse_args *args, u64 unique) +{ + struct fuse_req *req; + struct fuse_iqueue *fiq = &fm->fc->iq; + int err = 0; + + req = fuse_get_req(fm, false); + if (IS_ERR(req)) + return PTR_ERR(req); + + __clear_bit(FR_ISREPLY, &req->flags); + req->in.h.unique = unique; + + fuse_args_to_req(req, args); + + spin_lock(&fiq->lock); + if (fiq->connected) { + queue_request_and_unlock(fiq, req); + } else { + err = -ENODEV; + spin_unlock(&fiq->lock); + fuse_put_request(req); + } + + return err; +} + +/* + * Lock the request. Up to the next unlock_request() there mustn't be + * anything that could cause a page-fault. If the request was already + * aborted bail out. + */ +static int lock_request(struct fuse_req *req) +{ + int err = 0; + if (req) { + spin_lock(&req->waitq.lock); + if (test_bit(FR_ABORTED, &req->flags)) + err = -ENOENT; + else + set_bit(FR_LOCKED, &req->flags); + spin_unlock(&req->waitq.lock); + } + return err; +} + +/* + * Unlock request. If it was aborted while locked, caller is responsible + * for unlocking and ending the request. + */ +static int unlock_request(struct fuse_req *req) +{ + int err = 0; + if (req) { + spin_lock(&req->waitq.lock); + if (test_bit(FR_ABORTED, &req->flags)) + err = -ENOENT; + else + clear_bit(FR_LOCKED, &req->flags); + spin_unlock(&req->waitq.lock); + } + return err; +} + +struct fuse_copy_state { + int write; + struct fuse_req *req; + struct iov_iter *iter; + struct pipe_buffer *pipebufs; + struct pipe_buffer *currbuf; + struct pipe_inode_info *pipe; + unsigned long nr_segs; + struct page *pg; + unsigned len; + unsigned offset; + unsigned move_pages:1; +}; + +static void fuse_copy_init(struct fuse_copy_state *cs, int write, + struct iov_iter *iter) +{ + memset(cs, 0, sizeof(*cs)); + cs->write = write; + cs->iter = iter; +} + +/* Unmap and put previous page of userspace buffer */ +static void fuse_copy_finish(struct fuse_copy_state *cs) +{ + if (cs->currbuf) { + struct pipe_buffer *buf = cs->currbuf; + + if (cs->write) + buf->len = PAGE_SIZE - cs->len; + cs->currbuf = NULL; + } else if (cs->pg) { + if (cs->write) { + flush_dcache_page(cs->pg); + set_page_dirty_lock(cs->pg); + } + put_page(cs->pg); + } + cs->pg = NULL; +} + +/* + * Get another pagefull of userspace buffer, and map it to kernel + * address space, and lock request + */ +static int fuse_copy_fill(struct fuse_copy_state *cs) +{ + struct page *page; + int err; + + err = unlock_request(cs->req); + if (err) + return err; + + fuse_copy_finish(cs); + if (cs->pipebufs) { + struct pipe_buffer *buf = cs->pipebufs; + + if (!cs->write) { + err = pipe_buf_confirm(cs->pipe, buf); + if (err) + return err; + + BUG_ON(!cs->nr_segs); + cs->currbuf = buf; + cs->pg = buf->page; + cs->offset = buf->offset; + cs->len = buf->len; + cs->pipebufs++; + cs->nr_segs--; + } else { + if (cs->nr_segs >= cs->pipe->max_usage) + return -EIO; + + page = alloc_page(GFP_HIGHUSER); + if (!page) + return -ENOMEM; + + buf->page = page; + buf->offset = 0; + buf->len = 0; + + cs->currbuf = buf; + cs->pg = page; + cs->offset = 0; + cs->len = PAGE_SIZE; + cs->pipebufs++; + cs->nr_segs++; + } + } else { + size_t off; + err = iov_iter_get_pages2(cs->iter, &page, PAGE_SIZE, 1, &off); + if (err < 0) + return err; + BUG_ON(!err); + cs->len = err; + cs->offset = off; + cs->pg = page; + } + + return lock_request(cs->req); +} + +/* Do as much copy to/from userspace buffer as we can */ +static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) +{ + unsigned ncpy = min(*size, cs->len); + if (val) { + void *pgaddr = kmap_local_page(cs->pg); + void *buf = pgaddr + cs->offset; + + if (cs->write) + memcpy(buf, *val, ncpy); + else + memcpy(*val, buf, ncpy); + + kunmap_local(pgaddr); + *val += ncpy; + } + *size -= ncpy; + cs->len -= ncpy; + cs->offset += ncpy; + return ncpy; +} + +static int fuse_check_page(struct page *page) +{ + if (page_mapcount(page) || + page->mapping != NULL || + (page->flags & PAGE_FLAGS_CHECK_AT_PREP & + ~(1 << PG_locked | + 1 << PG_referenced | + 1 << PG_uptodate | + 1 << PG_lru | + 1 << PG_active | + 1 << PG_workingset | + 1 << PG_reclaim | + 1 << PG_waiters | + LRU_GEN_MASK | LRU_REFS_MASK))) { + dump_page(page, "fuse: trying to steal weird page"); + return 1; + } + return 0; +} + +static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) +{ + int err; + struct page *oldpage = *pagep; + struct page *newpage; + struct pipe_buffer *buf = cs->pipebufs; + + get_page(oldpage); + err = unlock_request(cs->req); + if (err) + goto out_put_old; + + fuse_copy_finish(cs); + + err = pipe_buf_confirm(cs->pipe, buf); + if (err) + goto out_put_old; + + BUG_ON(!cs->nr_segs); + cs->currbuf = buf; + cs->len = buf->len; + cs->pipebufs++; + cs->nr_segs--; + + if (cs->len != PAGE_SIZE) + goto out_fallback; + + if (!pipe_buf_try_steal(cs->pipe, buf)) + goto out_fallback; + + newpage = buf->page; + + if (!PageUptodate(newpage)) + SetPageUptodate(newpage); + + ClearPageMappedToDisk(newpage); + + if (fuse_check_page(newpage) != 0) + goto out_fallback_unlock; + + /* + * This is a new and locked page, it shouldn't be mapped or + * have any special flags on it + */ + if (WARN_ON(page_mapped(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(page_has_private(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(PageMlocked(oldpage))) + goto out_fallback_unlock; + + replace_page_cache_page(oldpage, newpage); + + get_page(newpage); + + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) + lru_cache_add(newpage); + + /* + * Release while we have extra ref on stolen page. Otherwise + * anon_pipe_buf_release() might think the page can be reused. + */ + pipe_buf_release(cs->pipe, buf); + + err = 0; + spin_lock(&cs->req->waitq.lock); + if (test_bit(FR_ABORTED, &cs->req->flags)) + err = -ENOENT; + else + *pagep = newpage; + spin_unlock(&cs->req->waitq.lock); + + if (err) { + unlock_page(newpage); + put_page(newpage); + goto out_put_old; + } + + unlock_page(oldpage); + /* Drop ref for ap->pages[] array */ + put_page(oldpage); + cs->len = 0; + + err = 0; +out_put_old: + /* Drop ref obtained in this function */ + put_page(oldpage); + return err; + +out_fallback_unlock: + unlock_page(newpage); +out_fallback: + cs->pg = buf->page; + cs->offset = buf->offset; + + err = lock_request(cs->req); + if (!err) + err = 1; + + goto out_put_old; +} + +static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, + unsigned offset, unsigned count) +{ + struct pipe_buffer *buf; + int err; + + if (cs->nr_segs >= cs->pipe->max_usage) + return -EIO; + + get_page(page); + err = unlock_request(cs->req); + if (err) { + put_page(page); + return err; + } + + fuse_copy_finish(cs); + + buf = cs->pipebufs; + buf->page = page; + buf->offset = offset; + buf->len = count; + + cs->pipebufs++; + cs->nr_segs++; + cs->len = 0; + + return 0; +} + +/* + * Copy a page in the request to/from the userspace buffer. Must be + * done atomically + */ +static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, + unsigned offset, unsigned count, int zeroing) +{ + int err; + struct page *page = *pagep; + + if (page && zeroing && count < PAGE_SIZE) + clear_highpage(page); + + while (count) { + if (cs->write && cs->pipebufs && page) { + /* + * Can't control lifetime of pipe buffers, so always + * copy user pages. + */ + if (cs->req->args->user_pages) { + err = fuse_copy_fill(cs); + if (err) + return err; + } else { + return fuse_ref_page(cs, page, offset, count); + } + } else if (!cs->len) { + if (cs->move_pages && page && + offset == 0 && count == PAGE_SIZE) { + err = fuse_try_move_page(cs, pagep); + if (err <= 0) + return err; + } else { + err = fuse_copy_fill(cs); + if (err) + return err; + } + } + if (page) { + void *mapaddr = kmap_local_page(page); + void *buf = mapaddr + offset; + offset += fuse_copy_do(cs, &buf, &count); + kunmap_local(mapaddr); + } else + offset += fuse_copy_do(cs, NULL, &count); + } + if (page && !cs->write) + flush_dcache_page(page); + return 0; +} + +/* Copy pages in the request to/from userspace buffer */ +static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, + int zeroing) +{ + unsigned i; + struct fuse_req *req = cs->req; + struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args); + + + for (i = 0; i < ap->num_pages && (nbytes || zeroing); i++) { + int err; + unsigned int offset = ap->descs[i].offset; + unsigned int count = min(nbytes, ap->descs[i].length); + + err = fuse_copy_page(cs, &ap->pages[i], offset, count, zeroing); + if (err) + return err; + + nbytes -= count; + } + return 0; +} + +/* Copy a single argument in the request to/from userspace buffer */ +static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) +{ + while (size) { + if (!cs->len) { + int err = fuse_copy_fill(cs); + if (err) + return err; + } + fuse_copy_do(cs, &val, &size); + } + return 0; +} + +/* Copy request arguments to/from userspace buffer */ +static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, + unsigned argpages, struct fuse_arg *args, + int zeroing) +{ + int err = 0; + unsigned i; + + for (i = 0; !err && i < numargs; i++) { + struct fuse_arg *arg = &args[i]; + if (i == numargs - 1 && argpages) + err = fuse_copy_pages(cs, arg->size, zeroing); + else + err = fuse_copy_one(cs, arg->value, arg->size); + } + return err; +} + +static int forget_pending(struct fuse_iqueue *fiq) +{ + return fiq->forget_list_head.next != NULL; +} + +static int request_pending(struct fuse_iqueue *fiq) +{ + return !list_empty(&fiq->pending) || !list_empty(&fiq->interrupts) || + forget_pending(fiq); +} + +/* + * Transfer an interrupt request to userspace + * + * Unlike other requests this is assembled on demand, without a need + * to allocate a separate fuse_req structure. + * + * Called with fiq->lock held, releases it + */ +static int fuse_read_interrupt(struct fuse_iqueue *fiq, + struct fuse_copy_state *cs, + size_t nbytes, struct fuse_req *req) +__releases(fiq->lock) +{ + struct fuse_in_header ih; + struct fuse_interrupt_in arg; + unsigned reqsize = sizeof(ih) + sizeof(arg); + int err; + + list_del_init(&req->intr_entry); + memset(&ih, 0, sizeof(ih)); + memset(&arg, 0, sizeof(arg)); + ih.len = reqsize; + ih.opcode = FUSE_INTERRUPT; + ih.unique = (req->in.h.unique | FUSE_INT_REQ_BIT); + arg.unique = req->in.h.unique; + + spin_unlock(&fiq->lock); + if (nbytes < reqsize) + return -EINVAL; + + err = fuse_copy_one(cs, &ih, sizeof(ih)); + if (!err) + err = fuse_copy_one(cs, &arg, sizeof(arg)); + fuse_copy_finish(cs); + + return err ? err : reqsize; +} + +struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, + unsigned int max, + unsigned int *countp) +{ + struct fuse_forget_link *head = fiq->forget_list_head.next; + struct fuse_forget_link **newhead = &head; + unsigned count; + + for (count = 0; *newhead != NULL && count < max; count++) + newhead = &(*newhead)->next; + + fiq->forget_list_head.next = *newhead; + *newhead = NULL; + if (fiq->forget_list_head.next == NULL) + fiq->forget_list_tail = &fiq->forget_list_head; + + if (countp != NULL) + *countp = count; + + return head; +} +EXPORT_SYMBOL(fuse_dequeue_forget); + +static int fuse_read_single_forget(struct fuse_iqueue *fiq, + struct fuse_copy_state *cs, + size_t nbytes) +__releases(fiq->lock) +{ + int err; + struct fuse_forget_link *forget = fuse_dequeue_forget(fiq, 1, NULL); + struct fuse_forget_in arg = { + .nlookup = forget->forget_one.nlookup, + }; + struct fuse_in_header ih = { + .opcode = FUSE_FORGET, + .nodeid = forget->forget_one.nodeid, + .unique = fuse_get_unique(fiq), + .len = sizeof(ih) + sizeof(arg), + }; + + spin_unlock(&fiq->lock); + kfree(forget); + if (nbytes < ih.len) + return -EINVAL; + + err = fuse_copy_one(cs, &ih, sizeof(ih)); + if (!err) + err = fuse_copy_one(cs, &arg, sizeof(arg)); + fuse_copy_finish(cs); + + if (err) + return err; + + return ih.len; +} + +static int fuse_read_batch_forget(struct fuse_iqueue *fiq, + struct fuse_copy_state *cs, size_t nbytes) +__releases(fiq->lock) +{ + int err; + unsigned max_forgets; + unsigned count; + struct fuse_forget_link *head; + struct fuse_batch_forget_in arg = { .count = 0 }; + struct fuse_in_header ih = { + .opcode = FUSE_BATCH_FORGET, + .unique = fuse_get_unique(fiq), + .len = sizeof(ih) + sizeof(arg), + }; + + if (nbytes < ih.len) { + spin_unlock(&fiq->lock); + return -EINVAL; + } + + max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one); + head = fuse_dequeue_forget(fiq, max_forgets, &count); + spin_unlock(&fiq->lock); + + arg.count = count; + ih.len += count * sizeof(struct fuse_forget_one); + err = fuse_copy_one(cs, &ih, sizeof(ih)); + if (!err) + err = fuse_copy_one(cs, &arg, sizeof(arg)); + + while (head) { + struct fuse_forget_link *forget = head; + + if (!err) { + err = fuse_copy_one(cs, &forget->forget_one, + sizeof(forget->forget_one)); + } + head = forget->next; + kfree(forget); + } + + fuse_copy_finish(cs); + + if (err) + return err; + + return ih.len; +} + +static int fuse_read_forget(struct fuse_conn *fc, struct fuse_iqueue *fiq, + struct fuse_copy_state *cs, + size_t nbytes) +__releases(fiq->lock) +{ + if (fc->minor < 16 || fiq->forget_list_head.next->next == NULL) + return fuse_read_single_forget(fiq, cs, nbytes); + else + return fuse_read_batch_forget(fiq, cs, nbytes); +} + +/* + * Read a single request into the userspace filesystem's buffer. This + * function waits until a request is available, then removes it from + * the pending list and copies request data to userspace buffer. If + * no reply is needed (FORGET) or request has been aborted or there + * was an error during the copying then it's finished by calling + * fuse_request_end(). Otherwise add it to the processing list, and set + * the 'sent' flag. + */ +static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, + struct fuse_copy_state *cs, size_t nbytes) +{ + ssize_t err; + struct fuse_conn *fc = fud->fc; + struct fuse_iqueue *fiq = &fc->iq; + struct fuse_pqueue *fpq = &fud->pq; + struct fuse_req *req; + struct fuse_args *args; + unsigned reqsize; + unsigned int hash; + + /* + * Require sane minimum read buffer - that has capacity for fixed part + * of any request header + negotiated max_write room for data. + * + * Historically libfuse reserves 4K for fixed header room, but e.g. + * GlusterFS reserves only 80 bytes + * + * = `sizeof(fuse_in_header) + sizeof(fuse_write_in)` + * + * which is the absolute minimum any sane filesystem should be using + * for header room. + */ + if (nbytes < max_t(size_t, FUSE_MIN_READ_BUFFER, + sizeof(struct fuse_in_header) + + sizeof(struct fuse_write_in) + + fc->max_write)) + return -EINVAL; + + restart: + for (;;) { + spin_lock(&fiq->lock); + if (!fiq->connected || request_pending(fiq)) + break; + spin_unlock(&fiq->lock); + + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + err = wait_event_interruptible_exclusive(fiq->waitq, + !fiq->connected || request_pending(fiq)); + if (err) + return err; + } + + if (!fiq->connected) { + err = fc->aborted ? -ECONNABORTED : -ENODEV; + goto err_unlock; + } + + if (!list_empty(&fiq->interrupts)) { + req = list_entry(fiq->interrupts.next, struct fuse_req, + intr_entry); + return fuse_read_interrupt(fiq, cs, nbytes, req); + } + + if (forget_pending(fiq)) { + if (list_empty(&fiq->pending) || fiq->forget_batch-- > 0) + return fuse_read_forget(fc, fiq, cs, nbytes); + + if (fiq->forget_batch <= -8) + fiq->forget_batch = 16; + } + + req = list_entry(fiq->pending.next, struct fuse_req, list); + clear_bit(FR_PENDING, &req->flags); + list_del_init(&req->list); + spin_unlock(&fiq->lock); + + args = req->args; + reqsize = req->in.h.len; + + /* If request is too large, reply with an error and restart the read */ + if (nbytes < reqsize) { + req->out.h.error = -EIO; + /* SETXATTR is special, since it may contain too large data */ + if (args->opcode == FUSE_SETXATTR) + req->out.h.error = -E2BIG; + fuse_request_end(req); + goto restart; + } + spin_lock(&fpq->lock); + /* + * Must not put request on fpq->io queue after having been shut down by + * fuse_abort_conn() + */ + if (!fpq->connected) { + req->out.h.error = err = -ECONNABORTED; + goto out_end; + + } + list_add(&req->list, &fpq->io); + spin_unlock(&fpq->lock); + cs->req = req; + err = fuse_copy_one(cs, &req->in.h, sizeof(req->in.h)); + if (!err) + err = fuse_copy_args(cs, args->in_numargs, args->in_pages, + (struct fuse_arg *) args->in_args, 0); + fuse_copy_finish(cs); + spin_lock(&fpq->lock); + clear_bit(FR_LOCKED, &req->flags); + if (!fpq->connected) { + err = fc->aborted ? -ECONNABORTED : -ENODEV; + goto out_end; + } + if (err) { + req->out.h.error = -EIO; + goto out_end; + } + if (!test_bit(FR_ISREPLY, &req->flags)) { + err = reqsize; + goto out_end; + } + hash = fuse_req_hash(req->in.h.unique); + list_move_tail(&req->list, &fpq->processing[hash]); + __fuse_get_request(req); + set_bit(FR_SENT, &req->flags); + spin_unlock(&fpq->lock); + /* matches barrier in request_wait_answer() */ + smp_mb__after_atomic(); + if (test_bit(FR_INTERRUPTED, &req->flags)) + queue_interrupt(req); + fuse_put_request(req); + + return reqsize; + +out_end: + if (!test_bit(FR_PRIVATE, &req->flags)) + list_del_init(&req->list); + spin_unlock(&fpq->lock); + fuse_request_end(req); + return err; + + err_unlock: + spin_unlock(&fiq->lock); + return err; +} + +static int fuse_dev_open(struct inode *inode, struct file *file) +{ + /* + * The fuse device's file's private_data is used to hold + * the fuse_conn(ection) when it is mounted, and is used to + * keep track of whether the file has been mounted already. + */ + file->private_data = NULL; + return 0; +} + +static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) +{ + struct fuse_copy_state cs; + struct file *file = iocb->ki_filp; + struct fuse_dev *fud = fuse_get_dev(file); + + if (!fud) + return -EPERM; + + if (!user_backed_iter(to)) + return -EINVAL; + + fuse_copy_init(&cs, 1, to); + + return fuse_dev_do_read(fud, file, &cs, iov_iter_count(to)); +} + +static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + int total, ret; + int page_nr = 0; + struct pipe_buffer *bufs; + struct fuse_copy_state cs; + struct fuse_dev *fud = fuse_get_dev(in); + + if (!fud) + return -EPERM; + + bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer), + GFP_KERNEL); + if (!bufs) + return -ENOMEM; + + fuse_copy_init(&cs, 1, NULL); + cs.pipebufs = bufs; + cs.pipe = pipe; + ret = fuse_dev_do_read(fud, in, &cs, len); + if (ret < 0) + goto out; + + if (pipe_occupancy(pipe->head, pipe->tail) + cs.nr_segs > pipe->max_usage) { + ret = -EIO; + goto out; + } + + for (ret = total = 0; page_nr < cs.nr_segs; total += ret) { + /* + * Need to be careful about this. Having buf->ops in module + * code can Oops if the buffer persists after module unload. + */ + bufs[page_nr].ops = &nosteal_pipe_buf_ops; + bufs[page_nr].flags = 0; + ret = add_to_pipe(pipe, &bufs[page_nr++]); + if (unlikely(ret < 0)) + break; + } + if (total) + ret = total; +out: + for (; page_nr < cs.nr_segs; page_nr++) + put_page(bufs[page_nr].page); + + kvfree(bufs); + return ret; +} + +static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_poll_wakeup_out outarg; + int err = -EINVAL; + + if (size != sizeof(outarg)) + goto err; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto err; + + fuse_copy_finish(cs); + return fuse_notify_poll_wakeup(fc, &outarg); + +err: + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_inval_inode_out outarg; + int err = -EINVAL; + + if (size != sizeof(outarg)) + goto err; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto err; + fuse_copy_finish(cs); + + down_read(&fc->killsb); + err = fuse_reverse_inval_inode(fc, outarg.ino, + outarg.off, outarg.len); + up_read(&fc->killsb); + return err; + +err: + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_inval_entry_out outarg; + int err = -ENOMEM; + char *buf; + struct qstr name; + + buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); + if (!buf) + goto err; + + err = -EINVAL; + if (size < sizeof(outarg)) + goto err; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto err; + + err = -ENAMETOOLONG; + if (outarg.namelen > FUSE_NAME_MAX) + goto err; + + err = -EINVAL; + if (size != sizeof(outarg) + outarg.namelen + 1) + goto err; + + name.name = buf; + name.len = outarg.namelen; + err = fuse_copy_one(cs, buf, outarg.namelen + 1); + if (err) + goto err; + fuse_copy_finish(cs); + buf[outarg.namelen] = 0; + + down_read(&fc->killsb); + err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name); + up_read(&fc->killsb); + kfree(buf); + return err; + +err: + kfree(buf); + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_delete_out outarg; + int err = -ENOMEM; + char *buf; + struct qstr name; + + buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); + if (!buf) + goto err; + + err = -EINVAL; + if (size < sizeof(outarg)) + goto err; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto err; + + err = -ENAMETOOLONG; + if (outarg.namelen > FUSE_NAME_MAX) + goto err; + + err = -EINVAL; + if (size != sizeof(outarg) + outarg.namelen + 1) + goto err; + + name.name = buf; + name.len = outarg.namelen; + err = fuse_copy_one(cs, buf, outarg.namelen + 1); + if (err) + goto err; + fuse_copy_finish(cs); + buf[outarg.namelen] = 0; + + down_read(&fc->killsb); + err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name); + up_read(&fc->killsb); + kfree(buf); + return err; + +err: + kfree(buf); + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_store_out outarg; + struct inode *inode; + struct address_space *mapping; + u64 nodeid; + int err; + pgoff_t index; + unsigned int offset; + unsigned int num; + loff_t file_size; + loff_t end; + + err = -EINVAL; + if (size < sizeof(outarg)) + goto out_finish; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto out_finish; + + err = -EINVAL; + if (size - sizeof(outarg) != outarg.size) + goto out_finish; + + nodeid = outarg.nodeid; + + down_read(&fc->killsb); + + err = -ENOENT; + inode = fuse_ilookup(fc, nodeid, NULL); + if (!inode) + goto out_up_killsb; + + mapping = inode->i_mapping; + index = outarg.offset >> PAGE_SHIFT; + offset = outarg.offset & ~PAGE_MASK; + file_size = i_size_read(inode); + end = outarg.offset + outarg.size; + if (end > file_size) { + file_size = end; + fuse_write_update_attr(inode, file_size, outarg.size); + } + + num = outarg.size; + while (num) { + struct page *page; + unsigned int this_num; + + err = -ENOMEM; + page = find_or_create_page(mapping, index, + mapping_gfp_mask(mapping)); + if (!page) + goto out_iput; + + this_num = min_t(unsigned, num, PAGE_SIZE - offset); + err = fuse_copy_page(cs, &page, offset, this_num, 0); + if (!err && offset == 0 && + (this_num == PAGE_SIZE || file_size == end)) + SetPageUptodate(page); + unlock_page(page); + put_page(page); + + if (err) + goto out_iput; + + num -= this_num; + offset = 0; + index++; + } + + err = 0; + +out_iput: + iput(inode); +out_up_killsb: + up_read(&fc->killsb); +out_finish: + fuse_copy_finish(cs); + return err; +} + +struct fuse_retrieve_args { + struct fuse_args_pages ap; + struct fuse_notify_retrieve_in inarg; +}; + +static void fuse_retrieve_end(struct fuse_mount *fm, struct fuse_args *args, + int error) +{ + struct fuse_retrieve_args *ra = + container_of(args, typeof(*ra), ap.args); + + release_pages(ra->ap.pages, ra->ap.num_pages); + kfree(ra); +} + +static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, + struct fuse_notify_retrieve_out *outarg) +{ + int err; + struct address_space *mapping = inode->i_mapping; + pgoff_t index; + loff_t file_size; + unsigned int num; + unsigned int offset; + size_t total_len = 0; + unsigned int num_pages; + struct fuse_conn *fc = fm->fc; + struct fuse_retrieve_args *ra; + size_t args_size = sizeof(*ra); + struct fuse_args_pages *ap; + struct fuse_args *args; + + offset = outarg->offset & ~PAGE_MASK; + file_size = i_size_read(inode); + + num = min(outarg->size, fc->max_write); + if (outarg->offset > file_size) + num = 0; + else if (outarg->offset + num > file_size) + num = file_size - outarg->offset; + + num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + num_pages = min(num_pages, fc->max_pages); + + args_size += num_pages * (sizeof(ap->pages[0]) + sizeof(ap->descs[0])); + + ra = kzalloc(args_size, GFP_KERNEL); + if (!ra) + return -ENOMEM; + + ap = &ra->ap; + ap->pages = (void *) (ra + 1); + ap->descs = (void *) (ap->pages + num_pages); + + args = &ap->args; + args->nodeid = outarg->nodeid; + args->opcode = FUSE_NOTIFY_REPLY; + args->in_numargs = 2; + args->in_pages = true; + args->end = fuse_retrieve_end; + + index = outarg->offset >> PAGE_SHIFT; + + while (num && ap->num_pages < num_pages) { + struct page *page; + unsigned int this_num; + + page = find_get_page(mapping, index); + if (!page) + break; + + this_num = min_t(unsigned, num, PAGE_SIZE - offset); + ap->pages[ap->num_pages] = page; + ap->descs[ap->num_pages].offset = offset; + ap->descs[ap->num_pages].length = this_num; + ap->num_pages++; + + offset = 0; + num -= this_num; + total_len += this_num; + index++; + } + ra->inarg.offset = outarg->offset; + ra->inarg.size = total_len; + args->in_args[0].size = sizeof(ra->inarg); + args->in_args[0].value = &ra->inarg; + args->in_args[1].size = total_len; + + err = fuse_simple_notify_reply(fm, args, outarg->notify_unique); + if (err) + fuse_retrieve_end(fm, args, err); + + return err; +} + +static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_retrieve_out outarg; + struct fuse_mount *fm; + struct inode *inode; + u64 nodeid; + int err; + + err = -EINVAL; + if (size != sizeof(outarg)) + goto copy_finish; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto copy_finish; + + fuse_copy_finish(cs); + + down_read(&fc->killsb); + err = -ENOENT; + nodeid = outarg.nodeid; + + inode = fuse_ilookup(fc, nodeid, &fm); + if (inode) { + err = fuse_retrieve(fm, inode, &outarg); + iput(inode); + } + up_read(&fc->killsb); + + return err; + +copy_finish: + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, + unsigned int size, struct fuse_copy_state *cs) +{ + /* Don't try to move pages (yet) */ + cs->move_pages = 0; + + switch (code) { + case FUSE_NOTIFY_POLL: + return fuse_notify_poll(fc, size, cs); + + case FUSE_NOTIFY_INVAL_INODE: + return fuse_notify_inval_inode(fc, size, cs); + + case FUSE_NOTIFY_INVAL_ENTRY: + return fuse_notify_inval_entry(fc, size, cs); + + case FUSE_NOTIFY_STORE: + return fuse_notify_store(fc, size, cs); + + case FUSE_NOTIFY_RETRIEVE: + return fuse_notify_retrieve(fc, size, cs); + + case FUSE_NOTIFY_DELETE: + return fuse_notify_delete(fc, size, cs); + + default: + fuse_copy_finish(cs); + return -EINVAL; + } +} + +/* Look up request on processing list by unique ID */ +static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) +{ + unsigned int hash = fuse_req_hash(unique); + struct fuse_req *req; + + list_for_each_entry(req, &fpq->processing[hash], list) { + if (req->in.h.unique == unique) + return req; + } + return NULL; +} + +static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, + unsigned nbytes) +{ + unsigned reqsize = sizeof(struct fuse_out_header); + + reqsize += fuse_len_args(args->out_numargs, args->out_args); + + if (reqsize < nbytes || (reqsize > nbytes && !args->out_argvar)) + return -EINVAL; + else if (reqsize > nbytes) { + struct fuse_arg *lastarg = &args->out_args[args->out_numargs-1]; + unsigned diffsize = reqsize - nbytes; + + if (diffsize > lastarg->size) + return -EINVAL; + lastarg->size -= diffsize; + } + return fuse_copy_args(cs, args->out_numargs, args->out_pages, + args->out_args, args->page_zeroing); +} + +/* + * Write a single reply to a request. First the header is copied from + * the write buffer. The request is then searched on the processing + * list by the unique ID found in the header. If found, then remove + * it from the list and copy the rest of the buffer to the request. + * The request is finished by calling fuse_request_end(). + */ +static ssize_t fuse_dev_do_write(struct fuse_dev *fud, + struct fuse_copy_state *cs, size_t nbytes) +{ + int err; + struct fuse_conn *fc = fud->fc; + struct fuse_pqueue *fpq = &fud->pq; + struct fuse_req *req; + struct fuse_out_header oh; + + err = -EINVAL; + if (nbytes < sizeof(struct fuse_out_header)) + goto out; + + err = fuse_copy_one(cs, &oh, sizeof(oh)); + if (err) + goto copy_finish; + + err = -EINVAL; + if (oh.len != nbytes) + goto copy_finish; + + /* + * Zero oh.unique indicates unsolicited notification message + * and error contains notification code. + */ + if (!oh.unique) { + err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs); + goto out; + } + + err = -EINVAL; + if (oh.error <= -512 || oh.error > 0) + goto copy_finish; + + spin_lock(&fpq->lock); + req = NULL; + if (fpq->connected) + req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT); + + err = -ENOENT; + if (!req) { + spin_unlock(&fpq->lock); + goto copy_finish; + } + + /* Is it an interrupt reply ID? */ + if (oh.unique & FUSE_INT_REQ_BIT) { + __fuse_get_request(req); + spin_unlock(&fpq->lock); + + err = 0; + if (nbytes != sizeof(struct fuse_out_header)) + err = -EINVAL; + else if (oh.error == -ENOSYS) + fc->no_interrupt = 1; + else if (oh.error == -EAGAIN) + err = queue_interrupt(req); + + fuse_put_request(req); + + goto copy_finish; + } + + clear_bit(FR_SENT, &req->flags); + list_move(&req->list, &fpq->io); + req->out.h = oh; + set_bit(FR_LOCKED, &req->flags); + spin_unlock(&fpq->lock); + cs->req = req; + if (!req->args->page_replace) + cs->move_pages = 0; + + if (oh.error) + err = nbytes != sizeof(oh) ? -EINVAL : 0; + else + err = copy_out_args(cs, req->args, nbytes); + fuse_copy_finish(cs); + + spin_lock(&fpq->lock); + clear_bit(FR_LOCKED, &req->flags); + if (!fpq->connected) + err = -ENOENT; + else if (err) + req->out.h.error = -EIO; + if (!test_bit(FR_PRIVATE, &req->flags)) + list_del_init(&req->list); + spin_unlock(&fpq->lock); + + fuse_request_end(req); +out: + return err ? err : nbytes; + +copy_finish: + fuse_copy_finish(cs); + goto out; +} + +static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct fuse_copy_state cs; + struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp); + + if (!fud) + return -EPERM; + + if (!user_backed_iter(from)) + return -EINVAL; + + fuse_copy_init(&cs, 0, from); + + return fuse_dev_do_write(fud, &cs, iov_iter_count(from)); +} + +static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + unsigned int head, tail, mask, count; + unsigned nbuf; + unsigned idx; + struct pipe_buffer *bufs; + struct fuse_copy_state cs; + struct fuse_dev *fud; + size_t rem; + ssize_t ret; + + fud = fuse_get_dev(out); + if (!fud) + return -EPERM; + + pipe_lock(pipe); + + head = pipe->head; + tail = pipe->tail; + mask = pipe->ring_size - 1; + count = head - tail; + + bufs = kvmalloc_array(count, sizeof(struct pipe_buffer), GFP_KERNEL); + if (!bufs) { + pipe_unlock(pipe); + return -ENOMEM; + } + + nbuf = 0; + rem = 0; + for (idx = tail; idx != head && rem < len; idx++) + rem += pipe->bufs[idx & mask].len; + + ret = -EINVAL; + if (rem < len) + goto out_free; + + rem = len; + while (rem) { + struct pipe_buffer *ibuf; + struct pipe_buffer *obuf; + + if (WARN_ON(nbuf >= count || tail == head)) + goto out_free; + + ibuf = &pipe->bufs[tail & mask]; + obuf = &bufs[nbuf]; + + if (rem >= ibuf->len) { + *obuf = *ibuf; + ibuf->ops = NULL; + tail++; + pipe->tail = tail; + } else { + if (!pipe_buf_get(pipe, ibuf)) + goto out_free; + + *obuf = *ibuf; + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + obuf->len = rem; + ibuf->offset += obuf->len; + ibuf->len -= obuf->len; + } + nbuf++; + rem -= obuf->len; + } + pipe_unlock(pipe); + + fuse_copy_init(&cs, 0, NULL); + cs.pipebufs = bufs; + cs.nr_segs = nbuf; + cs.pipe = pipe; + + if (flags & SPLICE_F_MOVE) + cs.move_pages = 1; + + ret = fuse_dev_do_write(fud, &cs, len); + + pipe_lock(pipe); +out_free: + for (idx = 0; idx < nbuf; idx++) { + struct pipe_buffer *buf = &bufs[idx]; + + if (buf->ops) + pipe_buf_release(pipe, buf); + } + pipe_unlock(pipe); + + kvfree(bufs); + return ret; +} + +static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) +{ + __poll_t mask = EPOLLOUT | EPOLLWRNORM; + struct fuse_iqueue *fiq; + struct fuse_dev *fud = fuse_get_dev(file); + + if (!fud) + return EPOLLERR; + + fiq = &fud->fc->iq; + poll_wait(file, &fiq->waitq, wait); + + spin_lock(&fiq->lock); + if (!fiq->connected) + mask = EPOLLERR; + else if (request_pending(fiq)) + mask |= EPOLLIN | EPOLLRDNORM; + spin_unlock(&fiq->lock); + + return mask; +} + +/* Abort all requests on the given list (pending or processing) */ +static void end_requests(struct list_head *head) +{ + while (!list_empty(head)) { + struct fuse_req *req; + req = list_entry(head->next, struct fuse_req, list); + req->out.h.error = -ECONNABORTED; + clear_bit(FR_SENT, &req->flags); + list_del_init(&req->list); + fuse_request_end(req); + } +} + +static void end_polls(struct fuse_conn *fc) +{ + struct rb_node *p; + + p = rb_first(&fc->polled_files); + + while (p) { + struct fuse_file *ff; + ff = rb_entry(p, struct fuse_file, polled_node); + wake_up_interruptible_all(&ff->poll_wait); + + p = rb_next(p); + } +} + +/* + * Abort all requests. + * + * Emergency exit in case of a malicious or accidental deadlock, or just a hung + * filesystem. + * + * The same effect is usually achievable through killing the filesystem daemon + * and all users of the filesystem. The exception is the combination of an + * asynchronous request and the tricky deadlock (see + * Documentation/filesystems/fuse.rst). + * + * Aborting requests under I/O goes as follows: 1: Separate out unlocked + * requests, they should be finished off immediately. Locked requests will be + * finished after unlock; see unlock_request(). 2: Finish off the unlocked + * requests. It is possible that some request will finish before we can. This + * is OK, the request will in that case be removed from the list before we touch + * it. + */ +void fuse_abort_conn(struct fuse_conn *fc) +{ + struct fuse_iqueue *fiq = &fc->iq; + + spin_lock(&fc->lock); + if (fc->connected) { + struct fuse_dev *fud; + struct fuse_req *req, *next; + LIST_HEAD(to_end); + unsigned int i; + + /* Background queuing checks fc->connected under bg_lock */ + spin_lock(&fc->bg_lock); + fc->connected = 0; + spin_unlock(&fc->bg_lock); + + fuse_set_initialized(fc); + list_for_each_entry(fud, &fc->devices, entry) { + struct fuse_pqueue *fpq = &fud->pq; + + spin_lock(&fpq->lock); + fpq->connected = 0; + list_for_each_entry_safe(req, next, &fpq->io, list) { + req->out.h.error = -ECONNABORTED; + spin_lock(&req->waitq.lock); + set_bit(FR_ABORTED, &req->flags); + if (!test_bit(FR_LOCKED, &req->flags)) { + set_bit(FR_PRIVATE, &req->flags); + __fuse_get_request(req); + list_move(&req->list, &to_end); + } + spin_unlock(&req->waitq.lock); + } + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + list_splice_tail_init(&fpq->processing[i], + &to_end); + spin_unlock(&fpq->lock); + } + spin_lock(&fc->bg_lock); + fc->blocked = 0; + fc->max_background = UINT_MAX; + flush_bg_queue(fc); + spin_unlock(&fc->bg_lock); + + spin_lock(&fiq->lock); + fiq->connected = 0; + list_for_each_entry(req, &fiq->pending, list) + clear_bit(FR_PENDING, &req->flags); + list_splice_tail_init(&fiq->pending, &to_end); + while (forget_pending(fiq)) + kfree(fuse_dequeue_forget(fiq, 1, NULL)); + wake_up_all(&fiq->waitq); + spin_unlock(&fiq->lock); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + end_polls(fc); + wake_up_all(&fc->blocked_waitq); + spin_unlock(&fc->lock); + + end_requests(&to_end); + } else { + spin_unlock(&fc->lock); + } +} +EXPORT_SYMBOL_GPL(fuse_abort_conn); + +void fuse_wait_aborted(struct fuse_conn *fc) +{ + /* matches implicit memory barrier in fuse_drop_waiting() */ + smp_mb(); + wait_event(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0); +} + +int fuse_dev_release(struct inode *inode, struct file *file) +{ + struct fuse_dev *fud = fuse_get_dev(file); + + if (fud) { + struct fuse_conn *fc = fud->fc; + struct fuse_pqueue *fpq = &fud->pq; + LIST_HEAD(to_end); + unsigned int i; + + spin_lock(&fpq->lock); + WARN_ON(!list_empty(&fpq->io)); + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + list_splice_init(&fpq->processing[i], &to_end); + spin_unlock(&fpq->lock); + + end_requests(&to_end); + + /* Are we the last open device? */ + if (atomic_dec_and_test(&fc->dev_count)) { + WARN_ON(fc->iq.fasync != NULL); + fuse_abort_conn(fc); + } + fuse_dev_free(fud); + } + return 0; +} +EXPORT_SYMBOL_GPL(fuse_dev_release); + +static int fuse_dev_fasync(int fd, struct file *file, int on) +{ + struct fuse_dev *fud = fuse_get_dev(file); + + if (!fud) + return -EPERM; + + /* No locking - fasync_helper does its own locking */ + return fasync_helper(fd, file, on, &fud->fc->iq.fasync); +} + +static int fuse_device_clone(struct fuse_conn *fc, struct file *new) +{ + struct fuse_dev *fud; + + if (new->private_data) + return -EINVAL; + + fud = fuse_dev_alloc_install(fc); + if (!fud) + return -ENOMEM; + + new->private_data = fud; + atomic_inc(&fc->dev_count); + + return 0; +} + +static long fuse_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int res; + int oldfd; + struct fuse_dev *fud = NULL; + + switch (cmd) { + case FUSE_DEV_IOC_CLONE: + res = -EFAULT; + if (!get_user(oldfd, (__u32 __user *)arg)) { + struct file *old = fget(oldfd); + + res = -EINVAL; + if (old) { + /* + * Check against file->f_op because CUSE + * uses the same ioctl handler. + */ + if (old->f_op == file->f_op && + old->f_cred->user_ns == file->f_cred->user_ns) + fud = fuse_get_dev(old); + + if (fud) { + mutex_lock(&fuse_mutex); + res = fuse_device_clone(fud->fc, file); + mutex_unlock(&fuse_mutex); + } + fput(old); + } + } + break; + default: + res = -ENOTTY; + break; + } + return res; +} + +const struct file_operations fuse_dev_operations = { + .owner = THIS_MODULE, + .open = fuse_dev_open, + .llseek = no_llseek, + .read_iter = fuse_dev_read, + .splice_read = fuse_dev_splice_read, + .write_iter = fuse_dev_write, + .splice_write = fuse_dev_splice_write, + .poll = fuse_dev_poll, + .release = fuse_dev_release, + .fasync = fuse_dev_fasync, + .unlocked_ioctl = fuse_dev_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; +EXPORT_SYMBOL_GPL(fuse_dev_operations); + +static struct miscdevice fuse_miscdevice = { + .minor = FUSE_MINOR, + .name = "fuse", + .fops = &fuse_dev_operations, +}; + +int __init fuse_dev_init(void) +{ + int err = -ENOMEM; + fuse_req_cachep = kmem_cache_create("fuse_request", + sizeof(struct fuse_req), + 0, 0, NULL); + if (!fuse_req_cachep) + goto out; + + err = misc_register(&fuse_miscdevice); + if (err) + goto out_cache_clean; + + return 0; + + out_cache_clean: + kmem_cache_destroy(fuse_req_cachep); + out: + return err; +} + +void fuse_dev_cleanup(void) +{ + misc_deregister(&fuse_miscdevice); + kmem_cache_destroy(fuse_req_cachep); +} diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c new file mode 100644 index 000000000..5e408e7ec --- /dev/null +++ b/fs/fuse/dir.c @@ -0,0 +1,2013 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static bool __read_mostly allow_sys_admin_access; +module_param(allow_sys_admin_access, bool, 0644); +MODULE_PARM_DESC(allow_sys_admin_access, + "Allow users with CAP_SYS_ADMIN in initial userns to bypass allow_other access check"); + +static void fuse_advise_use_readdirplus(struct inode *dir) +{ + struct fuse_inode *fi = get_fuse_inode(dir); + + set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); +} + +#if BITS_PER_LONG >= 64 +static inline void __fuse_dentry_settime(struct dentry *entry, u64 time) +{ + entry->d_fsdata = (void *) time; +} + +static inline u64 fuse_dentry_time(const struct dentry *entry) +{ + return (u64)entry->d_fsdata; +} + +#else +union fuse_dentry { + u64 time; + struct rcu_head rcu; +}; + +static inline void __fuse_dentry_settime(struct dentry *dentry, u64 time) +{ + ((union fuse_dentry *) dentry->d_fsdata)->time = time; +} + +static inline u64 fuse_dentry_time(const struct dentry *entry) +{ + return ((union fuse_dentry *) entry->d_fsdata)->time; +} +#endif + +static void fuse_dentry_settime(struct dentry *dentry, u64 time) +{ + struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb); + bool delete = !time && fc->delete_stale; + /* + * Mess with DCACHE_OP_DELETE because dput() will be faster without it. + * Don't care about races, either way it's just an optimization + */ + if ((!delete && (dentry->d_flags & DCACHE_OP_DELETE)) || + (delete && !(dentry->d_flags & DCACHE_OP_DELETE))) { + spin_lock(&dentry->d_lock); + if (!delete) + dentry->d_flags &= ~DCACHE_OP_DELETE; + else + dentry->d_flags |= DCACHE_OP_DELETE; + spin_unlock(&dentry->d_lock); + } + + __fuse_dentry_settime(dentry, time); +} + +/* + * FUSE caches dentries and attributes with separate timeout. The + * time in jiffies until the dentry/attributes are valid is stored in + * dentry->d_fsdata and fuse_inode->i_time respectively. + */ + +/* + * Calculate the time in jiffies until a dentry/attributes are valid + */ +static u64 time_to_jiffies(u64 sec, u32 nsec) +{ + if (sec || nsec) { + struct timespec64 ts = { + sec, + min_t(u32, nsec, NSEC_PER_SEC - 1) + }; + + return get_jiffies_64() + timespec64_to_jiffies(&ts); + } else + return 0; +} + +/* + * Set dentry and possibly attribute timeouts from the lookup/mk* + * replies + */ +void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o) +{ + fuse_dentry_settime(entry, + time_to_jiffies(o->entry_valid, o->entry_valid_nsec)); +} + +static u64 attr_timeout(struct fuse_attr_out *o) +{ + return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); +} + +u64 entry_attr_timeout(struct fuse_entry_out *o) +{ + return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); +} + +void fuse_invalidate_attr_mask(struct inode *inode, u32 mask) +{ + set_mask_bits(&get_fuse_inode(inode)->inval_mask, 0, mask); +} + +/* + * Mark the attributes as stale, so that at the next call to + * ->getattr() they will be fetched from userspace + */ +void fuse_invalidate_attr(struct inode *inode) +{ + fuse_invalidate_attr_mask(inode, STATX_BASIC_STATS); +} + +static void fuse_dir_changed(struct inode *dir) +{ + fuse_invalidate_attr(dir); + inode_maybe_inc_iversion(dir, false); +} + +/** + * Mark the attributes as stale due to an atime change. Avoid the invalidate if + * atime is not used. + */ +void fuse_invalidate_atime(struct inode *inode) +{ + if (!IS_RDONLY(inode)) + fuse_invalidate_attr_mask(inode, STATX_ATIME); +} + +/* + * Just mark the entry as stale, so that a next attempt to look it up + * will result in a new lookup call to userspace + * + * This is called when a dentry is about to become negative and the + * timeout is unknown (unlink, rmdir, rename and in some cases + * lookup) + */ +void fuse_invalidate_entry_cache(struct dentry *entry) +{ + fuse_dentry_settime(entry, 0); +} + +/* + * Same as fuse_invalidate_entry_cache(), but also try to remove the + * dentry from the hash + */ +static void fuse_invalidate_entry(struct dentry *entry) +{ + d_invalidate(entry); + fuse_invalidate_entry_cache(entry); +} + +static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, + u64 nodeid, const struct qstr *name, + struct fuse_entry_out *outarg) +{ + memset(outarg, 0, sizeof(struct fuse_entry_out)); + args->opcode = FUSE_LOOKUP; + args->nodeid = nodeid; + args->in_numargs = 1; + args->in_args[0].size = name->len + 1; + args->in_args[0].value = name->name; + args->out_numargs = 1; + args->out_args[0].size = sizeof(struct fuse_entry_out); + args->out_args[0].value = outarg; +} + +/* + * Check whether the dentry is still valid + * + * If the entry validity timeout has expired and the dentry is + * positive, try to redo the lookup. If the lookup results in a + * different inode, then let the VFS invalidate the dentry and redo + * the lookup once more. If the lookup results in the same inode, + * then refresh the attributes, timeouts and mark the dentry valid. + */ +static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) +{ + struct inode *inode; + struct dentry *parent; + struct fuse_mount *fm; + struct fuse_inode *fi; + int ret; + + inode = d_inode_rcu(entry); + if (inode && fuse_is_bad(inode)) + goto invalid; + else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || + (flags & (LOOKUP_EXCL | LOOKUP_REVAL | LOOKUP_RENAME_TARGET))) { + struct fuse_entry_out outarg; + FUSE_ARGS(args); + struct fuse_forget_link *forget; + u64 attr_version; + + /* For negative dentries, always do a fresh lookup */ + if (!inode) + goto invalid; + + ret = -ECHILD; + if (flags & LOOKUP_RCU) + goto out; + + fm = get_fuse_mount(inode); + + forget = fuse_alloc_forget(); + ret = -ENOMEM; + if (!forget) + goto out; + + attr_version = fuse_get_attr_version(fm->fc); + + parent = dget_parent(entry); + fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)), + &entry->d_name, &outarg); + ret = fuse_simple_request(fm, &args); + dput(parent); + /* Zero nodeid is same as -ENOENT */ + if (!ret && !outarg.nodeid) + ret = -ENOENT; + if (!ret) { + fi = get_fuse_inode(inode); + if (outarg.nodeid != get_node_id(inode) || + (bool) IS_AUTOMOUNT(inode) != (bool) (outarg.attr.flags & FUSE_ATTR_SUBMOUNT)) { + fuse_queue_forget(fm->fc, forget, + outarg.nodeid, 1); + goto invalid; + } + spin_lock(&fi->lock); + fi->nlookup++; + spin_unlock(&fi->lock); + } + kfree(forget); + if (ret == -ENOMEM || ret == -EINTR) + goto out; + if (ret || fuse_invalid_attr(&outarg.attr) || + fuse_stale_inode(inode, outarg.generation, &outarg.attr)) + goto invalid; + + forget_all_cached_acls(inode); + fuse_change_attributes(inode, &outarg.attr, + entry_attr_timeout(&outarg), + attr_version); + fuse_change_entry_timeout(entry, &outarg); + } else if (inode) { + fi = get_fuse_inode(inode); + if (flags & LOOKUP_RCU) { + if (test_bit(FUSE_I_INIT_RDPLUS, &fi->state)) + return -ECHILD; + } else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) { + parent = dget_parent(entry); + fuse_advise_use_readdirplus(d_inode(parent)); + dput(parent); + } + } + ret = 1; +out: + return ret; + +invalid: + ret = 0; + goto out; +} + +#if BITS_PER_LONG < 64 +static int fuse_dentry_init(struct dentry *dentry) +{ + dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), + GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); + + return dentry->d_fsdata ? 0 : -ENOMEM; +} +static void fuse_dentry_release(struct dentry *dentry) +{ + union fuse_dentry *fd = dentry->d_fsdata; + + kfree_rcu(fd, rcu); +} +#endif + +static int fuse_dentry_delete(const struct dentry *dentry) +{ + return time_before64(fuse_dentry_time(dentry), get_jiffies_64()); +} + +/* + * Create a fuse_mount object with a new superblock (with path->dentry + * as the root), and return that mount so it can be auto-mounted on + * @path. + */ +static struct vfsmount *fuse_dentry_automount(struct path *path) +{ + struct fs_context *fsc; + struct vfsmount *mnt; + struct fuse_inode *mp_fi = get_fuse_inode(d_inode(path->dentry)); + + fsc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry); + if (IS_ERR(fsc)) + return ERR_CAST(fsc); + + /* Pass the FUSE inode of the mount for fuse_get_tree_submount() */ + fsc->fs_private = mp_fi; + + /* Create the submount */ + mnt = fc_mount(fsc); + if (!IS_ERR(mnt)) + mntget(mnt); + + put_fs_context(fsc); + return mnt; +} + +const struct dentry_operations fuse_dentry_operations = { + .d_revalidate = fuse_dentry_revalidate, + .d_delete = fuse_dentry_delete, +#if BITS_PER_LONG < 64 + .d_init = fuse_dentry_init, + .d_release = fuse_dentry_release, +#endif + .d_automount = fuse_dentry_automount, +}; + +const struct dentry_operations fuse_root_dentry_operations = { +#if BITS_PER_LONG < 64 + .d_init = fuse_dentry_init, + .d_release = fuse_dentry_release, +#endif +}; + +int fuse_valid_type(int m) +{ + return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) || + S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m); +} + +bool fuse_invalid_attr(struct fuse_attr *attr) +{ + return !fuse_valid_type(attr->mode) || + attr->size > LLONG_MAX; +} + +int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, + struct fuse_entry_out *outarg, struct inode **inode) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + FUSE_ARGS(args); + struct fuse_forget_link *forget; + u64 attr_version; + int err; + + *inode = NULL; + err = -ENAMETOOLONG; + if (name->len > FUSE_NAME_MAX) + goto out; + + + forget = fuse_alloc_forget(); + err = -ENOMEM; + if (!forget) + goto out; + + attr_version = fuse_get_attr_version(fm->fc); + + fuse_lookup_init(fm->fc, &args, nodeid, name, outarg); + err = fuse_simple_request(fm, &args); + /* Zero nodeid is same as -ENOENT, but with valid timeout */ + if (err || !outarg->nodeid) + goto out_put_forget; + + err = -EIO; + if (!outarg->nodeid) + goto out_put_forget; + if (fuse_invalid_attr(&outarg->attr)) + goto out_put_forget; + + *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, + &outarg->attr, entry_attr_timeout(outarg), + attr_version); + err = -ENOMEM; + if (!*inode) { + fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1); + goto out; + } + err = 0; + + out_put_forget: + kfree(forget); + out: + return err; +} + +static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, + unsigned int flags) +{ + int err; + struct fuse_entry_out outarg; + struct inode *inode; + struct dentry *newent; + bool outarg_valid = true; + bool locked; + + if (fuse_is_bad(dir)) + return ERR_PTR(-EIO); + + locked = fuse_lock_inode(dir); + err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, + &outarg, &inode); + fuse_unlock_inode(dir, locked); + if (err == -ENOENT) { + outarg_valid = false; + err = 0; + } + if (err) + goto out_err; + + err = -EIO; + if (inode && get_node_id(inode) == FUSE_ROOT_ID) + goto out_iput; + + newent = d_splice_alias(inode, entry); + err = PTR_ERR(newent); + if (IS_ERR(newent)) + goto out_err; + + entry = newent ? newent : entry; + if (outarg_valid) + fuse_change_entry_timeout(entry, &outarg); + else + fuse_invalidate_entry_cache(entry); + + if (inode) + fuse_advise_use_readdirplus(dir); + return newent; + + out_iput: + iput(inode); + out_err: + return ERR_PTR(err); +} + +static int get_security_context(struct dentry *entry, umode_t mode, + void **security_ctx, u32 *security_ctxlen) +{ + struct fuse_secctx *fctx; + struct fuse_secctx_header *header; + void *ctx = NULL, *ptr; + u32 ctxlen, total_len = sizeof(*header); + int err, nr_ctx = 0; + const char *name; + size_t namelen; + + err = security_dentry_init_security(entry, mode, &entry->d_name, + &name, &ctx, &ctxlen); + if (err) { + if (err != -EOPNOTSUPP) + goto out_err; + /* No LSM is supporting this security hook. Ignore error */ + ctxlen = 0; + ctx = NULL; + } + + if (ctxlen) { + nr_ctx = 1; + namelen = strlen(name) + 1; + err = -EIO; + if (WARN_ON(namelen > XATTR_NAME_MAX + 1 || ctxlen > S32_MAX)) + goto out_err; + total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen + ctxlen); + } + + err = -ENOMEM; + header = ptr = kzalloc(total_len, GFP_KERNEL); + if (!ptr) + goto out_err; + + header->nr_secctx = nr_ctx; + header->size = total_len; + ptr += sizeof(*header); + if (nr_ctx) { + fctx = ptr; + fctx->size = ctxlen; + ptr += sizeof(*fctx); + + strcpy(ptr, name); + ptr += namelen; + + memcpy(ptr, ctx, ctxlen); + } + *security_ctxlen = total_len; + *security_ctx = header; + err = 0; +out_err: + kfree(ctx); + return err; +} + +/* + * Atomic create+open operation + * + * If the filesystem doesn't support this, then fall back to separate + * 'mknod' + 'open' requests. + */ +static int fuse_create_open(struct inode *dir, struct dentry *entry, + struct file *file, unsigned int flags, + umode_t mode, u32 opcode) +{ + int err; + struct inode *inode; + struct fuse_mount *fm = get_fuse_mount(dir); + FUSE_ARGS(args); + struct fuse_forget_link *forget; + struct fuse_create_in inarg; + struct fuse_open_out outopen; + struct fuse_entry_out outentry; + struct fuse_inode *fi; + struct fuse_file *ff; + void *security_ctx = NULL; + u32 security_ctxlen; + bool trunc = flags & O_TRUNC; + + /* Userspace expects S_IFREG in create mode */ + BUG_ON((mode & S_IFMT) != S_IFREG); + + forget = fuse_alloc_forget(); + err = -ENOMEM; + if (!forget) + goto out_err; + + err = -ENOMEM; + ff = fuse_file_alloc(fm); + if (!ff) + goto out_put_forget_req; + + if (!fm->fc->dont_mask) + mode &= ~current_umask(); + + flags &= ~O_NOCTTY; + memset(&inarg, 0, sizeof(inarg)); + memset(&outentry, 0, sizeof(outentry)); + inarg.flags = flags; + inarg.mode = mode; + inarg.umask = current_umask(); + + if (fm->fc->handle_killpriv_v2 && trunc && + !(flags & O_EXCL) && !capable(CAP_FSETID)) { + inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; + } + + args.opcode = opcode; + args.nodeid = get_node_id(dir); + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; + args.out_numargs = 2; + args.out_args[0].size = sizeof(outentry); + args.out_args[0].value = &outentry; + args.out_args[1].size = sizeof(outopen); + args.out_args[1].value = &outopen; + + if (fm->fc->init_security) { + err = get_security_context(entry, mode, &security_ctx, + &security_ctxlen); + if (err) + goto out_put_forget_req; + + args.in_numargs = 3; + args.in_args[2].size = security_ctxlen; + args.in_args[2].value = security_ctx; + } + + err = fuse_simple_request(fm, &args); + kfree(security_ctx); + if (err) + goto out_free_ff; + + err = -EIO; + if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid) || + fuse_invalid_attr(&outentry.attr)) + goto out_free_ff; + + ff->fh = outopen.fh; + ff->nodeid = outentry.nodeid; + ff->open_flags = outopen.open_flags; + inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, + &outentry.attr, entry_attr_timeout(&outentry), 0); + if (!inode) { + flags &= ~(O_CREAT | O_EXCL | O_TRUNC); + fuse_sync_release(NULL, ff, flags); + fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1); + err = -ENOMEM; + goto out_err; + } + kfree(forget); + d_instantiate(entry, inode); + fuse_change_entry_timeout(entry, &outentry); + fuse_dir_changed(dir); + err = finish_open(file, entry, generic_file_open); + if (err) { + fi = get_fuse_inode(inode); + fuse_sync_release(fi, ff, flags); + } else { + file->private_data = ff; + fuse_finish_open(inode, file); + if (fm->fc->atomic_o_trunc && trunc) + truncate_pagecache(inode, 0); + else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); + } + return err; + +out_free_ff: + fuse_file_free(ff); +out_put_forget_req: + kfree(forget); +out_err: + return err; +} + +static int fuse_mknod(struct user_namespace *, struct inode *, struct dentry *, + umode_t, dev_t); +static int fuse_atomic_open(struct inode *dir, struct dentry *entry, + struct file *file, unsigned flags, + umode_t mode) +{ + int err; + struct fuse_conn *fc = get_fuse_conn(dir); + struct dentry *res = NULL; + + if (fuse_is_bad(dir)) + return -EIO; + + if (d_in_lookup(entry)) { + res = fuse_lookup(dir, entry, 0); + if (IS_ERR(res)) + return PTR_ERR(res); + + if (res) + entry = res; + } + + if (!(flags & O_CREAT) || d_really_is_positive(entry)) + goto no_open; + + /* Only creates */ + file->f_mode |= FMODE_CREATED; + + if (fc->no_create) + goto mknod; + + err = fuse_create_open(dir, entry, file, flags, mode, FUSE_CREATE); + if (err == -ENOSYS) { + fc->no_create = 1; + goto mknod; + } +out_dput: + dput(res); + return err; + +mknod: + err = fuse_mknod(&init_user_ns, dir, entry, mode, 0); + if (err) + goto out_dput; +no_open: + return finish_no_open(file, res); +} + +/* + * Code shared between mknod, mkdir, symlink and link + */ +static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, + struct inode *dir, struct dentry *entry, + umode_t mode) +{ + struct fuse_entry_out outarg; + struct inode *inode; + struct dentry *d; + int err; + struct fuse_forget_link *forget; + void *security_ctx = NULL; + u32 security_ctxlen; + + if (fuse_is_bad(dir)) + return -EIO; + + forget = fuse_alloc_forget(); + if (!forget) + return -ENOMEM; + + memset(&outarg, 0, sizeof(outarg)); + args->nodeid = get_node_id(dir); + args->out_numargs = 1; + args->out_args[0].size = sizeof(outarg); + args->out_args[0].value = &outarg; + + if (fm->fc->init_security && args->opcode != FUSE_LINK) { + err = get_security_context(entry, mode, &security_ctx, + &security_ctxlen); + if (err) + goto out_put_forget_req; + + BUG_ON(args->in_numargs != 2); + + args->in_numargs = 3; + args->in_args[2].size = security_ctxlen; + args->in_args[2].value = security_ctx; + } + + err = fuse_simple_request(fm, args); + kfree(security_ctx); + if (err) + goto out_put_forget_req; + + err = -EIO; + if (invalid_nodeid(outarg.nodeid) || fuse_invalid_attr(&outarg.attr)) + goto out_put_forget_req; + + if ((outarg.attr.mode ^ mode) & S_IFMT) + goto out_put_forget_req; + + inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, + &outarg.attr, entry_attr_timeout(&outarg), 0); + if (!inode) { + fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); + return -ENOMEM; + } + kfree(forget); + + d_drop(entry); + d = d_splice_alias(inode, entry); + if (IS_ERR(d)) + return PTR_ERR(d); + + if (d) { + fuse_change_entry_timeout(d, &outarg); + dput(d); + } else { + fuse_change_entry_timeout(entry, &outarg); + } + fuse_dir_changed(dir); + return 0; + + out_put_forget_req: + kfree(forget); + return err; +} + +static int fuse_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *entry, umode_t mode, dev_t rdev) +{ + struct fuse_mknod_in inarg; + struct fuse_mount *fm = get_fuse_mount(dir); + FUSE_ARGS(args); + + if (!fm->fc->dont_mask) + mode &= ~current_umask(); + + memset(&inarg, 0, sizeof(inarg)); + inarg.mode = mode; + inarg.rdev = new_encode_dev(rdev); + inarg.umask = current_umask(); + args.opcode = FUSE_MKNOD; + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; + return create_new_entry(fm, &args, dir, entry, mode); +} + +static int fuse_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *entry, umode_t mode, bool excl) +{ + return fuse_mknod(&init_user_ns, dir, entry, mode, 0); +} + +static int fuse_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct file *file, umode_t mode) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + int err; + + if (fc->no_tmpfile) + return -EOPNOTSUPP; + + err = fuse_create_open(dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE); + if (err == -ENOSYS) { + fc->no_tmpfile = 1; + err = -EOPNOTSUPP; + } + return err; +} + +static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *entry, umode_t mode) +{ + struct fuse_mkdir_in inarg; + struct fuse_mount *fm = get_fuse_mount(dir); + FUSE_ARGS(args); + + if (!fm->fc->dont_mask) + mode &= ~current_umask(); + + memset(&inarg, 0, sizeof(inarg)); + inarg.mode = mode; + inarg.umask = current_umask(); + args.opcode = FUSE_MKDIR; + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; + return create_new_entry(fm, &args, dir, entry, S_IFDIR); +} + +static int fuse_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *entry, const char *link) +{ + struct fuse_mount *fm = get_fuse_mount(dir); + unsigned len = strlen(link) + 1; + FUSE_ARGS(args); + + args.opcode = FUSE_SYMLINK; + args.in_numargs = 2; + args.in_args[0].size = entry->d_name.len + 1; + args.in_args[0].value = entry->d_name.name; + args.in_args[1].size = len; + args.in_args[1].value = link; + return create_new_entry(fm, &args, dir, entry, S_IFLNK); +} + +void fuse_flush_time_update(struct inode *inode) +{ + int err = sync_inode_metadata(inode, 1); + + mapping_set_error(inode->i_mapping, err); +} + +static void fuse_update_ctime_in_cache(struct inode *inode) +{ + if (!IS_NOCMTIME(inode)) { + inode->i_ctime = current_time(inode); + mark_inode_dirty_sync(inode); + fuse_flush_time_update(inode); + } +} + +void fuse_update_ctime(struct inode *inode) +{ + fuse_invalidate_attr_mask(inode, STATX_CTIME); + fuse_update_ctime_in_cache(inode); +} + +static void fuse_entry_unlinked(struct dentry *entry) +{ + struct inode *inode = d_inode(entry); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + /* + * If i_nlink == 0 then unlink doesn't make sense, yet this can + * happen if userspace filesystem is careless. It would be + * difficult to enforce correct nlink usage so just ignore this + * condition here + */ + if (S_ISDIR(inode->i_mode)) + clear_nlink(inode); + else if (inode->i_nlink > 0) + drop_nlink(inode); + spin_unlock(&fi->lock); + fuse_invalidate_entry_cache(entry); + fuse_update_ctime(inode); +} + +static int fuse_unlink(struct inode *dir, struct dentry *entry) +{ + int err; + struct fuse_mount *fm = get_fuse_mount(dir); + FUSE_ARGS(args); + + if (fuse_is_bad(dir)) + return -EIO; + + args.opcode = FUSE_UNLINK; + args.nodeid = get_node_id(dir); + args.in_numargs = 1; + args.in_args[0].size = entry->d_name.len + 1; + args.in_args[0].value = entry->d_name.name; + err = fuse_simple_request(fm, &args); + if (!err) { + fuse_dir_changed(dir); + fuse_entry_unlinked(entry); + } else if (err == -EINTR) + fuse_invalidate_entry(entry); + return err; +} + +static int fuse_rmdir(struct inode *dir, struct dentry *entry) +{ + int err; + struct fuse_mount *fm = get_fuse_mount(dir); + FUSE_ARGS(args); + + if (fuse_is_bad(dir)) + return -EIO; + + args.opcode = FUSE_RMDIR; + args.nodeid = get_node_id(dir); + args.in_numargs = 1; + args.in_args[0].size = entry->d_name.len + 1; + args.in_args[0].value = entry->d_name.name; + err = fuse_simple_request(fm, &args); + if (!err) { + fuse_dir_changed(dir); + fuse_entry_unlinked(entry); + } else if (err == -EINTR) + fuse_invalidate_entry(entry); + return err; +} + +static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags, int opcode, size_t argsize) +{ + int err; + struct fuse_rename2_in inarg; + struct fuse_mount *fm = get_fuse_mount(olddir); + FUSE_ARGS(args); + + memset(&inarg, 0, argsize); + inarg.newdir = get_node_id(newdir); + inarg.flags = flags; + args.opcode = opcode; + args.nodeid = get_node_id(olddir); + args.in_numargs = 3; + args.in_args[0].size = argsize; + args.in_args[0].value = &inarg; + args.in_args[1].size = oldent->d_name.len + 1; + args.in_args[1].value = oldent->d_name.name; + args.in_args[2].size = newent->d_name.len + 1; + args.in_args[2].value = newent->d_name.name; + err = fuse_simple_request(fm, &args); + if (!err) { + /* ctime changes */ + fuse_update_ctime(d_inode(oldent)); + + if (flags & RENAME_EXCHANGE) + fuse_update_ctime(d_inode(newent)); + + fuse_dir_changed(olddir); + if (olddir != newdir) + fuse_dir_changed(newdir); + + /* newent will end up negative */ + if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) + fuse_entry_unlinked(newent); + } else if (err == -EINTR) { + /* If request was interrupted, DEITY only knows if the + rename actually took place. If the invalidation + fails (e.g. some process has CWD under the renamed + directory), then there can be inconsistency between + the dcache and the real filesystem. Tough luck. */ + fuse_invalidate_entry(oldent); + if (d_really_is_positive(newent)) + fuse_invalidate_entry(newent); + } + + return err; +} + +static int fuse_rename2(struct user_namespace *mnt_userns, struct inode *olddir, + struct dentry *oldent, struct inode *newdir, + struct dentry *newent, unsigned int flags) +{ + struct fuse_conn *fc = get_fuse_conn(olddir); + int err; + + if (fuse_is_bad(olddir)) + return -EIO; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) + return -EINVAL; + + if (flags) { + if (fc->no_rename2 || fc->minor < 23) + return -EINVAL; + + err = fuse_rename_common(olddir, oldent, newdir, newent, flags, + FUSE_RENAME2, + sizeof(struct fuse_rename2_in)); + if (err == -ENOSYS) { + fc->no_rename2 = 1; + err = -EINVAL; + } + } else { + err = fuse_rename_common(olddir, oldent, newdir, newent, 0, + FUSE_RENAME, + sizeof(struct fuse_rename_in)); + } + + return err; +} + +static int fuse_link(struct dentry *entry, struct inode *newdir, + struct dentry *newent) +{ + int err; + struct fuse_link_in inarg; + struct inode *inode = d_inode(entry); + struct fuse_mount *fm = get_fuse_mount(inode); + FUSE_ARGS(args); + + memset(&inarg, 0, sizeof(inarg)); + inarg.oldnodeid = get_node_id(inode); + args.opcode = FUSE_LINK; + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = newent->d_name.len + 1; + args.in_args[1].value = newent->d_name.name; + err = create_new_entry(fm, &args, newdir, newent, inode->i_mode); + if (!err) + fuse_update_ctime_in_cache(inode); + else if (err == -EINTR) + fuse_invalidate_attr(inode); + + return err; +} + +static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, + struct kstat *stat) +{ + unsigned int blkbits; + struct fuse_conn *fc = get_fuse_conn(inode); + + stat->dev = inode->i_sb->s_dev; + stat->ino = attr->ino; + stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); + stat->nlink = attr->nlink; + stat->uid = make_kuid(fc->user_ns, attr->uid); + stat->gid = make_kgid(fc->user_ns, attr->gid); + stat->rdev = inode->i_rdev; + stat->atime.tv_sec = attr->atime; + stat->atime.tv_nsec = attr->atimensec; + stat->mtime.tv_sec = attr->mtime; + stat->mtime.tv_nsec = attr->mtimensec; + stat->ctime.tv_sec = attr->ctime; + stat->ctime.tv_nsec = attr->ctimensec; + stat->size = attr->size; + stat->blocks = attr->blocks; + + if (attr->blksize != 0) + blkbits = ilog2(attr->blksize); + else + blkbits = inode->i_sb->s_blocksize_bits; + + stat->blksize = 1 << blkbits; +} + +static int fuse_do_getattr(struct inode *inode, struct kstat *stat, + struct file *file) +{ + int err; + struct fuse_getattr_in inarg; + struct fuse_attr_out outarg; + struct fuse_mount *fm = get_fuse_mount(inode); + FUSE_ARGS(args); + u64 attr_version; + + attr_version = fuse_get_attr_version(fm->fc); + + memset(&inarg, 0, sizeof(inarg)); + memset(&outarg, 0, sizeof(outarg)); + /* Directories have separate file-handle space */ + if (file && S_ISREG(inode->i_mode)) { + struct fuse_file *ff = file->private_data; + + inarg.getattr_flags |= FUSE_GETATTR_FH; + inarg.fh = ff->fh; + } + args.opcode = FUSE_GETATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (!err) { + if (fuse_invalid_attr(&outarg.attr) || + inode_wrong_type(inode, outarg.attr.mode)) { + fuse_make_bad(inode); + err = -EIO; + } else { + fuse_change_attributes(inode, &outarg.attr, + attr_timeout(&outarg), + attr_version); + if (stat) + fuse_fillattr(inode, &outarg.attr, stat); + } + } + return err; +} + +static int fuse_update_get_attr(struct inode *inode, struct file *file, + struct kstat *stat, u32 request_mask, + unsigned int flags) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + int err = 0; + bool sync; + u32 inval_mask = READ_ONCE(fi->inval_mask); + u32 cache_mask = fuse_get_cache_mask(inode); + + if (flags & AT_STATX_FORCE_SYNC) + sync = true; + else if (flags & AT_STATX_DONT_SYNC) + sync = false; + else if (request_mask & inval_mask & ~cache_mask) + sync = true; + else + sync = time_before64(fi->i_time, get_jiffies_64()); + + if (sync) { + forget_all_cached_acls(inode); + err = fuse_do_getattr(inode, stat, file); + } else if (stat) { + generic_fillattr(&init_user_ns, inode, stat); + stat->mode = fi->orig_i_mode; + stat->ino = fi->orig_ino; + } + + return err; +} + +int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask) +{ + return fuse_update_get_attr(inode, file, NULL, mask, 0); +} + +int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, + u64 child_nodeid, struct qstr *name) +{ + int err = -ENOTDIR; + struct inode *parent; + struct dentry *dir; + struct dentry *entry; + + parent = fuse_ilookup(fc, parent_nodeid, NULL); + if (!parent) + return -ENOENT; + + inode_lock_nested(parent, I_MUTEX_PARENT); + if (!S_ISDIR(parent->i_mode)) + goto unlock; + + err = -ENOENT; + dir = d_find_alias(parent); + if (!dir) + goto unlock; + + name->hash = full_name_hash(dir, name->name, name->len); + entry = d_lookup(dir, name); + dput(dir); + if (!entry) + goto unlock; + + fuse_dir_changed(parent); + fuse_invalidate_entry(entry); + + if (child_nodeid != 0 && d_really_is_positive(entry)) { + inode_lock(d_inode(entry)); + if (get_node_id(d_inode(entry)) != child_nodeid) { + err = -ENOENT; + goto badentry; + } + if (d_mountpoint(entry)) { + err = -EBUSY; + goto badentry; + } + if (d_is_dir(entry)) { + shrink_dcache_parent(entry); + if (!simple_empty(entry)) { + err = -ENOTEMPTY; + goto badentry; + } + d_inode(entry)->i_flags |= S_DEAD; + } + dont_mount(entry); + clear_nlink(d_inode(entry)); + err = 0; + badentry: + inode_unlock(d_inode(entry)); + if (!err) + d_delete(entry); + } else { + err = 0; + } + dput(entry); + + unlock: + inode_unlock(parent); + iput(parent); + return err; +} + +/* + * Calling into a user-controlled filesystem gives the filesystem + * daemon ptrace-like capabilities over the current process. This + * means, that the filesystem daemon is able to record the exact + * filesystem operations performed, and can also control the behavior + * of the requester process in otherwise impossible ways. For example + * it can delay the operation for arbitrary length of time allowing + * DoS against the requester. + * + * For this reason only those processes can call into the filesystem, + * for which the owner of the mount has ptrace privilege. This + * excludes processes started by other users, suid or sgid processes. + */ +int fuse_allow_current_process(struct fuse_conn *fc) +{ + const struct cred *cred; + + if (allow_sys_admin_access && capable(CAP_SYS_ADMIN)) + return 1; + + if (fc->allow_other) + return current_in_userns(fc->user_ns); + + cred = current_cred(); + if (uid_eq(cred->euid, fc->user_id) && + uid_eq(cred->suid, fc->user_id) && + uid_eq(cred->uid, fc->user_id) && + gid_eq(cred->egid, fc->group_id) && + gid_eq(cred->sgid, fc->group_id) && + gid_eq(cred->gid, fc->group_id)) + return 1; + + return 0; +} + +static int fuse_access(struct inode *inode, int mask) +{ + struct fuse_mount *fm = get_fuse_mount(inode); + FUSE_ARGS(args); + struct fuse_access_in inarg; + int err; + + BUG_ON(mask & MAY_NOT_BLOCK); + + if (fm->fc->no_access) + return 0; + + memset(&inarg, 0, sizeof(inarg)); + inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC); + args.opcode = FUSE_ACCESS; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + fm->fc->no_access = 1; + err = 0; + } + return err; +} + +static int fuse_perm_getattr(struct inode *inode, int mask) +{ + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + forget_all_cached_acls(inode); + return fuse_do_getattr(inode, NULL, NULL); +} + +/* + * Check permission. The two basic access models of FUSE are: + * + * 1) Local access checking ('default_permissions' mount option) based + * on file mode. This is the plain old disk filesystem permission + * modell. + * + * 2) "Remote" access checking, where server is responsible for + * checking permission in each inode operation. An exception to this + * is if ->permission() was invoked from sys_access() in which case an + * access request is sent. Execute permission is still checked + * locally based on file mode. + */ +static int fuse_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + bool refreshed = false; + int err = 0; + + if (fuse_is_bad(inode)) + return -EIO; + + if (!fuse_allow_current_process(fc)) + return -EACCES; + + /* + * If attributes are needed, refresh them before proceeding + */ + if (fc->default_permissions || + ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { + struct fuse_inode *fi = get_fuse_inode(inode); + u32 perm_mask = STATX_MODE | STATX_UID | STATX_GID; + + if (perm_mask & READ_ONCE(fi->inval_mask) || + time_before64(fi->i_time, get_jiffies_64())) { + refreshed = true; + + err = fuse_perm_getattr(inode, mask); + if (err) + return err; + } + } + + if (fc->default_permissions) { + err = generic_permission(&init_user_ns, inode, mask); + + /* If permission is denied, try to refresh file + attributes. This is also needed, because the root + node will at first have no permissions */ + if (err == -EACCES && !refreshed) { + err = fuse_perm_getattr(inode, mask); + if (!err) + err = generic_permission(&init_user_ns, + inode, mask); + } + + /* Note: the opposite of the above test does not + exist. So if permissions are revoked this won't be + noticed immediately, only after the attribute + timeout has expired */ + } else if (mask & (MAY_ACCESS | MAY_CHDIR)) { + err = fuse_access(inode, mask); + } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) { + if (!(inode->i_mode & S_IXUGO)) { + if (refreshed) + return -EACCES; + + err = fuse_perm_getattr(inode, mask); + if (!err && !(inode->i_mode & S_IXUGO)) + return -EACCES; + } + } + return err; +} + +static int fuse_readlink_page(struct inode *inode, struct page *page) +{ + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 }; + struct fuse_args_pages ap = { + .num_pages = 1, + .pages = &page, + .descs = &desc, + }; + char *link; + ssize_t res; + + ap.args.opcode = FUSE_READLINK; + ap.args.nodeid = get_node_id(inode); + ap.args.out_pages = true; + ap.args.out_argvar = true; + ap.args.page_zeroing = true; + ap.args.out_numargs = 1; + ap.args.out_args[0].size = desc.length; + res = fuse_simple_request(fm, &ap.args); + + fuse_invalidate_atime(inode); + + if (res < 0) + return res; + + if (WARN_ON(res >= PAGE_SIZE)) + return -EIO; + + link = page_address(page); + link[res] = '\0'; + + return 0; +} + +static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct page *page; + int err; + + err = -EIO; + if (fuse_is_bad(inode)) + goto out_err; + + if (fc->cache_symlinks) + return page_get_link(dentry, inode, callback); + + err = -ECHILD; + if (!dentry) + goto out_err; + + page = alloc_page(GFP_KERNEL); + err = -ENOMEM; + if (!page) + goto out_err; + + err = fuse_readlink_page(inode, page); + if (err) { + __free_page(page); + goto out_err; + } + + set_delayed_call(callback, page_put_link, page); + + return page_address(page); + +out_err: + return ERR_PTR(err); +} + +static int fuse_dir_open(struct inode *inode, struct file *file) +{ + return fuse_open_common(inode, file, true); +} + +static int fuse_dir_release(struct inode *inode, struct file *file) +{ + fuse_release_common(file, true); + + return 0; +} + +static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + if (fuse_is_bad(inode)) + return -EIO; + + if (fc->no_fsyncdir) + return 0; + + inode_lock(inode); + err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNCDIR); + if (err == -ENOSYS) { + fc->no_fsyncdir = 1; + err = 0; + } + inode_unlock(inode); + + return err; +} + +static long fuse_dir_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host); + + /* FUSE_IOCTL_DIR only supported for API version >= 7.18 */ + if (fc->minor < 18) + return -ENOTTY; + + return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_DIR); +} + +static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host); + + if (fc->minor < 18) + return -ENOTTY; + + return fuse_ioctl_common(file, cmd, arg, + FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR); +} + +static bool update_mtime(unsigned ivalid, bool trust_local_mtime) +{ + /* Always update if mtime is explicitly set */ + if (ivalid & ATTR_MTIME_SET) + return true; + + /* Or if kernel i_mtime is the official one */ + if (trust_local_mtime) + return true; + + /* If it's an open(O_TRUNC) or an ftruncate(), don't update */ + if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE))) + return false; + + /* In all other cases update */ + return true; +} + +static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr, + struct fuse_setattr_in *arg, bool trust_local_cmtime) +{ + unsigned ivalid = iattr->ia_valid; + + if (ivalid & ATTR_MODE) + arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; + if (ivalid & ATTR_UID) + arg->valid |= FATTR_UID, arg->uid = from_kuid(fc->user_ns, iattr->ia_uid); + if (ivalid & ATTR_GID) + arg->valid |= FATTR_GID, arg->gid = from_kgid(fc->user_ns, iattr->ia_gid); + if (ivalid & ATTR_SIZE) + arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; + if (ivalid & ATTR_ATIME) { + arg->valid |= FATTR_ATIME; + arg->atime = iattr->ia_atime.tv_sec; + arg->atimensec = iattr->ia_atime.tv_nsec; + if (!(ivalid & ATTR_ATIME_SET)) + arg->valid |= FATTR_ATIME_NOW; + } + if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) { + arg->valid |= FATTR_MTIME; + arg->mtime = iattr->ia_mtime.tv_sec; + arg->mtimensec = iattr->ia_mtime.tv_nsec; + if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime) + arg->valid |= FATTR_MTIME_NOW; + } + if ((ivalid & ATTR_CTIME) && trust_local_cmtime) { + arg->valid |= FATTR_CTIME; + arg->ctime = iattr->ia_ctime.tv_sec; + arg->ctimensec = iattr->ia_ctime.tv_nsec; + } +} + +/* + * Prevent concurrent writepages on inode + * + * This is done by adding a negative bias to the inode write counter + * and waiting for all pending writes to finish. + */ +void fuse_set_nowrite(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + BUG_ON(!inode_is_locked(inode)); + + spin_lock(&fi->lock); + BUG_ON(fi->writectr < 0); + fi->writectr += FUSE_NOWRITE; + spin_unlock(&fi->lock); + wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE); +} + +/* + * Allow writepages on inode + * + * Remove the bias from the writecounter and send any queued + * writepages. + */ +static void __fuse_release_nowrite(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + BUG_ON(fi->writectr != FUSE_NOWRITE); + fi->writectr = 0; + fuse_flush_writepages(inode); +} + +void fuse_release_nowrite(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + __fuse_release_nowrite(inode); + spin_unlock(&fi->lock); +} + +static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args, + struct inode *inode, + struct fuse_setattr_in *inarg_p, + struct fuse_attr_out *outarg_p) +{ + args->opcode = FUSE_SETATTR; + args->nodeid = get_node_id(inode); + args->in_numargs = 1; + args->in_args[0].size = sizeof(*inarg_p); + args->in_args[0].value = inarg_p; + args->out_numargs = 1; + args->out_args[0].size = sizeof(*outarg_p); + args->out_args[0].value = outarg_p; +} + +/* + * Flush inode->i_mtime to the server + */ +int fuse_flush_times(struct inode *inode, struct fuse_file *ff) +{ + struct fuse_mount *fm = get_fuse_mount(inode); + FUSE_ARGS(args); + struct fuse_setattr_in inarg; + struct fuse_attr_out outarg; + + memset(&inarg, 0, sizeof(inarg)); + memset(&outarg, 0, sizeof(outarg)); + + inarg.valid = FATTR_MTIME; + inarg.mtime = inode->i_mtime.tv_sec; + inarg.mtimensec = inode->i_mtime.tv_nsec; + if (fm->fc->minor >= 23) { + inarg.valid |= FATTR_CTIME; + inarg.ctime = inode->i_ctime.tv_sec; + inarg.ctimensec = inode->i_ctime.tv_nsec; + } + if (ff) { + inarg.valid |= FATTR_FH; + inarg.fh = ff->fh; + } + fuse_setattr_fill(fm->fc, &args, inode, &inarg, &outarg); + + return fuse_simple_request(fm, &args); +} + +/* + * Set attributes, and at the same time refresh them. + * + * Truncation is slightly complicated, because the 'truncate' request + * may fail, in which case we don't want to touch the mapping. + * vmtruncate() doesn't allow for this case, so do the rlimit checking + * and the actual truncation by hand. + */ +int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, + struct file *file) +{ + struct inode *inode = d_inode(dentry); + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn *fc = fm->fc; + struct fuse_inode *fi = get_fuse_inode(inode); + struct address_space *mapping = inode->i_mapping; + FUSE_ARGS(args); + struct fuse_setattr_in inarg; + struct fuse_attr_out outarg; + bool is_truncate = false; + bool is_wb = fc->writeback_cache && S_ISREG(inode->i_mode); + loff_t oldsize; + int err; + bool trust_local_cmtime = is_wb; + bool fault_blocked = false; + + if (!fc->default_permissions) + attr->ia_valid |= ATTR_FORCE; + + err = setattr_prepare(&init_user_ns, dentry, attr); + if (err) + return err; + + if (attr->ia_valid & ATTR_SIZE) { + if (WARN_ON(!S_ISREG(inode->i_mode))) + return -EIO; + is_truncate = true; + } + + if (FUSE_IS_DAX(inode) && is_truncate) { + filemap_invalidate_lock(mapping); + fault_blocked = true; + err = fuse_dax_break_layouts(inode, 0, 0); + if (err) { + filemap_invalidate_unlock(mapping); + return err; + } + } + + if (attr->ia_valid & ATTR_OPEN) { + /* This is coming from open(..., ... | O_TRUNC); */ + WARN_ON(!(attr->ia_valid & ATTR_SIZE)); + WARN_ON(attr->ia_size != 0); + if (fc->atomic_o_trunc) { + /* + * No need to send request to userspace, since actual + * truncation has already been done by OPEN. But still + * need to truncate page cache. + */ + i_size_write(inode, 0); + truncate_pagecache(inode, 0); + goto out; + } + file = NULL; + } + + /* Flush dirty data/metadata before non-truncate SETATTR */ + if (is_wb && + attr->ia_valid & + (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET | + ATTR_TIMES_SET)) { + err = write_inode_now(inode, true); + if (err) + return err; + + fuse_set_nowrite(inode); + fuse_release_nowrite(inode); + } + + if (is_truncate) { + fuse_set_nowrite(inode); + set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + if (trust_local_cmtime && attr->ia_size != inode->i_size) + attr->ia_valid |= ATTR_MTIME | ATTR_CTIME; + } + + memset(&inarg, 0, sizeof(inarg)); + memset(&outarg, 0, sizeof(outarg)); + iattr_to_fattr(fc, attr, &inarg, trust_local_cmtime); + if (file) { + struct fuse_file *ff = file->private_data; + inarg.valid |= FATTR_FH; + inarg.fh = ff->fh; + } + + /* Kill suid/sgid for non-directory chown unconditionally */ + if (fc->handle_killpriv_v2 && !S_ISDIR(inode->i_mode) && + attr->ia_valid & (ATTR_UID | ATTR_GID)) + inarg.valid |= FATTR_KILL_SUIDGID; + + if (attr->ia_valid & ATTR_SIZE) { + /* For mandatory locking in truncate */ + inarg.valid |= FATTR_LOCKOWNER; + inarg.lock_owner = fuse_lock_owner_id(fc, current->files); + + /* Kill suid/sgid for truncate only if no CAP_FSETID */ + if (fc->handle_killpriv_v2 && !capable(CAP_FSETID)) + inarg.valid |= FATTR_KILL_SUIDGID; + } + fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); + err = fuse_simple_request(fm, &args); + if (err) { + if (err == -EINTR) + fuse_invalidate_attr(inode); + goto error; + } + + if (fuse_invalid_attr(&outarg.attr) || + inode_wrong_type(inode, outarg.attr.mode)) { + fuse_make_bad(inode); + err = -EIO; + goto error; + } + + spin_lock(&fi->lock); + /* the kernel maintains i_mtime locally */ + if (trust_local_cmtime) { + if (attr->ia_valid & ATTR_MTIME) + inode->i_mtime = attr->ia_mtime; + if (attr->ia_valid & ATTR_CTIME) + inode->i_ctime = attr->ia_ctime; + /* FIXME: clear I_DIRTY_SYNC? */ + } + + fuse_change_attributes_common(inode, &outarg.attr, + attr_timeout(&outarg), + fuse_get_cache_mask(inode)); + oldsize = inode->i_size; + /* see the comment in fuse_change_attributes() */ + if (!is_wb || is_truncate) + i_size_write(inode, outarg.attr.size); + + if (is_truncate) { + /* NOTE: this may release/reacquire fi->lock */ + __fuse_release_nowrite(inode); + } + spin_unlock(&fi->lock); + + /* + * Only call invalidate_inode_pages2() after removing + * FUSE_NOWRITE, otherwise fuse_launder_folio() would deadlock. + */ + if ((is_truncate || !is_wb) && + S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { + truncate_pagecache(inode, outarg.attr.size); + invalidate_inode_pages2(mapping); + } + + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); +out: + if (fault_blocked) + filemap_invalidate_unlock(mapping); + + return 0; + +error: + if (is_truncate) + fuse_release_nowrite(inode); + + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + + if (fault_blocked) + filemap_invalidate_unlock(mapping); + return err; +} + +static int fuse_setattr(struct user_namespace *mnt_userns, struct dentry *entry, + struct iattr *attr) +{ + struct inode *inode = d_inode(entry); + struct fuse_conn *fc = get_fuse_conn(inode); + struct file *file = (attr->ia_valid & ATTR_FILE) ? attr->ia_file : NULL; + int ret; + + if (fuse_is_bad(inode)) + return -EIO; + + if (!fuse_allow_current_process(get_fuse_conn(inode))) + return -EACCES; + + if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) { + attr->ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID | + ATTR_MODE); + + /* + * The only sane way to reliably kill suid/sgid is to do it in + * the userspace filesystem + * + * This should be done on write(), truncate() and chown(). + */ + if (!fc->handle_killpriv && !fc->handle_killpriv_v2) { + /* + * ia_mode calculation may have used stale i_mode. + * Refresh and recalculate. + */ + ret = fuse_do_getattr(inode, NULL, file); + if (ret) + return ret; + + attr->ia_mode = inode->i_mode; + if (inode->i_mode & S_ISUID) { + attr->ia_valid |= ATTR_MODE; + attr->ia_mode &= ~S_ISUID; + } + if ((inode->i_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { + attr->ia_valid |= ATTR_MODE; + attr->ia_mode &= ~S_ISGID; + } + } + } + if (!attr->ia_valid) + return 0; + + ret = fuse_do_setattr(entry, attr, file); + if (!ret) { + /* + * If filesystem supports acls it may have updated acl xattrs in + * the filesystem, so forget cached acls for the inode. + */ + if (fc->posix_acl) + forget_all_cached_acls(inode); + + /* Directory mode changed, may need to revalidate access */ + if (d_is_dir(entry) && (attr->ia_valid & ATTR_MODE)) + fuse_invalidate_entry_cache(entry); + } + return ret; +} + +static int fuse_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + struct inode *inode = d_inode(path->dentry); + struct fuse_conn *fc = get_fuse_conn(inode); + + if (fuse_is_bad(inode)) + return -EIO; + + if (!fuse_allow_current_process(fc)) { + if (!request_mask) { + /* + * If user explicitly requested *nothing* then don't + * error out, but return st_dev only. + */ + stat->result_mask = 0; + stat->dev = inode->i_sb->s_dev; + return 0; + } + return -EACCES; + } + + return fuse_update_get_attr(inode, NULL, stat, request_mask, flags); +} + +static const struct inode_operations fuse_dir_inode_operations = { + .lookup = fuse_lookup, + .mkdir = fuse_mkdir, + .symlink = fuse_symlink, + .unlink = fuse_unlink, + .rmdir = fuse_rmdir, + .rename = fuse_rename2, + .link = fuse_link, + .setattr = fuse_setattr, + .create = fuse_create, + .atomic_open = fuse_atomic_open, + .tmpfile = fuse_tmpfile, + .mknod = fuse_mknod, + .permission = fuse_permission, + .getattr = fuse_getattr, + .listxattr = fuse_listxattr, + .get_acl = fuse_get_acl, + .set_acl = fuse_set_acl, + .fileattr_get = fuse_fileattr_get, + .fileattr_set = fuse_fileattr_set, +}; + +static const struct file_operations fuse_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate_shared = fuse_readdir, + .open = fuse_dir_open, + .release = fuse_dir_release, + .fsync = fuse_dir_fsync, + .unlocked_ioctl = fuse_dir_ioctl, + .compat_ioctl = fuse_dir_compat_ioctl, +}; + +static const struct inode_operations fuse_common_inode_operations = { + .setattr = fuse_setattr, + .permission = fuse_permission, + .getattr = fuse_getattr, + .listxattr = fuse_listxattr, + .get_acl = fuse_get_acl, + .set_acl = fuse_set_acl, + .fileattr_get = fuse_fileattr_get, + .fileattr_set = fuse_fileattr_set, +}; + +static const struct inode_operations fuse_symlink_inode_operations = { + .setattr = fuse_setattr, + .get_link = fuse_get_link, + .getattr = fuse_getattr, + .listxattr = fuse_listxattr, +}; + +void fuse_init_common(struct inode *inode) +{ + inode->i_op = &fuse_common_inode_operations; +} + +void fuse_init_dir(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + inode->i_op = &fuse_dir_inode_operations; + inode->i_fop = &fuse_dir_operations; + + spin_lock_init(&fi->rdc.lock); + fi->rdc.cached = false; + fi->rdc.size = 0; + fi->rdc.pos = 0; + fi->rdc.version = 0; +} + +static int fuse_symlink_read_folio(struct file *null, struct folio *folio) +{ + int err = fuse_readlink_page(folio->mapping->host, &folio->page); + + if (!err) + folio_mark_uptodate(folio); + + folio_unlock(folio); + + return err; +} + +static const struct address_space_operations fuse_symlink_aops = { + .read_folio = fuse_symlink_read_folio, +}; + +void fuse_init_symlink(struct inode *inode) +{ + inode->i_op = &fuse_symlink_inode_operations; + inode->i_data.a_ops = &fuse_symlink_aops; + inode_nohighmem(inode); +} diff --git a/fs/fuse/file.c b/fs/fuse/file.c new file mode 100644 index 000000000..c996c0ef8 --- /dev/null +++ b/fs/fuse/file.c @@ -0,0 +1,3216 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, + unsigned int open_flags, int opcode, + struct fuse_open_out *outargp) +{ + struct fuse_open_in inarg; + FUSE_ARGS(args); + + memset(&inarg, 0, sizeof(inarg)); + inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); + if (!fm->fc->atomic_o_trunc) + inarg.flags &= ~O_TRUNC; + + if (fm->fc->handle_killpriv_v2 && + (inarg.flags & O_TRUNC) && !capable(CAP_FSETID)) { + inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; + } + + args.opcode = opcode; + args.nodeid = nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(*outargp); + args.out_args[0].value = outargp; + + return fuse_simple_request(fm, &args); +} + +struct fuse_release_args { + struct fuse_args args; + struct fuse_release_in inarg; + struct inode *inode; +}; + +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) +{ + struct fuse_file *ff; + + ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT); + if (unlikely(!ff)) + return NULL; + + ff->fm = fm; + ff->release_args = kzalloc(sizeof(*ff->release_args), + GFP_KERNEL_ACCOUNT); + if (!ff->release_args) { + kfree(ff); + return NULL; + } + + INIT_LIST_HEAD(&ff->write_entry); + mutex_init(&ff->readdir.lock); + refcount_set(&ff->count, 1); + RB_CLEAR_NODE(&ff->polled_node); + init_waitqueue_head(&ff->poll_wait); + + ff->kh = atomic64_inc_return(&fm->fc->khctr); + + return ff; +} + +void fuse_file_free(struct fuse_file *ff) +{ + kfree(ff->release_args); + mutex_destroy(&ff->readdir.lock); + kfree(ff); +} + +static struct fuse_file *fuse_file_get(struct fuse_file *ff) +{ + refcount_inc(&ff->count); + return ff; +} + +static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args, + int error) +{ + struct fuse_release_args *ra = container_of(args, typeof(*ra), args); + + iput(ra->inode); + kfree(ra); +} + +static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) +{ + if (refcount_dec_and_test(&ff->count)) { + struct fuse_args *args = &ff->release_args->args; + + if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) { + /* Do nothing when client does not implement 'open' */ + fuse_release_end(ff->fm, args, 0); + } else if (sync) { + fuse_simple_request(ff->fm, args); + fuse_release_end(ff->fm, args, 0); + } else { + args->end = fuse_release_end; + if (fuse_simple_background(ff->fm, args, + GFP_KERNEL | __GFP_NOFAIL)) + fuse_release_end(ff->fm, args, -ENOTCONN); + } + kfree(ff); + } +} + +struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, + unsigned int open_flags, bool isdir) +{ + struct fuse_conn *fc = fm->fc; + struct fuse_file *ff; + int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; + + ff = fuse_file_alloc(fm); + if (!ff) + return ERR_PTR(-ENOMEM); + + ff->fh = 0; + /* Default for no-open */ + ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0); + if (isdir ? !fc->no_opendir : !fc->no_open) { + struct fuse_open_out outarg; + int err; + + err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg); + if (!err) { + ff->fh = outarg.fh; + ff->open_flags = outarg.open_flags; + + } else if (err != -ENOSYS) { + fuse_file_free(ff); + return ERR_PTR(err); + } else { + if (isdir) + fc->no_opendir = 1; + else + fc->no_open = 1; + } + } + + if (isdir) + ff->open_flags &= ~FOPEN_DIRECT_IO; + + ff->nodeid = nodeid; + + return ff; +} + +int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, + bool isdir) +{ + struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir); + + if (!IS_ERR(ff)) + file->private_data = ff; + + return PTR_ERR_OR_ZERO(ff); +} +EXPORT_SYMBOL_GPL(fuse_do_open); + +static void fuse_link_write_file(struct file *file) +{ + struct inode *inode = file_inode(file); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_file *ff = file->private_data; + /* + * file may be written through mmap, so chain it onto the + * inodes's write_file list + */ + spin_lock(&fi->lock); + if (list_empty(&ff->write_entry)) + list_add(&ff->write_entry, &fi->write_files); + spin_unlock(&fi->lock); +} + +void fuse_finish_open(struct inode *inode, struct file *file) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); + + if (ff->open_flags & FOPEN_STREAM) + stream_open(inode, file); + else if (ff->open_flags & FOPEN_NONSEEKABLE) + nonseekable_open(inode, file); + + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + i_size_write(inode, 0); + spin_unlock(&fi->lock); + file_update_time(file); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); + } + if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) + fuse_link_write_file(file); +} + +int fuse_open_common(struct inode *inode, struct file *file, bool isdir) +{ + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn *fc = fm->fc; + int err; + bool is_wb_truncate = (file->f_flags & O_TRUNC) && + fc->atomic_o_trunc && + fc->writeback_cache; + bool dax_truncate = (file->f_flags & O_TRUNC) && + fc->atomic_o_trunc && FUSE_IS_DAX(inode); + + if (fuse_is_bad(inode)) + return -EIO; + + err = generic_file_open(inode, file); + if (err) + return err; + + if (is_wb_truncate || dax_truncate) + inode_lock(inode); + + if (dax_truncate) { + filemap_invalidate_lock(inode->i_mapping); + err = fuse_dax_break_layouts(inode, 0, 0); + if (err) + goto out_inode_unlock; + } + + if (is_wb_truncate || dax_truncate) + fuse_set_nowrite(inode); + + err = fuse_do_open(fm, get_node_id(inode), file, isdir); + if (!err) + fuse_finish_open(inode, file); + + if (is_wb_truncate || dax_truncate) + fuse_release_nowrite(inode); + if (!err) { + struct fuse_file *ff = file->private_data; + + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) + truncate_pagecache(inode, 0); + else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); + } + if (dax_truncate) + filemap_invalidate_unlock(inode->i_mapping); +out_inode_unlock: + if (is_wb_truncate || dax_truncate) + inode_unlock(inode); + + return err; +} + +static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, + unsigned int flags, int opcode) +{ + struct fuse_conn *fc = ff->fm->fc; + struct fuse_release_args *ra = ff->release_args; + + /* Inode is NULL on error path of fuse_create_open() */ + if (likely(fi)) { + spin_lock(&fi->lock); + list_del(&ff->write_entry); + spin_unlock(&fi->lock); + } + spin_lock(&fc->lock); + if (!RB_EMPTY_NODE(&ff->polled_node)) + rb_erase(&ff->polled_node, &fc->polled_files); + spin_unlock(&fc->lock); + + wake_up_interruptible_all(&ff->poll_wait); + + ra->inarg.fh = ff->fh; + ra->inarg.flags = flags; + ra->args.in_numargs = 1; + ra->args.in_args[0].size = sizeof(struct fuse_release_in); + ra->args.in_args[0].value = &ra->inarg; + ra->args.opcode = opcode; + ra->args.nodeid = ff->nodeid; + ra->args.force = true; + ra->args.nocreds = true; +} + +void fuse_file_release(struct inode *inode, struct fuse_file *ff, + unsigned int open_flags, fl_owner_t id, bool isdir) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_release_args *ra = ff->release_args; + int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; + + fuse_prepare_release(fi, ff, open_flags, opcode); + + if (ff->flock) { + ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; + ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id); + } + /* Hold inode until release is finished */ + ra->inode = igrab(inode); + + /* + * Normally this will send the RELEASE request, however if + * some asynchronous READ or WRITE requests are outstanding, + * the sending will be delayed. + * + * Make the release synchronous if this is a fuseblk mount, + * synchronous RELEASE is allowed (and desirable) in this case + * because the server can be trusted not to screw up. + */ + fuse_file_put(ff, ff->fm->fc->destroy, isdir); +} + +void fuse_release_common(struct file *file, bool isdir) +{ + fuse_file_release(file_inode(file), file->private_data, file->f_flags, + (fl_owner_t) file, isdir); +} + +static int fuse_open(struct inode *inode, struct file *file) +{ + return fuse_open_common(inode, file, false); +} + +static int fuse_release(struct inode *inode, struct file *file) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + /* + * Dirty pages might remain despite write_inode_now() call from + * fuse_flush() due to writes racing with the close. + */ + if (fc->writeback_cache) + write_inode_now(inode, 1); + + fuse_release_common(file, false); + + /* return value is ignored by VFS */ + return 0; +} + +void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, + unsigned int flags) +{ + WARN_ON(refcount_read(&ff->count) > 1); + fuse_prepare_release(fi, ff, flags, FUSE_RELEASE); + /* + * iput(NULL) is a no-op and since the refcount is 1 and everything's + * synchronous, we are fine with not doing igrab() here" + */ + fuse_file_put(ff, true, false); +} +EXPORT_SYMBOL_GPL(fuse_sync_release); + +/* + * Scramble the ID space with XTEA, so that the value of the files_struct + * pointer is not exposed to userspace. + */ +u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) +{ + u32 *k = fc->scramble_key; + u64 v = (unsigned long) id; + u32 v0 = v; + u32 v1 = v >> 32; + u32 sum = 0; + int i; + + for (i = 0; i < 32; i++) { + v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]); + sum += 0x9E3779B9; + v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]); + } + + return (u64) v0 + ((u64) v1 << 32); +} + +struct fuse_writepage_args { + struct fuse_io_args ia; + struct rb_node writepages_entry; + struct list_head queue_entry; + struct fuse_writepage_args *next; + struct inode *inode; + struct fuse_sync_bucket *bucket; +}; + +static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, + pgoff_t idx_from, pgoff_t idx_to) +{ + struct rb_node *n; + + n = fi->writepages.rb_node; + + while (n) { + struct fuse_writepage_args *wpa; + pgoff_t curr_index; + + wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); + WARN_ON(get_fuse_inode(wpa->inode) != fi); + curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; + if (idx_from >= curr_index + wpa->ia.ap.num_pages) + n = n->rb_right; + else if (idx_to < curr_index) + n = n->rb_left; + else + return wpa; + } + return NULL; +} + +/* + * Check if any page in a range is under writeback + * + * This is currently done by walking the list of writepage requests + * for the inode, which can be pretty inefficient. + */ +static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, + pgoff_t idx_to) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + bool found; + + spin_lock(&fi->lock); + found = fuse_find_writeback(fi, idx_from, idx_to); + spin_unlock(&fi->lock); + + return found; +} + +static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) +{ + return fuse_range_is_writeback(inode, index, index); +} + +/* + * Wait for page writeback to be completed. + * + * Since fuse doesn't rely on the VM writeback tracking, this has to + * use some other means. + */ +static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); +} + +/* + * Wait for all pending writepages on the inode to finish. + * + * This is currently done by blocking further writes with FUSE_NOWRITE + * and waiting for all sent writes to complete. + * + * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage + * could conflict with truncation. + */ +static void fuse_sync_writes(struct inode *inode) +{ + fuse_set_nowrite(inode); + fuse_release_nowrite(inode); +} + +static int fuse_flush(struct file *file, fl_owner_t id) +{ + struct inode *inode = file_inode(file); + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_file *ff = file->private_data; + struct fuse_flush_in inarg; + FUSE_ARGS(args); + int err; + + if (fuse_is_bad(inode)) + return -EIO; + + if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache) + return 0; + + err = write_inode_now(inode, 1); + if (err) + return err; + + inode_lock(inode); + fuse_sync_writes(inode); + inode_unlock(inode); + + err = filemap_check_errors(file->f_mapping); + if (err) + return err; + + err = 0; + if (fm->fc->no_flush) + goto inval_attr_out; + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + inarg.lock_owner = fuse_lock_owner_id(fm->fc, id); + args.opcode = FUSE_FLUSH; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.force = true; + + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + fm->fc->no_flush = 1; + err = 0; + } + +inval_attr_out: + /* + * In memory i_blocks is not maintained by fuse, if writeback cache is + * enabled, i_blocks from cached attr may not be accurate. + */ + if (!err && fm->fc->writeback_cache) + fuse_invalidate_attr_mask(inode, STATX_BLOCKS); + return err; +} + +int fuse_fsync_common(struct file *file, loff_t start, loff_t end, + int datasync, int opcode) +{ + struct inode *inode = file->f_mapping->host; + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_file *ff = file->private_data; + FUSE_ARGS(args); + struct fuse_fsync_in inarg; + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0; + args.opcode = opcode; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + return fuse_simple_request(fm, &args); +} + +static int fuse_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + if (fuse_is_bad(inode)) + return -EIO; + + inode_lock(inode); + + /* + * Start writeback against all dirty pages of the inode, then + * wait for all outstanding writes, before sending the FSYNC + * request. + */ + err = file_write_and_wait_range(file, start, end); + if (err) + goto out; + + fuse_sync_writes(inode); + + /* + * Due to implementation of fuse writeback + * file_write_and_wait_range() does not catch errors. + * We have to do this directly after fuse_sync_writes() + */ + err = file_check_and_advance_wb_err(file); + if (err) + goto out; + + err = sync_inode_metadata(inode, 1); + if (err) + goto out; + + if (fc->no_fsync) + goto out; + + err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC); + if (err == -ENOSYS) { + fc->no_fsync = 1; + err = 0; + } +out: + inode_unlock(inode); + + return err; +} + +void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, + size_t count, int opcode) +{ + struct fuse_file *ff = file->private_data; + struct fuse_args *args = &ia->ap.args; + + ia->read.in.fh = ff->fh; + ia->read.in.offset = pos; + ia->read.in.size = count; + ia->read.in.flags = file->f_flags; + args->opcode = opcode; + args->nodeid = ff->nodeid; + args->in_numargs = 1; + args->in_args[0].size = sizeof(ia->read.in); + args->in_args[0].value = &ia->read.in; + args->out_argvar = true; + args->out_numargs = 1; + args->out_args[0].size = count; +} + +static void fuse_release_user_pages(struct fuse_args_pages *ap, + bool should_dirty) +{ + unsigned int i; + + for (i = 0; i < ap->num_pages; i++) { + if (should_dirty) + set_page_dirty_lock(ap->pages[i]); + put_page(ap->pages[i]); + } +} + +static void fuse_io_release(struct kref *kref) +{ + kfree(container_of(kref, struct fuse_io_priv, refcnt)); +} + +static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) +{ + if (io->err) + return io->err; + + if (io->bytes >= 0 && io->write) + return -EIO; + + return io->bytes < 0 ? io->size : io->bytes; +} + +/** + * In case of short read, the caller sets 'pos' to the position of + * actual end of fuse request in IO request. Otherwise, if bytes_requested + * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1. + * + * An example: + * User requested DIO read of 64K. It was split into two 32K fuse requests, + * both submitted asynchronously. The first of them was ACKed by userspace as + * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The + * second request was ACKed as short, e.g. only 1K was read, resulting in + * pos == 33K. + * + * Thus, when all fuse requests are completed, the minimal non-negative 'pos' + * will be equal to the length of the longest contiguous fragment of + * transferred data starting from the beginning of IO request. + */ +static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) +{ + int left; + + spin_lock(&io->lock); + if (err) + io->err = io->err ? : err; + else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes)) + io->bytes = pos; + + left = --io->reqs; + if (!left && io->blocking) + complete(io->done); + spin_unlock(&io->lock); + + if (!left && !io->blocking) { + ssize_t res = fuse_get_res_by_io(io); + + if (res >= 0) { + struct inode *inode = file_inode(io->iocb->ki_filp); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + spin_unlock(&fi->lock); + } + + io->iocb->ki_complete(io->iocb, res); + } + + kref_put(&io->refcnt, fuse_io_release); +} + +static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, + unsigned int npages) +{ + struct fuse_io_args *ia; + + ia = kzalloc(sizeof(*ia), GFP_KERNEL); + if (ia) { + ia->io = io; + ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL, + &ia->ap.descs); + if (!ia->ap.pages) { + kfree(ia); + ia = NULL; + } + } + return ia; +} + +static void fuse_io_free(struct fuse_io_args *ia) +{ + kfree(ia->ap.pages); + kfree(ia); +} + +static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, + int err) +{ + struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); + struct fuse_io_priv *io = ia->io; + ssize_t pos = -1; + + fuse_release_user_pages(&ia->ap, io->should_dirty); + + if (err) { + /* Nothing */ + } else if (io->write) { + if (ia->write.out.size > ia->write.in.size) { + err = -EIO; + } else if (ia->write.in.size != ia->write.out.size) { + pos = ia->write.in.offset - io->offset + + ia->write.out.size; + } + } else { + u32 outsize = args->out_args[0].size; + + if (ia->read.in.size != outsize) + pos = ia->read.in.offset - io->offset + outsize; + } + + fuse_aio_complete(io, err, pos); + fuse_io_free(ia); +} + +static ssize_t fuse_async_req_send(struct fuse_mount *fm, + struct fuse_io_args *ia, size_t num_bytes) +{ + ssize_t err; + struct fuse_io_priv *io = ia->io; + + spin_lock(&io->lock); + kref_get(&io->refcnt); + io->size += num_bytes; + io->reqs++; + spin_unlock(&io->lock); + + ia->ap.args.end = fuse_aio_complete_req; + ia->ap.args.may_block = io->should_dirty; + err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL); + if (err) + fuse_aio_complete_req(fm, &ia->ap.args, err); + + return num_bytes; +} + +static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, + fl_owner_t owner) +{ + struct file *file = ia->io->iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct fuse_mount *fm = ff->fm; + + fuse_read_args_fill(ia, file, pos, count, FUSE_READ); + if (owner != NULL) { + ia->read.in.read_flags |= FUSE_READ_LOCKOWNER; + ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner); + } + + if (ia->io->async) + return fuse_async_req_send(fm, ia, count); + + return fuse_simple_request(fm, &ia->ap.args); +} + +static void fuse_read_update_size(struct inode *inode, loff_t size, + u64 attr_ver) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + if (attr_ver >= fi->attr_version && size < inode->i_size && + !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { + fi->attr_version = atomic64_inc_return(&fc->attr_version); + i_size_write(inode, size); + } + spin_unlock(&fi->lock); +} + +static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, + struct fuse_args_pages *ap) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + /* + * If writeback_cache is enabled, a short read means there's a hole in + * the file. Some data after the hole is in page cache, but has not + * reached the client fs yet. So the hole is not present there. + */ + if (!fc->writeback_cache) { + loff_t pos = page_offset(ap->pages[0]) + num_read; + fuse_read_update_size(inode, pos, attr_ver); + } +} + +static int fuse_do_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct fuse_mount *fm = get_fuse_mount(inode); + loff_t pos = page_offset(page); + struct fuse_page_desc desc = { .length = PAGE_SIZE }; + struct fuse_io_args ia = { + .ap.args.page_zeroing = true, + .ap.args.out_pages = true, + .ap.num_pages = 1, + .ap.pages = &page, + .ap.descs = &desc, + }; + ssize_t res; + u64 attr_ver; + + /* + * Page writeback can extend beyond the lifetime of the + * page-cache page, so make sure we read a properly synced + * page. + */ + fuse_wait_on_page_writeback(inode, page->index); + + attr_ver = fuse_get_attr_version(fm->fc); + + /* Don't overflow end offset */ + if (pos + (desc.length - 1) == LLONG_MAX) + desc.length--; + + fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); + res = fuse_simple_request(fm, &ia.ap.args); + if (res < 0) + return res; + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (res < desc.length) + fuse_short_read(inode, attr_ver, res, &ia.ap); + + SetPageUptodate(page); + + return 0; +} + +static int fuse_read_folio(struct file *file, struct folio *folio) +{ + struct page *page = &folio->page; + struct inode *inode = page->mapping->host; + int err; + + err = -EIO; + if (fuse_is_bad(inode)) + goto out; + + err = fuse_do_readpage(file, page); + fuse_invalidate_atime(inode); + out: + unlock_page(page); + return err; +} + +static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, + int err) +{ + int i; + struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); + struct fuse_args_pages *ap = &ia->ap; + size_t count = ia->read.in.size; + size_t num_read = args->out_args[0].size; + struct address_space *mapping = NULL; + + for (i = 0; mapping == NULL && i < ap->num_pages; i++) + mapping = ap->pages[i]->mapping; + + if (mapping) { + struct inode *inode = mapping->host; + + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (!err && num_read < count) + fuse_short_read(inode, ia->read.attr_ver, num_read, ap); + + fuse_invalidate_atime(inode); + } + + for (i = 0; i < ap->num_pages; i++) { + struct page *page = ap->pages[i]; + + if (!err) + SetPageUptodate(page); + else + SetPageError(page); + unlock_page(page); + put_page(page); + } + if (ia->ff) + fuse_file_put(ia->ff, false, false); + + fuse_io_free(ia); +} + +static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) +{ + struct fuse_file *ff = file->private_data; + struct fuse_mount *fm = ff->fm; + struct fuse_args_pages *ap = &ia->ap; + loff_t pos = page_offset(ap->pages[0]); + size_t count = ap->num_pages << PAGE_SHIFT; + ssize_t res; + int err; + + ap->args.out_pages = true; + ap->args.page_zeroing = true; + ap->args.page_replace = true; + + /* Don't overflow end offset */ + if (pos + (count - 1) == LLONG_MAX) { + count--; + ap->descs[ap->num_pages - 1].length--; + } + WARN_ON((loff_t) (pos + count) < 0); + + fuse_read_args_fill(ia, file, pos, count, FUSE_READ); + ia->read.attr_ver = fuse_get_attr_version(fm->fc); + if (fm->fc->async_read) { + ia->ff = fuse_file_get(ff); + ap->args.end = fuse_readpages_end; + err = fuse_simple_background(fm, &ap->args, GFP_KERNEL); + if (!err) + return; + } else { + res = fuse_simple_request(fm, &ap->args); + err = res < 0 ? res : 0; + } + fuse_readpages_end(fm, &ap->args, err); +} + +static void fuse_readahead(struct readahead_control *rac) +{ + struct inode *inode = rac->mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + unsigned int i, max_pages, nr_pages = 0; + + if (fuse_is_bad(inode)) + return; + + max_pages = min_t(unsigned int, fc->max_pages, + fc->max_read / PAGE_SIZE); + + for (;;) { + struct fuse_io_args *ia; + struct fuse_args_pages *ap; + + if (fc->num_background >= fc->congestion_threshold && + rac->ra->async_size >= readahead_count(rac)) + /* + * Congested and only async pages left, so skip the + * rest. + */ + break; + + nr_pages = readahead_count(rac) - nr_pages; + if (nr_pages > max_pages) + nr_pages = max_pages; + if (nr_pages == 0) + break; + ia = fuse_io_alloc(NULL, nr_pages); + if (!ia) + return; + ap = &ia->ap; + nr_pages = __readahead_batch(rac, ap->pages, nr_pages); + for (i = 0; i < nr_pages; i++) { + fuse_wait_on_page_writeback(inode, + readahead_index(rac) + i); + ap->descs[i].length = PAGE_SIZE; + } + ap->num_pages = nr_pages; + fuse_send_readpages(ia, rac->file); + } +} + +static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + + /* + * In auto invalidate mode, always update attributes on read. + * Otherwise, only update if we attempt to read past EOF (to ensure + * i_size is up to date). + */ + if (fc->auto_inval_data || + (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) { + int err; + err = fuse_update_attributes(inode, iocb->ki_filp, STATX_SIZE); + if (err) + return err; + } + + return generic_file_read_iter(iocb, to); +} + +static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff, + loff_t pos, size_t count) +{ + struct fuse_args *args = &ia->ap.args; + + ia->write.in.fh = ff->fh; + ia->write.in.offset = pos; + ia->write.in.size = count; + args->opcode = FUSE_WRITE; + args->nodeid = ff->nodeid; + args->in_numargs = 2; + if (ff->fm->fc->minor < 9) + args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; + else + args->in_args[0].size = sizeof(ia->write.in); + args->in_args[0].value = &ia->write.in; + args->in_args[1].size = count; + args->out_numargs = 1; + args->out_args[0].size = sizeof(ia->write.out); + args->out_args[0].value = &ia->write.out; +} + +static unsigned int fuse_write_flags(struct kiocb *iocb) +{ + unsigned int flags = iocb->ki_filp->f_flags; + + if (iocb_is_dsync(iocb)) + flags |= O_DSYNC; + if (iocb->ki_flags & IOCB_SYNC) + flags |= O_SYNC; + + return flags; +} + +static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, + size_t count, fl_owner_t owner) +{ + struct kiocb *iocb = ia->io->iocb; + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct fuse_mount *fm = ff->fm; + struct fuse_write_in *inarg = &ia->write.in; + ssize_t err; + + fuse_write_args_fill(ia, ff, pos, count); + inarg->flags = fuse_write_flags(iocb); + if (owner != NULL) { + inarg->write_flags |= FUSE_WRITE_LOCKOWNER; + inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner); + } + + if (ia->io->async) + return fuse_async_req_send(fm, ia, count); + + err = fuse_simple_request(fm, &ia->ap.args); + if (!err && ia->write.out.size > count) + err = -EIO; + + return err ?: ia->write.out.size; +} + +bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + bool ret = false; + + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + if (written > 0 && pos > inode->i_size) { + i_size_write(inode, pos); + ret = true; + } + spin_unlock(&fi->lock); + + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); + + return ret; +} + +static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, + struct kiocb *iocb, struct inode *inode, + loff_t pos, size_t count) +{ + struct fuse_args_pages *ap = &ia->ap; + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct fuse_mount *fm = ff->fm; + unsigned int offset, i; + bool short_write; + int err; + + for (i = 0; i < ap->num_pages; i++) + fuse_wait_on_page_writeback(inode, ap->pages[i]->index); + + fuse_write_args_fill(ia, ff, pos, count); + ia->write.in.flags = fuse_write_flags(iocb); + if (fm->fc->handle_killpriv_v2 && !capable(CAP_FSETID)) + ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID; + + err = fuse_simple_request(fm, &ap->args); + if (!err && ia->write.out.size > count) + err = -EIO; + + short_write = ia->write.out.size < count; + offset = ap->descs[0].offset; + count = ia->write.out.size; + for (i = 0; i < ap->num_pages; i++) { + struct page *page = ap->pages[i]; + + if (err) { + ClearPageUptodate(page); + } else { + if (count >= PAGE_SIZE - offset) + count -= PAGE_SIZE - offset; + else { + if (short_write) + ClearPageUptodate(page); + count = 0; + } + offset = 0; + } + if (ia->write.page_locked && (i == ap->num_pages - 1)) + unlock_page(page); + put_page(page); + } + + return err; +} + +static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos, + unsigned int max_pages) +{ + struct fuse_args_pages *ap = &ia->ap; + struct fuse_conn *fc = get_fuse_conn(mapping->host); + unsigned offset = pos & (PAGE_SIZE - 1); + size_t count = 0; + int err; + + ap->args.in_pages = true; + ap->descs[0].offset = offset; + + do { + size_t tmp; + struct page *page; + pgoff_t index = pos >> PAGE_SHIFT; + size_t bytes = min_t(size_t, PAGE_SIZE - offset, + iov_iter_count(ii)); + + bytes = min_t(size_t, bytes, fc->max_write - count); + + again: + err = -EFAULT; + if (fault_in_iov_iter_readable(ii, bytes)) + break; + + err = -ENOMEM; + page = grab_cache_page_write_begin(mapping, index); + if (!page) + break; + + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + + tmp = copy_page_from_iter_atomic(page, offset, bytes, ii); + flush_dcache_page(page); + + if (!tmp) { + unlock_page(page); + put_page(page); + goto again; + } + + err = 0; + ap->pages[ap->num_pages] = page; + ap->descs[ap->num_pages].length = tmp; + ap->num_pages++; + + count += tmp; + pos += tmp; + offset += tmp; + if (offset == PAGE_SIZE) + offset = 0; + + /* If we copied full page, mark it uptodate */ + if (tmp == PAGE_SIZE) + SetPageUptodate(page); + + if (PageUptodate(page)) { + unlock_page(page); + } else { + ia->write.page_locked = true; + break; + } + if (!fc->big_writes) + break; + } while (iov_iter_count(ii) && count < fc->max_write && + ap->num_pages < max_pages && offset == 0); + + return count > 0 ? count : err; +} + +static inline unsigned int fuse_wr_pages(loff_t pos, size_t len, + unsigned int max_pages) +{ + return min_t(unsigned int, + ((pos + len - 1) >> PAGE_SHIFT) - + (pos >> PAGE_SHIFT) + 1, + max_pages); +} + +static ssize_t fuse_perform_write(struct kiocb *iocb, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos) +{ + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + int err = 0; + ssize_t res = 0; + + if (inode->i_size < pos + iov_iter_count(ii)) + set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + + do { + ssize_t count; + struct fuse_io_args ia = {}; + struct fuse_args_pages *ap = &ia.ap; + unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), + fc->max_pages); + + ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs); + if (!ap->pages) { + err = -ENOMEM; + break; + } + + count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages); + if (count <= 0) { + err = count; + } else { + err = fuse_send_write_pages(&ia, iocb, inode, + pos, count); + if (!err) { + size_t num_written = ia.write.out.size; + + res += num_written; + pos += num_written; + + /* break out of the loop on short write */ + if (num_written != count) + err = -EIO; + } + } + kfree(ap->pages); + } while (!err && iov_iter_count(ii)); + + fuse_write_update_attr(inode, pos, res); + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + + return res > 0 ? res : err; +} + +static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + ssize_t written = 0; + ssize_t written_buffered = 0; + struct inode *inode = mapping->host; + ssize_t err; + struct fuse_conn *fc = get_fuse_conn(inode); + loff_t endbyte = 0; + + if (fc->writeback_cache) { + /* Update size (EOF optimization) and mode (SUID clearing) */ + err = fuse_update_attributes(mapping->host, file, + STATX_SIZE | STATX_MODE); + if (err) + return err; + + if (fc->handle_killpriv_v2 && + setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) { + goto writethrough; + } + + return generic_file_write_iter(iocb, from); + } + +writethrough: + inode_lock(inode); + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = inode_to_bdi(inode); + + err = generic_write_checks(iocb, from); + if (err <= 0) + goto out; + + err = file_remove_privs(file); + if (err) + goto out; + + err = file_update_time(file); + if (err) + goto out; + + if (iocb->ki_flags & IOCB_DIRECT) { + loff_t pos = iocb->ki_pos; + written = generic_file_direct_write(iocb, from); + if (written < 0 || !iov_iter_count(from)) + goto out; + + pos += written; + + written_buffered = fuse_perform_write(iocb, mapping, from, pos); + if (written_buffered < 0) { + err = written_buffered; + goto out; + } + endbyte = pos + written_buffered - 1; + + err = filemap_write_and_wait_range(file->f_mapping, pos, + endbyte); + if (err) + goto out; + + invalidate_mapping_pages(file->f_mapping, + pos >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); + + written += written_buffered; + iocb->ki_pos = pos + written_buffered; + } else { + written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos); + if (written >= 0) + iocb->ki_pos += written; + } +out: + current->backing_dev_info = NULL; + inode_unlock(inode); + if (written > 0) + written = generic_write_sync(iocb, written); + + return written ? written : err; +} + +static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) +{ + return (unsigned long)ii->iov->iov_base + ii->iov_offset; +} + +static inline size_t fuse_get_frag_size(const struct iov_iter *ii, + size_t max_size) +{ + return min(iov_iter_single_seg_count(ii), max_size); +} + +static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, + size_t *nbytesp, int write, + unsigned int max_pages) +{ + size_t nbytes = 0; /* # bytes already packed in req */ + ssize_t ret = 0; + + /* Special case for kernel I/O: can copy directly into the buffer */ + if (iov_iter_is_kvec(ii)) { + unsigned long user_addr = fuse_get_user_addr(ii); + size_t frag_size = fuse_get_frag_size(ii, *nbytesp); + + if (write) + ap->args.in_args[1].value = (void *) user_addr; + else + ap->args.out_args[0].value = (void *) user_addr; + + iov_iter_advance(ii, frag_size); + *nbytesp = frag_size; + return 0; + } + + while (nbytes < *nbytesp && ap->num_pages < max_pages) { + unsigned npages; + size_t start; + ret = iov_iter_get_pages2(ii, &ap->pages[ap->num_pages], + *nbytesp - nbytes, + max_pages - ap->num_pages, + &start); + if (ret < 0) + break; + + nbytes += ret; + + ret += start; + npages = DIV_ROUND_UP(ret, PAGE_SIZE); + + ap->descs[ap->num_pages].offset = start; + fuse_page_descs_length_init(ap->descs, ap->num_pages, npages); + + ap->num_pages += npages; + ap->descs[ap->num_pages - 1].length -= + (PAGE_SIZE - ret) & (PAGE_SIZE - 1); + } + + ap->args.user_pages = true; + if (write) + ap->args.in_pages = true; + else + ap->args.out_pages = true; + + *nbytesp = nbytes; + + return ret < 0 ? ret : 0; +} + +ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, + loff_t *ppos, int flags) +{ + int write = flags & FUSE_DIO_WRITE; + int cuse = flags & FUSE_DIO_CUSE; + struct file *file = io->iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fm->fc; + size_t nmax = write ? fc->max_write : fc->max_read; + loff_t pos = *ppos; + size_t count = iov_iter_count(iter); + pgoff_t idx_from = pos >> PAGE_SHIFT; + pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT; + ssize_t res = 0; + int err = 0; + struct fuse_io_args *ia; + unsigned int max_pages; + + max_pages = iov_iter_npages(iter, fc->max_pages); + ia = fuse_io_alloc(io, max_pages); + if (!ia) + return -ENOMEM; + + if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { + if (!write) + inode_lock(inode); + fuse_sync_writes(inode); + if (!write) + inode_unlock(inode); + } + + io->should_dirty = !write && user_backed_iter(iter); + while (count) { + ssize_t nres; + fl_owner_t owner = current->files; + size_t nbytes = min(count, nmax); + + err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, + max_pages); + if (err && !nbytes) + break; + + if (write) { + if (!capable(CAP_FSETID)) + ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID; + + nres = fuse_send_write(ia, pos, nbytes, owner); + } else { + nres = fuse_send_read(ia, pos, nbytes, owner); + } + + if (!io->async || nres < 0) { + fuse_release_user_pages(&ia->ap, io->should_dirty); + fuse_io_free(ia); + } + ia = NULL; + if (nres < 0) { + iov_iter_revert(iter, nbytes); + err = nres; + break; + } + WARN_ON(nres > nbytes); + + count -= nres; + res += nres; + pos += nres; + if (nres != nbytes) { + iov_iter_revert(iter, nbytes - nres); + break; + } + if (count) { + max_pages = iov_iter_npages(iter, fc->max_pages); + ia = fuse_io_alloc(io, max_pages); + if (!ia) + break; + } + } + if (ia) + fuse_io_free(ia); + if (res > 0) + *ppos = pos; + + return res > 0 ? res : err; +} +EXPORT_SYMBOL_GPL(fuse_direct_io); + +static ssize_t __fuse_direct_read(struct fuse_io_priv *io, + struct iov_iter *iter, + loff_t *ppos) +{ + ssize_t res; + struct inode *inode = file_inode(io->iocb->ki_filp); + + res = fuse_direct_io(io, iter, ppos, 0); + + fuse_invalidate_atime(inode); + + return res; +} + +static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter); + +static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t res; + + if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { + res = fuse_direct_IO(iocb, to); + } else { + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + + res = __fuse_direct_read(&io, to, &iocb->ki_pos); + } + + return res; +} + +static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + ssize_t res; + + /* Don't allow parallel writes to the same file */ + inode_lock(inode); + res = generic_write_checks(iocb, from); + if (res > 0) { + if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { + res = fuse_direct_IO(iocb, from); + } else { + res = fuse_direct_io(&io, from, &iocb->ki_pos, + FUSE_DIO_WRITE); + fuse_write_update_attr(inode, iocb->ki_pos, res); + } + } + inode_unlock(inode); + + return res; +} + +static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + + if (fuse_is_bad(inode)) + return -EIO; + + if (FUSE_IS_DAX(inode)) + return fuse_dax_read_iter(iocb, to); + + if (!(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_cache_read_iter(iocb, to); + else + return fuse_direct_read_iter(iocb, to); +} + +static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + + if (fuse_is_bad(inode)) + return -EIO; + + if (FUSE_IS_DAX(inode)) + return fuse_dax_write_iter(iocb, from); + + if (!(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_cache_write_iter(iocb, from); + else + return fuse_direct_write_iter(iocb, from); +} + +static void fuse_writepage_free(struct fuse_writepage_args *wpa) +{ + struct fuse_args_pages *ap = &wpa->ia.ap; + int i; + + if (wpa->bucket) + fuse_sync_bucket_dec(wpa->bucket); + + for (i = 0; i < ap->num_pages; i++) + __free_page(ap->pages[i]); + + if (wpa->ia.ff) + fuse_file_put(wpa->ia.ff, false, false); + + kfree(ap->pages); + kfree(wpa); +} + +static void fuse_writepage_finish(struct fuse_mount *fm, + struct fuse_writepage_args *wpa) +{ + struct fuse_args_pages *ap = &wpa->ia.ap; + struct inode *inode = wpa->inode; + struct fuse_inode *fi = get_fuse_inode(inode); + struct backing_dev_info *bdi = inode_to_bdi(inode); + int i; + + for (i = 0; i < ap->num_pages; i++) { + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); + wb_writeout_inc(&bdi->wb); + } + wake_up(&fi->page_waitq); +} + +/* Called under fi->lock, may release and reacquire it */ +static void fuse_send_writepage(struct fuse_mount *fm, + struct fuse_writepage_args *wpa, loff_t size) +__releases(fi->lock) +__acquires(fi->lock) +{ + struct fuse_writepage_args *aux, *next; + struct fuse_inode *fi = get_fuse_inode(wpa->inode); + struct fuse_write_in *inarg = &wpa->ia.write.in; + struct fuse_args *args = &wpa->ia.ap.args; + __u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE; + int err; + + fi->writectr++; + if (inarg->offset + data_size <= size) { + inarg->size = data_size; + } else if (inarg->offset < size) { + inarg->size = size - inarg->offset; + } else { + /* Got truncated off completely */ + goto out_free; + } + + args->in_args[1].size = inarg->size; + args->force = true; + args->nocreds = true; + + err = fuse_simple_background(fm, args, GFP_ATOMIC); + if (err == -ENOMEM) { + spin_unlock(&fi->lock); + err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL); + spin_lock(&fi->lock); + } + + /* Fails on broken connection only */ + if (unlikely(err)) + goto out_free; + + return; + + out_free: + fi->writectr--; + rb_erase(&wpa->writepages_entry, &fi->writepages); + fuse_writepage_finish(fm, wpa); + spin_unlock(&fi->lock); + + /* After fuse_writepage_finish() aux request list is private */ + for (aux = wpa->next; aux; aux = next) { + next = aux->next; + aux->next = NULL; + fuse_writepage_free(aux); + } + + fuse_writepage_free(wpa); + spin_lock(&fi->lock); +} + +/* + * If fi->writectr is positive (no truncate or fsync going on) send + * all queued writepage requests. + * + * Called with fi->lock + */ +void fuse_flush_writepages(struct inode *inode) +__releases(fi->lock) +__acquires(fi->lock) +{ + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + loff_t crop = i_size_read(inode); + struct fuse_writepage_args *wpa; + + while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { + wpa = list_entry(fi->queued_writes.next, + struct fuse_writepage_args, queue_entry); + list_del_init(&wpa->queue_entry); + fuse_send_writepage(fm, wpa, crop); + } +} + +static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, + struct fuse_writepage_args *wpa) +{ + pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; + pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1; + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + + WARN_ON(!wpa->ia.ap.num_pages); + while (*p) { + struct fuse_writepage_args *curr; + pgoff_t curr_index; + + parent = *p; + curr = rb_entry(parent, struct fuse_writepage_args, + writepages_entry); + WARN_ON(curr->inode != wpa->inode); + curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; + + if (idx_from >= curr_index + curr->ia.ap.num_pages) + p = &(*p)->rb_right; + else if (idx_to < curr_index) + p = &(*p)->rb_left; + else + return curr; + } + + rb_link_node(&wpa->writepages_entry, parent, p); + rb_insert_color(&wpa->writepages_entry, root); + return NULL; +} + +static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) +{ + WARN_ON(fuse_insert_writeback(root, wpa)); +} + +static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, + int error) +{ + struct fuse_writepage_args *wpa = + container_of(args, typeof(*wpa), ia.ap.args); + struct inode *inode = wpa->inode; + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); + + mapping_set_error(inode->i_mapping, error); + /* + * A writeback finished and this might have updated mtime/ctime on + * server making local mtime/ctime stale. Hence invalidate attrs. + * Do this only if writeback_cache is not enabled. If writeback_cache + * is enabled, we trust local ctime/mtime. + */ + if (!fc->writeback_cache) + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY); + spin_lock(&fi->lock); + rb_erase(&wpa->writepages_entry, &fi->writepages); + while (wpa->next) { + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_write_in *inarg = &wpa->ia.write.in; + struct fuse_writepage_args *next = wpa->next; + + wpa->next = next->next; + next->next = NULL; + next->ia.ff = fuse_file_get(wpa->ia.ff); + tree_insert(&fi->writepages, next); + + /* + * Skip fuse_flush_writepages() to make it easy to crop requests + * based on primary request size. + * + * 1st case (trivial): there are no concurrent activities using + * fuse_set/release_nowrite. Then we're on safe side because + * fuse_flush_writepages() would call fuse_send_writepage() + * anyway. + * + * 2nd case: someone called fuse_set_nowrite and it is waiting + * now for completion of all in-flight requests. This happens + * rarely and no more than once per page, so this should be + * okay. + * + * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle + * of fuse_set_nowrite..fuse_release_nowrite section. The fact + * that fuse_set_nowrite returned implies that all in-flight + * requests were completed along with all of their secondary + * requests. Further primary requests are blocked by negative + * writectr. Hence there cannot be any in-flight requests and + * no invocations of fuse_writepage_end() while we're in + * fuse_set_nowrite..fuse_release_nowrite section. + */ + fuse_send_writepage(fm, next, inarg->offset + inarg->size); + } + fi->writectr--; + fuse_writepage_finish(fm, wpa); + spin_unlock(&fi->lock); + fuse_writepage_free(wpa); +} + +static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi) +{ + struct fuse_file *ff; + + spin_lock(&fi->lock); + ff = list_first_entry_or_null(&fi->write_files, struct fuse_file, + write_entry); + if (ff) + fuse_file_get(ff); + spin_unlock(&fi->lock); + + return ff; +} + +static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi) +{ + struct fuse_file *ff = __fuse_write_file_get(fi); + WARN_ON(!ff); + return ff; +} + +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_file *ff; + int err; + + /* + * Inode is always written before the last reference is dropped and + * hence this should not be reached from reclaim. + * + * Writing back the inode from reclaim can deadlock if the request + * processing itself needs an allocation. Allocations triggering + * reclaim while serving a request can't be prevented, because it can + * involve any number of unrelated userspace processes. + */ + WARN_ON(wbc->for_reclaim); + + ff = __fuse_write_file_get(fi); + err = fuse_flush_times(inode, ff); + if (ff) + fuse_file_put(ff, false, false); + + return err; +} + +static struct fuse_writepage_args *fuse_writepage_args_alloc(void) +{ + struct fuse_writepage_args *wpa; + struct fuse_args_pages *ap; + + wpa = kzalloc(sizeof(*wpa), GFP_NOFS); + if (wpa) { + ap = &wpa->ia.ap; + ap->num_pages = 0; + ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs); + if (!ap->pages) { + kfree(wpa); + wpa = NULL; + } + } + return wpa; + +} + +static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, + struct fuse_writepage_args *wpa) +{ + if (!fc->sync_fs) + return; + + rcu_read_lock(); + /* Prevent resurrection of dead bucket in unlikely race with syncfs */ + do { + wpa->bucket = rcu_dereference(fc->curr_bucket); + } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count))); + rcu_read_unlock(); +} + +static int fuse_writepage_locked(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_writepage_args *wpa; + struct fuse_args_pages *ap; + struct page *tmp_page; + int error = -ENOMEM; + + set_page_writeback(page); + + wpa = fuse_writepage_args_alloc(); + if (!wpa) + goto err; + ap = &wpa->ia.ap; + + tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!tmp_page) + goto err_free; + + error = -EIO; + wpa->ia.ff = fuse_write_file_get(fi); + if (!wpa->ia.ff) + goto err_nofile; + + fuse_writepage_add_to_bucket(fc, wpa); + fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); + + copy_highpage(tmp_page, page); + wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; + wpa->next = NULL; + ap->args.in_pages = true; + ap->num_pages = 1; + ap->pages[0] = tmp_page; + ap->descs[0].offset = 0; + ap->descs[0].length = PAGE_SIZE; + ap->args.end = fuse_writepage_end; + wpa->inode = inode; + + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); + inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); + + spin_lock(&fi->lock); + tree_insert(&fi->writepages, wpa); + list_add_tail(&wpa->queue_entry, &fi->queued_writes); + fuse_flush_writepages(inode); + spin_unlock(&fi->lock); + + end_page_writeback(page); + + return 0; + +err_nofile: + __free_page(tmp_page); +err_free: + kfree(wpa); +err: + mapping_set_error(page->mapping, error); + end_page_writeback(page); + return error; +} + +static int fuse_writepage(struct page *page, struct writeback_control *wbc) +{ + struct fuse_conn *fc = get_fuse_conn(page->mapping->host); + int err; + + if (fuse_page_is_writeback(page->mapping->host, page->index)) { + /* + * ->writepages() should be called for sync() and friends. We + * should only get here on direct reclaim and then we are + * allowed to skip a page which is already in flight + */ + WARN_ON(wbc->sync_mode == WB_SYNC_ALL); + + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + + if (wbc->sync_mode == WB_SYNC_NONE && + fc->num_background >= fc->congestion_threshold) + return AOP_WRITEPAGE_ACTIVATE; + + err = fuse_writepage_locked(page); + unlock_page(page); + + return err; +} + +struct fuse_fill_wb_data { + struct fuse_writepage_args *wpa; + struct fuse_file *ff; + struct inode *inode; + struct page **orig_pages; + unsigned int max_pages; +}; + +static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) +{ + struct fuse_args_pages *ap = &data->wpa->ia.ap; + struct fuse_conn *fc = get_fuse_conn(data->inode); + struct page **pages; + struct fuse_page_desc *descs; + unsigned int npages = min_t(unsigned int, + max_t(unsigned int, data->max_pages * 2, + FUSE_DEFAULT_MAX_PAGES_PER_REQ), + fc->max_pages); + WARN_ON(npages <= data->max_pages); + + pages = fuse_pages_alloc(npages, GFP_NOFS, &descs); + if (!pages) + return false; + + memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages); + memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages); + kfree(ap->pages); + ap->pages = pages; + ap->descs = descs; + data->max_pages = npages; + + return true; +} + +static void fuse_writepages_send(struct fuse_fill_wb_data *data) +{ + struct fuse_writepage_args *wpa = data->wpa; + struct inode *inode = data->inode; + struct fuse_inode *fi = get_fuse_inode(inode); + int num_pages = wpa->ia.ap.num_pages; + int i; + + wpa->ia.ff = fuse_file_get(data->ff); + spin_lock(&fi->lock); + list_add_tail(&wpa->queue_entry, &fi->queued_writes); + fuse_flush_writepages(inode); + spin_unlock(&fi->lock); + + for (i = 0; i < num_pages; i++) + end_page_writeback(data->orig_pages[i]); +} + +/* + * Check under fi->lock if the page is under writeback, and insert it onto the + * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's + * one already added for a page at this offset. If there's none, then insert + * this new request onto the auxiliary list, otherwise reuse the existing one by + * swapping the new temp page with the old one. + */ +static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, + struct page *page) +{ + struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); + struct fuse_writepage_args *tmp; + struct fuse_writepage_args *old_wpa; + struct fuse_args_pages *new_ap = &new_wpa->ia.ap; + + WARN_ON(new_ap->num_pages != 0); + new_ap->num_pages = 1; + + spin_lock(&fi->lock); + old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa); + if (!old_wpa) { + spin_unlock(&fi->lock); + return true; + } + + for (tmp = old_wpa->next; tmp; tmp = tmp->next) { + pgoff_t curr_index; + + WARN_ON(tmp->inode != new_wpa->inode); + curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; + if (curr_index == page->index) { + WARN_ON(tmp->ia.ap.num_pages != 1); + swap(tmp->ia.ap.pages[0], new_ap->pages[0]); + break; + } + } + + if (!tmp) { + new_wpa->next = old_wpa->next; + old_wpa->next = new_wpa; + } + + spin_unlock(&fi->lock); + + if (tmp) { + struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode); + + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP); + wb_writeout_inc(&bdi->wb); + fuse_writepage_free(new_wpa); + } + + return false; +} + +static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, + struct fuse_args_pages *ap, + struct fuse_fill_wb_data *data) +{ + WARN_ON(!ap->num_pages); + + /* + * Being under writeback is unlikely but possible. For example direct + * read to an mmaped fuse file will set the page dirty twice; once when + * the pages are faulted with get_user_pages(), and then after the read + * completed. + */ + if (fuse_page_is_writeback(data->inode, page->index)) + return true; + + /* Reached max pages */ + if (ap->num_pages == fc->max_pages) + return true; + + /* Reached max write bytes */ + if ((ap->num_pages + 1) * PAGE_SIZE > fc->max_write) + return true; + + /* Discontinuity */ + if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index) + return true; + + /* Need to grow the pages array? If so, did the expansion fail? */ + if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data)) + return true; + + return false; +} + +static int fuse_writepages_fill(struct page *page, + struct writeback_control *wbc, void *_data) +{ + struct fuse_fill_wb_data *data = _data; + struct fuse_writepage_args *wpa = data->wpa; + struct fuse_args_pages *ap = &wpa->ia.ap; + struct inode *inode = data->inode; + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); + struct page *tmp_page; + int err; + + if (!data->ff) { + err = -EIO; + data->ff = fuse_write_file_get(fi); + if (!data->ff) + goto out_unlock; + } + + if (wpa && fuse_writepage_need_send(fc, page, ap, data)) { + fuse_writepages_send(data); + data->wpa = NULL; + } + + err = -ENOMEM; + tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!tmp_page) + goto out_unlock; + + /* + * The page must not be redirtied until the writeout is completed + * (i.e. userspace has sent a reply to the write request). Otherwise + * there could be more than one temporary page instance for each real + * page. + * + * This is ensured by holding the page lock in page_mkwrite() while + * checking fuse_page_is_writeback(). We already hold the page lock + * since clear_page_dirty_for_io() and keep it held until we add the + * request to the fi->writepages list and increment ap->num_pages. + * After this fuse_page_is_writeback() will indicate that the page is + * under writeback, so we can release the page lock. + */ + if (data->wpa == NULL) { + err = -ENOMEM; + wpa = fuse_writepage_args_alloc(); + if (!wpa) { + __free_page(tmp_page); + goto out_unlock; + } + fuse_writepage_add_to_bucket(fc, wpa); + + data->max_pages = 1; + + ap = &wpa->ia.ap; + fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0); + wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; + wpa->next = NULL; + ap->args.in_pages = true; + ap->args.end = fuse_writepage_end; + ap->num_pages = 0; + wpa->inode = inode; + } + set_page_writeback(page); + + copy_highpage(tmp_page, page); + ap->pages[ap->num_pages] = tmp_page; + ap->descs[ap->num_pages].offset = 0; + ap->descs[ap->num_pages].length = PAGE_SIZE; + data->orig_pages[ap->num_pages] = page; + + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); + inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); + + err = 0; + if (data->wpa) { + /* + * Protected by fi->lock against concurrent access by + * fuse_page_is_writeback(). + */ + spin_lock(&fi->lock); + ap->num_pages++; + spin_unlock(&fi->lock); + } else if (fuse_writepage_add(wpa, page)) { + data->wpa = wpa; + } else { + end_page_writeback(page); + } +out_unlock: + unlock_page(page); + + return err; +} + +static int fuse_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_fill_wb_data data; + int err; + + err = -EIO; + if (fuse_is_bad(inode)) + goto out; + + if (wbc->sync_mode == WB_SYNC_NONE && + fc->num_background >= fc->congestion_threshold) + return 0; + + data.inode = inode; + data.wpa = NULL; + data.ff = NULL; + + err = -ENOMEM; + data.orig_pages = kcalloc(fc->max_pages, + sizeof(struct page *), + GFP_NOFS); + if (!data.orig_pages) + goto out; + + err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); + if (data.wpa) { + WARN_ON(!data.wpa->ia.ap.num_pages); + fuse_writepages_send(&data); + } + if (data.ff) + fuse_file_put(data.ff, false, false); + + kfree(data.orig_pages); +out: + return err; +} + +/* + * It's worthy to make sure that space is reserved on disk for the write, + * but how to implement it without killing performance need more thinking. + */ +static int fuse_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) +{ + pgoff_t index = pos >> PAGE_SHIFT; + struct fuse_conn *fc = get_fuse_conn(file_inode(file)); + struct page *page; + loff_t fsize; + int err = -ENOMEM; + + WARN_ON(!fc->writeback_cache); + + page = grab_cache_page_write_begin(mapping, index); + if (!page) + goto error; + + fuse_wait_on_page_writeback(mapping->host, page->index); + + if (PageUptodate(page) || len == PAGE_SIZE) + goto success; + /* + * Check if the start this page comes after the end of file, in which + * case the readpage can be optimized away. + */ + fsize = i_size_read(mapping->host); + if (fsize <= (pos & PAGE_MASK)) { + size_t off = pos & ~PAGE_MASK; + if (off) + zero_user_segment(page, 0, off); + goto success; + } + err = fuse_do_readpage(file, page); + if (err) + goto cleanup; +success: + *pagep = page; + return 0; + +cleanup: + unlock_page(page); + put_page(page); +error: + return err; +} + +static int fuse_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + + /* Haven't copied anything? Skip zeroing, size extending, dirtying. */ + if (!copied) + goto unlock; + + pos += copied; + if (!PageUptodate(page)) { + /* Zero any unwritten bytes at the end of the page */ + size_t endoff = pos & ~PAGE_MASK; + if (endoff) + zero_user_segment(page, endoff, PAGE_SIZE); + SetPageUptodate(page); + } + + if (pos > inode->i_size) + i_size_write(inode, pos); + + set_page_dirty(page); + +unlock: + unlock_page(page); + put_page(page); + + return copied; +} + +static int fuse_launder_folio(struct folio *folio) +{ + int err = 0; + if (folio_clear_dirty_for_io(folio)) { + struct inode *inode = folio->mapping->host; + + /* Serialize with pending writeback for the same page */ + fuse_wait_on_page_writeback(inode, folio->index); + err = fuse_writepage_locked(&folio->page); + if (!err) + fuse_wait_on_page_writeback(inode, folio->index); + } + return err; +} + +/* + * Write back dirty data/metadata now (there may not be any suitable + * open files later for data) + */ +static void fuse_vma_close(struct vm_area_struct *vma) +{ + int err; + + err = write_inode_now(vma->vm_file->f_mapping->host, 1); + mapping_set_error(vma->vm_file->f_mapping, err); +} + +/* + * Wait for writeback against this page to complete before allowing it + * to be marked dirty again, and hence written back again, possibly + * before the previous writepage completed. + * + * Block here, instead of in ->writepage(), so that the userspace fs + * can only block processes actually operating on the filesystem. + * + * Otherwise unprivileged userspace fs would be able to block + * unrelated: + * + * - page migration + * - sync(2) + * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER + */ +static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vmf->vma->vm_file); + + file_update_time(vmf->vma->vm_file); + lock_page(page); + if (page->mapping != inode->i_mapping) { + unlock_page(page); + return VM_FAULT_NOPAGE; + } + + fuse_wait_on_page_writeback(inode, page->index); + return VM_FAULT_LOCKED; +} + +static const struct vm_operations_struct fuse_file_vm_ops = { + .close = fuse_vma_close, + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = fuse_page_mkwrite, +}; + +static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct fuse_file *ff = file->private_data; + + /* DAX mmap is superior to direct_io mmap */ + if (FUSE_IS_DAX(file_inode(file))) + return fuse_dax_mmap(file, vma); + + if (ff->open_flags & FOPEN_DIRECT_IO) { + /* Can't provide the coherency needed for MAP_SHARED */ + if (vma->vm_flags & VM_MAYSHARE) + return -ENODEV; + + invalidate_inode_pages2(file->f_mapping); + + return generic_file_mmap(file, vma); + } + + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + fuse_link_write_file(file); + + file_accessed(file); + vma->vm_ops = &fuse_file_vm_ops; + return 0; +} + +static int convert_fuse_file_lock(struct fuse_conn *fc, + const struct fuse_file_lock *ffl, + struct file_lock *fl) +{ + switch (ffl->type) { + case F_UNLCK: + break; + + case F_RDLCK: + case F_WRLCK: + if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX || + ffl->end < ffl->start) + return -EIO; + + fl->fl_start = ffl->start; + fl->fl_end = ffl->end; + + /* + * Convert pid into init's pid namespace. The locks API will + * translate it into the caller's pid namespace. + */ + rcu_read_lock(); + fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns); + rcu_read_unlock(); + break; + + default: + return -EIO; + } + fl->fl_type = ffl->type; + return 0; +} + +static void fuse_lk_fill(struct fuse_args *args, struct file *file, + const struct file_lock *fl, int opcode, pid_t pid, + int flock, struct fuse_lk_in *inarg) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_file *ff = file->private_data; + + memset(inarg, 0, sizeof(*inarg)); + inarg->fh = ff->fh; + inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner); + inarg->lk.start = fl->fl_start; + inarg->lk.end = fl->fl_end; + inarg->lk.type = fl->fl_type; + inarg->lk.pid = pid; + if (flock) + inarg->lk_flags |= FUSE_LK_FLOCK; + args->opcode = opcode; + args->nodeid = get_node_id(inode); + args->in_numargs = 1; + args->in_args[0].size = sizeof(*inarg); + args->in_args[0].value = inarg; +} + +static int fuse_getlk(struct file *file, struct file_lock *fl) +{ + struct inode *inode = file_inode(file); + struct fuse_mount *fm = get_fuse_mount(inode); + FUSE_ARGS(args); + struct fuse_lk_in inarg; + struct fuse_lk_out outarg; + int err; + + fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg); + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (!err) + err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl); + + return err; +} + +static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) +{ + struct inode *inode = file_inode(file); + struct fuse_mount *fm = get_fuse_mount(inode); + FUSE_ARGS(args); + struct fuse_lk_in inarg; + int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; + struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL; + pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns); + int err; + + if (fl->fl_lmops && fl->fl_lmops->lm_grant) { + /* NLM needs asynchronous locks, which we don't support yet */ + return -ENOLCK; + } + + /* Unlock on close is handled by the flush method */ + if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) + return 0; + + fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); + err = fuse_simple_request(fm, &args); + + /* locking is restartable */ + if (err == -EINTR) + err = -ERESTARTSYS; + + return err; +} + +static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + if (cmd == F_CANCELLK) { + err = 0; + } else if (cmd == F_GETLK) { + if (fc->no_lock) { + posix_test_lock(file, fl); + err = 0; + } else + err = fuse_getlk(file, fl); + } else { + if (fc->no_lock) + err = posix_lock_file(file, fl, NULL); + else + err = fuse_setlk(file, fl, 0); + } + return err; +} + +static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + if (fc->no_flock) { + err = locks_lock_file_wait(file, fl); + } else { + struct fuse_file *ff = file->private_data; + + /* emulate flock with POSIX locks */ + ff->flock = true; + err = fuse_setlk(file, fl, 1); + } + + return err; +} + +static sector_t fuse_bmap(struct address_space *mapping, sector_t block) +{ + struct inode *inode = mapping->host; + struct fuse_mount *fm = get_fuse_mount(inode); + FUSE_ARGS(args); + struct fuse_bmap_in inarg; + struct fuse_bmap_out outarg; + int err; + + if (!inode->i_sb->s_bdev || fm->fc->no_bmap) + return 0; + + memset(&inarg, 0, sizeof(inarg)); + inarg.block = block; + inarg.blocksize = inode->i_sb->s_blocksize; + args.opcode = FUSE_BMAP; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) + fm->fc->no_bmap = 1; + + return err ? 0 : outarg.block; +} + +static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_file *ff = file->private_data; + FUSE_ARGS(args); + struct fuse_lseek_in inarg = { + .fh = ff->fh, + .offset = offset, + .whence = whence + }; + struct fuse_lseek_out outarg; + int err; + + if (fm->fc->no_lseek) + goto fallback; + + args.opcode = FUSE_LSEEK; + args.nodeid = ff->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (err) { + if (err == -ENOSYS) { + fm->fc->no_lseek = 1; + goto fallback; + } + return err; + } + + return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes); + +fallback: + err = fuse_update_attributes(inode, file, STATX_SIZE); + if (!err) + return generic_file_llseek(file, offset, whence); + else + return err; +} + +static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) +{ + loff_t retval; + struct inode *inode = file_inode(file); + + switch (whence) { + case SEEK_SET: + case SEEK_CUR: + /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */ + retval = generic_file_llseek(file, offset, whence); + break; + case SEEK_END: + inode_lock(inode); + retval = fuse_update_attributes(inode, file, STATX_SIZE); + if (!retval) + retval = generic_file_llseek(file, offset, whence); + inode_unlock(inode); + break; + case SEEK_HOLE: + case SEEK_DATA: + inode_lock(inode); + retval = fuse_lseek(file, offset, whence); + inode_unlock(inode); + break; + default: + retval = -EINVAL; + } + + return retval; +} + +/* + * All files which have been polled are linked to RB tree + * fuse_conn->polled_files which is indexed by kh. Walk the tree and + * find the matching one. + */ +static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh, + struct rb_node **parent_out) +{ + struct rb_node **link = &fc->polled_files.rb_node; + struct rb_node *last = NULL; + + while (*link) { + struct fuse_file *ff; + + last = *link; + ff = rb_entry(last, struct fuse_file, polled_node); + + if (kh < ff->kh) + link = &last->rb_left; + else if (kh > ff->kh) + link = &last->rb_right; + else + return link; + } + + if (parent_out) + *parent_out = last; + return link; +} + +/* + * The file is about to be polled. Make sure it's on the polled_files + * RB tree. Note that files once added to the polled_files tree are + * not removed before the file is released. This is because a file + * polled once is likely to be polled again. + */ +static void fuse_register_polled_file(struct fuse_conn *fc, + struct fuse_file *ff) +{ + spin_lock(&fc->lock); + if (RB_EMPTY_NODE(&ff->polled_node)) { + struct rb_node **link, *parent; + + link = fuse_find_polled_node(fc, ff->kh, &parent); + BUG_ON(*link); + rb_link_node(&ff->polled_node, parent, link); + rb_insert_color(&ff->polled_node, &fc->polled_files); + } + spin_unlock(&fc->lock); +} + +__poll_t fuse_file_poll(struct file *file, poll_table *wait) +{ + struct fuse_file *ff = file->private_data; + struct fuse_mount *fm = ff->fm; + struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; + struct fuse_poll_out outarg; + FUSE_ARGS(args); + int err; + + if (fm->fc->no_poll) + return DEFAULT_POLLMASK; + + poll_wait(file, &ff->poll_wait, wait); + inarg.events = mangle_poll(poll_requested_events(wait)); + + /* + * Ask for notification iff there's someone waiting for it. + * The client may ignore the flag and always notify. + */ + if (waitqueue_active(&ff->poll_wait)) { + inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY; + fuse_register_polled_file(fm->fc, ff); + } + + args.opcode = FUSE_POLL; + args.nodeid = ff->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + + if (!err) + return demangle_poll(outarg.revents); + if (err == -ENOSYS) { + fm->fc->no_poll = 1; + return DEFAULT_POLLMASK; + } + return EPOLLERR; +} +EXPORT_SYMBOL_GPL(fuse_file_poll); + +/* + * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and + * wakes up the poll waiters. + */ +int fuse_notify_poll_wakeup(struct fuse_conn *fc, + struct fuse_notify_poll_wakeup_out *outarg) +{ + u64 kh = outarg->kh; + struct rb_node **link; + + spin_lock(&fc->lock); + + link = fuse_find_polled_node(fc, kh, NULL); + if (*link) { + struct fuse_file *ff; + + ff = rb_entry(*link, struct fuse_file, polled_node); + wake_up_interruptible_sync(&ff->poll_wait); + } + + spin_unlock(&fc->lock); + return 0; +} + +static void fuse_do_truncate(struct file *file) +{ + struct inode *inode = file->f_mapping->host; + struct iattr attr; + + attr.ia_valid = ATTR_SIZE; + attr.ia_size = i_size_read(inode); + + attr.ia_file = file; + attr.ia_valid |= ATTR_FILE; + + fuse_do_setattr(file_dentry(file), &attr, file); +} + +static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off) +{ + return round_up(off, fc->max_pages << PAGE_SHIFT); +} + +static ssize_t +fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +{ + DECLARE_COMPLETION_ONSTACK(wait); + ssize_t ret = 0; + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + loff_t pos = 0; + struct inode *inode; + loff_t i_size; + size_t count = iov_iter_count(iter), shortened = 0; + loff_t offset = iocb->ki_pos; + struct fuse_io_priv *io; + + pos = offset; + inode = file->f_mapping->host; + i_size = i_size_read(inode); + + if ((iov_iter_rw(iter) == READ) && (offset >= i_size)) + return 0; + + io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); + if (!io) + return -ENOMEM; + spin_lock_init(&io->lock); + kref_init(&io->refcnt); + io->reqs = 1; + io->bytes = -1; + io->size = 0; + io->offset = offset; + io->write = (iov_iter_rw(iter) == WRITE); + io->err = 0; + /* + * By default, we want to optimize all I/Os with async request + * submission to the client filesystem if supported. + */ + io->async = ff->fm->fc->async_dio; + io->iocb = iocb; + io->blocking = is_sync_kiocb(iocb); + + /* optimization for short read */ + if (io->async && !io->write && offset + count > i_size) { + iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset)); + shortened = count - iov_iter_count(iter); + count -= shortened; + } + + /* + * We cannot asynchronously extend the size of a file. + * In such case the aio will behave exactly like sync io. + */ + if ((offset + count > i_size) && io->write) + io->blocking = true; + + if (io->async && io->blocking) { + /* + * Additional reference to keep io around after + * calling fuse_aio_complete() + */ + kref_get(&io->refcnt); + io->done = &wait; + } + + if (iov_iter_rw(iter) == WRITE) { + ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); + } else { + ret = __fuse_direct_read(io, iter, &pos); + } + iov_iter_reexpand(iter, iov_iter_count(iter) + shortened); + + if (io->async) { + bool blocking = io->blocking; + + fuse_aio_complete(io, ret < 0 ? ret : 0, -1); + + /* we have a non-extending, async request, so return */ + if (!blocking) + return -EIOCBQUEUED; + + wait_for_completion(&wait); + ret = fuse_get_res_by_io(io); + } + + kref_put(&io->refcnt, fuse_io_release); + + if (iov_iter_rw(iter) == WRITE) { + fuse_write_update_attr(inode, pos, ret); + if (ret < 0 && offset + count > i_size) + fuse_do_truncate(file); + } + + return ret; +} + +static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end) +{ + int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX); + + if (!err) + fuse_sync_writes(inode); + + return err; +} + +static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, + loff_t length) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_mount *fm = ff->fm; + FUSE_ARGS(args); + struct fuse_fallocate_in inarg = { + .fh = ff->fh, + .offset = offset, + .length = length, + .mode = mode + }; + int err; + bool block_faults = FUSE_IS_DAX(inode) && + (!(mode & FALLOC_FL_KEEP_SIZE) || + (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))); + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_ZERO_RANGE)) + return -EOPNOTSUPP; + + if (fm->fc->no_fallocate) + return -EOPNOTSUPP; + + inode_lock(inode); + if (block_faults) { + filemap_invalidate_lock(inode->i_mapping); + err = fuse_dax_break_layouts(inode, 0, 0); + if (err) + goto out; + } + + if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) { + loff_t endbyte = offset + length - 1; + + err = fuse_writeback_range(inode, offset, endbyte); + if (err) + goto out; + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + length > i_size_read(inode)) { + err = inode_newsize_ok(inode, offset + length); + if (err) + goto out; + } + + err = file_modified(file); + if (err) + goto out; + + if (!(mode & FALLOC_FL_KEEP_SIZE)) + set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + + args.opcode = FUSE_FALLOCATE; + args.nodeid = ff->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + fm->fc->no_fallocate = 1; + err = -EOPNOTSUPP; + } + if (err) + goto out; + + /* we could have extended the file */ + if (!(mode & FALLOC_FL_KEEP_SIZE)) { + if (fuse_write_update_attr(inode, offset + length, length)) + file_update_time(file); + } + + if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) + truncate_pagecache_range(inode, offset, offset + length - 1); + + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); + +out: + if (!(mode & FALLOC_FL_KEEP_SIZE)) + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + + if (block_faults) + filemap_invalidate_unlock(inode->i_mapping); + + inode_unlock(inode); + + fuse_flush_time_update(inode); + + return err; +} + +static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags) +{ + struct fuse_file *ff_in = file_in->private_data; + struct fuse_file *ff_out = file_out->private_data; + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + struct fuse_inode *fi_out = get_fuse_inode(inode_out); + struct fuse_mount *fm = ff_in->fm; + struct fuse_conn *fc = fm->fc; + FUSE_ARGS(args); + struct fuse_copy_file_range_in inarg = { + .fh_in = ff_in->fh, + .off_in = pos_in, + .nodeid_out = ff_out->nodeid, + .fh_out = ff_out->fh, + .off_out = pos_out, + .len = len, + .flags = flags + }; + struct fuse_write_out outarg; + ssize_t err; + /* mark unstable when write-back is not used, and file_out gets + * extended */ + bool is_unstable = (!fc->writeback_cache) && + ((pos_out + len) > inode_out->i_size); + + if (fc->no_copy_file_range) + return -EOPNOTSUPP; + + if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) + return -EXDEV; + + inode_lock(inode_in); + err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1); + inode_unlock(inode_in); + if (err) + return err; + + inode_lock(inode_out); + + err = file_modified(file_out); + if (err) + goto out; + + /* + * Write out dirty pages in the destination file before sending the COPY + * request to userspace. After the request is completed, truncate off + * pages (including partial ones) from the cache that have been copied, + * since these contain stale data at that point. + * + * This should be mostly correct, but if the COPY writes to partial + * pages (at the start or end) and the parts not covered by the COPY are + * written through a memory map after calling fuse_writeback_range(), + * then these partial page modifications will be lost on truncation. + * + * It is unlikely that someone would rely on such mixed style + * modifications. Yet this does give less guarantees than if the + * copying was performed with write(2). + * + * To fix this a mapping->invalidate_lock could be used to prevent new + * faults while the copy is ongoing. + */ + err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1); + if (err) + goto out; + + if (is_unstable) + set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); + + args.opcode = FUSE_COPY_FILE_RANGE; + args.nodeid = ff_in->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + fc->no_copy_file_range = 1; + err = -EOPNOTSUPP; + } + if (err) + goto out; + + truncate_inode_pages_range(inode_out->i_mapping, + ALIGN_DOWN(pos_out, PAGE_SIZE), + ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1); + + file_update_time(file_out); + fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size); + + err = outarg.size; +out: + if (is_unstable) + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); + + inode_unlock(inode_out); + file_accessed(file_in); + + fuse_flush_time_update(inode_out); + + return err; +} + +static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, + size_t len, unsigned int flags) +{ + ssize_t ret; + + ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off, + len, flags); + + if (ret == -EOPNOTSUPP || ret == -EXDEV) + ret = generic_copy_file_range(src_file, src_off, dst_file, + dst_off, len, flags); + return ret; +} + +static const struct file_operations fuse_file_operations = { + .llseek = fuse_file_llseek, + .read_iter = fuse_file_read_iter, + .write_iter = fuse_file_write_iter, + .mmap = fuse_file_mmap, + .open = fuse_open, + .flush = fuse_flush, + .release = fuse_release, + .fsync = fuse_fsync, + .lock = fuse_file_lock, + .get_unmapped_area = thp_get_unmapped_area, + .flock = fuse_file_flock, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .unlocked_ioctl = fuse_file_ioctl, + .compat_ioctl = fuse_file_compat_ioctl, + .poll = fuse_file_poll, + .fallocate = fuse_file_fallocate, + .copy_file_range = fuse_copy_file_range, +}; + +static const struct address_space_operations fuse_file_aops = { + .read_folio = fuse_read_folio, + .readahead = fuse_readahead, + .writepage = fuse_writepage, + .writepages = fuse_writepages, + .launder_folio = fuse_launder_folio, + .dirty_folio = filemap_dirty_folio, + .bmap = fuse_bmap, + .direct_IO = fuse_direct_IO, + .write_begin = fuse_write_begin, + .write_end = fuse_write_end, +}; + +void fuse_init_file_inode(struct inode *inode, unsigned int flags) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + inode->i_fop = &fuse_file_operations; + inode->i_data.a_ops = &fuse_file_aops; + + INIT_LIST_HEAD(&fi->write_files); + INIT_LIST_HEAD(&fi->queued_writes); + fi->writectr = 0; + init_waitqueue_head(&fi->page_waitq); + fi->writepages = RB_ROOT; + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_inode_init(inode, flags); +} diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h new file mode 100644 index 000000000..a9681fecb --- /dev/null +++ b/fs/fuse/fuse_i.h @@ -0,0 +1,1337 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#ifndef _FS_FUSE_I_H +#define _FS_FUSE_I_H + +#ifndef pr_fmt +# define pr_fmt(fmt) "fuse: " fmt +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** Default max number of pages that can be used in a single read request */ +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 + +/** Maximum of max_pages received in init_out */ +#define FUSE_MAX_MAX_PAGES 256 + +/** Bias for fi->writectr, meaning new writepages must not be sent */ +#define FUSE_NOWRITE INT_MIN + +/** It could be as large as PATH_MAX, but would that have any uses? */ +#define FUSE_NAME_MAX 1024 + +/** Number of dentries for each connection in the control filesystem */ +#define FUSE_CTL_NUM_DENTRIES 5 + +/** List of active connections */ +extern struct list_head fuse_conn_list; + +/** Global mutex protecting fuse_conn_list and the control filesystem */ +extern struct mutex fuse_mutex; + +/** Module parameters */ +extern unsigned max_user_bgreq; +extern unsigned max_user_congthresh; + +/* One forget request */ +struct fuse_forget_link { + struct fuse_forget_one forget_one; + struct fuse_forget_link *next; +}; + +/* Submount lookup tracking */ +struct fuse_submount_lookup { + /** Refcount */ + refcount_t count; + + /** Unique ID, which identifies the inode between userspace + * and kernel */ + u64 nodeid; + + /** The request used for sending the FORGET message */ + struct fuse_forget_link *forget; +}; + +/** FUSE inode */ +struct fuse_inode { + /** Inode data */ + struct inode inode; + + /** Unique ID, which identifies the inode between userspace + * and kernel */ + u64 nodeid; + + /** Number of lookups on this inode */ + u64 nlookup; + + /** The request used for sending the FORGET message */ + struct fuse_forget_link *forget; + + /** Time in jiffies until the file attributes are valid */ + u64 i_time; + + /* Which attributes are invalid */ + u32 inval_mask; + + /** The sticky bit in inode->i_mode may have been removed, so + preserve the original mode */ + umode_t orig_i_mode; + + /** 64 bit inode number */ + u64 orig_ino; + + /** Version of last attribute change */ + u64 attr_version; + + union { + /* Write related fields (regular file only) */ + struct { + /* Files usable in writepage. Protected by fi->lock */ + struct list_head write_files; + + /* Writepages pending on truncate or fsync */ + struct list_head queued_writes; + + /* Number of sent writes, a negative bias + * (FUSE_NOWRITE) means more writes are blocked */ + int writectr; + + /* Waitq for writepage completion */ + wait_queue_head_t page_waitq; + + /* List of writepage requestst (pending or sent) */ + struct rb_root writepages; + }; + + /* readdir cache (directory only) */ + struct { + /* true if fully cached */ + bool cached; + + /* size of cache */ + loff_t size; + + /* position at end of cache (position of next entry) */ + loff_t pos; + + /* version of the cache */ + u64 version; + + /* modification time of directory when cache was + * started */ + struct timespec64 mtime; + + /* iversion of directory when cache was started */ + u64 iversion; + + /* protects above fields */ + spinlock_t lock; + } rdc; + }; + + /** Miscellaneous bits describing inode state */ + unsigned long state; + + /** Lock for serializing lookup and readdir for back compatibility*/ + struct mutex mutex; + + /** Lock to protect write related fields */ + spinlock_t lock; + +#ifdef CONFIG_FUSE_DAX + /* + * Dax specific inode data + */ + struct fuse_inode_dax *dax; +#endif + /** Submount specific lookup tracking */ + struct fuse_submount_lookup *submount_lookup; +}; + +/** FUSE inode state bits */ +enum { + /** Advise readdirplus */ + FUSE_I_ADVISE_RDPLUS, + /** Initialized with readdirplus */ + FUSE_I_INIT_RDPLUS, + /** An operation changing file size is in progress */ + FUSE_I_SIZE_UNSTABLE, + /* Bad inode */ + FUSE_I_BAD, +}; + +struct fuse_conn; +struct fuse_mount; +struct fuse_release_args; + +/** FUSE specific file data */ +struct fuse_file { + /** Fuse connection for this file */ + struct fuse_mount *fm; + + /* Argument space reserved for release */ + struct fuse_release_args *release_args; + + /** Kernel file handle guaranteed to be unique */ + u64 kh; + + /** File handle used by userspace */ + u64 fh; + + /** Node id of this file */ + u64 nodeid; + + /** Refcount */ + refcount_t count; + + /** FOPEN_* flags returned by open */ + u32 open_flags; + + /** Entry on inode's write_files list */ + struct list_head write_entry; + + /* Readdir related */ + struct { + /* + * Protects below fields against (crazy) parallel readdir on + * same open file. Uncontended in the normal case. + */ + struct mutex lock; + + /* Dir stream position */ + loff_t pos; + + /* Offset in cache */ + loff_t cache_off; + + /* Version of cache we are reading */ + u64 version; + + } readdir; + + /** RB node to be linked on fuse_conn->polled_files */ + struct rb_node polled_node; + + /** Wait queue head for poll */ + wait_queue_head_t poll_wait; + + /** Has flock been performed on this file? */ + bool flock:1; +}; + +/** One input argument of a request */ +struct fuse_in_arg { + unsigned size; + const void *value; +}; + +/** One output argument of a request */ +struct fuse_arg { + unsigned size; + void *value; +}; + +/** FUSE page descriptor */ +struct fuse_page_desc { + unsigned int length; + unsigned int offset; +}; + +struct fuse_args { + uint64_t nodeid; + uint32_t opcode; + unsigned short in_numargs; + unsigned short out_numargs; + bool force:1; + bool noreply:1; + bool nocreds:1; + bool in_pages:1; + bool out_pages:1; + bool user_pages:1; + bool out_argvar:1; + bool page_zeroing:1; + bool page_replace:1; + bool may_block:1; + struct fuse_in_arg in_args[3]; + struct fuse_arg out_args[2]; + void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); +}; + +struct fuse_args_pages { + struct fuse_args args; + struct page **pages; + struct fuse_page_desc *descs; + unsigned int num_pages; +}; + +#define FUSE_ARGS(args) struct fuse_args args = {} + +/** The request IO state (for asynchronous processing) */ +struct fuse_io_priv { + struct kref refcnt; + int async; + spinlock_t lock; + unsigned reqs; + ssize_t bytes; + size_t size; + __u64 offset; + bool write; + bool should_dirty; + int err; + struct kiocb *iocb; + struct completion *done; + bool blocking; +}; + +#define FUSE_IO_PRIV_SYNC(i) \ +{ \ + .refcnt = KREF_INIT(1), \ + .async = 0, \ + .iocb = i, \ +} + +/** + * Request flags + * + * FR_ISREPLY: set if the request has reply + * FR_FORCE: force sending of the request even if interrupted + * FR_BACKGROUND: request is sent in the background + * FR_WAITING: request is counted as "waiting" + * FR_ABORTED: the request was aborted + * FR_INTERRUPTED: the request has been interrupted + * FR_LOCKED: data is being copied to/from the request + * FR_PENDING: request is not yet in userspace + * FR_SENT: request is in userspace, waiting for an answer + * FR_FINISHED: request is finished + * FR_PRIVATE: request is on private list + * FR_ASYNC: request is asynchronous + */ +enum fuse_req_flag { + FR_ISREPLY, + FR_FORCE, + FR_BACKGROUND, + FR_WAITING, + FR_ABORTED, + FR_INTERRUPTED, + FR_LOCKED, + FR_PENDING, + FR_SENT, + FR_FINISHED, + FR_PRIVATE, + FR_ASYNC, +}; + +/** + * A request to the client + * + * .waitq.lock protects the following fields: + * - FR_ABORTED + * - FR_LOCKED (may also be modified under fc->lock, tested under both) + */ +struct fuse_req { + /** This can be on either pending processing or io lists in + fuse_conn */ + struct list_head list; + + /** Entry on the interrupts list */ + struct list_head intr_entry; + + /* Input/output arguments */ + struct fuse_args *args; + + /** refcount */ + refcount_t count; + + /* Request flags, updated with test/set/clear_bit() */ + unsigned long flags; + + /* The request input header */ + struct { + struct fuse_in_header h; + } in; + + /* The request output header */ + struct { + struct fuse_out_header h; + } out; + + /** Used to wake up the task waiting for completion of request*/ + wait_queue_head_t waitq; + +#if IS_ENABLED(CONFIG_VIRTIO_FS) + /** virtio-fs's physically contiguous buffer for in and out args */ + void *argbuf; +#endif + + /** fuse_mount this request belongs to */ + struct fuse_mount *fm; +}; + +struct fuse_iqueue; + +/** + * Input queue callbacks + * + * Input queue signalling is device-specific. For example, the /dev/fuse file + * uses fiq->waitq and fasync to wake processes that are waiting on queue + * readiness. These callbacks allow other device types to respond to input + * queue activity. + */ +struct fuse_iqueue_ops { + /** + * Signal that a forget has been queued + */ + void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq) + __releases(fiq->lock); + + /** + * Signal that an INTERRUPT request has been queued + */ + void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq) + __releases(fiq->lock); + + /** + * Signal that a request has been queued + */ + void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq) + __releases(fiq->lock); + + /** + * Clean up when fuse_iqueue is destroyed + */ + void (*release)(struct fuse_iqueue *fiq); +}; + +/** /dev/fuse input queue operations */ +extern const struct fuse_iqueue_ops fuse_dev_fiq_ops; + +struct fuse_iqueue { + /** Connection established */ + unsigned connected; + + /** Lock protecting accesses to members of this structure */ + spinlock_t lock; + + /** Readers of the connection are waiting on this */ + wait_queue_head_t waitq; + + /** The next unique request id */ + u64 reqctr; + + /** The list of pending requests */ + struct list_head pending; + + /** Pending interrupts */ + struct list_head interrupts; + + /** Queue of pending forgets */ + struct fuse_forget_link forget_list_head; + struct fuse_forget_link *forget_list_tail; + + /** Batching of FORGET requests (positive indicates FORGET batch) */ + int forget_batch; + + /** O_ASYNC requests */ + struct fasync_struct *fasync; + + /** Device-specific callbacks */ + const struct fuse_iqueue_ops *ops; + + /** Device-specific state */ + void *priv; +}; + +#define FUSE_PQ_HASH_BITS 8 +#define FUSE_PQ_HASH_SIZE (1 << FUSE_PQ_HASH_BITS) + +struct fuse_pqueue { + /** Connection established */ + unsigned connected; + + /** Lock protecting accessess to members of this structure */ + spinlock_t lock; + + /** Hash table of requests being processed */ + struct list_head *processing; + + /** The list of requests under I/O */ + struct list_head io; +}; + +/** + * Fuse device instance + */ +struct fuse_dev { + /** Fuse connection for this device */ + struct fuse_conn *fc; + + /** Processing queue */ + struct fuse_pqueue pq; + + /** list entry on fc->devices */ + struct list_head entry; +}; + +enum fuse_dax_mode { + FUSE_DAX_INODE_DEFAULT, /* default */ + FUSE_DAX_ALWAYS, /* "-o dax=always" */ + FUSE_DAX_NEVER, /* "-o dax=never" */ + FUSE_DAX_INODE_USER, /* "-o dax=inode" */ +}; + +static inline bool fuse_is_inode_dax_mode(enum fuse_dax_mode mode) +{ + return mode == FUSE_DAX_INODE_DEFAULT || mode == FUSE_DAX_INODE_USER; +} + +struct fuse_fs_context { + int fd; + struct file *file; + unsigned int rootmode; + kuid_t user_id; + kgid_t group_id; + bool is_bdev:1; + bool fd_present:1; + bool rootmode_present:1; + bool user_id_present:1; + bool group_id_present:1; + bool default_permissions:1; + bool allow_other:1; + bool destroy:1; + bool no_control:1; + bool no_force_umount:1; + bool legacy_opts_show:1; + enum fuse_dax_mode dax_mode; + unsigned int max_read; + unsigned int blksize; + const char *subtype; + + /* DAX device, may be NULL */ + struct dax_device *dax_dev; + + /* fuse_dev pointer to fill in, should contain NULL on entry */ + void **fudptr; +}; + +struct fuse_sync_bucket { + /* count is a possible scalability bottleneck */ + atomic_t count; + wait_queue_head_t waitq; + struct rcu_head rcu; +}; + +/** + * A Fuse connection. + * + * This structure is created, when the root filesystem is mounted, and + * is destroyed, when the client device is closed and the last + * fuse_mount is destroyed. + */ +struct fuse_conn { + /** Lock protecting accessess to members of this structure */ + spinlock_t lock; + + /** Refcount */ + refcount_t count; + + /** Number of fuse_dev's */ + atomic_t dev_count; + + struct rcu_head rcu; + + /** The user id for this mount */ + kuid_t user_id; + + /** The group id for this mount */ + kgid_t group_id; + + /** The pid namespace for this mount */ + struct pid_namespace *pid_ns; + + /** The user namespace for this mount */ + struct user_namespace *user_ns; + + /** Maximum read size */ + unsigned max_read; + + /** Maximum write size */ + unsigned max_write; + + /** Maximum number of pages that can be used in a single request */ + unsigned int max_pages; + + /** Constrain ->max_pages to this value during feature negotiation */ + unsigned int max_pages_limit; + + /** Input queue */ + struct fuse_iqueue iq; + + /** The next unique kernel file handle */ + atomic64_t khctr; + + /** rbtree of fuse_files waiting for poll events indexed by ph */ + struct rb_root polled_files; + + /** Maximum number of outstanding background requests */ + unsigned max_background; + + /** Number of background requests at which congestion starts */ + unsigned congestion_threshold; + + /** Number of requests currently in the background */ + unsigned num_background; + + /** Number of background requests currently queued for userspace */ + unsigned active_background; + + /** The list of background requests set aside for later queuing */ + struct list_head bg_queue; + + /** Protects: max_background, congestion_threshold, num_background, + * active_background, bg_queue, blocked */ + spinlock_t bg_lock; + + /** Flag indicating that INIT reply has been received. Allocating + * any fuse request will be suspended until the flag is set */ + int initialized; + + /** Flag indicating if connection is blocked. This will be + the case before the INIT reply is received, and if there + are too many outstading backgrounds requests */ + int blocked; + + /** waitq for blocked connection */ + wait_queue_head_t blocked_waitq; + + /** Connection established, cleared on umount, connection + abort and device release */ + unsigned connected; + + /** Connection aborted via sysfs */ + bool aborted; + + /** Connection failed (version mismatch). Cannot race with + setting other bitfields since it is only set once in INIT + reply, before any other request, and never cleared */ + unsigned conn_error:1; + + /** Connection successful. Only set in INIT */ + unsigned conn_init:1; + + /** Do readahead asynchronously? Only set in INIT */ + unsigned async_read:1; + + /** Return an unique read error after abort. Only set in INIT */ + unsigned abort_err:1; + + /** Do not send separate SETATTR request before open(O_TRUNC) */ + unsigned atomic_o_trunc:1; + + /** Filesystem supports NFS exporting. Only set in INIT */ + unsigned export_support:1; + + /** write-back cache policy (default is write-through) */ + unsigned writeback_cache:1; + + /** allow parallel lookups and readdir (default is serialized) */ + unsigned parallel_dirops:1; + + /** handle fs handles killing suid/sgid/cap on write/chown/trunc */ + unsigned handle_killpriv:1; + + /** cache READLINK responses in page cache */ + unsigned cache_symlinks:1; + + /* show legacy mount options */ + unsigned int legacy_opts_show:1; + + /* + * fs kills suid/sgid/cap on write/chown/trunc. suid is killed on + * write/trunc only if caller did not have CAP_FSETID. sgid is killed + * on write/truncate only if caller did not have CAP_FSETID as well as + * file has group execute permission. + */ + unsigned handle_killpriv_v2:1; + + /* + * The following bitfields are only for optimization purposes + * and hence races in setting them will not cause malfunction + */ + + /** Is open/release not implemented by fs? */ + unsigned no_open:1; + + /** Is opendir/releasedir not implemented by fs? */ + unsigned no_opendir:1; + + /** Is fsync not implemented by fs? */ + unsigned no_fsync:1; + + /** Is fsyncdir not implemented by fs? */ + unsigned no_fsyncdir:1; + + /** Is flush not implemented by fs? */ + unsigned no_flush:1; + + /** Is setxattr not implemented by fs? */ + unsigned no_setxattr:1; + + /** Does file server support extended setxattr */ + unsigned setxattr_ext:1; + + /** Is getxattr not implemented by fs? */ + unsigned no_getxattr:1; + + /** Is listxattr not implemented by fs? */ + unsigned no_listxattr:1; + + /** Is removexattr not implemented by fs? */ + unsigned no_removexattr:1; + + /** Are posix file locking primitives not implemented by fs? */ + unsigned no_lock:1; + + /** Is access not implemented by fs? */ + unsigned no_access:1; + + /** Is create not implemented by fs? */ + unsigned no_create:1; + + /** Is interrupt not implemented by fs? */ + unsigned no_interrupt:1; + + /** Is bmap not implemented by fs? */ + unsigned no_bmap:1; + + /** Is poll not implemented by fs? */ + unsigned no_poll:1; + + /** Do multi-page cached writes */ + unsigned big_writes:1; + + /** Don't apply umask to creation modes */ + unsigned dont_mask:1; + + /** Are BSD file locking primitives not implemented by fs? */ + unsigned no_flock:1; + + /** Is fallocate not implemented by fs? */ + unsigned no_fallocate:1; + + /** Is rename with flags implemented by fs? */ + unsigned no_rename2:1; + + /** Use enhanced/automatic page cache invalidation. */ + unsigned auto_inval_data:1; + + /** Filesystem is fully responsible for page cache invalidation. */ + unsigned explicit_inval_data:1; + + /** Does the filesystem support readdirplus? */ + unsigned do_readdirplus:1; + + /** Does the filesystem want adaptive readdirplus? */ + unsigned readdirplus_auto:1; + + /** Does the filesystem support asynchronous direct-IO submission? */ + unsigned async_dio:1; + + /** Is lseek not implemented by fs? */ + unsigned no_lseek:1; + + /** Does the filesystem support posix acls? */ + unsigned posix_acl:1; + + /** Check permissions based on the file mode or not? */ + unsigned default_permissions:1; + + /** Allow other than the mounter user to access the filesystem ? */ + unsigned allow_other:1; + + /** Does the filesystem support copy_file_range? */ + unsigned no_copy_file_range:1; + + /* Send DESTROY request */ + unsigned int destroy:1; + + /* Delete dentries that have gone stale */ + unsigned int delete_stale:1; + + /** Do not create entry in fusectl fs */ + unsigned int no_control:1; + + /** Do not allow MNT_FORCE umount */ + unsigned int no_force_umount:1; + + /* Auto-mount submounts announced by the server */ + unsigned int auto_submounts:1; + + /* Propagate syncfs() to server */ + unsigned int sync_fs:1; + + /* Initialize security xattrs when creating a new inode */ + unsigned int init_security:1; + + /* Does the filesystem support per inode DAX? */ + unsigned int inode_dax:1; + + /* Is tmpfile not implemented by fs? */ + unsigned int no_tmpfile:1; + + /** The number of requests waiting for completion */ + atomic_t num_waiting; + + /** Negotiated minor version */ + unsigned minor; + + /** Entry on the fuse_mount_list */ + struct list_head entry; + + /** Device ID from the root super block */ + dev_t dev; + + /** Dentries in the control filesystem */ + struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES]; + + /** number of dentries used in the above array */ + int ctl_ndents; + + /** Key for lock owner ID scrambling */ + u32 scramble_key[4]; + + /** Version counter for attribute changes */ + atomic64_t attr_version; + + /** Called on final put */ + void (*release)(struct fuse_conn *); + + /** + * Read/write semaphore to hold when accessing the sb of any + * fuse_mount belonging to this connection + */ + struct rw_semaphore killsb; + + /** List of device instances belonging to this connection */ + struct list_head devices; + +#ifdef CONFIG_FUSE_DAX + /* Dax mode */ + enum fuse_dax_mode dax_mode; + + /* Dax specific conn data, non-NULL if DAX is enabled */ + struct fuse_conn_dax *dax; +#endif + + /** List of filesystems using this connection */ + struct list_head mounts; + + /* New writepages go into this bucket */ + struct fuse_sync_bucket __rcu *curr_bucket; +}; + +/* + * Represents a mounted filesystem, potentially a submount. + * + * This object allows sharing a fuse_conn between separate mounts to + * allow submounts with dedicated superblocks and thus separate device + * IDs. + */ +struct fuse_mount { + /* Underlying (potentially shared) connection to the FUSE server */ + struct fuse_conn *fc; + + /* + * Super block for this connection (fc->killsb must be held when + * accessing this). + */ + struct super_block *sb; + + /* Entry on fc->mounts */ + struct list_head fc_entry; +}; + +static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) +{ + return get_fuse_mount_super(sb)->fc; +} + +static inline struct fuse_mount *get_fuse_mount(struct inode *inode) +{ + return get_fuse_mount_super(inode->i_sb); +} + +static inline struct fuse_conn *get_fuse_conn(struct inode *inode) +{ + return get_fuse_mount_super(inode->i_sb)->fc; +} + +static inline struct fuse_inode *get_fuse_inode(struct inode *inode) +{ + return container_of(inode, struct fuse_inode, inode); +} + +static inline u64 get_node_id(struct inode *inode) +{ + return get_fuse_inode(inode)->nodeid; +} + +static inline int invalid_nodeid(u64 nodeid) +{ + return !nodeid || nodeid == FUSE_ROOT_ID; +} + +static inline u64 fuse_get_attr_version(struct fuse_conn *fc) +{ + return atomic64_read(&fc->attr_version); +} + +static inline bool fuse_stale_inode(const struct inode *inode, int generation, + struct fuse_attr *attr) +{ + return inode->i_generation != generation || + inode_wrong_type(inode, attr->mode); +} + +static inline void fuse_make_bad(struct inode *inode) +{ + remove_inode_hash(inode); + set_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state); +} + +static inline bool fuse_is_bad(struct inode *inode) +{ + return unlikely(test_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state)); +} + +static inline struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, + struct fuse_page_desc **desc) +{ + struct page **pages; + + pages = kzalloc(npages * (sizeof(struct page *) + + sizeof(struct fuse_page_desc)), flags); + *desc = (void *) (pages + npages); + + return pages; +} + +static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs, + unsigned int index, + unsigned int nr_pages) +{ + int i; + + for (i = index; i < index + nr_pages; i++) + descs[i].length = PAGE_SIZE - descs[i].offset; +} + +static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket) +{ + /* Need RCU protection to prevent use after free after the decrement */ + rcu_read_lock(); + if (atomic_dec_and_test(&bucket->count)) + wake_up(&bucket->waitq); + rcu_read_unlock(); +} + +/** Device operations */ +extern const struct file_operations fuse_dev_operations; + +extern const struct dentry_operations fuse_dentry_operations; +extern const struct dentry_operations fuse_root_dentry_operations; + +/** + * Get a filled in inode + */ +struct inode *fuse_iget(struct super_block *sb, u64 nodeid, + int generation, struct fuse_attr *attr, + u64 attr_valid, u64 attr_version); + +int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, + struct fuse_entry_out *outarg, struct inode **inode); + +/** + * Send FORGET command + */ +void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, + u64 nodeid, u64 nlookup); + +struct fuse_forget_link *fuse_alloc_forget(void); + +struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, + unsigned int max, + unsigned int *countp); + +/* + * Initialize READ or READDIR request + */ +struct fuse_io_args { + union { + struct { + struct fuse_read_in in; + u64 attr_ver; + } read; + struct { + struct fuse_write_in in; + struct fuse_write_out out; + bool page_locked; + } write; + }; + struct fuse_args_pages ap; + struct fuse_io_priv *io; + struct fuse_file *ff; +}; + +void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, + size_t count, int opcode); + + +/** + * Send OPEN or OPENDIR request + */ +int fuse_open_common(struct inode *inode, struct file *file, bool isdir); + +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm); +void fuse_file_free(struct fuse_file *ff); +void fuse_finish_open(struct inode *inode, struct file *file); + +void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, + unsigned int flags); + +/** + * Send RELEASE or RELEASEDIR request + */ +void fuse_release_common(struct file *file, bool isdir); + +/** + * Send FSYNC or FSYNCDIR request + */ +int fuse_fsync_common(struct file *file, loff_t start, loff_t end, + int datasync, int opcode); + +/** + * Notify poll wakeup + */ +int fuse_notify_poll_wakeup(struct fuse_conn *fc, + struct fuse_notify_poll_wakeup_out *outarg); + +/** + * Initialize file operations on a regular file + */ +void fuse_init_file_inode(struct inode *inode, unsigned int flags); + +/** + * Initialize inode operations on regular files and special files + */ +void fuse_init_common(struct inode *inode); + +/** + * Initialize inode and file operations on a directory + */ +void fuse_init_dir(struct inode *inode); + +/** + * Initialize inode operations on a symlink + */ +void fuse_init_symlink(struct inode *inode); + +/** + * Change attributes of an inode + */ +void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid, u64 attr_version); + +void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid, u32 cache_mask); + +u32 fuse_get_cache_mask(struct inode *inode); + +/** + * Initialize the client device + */ +int fuse_dev_init(void); + +/** + * Cleanup the client device + */ +void fuse_dev_cleanup(void); + +int fuse_ctl_init(void); +void __exit fuse_ctl_cleanup(void); + +/** + * Simple request sending that does request allocation and freeing + */ +ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args); +int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, + gfp_t gfp_flags); + +/** + * End a finished request + */ +void fuse_request_end(struct fuse_req *req); + +/* Abort all requests */ +void fuse_abort_conn(struct fuse_conn *fc); +void fuse_wait_aborted(struct fuse_conn *fc); + +/** + * Invalidate inode attributes + */ + +/* Attributes possibly changed on data modification */ +#define FUSE_STATX_MODIFY (STATX_MTIME | STATX_CTIME | STATX_BLOCKS) + +/* Attributes possibly changed on data and/or size modification */ +#define FUSE_STATX_MODSIZE (FUSE_STATX_MODIFY | STATX_SIZE) + +void fuse_invalidate_attr(struct inode *inode); +void fuse_invalidate_attr_mask(struct inode *inode, u32 mask); + +void fuse_invalidate_entry_cache(struct dentry *entry); + +void fuse_invalidate_atime(struct inode *inode); + +u64 entry_attr_timeout(struct fuse_entry_out *o); +void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o); + +/** + * Acquire reference to fuse_conn + */ +struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); + +/** + * Initialize fuse_conn + */ +void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, + struct user_namespace *user_ns, + const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv); + +/** + * Release reference to fuse_conn + */ +void fuse_conn_put(struct fuse_conn *fc); + +struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); +struct fuse_dev *fuse_dev_alloc(void); +void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); +void fuse_dev_free(struct fuse_dev *fud); +void fuse_send_init(struct fuse_mount *fm); + +/** + * Fill in superblock and initialize fuse connection + * @sb: partially-initialized superblock to fill in + * @ctx: mount context + */ +int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx); + +/* + * Remove the mount from the connection + * + * Returns whether this was the last mount + */ +bool fuse_mount_remove(struct fuse_mount *fm); + +/* + * Setup context ops for submounts + */ +int fuse_init_fs_context_submount(struct fs_context *fsc); + +/* + * Shut down the connection (possibly sending DESTROY request). + */ +void fuse_conn_destroy(struct fuse_mount *fm); + +/* Drop the connection and free the fuse mount */ +void fuse_mount_destroy(struct fuse_mount *fm); + +/** + * Add connection to control filesystem + */ +int fuse_ctl_add_conn(struct fuse_conn *fc); + +/** + * Remove connection from control filesystem + */ +void fuse_ctl_remove_conn(struct fuse_conn *fc); + +/** + * Is file type valid? + */ +int fuse_valid_type(int m); + +bool fuse_invalid_attr(struct fuse_attr *attr); + +/** + * Is current process allowed to perform filesystem operation? + */ +int fuse_allow_current_process(struct fuse_conn *fc); + +u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); + +void fuse_flush_time_update(struct inode *inode); +void fuse_update_ctime(struct inode *inode); + +int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask); + +void fuse_flush_writepages(struct inode *inode); + +void fuse_set_nowrite(struct inode *inode); +void fuse_release_nowrite(struct inode *inode); + +/** + * Scan all fuse_mounts belonging to fc to find the first where + * ilookup5() returns a result. Return that result and the + * respective fuse_mount in *fm (unless fm is NULL). + * + * The caller must hold fc->killsb. + */ +struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid, + struct fuse_mount **fm); + +/** + * File-system tells the kernel to invalidate cache for the given node id. + */ +int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, + loff_t offset, loff_t len); + +/** + * File-system tells the kernel to invalidate parent attributes and + * the dentry matching parent/name. + * + * If the child_nodeid is non-zero and: + * - matches the inode number for the dentry matching parent/name, + * - is not a mount point + * - is a file or oan empty directory + * then the dentry is unhashed (d_delete()). + */ +int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, + u64 child_nodeid, struct qstr *name); + +int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, + bool isdir); + +/** + * fuse_direct_io() flags + */ + +/** If set, it is WRITE; otherwise - READ */ +#define FUSE_DIO_WRITE (1 << 0) + +/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */ +#define FUSE_DIO_CUSE (1 << 1) + +ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, + loff_t *ppos, int flags); +long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, + unsigned int flags); +long fuse_ioctl_common(struct file *file, unsigned int cmd, + unsigned long arg, unsigned int flags); +__poll_t fuse_file_poll(struct file *file, poll_table *wait); +int fuse_dev_release(struct inode *inode, struct file *file); + +bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written); + +int fuse_flush_times(struct inode *inode, struct fuse_file *ff); +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); + +int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, + struct file *file); + +void fuse_set_initialized(struct fuse_conn *fc); + +void fuse_unlock_inode(struct inode *inode, bool locked); +bool fuse_lock_inode(struct inode *inode); + +int fuse_setxattr(struct inode *inode, const char *name, const void *value, + size_t size, int flags, unsigned int extra_flags); +ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, + size_t size); +ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); +int fuse_removexattr(struct inode *inode, const char *name); +extern const struct xattr_handler *fuse_xattr_handlers[]; +extern const struct xattr_handler *fuse_acl_xattr_handlers[]; +extern const struct xattr_handler *fuse_no_acl_xattr_handlers[]; + +struct posix_acl; +struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu); +int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); + +/* readdir.c */ +int fuse_readdir(struct file *file, struct dir_context *ctx); + +/** + * Return the number of bytes in an arguments list + */ +unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args); + +/** + * Get the next unique ID for a request + */ +u64 fuse_get_unique(struct fuse_iqueue *fiq); +void fuse_free_conn(struct fuse_conn *fc); + +/* dax.c */ + +#define FUSE_IS_DAX(inode) (IS_ENABLED(CONFIG_FUSE_DAX) && IS_DAX(inode)) + +ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to); +ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from); +int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma); +int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end); +int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode mode, + struct dax_device *dax_dev); +void fuse_dax_conn_free(struct fuse_conn *fc); +bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi); +void fuse_dax_inode_init(struct inode *inode, unsigned int flags); +void fuse_dax_inode_cleanup(struct inode *inode); +void fuse_dax_dontcache(struct inode *inode, unsigned int flags); +bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment); +void fuse_dax_cancel_work(struct fuse_conn *fc); + +/* ioctl.c */ +long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg); +int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int fuse_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); + +/* file.c */ + +struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, + unsigned int open_flags, bool isdir); +void fuse_file_release(struct inode *inode, struct fuse_file *ff, + unsigned int open_flags, fl_owner_t id, bool isdir); + +#endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c new file mode 100644 index 000000000..f81000d96 --- /dev/null +++ b/fs/fuse/inode.c @@ -0,0 +1,2068 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Miklos Szeredi "); +MODULE_DESCRIPTION("Filesystem in Userspace"); +MODULE_LICENSE("GPL"); + +static struct kmem_cache *fuse_inode_cachep; +struct list_head fuse_conn_list; +DEFINE_MUTEX(fuse_mutex); + +static int set_global_limit(const char *val, const struct kernel_param *kp); + +unsigned max_user_bgreq; +module_param_call(max_user_bgreq, set_global_limit, param_get_uint, + &max_user_bgreq, 0644); +__MODULE_PARM_TYPE(max_user_bgreq, "uint"); +MODULE_PARM_DESC(max_user_bgreq, + "Global limit for the maximum number of backgrounded requests an " + "unprivileged user can set"); + +unsigned max_user_congthresh; +module_param_call(max_user_congthresh, set_global_limit, param_get_uint, + &max_user_congthresh, 0644); +__MODULE_PARM_TYPE(max_user_congthresh, "uint"); +MODULE_PARM_DESC(max_user_congthresh, + "Global limit for the maximum congestion threshold an " + "unprivileged user can set"); + +#define FUSE_DEFAULT_BLKSIZE 512 + +/** Maximum number of outstanding background requests */ +#define FUSE_DEFAULT_MAX_BACKGROUND 12 + +/** Congestion starts at 75% of maximum */ +#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4) + +#ifdef CONFIG_BLOCK +static struct file_system_type fuseblk_fs_type; +#endif + +struct fuse_forget_link *fuse_alloc_forget(void) +{ + return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL_ACCOUNT); +} + +static struct fuse_submount_lookup *fuse_alloc_submount_lookup(void) +{ + struct fuse_submount_lookup *sl; + + sl = kzalloc(sizeof(struct fuse_submount_lookup), GFP_KERNEL_ACCOUNT); + if (!sl) + return NULL; + sl->forget = fuse_alloc_forget(); + if (!sl->forget) + goto out_free; + + return sl; + +out_free: + kfree(sl); + return NULL; +} + +static struct inode *fuse_alloc_inode(struct super_block *sb) +{ + struct fuse_inode *fi; + + fi = alloc_inode_sb(sb, fuse_inode_cachep, GFP_KERNEL); + if (!fi) + return NULL; + + fi->i_time = 0; + fi->inval_mask = 0; + fi->nodeid = 0; + fi->nlookup = 0; + fi->attr_version = 0; + fi->orig_ino = 0; + fi->state = 0; + fi->submount_lookup = NULL; + mutex_init(&fi->mutex); + spin_lock_init(&fi->lock); + fi->forget = fuse_alloc_forget(); + if (!fi->forget) + goto out_free; + + if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi)) + goto out_free_forget; + + return &fi->inode; + +out_free_forget: + kfree(fi->forget); +out_free: + kmem_cache_free(fuse_inode_cachep, fi); + return NULL; +} + +static void fuse_free_inode(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + mutex_destroy(&fi->mutex); + kfree(fi->forget); +#ifdef CONFIG_FUSE_DAX + kfree(fi->dax); +#endif + kmem_cache_free(fuse_inode_cachep, fi); +} + +static void fuse_cleanup_submount_lookup(struct fuse_conn *fc, + struct fuse_submount_lookup *sl) +{ + if (!refcount_dec_and_test(&sl->count)) + return; + + fuse_queue_forget(fc, sl->forget, sl->nodeid, 1); + sl->forget = NULL; + kfree(sl); +} + +static void fuse_evict_inode(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + /* Will write inode on close/munmap and in all other dirtiers */ + WARN_ON(inode->i_state & I_DIRTY_INODE); + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + if (inode->i_sb->s_flags & SB_ACTIVE) { + struct fuse_conn *fc = get_fuse_conn(inode); + + if (FUSE_IS_DAX(inode)) + fuse_dax_inode_cleanup(inode); + if (fi->nlookup) { + fuse_queue_forget(fc, fi->forget, fi->nodeid, + fi->nlookup); + fi->forget = NULL; + } + + if (fi->submount_lookup) { + fuse_cleanup_submount_lookup(fc, fi->submount_lookup); + fi->submount_lookup = NULL; + } + } + if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { + WARN_ON(!list_empty(&fi->write_files)); + WARN_ON(!list_empty(&fi->queued_writes)); + } +} + +static int fuse_reconfigure(struct fs_context *fsc) +{ + struct super_block *sb = fsc->root->d_sb; + + sync_filesystem(sb); + if (fsc->sb_flags & SB_MANDLOCK) + return -EINVAL; + + return 0; +} + +/* + * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down + * so that it will fit. + */ +static ino_t fuse_squash_ino(u64 ino64) +{ + ino_t ino = (ino_t) ino64; + if (sizeof(ino_t) < sizeof(u64)) + ino ^= ino64 >> (sizeof(u64) - sizeof(ino_t)) * 8; + return ino; +} + +void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid, u32 cache_mask) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + lockdep_assert_held(&fi->lock); + + fi->attr_version = atomic64_inc_return(&fc->attr_version); + fi->i_time = attr_valid; + WRITE_ONCE(fi->inval_mask, 0); + + inode->i_ino = fuse_squash_ino(attr->ino); + inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); + set_nlink(inode, attr->nlink); + inode->i_uid = make_kuid(fc->user_ns, attr->uid); + inode->i_gid = make_kgid(fc->user_ns, attr->gid); + inode->i_blocks = attr->blocks; + + /* Sanitize nsecs */ + attr->atimensec = min_t(u32, attr->atimensec, NSEC_PER_SEC - 1); + attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1); + attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1); + + inode->i_atime.tv_sec = attr->atime; + inode->i_atime.tv_nsec = attr->atimensec; + /* mtime from server may be stale due to local buffered write */ + if (!(cache_mask & STATX_MTIME)) { + inode->i_mtime.tv_sec = attr->mtime; + inode->i_mtime.tv_nsec = attr->mtimensec; + } + if (!(cache_mask & STATX_CTIME)) { + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; + } + + if (attr->blksize != 0) + inode->i_blkbits = ilog2(attr->blksize); + else + inode->i_blkbits = inode->i_sb->s_blocksize_bits; + + /* + * Don't set the sticky bit in i_mode, unless we want the VFS + * to check permissions. This prevents failures due to the + * check in may_delete(). + */ + fi->orig_i_mode = inode->i_mode; + if (!fc->default_permissions) + inode->i_mode &= ~S_ISVTX; + + fi->orig_ino = attr->ino; + + /* + * We are refreshing inode data and it is possible that another + * client set suid/sgid or security.capability xattr. So clear + * S_NOSEC. Ideally, we could have cleared it only if suid/sgid + * was set or if security.capability xattr was set. But we don't + * know if security.capability has been set or not. So clear it + * anyway. Its less efficient but should be safe. + */ + inode->i_flags &= ~S_NOSEC; +} + +u32 fuse_get_cache_mask(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) + return 0; + + return STATX_MTIME | STATX_CTIME | STATX_SIZE; +} + +void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid, u64 attr_version) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + u32 cache_mask; + loff_t oldsize; + struct timespec64 old_mtime; + + spin_lock(&fi->lock); + /* + * In case of writeback_cache enabled, writes update mtime, ctime and + * may update i_size. In these cases trust the cached value in the + * inode. + */ + cache_mask = fuse_get_cache_mask(inode); + if (cache_mask & STATX_SIZE) + attr->size = i_size_read(inode); + + if (cache_mask & STATX_MTIME) { + attr->mtime = inode->i_mtime.tv_sec; + attr->mtimensec = inode->i_mtime.tv_nsec; + } + if (cache_mask & STATX_CTIME) { + attr->ctime = inode->i_ctime.tv_sec; + attr->ctimensec = inode->i_ctime.tv_nsec; + } + + if ((attr_version != 0 && fi->attr_version > attr_version) || + test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { + spin_unlock(&fi->lock); + return; + } + + old_mtime = inode->i_mtime; + fuse_change_attributes_common(inode, attr, attr_valid, cache_mask); + + oldsize = inode->i_size; + /* + * In case of writeback_cache enabled, the cached writes beyond EOF + * extend local i_size without keeping userspace server in sync. So, + * attr->size coming from server can be stale. We cannot trust it. + */ + if (!(cache_mask & STATX_SIZE)) + i_size_write(inode, attr->size); + spin_unlock(&fi->lock); + + if (!cache_mask && S_ISREG(inode->i_mode)) { + bool inval = false; + + if (oldsize != attr->size) { + truncate_pagecache(inode, attr->size); + if (!fc->explicit_inval_data) + inval = true; + } else if (fc->auto_inval_data) { + struct timespec64 new_mtime = { + .tv_sec = attr->mtime, + .tv_nsec = attr->mtimensec, + }; + + /* + * Auto inval mode also checks and invalidates if mtime + * has changed. + */ + if (!timespec64_equal(&old_mtime, &new_mtime)) + inval = true; + } + + if (inval) + invalidate_inode_pages2(inode->i_mapping); + } + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_dontcache(inode, attr->flags); +} + +static void fuse_init_submount_lookup(struct fuse_submount_lookup *sl, + u64 nodeid) +{ + sl->nodeid = nodeid; + refcount_set(&sl->count, 1); +} + +static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) +{ + inode->i_mode = attr->mode & S_IFMT; + inode->i_size = attr->size; + inode->i_mtime.tv_sec = attr->mtime; + inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; + if (S_ISREG(inode->i_mode)) { + fuse_init_common(inode); + fuse_init_file_inode(inode, attr->flags); + } else if (S_ISDIR(inode->i_mode)) + fuse_init_dir(inode); + else if (S_ISLNK(inode->i_mode)) + fuse_init_symlink(inode); + else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + fuse_init_common(inode); + init_special_inode(inode, inode->i_mode, + new_decode_dev(attr->rdev)); + } else + BUG(); +} + +static int fuse_inode_eq(struct inode *inode, void *_nodeidp) +{ + u64 nodeid = *(u64 *) _nodeidp; + if (get_node_id(inode) == nodeid) + return 1; + else + return 0; +} + +static int fuse_inode_set(struct inode *inode, void *_nodeidp) +{ + u64 nodeid = *(u64 *) _nodeidp; + get_fuse_inode(inode)->nodeid = nodeid; + return 0; +} + +struct inode *fuse_iget(struct super_block *sb, u64 nodeid, + int generation, struct fuse_attr *attr, + u64 attr_valid, u64 attr_version) +{ + struct inode *inode; + struct fuse_inode *fi; + struct fuse_conn *fc = get_fuse_conn_super(sb); + + /* + * Auto mount points get their node id from the submount root, which is + * not a unique identifier within this filesystem. + * + * To avoid conflicts, do not place submount points into the inode hash + * table. + */ + if (fc->auto_submounts && (attr->flags & FUSE_ATTR_SUBMOUNT) && + S_ISDIR(attr->mode)) { + struct fuse_inode *fi; + + inode = new_inode(sb); + if (!inode) + return NULL; + + fuse_init_inode(inode, attr); + fi = get_fuse_inode(inode); + fi->nodeid = nodeid; + fi->submount_lookup = fuse_alloc_submount_lookup(); + if (!fi->submount_lookup) { + iput(inode); + return NULL; + } + /* Sets nlookup = 1 on fi->submount_lookup->nlookup */ + fuse_init_submount_lookup(fi->submount_lookup, nodeid); + inode->i_flags |= S_AUTOMOUNT; + goto done; + } + +retry: + inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid); + if (!inode) + return NULL; + + if ((inode->i_state & I_NEW)) { + inode->i_flags |= S_NOATIME; + if (!fc->writeback_cache || !S_ISREG(attr->mode)) + inode->i_flags |= S_NOCMTIME; + inode->i_generation = generation; + fuse_init_inode(inode, attr); + unlock_new_inode(inode); + } else if (fuse_stale_inode(inode, generation, attr)) { + /* nodeid was reused, any I/O on the old inode should fail */ + fuse_make_bad(inode); + iput(inode); + goto retry; + } + fi = get_fuse_inode(inode); + spin_lock(&fi->lock); + fi->nlookup++; + spin_unlock(&fi->lock); +done: + fuse_change_attributes(inode, attr, attr_valid, attr_version); + + return inode; +} + +struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid, + struct fuse_mount **fm) +{ + struct fuse_mount *fm_iter; + struct inode *inode; + + WARN_ON(!rwsem_is_locked(&fc->killsb)); + list_for_each_entry(fm_iter, &fc->mounts, fc_entry) { + if (!fm_iter->sb) + continue; + + inode = ilookup5(fm_iter->sb, nodeid, fuse_inode_eq, &nodeid); + if (inode) { + if (fm) + *fm = fm_iter; + return inode; + } + } + + return NULL; +} + +int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, + loff_t offset, loff_t len) +{ + struct fuse_inode *fi; + struct inode *inode; + pgoff_t pg_start; + pgoff_t pg_end; + + inode = fuse_ilookup(fc, nodeid, NULL); + if (!inode) + return -ENOENT; + + fi = get_fuse_inode(inode); + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + spin_unlock(&fi->lock); + + fuse_invalidate_attr(inode); + forget_all_cached_acls(inode); + if (offset >= 0) { + pg_start = offset >> PAGE_SHIFT; + if (len <= 0) + pg_end = -1; + else + pg_end = (offset + len - 1) >> PAGE_SHIFT; + invalidate_inode_pages2_range(inode->i_mapping, + pg_start, pg_end); + } + iput(inode); + return 0; +} + +bool fuse_lock_inode(struct inode *inode) +{ + bool locked = false; + + if (!get_fuse_conn(inode)->parallel_dirops) { + mutex_lock(&get_fuse_inode(inode)->mutex); + locked = true; + } + + return locked; +} + +void fuse_unlock_inode(struct inode *inode, bool locked) +{ + if (locked) + mutex_unlock(&get_fuse_inode(inode)->mutex); +} + +static void fuse_umount_begin(struct super_block *sb) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + + if (fc->no_force_umount) + return; + + fuse_abort_conn(fc); + + // Only retire block-device-based superblocks. + if (sb->s_bdev != NULL) + retire_super(sb); +} + +static void fuse_send_destroy(struct fuse_mount *fm) +{ + if (fm->fc->conn_init) { + FUSE_ARGS(args); + + args.opcode = FUSE_DESTROY; + args.force = true; + args.nocreds = true; + fuse_simple_request(fm, &args); + } +} + +static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr) +{ + stbuf->f_type = FUSE_SUPER_MAGIC; + stbuf->f_bsize = attr->bsize; + stbuf->f_frsize = attr->frsize; + stbuf->f_blocks = attr->blocks; + stbuf->f_bfree = attr->bfree; + stbuf->f_bavail = attr->bavail; + stbuf->f_files = attr->files; + stbuf->f_ffree = attr->ffree; + stbuf->f_namelen = attr->namelen; + /* fsid is left zero */ +} + +static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct fuse_mount *fm = get_fuse_mount_super(sb); + FUSE_ARGS(args); + struct fuse_statfs_out outarg; + int err; + + if (!fuse_allow_current_process(fm->fc)) { + buf->f_type = FUSE_SUPER_MAGIC; + return 0; + } + + memset(&outarg, 0, sizeof(outarg)); + args.in_numargs = 0; + args.opcode = FUSE_STATFS; + args.nodeid = get_node_id(d_inode(dentry)); + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (!err) + convert_fuse_statfs(buf, &outarg.st); + return err; +} + +static struct fuse_sync_bucket *fuse_sync_bucket_alloc(void) +{ + struct fuse_sync_bucket *bucket; + + bucket = kzalloc(sizeof(*bucket), GFP_KERNEL | __GFP_NOFAIL); + if (bucket) { + init_waitqueue_head(&bucket->waitq); + /* Initial active count */ + atomic_set(&bucket->count, 1); + } + return bucket; +} + +static void fuse_sync_fs_writes(struct fuse_conn *fc) +{ + struct fuse_sync_bucket *bucket, *new_bucket; + int count; + + new_bucket = fuse_sync_bucket_alloc(); + spin_lock(&fc->lock); + bucket = rcu_dereference_protected(fc->curr_bucket, 1); + count = atomic_read(&bucket->count); + WARN_ON(count < 1); + /* No outstanding writes? */ + if (count == 1) { + spin_unlock(&fc->lock); + kfree(new_bucket); + return; + } + + /* + * Completion of new bucket depends on completion of this bucket, so add + * one more count. + */ + atomic_inc(&new_bucket->count); + rcu_assign_pointer(fc->curr_bucket, new_bucket); + spin_unlock(&fc->lock); + /* + * Drop initial active count. At this point if all writes in this and + * ancestor buckets complete, the count will go to zero and this task + * will be woken up. + */ + atomic_dec(&bucket->count); + + wait_event(bucket->waitq, atomic_read(&bucket->count) == 0); + + /* Drop temp count on descendant bucket */ + fuse_sync_bucket_dec(new_bucket); + kfree_rcu(bucket, rcu); +} + +static int fuse_sync_fs(struct super_block *sb, int wait) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct fuse_conn *fc = fm->fc; + struct fuse_syncfs_in inarg; + FUSE_ARGS(args); + int err; + + /* + * Userspace cannot handle the wait == 0 case. Avoid a + * gratuitous roundtrip. + */ + if (!wait) + return 0; + + /* The filesystem is being unmounted. Nothing to do. */ + if (!sb->s_root) + return 0; + + if (!fc->sync_fs) + return 0; + + fuse_sync_fs_writes(fc); + + memset(&inarg, 0, sizeof(inarg)); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.opcode = FUSE_SYNCFS; + args.nodeid = get_node_id(sb->s_root->d_inode); + args.out_numargs = 0; + + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + fc->sync_fs = 0; + err = 0; + } + + return err; +} + +enum { + OPT_SOURCE, + OPT_SUBTYPE, + OPT_FD, + OPT_ROOTMODE, + OPT_USER_ID, + OPT_GROUP_ID, + OPT_DEFAULT_PERMISSIONS, + OPT_ALLOW_OTHER, + OPT_MAX_READ, + OPT_BLKSIZE, + OPT_ERR +}; + +static const struct fs_parameter_spec fuse_fs_parameters[] = { + fsparam_string ("source", OPT_SOURCE), + fsparam_u32 ("fd", OPT_FD), + fsparam_u32oct ("rootmode", OPT_ROOTMODE), + fsparam_u32 ("user_id", OPT_USER_ID), + fsparam_u32 ("group_id", OPT_GROUP_ID), + fsparam_flag ("default_permissions", OPT_DEFAULT_PERMISSIONS), + fsparam_flag ("allow_other", OPT_ALLOW_OTHER), + fsparam_u32 ("max_read", OPT_MAX_READ), + fsparam_u32 ("blksize", OPT_BLKSIZE), + fsparam_string ("subtype", OPT_SUBTYPE), + {} +}; + +static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param) +{ + struct fs_parse_result result; + struct fuse_fs_context *ctx = fsc->fs_private; + int opt; + + if (fsc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + /* + * Ignore options coming from mount(MS_REMOUNT) for backward + * compatibility. + */ + if (fsc->oldapi) + return 0; + + return invalfc(fsc, "No changes allowed in reconfigure"); + } + + opt = fs_parse(fsc, fuse_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case OPT_SOURCE: + if (fsc->source) + return invalfc(fsc, "Multiple sources specified"); + fsc->source = param->string; + param->string = NULL; + break; + + case OPT_SUBTYPE: + if (ctx->subtype) + return invalfc(fsc, "Multiple subtypes specified"); + ctx->subtype = param->string; + param->string = NULL; + return 0; + + case OPT_FD: + ctx->fd = result.uint_32; + ctx->fd_present = true; + break; + + case OPT_ROOTMODE: + if (!fuse_valid_type(result.uint_32)) + return invalfc(fsc, "Invalid rootmode"); + ctx->rootmode = result.uint_32; + ctx->rootmode_present = true; + break; + + case OPT_USER_ID: + ctx->user_id = make_kuid(fsc->user_ns, result.uint_32); + if (!uid_valid(ctx->user_id)) + return invalfc(fsc, "Invalid user_id"); + ctx->user_id_present = true; + break; + + case OPT_GROUP_ID: + ctx->group_id = make_kgid(fsc->user_ns, result.uint_32); + if (!gid_valid(ctx->group_id)) + return invalfc(fsc, "Invalid group_id"); + ctx->group_id_present = true; + break; + + case OPT_DEFAULT_PERMISSIONS: + ctx->default_permissions = true; + break; + + case OPT_ALLOW_OTHER: + ctx->allow_other = true; + break; + + case OPT_MAX_READ: + ctx->max_read = result.uint_32; + break; + + case OPT_BLKSIZE: + if (!ctx->is_bdev) + return invalfc(fsc, "blksize only supported for fuseblk"); + ctx->blksize = result.uint_32; + break; + + default: + return -EINVAL; + } + + return 0; +} + +static void fuse_free_fsc(struct fs_context *fsc) +{ + struct fuse_fs_context *ctx = fsc->fs_private; + + if (ctx) { + kfree(ctx->subtype); + kfree(ctx); + } +} + +static int fuse_show_options(struct seq_file *m, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct fuse_conn *fc = get_fuse_conn_super(sb); + + if (fc->legacy_opts_show) { + seq_printf(m, ",user_id=%u", + from_kuid_munged(fc->user_ns, fc->user_id)); + seq_printf(m, ",group_id=%u", + from_kgid_munged(fc->user_ns, fc->group_id)); + if (fc->default_permissions) + seq_puts(m, ",default_permissions"); + if (fc->allow_other) + seq_puts(m, ",allow_other"); + if (fc->max_read != ~0) + seq_printf(m, ",max_read=%u", fc->max_read); + if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) + seq_printf(m, ",blksize=%lu", sb->s_blocksize); + } +#ifdef CONFIG_FUSE_DAX + if (fc->dax_mode == FUSE_DAX_ALWAYS) + seq_puts(m, ",dax=always"); + else if (fc->dax_mode == FUSE_DAX_NEVER) + seq_puts(m, ",dax=never"); + else if (fc->dax_mode == FUSE_DAX_INODE_USER) + seq_puts(m, ",dax=inode"); +#endif + + return 0; +} + +static void fuse_iqueue_init(struct fuse_iqueue *fiq, + const struct fuse_iqueue_ops *ops, + void *priv) +{ + memset(fiq, 0, sizeof(struct fuse_iqueue)); + spin_lock_init(&fiq->lock); + init_waitqueue_head(&fiq->waitq); + INIT_LIST_HEAD(&fiq->pending); + INIT_LIST_HEAD(&fiq->interrupts); + fiq->forget_list_tail = &fiq->forget_list_head; + fiq->connected = 1; + fiq->ops = ops; + fiq->priv = priv; +} + +static void fuse_pqueue_init(struct fuse_pqueue *fpq) +{ + unsigned int i; + + spin_lock_init(&fpq->lock); + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + INIT_LIST_HEAD(&fpq->processing[i]); + INIT_LIST_HEAD(&fpq->io); + fpq->connected = 1; +} + +void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, + struct user_namespace *user_ns, + const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv) +{ + memset(fc, 0, sizeof(*fc)); + spin_lock_init(&fc->lock); + spin_lock_init(&fc->bg_lock); + init_rwsem(&fc->killsb); + refcount_set(&fc->count, 1); + atomic_set(&fc->dev_count, 1); + init_waitqueue_head(&fc->blocked_waitq); + fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); + INIT_LIST_HEAD(&fc->bg_queue); + INIT_LIST_HEAD(&fc->entry); + INIT_LIST_HEAD(&fc->devices); + atomic_set(&fc->num_waiting, 0); + fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; + fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; + atomic64_set(&fc->khctr, 0); + fc->polled_files = RB_ROOT; + fc->blocked = 0; + fc->initialized = 0; + fc->connected = 1; + atomic64_set(&fc->attr_version, 1); + get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); + fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); + fc->user_ns = get_user_ns(user_ns); + fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; + fc->max_pages_limit = FUSE_MAX_MAX_PAGES; + + INIT_LIST_HEAD(&fc->mounts); + list_add(&fm->fc_entry, &fc->mounts); + fm->fc = fc; +} +EXPORT_SYMBOL_GPL(fuse_conn_init); + +void fuse_conn_put(struct fuse_conn *fc) +{ + if (refcount_dec_and_test(&fc->count)) { + struct fuse_iqueue *fiq = &fc->iq; + struct fuse_sync_bucket *bucket; + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_conn_free(fc); + if (fiq->ops->release) + fiq->ops->release(fiq); + put_pid_ns(fc->pid_ns); + put_user_ns(fc->user_ns); + bucket = rcu_dereference_protected(fc->curr_bucket, 1); + if (bucket) { + WARN_ON(atomic_read(&bucket->count) != 1); + kfree(bucket); + } + fc->release(fc); + } +} +EXPORT_SYMBOL_GPL(fuse_conn_put); + +struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) +{ + refcount_inc(&fc->count); + return fc; +} +EXPORT_SYMBOL_GPL(fuse_conn_get); + +static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) +{ + struct fuse_attr attr; + memset(&attr, 0, sizeof(attr)); + + attr.mode = mode; + attr.ino = FUSE_ROOT_ID; + attr.nlink = 1; + return fuse_iget(sb, 1, 0, &attr, 0, 0); +} + +struct fuse_inode_handle { + u64 nodeid; + u32 generation; +}; + +static struct dentry *fuse_get_dentry(struct super_block *sb, + struct fuse_inode_handle *handle) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + struct inode *inode; + struct dentry *entry; + int err = -ESTALE; + + if (handle->nodeid == 0) + goto out_err; + + inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &handle->nodeid); + if (!inode) { + struct fuse_entry_out outarg; + const struct qstr name = QSTR_INIT(".", 1); + + if (!fc->export_support) + goto out_err; + + err = fuse_lookup_name(sb, handle->nodeid, &name, &outarg, + &inode); + if (err && err != -ENOENT) + goto out_err; + if (err || !inode) { + err = -ESTALE; + goto out_err; + } + err = -EIO; + if (get_node_id(inode) != handle->nodeid) + goto out_iput; + } + err = -ESTALE; + if (inode->i_generation != handle->generation) + goto out_iput; + + entry = d_obtain_alias(inode); + if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) + fuse_invalidate_entry_cache(entry); + + return entry; + + out_iput: + iput(inode); + out_err: + return ERR_PTR(err); +} + +static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) +{ + int len = parent ? 6 : 3; + u64 nodeid; + u32 generation; + + if (*max_len < len) { + *max_len = len; + return FILEID_INVALID; + } + + nodeid = get_fuse_inode(inode)->nodeid; + generation = inode->i_generation; + + fh[0] = (u32)(nodeid >> 32); + fh[1] = (u32)(nodeid & 0xffffffff); + fh[2] = generation; + + if (parent) { + nodeid = get_fuse_inode(parent)->nodeid; + generation = parent->i_generation; + + fh[3] = (u32)(nodeid >> 32); + fh[4] = (u32)(nodeid & 0xffffffff); + fh[5] = generation; + } + + *max_len = len; + return parent ? 0x82 : 0x81; +} + +static struct dentry *fuse_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct fuse_inode_handle handle; + + if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3) + return NULL; + + handle.nodeid = (u64) fid->raw[0] << 32; + handle.nodeid |= (u64) fid->raw[1]; + handle.generation = fid->raw[2]; + return fuse_get_dentry(sb, &handle); +} + +static struct dentry *fuse_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct fuse_inode_handle parent; + + if (fh_type != 0x82 || fh_len < 6) + return NULL; + + parent.nodeid = (u64) fid->raw[3] << 32; + parent.nodeid |= (u64) fid->raw[4]; + parent.generation = fid->raw[5]; + return fuse_get_dentry(sb, &parent); +} + +static struct dentry *fuse_get_parent(struct dentry *child) +{ + struct inode *child_inode = d_inode(child); + struct fuse_conn *fc = get_fuse_conn(child_inode); + struct inode *inode; + struct dentry *parent; + struct fuse_entry_out outarg; + int err; + + if (!fc->export_support) + return ERR_PTR(-ESTALE); + + err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode), + &dotdot_name, &outarg, &inode); + if (err) { + if (err == -ENOENT) + return ERR_PTR(-ESTALE); + return ERR_PTR(err); + } + + parent = d_obtain_alias(inode); + if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) + fuse_invalidate_entry_cache(parent); + + return parent; +} + +static const struct export_operations fuse_export_operations = { + .fh_to_dentry = fuse_fh_to_dentry, + .fh_to_parent = fuse_fh_to_parent, + .encode_fh = fuse_encode_fh, + .get_parent = fuse_get_parent, +}; + +static const struct super_operations fuse_super_operations = { + .alloc_inode = fuse_alloc_inode, + .free_inode = fuse_free_inode, + .evict_inode = fuse_evict_inode, + .write_inode = fuse_write_inode, + .drop_inode = generic_delete_inode, + .umount_begin = fuse_umount_begin, + .statfs = fuse_statfs, + .sync_fs = fuse_sync_fs, + .show_options = fuse_show_options, +}; + +static void sanitize_global_limit(unsigned *limit) +{ + /* + * The default maximum number of async requests is calculated to consume + * 1/2^13 of the total memory, assuming 392 bytes per request. + */ + if (*limit == 0) + *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / 392; + + if (*limit >= 1 << 16) + *limit = (1 << 16) - 1; +} + +static int set_global_limit(const char *val, const struct kernel_param *kp) +{ + int rv; + + rv = param_set_uint(val, kp); + if (rv) + return rv; + + sanitize_global_limit((unsigned *)kp->arg); + + return 0; +} + +static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) +{ + int cap_sys_admin = capable(CAP_SYS_ADMIN); + + if (arg->minor < 13) + return; + + sanitize_global_limit(&max_user_bgreq); + sanitize_global_limit(&max_user_congthresh); + + spin_lock(&fc->bg_lock); + if (arg->max_background) { + fc->max_background = arg->max_background; + + if (!cap_sys_admin && fc->max_background > max_user_bgreq) + fc->max_background = max_user_bgreq; + } + if (arg->congestion_threshold) { + fc->congestion_threshold = arg->congestion_threshold; + + if (!cap_sys_admin && + fc->congestion_threshold > max_user_congthresh) + fc->congestion_threshold = max_user_congthresh; + } + spin_unlock(&fc->bg_lock); +} + +struct fuse_init_args { + struct fuse_args args; + struct fuse_init_in in; + struct fuse_init_out out; +}; + +static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, + int error) +{ + struct fuse_conn *fc = fm->fc; + struct fuse_init_args *ia = container_of(args, typeof(*ia), args); + struct fuse_init_out *arg = &ia->out; + bool ok = true; + + if (error || arg->major != FUSE_KERNEL_VERSION) + ok = false; + else { + unsigned long ra_pages; + + process_init_limits(fc, arg); + + if (arg->minor >= 6) { + u64 flags = arg->flags; + + if (flags & FUSE_INIT_EXT) + flags |= (u64) arg->flags2 << 32; + + ra_pages = arg->max_readahead / PAGE_SIZE; + if (flags & FUSE_ASYNC_READ) + fc->async_read = 1; + if (!(flags & FUSE_POSIX_LOCKS)) + fc->no_lock = 1; + if (arg->minor >= 17) { + if (!(flags & FUSE_FLOCK_LOCKS)) + fc->no_flock = 1; + } else { + if (!(flags & FUSE_POSIX_LOCKS)) + fc->no_flock = 1; + } + if (flags & FUSE_ATOMIC_O_TRUNC) + fc->atomic_o_trunc = 1; + if (arg->minor >= 9) { + /* LOOKUP has dependency on proto version */ + if (flags & FUSE_EXPORT_SUPPORT) + fc->export_support = 1; + } + if (flags & FUSE_BIG_WRITES) + fc->big_writes = 1; + if (flags & FUSE_DONT_MASK) + fc->dont_mask = 1; + if (flags & FUSE_AUTO_INVAL_DATA) + fc->auto_inval_data = 1; + else if (flags & FUSE_EXPLICIT_INVAL_DATA) + fc->explicit_inval_data = 1; + if (flags & FUSE_DO_READDIRPLUS) { + fc->do_readdirplus = 1; + if (flags & FUSE_READDIRPLUS_AUTO) + fc->readdirplus_auto = 1; + } + if (flags & FUSE_ASYNC_DIO) + fc->async_dio = 1; + if (flags & FUSE_WRITEBACK_CACHE) + fc->writeback_cache = 1; + if (flags & FUSE_PARALLEL_DIROPS) + fc->parallel_dirops = 1; + if (flags & FUSE_HANDLE_KILLPRIV) + fc->handle_killpriv = 1; + if (arg->time_gran && arg->time_gran <= 1000000000) + fm->sb->s_time_gran = arg->time_gran; + if ((flags & FUSE_POSIX_ACL)) { + fc->default_permissions = 1; + fc->posix_acl = 1; + fm->sb->s_xattr = fuse_acl_xattr_handlers; + } + if (flags & FUSE_CACHE_SYMLINKS) + fc->cache_symlinks = 1; + if (flags & FUSE_ABORT_ERROR) + fc->abort_err = 1; + if (flags & FUSE_MAX_PAGES) { + fc->max_pages = + min_t(unsigned int, fc->max_pages_limit, + max_t(unsigned int, arg->max_pages, 1)); + } + if (IS_ENABLED(CONFIG_FUSE_DAX)) { + if (flags & FUSE_MAP_ALIGNMENT && + !fuse_dax_check_alignment(fc, arg->map_alignment)) { + ok = false; + } + if (flags & FUSE_HAS_INODE_DAX) + fc->inode_dax = 1; + } + if (flags & FUSE_HANDLE_KILLPRIV_V2) { + fc->handle_killpriv_v2 = 1; + fm->sb->s_flags |= SB_NOSEC; + } + if (flags & FUSE_SETXATTR_EXT) + fc->setxattr_ext = 1; + if (flags & FUSE_SECURITY_CTX) + fc->init_security = 1; + } else { + ra_pages = fc->max_read / PAGE_SIZE; + fc->no_lock = 1; + fc->no_flock = 1; + } + + fm->sb->s_bdi->ra_pages = + min(fm->sb->s_bdi->ra_pages, ra_pages); + fc->minor = arg->minor; + fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; + fc->max_write = max_t(unsigned, 4096, fc->max_write); + fc->conn_init = 1; + } + kfree(ia); + + if (!ok) { + fc->conn_init = 0; + fc->conn_error = 1; + } + + fuse_set_initialized(fc); + wake_up_all(&fc->blocked_waitq); +} + +void fuse_send_init(struct fuse_mount *fm) +{ + struct fuse_init_args *ia; + u64 flags; + + ia = kzalloc(sizeof(*ia), GFP_KERNEL | __GFP_NOFAIL); + + ia->in.major = FUSE_KERNEL_VERSION; + ia->in.minor = FUSE_KERNEL_MINOR_VERSION; + ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE; + flags = + FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | + FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | + FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | + FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | + FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | + FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | + FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | + FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | + FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | + FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | + FUSE_SECURITY_CTX; +#ifdef CONFIG_FUSE_DAX + if (fm->fc->dax) + flags |= FUSE_MAP_ALIGNMENT; + if (fuse_is_inode_dax_mode(fm->fc->dax_mode)) + flags |= FUSE_HAS_INODE_DAX; +#endif + if (fm->fc->auto_submounts) + flags |= FUSE_SUBMOUNTS; + + ia->in.flags = flags; + ia->in.flags2 = flags >> 32; + + ia->args.opcode = FUSE_INIT; + ia->args.in_numargs = 1; + ia->args.in_args[0].size = sizeof(ia->in); + ia->args.in_args[0].value = &ia->in; + ia->args.out_numargs = 1; + /* Variable length argument used for backward compatibility + with interface version < 7.5. Rest of init_out is zeroed + by do_get_request(), so a short reply is not a problem */ + ia->args.out_argvar = true; + ia->args.out_args[0].size = sizeof(ia->out); + ia->args.out_args[0].value = &ia->out; + ia->args.force = true; + ia->args.nocreds = true; + ia->args.end = process_init_reply; + + if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0) + process_init_reply(fm, &ia->args, -ENOTCONN); +} +EXPORT_SYMBOL_GPL(fuse_send_init); + +void fuse_free_conn(struct fuse_conn *fc) +{ + WARN_ON(!list_empty(&fc->devices)); + kfree_rcu(fc, rcu); +} +EXPORT_SYMBOL_GPL(fuse_free_conn); + +static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) +{ + int err; + char *suffix = ""; + + if (sb->s_bdev) { + suffix = "-fuseblk"; + /* + * sb->s_bdi points to blkdev's bdi however we want to redirect + * it to our private bdi... + */ + bdi_put(sb->s_bdi); + sb->s_bdi = &noop_backing_dev_info; + } + err = super_setup_bdi_name(sb, "%u:%u%s", MAJOR(fc->dev), + MINOR(fc->dev), suffix); + if (err) + return err; + + /* fuse does it's own writeback accounting */ + sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT; + sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT; + + /* + * For a single fuse filesystem use max 1% of dirty + + * writeback threshold. + * + * This gives about 1M of write buffer for memory maps on a + * machine with 1G and 10% dirty_ratio, which should be more + * than enough. + * + * Privileged users can raise it by writing to + * + * /sys/class/bdi//max_ratio + */ + bdi_set_max_ratio(sb->s_bdi, 1); + + return 0; +} + +struct fuse_dev *fuse_dev_alloc(void) +{ + struct fuse_dev *fud; + struct list_head *pq; + + fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL); + if (!fud) + return NULL; + + pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); + if (!pq) { + kfree(fud); + return NULL; + } + + fud->pq.processing = pq; + fuse_pqueue_init(&fud->pq); + + return fud; +} +EXPORT_SYMBOL_GPL(fuse_dev_alloc); + +void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc) +{ + fud->fc = fuse_conn_get(fc); + spin_lock(&fc->lock); + list_add_tail(&fud->entry, &fc->devices); + spin_unlock(&fc->lock); +} +EXPORT_SYMBOL_GPL(fuse_dev_install); + +struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc) +{ + struct fuse_dev *fud; + + fud = fuse_dev_alloc(); + if (!fud) + return NULL; + + fuse_dev_install(fud, fc); + return fud; +} +EXPORT_SYMBOL_GPL(fuse_dev_alloc_install); + +void fuse_dev_free(struct fuse_dev *fud) +{ + struct fuse_conn *fc = fud->fc; + + if (fc) { + spin_lock(&fc->lock); + list_del(&fud->entry); + spin_unlock(&fc->lock); + + fuse_conn_put(fc); + } + kfree(fud->pq.processing); + kfree(fud); +} +EXPORT_SYMBOL_GPL(fuse_dev_free); + +static void fuse_fill_attr_from_inode(struct fuse_attr *attr, + const struct fuse_inode *fi) +{ + *attr = (struct fuse_attr){ + .ino = fi->inode.i_ino, + .size = fi->inode.i_size, + .blocks = fi->inode.i_blocks, + .atime = fi->inode.i_atime.tv_sec, + .mtime = fi->inode.i_mtime.tv_sec, + .ctime = fi->inode.i_ctime.tv_sec, + .atimensec = fi->inode.i_atime.tv_nsec, + .mtimensec = fi->inode.i_mtime.tv_nsec, + .ctimensec = fi->inode.i_ctime.tv_nsec, + .mode = fi->inode.i_mode, + .nlink = fi->inode.i_nlink, + .uid = fi->inode.i_uid.val, + .gid = fi->inode.i_gid.val, + .rdev = fi->inode.i_rdev, + .blksize = 1u << fi->inode.i_blkbits, + }; +} + +static void fuse_sb_defaults(struct super_block *sb) +{ + sb->s_magic = FUSE_SUPER_MAGIC; + sb->s_op = &fuse_super_operations; + sb->s_xattr = fuse_xattr_handlers; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_time_gran = 1; + sb->s_export_op = &fuse_export_operations; + sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE; + if (sb->s_user_ns != &init_user_ns) + sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; + sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); + + /* + * If we are not in the initial user namespace posix + * acls must be translated. + */ + if (sb->s_user_ns != &init_user_ns) + sb->s_xattr = fuse_no_acl_xattr_handlers; +} + +static int fuse_fill_super_submount(struct super_block *sb, + struct fuse_inode *parent_fi) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct super_block *parent_sb = parent_fi->inode.i_sb; + struct fuse_attr root_attr; + struct inode *root; + struct fuse_submount_lookup *sl; + struct fuse_inode *fi; + + fuse_sb_defaults(sb); + fm->sb = sb; + + WARN_ON(sb->s_bdi != &noop_backing_dev_info); + sb->s_bdi = bdi_get(parent_sb->s_bdi); + + sb->s_xattr = parent_sb->s_xattr; + sb->s_time_gran = parent_sb->s_time_gran; + sb->s_blocksize = parent_sb->s_blocksize; + sb->s_blocksize_bits = parent_sb->s_blocksize_bits; + sb->s_subtype = kstrdup(parent_sb->s_subtype, GFP_KERNEL); + if (parent_sb->s_subtype && !sb->s_subtype) + return -ENOMEM; + + fuse_fill_attr_from_inode(&root_attr, parent_fi); + root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0); + /* + * This inode is just a duplicate, so it is not looked up and + * its nlookup should not be incremented. fuse_iget() does + * that, though, so undo it here. + */ + fi = get_fuse_inode(root); + fi->nlookup--; + + sb->s_d_op = &fuse_dentry_operations; + sb->s_root = d_make_root(root); + if (!sb->s_root) + return -ENOMEM; + + /* + * Grab the parent's submount_lookup pointer and take a + * reference on the shared nlookup from the parent. This is to + * prevent the last forget for this nodeid from getting + * triggered until all users have finished with it. + */ + sl = parent_fi->submount_lookup; + WARN_ON(!sl); + if (sl) { + refcount_inc(&sl->count); + fi->submount_lookup = sl; + } + + return 0; +} + +/* Filesystem context private data holds the FUSE inode of the mount point */ +static int fuse_get_tree_submount(struct fs_context *fsc) +{ + struct fuse_mount *fm; + struct fuse_inode *mp_fi = fsc->fs_private; + struct fuse_conn *fc = get_fuse_conn(&mp_fi->inode); + struct super_block *sb; + int err; + + fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); + if (!fm) + return -ENOMEM; + + fm->fc = fuse_conn_get(fc); + fsc->s_fs_info = fm; + sb = sget_fc(fsc, NULL, set_anon_super_fc); + if (fsc->s_fs_info) + fuse_mount_destroy(fm); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + /* Initialize superblock, making @mp_fi its root */ + err = fuse_fill_super_submount(sb, mp_fi); + if (err) { + deactivate_locked_super(sb); + return err; + } + + down_write(&fc->killsb); + list_add_tail(&fm->fc_entry, &fc->mounts); + up_write(&fc->killsb); + + sb->s_flags |= SB_ACTIVE; + fsc->root = dget(sb->s_root); + + return 0; +} + +static const struct fs_context_operations fuse_context_submount_ops = { + .get_tree = fuse_get_tree_submount, +}; + +int fuse_init_fs_context_submount(struct fs_context *fsc) +{ + fsc->ops = &fuse_context_submount_ops; + return 0; +} +EXPORT_SYMBOL_GPL(fuse_init_fs_context_submount); + +int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) +{ + struct fuse_dev *fud = NULL; + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct fuse_conn *fc = fm->fc; + struct inode *root; + struct dentry *root_dentry; + int err; + + err = -EINVAL; + if (sb->s_flags & SB_MANDLOCK) + goto err; + + rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc()); + fuse_sb_defaults(sb); + + if (ctx->is_bdev) { +#ifdef CONFIG_BLOCK + err = -EINVAL; + if (!sb_set_blocksize(sb, ctx->blksize)) + goto err; +#endif + } else { + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + } + + sb->s_subtype = ctx->subtype; + ctx->subtype = NULL; + if (IS_ENABLED(CONFIG_FUSE_DAX)) { + err = fuse_dax_conn_alloc(fc, ctx->dax_mode, ctx->dax_dev); + if (err) + goto err; + } + + if (ctx->fudptr) { + err = -ENOMEM; + fud = fuse_dev_alloc_install(fc); + if (!fud) + goto err_free_dax; + } + + fc->dev = sb->s_dev; + fm->sb = sb; + err = fuse_bdi_init(fc, sb); + if (err) + goto err_dev_free; + + /* Handle umasking inside the fuse code */ + if (sb->s_flags & SB_POSIXACL) + fc->dont_mask = 1; + sb->s_flags |= SB_POSIXACL; + + fc->default_permissions = ctx->default_permissions; + fc->allow_other = ctx->allow_other; + fc->user_id = ctx->user_id; + fc->group_id = ctx->group_id; + fc->legacy_opts_show = ctx->legacy_opts_show; + fc->max_read = max_t(unsigned int, 4096, ctx->max_read); + fc->destroy = ctx->destroy; + fc->no_control = ctx->no_control; + fc->no_force_umount = ctx->no_force_umount; + + err = -ENOMEM; + root = fuse_get_root_inode(sb, ctx->rootmode); + sb->s_d_op = &fuse_root_dentry_operations; + root_dentry = d_make_root(root); + if (!root_dentry) + goto err_dev_free; + /* Root dentry doesn't have .d_revalidate */ + sb->s_d_op = &fuse_dentry_operations; + + mutex_lock(&fuse_mutex); + err = -EINVAL; + if (ctx->fudptr && *ctx->fudptr) + goto err_unlock; + + err = fuse_ctl_add_conn(fc); + if (err) + goto err_unlock; + + list_add_tail(&fc->entry, &fuse_conn_list); + sb->s_root = root_dentry; + if (ctx->fudptr) + *ctx->fudptr = fud; + mutex_unlock(&fuse_mutex); + return 0; + + err_unlock: + mutex_unlock(&fuse_mutex); + dput(root_dentry); + err_dev_free: + if (fud) + fuse_dev_free(fud); + err_free_dax: + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_conn_free(fc); + err: + return err; +} +EXPORT_SYMBOL_GPL(fuse_fill_super_common); + +static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) +{ + struct fuse_fs_context *ctx = fsc->fs_private; + int err; + + if (!ctx->file || !ctx->rootmode_present || + !ctx->user_id_present || !ctx->group_id_present) + return -EINVAL; + + /* + * Require mount to happen from the same user namespace which + * opened /dev/fuse to prevent potential attacks. + */ + if ((ctx->file->f_op != &fuse_dev_operations) || + (ctx->file->f_cred->user_ns != sb->s_user_ns)) + return -EINVAL; + ctx->fudptr = &ctx->file->private_data; + + err = fuse_fill_super_common(sb, ctx); + if (err) + return err; + /* file->private_data shall be visible on all CPUs after this */ + smp_mb(); + fuse_send_init(get_fuse_mount_super(sb)); + return 0; +} + +/* + * This is the path where user supplied an already initialized fuse dev. In + * this case never create a new super if the old one is gone. + */ +static int fuse_set_no_super(struct super_block *sb, struct fs_context *fsc) +{ + return -ENOTCONN; +} + +static int fuse_test_super(struct super_block *sb, struct fs_context *fsc) +{ + + return fsc->sget_key == get_fuse_conn_super(sb); +} + +static int fuse_get_tree(struct fs_context *fsc) +{ + struct fuse_fs_context *ctx = fsc->fs_private; + struct fuse_dev *fud; + struct fuse_conn *fc; + struct fuse_mount *fm; + struct super_block *sb; + int err; + + fc = kmalloc(sizeof(*fc), GFP_KERNEL); + if (!fc) + return -ENOMEM; + + fm = kzalloc(sizeof(*fm), GFP_KERNEL); + if (!fm) { + kfree(fc); + return -ENOMEM; + } + + fuse_conn_init(fc, fm, fsc->user_ns, &fuse_dev_fiq_ops, NULL); + fc->release = fuse_free_conn; + + fsc->s_fs_info = fm; + + if (ctx->fd_present) + ctx->file = fget(ctx->fd); + + if (IS_ENABLED(CONFIG_BLOCK) && ctx->is_bdev) { + err = get_tree_bdev(fsc, fuse_fill_super); + goto out; + } + /* + * While block dev mount can be initialized with a dummy device fd + * (found by device name), normal fuse mounts can't + */ + err = -EINVAL; + if (!ctx->file) + goto out; + + /* + * Allow creating a fuse mount with an already initialized fuse + * connection + */ + fud = READ_ONCE(ctx->file->private_data); + if (ctx->file->f_op == &fuse_dev_operations && fud) { + fsc->sget_key = fud->fc; + sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super); + err = PTR_ERR_OR_ZERO(sb); + if (!IS_ERR(sb)) + fsc->root = dget(sb->s_root); + } else { + err = get_tree_nodev(fsc, fuse_fill_super); + } +out: + if (fsc->s_fs_info) + fuse_mount_destroy(fm); + if (ctx->file) + fput(ctx->file); + return err; +} + +static const struct fs_context_operations fuse_context_ops = { + .free = fuse_free_fsc, + .parse_param = fuse_parse_param, + .reconfigure = fuse_reconfigure, + .get_tree = fuse_get_tree, +}; + +/* + * Set up the filesystem mount context. + */ +static int fuse_init_fs_context(struct fs_context *fsc) +{ + struct fuse_fs_context *ctx; + + ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->max_read = ~0; + ctx->blksize = FUSE_DEFAULT_BLKSIZE; + ctx->legacy_opts_show = true; + +#ifdef CONFIG_BLOCK + if (fsc->fs_type == &fuseblk_fs_type) { + ctx->is_bdev = true; + ctx->destroy = true; + } +#endif + + fsc->fs_private = ctx; + fsc->ops = &fuse_context_ops; + return 0; +} + +bool fuse_mount_remove(struct fuse_mount *fm) +{ + struct fuse_conn *fc = fm->fc; + bool last = false; + + down_write(&fc->killsb); + list_del_init(&fm->fc_entry); + if (list_empty(&fc->mounts)) + last = true; + up_write(&fc->killsb); + + return last; +} +EXPORT_SYMBOL_GPL(fuse_mount_remove); + +void fuse_conn_destroy(struct fuse_mount *fm) +{ + struct fuse_conn *fc = fm->fc; + + if (fc->destroy) + fuse_send_destroy(fm); + + fuse_abort_conn(fc); + fuse_wait_aborted(fc); + + if (!list_empty(&fc->entry)) { + mutex_lock(&fuse_mutex); + list_del(&fc->entry); + fuse_ctl_remove_conn(fc); + mutex_unlock(&fuse_mutex); + } +} +EXPORT_SYMBOL_GPL(fuse_conn_destroy); + +static void fuse_sb_destroy(struct super_block *sb) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + bool last; + + if (sb->s_root) { + last = fuse_mount_remove(fm); + if (last) + fuse_conn_destroy(fm); + } +} + +void fuse_mount_destroy(struct fuse_mount *fm) +{ + fuse_conn_put(fm->fc); + kfree(fm); +} +EXPORT_SYMBOL(fuse_mount_destroy); + +static void fuse_kill_sb_anon(struct super_block *sb) +{ + fuse_sb_destroy(sb); + kill_anon_super(sb); + fuse_mount_destroy(get_fuse_mount_super(sb)); +} + +static struct file_system_type fuse_fs_type = { + .owner = THIS_MODULE, + .name = "fuse", + .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT, + .init_fs_context = fuse_init_fs_context, + .parameters = fuse_fs_parameters, + .kill_sb = fuse_kill_sb_anon, +}; +MODULE_ALIAS_FS("fuse"); + +#ifdef CONFIG_BLOCK +static void fuse_kill_sb_blk(struct super_block *sb) +{ + fuse_sb_destroy(sb); + kill_block_super(sb); + fuse_mount_destroy(get_fuse_mount_super(sb)); +} + +static struct file_system_type fuseblk_fs_type = { + .owner = THIS_MODULE, + .name = "fuseblk", + .init_fs_context = fuse_init_fs_context, + .parameters = fuse_fs_parameters, + .kill_sb = fuse_kill_sb_blk, + .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, +}; +MODULE_ALIAS_FS("fuseblk"); + +static inline int register_fuseblk(void) +{ + return register_filesystem(&fuseblk_fs_type); +} + +static inline void unregister_fuseblk(void) +{ + unregister_filesystem(&fuseblk_fs_type); +} +#else +static inline int register_fuseblk(void) +{ + return 0; +} + +static inline void unregister_fuseblk(void) +{ +} +#endif + +static void fuse_inode_init_once(void *foo) +{ + struct inode *inode = foo; + + inode_init_once(inode); +} + +static int __init fuse_fs_init(void) +{ + int err; + + fuse_inode_cachep = kmem_cache_create("fuse_inode", + sizeof(struct fuse_inode), 0, + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT, + fuse_inode_init_once); + err = -ENOMEM; + if (!fuse_inode_cachep) + goto out; + + err = register_fuseblk(); + if (err) + goto out2; + + err = register_filesystem(&fuse_fs_type); + if (err) + goto out3; + + return 0; + + out3: + unregister_fuseblk(); + out2: + kmem_cache_destroy(fuse_inode_cachep); + out: + return err; +} + +static void fuse_fs_cleanup(void) +{ + unregister_filesystem(&fuse_fs_type); + unregister_fuseblk(); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(fuse_inode_cachep); +} + +static struct kobject *fuse_kobj; + +static int fuse_sysfs_init(void) +{ + int err; + + fuse_kobj = kobject_create_and_add("fuse", fs_kobj); + if (!fuse_kobj) { + err = -ENOMEM; + goto out_err; + } + + err = sysfs_create_mount_point(fuse_kobj, "connections"); + if (err) + goto out_fuse_unregister; + + return 0; + + out_fuse_unregister: + kobject_put(fuse_kobj); + out_err: + return err; +} + +static void fuse_sysfs_cleanup(void) +{ + sysfs_remove_mount_point(fuse_kobj, "connections"); + kobject_put(fuse_kobj); +} + +static int __init fuse_init(void) +{ + int res; + + pr_info("init (API version %i.%i)\n", + FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); + + INIT_LIST_HEAD(&fuse_conn_list); + res = fuse_fs_init(); + if (res) + goto err; + + res = fuse_dev_init(); + if (res) + goto err_fs_cleanup; + + res = fuse_sysfs_init(); + if (res) + goto err_dev_cleanup; + + res = fuse_ctl_init(); + if (res) + goto err_sysfs_cleanup; + + sanitize_global_limit(&max_user_bgreq); + sanitize_global_limit(&max_user_congthresh); + + return 0; + + err_sysfs_cleanup: + fuse_sysfs_cleanup(); + err_dev_cleanup: + fuse_dev_cleanup(); + err_fs_cleanup: + fuse_fs_cleanup(); + err: + return res; +} + +static void __exit fuse_exit(void) +{ + pr_debug("exit\n"); + + fuse_ctl_cleanup(); + fuse_sysfs_cleanup(); + fuse_fs_cleanup(); + fuse_dev_cleanup(); +} + +module_init(fuse_init); +module_exit(fuse_exit); diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c new file mode 100644 index 000000000..5a6c715b9 --- /dev/null +++ b/fs/fuse/ioctl.c @@ -0,0 +1,515 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 Red Hat, Inc. + */ + +#include "fuse_i.h" + +#include +#include +#include + +static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args, + struct fuse_ioctl_out *outarg) +{ + ssize_t ret; + + args->out_args[0].size = sizeof(*outarg); + args->out_args[0].value = outarg; + + ret = fuse_simple_request(fm, args); + + /* Translate ENOSYS, which shouldn't be returned from fs */ + if (ret == -ENOSYS) + ret = -ENOTTY; + + if (ret >= 0 && outarg->result == -ENOSYS) + outarg->result = -ENOTTY; + + return ret; +} + +/* + * CUSE servers compiled on 32bit broke on 64bit kernels because the + * ABI was defined to be 'struct iovec' which is different on 32bit + * and 64bit. Fortunately we can determine which structure the server + * used from the size of the reply. + */ +static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src, + size_t transferred, unsigned count, + bool is_compat) +{ +#ifdef CONFIG_COMPAT + if (count * sizeof(struct compat_iovec) == transferred) { + struct compat_iovec *ciov = src; + unsigned i; + + /* + * With this interface a 32bit server cannot support + * non-compat (i.e. ones coming from 64bit apps) ioctl + * requests + */ + if (!is_compat) + return -EINVAL; + + for (i = 0; i < count; i++) { + dst[i].iov_base = compat_ptr(ciov[i].iov_base); + dst[i].iov_len = ciov[i].iov_len; + } + return 0; + } +#endif + + if (count * sizeof(struct iovec) != transferred) + return -EIO; + + memcpy(dst, src, transferred); + return 0; +} + +/* Make sure iov_length() won't overflow */ +static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov, + size_t count) +{ + size_t n; + u32 max = fc->max_pages << PAGE_SHIFT; + + for (n = 0; n < count; n++, iov++) { + if (iov->iov_len > (size_t) max) + return -ENOMEM; + max -= iov->iov_len; + } + return 0; +} + +static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst, + void *src, size_t transferred, unsigned count, + bool is_compat) +{ + unsigned i; + struct fuse_ioctl_iovec *fiov = src; + + if (fc->minor < 16) { + return fuse_copy_ioctl_iovec_old(dst, src, transferred, + count, is_compat); + } + + if (count * sizeof(struct fuse_ioctl_iovec) != transferred) + return -EIO; + + for (i = 0; i < count; i++) { + /* Did the server supply an inappropriate value? */ + if (fiov[i].base != (unsigned long) fiov[i].base || + fiov[i].len != (unsigned long) fiov[i].len) + return -EIO; + + dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base; + dst[i].iov_len = (size_t) fiov[i].len; + +#ifdef CONFIG_COMPAT + if (is_compat && + (ptr_to_compat(dst[i].iov_base) != fiov[i].base || + (compat_size_t) dst[i].iov_len != fiov[i].len)) + return -EIO; +#endif + } + + return 0; +} + + +/* + * For ioctls, there is no generic way to determine how much memory + * needs to be read and/or written. Furthermore, ioctls are allowed + * to dereference the passed pointer, so the parameter requires deep + * copying but FUSE has no idea whatsoever about what to copy in or + * out. + * + * This is solved by allowing FUSE server to retry ioctl with + * necessary in/out iovecs. Let's assume the ioctl implementation + * needs to read in the following structure. + * + * struct a { + * char *buf; + * size_t buflen; + * } + * + * On the first callout to FUSE server, inarg->in_size and + * inarg->out_size will be NULL; then, the server completes the ioctl + * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and + * the actual iov array to + * + * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } } + * + * which tells FUSE to copy in the requested area and retry the ioctl. + * On the second round, the server has access to the structure and + * from that it can tell what to look for next, so on the invocation, + * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to + * + * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) }, + * { .iov_base = a.buf, .iov_len = a.buflen } } + * + * FUSE will copy both struct a and the pointed buffer from the + * process doing the ioctl and retry ioctl with both struct a and the + * buffer. + * + * This time, FUSE server has everything it needs and completes ioctl + * without FUSE_IOCTL_RETRY which finishes the ioctl call. + * + * Copying data out works the same way. + * + * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel + * automatically initializes in and out iovs by decoding @cmd with + * _IOC_* macros and the server is not allowed to request RETRY. This + * limits ioctl data transfers to well-formed ioctls and is the forced + * behavior for all FUSE servers. + */ +long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, + unsigned int flags) +{ + struct fuse_file *ff = file->private_data; + struct fuse_mount *fm = ff->fm; + struct fuse_ioctl_in inarg = { + .fh = ff->fh, + .cmd = cmd, + .arg = arg, + .flags = flags + }; + struct fuse_ioctl_out outarg; + struct iovec *iov_page = NULL; + struct iovec *in_iov = NULL, *out_iov = NULL; + unsigned int in_iovs = 0, out_iovs = 0, max_pages; + size_t in_size, out_size, c; + ssize_t transferred; + int err, i; + struct iov_iter ii; + struct fuse_args_pages ap = {}; + +#if BITS_PER_LONG == 32 + inarg.flags |= FUSE_IOCTL_32BIT; +#else + if (flags & FUSE_IOCTL_COMPAT) { + inarg.flags |= FUSE_IOCTL_32BIT; +#ifdef CONFIG_X86_X32_ABI + if (in_x32_syscall()) + inarg.flags |= FUSE_IOCTL_COMPAT_X32; +#endif + } +#endif + + /* assume all the iovs returned by client always fits in a page */ + BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); + + err = -ENOMEM; + ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs); + iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); + if (!ap.pages || !iov_page) + goto out; + + fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages); + + /* + * If restricted, initialize IO parameters as encoded in @cmd. + * RETRY from server is not allowed. + */ + if (!(flags & FUSE_IOCTL_UNRESTRICTED)) { + struct iovec *iov = iov_page; + + iov->iov_base = (void __user *)arg; + iov->iov_len = _IOC_SIZE(cmd); + + if (_IOC_DIR(cmd) & _IOC_WRITE) { + in_iov = iov; + in_iovs = 1; + } + + if (_IOC_DIR(cmd) & _IOC_READ) { + out_iov = iov; + out_iovs = 1; + } + } + + retry: + inarg.in_size = in_size = iov_length(in_iov, in_iovs); + inarg.out_size = out_size = iov_length(out_iov, out_iovs); + + /* + * Out data can be used either for actual out data or iovs, + * make sure there always is at least one page. + */ + out_size = max_t(size_t, out_size, PAGE_SIZE); + max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE); + + /* make sure there are enough buffer pages and init request with them */ + err = -ENOMEM; + if (max_pages > fm->fc->max_pages) + goto out; + while (ap.num_pages < max_pages) { + ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!ap.pages[ap.num_pages]) + goto out; + ap.num_pages++; + } + + + /* okay, let's send it to the client */ + ap.args.opcode = FUSE_IOCTL; + ap.args.nodeid = ff->nodeid; + ap.args.in_numargs = 1; + ap.args.in_args[0].size = sizeof(inarg); + ap.args.in_args[0].value = &inarg; + if (in_size) { + ap.args.in_numargs++; + ap.args.in_args[1].size = in_size; + ap.args.in_pages = true; + + err = -EFAULT; + iov_iter_init(&ii, ITER_SOURCE, in_iov, in_iovs, in_size); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { + c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii); + if (c != PAGE_SIZE && iov_iter_count(&ii)) + goto out; + } + } + + ap.args.out_numargs = 2; + ap.args.out_args[1].size = out_size; + ap.args.out_pages = true; + ap.args.out_argvar = true; + + transferred = fuse_send_ioctl(fm, &ap.args, &outarg); + err = transferred; + if (transferred < 0) + goto out; + + /* did it ask for retry? */ + if (outarg.flags & FUSE_IOCTL_RETRY) { + void *vaddr; + + /* no retry if in restricted mode */ + err = -EIO; + if (!(flags & FUSE_IOCTL_UNRESTRICTED)) + goto out; + + in_iovs = outarg.in_iovs; + out_iovs = outarg.out_iovs; + + /* + * Make sure things are in boundary, separate checks + * are to protect against overflow. + */ + err = -ENOMEM; + if (in_iovs > FUSE_IOCTL_MAX_IOV || + out_iovs > FUSE_IOCTL_MAX_IOV || + in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) + goto out; + + vaddr = kmap_local_page(ap.pages[0]); + err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr, + transferred, in_iovs + out_iovs, + (flags & FUSE_IOCTL_COMPAT) != 0); + kunmap_local(vaddr); + if (err) + goto out; + + in_iov = iov_page; + out_iov = in_iov + in_iovs; + + err = fuse_verify_ioctl_iov(fm->fc, in_iov, in_iovs); + if (err) + goto out; + + err = fuse_verify_ioctl_iov(fm->fc, out_iov, out_iovs); + if (err) + goto out; + + goto retry; + } + + err = -EIO; + if (transferred > inarg.out_size) + goto out; + + err = -EFAULT; + iov_iter_init(&ii, ITER_DEST, out_iov, out_iovs, transferred); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { + c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii); + if (c != PAGE_SIZE && iov_iter_count(&ii)) + goto out; + } + err = 0; + out: + free_page((unsigned long) iov_page); + while (ap.num_pages) + __free_page(ap.pages[--ap.num_pages]); + kfree(ap.pages); + + return err ? err : outarg.result; +} +EXPORT_SYMBOL_GPL(fuse_do_ioctl); + +long fuse_ioctl_common(struct file *file, unsigned int cmd, + unsigned long arg, unsigned int flags) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + + if (!fuse_allow_current_process(fc)) + return -EACCES; + + if (fuse_is_bad(inode)) + return -EIO; + + return fuse_do_ioctl(file, cmd, arg, flags); +} + +long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + return fuse_ioctl_common(file, cmd, arg, 0); +} + +long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT); +} + +static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff, + unsigned int cmd, void *ptr, size_t size) +{ + struct fuse_mount *fm = ff->fm; + struct fuse_ioctl_in inarg; + struct fuse_ioctl_out outarg; + FUSE_ARGS(args); + int err; + + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + inarg.cmd = cmd; + +#if BITS_PER_LONG == 32 + inarg.flags |= FUSE_IOCTL_32BIT; +#endif + if (S_ISDIR(inode->i_mode)) + inarg.flags |= FUSE_IOCTL_DIR; + + if (_IOC_DIR(cmd) & _IOC_READ) + inarg.out_size = size; + if (_IOC_DIR(cmd) & _IOC_WRITE) + inarg.in_size = size; + + args.opcode = FUSE_IOCTL; + args.nodeid = ff->nodeid; + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = inarg.in_size; + args.in_args[1].value = ptr; + args.out_numargs = 2; + args.out_args[1].size = inarg.out_size; + args.out_args[1].value = ptr; + + err = fuse_send_ioctl(fm, &args, &outarg); + if (!err) { + if (outarg.result < 0) + err = outarg.result; + else if (outarg.flags & FUSE_IOCTL_RETRY) + err = -EIO; + } + return err; +} + +static struct fuse_file *fuse_priv_ioctl_prepare(struct inode *inode) +{ + struct fuse_mount *fm = get_fuse_mount(inode); + bool isdir = S_ISDIR(inode->i_mode); + + if (!fuse_allow_current_process(fm->fc)) + return ERR_PTR(-EACCES); + + if (fuse_is_bad(inode)) + return ERR_PTR(-EIO); + + if (!S_ISREG(inode->i_mode) && !isdir) + return ERR_PTR(-ENOTTY); + + return fuse_file_open(fm, get_node_id(inode), O_RDONLY, isdir); +} + +static void fuse_priv_ioctl_cleanup(struct inode *inode, struct fuse_file *ff) +{ + fuse_file_release(inode, ff, O_RDONLY, NULL, S_ISDIR(inode->i_mode)); +} + +int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct fuse_file *ff; + unsigned int flags; + struct fsxattr xfa; + int err; + + ff = fuse_priv_ioctl_prepare(inode); + if (IS_ERR(ff)) + return PTR_ERR(ff); + + if (fa->flags_valid) { + err = fuse_priv_ioctl(inode, ff, FS_IOC_GETFLAGS, + &flags, sizeof(flags)); + if (err) + goto cleanup; + + fileattr_fill_flags(fa, flags); + } else { + err = fuse_priv_ioctl(inode, ff, FS_IOC_FSGETXATTR, + &xfa, sizeof(xfa)); + if (err) + goto cleanup; + + fileattr_fill_xflags(fa, xfa.fsx_xflags); + fa->fsx_extsize = xfa.fsx_extsize; + fa->fsx_nextents = xfa.fsx_nextents; + fa->fsx_projid = xfa.fsx_projid; + fa->fsx_cowextsize = xfa.fsx_cowextsize; + } +cleanup: + fuse_priv_ioctl_cleanup(inode, ff); + + return err; +} + +int fuse_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct fuse_file *ff; + unsigned int flags = fa->flags; + struct fsxattr xfa; + int err; + + ff = fuse_priv_ioctl_prepare(inode); + if (IS_ERR(ff)) + return PTR_ERR(ff); + + if (fa->flags_valid) { + err = fuse_priv_ioctl(inode, ff, FS_IOC_SETFLAGS, + &flags, sizeof(flags)); + if (err) + goto cleanup; + } else { + memset(&xfa, 0, sizeof(xfa)); + xfa.fsx_xflags = fa->fsx_xflags; + xfa.fsx_extsize = fa->fsx_extsize; + xfa.fsx_nextents = fa->fsx_nextents; + xfa.fsx_projid = fa->fsx_projid; + xfa.fsx_cowextsize = fa->fsx_cowextsize; + + err = fuse_priv_ioctl(inode, ff, FS_IOC_FSSETXATTR, + &xfa, sizeof(xfa)); + } + +cleanup: + fuse_priv_ioctl_cleanup(inode, ff); + + return err; +} diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c new file mode 100644 index 000000000..f40bc51fa --- /dev/null +++ b/fs/fuse/readdir.c @@ -0,0 +1,604 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2018 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + + +#include "fuse_i.h" +#include +#include +#include +#include + +static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_inode *fi = get_fuse_inode(dir); + + if (!fc->do_readdirplus) + return false; + if (!fc->readdirplus_auto) + return true; + if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) + return true; + if (ctx->pos == 0) + return true; + return false; +} + +static void fuse_add_dirent_to_cache(struct file *file, + struct fuse_dirent *dirent, loff_t pos) +{ + struct fuse_inode *fi = get_fuse_inode(file_inode(file)); + size_t reclen = FUSE_DIRENT_SIZE(dirent); + pgoff_t index; + struct page *page; + loff_t size; + u64 version; + unsigned int offset; + void *addr; + + spin_lock(&fi->rdc.lock); + /* + * Is cache already completed? Or this entry does not go at the end of + * cache? + */ + if (fi->rdc.cached || pos != fi->rdc.pos) { + spin_unlock(&fi->rdc.lock); + return; + } + version = fi->rdc.version; + size = fi->rdc.size; + offset = size & ~PAGE_MASK; + index = size >> PAGE_SHIFT; + /* Dirent doesn't fit in current page? Jump to next page. */ + if (offset + reclen > PAGE_SIZE) { + index++; + offset = 0; + } + spin_unlock(&fi->rdc.lock); + + if (offset) { + page = find_lock_page(file->f_mapping, index); + } else { + page = find_or_create_page(file->f_mapping, index, + mapping_gfp_mask(file->f_mapping)); + } + if (!page) + return; + + spin_lock(&fi->rdc.lock); + /* Raced with another readdir */ + if (fi->rdc.version != version || fi->rdc.size != size || + WARN_ON(fi->rdc.pos != pos)) + goto unlock; + + addr = kmap_local_page(page); + if (!offset) { + clear_page(addr); + SetPageUptodate(page); + } + memcpy(addr + offset, dirent, reclen); + kunmap_local(addr); + fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen; + fi->rdc.pos = dirent->off; +unlock: + spin_unlock(&fi->rdc.lock); + unlock_page(page); + put_page(page); +} + +static void fuse_readdir_cache_end(struct file *file, loff_t pos) +{ + struct fuse_inode *fi = get_fuse_inode(file_inode(file)); + loff_t end; + + spin_lock(&fi->rdc.lock); + /* does cache end position match current position? */ + if (fi->rdc.pos != pos) { + spin_unlock(&fi->rdc.lock); + return; + } + + fi->rdc.cached = true; + end = ALIGN(fi->rdc.size, PAGE_SIZE); + spin_unlock(&fi->rdc.lock); + + /* truncate unused tail of cache */ + truncate_inode_pages(file->f_mapping, end); +} + +static bool fuse_emit(struct file *file, struct dir_context *ctx, + struct fuse_dirent *dirent) +{ + struct fuse_file *ff = file->private_data; + + if (ff->open_flags & FOPEN_CACHE_DIR) + fuse_add_dirent_to_cache(file, dirent, ctx->pos); + + return dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino, + dirent->type); +} + +static int parse_dirfile(char *buf, size_t nbytes, struct file *file, + struct dir_context *ctx) +{ + while (nbytes >= FUSE_NAME_OFFSET) { + struct fuse_dirent *dirent = (struct fuse_dirent *) buf; + size_t reclen = FUSE_DIRENT_SIZE(dirent); + if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) + return -EIO; + if (reclen > nbytes) + break; + if (memchr(dirent->name, '/', dirent->namelen) != NULL) + return -EIO; + + if (!fuse_emit(file, ctx, dirent)) + break; + + buf += reclen; + nbytes -= reclen; + ctx->pos = dirent->off; + } + + return 0; +} + +static int fuse_direntplus_link(struct file *file, + struct fuse_direntplus *direntplus, + u64 attr_version) +{ + struct fuse_entry_out *o = &direntplus->entry_out; + struct fuse_dirent *dirent = &direntplus->dirent; + struct dentry *parent = file->f_path.dentry; + struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); + struct dentry *dentry; + struct dentry *alias; + struct inode *dir = d_inode(parent); + struct fuse_conn *fc; + struct inode *inode; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + + if (!o->nodeid) { + /* + * Unlike in the case of fuse_lookup, zero nodeid does not mean + * ENOENT. Instead, it only means the userspace filesystem did + * not want to return attributes/handle for this entry. + * + * So do nothing. + */ + return 0; + } + + if (name.name[0] == '.') { + /* + * We could potentially refresh the attributes of the directory + * and its parent? + */ + if (name.len == 1) + return 0; + if (name.name[1] == '.' && name.len == 2) + return 0; + } + + if (invalid_nodeid(o->nodeid)) + return -EIO; + if (fuse_invalid_attr(&o->attr)) + return -EIO; + + fc = get_fuse_conn(dir); + + name.hash = full_name_hash(parent, name.name, name.len); + dentry = d_lookup(parent, &name); + if (!dentry) { +retry: + dentry = d_alloc_parallel(parent, &name, &wq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + } + if (!d_in_lookup(dentry)) { + struct fuse_inode *fi; + inode = d_inode(dentry); + if (inode && get_node_id(inode) != o->nodeid) + inode = NULL; + if (!inode || + fuse_stale_inode(inode, o->generation, &o->attr)) { + if (inode) + fuse_make_bad(inode); + d_invalidate(dentry); + dput(dentry); + goto retry; + } + if (fuse_is_bad(inode)) { + dput(dentry); + return -EIO; + } + + fi = get_fuse_inode(inode); + spin_lock(&fi->lock); + fi->nlookup++; + spin_unlock(&fi->lock); + + forget_all_cached_acls(inode); + fuse_change_attributes(inode, &o->attr, + entry_attr_timeout(o), + attr_version); + /* + * The other branch comes via fuse_iget() + * which bumps nlookup inside + */ + } else { + inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, + &o->attr, entry_attr_timeout(o), + attr_version); + if (!inode) + inode = ERR_PTR(-ENOMEM); + + alias = d_splice_alias(inode, dentry); + d_lookup_done(dentry); + if (alias) { + dput(dentry); + dentry = alias; + } + if (IS_ERR(dentry)) { + if (!IS_ERR(inode)) { + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + fi->nlookup--; + spin_unlock(&fi->lock); + } + return PTR_ERR(dentry); + } + } + if (fc->readdirplus_auto) + set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); + fuse_change_entry_timeout(dentry, o); + + dput(dentry); + return 0; +} + +static void fuse_force_forget(struct file *file, u64 nodeid) +{ + struct inode *inode = file_inode(file); + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_forget_in inarg; + FUSE_ARGS(args); + + memset(&inarg, 0, sizeof(inarg)); + inarg.nlookup = 1; + args.opcode = FUSE_FORGET; + args.nodeid = nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.force = true; + args.noreply = true; + + fuse_simple_request(fm, &args); + /* ignore errors */ +} + +static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, + struct dir_context *ctx, u64 attr_version) +{ + struct fuse_direntplus *direntplus; + struct fuse_dirent *dirent; + size_t reclen; + int over = 0; + int ret; + + while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) { + direntplus = (struct fuse_direntplus *) buf; + dirent = &direntplus->dirent; + reclen = FUSE_DIRENTPLUS_SIZE(direntplus); + + if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) + return -EIO; + if (reclen > nbytes) + break; + if (memchr(dirent->name, '/', dirent->namelen) != NULL) + return -EIO; + + if (!over) { + /* We fill entries into dstbuf only as much as + it can hold. But we still continue iterating + over remaining entries to link them. If not, + we need to send a FORGET for each of those + which we did not link. + */ + over = !fuse_emit(file, ctx, dirent); + if (!over) + ctx->pos = dirent->off; + } + + buf += reclen; + nbytes -= reclen; + + ret = fuse_direntplus_link(file, direntplus, attr_version); + if (ret) + fuse_force_forget(file, direntplus->entry_out.nodeid); + } + + return 0; +} + +static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) +{ + int plus; + ssize_t res; + struct page *page; + struct inode *inode = file_inode(file); + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_io_args ia = {}; + struct fuse_args_pages *ap = &ia.ap; + struct fuse_page_desc desc = { .length = PAGE_SIZE }; + u64 attr_version = 0; + bool locked; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + plus = fuse_use_readdirplus(inode, ctx); + ap->args.out_pages = true; + ap->num_pages = 1; + ap->pages = &page; + ap->descs = &desc; + if (plus) { + attr_version = fuse_get_attr_version(fm->fc); + fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, + FUSE_READDIRPLUS); + } else { + fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, + FUSE_READDIR); + } + locked = fuse_lock_inode(inode); + res = fuse_simple_request(fm, &ap->args); + fuse_unlock_inode(inode, locked); + if (res >= 0) { + if (!res) { + struct fuse_file *ff = file->private_data; + + if (ff->open_flags & FOPEN_CACHE_DIR) + fuse_readdir_cache_end(file, ctx->pos); + } else if (plus) { + res = parse_dirplusfile(page_address(page), res, + file, ctx, attr_version); + } else { + res = parse_dirfile(page_address(page), res, file, + ctx); + } + } + + __free_page(page); + fuse_invalidate_atime(inode); + return res; +} + +enum fuse_parse_result { + FOUND_ERR = -1, + FOUND_NONE = 0, + FOUND_SOME, + FOUND_ALL, +}; + +static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff, + void *addr, unsigned int size, + struct dir_context *ctx) +{ + unsigned int offset = ff->readdir.cache_off & ~PAGE_MASK; + enum fuse_parse_result res = FOUND_NONE; + + WARN_ON(offset >= size); + + for (;;) { + struct fuse_dirent *dirent = addr + offset; + unsigned int nbytes = size - offset; + size_t reclen; + + if (nbytes < FUSE_NAME_OFFSET || !dirent->namelen) + break; + + reclen = FUSE_DIRENT_SIZE(dirent); /* derefs ->namelen */ + + if (WARN_ON(dirent->namelen > FUSE_NAME_MAX)) + return FOUND_ERR; + if (WARN_ON(reclen > nbytes)) + return FOUND_ERR; + if (WARN_ON(memchr(dirent->name, '/', dirent->namelen) != NULL)) + return FOUND_ERR; + + if (ff->readdir.pos == ctx->pos) { + res = FOUND_SOME; + if (!dir_emit(ctx, dirent->name, dirent->namelen, + dirent->ino, dirent->type)) + return FOUND_ALL; + ctx->pos = dirent->off; + } + ff->readdir.pos = dirent->off; + ff->readdir.cache_off += reclen; + + offset += reclen; + } + + return res; +} + +static void fuse_rdc_reset(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + fi->rdc.cached = false; + fi->rdc.version++; + fi->rdc.size = 0; + fi->rdc.pos = 0; +} + +#define UNCACHED 1 + +static int fuse_readdir_cached(struct file *file, struct dir_context *ctx) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + enum fuse_parse_result res; + pgoff_t index; + unsigned int size; + struct page *page; + void *addr; + + /* Seeked? If so, reset the cache stream */ + if (ff->readdir.pos != ctx->pos) { + ff->readdir.pos = 0; + ff->readdir.cache_off = 0; + } + + /* + * We're just about to start reading into the cache or reading the + * cache; both cases require an up-to-date mtime value. + */ + if (!ctx->pos && fc->auto_inval_data) { + int err = fuse_update_attributes(inode, file, STATX_MTIME); + + if (err) + return err; + } + +retry: + spin_lock(&fi->rdc.lock); +retry_locked: + if (!fi->rdc.cached) { + /* Starting cache? Set cache mtime. */ + if (!ctx->pos && !fi->rdc.size) { + fi->rdc.mtime = inode->i_mtime; + fi->rdc.iversion = inode_query_iversion(inode); + } + spin_unlock(&fi->rdc.lock); + return UNCACHED; + } + /* + * When at the beginning of the directory (i.e. just after opendir(3) or + * rewinddir(3)), then need to check whether directory contents have + * changed, and reset the cache if so. + */ + if (!ctx->pos) { + if (inode_peek_iversion(inode) != fi->rdc.iversion || + !timespec64_equal(&fi->rdc.mtime, &inode->i_mtime)) { + fuse_rdc_reset(inode); + goto retry_locked; + } + } + + /* + * If cache version changed since the last getdents() call, then reset + * the cache stream. + */ + if (ff->readdir.version != fi->rdc.version) { + ff->readdir.pos = 0; + ff->readdir.cache_off = 0; + } + /* + * If at the beginning of the cache, than reset version to + * current. + */ + if (ff->readdir.pos == 0) + ff->readdir.version = fi->rdc.version; + + WARN_ON(fi->rdc.size < ff->readdir.cache_off); + + index = ff->readdir.cache_off >> PAGE_SHIFT; + + if (index == (fi->rdc.size >> PAGE_SHIFT)) + size = fi->rdc.size & ~PAGE_MASK; + else + size = PAGE_SIZE; + spin_unlock(&fi->rdc.lock); + + /* EOF? */ + if ((ff->readdir.cache_off & ~PAGE_MASK) == size) + return 0; + + page = find_get_page_flags(file->f_mapping, index, + FGP_ACCESSED | FGP_LOCK); + /* Page gone missing, then re-added to cache, but not initialized? */ + if (page && !PageUptodate(page)) { + unlock_page(page); + put_page(page); + page = NULL; + } + spin_lock(&fi->rdc.lock); + if (!page) { + /* + * Uh-oh: page gone missing, cache is useless + */ + if (fi->rdc.version == ff->readdir.version) + fuse_rdc_reset(inode); + goto retry_locked; + } + + /* Make sure it's still the same version after getting the page. */ + if (ff->readdir.version != fi->rdc.version) { + spin_unlock(&fi->rdc.lock); + unlock_page(page); + put_page(page); + goto retry; + } + spin_unlock(&fi->rdc.lock); + + /* + * Contents of the page are now protected against changing by holding + * the page lock. + */ + addr = kmap(page); + res = fuse_parse_cache(ff, addr, size, ctx); + kunmap(page); + unlock_page(page); + put_page(page); + + if (res == FOUND_ERR) + return -EIO; + + if (res == FOUND_ALL) + return 0; + + if (size == PAGE_SIZE) { + /* We hit end of page: skip to next page. */ + ff->readdir.cache_off = ALIGN(ff->readdir.cache_off, PAGE_SIZE); + goto retry; + } + + /* + * End of cache reached. If found position, then we are done, otherwise + * need to fall back to uncached, since the position we were looking for + * wasn't in the cache. + */ + return res == FOUND_SOME ? 0 : UNCACHED; +} + +int fuse_readdir(struct file *file, struct dir_context *ctx) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + int err; + + if (fuse_is_bad(inode)) + return -EIO; + + mutex_lock(&ff->readdir.lock); + + err = UNCACHED; + if (ff->open_flags & FOPEN_CACHE_DIR) + err = fuse_readdir_cached(file, ctx); + if (err == UNCACHED) + err = fuse_readdir_uncached(file, ctx); + + mutex_unlock(&ff->readdir.lock); + + return err; +} diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c new file mode 100644 index 000000000..4d8d4f16c --- /dev/null +++ b/fs/fuse/virtio_fs.c @@ -0,0 +1,1541 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * virtio-fs: Virtio Filesystem + * Copyright (C) 2018 Red Hat, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "fuse_i.h" + +/* Used to help calculate the FUSE connection's max_pages limit for a request's + * size. Parts of the struct fuse_req are sliced into scattergather lists in + * addition to the pages used, so this can help account for that overhead. + */ +#define FUSE_HEADER_OVERHEAD 4 + +/* List of virtio-fs device instances and a lock for the list. Also provides + * mutual exclusion in device removal and mounting path + */ +static DEFINE_MUTEX(virtio_fs_mutex); +static LIST_HEAD(virtio_fs_instances); + +enum { + VQ_HIPRIO, + VQ_REQUEST +}; + +#define VQ_NAME_LEN 24 + +/* Per-virtqueue state */ +struct virtio_fs_vq { + spinlock_t lock; + struct virtqueue *vq; /* protected by ->lock */ + struct work_struct done_work; + struct list_head queued_reqs; + struct list_head end_reqs; /* End these requests */ + struct delayed_work dispatch_work; + struct fuse_dev *fud; + bool connected; + long in_flight; + struct completion in_flight_zero; /* No inflight requests */ + char name[VQ_NAME_LEN]; +} ____cacheline_aligned_in_smp; + +/* A virtio-fs device instance */ +struct virtio_fs { + struct kref refcount; + struct list_head list; /* on virtio_fs_instances */ + char *tag; + struct virtio_fs_vq *vqs; + unsigned int nvqs; /* number of virtqueues */ + unsigned int num_request_queues; /* number of request queues */ + struct dax_device *dax_dev; + + /* DAX memory window where file contents are mapped */ + void *window_kaddr; + phys_addr_t window_phys_addr; + size_t window_len; +}; + +struct virtio_fs_forget_req { + struct fuse_in_header ih; + struct fuse_forget_in arg; +}; + +struct virtio_fs_forget { + /* This request can be temporarily queued on virt queue */ + struct list_head list; + struct virtio_fs_forget_req req; +}; + +struct virtio_fs_req_work { + struct fuse_req *req; + struct virtio_fs_vq *fsvq; + struct work_struct done_work; +}; + +static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, + struct fuse_req *req, bool in_flight); + +static const struct constant_table dax_param_enums[] = { + {"always", FUSE_DAX_ALWAYS }, + {"never", FUSE_DAX_NEVER }, + {"inode", FUSE_DAX_INODE_USER }, + {} +}; + +enum { + OPT_DAX, + OPT_DAX_ENUM, +}; + +static const struct fs_parameter_spec virtio_fs_parameters[] = { + fsparam_flag("dax", OPT_DAX), + fsparam_enum("dax", OPT_DAX_ENUM, dax_param_enums), + {} +}; + +static int virtio_fs_parse_param(struct fs_context *fsc, + struct fs_parameter *param) +{ + struct fs_parse_result result; + struct fuse_fs_context *ctx = fsc->fs_private; + int opt; + + opt = fs_parse(fsc, virtio_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case OPT_DAX: + ctx->dax_mode = FUSE_DAX_ALWAYS; + break; + case OPT_DAX_ENUM: + ctx->dax_mode = result.uint_32; + break; + default: + return -EINVAL; + } + + return 0; +} + +static void virtio_fs_free_fsc(struct fs_context *fsc) +{ + struct fuse_fs_context *ctx = fsc->fs_private; + + kfree(ctx); +} + +static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq) +{ + struct virtio_fs *fs = vq->vdev->priv; + + return &fs->vqs[vq->index]; +} + +/* Should be called with fsvq->lock held. */ +static inline void inc_in_flight_req(struct virtio_fs_vq *fsvq) +{ + fsvq->in_flight++; +} + +/* Should be called with fsvq->lock held. */ +static inline void dec_in_flight_req(struct virtio_fs_vq *fsvq) +{ + WARN_ON(fsvq->in_flight <= 0); + fsvq->in_flight--; + if (!fsvq->in_flight) + complete(&fsvq->in_flight_zero); +} + +static void release_virtio_fs_obj(struct kref *ref) +{ + struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount); + + kfree(vfs->vqs); + kfree(vfs); +} + +/* Make sure virtiofs_mutex is held */ +static void virtio_fs_put(struct virtio_fs *fs) +{ + kref_put(&fs->refcount, release_virtio_fs_obj); +} + +static void virtio_fs_fiq_release(struct fuse_iqueue *fiq) +{ + struct virtio_fs *vfs = fiq->priv; + + mutex_lock(&virtio_fs_mutex); + virtio_fs_put(vfs); + mutex_unlock(&virtio_fs_mutex); +} + +static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq) +{ + WARN_ON(fsvq->in_flight < 0); + + /* Wait for in flight requests to finish.*/ + spin_lock(&fsvq->lock); + if (fsvq->in_flight) { + /* We are holding virtio_fs_mutex. There should not be any + * waiters waiting for completion. + */ + reinit_completion(&fsvq->in_flight_zero); + spin_unlock(&fsvq->lock); + wait_for_completion(&fsvq->in_flight_zero); + } else { + spin_unlock(&fsvq->lock); + } + + flush_work(&fsvq->done_work); + flush_delayed_work(&fsvq->dispatch_work); +} + +static void virtio_fs_drain_all_queues_locked(struct virtio_fs *fs) +{ + struct virtio_fs_vq *fsvq; + int i; + + for (i = 0; i < fs->nvqs; i++) { + fsvq = &fs->vqs[i]; + virtio_fs_drain_queue(fsvq); + } +} + +static void virtio_fs_drain_all_queues(struct virtio_fs *fs) +{ + /* Provides mutual exclusion between ->remove and ->kill_sb + * paths. We don't want both of these draining queue at the + * same time. Current completion logic reinits completion + * and that means there should not be any other thread + * doing reinit or waiting for completion already. + */ + mutex_lock(&virtio_fs_mutex); + virtio_fs_drain_all_queues_locked(fs); + mutex_unlock(&virtio_fs_mutex); +} + +static void virtio_fs_start_all_queues(struct virtio_fs *fs) +{ + struct virtio_fs_vq *fsvq; + int i; + + for (i = 0; i < fs->nvqs; i++) { + fsvq = &fs->vqs[i]; + spin_lock(&fsvq->lock); + fsvq->connected = true; + spin_unlock(&fsvq->lock); + } +} + +/* Add a new instance to the list or return -EEXIST if tag name exists*/ +static int virtio_fs_add_instance(struct virtio_fs *fs) +{ + struct virtio_fs *fs2; + bool duplicate = false; + + mutex_lock(&virtio_fs_mutex); + + list_for_each_entry(fs2, &virtio_fs_instances, list) { + if (strcmp(fs->tag, fs2->tag) == 0) + duplicate = true; + } + + if (!duplicate) + list_add_tail(&fs->list, &virtio_fs_instances); + + mutex_unlock(&virtio_fs_mutex); + + if (duplicate) + return -EEXIST; + return 0; +} + +/* Return the virtio_fs with a given tag, or NULL */ +static struct virtio_fs *virtio_fs_find_instance(const char *tag) +{ + struct virtio_fs *fs; + + mutex_lock(&virtio_fs_mutex); + + list_for_each_entry(fs, &virtio_fs_instances, list) { + if (strcmp(fs->tag, tag) == 0) { + kref_get(&fs->refcount); + goto found; + } + } + + fs = NULL; /* not found */ + +found: + mutex_unlock(&virtio_fs_mutex); + + return fs; +} + +static void virtio_fs_free_devs(struct virtio_fs *fs) +{ + unsigned int i; + + for (i = 0; i < fs->nvqs; i++) { + struct virtio_fs_vq *fsvq = &fs->vqs[i]; + + if (!fsvq->fud) + continue; + + fuse_dev_free(fsvq->fud); + fsvq->fud = NULL; + } +} + +/* Read filesystem name from virtio config into fs->tag (must kfree()). */ +static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs) +{ + char tag_buf[sizeof_field(struct virtio_fs_config, tag)]; + char *end; + size_t len; + + virtio_cread_bytes(vdev, offsetof(struct virtio_fs_config, tag), + &tag_buf, sizeof(tag_buf)); + end = memchr(tag_buf, '\0', sizeof(tag_buf)); + if (end == tag_buf) + return -EINVAL; /* empty tag */ + if (!end) + end = &tag_buf[sizeof(tag_buf)]; + + len = end - tag_buf; + fs->tag = devm_kmalloc(&vdev->dev, len + 1, GFP_KERNEL); + if (!fs->tag) + return -ENOMEM; + memcpy(fs->tag, tag_buf, len); + fs->tag[len] = '\0'; + return 0; +} + +/* Work function for hiprio completion */ +static void virtio_fs_hiprio_done_work(struct work_struct *work) +{ + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, + done_work); + struct virtqueue *vq = fsvq->vq; + + /* Free completed FUSE_FORGET requests */ + spin_lock(&fsvq->lock); + do { + unsigned int len; + void *req; + + virtqueue_disable_cb(vq); + + while ((req = virtqueue_get_buf(vq, &len)) != NULL) { + kfree(req); + dec_in_flight_req(fsvq); + } + } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); + spin_unlock(&fsvq->lock); +} + +static void virtio_fs_request_dispatch_work(struct work_struct *work) +{ + struct fuse_req *req; + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, + dispatch_work.work); + int ret; + + pr_debug("virtio-fs: worker %s called.\n", __func__); + while (1) { + spin_lock(&fsvq->lock); + req = list_first_entry_or_null(&fsvq->end_reqs, struct fuse_req, + list); + if (!req) { + spin_unlock(&fsvq->lock); + break; + } + + list_del_init(&req->list); + spin_unlock(&fsvq->lock); + fuse_request_end(req); + } + + /* Dispatch pending requests */ + while (1) { + spin_lock(&fsvq->lock); + req = list_first_entry_or_null(&fsvq->queued_reqs, + struct fuse_req, list); + if (!req) { + spin_unlock(&fsvq->lock); + return; + } + list_del_init(&req->list); + spin_unlock(&fsvq->lock); + + ret = virtio_fs_enqueue_req(fsvq, req, true); + if (ret < 0) { + if (ret == -ENOMEM || ret == -ENOSPC) { + spin_lock(&fsvq->lock); + list_add_tail(&req->list, &fsvq->queued_reqs); + schedule_delayed_work(&fsvq->dispatch_work, + msecs_to_jiffies(1)); + spin_unlock(&fsvq->lock); + return; + } + req->out.h.error = ret; + spin_lock(&fsvq->lock); + dec_in_flight_req(fsvq); + spin_unlock(&fsvq->lock); + pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", + ret); + fuse_request_end(req); + } + } +} + +/* + * Returns 1 if queue is full and sender should wait a bit before sending + * next request, 0 otherwise. + */ +static int send_forget_request(struct virtio_fs_vq *fsvq, + struct virtio_fs_forget *forget, + bool in_flight) +{ + struct scatterlist sg; + struct virtqueue *vq; + int ret = 0; + bool notify; + struct virtio_fs_forget_req *req = &forget->req; + + spin_lock(&fsvq->lock); + if (!fsvq->connected) { + if (in_flight) + dec_in_flight_req(fsvq); + kfree(forget); + goto out; + } + + sg_init_one(&sg, req, sizeof(*req)); + vq = fsvq->vq; + dev_dbg(&vq->vdev->dev, "%s\n", __func__); + + ret = virtqueue_add_outbuf(vq, &sg, 1, forget, GFP_ATOMIC); + if (ret < 0) { + if (ret == -ENOMEM || ret == -ENOSPC) { + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n", + ret); + list_add_tail(&forget->list, &fsvq->queued_reqs); + schedule_delayed_work(&fsvq->dispatch_work, + msecs_to_jiffies(1)); + if (!in_flight) + inc_in_flight_req(fsvq); + /* Queue is full */ + ret = 1; + } else { + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n", + ret); + kfree(forget); + if (in_flight) + dec_in_flight_req(fsvq); + } + goto out; + } + + if (!in_flight) + inc_in_flight_req(fsvq); + notify = virtqueue_kick_prepare(vq); + spin_unlock(&fsvq->lock); + + if (notify) + virtqueue_notify(vq); + return ret; +out: + spin_unlock(&fsvq->lock); + return ret; +} + +static void virtio_fs_hiprio_dispatch_work(struct work_struct *work) +{ + struct virtio_fs_forget *forget; + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, + dispatch_work.work); + pr_debug("virtio-fs: worker %s called.\n", __func__); + while (1) { + spin_lock(&fsvq->lock); + forget = list_first_entry_or_null(&fsvq->queued_reqs, + struct virtio_fs_forget, list); + if (!forget) { + spin_unlock(&fsvq->lock); + return; + } + + list_del(&forget->list); + spin_unlock(&fsvq->lock); + if (send_forget_request(fsvq, forget, true)) + return; + } +} + +/* Allocate and copy args into req->argbuf */ +static int copy_args_to_argbuf(struct fuse_req *req) +{ + struct fuse_args *args = req->args; + unsigned int offset = 0; + unsigned int num_in; + unsigned int num_out; + unsigned int len; + unsigned int i; + + num_in = args->in_numargs - args->in_pages; + num_out = args->out_numargs - args->out_pages; + len = fuse_len_args(num_in, (struct fuse_arg *) args->in_args) + + fuse_len_args(num_out, args->out_args); + + req->argbuf = kmalloc(len, GFP_ATOMIC); + if (!req->argbuf) + return -ENOMEM; + + for (i = 0; i < num_in; i++) { + memcpy(req->argbuf + offset, + args->in_args[i].value, + args->in_args[i].size); + offset += args->in_args[i].size; + } + + return 0; +} + +/* Copy args out of and free req->argbuf */ +static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req) +{ + unsigned int remaining; + unsigned int offset; + unsigned int num_in; + unsigned int num_out; + unsigned int i; + + remaining = req->out.h.len - sizeof(req->out.h); + num_in = args->in_numargs - args->in_pages; + num_out = args->out_numargs - args->out_pages; + offset = fuse_len_args(num_in, (struct fuse_arg *)args->in_args); + + for (i = 0; i < num_out; i++) { + unsigned int argsize = args->out_args[i].size; + + if (args->out_argvar && + i == args->out_numargs - 1 && + argsize > remaining) { + argsize = remaining; + } + + memcpy(args->out_args[i].value, req->argbuf + offset, argsize); + offset += argsize; + + if (i != args->out_numargs - 1) + remaining -= argsize; + } + + /* Store the actual size of the variable-length arg */ + if (args->out_argvar) + args->out_args[args->out_numargs - 1].size = remaining; + + kfree(req->argbuf); + req->argbuf = NULL; +} + +/* Work function for request completion */ +static void virtio_fs_request_complete(struct fuse_req *req, + struct virtio_fs_vq *fsvq) +{ + struct fuse_pqueue *fpq = &fsvq->fud->pq; + struct fuse_args *args; + struct fuse_args_pages *ap; + unsigned int len, i, thislen; + struct page *page; + + /* + * TODO verify that server properly follows FUSE protocol + * (oh.uniq, oh.len) + */ + args = req->args; + copy_args_from_argbuf(args, req); + + if (args->out_pages && args->page_zeroing) { + len = args->out_args[args->out_numargs - 1].size; + ap = container_of(args, typeof(*ap), args); + for (i = 0; i < ap->num_pages; i++) { + thislen = ap->descs[i].length; + if (len < thislen) { + WARN_ON(ap->descs[i].offset); + page = ap->pages[i]; + zero_user_segment(page, len, thislen); + len = 0; + } else { + len -= thislen; + } + } + } + + spin_lock(&fpq->lock); + clear_bit(FR_SENT, &req->flags); + spin_unlock(&fpq->lock); + + fuse_request_end(req); + spin_lock(&fsvq->lock); + dec_in_flight_req(fsvq); + spin_unlock(&fsvq->lock); +} + +static void virtio_fs_complete_req_work(struct work_struct *work) +{ + struct virtio_fs_req_work *w = + container_of(work, typeof(*w), done_work); + + virtio_fs_request_complete(w->req, w->fsvq); + kfree(w); +} + +static void virtio_fs_requests_done_work(struct work_struct *work) +{ + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, + done_work); + struct fuse_pqueue *fpq = &fsvq->fud->pq; + struct virtqueue *vq = fsvq->vq; + struct fuse_req *req; + struct fuse_req *next; + unsigned int len; + LIST_HEAD(reqs); + + /* Collect completed requests off the virtqueue */ + spin_lock(&fsvq->lock); + do { + virtqueue_disable_cb(vq); + + while ((req = virtqueue_get_buf(vq, &len)) != NULL) { + spin_lock(&fpq->lock); + list_move_tail(&req->list, &reqs); + spin_unlock(&fpq->lock); + } + } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); + spin_unlock(&fsvq->lock); + + /* End requests */ + list_for_each_entry_safe(req, next, &reqs, list) { + list_del_init(&req->list); + + /* blocking async request completes in a worker context */ + if (req->args->may_block) { + struct virtio_fs_req_work *w; + + w = kzalloc(sizeof(*w), GFP_NOFS | __GFP_NOFAIL); + INIT_WORK(&w->done_work, virtio_fs_complete_req_work); + w->fsvq = fsvq; + w->req = req; + schedule_work(&w->done_work); + } else { + virtio_fs_request_complete(req, fsvq); + } + } +} + +/* Virtqueue interrupt handler */ +static void virtio_fs_vq_done(struct virtqueue *vq) +{ + struct virtio_fs_vq *fsvq = vq_to_fsvq(vq); + + dev_dbg(&vq->vdev->dev, "%s %s\n", __func__, fsvq->name); + + schedule_work(&fsvq->done_work); +} + +static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name, + int vq_type) +{ + strscpy(fsvq->name, name, VQ_NAME_LEN); + spin_lock_init(&fsvq->lock); + INIT_LIST_HEAD(&fsvq->queued_reqs); + INIT_LIST_HEAD(&fsvq->end_reqs); + init_completion(&fsvq->in_flight_zero); + + if (vq_type == VQ_REQUEST) { + INIT_WORK(&fsvq->done_work, virtio_fs_requests_done_work); + INIT_DELAYED_WORK(&fsvq->dispatch_work, + virtio_fs_request_dispatch_work); + } else { + INIT_WORK(&fsvq->done_work, virtio_fs_hiprio_done_work); + INIT_DELAYED_WORK(&fsvq->dispatch_work, + virtio_fs_hiprio_dispatch_work); + } +} + +/* Initialize virtqueues */ +static int virtio_fs_setup_vqs(struct virtio_device *vdev, + struct virtio_fs *fs) +{ + struct virtqueue **vqs; + vq_callback_t **callbacks; + const char **names; + unsigned int i; + int ret = 0; + + virtio_cread_le(vdev, struct virtio_fs_config, num_request_queues, + &fs->num_request_queues); + if (fs->num_request_queues == 0) + return -EINVAL; + + fs->nvqs = VQ_REQUEST + fs->num_request_queues; + fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL); + if (!fs->vqs) + return -ENOMEM; + + vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL); + callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]), + GFP_KERNEL); + names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL); + if (!vqs || !callbacks || !names) { + ret = -ENOMEM; + goto out; + } + + /* Initialize the hiprio/forget request virtqueue */ + callbacks[VQ_HIPRIO] = virtio_fs_vq_done; + virtio_fs_init_vq(&fs->vqs[VQ_HIPRIO], "hiprio", VQ_HIPRIO); + names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name; + + /* Initialize the requests virtqueues */ + for (i = VQ_REQUEST; i < fs->nvqs; i++) { + char vq_name[VQ_NAME_LEN]; + + snprintf(vq_name, VQ_NAME_LEN, "requests.%u", i - VQ_REQUEST); + virtio_fs_init_vq(&fs->vqs[i], vq_name, VQ_REQUEST); + callbacks[i] = virtio_fs_vq_done; + names[i] = fs->vqs[i].name; + } + + ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL); + if (ret < 0) + goto out; + + for (i = 0; i < fs->nvqs; i++) + fs->vqs[i].vq = vqs[i]; + + virtio_fs_start_all_queues(fs); +out: + kfree(names); + kfree(callbacks); + kfree(vqs); + if (ret) + kfree(fs->vqs); + return ret; +} + +/* Free virtqueues (device must already be reset) */ +static void virtio_fs_cleanup_vqs(struct virtio_device *vdev) +{ + vdev->config->del_vqs(vdev); +} + +/* Map a window offset to a page frame number. The window offset will have + * been produced by .iomap_begin(), which maps a file offset to a window + * offset. + */ +static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, + long nr_pages, enum dax_access_mode mode, + void **kaddr, pfn_t *pfn) +{ + struct virtio_fs *fs = dax_get_private(dax_dev); + phys_addr_t offset = PFN_PHYS(pgoff); + size_t max_nr_pages = fs->window_len / PAGE_SIZE - pgoff; + + if (kaddr) + *kaddr = fs->window_kaddr + offset; + if (pfn) + *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, + PFN_DEV | PFN_MAP); + return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; +} + +static int virtio_fs_zero_page_range(struct dax_device *dax_dev, + pgoff_t pgoff, size_t nr_pages) +{ + long rc; + void *kaddr; + + rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, &kaddr, + NULL); + if (rc < 0) + return rc; + memset(kaddr, 0, nr_pages << PAGE_SHIFT); + dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT); + return 0; +} + +static const struct dax_operations virtio_fs_dax_ops = { + .direct_access = virtio_fs_direct_access, + .zero_page_range = virtio_fs_zero_page_range, +}; + +static void virtio_fs_cleanup_dax(void *data) +{ + struct dax_device *dax_dev = data; + + kill_dax(dax_dev); + put_dax(dax_dev); +} + +static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) +{ + struct virtio_shm_region cache_reg; + struct dev_pagemap *pgmap; + bool have_cache; + + if (!IS_ENABLED(CONFIG_FUSE_DAX)) + return 0; + + /* Get cache region */ + have_cache = virtio_get_shm_region(vdev, &cache_reg, + (u8)VIRTIO_FS_SHMCAP_ID_CACHE); + if (!have_cache) { + dev_notice(&vdev->dev, "%s: No cache capability\n", __func__); + return 0; + } + + if (!devm_request_mem_region(&vdev->dev, cache_reg.addr, cache_reg.len, + dev_name(&vdev->dev))) { + dev_warn(&vdev->dev, "could not reserve region addr=0x%llx len=0x%llx\n", + cache_reg.addr, cache_reg.len); + return -EBUSY; + } + + dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n", cache_reg.len, + cache_reg.addr); + + pgmap = devm_kzalloc(&vdev->dev, sizeof(*pgmap), GFP_KERNEL); + if (!pgmap) + return -ENOMEM; + + pgmap->type = MEMORY_DEVICE_FS_DAX; + + /* Ideally we would directly use the PCI BAR resource but + * devm_memremap_pages() wants its own copy in pgmap. So + * initialize a struct resource from scratch (only the start + * and end fields will be used). + */ + pgmap->range = (struct range) { + .start = (phys_addr_t) cache_reg.addr, + .end = (phys_addr_t) cache_reg.addr + cache_reg.len - 1, + }; + pgmap->nr_range = 1; + + fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap); + if (IS_ERR(fs->window_kaddr)) + return PTR_ERR(fs->window_kaddr); + + fs->window_phys_addr = (phys_addr_t) cache_reg.addr; + fs->window_len = (phys_addr_t) cache_reg.len; + + dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n", + __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len); + + fs->dax_dev = alloc_dax(fs, &virtio_fs_dax_ops); + if (IS_ERR(fs->dax_dev)) + return PTR_ERR(fs->dax_dev); + + return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, + fs->dax_dev); +} + +static int virtio_fs_probe(struct virtio_device *vdev) +{ + struct virtio_fs *fs; + int ret; + + fs = kzalloc(sizeof(*fs), GFP_KERNEL); + if (!fs) + return -ENOMEM; + kref_init(&fs->refcount); + vdev->priv = fs; + + ret = virtio_fs_read_tag(vdev, fs); + if (ret < 0) + goto out; + + ret = virtio_fs_setup_vqs(vdev, fs); + if (ret < 0) + goto out; + + /* TODO vq affinity */ + + ret = virtio_fs_setup_dax(vdev, fs); + if (ret < 0) + goto out_vqs; + + /* Bring the device online in case the filesystem is mounted and + * requests need to be sent before we return. + */ + virtio_device_ready(vdev); + + ret = virtio_fs_add_instance(fs); + if (ret < 0) + goto out_vqs; + + return 0; + +out_vqs: + virtio_reset_device(vdev); + virtio_fs_cleanup_vqs(vdev); + kfree(fs->vqs); + +out: + vdev->priv = NULL; + kfree(fs); + return ret; +} + +static void virtio_fs_stop_all_queues(struct virtio_fs *fs) +{ + struct virtio_fs_vq *fsvq; + int i; + + for (i = 0; i < fs->nvqs; i++) { + fsvq = &fs->vqs[i]; + spin_lock(&fsvq->lock); + fsvq->connected = false; + spin_unlock(&fsvq->lock); + } +} + +static void virtio_fs_remove(struct virtio_device *vdev) +{ + struct virtio_fs *fs = vdev->priv; + + mutex_lock(&virtio_fs_mutex); + /* This device is going away. No one should get new reference */ + list_del_init(&fs->list); + virtio_fs_stop_all_queues(fs); + virtio_fs_drain_all_queues_locked(fs); + virtio_reset_device(vdev); + virtio_fs_cleanup_vqs(vdev); + + vdev->priv = NULL; + /* Put device reference on virtio_fs object */ + virtio_fs_put(fs); + mutex_unlock(&virtio_fs_mutex); +} + +#ifdef CONFIG_PM_SLEEP +static int virtio_fs_freeze(struct virtio_device *vdev) +{ + /* TODO need to save state here */ + pr_warn("virtio-fs: suspend/resume not yet supported\n"); + return -EOPNOTSUPP; +} + +static int virtio_fs_restore(struct virtio_device *vdev) +{ + /* TODO need to restore state here */ + return 0; +} +#endif /* CONFIG_PM_SLEEP */ + +static const struct virtio_device_id id_table[] = { + { VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID }, + {}, +}; + +static const unsigned int feature_table[] = {}; + +static struct virtio_driver virtio_fs_driver = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .feature_table = feature_table, + .feature_table_size = ARRAY_SIZE(feature_table), + .probe = virtio_fs_probe, + .remove = virtio_fs_remove, +#ifdef CONFIG_PM_SLEEP + .freeze = virtio_fs_freeze, + .restore = virtio_fs_restore, +#endif +}; + +static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq) +__releases(fiq->lock) +{ + struct fuse_forget_link *link; + struct virtio_fs_forget *forget; + struct virtio_fs_forget_req *req; + struct virtio_fs *fs; + struct virtio_fs_vq *fsvq; + u64 unique; + + link = fuse_dequeue_forget(fiq, 1, NULL); + unique = fuse_get_unique(fiq); + + fs = fiq->priv; + fsvq = &fs->vqs[VQ_HIPRIO]; + spin_unlock(&fiq->lock); + + /* Allocate a buffer for the request */ + forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL); + req = &forget->req; + + req->ih = (struct fuse_in_header){ + .opcode = FUSE_FORGET, + .nodeid = link->forget_one.nodeid, + .unique = unique, + .len = sizeof(*req), + }; + req->arg = (struct fuse_forget_in){ + .nlookup = link->forget_one.nlookup, + }; + + send_forget_request(fsvq, forget, false); + kfree(link); +} + +static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq) +__releases(fiq->lock) +{ + /* + * TODO interrupts. + * + * Normal fs operations on a local filesystems aren't interruptible. + * Exceptions are blocking lock operations; for example fcntl(F_SETLKW) + * with shared lock between host and guest. + */ + spin_unlock(&fiq->lock); +} + +/* Count number of scatter-gather elements required */ +static unsigned int sg_count_fuse_pages(struct fuse_page_desc *page_descs, + unsigned int num_pages, + unsigned int total_len) +{ + unsigned int i; + unsigned int this_len; + + for (i = 0; i < num_pages && total_len; i++) { + this_len = min(page_descs[i].length, total_len); + total_len -= this_len; + } + + return i; +} + +/* Return the number of scatter-gather list elements required */ +static unsigned int sg_count_fuse_req(struct fuse_req *req) +{ + struct fuse_args *args = req->args; + struct fuse_args_pages *ap = container_of(args, typeof(*ap), args); + unsigned int size, total_sgs = 1 /* fuse_in_header */; + + if (args->in_numargs - args->in_pages) + total_sgs += 1; + + if (args->in_pages) { + size = args->in_args[args->in_numargs - 1].size; + total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages, + size); + } + + if (!test_bit(FR_ISREPLY, &req->flags)) + return total_sgs; + + total_sgs += 1 /* fuse_out_header */; + + if (args->out_numargs - args->out_pages) + total_sgs += 1; + + if (args->out_pages) { + size = args->out_args[args->out_numargs - 1].size; + total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages, + size); + } + + return total_sgs; +} + +/* Add pages to scatter-gather list and return number of elements used */ +static unsigned int sg_init_fuse_pages(struct scatterlist *sg, + struct page **pages, + struct fuse_page_desc *page_descs, + unsigned int num_pages, + unsigned int total_len) +{ + unsigned int i; + unsigned int this_len; + + for (i = 0; i < num_pages && total_len; i++) { + sg_init_table(&sg[i], 1); + this_len = min(page_descs[i].length, total_len); + sg_set_page(&sg[i], pages[i], this_len, page_descs[i].offset); + total_len -= this_len; + } + + return i; +} + +/* Add args to scatter-gather list and return number of elements used */ +static unsigned int sg_init_fuse_args(struct scatterlist *sg, + struct fuse_req *req, + struct fuse_arg *args, + unsigned int numargs, + bool argpages, + void *argbuf, + unsigned int *len_used) +{ + struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args); + unsigned int total_sgs = 0; + unsigned int len; + + len = fuse_len_args(numargs - argpages, args); + if (len) + sg_init_one(&sg[total_sgs++], argbuf, len); + + if (argpages) + total_sgs += sg_init_fuse_pages(&sg[total_sgs], + ap->pages, ap->descs, + ap->num_pages, + args[numargs - 1].size); + + if (len_used) + *len_used = len; + + return total_sgs; +} + +/* Add a request to a virtqueue and kick the device */ +static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, + struct fuse_req *req, bool in_flight) +{ + /* requests need at least 4 elements */ + struct scatterlist *stack_sgs[6]; + struct scatterlist stack_sg[ARRAY_SIZE(stack_sgs)]; + struct scatterlist **sgs = stack_sgs; + struct scatterlist *sg = stack_sg; + struct virtqueue *vq; + struct fuse_args *args = req->args; + unsigned int argbuf_used = 0; + unsigned int out_sgs = 0; + unsigned int in_sgs = 0; + unsigned int total_sgs; + unsigned int i; + int ret; + bool notify; + struct fuse_pqueue *fpq; + + /* Does the sglist fit on the stack? */ + total_sgs = sg_count_fuse_req(req); + if (total_sgs > ARRAY_SIZE(stack_sgs)) { + sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC); + sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC); + if (!sgs || !sg) { + ret = -ENOMEM; + goto out; + } + } + + /* Use a bounce buffer since stack args cannot be mapped */ + ret = copy_args_to_argbuf(req); + if (ret < 0) + goto out; + + /* Request elements */ + sg_init_one(&sg[out_sgs++], &req->in.h, sizeof(req->in.h)); + out_sgs += sg_init_fuse_args(&sg[out_sgs], req, + (struct fuse_arg *)args->in_args, + args->in_numargs, args->in_pages, + req->argbuf, &argbuf_used); + + /* Reply elements */ + if (test_bit(FR_ISREPLY, &req->flags)) { + sg_init_one(&sg[out_sgs + in_sgs++], + &req->out.h, sizeof(req->out.h)); + in_sgs += sg_init_fuse_args(&sg[out_sgs + in_sgs], req, + args->out_args, args->out_numargs, + args->out_pages, + req->argbuf + argbuf_used, NULL); + } + + WARN_ON(out_sgs + in_sgs != total_sgs); + + for (i = 0; i < total_sgs; i++) + sgs[i] = &sg[i]; + + spin_lock(&fsvq->lock); + + if (!fsvq->connected) { + spin_unlock(&fsvq->lock); + ret = -ENOTCONN; + goto out; + } + + vq = fsvq->vq; + ret = virtqueue_add_sgs(vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC); + if (ret < 0) { + spin_unlock(&fsvq->lock); + goto out; + } + + /* Request successfully sent. */ + fpq = &fsvq->fud->pq; + spin_lock(&fpq->lock); + list_add_tail(&req->list, fpq->processing); + spin_unlock(&fpq->lock); + set_bit(FR_SENT, &req->flags); + /* matches barrier in request_wait_answer() */ + smp_mb__after_atomic(); + + if (!in_flight) + inc_in_flight_req(fsvq); + notify = virtqueue_kick_prepare(vq); + + spin_unlock(&fsvq->lock); + + if (notify) + virtqueue_notify(vq); + +out: + if (ret < 0 && req->argbuf) { + kfree(req->argbuf); + req->argbuf = NULL; + } + if (sgs != stack_sgs) { + kfree(sgs); + kfree(sg); + } + + return ret; +} + +static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq) +__releases(fiq->lock) +{ + unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */ + struct virtio_fs *fs; + struct fuse_req *req; + struct virtio_fs_vq *fsvq; + int ret; + + WARN_ON(list_empty(&fiq->pending)); + req = list_last_entry(&fiq->pending, struct fuse_req, list); + clear_bit(FR_PENDING, &req->flags); + list_del_init(&req->list); + WARN_ON(!list_empty(&fiq->pending)); + spin_unlock(&fiq->lock); + + fs = fiq->priv; + + pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n", + __func__, req->in.h.opcode, req->in.h.unique, + req->in.h.nodeid, req->in.h.len, + fuse_len_args(req->args->out_numargs, req->args->out_args)); + + fsvq = &fs->vqs[queue_id]; + ret = virtio_fs_enqueue_req(fsvq, req, false); + if (ret < 0) { + if (ret == -ENOMEM || ret == -ENOSPC) { + /* + * Virtqueue full. Retry submission from worker + * context as we might be holding fc->bg_lock. + */ + spin_lock(&fsvq->lock); + list_add_tail(&req->list, &fsvq->queued_reqs); + inc_in_flight_req(fsvq); + schedule_delayed_work(&fsvq->dispatch_work, + msecs_to_jiffies(1)); + spin_unlock(&fsvq->lock); + return; + } + req->out.h.error = ret; + pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret); + + /* Can't end request in submission context. Use a worker */ + spin_lock(&fsvq->lock); + list_add_tail(&req->list, &fsvq->end_reqs); + schedule_delayed_work(&fsvq->dispatch_work, 0); + spin_unlock(&fsvq->lock); + return; + } +} + +static const struct fuse_iqueue_ops virtio_fs_fiq_ops = { + .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock, + .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock, + .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock, + .release = virtio_fs_fiq_release, +}; + +static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx) +{ + ctx->rootmode = S_IFDIR; + ctx->default_permissions = 1; + ctx->allow_other = 1; + ctx->max_read = UINT_MAX; + ctx->blksize = 512; + ctx->destroy = true; + ctx->no_control = true; + ctx->no_force_umount = true; +} + +static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct fuse_conn *fc = fm->fc; + struct virtio_fs *fs = fc->iq.priv; + struct fuse_fs_context *ctx = fsc->fs_private; + unsigned int i; + int err; + + virtio_fs_ctx_set_defaults(ctx); + mutex_lock(&virtio_fs_mutex); + + /* After holding mutex, make sure virtiofs device is still there. + * Though we are holding a reference to it, drive ->remove might + * still have cleaned up virtual queues. In that case bail out. + */ + err = -EINVAL; + if (list_empty(&fs->list)) { + pr_info("virtio-fs: tag <%s> not found\n", fs->tag); + goto err; + } + + err = -ENOMEM; + /* Allocate fuse_dev for hiprio and notification queues */ + for (i = 0; i < fs->nvqs; i++) { + struct virtio_fs_vq *fsvq = &fs->vqs[i]; + + fsvq->fud = fuse_dev_alloc(); + if (!fsvq->fud) + goto err_free_fuse_devs; + } + + /* virtiofs allocates and installs its own fuse devices */ + ctx->fudptr = NULL; + if (ctx->dax_mode != FUSE_DAX_NEVER) { + if (ctx->dax_mode == FUSE_DAX_ALWAYS && !fs->dax_dev) { + err = -EINVAL; + pr_err("virtio-fs: dax can't be enabled as filesystem" + " device does not support it.\n"); + goto err_free_fuse_devs; + } + ctx->dax_dev = fs->dax_dev; + } + err = fuse_fill_super_common(sb, ctx); + if (err < 0) + goto err_free_fuse_devs; + + for (i = 0; i < fs->nvqs; i++) { + struct virtio_fs_vq *fsvq = &fs->vqs[i]; + + fuse_dev_install(fsvq->fud, fc); + } + + /* Previous unmount will stop all queues. Start these again */ + virtio_fs_start_all_queues(fs); + fuse_send_init(fm); + mutex_unlock(&virtio_fs_mutex); + return 0; + +err_free_fuse_devs: + virtio_fs_free_devs(fs); +err: + mutex_unlock(&virtio_fs_mutex); + return err; +} + +static void virtio_fs_conn_destroy(struct fuse_mount *fm) +{ + struct fuse_conn *fc = fm->fc; + struct virtio_fs *vfs = fc->iq.priv; + struct virtio_fs_vq *fsvq = &vfs->vqs[VQ_HIPRIO]; + + /* Stop dax worker. Soon evict_inodes() will be called which + * will free all memory ranges belonging to all inodes. + */ + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_cancel_work(fc); + + /* Stop forget queue. Soon destroy will be sent */ + spin_lock(&fsvq->lock); + fsvq->connected = false; + spin_unlock(&fsvq->lock); + virtio_fs_drain_all_queues(vfs); + + fuse_conn_destroy(fm); + + /* fuse_conn_destroy() must have sent destroy. Stop all queues + * and drain one more time and free fuse devices. Freeing fuse + * devices will drop their reference on fuse_conn and that in + * turn will drop its reference on virtio_fs object. + */ + virtio_fs_stop_all_queues(vfs); + virtio_fs_drain_all_queues(vfs); + virtio_fs_free_devs(vfs); +} + +static void virtio_kill_sb(struct super_block *sb) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + bool last; + + /* If mount failed, we can still be called without any fc */ + if (sb->s_root) { + last = fuse_mount_remove(fm); + if (last) + virtio_fs_conn_destroy(fm); + } + kill_anon_super(sb); + fuse_mount_destroy(fm); +} + +static int virtio_fs_test_super(struct super_block *sb, + struct fs_context *fsc) +{ + struct fuse_mount *fsc_fm = fsc->s_fs_info; + struct fuse_mount *sb_fm = get_fuse_mount_super(sb); + + return fsc_fm->fc->iq.priv == sb_fm->fc->iq.priv; +} + +static int virtio_fs_get_tree(struct fs_context *fsc) +{ + struct virtio_fs *fs; + struct super_block *sb; + struct fuse_conn *fc = NULL; + struct fuse_mount *fm; + unsigned int virtqueue_size; + int err = -EIO; + + /* This gets a reference on virtio_fs object. This ptr gets installed + * in fc->iq->priv. Once fuse_conn is going away, it calls ->put() + * to drop the reference to this object. + */ + fs = virtio_fs_find_instance(fsc->source); + if (!fs) { + pr_info("virtio-fs: tag <%s> not found\n", fsc->source); + return -EINVAL; + } + + virtqueue_size = virtqueue_get_vring_size(fs->vqs[VQ_REQUEST].vq); + if (WARN_ON(virtqueue_size <= FUSE_HEADER_OVERHEAD)) + goto out_err; + + err = -ENOMEM; + fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL); + if (!fc) + goto out_err; + + fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); + if (!fm) + goto out_err; + + fuse_conn_init(fc, fm, fsc->user_ns, &virtio_fs_fiq_ops, fs); + fc->release = fuse_free_conn; + fc->delete_stale = true; + fc->auto_submounts = true; + fc->sync_fs = true; + + /* Tell FUSE to split requests that exceed the virtqueue's size */ + fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, + virtqueue_size - FUSE_HEADER_OVERHEAD); + + fsc->s_fs_info = fm; + sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc); + if (fsc->s_fs_info) + fuse_mount_destroy(fm); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + if (!sb->s_root) { + err = virtio_fs_fill_super(sb, fsc); + if (err) { + deactivate_locked_super(sb); + return err; + } + + sb->s_flags |= SB_ACTIVE; + } + + WARN_ON(fsc->root); + fsc->root = dget(sb->s_root); + return 0; + +out_err: + kfree(fc); + mutex_lock(&virtio_fs_mutex); + virtio_fs_put(fs); + mutex_unlock(&virtio_fs_mutex); + return err; +} + +static const struct fs_context_operations virtio_fs_context_ops = { + .free = virtio_fs_free_fsc, + .parse_param = virtio_fs_parse_param, + .get_tree = virtio_fs_get_tree, +}; + +static int virtio_fs_init_fs_context(struct fs_context *fsc) +{ + struct fuse_fs_context *ctx; + + if (fsc->purpose == FS_CONTEXT_FOR_SUBMOUNT) + return fuse_init_fs_context_submount(fsc); + + ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + fsc->fs_private = ctx; + fsc->ops = &virtio_fs_context_ops; + return 0; +} + +static struct file_system_type virtio_fs_type = { + .owner = THIS_MODULE, + .name = "virtiofs", + .init_fs_context = virtio_fs_init_fs_context, + .kill_sb = virtio_kill_sb, +}; + +static int __init virtio_fs_init(void) +{ + int ret; + + ret = register_virtio_driver(&virtio_fs_driver); + if (ret < 0) + return ret; + + ret = register_filesystem(&virtio_fs_type); + if (ret < 0) { + unregister_virtio_driver(&virtio_fs_driver); + return ret; + } + + return 0; +} +module_init(virtio_fs_init); + +static void __exit virtio_fs_exit(void) +{ + unregister_filesystem(&virtio_fs_type); + unregister_virtio_driver(&virtio_fs_driver); +} +module_exit(virtio_fs_exit); + +MODULE_AUTHOR("Stefan Hajnoczi "); +MODULE_DESCRIPTION("Virtio Filesystem"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_FS(KBUILD_MODNAME); +MODULE_DEVICE_TABLE(virtio, id_table); diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c new file mode 100644 index 000000000..0d3e7177f --- /dev/null +++ b/fs/fuse/xattr.c @@ -0,0 +1,266 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2016 Miklos Szeredi + * + * This program can be distributed under the terms of the GNU GPL. + * See the file COPYING. + */ + +#include "fuse_i.h" + +#include +#include + +int fuse_setxattr(struct inode *inode, const char *name, const void *value, + size_t size, int flags, unsigned int extra_flags) +{ + struct fuse_mount *fm = get_fuse_mount(inode); + FUSE_ARGS(args); + struct fuse_setxattr_in inarg; + int err; + + if (fm->fc->no_setxattr) + return -EOPNOTSUPP; + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + inarg.flags = flags; + inarg.setxattr_flags = extra_flags; + + args.opcode = FUSE_SETXATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 3; + args.in_args[0].size = fm->fc->setxattr_ext ? + sizeof(inarg) : FUSE_COMPAT_SETXATTR_IN_SIZE; + args.in_args[0].value = &inarg; + args.in_args[1].size = strlen(name) + 1; + args.in_args[1].value = name; + args.in_args[2].size = size; + args.in_args[2].value = value; + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + fm->fc->no_setxattr = 1; + err = -EOPNOTSUPP; + } + if (!err) + fuse_update_ctime(inode); + + return err; +} + +ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, + size_t size) +{ + struct fuse_mount *fm = get_fuse_mount(inode); + FUSE_ARGS(args); + struct fuse_getxattr_in inarg; + struct fuse_getxattr_out outarg; + ssize_t ret; + + if (fm->fc->no_getxattr) + return -EOPNOTSUPP; + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + args.opcode = FUSE_GETXATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = strlen(name) + 1; + args.in_args[1].value = name; + /* This is really two different operations rolled into one */ + args.out_numargs = 1; + if (size) { + args.out_argvar = true; + args.out_args[0].size = size; + args.out_args[0].value = value; + } else { + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + } + ret = fuse_simple_request(fm, &args); + if (!ret && !size) + ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX); + if (ret == -ENOSYS) { + fm->fc->no_getxattr = 1; + ret = -EOPNOTSUPP; + } + return ret; +} + +static int fuse_verify_xattr_list(char *list, size_t size) +{ + size_t origsize = size; + + while (size) { + size_t thislen = strnlen(list, size); + + if (!thislen || thislen == size) + return -EIO; + + size -= thislen + 1; + list += thislen + 1; + } + + return origsize; +} + +ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) +{ + struct inode *inode = d_inode(entry); + struct fuse_mount *fm = get_fuse_mount(inode); + FUSE_ARGS(args); + struct fuse_getxattr_in inarg; + struct fuse_getxattr_out outarg; + ssize_t ret; + + if (fuse_is_bad(inode)) + return -EIO; + + if (!fuse_allow_current_process(fm->fc)) + return -EACCES; + + if (fm->fc->no_listxattr) + return -EOPNOTSUPP; + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + args.opcode = FUSE_LISTXATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + /* This is really two different operations rolled into one */ + args.out_numargs = 1; + if (size) { + args.out_argvar = true; + args.out_args[0].size = size; + args.out_args[0].value = list; + } else { + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + } + ret = fuse_simple_request(fm, &args); + if (!ret && !size) + ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX); + if (ret > 0 && size) + ret = fuse_verify_xattr_list(list, ret); + if (ret == -ENOSYS) { + fm->fc->no_listxattr = 1; + ret = -EOPNOTSUPP; + } + return ret; +} + +int fuse_removexattr(struct inode *inode, const char *name) +{ + struct fuse_mount *fm = get_fuse_mount(inode); + FUSE_ARGS(args); + int err; + + if (fm->fc->no_removexattr) + return -EOPNOTSUPP; + + args.opcode = FUSE_REMOVEXATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = strlen(name) + 1; + args.in_args[0].value = name; + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + fm->fc->no_removexattr = 1; + err = -EOPNOTSUPP; + } + if (!err) + fuse_update_ctime(inode); + + return err; +} + +static int fuse_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *value, size_t size) +{ + if (fuse_is_bad(inode)) + return -EIO; + + return fuse_getxattr(inode, name, value, size); +} + +static int fuse_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, size_t size, + int flags) +{ + if (fuse_is_bad(inode)) + return -EIO; + + if (!value) + return fuse_removexattr(inode, name); + + return fuse_setxattr(inode, name, value, size, flags, 0); +} + +static bool no_xattr_list(struct dentry *dentry) +{ + return false; +} + +static int no_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *value, size_t size) +{ + return -EOPNOTSUPP; +} + +static int no_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *dentry, struct inode *nodee, + const char *name, const void *value, + size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static const struct xattr_handler fuse_xattr_handler = { + .prefix = "", + .get = fuse_xattr_get, + .set = fuse_xattr_set, +}; + +const struct xattr_handler *fuse_xattr_handlers[] = { + &fuse_xattr_handler, + NULL +}; + +const struct xattr_handler *fuse_acl_xattr_handlers[] = { + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, + &fuse_xattr_handler, + NULL +}; + +static const struct xattr_handler fuse_no_acl_access_xattr_handler = { + .name = XATTR_NAME_POSIX_ACL_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = no_xattr_list, + .get = no_xattr_get, + .set = no_xattr_set, +}; + +static const struct xattr_handler fuse_no_acl_default_xattr_handler = { + .name = XATTR_NAME_POSIX_ACL_DEFAULT, + .flags = ACL_TYPE_ACCESS, + .list = no_xattr_list, + .get = no_xattr_get, + .set = no_xattr_set, +}; + +const struct xattr_handler *fuse_no_acl_xattr_handlers[] = { + &fuse_no_acl_access_xattr_handler, + &fuse_no_acl_default_xattr_handler, + &fuse_xattr_handler, + NULL +}; -- cgit v1.2.3