summaryrefslogtreecommitdiffstats
path: root/fs/fuse
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:49:45 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:49:45 +0000
commit2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch)
tree848558de17fb3008cdf4d861b01ac7781903ce39 /fs/fuse
parentInitial commit. (diff)
downloadlinux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz
linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip
Adding upstream version 6.1.76.upstream/6.1.76upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'fs/fuse')
-rw-r--r--fs/fuse/Kconfig54
-rw-r--r--fs/fuse/Makefile13
-rw-r--r--fs/fuse/acl.c114
-rw-r--r--fs/fuse/control.c384
-rw-r--r--fs/fuse/cuse.c655
-rw-r--r--fs/fuse/dax.c1391
-rw-r--r--fs/fuse/dev.c2337
-rw-r--r--fs/fuse/dir.c2013
-rw-r--r--fs/fuse/file.c3216
-rw-r--r--fs/fuse/fuse_i.h1337
-rw-r--r--fs/fuse/inode.c2068
-rw-r--r--fs/fuse/ioctl.c515
-rw-r--r--fs/fuse/readdir.c604
-rw-r--r--fs/fuse/virtio_fs.c1541
-rw-r--r--fs/fuse/xattr.c266
15 files changed, 16508 insertions, 0 deletions
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
new file mode 100644
index 000000000..038ed0b9a
--- /dev/null
+++ b/fs/fuse/Kconfig
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config FUSE_FS
+ tristate "FUSE (Filesystem in Userspace) support"
+ select FS_POSIX_ACL
+ help
+ With FUSE it is possible to implement a fully functional filesystem
+ in a userspace program.
+
+ There's also a companion library: libfuse2. This library is available
+ from the FUSE homepage:
+ <https://github.com/libfuse/>
+ although chances are your distribution already has that library
+ installed if you've installed the "fuse" package itself.
+
+ See <file:Documentation/filesystems/fuse.rst> for more information.
+ See <file:Documentation/Changes> for needed library/utility version.
+
+ If you want to develop a userspace FS, or if you want to use
+ a filesystem based on FUSE, answer Y or M.
+
+config CUSE
+ tristate "Character device in Userspace support"
+ depends on FUSE_FS
+ help
+ This FUSE extension allows character devices to be
+ implemented in userspace.
+
+ If you want to develop or use a userspace character device
+ based on CUSE, answer Y or M.
+
+config VIRTIO_FS
+ tristate "Virtio Filesystem"
+ depends on FUSE_FS
+ select VIRTIO
+ help
+ The Virtio Filesystem allows guests to mount file systems from the
+ host.
+
+ If you want to share files between guests or with the host, answer Y
+ or M.
+
+config FUSE_DAX
+ bool "Virtio Filesystem Direct Host Memory Access support"
+ default y
+ select INTERVAL_TREE
+ depends on VIRTIO_FS
+ depends on FS_DAX
+ depends on DAX
+ help
+ This allows bypassing guest page cache and allows mapping host page
+ cache directly in guest address space.
+
+ If you want to allow mounting a Virtio Filesystem with the "dax"
+ option, answer Y.
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
new file mode 100644
index 000000000..0c48b35c0
--- /dev/null
+++ b/fs/fuse/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for the FUSE filesystem.
+#
+
+obj-$(CONFIG_FUSE_FS) += fuse.o
+obj-$(CONFIG_CUSE) += cuse.o
+obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
+
+fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o
+fuse-$(CONFIG_FUSE_DAX) += dax.o
+
+virtiofs-y := virtio_fs.o
diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
new file mode 100644
index 000000000..337cb29a8
--- /dev/null
+++ b/fs/fuse/acl.c
@@ -0,0 +1,114 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2016 Canonical Ltd. <seth.forshee@canonical.com>
+ *
+ * This program can be distributed under the terms of the GNU GPL.
+ * See the file COPYING.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+
+struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ int size;
+ const char *name;
+ void *value = NULL;
+ struct posix_acl *acl;
+
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
+ if (fuse_is_bad(inode))
+ return ERR_PTR(-EIO);
+
+ if (!fc->posix_acl || fc->no_getxattr)
+ return NULL;
+
+ if (type == ACL_TYPE_ACCESS)
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
+ else if (type == ACL_TYPE_DEFAULT)
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
+ else
+ return ERR_PTR(-EOPNOTSUPP);
+
+ value = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+ size = fuse_getxattr(inode, name, value, PAGE_SIZE);
+ if (size > 0)
+ acl = posix_acl_from_xattr(fc->user_ns, value, size);
+ else if ((size == 0) || (size == -ENODATA) ||
+ (size == -EOPNOTSUPP && fc->no_getxattr))
+ acl = NULL;
+ else if (size == -ERANGE)
+ acl = ERR_PTR(-E2BIG);
+ else
+ acl = ERR_PTR(size);
+
+ kfree(value);
+ return acl;
+}
+
+int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct posix_acl *acl, int type)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ const char *name;
+ int ret;
+
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ if (!fc->posix_acl || fc->no_setxattr)
+ return -EOPNOTSUPP;
+
+ if (type == ACL_TYPE_ACCESS)
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
+ else if (type == ACL_TYPE_DEFAULT)
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
+ else
+ return -EINVAL;
+
+ if (acl) {
+ unsigned int extra_flags = 0;
+ /*
+ * Fuse userspace is responsible for updating access
+ * permissions in the inode, if needed. fuse_setxattr
+ * invalidates the inode attributes, which will force
+ * them to be refreshed the next time they are used,
+ * and it also updates i_ctime.
+ */
+ size_t size = posix_acl_xattr_size(acl->a_count);
+ void *value;
+
+ if (size > PAGE_SIZE)
+ return -E2BIG;
+
+ value = kmalloc(size, GFP_KERNEL);
+ if (!value)
+ return -ENOMEM;
+
+ ret = posix_acl_to_xattr(fc->user_ns, acl, value, size);
+ if (ret < 0) {
+ kfree(value);
+ return ret;
+ }
+
+ if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) &&
+ !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
+ extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID;
+
+ ret = fuse_setxattr(inode, name, value, size, 0, extra_flags);
+ kfree(value);
+ } else {
+ ret = fuse_removexattr(inode, name);
+ }
+ forget_all_cached_acls(inode);
+ fuse_invalidate_attr(inode);
+
+ return ret;
+}
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
new file mode 100644
index 000000000..247ef4f76
--- /dev/null
+++ b/fs/fuse/control.c
@@ -0,0 +1,384 @@
+/*
+ FUSE: Filesystem in Userspace
+ Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
+
+ This program can be distributed under the terms of the GNU GPL.
+ See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs_context.h>
+
+#define FUSE_CTL_SUPER_MAGIC 0x65735543
+
+/*
+ * This is non-NULL when the single instance of the control filesystem
+ * exists. Protected by fuse_mutex
+ */
+static struct super_block *fuse_control_sb;
+
+static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file)
+{
+ struct fuse_conn *fc;
+ mutex_lock(&fuse_mutex);
+ fc = file_inode(file)->i_private;
+ if (fc)
+ fc = fuse_conn_get(fc);
+ mutex_unlock(&fuse_mutex);
+ return fc;
+}
+
+static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+ if (fc) {
+ if (fc->abort_err)
+ fc->aborted = true;
+ fuse_abort_conn(fc);
+ fuse_conn_put(fc);
+ }
+ return count;
+}
+
+static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ char tmp[32];
+ size_t size;
+
+ if (!*ppos) {
+ long value;
+ struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+ if (!fc)
+ return 0;
+
+ value = atomic_read(&fc->num_waiting);
+ file->private_data = (void *)value;
+ fuse_conn_put(fc);
+ }
+ size = sprintf(tmp, "%ld\n", (long)file->private_data);
+ return simple_read_from_buffer(buf, len, ppos, tmp, size);
+}
+
+static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos, unsigned val)
+{
+ char tmp[32];
+ size_t size = sprintf(tmp, "%u\n", val);
+
+ return simple_read_from_buffer(buf, len, ppos, tmp, size);
+}
+
+static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos, unsigned *val,
+ unsigned global_limit)
+{
+ unsigned long t;
+ unsigned limit = (1 << 16) - 1;
+ int err;
+
+ if (*ppos)
+ return -EINVAL;
+
+ err = kstrtoul_from_user(buf, count, 0, &t);
+ if (err)
+ return err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ limit = min(limit, global_limit);
+
+ if (t > limit)
+ return -EINVAL;
+
+ *val = t;
+
+ return count;
+}
+
+static ssize_t fuse_conn_max_background_read(struct file *file,
+ char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct fuse_conn *fc;
+ unsigned val;
+
+ fc = fuse_ctl_file_conn_get(file);
+ if (!fc)
+ return 0;
+
+ val = READ_ONCE(fc->max_background);
+ fuse_conn_put(fc);
+
+ return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_max_background_write(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned val;
+ ssize_t ret;
+
+ ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+ max_user_bgreq);
+ if (ret > 0) {
+ struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+ if (fc) {
+ spin_lock(&fc->bg_lock);
+ fc->max_background = val;
+ fc->blocked = fc->num_background >= fc->max_background;
+ if (!fc->blocked)
+ wake_up(&fc->blocked_waitq);
+ spin_unlock(&fc->bg_lock);
+ fuse_conn_put(fc);
+ }
+ }
+
+ return ret;
+}
+
+static ssize_t fuse_conn_congestion_threshold_read(struct file *file,
+ char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct fuse_conn *fc;
+ unsigned val;
+
+ fc = fuse_ctl_file_conn_get(file);
+ if (!fc)
+ return 0;
+
+ val = READ_ONCE(fc->congestion_threshold);
+ fuse_conn_put(fc);
+
+ return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned val;
+ struct fuse_conn *fc;
+ ssize_t ret;
+
+ ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+ max_user_congthresh);
+ if (ret <= 0)
+ goto out;
+ fc = fuse_ctl_file_conn_get(file);
+ if (!fc)
+ goto out;
+
+ down_read(&fc->killsb);
+ spin_lock(&fc->bg_lock);
+ fc->congestion_threshold = val;
+ spin_unlock(&fc->bg_lock);
+ up_read(&fc->killsb);
+ fuse_conn_put(fc);
+out:
+ return ret;
+}
+
+static const struct file_operations fuse_ctl_abort_ops = {
+ .open = nonseekable_open,
+ .write = fuse_conn_abort_write,
+ .llseek = no_llseek,
+};
+
+static const struct file_operations fuse_ctl_waiting_ops = {
+ .open = nonseekable_open,
+ .read = fuse_conn_waiting_read,
+ .llseek = no_llseek,
+};
+
+static const struct file_operations fuse_conn_max_background_ops = {
+ .open = nonseekable_open,
+ .read = fuse_conn_max_background_read,
+ .write = fuse_conn_max_background_write,
+ .llseek = no_llseek,
+};
+
+static const struct file_operations fuse_conn_congestion_threshold_ops = {
+ .open = nonseekable_open,
+ .read = fuse_conn_congestion_threshold_read,
+ .write = fuse_conn_congestion_threshold_write,
+ .llseek = no_llseek,
+};
+
+static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
+ struct fuse_conn *fc,
+ const char *name,
+ int mode, int nlink,
+ const struct inode_operations *iop,
+ const struct file_operations *fop)
+{
+ struct dentry *dentry;
+ struct inode *inode;
+
+ BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES);
+ dentry = d_alloc_name(parent, name);
+ if (!dentry)
+ return NULL;
+
+ inode = new_inode(fuse_control_sb);
+ if (!inode) {
+ dput(dentry);
+ return NULL;
+ }
+
+ inode->i_ino = get_next_ino();
+ inode->i_mode = mode;
+ inode->i_uid = fc->user_id;
+ inode->i_gid = fc->group_id;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ /* setting ->i_op to NULL is not allowed */
+ if (iop)
+ inode->i_op = iop;
+ inode->i_fop = fop;
+ set_nlink(inode, nlink);
+ inode->i_private = fc;
+ d_add(dentry, inode);
+
+ fc->ctl_dentry[fc->ctl_ndents++] = dentry;
+
+ return dentry;
+}
+
+/*
+ * Add a connection to the control filesystem (if it exists). Caller
+ * must hold fuse_mutex
+ */
+int fuse_ctl_add_conn(struct fuse_conn *fc)
+{
+ struct dentry *parent;
+ char name[32];
+
+ if (!fuse_control_sb || fc->no_control)
+ return 0;
+
+ parent = fuse_control_sb->s_root;
+ inc_nlink(d_inode(parent));
+ sprintf(name, "%u", fc->dev);
+ parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2,
+ &simple_dir_inode_operations,
+ &simple_dir_operations);
+ if (!parent)
+ goto err;
+
+ if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
+ NULL, &fuse_ctl_waiting_ops) ||
+ !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
+ NULL, &fuse_ctl_abort_ops) ||
+ !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600,
+ 1, NULL, &fuse_conn_max_background_ops) ||
+ !fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
+ S_IFREG | 0600, 1, NULL,
+ &fuse_conn_congestion_threshold_ops))
+ goto err;
+
+ return 0;
+
+ err:
+ fuse_ctl_remove_conn(fc);
+ return -ENOMEM;
+}
+
+/*
+ * Remove a connection from the control filesystem (if it exists).
+ * Caller must hold fuse_mutex
+ */
+void fuse_ctl_remove_conn(struct fuse_conn *fc)
+{
+ int i;
+
+ if (!fuse_control_sb || fc->no_control)
+ return;
+
+ for (i = fc->ctl_ndents - 1; i >= 0; i--) {
+ struct dentry *dentry = fc->ctl_dentry[i];
+ d_inode(dentry)->i_private = NULL;
+ if (!i) {
+ /* Get rid of submounts: */
+ d_invalidate(dentry);
+ }
+ dput(dentry);
+ }
+ drop_nlink(d_inode(fuse_control_sb->s_root));
+}
+
+static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fsc)
+{
+ static const struct tree_descr empty_descr = {""};
+ struct fuse_conn *fc;
+ int err;
+
+ err = simple_fill_super(sb, FUSE_CTL_SUPER_MAGIC, &empty_descr);
+ if (err)
+ return err;
+
+ mutex_lock(&fuse_mutex);
+ BUG_ON(fuse_control_sb);
+ fuse_control_sb = sb;
+ list_for_each_entry(fc, &fuse_conn_list, entry) {
+ err = fuse_ctl_add_conn(fc);
+ if (err) {
+ fuse_control_sb = NULL;
+ mutex_unlock(&fuse_mutex);
+ return err;
+ }
+ }
+ mutex_unlock(&fuse_mutex);
+
+ return 0;
+}
+
+static int fuse_ctl_get_tree(struct fs_context *fsc)
+{
+ return get_tree_single(fsc, fuse_ctl_fill_super);
+}
+
+static const struct fs_context_operations fuse_ctl_context_ops = {
+ .get_tree = fuse_ctl_get_tree,
+};
+
+static int fuse_ctl_init_fs_context(struct fs_context *fsc)
+{
+ fsc->ops = &fuse_ctl_context_ops;
+ return 0;
+}
+
+static void fuse_ctl_kill_sb(struct super_block *sb)
+{
+ struct fuse_conn *fc;
+
+ mutex_lock(&fuse_mutex);
+ fuse_control_sb = NULL;
+ list_for_each_entry(fc, &fuse_conn_list, entry)
+ fc->ctl_ndents = 0;
+ mutex_unlock(&fuse_mutex);
+
+ kill_litter_super(sb);
+}
+
+static struct file_system_type fuse_ctl_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "fusectl",
+ .init_fs_context = fuse_ctl_init_fs_context,
+ .kill_sb = fuse_ctl_kill_sb,
+};
+MODULE_ALIAS_FS("fusectl");
+
+int __init fuse_ctl_init(void)
+{
+ return register_filesystem(&fuse_ctl_fs_type);
+}
+
+void __exit fuse_ctl_cleanup(void)
+{
+ unregister_filesystem(&fuse_ctl_fs_type);
+}
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
new file mode 100644
index 000000000..c7d882a9f
--- /dev/null
+++ b/fs/fuse/cuse.c
@@ -0,0 +1,655 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CUSE: Character device in Userspace
+ *
+ * Copyright (C) 2008-2009 SUSE Linux Products GmbH
+ * Copyright (C) 2008-2009 Tejun Heo <tj@kernel.org>
+ *
+ * CUSE enables character devices to be implemented from userland much
+ * like FUSE allows filesystems. On initialization /dev/cuse is
+ * created. By opening the file and replying to the CUSE_INIT request
+ * userland CUSE server can create a character device. After that the
+ * operation is very similar to FUSE.
+ *
+ * A CUSE instance involves the following objects.
+ *
+ * cuse_conn : contains fuse_conn and serves as bonding structure
+ * channel : file handle connected to the userland CUSE server
+ * cdev : the implemented character device
+ * dev : generic device for cdev
+ *
+ * Note that 'channel' is what 'dev' is in FUSE. As CUSE deals with
+ * devices, it's called 'channel' to reduce confusion.
+ *
+ * channel determines when the character device dies. When channel is
+ * closed, everything begins to destruct. The cuse_conn is taken off
+ * the lookup table preventing further access from cdev, cdev and
+ * generic device are removed and the base reference of cuse_conn is
+ * put.
+ *
+ * On each open, the matching cuse_conn is looked up and if found an
+ * additional reference is taken which is released when the file is
+ * closed.
+ */
+
+#define pr_fmt(fmt) "CUSE: " fmt
+
+#include <linux/fuse.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kdev_t.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/magic.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/module.h>
+#include <linux/uio.h>
+#include <linux/user_namespace.h>
+
+#include "fuse_i.h"
+
+#define CUSE_CONNTBL_LEN 64
+
+struct cuse_conn {
+ struct list_head list; /* linked on cuse_conntbl */
+ struct fuse_mount fm; /* Dummy mount referencing fc */
+ struct fuse_conn fc; /* fuse connection */
+ struct cdev *cdev; /* associated character device */
+ struct device *dev; /* device representing @cdev */
+
+ /* init parameters, set once during initialization */
+ bool unrestricted_ioctl;
+};
+
+static DEFINE_MUTEX(cuse_lock); /* protects registration */
+static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN];
+static struct class *cuse_class;
+
+static struct cuse_conn *fc_to_cc(struct fuse_conn *fc)
+{
+ return container_of(fc, struct cuse_conn, fc);
+}
+
+static struct list_head *cuse_conntbl_head(dev_t devt)
+{
+ return &cuse_conntbl[(MAJOR(devt) + MINOR(devt)) % CUSE_CONNTBL_LEN];
+}
+
+
+/**************************************************************************
+ * CUSE frontend operations
+ *
+ * These are file operations for the character device.
+ *
+ * On open, CUSE opens a file from the FUSE mnt and stores it to
+ * private_data of the open file. All other ops call FUSE ops on the
+ * FUSE file.
+ */
+
+static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to)
+{
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb);
+ loff_t pos = 0;
+
+ return fuse_direct_io(&io, to, &pos, FUSE_DIO_CUSE);
+}
+
+static ssize_t cuse_write_iter(struct kiocb *kiocb, struct iov_iter *from)
+{
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb);
+ loff_t pos = 0;
+ /*
+ * No locking or generic_write_checks(), the server is
+ * responsible for locking and sanity checks.
+ */
+ return fuse_direct_io(&io, from, &pos,
+ FUSE_DIO_WRITE | FUSE_DIO_CUSE);
+}
+
+static int cuse_open(struct inode *inode, struct file *file)
+{
+ dev_t devt = inode->i_cdev->dev;
+ struct cuse_conn *cc = NULL, *pos;
+ int rc;
+
+ /* look up and get the connection */
+ mutex_lock(&cuse_lock);
+ list_for_each_entry(pos, cuse_conntbl_head(devt), list)
+ if (pos->dev->devt == devt) {
+ fuse_conn_get(&pos->fc);
+ cc = pos;
+ break;
+ }
+ mutex_unlock(&cuse_lock);
+
+ /* dead? */
+ if (!cc)
+ return -ENODEV;
+
+ /*
+ * Generic permission check is already done against the chrdev
+ * file, proceed to open.
+ */
+ rc = fuse_do_open(&cc->fm, 0, file, 0);
+ if (rc)
+ fuse_conn_put(&cc->fc);
+ return rc;
+}
+
+static int cuse_release(struct inode *inode, struct file *file)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_mount *fm = ff->fm;
+
+ fuse_sync_release(NULL, ff, file->f_flags);
+ fuse_conn_put(fm->fc);
+
+ return 0;
+}
+
+static long cuse_file_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct fuse_file *ff = file->private_data;
+ struct cuse_conn *cc = fc_to_cc(ff->fm->fc);
+ unsigned int flags = 0;
+
+ if (cc->unrestricted_ioctl)
+ flags |= FUSE_IOCTL_UNRESTRICTED;
+
+ return fuse_do_ioctl(file, cmd, arg, flags);
+}
+
+static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct fuse_file *ff = file->private_data;
+ struct cuse_conn *cc = fc_to_cc(ff->fm->fc);
+ unsigned int flags = FUSE_IOCTL_COMPAT;
+
+ if (cc->unrestricted_ioctl)
+ flags |= FUSE_IOCTL_UNRESTRICTED;
+
+ return fuse_do_ioctl(file, cmd, arg, flags);
+}
+
+static const struct file_operations cuse_frontend_fops = {
+ .owner = THIS_MODULE,
+ .read_iter = cuse_read_iter,
+ .write_iter = cuse_write_iter,
+ .open = cuse_open,
+ .release = cuse_release,
+ .unlocked_ioctl = cuse_file_ioctl,
+ .compat_ioctl = cuse_file_compat_ioctl,
+ .poll = fuse_file_poll,
+ .llseek = noop_llseek,
+};
+
+
+/**************************************************************************
+ * CUSE channel initialization and destruction
+ */
+
+struct cuse_devinfo {
+ const char *name;
+};
+
+/**
+ * cuse_parse_one - parse one key=value pair
+ * @pp: i/o parameter for the current position
+ * @end: points to one past the end of the packed string
+ * @keyp: out parameter for key
+ * @valp: out parameter for value
+ *
+ * *@pp points to packed strings - "key0=val0\0key1=val1\0" which ends
+ * at @end - 1. This function parses one pair and set *@keyp to the
+ * start of the key and *@valp to the start of the value. Note that
+ * the original string is modified such that the key string is
+ * terminated with '\0'. *@pp is updated to point to the next string.
+ *
+ * RETURNS:
+ * 1 on successful parse, 0 on EOF, -errno on failure.
+ */
+static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
+{
+ char *p = *pp;
+ char *key, *val;
+
+ while (p < end && *p == '\0')
+ p++;
+ if (p == end)
+ return 0;
+
+ if (end[-1] != '\0') {
+ pr_err("info not properly terminated\n");
+ return -EINVAL;
+ }
+
+ key = val = p;
+ p += strlen(p);
+
+ if (valp) {
+ strsep(&val, "=");
+ if (!val)
+ val = key + strlen(key);
+ key = strstrip(key);
+ val = strstrip(val);
+ } else
+ key = strstrip(key);
+
+ if (!strlen(key)) {
+ pr_err("zero length info key specified\n");
+ return -EINVAL;
+ }
+
+ *pp = p;
+ *keyp = key;
+ if (valp)
+ *valp = val;
+
+ return 1;
+}
+
+/**
+ * cuse_parse_dev_info - parse device info
+ * @p: device info string
+ * @len: length of device info string
+ * @devinfo: out parameter for parsed device info
+ *
+ * Parse @p to extract device info and store it into @devinfo. String
+ * pointed to by @p is modified by parsing and @devinfo points into
+ * them, so @p shouldn't be freed while @devinfo is in use.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
+{
+ char *end = p + len;
+ char *key, *val;
+ int rc;
+
+ while (true) {
+ rc = cuse_parse_one(&p, end, &key, &val);
+ if (rc < 0)
+ return rc;
+ if (!rc)
+ break;
+ if (strcmp(key, "DEVNAME") == 0)
+ devinfo->name = val;
+ else
+ pr_warn("unknown device info \"%s\"\n", key);
+ }
+
+ if (!devinfo->name || !strlen(devinfo->name)) {
+ pr_err("DEVNAME unspecified\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void cuse_gendev_release(struct device *dev)
+{
+ kfree(dev);
+}
+
+struct cuse_init_args {
+ struct fuse_args_pages ap;
+ struct cuse_init_in in;
+ struct cuse_init_out out;
+ struct page *page;
+ struct fuse_page_desc desc;
+};
+
+/**
+ * cuse_process_init_reply - finish initializing CUSE channel
+ *
+ * This function creates the character device and sets up all the
+ * required data structures for it. Please read the comment at the
+ * top of this file for high level overview.
+ */
+static void cuse_process_init_reply(struct fuse_mount *fm,
+ struct fuse_args *args, int error)
+{
+ struct fuse_conn *fc = fm->fc;
+ struct cuse_init_args *ia = container_of(args, typeof(*ia), ap.args);
+ struct fuse_args_pages *ap = &ia->ap;
+ struct cuse_conn *cc = fc_to_cc(fc), *pos;
+ struct cuse_init_out *arg = &ia->out;
+ struct page *page = ap->pages[0];
+ struct cuse_devinfo devinfo = { };
+ struct device *dev;
+ struct cdev *cdev;
+ dev_t devt;
+ int rc, i;
+
+ if (error || arg->major != FUSE_KERNEL_VERSION || arg->minor < 11)
+ goto err;
+
+ fc->minor = arg->minor;
+ fc->max_read = max_t(unsigned, arg->max_read, 4096);
+ fc->max_write = max_t(unsigned, arg->max_write, 4096);
+
+ /* parse init reply */
+ cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL;
+
+ rc = cuse_parse_devinfo(page_address(page), ap->args.out_args[1].size,
+ &devinfo);
+ if (rc)
+ goto err;
+
+ /* determine and reserve devt */
+ devt = MKDEV(arg->dev_major, arg->dev_minor);
+ if (!MAJOR(devt))
+ rc = alloc_chrdev_region(&devt, MINOR(devt), 1, devinfo.name);
+ else
+ rc = register_chrdev_region(devt, 1, devinfo.name);
+ if (rc) {
+ pr_err("failed to register chrdev region\n");
+ goto err;
+ }
+
+ /* devt determined, create device */
+ rc = -ENOMEM;
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ goto err_region;
+
+ device_initialize(dev);
+ dev_set_uevent_suppress(dev, 1);
+ dev->class = cuse_class;
+ dev->devt = devt;
+ dev->release = cuse_gendev_release;
+ dev_set_drvdata(dev, cc);
+ dev_set_name(dev, "%s", devinfo.name);
+
+ mutex_lock(&cuse_lock);
+
+ /* make sure the device-name is unique */
+ for (i = 0; i < CUSE_CONNTBL_LEN; ++i) {
+ list_for_each_entry(pos, &cuse_conntbl[i], list)
+ if (!strcmp(dev_name(pos->dev), dev_name(dev)))
+ goto err_unlock;
+ }
+
+ rc = device_add(dev);
+ if (rc)
+ goto err_unlock;
+
+ /* register cdev */
+ rc = -ENOMEM;
+ cdev = cdev_alloc();
+ if (!cdev)
+ goto err_unlock;
+
+ cdev->owner = THIS_MODULE;
+ cdev->ops = &cuse_frontend_fops;
+
+ rc = cdev_add(cdev, devt, 1);
+ if (rc)
+ goto err_cdev;
+
+ cc->dev = dev;
+ cc->cdev = cdev;
+
+ /* make the device available */
+ list_add(&cc->list, cuse_conntbl_head(devt));
+ mutex_unlock(&cuse_lock);
+
+ /* announce device availability */
+ dev_set_uevent_suppress(dev, 0);
+ kobject_uevent(&dev->kobj, KOBJ_ADD);
+out:
+ kfree(ia);
+ __free_page(page);
+ return;
+
+err_cdev:
+ cdev_del(cdev);
+err_unlock:
+ mutex_unlock(&cuse_lock);
+ put_device(dev);
+err_region:
+ unregister_chrdev_region(devt, 1);
+err:
+ fuse_abort_conn(fc);
+ goto out;
+}
+
+static int cuse_send_init(struct cuse_conn *cc)
+{
+ int rc;
+ struct page *page;
+ struct fuse_mount *fm = &cc->fm;
+ struct cuse_init_args *ia;
+ struct fuse_args_pages *ap;
+
+ BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
+
+ rc = -ENOMEM;
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ goto err;
+
+ ia = kzalloc(sizeof(*ia), GFP_KERNEL);
+ if (!ia)
+ goto err_free_page;
+
+ ap = &ia->ap;
+ ia->in.major = FUSE_KERNEL_VERSION;
+ ia->in.minor = FUSE_KERNEL_MINOR_VERSION;
+ ia->in.flags |= CUSE_UNRESTRICTED_IOCTL;
+ ap->args.opcode = CUSE_INIT;
+ ap->args.in_numargs = 1;
+ ap->args.in_args[0].size = sizeof(ia->in);
+ ap->args.in_args[0].value = &ia->in;
+ ap->args.out_numargs = 2;
+ ap->args.out_args[0].size = sizeof(ia->out);
+ ap->args.out_args[0].value = &ia->out;
+ ap->args.out_args[1].size = CUSE_INIT_INFO_MAX;
+ ap->args.out_argvar = true;
+ ap->args.out_pages = true;
+ ap->num_pages = 1;
+ ap->pages = &ia->page;
+ ap->descs = &ia->desc;
+ ia->page = page;
+ ia->desc.length = ap->args.out_args[1].size;
+ ap->args.end = cuse_process_init_reply;
+
+ rc = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
+ if (rc) {
+ kfree(ia);
+err_free_page:
+ __free_page(page);
+ }
+err:
+ return rc;
+}
+
+static void cuse_fc_release(struct fuse_conn *fc)
+{
+ struct cuse_conn *cc = fc_to_cc(fc);
+ kfree_rcu(cc, fc.rcu);
+}
+
+/**
+ * cuse_channel_open - open method for /dev/cuse
+ * @inode: inode for /dev/cuse
+ * @file: file struct being opened
+ *
+ * Userland CUSE server can create a CUSE device by opening /dev/cuse
+ * and replying to the initialization request kernel sends. This
+ * function is responsible for handling CUSE device initialization.
+ * Because the fd opened by this function is used during
+ * initialization, this function only creates cuse_conn and sends
+ * init. The rest is delegated to a kthread.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_channel_open(struct inode *inode, struct file *file)
+{
+ struct fuse_dev *fud;
+ struct cuse_conn *cc;
+ int rc;
+
+ /* set up cuse_conn */
+ cc = kzalloc(sizeof(*cc), GFP_KERNEL);
+ if (!cc)
+ return -ENOMEM;
+
+ /*
+ * Limit the cuse channel to requests that can
+ * be represented in file->f_cred->user_ns.
+ */
+ fuse_conn_init(&cc->fc, &cc->fm, file->f_cred->user_ns,
+ &fuse_dev_fiq_ops, NULL);
+
+ cc->fc.release = cuse_fc_release;
+ fud = fuse_dev_alloc_install(&cc->fc);
+ fuse_conn_put(&cc->fc);
+ if (!fud)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&cc->list);
+
+ cc->fc.initialized = 1;
+ rc = cuse_send_init(cc);
+ if (rc) {
+ fuse_dev_free(fud);
+ return rc;
+ }
+ file->private_data = fud;
+
+ return 0;
+}
+
+/**
+ * cuse_channel_release - release method for /dev/cuse
+ * @inode: inode for /dev/cuse
+ * @file: file struct being closed
+ *
+ * Disconnect the channel, deregister CUSE device and initiate
+ * destruction by putting the default reference.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_channel_release(struct inode *inode, struct file *file)
+{
+ struct fuse_dev *fud = file->private_data;
+ struct cuse_conn *cc = fc_to_cc(fud->fc);
+ int rc;
+
+ /* remove from the conntbl, no more access from this point on */
+ mutex_lock(&cuse_lock);
+ list_del_init(&cc->list);
+ mutex_unlock(&cuse_lock);
+
+ /* remove device */
+ if (cc->dev)
+ device_unregister(cc->dev);
+ if (cc->cdev) {
+ unregister_chrdev_region(cc->cdev->dev, 1);
+ cdev_del(cc->cdev);
+ }
+
+ rc = fuse_dev_release(inode, file); /* puts the base reference */
+
+ return rc;
+}
+
+static struct file_operations cuse_channel_fops; /* initialized during init */
+
+
+/**************************************************************************
+ * Misc stuff and module initializatiion
+ *
+ * CUSE exports the same set of attributes to sysfs as fusectl.
+ */
+
+static ssize_t cuse_class_waiting_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cuse_conn *cc = dev_get_drvdata(dev);
+
+ return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
+}
+static DEVICE_ATTR(waiting, 0400, cuse_class_waiting_show, NULL);
+
+static ssize_t cuse_class_abort_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cuse_conn *cc = dev_get_drvdata(dev);
+
+ fuse_abort_conn(&cc->fc);
+ return count;
+}
+static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store);
+
+static struct attribute *cuse_class_dev_attrs[] = {
+ &dev_attr_waiting.attr,
+ &dev_attr_abort.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(cuse_class_dev);
+
+static struct miscdevice cuse_miscdev = {
+ .minor = CUSE_MINOR,
+ .name = "cuse",
+ .fops = &cuse_channel_fops,
+};
+
+MODULE_ALIAS_MISCDEV(CUSE_MINOR);
+MODULE_ALIAS("devname:cuse");
+
+static int __init cuse_init(void)
+{
+ int i, rc;
+
+ /* init conntbl */
+ for (i = 0; i < CUSE_CONNTBL_LEN; i++)
+ INIT_LIST_HEAD(&cuse_conntbl[i]);
+
+ /* inherit and extend fuse_dev_operations */
+ cuse_channel_fops = fuse_dev_operations;
+ cuse_channel_fops.owner = THIS_MODULE;
+ cuse_channel_fops.open = cuse_channel_open;
+ cuse_channel_fops.release = cuse_channel_release;
+ /* CUSE is not prepared for FUSE_DEV_IOC_CLONE */
+ cuse_channel_fops.unlocked_ioctl = NULL;
+
+ cuse_class = class_create(THIS_MODULE, "cuse");
+ if (IS_ERR(cuse_class))
+ return PTR_ERR(cuse_class);
+
+ cuse_class->dev_groups = cuse_class_dev_groups;
+
+ rc = misc_register(&cuse_miscdev);
+ if (rc) {
+ class_destroy(cuse_class);
+ return rc;
+ }
+
+ return 0;
+}
+
+static void __exit cuse_exit(void)
+{
+ misc_deregister(&cuse_miscdev);
+ class_destroy(cuse_class);
+}
+
+module_init(cuse_init);
+module_exit(cuse_exit);
+
+MODULE_AUTHOR("Tejun Heo <tj@kernel.org>");
+MODULE_DESCRIPTION("Character device in Userspace");
+MODULE_LICENSE("GPL");
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
new file mode 100644
index 000000000..6e71904c3
--- /dev/null
+++ b/fs/fuse/dax.c
@@ -0,0 +1,1391 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * dax: direct host memory access
+ * Copyright (C) 2020 Red Hat, Inc.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/delay.h>
+#include <linux/dax.h>
+#include <linux/uio.h>
+#include <linux/pagemap.h>
+#include <linux/pfn_t.h>
+#include <linux/iomap.h>
+#include <linux/interval_tree.h>
+
+/*
+ * Default memory range size. A power of 2 so it agrees with common FUSE_INIT
+ * map_alignment values 4KB and 64KB.
+ */
+#define FUSE_DAX_SHIFT 21
+#define FUSE_DAX_SZ (1 << FUSE_DAX_SHIFT)
+#define FUSE_DAX_PAGES (FUSE_DAX_SZ / PAGE_SIZE)
+
+/* Number of ranges reclaimer will try to free in one invocation */
+#define FUSE_DAX_RECLAIM_CHUNK (10)
+
+/*
+ * Dax memory reclaim threshold in percetage of total ranges. When free
+ * number of free ranges drops below this threshold, reclaim can trigger
+ * Default is 20%
+ */
+#define FUSE_DAX_RECLAIM_THRESHOLD (20)
+
+/** Translation information for file offsets to DAX window offsets */
+struct fuse_dax_mapping {
+ /* Pointer to inode where this memory range is mapped */
+ struct inode *inode;
+
+ /* Will connect in fcd->free_ranges to keep track of free memory */
+ struct list_head list;
+
+ /* For interval tree in file/inode */
+ struct interval_tree_node itn;
+
+ /* Will connect in fc->busy_ranges to keep track busy memory */
+ struct list_head busy_list;
+
+ /** Position in DAX window */
+ u64 window_offset;
+
+ /** Length of mapping, in bytes */
+ loff_t length;
+
+ /* Is this mapping read-only or read-write */
+ bool writable;
+
+ /* reference count when the mapping is used by dax iomap. */
+ refcount_t refcnt;
+};
+
+/* Per-inode dax map */
+struct fuse_inode_dax {
+ /* Semaphore to protect modifications to the dmap tree */
+ struct rw_semaphore sem;
+
+ /* Sorted rb tree of struct fuse_dax_mapping elements */
+ struct rb_root_cached tree;
+ unsigned long nr;
+};
+
+struct fuse_conn_dax {
+ /* DAX device */
+ struct dax_device *dev;
+
+ /* Lock protecting accessess to members of this structure */
+ spinlock_t lock;
+
+ /* List of memory ranges which are busy */
+ unsigned long nr_busy_ranges;
+ struct list_head busy_ranges;
+
+ /* Worker to free up memory ranges */
+ struct delayed_work free_work;
+
+ /* Wait queue for a dax range to become free */
+ wait_queue_head_t range_waitq;
+
+ /* DAX Window Free Ranges */
+ long nr_free_ranges;
+ struct list_head free_ranges;
+
+ unsigned long nr_ranges;
+};
+
+static inline struct fuse_dax_mapping *
+node_to_dmap(struct interval_tree_node *node)
+{
+ if (!node)
+ return NULL;
+
+ return container_of(node, struct fuse_dax_mapping, itn);
+}
+
+static struct fuse_dax_mapping *
+alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode);
+
+static void
+__kick_dmap_free_worker(struct fuse_conn_dax *fcd, unsigned long delay_ms)
+{
+ unsigned long free_threshold;
+
+ /* If number of free ranges are below threshold, start reclaim */
+ free_threshold = max_t(unsigned long, fcd->nr_ranges * FUSE_DAX_RECLAIM_THRESHOLD / 100,
+ 1);
+ if (fcd->nr_free_ranges < free_threshold)
+ queue_delayed_work(system_long_wq, &fcd->free_work,
+ msecs_to_jiffies(delay_ms));
+}
+
+static void kick_dmap_free_worker(struct fuse_conn_dax *fcd,
+ unsigned long delay_ms)
+{
+ spin_lock(&fcd->lock);
+ __kick_dmap_free_worker(fcd, delay_ms);
+ spin_unlock(&fcd->lock);
+}
+
+static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd)
+{
+ struct fuse_dax_mapping *dmap;
+
+ spin_lock(&fcd->lock);
+ dmap = list_first_entry_or_null(&fcd->free_ranges,
+ struct fuse_dax_mapping, list);
+ if (dmap) {
+ list_del_init(&dmap->list);
+ WARN_ON(fcd->nr_free_ranges <= 0);
+ fcd->nr_free_ranges--;
+ }
+ __kick_dmap_free_worker(fcd, 0);
+ spin_unlock(&fcd->lock);
+
+ return dmap;
+}
+
+/* This assumes fcd->lock is held */
+static void __dmap_remove_busy_list(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ list_del_init(&dmap->busy_list);
+ WARN_ON(fcd->nr_busy_ranges == 0);
+ fcd->nr_busy_ranges--;
+}
+
+static void dmap_remove_busy_list(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ spin_lock(&fcd->lock);
+ __dmap_remove_busy_list(fcd, dmap);
+ spin_unlock(&fcd->lock);
+}
+
+/* This assumes fcd->lock is held */
+static void __dmap_add_to_free_pool(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ list_add_tail(&dmap->list, &fcd->free_ranges);
+ fcd->nr_free_ranges++;
+ wake_up(&fcd->range_waitq);
+}
+
+static void dmap_add_to_free_pool(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ /* Return fuse_dax_mapping to free list */
+ spin_lock(&fcd->lock);
+ __dmap_add_to_free_pool(fcd, dmap);
+ spin_unlock(&fcd->lock);
+}
+
+static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx,
+ struct fuse_dax_mapping *dmap, bool writable,
+ bool upgrade)
+{
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_conn_dax *fcd = fm->fc->dax;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_setupmapping_in inarg;
+ loff_t offset = start_idx << FUSE_DAX_SHIFT;
+ FUSE_ARGS(args);
+ ssize_t err;
+
+ WARN_ON(fcd->nr_free_ranges < 0);
+
+ /* Ask fuse daemon to setup mapping */
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.foffset = offset;
+ inarg.fh = -1;
+ inarg.moffset = dmap->window_offset;
+ inarg.len = FUSE_DAX_SZ;
+ inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ;
+ if (writable)
+ inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
+ args.opcode = FUSE_SETUPMAPPING;
+ args.nodeid = fi->nodeid;
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ err = fuse_simple_request(fm, &args);
+ if (err < 0)
+ return err;
+ dmap->writable = writable;
+ if (!upgrade) {
+ /*
+ * We don't take a reference on inode. inode is valid right now
+ * and when inode is going away, cleanup logic should first
+ * cleanup dmap entries.
+ */
+ dmap->inode = inode;
+ dmap->itn.start = dmap->itn.last = start_idx;
+ /* Protected by fi->dax->sem */
+ interval_tree_insert(&dmap->itn, &fi->dax->tree);
+ fi->dax->nr++;
+ spin_lock(&fcd->lock);
+ list_add_tail(&dmap->busy_list, &fcd->busy_ranges);
+ fcd->nr_busy_ranges++;
+ spin_unlock(&fcd->lock);
+ }
+ return 0;
+}
+
+static int fuse_send_removemapping(struct inode *inode,
+ struct fuse_removemapping_in *inargp,
+ struct fuse_removemapping_one *remove_one)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ FUSE_ARGS(args);
+
+ args.opcode = FUSE_REMOVEMAPPING;
+ args.nodeid = fi->nodeid;
+ args.in_numargs = 2;
+ args.in_args[0].size = sizeof(*inargp);
+ args.in_args[0].value = inargp;
+ args.in_args[1].size = inargp->count * sizeof(*remove_one);
+ args.in_args[1].value = remove_one;
+ return fuse_simple_request(fm, &args);
+}
+
+static int dmap_removemapping_list(struct inode *inode, unsigned int num,
+ struct list_head *to_remove)
+{
+ struct fuse_removemapping_one *remove_one, *ptr;
+ struct fuse_removemapping_in inarg;
+ struct fuse_dax_mapping *dmap;
+ int ret, i = 0, nr_alloc;
+
+ nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY);
+ remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS);
+ if (!remove_one)
+ return -ENOMEM;
+
+ ptr = remove_one;
+ list_for_each_entry(dmap, to_remove, list) {
+ ptr->moffset = dmap->window_offset;
+ ptr->len = dmap->length;
+ ptr++;
+ i++;
+ num--;
+ if (i >= nr_alloc || num == 0) {
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.count = i;
+ ret = fuse_send_removemapping(inode, &inarg,
+ remove_one);
+ if (ret)
+ goto out;
+ ptr = remove_one;
+ i = 0;
+ }
+ }
+out:
+ kfree(remove_one);
+ return ret;
+}
+
+/*
+ * Cleanup dmap entry and add back to free list. This should be called with
+ * fcd->lock held.
+ */
+static void dmap_reinit_add_to_free_pool(struct fuse_conn_dax *fcd,
+ struct fuse_dax_mapping *dmap)
+{
+ pr_debug("fuse: freeing memory range start_idx=0x%lx end_idx=0x%lx window_offset=0x%llx length=0x%llx\n",
+ dmap->itn.start, dmap->itn.last, dmap->window_offset,
+ dmap->length);
+ __dmap_remove_busy_list(fcd, dmap);
+ dmap->inode = NULL;
+ dmap->itn.start = dmap->itn.last = 0;
+ __dmap_add_to_free_pool(fcd, dmap);
+}
+
+/*
+ * Free inode dmap entries whose range falls inside [start, end].
+ * Does not take any locks. At this point of time it should only be
+ * called from evict_inode() path where we know all dmap entries can be
+ * reclaimed.
+ */
+static void inode_reclaim_dmap_range(struct fuse_conn_dax *fcd,
+ struct inode *inode,
+ loff_t start, loff_t end)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap, *n;
+ int err, num = 0;
+ LIST_HEAD(to_remove);
+ unsigned long start_idx = start >> FUSE_DAX_SHIFT;
+ unsigned long end_idx = end >> FUSE_DAX_SHIFT;
+ struct interval_tree_node *node;
+
+ while (1) {
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx,
+ end_idx);
+ if (!node)
+ break;
+ dmap = node_to_dmap(node);
+ /* inode is going away. There should not be any users of dmap */
+ WARN_ON(refcount_read(&dmap->refcnt) > 1);
+ interval_tree_remove(&dmap->itn, &fi->dax->tree);
+ num++;
+ list_add(&dmap->list, &to_remove);
+ }
+
+ /* Nothing to remove */
+ if (list_empty(&to_remove))
+ return;
+
+ WARN_ON(fi->dax->nr < num);
+ fi->dax->nr -= num;
+ err = dmap_removemapping_list(inode, num, &to_remove);
+ if (err && err != -ENOTCONN) {
+ pr_warn("Failed to removemappings. start=0x%llx end=0x%llx\n",
+ start, end);
+ }
+ spin_lock(&fcd->lock);
+ list_for_each_entry_safe(dmap, n, &to_remove, list) {
+ list_del_init(&dmap->list);
+ dmap_reinit_add_to_free_pool(fcd, dmap);
+ }
+ spin_unlock(&fcd->lock);
+}
+
+static int dmap_removemapping_one(struct inode *inode,
+ struct fuse_dax_mapping *dmap)
+{
+ struct fuse_removemapping_one forget_one;
+ struct fuse_removemapping_in inarg;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.count = 1;
+ memset(&forget_one, 0, sizeof(forget_one));
+ forget_one.moffset = dmap->window_offset;
+ forget_one.len = dmap->length;
+
+ return fuse_send_removemapping(inode, &inarg, &forget_one);
+}
+
+/*
+ * It is called from evict_inode() and by that time inode is going away. So
+ * this function does not take any locks like fi->dax->sem for traversing
+ * that fuse inode interval tree. If that lock is taken then lock validator
+ * complains of deadlock situation w.r.t fs_reclaim lock.
+ */
+void fuse_dax_inode_cleanup(struct inode *inode)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ /*
+ * fuse_evict_inode() has already called truncate_inode_pages_final()
+ * before we arrive here. So we should not have to worry about any
+ * pages/exception entries still associated with inode.
+ */
+ inode_reclaim_dmap_range(fc->dax, inode, 0, -1);
+ WARN_ON(fi->dax->nr);
+}
+
+static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length)
+{
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->length = length;
+ iomap->type = IOMAP_HOLE;
+}
+
+static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length,
+ struct iomap *iomap, struct fuse_dax_mapping *dmap,
+ unsigned int flags)
+{
+ loff_t offset, len;
+ loff_t i_size = i_size_read(inode);
+
+ offset = pos - (dmap->itn.start << FUSE_DAX_SHIFT);
+ len = min(length, dmap->length - offset);
+
+ /* If length is beyond end of file, truncate further */
+ if (pos + len > i_size)
+ len = i_size - pos;
+
+ if (len > 0) {
+ iomap->addr = dmap->window_offset + offset;
+ iomap->length = len;
+ if (flags & IOMAP_FAULT)
+ iomap->length = ALIGN(len, PAGE_SIZE);
+ iomap->type = IOMAP_MAPPED;
+ /*
+ * increace refcnt so that reclaim code knows this dmap is in
+ * use. This assumes fi->dax->sem mutex is held either
+ * shared/exclusive.
+ */
+ refcount_inc(&dmap->refcnt);
+
+ /* iomap->private should be NULL */
+ WARN_ON_ONCE(iomap->private);
+ iomap->private = dmap;
+ } else {
+ /* Mapping beyond end of file is hole */
+ fuse_fill_iomap_hole(iomap, length);
+ }
+}
+
+static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_conn_dax *fcd = fc->dax;
+ struct fuse_dax_mapping *dmap, *alloc_dmap = NULL;
+ int ret;
+ bool writable = flags & IOMAP_WRITE;
+ unsigned long start_idx = pos >> FUSE_DAX_SHIFT;
+ struct interval_tree_node *node;
+
+ /*
+ * Can't do inline reclaim in fault path. We call
+ * dax_layout_busy_page() before we free a range. And
+ * fuse_wait_dax_page() drops mapping->invalidate_lock and requires it.
+ * In fault path we enter with mapping->invalidate_lock held and can't
+ * drop it. Also in fault path we hold mapping->invalidate_lock shared
+ * and not exclusive, so that creates further issues with
+ * fuse_wait_dax_page(). Hence return -EAGAIN and fuse_dax_fault()
+ * will wait for a memory range to become free and retry.
+ */
+ if (flags & IOMAP_FAULT) {
+ alloc_dmap = alloc_dax_mapping(fcd);
+ if (!alloc_dmap)
+ return -EAGAIN;
+ } else {
+ alloc_dmap = alloc_dax_mapping_reclaim(fcd, inode);
+ if (IS_ERR(alloc_dmap))
+ return PTR_ERR(alloc_dmap);
+ }
+
+ /* If we are here, we should have memory allocated */
+ if (WARN_ON(!alloc_dmap))
+ return -EIO;
+
+ /*
+ * Take write lock so that only one caller can try to setup mapping
+ * and other waits.
+ */
+ down_write(&fi->dax->sem);
+ /*
+ * We dropped lock. Check again if somebody else setup
+ * mapping already.
+ */
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+ if (node) {
+ dmap = node_to_dmap(node);
+ fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+ dmap_add_to_free_pool(fcd, alloc_dmap);
+ up_write(&fi->dax->sem);
+ return 0;
+ }
+
+ /* Setup one mapping */
+ ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, alloc_dmap,
+ writable, false);
+ if (ret < 0) {
+ dmap_add_to_free_pool(fcd, alloc_dmap);
+ up_write(&fi->dax->sem);
+ return ret;
+ }
+ fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags);
+ up_write(&fi->dax->sem);
+ return 0;
+}
+
+static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap;
+ int ret;
+ unsigned long idx = pos >> FUSE_DAX_SHIFT;
+ struct interval_tree_node *node;
+
+ /*
+ * Take exclusive lock so that only one caller can try to setup
+ * mapping and others wait.
+ */
+ down_write(&fi->dax->sem);
+ node = interval_tree_iter_first(&fi->dax->tree, idx, idx);
+
+ /* We are holding either inode lock or invalidate_lock, and that should
+ * ensure that dmap can't be truncated. We are holding a reference
+ * on dmap and that should make sure it can't be reclaimed. So dmap
+ * should still be there in tree despite the fact we dropped and
+ * re-acquired the fi->dax->sem lock.
+ */
+ ret = -EIO;
+ if (WARN_ON(!node))
+ goto out_err;
+
+ dmap = node_to_dmap(node);
+
+ /* We took an extra reference on dmap to make sure its not reclaimd.
+ * Now we hold fi->dax->sem lock and that reference is not needed
+ * anymore. Drop it.
+ */
+ if (refcount_dec_and_test(&dmap->refcnt)) {
+ /* refcount should not hit 0. This object only goes
+ * away when fuse connection goes away
+ */
+ WARN_ON_ONCE(1);
+ }
+
+ /* Maybe another thread already upgraded mapping while we were not
+ * holding lock.
+ */
+ if (dmap->writable) {
+ ret = 0;
+ goto out_fill_iomap;
+ }
+
+ ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, dmap, true,
+ true);
+ if (ret < 0)
+ goto out_err;
+out_fill_iomap:
+ fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+out_err:
+ up_write(&fi->dax->sem);
+ return ret;
+}
+
+/* This is just for DAX and the mapping is ephemeral, do not use it for other
+ * purposes since there is no block device with a permanent mapping.
+ */
+static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
+ unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_dax_mapping *dmap;
+ bool writable = flags & IOMAP_WRITE;
+ unsigned long start_idx = pos >> FUSE_DAX_SHIFT;
+ struct interval_tree_node *node;
+
+ /* We don't support FIEMAP */
+ if (WARN_ON(flags & IOMAP_REPORT))
+ return -EIO;
+
+ iomap->offset = pos;
+ iomap->flags = 0;
+ iomap->bdev = NULL;
+ iomap->dax_dev = fc->dax->dev;
+
+ /*
+ * Both read/write and mmap path can race here. So we need something
+ * to make sure if we are setting up mapping, then other path waits
+ *
+ * For now, use a semaphore for this. It probably needs to be
+ * optimized later.
+ */
+ down_read(&fi->dax->sem);
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+ if (node) {
+ dmap = node_to_dmap(node);
+ if (writable && !dmap->writable) {
+ /* Upgrade read-only mapping to read-write. This will
+ * require exclusive fi->dax->sem lock as we don't want
+ * two threads to be trying to this simultaneously
+ * for same dmap. So drop shared lock and acquire
+ * exclusive lock.
+ *
+ * Before dropping fi->dax->sem lock, take reference
+ * on dmap so that its not freed by range reclaim.
+ */
+ refcount_inc(&dmap->refcnt);
+ up_read(&fi->dax->sem);
+ pr_debug("%s: Upgrading mapping at offset 0x%llx length 0x%llx\n",
+ __func__, pos, length);
+ return fuse_upgrade_dax_mapping(inode, pos, length,
+ flags, iomap);
+ } else {
+ fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+ up_read(&fi->dax->sem);
+ return 0;
+ }
+ } else {
+ up_read(&fi->dax->sem);
+ pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n",
+ __func__, pos, length);
+ if (pos >= i_size_read(inode))
+ goto iomap_hole;
+
+ return fuse_setup_new_dax_mapping(inode, pos, length, flags,
+ iomap);
+ }
+
+ /*
+ * If read beyond end of file happens, fs code seems to return
+ * it as hole
+ */
+iomap_hole:
+ fuse_fill_iomap_hole(iomap, length);
+ pr_debug("%s returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n",
+ __func__, pos, length, iomap->length);
+ return 0;
+}
+
+static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned int flags,
+ struct iomap *iomap)
+{
+ struct fuse_dax_mapping *dmap = iomap->private;
+
+ if (dmap) {
+ if (refcount_dec_and_test(&dmap->refcnt)) {
+ /* refcount should not hit 0. This object only goes
+ * away when fuse connection goes away
+ */
+ WARN_ON_ONCE(1);
+ }
+ }
+
+ /* DAX writes beyond end-of-file aren't handled using iomap, so the
+ * file size is unchanged and there is nothing to do here.
+ */
+ return 0;
+}
+
+static const struct iomap_ops fuse_iomap_ops = {
+ .iomap_begin = fuse_iomap_begin,
+ .iomap_end = fuse_iomap_end,
+};
+
+static void fuse_wait_dax_page(struct inode *inode)
+{
+ filemap_invalidate_unlock(inode->i_mapping);
+ schedule();
+ filemap_invalidate_lock(inode->i_mapping);
+}
+
+/* Should be called with mapping->invalidate_lock held exclusively */
+static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
+ loff_t start, loff_t end)
+{
+ struct page *page;
+
+ page = dax_layout_busy_page_range(inode->i_mapping, start, end);
+ if (!page)
+ return 0;
+
+ *retry = true;
+ return ___wait_var_event(&page->_refcount,
+ atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
+ 0, 0, fuse_wait_dax_page(inode));
+}
+
+/* dmap_end == 0 leads to unmapping of whole file */
+int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start,
+ u64 dmap_end)
+{
+ bool retry;
+ int ret;
+
+ do {
+ retry = false;
+ ret = __fuse_dax_break_layouts(inode, &retry, dmap_start,
+ dmap_end);
+ } while (ret == 0 && retry);
+
+ return ret;
+}
+
+ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock_shared(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock_shared(inode);
+ }
+
+ ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops);
+ inode_unlock_shared(inode);
+
+ /* TODO file_accessed(iocb->f_filp) */
+ return ret;
+}
+
+static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ return (iov_iter_rw(from) == WRITE &&
+ ((iocb->ki_pos) >= i_size_read(inode) ||
+ (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode))));
+}
+
+static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
+ ssize_t ret;
+
+ ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
+
+ fuse_write_update_attr(inode, iocb->ki_pos, ret);
+ return ret;
+}
+
+ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock(inode);
+ }
+
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
+
+ ret = file_remove_privs(iocb->ki_filp);
+ if (ret)
+ goto out;
+ /* TODO file_update_time() but we don't want metadata I/O */
+
+ /* Do not use dax for file extending writes as write and on
+ * disk i_size increase are not atomic otherwise.
+ */
+ if (file_extending_write(iocb, from))
+ ret = fuse_dax_direct_write(iocb, from);
+ else
+ ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops);
+
+out:
+ inode_unlock(inode);
+
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ return ret;
+}
+
+static int fuse_dax_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+
+ struct inode *inode = mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc);
+}
+
+static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf,
+ enum page_entry_size pe_size, bool write)
+{
+ vm_fault_t ret;
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ struct super_block *sb = inode->i_sb;
+ pfn_t pfn;
+ int error = 0;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_conn_dax *fcd = fc->dax;
+ bool retry = false;
+
+ if (write)
+ sb_start_pagefault(sb);
+retry:
+ if (retry && !(fcd->nr_free_ranges > 0))
+ wait_event(fcd->range_waitq, (fcd->nr_free_ranges > 0));
+
+ /*
+ * We need to serialize against not only truncate but also against
+ * fuse dax memory range reclaim. While a range is being reclaimed,
+ * we do not want any read/write/mmap to make progress and try
+ * to populate page cache or access memory we are trying to free.
+ */
+ filemap_invalidate_lock_shared(inode->i_mapping);
+ ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops);
+ if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) {
+ error = 0;
+ retry = true;
+ filemap_invalidate_unlock_shared(inode->i_mapping);
+ goto retry;
+ }
+
+ if (ret & VM_FAULT_NEEDDSYNC)
+ ret = dax_finish_sync_fault(vmf, pe_size, pfn);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
+
+ if (write)
+ sb_end_pagefault(sb);
+
+ return ret;
+}
+
+static vm_fault_t fuse_dax_fault(struct vm_fault *vmf)
+{
+ return __fuse_dax_fault(vmf, PE_SIZE_PTE,
+ vmf->flags & FAULT_FLAG_WRITE);
+}
+
+static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf,
+ enum page_entry_size pe_size)
+{
+ return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE);
+}
+
+static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf)
+{
+ return __fuse_dax_fault(vmf, PE_SIZE_PTE, true);
+}
+
+static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf)
+{
+ return __fuse_dax_fault(vmf, PE_SIZE_PTE, true);
+}
+
+static const struct vm_operations_struct fuse_dax_vm_ops = {
+ .fault = fuse_dax_fault,
+ .huge_fault = fuse_dax_huge_fault,
+ .page_mkwrite = fuse_dax_page_mkwrite,
+ .pfn_mkwrite = fuse_dax_pfn_mkwrite,
+};
+
+int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ file_accessed(file);
+ vma->vm_ops = &fuse_dax_vm_ops;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ return 0;
+}
+
+static int dmap_writeback_invalidate(struct inode *inode,
+ struct fuse_dax_mapping *dmap)
+{
+ int ret;
+ loff_t start_pos = dmap->itn.start << FUSE_DAX_SHIFT;
+ loff_t end_pos = (start_pos + FUSE_DAX_SZ - 1);
+
+ ret = filemap_fdatawrite_range(inode->i_mapping, start_pos, end_pos);
+ if (ret) {
+ pr_debug("fuse: filemap_fdatawrite_range() failed. err=%d start_pos=0x%llx, end_pos=0x%llx\n",
+ ret, start_pos, end_pos);
+ return ret;
+ }
+
+ ret = invalidate_inode_pages2_range(inode->i_mapping,
+ start_pos >> PAGE_SHIFT,
+ end_pos >> PAGE_SHIFT);
+ if (ret)
+ pr_debug("fuse: invalidate_inode_pages2_range() failed err=%d\n",
+ ret);
+
+ return ret;
+}
+
+static int reclaim_one_dmap_locked(struct inode *inode,
+ struct fuse_dax_mapping *dmap)
+{
+ int ret;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ /*
+ * igrab() was done to make sure inode won't go under us, and this
+ * further avoids the race with evict().
+ */
+ ret = dmap_writeback_invalidate(inode, dmap);
+ if (ret)
+ return ret;
+
+ /* Remove dax mapping from inode interval tree now */
+ interval_tree_remove(&dmap->itn, &fi->dax->tree);
+ fi->dax->nr--;
+
+ /* It is possible that umount/shutdown has killed the fuse connection
+ * and worker thread is trying to reclaim memory in parallel. Don't
+ * warn in that case.
+ */
+ ret = dmap_removemapping_one(inode, dmap);
+ if (ret && ret != -ENOTCONN) {
+ pr_warn("Failed to remove mapping. offset=0x%llx len=0x%llx ret=%d\n",
+ dmap->window_offset, dmap->length, ret);
+ }
+ return 0;
+}
+
+/* Find first mapped dmap for an inode and return file offset. Caller needs
+ * to hold fi->dax->sem lock either shared or exclusive.
+ */
+static struct fuse_dax_mapping *inode_lookup_first_dmap(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap;
+ struct interval_tree_node *node;
+
+ for (node = interval_tree_iter_first(&fi->dax->tree, 0, -1); node;
+ node = interval_tree_iter_next(node, 0, -1)) {
+ dmap = node_to_dmap(node);
+ /* still in use. */
+ if (refcount_read(&dmap->refcnt) > 1)
+ continue;
+
+ return dmap;
+ }
+
+ return NULL;
+}
+
+/*
+ * Find first mapping in the tree and free it and return it. Do not add
+ * it back to free pool.
+ */
+static struct fuse_dax_mapping *
+inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode,
+ bool *retry)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap;
+ u64 dmap_start, dmap_end;
+ unsigned long start_idx;
+ int ret;
+ struct interval_tree_node *node;
+
+ filemap_invalidate_lock(inode->i_mapping);
+
+ /* Lookup a dmap and corresponding file offset to reclaim. */
+ down_read(&fi->dax->sem);
+ dmap = inode_lookup_first_dmap(inode);
+ if (dmap) {
+ start_idx = dmap->itn.start;
+ dmap_start = start_idx << FUSE_DAX_SHIFT;
+ dmap_end = dmap_start + FUSE_DAX_SZ - 1;
+ }
+ up_read(&fi->dax->sem);
+
+ if (!dmap)
+ goto out_mmap_sem;
+ /*
+ * Make sure there are no references to inode pages using
+ * get_user_pages()
+ */
+ ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
+ if (ret) {
+ pr_debug("fuse: fuse_dax_break_layouts() failed. err=%d\n",
+ ret);
+ dmap = ERR_PTR(ret);
+ goto out_mmap_sem;
+ }
+
+ down_write(&fi->dax->sem);
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+ /* Range already got reclaimed by somebody else */
+ if (!node) {
+ if (retry)
+ *retry = true;
+ goto out_write_dmap_sem;
+ }
+
+ dmap = node_to_dmap(node);
+ /* still in use. */
+ if (refcount_read(&dmap->refcnt) > 1) {
+ dmap = NULL;
+ if (retry)
+ *retry = true;
+ goto out_write_dmap_sem;
+ }
+
+ ret = reclaim_one_dmap_locked(inode, dmap);
+ if (ret < 0) {
+ dmap = ERR_PTR(ret);
+ goto out_write_dmap_sem;
+ }
+
+ /* Clean up dmap. Do not add back to free list */
+ dmap_remove_busy_list(fcd, dmap);
+ dmap->inode = NULL;
+ dmap->itn.start = dmap->itn.last = 0;
+
+ pr_debug("fuse: %s: inline reclaimed memory range. inode=%p, window_offset=0x%llx, length=0x%llx\n",
+ __func__, inode, dmap->window_offset, dmap->length);
+
+out_write_dmap_sem:
+ up_write(&fi->dax->sem);
+out_mmap_sem:
+ filemap_invalidate_unlock(inode->i_mapping);
+ return dmap;
+}
+
+static struct fuse_dax_mapping *
+alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode)
+{
+ struct fuse_dax_mapping *dmap;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ while (1) {
+ bool retry = false;
+
+ dmap = alloc_dax_mapping(fcd);
+ if (dmap)
+ return dmap;
+
+ dmap = inode_inline_reclaim_one_dmap(fcd, inode, &retry);
+ /*
+ * Either we got a mapping or it is an error, return in both
+ * the cases.
+ */
+ if (dmap)
+ return dmap;
+
+ /* If we could not reclaim a mapping because it
+ * had a reference or some other temporary failure,
+ * Try again. We want to give up inline reclaim only
+ * if there is no range assigned to this node. Otherwise
+ * if a deadlock is possible if we sleep with
+ * mapping->invalidate_lock held and worker to free memory
+ * can't make progress due to unavailability of
+ * mapping->invalidate_lock. So sleep only if fi->dax->nr=0
+ */
+ if (retry)
+ continue;
+ /*
+ * There are no mappings which can be reclaimed. Wait for one.
+ * We are not holding fi->dax->sem. So it is possible
+ * that range gets added now. But as we are not holding
+ * mapping->invalidate_lock, worker should still be able to
+ * free up a range and wake us up.
+ */
+ if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) {
+ if (wait_event_killable_exclusive(fcd->range_waitq,
+ (fcd->nr_free_ranges > 0))) {
+ return ERR_PTR(-EINTR);
+ }
+ }
+ }
+}
+
+static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd,
+ struct inode *inode,
+ unsigned long start_idx)
+{
+ int ret;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_dax_mapping *dmap;
+ struct interval_tree_node *node;
+
+ /* Find fuse dax mapping at file offset inode. */
+ node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+
+ /* Range already got cleaned up by somebody else */
+ if (!node)
+ return 0;
+ dmap = node_to_dmap(node);
+
+ /* still in use. */
+ if (refcount_read(&dmap->refcnt) > 1)
+ return 0;
+
+ ret = reclaim_one_dmap_locked(inode, dmap);
+ if (ret < 0)
+ return ret;
+
+ /* Cleanup dmap entry and add back to free list */
+ spin_lock(&fcd->lock);
+ dmap_reinit_add_to_free_pool(fcd, dmap);
+ spin_unlock(&fcd->lock);
+ return ret;
+}
+
+/*
+ * Free a range of memory.
+ * Locking:
+ * 1. Take mapping->invalidate_lock to block dax faults.
+ * 2. Take fi->dax->sem to protect interval tree and also to make sure
+ * read/write can not reuse a dmap which we might be freeing.
+ */
+static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd,
+ struct inode *inode,
+ unsigned long start_idx,
+ unsigned long end_idx)
+{
+ int ret;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ loff_t dmap_start = start_idx << FUSE_DAX_SHIFT;
+ loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1;
+
+ filemap_invalidate_lock(inode->i_mapping);
+ ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
+ if (ret) {
+ pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n",
+ ret);
+ goto out_mmap_sem;
+ }
+
+ down_write(&fi->dax->sem);
+ ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx);
+ up_write(&fi->dax->sem);
+out_mmap_sem:
+ filemap_invalidate_unlock(inode->i_mapping);
+ return ret;
+}
+
+static int try_to_free_dmap_chunks(struct fuse_conn_dax *fcd,
+ unsigned long nr_to_free)
+{
+ struct fuse_dax_mapping *dmap, *pos, *temp;
+ int ret, nr_freed = 0;
+ unsigned long start_idx = 0, end_idx = 0;
+ struct inode *inode = NULL;
+
+ /* Pick first busy range and free it for now*/
+ while (1) {
+ if (nr_freed >= nr_to_free)
+ break;
+
+ dmap = NULL;
+ spin_lock(&fcd->lock);
+
+ if (!fcd->nr_busy_ranges) {
+ spin_unlock(&fcd->lock);
+ return 0;
+ }
+
+ list_for_each_entry_safe(pos, temp, &fcd->busy_ranges,
+ busy_list) {
+ /* skip this range if it's in use. */
+ if (refcount_read(&pos->refcnt) > 1)
+ continue;
+
+ inode = igrab(pos->inode);
+ /*
+ * This inode is going away. That will free
+ * up all the ranges anyway, continue to
+ * next range.
+ */
+ if (!inode)
+ continue;
+ /*
+ * Take this element off list and add it tail. If
+ * this element can't be freed, it will help with
+ * selecting new element in next iteration of loop.
+ */
+ dmap = pos;
+ list_move_tail(&dmap->busy_list, &fcd->busy_ranges);
+ start_idx = end_idx = dmap->itn.start;
+ break;
+ }
+ spin_unlock(&fcd->lock);
+ if (!dmap)
+ return 0;
+
+ ret = lookup_and_reclaim_dmap(fcd, inode, start_idx, end_idx);
+ iput(inode);
+ if (ret)
+ return ret;
+ nr_freed++;
+ }
+ return 0;
+}
+
+static void fuse_dax_free_mem_worker(struct work_struct *work)
+{
+ int ret;
+ struct fuse_conn_dax *fcd = container_of(work, struct fuse_conn_dax,
+ free_work.work);
+ ret = try_to_free_dmap_chunks(fcd, FUSE_DAX_RECLAIM_CHUNK);
+ if (ret) {
+ pr_debug("fuse: try_to_free_dmap_chunks() failed with err=%d\n",
+ ret);
+ }
+
+ /* If number of free ranges are still below threshold, requeue */
+ kick_dmap_free_worker(fcd, 1);
+}
+
+static void fuse_free_dax_mem_ranges(struct list_head *mem_list)
+{
+ struct fuse_dax_mapping *range, *temp;
+
+ /* Free All allocated elements */
+ list_for_each_entry_safe(range, temp, mem_list, list) {
+ list_del(&range->list);
+ if (!list_empty(&range->busy_list))
+ list_del(&range->busy_list);
+ kfree(range);
+ }
+}
+
+void fuse_dax_conn_free(struct fuse_conn *fc)
+{
+ if (fc->dax) {
+ fuse_free_dax_mem_ranges(&fc->dax->free_ranges);
+ kfree(fc->dax);
+ fc->dax = NULL;
+ }
+}
+
+static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd)
+{
+ long nr_pages, nr_ranges;
+ struct fuse_dax_mapping *range;
+ int ret, id;
+ size_t dax_size = -1;
+ unsigned long i;
+
+ init_waitqueue_head(&fcd->range_waitq);
+ INIT_LIST_HEAD(&fcd->free_ranges);
+ INIT_LIST_HEAD(&fcd->busy_ranges);
+ INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker);
+
+ id = dax_read_lock();
+ nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size),
+ DAX_ACCESS, NULL, NULL);
+ dax_read_unlock(id);
+ if (nr_pages < 0) {
+ pr_debug("dax_direct_access() returned %ld\n", nr_pages);
+ return nr_pages;
+ }
+
+ nr_ranges = nr_pages/FUSE_DAX_PAGES;
+ pr_debug("%s: dax mapped %ld pages. nr_ranges=%ld\n",
+ __func__, nr_pages, nr_ranges);
+
+ for (i = 0; i < nr_ranges; i++) {
+ range = kzalloc(sizeof(struct fuse_dax_mapping), GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!range)
+ goto out_err;
+
+ /* TODO: This offset only works if virtio-fs driver is not
+ * having some memory hidden at the beginning. This needs
+ * better handling
+ */
+ range->window_offset = i * FUSE_DAX_SZ;
+ range->length = FUSE_DAX_SZ;
+ INIT_LIST_HEAD(&range->busy_list);
+ refcount_set(&range->refcnt, 1);
+ list_add_tail(&range->list, &fcd->free_ranges);
+ }
+
+ fcd->nr_free_ranges = nr_ranges;
+ fcd->nr_ranges = nr_ranges;
+ return 0;
+out_err:
+ /* Free All allocated elements */
+ fuse_free_dax_mem_ranges(&fcd->free_ranges);
+ return ret;
+}
+
+int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode dax_mode,
+ struct dax_device *dax_dev)
+{
+ struct fuse_conn_dax *fcd;
+ int err;
+
+ fc->dax_mode = dax_mode;
+
+ if (!dax_dev)
+ return 0;
+
+ fcd = kzalloc(sizeof(*fcd), GFP_KERNEL);
+ if (!fcd)
+ return -ENOMEM;
+
+ spin_lock_init(&fcd->lock);
+ fcd->dev = dax_dev;
+ err = fuse_dax_mem_range_init(fcd);
+ if (err) {
+ kfree(fcd);
+ return err;
+ }
+
+ fc->dax = fcd;
+ return 0;
+}
+
+bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi)
+{
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+ fi->dax = NULL;
+ if (fc->dax) {
+ fi->dax = kzalloc(sizeof(*fi->dax), GFP_KERNEL_ACCOUNT);
+ if (!fi->dax)
+ return false;
+
+ init_rwsem(&fi->dax->sem);
+ fi->dax->tree = RB_ROOT_CACHED;
+ }
+
+ return true;
+}
+
+static const struct address_space_operations fuse_dax_file_aops = {
+ .writepages = fuse_dax_writepages,
+ .direct_IO = noop_direct_IO,
+ .dirty_folio = noop_dirty_folio,
+};
+
+static bool fuse_should_enable_dax(struct inode *inode, unsigned int flags)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ enum fuse_dax_mode dax_mode = fc->dax_mode;
+
+ if (dax_mode == FUSE_DAX_NEVER)
+ return false;
+
+ /*
+ * fc->dax may be NULL in 'inode' mode when filesystem device doesn't
+ * support DAX, in which case it will silently fallback to 'never' mode.
+ */
+ if (!fc->dax)
+ return false;
+
+ if (dax_mode == FUSE_DAX_ALWAYS)
+ return true;
+
+ /* dax_mode is FUSE_DAX_INODE* */
+ return fc->inode_dax && (flags & FUSE_ATTR_DAX);
+}
+
+void fuse_dax_inode_init(struct inode *inode, unsigned int flags)
+{
+ if (!fuse_should_enable_dax(inode, flags))
+ return;
+
+ inode->i_flags |= S_DAX;
+ inode->i_data.a_ops = &fuse_dax_file_aops;
+}
+
+void fuse_dax_dontcache(struct inode *inode, unsigned int flags)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (fuse_is_inode_dax_mode(fc->dax_mode) &&
+ ((bool) IS_DAX(inode) != (bool) (flags & FUSE_ATTR_DAX)))
+ d_mark_dontcache(inode);
+}
+
+bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment)
+{
+ if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) {
+ pr_warn("FUSE: map_alignment %u incompatible with dax mem range size %u\n",
+ map_alignment, FUSE_DAX_SZ);
+ return false;
+ }
+ return true;
+}
+
+void fuse_dax_cancel_work(struct fuse_conn *fc)
+{
+ struct fuse_conn_dax *fcd = fc->dax;
+
+ if (fcd)
+ cancel_delayed_work_sync(&fcd->free_work);
+
+}
+EXPORT_SYMBOL_GPL(fuse_dax_cancel_work);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
new file mode 100644
index 000000000..b4a6e0a1b
--- /dev/null
+++ b/fs/fuse/dev.c
@@ -0,0 +1,2337 @@
+/*
+ FUSE: Filesystem in Userspace
+ Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
+
+ This program can be distributed under the terms of the GNU GPL.
+ See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/sched/signal.h>
+#include <linux/uio.h>
+#include <linux/miscdevice.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/swap.h>
+#include <linux/splice.h>
+#include <linux/sched.h>
+
+MODULE_ALIAS_MISCDEV(FUSE_MINOR);
+MODULE_ALIAS("devname:fuse");
+
+/* Ordinary requests have even IDs, while interrupts IDs are odd */
+#define FUSE_INT_REQ_BIT (1ULL << 0)
+#define FUSE_REQ_ID_STEP (1ULL << 1)
+
+static struct kmem_cache *fuse_req_cachep;
+
+static struct fuse_dev *fuse_get_dev(struct file *file)
+{
+ /*
+ * Lockless access is OK, because file->private data is set
+ * once during mount and is valid until the file is released.
+ */
+ return READ_ONCE(file->private_data);
+}
+
+static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
+{
+ INIT_LIST_HEAD(&req->list);
+ INIT_LIST_HEAD(&req->intr_entry);
+ init_waitqueue_head(&req->waitq);
+ refcount_set(&req->count, 1);
+ __set_bit(FR_PENDING, &req->flags);
+ req->fm = fm;
+}
+
+static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags)
+{
+ struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags);
+ if (req)
+ fuse_request_init(fm, req);
+
+ return req;
+}
+
+static void fuse_request_free(struct fuse_req *req)
+{
+ kmem_cache_free(fuse_req_cachep, req);
+}
+
+static void __fuse_get_request(struct fuse_req *req)
+{
+ refcount_inc(&req->count);
+}
+
+/* Must be called with > 1 refcount */
+static void __fuse_put_request(struct fuse_req *req)
+{
+ refcount_dec(&req->count);
+}
+
+void fuse_set_initialized(struct fuse_conn *fc)
+{
+ /* Make sure stores before this are seen on another CPU */
+ smp_wmb();
+ fc->initialized = 1;
+}
+
+static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
+{
+ return !fc->initialized || (for_background && fc->blocked);
+}
+
+static void fuse_drop_waiting(struct fuse_conn *fc)
+{
+ /*
+ * lockess check of fc->connected is okay, because atomic_dec_and_test()
+ * provides a memory barrier matched with the one in fuse_wait_aborted()
+ * to ensure no wake-up is missed.
+ */
+ if (atomic_dec_and_test(&fc->num_waiting) &&
+ !READ_ONCE(fc->connected)) {
+ /* wake up aborters */
+ wake_up_all(&fc->blocked_waitq);
+ }
+}
+
+static void fuse_put_request(struct fuse_req *req);
+
+static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background)
+{
+ struct fuse_conn *fc = fm->fc;
+ struct fuse_req *req;
+ int err;
+ atomic_inc(&fc->num_waiting);
+
+ if (fuse_block_alloc(fc, for_background)) {
+ err = -EINTR;
+ if (wait_event_killable_exclusive(fc->blocked_waitq,
+ !fuse_block_alloc(fc, for_background)))
+ goto out;
+ }
+ /* Matches smp_wmb() in fuse_set_initialized() */
+ smp_rmb();
+
+ err = -ENOTCONN;
+ if (!fc->connected)
+ goto out;
+
+ err = -ECONNREFUSED;
+ if (fc->conn_error)
+ goto out;
+
+ req = fuse_request_alloc(fm, GFP_KERNEL);
+ err = -ENOMEM;
+ if (!req) {
+ if (for_background)
+ wake_up(&fc->blocked_waitq);
+ goto out;
+ }
+
+ req->in.h.uid = from_kuid(fc->user_ns, current_fsuid());
+ req->in.h.gid = from_kgid(fc->user_ns, current_fsgid());
+ req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
+
+ __set_bit(FR_WAITING, &req->flags);
+ if (for_background)
+ __set_bit(FR_BACKGROUND, &req->flags);
+
+ if (unlikely(req->in.h.uid == ((uid_t)-1) ||
+ req->in.h.gid == ((gid_t)-1))) {
+ fuse_put_request(req);
+ return ERR_PTR(-EOVERFLOW);
+ }
+ return req;
+
+ out:
+ fuse_drop_waiting(fc);
+ return ERR_PTR(err);
+}
+
+static void fuse_put_request(struct fuse_req *req)
+{
+ struct fuse_conn *fc = req->fm->fc;
+
+ if (refcount_dec_and_test(&req->count)) {
+ if (test_bit(FR_BACKGROUND, &req->flags)) {
+ /*
+ * We get here in the unlikely case that a background
+ * request was allocated but not sent
+ */
+ spin_lock(&fc->bg_lock);
+ if (!fc->blocked)
+ wake_up(&fc->blocked_waitq);
+ spin_unlock(&fc->bg_lock);
+ }
+
+ if (test_bit(FR_WAITING, &req->flags)) {
+ __clear_bit(FR_WAITING, &req->flags);
+ fuse_drop_waiting(fc);
+ }
+
+ fuse_request_free(req);
+ }
+}
+
+unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args)
+{
+ unsigned nbytes = 0;
+ unsigned i;
+
+ for (i = 0; i < numargs; i++)
+ nbytes += args[i].size;
+
+ return nbytes;
+}
+EXPORT_SYMBOL_GPL(fuse_len_args);
+
+u64 fuse_get_unique(struct fuse_iqueue *fiq)
+{
+ fiq->reqctr += FUSE_REQ_ID_STEP;
+ return fiq->reqctr;
+}
+EXPORT_SYMBOL_GPL(fuse_get_unique);
+
+static unsigned int fuse_req_hash(u64 unique)
+{
+ return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS);
+}
+
+/**
+ * A new request is available, wake fiq->waitq
+ */
+static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq)
+__releases(fiq->lock)
+{
+ wake_up(&fiq->waitq);
+ kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+ spin_unlock(&fiq->lock);
+}
+
+const struct fuse_iqueue_ops fuse_dev_fiq_ops = {
+ .wake_forget_and_unlock = fuse_dev_wake_and_unlock,
+ .wake_interrupt_and_unlock = fuse_dev_wake_and_unlock,
+ .wake_pending_and_unlock = fuse_dev_wake_and_unlock,
+};
+EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops);
+
+static void queue_request_and_unlock(struct fuse_iqueue *fiq,
+ struct fuse_req *req)
+__releases(fiq->lock)
+{
+ req->in.h.len = sizeof(struct fuse_in_header) +
+ fuse_len_args(req->args->in_numargs,
+ (struct fuse_arg *) req->args->in_args);
+ list_add_tail(&req->list, &fiq->pending);
+ fiq->ops->wake_pending_and_unlock(fiq);
+}
+
+void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
+ u64 nodeid, u64 nlookup)
+{
+ struct fuse_iqueue *fiq = &fc->iq;
+
+ forget->forget_one.nodeid = nodeid;
+ forget->forget_one.nlookup = nlookup;
+
+ spin_lock(&fiq->lock);
+ if (fiq->connected) {
+ fiq->forget_list_tail->next = forget;
+ fiq->forget_list_tail = forget;
+ fiq->ops->wake_forget_and_unlock(fiq);
+ } else {
+ kfree(forget);
+ spin_unlock(&fiq->lock);
+ }
+}
+
+static void flush_bg_queue(struct fuse_conn *fc)
+{
+ struct fuse_iqueue *fiq = &fc->iq;
+
+ while (fc->active_background < fc->max_background &&
+ !list_empty(&fc->bg_queue)) {
+ struct fuse_req *req;
+
+ req = list_first_entry(&fc->bg_queue, struct fuse_req, list);
+ list_del(&req->list);
+ fc->active_background++;
+ spin_lock(&fiq->lock);
+ req->in.h.unique = fuse_get_unique(fiq);
+ queue_request_and_unlock(fiq, req);
+ }
+}
+
+/*
+ * This function is called when a request is finished. Either a reply
+ * has arrived or it was aborted (and not yet sent) or some error
+ * occurred during communication with userspace, or the device file
+ * was closed. The requester thread is woken up (if still waiting),
+ * the 'end' callback is called if given, else the reference to the
+ * request is released
+ */
+void fuse_request_end(struct fuse_req *req)
+{
+ struct fuse_mount *fm = req->fm;
+ struct fuse_conn *fc = fm->fc;
+ struct fuse_iqueue *fiq = &fc->iq;
+
+ if (test_and_set_bit(FR_FINISHED, &req->flags))
+ goto put_request;
+
+ /*
+ * test_and_set_bit() implies smp_mb() between bit
+ * changing and below FR_INTERRUPTED check. Pairs with
+ * smp_mb() from queue_interrupt().
+ */
+ if (test_bit(FR_INTERRUPTED, &req->flags)) {
+ spin_lock(&fiq->lock);
+ list_del_init(&req->intr_entry);
+ spin_unlock(&fiq->lock);
+ }
+ WARN_ON(test_bit(FR_PENDING, &req->flags));
+ WARN_ON(test_bit(FR_SENT, &req->flags));
+ if (test_bit(FR_BACKGROUND, &req->flags)) {
+ spin_lock(&fc->bg_lock);
+ clear_bit(FR_BACKGROUND, &req->flags);
+ if (fc->num_background == fc->max_background) {
+ fc->blocked = 0;
+ wake_up(&fc->blocked_waitq);
+ } else if (!fc->blocked) {
+ /*
+ * Wake up next waiter, if any. It's okay to use
+ * waitqueue_active(), as we've already synced up
+ * fc->blocked with waiters with the wake_up() call
+ * above.
+ */
+ if (waitqueue_active(&fc->blocked_waitq))
+ wake_up(&fc->blocked_waitq);
+ }
+
+ fc->num_background--;
+ fc->active_background--;
+ flush_bg_queue(fc);
+ spin_unlock(&fc->bg_lock);
+ } else {
+ /* Wake up waiter sleeping in request_wait_answer() */
+ wake_up(&req->waitq);
+ }
+
+ if (test_bit(FR_ASYNC, &req->flags))
+ req->args->end(fm, req->args, req->out.h.error);
+put_request:
+ fuse_put_request(req);
+}
+EXPORT_SYMBOL_GPL(fuse_request_end);
+
+static int queue_interrupt(struct fuse_req *req)
+{
+ struct fuse_iqueue *fiq = &req->fm->fc->iq;
+
+ spin_lock(&fiq->lock);
+ /* Check for we've sent request to interrupt this req */
+ if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) {
+ spin_unlock(&fiq->lock);
+ return -EINVAL;
+ }
+
+ if (list_empty(&req->intr_entry)) {
+ list_add_tail(&req->intr_entry, &fiq->interrupts);
+ /*
+ * Pairs with smp_mb() implied by test_and_set_bit()
+ * from fuse_request_end().
+ */
+ smp_mb();
+ if (test_bit(FR_FINISHED, &req->flags)) {
+ list_del_init(&req->intr_entry);
+ spin_unlock(&fiq->lock);
+ return 0;
+ }
+ fiq->ops->wake_interrupt_and_unlock(fiq);
+ } else {
+ spin_unlock(&fiq->lock);
+ }
+ return 0;
+}
+
+static void request_wait_answer(struct fuse_req *req)
+{
+ struct fuse_conn *fc = req->fm->fc;
+ struct fuse_iqueue *fiq = &fc->iq;
+ int err;
+
+ if (!fc->no_interrupt) {
+ /* Any signal may interrupt this */
+ err = wait_event_interruptible(req->waitq,
+ test_bit(FR_FINISHED, &req->flags));
+ if (!err)
+ return;
+
+ set_bit(FR_INTERRUPTED, &req->flags);
+ /* matches barrier in fuse_dev_do_read() */
+ smp_mb__after_atomic();
+ if (test_bit(FR_SENT, &req->flags))
+ queue_interrupt(req);
+ }
+
+ if (!test_bit(FR_FORCE, &req->flags)) {
+ /* Only fatal signals may interrupt this */
+ err = wait_event_killable(req->waitq,
+ test_bit(FR_FINISHED, &req->flags));
+ if (!err)
+ return;
+
+ spin_lock(&fiq->lock);
+ /* Request is not yet in userspace, bail out */
+ if (test_bit(FR_PENDING, &req->flags)) {
+ list_del(&req->list);
+ spin_unlock(&fiq->lock);
+ __fuse_put_request(req);
+ req->out.h.error = -EINTR;
+ return;
+ }
+ spin_unlock(&fiq->lock);
+ }
+
+ /*
+ * Either request is already in userspace, or it was forced.
+ * Wait it out.
+ */
+ wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags));
+}
+
+static void __fuse_request_send(struct fuse_req *req)
+{
+ struct fuse_iqueue *fiq = &req->fm->fc->iq;
+
+ BUG_ON(test_bit(FR_BACKGROUND, &req->flags));
+ spin_lock(&fiq->lock);
+ if (!fiq->connected) {
+ spin_unlock(&fiq->lock);
+ req->out.h.error = -ENOTCONN;
+ } else {
+ req->in.h.unique = fuse_get_unique(fiq);
+ /* acquire extra reference, since request is still needed
+ after fuse_request_end() */
+ __fuse_get_request(req);
+ queue_request_and_unlock(fiq, req);
+
+ request_wait_answer(req);
+ /* Pairs with smp_wmb() in fuse_request_end() */
+ smp_rmb();
+ }
+}
+
+static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args)
+{
+ if (fc->minor < 4 && args->opcode == FUSE_STATFS)
+ args->out_args[0].size = FUSE_COMPAT_STATFS_SIZE;
+
+ if (fc->minor < 9) {
+ switch (args->opcode) {
+ case FUSE_LOOKUP:
+ case FUSE_CREATE:
+ case FUSE_MKNOD:
+ case FUSE_MKDIR:
+ case FUSE_SYMLINK:
+ case FUSE_LINK:
+ args->out_args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
+ break;
+ case FUSE_GETATTR:
+ case FUSE_SETATTR:
+ args->out_args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
+ break;
+ }
+ }
+ if (fc->minor < 12) {
+ switch (args->opcode) {
+ case FUSE_CREATE:
+ args->in_args[0].size = sizeof(struct fuse_open_in);
+ break;
+ case FUSE_MKNOD:
+ args->in_args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE;
+ break;
+ }
+ }
+}
+
+static void fuse_force_creds(struct fuse_req *req)
+{
+ struct fuse_conn *fc = req->fm->fc;
+
+ req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid());
+ req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid());
+ req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
+}
+
+static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args)
+{
+ req->in.h.opcode = args->opcode;
+ req->in.h.nodeid = args->nodeid;
+ req->args = args;
+ if (args->end)
+ __set_bit(FR_ASYNC, &req->flags);
+}
+
+ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args)
+{
+ struct fuse_conn *fc = fm->fc;
+ struct fuse_req *req;
+ ssize_t ret;
+
+ if (args->force) {
+ atomic_inc(&fc->num_waiting);
+ req = fuse_request_alloc(fm, GFP_KERNEL | __GFP_NOFAIL);
+
+ if (!args->nocreds)
+ fuse_force_creds(req);
+
+ __set_bit(FR_WAITING, &req->flags);
+ __set_bit(FR_FORCE, &req->flags);
+ } else {
+ WARN_ON(args->nocreds);
+ req = fuse_get_req(fm, false);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ }
+
+ /* Needs to be done after fuse_get_req() so that fc->minor is valid */
+ fuse_adjust_compat(fc, args);
+ fuse_args_to_req(req, args);
+
+ if (!args->noreply)
+ __set_bit(FR_ISREPLY, &req->flags);
+ __fuse_request_send(req);
+ ret = req->out.h.error;
+ if (!ret && args->out_argvar) {
+ BUG_ON(args->out_numargs == 0);
+ ret = args->out_args[args->out_numargs - 1].size;
+ }
+ fuse_put_request(req);
+
+ return ret;
+}
+
+static bool fuse_request_queue_background(struct fuse_req *req)
+{
+ struct fuse_mount *fm = req->fm;
+ struct fuse_conn *fc = fm->fc;
+ bool queued = false;
+
+ WARN_ON(!test_bit(FR_BACKGROUND, &req->flags));
+ if (!test_bit(FR_WAITING, &req->flags)) {
+ __set_bit(FR_WAITING, &req->flags);
+ atomic_inc(&fc->num_waiting);
+ }
+ __set_bit(FR_ISREPLY, &req->flags);
+ spin_lock(&fc->bg_lock);
+ if (likely(fc->connected)) {
+ fc->num_background++;
+ if (fc->num_background == fc->max_background)
+ fc->blocked = 1;
+ list_add_tail(&req->list, &fc->bg_queue);
+ flush_bg_queue(fc);
+ queued = true;
+ }
+ spin_unlock(&fc->bg_lock);
+
+ return queued;
+}
+
+int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
+ gfp_t gfp_flags)
+{
+ struct fuse_req *req;
+
+ if (args->force) {
+ WARN_ON(!args->nocreds);
+ req = fuse_request_alloc(fm, gfp_flags);
+ if (!req)
+ return -ENOMEM;
+ __set_bit(FR_BACKGROUND, &req->flags);
+ } else {
+ WARN_ON(args->nocreds);
+ req = fuse_get_req(fm, true);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ }
+
+ fuse_args_to_req(req, args);
+
+ if (!fuse_request_queue_background(req)) {
+ fuse_put_request(req);
+ return -ENOTCONN;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fuse_simple_background);
+
+static int fuse_simple_notify_reply(struct fuse_mount *fm,
+ struct fuse_args *args, u64 unique)
+{
+ struct fuse_req *req;
+ struct fuse_iqueue *fiq = &fm->fc->iq;
+ int err = 0;
+
+ req = fuse_get_req(fm, false);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ __clear_bit(FR_ISREPLY, &req->flags);
+ req->in.h.unique = unique;
+
+ fuse_args_to_req(req, args);
+
+ spin_lock(&fiq->lock);
+ if (fiq->connected) {
+ queue_request_and_unlock(fiq, req);
+ } else {
+ err = -ENODEV;
+ spin_unlock(&fiq->lock);
+ fuse_put_request(req);
+ }
+
+ return err;
+}
+
+/*
+ * Lock the request. Up to the next unlock_request() there mustn't be
+ * anything that could cause a page-fault. If the request was already
+ * aborted bail out.
+ */
+static int lock_request(struct fuse_req *req)
+{
+ int err = 0;
+ if (req) {
+ spin_lock(&req->waitq.lock);
+ if (test_bit(FR_ABORTED, &req->flags))
+ err = -ENOENT;
+ else
+ set_bit(FR_LOCKED, &req->flags);
+ spin_unlock(&req->waitq.lock);
+ }
+ return err;
+}
+
+/*
+ * Unlock request. If it was aborted while locked, caller is responsible
+ * for unlocking and ending the request.
+ */
+static int unlock_request(struct fuse_req *req)
+{
+ int err = 0;
+ if (req) {
+ spin_lock(&req->waitq.lock);
+ if (test_bit(FR_ABORTED, &req->flags))
+ err = -ENOENT;
+ else
+ clear_bit(FR_LOCKED, &req->flags);
+ spin_unlock(&req->waitq.lock);
+ }
+ return err;
+}
+
+struct fuse_copy_state {
+ int write;
+ struct fuse_req *req;
+ struct iov_iter *iter;
+ struct pipe_buffer *pipebufs;
+ struct pipe_buffer *currbuf;
+ struct pipe_inode_info *pipe;
+ unsigned long nr_segs;
+ struct page *pg;
+ unsigned len;
+ unsigned offset;
+ unsigned move_pages:1;
+};
+
+static void fuse_copy_init(struct fuse_copy_state *cs, int write,
+ struct iov_iter *iter)
+{
+ memset(cs, 0, sizeof(*cs));
+ cs->write = write;
+ cs->iter = iter;
+}
+
+/* Unmap and put previous page of userspace buffer */
+static void fuse_copy_finish(struct fuse_copy_state *cs)
+{
+ if (cs->currbuf) {
+ struct pipe_buffer *buf = cs->currbuf;
+
+ if (cs->write)
+ buf->len = PAGE_SIZE - cs->len;
+ cs->currbuf = NULL;
+ } else if (cs->pg) {
+ if (cs->write) {
+ flush_dcache_page(cs->pg);
+ set_page_dirty_lock(cs->pg);
+ }
+ put_page(cs->pg);
+ }
+ cs->pg = NULL;
+}
+
+/*
+ * Get another pagefull of userspace buffer, and map it to kernel
+ * address space, and lock request
+ */
+static int fuse_copy_fill(struct fuse_copy_state *cs)
+{
+ struct page *page;
+ int err;
+
+ err = unlock_request(cs->req);
+ if (err)
+ return err;
+
+ fuse_copy_finish(cs);
+ if (cs->pipebufs) {
+ struct pipe_buffer *buf = cs->pipebufs;
+
+ if (!cs->write) {
+ err = pipe_buf_confirm(cs->pipe, buf);
+ if (err)
+ return err;
+
+ BUG_ON(!cs->nr_segs);
+ cs->currbuf = buf;
+ cs->pg = buf->page;
+ cs->offset = buf->offset;
+ cs->len = buf->len;
+ cs->pipebufs++;
+ cs->nr_segs--;
+ } else {
+ if (cs->nr_segs >= cs->pipe->max_usage)
+ return -EIO;
+
+ page = alloc_page(GFP_HIGHUSER);
+ if (!page)
+ return -ENOMEM;
+
+ buf->page = page;
+ buf->offset = 0;
+ buf->len = 0;
+
+ cs->currbuf = buf;
+ cs->pg = page;
+ cs->offset = 0;
+ cs->len = PAGE_SIZE;
+ cs->pipebufs++;
+ cs->nr_segs++;
+ }
+ } else {
+ size_t off;
+ err = iov_iter_get_pages2(cs->iter, &page, PAGE_SIZE, 1, &off);
+ if (err < 0)
+ return err;
+ BUG_ON(!err);
+ cs->len = err;
+ cs->offset = off;
+ cs->pg = page;
+ }
+
+ return lock_request(cs->req);
+}
+
+/* Do as much copy to/from userspace buffer as we can */
+static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
+{
+ unsigned ncpy = min(*size, cs->len);
+ if (val) {
+ void *pgaddr = kmap_local_page(cs->pg);
+ void *buf = pgaddr + cs->offset;
+
+ if (cs->write)
+ memcpy(buf, *val, ncpy);
+ else
+ memcpy(*val, buf, ncpy);
+
+ kunmap_local(pgaddr);
+ *val += ncpy;
+ }
+ *size -= ncpy;
+ cs->len -= ncpy;
+ cs->offset += ncpy;
+ return ncpy;
+}
+
+static int fuse_check_page(struct page *page)
+{
+ if (page_mapcount(page) ||
+ page->mapping != NULL ||
+ (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
+ ~(1 << PG_locked |
+ 1 << PG_referenced |
+ 1 << PG_uptodate |
+ 1 << PG_lru |
+ 1 << PG_active |
+ 1 << PG_workingset |
+ 1 << PG_reclaim |
+ 1 << PG_waiters |
+ LRU_GEN_MASK | LRU_REFS_MASK))) {
+ dump_page(page, "fuse: trying to steal weird page");
+ return 1;
+ }
+ return 0;
+}
+
+static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
+{
+ int err;
+ struct page *oldpage = *pagep;
+ struct page *newpage;
+ struct pipe_buffer *buf = cs->pipebufs;
+
+ get_page(oldpage);
+ err = unlock_request(cs->req);
+ if (err)
+ goto out_put_old;
+
+ fuse_copy_finish(cs);
+
+ err = pipe_buf_confirm(cs->pipe, buf);
+ if (err)
+ goto out_put_old;
+
+ BUG_ON(!cs->nr_segs);
+ cs->currbuf = buf;
+ cs->len = buf->len;
+ cs->pipebufs++;
+ cs->nr_segs--;
+
+ if (cs->len != PAGE_SIZE)
+ goto out_fallback;
+
+ if (!pipe_buf_try_steal(cs->pipe, buf))
+ goto out_fallback;
+
+ newpage = buf->page;
+
+ if (!PageUptodate(newpage))
+ SetPageUptodate(newpage);
+
+ ClearPageMappedToDisk(newpage);
+
+ if (fuse_check_page(newpage) != 0)
+ goto out_fallback_unlock;
+
+ /*
+ * This is a new and locked page, it shouldn't be mapped or
+ * have any special flags on it
+ */
+ if (WARN_ON(page_mapped(oldpage)))
+ goto out_fallback_unlock;
+ if (WARN_ON(page_has_private(oldpage)))
+ goto out_fallback_unlock;
+ if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
+ goto out_fallback_unlock;
+ if (WARN_ON(PageMlocked(oldpage)))
+ goto out_fallback_unlock;
+
+ replace_page_cache_page(oldpage, newpage);
+
+ get_page(newpage);
+
+ if (!(buf->flags & PIPE_BUF_FLAG_LRU))
+ lru_cache_add(newpage);
+
+ /*
+ * Release while we have extra ref on stolen page. Otherwise
+ * anon_pipe_buf_release() might think the page can be reused.
+ */
+ pipe_buf_release(cs->pipe, buf);
+
+ err = 0;
+ spin_lock(&cs->req->waitq.lock);
+ if (test_bit(FR_ABORTED, &cs->req->flags))
+ err = -ENOENT;
+ else
+ *pagep = newpage;
+ spin_unlock(&cs->req->waitq.lock);
+
+ if (err) {
+ unlock_page(newpage);
+ put_page(newpage);
+ goto out_put_old;
+ }
+
+ unlock_page(oldpage);
+ /* Drop ref for ap->pages[] array */
+ put_page(oldpage);
+ cs->len = 0;
+
+ err = 0;
+out_put_old:
+ /* Drop ref obtained in this function */
+ put_page(oldpage);
+ return err;
+
+out_fallback_unlock:
+ unlock_page(newpage);
+out_fallback:
+ cs->pg = buf->page;
+ cs->offset = buf->offset;
+
+ err = lock_request(cs->req);
+ if (!err)
+ err = 1;
+
+ goto out_put_old;
+}
+
+static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
+ unsigned offset, unsigned count)
+{
+ struct pipe_buffer *buf;
+ int err;
+
+ if (cs->nr_segs >= cs->pipe->max_usage)
+ return -EIO;
+
+ get_page(page);
+ err = unlock_request(cs->req);
+ if (err) {
+ put_page(page);
+ return err;
+ }
+
+ fuse_copy_finish(cs);
+
+ buf = cs->pipebufs;
+ buf->page = page;
+ buf->offset = offset;
+ buf->len = count;
+
+ cs->pipebufs++;
+ cs->nr_segs++;
+ cs->len = 0;
+
+ return 0;
+}
+
+/*
+ * Copy a page in the request to/from the userspace buffer. Must be
+ * done atomically
+ */
+static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
+ unsigned offset, unsigned count, int zeroing)
+{
+ int err;
+ struct page *page = *pagep;
+
+ if (page && zeroing && count < PAGE_SIZE)
+ clear_highpage(page);
+
+ while (count) {
+ if (cs->write && cs->pipebufs && page) {
+ /*
+ * Can't control lifetime of pipe buffers, so always
+ * copy user pages.
+ */
+ if (cs->req->args->user_pages) {
+ err = fuse_copy_fill(cs);
+ if (err)
+ return err;
+ } else {
+ return fuse_ref_page(cs, page, offset, count);
+ }
+ } else if (!cs->len) {
+ if (cs->move_pages && page &&
+ offset == 0 && count == PAGE_SIZE) {
+ err = fuse_try_move_page(cs, pagep);
+ if (err <= 0)
+ return err;
+ } else {
+ err = fuse_copy_fill(cs);
+ if (err)
+ return err;
+ }
+ }
+ if (page) {
+ void *mapaddr = kmap_local_page(page);
+ void *buf = mapaddr + offset;
+ offset += fuse_copy_do(cs, &buf, &count);
+ kunmap_local(mapaddr);
+ } else
+ offset += fuse_copy_do(cs, NULL, &count);
+ }
+ if (page && !cs->write)
+ flush_dcache_page(page);
+ return 0;
+}
+
+/* Copy pages in the request to/from userspace buffer */
+static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
+ int zeroing)
+{
+ unsigned i;
+ struct fuse_req *req = cs->req;
+ struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args);
+
+
+ for (i = 0; i < ap->num_pages && (nbytes || zeroing); i++) {
+ int err;
+ unsigned int offset = ap->descs[i].offset;
+ unsigned int count = min(nbytes, ap->descs[i].length);
+
+ err = fuse_copy_page(cs, &ap->pages[i], offset, count, zeroing);
+ if (err)
+ return err;
+
+ nbytes -= count;
+ }
+ return 0;
+}
+
+/* Copy a single argument in the request to/from userspace buffer */
+static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
+{
+ while (size) {
+ if (!cs->len) {
+ int err = fuse_copy_fill(cs);
+ if (err)
+ return err;
+ }
+ fuse_copy_do(cs, &val, &size);
+ }
+ return 0;
+}
+
+/* Copy request arguments to/from userspace buffer */
+static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
+ unsigned argpages, struct fuse_arg *args,
+ int zeroing)
+{
+ int err = 0;
+ unsigned i;
+
+ for (i = 0; !err && i < numargs; i++) {
+ struct fuse_arg *arg = &args[i];
+ if (i == numargs - 1 && argpages)
+ err = fuse_copy_pages(cs, arg->size, zeroing);
+ else
+ err = fuse_copy_one(cs, arg->value, arg->size);
+ }
+ return err;
+}
+
+static int forget_pending(struct fuse_iqueue *fiq)
+{
+ return fiq->forget_list_head.next != NULL;
+}
+
+static int request_pending(struct fuse_iqueue *fiq)
+{
+ return !list_empty(&fiq->pending) || !list_empty(&fiq->interrupts) ||
+ forget_pending(fiq);
+}
+
+/*
+ * Transfer an interrupt request to userspace
+ *
+ * Unlike other requests this is assembled on demand, without a need
+ * to allocate a separate fuse_req structure.
+ *
+ * Called with fiq->lock held, releases it
+ */
+static int fuse_read_interrupt(struct fuse_iqueue *fiq,
+ struct fuse_copy_state *cs,
+ size_t nbytes, struct fuse_req *req)
+__releases(fiq->lock)
+{
+ struct fuse_in_header ih;
+ struct fuse_interrupt_in arg;
+ unsigned reqsize = sizeof(ih) + sizeof(arg);
+ int err;
+
+ list_del_init(&req->intr_entry);
+ memset(&ih, 0, sizeof(ih));
+ memset(&arg, 0, sizeof(arg));
+ ih.len = reqsize;
+ ih.opcode = FUSE_INTERRUPT;
+ ih.unique = (req->in.h.unique | FUSE_INT_REQ_BIT);
+ arg.unique = req->in.h.unique;
+
+ spin_unlock(&fiq->lock);
+ if (nbytes < reqsize)
+ return -EINVAL;
+
+ err = fuse_copy_one(cs, &ih, sizeof(ih));
+ if (!err)
+ err = fuse_copy_one(cs, &arg, sizeof(arg));
+ fuse_copy_finish(cs);
+
+ return err ? err : reqsize;
+}
+
+struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq,
+ unsigned int max,
+ unsigned int *countp)
+{
+ struct fuse_forget_link *head = fiq->forget_list_head.next;
+ struct fuse_forget_link **newhead = &head;
+ unsigned count;
+
+ for (count = 0; *newhead != NULL && count < max; count++)
+ newhead = &(*newhead)->next;
+
+ fiq->forget_list_head.next = *newhead;
+ *newhead = NULL;
+ if (fiq->forget_list_head.next == NULL)
+ fiq->forget_list_tail = &fiq->forget_list_head;
+
+ if (countp != NULL)
+ *countp = count;
+
+ return head;
+}
+EXPORT_SYMBOL(fuse_dequeue_forget);
+
+static int fuse_read_single_forget(struct fuse_iqueue *fiq,
+ struct fuse_copy_state *cs,
+ size_t nbytes)
+__releases(fiq->lock)
+{
+ int err;
+ struct fuse_forget_link *forget = fuse_dequeue_forget(fiq, 1, NULL);
+ struct fuse_forget_in arg = {
+ .nlookup = forget->forget_one.nlookup,
+ };
+ struct fuse_in_header ih = {
+ .opcode = FUSE_FORGET,
+ .nodeid = forget->forget_one.nodeid,
+ .unique = fuse_get_unique(fiq),
+ .len = sizeof(ih) + sizeof(arg),
+ };
+
+ spin_unlock(&fiq->lock);
+ kfree(forget);
+ if (nbytes < ih.len)
+ return -EINVAL;
+
+ err = fuse_copy_one(cs, &ih, sizeof(ih));
+ if (!err)
+ err = fuse_copy_one(cs, &arg, sizeof(arg));
+ fuse_copy_finish(cs);
+
+ if (err)
+ return err;
+
+ return ih.len;
+}
+
+static int fuse_read_batch_forget(struct fuse_iqueue *fiq,
+ struct fuse_copy_state *cs, size_t nbytes)
+__releases(fiq->lock)
+{
+ int err;
+ unsigned max_forgets;
+ unsigned count;
+ struct fuse_forget_link *head;
+ struct fuse_batch_forget_in arg = { .count = 0 };
+ struct fuse_in_header ih = {
+ .opcode = FUSE_BATCH_FORGET,
+ .unique = fuse_get_unique(fiq),
+ .len = sizeof(ih) + sizeof(arg),
+ };
+
+ if (nbytes < ih.len) {
+ spin_unlock(&fiq->lock);
+ return -EINVAL;
+ }
+
+ max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
+ head = fuse_dequeue_forget(fiq, max_forgets, &count);
+ spin_unlock(&fiq->lock);
+
+ arg.count = count;
+ ih.len += count * sizeof(struct fuse_forget_one);
+ err = fuse_copy_one(cs, &ih, sizeof(ih));
+ if (!err)
+ err = fuse_copy_one(cs, &arg, sizeof(arg));
+
+ while (head) {
+ struct fuse_forget_link *forget = head;
+
+ if (!err) {
+ err = fuse_copy_one(cs, &forget->forget_one,
+ sizeof(forget->forget_one));
+ }
+ head = forget->next;
+ kfree(forget);
+ }
+
+ fuse_copy_finish(cs);
+
+ if (err)
+ return err;
+
+ return ih.len;
+}
+
+static int fuse_read_forget(struct fuse_conn *fc, struct fuse_iqueue *fiq,
+ struct fuse_copy_state *cs,
+ size_t nbytes)
+__releases(fiq->lock)
+{
+ if (fc->minor < 16 || fiq->forget_list_head.next->next == NULL)
+ return fuse_read_single_forget(fiq, cs, nbytes);
+ else
+ return fuse_read_batch_forget(fiq, cs, nbytes);
+}
+
+/*
+ * Read a single request into the userspace filesystem's buffer. This
+ * function waits until a request is available, then removes it from
+ * the pending list and copies request data to userspace buffer. If
+ * no reply is needed (FORGET) or request has been aborted or there
+ * was an error during the copying then it's finished by calling
+ * fuse_request_end(). Otherwise add it to the processing list, and set
+ * the 'sent' flag.
+ */
+static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
+ struct fuse_copy_state *cs, size_t nbytes)
+{
+ ssize_t err;
+ struct fuse_conn *fc = fud->fc;
+ struct fuse_iqueue *fiq = &fc->iq;
+ struct fuse_pqueue *fpq = &fud->pq;
+ struct fuse_req *req;
+ struct fuse_args *args;
+ unsigned reqsize;
+ unsigned int hash;
+
+ /*
+ * Require sane minimum read buffer - that has capacity for fixed part
+ * of any request header + negotiated max_write room for data.
+ *
+ * Historically libfuse reserves 4K for fixed header room, but e.g.
+ * GlusterFS reserves only 80 bytes
+ *
+ * = `sizeof(fuse_in_header) + sizeof(fuse_write_in)`
+ *
+ * which is the absolute minimum any sane filesystem should be using
+ * for header room.
+ */
+ if (nbytes < max_t(size_t, FUSE_MIN_READ_BUFFER,
+ sizeof(struct fuse_in_header) +
+ sizeof(struct fuse_write_in) +
+ fc->max_write))
+ return -EINVAL;
+
+ restart:
+ for (;;) {
+ spin_lock(&fiq->lock);
+ if (!fiq->connected || request_pending(fiq))
+ break;
+ spin_unlock(&fiq->lock);
+
+ if (file->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+ err = wait_event_interruptible_exclusive(fiq->waitq,
+ !fiq->connected || request_pending(fiq));
+ if (err)
+ return err;
+ }
+
+ if (!fiq->connected) {
+ err = fc->aborted ? -ECONNABORTED : -ENODEV;
+ goto err_unlock;
+ }
+
+ if (!list_empty(&fiq->interrupts)) {
+ req = list_entry(fiq->interrupts.next, struct fuse_req,
+ intr_entry);
+ return fuse_read_interrupt(fiq, cs, nbytes, req);
+ }
+
+ if (forget_pending(fiq)) {
+ if (list_empty(&fiq->pending) || fiq->forget_batch-- > 0)
+ return fuse_read_forget(fc, fiq, cs, nbytes);
+
+ if (fiq->forget_batch <= -8)
+ fiq->forget_batch = 16;
+ }
+
+ req = list_entry(fiq->pending.next, struct fuse_req, list);
+ clear_bit(FR_PENDING, &req->flags);
+ list_del_init(&req->list);
+ spin_unlock(&fiq->lock);
+
+ args = req->args;
+ reqsize = req->in.h.len;
+
+ /* If request is too large, reply with an error and restart the read */
+ if (nbytes < reqsize) {
+ req->out.h.error = -EIO;
+ /* SETXATTR is special, since it may contain too large data */
+ if (args->opcode == FUSE_SETXATTR)
+ req->out.h.error = -E2BIG;
+ fuse_request_end(req);
+ goto restart;
+ }
+ spin_lock(&fpq->lock);
+ /*
+ * Must not put request on fpq->io queue after having been shut down by
+ * fuse_abort_conn()
+ */
+ if (!fpq->connected) {
+ req->out.h.error = err = -ECONNABORTED;
+ goto out_end;
+
+ }
+ list_add(&req->list, &fpq->io);
+ spin_unlock(&fpq->lock);
+ cs->req = req;
+ err = fuse_copy_one(cs, &req->in.h, sizeof(req->in.h));
+ if (!err)
+ err = fuse_copy_args(cs, args->in_numargs, args->in_pages,
+ (struct fuse_arg *) args->in_args, 0);
+ fuse_copy_finish(cs);
+ spin_lock(&fpq->lock);
+ clear_bit(FR_LOCKED, &req->flags);
+ if (!fpq->connected) {
+ err = fc->aborted ? -ECONNABORTED : -ENODEV;
+ goto out_end;
+ }
+ if (err) {
+ req->out.h.error = -EIO;
+ goto out_end;
+ }
+ if (!test_bit(FR_ISREPLY, &req->flags)) {
+ err = reqsize;
+ goto out_end;
+ }
+ hash = fuse_req_hash(req->in.h.unique);
+ list_move_tail(&req->list, &fpq->processing[hash]);
+ __fuse_get_request(req);
+ set_bit(FR_SENT, &req->flags);
+ spin_unlock(&fpq->lock);
+ /* matches barrier in request_wait_answer() */
+ smp_mb__after_atomic();
+ if (test_bit(FR_INTERRUPTED, &req->flags))
+ queue_interrupt(req);
+ fuse_put_request(req);
+
+ return reqsize;
+
+out_end:
+ if (!test_bit(FR_PRIVATE, &req->flags))
+ list_del_init(&req->list);
+ spin_unlock(&fpq->lock);
+ fuse_request_end(req);
+ return err;
+
+ err_unlock:
+ spin_unlock(&fiq->lock);
+ return err;
+}
+
+static int fuse_dev_open(struct inode *inode, struct file *file)
+{
+ /*
+ * The fuse device's file's private_data is used to hold
+ * the fuse_conn(ection) when it is mounted, and is used to
+ * keep track of whether the file has been mounted already.
+ */
+ file->private_data = NULL;
+ return 0;
+}
+
+static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct fuse_copy_state cs;
+ struct file *file = iocb->ki_filp;
+ struct fuse_dev *fud = fuse_get_dev(file);
+
+ if (!fud)
+ return -EPERM;
+
+ if (!user_backed_iter(to))
+ return -EINVAL;
+
+ fuse_copy_init(&cs, 1, to);
+
+ return fuse_dev_do_read(fud, file, &cs, iov_iter_count(to));
+}
+
+static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ int total, ret;
+ int page_nr = 0;
+ struct pipe_buffer *bufs;
+ struct fuse_copy_state cs;
+ struct fuse_dev *fud = fuse_get_dev(in);
+
+ if (!fud)
+ return -EPERM;
+
+ bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer),
+ GFP_KERNEL);
+ if (!bufs)
+ return -ENOMEM;
+
+ fuse_copy_init(&cs, 1, NULL);
+ cs.pipebufs = bufs;
+ cs.pipe = pipe;
+ ret = fuse_dev_do_read(fud, in, &cs, len);
+ if (ret < 0)
+ goto out;
+
+ if (pipe_occupancy(pipe->head, pipe->tail) + cs.nr_segs > pipe->max_usage) {
+ ret = -EIO;
+ goto out;
+ }
+
+ for (ret = total = 0; page_nr < cs.nr_segs; total += ret) {
+ /*
+ * Need to be careful about this. Having buf->ops in module
+ * code can Oops if the buffer persists after module unload.
+ */
+ bufs[page_nr].ops = &nosteal_pipe_buf_ops;
+ bufs[page_nr].flags = 0;
+ ret = add_to_pipe(pipe, &bufs[page_nr++]);
+ if (unlikely(ret < 0))
+ break;
+ }
+ if (total)
+ ret = total;
+out:
+ for (; page_nr < cs.nr_segs; page_nr++)
+ put_page(bufs[page_nr].page);
+
+ kvfree(bufs);
+ return ret;
+}
+
+static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
+ struct fuse_copy_state *cs)
+{
+ struct fuse_notify_poll_wakeup_out outarg;
+ int err = -EINVAL;
+
+ if (size != sizeof(outarg))
+ goto err;
+
+ err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+ if (err)
+ goto err;
+
+ fuse_copy_finish(cs);
+ return fuse_notify_poll_wakeup(fc, &outarg);
+
+err:
+ fuse_copy_finish(cs);
+ return err;
+}
+
+static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
+ struct fuse_copy_state *cs)
+{
+ struct fuse_notify_inval_inode_out outarg;
+ int err = -EINVAL;
+
+ if (size != sizeof(outarg))
+ goto err;
+
+ err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+ if (err)
+ goto err;
+ fuse_copy_finish(cs);
+
+ down_read(&fc->killsb);
+ err = fuse_reverse_inval_inode(fc, outarg.ino,
+ outarg.off, outarg.len);
+ up_read(&fc->killsb);
+ return err;
+
+err:
+ fuse_copy_finish(cs);
+ return err;
+}
+
+static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
+ struct fuse_copy_state *cs)
+{
+ struct fuse_notify_inval_entry_out outarg;
+ int err = -ENOMEM;
+ char *buf;
+ struct qstr name;
+
+ buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
+ if (!buf)
+ goto err;
+
+ err = -EINVAL;
+ if (size < sizeof(outarg))
+ goto err;
+
+ err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+ if (err)
+ goto err;
+
+ err = -ENAMETOOLONG;
+ if (outarg.namelen > FUSE_NAME_MAX)
+ goto err;
+
+ err = -EINVAL;
+ if (size != sizeof(outarg) + outarg.namelen + 1)
+ goto err;
+
+ name.name = buf;
+ name.len = outarg.namelen;
+ err = fuse_copy_one(cs, buf, outarg.namelen + 1);
+ if (err)
+ goto err;
+ fuse_copy_finish(cs);
+ buf[outarg.namelen] = 0;
+
+ down_read(&fc->killsb);
+ err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name);
+ up_read(&fc->killsb);
+ kfree(buf);
+ return err;
+
+err:
+ kfree(buf);
+ fuse_copy_finish(cs);
+ return err;
+}
+
+static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
+ struct fuse_copy_state *cs)
+{
+ struct fuse_notify_delete_out outarg;
+ int err = -ENOMEM;
+ char *buf;
+ struct qstr name;
+
+ buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
+ if (!buf)
+ goto err;
+
+ err = -EINVAL;
+ if (size < sizeof(outarg))
+ goto err;
+
+ err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+ if (err)
+ goto err;
+
+ err = -ENAMETOOLONG;
+ if (outarg.namelen > FUSE_NAME_MAX)
+ goto err;
+
+ err = -EINVAL;
+ if (size != sizeof(outarg) + outarg.namelen + 1)
+ goto err;
+
+ name.name = buf;
+ name.len = outarg.namelen;
+ err = fuse_copy_one(cs, buf, outarg.namelen + 1);
+ if (err)
+ goto err;
+ fuse_copy_finish(cs);
+ buf[outarg.namelen] = 0;
+
+ down_read(&fc->killsb);
+ err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name);
+ up_read(&fc->killsb);
+ kfree(buf);
+ return err;
+
+err:
+ kfree(buf);
+ fuse_copy_finish(cs);
+ return err;
+}
+
+static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
+ struct fuse_copy_state *cs)
+{
+ struct fuse_notify_store_out outarg;
+ struct inode *inode;
+ struct address_space *mapping;
+ u64 nodeid;
+ int err;
+ pgoff_t index;
+ unsigned int offset;
+ unsigned int num;
+ loff_t file_size;
+ loff_t end;
+
+ err = -EINVAL;
+ if (size < sizeof(outarg))
+ goto out_finish;
+
+ err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+ if (err)
+ goto out_finish;
+
+ err = -EINVAL;
+ if (size - sizeof(outarg) != outarg.size)
+ goto out_finish;
+
+ nodeid = outarg.nodeid;
+
+ down_read(&fc->killsb);
+
+ err = -ENOENT;
+ inode = fuse_ilookup(fc, nodeid, NULL);
+ if (!inode)
+ goto out_up_killsb;
+
+ mapping = inode->i_mapping;
+ index = outarg.offset >> PAGE_SHIFT;
+ offset = outarg.offset & ~PAGE_MASK;
+ file_size = i_size_read(inode);
+ end = outarg.offset + outarg.size;
+ if (end > file_size) {
+ file_size = end;
+ fuse_write_update_attr(inode, file_size, outarg.size);
+ }
+
+ num = outarg.size;
+ while (num) {
+ struct page *page;
+ unsigned int this_num;
+
+ err = -ENOMEM;
+ page = find_or_create_page(mapping, index,
+ mapping_gfp_mask(mapping));
+ if (!page)
+ goto out_iput;
+
+ this_num = min_t(unsigned, num, PAGE_SIZE - offset);
+ err = fuse_copy_page(cs, &page, offset, this_num, 0);
+ if (!err && offset == 0 &&
+ (this_num == PAGE_SIZE || file_size == end))
+ SetPageUptodate(page);
+ unlock_page(page);
+ put_page(page);
+
+ if (err)
+ goto out_iput;
+
+ num -= this_num;
+ offset = 0;
+ index++;
+ }
+
+ err = 0;
+
+out_iput:
+ iput(inode);
+out_up_killsb:
+ up_read(&fc->killsb);
+out_finish:
+ fuse_copy_finish(cs);
+ return err;
+}
+
+struct fuse_retrieve_args {
+ struct fuse_args_pages ap;
+ struct fuse_notify_retrieve_in inarg;
+};
+
+static void fuse_retrieve_end(struct fuse_mount *fm, struct fuse_args *args,
+ int error)
+{
+ struct fuse_retrieve_args *ra =
+ container_of(args, typeof(*ra), ap.args);
+
+ release_pages(ra->ap.pages, ra->ap.num_pages);
+ kfree(ra);
+}
+
+static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
+ struct fuse_notify_retrieve_out *outarg)
+{
+ int err;
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t index;
+ loff_t file_size;
+ unsigned int num;
+ unsigned int offset;
+ size_t total_len = 0;
+ unsigned int num_pages;
+ struct fuse_conn *fc = fm->fc;
+ struct fuse_retrieve_args *ra;
+ size_t args_size = sizeof(*ra);
+ struct fuse_args_pages *ap;
+ struct fuse_args *args;
+
+ offset = outarg->offset & ~PAGE_MASK;
+ file_size = i_size_read(inode);
+
+ num = min(outarg->size, fc->max_write);
+ if (outarg->offset > file_size)
+ num = 0;
+ else if (outarg->offset + num > file_size)
+ num = file_size - outarg->offset;
+
+ num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ num_pages = min(num_pages, fc->max_pages);
+
+ args_size += num_pages * (sizeof(ap->pages[0]) + sizeof(ap->descs[0]));
+
+ ra = kzalloc(args_size, GFP_KERNEL);
+ if (!ra)
+ return -ENOMEM;
+
+ ap = &ra->ap;
+ ap->pages = (void *) (ra + 1);
+ ap->descs = (void *) (ap->pages + num_pages);
+
+ args = &ap->args;
+ args->nodeid = outarg->nodeid;
+ args->opcode = FUSE_NOTIFY_REPLY;
+ args->in_numargs = 2;
+ args->in_pages = true;
+ args->end = fuse_retrieve_end;
+
+ index = outarg->offset >> PAGE_SHIFT;
+
+ while (num && ap->num_pages < num_pages) {
+ struct page *page;
+ unsigned int this_num;
+
+ page = find_get_page(mapping, index);
+ if (!page)
+ break;
+
+ this_num = min_t(unsigned, num, PAGE_SIZE - offset);
+ ap->pages[ap->num_pages] = page;
+ ap->descs[ap->num_pages].offset = offset;
+ ap->descs[ap->num_pages].length = this_num;
+ ap->num_pages++;
+
+ offset = 0;
+ num -= this_num;
+ total_len += this_num;
+ index++;
+ }
+ ra->inarg.offset = outarg->offset;
+ ra->inarg.size = total_len;
+ args->in_args[0].size = sizeof(ra->inarg);
+ args->in_args[0].value = &ra->inarg;
+ args->in_args[1].size = total_len;
+
+ err = fuse_simple_notify_reply(fm, args, outarg->notify_unique);
+ if (err)
+ fuse_retrieve_end(fm, args, err);
+
+ return err;
+}
+
+static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
+ struct fuse_copy_state *cs)
+{
+ struct fuse_notify_retrieve_out outarg;
+ struct fuse_mount *fm;
+ struct inode *inode;
+ u64 nodeid;
+ int err;
+
+ err = -EINVAL;
+ if (size != sizeof(outarg))
+ goto copy_finish;
+
+ err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+ if (err)
+ goto copy_finish;
+
+ fuse_copy_finish(cs);
+
+ down_read(&fc->killsb);
+ err = -ENOENT;
+ nodeid = outarg.nodeid;
+
+ inode = fuse_ilookup(fc, nodeid, &fm);
+ if (inode) {
+ err = fuse_retrieve(fm, inode, &outarg);
+ iput(inode);
+ }
+ up_read(&fc->killsb);
+
+ return err;
+
+copy_finish:
+ fuse_copy_finish(cs);
+ return err;
+}
+
+static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
+ unsigned int size, struct fuse_copy_state *cs)
+{
+ /* Don't try to move pages (yet) */
+ cs->move_pages = 0;
+
+ switch (code) {
+ case FUSE_NOTIFY_POLL:
+ return fuse_notify_poll(fc, size, cs);
+
+ case FUSE_NOTIFY_INVAL_INODE:
+ return fuse_notify_inval_inode(fc, size, cs);
+
+ case FUSE_NOTIFY_INVAL_ENTRY:
+ return fuse_notify_inval_entry(fc, size, cs);
+
+ case FUSE_NOTIFY_STORE:
+ return fuse_notify_store(fc, size, cs);
+
+ case FUSE_NOTIFY_RETRIEVE:
+ return fuse_notify_retrieve(fc, size, cs);
+
+ case FUSE_NOTIFY_DELETE:
+ return fuse_notify_delete(fc, size, cs);
+
+ default:
+ fuse_copy_finish(cs);
+ return -EINVAL;
+ }
+}
+
+/* Look up request on processing list by unique ID */
+static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique)
+{
+ unsigned int hash = fuse_req_hash(unique);
+ struct fuse_req *req;
+
+ list_for_each_entry(req, &fpq->processing[hash], list) {
+ if (req->in.h.unique == unique)
+ return req;
+ }
+ return NULL;
+}
+
+static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
+ unsigned nbytes)
+{
+ unsigned reqsize = sizeof(struct fuse_out_header);
+
+ reqsize += fuse_len_args(args->out_numargs, args->out_args);
+
+ if (reqsize < nbytes || (reqsize > nbytes && !args->out_argvar))
+ return -EINVAL;
+ else if (reqsize > nbytes) {
+ struct fuse_arg *lastarg = &args->out_args[args->out_numargs-1];
+ unsigned diffsize = reqsize - nbytes;
+
+ if (diffsize > lastarg->size)
+ return -EINVAL;
+ lastarg->size -= diffsize;
+ }
+ return fuse_copy_args(cs, args->out_numargs, args->out_pages,
+ args->out_args, args->page_zeroing);
+}
+
+/*
+ * Write a single reply to a request. First the header is copied from
+ * the write buffer. The request is then searched on the processing
+ * list by the unique ID found in the header. If found, then remove
+ * it from the list and copy the rest of the buffer to the request.
+ * The request is finished by calling fuse_request_end().
+ */
+static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
+ struct fuse_copy_state *cs, size_t nbytes)
+{
+ int err;
+ struct fuse_conn *fc = fud->fc;
+ struct fuse_pqueue *fpq = &fud->pq;
+ struct fuse_req *req;
+ struct fuse_out_header oh;
+
+ err = -EINVAL;
+ if (nbytes < sizeof(struct fuse_out_header))
+ goto out;
+
+ err = fuse_copy_one(cs, &oh, sizeof(oh));
+ if (err)
+ goto copy_finish;
+
+ err = -EINVAL;
+ if (oh.len != nbytes)
+ goto copy_finish;
+
+ /*
+ * Zero oh.unique indicates unsolicited notification message
+ * and error contains notification code.
+ */
+ if (!oh.unique) {
+ err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
+ goto out;
+ }
+
+ err = -EINVAL;
+ if (oh.error <= -512 || oh.error > 0)
+ goto copy_finish;
+
+ spin_lock(&fpq->lock);
+ req = NULL;
+ if (fpq->connected)
+ req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT);
+
+ err = -ENOENT;
+ if (!req) {
+ spin_unlock(&fpq->lock);
+ goto copy_finish;
+ }
+
+ /* Is it an interrupt reply ID? */
+ if (oh.unique & FUSE_INT_REQ_BIT) {
+ __fuse_get_request(req);
+ spin_unlock(&fpq->lock);
+
+ err = 0;
+ if (nbytes != sizeof(struct fuse_out_header))
+ err = -EINVAL;
+ else if (oh.error == -ENOSYS)
+ fc->no_interrupt = 1;
+ else if (oh.error == -EAGAIN)
+ err = queue_interrupt(req);
+
+ fuse_put_request(req);
+
+ goto copy_finish;
+ }
+
+ clear_bit(FR_SENT, &req->flags);
+ list_move(&req->list, &fpq->io);
+ req->out.h = oh;
+ set_bit(FR_LOCKED, &req->flags);
+ spin_unlock(&fpq->lock);
+ cs->req = req;
+ if (!req->args->page_replace)
+ cs->move_pages = 0;
+
+ if (oh.error)
+ err = nbytes != sizeof(oh) ? -EINVAL : 0;
+ else
+ err = copy_out_args(cs, req->args, nbytes);
+ fuse_copy_finish(cs);
+
+ spin_lock(&fpq->lock);
+ clear_bit(FR_LOCKED, &req->flags);
+ if (!fpq->connected)
+ err = -ENOENT;
+ else if (err)
+ req->out.h.error = -EIO;
+ if (!test_bit(FR_PRIVATE, &req->flags))
+ list_del_init(&req->list);
+ spin_unlock(&fpq->lock);
+
+ fuse_request_end(req);
+out:
+ return err ? err : nbytes;
+
+copy_finish:
+ fuse_copy_finish(cs);
+ goto out;
+}
+
+static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct fuse_copy_state cs;
+ struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp);
+
+ if (!fud)
+ return -EPERM;
+
+ if (!user_backed_iter(from))
+ return -EINVAL;
+
+ fuse_copy_init(&cs, 0, from);
+
+ return fuse_dev_do_write(fud, &cs, iov_iter_count(from));
+}
+
+static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
+ struct file *out, loff_t *ppos,
+ size_t len, unsigned int flags)
+{
+ unsigned int head, tail, mask, count;
+ unsigned nbuf;
+ unsigned idx;
+ struct pipe_buffer *bufs;
+ struct fuse_copy_state cs;
+ struct fuse_dev *fud;
+ size_t rem;
+ ssize_t ret;
+
+ fud = fuse_get_dev(out);
+ if (!fud)
+ return -EPERM;
+
+ pipe_lock(pipe);
+
+ head = pipe->head;
+ tail = pipe->tail;
+ mask = pipe->ring_size - 1;
+ count = head - tail;
+
+ bufs = kvmalloc_array(count, sizeof(struct pipe_buffer), GFP_KERNEL);
+ if (!bufs) {
+ pipe_unlock(pipe);
+ return -ENOMEM;
+ }
+
+ nbuf = 0;
+ rem = 0;
+ for (idx = tail; idx != head && rem < len; idx++)
+ rem += pipe->bufs[idx & mask].len;
+
+ ret = -EINVAL;
+ if (rem < len)
+ goto out_free;
+
+ rem = len;
+ while (rem) {
+ struct pipe_buffer *ibuf;
+ struct pipe_buffer *obuf;
+
+ if (WARN_ON(nbuf >= count || tail == head))
+ goto out_free;
+
+ ibuf = &pipe->bufs[tail & mask];
+ obuf = &bufs[nbuf];
+
+ if (rem >= ibuf->len) {
+ *obuf = *ibuf;
+ ibuf->ops = NULL;
+ tail++;
+ pipe->tail = tail;
+ } else {
+ if (!pipe_buf_get(pipe, ibuf))
+ goto out_free;
+
+ *obuf = *ibuf;
+ obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+ obuf->len = rem;
+ ibuf->offset += obuf->len;
+ ibuf->len -= obuf->len;
+ }
+ nbuf++;
+ rem -= obuf->len;
+ }
+ pipe_unlock(pipe);
+
+ fuse_copy_init(&cs, 0, NULL);
+ cs.pipebufs = bufs;
+ cs.nr_segs = nbuf;
+ cs.pipe = pipe;
+
+ if (flags & SPLICE_F_MOVE)
+ cs.move_pages = 1;
+
+ ret = fuse_dev_do_write(fud, &cs, len);
+
+ pipe_lock(pipe);
+out_free:
+ for (idx = 0; idx < nbuf; idx++) {
+ struct pipe_buffer *buf = &bufs[idx];
+
+ if (buf->ops)
+ pipe_buf_release(pipe, buf);
+ }
+ pipe_unlock(pipe);
+
+ kvfree(bufs);
+ return ret;
+}
+
+static __poll_t fuse_dev_poll(struct file *file, poll_table *wait)
+{
+ __poll_t mask = EPOLLOUT | EPOLLWRNORM;
+ struct fuse_iqueue *fiq;
+ struct fuse_dev *fud = fuse_get_dev(file);
+
+ if (!fud)
+ return EPOLLERR;
+
+ fiq = &fud->fc->iq;
+ poll_wait(file, &fiq->waitq, wait);
+
+ spin_lock(&fiq->lock);
+ if (!fiq->connected)
+ mask = EPOLLERR;
+ else if (request_pending(fiq))
+ mask |= EPOLLIN | EPOLLRDNORM;
+ spin_unlock(&fiq->lock);
+
+ return mask;
+}
+
+/* Abort all requests on the given list (pending or processing) */
+static void end_requests(struct list_head *head)
+{
+ while (!list_empty(head)) {
+ struct fuse_req *req;
+ req = list_entry(head->next, struct fuse_req, list);
+ req->out.h.error = -ECONNABORTED;
+ clear_bit(FR_SENT, &req->flags);
+ list_del_init(&req->list);
+ fuse_request_end(req);
+ }
+}
+
+static void end_polls(struct fuse_conn *fc)
+{
+ struct rb_node *p;
+
+ p = rb_first(&fc->polled_files);
+
+ while (p) {
+ struct fuse_file *ff;
+ ff = rb_entry(p, struct fuse_file, polled_node);
+ wake_up_interruptible_all(&ff->poll_wait);
+
+ p = rb_next(p);
+ }
+}
+
+/*
+ * Abort all requests.
+ *
+ * Emergency exit in case of a malicious or accidental deadlock, or just a hung
+ * filesystem.
+ *
+ * The same effect is usually achievable through killing the filesystem daemon
+ * and all users of the filesystem. The exception is the combination of an
+ * asynchronous request and the tricky deadlock (see
+ * Documentation/filesystems/fuse.rst).
+ *
+ * Aborting requests under I/O goes as follows: 1: Separate out unlocked
+ * requests, they should be finished off immediately. Locked requests will be
+ * finished after unlock; see unlock_request(). 2: Finish off the unlocked
+ * requests. It is possible that some request will finish before we can. This
+ * is OK, the request will in that case be removed from the list before we touch
+ * it.
+ */
+void fuse_abort_conn(struct fuse_conn *fc)
+{
+ struct fuse_iqueue *fiq = &fc->iq;
+
+ spin_lock(&fc->lock);
+ if (fc->connected) {
+ struct fuse_dev *fud;
+ struct fuse_req *req, *next;
+ LIST_HEAD(to_end);
+ unsigned int i;
+
+ /* Background queuing checks fc->connected under bg_lock */
+ spin_lock(&fc->bg_lock);
+ fc->connected = 0;
+ spin_unlock(&fc->bg_lock);
+
+ fuse_set_initialized(fc);
+ list_for_each_entry(fud, &fc->devices, entry) {
+ struct fuse_pqueue *fpq = &fud->pq;
+
+ spin_lock(&fpq->lock);
+ fpq->connected = 0;
+ list_for_each_entry_safe(req, next, &fpq->io, list) {
+ req->out.h.error = -ECONNABORTED;
+ spin_lock(&req->waitq.lock);
+ set_bit(FR_ABORTED, &req->flags);
+ if (!test_bit(FR_LOCKED, &req->flags)) {
+ set_bit(FR_PRIVATE, &req->flags);
+ __fuse_get_request(req);
+ list_move(&req->list, &to_end);
+ }
+ spin_unlock(&req->waitq.lock);
+ }
+ for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
+ list_splice_tail_init(&fpq->processing[i],
+ &to_end);
+ spin_unlock(&fpq->lock);
+ }
+ spin_lock(&fc->bg_lock);
+ fc->blocked = 0;
+ fc->max_background = UINT_MAX;
+ flush_bg_queue(fc);
+ spin_unlock(&fc->bg_lock);
+
+ spin_lock(&fiq->lock);
+ fiq->connected = 0;
+ list_for_each_entry(req, &fiq->pending, list)
+ clear_bit(FR_PENDING, &req->flags);
+ list_splice_tail_init(&fiq->pending, &to_end);
+ while (forget_pending(fiq))
+ kfree(fuse_dequeue_forget(fiq, 1, NULL));
+ wake_up_all(&fiq->waitq);
+ spin_unlock(&fiq->lock);
+ kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+ end_polls(fc);
+ wake_up_all(&fc->blocked_waitq);
+ spin_unlock(&fc->lock);
+
+ end_requests(&to_end);
+ } else {
+ spin_unlock(&fc->lock);
+ }
+}
+EXPORT_SYMBOL_GPL(fuse_abort_conn);
+
+void fuse_wait_aborted(struct fuse_conn *fc)
+{
+ /* matches implicit memory barrier in fuse_drop_waiting() */
+ smp_mb();
+ wait_event(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0);
+}
+
+int fuse_dev_release(struct inode *inode, struct file *file)
+{
+ struct fuse_dev *fud = fuse_get_dev(file);
+
+ if (fud) {
+ struct fuse_conn *fc = fud->fc;
+ struct fuse_pqueue *fpq = &fud->pq;
+ LIST_HEAD(to_end);
+ unsigned int i;
+
+ spin_lock(&fpq->lock);
+ WARN_ON(!list_empty(&fpq->io));
+ for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
+ list_splice_init(&fpq->processing[i], &to_end);
+ spin_unlock(&fpq->lock);
+
+ end_requests(&to_end);
+
+ /* Are we the last open device? */
+ if (atomic_dec_and_test(&fc->dev_count)) {
+ WARN_ON(fc->iq.fasync != NULL);
+ fuse_abort_conn(fc);
+ }
+ fuse_dev_free(fud);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fuse_dev_release);
+
+static int fuse_dev_fasync(int fd, struct file *file, int on)
+{
+ struct fuse_dev *fud = fuse_get_dev(file);
+
+ if (!fud)
+ return -EPERM;
+
+ /* No locking - fasync_helper does its own locking */
+ return fasync_helper(fd, file, on, &fud->fc->iq.fasync);
+}
+
+static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
+{
+ struct fuse_dev *fud;
+
+ if (new->private_data)
+ return -EINVAL;
+
+ fud = fuse_dev_alloc_install(fc);
+ if (!fud)
+ return -ENOMEM;
+
+ new->private_data = fud;
+ atomic_inc(&fc->dev_count);
+
+ return 0;
+}
+
+static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ int res;
+ int oldfd;
+ struct fuse_dev *fud = NULL;
+
+ switch (cmd) {
+ case FUSE_DEV_IOC_CLONE:
+ res = -EFAULT;
+ if (!get_user(oldfd, (__u32 __user *)arg)) {
+ struct file *old = fget(oldfd);
+
+ res = -EINVAL;
+ if (old) {
+ /*
+ * Check against file->f_op because CUSE
+ * uses the same ioctl handler.
+ */
+ if (old->f_op == file->f_op &&
+ old->f_cred->user_ns == file->f_cred->user_ns)
+ fud = fuse_get_dev(old);
+
+ if (fud) {
+ mutex_lock(&fuse_mutex);
+ res = fuse_device_clone(fud->fc, file);
+ mutex_unlock(&fuse_mutex);
+ }
+ fput(old);
+ }
+ }
+ break;
+ default:
+ res = -ENOTTY;
+ break;
+ }
+ return res;
+}
+
+const struct file_operations fuse_dev_operations = {
+ .owner = THIS_MODULE,
+ .open = fuse_dev_open,
+ .llseek = no_llseek,
+ .read_iter = fuse_dev_read,
+ .splice_read = fuse_dev_splice_read,
+ .write_iter = fuse_dev_write,
+ .splice_write = fuse_dev_splice_write,
+ .poll = fuse_dev_poll,
+ .release = fuse_dev_release,
+ .fasync = fuse_dev_fasync,
+ .unlocked_ioctl = fuse_dev_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+};
+EXPORT_SYMBOL_GPL(fuse_dev_operations);
+
+static struct miscdevice fuse_miscdevice = {
+ .minor = FUSE_MINOR,
+ .name = "fuse",
+ .fops = &fuse_dev_operations,
+};
+
+int __init fuse_dev_init(void)
+{
+ int err = -ENOMEM;
+ fuse_req_cachep = kmem_cache_create("fuse_request",
+ sizeof(struct fuse_req),
+ 0, 0, NULL);
+ if (!fuse_req_cachep)
+ goto out;
+
+ err = misc_register(&fuse_miscdevice);
+ if (err)
+ goto out_cache_clean;
+
+ return 0;
+
+ out_cache_clean:
+ kmem_cache_destroy(fuse_req_cachep);
+ out:
+ return err;
+}
+
+void fuse_dev_cleanup(void)
+{
+ misc_deregister(&fuse_miscdevice);
+ kmem_cache_destroy(fuse_req_cachep);
+}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
new file mode 100644
index 000000000..5e408e7ec
--- /dev/null
+++ b/fs/fuse/dir.c
@@ -0,0 +1,2013 @@
+/*
+ FUSE: Filesystem in Userspace
+ Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
+
+ This program can be distributed under the terms of the GNU GPL.
+ See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/fs_context.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/iversion.h>
+#include <linux/posix_acl.h>
+#include <linux/security.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+
+static bool __read_mostly allow_sys_admin_access;
+module_param(allow_sys_admin_access, bool, 0644);
+MODULE_PARM_DESC(allow_sys_admin_access,
+ "Allow users with CAP_SYS_ADMIN in initial userns to bypass allow_other access check");
+
+static void fuse_advise_use_readdirplus(struct inode *dir)
+{
+ struct fuse_inode *fi = get_fuse_inode(dir);
+
+ set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state);
+}
+
+#if BITS_PER_LONG >= 64
+static inline void __fuse_dentry_settime(struct dentry *entry, u64 time)
+{
+ entry->d_fsdata = (void *) time;
+}
+
+static inline u64 fuse_dentry_time(const struct dentry *entry)
+{
+ return (u64)entry->d_fsdata;
+}
+
+#else
+union fuse_dentry {
+ u64 time;
+ struct rcu_head rcu;
+};
+
+static inline void __fuse_dentry_settime(struct dentry *dentry, u64 time)
+{
+ ((union fuse_dentry *) dentry->d_fsdata)->time = time;
+}
+
+static inline u64 fuse_dentry_time(const struct dentry *entry)
+{
+ return ((union fuse_dentry *) entry->d_fsdata)->time;
+}
+#endif
+
+static void fuse_dentry_settime(struct dentry *dentry, u64 time)
+{
+ struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb);
+ bool delete = !time && fc->delete_stale;
+ /*
+ * Mess with DCACHE_OP_DELETE because dput() will be faster without it.
+ * Don't care about races, either way it's just an optimization
+ */
+ if ((!delete && (dentry->d_flags & DCACHE_OP_DELETE)) ||
+ (delete && !(dentry->d_flags & DCACHE_OP_DELETE))) {
+ spin_lock(&dentry->d_lock);
+ if (!delete)
+ dentry->d_flags &= ~DCACHE_OP_DELETE;
+ else
+ dentry->d_flags |= DCACHE_OP_DELETE;
+ spin_unlock(&dentry->d_lock);
+ }
+
+ __fuse_dentry_settime(dentry, time);
+}
+
+/*
+ * FUSE caches dentries and attributes with separate timeout. The
+ * time in jiffies until the dentry/attributes are valid is stored in
+ * dentry->d_fsdata and fuse_inode->i_time respectively.
+ */
+
+/*
+ * Calculate the time in jiffies until a dentry/attributes are valid
+ */
+static u64 time_to_jiffies(u64 sec, u32 nsec)
+{
+ if (sec || nsec) {
+ struct timespec64 ts = {
+ sec,
+ min_t(u32, nsec, NSEC_PER_SEC - 1)
+ };
+
+ return get_jiffies_64() + timespec64_to_jiffies(&ts);
+ } else
+ return 0;
+}
+
+/*
+ * Set dentry and possibly attribute timeouts from the lookup/mk*
+ * replies
+ */
+void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o)
+{
+ fuse_dentry_settime(entry,
+ time_to_jiffies(o->entry_valid, o->entry_valid_nsec));
+}
+
+static u64 attr_timeout(struct fuse_attr_out *o)
+{
+ return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
+}
+
+u64 entry_attr_timeout(struct fuse_entry_out *o)
+{
+ return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
+}
+
+void fuse_invalidate_attr_mask(struct inode *inode, u32 mask)
+{
+ set_mask_bits(&get_fuse_inode(inode)->inval_mask, 0, mask);
+}
+
+/*
+ * Mark the attributes as stale, so that at the next call to
+ * ->getattr() they will be fetched from userspace
+ */
+void fuse_invalidate_attr(struct inode *inode)
+{
+ fuse_invalidate_attr_mask(inode, STATX_BASIC_STATS);
+}
+
+static void fuse_dir_changed(struct inode *dir)
+{
+ fuse_invalidate_attr(dir);
+ inode_maybe_inc_iversion(dir, false);
+}
+
+/**
+ * Mark the attributes as stale due to an atime change. Avoid the invalidate if
+ * atime is not used.
+ */
+void fuse_invalidate_atime(struct inode *inode)
+{
+ if (!IS_RDONLY(inode))
+ fuse_invalidate_attr_mask(inode, STATX_ATIME);
+}
+
+/*
+ * Just mark the entry as stale, so that a next attempt to look it up
+ * will result in a new lookup call to userspace
+ *
+ * This is called when a dentry is about to become negative and the
+ * timeout is unknown (unlink, rmdir, rename and in some cases
+ * lookup)
+ */
+void fuse_invalidate_entry_cache(struct dentry *entry)
+{
+ fuse_dentry_settime(entry, 0);
+}
+
+/*
+ * Same as fuse_invalidate_entry_cache(), but also try to remove the
+ * dentry from the hash
+ */
+static void fuse_invalidate_entry(struct dentry *entry)
+{
+ d_invalidate(entry);
+ fuse_invalidate_entry_cache(entry);
+}
+
+static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
+ u64 nodeid, const struct qstr *name,
+ struct fuse_entry_out *outarg)
+{
+ memset(outarg, 0, sizeof(struct fuse_entry_out));
+ args->opcode = FUSE_LOOKUP;
+ args->nodeid = nodeid;
+ args->in_numargs = 1;
+ args->in_args[0].size = name->len + 1;
+ args->in_args[0].value = name->name;
+ args->out_numargs = 1;
+ args->out_args[0].size = sizeof(struct fuse_entry_out);
+ args->out_args[0].value = outarg;
+}
+
+/*
+ * Check whether the dentry is still valid
+ *
+ * If the entry validity timeout has expired and the dentry is
+ * positive, try to redo the lookup. If the lookup results in a
+ * different inode, then let the VFS invalidate the dentry and redo
+ * the lookup once more. If the lookup results in the same inode,
+ * then refresh the attributes, timeouts and mark the dentry valid.
+ */
+static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
+{
+ struct inode *inode;
+ struct dentry *parent;
+ struct fuse_mount *fm;
+ struct fuse_inode *fi;
+ int ret;
+
+ inode = d_inode_rcu(entry);
+ if (inode && fuse_is_bad(inode))
+ goto invalid;
+ else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) ||
+ (flags & (LOOKUP_EXCL | LOOKUP_REVAL | LOOKUP_RENAME_TARGET))) {
+ struct fuse_entry_out outarg;
+ FUSE_ARGS(args);
+ struct fuse_forget_link *forget;
+ u64 attr_version;
+
+ /* For negative dentries, always do a fresh lookup */
+ if (!inode)
+ goto invalid;
+
+ ret = -ECHILD;
+ if (flags & LOOKUP_RCU)
+ goto out;
+
+ fm = get_fuse_mount(inode);
+
+ forget = fuse_alloc_forget();
+ ret = -ENOMEM;
+ if (!forget)
+ goto out;
+
+ attr_version = fuse_get_attr_version(fm->fc);
+
+ parent = dget_parent(entry);
+ fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)),
+ &entry->d_name, &outarg);
+ ret = fuse_simple_request(fm, &args);
+ dput(parent);
+ /* Zero nodeid is same as -ENOENT */
+ if (!ret && !outarg.nodeid)
+ ret = -ENOENT;
+ if (!ret) {
+ fi = get_fuse_inode(inode);
+ if (outarg.nodeid != get_node_id(inode) ||
+ (bool) IS_AUTOMOUNT(inode) != (bool) (outarg.attr.flags & FUSE_ATTR_SUBMOUNT)) {
+ fuse_queue_forget(fm->fc, forget,
+ outarg.nodeid, 1);
+ goto invalid;
+ }
+ spin_lock(&fi->lock);
+ fi->nlookup++;
+ spin_unlock(&fi->lock);
+ }
+ kfree(forget);
+ if (ret == -ENOMEM || ret == -EINTR)
+ goto out;
+ if (ret || fuse_invalid_attr(&outarg.attr) ||
+ fuse_stale_inode(inode, outarg.generation, &outarg.attr))
+ goto invalid;
+
+ forget_all_cached_acls(inode);
+ fuse_change_attributes(inode, &outarg.attr,
+ entry_attr_timeout(&outarg),
+ attr_version);
+ fuse_change_entry_timeout(entry, &outarg);
+ } else if (inode) {
+ fi = get_fuse_inode(inode);
+ if (flags & LOOKUP_RCU) {
+ if (test_bit(FUSE_I_INIT_RDPLUS, &fi->state))
+ return -ECHILD;
+ } else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) {
+ parent = dget_parent(entry);
+ fuse_advise_use_readdirplus(d_inode(parent));
+ dput(parent);
+ }
+ }
+ ret = 1;
+out:
+ return ret;
+
+invalid:
+ ret = 0;
+ goto out;
+}
+
+#if BITS_PER_LONG < 64
+static int fuse_dentry_init(struct dentry *dentry)
+{
+ dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry),
+ GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE);
+
+ return dentry->d_fsdata ? 0 : -ENOMEM;
+}
+static void fuse_dentry_release(struct dentry *dentry)
+{
+ union fuse_dentry *fd = dentry->d_fsdata;
+
+ kfree_rcu(fd, rcu);
+}
+#endif
+
+static int fuse_dentry_delete(const struct dentry *dentry)
+{
+ return time_before64(fuse_dentry_time(dentry), get_jiffies_64());
+}
+
+/*
+ * Create a fuse_mount object with a new superblock (with path->dentry
+ * as the root), and return that mount so it can be auto-mounted on
+ * @path.
+ */
+static struct vfsmount *fuse_dentry_automount(struct path *path)
+{
+ struct fs_context *fsc;
+ struct vfsmount *mnt;
+ struct fuse_inode *mp_fi = get_fuse_inode(d_inode(path->dentry));
+
+ fsc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry);
+ if (IS_ERR(fsc))
+ return ERR_CAST(fsc);
+
+ /* Pass the FUSE inode of the mount for fuse_get_tree_submount() */
+ fsc->fs_private = mp_fi;
+
+ /* Create the submount */
+ mnt = fc_mount(fsc);
+ if (!IS_ERR(mnt))
+ mntget(mnt);
+
+ put_fs_context(fsc);
+ return mnt;
+}
+
+const struct dentry_operations fuse_dentry_operations = {
+ .d_revalidate = fuse_dentry_revalidate,
+ .d_delete = fuse_dentry_delete,
+#if BITS_PER_LONG < 64
+ .d_init = fuse_dentry_init,
+ .d_release = fuse_dentry_release,
+#endif
+ .d_automount = fuse_dentry_automount,
+};
+
+const struct dentry_operations fuse_root_dentry_operations = {
+#if BITS_PER_LONG < 64
+ .d_init = fuse_dentry_init,
+ .d_release = fuse_dentry_release,
+#endif
+};
+
+int fuse_valid_type(int m)
+{
+ return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) ||
+ S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
+}
+
+bool fuse_invalid_attr(struct fuse_attr *attr)
+{
+ return !fuse_valid_type(attr->mode) ||
+ attr->size > LLONG_MAX;
+}
+
+int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name,
+ struct fuse_entry_out *outarg, struct inode **inode)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ FUSE_ARGS(args);
+ struct fuse_forget_link *forget;
+ u64 attr_version;
+ int err;
+
+ *inode = NULL;
+ err = -ENAMETOOLONG;
+ if (name->len > FUSE_NAME_MAX)
+ goto out;
+
+
+ forget = fuse_alloc_forget();
+ err = -ENOMEM;
+ if (!forget)
+ goto out;
+
+ attr_version = fuse_get_attr_version(fm->fc);
+
+ fuse_lookup_init(fm->fc, &args, nodeid, name, outarg);
+ err = fuse_simple_request(fm, &args);
+ /* Zero nodeid is same as -ENOENT, but with valid timeout */
+ if (err || !outarg->nodeid)
+ goto out_put_forget;
+
+ err = -EIO;
+ if (!outarg->nodeid)
+ goto out_put_forget;
+ if (fuse_invalid_attr(&outarg->attr))
+ goto out_put_forget;
+
+ *inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
+ &outarg->attr, entry_attr_timeout(outarg),
+ attr_version);
+ err = -ENOMEM;
+ if (!*inode) {
+ fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1);
+ goto out;
+ }
+ err = 0;
+
+ out_put_forget:
+ kfree(forget);
+ out:
+ return err;
+}
+
+static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
+ unsigned int flags)
+{
+ int err;
+ struct fuse_entry_out outarg;
+ struct inode *inode;
+ struct dentry *newent;
+ bool outarg_valid = true;
+ bool locked;
+
+ if (fuse_is_bad(dir))
+ return ERR_PTR(-EIO);
+
+ locked = fuse_lock_inode(dir);
+ err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name,
+ &outarg, &inode);
+ fuse_unlock_inode(dir, locked);
+ if (err == -ENOENT) {
+ outarg_valid = false;
+ err = 0;
+ }
+ if (err)
+ goto out_err;
+
+ err = -EIO;
+ if (inode && get_node_id(inode) == FUSE_ROOT_ID)
+ goto out_iput;
+
+ newent = d_splice_alias(inode, entry);
+ err = PTR_ERR(newent);
+ if (IS_ERR(newent))
+ goto out_err;
+
+ entry = newent ? newent : entry;
+ if (outarg_valid)
+ fuse_change_entry_timeout(entry, &outarg);
+ else
+ fuse_invalidate_entry_cache(entry);
+
+ if (inode)
+ fuse_advise_use_readdirplus(dir);
+ return newent;
+
+ out_iput:
+ iput(inode);
+ out_err:
+ return ERR_PTR(err);
+}
+
+static int get_security_context(struct dentry *entry, umode_t mode,
+ void **security_ctx, u32 *security_ctxlen)
+{
+ struct fuse_secctx *fctx;
+ struct fuse_secctx_header *header;
+ void *ctx = NULL, *ptr;
+ u32 ctxlen, total_len = sizeof(*header);
+ int err, nr_ctx = 0;
+ const char *name;
+ size_t namelen;
+
+ err = security_dentry_init_security(entry, mode, &entry->d_name,
+ &name, &ctx, &ctxlen);
+ if (err) {
+ if (err != -EOPNOTSUPP)
+ goto out_err;
+ /* No LSM is supporting this security hook. Ignore error */
+ ctxlen = 0;
+ ctx = NULL;
+ }
+
+ if (ctxlen) {
+ nr_ctx = 1;
+ namelen = strlen(name) + 1;
+ err = -EIO;
+ if (WARN_ON(namelen > XATTR_NAME_MAX + 1 || ctxlen > S32_MAX))
+ goto out_err;
+ total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen + ctxlen);
+ }
+
+ err = -ENOMEM;
+ header = ptr = kzalloc(total_len, GFP_KERNEL);
+ if (!ptr)
+ goto out_err;
+
+ header->nr_secctx = nr_ctx;
+ header->size = total_len;
+ ptr += sizeof(*header);
+ if (nr_ctx) {
+ fctx = ptr;
+ fctx->size = ctxlen;
+ ptr += sizeof(*fctx);
+
+ strcpy(ptr, name);
+ ptr += namelen;
+
+ memcpy(ptr, ctx, ctxlen);
+ }
+ *security_ctxlen = total_len;
+ *security_ctx = header;
+ err = 0;
+out_err:
+ kfree(ctx);
+ return err;
+}
+
+/*
+ * Atomic create+open operation
+ *
+ * If the filesystem doesn't support this, then fall back to separate
+ * 'mknod' + 'open' requests.
+ */
+static int fuse_create_open(struct inode *dir, struct dentry *entry,
+ struct file *file, unsigned int flags,
+ umode_t mode, u32 opcode)
+{
+ int err;
+ struct inode *inode;
+ struct fuse_mount *fm = get_fuse_mount(dir);
+ FUSE_ARGS(args);
+ struct fuse_forget_link *forget;
+ struct fuse_create_in inarg;
+ struct fuse_open_out outopen;
+ struct fuse_entry_out outentry;
+ struct fuse_inode *fi;
+ struct fuse_file *ff;
+ void *security_ctx = NULL;
+ u32 security_ctxlen;
+ bool trunc = flags & O_TRUNC;
+
+ /* Userspace expects S_IFREG in create mode */
+ BUG_ON((mode & S_IFMT) != S_IFREG);
+
+ forget = fuse_alloc_forget();
+ err = -ENOMEM;
+ if (!forget)
+ goto out_err;
+
+ err = -ENOMEM;
+ ff = fuse_file_alloc(fm);
+ if (!ff)
+ goto out_put_forget_req;
+
+ if (!fm->fc->dont_mask)
+ mode &= ~current_umask();
+
+ flags &= ~O_NOCTTY;
+ memset(&inarg, 0, sizeof(inarg));
+ memset(&outentry, 0, sizeof(outentry));
+ inarg.flags = flags;
+ inarg.mode = mode;
+ inarg.umask = current_umask();
+
+ if (fm->fc->handle_killpriv_v2 && trunc &&
+ !(flags & O_EXCL) && !capable(CAP_FSETID)) {
+ inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
+ }
+
+ args.opcode = opcode;
+ args.nodeid = get_node_id(dir);
+ args.in_numargs = 2;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.in_args[1].size = entry->d_name.len + 1;
+ args.in_args[1].value = entry->d_name.name;
+ args.out_numargs = 2;
+ args.out_args[0].size = sizeof(outentry);
+ args.out_args[0].value = &outentry;
+ args.out_args[1].size = sizeof(outopen);
+ args.out_args[1].value = &outopen;
+
+ if (fm->fc->init_security) {
+ err = get_security_context(entry, mode, &security_ctx,
+ &security_ctxlen);
+ if (err)
+ goto out_put_forget_req;
+
+ args.in_numargs = 3;
+ args.in_args[2].size = security_ctxlen;
+ args.in_args[2].value = security_ctx;
+ }
+
+ err = fuse_simple_request(fm, &args);
+ kfree(security_ctx);
+ if (err)
+ goto out_free_ff;
+
+ err = -EIO;
+ if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid) ||
+ fuse_invalid_attr(&outentry.attr))
+ goto out_free_ff;
+
+ ff->fh = outopen.fh;
+ ff->nodeid = outentry.nodeid;
+ ff->open_flags = outopen.open_flags;
+ inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
+ &outentry.attr, entry_attr_timeout(&outentry), 0);
+ if (!inode) {
+ flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
+ fuse_sync_release(NULL, ff, flags);
+ fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1);
+ err = -ENOMEM;
+ goto out_err;
+ }
+ kfree(forget);
+ d_instantiate(entry, inode);
+ fuse_change_entry_timeout(entry, &outentry);
+ fuse_dir_changed(dir);
+ err = finish_open(file, entry, generic_file_open);
+ if (err) {
+ fi = get_fuse_inode(inode);
+ fuse_sync_release(fi, ff, flags);
+ } else {
+ file->private_data = ff;
+ fuse_finish_open(inode, file);
+ if (fm->fc->atomic_o_trunc && trunc)
+ truncate_pagecache(inode, 0);
+ else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+ invalidate_inode_pages2(inode->i_mapping);
+ }
+ return err;
+
+out_free_ff:
+ fuse_file_free(ff);
+out_put_forget_req:
+ kfree(forget);
+out_err:
+ return err;
+}
+
+static int fuse_mknod(struct user_namespace *, struct inode *, struct dentry *,
+ umode_t, dev_t);
+static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
+ struct file *file, unsigned flags,
+ umode_t mode)
+{
+ int err;
+ struct fuse_conn *fc = get_fuse_conn(dir);
+ struct dentry *res = NULL;
+
+ if (fuse_is_bad(dir))
+ return -EIO;
+
+ if (d_in_lookup(entry)) {
+ res = fuse_lookup(dir, entry, 0);
+ if (IS_ERR(res))
+ return PTR_ERR(res);
+
+ if (res)
+ entry = res;
+ }
+
+ if (!(flags & O_CREAT) || d_really_is_positive(entry))
+ goto no_open;
+
+ /* Only creates */
+ file->f_mode |= FMODE_CREATED;
+
+ if (fc->no_create)
+ goto mknod;
+
+ err = fuse_create_open(dir, entry, file, flags, mode, FUSE_CREATE);
+ if (err == -ENOSYS) {
+ fc->no_create = 1;
+ goto mknod;
+ }
+out_dput:
+ dput(res);
+ return err;
+
+mknod:
+ err = fuse_mknod(&init_user_ns, dir, entry, mode, 0);
+ if (err)
+ goto out_dput;
+no_open:
+ return finish_no_open(file, res);
+}
+
+/*
+ * Code shared between mknod, mkdir, symlink and link
+ */
+static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
+ struct inode *dir, struct dentry *entry,
+ umode_t mode)
+{
+ struct fuse_entry_out outarg;
+ struct inode *inode;
+ struct dentry *d;
+ int err;
+ struct fuse_forget_link *forget;
+ void *security_ctx = NULL;
+ u32 security_ctxlen;
+
+ if (fuse_is_bad(dir))
+ return -EIO;
+
+ forget = fuse_alloc_forget();
+ if (!forget)
+ return -ENOMEM;
+
+ memset(&outarg, 0, sizeof(outarg));
+ args->nodeid = get_node_id(dir);
+ args->out_numargs = 1;
+ args->out_args[0].size = sizeof(outarg);
+ args->out_args[0].value = &outarg;
+
+ if (fm->fc->init_security && args->opcode != FUSE_LINK) {
+ err = get_security_context(entry, mode, &security_ctx,
+ &security_ctxlen);
+ if (err)
+ goto out_put_forget_req;
+
+ BUG_ON(args->in_numargs != 2);
+
+ args->in_numargs = 3;
+ args->in_args[2].size = security_ctxlen;
+ args->in_args[2].value = security_ctx;
+ }
+
+ err = fuse_simple_request(fm, args);
+ kfree(security_ctx);
+ if (err)
+ goto out_put_forget_req;
+
+ err = -EIO;
+ if (invalid_nodeid(outarg.nodeid) || fuse_invalid_attr(&outarg.attr))
+ goto out_put_forget_req;
+
+ if ((outarg.attr.mode ^ mode) & S_IFMT)
+ goto out_put_forget_req;
+
+ inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
+ &outarg.attr, entry_attr_timeout(&outarg), 0);
+ if (!inode) {
+ fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1);
+ return -ENOMEM;
+ }
+ kfree(forget);
+
+ d_drop(entry);
+ d = d_splice_alias(inode, entry);
+ if (IS_ERR(d))
+ return PTR_ERR(d);
+
+ if (d) {
+ fuse_change_entry_timeout(d, &outarg);
+ dput(d);
+ } else {
+ fuse_change_entry_timeout(entry, &outarg);
+ }
+ fuse_dir_changed(dir);
+ return 0;
+
+ out_put_forget_req:
+ kfree(forget);
+ return err;
+}
+
+static int fuse_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *entry, umode_t mode, dev_t rdev)
+{
+ struct fuse_mknod_in inarg;
+ struct fuse_mount *fm = get_fuse_mount(dir);
+ FUSE_ARGS(args);
+
+ if (!fm->fc->dont_mask)
+ mode &= ~current_umask();
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.mode = mode;
+ inarg.rdev = new_encode_dev(rdev);
+ inarg.umask = current_umask();
+ args.opcode = FUSE_MKNOD;
+ args.in_numargs = 2;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.in_args[1].size = entry->d_name.len + 1;
+ args.in_args[1].value = entry->d_name.name;
+ return create_new_entry(fm, &args, dir, entry, mode);
+}
+
+static int fuse_create(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *entry, umode_t mode, bool excl)
+{
+ return fuse_mknod(&init_user_ns, dir, entry, mode, 0);
+}
+
+static int fuse_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct file *file, umode_t mode)
+{
+ struct fuse_conn *fc = get_fuse_conn(dir);
+ int err;
+
+ if (fc->no_tmpfile)
+ return -EOPNOTSUPP;
+
+ err = fuse_create_open(dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE);
+ if (err == -ENOSYS) {
+ fc->no_tmpfile = 1;
+ err = -EOPNOTSUPP;
+ }
+ return err;
+}
+
+static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *entry, umode_t mode)
+{
+ struct fuse_mkdir_in inarg;
+ struct fuse_mount *fm = get_fuse_mount(dir);
+ FUSE_ARGS(args);
+
+ if (!fm->fc->dont_mask)
+ mode &= ~current_umask();
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.mode = mode;
+ inarg.umask = current_umask();
+ args.opcode = FUSE_MKDIR;
+ args.in_numargs = 2;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.in_args[1].size = entry->d_name.len + 1;
+ args.in_args[1].value = entry->d_name.name;
+ return create_new_entry(fm, &args, dir, entry, S_IFDIR);
+}
+
+static int fuse_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *entry, const char *link)
+{
+ struct fuse_mount *fm = get_fuse_mount(dir);
+ unsigned len = strlen(link) + 1;
+ FUSE_ARGS(args);
+
+ args.opcode = FUSE_SYMLINK;
+ args.in_numargs = 2;
+ args.in_args[0].size = entry->d_name.len + 1;
+ args.in_args[0].value = entry->d_name.name;
+ args.in_args[1].size = len;
+ args.in_args[1].value = link;
+ return create_new_entry(fm, &args, dir, entry, S_IFLNK);
+}
+
+void fuse_flush_time_update(struct inode *inode)
+{
+ int err = sync_inode_metadata(inode, 1);
+
+ mapping_set_error(inode->i_mapping, err);
+}
+
+static void fuse_update_ctime_in_cache(struct inode *inode)
+{
+ if (!IS_NOCMTIME(inode)) {
+ inode->i_ctime = current_time(inode);
+ mark_inode_dirty_sync(inode);
+ fuse_flush_time_update(inode);
+ }
+}
+
+void fuse_update_ctime(struct inode *inode)
+{
+ fuse_invalidate_attr_mask(inode, STATX_CTIME);
+ fuse_update_ctime_in_cache(inode);
+}
+
+static void fuse_entry_unlinked(struct dentry *entry)
+{
+ struct inode *inode = d_inode(entry);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ /*
+ * If i_nlink == 0 then unlink doesn't make sense, yet this can
+ * happen if userspace filesystem is careless. It would be
+ * difficult to enforce correct nlink usage so just ignore this
+ * condition here
+ */
+ if (S_ISDIR(inode->i_mode))
+ clear_nlink(inode);
+ else if (inode->i_nlink > 0)
+ drop_nlink(inode);
+ spin_unlock(&fi->lock);
+ fuse_invalidate_entry_cache(entry);
+ fuse_update_ctime(inode);
+}
+
+static int fuse_unlink(struct inode *dir, struct dentry *entry)
+{
+ int err;
+ struct fuse_mount *fm = get_fuse_mount(dir);
+ FUSE_ARGS(args);
+
+ if (fuse_is_bad(dir))
+ return -EIO;
+
+ args.opcode = FUSE_UNLINK;
+ args.nodeid = get_node_id(dir);
+ args.in_numargs = 1;
+ args.in_args[0].size = entry->d_name.len + 1;
+ args.in_args[0].value = entry->d_name.name;
+ err = fuse_simple_request(fm, &args);
+ if (!err) {
+ fuse_dir_changed(dir);
+ fuse_entry_unlinked(entry);
+ } else if (err == -EINTR)
+ fuse_invalidate_entry(entry);
+ return err;
+}
+
+static int fuse_rmdir(struct inode *dir, struct dentry *entry)
+{
+ int err;
+ struct fuse_mount *fm = get_fuse_mount(dir);
+ FUSE_ARGS(args);
+
+ if (fuse_is_bad(dir))
+ return -EIO;
+
+ args.opcode = FUSE_RMDIR;
+ args.nodeid = get_node_id(dir);
+ args.in_numargs = 1;
+ args.in_args[0].size = entry->d_name.len + 1;
+ args.in_args[0].value = entry->d_name.name;
+ err = fuse_simple_request(fm, &args);
+ if (!err) {
+ fuse_dir_changed(dir);
+ fuse_entry_unlinked(entry);
+ } else if (err == -EINTR)
+ fuse_invalidate_entry(entry);
+ return err;
+}
+
+static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
+ struct inode *newdir, struct dentry *newent,
+ unsigned int flags, int opcode, size_t argsize)
+{
+ int err;
+ struct fuse_rename2_in inarg;
+ struct fuse_mount *fm = get_fuse_mount(olddir);
+ FUSE_ARGS(args);
+
+ memset(&inarg, 0, argsize);
+ inarg.newdir = get_node_id(newdir);
+ inarg.flags = flags;
+ args.opcode = opcode;
+ args.nodeid = get_node_id(olddir);
+ args.in_numargs = 3;
+ args.in_args[0].size = argsize;
+ args.in_args[0].value = &inarg;
+ args.in_args[1].size = oldent->d_name.len + 1;
+ args.in_args[1].value = oldent->d_name.name;
+ args.in_args[2].size = newent->d_name.len + 1;
+ args.in_args[2].value = newent->d_name.name;
+ err = fuse_simple_request(fm, &args);
+ if (!err) {
+ /* ctime changes */
+ fuse_update_ctime(d_inode(oldent));
+
+ if (flags & RENAME_EXCHANGE)
+ fuse_update_ctime(d_inode(newent));
+
+ fuse_dir_changed(olddir);
+ if (olddir != newdir)
+ fuse_dir_changed(newdir);
+
+ /* newent will end up negative */
+ if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent))
+ fuse_entry_unlinked(newent);
+ } else if (err == -EINTR) {
+ /* If request was interrupted, DEITY only knows if the
+ rename actually took place. If the invalidation
+ fails (e.g. some process has CWD under the renamed
+ directory), then there can be inconsistency between
+ the dcache and the real filesystem. Tough luck. */
+ fuse_invalidate_entry(oldent);
+ if (d_really_is_positive(newent))
+ fuse_invalidate_entry(newent);
+ }
+
+ return err;
+}
+
+static int fuse_rename2(struct user_namespace *mnt_userns, struct inode *olddir,
+ struct dentry *oldent, struct inode *newdir,
+ struct dentry *newent, unsigned int flags)
+{
+ struct fuse_conn *fc = get_fuse_conn(olddir);
+ int err;
+
+ if (fuse_is_bad(olddir))
+ return -EIO;
+
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
+ return -EINVAL;
+
+ if (flags) {
+ if (fc->no_rename2 || fc->minor < 23)
+ return -EINVAL;
+
+ err = fuse_rename_common(olddir, oldent, newdir, newent, flags,
+ FUSE_RENAME2,
+ sizeof(struct fuse_rename2_in));
+ if (err == -ENOSYS) {
+ fc->no_rename2 = 1;
+ err = -EINVAL;
+ }
+ } else {
+ err = fuse_rename_common(olddir, oldent, newdir, newent, 0,
+ FUSE_RENAME,
+ sizeof(struct fuse_rename_in));
+ }
+
+ return err;
+}
+
+static int fuse_link(struct dentry *entry, struct inode *newdir,
+ struct dentry *newent)
+{
+ int err;
+ struct fuse_link_in inarg;
+ struct inode *inode = d_inode(entry);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ FUSE_ARGS(args);
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.oldnodeid = get_node_id(inode);
+ args.opcode = FUSE_LINK;
+ args.in_numargs = 2;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.in_args[1].size = newent->d_name.len + 1;
+ args.in_args[1].value = newent->d_name.name;
+ err = create_new_entry(fm, &args, newdir, newent, inode->i_mode);
+ if (!err)
+ fuse_update_ctime_in_cache(inode);
+ else if (err == -EINTR)
+ fuse_invalidate_attr(inode);
+
+ return err;
+}
+
+static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
+ struct kstat *stat)
+{
+ unsigned int blkbits;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ stat->dev = inode->i_sb->s_dev;
+ stat->ino = attr->ino;
+ stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
+ stat->nlink = attr->nlink;
+ stat->uid = make_kuid(fc->user_ns, attr->uid);
+ stat->gid = make_kgid(fc->user_ns, attr->gid);
+ stat->rdev = inode->i_rdev;
+ stat->atime.tv_sec = attr->atime;
+ stat->atime.tv_nsec = attr->atimensec;
+ stat->mtime.tv_sec = attr->mtime;
+ stat->mtime.tv_nsec = attr->mtimensec;
+ stat->ctime.tv_sec = attr->ctime;
+ stat->ctime.tv_nsec = attr->ctimensec;
+ stat->size = attr->size;
+ stat->blocks = attr->blocks;
+
+ if (attr->blksize != 0)
+ blkbits = ilog2(attr->blksize);
+ else
+ blkbits = inode->i_sb->s_blocksize_bits;
+
+ stat->blksize = 1 << blkbits;
+}
+
+static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
+ struct file *file)
+{
+ int err;
+ struct fuse_getattr_in inarg;
+ struct fuse_attr_out outarg;
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ FUSE_ARGS(args);
+ u64 attr_version;
+
+ attr_version = fuse_get_attr_version(fm->fc);
+
+ memset(&inarg, 0, sizeof(inarg));
+ memset(&outarg, 0, sizeof(outarg));
+ /* Directories have separate file-handle space */
+ if (file && S_ISREG(inode->i_mode)) {
+ struct fuse_file *ff = file->private_data;
+
+ inarg.getattr_flags |= FUSE_GETATTR_FH;
+ inarg.fh = ff->fh;
+ }
+ args.opcode = FUSE_GETATTR;
+ args.nodeid = get_node_id(inode);
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.out_numargs = 1;
+ args.out_args[0].size = sizeof(outarg);
+ args.out_args[0].value = &outarg;
+ err = fuse_simple_request(fm, &args);
+ if (!err) {
+ if (fuse_invalid_attr(&outarg.attr) ||
+ inode_wrong_type(inode, outarg.attr.mode)) {
+ fuse_make_bad(inode);
+ err = -EIO;
+ } else {
+ fuse_change_attributes(inode, &outarg.attr,
+ attr_timeout(&outarg),
+ attr_version);
+ if (stat)
+ fuse_fillattr(inode, &outarg.attr, stat);
+ }
+ }
+ return err;
+}
+
+static int fuse_update_get_attr(struct inode *inode, struct file *file,
+ struct kstat *stat, u32 request_mask,
+ unsigned int flags)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ int err = 0;
+ bool sync;
+ u32 inval_mask = READ_ONCE(fi->inval_mask);
+ u32 cache_mask = fuse_get_cache_mask(inode);
+
+ if (flags & AT_STATX_FORCE_SYNC)
+ sync = true;
+ else if (flags & AT_STATX_DONT_SYNC)
+ sync = false;
+ else if (request_mask & inval_mask & ~cache_mask)
+ sync = true;
+ else
+ sync = time_before64(fi->i_time, get_jiffies_64());
+
+ if (sync) {
+ forget_all_cached_acls(inode);
+ err = fuse_do_getattr(inode, stat, file);
+ } else if (stat) {
+ generic_fillattr(&init_user_ns, inode, stat);
+ stat->mode = fi->orig_i_mode;
+ stat->ino = fi->orig_ino;
+ }
+
+ return err;
+}
+
+int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask)
+{
+ return fuse_update_get_attr(inode, file, NULL, mask, 0);
+}
+
+int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
+ u64 child_nodeid, struct qstr *name)
+{
+ int err = -ENOTDIR;
+ struct inode *parent;
+ struct dentry *dir;
+ struct dentry *entry;
+
+ parent = fuse_ilookup(fc, parent_nodeid, NULL);
+ if (!parent)
+ return -ENOENT;
+
+ inode_lock_nested(parent, I_MUTEX_PARENT);
+ if (!S_ISDIR(parent->i_mode))
+ goto unlock;
+
+ err = -ENOENT;
+ dir = d_find_alias(parent);
+ if (!dir)
+ goto unlock;
+
+ name->hash = full_name_hash(dir, name->name, name->len);
+ entry = d_lookup(dir, name);
+ dput(dir);
+ if (!entry)
+ goto unlock;
+
+ fuse_dir_changed(parent);
+ fuse_invalidate_entry(entry);
+
+ if (child_nodeid != 0 && d_really_is_positive(entry)) {
+ inode_lock(d_inode(entry));
+ if (get_node_id(d_inode(entry)) != child_nodeid) {
+ err = -ENOENT;
+ goto badentry;
+ }
+ if (d_mountpoint(entry)) {
+ err = -EBUSY;
+ goto badentry;
+ }
+ if (d_is_dir(entry)) {
+ shrink_dcache_parent(entry);
+ if (!simple_empty(entry)) {
+ err = -ENOTEMPTY;
+ goto badentry;
+ }
+ d_inode(entry)->i_flags |= S_DEAD;
+ }
+ dont_mount(entry);
+ clear_nlink(d_inode(entry));
+ err = 0;
+ badentry:
+ inode_unlock(d_inode(entry));
+ if (!err)
+ d_delete(entry);
+ } else {
+ err = 0;
+ }
+ dput(entry);
+
+ unlock:
+ inode_unlock(parent);
+ iput(parent);
+ return err;
+}
+
+/*
+ * Calling into a user-controlled filesystem gives the filesystem
+ * daemon ptrace-like capabilities over the current process. This
+ * means, that the filesystem daemon is able to record the exact
+ * filesystem operations performed, and can also control the behavior
+ * of the requester process in otherwise impossible ways. For example
+ * it can delay the operation for arbitrary length of time allowing
+ * DoS against the requester.
+ *
+ * For this reason only those processes can call into the filesystem,
+ * for which the owner of the mount has ptrace privilege. This
+ * excludes processes started by other users, suid or sgid processes.
+ */
+int fuse_allow_current_process(struct fuse_conn *fc)
+{
+ const struct cred *cred;
+
+ if (allow_sys_admin_access && capable(CAP_SYS_ADMIN))
+ return 1;
+
+ if (fc->allow_other)
+ return current_in_userns(fc->user_ns);
+
+ cred = current_cred();
+ if (uid_eq(cred->euid, fc->user_id) &&
+ uid_eq(cred->suid, fc->user_id) &&
+ uid_eq(cred->uid, fc->user_id) &&
+ gid_eq(cred->egid, fc->group_id) &&
+ gid_eq(cred->sgid, fc->group_id) &&
+ gid_eq(cred->gid, fc->group_id))
+ return 1;
+
+ return 0;
+}
+
+static int fuse_access(struct inode *inode, int mask)
+{
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ FUSE_ARGS(args);
+ struct fuse_access_in inarg;
+ int err;
+
+ BUG_ON(mask & MAY_NOT_BLOCK);
+
+ if (fm->fc->no_access)
+ return 0;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC);
+ args.opcode = FUSE_ACCESS;
+ args.nodeid = get_node_id(inode);
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ err = fuse_simple_request(fm, &args);
+ if (err == -ENOSYS) {
+ fm->fc->no_access = 1;
+ err = 0;
+ }
+ return err;
+}
+
+static int fuse_perm_getattr(struct inode *inode, int mask)
+{
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+
+ forget_all_cached_acls(inode);
+ return fuse_do_getattr(inode, NULL, NULL);
+}
+
+/*
+ * Check permission. The two basic access models of FUSE are:
+ *
+ * 1) Local access checking ('default_permissions' mount option) based
+ * on file mode. This is the plain old disk filesystem permission
+ * modell.
+ *
+ * 2) "Remote" access checking, where server is responsible for
+ * checking permission in each inode operation. An exception to this
+ * is if ->permission() was invoked from sys_access() in which case an
+ * access request is sent. Execute permission is still checked
+ * locally based on file mode.
+ */
+static int fuse_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ bool refreshed = false;
+ int err = 0;
+
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ if (!fuse_allow_current_process(fc))
+ return -EACCES;
+
+ /*
+ * If attributes are needed, refresh them before proceeding
+ */
+ if (fc->default_permissions ||
+ ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) {
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ u32 perm_mask = STATX_MODE | STATX_UID | STATX_GID;
+
+ if (perm_mask & READ_ONCE(fi->inval_mask) ||
+ time_before64(fi->i_time, get_jiffies_64())) {
+ refreshed = true;
+
+ err = fuse_perm_getattr(inode, mask);
+ if (err)
+ return err;
+ }
+ }
+
+ if (fc->default_permissions) {
+ err = generic_permission(&init_user_ns, inode, mask);
+
+ /* If permission is denied, try to refresh file
+ attributes. This is also needed, because the root
+ node will at first have no permissions */
+ if (err == -EACCES && !refreshed) {
+ err = fuse_perm_getattr(inode, mask);
+ if (!err)
+ err = generic_permission(&init_user_ns,
+ inode, mask);
+ }
+
+ /* Note: the opposite of the above test does not
+ exist. So if permissions are revoked this won't be
+ noticed immediately, only after the attribute
+ timeout has expired */
+ } else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
+ err = fuse_access(inode, mask);
+ } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
+ if (!(inode->i_mode & S_IXUGO)) {
+ if (refreshed)
+ return -EACCES;
+
+ err = fuse_perm_getattr(inode, mask);
+ if (!err && !(inode->i_mode & S_IXUGO))
+ return -EACCES;
+ }
+ }
+ return err;
+}
+
+static int fuse_readlink_page(struct inode *inode, struct page *page)
+{
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 };
+ struct fuse_args_pages ap = {
+ .num_pages = 1,
+ .pages = &page,
+ .descs = &desc,
+ };
+ char *link;
+ ssize_t res;
+
+ ap.args.opcode = FUSE_READLINK;
+ ap.args.nodeid = get_node_id(inode);
+ ap.args.out_pages = true;
+ ap.args.out_argvar = true;
+ ap.args.page_zeroing = true;
+ ap.args.out_numargs = 1;
+ ap.args.out_args[0].size = desc.length;
+ res = fuse_simple_request(fm, &ap.args);
+
+ fuse_invalidate_atime(inode);
+
+ if (res < 0)
+ return res;
+
+ if (WARN_ON(res >= PAGE_SIZE))
+ return -EIO;
+
+ link = page_address(page);
+ link[res] = '\0';
+
+ return 0;
+}
+
+static const char *fuse_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *callback)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct page *page;
+ int err;
+
+ err = -EIO;
+ if (fuse_is_bad(inode))
+ goto out_err;
+
+ if (fc->cache_symlinks)
+ return page_get_link(dentry, inode, callback);
+
+ err = -ECHILD;
+ if (!dentry)
+ goto out_err;
+
+ page = alloc_page(GFP_KERNEL);
+ err = -ENOMEM;
+ if (!page)
+ goto out_err;
+
+ err = fuse_readlink_page(inode, page);
+ if (err) {
+ __free_page(page);
+ goto out_err;
+ }
+
+ set_delayed_call(callback, page_put_link, page);
+
+ return page_address(page);
+
+out_err:
+ return ERR_PTR(err);
+}
+
+static int fuse_dir_open(struct inode *inode, struct file *file)
+{
+ return fuse_open_common(inode, file, true);
+}
+
+static int fuse_dir_release(struct inode *inode, struct file *file)
+{
+ fuse_release_common(file, true);
+
+ return 0;
+}
+
+static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ int err;
+
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ if (fc->no_fsyncdir)
+ return 0;
+
+ inode_lock(inode);
+ err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNCDIR);
+ if (err == -ENOSYS) {
+ fc->no_fsyncdir = 1;
+ err = 0;
+ }
+ inode_unlock(inode);
+
+ return err;
+}
+
+static long fuse_dir_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
+
+ /* FUSE_IOCTL_DIR only supported for API version >= 7.18 */
+ if (fc->minor < 18)
+ return -ENOTTY;
+
+ return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_DIR);
+}
+
+static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
+
+ if (fc->minor < 18)
+ return -ENOTTY;
+
+ return fuse_ioctl_common(file, cmd, arg,
+ FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR);
+}
+
+static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
+{
+ /* Always update if mtime is explicitly set */
+ if (ivalid & ATTR_MTIME_SET)
+ return true;
+
+ /* Or if kernel i_mtime is the official one */
+ if (trust_local_mtime)
+ return true;
+
+ /* If it's an open(O_TRUNC) or an ftruncate(), don't update */
+ if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE)))
+ return false;
+
+ /* In all other cases update */
+ return true;
+}
+
+static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr,
+ struct fuse_setattr_in *arg, bool trust_local_cmtime)
+{
+ unsigned ivalid = iattr->ia_valid;
+
+ if (ivalid & ATTR_MODE)
+ arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode;
+ if (ivalid & ATTR_UID)
+ arg->valid |= FATTR_UID, arg->uid = from_kuid(fc->user_ns, iattr->ia_uid);
+ if (ivalid & ATTR_GID)
+ arg->valid |= FATTR_GID, arg->gid = from_kgid(fc->user_ns, iattr->ia_gid);
+ if (ivalid & ATTR_SIZE)
+ arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size;
+ if (ivalid & ATTR_ATIME) {
+ arg->valid |= FATTR_ATIME;
+ arg->atime = iattr->ia_atime.tv_sec;
+ arg->atimensec = iattr->ia_atime.tv_nsec;
+ if (!(ivalid & ATTR_ATIME_SET))
+ arg->valid |= FATTR_ATIME_NOW;
+ }
+ if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) {
+ arg->valid |= FATTR_MTIME;
+ arg->mtime = iattr->ia_mtime.tv_sec;
+ arg->mtimensec = iattr->ia_mtime.tv_nsec;
+ if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime)
+ arg->valid |= FATTR_MTIME_NOW;
+ }
+ if ((ivalid & ATTR_CTIME) && trust_local_cmtime) {
+ arg->valid |= FATTR_CTIME;
+ arg->ctime = iattr->ia_ctime.tv_sec;
+ arg->ctimensec = iattr->ia_ctime.tv_nsec;
+ }
+}
+
+/*
+ * Prevent concurrent writepages on inode
+ *
+ * This is done by adding a negative bias to the inode write counter
+ * and waiting for all pending writes to finish.
+ */
+void fuse_set_nowrite(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ BUG_ON(!inode_is_locked(inode));
+
+ spin_lock(&fi->lock);
+ BUG_ON(fi->writectr < 0);
+ fi->writectr += FUSE_NOWRITE;
+ spin_unlock(&fi->lock);
+ wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE);
+}
+
+/*
+ * Allow writepages on inode
+ *
+ * Remove the bias from the writecounter and send any queued
+ * writepages.
+ */
+static void __fuse_release_nowrite(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ BUG_ON(fi->writectr != FUSE_NOWRITE);
+ fi->writectr = 0;
+ fuse_flush_writepages(inode);
+}
+
+void fuse_release_nowrite(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fi->lock);
+ __fuse_release_nowrite(inode);
+ spin_unlock(&fi->lock);
+}
+
+static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args,
+ struct inode *inode,
+ struct fuse_setattr_in *inarg_p,
+ struct fuse_attr_out *outarg_p)
+{
+ args->opcode = FUSE_SETATTR;
+ args->nodeid = get_node_id(inode);
+ args->in_numargs = 1;
+ args->in_args[0].size = sizeof(*inarg_p);
+ args->in_args[0].value = inarg_p;
+ args->out_numargs = 1;
+ args->out_args[0].size = sizeof(*outarg_p);
+ args->out_args[0].value = outarg_p;
+}
+
+/*
+ * Flush inode->i_mtime to the server
+ */
+int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
+{
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ FUSE_ARGS(args);
+ struct fuse_setattr_in inarg;
+ struct fuse_attr_out outarg;
+
+ memset(&inarg, 0, sizeof(inarg));
+ memset(&outarg, 0, sizeof(outarg));
+
+ inarg.valid = FATTR_MTIME;
+ inarg.mtime = inode->i_mtime.tv_sec;
+ inarg.mtimensec = inode->i_mtime.tv_nsec;
+ if (fm->fc->minor >= 23) {
+ inarg.valid |= FATTR_CTIME;
+ inarg.ctime = inode->i_ctime.tv_sec;
+ inarg.ctimensec = inode->i_ctime.tv_nsec;
+ }
+ if (ff) {
+ inarg.valid |= FATTR_FH;
+ inarg.fh = ff->fh;
+ }
+ fuse_setattr_fill(fm->fc, &args, inode, &inarg, &outarg);
+
+ return fuse_simple_request(fm, &args);
+}
+
+/*
+ * Set attributes, and at the same time refresh them.
+ *
+ * Truncation is slightly complicated, because the 'truncate' request
+ * may fail, in which case we don't want to touch the mapping.
+ * vmtruncate() doesn't allow for this case, so do the rlimit checking
+ * and the actual truncation by hand.
+ */
+int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
+ struct file *file)
+{
+ struct inode *inode = d_inode(dentry);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_conn *fc = fm->fc;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct address_space *mapping = inode->i_mapping;
+ FUSE_ARGS(args);
+ struct fuse_setattr_in inarg;
+ struct fuse_attr_out outarg;
+ bool is_truncate = false;
+ bool is_wb = fc->writeback_cache && S_ISREG(inode->i_mode);
+ loff_t oldsize;
+ int err;
+ bool trust_local_cmtime = is_wb;
+ bool fault_blocked = false;
+
+ if (!fc->default_permissions)
+ attr->ia_valid |= ATTR_FORCE;
+
+ err = setattr_prepare(&init_user_ns, dentry, attr);
+ if (err)
+ return err;
+
+ if (attr->ia_valid & ATTR_SIZE) {
+ if (WARN_ON(!S_ISREG(inode->i_mode)))
+ return -EIO;
+ is_truncate = true;
+ }
+
+ if (FUSE_IS_DAX(inode) && is_truncate) {
+ filemap_invalidate_lock(mapping);
+ fault_blocked = true;
+ err = fuse_dax_break_layouts(inode, 0, 0);
+ if (err) {
+ filemap_invalidate_unlock(mapping);
+ return err;
+ }
+ }
+
+ if (attr->ia_valid & ATTR_OPEN) {
+ /* This is coming from open(..., ... | O_TRUNC); */
+ WARN_ON(!(attr->ia_valid & ATTR_SIZE));
+ WARN_ON(attr->ia_size != 0);
+ if (fc->atomic_o_trunc) {
+ /*
+ * No need to send request to userspace, since actual
+ * truncation has already been done by OPEN. But still
+ * need to truncate page cache.
+ */
+ i_size_write(inode, 0);
+ truncate_pagecache(inode, 0);
+ goto out;
+ }
+ file = NULL;
+ }
+
+ /* Flush dirty data/metadata before non-truncate SETATTR */
+ if (is_wb &&
+ attr->ia_valid &
+ (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET |
+ ATTR_TIMES_SET)) {
+ err = write_inode_now(inode, true);
+ if (err)
+ return err;
+
+ fuse_set_nowrite(inode);
+ fuse_release_nowrite(inode);
+ }
+
+ if (is_truncate) {
+ fuse_set_nowrite(inode);
+ set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+ if (trust_local_cmtime && attr->ia_size != inode->i_size)
+ attr->ia_valid |= ATTR_MTIME | ATTR_CTIME;
+ }
+
+ memset(&inarg, 0, sizeof(inarg));
+ memset(&outarg, 0, sizeof(outarg));
+ iattr_to_fattr(fc, attr, &inarg, trust_local_cmtime);
+ if (file) {
+ struct fuse_file *ff = file->private_data;
+ inarg.valid |= FATTR_FH;
+ inarg.fh = ff->fh;
+ }
+
+ /* Kill suid/sgid for non-directory chown unconditionally */
+ if (fc->handle_killpriv_v2 && !S_ISDIR(inode->i_mode) &&
+ attr->ia_valid & (ATTR_UID | ATTR_GID))
+ inarg.valid |= FATTR_KILL_SUIDGID;
+
+ if (attr->ia_valid & ATTR_SIZE) {
+ /* For mandatory locking in truncate */
+ inarg.valid |= FATTR_LOCKOWNER;
+ inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
+
+ /* Kill suid/sgid for truncate only if no CAP_FSETID */
+ if (fc->handle_killpriv_v2 && !capable(CAP_FSETID))
+ inarg.valid |= FATTR_KILL_SUIDGID;
+ }
+ fuse_setattr_fill(fc, &args, inode, &inarg, &outarg);
+ err = fuse_simple_request(fm, &args);
+ if (err) {
+ if (err == -EINTR)
+ fuse_invalidate_attr(inode);
+ goto error;
+ }
+
+ if (fuse_invalid_attr(&outarg.attr) ||
+ inode_wrong_type(inode, outarg.attr.mode)) {
+ fuse_make_bad(inode);
+ err = -EIO;
+ goto error;
+ }
+
+ spin_lock(&fi->lock);
+ /* the kernel maintains i_mtime locally */
+ if (trust_local_cmtime) {
+ if (attr->ia_valid & ATTR_MTIME)
+ inode->i_mtime = attr->ia_mtime;
+ if (attr->ia_valid & ATTR_CTIME)
+ inode->i_ctime = attr->ia_ctime;
+ /* FIXME: clear I_DIRTY_SYNC? */
+ }
+
+ fuse_change_attributes_common(inode, &outarg.attr,
+ attr_timeout(&outarg),
+ fuse_get_cache_mask(inode));
+ oldsize = inode->i_size;
+ /* see the comment in fuse_change_attributes() */
+ if (!is_wb || is_truncate)
+ i_size_write(inode, outarg.attr.size);
+
+ if (is_truncate) {
+ /* NOTE: this may release/reacquire fi->lock */
+ __fuse_release_nowrite(inode);
+ }
+ spin_unlock(&fi->lock);
+
+ /*
+ * Only call invalidate_inode_pages2() after removing
+ * FUSE_NOWRITE, otherwise fuse_launder_folio() would deadlock.
+ */
+ if ((is_truncate || !is_wb) &&
+ S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
+ truncate_pagecache(inode, outarg.attr.size);
+ invalidate_inode_pages2(mapping);
+ }
+
+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+out:
+ if (fault_blocked)
+ filemap_invalidate_unlock(mapping);
+
+ return 0;
+
+error:
+ if (is_truncate)
+ fuse_release_nowrite(inode);
+
+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
+ if (fault_blocked)
+ filemap_invalidate_unlock(mapping);
+ return err;
+}
+
+static int fuse_setattr(struct user_namespace *mnt_userns, struct dentry *entry,
+ struct iattr *attr)
+{
+ struct inode *inode = d_inode(entry);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct file *file = (attr->ia_valid & ATTR_FILE) ? attr->ia_file : NULL;
+ int ret;
+
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ if (!fuse_allow_current_process(get_fuse_conn(inode)))
+ return -EACCES;
+
+ if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) {
+ attr->ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID |
+ ATTR_MODE);
+
+ /*
+ * The only sane way to reliably kill suid/sgid is to do it in
+ * the userspace filesystem
+ *
+ * This should be done on write(), truncate() and chown().
+ */
+ if (!fc->handle_killpriv && !fc->handle_killpriv_v2) {
+ /*
+ * ia_mode calculation may have used stale i_mode.
+ * Refresh and recalculate.
+ */
+ ret = fuse_do_getattr(inode, NULL, file);
+ if (ret)
+ return ret;
+
+ attr->ia_mode = inode->i_mode;
+ if (inode->i_mode & S_ISUID) {
+ attr->ia_valid |= ATTR_MODE;
+ attr->ia_mode &= ~S_ISUID;
+ }
+ if ((inode->i_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
+ attr->ia_valid |= ATTR_MODE;
+ attr->ia_mode &= ~S_ISGID;
+ }
+ }
+ }
+ if (!attr->ia_valid)
+ return 0;
+
+ ret = fuse_do_setattr(entry, attr, file);
+ if (!ret) {
+ /*
+ * If filesystem supports acls it may have updated acl xattrs in
+ * the filesystem, so forget cached acls for the inode.
+ */
+ if (fc->posix_acl)
+ forget_all_cached_acls(inode);
+
+ /* Directory mode changed, may need to revalidate access */
+ if (d_is_dir(entry) && (attr->ia_valid & ATTR_MODE))
+ fuse_invalidate_entry_cache(entry);
+ }
+ return ret;
+}
+
+static int fuse_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ if (!fuse_allow_current_process(fc)) {
+ if (!request_mask) {
+ /*
+ * If user explicitly requested *nothing* then don't
+ * error out, but return st_dev only.
+ */
+ stat->result_mask = 0;
+ stat->dev = inode->i_sb->s_dev;
+ return 0;
+ }
+ return -EACCES;
+ }
+
+ return fuse_update_get_attr(inode, NULL, stat, request_mask, flags);
+}
+
+static const struct inode_operations fuse_dir_inode_operations = {
+ .lookup = fuse_lookup,
+ .mkdir = fuse_mkdir,
+ .symlink = fuse_symlink,
+ .unlink = fuse_unlink,
+ .rmdir = fuse_rmdir,
+ .rename = fuse_rename2,
+ .link = fuse_link,
+ .setattr = fuse_setattr,
+ .create = fuse_create,
+ .atomic_open = fuse_atomic_open,
+ .tmpfile = fuse_tmpfile,
+ .mknod = fuse_mknod,
+ .permission = fuse_permission,
+ .getattr = fuse_getattr,
+ .listxattr = fuse_listxattr,
+ .get_acl = fuse_get_acl,
+ .set_acl = fuse_set_acl,
+ .fileattr_get = fuse_fileattr_get,
+ .fileattr_set = fuse_fileattr_set,
+};
+
+static const struct file_operations fuse_dir_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .iterate_shared = fuse_readdir,
+ .open = fuse_dir_open,
+ .release = fuse_dir_release,
+ .fsync = fuse_dir_fsync,
+ .unlocked_ioctl = fuse_dir_ioctl,
+ .compat_ioctl = fuse_dir_compat_ioctl,
+};
+
+static const struct inode_operations fuse_common_inode_operations = {
+ .setattr = fuse_setattr,
+ .permission = fuse_permission,
+ .getattr = fuse_getattr,
+ .listxattr = fuse_listxattr,
+ .get_acl = fuse_get_acl,
+ .set_acl = fuse_set_acl,
+ .fileattr_get = fuse_fileattr_get,
+ .fileattr_set = fuse_fileattr_set,
+};
+
+static const struct inode_operations fuse_symlink_inode_operations = {
+ .setattr = fuse_setattr,
+ .get_link = fuse_get_link,
+ .getattr = fuse_getattr,
+ .listxattr = fuse_listxattr,
+};
+
+void fuse_init_common(struct inode *inode)
+{
+ inode->i_op = &fuse_common_inode_operations;
+}
+
+void fuse_init_dir(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ inode->i_op = &fuse_dir_inode_operations;
+ inode->i_fop = &fuse_dir_operations;
+
+ spin_lock_init(&fi->rdc.lock);
+ fi->rdc.cached = false;
+ fi->rdc.size = 0;
+ fi->rdc.pos = 0;
+ fi->rdc.version = 0;
+}
+
+static int fuse_symlink_read_folio(struct file *null, struct folio *folio)
+{
+ int err = fuse_readlink_page(folio->mapping->host, &folio->page);
+
+ if (!err)
+ folio_mark_uptodate(folio);
+
+ folio_unlock(folio);
+
+ return err;
+}
+
+static const struct address_space_operations fuse_symlink_aops = {
+ .read_folio = fuse_symlink_read_folio,
+};
+
+void fuse_init_symlink(struct inode *inode)
+{
+ inode->i_op = &fuse_symlink_inode_operations;
+ inode->i_data.a_ops = &fuse_symlink_aops;
+ inode_nohighmem(inode);
+}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
new file mode 100644
index 000000000..c996c0ef8
--- /dev/null
+++ b/fs/fuse/file.c
@@ -0,0 +1,3216 @@
+/*
+ FUSE: Filesystem in Userspace
+ Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
+
+ This program can be distributed under the terms of the GNU GPL.
+ See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/falloc.h>
+#include <linux/uio.h>
+#include <linux/fs.h>
+
+static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
+ unsigned int open_flags, int opcode,
+ struct fuse_open_out *outargp)
+{
+ struct fuse_open_in inarg;
+ FUSE_ARGS(args);
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
+ if (!fm->fc->atomic_o_trunc)
+ inarg.flags &= ~O_TRUNC;
+
+ if (fm->fc->handle_killpriv_v2 &&
+ (inarg.flags & O_TRUNC) && !capable(CAP_FSETID)) {
+ inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
+ }
+
+ args.opcode = opcode;
+ args.nodeid = nodeid;
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.out_numargs = 1;
+ args.out_args[0].size = sizeof(*outargp);
+ args.out_args[0].value = outargp;
+
+ return fuse_simple_request(fm, &args);
+}
+
+struct fuse_release_args {
+ struct fuse_args args;
+ struct fuse_release_in inarg;
+ struct inode *inode;
+};
+
+struct fuse_file *fuse_file_alloc(struct fuse_mount *fm)
+{
+ struct fuse_file *ff;
+
+ ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT);
+ if (unlikely(!ff))
+ return NULL;
+
+ ff->fm = fm;
+ ff->release_args = kzalloc(sizeof(*ff->release_args),
+ GFP_KERNEL_ACCOUNT);
+ if (!ff->release_args) {
+ kfree(ff);
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&ff->write_entry);
+ mutex_init(&ff->readdir.lock);
+ refcount_set(&ff->count, 1);
+ RB_CLEAR_NODE(&ff->polled_node);
+ init_waitqueue_head(&ff->poll_wait);
+
+ ff->kh = atomic64_inc_return(&fm->fc->khctr);
+
+ return ff;
+}
+
+void fuse_file_free(struct fuse_file *ff)
+{
+ kfree(ff->release_args);
+ mutex_destroy(&ff->readdir.lock);
+ kfree(ff);
+}
+
+static struct fuse_file *fuse_file_get(struct fuse_file *ff)
+{
+ refcount_inc(&ff->count);
+ return ff;
+}
+
+static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args,
+ int error)
+{
+ struct fuse_release_args *ra = container_of(args, typeof(*ra), args);
+
+ iput(ra->inode);
+ kfree(ra);
+}
+
+static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
+{
+ if (refcount_dec_and_test(&ff->count)) {
+ struct fuse_args *args = &ff->release_args->args;
+
+ if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) {
+ /* Do nothing when client does not implement 'open' */
+ fuse_release_end(ff->fm, args, 0);
+ } else if (sync) {
+ fuse_simple_request(ff->fm, args);
+ fuse_release_end(ff->fm, args, 0);
+ } else {
+ args->end = fuse_release_end;
+ if (fuse_simple_background(ff->fm, args,
+ GFP_KERNEL | __GFP_NOFAIL))
+ fuse_release_end(ff->fm, args, -ENOTCONN);
+ }
+ kfree(ff);
+ }
+}
+
+struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
+ unsigned int open_flags, bool isdir)
+{
+ struct fuse_conn *fc = fm->fc;
+ struct fuse_file *ff;
+ int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
+
+ ff = fuse_file_alloc(fm);
+ if (!ff)
+ return ERR_PTR(-ENOMEM);
+
+ ff->fh = 0;
+ /* Default for no-open */
+ ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
+ if (isdir ? !fc->no_opendir : !fc->no_open) {
+ struct fuse_open_out outarg;
+ int err;
+
+ err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg);
+ if (!err) {
+ ff->fh = outarg.fh;
+ ff->open_flags = outarg.open_flags;
+
+ } else if (err != -ENOSYS) {
+ fuse_file_free(ff);
+ return ERR_PTR(err);
+ } else {
+ if (isdir)
+ fc->no_opendir = 1;
+ else
+ fc->no_open = 1;
+ }
+ }
+
+ if (isdir)
+ ff->open_flags &= ~FOPEN_DIRECT_IO;
+
+ ff->nodeid = nodeid;
+
+ return ff;
+}
+
+int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
+ bool isdir)
+{
+ struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir);
+
+ if (!IS_ERR(ff))
+ file->private_data = ff;
+
+ return PTR_ERR_OR_ZERO(ff);
+}
+EXPORT_SYMBOL_GPL(fuse_do_open);
+
+static void fuse_link_write_file(struct file *file)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_file *ff = file->private_data;
+ /*
+ * file may be written through mmap, so chain it onto the
+ * inodes's write_file list
+ */
+ spin_lock(&fi->lock);
+ if (list_empty(&ff->write_entry))
+ list_add(&ff->write_entry, &fi->write_files);
+ spin_unlock(&fi->lock);
+}
+
+void fuse_finish_open(struct inode *inode, struct file *file)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (ff->open_flags & FOPEN_STREAM)
+ stream_open(inode, file);
+ else if (ff->open_flags & FOPEN_NONSEEKABLE)
+ nonseekable_open(inode, file);
+
+ if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ i_size_write(inode, 0);
+ spin_unlock(&fi->lock);
+ file_update_time(file);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
+ }
+ if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
+ fuse_link_write_file(file);
+}
+
+int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
+{
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_conn *fc = fm->fc;
+ int err;
+ bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
+ fc->atomic_o_trunc &&
+ fc->writeback_cache;
+ bool dax_truncate = (file->f_flags & O_TRUNC) &&
+ fc->atomic_o_trunc && FUSE_IS_DAX(inode);
+
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ err = generic_file_open(inode, file);
+ if (err)
+ return err;
+
+ if (is_wb_truncate || dax_truncate)
+ inode_lock(inode);
+
+ if (dax_truncate) {
+ filemap_invalidate_lock(inode->i_mapping);
+ err = fuse_dax_break_layouts(inode, 0, 0);
+ if (err)
+ goto out_inode_unlock;
+ }
+
+ if (is_wb_truncate || dax_truncate)
+ fuse_set_nowrite(inode);
+
+ err = fuse_do_open(fm, get_node_id(inode), file, isdir);
+ if (!err)
+ fuse_finish_open(inode, file);
+
+ if (is_wb_truncate || dax_truncate)
+ fuse_release_nowrite(inode);
+ if (!err) {
+ struct fuse_file *ff = file->private_data;
+
+ if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC))
+ truncate_pagecache(inode, 0);
+ else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+ invalidate_inode_pages2(inode->i_mapping);
+ }
+ if (dax_truncate)
+ filemap_invalidate_unlock(inode->i_mapping);
+out_inode_unlock:
+ if (is_wb_truncate || dax_truncate)
+ inode_unlock(inode);
+
+ return err;
+}
+
+static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
+ unsigned int flags, int opcode)
+{
+ struct fuse_conn *fc = ff->fm->fc;
+ struct fuse_release_args *ra = ff->release_args;
+
+ /* Inode is NULL on error path of fuse_create_open() */
+ if (likely(fi)) {
+ spin_lock(&fi->lock);
+ list_del(&ff->write_entry);
+ spin_unlock(&fi->lock);
+ }
+ spin_lock(&fc->lock);
+ if (!RB_EMPTY_NODE(&ff->polled_node))
+ rb_erase(&ff->polled_node, &fc->polled_files);
+ spin_unlock(&fc->lock);
+
+ wake_up_interruptible_all(&ff->poll_wait);
+
+ ra->inarg.fh = ff->fh;
+ ra->inarg.flags = flags;
+ ra->args.in_numargs = 1;
+ ra->args.in_args[0].size = sizeof(struct fuse_release_in);
+ ra->args.in_args[0].value = &ra->inarg;
+ ra->args.opcode = opcode;
+ ra->args.nodeid = ff->nodeid;
+ ra->args.force = true;
+ ra->args.nocreds = true;
+}
+
+void fuse_file_release(struct inode *inode, struct fuse_file *ff,
+ unsigned int open_flags, fl_owner_t id, bool isdir)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_release_args *ra = ff->release_args;
+ int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
+
+ fuse_prepare_release(fi, ff, open_flags, opcode);
+
+ if (ff->flock) {
+ ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
+ ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id);
+ }
+ /* Hold inode until release is finished */
+ ra->inode = igrab(inode);
+
+ /*
+ * Normally this will send the RELEASE request, however if
+ * some asynchronous READ or WRITE requests are outstanding,
+ * the sending will be delayed.
+ *
+ * Make the release synchronous if this is a fuseblk mount,
+ * synchronous RELEASE is allowed (and desirable) in this case
+ * because the server can be trusted not to screw up.
+ */
+ fuse_file_put(ff, ff->fm->fc->destroy, isdir);
+}
+
+void fuse_release_common(struct file *file, bool isdir)
+{
+ fuse_file_release(file_inode(file), file->private_data, file->f_flags,
+ (fl_owner_t) file, isdir);
+}
+
+static int fuse_open(struct inode *inode, struct file *file)
+{
+ return fuse_open_common(inode, file, false);
+}
+
+static int fuse_release(struct inode *inode, struct file *file)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ /*
+ * Dirty pages might remain despite write_inode_now() call from
+ * fuse_flush() due to writes racing with the close.
+ */
+ if (fc->writeback_cache)
+ write_inode_now(inode, 1);
+
+ fuse_release_common(file, false);
+
+ /* return value is ignored by VFS */
+ return 0;
+}
+
+void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff,
+ unsigned int flags)
+{
+ WARN_ON(refcount_read(&ff->count) > 1);
+ fuse_prepare_release(fi, ff, flags, FUSE_RELEASE);
+ /*
+ * iput(NULL) is a no-op and since the refcount is 1 and everything's
+ * synchronous, we are fine with not doing igrab() here"
+ */
+ fuse_file_put(ff, true, false);
+}
+EXPORT_SYMBOL_GPL(fuse_sync_release);
+
+/*
+ * Scramble the ID space with XTEA, so that the value of the files_struct
+ * pointer is not exposed to userspace.
+ */
+u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
+{
+ u32 *k = fc->scramble_key;
+ u64 v = (unsigned long) id;
+ u32 v0 = v;
+ u32 v1 = v >> 32;
+ u32 sum = 0;
+ int i;
+
+ for (i = 0; i < 32; i++) {
+ v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
+ sum += 0x9E3779B9;
+ v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
+ }
+
+ return (u64) v0 + ((u64) v1 << 32);
+}
+
+struct fuse_writepage_args {
+ struct fuse_io_args ia;
+ struct rb_node writepages_entry;
+ struct list_head queue_entry;
+ struct fuse_writepage_args *next;
+ struct inode *inode;
+ struct fuse_sync_bucket *bucket;
+};
+
+static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
+ pgoff_t idx_from, pgoff_t idx_to)
+{
+ struct rb_node *n;
+
+ n = fi->writepages.rb_node;
+
+ while (n) {
+ struct fuse_writepage_args *wpa;
+ pgoff_t curr_index;
+
+ wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry);
+ WARN_ON(get_fuse_inode(wpa->inode) != fi);
+ curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
+ if (idx_from >= curr_index + wpa->ia.ap.num_pages)
+ n = n->rb_right;
+ else if (idx_to < curr_index)
+ n = n->rb_left;
+ else
+ return wpa;
+ }
+ return NULL;
+}
+
+/*
+ * Check if any page in a range is under writeback
+ *
+ * This is currently done by walking the list of writepage requests
+ * for the inode, which can be pretty inefficient.
+ */
+static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
+ pgoff_t idx_to)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ bool found;
+
+ spin_lock(&fi->lock);
+ found = fuse_find_writeback(fi, idx_from, idx_to);
+ spin_unlock(&fi->lock);
+
+ return found;
+}
+
+static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
+{
+ return fuse_range_is_writeback(inode, index, index);
+}
+
+/*
+ * Wait for page writeback to be completed.
+ *
+ * Since fuse doesn't rely on the VM writeback tracking, this has to
+ * use some other means.
+ */
+static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
+}
+
+/*
+ * Wait for all pending writepages on the inode to finish.
+ *
+ * This is currently done by blocking further writes with FUSE_NOWRITE
+ * and waiting for all sent writes to complete.
+ *
+ * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
+ * could conflict with truncation.
+ */
+static void fuse_sync_writes(struct inode *inode)
+{
+ fuse_set_nowrite(inode);
+ fuse_release_nowrite(inode);
+}
+
+static int fuse_flush(struct file *file, fl_owner_t id)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_file *ff = file->private_data;
+ struct fuse_flush_in inarg;
+ FUSE_ARGS(args);
+ int err;
+
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache)
+ return 0;
+
+ err = write_inode_now(inode, 1);
+ if (err)
+ return err;
+
+ inode_lock(inode);
+ fuse_sync_writes(inode);
+ inode_unlock(inode);
+
+ err = filemap_check_errors(file->f_mapping);
+ if (err)
+ return err;
+
+ err = 0;
+ if (fm->fc->no_flush)
+ goto inval_attr_out;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.fh = ff->fh;
+ inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
+ args.opcode = FUSE_FLUSH;
+ args.nodeid = get_node_id(inode);
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.force = true;
+
+ err = fuse_simple_request(fm, &args);
+ if (err == -ENOSYS) {
+ fm->fc->no_flush = 1;
+ err = 0;
+ }
+
+inval_attr_out:
+ /*
+ * In memory i_blocks is not maintained by fuse, if writeback cache is
+ * enabled, i_blocks from cached attr may not be accurate.
+ */
+ if (!err && fm->fc->writeback_cache)
+ fuse_invalidate_attr_mask(inode, STATX_BLOCKS);
+ return err;
+}
+
+int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
+ int datasync, int opcode)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_file *ff = file->private_data;
+ FUSE_ARGS(args);
+ struct fuse_fsync_in inarg;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.fh = ff->fh;
+ inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0;
+ args.opcode = opcode;
+ args.nodeid = get_node_id(inode);
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ return fuse_simple_request(fm, &args);
+}
+
+static int fuse_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ int err;
+
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ inode_lock(inode);
+
+ /*
+ * Start writeback against all dirty pages of the inode, then
+ * wait for all outstanding writes, before sending the FSYNC
+ * request.
+ */
+ err = file_write_and_wait_range(file, start, end);
+ if (err)
+ goto out;
+
+ fuse_sync_writes(inode);
+
+ /*
+ * Due to implementation of fuse writeback
+ * file_write_and_wait_range() does not catch errors.
+ * We have to do this directly after fuse_sync_writes()
+ */
+ err = file_check_and_advance_wb_err(file);
+ if (err)
+ goto out;
+
+ err = sync_inode_metadata(inode, 1);
+ if (err)
+ goto out;
+
+ if (fc->no_fsync)
+ goto out;
+
+ err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC);
+ if (err == -ENOSYS) {
+ fc->no_fsync = 1;
+ err = 0;
+ }
+out:
+ inode_unlock(inode);
+
+ return err;
+}
+
+void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
+ size_t count, int opcode)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_args *args = &ia->ap.args;
+
+ ia->read.in.fh = ff->fh;
+ ia->read.in.offset = pos;
+ ia->read.in.size = count;
+ ia->read.in.flags = file->f_flags;
+ args->opcode = opcode;
+ args->nodeid = ff->nodeid;
+ args->in_numargs = 1;
+ args->in_args[0].size = sizeof(ia->read.in);
+ args->in_args[0].value = &ia->read.in;
+ args->out_argvar = true;
+ args->out_numargs = 1;
+ args->out_args[0].size = count;
+}
+
+static void fuse_release_user_pages(struct fuse_args_pages *ap,
+ bool should_dirty)
+{
+ unsigned int i;
+
+ for (i = 0; i < ap->num_pages; i++) {
+ if (should_dirty)
+ set_page_dirty_lock(ap->pages[i]);
+ put_page(ap->pages[i]);
+ }
+}
+
+static void fuse_io_release(struct kref *kref)
+{
+ kfree(container_of(kref, struct fuse_io_priv, refcnt));
+}
+
+static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
+{
+ if (io->err)
+ return io->err;
+
+ if (io->bytes >= 0 && io->write)
+ return -EIO;
+
+ return io->bytes < 0 ? io->size : io->bytes;
+}
+
+/**
+ * In case of short read, the caller sets 'pos' to the position of
+ * actual end of fuse request in IO request. Otherwise, if bytes_requested
+ * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
+ *
+ * An example:
+ * User requested DIO read of 64K. It was split into two 32K fuse requests,
+ * both submitted asynchronously. The first of them was ACKed by userspace as
+ * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
+ * second request was ACKed as short, e.g. only 1K was read, resulting in
+ * pos == 33K.
+ *
+ * Thus, when all fuse requests are completed, the minimal non-negative 'pos'
+ * will be equal to the length of the longest contiguous fragment of
+ * transferred data starting from the beginning of IO request.
+ */
+static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
+{
+ int left;
+
+ spin_lock(&io->lock);
+ if (err)
+ io->err = io->err ? : err;
+ else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes))
+ io->bytes = pos;
+
+ left = --io->reqs;
+ if (!left && io->blocking)
+ complete(io->done);
+ spin_unlock(&io->lock);
+
+ if (!left && !io->blocking) {
+ ssize_t res = fuse_get_res_by_io(io);
+
+ if (res >= 0) {
+ struct inode *inode = file_inode(io->iocb->ki_filp);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ spin_unlock(&fi->lock);
+ }
+
+ io->iocb->ki_complete(io->iocb, res);
+ }
+
+ kref_put(&io->refcnt, fuse_io_release);
+}
+
+static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io,
+ unsigned int npages)
+{
+ struct fuse_io_args *ia;
+
+ ia = kzalloc(sizeof(*ia), GFP_KERNEL);
+ if (ia) {
+ ia->io = io;
+ ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL,
+ &ia->ap.descs);
+ if (!ia->ap.pages) {
+ kfree(ia);
+ ia = NULL;
+ }
+ }
+ return ia;
+}
+
+static void fuse_io_free(struct fuse_io_args *ia)
+{
+ kfree(ia->ap.pages);
+ kfree(ia);
+}
+
+static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args,
+ int err)
+{
+ struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
+ struct fuse_io_priv *io = ia->io;
+ ssize_t pos = -1;
+
+ fuse_release_user_pages(&ia->ap, io->should_dirty);
+
+ if (err) {
+ /* Nothing */
+ } else if (io->write) {
+ if (ia->write.out.size > ia->write.in.size) {
+ err = -EIO;
+ } else if (ia->write.in.size != ia->write.out.size) {
+ pos = ia->write.in.offset - io->offset +
+ ia->write.out.size;
+ }
+ } else {
+ u32 outsize = args->out_args[0].size;
+
+ if (ia->read.in.size != outsize)
+ pos = ia->read.in.offset - io->offset + outsize;
+ }
+
+ fuse_aio_complete(io, err, pos);
+ fuse_io_free(ia);
+}
+
+static ssize_t fuse_async_req_send(struct fuse_mount *fm,
+ struct fuse_io_args *ia, size_t num_bytes)
+{
+ ssize_t err;
+ struct fuse_io_priv *io = ia->io;
+
+ spin_lock(&io->lock);
+ kref_get(&io->refcnt);
+ io->size += num_bytes;
+ io->reqs++;
+ spin_unlock(&io->lock);
+
+ ia->ap.args.end = fuse_aio_complete_req;
+ ia->ap.args.may_block = io->should_dirty;
+ err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL);
+ if (err)
+ fuse_aio_complete_req(fm, &ia->ap.args, err);
+
+ return num_bytes;
+}
+
+static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count,
+ fl_owner_t owner)
+{
+ struct file *file = ia->io->iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+ struct fuse_mount *fm = ff->fm;
+
+ fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
+ if (owner != NULL) {
+ ia->read.in.read_flags |= FUSE_READ_LOCKOWNER;
+ ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner);
+ }
+
+ if (ia->io->async)
+ return fuse_async_req_send(fm, ia, count);
+
+ return fuse_simple_request(fm, &ia->ap.args);
+}
+
+static void fuse_read_update_size(struct inode *inode, loff_t size,
+ u64 attr_ver)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fi->lock);
+ if (attr_ver >= fi->attr_version && size < inode->i_size &&
+ !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ i_size_write(inode, size);
+ }
+ spin_unlock(&fi->lock);
+}
+
+static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
+ struct fuse_args_pages *ap)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ /*
+ * If writeback_cache is enabled, a short read means there's a hole in
+ * the file. Some data after the hole is in page cache, but has not
+ * reached the client fs yet. So the hole is not present there.
+ */
+ if (!fc->writeback_cache) {
+ loff_t pos = page_offset(ap->pages[0]) + num_read;
+ fuse_read_update_size(inode, pos, attr_ver);
+ }
+}
+
+static int fuse_do_readpage(struct file *file, struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ loff_t pos = page_offset(page);
+ struct fuse_page_desc desc = { .length = PAGE_SIZE };
+ struct fuse_io_args ia = {
+ .ap.args.page_zeroing = true,
+ .ap.args.out_pages = true,
+ .ap.num_pages = 1,
+ .ap.pages = &page,
+ .ap.descs = &desc,
+ };
+ ssize_t res;
+ u64 attr_ver;
+
+ /*
+ * Page writeback can extend beyond the lifetime of the
+ * page-cache page, so make sure we read a properly synced
+ * page.
+ */
+ fuse_wait_on_page_writeback(inode, page->index);
+
+ attr_ver = fuse_get_attr_version(fm->fc);
+
+ /* Don't overflow end offset */
+ if (pos + (desc.length - 1) == LLONG_MAX)
+ desc.length--;
+
+ fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
+ res = fuse_simple_request(fm, &ia.ap.args);
+ if (res < 0)
+ return res;
+ /*
+ * Short read means EOF. If file size is larger, truncate it
+ */
+ if (res < desc.length)
+ fuse_short_read(inode, attr_ver, res, &ia.ap);
+
+ SetPageUptodate(page);
+
+ return 0;
+}
+
+static int fuse_read_folio(struct file *file, struct folio *folio)
+{
+ struct page *page = &folio->page;
+ struct inode *inode = page->mapping->host;
+ int err;
+
+ err = -EIO;
+ if (fuse_is_bad(inode))
+ goto out;
+
+ err = fuse_do_readpage(file, page);
+ fuse_invalidate_atime(inode);
+ out:
+ unlock_page(page);
+ return err;
+}
+
+static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
+ int err)
+{
+ int i;
+ struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
+ struct fuse_args_pages *ap = &ia->ap;
+ size_t count = ia->read.in.size;
+ size_t num_read = args->out_args[0].size;
+ struct address_space *mapping = NULL;
+
+ for (i = 0; mapping == NULL && i < ap->num_pages; i++)
+ mapping = ap->pages[i]->mapping;
+
+ if (mapping) {
+ struct inode *inode = mapping->host;
+
+ /*
+ * Short read means EOF. If file size is larger, truncate it
+ */
+ if (!err && num_read < count)
+ fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
+
+ fuse_invalidate_atime(inode);
+ }
+
+ for (i = 0; i < ap->num_pages; i++) {
+ struct page *page = ap->pages[i];
+
+ if (!err)
+ SetPageUptodate(page);
+ else
+ SetPageError(page);
+ unlock_page(page);
+ put_page(page);
+ }
+ if (ia->ff)
+ fuse_file_put(ia->ff, false, false);
+
+ fuse_io_free(ia);
+}
+
+static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_mount *fm = ff->fm;
+ struct fuse_args_pages *ap = &ia->ap;
+ loff_t pos = page_offset(ap->pages[0]);
+ size_t count = ap->num_pages << PAGE_SHIFT;
+ ssize_t res;
+ int err;
+
+ ap->args.out_pages = true;
+ ap->args.page_zeroing = true;
+ ap->args.page_replace = true;
+
+ /* Don't overflow end offset */
+ if (pos + (count - 1) == LLONG_MAX) {
+ count--;
+ ap->descs[ap->num_pages - 1].length--;
+ }
+ WARN_ON((loff_t) (pos + count) < 0);
+
+ fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
+ ia->read.attr_ver = fuse_get_attr_version(fm->fc);
+ if (fm->fc->async_read) {
+ ia->ff = fuse_file_get(ff);
+ ap->args.end = fuse_readpages_end;
+ err = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
+ if (!err)
+ return;
+ } else {
+ res = fuse_simple_request(fm, &ap->args);
+ err = res < 0 ? res : 0;
+ }
+ fuse_readpages_end(fm, &ap->args, err);
+}
+
+static void fuse_readahead(struct readahead_control *rac)
+{
+ struct inode *inode = rac->mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ unsigned int i, max_pages, nr_pages = 0;
+
+ if (fuse_is_bad(inode))
+ return;
+
+ max_pages = min_t(unsigned int, fc->max_pages,
+ fc->max_read / PAGE_SIZE);
+
+ for (;;) {
+ struct fuse_io_args *ia;
+ struct fuse_args_pages *ap;
+
+ if (fc->num_background >= fc->congestion_threshold &&
+ rac->ra->async_size >= readahead_count(rac))
+ /*
+ * Congested and only async pages left, so skip the
+ * rest.
+ */
+ break;
+
+ nr_pages = readahead_count(rac) - nr_pages;
+ if (nr_pages > max_pages)
+ nr_pages = max_pages;
+ if (nr_pages == 0)
+ break;
+ ia = fuse_io_alloc(NULL, nr_pages);
+ if (!ia)
+ return;
+ ap = &ia->ap;
+ nr_pages = __readahead_batch(rac, ap->pages, nr_pages);
+ for (i = 0; i < nr_pages; i++) {
+ fuse_wait_on_page_writeback(inode,
+ readahead_index(rac) + i);
+ ap->descs[i].length = PAGE_SIZE;
+ }
+ ap->num_pages = nr_pages;
+ fuse_send_readpages(ia, rac->file);
+ }
+}
+
+static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ /*
+ * In auto invalidate mode, always update attributes on read.
+ * Otherwise, only update if we attempt to read past EOF (to ensure
+ * i_size is up to date).
+ */
+ if (fc->auto_inval_data ||
+ (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
+ int err;
+ err = fuse_update_attributes(inode, iocb->ki_filp, STATX_SIZE);
+ if (err)
+ return err;
+ }
+
+ return generic_file_read_iter(iocb, to);
+}
+
+static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff,
+ loff_t pos, size_t count)
+{
+ struct fuse_args *args = &ia->ap.args;
+
+ ia->write.in.fh = ff->fh;
+ ia->write.in.offset = pos;
+ ia->write.in.size = count;
+ args->opcode = FUSE_WRITE;
+ args->nodeid = ff->nodeid;
+ args->in_numargs = 2;
+ if (ff->fm->fc->minor < 9)
+ args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
+ else
+ args->in_args[0].size = sizeof(ia->write.in);
+ args->in_args[0].value = &ia->write.in;
+ args->in_args[1].size = count;
+ args->out_numargs = 1;
+ args->out_args[0].size = sizeof(ia->write.out);
+ args->out_args[0].value = &ia->write.out;
+}
+
+static unsigned int fuse_write_flags(struct kiocb *iocb)
+{
+ unsigned int flags = iocb->ki_filp->f_flags;
+
+ if (iocb_is_dsync(iocb))
+ flags |= O_DSYNC;
+ if (iocb->ki_flags & IOCB_SYNC)
+ flags |= O_SYNC;
+
+ return flags;
+}
+
+static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
+ size_t count, fl_owner_t owner)
+{
+ struct kiocb *iocb = ia->io->iocb;
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+ struct fuse_mount *fm = ff->fm;
+ struct fuse_write_in *inarg = &ia->write.in;
+ ssize_t err;
+
+ fuse_write_args_fill(ia, ff, pos, count);
+ inarg->flags = fuse_write_flags(iocb);
+ if (owner != NULL) {
+ inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
+ inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner);
+ }
+
+ if (ia->io->async)
+ return fuse_async_req_send(fm, ia, count);
+
+ err = fuse_simple_request(fm, &ia->ap.args);
+ if (!err && ia->write.out.size > count)
+ err = -EIO;
+
+ return err ?: ia->write.out.size;
+}
+
+bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ bool ret = false;
+
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ if (written > 0 && pos > inode->i_size) {
+ i_size_write(inode, pos);
+ ret = true;
+ }
+ spin_unlock(&fi->lock);
+
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
+
+ return ret;
+}
+
+static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
+ struct kiocb *iocb, struct inode *inode,
+ loff_t pos, size_t count)
+{
+ struct fuse_args_pages *ap = &ia->ap;
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+ struct fuse_mount *fm = ff->fm;
+ unsigned int offset, i;
+ bool short_write;
+ int err;
+
+ for (i = 0; i < ap->num_pages; i++)
+ fuse_wait_on_page_writeback(inode, ap->pages[i]->index);
+
+ fuse_write_args_fill(ia, ff, pos, count);
+ ia->write.in.flags = fuse_write_flags(iocb);
+ if (fm->fc->handle_killpriv_v2 && !capable(CAP_FSETID))
+ ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
+
+ err = fuse_simple_request(fm, &ap->args);
+ if (!err && ia->write.out.size > count)
+ err = -EIO;
+
+ short_write = ia->write.out.size < count;
+ offset = ap->descs[0].offset;
+ count = ia->write.out.size;
+ for (i = 0; i < ap->num_pages; i++) {
+ struct page *page = ap->pages[i];
+
+ if (err) {
+ ClearPageUptodate(page);
+ } else {
+ if (count >= PAGE_SIZE - offset)
+ count -= PAGE_SIZE - offset;
+ else {
+ if (short_write)
+ ClearPageUptodate(page);
+ count = 0;
+ }
+ offset = 0;
+ }
+ if (ia->write.page_locked && (i == ap->num_pages - 1))
+ unlock_page(page);
+ put_page(page);
+ }
+
+ return err;
+}
+
+static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
+ struct address_space *mapping,
+ struct iov_iter *ii, loff_t pos,
+ unsigned int max_pages)
+{
+ struct fuse_args_pages *ap = &ia->ap;
+ struct fuse_conn *fc = get_fuse_conn(mapping->host);
+ unsigned offset = pos & (PAGE_SIZE - 1);
+ size_t count = 0;
+ int err;
+
+ ap->args.in_pages = true;
+ ap->descs[0].offset = offset;
+
+ do {
+ size_t tmp;
+ struct page *page;
+ pgoff_t index = pos >> PAGE_SHIFT;
+ size_t bytes = min_t(size_t, PAGE_SIZE - offset,
+ iov_iter_count(ii));
+
+ bytes = min_t(size_t, bytes, fc->max_write - count);
+
+ again:
+ err = -EFAULT;
+ if (fault_in_iov_iter_readable(ii, bytes))
+ break;
+
+ err = -ENOMEM;
+ page = grab_cache_page_write_begin(mapping, index);
+ if (!page)
+ break;
+
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_page(page);
+
+ tmp = copy_page_from_iter_atomic(page, offset, bytes, ii);
+ flush_dcache_page(page);
+
+ if (!tmp) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
+
+ err = 0;
+ ap->pages[ap->num_pages] = page;
+ ap->descs[ap->num_pages].length = tmp;
+ ap->num_pages++;
+
+ count += tmp;
+ pos += tmp;
+ offset += tmp;
+ if (offset == PAGE_SIZE)
+ offset = 0;
+
+ /* If we copied full page, mark it uptodate */
+ if (tmp == PAGE_SIZE)
+ SetPageUptodate(page);
+
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ } else {
+ ia->write.page_locked = true;
+ break;
+ }
+ if (!fc->big_writes)
+ break;
+ } while (iov_iter_count(ii) && count < fc->max_write &&
+ ap->num_pages < max_pages && offset == 0);
+
+ return count > 0 ? count : err;
+}
+
+static inline unsigned int fuse_wr_pages(loff_t pos, size_t len,
+ unsigned int max_pages)
+{
+ return min_t(unsigned int,
+ ((pos + len - 1) >> PAGE_SHIFT) -
+ (pos >> PAGE_SHIFT) + 1,
+ max_pages);
+}
+
+static ssize_t fuse_perform_write(struct kiocb *iocb,
+ struct address_space *mapping,
+ struct iov_iter *ii, loff_t pos)
+{
+ struct inode *inode = mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ int err = 0;
+ ssize_t res = 0;
+
+ if (inode->i_size < pos + iov_iter_count(ii))
+ set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
+ do {
+ ssize_t count;
+ struct fuse_io_args ia = {};
+ struct fuse_args_pages *ap = &ia.ap;
+ unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii),
+ fc->max_pages);
+
+ ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs);
+ if (!ap->pages) {
+ err = -ENOMEM;
+ break;
+ }
+
+ count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages);
+ if (count <= 0) {
+ err = count;
+ } else {
+ err = fuse_send_write_pages(&ia, iocb, inode,
+ pos, count);
+ if (!err) {
+ size_t num_written = ia.write.out.size;
+
+ res += num_written;
+ pos += num_written;
+
+ /* break out of the loop on short write */
+ if (num_written != count)
+ err = -EIO;
+ }
+ }
+ kfree(ap->pages);
+ } while (!err && iov_iter_count(ii));
+
+ fuse_write_update_attr(inode, pos, res);
+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
+ return res > 0 ? res : err;
+}
+
+static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ ssize_t written = 0;
+ ssize_t written_buffered = 0;
+ struct inode *inode = mapping->host;
+ ssize_t err;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ loff_t endbyte = 0;
+
+ if (fc->writeback_cache) {
+ /* Update size (EOF optimization) and mode (SUID clearing) */
+ err = fuse_update_attributes(mapping->host, file,
+ STATX_SIZE | STATX_MODE);
+ if (err)
+ return err;
+
+ if (fc->handle_killpriv_v2 &&
+ setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) {
+ goto writethrough;
+ }
+
+ return generic_file_write_iter(iocb, from);
+ }
+
+writethrough:
+ inode_lock(inode);
+
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = inode_to_bdi(inode);
+
+ err = generic_write_checks(iocb, from);
+ if (err <= 0)
+ goto out;
+
+ err = file_remove_privs(file);
+ if (err)
+ goto out;
+
+ err = file_update_time(file);
+ if (err)
+ goto out;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ loff_t pos = iocb->ki_pos;
+ written = generic_file_direct_write(iocb, from);
+ if (written < 0 || !iov_iter_count(from))
+ goto out;
+
+ pos += written;
+
+ written_buffered = fuse_perform_write(iocb, mapping, from, pos);
+ if (written_buffered < 0) {
+ err = written_buffered;
+ goto out;
+ }
+ endbyte = pos + written_buffered - 1;
+
+ err = filemap_write_and_wait_range(file->f_mapping, pos,
+ endbyte);
+ if (err)
+ goto out;
+
+ invalidate_mapping_pages(file->f_mapping,
+ pos >> PAGE_SHIFT,
+ endbyte >> PAGE_SHIFT);
+
+ written += written_buffered;
+ iocb->ki_pos = pos + written_buffered;
+ } else {
+ written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos);
+ if (written >= 0)
+ iocb->ki_pos += written;
+ }
+out:
+ current->backing_dev_info = NULL;
+ inode_unlock(inode);
+ if (written > 0)
+ written = generic_write_sync(iocb, written);
+
+ return written ? written : err;
+}
+
+static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
+{
+ return (unsigned long)ii->iov->iov_base + ii->iov_offset;
+}
+
+static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
+ size_t max_size)
+{
+ return min(iov_iter_single_seg_count(ii), max_size);
+}
+
+static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
+ size_t *nbytesp, int write,
+ unsigned int max_pages)
+{
+ size_t nbytes = 0; /* # bytes already packed in req */
+ ssize_t ret = 0;
+
+ /* Special case for kernel I/O: can copy directly into the buffer */
+ if (iov_iter_is_kvec(ii)) {
+ unsigned long user_addr = fuse_get_user_addr(ii);
+ size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
+
+ if (write)
+ ap->args.in_args[1].value = (void *) user_addr;
+ else
+ ap->args.out_args[0].value = (void *) user_addr;
+
+ iov_iter_advance(ii, frag_size);
+ *nbytesp = frag_size;
+ return 0;
+ }
+
+ while (nbytes < *nbytesp && ap->num_pages < max_pages) {
+ unsigned npages;
+ size_t start;
+ ret = iov_iter_get_pages2(ii, &ap->pages[ap->num_pages],
+ *nbytesp - nbytes,
+ max_pages - ap->num_pages,
+ &start);
+ if (ret < 0)
+ break;
+
+ nbytes += ret;
+
+ ret += start;
+ npages = DIV_ROUND_UP(ret, PAGE_SIZE);
+
+ ap->descs[ap->num_pages].offset = start;
+ fuse_page_descs_length_init(ap->descs, ap->num_pages, npages);
+
+ ap->num_pages += npages;
+ ap->descs[ap->num_pages - 1].length -=
+ (PAGE_SIZE - ret) & (PAGE_SIZE - 1);
+ }
+
+ ap->args.user_pages = true;
+ if (write)
+ ap->args.in_pages = true;
+ else
+ ap->args.out_pages = true;
+
+ *nbytesp = nbytes;
+
+ return ret < 0 ? ret : 0;
+}
+
+ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
+ loff_t *ppos, int flags)
+{
+ int write = flags & FUSE_DIO_WRITE;
+ int cuse = flags & FUSE_DIO_CUSE;
+ struct file *file = io->iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = ff->fm->fc;
+ size_t nmax = write ? fc->max_write : fc->max_read;
+ loff_t pos = *ppos;
+ size_t count = iov_iter_count(iter);
+ pgoff_t idx_from = pos >> PAGE_SHIFT;
+ pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT;
+ ssize_t res = 0;
+ int err = 0;
+ struct fuse_io_args *ia;
+ unsigned int max_pages;
+
+ max_pages = iov_iter_npages(iter, fc->max_pages);
+ ia = fuse_io_alloc(io, max_pages);
+ if (!ia)
+ return -ENOMEM;
+
+ if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
+ if (!write)
+ inode_lock(inode);
+ fuse_sync_writes(inode);
+ if (!write)
+ inode_unlock(inode);
+ }
+
+ io->should_dirty = !write && user_backed_iter(iter);
+ while (count) {
+ ssize_t nres;
+ fl_owner_t owner = current->files;
+ size_t nbytes = min(count, nmax);
+
+ err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write,
+ max_pages);
+ if (err && !nbytes)
+ break;
+
+ if (write) {
+ if (!capable(CAP_FSETID))
+ ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
+
+ nres = fuse_send_write(ia, pos, nbytes, owner);
+ } else {
+ nres = fuse_send_read(ia, pos, nbytes, owner);
+ }
+
+ if (!io->async || nres < 0) {
+ fuse_release_user_pages(&ia->ap, io->should_dirty);
+ fuse_io_free(ia);
+ }
+ ia = NULL;
+ if (nres < 0) {
+ iov_iter_revert(iter, nbytes);
+ err = nres;
+ break;
+ }
+ WARN_ON(nres > nbytes);
+
+ count -= nres;
+ res += nres;
+ pos += nres;
+ if (nres != nbytes) {
+ iov_iter_revert(iter, nbytes - nres);
+ break;
+ }
+ if (count) {
+ max_pages = iov_iter_npages(iter, fc->max_pages);
+ ia = fuse_io_alloc(io, max_pages);
+ if (!ia)
+ break;
+ }
+ }
+ if (ia)
+ fuse_io_free(ia);
+ if (res > 0)
+ *ppos = pos;
+
+ return res > 0 ? res : err;
+}
+EXPORT_SYMBOL_GPL(fuse_direct_io);
+
+static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
+ struct iov_iter *iter,
+ loff_t *ppos)
+{
+ ssize_t res;
+ struct inode *inode = file_inode(io->iocb->ki_filp);
+
+ res = fuse_direct_io(io, iter, ppos, 0);
+
+ fuse_invalidate_atime(inode);
+
+ return res;
+}
+
+static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
+
+static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ ssize_t res;
+
+ if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
+ res = fuse_direct_IO(iocb, to);
+ } else {
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
+
+ res = __fuse_direct_read(&io, to, &iocb->ki_pos);
+ }
+
+ return res;
+}
+
+static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
+ ssize_t res;
+
+ /* Don't allow parallel writes to the same file */
+ inode_lock(inode);
+ res = generic_write_checks(iocb, from);
+ if (res > 0) {
+ if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
+ res = fuse_direct_IO(iocb, from);
+ } else {
+ res = fuse_direct_io(&io, from, &iocb->ki_pos,
+ FUSE_DIO_WRITE);
+ fuse_write_update_attr(inode, iocb->ki_pos, res);
+ }
+ }
+ inode_unlock(inode);
+
+ return res;
+}
+
+static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+ struct inode *inode = file_inode(file);
+
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ if (FUSE_IS_DAX(inode))
+ return fuse_dax_read_iter(iocb, to);
+
+ if (!(ff->open_flags & FOPEN_DIRECT_IO))
+ return fuse_cache_read_iter(iocb, to);
+ else
+ return fuse_direct_read_iter(iocb, to);
+}
+
+static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+ struct inode *inode = file_inode(file);
+
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ if (FUSE_IS_DAX(inode))
+ return fuse_dax_write_iter(iocb, from);
+
+ if (!(ff->open_flags & FOPEN_DIRECT_IO))
+ return fuse_cache_write_iter(iocb, from);
+ else
+ return fuse_direct_write_iter(iocb, from);
+}
+
+static void fuse_writepage_free(struct fuse_writepage_args *wpa)
+{
+ struct fuse_args_pages *ap = &wpa->ia.ap;
+ int i;
+
+ if (wpa->bucket)
+ fuse_sync_bucket_dec(wpa->bucket);
+
+ for (i = 0; i < ap->num_pages; i++)
+ __free_page(ap->pages[i]);
+
+ if (wpa->ia.ff)
+ fuse_file_put(wpa->ia.ff, false, false);
+
+ kfree(ap->pages);
+ kfree(wpa);
+}
+
+static void fuse_writepage_finish(struct fuse_mount *fm,
+ struct fuse_writepage_args *wpa)
+{
+ struct fuse_args_pages *ap = &wpa->ia.ap;
+ struct inode *inode = wpa->inode;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
+ int i;
+
+ for (i = 0; i < ap->num_pages; i++) {
+ dec_wb_stat(&bdi->wb, WB_WRITEBACK);
+ dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
+ wb_writeout_inc(&bdi->wb);
+ }
+ wake_up(&fi->page_waitq);
+}
+
+/* Called under fi->lock, may release and reacquire it */
+static void fuse_send_writepage(struct fuse_mount *fm,
+ struct fuse_writepage_args *wpa, loff_t size)
+__releases(fi->lock)
+__acquires(fi->lock)
+{
+ struct fuse_writepage_args *aux, *next;
+ struct fuse_inode *fi = get_fuse_inode(wpa->inode);
+ struct fuse_write_in *inarg = &wpa->ia.write.in;
+ struct fuse_args *args = &wpa->ia.ap.args;
+ __u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE;
+ int err;
+
+ fi->writectr++;
+ if (inarg->offset + data_size <= size) {
+ inarg->size = data_size;
+ } else if (inarg->offset < size) {
+ inarg->size = size - inarg->offset;
+ } else {
+ /* Got truncated off completely */
+ goto out_free;
+ }
+
+ args->in_args[1].size = inarg->size;
+ args->force = true;
+ args->nocreds = true;
+
+ err = fuse_simple_background(fm, args, GFP_ATOMIC);
+ if (err == -ENOMEM) {
+ spin_unlock(&fi->lock);
+ err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL);
+ spin_lock(&fi->lock);
+ }
+
+ /* Fails on broken connection only */
+ if (unlikely(err))
+ goto out_free;
+
+ return;
+
+ out_free:
+ fi->writectr--;
+ rb_erase(&wpa->writepages_entry, &fi->writepages);
+ fuse_writepage_finish(fm, wpa);
+ spin_unlock(&fi->lock);
+
+ /* After fuse_writepage_finish() aux request list is private */
+ for (aux = wpa->next; aux; aux = next) {
+ next = aux->next;
+ aux->next = NULL;
+ fuse_writepage_free(aux);
+ }
+
+ fuse_writepage_free(wpa);
+ spin_lock(&fi->lock);
+}
+
+/*
+ * If fi->writectr is positive (no truncate or fsync going on) send
+ * all queued writepage requests.
+ *
+ * Called with fi->lock
+ */
+void fuse_flush_writepages(struct inode *inode)
+__releases(fi->lock)
+__acquires(fi->lock)
+{
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ loff_t crop = i_size_read(inode);
+ struct fuse_writepage_args *wpa;
+
+ while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
+ wpa = list_entry(fi->queued_writes.next,
+ struct fuse_writepage_args, queue_entry);
+ list_del_init(&wpa->queue_entry);
+ fuse_send_writepage(fm, wpa, crop);
+ }
+}
+
+static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root,
+ struct fuse_writepage_args *wpa)
+{
+ pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT;
+ pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1;
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+
+ WARN_ON(!wpa->ia.ap.num_pages);
+ while (*p) {
+ struct fuse_writepage_args *curr;
+ pgoff_t curr_index;
+
+ parent = *p;
+ curr = rb_entry(parent, struct fuse_writepage_args,
+ writepages_entry);
+ WARN_ON(curr->inode != wpa->inode);
+ curr_index = curr->ia.write.in.offset >> PAGE_SHIFT;
+
+ if (idx_from >= curr_index + curr->ia.ap.num_pages)
+ p = &(*p)->rb_right;
+ else if (idx_to < curr_index)
+ p = &(*p)->rb_left;
+ else
+ return curr;
+ }
+
+ rb_link_node(&wpa->writepages_entry, parent, p);
+ rb_insert_color(&wpa->writepages_entry, root);
+ return NULL;
+}
+
+static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
+{
+ WARN_ON(fuse_insert_writeback(root, wpa));
+}
+
+static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
+ int error)
+{
+ struct fuse_writepage_args *wpa =
+ container_of(args, typeof(*wpa), ia.ap.args);
+ struct inode *inode = wpa->inode;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ mapping_set_error(inode->i_mapping, error);
+ /*
+ * A writeback finished and this might have updated mtime/ctime on
+ * server making local mtime/ctime stale. Hence invalidate attrs.
+ * Do this only if writeback_cache is not enabled. If writeback_cache
+ * is enabled, we trust local ctime/mtime.
+ */
+ if (!fc->writeback_cache)
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY);
+ spin_lock(&fi->lock);
+ rb_erase(&wpa->writepages_entry, &fi->writepages);
+ while (wpa->next) {
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_write_in *inarg = &wpa->ia.write.in;
+ struct fuse_writepage_args *next = wpa->next;
+
+ wpa->next = next->next;
+ next->next = NULL;
+ next->ia.ff = fuse_file_get(wpa->ia.ff);
+ tree_insert(&fi->writepages, next);
+
+ /*
+ * Skip fuse_flush_writepages() to make it easy to crop requests
+ * based on primary request size.
+ *
+ * 1st case (trivial): there are no concurrent activities using
+ * fuse_set/release_nowrite. Then we're on safe side because
+ * fuse_flush_writepages() would call fuse_send_writepage()
+ * anyway.
+ *
+ * 2nd case: someone called fuse_set_nowrite and it is waiting
+ * now for completion of all in-flight requests. This happens
+ * rarely and no more than once per page, so this should be
+ * okay.
+ *
+ * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
+ * of fuse_set_nowrite..fuse_release_nowrite section. The fact
+ * that fuse_set_nowrite returned implies that all in-flight
+ * requests were completed along with all of their secondary
+ * requests. Further primary requests are blocked by negative
+ * writectr. Hence there cannot be any in-flight requests and
+ * no invocations of fuse_writepage_end() while we're in
+ * fuse_set_nowrite..fuse_release_nowrite section.
+ */
+ fuse_send_writepage(fm, next, inarg->offset + inarg->size);
+ }
+ fi->writectr--;
+ fuse_writepage_finish(fm, wpa);
+ spin_unlock(&fi->lock);
+ fuse_writepage_free(wpa);
+}
+
+static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi)
+{
+ struct fuse_file *ff;
+
+ spin_lock(&fi->lock);
+ ff = list_first_entry_or_null(&fi->write_files, struct fuse_file,
+ write_entry);
+ if (ff)
+ fuse_file_get(ff);
+ spin_unlock(&fi->lock);
+
+ return ff;
+}
+
+static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi)
+{
+ struct fuse_file *ff = __fuse_write_file_get(fi);
+ WARN_ON(!ff);
+ return ff;
+}
+
+int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_file *ff;
+ int err;
+
+ /*
+ * Inode is always written before the last reference is dropped and
+ * hence this should not be reached from reclaim.
+ *
+ * Writing back the inode from reclaim can deadlock if the request
+ * processing itself needs an allocation. Allocations triggering
+ * reclaim while serving a request can't be prevented, because it can
+ * involve any number of unrelated userspace processes.
+ */
+ WARN_ON(wbc->for_reclaim);
+
+ ff = __fuse_write_file_get(fi);
+ err = fuse_flush_times(inode, ff);
+ if (ff)
+ fuse_file_put(ff, false, false);
+
+ return err;
+}
+
+static struct fuse_writepage_args *fuse_writepage_args_alloc(void)
+{
+ struct fuse_writepage_args *wpa;
+ struct fuse_args_pages *ap;
+
+ wpa = kzalloc(sizeof(*wpa), GFP_NOFS);
+ if (wpa) {
+ ap = &wpa->ia.ap;
+ ap->num_pages = 0;
+ ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs);
+ if (!ap->pages) {
+ kfree(wpa);
+ wpa = NULL;
+ }
+ }
+ return wpa;
+
+}
+
+static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
+ struct fuse_writepage_args *wpa)
+{
+ if (!fc->sync_fs)
+ return;
+
+ rcu_read_lock();
+ /* Prevent resurrection of dead bucket in unlikely race with syncfs */
+ do {
+ wpa->bucket = rcu_dereference(fc->curr_bucket);
+ } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count)));
+ rcu_read_unlock();
+}
+
+static int fuse_writepage_locked(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_writepage_args *wpa;
+ struct fuse_args_pages *ap;
+ struct page *tmp_page;
+ int error = -ENOMEM;
+
+ set_page_writeback(page);
+
+ wpa = fuse_writepage_args_alloc();
+ if (!wpa)
+ goto err;
+ ap = &wpa->ia.ap;
+
+ tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (!tmp_page)
+ goto err_free;
+
+ error = -EIO;
+ wpa->ia.ff = fuse_write_file_get(fi);
+ if (!wpa->ia.ff)
+ goto err_nofile;
+
+ fuse_writepage_add_to_bucket(fc, wpa);
+ fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
+
+ copy_highpage(tmp_page, page);
+ wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
+ wpa->next = NULL;
+ ap->args.in_pages = true;
+ ap->num_pages = 1;
+ ap->pages[0] = tmp_page;
+ ap->descs[0].offset = 0;
+ ap->descs[0].length = PAGE_SIZE;
+ ap->args.end = fuse_writepage_end;
+ wpa->inode = inode;
+
+ inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
+ inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
+
+ spin_lock(&fi->lock);
+ tree_insert(&fi->writepages, wpa);
+ list_add_tail(&wpa->queue_entry, &fi->queued_writes);
+ fuse_flush_writepages(inode);
+ spin_unlock(&fi->lock);
+
+ end_page_writeback(page);
+
+ return 0;
+
+err_nofile:
+ __free_page(tmp_page);
+err_free:
+ kfree(wpa);
+err:
+ mapping_set_error(page->mapping, error);
+ end_page_writeback(page);
+ return error;
+}
+
+static int fuse_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct fuse_conn *fc = get_fuse_conn(page->mapping->host);
+ int err;
+
+ if (fuse_page_is_writeback(page->mapping->host, page->index)) {
+ /*
+ * ->writepages() should be called for sync() and friends. We
+ * should only get here on direct reclaim and then we are
+ * allowed to skip a page which is already in flight
+ */
+ WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
+
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ fc->num_background >= fc->congestion_threshold)
+ return AOP_WRITEPAGE_ACTIVATE;
+
+ err = fuse_writepage_locked(page);
+ unlock_page(page);
+
+ return err;
+}
+
+struct fuse_fill_wb_data {
+ struct fuse_writepage_args *wpa;
+ struct fuse_file *ff;
+ struct inode *inode;
+ struct page **orig_pages;
+ unsigned int max_pages;
+};
+
+static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
+{
+ struct fuse_args_pages *ap = &data->wpa->ia.ap;
+ struct fuse_conn *fc = get_fuse_conn(data->inode);
+ struct page **pages;
+ struct fuse_page_desc *descs;
+ unsigned int npages = min_t(unsigned int,
+ max_t(unsigned int, data->max_pages * 2,
+ FUSE_DEFAULT_MAX_PAGES_PER_REQ),
+ fc->max_pages);
+ WARN_ON(npages <= data->max_pages);
+
+ pages = fuse_pages_alloc(npages, GFP_NOFS, &descs);
+ if (!pages)
+ return false;
+
+ memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages);
+ memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages);
+ kfree(ap->pages);
+ ap->pages = pages;
+ ap->descs = descs;
+ data->max_pages = npages;
+
+ return true;
+}
+
+static void fuse_writepages_send(struct fuse_fill_wb_data *data)
+{
+ struct fuse_writepage_args *wpa = data->wpa;
+ struct inode *inode = data->inode;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ int num_pages = wpa->ia.ap.num_pages;
+ int i;
+
+ wpa->ia.ff = fuse_file_get(data->ff);
+ spin_lock(&fi->lock);
+ list_add_tail(&wpa->queue_entry, &fi->queued_writes);
+ fuse_flush_writepages(inode);
+ spin_unlock(&fi->lock);
+
+ for (i = 0; i < num_pages; i++)
+ end_page_writeback(data->orig_pages[i]);
+}
+
+/*
+ * Check under fi->lock if the page is under writeback, and insert it onto the
+ * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's
+ * one already added for a page at this offset. If there's none, then insert
+ * this new request onto the auxiliary list, otherwise reuse the existing one by
+ * swapping the new temp page with the old one.
+ */
+static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa,
+ struct page *page)
+{
+ struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
+ struct fuse_writepage_args *tmp;
+ struct fuse_writepage_args *old_wpa;
+ struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
+
+ WARN_ON(new_ap->num_pages != 0);
+ new_ap->num_pages = 1;
+
+ spin_lock(&fi->lock);
+ old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa);
+ if (!old_wpa) {
+ spin_unlock(&fi->lock);
+ return true;
+ }
+
+ for (tmp = old_wpa->next; tmp; tmp = tmp->next) {
+ pgoff_t curr_index;
+
+ WARN_ON(tmp->inode != new_wpa->inode);
+ curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT;
+ if (curr_index == page->index) {
+ WARN_ON(tmp->ia.ap.num_pages != 1);
+ swap(tmp->ia.ap.pages[0], new_ap->pages[0]);
+ break;
+ }
+ }
+
+ if (!tmp) {
+ new_wpa->next = old_wpa->next;
+ old_wpa->next = new_wpa;
+ }
+
+ spin_unlock(&fi->lock);
+
+ if (tmp) {
+ struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode);
+
+ dec_wb_stat(&bdi->wb, WB_WRITEBACK);
+ dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP);
+ wb_writeout_inc(&bdi->wb);
+ fuse_writepage_free(new_wpa);
+ }
+
+ return false;
+}
+
+static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page,
+ struct fuse_args_pages *ap,
+ struct fuse_fill_wb_data *data)
+{
+ WARN_ON(!ap->num_pages);
+
+ /*
+ * Being under writeback is unlikely but possible. For example direct
+ * read to an mmaped fuse file will set the page dirty twice; once when
+ * the pages are faulted with get_user_pages(), and then after the read
+ * completed.
+ */
+ if (fuse_page_is_writeback(data->inode, page->index))
+ return true;
+
+ /* Reached max pages */
+ if (ap->num_pages == fc->max_pages)
+ return true;
+
+ /* Reached max write bytes */
+ if ((ap->num_pages + 1) * PAGE_SIZE > fc->max_write)
+ return true;
+
+ /* Discontinuity */
+ if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)
+ return true;
+
+ /* Need to grow the pages array? If so, did the expansion fail? */
+ if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data))
+ return true;
+
+ return false;
+}
+
+static int fuse_writepages_fill(struct page *page,
+ struct writeback_control *wbc, void *_data)
+{
+ struct fuse_fill_wb_data *data = _data;
+ struct fuse_writepage_args *wpa = data->wpa;
+ struct fuse_args_pages *ap = &wpa->ia.ap;
+ struct inode *inode = data->inode;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct page *tmp_page;
+ int err;
+
+ if (!data->ff) {
+ err = -EIO;
+ data->ff = fuse_write_file_get(fi);
+ if (!data->ff)
+ goto out_unlock;
+ }
+
+ if (wpa && fuse_writepage_need_send(fc, page, ap, data)) {
+ fuse_writepages_send(data);
+ data->wpa = NULL;
+ }
+
+ err = -ENOMEM;
+ tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (!tmp_page)
+ goto out_unlock;
+
+ /*
+ * The page must not be redirtied until the writeout is completed
+ * (i.e. userspace has sent a reply to the write request). Otherwise
+ * there could be more than one temporary page instance for each real
+ * page.
+ *
+ * This is ensured by holding the page lock in page_mkwrite() while
+ * checking fuse_page_is_writeback(). We already hold the page lock
+ * since clear_page_dirty_for_io() and keep it held until we add the
+ * request to the fi->writepages list and increment ap->num_pages.
+ * After this fuse_page_is_writeback() will indicate that the page is
+ * under writeback, so we can release the page lock.
+ */
+ if (data->wpa == NULL) {
+ err = -ENOMEM;
+ wpa = fuse_writepage_args_alloc();
+ if (!wpa) {
+ __free_page(tmp_page);
+ goto out_unlock;
+ }
+ fuse_writepage_add_to_bucket(fc, wpa);
+
+ data->max_pages = 1;
+
+ ap = &wpa->ia.ap;
+ fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0);
+ wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
+ wpa->next = NULL;
+ ap->args.in_pages = true;
+ ap->args.end = fuse_writepage_end;
+ ap->num_pages = 0;
+ wpa->inode = inode;
+ }
+ set_page_writeback(page);
+
+ copy_highpage(tmp_page, page);
+ ap->pages[ap->num_pages] = tmp_page;
+ ap->descs[ap->num_pages].offset = 0;
+ ap->descs[ap->num_pages].length = PAGE_SIZE;
+ data->orig_pages[ap->num_pages] = page;
+
+ inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
+ inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
+
+ err = 0;
+ if (data->wpa) {
+ /*
+ * Protected by fi->lock against concurrent access by
+ * fuse_page_is_writeback().
+ */
+ spin_lock(&fi->lock);
+ ap->num_pages++;
+ spin_unlock(&fi->lock);
+ } else if (fuse_writepage_add(wpa, page)) {
+ data->wpa = wpa;
+ } else {
+ end_page_writeback(page);
+ }
+out_unlock:
+ unlock_page(page);
+
+ return err;
+}
+
+static int fuse_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_fill_wb_data data;
+ int err;
+
+ err = -EIO;
+ if (fuse_is_bad(inode))
+ goto out;
+
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ fc->num_background >= fc->congestion_threshold)
+ return 0;
+
+ data.inode = inode;
+ data.wpa = NULL;
+ data.ff = NULL;
+
+ err = -ENOMEM;
+ data.orig_pages = kcalloc(fc->max_pages,
+ sizeof(struct page *),
+ GFP_NOFS);
+ if (!data.orig_pages)
+ goto out;
+
+ err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
+ if (data.wpa) {
+ WARN_ON(!data.wpa->ia.ap.num_pages);
+ fuse_writepages_send(&data);
+ }
+ if (data.ff)
+ fuse_file_put(data.ff, false, false);
+
+ kfree(data.orig_pages);
+out:
+ return err;
+}
+
+/*
+ * It's worthy to make sure that space is reserved on disk for the write,
+ * but how to implement it without killing performance need more thinking.
+ */
+static int fuse_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, struct page **pagep, void **fsdata)
+{
+ pgoff_t index = pos >> PAGE_SHIFT;
+ struct fuse_conn *fc = get_fuse_conn(file_inode(file));
+ struct page *page;
+ loff_t fsize;
+ int err = -ENOMEM;
+
+ WARN_ON(!fc->writeback_cache);
+
+ page = grab_cache_page_write_begin(mapping, index);
+ if (!page)
+ goto error;
+
+ fuse_wait_on_page_writeback(mapping->host, page->index);
+
+ if (PageUptodate(page) || len == PAGE_SIZE)
+ goto success;
+ /*
+ * Check if the start this page comes after the end of file, in which
+ * case the readpage can be optimized away.
+ */
+ fsize = i_size_read(mapping->host);
+ if (fsize <= (pos & PAGE_MASK)) {
+ size_t off = pos & ~PAGE_MASK;
+ if (off)
+ zero_user_segment(page, 0, off);
+ goto success;
+ }
+ err = fuse_do_readpage(file, page);
+ if (err)
+ goto cleanup;
+success:
+ *pagep = page;
+ return 0;
+
+cleanup:
+ unlock_page(page);
+ put_page(page);
+error:
+ return err;
+}
+
+static int fuse_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = page->mapping->host;
+
+ /* Haven't copied anything? Skip zeroing, size extending, dirtying. */
+ if (!copied)
+ goto unlock;
+
+ pos += copied;
+ if (!PageUptodate(page)) {
+ /* Zero any unwritten bytes at the end of the page */
+ size_t endoff = pos & ~PAGE_MASK;
+ if (endoff)
+ zero_user_segment(page, endoff, PAGE_SIZE);
+ SetPageUptodate(page);
+ }
+
+ if (pos > inode->i_size)
+ i_size_write(inode, pos);
+
+ set_page_dirty(page);
+
+unlock:
+ unlock_page(page);
+ put_page(page);
+
+ return copied;
+}
+
+static int fuse_launder_folio(struct folio *folio)
+{
+ int err = 0;
+ if (folio_clear_dirty_for_io(folio)) {
+ struct inode *inode = folio->mapping->host;
+
+ /* Serialize with pending writeback for the same page */
+ fuse_wait_on_page_writeback(inode, folio->index);
+ err = fuse_writepage_locked(&folio->page);
+ if (!err)
+ fuse_wait_on_page_writeback(inode, folio->index);
+ }
+ return err;
+}
+
+/*
+ * Write back dirty data/metadata now (there may not be any suitable
+ * open files later for data)
+ */
+static void fuse_vma_close(struct vm_area_struct *vma)
+{
+ int err;
+
+ err = write_inode_now(vma->vm_file->f_mapping->host, 1);
+ mapping_set_error(vma->vm_file->f_mapping, err);
+}
+
+/*
+ * Wait for writeback against this page to complete before allowing it
+ * to be marked dirty again, and hence written back again, possibly
+ * before the previous writepage completed.
+ *
+ * Block here, instead of in ->writepage(), so that the userspace fs
+ * can only block processes actually operating on the filesystem.
+ *
+ * Otherwise unprivileged userspace fs would be able to block
+ * unrelated:
+ *
+ * - page migration
+ * - sync(2)
+ * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
+ */
+static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+
+ file_update_time(vmf->vma->vm_file);
+ lock_page(page);
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ return VM_FAULT_NOPAGE;
+ }
+
+ fuse_wait_on_page_writeback(inode, page->index);
+ return VM_FAULT_LOCKED;
+}
+
+static const struct vm_operations_struct fuse_file_vm_ops = {
+ .close = fuse_vma_close,
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = fuse_page_mkwrite,
+};
+
+static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct fuse_file *ff = file->private_data;
+
+ /* DAX mmap is superior to direct_io mmap */
+ if (FUSE_IS_DAX(file_inode(file)))
+ return fuse_dax_mmap(file, vma);
+
+ if (ff->open_flags & FOPEN_DIRECT_IO) {
+ /* Can't provide the coherency needed for MAP_SHARED */
+ if (vma->vm_flags & VM_MAYSHARE)
+ return -ENODEV;
+
+ invalidate_inode_pages2(file->f_mapping);
+
+ return generic_file_mmap(file, vma);
+ }
+
+ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ fuse_link_write_file(file);
+
+ file_accessed(file);
+ vma->vm_ops = &fuse_file_vm_ops;
+ return 0;
+}
+
+static int convert_fuse_file_lock(struct fuse_conn *fc,
+ const struct fuse_file_lock *ffl,
+ struct file_lock *fl)
+{
+ switch (ffl->type) {
+ case F_UNLCK:
+ break;
+
+ case F_RDLCK:
+ case F_WRLCK:
+ if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
+ ffl->end < ffl->start)
+ return -EIO;
+
+ fl->fl_start = ffl->start;
+ fl->fl_end = ffl->end;
+
+ /*
+ * Convert pid into init's pid namespace. The locks API will
+ * translate it into the caller's pid namespace.
+ */
+ rcu_read_lock();
+ fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
+ rcu_read_unlock();
+ break;
+
+ default:
+ return -EIO;
+ }
+ fl->fl_type = ffl->type;
+ return 0;
+}
+
+static void fuse_lk_fill(struct fuse_args *args, struct file *file,
+ const struct file_lock *fl, int opcode, pid_t pid,
+ int flock, struct fuse_lk_in *inarg)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_file *ff = file->private_data;
+
+ memset(inarg, 0, sizeof(*inarg));
+ inarg->fh = ff->fh;
+ inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
+ inarg->lk.start = fl->fl_start;
+ inarg->lk.end = fl->fl_end;
+ inarg->lk.type = fl->fl_type;
+ inarg->lk.pid = pid;
+ if (flock)
+ inarg->lk_flags |= FUSE_LK_FLOCK;
+ args->opcode = opcode;
+ args->nodeid = get_node_id(inode);
+ args->in_numargs = 1;
+ args->in_args[0].size = sizeof(*inarg);
+ args->in_args[0].value = inarg;
+}
+
+static int fuse_getlk(struct file *file, struct file_lock *fl)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ FUSE_ARGS(args);
+ struct fuse_lk_in inarg;
+ struct fuse_lk_out outarg;
+ int err;
+
+ fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg);
+ args.out_numargs = 1;
+ args.out_args[0].size = sizeof(outarg);
+ args.out_args[0].value = &outarg;
+ err = fuse_simple_request(fm, &args);
+ if (!err)
+ err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl);
+
+ return err;
+}
+
+static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ FUSE_ARGS(args);
+ struct fuse_lk_in inarg;
+ int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
+ struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL;
+ pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns);
+ int err;
+
+ if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
+ /* NLM needs asynchronous locks, which we don't support yet */
+ return -ENOLCK;
+ }
+
+ /* Unlock on close is handled by the flush method */
+ if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
+ return 0;
+
+ fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
+ err = fuse_simple_request(fm, &args);
+
+ /* locking is restartable */
+ if (err == -EINTR)
+ err = -ERESTARTSYS;
+
+ return err;
+}
+
+static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ int err;
+
+ if (cmd == F_CANCELLK) {
+ err = 0;
+ } else if (cmd == F_GETLK) {
+ if (fc->no_lock) {
+ posix_test_lock(file, fl);
+ err = 0;
+ } else
+ err = fuse_getlk(file, fl);
+ } else {
+ if (fc->no_lock)
+ err = posix_lock_file(file, fl, NULL);
+ else
+ err = fuse_setlk(file, fl, 0);
+ }
+ return err;
+}
+
+static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ int err;
+
+ if (fc->no_flock) {
+ err = locks_lock_file_wait(file, fl);
+ } else {
+ struct fuse_file *ff = file->private_data;
+
+ /* emulate flock with POSIX locks */
+ ff->flock = true;
+ err = fuse_setlk(file, fl, 1);
+ }
+
+ return err;
+}
+
+static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
+{
+ struct inode *inode = mapping->host;
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ FUSE_ARGS(args);
+ struct fuse_bmap_in inarg;
+ struct fuse_bmap_out outarg;
+ int err;
+
+ if (!inode->i_sb->s_bdev || fm->fc->no_bmap)
+ return 0;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.block = block;
+ inarg.blocksize = inode->i_sb->s_blocksize;
+ args.opcode = FUSE_BMAP;
+ args.nodeid = get_node_id(inode);
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.out_numargs = 1;
+ args.out_args[0].size = sizeof(outarg);
+ args.out_args[0].value = &outarg;
+ err = fuse_simple_request(fm, &args);
+ if (err == -ENOSYS)
+ fm->fc->no_bmap = 1;
+
+ return err ? 0 : outarg.block;
+}
+
+static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_file *ff = file->private_data;
+ FUSE_ARGS(args);
+ struct fuse_lseek_in inarg = {
+ .fh = ff->fh,
+ .offset = offset,
+ .whence = whence
+ };
+ struct fuse_lseek_out outarg;
+ int err;
+
+ if (fm->fc->no_lseek)
+ goto fallback;
+
+ args.opcode = FUSE_LSEEK;
+ args.nodeid = ff->nodeid;
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.out_numargs = 1;
+ args.out_args[0].size = sizeof(outarg);
+ args.out_args[0].value = &outarg;
+ err = fuse_simple_request(fm, &args);
+ if (err) {
+ if (err == -ENOSYS) {
+ fm->fc->no_lseek = 1;
+ goto fallback;
+ }
+ return err;
+ }
+
+ return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);
+
+fallback:
+ err = fuse_update_attributes(inode, file, STATX_SIZE);
+ if (!err)
+ return generic_file_llseek(file, offset, whence);
+ else
+ return err;
+}
+
+static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
+{
+ loff_t retval;
+ struct inode *inode = file_inode(file);
+
+ switch (whence) {
+ case SEEK_SET:
+ case SEEK_CUR:
+ /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
+ retval = generic_file_llseek(file, offset, whence);
+ break;
+ case SEEK_END:
+ inode_lock(inode);
+ retval = fuse_update_attributes(inode, file, STATX_SIZE);
+ if (!retval)
+ retval = generic_file_llseek(file, offset, whence);
+ inode_unlock(inode);
+ break;
+ case SEEK_HOLE:
+ case SEEK_DATA:
+ inode_lock(inode);
+ retval = fuse_lseek(file, offset, whence);
+ inode_unlock(inode);
+ break;
+ default:
+ retval = -EINVAL;
+ }
+
+ return retval;
+}
+
+/*
+ * All files which have been polled are linked to RB tree
+ * fuse_conn->polled_files which is indexed by kh. Walk the tree and
+ * find the matching one.
+ */
+static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
+ struct rb_node **parent_out)
+{
+ struct rb_node **link = &fc->polled_files.rb_node;
+ struct rb_node *last = NULL;
+
+ while (*link) {
+ struct fuse_file *ff;
+
+ last = *link;
+ ff = rb_entry(last, struct fuse_file, polled_node);
+
+ if (kh < ff->kh)
+ link = &last->rb_left;
+ else if (kh > ff->kh)
+ link = &last->rb_right;
+ else
+ return link;
+ }
+
+ if (parent_out)
+ *parent_out = last;
+ return link;
+}
+
+/*
+ * The file is about to be polled. Make sure it's on the polled_files
+ * RB tree. Note that files once added to the polled_files tree are
+ * not removed before the file is released. This is because a file
+ * polled once is likely to be polled again.
+ */
+static void fuse_register_polled_file(struct fuse_conn *fc,
+ struct fuse_file *ff)
+{
+ spin_lock(&fc->lock);
+ if (RB_EMPTY_NODE(&ff->polled_node)) {
+ struct rb_node **link, *parent;
+
+ link = fuse_find_polled_node(fc, ff->kh, &parent);
+ BUG_ON(*link);
+ rb_link_node(&ff->polled_node, parent, link);
+ rb_insert_color(&ff->polled_node, &fc->polled_files);
+ }
+ spin_unlock(&fc->lock);
+}
+
+__poll_t fuse_file_poll(struct file *file, poll_table *wait)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_mount *fm = ff->fm;
+ struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
+ struct fuse_poll_out outarg;
+ FUSE_ARGS(args);
+ int err;
+
+ if (fm->fc->no_poll)
+ return DEFAULT_POLLMASK;
+
+ poll_wait(file, &ff->poll_wait, wait);
+ inarg.events = mangle_poll(poll_requested_events(wait));
+
+ /*
+ * Ask for notification iff there's someone waiting for it.
+ * The client may ignore the flag and always notify.
+ */
+ if (waitqueue_active(&ff->poll_wait)) {
+ inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
+ fuse_register_polled_file(fm->fc, ff);
+ }
+
+ args.opcode = FUSE_POLL;
+ args.nodeid = ff->nodeid;
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.out_numargs = 1;
+ args.out_args[0].size = sizeof(outarg);
+ args.out_args[0].value = &outarg;
+ err = fuse_simple_request(fm, &args);
+
+ if (!err)
+ return demangle_poll(outarg.revents);
+ if (err == -ENOSYS) {
+ fm->fc->no_poll = 1;
+ return DEFAULT_POLLMASK;
+ }
+ return EPOLLERR;
+}
+EXPORT_SYMBOL_GPL(fuse_file_poll);
+
+/*
+ * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
+ * wakes up the poll waiters.
+ */
+int fuse_notify_poll_wakeup(struct fuse_conn *fc,
+ struct fuse_notify_poll_wakeup_out *outarg)
+{
+ u64 kh = outarg->kh;
+ struct rb_node **link;
+
+ spin_lock(&fc->lock);
+
+ link = fuse_find_polled_node(fc, kh, NULL);
+ if (*link) {
+ struct fuse_file *ff;
+
+ ff = rb_entry(*link, struct fuse_file, polled_node);
+ wake_up_interruptible_sync(&ff->poll_wait);
+ }
+
+ spin_unlock(&fc->lock);
+ return 0;
+}
+
+static void fuse_do_truncate(struct file *file)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct iattr attr;
+
+ attr.ia_valid = ATTR_SIZE;
+ attr.ia_size = i_size_read(inode);
+
+ attr.ia_file = file;
+ attr.ia_valid |= ATTR_FILE;
+
+ fuse_do_setattr(file_dentry(file), &attr, file);
+}
+
+static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off)
+{
+ return round_up(off, fc->max_pages << PAGE_SHIFT);
+}
+
+static ssize_t
+fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+ DECLARE_COMPLETION_ONSTACK(wait);
+ ssize_t ret = 0;
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+ loff_t pos = 0;
+ struct inode *inode;
+ loff_t i_size;
+ size_t count = iov_iter_count(iter), shortened = 0;
+ loff_t offset = iocb->ki_pos;
+ struct fuse_io_priv *io;
+
+ pos = offset;
+ inode = file->f_mapping->host;
+ i_size = i_size_read(inode);
+
+ if ((iov_iter_rw(iter) == READ) && (offset >= i_size))
+ return 0;
+
+ io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
+ if (!io)
+ return -ENOMEM;
+ spin_lock_init(&io->lock);
+ kref_init(&io->refcnt);
+ io->reqs = 1;
+ io->bytes = -1;
+ io->size = 0;
+ io->offset = offset;
+ io->write = (iov_iter_rw(iter) == WRITE);
+ io->err = 0;
+ /*
+ * By default, we want to optimize all I/Os with async request
+ * submission to the client filesystem if supported.
+ */
+ io->async = ff->fm->fc->async_dio;
+ io->iocb = iocb;
+ io->blocking = is_sync_kiocb(iocb);
+
+ /* optimization for short read */
+ if (io->async && !io->write && offset + count > i_size) {
+ iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset));
+ shortened = count - iov_iter_count(iter);
+ count -= shortened;
+ }
+
+ /*
+ * We cannot asynchronously extend the size of a file.
+ * In such case the aio will behave exactly like sync io.
+ */
+ if ((offset + count > i_size) && io->write)
+ io->blocking = true;
+
+ if (io->async && io->blocking) {
+ /*
+ * Additional reference to keep io around after
+ * calling fuse_aio_complete()
+ */
+ kref_get(&io->refcnt);
+ io->done = &wait;
+ }
+
+ if (iov_iter_rw(iter) == WRITE) {
+ ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
+ } else {
+ ret = __fuse_direct_read(io, iter, &pos);
+ }
+ iov_iter_reexpand(iter, iov_iter_count(iter) + shortened);
+
+ if (io->async) {
+ bool blocking = io->blocking;
+
+ fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
+
+ /* we have a non-extending, async request, so return */
+ if (!blocking)
+ return -EIOCBQUEUED;
+
+ wait_for_completion(&wait);
+ ret = fuse_get_res_by_io(io);
+ }
+
+ kref_put(&io->refcnt, fuse_io_release);
+
+ if (iov_iter_rw(iter) == WRITE) {
+ fuse_write_update_attr(inode, pos, ret);
+ if (ret < 0 && offset + count > i_size)
+ fuse_do_truncate(file);
+ }
+
+ return ret;
+}
+
+static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
+{
+ int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX);
+
+ if (!err)
+ fuse_sync_writes(inode);
+
+ return err;
+}
+
+static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t length)
+{
+ struct fuse_file *ff = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_mount *fm = ff->fm;
+ FUSE_ARGS(args);
+ struct fuse_fallocate_in inarg = {
+ .fh = ff->fh,
+ .offset = offset,
+ .length = length,
+ .mode = mode
+ };
+ int err;
+ bool block_faults = FUSE_IS_DAX(inode) &&
+ (!(mode & FALLOC_FL_KEEP_SIZE) ||
+ (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)));
+
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_ZERO_RANGE))
+ return -EOPNOTSUPP;
+
+ if (fm->fc->no_fallocate)
+ return -EOPNOTSUPP;
+
+ inode_lock(inode);
+ if (block_faults) {
+ filemap_invalidate_lock(inode->i_mapping);
+ err = fuse_dax_break_layouts(inode, 0, 0);
+ if (err)
+ goto out;
+ }
+
+ if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) {
+ loff_t endbyte = offset + length - 1;
+
+ err = fuse_writeback_range(inode, offset, endbyte);
+ if (err)
+ goto out;
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ offset + length > i_size_read(inode)) {
+ err = inode_newsize_ok(inode, offset + length);
+ if (err)
+ goto out;
+ }
+
+ err = file_modified(file);
+ if (err)
+ goto out;
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE))
+ set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
+ args.opcode = FUSE_FALLOCATE;
+ args.nodeid = ff->nodeid;
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ err = fuse_simple_request(fm, &args);
+ if (err == -ENOSYS) {
+ fm->fc->no_fallocate = 1;
+ err = -EOPNOTSUPP;
+ }
+ if (err)
+ goto out;
+
+ /* we could have extended the file */
+ if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+ if (fuse_write_update_attr(inode, offset + length, length))
+ file_update_time(file);
+ }
+
+ if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
+ truncate_pagecache_range(inode, offset, offset + length - 1);
+
+ fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
+
+out:
+ if (!(mode & FALLOC_FL_KEEP_SIZE))
+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
+ if (block_faults)
+ filemap_invalidate_unlock(inode->i_mapping);
+
+ inode_unlock(inode);
+
+ fuse_flush_time_update(inode);
+
+ return err;
+}
+
+static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, unsigned int flags)
+{
+ struct fuse_file *ff_in = file_in->private_data;
+ struct fuse_file *ff_out = file_out->private_data;
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct fuse_inode *fi_out = get_fuse_inode(inode_out);
+ struct fuse_mount *fm = ff_in->fm;
+ struct fuse_conn *fc = fm->fc;
+ FUSE_ARGS(args);
+ struct fuse_copy_file_range_in inarg = {
+ .fh_in = ff_in->fh,
+ .off_in = pos_in,
+ .nodeid_out = ff_out->nodeid,
+ .fh_out = ff_out->fh,
+ .off_out = pos_out,
+ .len = len,
+ .flags = flags
+ };
+ struct fuse_write_out outarg;
+ ssize_t err;
+ /* mark unstable when write-back is not used, and file_out gets
+ * extended */
+ bool is_unstable = (!fc->writeback_cache) &&
+ ((pos_out + len) > inode_out->i_size);
+
+ if (fc->no_copy_file_range)
+ return -EOPNOTSUPP;
+
+ if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
+ return -EXDEV;
+
+ inode_lock(inode_in);
+ err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1);
+ inode_unlock(inode_in);
+ if (err)
+ return err;
+
+ inode_lock(inode_out);
+
+ err = file_modified(file_out);
+ if (err)
+ goto out;
+
+ /*
+ * Write out dirty pages in the destination file before sending the COPY
+ * request to userspace. After the request is completed, truncate off
+ * pages (including partial ones) from the cache that have been copied,
+ * since these contain stale data at that point.
+ *
+ * This should be mostly correct, but if the COPY writes to partial
+ * pages (at the start or end) and the parts not covered by the COPY are
+ * written through a memory map after calling fuse_writeback_range(),
+ * then these partial page modifications will be lost on truncation.
+ *
+ * It is unlikely that someone would rely on such mixed style
+ * modifications. Yet this does give less guarantees than if the
+ * copying was performed with write(2).
+ *
+ * To fix this a mapping->invalidate_lock could be used to prevent new
+ * faults while the copy is ongoing.
+ */
+ err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
+ if (err)
+ goto out;
+
+ if (is_unstable)
+ set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
+
+ args.opcode = FUSE_COPY_FILE_RANGE;
+ args.nodeid = ff_in->nodeid;
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.out_numargs = 1;
+ args.out_args[0].size = sizeof(outarg);
+ args.out_args[0].value = &outarg;
+ err = fuse_simple_request(fm, &args);
+ if (err == -ENOSYS) {
+ fc->no_copy_file_range = 1;
+ err = -EOPNOTSUPP;
+ }
+ if (err)
+ goto out;
+
+ truncate_inode_pages_range(inode_out->i_mapping,
+ ALIGN_DOWN(pos_out, PAGE_SIZE),
+ ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1);
+
+ file_update_time(file_out);
+ fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size);
+
+ err = outarg.size;
+out:
+ if (is_unstable)
+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
+
+ inode_unlock(inode_out);
+ file_accessed(file_in);
+
+ fuse_flush_time_update(inode_out);
+
+ return err;
+}
+
+static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
+ struct file *dst_file, loff_t dst_off,
+ size_t len, unsigned int flags)
+{
+ ssize_t ret;
+
+ ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off,
+ len, flags);
+
+ if (ret == -EOPNOTSUPP || ret == -EXDEV)
+ ret = generic_copy_file_range(src_file, src_off, dst_file,
+ dst_off, len, flags);
+ return ret;
+}
+
+static const struct file_operations fuse_file_operations = {
+ .llseek = fuse_file_llseek,
+ .read_iter = fuse_file_read_iter,
+ .write_iter = fuse_file_write_iter,
+ .mmap = fuse_file_mmap,
+ .open = fuse_open,
+ .flush = fuse_flush,
+ .release = fuse_release,
+ .fsync = fuse_fsync,
+ .lock = fuse_file_lock,
+ .get_unmapped_area = thp_get_unmapped_area,
+ .flock = fuse_file_flock,
+ .splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
+ .unlocked_ioctl = fuse_file_ioctl,
+ .compat_ioctl = fuse_file_compat_ioctl,
+ .poll = fuse_file_poll,
+ .fallocate = fuse_file_fallocate,
+ .copy_file_range = fuse_copy_file_range,
+};
+
+static const struct address_space_operations fuse_file_aops = {
+ .read_folio = fuse_read_folio,
+ .readahead = fuse_readahead,
+ .writepage = fuse_writepage,
+ .writepages = fuse_writepages,
+ .launder_folio = fuse_launder_folio,
+ .dirty_folio = filemap_dirty_folio,
+ .bmap = fuse_bmap,
+ .direct_IO = fuse_direct_IO,
+ .write_begin = fuse_write_begin,
+ .write_end = fuse_write_end,
+};
+
+void fuse_init_file_inode(struct inode *inode, unsigned int flags)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ inode->i_fop = &fuse_file_operations;
+ inode->i_data.a_ops = &fuse_file_aops;
+
+ INIT_LIST_HEAD(&fi->write_files);
+ INIT_LIST_HEAD(&fi->queued_writes);
+ fi->writectr = 0;
+ init_waitqueue_head(&fi->page_waitq);
+ fi->writepages = RB_ROOT;
+
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_inode_init(inode, flags);
+}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
new file mode 100644
index 000000000..a9681fecb
--- /dev/null
+++ b/fs/fuse/fuse_i.h
@@ -0,0 +1,1337 @@
+/*
+ FUSE: Filesystem in Userspace
+ Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
+
+ This program can be distributed under the terms of the GNU GPL.
+ See the file COPYING.
+*/
+
+#ifndef _FS_FUSE_I_H
+#define _FS_FUSE_I_H
+
+#ifndef pr_fmt
+# define pr_fmt(fmt) "fuse: " fmt
+#endif
+
+#include <linux/fuse.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/backing-dev.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/poll.h>
+#include <linux/workqueue.h>
+#include <linux/kref.h>
+#include <linux/xattr.h>
+#include <linux/pid_namespace.h>
+#include <linux/refcount.h>
+#include <linux/user_namespace.h>
+
+/** Default max number of pages that can be used in a single read request */
+#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
+
+/** Maximum of max_pages received in init_out */
+#define FUSE_MAX_MAX_PAGES 256
+
+/** Bias for fi->writectr, meaning new writepages must not be sent */
+#define FUSE_NOWRITE INT_MIN
+
+/** It could be as large as PATH_MAX, but would that have any uses? */
+#define FUSE_NAME_MAX 1024
+
+/** Number of dentries for each connection in the control filesystem */
+#define FUSE_CTL_NUM_DENTRIES 5
+
+/** List of active connections */
+extern struct list_head fuse_conn_list;
+
+/** Global mutex protecting fuse_conn_list and the control filesystem */
+extern struct mutex fuse_mutex;
+
+/** Module parameters */
+extern unsigned max_user_bgreq;
+extern unsigned max_user_congthresh;
+
+/* One forget request */
+struct fuse_forget_link {
+ struct fuse_forget_one forget_one;
+ struct fuse_forget_link *next;
+};
+
+/* Submount lookup tracking */
+struct fuse_submount_lookup {
+ /** Refcount */
+ refcount_t count;
+
+ /** Unique ID, which identifies the inode between userspace
+ * and kernel */
+ u64 nodeid;
+
+ /** The request used for sending the FORGET message */
+ struct fuse_forget_link *forget;
+};
+
+/** FUSE inode */
+struct fuse_inode {
+ /** Inode data */
+ struct inode inode;
+
+ /** Unique ID, which identifies the inode between userspace
+ * and kernel */
+ u64 nodeid;
+
+ /** Number of lookups on this inode */
+ u64 nlookup;
+
+ /** The request used for sending the FORGET message */
+ struct fuse_forget_link *forget;
+
+ /** Time in jiffies until the file attributes are valid */
+ u64 i_time;
+
+ /* Which attributes are invalid */
+ u32 inval_mask;
+
+ /** The sticky bit in inode->i_mode may have been removed, so
+ preserve the original mode */
+ umode_t orig_i_mode;
+
+ /** 64 bit inode number */
+ u64 orig_ino;
+
+ /** Version of last attribute change */
+ u64 attr_version;
+
+ union {
+ /* Write related fields (regular file only) */
+ struct {
+ /* Files usable in writepage. Protected by fi->lock */
+ struct list_head write_files;
+
+ /* Writepages pending on truncate or fsync */
+ struct list_head queued_writes;
+
+ /* Number of sent writes, a negative bias
+ * (FUSE_NOWRITE) means more writes are blocked */
+ int writectr;
+
+ /* Waitq for writepage completion */
+ wait_queue_head_t page_waitq;
+
+ /* List of writepage requestst (pending or sent) */
+ struct rb_root writepages;
+ };
+
+ /* readdir cache (directory only) */
+ struct {
+ /* true if fully cached */
+ bool cached;
+
+ /* size of cache */
+ loff_t size;
+
+ /* position at end of cache (position of next entry) */
+ loff_t pos;
+
+ /* version of the cache */
+ u64 version;
+
+ /* modification time of directory when cache was
+ * started */
+ struct timespec64 mtime;
+
+ /* iversion of directory when cache was started */
+ u64 iversion;
+
+ /* protects above fields */
+ spinlock_t lock;
+ } rdc;
+ };
+
+ /** Miscellaneous bits describing inode state */
+ unsigned long state;
+
+ /** Lock for serializing lookup and readdir for back compatibility*/
+ struct mutex mutex;
+
+ /** Lock to protect write related fields */
+ spinlock_t lock;
+
+#ifdef CONFIG_FUSE_DAX
+ /*
+ * Dax specific inode data
+ */
+ struct fuse_inode_dax *dax;
+#endif
+ /** Submount specific lookup tracking */
+ struct fuse_submount_lookup *submount_lookup;
+};
+
+/** FUSE inode state bits */
+enum {
+ /** Advise readdirplus */
+ FUSE_I_ADVISE_RDPLUS,
+ /** Initialized with readdirplus */
+ FUSE_I_INIT_RDPLUS,
+ /** An operation changing file size is in progress */
+ FUSE_I_SIZE_UNSTABLE,
+ /* Bad inode */
+ FUSE_I_BAD,
+};
+
+struct fuse_conn;
+struct fuse_mount;
+struct fuse_release_args;
+
+/** FUSE specific file data */
+struct fuse_file {
+ /** Fuse connection for this file */
+ struct fuse_mount *fm;
+
+ /* Argument space reserved for release */
+ struct fuse_release_args *release_args;
+
+ /** Kernel file handle guaranteed to be unique */
+ u64 kh;
+
+ /** File handle used by userspace */
+ u64 fh;
+
+ /** Node id of this file */
+ u64 nodeid;
+
+ /** Refcount */
+ refcount_t count;
+
+ /** FOPEN_* flags returned by open */
+ u32 open_flags;
+
+ /** Entry on inode's write_files list */
+ struct list_head write_entry;
+
+ /* Readdir related */
+ struct {
+ /*
+ * Protects below fields against (crazy) parallel readdir on
+ * same open file. Uncontended in the normal case.
+ */
+ struct mutex lock;
+
+ /* Dir stream position */
+ loff_t pos;
+
+ /* Offset in cache */
+ loff_t cache_off;
+
+ /* Version of cache we are reading */
+ u64 version;
+
+ } readdir;
+
+ /** RB node to be linked on fuse_conn->polled_files */
+ struct rb_node polled_node;
+
+ /** Wait queue head for poll */
+ wait_queue_head_t poll_wait;
+
+ /** Has flock been performed on this file? */
+ bool flock:1;
+};
+
+/** One input argument of a request */
+struct fuse_in_arg {
+ unsigned size;
+ const void *value;
+};
+
+/** One output argument of a request */
+struct fuse_arg {
+ unsigned size;
+ void *value;
+};
+
+/** FUSE page descriptor */
+struct fuse_page_desc {
+ unsigned int length;
+ unsigned int offset;
+};
+
+struct fuse_args {
+ uint64_t nodeid;
+ uint32_t opcode;
+ unsigned short in_numargs;
+ unsigned short out_numargs;
+ bool force:1;
+ bool noreply:1;
+ bool nocreds:1;
+ bool in_pages:1;
+ bool out_pages:1;
+ bool user_pages:1;
+ bool out_argvar:1;
+ bool page_zeroing:1;
+ bool page_replace:1;
+ bool may_block:1;
+ struct fuse_in_arg in_args[3];
+ struct fuse_arg out_args[2];
+ void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error);
+};
+
+struct fuse_args_pages {
+ struct fuse_args args;
+ struct page **pages;
+ struct fuse_page_desc *descs;
+ unsigned int num_pages;
+};
+
+#define FUSE_ARGS(args) struct fuse_args args = {}
+
+/** The request IO state (for asynchronous processing) */
+struct fuse_io_priv {
+ struct kref refcnt;
+ int async;
+ spinlock_t lock;
+ unsigned reqs;
+ ssize_t bytes;
+ size_t size;
+ __u64 offset;
+ bool write;
+ bool should_dirty;
+ int err;
+ struct kiocb *iocb;
+ struct completion *done;
+ bool blocking;
+};
+
+#define FUSE_IO_PRIV_SYNC(i) \
+{ \
+ .refcnt = KREF_INIT(1), \
+ .async = 0, \
+ .iocb = i, \
+}
+
+/**
+ * Request flags
+ *
+ * FR_ISREPLY: set if the request has reply
+ * FR_FORCE: force sending of the request even if interrupted
+ * FR_BACKGROUND: request is sent in the background
+ * FR_WAITING: request is counted as "waiting"
+ * FR_ABORTED: the request was aborted
+ * FR_INTERRUPTED: the request has been interrupted
+ * FR_LOCKED: data is being copied to/from the request
+ * FR_PENDING: request is not yet in userspace
+ * FR_SENT: request is in userspace, waiting for an answer
+ * FR_FINISHED: request is finished
+ * FR_PRIVATE: request is on private list
+ * FR_ASYNC: request is asynchronous
+ */
+enum fuse_req_flag {
+ FR_ISREPLY,
+ FR_FORCE,
+ FR_BACKGROUND,
+ FR_WAITING,
+ FR_ABORTED,
+ FR_INTERRUPTED,
+ FR_LOCKED,
+ FR_PENDING,
+ FR_SENT,
+ FR_FINISHED,
+ FR_PRIVATE,
+ FR_ASYNC,
+};
+
+/**
+ * A request to the client
+ *
+ * .waitq.lock protects the following fields:
+ * - FR_ABORTED
+ * - FR_LOCKED (may also be modified under fc->lock, tested under both)
+ */
+struct fuse_req {
+ /** This can be on either pending processing or io lists in
+ fuse_conn */
+ struct list_head list;
+
+ /** Entry on the interrupts list */
+ struct list_head intr_entry;
+
+ /* Input/output arguments */
+ struct fuse_args *args;
+
+ /** refcount */
+ refcount_t count;
+
+ /* Request flags, updated with test/set/clear_bit() */
+ unsigned long flags;
+
+ /* The request input header */
+ struct {
+ struct fuse_in_header h;
+ } in;
+
+ /* The request output header */
+ struct {
+ struct fuse_out_header h;
+ } out;
+
+ /** Used to wake up the task waiting for completion of request*/
+ wait_queue_head_t waitq;
+
+#if IS_ENABLED(CONFIG_VIRTIO_FS)
+ /** virtio-fs's physically contiguous buffer for in and out args */
+ void *argbuf;
+#endif
+
+ /** fuse_mount this request belongs to */
+ struct fuse_mount *fm;
+};
+
+struct fuse_iqueue;
+
+/**
+ * Input queue callbacks
+ *
+ * Input queue signalling is device-specific. For example, the /dev/fuse file
+ * uses fiq->waitq and fasync to wake processes that are waiting on queue
+ * readiness. These callbacks allow other device types to respond to input
+ * queue activity.
+ */
+struct fuse_iqueue_ops {
+ /**
+ * Signal that a forget has been queued
+ */
+ void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq)
+ __releases(fiq->lock);
+
+ /**
+ * Signal that an INTERRUPT request has been queued
+ */
+ void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq)
+ __releases(fiq->lock);
+
+ /**
+ * Signal that a request has been queued
+ */
+ void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq)
+ __releases(fiq->lock);
+
+ /**
+ * Clean up when fuse_iqueue is destroyed
+ */
+ void (*release)(struct fuse_iqueue *fiq);
+};
+
+/** /dev/fuse input queue operations */
+extern const struct fuse_iqueue_ops fuse_dev_fiq_ops;
+
+struct fuse_iqueue {
+ /** Connection established */
+ unsigned connected;
+
+ /** Lock protecting accesses to members of this structure */
+ spinlock_t lock;
+
+ /** Readers of the connection are waiting on this */
+ wait_queue_head_t waitq;
+
+ /** The next unique request id */
+ u64 reqctr;
+
+ /** The list of pending requests */
+ struct list_head pending;
+
+ /** Pending interrupts */
+ struct list_head interrupts;
+
+ /** Queue of pending forgets */
+ struct fuse_forget_link forget_list_head;
+ struct fuse_forget_link *forget_list_tail;
+
+ /** Batching of FORGET requests (positive indicates FORGET batch) */
+ int forget_batch;
+
+ /** O_ASYNC requests */
+ struct fasync_struct *fasync;
+
+ /** Device-specific callbacks */
+ const struct fuse_iqueue_ops *ops;
+
+ /** Device-specific state */
+ void *priv;
+};
+
+#define FUSE_PQ_HASH_BITS 8
+#define FUSE_PQ_HASH_SIZE (1 << FUSE_PQ_HASH_BITS)
+
+struct fuse_pqueue {
+ /** Connection established */
+ unsigned connected;
+
+ /** Lock protecting accessess to members of this structure */
+ spinlock_t lock;
+
+ /** Hash table of requests being processed */
+ struct list_head *processing;
+
+ /** The list of requests under I/O */
+ struct list_head io;
+};
+
+/**
+ * Fuse device instance
+ */
+struct fuse_dev {
+ /** Fuse connection for this device */
+ struct fuse_conn *fc;
+
+ /** Processing queue */
+ struct fuse_pqueue pq;
+
+ /** list entry on fc->devices */
+ struct list_head entry;
+};
+
+enum fuse_dax_mode {
+ FUSE_DAX_INODE_DEFAULT, /* default */
+ FUSE_DAX_ALWAYS, /* "-o dax=always" */
+ FUSE_DAX_NEVER, /* "-o dax=never" */
+ FUSE_DAX_INODE_USER, /* "-o dax=inode" */
+};
+
+static inline bool fuse_is_inode_dax_mode(enum fuse_dax_mode mode)
+{
+ return mode == FUSE_DAX_INODE_DEFAULT || mode == FUSE_DAX_INODE_USER;
+}
+
+struct fuse_fs_context {
+ int fd;
+ struct file *file;
+ unsigned int rootmode;
+ kuid_t user_id;
+ kgid_t group_id;
+ bool is_bdev:1;
+ bool fd_present:1;
+ bool rootmode_present:1;
+ bool user_id_present:1;
+ bool group_id_present:1;
+ bool default_permissions:1;
+ bool allow_other:1;
+ bool destroy:1;
+ bool no_control:1;
+ bool no_force_umount:1;
+ bool legacy_opts_show:1;
+ enum fuse_dax_mode dax_mode;
+ unsigned int max_read;
+ unsigned int blksize;
+ const char *subtype;
+
+ /* DAX device, may be NULL */
+ struct dax_device *dax_dev;
+
+ /* fuse_dev pointer to fill in, should contain NULL on entry */
+ void **fudptr;
+};
+
+struct fuse_sync_bucket {
+ /* count is a possible scalability bottleneck */
+ atomic_t count;
+ wait_queue_head_t waitq;
+ struct rcu_head rcu;
+};
+
+/**
+ * A Fuse connection.
+ *
+ * This structure is created, when the root filesystem is mounted, and
+ * is destroyed, when the client device is closed and the last
+ * fuse_mount is destroyed.
+ */
+struct fuse_conn {
+ /** Lock protecting accessess to members of this structure */
+ spinlock_t lock;
+
+ /** Refcount */
+ refcount_t count;
+
+ /** Number of fuse_dev's */
+ atomic_t dev_count;
+
+ struct rcu_head rcu;
+
+ /** The user id for this mount */
+ kuid_t user_id;
+
+ /** The group id for this mount */
+ kgid_t group_id;
+
+ /** The pid namespace for this mount */
+ struct pid_namespace *pid_ns;
+
+ /** The user namespace for this mount */
+ struct user_namespace *user_ns;
+
+ /** Maximum read size */
+ unsigned max_read;
+
+ /** Maximum write size */
+ unsigned max_write;
+
+ /** Maximum number of pages that can be used in a single request */
+ unsigned int max_pages;
+
+ /** Constrain ->max_pages to this value during feature negotiation */
+ unsigned int max_pages_limit;
+
+ /** Input queue */
+ struct fuse_iqueue iq;
+
+ /** The next unique kernel file handle */
+ atomic64_t khctr;
+
+ /** rbtree of fuse_files waiting for poll events indexed by ph */
+ struct rb_root polled_files;
+
+ /** Maximum number of outstanding background requests */
+ unsigned max_background;
+
+ /** Number of background requests at which congestion starts */
+ unsigned congestion_threshold;
+
+ /** Number of requests currently in the background */
+ unsigned num_background;
+
+ /** Number of background requests currently queued for userspace */
+ unsigned active_background;
+
+ /** The list of background requests set aside for later queuing */
+ struct list_head bg_queue;
+
+ /** Protects: max_background, congestion_threshold, num_background,
+ * active_background, bg_queue, blocked */
+ spinlock_t bg_lock;
+
+ /** Flag indicating that INIT reply has been received. Allocating
+ * any fuse request will be suspended until the flag is set */
+ int initialized;
+
+ /** Flag indicating if connection is blocked. This will be
+ the case before the INIT reply is received, and if there
+ are too many outstading backgrounds requests */
+ int blocked;
+
+ /** waitq for blocked connection */
+ wait_queue_head_t blocked_waitq;
+
+ /** Connection established, cleared on umount, connection
+ abort and device release */
+ unsigned connected;
+
+ /** Connection aborted via sysfs */
+ bool aborted;
+
+ /** Connection failed (version mismatch). Cannot race with
+ setting other bitfields since it is only set once in INIT
+ reply, before any other request, and never cleared */
+ unsigned conn_error:1;
+
+ /** Connection successful. Only set in INIT */
+ unsigned conn_init:1;
+
+ /** Do readahead asynchronously? Only set in INIT */
+ unsigned async_read:1;
+
+ /** Return an unique read error after abort. Only set in INIT */
+ unsigned abort_err:1;
+
+ /** Do not send separate SETATTR request before open(O_TRUNC) */
+ unsigned atomic_o_trunc:1;
+
+ /** Filesystem supports NFS exporting. Only set in INIT */
+ unsigned export_support:1;
+
+ /** write-back cache policy (default is write-through) */
+ unsigned writeback_cache:1;
+
+ /** allow parallel lookups and readdir (default is serialized) */
+ unsigned parallel_dirops:1;
+
+ /** handle fs handles killing suid/sgid/cap on write/chown/trunc */
+ unsigned handle_killpriv:1;
+
+ /** cache READLINK responses in page cache */
+ unsigned cache_symlinks:1;
+
+ /* show legacy mount options */
+ unsigned int legacy_opts_show:1;
+
+ /*
+ * fs kills suid/sgid/cap on write/chown/trunc. suid is killed on
+ * write/trunc only if caller did not have CAP_FSETID. sgid is killed
+ * on write/truncate only if caller did not have CAP_FSETID as well as
+ * file has group execute permission.
+ */
+ unsigned handle_killpriv_v2:1;
+
+ /*
+ * The following bitfields are only for optimization purposes
+ * and hence races in setting them will not cause malfunction
+ */
+
+ /** Is open/release not implemented by fs? */
+ unsigned no_open:1;
+
+ /** Is opendir/releasedir not implemented by fs? */
+ unsigned no_opendir:1;
+
+ /** Is fsync not implemented by fs? */
+ unsigned no_fsync:1;
+
+ /** Is fsyncdir not implemented by fs? */
+ unsigned no_fsyncdir:1;
+
+ /** Is flush not implemented by fs? */
+ unsigned no_flush:1;
+
+ /** Is setxattr not implemented by fs? */
+ unsigned no_setxattr:1;
+
+ /** Does file server support extended setxattr */
+ unsigned setxattr_ext:1;
+
+ /** Is getxattr not implemented by fs? */
+ unsigned no_getxattr:1;
+
+ /** Is listxattr not implemented by fs? */
+ unsigned no_listxattr:1;
+
+ /** Is removexattr not implemented by fs? */
+ unsigned no_removexattr:1;
+
+ /** Are posix file locking primitives not implemented by fs? */
+ unsigned no_lock:1;
+
+ /** Is access not implemented by fs? */
+ unsigned no_access:1;
+
+ /** Is create not implemented by fs? */
+ unsigned no_create:1;
+
+ /** Is interrupt not implemented by fs? */
+ unsigned no_interrupt:1;
+
+ /** Is bmap not implemented by fs? */
+ unsigned no_bmap:1;
+
+ /** Is poll not implemented by fs? */
+ unsigned no_poll:1;
+
+ /** Do multi-page cached writes */
+ unsigned big_writes:1;
+
+ /** Don't apply umask to creation modes */
+ unsigned dont_mask:1;
+
+ /** Are BSD file locking primitives not implemented by fs? */
+ unsigned no_flock:1;
+
+ /** Is fallocate not implemented by fs? */
+ unsigned no_fallocate:1;
+
+ /** Is rename with flags implemented by fs? */
+ unsigned no_rename2:1;
+
+ /** Use enhanced/automatic page cache invalidation. */
+ unsigned auto_inval_data:1;
+
+ /** Filesystem is fully responsible for page cache invalidation. */
+ unsigned explicit_inval_data:1;
+
+ /** Does the filesystem support readdirplus? */
+ unsigned do_readdirplus:1;
+
+ /** Does the filesystem want adaptive readdirplus? */
+ unsigned readdirplus_auto:1;
+
+ /** Does the filesystem support asynchronous direct-IO submission? */
+ unsigned async_dio:1;
+
+ /** Is lseek not implemented by fs? */
+ unsigned no_lseek:1;
+
+ /** Does the filesystem support posix acls? */
+ unsigned posix_acl:1;
+
+ /** Check permissions based on the file mode or not? */
+ unsigned default_permissions:1;
+
+ /** Allow other than the mounter user to access the filesystem ? */
+ unsigned allow_other:1;
+
+ /** Does the filesystem support copy_file_range? */
+ unsigned no_copy_file_range:1;
+
+ /* Send DESTROY request */
+ unsigned int destroy:1;
+
+ /* Delete dentries that have gone stale */
+ unsigned int delete_stale:1;
+
+ /** Do not create entry in fusectl fs */
+ unsigned int no_control:1;
+
+ /** Do not allow MNT_FORCE umount */
+ unsigned int no_force_umount:1;
+
+ /* Auto-mount submounts announced by the server */
+ unsigned int auto_submounts:1;
+
+ /* Propagate syncfs() to server */
+ unsigned int sync_fs:1;
+
+ /* Initialize security xattrs when creating a new inode */
+ unsigned int init_security:1;
+
+ /* Does the filesystem support per inode DAX? */
+ unsigned int inode_dax:1;
+
+ /* Is tmpfile not implemented by fs? */
+ unsigned int no_tmpfile:1;
+
+ /** The number of requests waiting for completion */
+ atomic_t num_waiting;
+
+ /** Negotiated minor version */
+ unsigned minor;
+
+ /** Entry on the fuse_mount_list */
+ struct list_head entry;
+
+ /** Device ID from the root super block */
+ dev_t dev;
+
+ /** Dentries in the control filesystem */
+ struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES];
+
+ /** number of dentries used in the above array */
+ int ctl_ndents;
+
+ /** Key for lock owner ID scrambling */
+ u32 scramble_key[4];
+
+ /** Version counter for attribute changes */
+ atomic64_t attr_version;
+
+ /** Called on final put */
+ void (*release)(struct fuse_conn *);
+
+ /**
+ * Read/write semaphore to hold when accessing the sb of any
+ * fuse_mount belonging to this connection
+ */
+ struct rw_semaphore killsb;
+
+ /** List of device instances belonging to this connection */
+ struct list_head devices;
+
+#ifdef CONFIG_FUSE_DAX
+ /* Dax mode */
+ enum fuse_dax_mode dax_mode;
+
+ /* Dax specific conn data, non-NULL if DAX is enabled */
+ struct fuse_conn_dax *dax;
+#endif
+
+ /** List of filesystems using this connection */
+ struct list_head mounts;
+
+ /* New writepages go into this bucket */
+ struct fuse_sync_bucket __rcu *curr_bucket;
+};
+
+/*
+ * Represents a mounted filesystem, potentially a submount.
+ *
+ * This object allows sharing a fuse_conn between separate mounts to
+ * allow submounts with dedicated superblocks and thus separate device
+ * IDs.
+ */
+struct fuse_mount {
+ /* Underlying (potentially shared) connection to the FUSE server */
+ struct fuse_conn *fc;
+
+ /*
+ * Super block for this connection (fc->killsb must be held when
+ * accessing this).
+ */
+ struct super_block *sb;
+
+ /* Entry on fc->mounts */
+ struct list_head fc_entry;
+};
+
+static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
+{
+ return get_fuse_mount_super(sb)->fc;
+}
+
+static inline struct fuse_mount *get_fuse_mount(struct inode *inode)
+{
+ return get_fuse_mount_super(inode->i_sb);
+}
+
+static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
+{
+ return get_fuse_mount_super(inode->i_sb)->fc;
+}
+
+static inline struct fuse_inode *get_fuse_inode(struct inode *inode)
+{
+ return container_of(inode, struct fuse_inode, inode);
+}
+
+static inline u64 get_node_id(struct inode *inode)
+{
+ return get_fuse_inode(inode)->nodeid;
+}
+
+static inline int invalid_nodeid(u64 nodeid)
+{
+ return !nodeid || nodeid == FUSE_ROOT_ID;
+}
+
+static inline u64 fuse_get_attr_version(struct fuse_conn *fc)
+{
+ return atomic64_read(&fc->attr_version);
+}
+
+static inline bool fuse_stale_inode(const struct inode *inode, int generation,
+ struct fuse_attr *attr)
+{
+ return inode->i_generation != generation ||
+ inode_wrong_type(inode, attr->mode);
+}
+
+static inline void fuse_make_bad(struct inode *inode)
+{
+ remove_inode_hash(inode);
+ set_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state);
+}
+
+static inline bool fuse_is_bad(struct inode *inode)
+{
+ return unlikely(test_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state));
+}
+
+static inline struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
+ struct fuse_page_desc **desc)
+{
+ struct page **pages;
+
+ pages = kzalloc(npages * (sizeof(struct page *) +
+ sizeof(struct fuse_page_desc)), flags);
+ *desc = (void *) (pages + npages);
+
+ return pages;
+}
+
+static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs,
+ unsigned int index,
+ unsigned int nr_pages)
+{
+ int i;
+
+ for (i = index; i < index + nr_pages; i++)
+ descs[i].length = PAGE_SIZE - descs[i].offset;
+}
+
+static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket)
+{
+ /* Need RCU protection to prevent use after free after the decrement */
+ rcu_read_lock();
+ if (atomic_dec_and_test(&bucket->count))
+ wake_up(&bucket->waitq);
+ rcu_read_unlock();
+}
+
+/** Device operations */
+extern const struct file_operations fuse_dev_operations;
+
+extern const struct dentry_operations fuse_dentry_operations;
+extern const struct dentry_operations fuse_root_dentry_operations;
+
+/**
+ * Get a filled in inode
+ */
+struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
+ int generation, struct fuse_attr *attr,
+ u64 attr_valid, u64 attr_version);
+
+int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name,
+ struct fuse_entry_out *outarg, struct inode **inode);
+
+/**
+ * Send FORGET command
+ */
+void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
+ u64 nodeid, u64 nlookup);
+
+struct fuse_forget_link *fuse_alloc_forget(void);
+
+struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq,
+ unsigned int max,
+ unsigned int *countp);
+
+/*
+ * Initialize READ or READDIR request
+ */
+struct fuse_io_args {
+ union {
+ struct {
+ struct fuse_read_in in;
+ u64 attr_ver;
+ } read;
+ struct {
+ struct fuse_write_in in;
+ struct fuse_write_out out;
+ bool page_locked;
+ } write;
+ };
+ struct fuse_args_pages ap;
+ struct fuse_io_priv *io;
+ struct fuse_file *ff;
+};
+
+void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
+ size_t count, int opcode);
+
+
+/**
+ * Send OPEN or OPENDIR request
+ */
+int fuse_open_common(struct inode *inode, struct file *file, bool isdir);
+
+struct fuse_file *fuse_file_alloc(struct fuse_mount *fm);
+void fuse_file_free(struct fuse_file *ff);
+void fuse_finish_open(struct inode *inode, struct file *file);
+
+void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff,
+ unsigned int flags);
+
+/**
+ * Send RELEASE or RELEASEDIR request
+ */
+void fuse_release_common(struct file *file, bool isdir);
+
+/**
+ * Send FSYNC or FSYNCDIR request
+ */
+int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
+ int datasync, int opcode);
+
+/**
+ * Notify poll wakeup
+ */
+int fuse_notify_poll_wakeup(struct fuse_conn *fc,
+ struct fuse_notify_poll_wakeup_out *outarg);
+
+/**
+ * Initialize file operations on a regular file
+ */
+void fuse_init_file_inode(struct inode *inode, unsigned int flags);
+
+/**
+ * Initialize inode operations on regular files and special files
+ */
+void fuse_init_common(struct inode *inode);
+
+/**
+ * Initialize inode and file operations on a directory
+ */
+void fuse_init_dir(struct inode *inode);
+
+/**
+ * Initialize inode operations on a symlink
+ */
+void fuse_init_symlink(struct inode *inode);
+
+/**
+ * Change attributes of an inode
+ */
+void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
+ u64 attr_valid, u64 attr_version);
+
+void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
+ u64 attr_valid, u32 cache_mask);
+
+u32 fuse_get_cache_mask(struct inode *inode);
+
+/**
+ * Initialize the client device
+ */
+int fuse_dev_init(void);
+
+/**
+ * Cleanup the client device
+ */
+void fuse_dev_cleanup(void);
+
+int fuse_ctl_init(void);
+void __exit fuse_ctl_cleanup(void);
+
+/**
+ * Simple request sending that does request allocation and freeing
+ */
+ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args);
+int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
+ gfp_t gfp_flags);
+
+/**
+ * End a finished request
+ */
+void fuse_request_end(struct fuse_req *req);
+
+/* Abort all requests */
+void fuse_abort_conn(struct fuse_conn *fc);
+void fuse_wait_aborted(struct fuse_conn *fc);
+
+/**
+ * Invalidate inode attributes
+ */
+
+/* Attributes possibly changed on data modification */
+#define FUSE_STATX_MODIFY (STATX_MTIME | STATX_CTIME | STATX_BLOCKS)
+
+/* Attributes possibly changed on data and/or size modification */
+#define FUSE_STATX_MODSIZE (FUSE_STATX_MODIFY | STATX_SIZE)
+
+void fuse_invalidate_attr(struct inode *inode);
+void fuse_invalidate_attr_mask(struct inode *inode, u32 mask);
+
+void fuse_invalidate_entry_cache(struct dentry *entry);
+
+void fuse_invalidate_atime(struct inode *inode);
+
+u64 entry_attr_timeout(struct fuse_entry_out *o);
+void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o);
+
+/**
+ * Acquire reference to fuse_conn
+ */
+struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
+
+/**
+ * Initialize fuse_conn
+ */
+void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
+ struct user_namespace *user_ns,
+ const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv);
+
+/**
+ * Release reference to fuse_conn
+ */
+void fuse_conn_put(struct fuse_conn *fc);
+
+struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc);
+struct fuse_dev *fuse_dev_alloc(void);
+void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc);
+void fuse_dev_free(struct fuse_dev *fud);
+void fuse_send_init(struct fuse_mount *fm);
+
+/**
+ * Fill in superblock and initialize fuse connection
+ * @sb: partially-initialized superblock to fill in
+ * @ctx: mount context
+ */
+int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx);
+
+/*
+ * Remove the mount from the connection
+ *
+ * Returns whether this was the last mount
+ */
+bool fuse_mount_remove(struct fuse_mount *fm);
+
+/*
+ * Setup context ops for submounts
+ */
+int fuse_init_fs_context_submount(struct fs_context *fsc);
+
+/*
+ * Shut down the connection (possibly sending DESTROY request).
+ */
+void fuse_conn_destroy(struct fuse_mount *fm);
+
+/* Drop the connection and free the fuse mount */
+void fuse_mount_destroy(struct fuse_mount *fm);
+
+/**
+ * Add connection to control filesystem
+ */
+int fuse_ctl_add_conn(struct fuse_conn *fc);
+
+/**
+ * Remove connection from control filesystem
+ */
+void fuse_ctl_remove_conn(struct fuse_conn *fc);
+
+/**
+ * Is file type valid?
+ */
+int fuse_valid_type(int m);
+
+bool fuse_invalid_attr(struct fuse_attr *attr);
+
+/**
+ * Is current process allowed to perform filesystem operation?
+ */
+int fuse_allow_current_process(struct fuse_conn *fc);
+
+u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
+
+void fuse_flush_time_update(struct inode *inode);
+void fuse_update_ctime(struct inode *inode);
+
+int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask);
+
+void fuse_flush_writepages(struct inode *inode);
+
+void fuse_set_nowrite(struct inode *inode);
+void fuse_release_nowrite(struct inode *inode);
+
+/**
+ * Scan all fuse_mounts belonging to fc to find the first where
+ * ilookup5() returns a result. Return that result and the
+ * respective fuse_mount in *fm (unless fm is NULL).
+ *
+ * The caller must hold fc->killsb.
+ */
+struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid,
+ struct fuse_mount **fm);
+
+/**
+ * File-system tells the kernel to invalidate cache for the given node id.
+ */
+int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
+ loff_t offset, loff_t len);
+
+/**
+ * File-system tells the kernel to invalidate parent attributes and
+ * the dentry matching parent/name.
+ *
+ * If the child_nodeid is non-zero and:
+ * - matches the inode number for the dentry matching parent/name,
+ * - is not a mount point
+ * - is a file or oan empty directory
+ * then the dentry is unhashed (d_delete()).
+ */
+int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
+ u64 child_nodeid, struct qstr *name);
+
+int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
+ bool isdir);
+
+/**
+ * fuse_direct_io() flags
+ */
+
+/** If set, it is WRITE; otherwise - READ */
+#define FUSE_DIO_WRITE (1 << 0)
+
+/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */
+#define FUSE_DIO_CUSE (1 << 1)
+
+ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
+ loff_t *ppos, int flags);
+long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
+ unsigned int flags);
+long fuse_ioctl_common(struct file *file, unsigned int cmd,
+ unsigned long arg, unsigned int flags);
+__poll_t fuse_file_poll(struct file *file, poll_table *wait);
+int fuse_dev_release(struct inode *inode, struct file *file);
+
+bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written);
+
+int fuse_flush_times(struct inode *inode, struct fuse_file *ff);
+int fuse_write_inode(struct inode *inode, struct writeback_control *wbc);
+
+int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
+ struct file *file);
+
+void fuse_set_initialized(struct fuse_conn *fc);
+
+void fuse_unlock_inode(struct inode *inode, bool locked);
+bool fuse_lock_inode(struct inode *inode);
+
+int fuse_setxattr(struct inode *inode, const char *name, const void *value,
+ size_t size, int flags, unsigned int extra_flags);
+ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
+ size_t size);
+ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size);
+int fuse_removexattr(struct inode *inode, const char *name);
+extern const struct xattr_handler *fuse_xattr_handlers[];
+extern const struct xattr_handler *fuse_acl_xattr_handlers[];
+extern const struct xattr_handler *fuse_no_acl_xattr_handlers[];
+
+struct posix_acl;
+struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu);
+int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct posix_acl *acl, int type);
+
+/* readdir.c */
+int fuse_readdir(struct file *file, struct dir_context *ctx);
+
+/**
+ * Return the number of bytes in an arguments list
+ */
+unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args);
+
+/**
+ * Get the next unique ID for a request
+ */
+u64 fuse_get_unique(struct fuse_iqueue *fiq);
+void fuse_free_conn(struct fuse_conn *fc);
+
+/* dax.c */
+
+#define FUSE_IS_DAX(inode) (IS_ENABLED(CONFIG_FUSE_DAX) && IS_DAX(inode))
+
+ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
+ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
+int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma);
+int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end);
+int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode mode,
+ struct dax_device *dax_dev);
+void fuse_dax_conn_free(struct fuse_conn *fc);
+bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi);
+void fuse_dax_inode_init(struct inode *inode, unsigned int flags);
+void fuse_dax_inode_cleanup(struct inode *inode);
+void fuse_dax_dontcache(struct inode *inode, unsigned int flags);
+bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment);
+void fuse_dax_cancel_work(struct fuse_conn *fc);
+
+/* ioctl.c */
+long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg);
+int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int fuse_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
+
+/* file.c */
+
+struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
+ unsigned int open_flags, bool isdir);
+void fuse_file_release(struct inode *inode, struct fuse_file *ff,
+ unsigned int open_flags, fl_owner_t id, bool isdir);
+
+#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
new file mode 100644
index 000000000..f81000d96
--- /dev/null
+++ b/fs/fuse/inode.c
@@ -0,0 +1,2068 @@
+/*
+ FUSE: Filesystem in Userspace
+ Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
+
+ This program can be distributed under the terms of the GNU GPL.
+ See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/statfs.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/exportfs.h>
+#include <linux/posix_acl.h>
+#include <linux/pid_namespace.h>
+#include <uapi/linux/magic.h>
+
+MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
+MODULE_DESCRIPTION("Filesystem in Userspace");
+MODULE_LICENSE("GPL");
+
+static struct kmem_cache *fuse_inode_cachep;
+struct list_head fuse_conn_list;
+DEFINE_MUTEX(fuse_mutex);
+
+static int set_global_limit(const char *val, const struct kernel_param *kp);
+
+unsigned max_user_bgreq;
+module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
+ &max_user_bgreq, 0644);
+__MODULE_PARM_TYPE(max_user_bgreq, "uint");
+MODULE_PARM_DESC(max_user_bgreq,
+ "Global limit for the maximum number of backgrounded requests an "
+ "unprivileged user can set");
+
+unsigned max_user_congthresh;
+module_param_call(max_user_congthresh, set_global_limit, param_get_uint,
+ &max_user_congthresh, 0644);
+__MODULE_PARM_TYPE(max_user_congthresh, "uint");
+MODULE_PARM_DESC(max_user_congthresh,
+ "Global limit for the maximum congestion threshold an "
+ "unprivileged user can set");
+
+#define FUSE_DEFAULT_BLKSIZE 512
+
+/** Maximum number of outstanding background requests */
+#define FUSE_DEFAULT_MAX_BACKGROUND 12
+
+/** Congestion starts at 75% of maximum */
+#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
+
+#ifdef CONFIG_BLOCK
+static struct file_system_type fuseblk_fs_type;
+#endif
+
+struct fuse_forget_link *fuse_alloc_forget(void)
+{
+ return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL_ACCOUNT);
+}
+
+static struct fuse_submount_lookup *fuse_alloc_submount_lookup(void)
+{
+ struct fuse_submount_lookup *sl;
+
+ sl = kzalloc(sizeof(struct fuse_submount_lookup), GFP_KERNEL_ACCOUNT);
+ if (!sl)
+ return NULL;
+ sl->forget = fuse_alloc_forget();
+ if (!sl->forget)
+ goto out_free;
+
+ return sl;
+
+out_free:
+ kfree(sl);
+ return NULL;
+}
+
+static struct inode *fuse_alloc_inode(struct super_block *sb)
+{
+ struct fuse_inode *fi;
+
+ fi = alloc_inode_sb(sb, fuse_inode_cachep, GFP_KERNEL);
+ if (!fi)
+ return NULL;
+
+ fi->i_time = 0;
+ fi->inval_mask = 0;
+ fi->nodeid = 0;
+ fi->nlookup = 0;
+ fi->attr_version = 0;
+ fi->orig_ino = 0;
+ fi->state = 0;
+ fi->submount_lookup = NULL;
+ mutex_init(&fi->mutex);
+ spin_lock_init(&fi->lock);
+ fi->forget = fuse_alloc_forget();
+ if (!fi->forget)
+ goto out_free;
+
+ if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi))
+ goto out_free_forget;
+
+ return &fi->inode;
+
+out_free_forget:
+ kfree(fi->forget);
+out_free:
+ kmem_cache_free(fuse_inode_cachep, fi);
+ return NULL;
+}
+
+static void fuse_free_inode(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ mutex_destroy(&fi->mutex);
+ kfree(fi->forget);
+#ifdef CONFIG_FUSE_DAX
+ kfree(fi->dax);
+#endif
+ kmem_cache_free(fuse_inode_cachep, fi);
+}
+
+static void fuse_cleanup_submount_lookup(struct fuse_conn *fc,
+ struct fuse_submount_lookup *sl)
+{
+ if (!refcount_dec_and_test(&sl->count))
+ return;
+
+ fuse_queue_forget(fc, sl->forget, sl->nodeid, 1);
+ sl->forget = NULL;
+ kfree(sl);
+}
+
+static void fuse_evict_inode(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ /* Will write inode on close/munmap and in all other dirtiers */
+ WARN_ON(inode->i_state & I_DIRTY_INODE);
+
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+ if (inode->i_sb->s_flags & SB_ACTIVE) {
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (FUSE_IS_DAX(inode))
+ fuse_dax_inode_cleanup(inode);
+ if (fi->nlookup) {
+ fuse_queue_forget(fc, fi->forget, fi->nodeid,
+ fi->nlookup);
+ fi->forget = NULL;
+ }
+
+ if (fi->submount_lookup) {
+ fuse_cleanup_submount_lookup(fc, fi->submount_lookup);
+ fi->submount_lookup = NULL;
+ }
+ }
+ if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) {
+ WARN_ON(!list_empty(&fi->write_files));
+ WARN_ON(!list_empty(&fi->queued_writes));
+ }
+}
+
+static int fuse_reconfigure(struct fs_context *fsc)
+{
+ struct super_block *sb = fsc->root->d_sb;
+
+ sync_filesystem(sb);
+ if (fsc->sb_flags & SB_MANDLOCK)
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down
+ * so that it will fit.
+ */
+static ino_t fuse_squash_ino(u64 ino64)
+{
+ ino_t ino = (ino_t) ino64;
+ if (sizeof(ino_t) < sizeof(u64))
+ ino ^= ino64 >> (sizeof(u64) - sizeof(ino_t)) * 8;
+ return ino;
+}
+
+void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
+ u64 attr_valid, u32 cache_mask)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ lockdep_assert_held(&fi->lock);
+
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ fi->i_time = attr_valid;
+ WRITE_ONCE(fi->inval_mask, 0);
+
+ inode->i_ino = fuse_squash_ino(attr->ino);
+ inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
+ set_nlink(inode, attr->nlink);
+ inode->i_uid = make_kuid(fc->user_ns, attr->uid);
+ inode->i_gid = make_kgid(fc->user_ns, attr->gid);
+ inode->i_blocks = attr->blocks;
+
+ /* Sanitize nsecs */
+ attr->atimensec = min_t(u32, attr->atimensec, NSEC_PER_SEC - 1);
+ attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1);
+ attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1);
+
+ inode->i_atime.tv_sec = attr->atime;
+ inode->i_atime.tv_nsec = attr->atimensec;
+ /* mtime from server may be stale due to local buffered write */
+ if (!(cache_mask & STATX_MTIME)) {
+ inode->i_mtime.tv_sec = attr->mtime;
+ inode->i_mtime.tv_nsec = attr->mtimensec;
+ }
+ if (!(cache_mask & STATX_CTIME)) {
+ inode->i_ctime.tv_sec = attr->ctime;
+ inode->i_ctime.tv_nsec = attr->ctimensec;
+ }
+
+ if (attr->blksize != 0)
+ inode->i_blkbits = ilog2(attr->blksize);
+ else
+ inode->i_blkbits = inode->i_sb->s_blocksize_bits;
+
+ /*
+ * Don't set the sticky bit in i_mode, unless we want the VFS
+ * to check permissions. This prevents failures due to the
+ * check in may_delete().
+ */
+ fi->orig_i_mode = inode->i_mode;
+ if (!fc->default_permissions)
+ inode->i_mode &= ~S_ISVTX;
+
+ fi->orig_ino = attr->ino;
+
+ /*
+ * We are refreshing inode data and it is possible that another
+ * client set suid/sgid or security.capability xattr. So clear
+ * S_NOSEC. Ideally, we could have cleared it only if suid/sgid
+ * was set or if security.capability xattr was set. But we don't
+ * know if security.capability has been set or not. So clear it
+ * anyway. Its less efficient but should be safe.
+ */
+ inode->i_flags &= ~S_NOSEC;
+}
+
+u32 fuse_get_cache_mask(struct inode *inode)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (!fc->writeback_cache || !S_ISREG(inode->i_mode))
+ return 0;
+
+ return STATX_MTIME | STATX_CTIME | STATX_SIZE;
+}
+
+void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
+ u64 attr_valid, u64 attr_version)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ u32 cache_mask;
+ loff_t oldsize;
+ struct timespec64 old_mtime;
+
+ spin_lock(&fi->lock);
+ /*
+ * In case of writeback_cache enabled, writes update mtime, ctime and
+ * may update i_size. In these cases trust the cached value in the
+ * inode.
+ */
+ cache_mask = fuse_get_cache_mask(inode);
+ if (cache_mask & STATX_SIZE)
+ attr->size = i_size_read(inode);
+
+ if (cache_mask & STATX_MTIME) {
+ attr->mtime = inode->i_mtime.tv_sec;
+ attr->mtimensec = inode->i_mtime.tv_nsec;
+ }
+ if (cache_mask & STATX_CTIME) {
+ attr->ctime = inode->i_ctime.tv_sec;
+ attr->ctimensec = inode->i_ctime.tv_nsec;
+ }
+
+ if ((attr_version != 0 && fi->attr_version > attr_version) ||
+ test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
+ spin_unlock(&fi->lock);
+ return;
+ }
+
+ old_mtime = inode->i_mtime;
+ fuse_change_attributes_common(inode, attr, attr_valid, cache_mask);
+
+ oldsize = inode->i_size;
+ /*
+ * In case of writeback_cache enabled, the cached writes beyond EOF
+ * extend local i_size without keeping userspace server in sync. So,
+ * attr->size coming from server can be stale. We cannot trust it.
+ */
+ if (!(cache_mask & STATX_SIZE))
+ i_size_write(inode, attr->size);
+ spin_unlock(&fi->lock);
+
+ if (!cache_mask && S_ISREG(inode->i_mode)) {
+ bool inval = false;
+
+ if (oldsize != attr->size) {
+ truncate_pagecache(inode, attr->size);
+ if (!fc->explicit_inval_data)
+ inval = true;
+ } else if (fc->auto_inval_data) {
+ struct timespec64 new_mtime = {
+ .tv_sec = attr->mtime,
+ .tv_nsec = attr->mtimensec,
+ };
+
+ /*
+ * Auto inval mode also checks and invalidates if mtime
+ * has changed.
+ */
+ if (!timespec64_equal(&old_mtime, &new_mtime))
+ inval = true;
+ }
+
+ if (inval)
+ invalidate_inode_pages2(inode->i_mapping);
+ }
+
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_dontcache(inode, attr->flags);
+}
+
+static void fuse_init_submount_lookup(struct fuse_submount_lookup *sl,
+ u64 nodeid)
+{
+ sl->nodeid = nodeid;
+ refcount_set(&sl->count, 1);
+}
+
+static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
+{
+ inode->i_mode = attr->mode & S_IFMT;
+ inode->i_size = attr->size;
+ inode->i_mtime.tv_sec = attr->mtime;
+ inode->i_mtime.tv_nsec = attr->mtimensec;
+ inode->i_ctime.tv_sec = attr->ctime;
+ inode->i_ctime.tv_nsec = attr->ctimensec;
+ if (S_ISREG(inode->i_mode)) {
+ fuse_init_common(inode);
+ fuse_init_file_inode(inode, attr->flags);
+ } else if (S_ISDIR(inode->i_mode))
+ fuse_init_dir(inode);
+ else if (S_ISLNK(inode->i_mode))
+ fuse_init_symlink(inode);
+ else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+ fuse_init_common(inode);
+ init_special_inode(inode, inode->i_mode,
+ new_decode_dev(attr->rdev));
+ } else
+ BUG();
+}
+
+static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
+{
+ u64 nodeid = *(u64 *) _nodeidp;
+ if (get_node_id(inode) == nodeid)
+ return 1;
+ else
+ return 0;
+}
+
+static int fuse_inode_set(struct inode *inode, void *_nodeidp)
+{
+ u64 nodeid = *(u64 *) _nodeidp;
+ get_fuse_inode(inode)->nodeid = nodeid;
+ return 0;
+}
+
+struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
+ int generation, struct fuse_attr *attr,
+ u64 attr_valid, u64 attr_version)
+{
+ struct inode *inode;
+ struct fuse_inode *fi;
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+ /*
+ * Auto mount points get their node id from the submount root, which is
+ * not a unique identifier within this filesystem.
+ *
+ * To avoid conflicts, do not place submount points into the inode hash
+ * table.
+ */
+ if (fc->auto_submounts && (attr->flags & FUSE_ATTR_SUBMOUNT) &&
+ S_ISDIR(attr->mode)) {
+ struct fuse_inode *fi;
+
+ inode = new_inode(sb);
+ if (!inode)
+ return NULL;
+
+ fuse_init_inode(inode, attr);
+ fi = get_fuse_inode(inode);
+ fi->nodeid = nodeid;
+ fi->submount_lookup = fuse_alloc_submount_lookup();
+ if (!fi->submount_lookup) {
+ iput(inode);
+ return NULL;
+ }
+ /* Sets nlookup = 1 on fi->submount_lookup->nlookup */
+ fuse_init_submount_lookup(fi->submount_lookup, nodeid);
+ inode->i_flags |= S_AUTOMOUNT;
+ goto done;
+ }
+
+retry:
+ inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid);
+ if (!inode)
+ return NULL;
+
+ if ((inode->i_state & I_NEW)) {
+ inode->i_flags |= S_NOATIME;
+ if (!fc->writeback_cache || !S_ISREG(attr->mode))
+ inode->i_flags |= S_NOCMTIME;
+ inode->i_generation = generation;
+ fuse_init_inode(inode, attr);
+ unlock_new_inode(inode);
+ } else if (fuse_stale_inode(inode, generation, attr)) {
+ /* nodeid was reused, any I/O on the old inode should fail */
+ fuse_make_bad(inode);
+ iput(inode);
+ goto retry;
+ }
+ fi = get_fuse_inode(inode);
+ spin_lock(&fi->lock);
+ fi->nlookup++;
+ spin_unlock(&fi->lock);
+done:
+ fuse_change_attributes(inode, attr, attr_valid, attr_version);
+
+ return inode;
+}
+
+struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid,
+ struct fuse_mount **fm)
+{
+ struct fuse_mount *fm_iter;
+ struct inode *inode;
+
+ WARN_ON(!rwsem_is_locked(&fc->killsb));
+ list_for_each_entry(fm_iter, &fc->mounts, fc_entry) {
+ if (!fm_iter->sb)
+ continue;
+
+ inode = ilookup5(fm_iter->sb, nodeid, fuse_inode_eq, &nodeid);
+ if (inode) {
+ if (fm)
+ *fm = fm_iter;
+ return inode;
+ }
+ }
+
+ return NULL;
+}
+
+int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
+ loff_t offset, loff_t len)
+{
+ struct fuse_inode *fi;
+ struct inode *inode;
+ pgoff_t pg_start;
+ pgoff_t pg_end;
+
+ inode = fuse_ilookup(fc, nodeid, NULL);
+ if (!inode)
+ return -ENOENT;
+
+ fi = get_fuse_inode(inode);
+ spin_lock(&fi->lock);
+ fi->attr_version = atomic64_inc_return(&fc->attr_version);
+ spin_unlock(&fi->lock);
+
+ fuse_invalidate_attr(inode);
+ forget_all_cached_acls(inode);
+ if (offset >= 0) {
+ pg_start = offset >> PAGE_SHIFT;
+ if (len <= 0)
+ pg_end = -1;
+ else
+ pg_end = (offset + len - 1) >> PAGE_SHIFT;
+ invalidate_inode_pages2_range(inode->i_mapping,
+ pg_start, pg_end);
+ }
+ iput(inode);
+ return 0;
+}
+
+bool fuse_lock_inode(struct inode *inode)
+{
+ bool locked = false;
+
+ if (!get_fuse_conn(inode)->parallel_dirops) {
+ mutex_lock(&get_fuse_inode(inode)->mutex);
+ locked = true;
+ }
+
+ return locked;
+}
+
+void fuse_unlock_inode(struct inode *inode, bool locked)
+{
+ if (locked)
+ mutex_unlock(&get_fuse_inode(inode)->mutex);
+}
+
+static void fuse_umount_begin(struct super_block *sb)
+{
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+ if (fc->no_force_umount)
+ return;
+
+ fuse_abort_conn(fc);
+
+ // Only retire block-device-based superblocks.
+ if (sb->s_bdev != NULL)
+ retire_super(sb);
+}
+
+static void fuse_send_destroy(struct fuse_mount *fm)
+{
+ if (fm->fc->conn_init) {
+ FUSE_ARGS(args);
+
+ args.opcode = FUSE_DESTROY;
+ args.force = true;
+ args.nocreds = true;
+ fuse_simple_request(fm, &args);
+ }
+}
+
+static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
+{
+ stbuf->f_type = FUSE_SUPER_MAGIC;
+ stbuf->f_bsize = attr->bsize;
+ stbuf->f_frsize = attr->frsize;
+ stbuf->f_blocks = attr->blocks;
+ stbuf->f_bfree = attr->bfree;
+ stbuf->f_bavail = attr->bavail;
+ stbuf->f_files = attr->files;
+ stbuf->f_ffree = attr->ffree;
+ stbuf->f_namelen = attr->namelen;
+ /* fsid is left zero */
+}
+
+static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ FUSE_ARGS(args);
+ struct fuse_statfs_out outarg;
+ int err;
+
+ if (!fuse_allow_current_process(fm->fc)) {
+ buf->f_type = FUSE_SUPER_MAGIC;
+ return 0;
+ }
+
+ memset(&outarg, 0, sizeof(outarg));
+ args.in_numargs = 0;
+ args.opcode = FUSE_STATFS;
+ args.nodeid = get_node_id(d_inode(dentry));
+ args.out_numargs = 1;
+ args.out_args[0].size = sizeof(outarg);
+ args.out_args[0].value = &outarg;
+ err = fuse_simple_request(fm, &args);
+ if (!err)
+ convert_fuse_statfs(buf, &outarg.st);
+ return err;
+}
+
+static struct fuse_sync_bucket *fuse_sync_bucket_alloc(void)
+{
+ struct fuse_sync_bucket *bucket;
+
+ bucket = kzalloc(sizeof(*bucket), GFP_KERNEL | __GFP_NOFAIL);
+ if (bucket) {
+ init_waitqueue_head(&bucket->waitq);
+ /* Initial active count */
+ atomic_set(&bucket->count, 1);
+ }
+ return bucket;
+}
+
+static void fuse_sync_fs_writes(struct fuse_conn *fc)
+{
+ struct fuse_sync_bucket *bucket, *new_bucket;
+ int count;
+
+ new_bucket = fuse_sync_bucket_alloc();
+ spin_lock(&fc->lock);
+ bucket = rcu_dereference_protected(fc->curr_bucket, 1);
+ count = atomic_read(&bucket->count);
+ WARN_ON(count < 1);
+ /* No outstanding writes? */
+ if (count == 1) {
+ spin_unlock(&fc->lock);
+ kfree(new_bucket);
+ return;
+ }
+
+ /*
+ * Completion of new bucket depends on completion of this bucket, so add
+ * one more count.
+ */
+ atomic_inc(&new_bucket->count);
+ rcu_assign_pointer(fc->curr_bucket, new_bucket);
+ spin_unlock(&fc->lock);
+ /*
+ * Drop initial active count. At this point if all writes in this and
+ * ancestor buckets complete, the count will go to zero and this task
+ * will be woken up.
+ */
+ atomic_dec(&bucket->count);
+
+ wait_event(bucket->waitq, atomic_read(&bucket->count) == 0);
+
+ /* Drop temp count on descendant bucket */
+ fuse_sync_bucket_dec(new_bucket);
+ kfree_rcu(bucket, rcu);
+}
+
+static int fuse_sync_fs(struct super_block *sb, int wait)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ struct fuse_conn *fc = fm->fc;
+ struct fuse_syncfs_in inarg;
+ FUSE_ARGS(args);
+ int err;
+
+ /*
+ * Userspace cannot handle the wait == 0 case. Avoid a
+ * gratuitous roundtrip.
+ */
+ if (!wait)
+ return 0;
+
+ /* The filesystem is being unmounted. Nothing to do. */
+ if (!sb->s_root)
+ return 0;
+
+ if (!fc->sync_fs)
+ return 0;
+
+ fuse_sync_fs_writes(fc);
+
+ memset(&inarg, 0, sizeof(inarg));
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.opcode = FUSE_SYNCFS;
+ args.nodeid = get_node_id(sb->s_root->d_inode);
+ args.out_numargs = 0;
+
+ err = fuse_simple_request(fm, &args);
+ if (err == -ENOSYS) {
+ fc->sync_fs = 0;
+ err = 0;
+ }
+
+ return err;
+}
+
+enum {
+ OPT_SOURCE,
+ OPT_SUBTYPE,
+ OPT_FD,
+ OPT_ROOTMODE,
+ OPT_USER_ID,
+ OPT_GROUP_ID,
+ OPT_DEFAULT_PERMISSIONS,
+ OPT_ALLOW_OTHER,
+ OPT_MAX_READ,
+ OPT_BLKSIZE,
+ OPT_ERR
+};
+
+static const struct fs_parameter_spec fuse_fs_parameters[] = {
+ fsparam_string ("source", OPT_SOURCE),
+ fsparam_u32 ("fd", OPT_FD),
+ fsparam_u32oct ("rootmode", OPT_ROOTMODE),
+ fsparam_u32 ("user_id", OPT_USER_ID),
+ fsparam_u32 ("group_id", OPT_GROUP_ID),
+ fsparam_flag ("default_permissions", OPT_DEFAULT_PERMISSIONS),
+ fsparam_flag ("allow_other", OPT_ALLOW_OTHER),
+ fsparam_u32 ("max_read", OPT_MAX_READ),
+ fsparam_u32 ("blksize", OPT_BLKSIZE),
+ fsparam_string ("subtype", OPT_SUBTYPE),
+ {}
+};
+
+static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param)
+{
+ struct fs_parse_result result;
+ struct fuse_fs_context *ctx = fsc->fs_private;
+ int opt;
+
+ if (fsc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ /*
+ * Ignore options coming from mount(MS_REMOUNT) for backward
+ * compatibility.
+ */
+ if (fsc->oldapi)
+ return 0;
+
+ return invalfc(fsc, "No changes allowed in reconfigure");
+ }
+
+ opt = fs_parse(fsc, fuse_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case OPT_SOURCE:
+ if (fsc->source)
+ return invalfc(fsc, "Multiple sources specified");
+ fsc->source = param->string;
+ param->string = NULL;
+ break;
+
+ case OPT_SUBTYPE:
+ if (ctx->subtype)
+ return invalfc(fsc, "Multiple subtypes specified");
+ ctx->subtype = param->string;
+ param->string = NULL;
+ return 0;
+
+ case OPT_FD:
+ ctx->fd = result.uint_32;
+ ctx->fd_present = true;
+ break;
+
+ case OPT_ROOTMODE:
+ if (!fuse_valid_type(result.uint_32))
+ return invalfc(fsc, "Invalid rootmode");
+ ctx->rootmode = result.uint_32;
+ ctx->rootmode_present = true;
+ break;
+
+ case OPT_USER_ID:
+ ctx->user_id = make_kuid(fsc->user_ns, result.uint_32);
+ if (!uid_valid(ctx->user_id))
+ return invalfc(fsc, "Invalid user_id");
+ ctx->user_id_present = true;
+ break;
+
+ case OPT_GROUP_ID:
+ ctx->group_id = make_kgid(fsc->user_ns, result.uint_32);
+ if (!gid_valid(ctx->group_id))
+ return invalfc(fsc, "Invalid group_id");
+ ctx->group_id_present = true;
+ break;
+
+ case OPT_DEFAULT_PERMISSIONS:
+ ctx->default_permissions = true;
+ break;
+
+ case OPT_ALLOW_OTHER:
+ ctx->allow_other = true;
+ break;
+
+ case OPT_MAX_READ:
+ ctx->max_read = result.uint_32;
+ break;
+
+ case OPT_BLKSIZE:
+ if (!ctx->is_bdev)
+ return invalfc(fsc, "blksize only supported for fuseblk");
+ ctx->blksize = result.uint_32;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void fuse_free_fsc(struct fs_context *fsc)
+{
+ struct fuse_fs_context *ctx = fsc->fs_private;
+
+ if (ctx) {
+ kfree(ctx->subtype);
+ kfree(ctx);
+ }
+}
+
+static int fuse_show_options(struct seq_file *m, struct dentry *root)
+{
+ struct super_block *sb = root->d_sb;
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+ if (fc->legacy_opts_show) {
+ seq_printf(m, ",user_id=%u",
+ from_kuid_munged(fc->user_ns, fc->user_id));
+ seq_printf(m, ",group_id=%u",
+ from_kgid_munged(fc->user_ns, fc->group_id));
+ if (fc->default_permissions)
+ seq_puts(m, ",default_permissions");
+ if (fc->allow_other)
+ seq_puts(m, ",allow_other");
+ if (fc->max_read != ~0)
+ seq_printf(m, ",max_read=%u", fc->max_read);
+ if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
+ seq_printf(m, ",blksize=%lu", sb->s_blocksize);
+ }
+#ifdef CONFIG_FUSE_DAX
+ if (fc->dax_mode == FUSE_DAX_ALWAYS)
+ seq_puts(m, ",dax=always");
+ else if (fc->dax_mode == FUSE_DAX_NEVER)
+ seq_puts(m, ",dax=never");
+ else if (fc->dax_mode == FUSE_DAX_INODE_USER)
+ seq_puts(m, ",dax=inode");
+#endif
+
+ return 0;
+}
+
+static void fuse_iqueue_init(struct fuse_iqueue *fiq,
+ const struct fuse_iqueue_ops *ops,
+ void *priv)
+{
+ memset(fiq, 0, sizeof(struct fuse_iqueue));
+ spin_lock_init(&fiq->lock);
+ init_waitqueue_head(&fiq->waitq);
+ INIT_LIST_HEAD(&fiq->pending);
+ INIT_LIST_HEAD(&fiq->interrupts);
+ fiq->forget_list_tail = &fiq->forget_list_head;
+ fiq->connected = 1;
+ fiq->ops = ops;
+ fiq->priv = priv;
+}
+
+static void fuse_pqueue_init(struct fuse_pqueue *fpq)
+{
+ unsigned int i;
+
+ spin_lock_init(&fpq->lock);
+ for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&fpq->processing[i]);
+ INIT_LIST_HEAD(&fpq->io);
+ fpq->connected = 1;
+}
+
+void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
+ struct user_namespace *user_ns,
+ const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv)
+{
+ memset(fc, 0, sizeof(*fc));
+ spin_lock_init(&fc->lock);
+ spin_lock_init(&fc->bg_lock);
+ init_rwsem(&fc->killsb);
+ refcount_set(&fc->count, 1);
+ atomic_set(&fc->dev_count, 1);
+ init_waitqueue_head(&fc->blocked_waitq);
+ fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv);
+ INIT_LIST_HEAD(&fc->bg_queue);
+ INIT_LIST_HEAD(&fc->entry);
+ INIT_LIST_HEAD(&fc->devices);
+ atomic_set(&fc->num_waiting, 0);
+ fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
+ fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
+ atomic64_set(&fc->khctr, 0);
+ fc->polled_files = RB_ROOT;
+ fc->blocked = 0;
+ fc->initialized = 0;
+ fc->connected = 1;
+ atomic64_set(&fc->attr_version, 1);
+ get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
+ fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
+ fc->user_ns = get_user_ns(user_ns);
+ fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ;
+ fc->max_pages_limit = FUSE_MAX_MAX_PAGES;
+
+ INIT_LIST_HEAD(&fc->mounts);
+ list_add(&fm->fc_entry, &fc->mounts);
+ fm->fc = fc;
+}
+EXPORT_SYMBOL_GPL(fuse_conn_init);
+
+void fuse_conn_put(struct fuse_conn *fc)
+{
+ if (refcount_dec_and_test(&fc->count)) {
+ struct fuse_iqueue *fiq = &fc->iq;
+ struct fuse_sync_bucket *bucket;
+
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_conn_free(fc);
+ if (fiq->ops->release)
+ fiq->ops->release(fiq);
+ put_pid_ns(fc->pid_ns);
+ put_user_ns(fc->user_ns);
+ bucket = rcu_dereference_protected(fc->curr_bucket, 1);
+ if (bucket) {
+ WARN_ON(atomic_read(&bucket->count) != 1);
+ kfree(bucket);
+ }
+ fc->release(fc);
+ }
+}
+EXPORT_SYMBOL_GPL(fuse_conn_put);
+
+struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
+{
+ refcount_inc(&fc->count);
+ return fc;
+}
+EXPORT_SYMBOL_GPL(fuse_conn_get);
+
+static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
+{
+ struct fuse_attr attr;
+ memset(&attr, 0, sizeof(attr));
+
+ attr.mode = mode;
+ attr.ino = FUSE_ROOT_ID;
+ attr.nlink = 1;
+ return fuse_iget(sb, 1, 0, &attr, 0, 0);
+}
+
+struct fuse_inode_handle {
+ u64 nodeid;
+ u32 generation;
+};
+
+static struct dentry *fuse_get_dentry(struct super_block *sb,
+ struct fuse_inode_handle *handle)
+{
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct inode *inode;
+ struct dentry *entry;
+ int err = -ESTALE;
+
+ if (handle->nodeid == 0)
+ goto out_err;
+
+ inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &handle->nodeid);
+ if (!inode) {
+ struct fuse_entry_out outarg;
+ const struct qstr name = QSTR_INIT(".", 1);
+
+ if (!fc->export_support)
+ goto out_err;
+
+ err = fuse_lookup_name(sb, handle->nodeid, &name, &outarg,
+ &inode);
+ if (err && err != -ENOENT)
+ goto out_err;
+ if (err || !inode) {
+ err = -ESTALE;
+ goto out_err;
+ }
+ err = -EIO;
+ if (get_node_id(inode) != handle->nodeid)
+ goto out_iput;
+ }
+ err = -ESTALE;
+ if (inode->i_generation != handle->generation)
+ goto out_iput;
+
+ entry = d_obtain_alias(inode);
+ if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID)
+ fuse_invalidate_entry_cache(entry);
+
+ return entry;
+
+ out_iput:
+ iput(inode);
+ out_err:
+ return ERR_PTR(err);
+}
+
+static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+ struct inode *parent)
+{
+ int len = parent ? 6 : 3;
+ u64 nodeid;
+ u32 generation;
+
+ if (*max_len < len) {
+ *max_len = len;
+ return FILEID_INVALID;
+ }
+
+ nodeid = get_fuse_inode(inode)->nodeid;
+ generation = inode->i_generation;
+
+ fh[0] = (u32)(nodeid >> 32);
+ fh[1] = (u32)(nodeid & 0xffffffff);
+ fh[2] = generation;
+
+ if (parent) {
+ nodeid = get_fuse_inode(parent)->nodeid;
+ generation = parent->i_generation;
+
+ fh[3] = (u32)(nodeid >> 32);
+ fh[4] = (u32)(nodeid & 0xffffffff);
+ fh[5] = generation;
+ }
+
+ *max_len = len;
+ return parent ? 0x82 : 0x81;
+}
+
+static struct dentry *fuse_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ struct fuse_inode_handle handle;
+
+ if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3)
+ return NULL;
+
+ handle.nodeid = (u64) fid->raw[0] << 32;
+ handle.nodeid |= (u64) fid->raw[1];
+ handle.generation = fid->raw[2];
+ return fuse_get_dentry(sb, &handle);
+}
+
+static struct dentry *fuse_fh_to_parent(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ struct fuse_inode_handle parent;
+
+ if (fh_type != 0x82 || fh_len < 6)
+ return NULL;
+
+ parent.nodeid = (u64) fid->raw[3] << 32;
+ parent.nodeid |= (u64) fid->raw[4];
+ parent.generation = fid->raw[5];
+ return fuse_get_dentry(sb, &parent);
+}
+
+static struct dentry *fuse_get_parent(struct dentry *child)
+{
+ struct inode *child_inode = d_inode(child);
+ struct fuse_conn *fc = get_fuse_conn(child_inode);
+ struct inode *inode;
+ struct dentry *parent;
+ struct fuse_entry_out outarg;
+ int err;
+
+ if (!fc->export_support)
+ return ERR_PTR(-ESTALE);
+
+ err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode),
+ &dotdot_name, &outarg, &inode);
+ if (err) {
+ if (err == -ENOENT)
+ return ERR_PTR(-ESTALE);
+ return ERR_PTR(err);
+ }
+
+ parent = d_obtain_alias(inode);
+ if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID)
+ fuse_invalidate_entry_cache(parent);
+
+ return parent;
+}
+
+static const struct export_operations fuse_export_operations = {
+ .fh_to_dentry = fuse_fh_to_dentry,
+ .fh_to_parent = fuse_fh_to_parent,
+ .encode_fh = fuse_encode_fh,
+ .get_parent = fuse_get_parent,
+};
+
+static const struct super_operations fuse_super_operations = {
+ .alloc_inode = fuse_alloc_inode,
+ .free_inode = fuse_free_inode,
+ .evict_inode = fuse_evict_inode,
+ .write_inode = fuse_write_inode,
+ .drop_inode = generic_delete_inode,
+ .umount_begin = fuse_umount_begin,
+ .statfs = fuse_statfs,
+ .sync_fs = fuse_sync_fs,
+ .show_options = fuse_show_options,
+};
+
+static void sanitize_global_limit(unsigned *limit)
+{
+ /*
+ * The default maximum number of async requests is calculated to consume
+ * 1/2^13 of the total memory, assuming 392 bytes per request.
+ */
+ if (*limit == 0)
+ *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / 392;
+
+ if (*limit >= 1 << 16)
+ *limit = (1 << 16) - 1;
+}
+
+static int set_global_limit(const char *val, const struct kernel_param *kp)
+{
+ int rv;
+
+ rv = param_set_uint(val, kp);
+ if (rv)
+ return rv;
+
+ sanitize_global_limit((unsigned *)kp->arg);
+
+ return 0;
+}
+
+static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
+{
+ int cap_sys_admin = capable(CAP_SYS_ADMIN);
+
+ if (arg->minor < 13)
+ return;
+
+ sanitize_global_limit(&max_user_bgreq);
+ sanitize_global_limit(&max_user_congthresh);
+
+ spin_lock(&fc->bg_lock);
+ if (arg->max_background) {
+ fc->max_background = arg->max_background;
+
+ if (!cap_sys_admin && fc->max_background > max_user_bgreq)
+ fc->max_background = max_user_bgreq;
+ }
+ if (arg->congestion_threshold) {
+ fc->congestion_threshold = arg->congestion_threshold;
+
+ if (!cap_sys_admin &&
+ fc->congestion_threshold > max_user_congthresh)
+ fc->congestion_threshold = max_user_congthresh;
+ }
+ spin_unlock(&fc->bg_lock);
+}
+
+struct fuse_init_args {
+ struct fuse_args args;
+ struct fuse_init_in in;
+ struct fuse_init_out out;
+};
+
+static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
+ int error)
+{
+ struct fuse_conn *fc = fm->fc;
+ struct fuse_init_args *ia = container_of(args, typeof(*ia), args);
+ struct fuse_init_out *arg = &ia->out;
+ bool ok = true;
+
+ if (error || arg->major != FUSE_KERNEL_VERSION)
+ ok = false;
+ else {
+ unsigned long ra_pages;
+
+ process_init_limits(fc, arg);
+
+ if (arg->minor >= 6) {
+ u64 flags = arg->flags;
+
+ if (flags & FUSE_INIT_EXT)
+ flags |= (u64) arg->flags2 << 32;
+
+ ra_pages = arg->max_readahead / PAGE_SIZE;
+ if (flags & FUSE_ASYNC_READ)
+ fc->async_read = 1;
+ if (!(flags & FUSE_POSIX_LOCKS))
+ fc->no_lock = 1;
+ if (arg->minor >= 17) {
+ if (!(flags & FUSE_FLOCK_LOCKS))
+ fc->no_flock = 1;
+ } else {
+ if (!(flags & FUSE_POSIX_LOCKS))
+ fc->no_flock = 1;
+ }
+ if (flags & FUSE_ATOMIC_O_TRUNC)
+ fc->atomic_o_trunc = 1;
+ if (arg->minor >= 9) {
+ /* LOOKUP has dependency on proto version */
+ if (flags & FUSE_EXPORT_SUPPORT)
+ fc->export_support = 1;
+ }
+ if (flags & FUSE_BIG_WRITES)
+ fc->big_writes = 1;
+ if (flags & FUSE_DONT_MASK)
+ fc->dont_mask = 1;
+ if (flags & FUSE_AUTO_INVAL_DATA)
+ fc->auto_inval_data = 1;
+ else if (flags & FUSE_EXPLICIT_INVAL_DATA)
+ fc->explicit_inval_data = 1;
+ if (flags & FUSE_DO_READDIRPLUS) {
+ fc->do_readdirplus = 1;
+ if (flags & FUSE_READDIRPLUS_AUTO)
+ fc->readdirplus_auto = 1;
+ }
+ if (flags & FUSE_ASYNC_DIO)
+ fc->async_dio = 1;
+ if (flags & FUSE_WRITEBACK_CACHE)
+ fc->writeback_cache = 1;
+ if (flags & FUSE_PARALLEL_DIROPS)
+ fc->parallel_dirops = 1;
+ if (flags & FUSE_HANDLE_KILLPRIV)
+ fc->handle_killpriv = 1;
+ if (arg->time_gran && arg->time_gran <= 1000000000)
+ fm->sb->s_time_gran = arg->time_gran;
+ if ((flags & FUSE_POSIX_ACL)) {
+ fc->default_permissions = 1;
+ fc->posix_acl = 1;
+ fm->sb->s_xattr = fuse_acl_xattr_handlers;
+ }
+ if (flags & FUSE_CACHE_SYMLINKS)
+ fc->cache_symlinks = 1;
+ if (flags & FUSE_ABORT_ERROR)
+ fc->abort_err = 1;
+ if (flags & FUSE_MAX_PAGES) {
+ fc->max_pages =
+ min_t(unsigned int, fc->max_pages_limit,
+ max_t(unsigned int, arg->max_pages, 1));
+ }
+ if (IS_ENABLED(CONFIG_FUSE_DAX)) {
+ if (flags & FUSE_MAP_ALIGNMENT &&
+ !fuse_dax_check_alignment(fc, arg->map_alignment)) {
+ ok = false;
+ }
+ if (flags & FUSE_HAS_INODE_DAX)
+ fc->inode_dax = 1;
+ }
+ if (flags & FUSE_HANDLE_KILLPRIV_V2) {
+ fc->handle_killpriv_v2 = 1;
+ fm->sb->s_flags |= SB_NOSEC;
+ }
+ if (flags & FUSE_SETXATTR_EXT)
+ fc->setxattr_ext = 1;
+ if (flags & FUSE_SECURITY_CTX)
+ fc->init_security = 1;
+ } else {
+ ra_pages = fc->max_read / PAGE_SIZE;
+ fc->no_lock = 1;
+ fc->no_flock = 1;
+ }
+
+ fm->sb->s_bdi->ra_pages =
+ min(fm->sb->s_bdi->ra_pages, ra_pages);
+ fc->minor = arg->minor;
+ fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
+ fc->max_write = max_t(unsigned, 4096, fc->max_write);
+ fc->conn_init = 1;
+ }
+ kfree(ia);
+
+ if (!ok) {
+ fc->conn_init = 0;
+ fc->conn_error = 1;
+ }
+
+ fuse_set_initialized(fc);
+ wake_up_all(&fc->blocked_waitq);
+}
+
+void fuse_send_init(struct fuse_mount *fm)
+{
+ struct fuse_init_args *ia;
+ u64 flags;
+
+ ia = kzalloc(sizeof(*ia), GFP_KERNEL | __GFP_NOFAIL);
+
+ ia->in.major = FUSE_KERNEL_VERSION;
+ ia->in.minor = FUSE_KERNEL_MINOR_VERSION;
+ ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE;
+ flags =
+ FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
+ FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
+ FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
+ FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
+ FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
+ FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT |
+ FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
+ FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
+ FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA |
+ FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT |
+ FUSE_SECURITY_CTX;
+#ifdef CONFIG_FUSE_DAX
+ if (fm->fc->dax)
+ flags |= FUSE_MAP_ALIGNMENT;
+ if (fuse_is_inode_dax_mode(fm->fc->dax_mode))
+ flags |= FUSE_HAS_INODE_DAX;
+#endif
+ if (fm->fc->auto_submounts)
+ flags |= FUSE_SUBMOUNTS;
+
+ ia->in.flags = flags;
+ ia->in.flags2 = flags >> 32;
+
+ ia->args.opcode = FUSE_INIT;
+ ia->args.in_numargs = 1;
+ ia->args.in_args[0].size = sizeof(ia->in);
+ ia->args.in_args[0].value = &ia->in;
+ ia->args.out_numargs = 1;
+ /* Variable length argument used for backward compatibility
+ with interface version < 7.5. Rest of init_out is zeroed
+ by do_get_request(), so a short reply is not a problem */
+ ia->args.out_argvar = true;
+ ia->args.out_args[0].size = sizeof(ia->out);
+ ia->args.out_args[0].value = &ia->out;
+ ia->args.force = true;
+ ia->args.nocreds = true;
+ ia->args.end = process_init_reply;
+
+ if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0)
+ process_init_reply(fm, &ia->args, -ENOTCONN);
+}
+EXPORT_SYMBOL_GPL(fuse_send_init);
+
+void fuse_free_conn(struct fuse_conn *fc)
+{
+ WARN_ON(!list_empty(&fc->devices));
+ kfree_rcu(fc, rcu);
+}
+EXPORT_SYMBOL_GPL(fuse_free_conn);
+
+static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
+{
+ int err;
+ char *suffix = "";
+
+ if (sb->s_bdev) {
+ suffix = "-fuseblk";
+ /*
+ * sb->s_bdi points to blkdev's bdi however we want to redirect
+ * it to our private bdi...
+ */
+ bdi_put(sb->s_bdi);
+ sb->s_bdi = &noop_backing_dev_info;
+ }
+ err = super_setup_bdi_name(sb, "%u:%u%s", MAJOR(fc->dev),
+ MINOR(fc->dev), suffix);
+ if (err)
+ return err;
+
+ /* fuse does it's own writeback accounting */
+ sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT;
+ sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT;
+
+ /*
+ * For a single fuse filesystem use max 1% of dirty +
+ * writeback threshold.
+ *
+ * This gives about 1M of write buffer for memory maps on a
+ * machine with 1G and 10% dirty_ratio, which should be more
+ * than enough.
+ *
+ * Privileged users can raise it by writing to
+ *
+ * /sys/class/bdi/<bdi>/max_ratio
+ */
+ bdi_set_max_ratio(sb->s_bdi, 1);
+
+ return 0;
+}
+
+struct fuse_dev *fuse_dev_alloc(void)
+{
+ struct fuse_dev *fud;
+ struct list_head *pq;
+
+ fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL);
+ if (!fud)
+ return NULL;
+
+ pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL);
+ if (!pq) {
+ kfree(fud);
+ return NULL;
+ }
+
+ fud->pq.processing = pq;
+ fuse_pqueue_init(&fud->pq);
+
+ return fud;
+}
+EXPORT_SYMBOL_GPL(fuse_dev_alloc);
+
+void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc)
+{
+ fud->fc = fuse_conn_get(fc);
+ spin_lock(&fc->lock);
+ list_add_tail(&fud->entry, &fc->devices);
+ spin_unlock(&fc->lock);
+}
+EXPORT_SYMBOL_GPL(fuse_dev_install);
+
+struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc)
+{
+ struct fuse_dev *fud;
+
+ fud = fuse_dev_alloc();
+ if (!fud)
+ return NULL;
+
+ fuse_dev_install(fud, fc);
+ return fud;
+}
+EXPORT_SYMBOL_GPL(fuse_dev_alloc_install);
+
+void fuse_dev_free(struct fuse_dev *fud)
+{
+ struct fuse_conn *fc = fud->fc;
+
+ if (fc) {
+ spin_lock(&fc->lock);
+ list_del(&fud->entry);
+ spin_unlock(&fc->lock);
+
+ fuse_conn_put(fc);
+ }
+ kfree(fud->pq.processing);
+ kfree(fud);
+}
+EXPORT_SYMBOL_GPL(fuse_dev_free);
+
+static void fuse_fill_attr_from_inode(struct fuse_attr *attr,
+ const struct fuse_inode *fi)
+{
+ *attr = (struct fuse_attr){
+ .ino = fi->inode.i_ino,
+ .size = fi->inode.i_size,
+ .blocks = fi->inode.i_blocks,
+ .atime = fi->inode.i_atime.tv_sec,
+ .mtime = fi->inode.i_mtime.tv_sec,
+ .ctime = fi->inode.i_ctime.tv_sec,
+ .atimensec = fi->inode.i_atime.tv_nsec,
+ .mtimensec = fi->inode.i_mtime.tv_nsec,
+ .ctimensec = fi->inode.i_ctime.tv_nsec,
+ .mode = fi->inode.i_mode,
+ .nlink = fi->inode.i_nlink,
+ .uid = fi->inode.i_uid.val,
+ .gid = fi->inode.i_gid.val,
+ .rdev = fi->inode.i_rdev,
+ .blksize = 1u << fi->inode.i_blkbits,
+ };
+}
+
+static void fuse_sb_defaults(struct super_block *sb)
+{
+ sb->s_magic = FUSE_SUPER_MAGIC;
+ sb->s_op = &fuse_super_operations;
+ sb->s_xattr = fuse_xattr_handlers;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_time_gran = 1;
+ sb->s_export_op = &fuse_export_operations;
+ sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE;
+ if (sb->s_user_ns != &init_user_ns)
+ sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
+ sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
+
+ /*
+ * If we are not in the initial user namespace posix
+ * acls must be translated.
+ */
+ if (sb->s_user_ns != &init_user_ns)
+ sb->s_xattr = fuse_no_acl_xattr_handlers;
+}
+
+static int fuse_fill_super_submount(struct super_block *sb,
+ struct fuse_inode *parent_fi)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ struct super_block *parent_sb = parent_fi->inode.i_sb;
+ struct fuse_attr root_attr;
+ struct inode *root;
+ struct fuse_submount_lookup *sl;
+ struct fuse_inode *fi;
+
+ fuse_sb_defaults(sb);
+ fm->sb = sb;
+
+ WARN_ON(sb->s_bdi != &noop_backing_dev_info);
+ sb->s_bdi = bdi_get(parent_sb->s_bdi);
+
+ sb->s_xattr = parent_sb->s_xattr;
+ sb->s_time_gran = parent_sb->s_time_gran;
+ sb->s_blocksize = parent_sb->s_blocksize;
+ sb->s_blocksize_bits = parent_sb->s_blocksize_bits;
+ sb->s_subtype = kstrdup(parent_sb->s_subtype, GFP_KERNEL);
+ if (parent_sb->s_subtype && !sb->s_subtype)
+ return -ENOMEM;
+
+ fuse_fill_attr_from_inode(&root_attr, parent_fi);
+ root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0);
+ /*
+ * This inode is just a duplicate, so it is not looked up and
+ * its nlookup should not be incremented. fuse_iget() does
+ * that, though, so undo it here.
+ */
+ fi = get_fuse_inode(root);
+ fi->nlookup--;
+
+ sb->s_d_op = &fuse_dentry_operations;
+ sb->s_root = d_make_root(root);
+ if (!sb->s_root)
+ return -ENOMEM;
+
+ /*
+ * Grab the parent's submount_lookup pointer and take a
+ * reference on the shared nlookup from the parent. This is to
+ * prevent the last forget for this nodeid from getting
+ * triggered until all users have finished with it.
+ */
+ sl = parent_fi->submount_lookup;
+ WARN_ON(!sl);
+ if (sl) {
+ refcount_inc(&sl->count);
+ fi->submount_lookup = sl;
+ }
+
+ return 0;
+}
+
+/* Filesystem context private data holds the FUSE inode of the mount point */
+static int fuse_get_tree_submount(struct fs_context *fsc)
+{
+ struct fuse_mount *fm;
+ struct fuse_inode *mp_fi = fsc->fs_private;
+ struct fuse_conn *fc = get_fuse_conn(&mp_fi->inode);
+ struct super_block *sb;
+ int err;
+
+ fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL);
+ if (!fm)
+ return -ENOMEM;
+
+ fm->fc = fuse_conn_get(fc);
+ fsc->s_fs_info = fm;
+ sb = sget_fc(fsc, NULL, set_anon_super_fc);
+ if (fsc->s_fs_info)
+ fuse_mount_destroy(fm);
+ if (IS_ERR(sb))
+ return PTR_ERR(sb);
+
+ /* Initialize superblock, making @mp_fi its root */
+ err = fuse_fill_super_submount(sb, mp_fi);
+ if (err) {
+ deactivate_locked_super(sb);
+ return err;
+ }
+
+ down_write(&fc->killsb);
+ list_add_tail(&fm->fc_entry, &fc->mounts);
+ up_write(&fc->killsb);
+
+ sb->s_flags |= SB_ACTIVE;
+ fsc->root = dget(sb->s_root);
+
+ return 0;
+}
+
+static const struct fs_context_operations fuse_context_submount_ops = {
+ .get_tree = fuse_get_tree_submount,
+};
+
+int fuse_init_fs_context_submount(struct fs_context *fsc)
+{
+ fsc->ops = &fuse_context_submount_ops;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fuse_init_fs_context_submount);
+
+int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
+{
+ struct fuse_dev *fud = NULL;
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ struct fuse_conn *fc = fm->fc;
+ struct inode *root;
+ struct dentry *root_dentry;
+ int err;
+
+ err = -EINVAL;
+ if (sb->s_flags & SB_MANDLOCK)
+ goto err;
+
+ rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc());
+ fuse_sb_defaults(sb);
+
+ if (ctx->is_bdev) {
+#ifdef CONFIG_BLOCK
+ err = -EINVAL;
+ if (!sb_set_blocksize(sb, ctx->blksize))
+ goto err;
+#endif
+ } else {
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
+ }
+
+ sb->s_subtype = ctx->subtype;
+ ctx->subtype = NULL;
+ if (IS_ENABLED(CONFIG_FUSE_DAX)) {
+ err = fuse_dax_conn_alloc(fc, ctx->dax_mode, ctx->dax_dev);
+ if (err)
+ goto err;
+ }
+
+ if (ctx->fudptr) {
+ err = -ENOMEM;
+ fud = fuse_dev_alloc_install(fc);
+ if (!fud)
+ goto err_free_dax;
+ }
+
+ fc->dev = sb->s_dev;
+ fm->sb = sb;
+ err = fuse_bdi_init(fc, sb);
+ if (err)
+ goto err_dev_free;
+
+ /* Handle umasking inside the fuse code */
+ if (sb->s_flags & SB_POSIXACL)
+ fc->dont_mask = 1;
+ sb->s_flags |= SB_POSIXACL;
+
+ fc->default_permissions = ctx->default_permissions;
+ fc->allow_other = ctx->allow_other;
+ fc->user_id = ctx->user_id;
+ fc->group_id = ctx->group_id;
+ fc->legacy_opts_show = ctx->legacy_opts_show;
+ fc->max_read = max_t(unsigned int, 4096, ctx->max_read);
+ fc->destroy = ctx->destroy;
+ fc->no_control = ctx->no_control;
+ fc->no_force_umount = ctx->no_force_umount;
+
+ err = -ENOMEM;
+ root = fuse_get_root_inode(sb, ctx->rootmode);
+ sb->s_d_op = &fuse_root_dentry_operations;
+ root_dentry = d_make_root(root);
+ if (!root_dentry)
+ goto err_dev_free;
+ /* Root dentry doesn't have .d_revalidate */
+ sb->s_d_op = &fuse_dentry_operations;
+
+ mutex_lock(&fuse_mutex);
+ err = -EINVAL;
+ if (ctx->fudptr && *ctx->fudptr)
+ goto err_unlock;
+
+ err = fuse_ctl_add_conn(fc);
+ if (err)
+ goto err_unlock;
+
+ list_add_tail(&fc->entry, &fuse_conn_list);
+ sb->s_root = root_dentry;
+ if (ctx->fudptr)
+ *ctx->fudptr = fud;
+ mutex_unlock(&fuse_mutex);
+ return 0;
+
+ err_unlock:
+ mutex_unlock(&fuse_mutex);
+ dput(root_dentry);
+ err_dev_free:
+ if (fud)
+ fuse_dev_free(fud);
+ err_free_dax:
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_conn_free(fc);
+ err:
+ return err;
+}
+EXPORT_SYMBOL_GPL(fuse_fill_super_common);
+
+static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
+{
+ struct fuse_fs_context *ctx = fsc->fs_private;
+ int err;
+
+ if (!ctx->file || !ctx->rootmode_present ||
+ !ctx->user_id_present || !ctx->group_id_present)
+ return -EINVAL;
+
+ /*
+ * Require mount to happen from the same user namespace which
+ * opened /dev/fuse to prevent potential attacks.
+ */
+ if ((ctx->file->f_op != &fuse_dev_operations) ||
+ (ctx->file->f_cred->user_ns != sb->s_user_ns))
+ return -EINVAL;
+ ctx->fudptr = &ctx->file->private_data;
+
+ err = fuse_fill_super_common(sb, ctx);
+ if (err)
+ return err;
+ /* file->private_data shall be visible on all CPUs after this */
+ smp_mb();
+ fuse_send_init(get_fuse_mount_super(sb));
+ return 0;
+}
+
+/*
+ * This is the path where user supplied an already initialized fuse dev. In
+ * this case never create a new super if the old one is gone.
+ */
+static int fuse_set_no_super(struct super_block *sb, struct fs_context *fsc)
+{
+ return -ENOTCONN;
+}
+
+static int fuse_test_super(struct super_block *sb, struct fs_context *fsc)
+{
+
+ return fsc->sget_key == get_fuse_conn_super(sb);
+}
+
+static int fuse_get_tree(struct fs_context *fsc)
+{
+ struct fuse_fs_context *ctx = fsc->fs_private;
+ struct fuse_dev *fud;
+ struct fuse_conn *fc;
+ struct fuse_mount *fm;
+ struct super_block *sb;
+ int err;
+
+ fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+ if (!fc)
+ return -ENOMEM;
+
+ fm = kzalloc(sizeof(*fm), GFP_KERNEL);
+ if (!fm) {
+ kfree(fc);
+ return -ENOMEM;
+ }
+
+ fuse_conn_init(fc, fm, fsc->user_ns, &fuse_dev_fiq_ops, NULL);
+ fc->release = fuse_free_conn;
+
+ fsc->s_fs_info = fm;
+
+ if (ctx->fd_present)
+ ctx->file = fget(ctx->fd);
+
+ if (IS_ENABLED(CONFIG_BLOCK) && ctx->is_bdev) {
+ err = get_tree_bdev(fsc, fuse_fill_super);
+ goto out;
+ }
+ /*
+ * While block dev mount can be initialized with a dummy device fd
+ * (found by device name), normal fuse mounts can't
+ */
+ err = -EINVAL;
+ if (!ctx->file)
+ goto out;
+
+ /*
+ * Allow creating a fuse mount with an already initialized fuse
+ * connection
+ */
+ fud = READ_ONCE(ctx->file->private_data);
+ if (ctx->file->f_op == &fuse_dev_operations && fud) {
+ fsc->sget_key = fud->fc;
+ sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super);
+ err = PTR_ERR_OR_ZERO(sb);
+ if (!IS_ERR(sb))
+ fsc->root = dget(sb->s_root);
+ } else {
+ err = get_tree_nodev(fsc, fuse_fill_super);
+ }
+out:
+ if (fsc->s_fs_info)
+ fuse_mount_destroy(fm);
+ if (ctx->file)
+ fput(ctx->file);
+ return err;
+}
+
+static const struct fs_context_operations fuse_context_ops = {
+ .free = fuse_free_fsc,
+ .parse_param = fuse_parse_param,
+ .reconfigure = fuse_reconfigure,
+ .get_tree = fuse_get_tree,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int fuse_init_fs_context(struct fs_context *fsc)
+{
+ struct fuse_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->max_read = ~0;
+ ctx->blksize = FUSE_DEFAULT_BLKSIZE;
+ ctx->legacy_opts_show = true;
+
+#ifdef CONFIG_BLOCK
+ if (fsc->fs_type == &fuseblk_fs_type) {
+ ctx->is_bdev = true;
+ ctx->destroy = true;
+ }
+#endif
+
+ fsc->fs_private = ctx;
+ fsc->ops = &fuse_context_ops;
+ return 0;
+}
+
+bool fuse_mount_remove(struct fuse_mount *fm)
+{
+ struct fuse_conn *fc = fm->fc;
+ bool last = false;
+
+ down_write(&fc->killsb);
+ list_del_init(&fm->fc_entry);
+ if (list_empty(&fc->mounts))
+ last = true;
+ up_write(&fc->killsb);
+
+ return last;
+}
+EXPORT_SYMBOL_GPL(fuse_mount_remove);
+
+void fuse_conn_destroy(struct fuse_mount *fm)
+{
+ struct fuse_conn *fc = fm->fc;
+
+ if (fc->destroy)
+ fuse_send_destroy(fm);
+
+ fuse_abort_conn(fc);
+ fuse_wait_aborted(fc);
+
+ if (!list_empty(&fc->entry)) {
+ mutex_lock(&fuse_mutex);
+ list_del(&fc->entry);
+ fuse_ctl_remove_conn(fc);
+ mutex_unlock(&fuse_mutex);
+ }
+}
+EXPORT_SYMBOL_GPL(fuse_conn_destroy);
+
+static void fuse_sb_destroy(struct super_block *sb)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ bool last;
+
+ if (sb->s_root) {
+ last = fuse_mount_remove(fm);
+ if (last)
+ fuse_conn_destroy(fm);
+ }
+}
+
+void fuse_mount_destroy(struct fuse_mount *fm)
+{
+ fuse_conn_put(fm->fc);
+ kfree(fm);
+}
+EXPORT_SYMBOL(fuse_mount_destroy);
+
+static void fuse_kill_sb_anon(struct super_block *sb)
+{
+ fuse_sb_destroy(sb);
+ kill_anon_super(sb);
+ fuse_mount_destroy(get_fuse_mount_super(sb));
+}
+
+static struct file_system_type fuse_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "fuse",
+ .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT,
+ .init_fs_context = fuse_init_fs_context,
+ .parameters = fuse_fs_parameters,
+ .kill_sb = fuse_kill_sb_anon,
+};
+MODULE_ALIAS_FS("fuse");
+
+#ifdef CONFIG_BLOCK
+static void fuse_kill_sb_blk(struct super_block *sb)
+{
+ fuse_sb_destroy(sb);
+ kill_block_super(sb);
+ fuse_mount_destroy(get_fuse_mount_super(sb));
+}
+
+static struct file_system_type fuseblk_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "fuseblk",
+ .init_fs_context = fuse_init_fs_context,
+ .parameters = fuse_fs_parameters,
+ .kill_sb = fuse_kill_sb_blk,
+ .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
+};
+MODULE_ALIAS_FS("fuseblk");
+
+static inline int register_fuseblk(void)
+{
+ return register_filesystem(&fuseblk_fs_type);
+}
+
+static inline void unregister_fuseblk(void)
+{
+ unregister_filesystem(&fuseblk_fs_type);
+}
+#else
+static inline int register_fuseblk(void)
+{
+ return 0;
+}
+
+static inline void unregister_fuseblk(void)
+{
+}
+#endif
+
+static void fuse_inode_init_once(void *foo)
+{
+ struct inode *inode = foo;
+
+ inode_init_once(inode);
+}
+
+static int __init fuse_fs_init(void)
+{
+ int err;
+
+ fuse_inode_cachep = kmem_cache_create("fuse_inode",
+ sizeof(struct fuse_inode), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT,
+ fuse_inode_init_once);
+ err = -ENOMEM;
+ if (!fuse_inode_cachep)
+ goto out;
+
+ err = register_fuseblk();
+ if (err)
+ goto out2;
+
+ err = register_filesystem(&fuse_fs_type);
+ if (err)
+ goto out3;
+
+ return 0;
+
+ out3:
+ unregister_fuseblk();
+ out2:
+ kmem_cache_destroy(fuse_inode_cachep);
+ out:
+ return err;
+}
+
+static void fuse_fs_cleanup(void)
+{
+ unregister_filesystem(&fuse_fs_type);
+ unregister_fuseblk();
+
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
+ kmem_cache_destroy(fuse_inode_cachep);
+}
+
+static struct kobject *fuse_kobj;
+
+static int fuse_sysfs_init(void)
+{
+ int err;
+
+ fuse_kobj = kobject_create_and_add("fuse", fs_kobj);
+ if (!fuse_kobj) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ err = sysfs_create_mount_point(fuse_kobj, "connections");
+ if (err)
+ goto out_fuse_unregister;
+
+ return 0;
+
+ out_fuse_unregister:
+ kobject_put(fuse_kobj);
+ out_err:
+ return err;
+}
+
+static void fuse_sysfs_cleanup(void)
+{
+ sysfs_remove_mount_point(fuse_kobj, "connections");
+ kobject_put(fuse_kobj);
+}
+
+static int __init fuse_init(void)
+{
+ int res;
+
+ pr_info("init (API version %i.%i)\n",
+ FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
+
+ INIT_LIST_HEAD(&fuse_conn_list);
+ res = fuse_fs_init();
+ if (res)
+ goto err;
+
+ res = fuse_dev_init();
+ if (res)
+ goto err_fs_cleanup;
+
+ res = fuse_sysfs_init();
+ if (res)
+ goto err_dev_cleanup;
+
+ res = fuse_ctl_init();
+ if (res)
+ goto err_sysfs_cleanup;
+
+ sanitize_global_limit(&max_user_bgreq);
+ sanitize_global_limit(&max_user_congthresh);
+
+ return 0;
+
+ err_sysfs_cleanup:
+ fuse_sysfs_cleanup();
+ err_dev_cleanup:
+ fuse_dev_cleanup();
+ err_fs_cleanup:
+ fuse_fs_cleanup();
+ err:
+ return res;
+}
+
+static void __exit fuse_exit(void)
+{
+ pr_debug("exit\n");
+
+ fuse_ctl_cleanup();
+ fuse_sysfs_cleanup();
+ fuse_fs_cleanup();
+ fuse_dev_cleanup();
+}
+
+module_init(fuse_init);
+module_exit(fuse_exit);
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
new file mode 100644
index 000000000..5a6c715b9
--- /dev/null
+++ b/fs/fuse/ioctl.c
@@ -0,0 +1,515 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017 Red Hat, Inc.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/uio.h>
+#include <linux/compat.h>
+#include <linux/fileattr.h>
+
+static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args,
+ struct fuse_ioctl_out *outarg)
+{
+ ssize_t ret;
+
+ args->out_args[0].size = sizeof(*outarg);
+ args->out_args[0].value = outarg;
+
+ ret = fuse_simple_request(fm, args);
+
+ /* Translate ENOSYS, which shouldn't be returned from fs */
+ if (ret == -ENOSYS)
+ ret = -ENOTTY;
+
+ if (ret >= 0 && outarg->result == -ENOSYS)
+ outarg->result = -ENOTTY;
+
+ return ret;
+}
+
+/*
+ * CUSE servers compiled on 32bit broke on 64bit kernels because the
+ * ABI was defined to be 'struct iovec' which is different on 32bit
+ * and 64bit. Fortunately we can determine which structure the server
+ * used from the size of the reply.
+ */
+static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
+ size_t transferred, unsigned count,
+ bool is_compat)
+{
+#ifdef CONFIG_COMPAT
+ if (count * sizeof(struct compat_iovec) == transferred) {
+ struct compat_iovec *ciov = src;
+ unsigned i;
+
+ /*
+ * With this interface a 32bit server cannot support
+ * non-compat (i.e. ones coming from 64bit apps) ioctl
+ * requests
+ */
+ if (!is_compat)
+ return -EINVAL;
+
+ for (i = 0; i < count; i++) {
+ dst[i].iov_base = compat_ptr(ciov[i].iov_base);
+ dst[i].iov_len = ciov[i].iov_len;
+ }
+ return 0;
+ }
+#endif
+
+ if (count * sizeof(struct iovec) != transferred)
+ return -EIO;
+
+ memcpy(dst, src, transferred);
+ return 0;
+}
+
+/* Make sure iov_length() won't overflow */
+static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov,
+ size_t count)
+{
+ size_t n;
+ u32 max = fc->max_pages << PAGE_SHIFT;
+
+ for (n = 0; n < count; n++, iov++) {
+ if (iov->iov_len > (size_t) max)
+ return -ENOMEM;
+ max -= iov->iov_len;
+ }
+ return 0;
+}
+
+static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
+ void *src, size_t transferred, unsigned count,
+ bool is_compat)
+{
+ unsigned i;
+ struct fuse_ioctl_iovec *fiov = src;
+
+ if (fc->minor < 16) {
+ return fuse_copy_ioctl_iovec_old(dst, src, transferred,
+ count, is_compat);
+ }
+
+ if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
+ return -EIO;
+
+ for (i = 0; i < count; i++) {
+ /* Did the server supply an inappropriate value? */
+ if (fiov[i].base != (unsigned long) fiov[i].base ||
+ fiov[i].len != (unsigned long) fiov[i].len)
+ return -EIO;
+
+ dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
+ dst[i].iov_len = (size_t) fiov[i].len;
+
+#ifdef CONFIG_COMPAT
+ if (is_compat &&
+ (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
+ (compat_size_t) dst[i].iov_len != fiov[i].len))
+ return -EIO;
+#endif
+ }
+
+ return 0;
+}
+
+
+/*
+ * For ioctls, there is no generic way to determine how much memory
+ * needs to be read and/or written. Furthermore, ioctls are allowed
+ * to dereference the passed pointer, so the parameter requires deep
+ * copying but FUSE has no idea whatsoever about what to copy in or
+ * out.
+ *
+ * This is solved by allowing FUSE server to retry ioctl with
+ * necessary in/out iovecs. Let's assume the ioctl implementation
+ * needs to read in the following structure.
+ *
+ * struct a {
+ * char *buf;
+ * size_t buflen;
+ * }
+ *
+ * On the first callout to FUSE server, inarg->in_size and
+ * inarg->out_size will be NULL; then, the server completes the ioctl
+ * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
+ * the actual iov array to
+ *
+ * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } }
+ *
+ * which tells FUSE to copy in the requested area and retry the ioctl.
+ * On the second round, the server has access to the structure and
+ * from that it can tell what to look for next, so on the invocation,
+ * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
+ *
+ * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) },
+ * { .iov_base = a.buf, .iov_len = a.buflen } }
+ *
+ * FUSE will copy both struct a and the pointed buffer from the
+ * process doing the ioctl and retry ioctl with both struct a and the
+ * buffer.
+ *
+ * This time, FUSE server has everything it needs and completes ioctl
+ * without FUSE_IOCTL_RETRY which finishes the ioctl call.
+ *
+ * Copying data out works the same way.
+ *
+ * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
+ * automatically initializes in and out iovs by decoding @cmd with
+ * _IOC_* macros and the server is not allowed to request RETRY. This
+ * limits ioctl data transfers to well-formed ioctls and is the forced
+ * behavior for all FUSE servers.
+ */
+long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
+ unsigned int flags)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_mount *fm = ff->fm;
+ struct fuse_ioctl_in inarg = {
+ .fh = ff->fh,
+ .cmd = cmd,
+ .arg = arg,
+ .flags = flags
+ };
+ struct fuse_ioctl_out outarg;
+ struct iovec *iov_page = NULL;
+ struct iovec *in_iov = NULL, *out_iov = NULL;
+ unsigned int in_iovs = 0, out_iovs = 0, max_pages;
+ size_t in_size, out_size, c;
+ ssize_t transferred;
+ int err, i;
+ struct iov_iter ii;
+ struct fuse_args_pages ap = {};
+
+#if BITS_PER_LONG == 32
+ inarg.flags |= FUSE_IOCTL_32BIT;
+#else
+ if (flags & FUSE_IOCTL_COMPAT) {
+ inarg.flags |= FUSE_IOCTL_32BIT;
+#ifdef CONFIG_X86_X32_ABI
+ if (in_x32_syscall())
+ inarg.flags |= FUSE_IOCTL_COMPAT_X32;
+#endif
+ }
+#endif
+
+ /* assume all the iovs returned by client always fits in a page */
+ BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
+
+ err = -ENOMEM;
+ ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs);
+ iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
+ if (!ap.pages || !iov_page)
+ goto out;
+
+ fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages);
+
+ /*
+ * If restricted, initialize IO parameters as encoded in @cmd.
+ * RETRY from server is not allowed.
+ */
+ if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
+ struct iovec *iov = iov_page;
+
+ iov->iov_base = (void __user *)arg;
+ iov->iov_len = _IOC_SIZE(cmd);
+
+ if (_IOC_DIR(cmd) & _IOC_WRITE) {
+ in_iov = iov;
+ in_iovs = 1;
+ }
+
+ if (_IOC_DIR(cmd) & _IOC_READ) {
+ out_iov = iov;
+ out_iovs = 1;
+ }
+ }
+
+ retry:
+ inarg.in_size = in_size = iov_length(in_iov, in_iovs);
+ inarg.out_size = out_size = iov_length(out_iov, out_iovs);
+
+ /*
+ * Out data can be used either for actual out data or iovs,
+ * make sure there always is at least one page.
+ */
+ out_size = max_t(size_t, out_size, PAGE_SIZE);
+ max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
+
+ /* make sure there are enough buffer pages and init request with them */
+ err = -ENOMEM;
+ if (max_pages > fm->fc->max_pages)
+ goto out;
+ while (ap.num_pages < max_pages) {
+ ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+ if (!ap.pages[ap.num_pages])
+ goto out;
+ ap.num_pages++;
+ }
+
+
+ /* okay, let's send it to the client */
+ ap.args.opcode = FUSE_IOCTL;
+ ap.args.nodeid = ff->nodeid;
+ ap.args.in_numargs = 1;
+ ap.args.in_args[0].size = sizeof(inarg);
+ ap.args.in_args[0].value = &inarg;
+ if (in_size) {
+ ap.args.in_numargs++;
+ ap.args.in_args[1].size = in_size;
+ ap.args.in_pages = true;
+
+ err = -EFAULT;
+ iov_iter_init(&ii, ITER_SOURCE, in_iov, in_iovs, in_size);
+ for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
+ c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
+ if (c != PAGE_SIZE && iov_iter_count(&ii))
+ goto out;
+ }
+ }
+
+ ap.args.out_numargs = 2;
+ ap.args.out_args[1].size = out_size;
+ ap.args.out_pages = true;
+ ap.args.out_argvar = true;
+
+ transferred = fuse_send_ioctl(fm, &ap.args, &outarg);
+ err = transferred;
+ if (transferred < 0)
+ goto out;
+
+ /* did it ask for retry? */
+ if (outarg.flags & FUSE_IOCTL_RETRY) {
+ void *vaddr;
+
+ /* no retry if in restricted mode */
+ err = -EIO;
+ if (!(flags & FUSE_IOCTL_UNRESTRICTED))
+ goto out;
+
+ in_iovs = outarg.in_iovs;
+ out_iovs = outarg.out_iovs;
+
+ /*
+ * Make sure things are in boundary, separate checks
+ * are to protect against overflow.
+ */
+ err = -ENOMEM;
+ if (in_iovs > FUSE_IOCTL_MAX_IOV ||
+ out_iovs > FUSE_IOCTL_MAX_IOV ||
+ in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
+ goto out;
+
+ vaddr = kmap_local_page(ap.pages[0]);
+ err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr,
+ transferred, in_iovs + out_iovs,
+ (flags & FUSE_IOCTL_COMPAT) != 0);
+ kunmap_local(vaddr);
+ if (err)
+ goto out;
+
+ in_iov = iov_page;
+ out_iov = in_iov + in_iovs;
+
+ err = fuse_verify_ioctl_iov(fm->fc, in_iov, in_iovs);
+ if (err)
+ goto out;
+
+ err = fuse_verify_ioctl_iov(fm->fc, out_iov, out_iovs);
+ if (err)
+ goto out;
+
+ goto retry;
+ }
+
+ err = -EIO;
+ if (transferred > inarg.out_size)
+ goto out;
+
+ err = -EFAULT;
+ iov_iter_init(&ii, ITER_DEST, out_iov, out_iovs, transferred);
+ for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
+ c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
+ if (c != PAGE_SIZE && iov_iter_count(&ii))
+ goto out;
+ }
+ err = 0;
+ out:
+ free_page((unsigned long) iov_page);
+ while (ap.num_pages)
+ __free_page(ap.pages[--ap.num_pages]);
+ kfree(ap.pages);
+
+ return err ? err : outarg.result;
+}
+EXPORT_SYMBOL_GPL(fuse_do_ioctl);
+
+long fuse_ioctl_common(struct file *file, unsigned int cmd,
+ unsigned long arg, unsigned int flags)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (!fuse_allow_current_process(fc))
+ return -EACCES;
+
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ return fuse_do_ioctl(file, cmd, arg, flags);
+}
+
+long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ return fuse_ioctl_common(file, cmd, arg, 0);
+}
+
+long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
+}
+
+static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff,
+ unsigned int cmd, void *ptr, size_t size)
+{
+ struct fuse_mount *fm = ff->fm;
+ struct fuse_ioctl_in inarg;
+ struct fuse_ioctl_out outarg;
+ FUSE_ARGS(args);
+ int err;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.fh = ff->fh;
+ inarg.cmd = cmd;
+
+#if BITS_PER_LONG == 32
+ inarg.flags |= FUSE_IOCTL_32BIT;
+#endif
+ if (S_ISDIR(inode->i_mode))
+ inarg.flags |= FUSE_IOCTL_DIR;
+
+ if (_IOC_DIR(cmd) & _IOC_READ)
+ inarg.out_size = size;
+ if (_IOC_DIR(cmd) & _IOC_WRITE)
+ inarg.in_size = size;
+
+ args.opcode = FUSE_IOCTL;
+ args.nodeid = ff->nodeid;
+ args.in_numargs = 2;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.in_args[1].size = inarg.in_size;
+ args.in_args[1].value = ptr;
+ args.out_numargs = 2;
+ args.out_args[1].size = inarg.out_size;
+ args.out_args[1].value = ptr;
+
+ err = fuse_send_ioctl(fm, &args, &outarg);
+ if (!err) {
+ if (outarg.result < 0)
+ err = outarg.result;
+ else if (outarg.flags & FUSE_IOCTL_RETRY)
+ err = -EIO;
+ }
+ return err;
+}
+
+static struct fuse_file *fuse_priv_ioctl_prepare(struct inode *inode)
+{
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ bool isdir = S_ISDIR(inode->i_mode);
+
+ if (!fuse_allow_current_process(fm->fc))
+ return ERR_PTR(-EACCES);
+
+ if (fuse_is_bad(inode))
+ return ERR_PTR(-EIO);
+
+ if (!S_ISREG(inode->i_mode) && !isdir)
+ return ERR_PTR(-ENOTTY);
+
+ return fuse_file_open(fm, get_node_id(inode), O_RDONLY, isdir);
+}
+
+static void fuse_priv_ioctl_cleanup(struct inode *inode, struct fuse_file *ff)
+{
+ fuse_file_release(inode, ff, O_RDONLY, NULL, S_ISDIR(inode->i_mode));
+}
+
+int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct fuse_file *ff;
+ unsigned int flags;
+ struct fsxattr xfa;
+ int err;
+
+ ff = fuse_priv_ioctl_prepare(inode);
+ if (IS_ERR(ff))
+ return PTR_ERR(ff);
+
+ if (fa->flags_valid) {
+ err = fuse_priv_ioctl(inode, ff, FS_IOC_GETFLAGS,
+ &flags, sizeof(flags));
+ if (err)
+ goto cleanup;
+
+ fileattr_fill_flags(fa, flags);
+ } else {
+ err = fuse_priv_ioctl(inode, ff, FS_IOC_FSGETXATTR,
+ &xfa, sizeof(xfa));
+ if (err)
+ goto cleanup;
+
+ fileattr_fill_xflags(fa, xfa.fsx_xflags);
+ fa->fsx_extsize = xfa.fsx_extsize;
+ fa->fsx_nextents = xfa.fsx_nextents;
+ fa->fsx_projid = xfa.fsx_projid;
+ fa->fsx_cowextsize = xfa.fsx_cowextsize;
+ }
+cleanup:
+ fuse_priv_ioctl_cleanup(inode, ff);
+
+ return err;
+}
+
+int fuse_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct fuse_file *ff;
+ unsigned int flags = fa->flags;
+ struct fsxattr xfa;
+ int err;
+
+ ff = fuse_priv_ioctl_prepare(inode);
+ if (IS_ERR(ff))
+ return PTR_ERR(ff);
+
+ if (fa->flags_valid) {
+ err = fuse_priv_ioctl(inode, ff, FS_IOC_SETFLAGS,
+ &flags, sizeof(flags));
+ if (err)
+ goto cleanup;
+ } else {
+ memset(&xfa, 0, sizeof(xfa));
+ xfa.fsx_xflags = fa->fsx_xflags;
+ xfa.fsx_extsize = fa->fsx_extsize;
+ xfa.fsx_nextents = fa->fsx_nextents;
+ xfa.fsx_projid = fa->fsx_projid;
+ xfa.fsx_cowextsize = fa->fsx_cowextsize;
+
+ err = fuse_priv_ioctl(inode, ff, FS_IOC_FSSETXATTR,
+ &xfa, sizeof(xfa));
+ }
+
+cleanup:
+ fuse_priv_ioctl_cleanup(inode, ff);
+
+ return err;
+}
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
new file mode 100644
index 000000000..f40bc51fa
--- /dev/null
+++ b/fs/fuse/readdir.c
@@ -0,0 +1,604 @@
+/*
+ FUSE: Filesystem in Userspace
+ Copyright (C) 2001-2018 Miklos Szeredi <miklos@szeredi.hu>
+
+ This program can be distributed under the terms of the GNU GPL.
+ See the file COPYING.
+*/
+
+
+#include "fuse_i.h"
+#include <linux/iversion.h>
+#include <linux/posix_acl.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+
+static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
+{
+ struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_inode *fi = get_fuse_inode(dir);
+
+ if (!fc->do_readdirplus)
+ return false;
+ if (!fc->readdirplus_auto)
+ return true;
+ if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
+ return true;
+ if (ctx->pos == 0)
+ return true;
+ return false;
+}
+
+static void fuse_add_dirent_to_cache(struct file *file,
+ struct fuse_dirent *dirent, loff_t pos)
+{
+ struct fuse_inode *fi = get_fuse_inode(file_inode(file));
+ size_t reclen = FUSE_DIRENT_SIZE(dirent);
+ pgoff_t index;
+ struct page *page;
+ loff_t size;
+ u64 version;
+ unsigned int offset;
+ void *addr;
+
+ spin_lock(&fi->rdc.lock);
+ /*
+ * Is cache already completed? Or this entry does not go at the end of
+ * cache?
+ */
+ if (fi->rdc.cached || pos != fi->rdc.pos) {
+ spin_unlock(&fi->rdc.lock);
+ return;
+ }
+ version = fi->rdc.version;
+ size = fi->rdc.size;
+ offset = size & ~PAGE_MASK;
+ index = size >> PAGE_SHIFT;
+ /* Dirent doesn't fit in current page? Jump to next page. */
+ if (offset + reclen > PAGE_SIZE) {
+ index++;
+ offset = 0;
+ }
+ spin_unlock(&fi->rdc.lock);
+
+ if (offset) {
+ page = find_lock_page(file->f_mapping, index);
+ } else {
+ page = find_or_create_page(file->f_mapping, index,
+ mapping_gfp_mask(file->f_mapping));
+ }
+ if (!page)
+ return;
+
+ spin_lock(&fi->rdc.lock);
+ /* Raced with another readdir */
+ if (fi->rdc.version != version || fi->rdc.size != size ||
+ WARN_ON(fi->rdc.pos != pos))
+ goto unlock;
+
+ addr = kmap_local_page(page);
+ if (!offset) {
+ clear_page(addr);
+ SetPageUptodate(page);
+ }
+ memcpy(addr + offset, dirent, reclen);
+ kunmap_local(addr);
+ fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen;
+ fi->rdc.pos = dirent->off;
+unlock:
+ spin_unlock(&fi->rdc.lock);
+ unlock_page(page);
+ put_page(page);
+}
+
+static void fuse_readdir_cache_end(struct file *file, loff_t pos)
+{
+ struct fuse_inode *fi = get_fuse_inode(file_inode(file));
+ loff_t end;
+
+ spin_lock(&fi->rdc.lock);
+ /* does cache end position match current position? */
+ if (fi->rdc.pos != pos) {
+ spin_unlock(&fi->rdc.lock);
+ return;
+ }
+
+ fi->rdc.cached = true;
+ end = ALIGN(fi->rdc.size, PAGE_SIZE);
+ spin_unlock(&fi->rdc.lock);
+
+ /* truncate unused tail of cache */
+ truncate_inode_pages(file->f_mapping, end);
+}
+
+static bool fuse_emit(struct file *file, struct dir_context *ctx,
+ struct fuse_dirent *dirent)
+{
+ struct fuse_file *ff = file->private_data;
+
+ if (ff->open_flags & FOPEN_CACHE_DIR)
+ fuse_add_dirent_to_cache(file, dirent, ctx->pos);
+
+ return dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino,
+ dirent->type);
+}
+
+static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
+ struct dir_context *ctx)
+{
+ while (nbytes >= FUSE_NAME_OFFSET) {
+ struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
+ size_t reclen = FUSE_DIRENT_SIZE(dirent);
+ if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+ return -EIO;
+ if (reclen > nbytes)
+ break;
+ if (memchr(dirent->name, '/', dirent->namelen) != NULL)
+ return -EIO;
+
+ if (!fuse_emit(file, ctx, dirent))
+ break;
+
+ buf += reclen;
+ nbytes -= reclen;
+ ctx->pos = dirent->off;
+ }
+
+ return 0;
+}
+
+static int fuse_direntplus_link(struct file *file,
+ struct fuse_direntplus *direntplus,
+ u64 attr_version)
+{
+ struct fuse_entry_out *o = &direntplus->entry_out;
+ struct fuse_dirent *dirent = &direntplus->dirent;
+ struct dentry *parent = file->f_path.dentry;
+ struct qstr name = QSTR_INIT(dirent->name, dirent->namelen);
+ struct dentry *dentry;
+ struct dentry *alias;
+ struct inode *dir = d_inode(parent);
+ struct fuse_conn *fc;
+ struct inode *inode;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+
+ if (!o->nodeid) {
+ /*
+ * Unlike in the case of fuse_lookup, zero nodeid does not mean
+ * ENOENT. Instead, it only means the userspace filesystem did
+ * not want to return attributes/handle for this entry.
+ *
+ * So do nothing.
+ */
+ return 0;
+ }
+
+ if (name.name[0] == '.') {
+ /*
+ * We could potentially refresh the attributes of the directory
+ * and its parent?
+ */
+ if (name.len == 1)
+ return 0;
+ if (name.name[1] == '.' && name.len == 2)
+ return 0;
+ }
+
+ if (invalid_nodeid(o->nodeid))
+ return -EIO;
+ if (fuse_invalid_attr(&o->attr))
+ return -EIO;
+
+ fc = get_fuse_conn(dir);
+
+ name.hash = full_name_hash(parent, name.name, name.len);
+ dentry = d_lookup(parent, &name);
+ if (!dentry) {
+retry:
+ dentry = d_alloc_parallel(parent, &name, &wq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ }
+ if (!d_in_lookup(dentry)) {
+ struct fuse_inode *fi;
+ inode = d_inode(dentry);
+ if (inode && get_node_id(inode) != o->nodeid)
+ inode = NULL;
+ if (!inode ||
+ fuse_stale_inode(inode, o->generation, &o->attr)) {
+ if (inode)
+ fuse_make_bad(inode);
+ d_invalidate(dentry);
+ dput(dentry);
+ goto retry;
+ }
+ if (fuse_is_bad(inode)) {
+ dput(dentry);
+ return -EIO;
+ }
+
+ fi = get_fuse_inode(inode);
+ spin_lock(&fi->lock);
+ fi->nlookup++;
+ spin_unlock(&fi->lock);
+
+ forget_all_cached_acls(inode);
+ fuse_change_attributes(inode, &o->attr,
+ entry_attr_timeout(o),
+ attr_version);
+ /*
+ * The other branch comes via fuse_iget()
+ * which bumps nlookup inside
+ */
+ } else {
+ inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
+ &o->attr, entry_attr_timeout(o),
+ attr_version);
+ if (!inode)
+ inode = ERR_PTR(-ENOMEM);
+
+ alias = d_splice_alias(inode, dentry);
+ d_lookup_done(dentry);
+ if (alias) {
+ dput(dentry);
+ dentry = alias;
+ }
+ if (IS_ERR(dentry)) {
+ if (!IS_ERR(inode)) {
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fi->lock);
+ fi->nlookup--;
+ spin_unlock(&fi->lock);
+ }
+ return PTR_ERR(dentry);
+ }
+ }
+ if (fc->readdirplus_auto)
+ set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state);
+ fuse_change_entry_timeout(dentry, o);
+
+ dput(dentry);
+ return 0;
+}
+
+static void fuse_force_forget(struct file *file, u64 nodeid)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_forget_in inarg;
+ FUSE_ARGS(args);
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.nlookup = 1;
+ args.opcode = FUSE_FORGET;
+ args.nodeid = nodeid;
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.force = true;
+ args.noreply = true;
+
+ fuse_simple_request(fm, &args);
+ /* ignore errors */
+}
+
+static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
+ struct dir_context *ctx, u64 attr_version)
+{
+ struct fuse_direntplus *direntplus;
+ struct fuse_dirent *dirent;
+ size_t reclen;
+ int over = 0;
+ int ret;
+
+ while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) {
+ direntplus = (struct fuse_direntplus *) buf;
+ dirent = &direntplus->dirent;
+ reclen = FUSE_DIRENTPLUS_SIZE(direntplus);
+
+ if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+ return -EIO;
+ if (reclen > nbytes)
+ break;
+ if (memchr(dirent->name, '/', dirent->namelen) != NULL)
+ return -EIO;
+
+ if (!over) {
+ /* We fill entries into dstbuf only as much as
+ it can hold. But we still continue iterating
+ over remaining entries to link them. If not,
+ we need to send a FORGET for each of those
+ which we did not link.
+ */
+ over = !fuse_emit(file, ctx, dirent);
+ if (!over)
+ ctx->pos = dirent->off;
+ }
+
+ buf += reclen;
+ nbytes -= reclen;
+
+ ret = fuse_direntplus_link(file, direntplus, attr_version);
+ if (ret)
+ fuse_force_forget(file, direntplus->entry_out.nodeid);
+ }
+
+ return 0;
+}
+
+static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
+{
+ int plus;
+ ssize_t res;
+ struct page *page;
+ struct inode *inode = file_inode(file);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_io_args ia = {};
+ struct fuse_args_pages *ap = &ia.ap;
+ struct fuse_page_desc desc = { .length = PAGE_SIZE };
+ u64 attr_version = 0;
+ bool locked;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ plus = fuse_use_readdirplus(inode, ctx);
+ ap->args.out_pages = true;
+ ap->num_pages = 1;
+ ap->pages = &page;
+ ap->descs = &desc;
+ if (plus) {
+ attr_version = fuse_get_attr_version(fm->fc);
+ fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
+ FUSE_READDIRPLUS);
+ } else {
+ fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
+ FUSE_READDIR);
+ }
+ locked = fuse_lock_inode(inode);
+ res = fuse_simple_request(fm, &ap->args);
+ fuse_unlock_inode(inode, locked);
+ if (res >= 0) {
+ if (!res) {
+ struct fuse_file *ff = file->private_data;
+
+ if (ff->open_flags & FOPEN_CACHE_DIR)
+ fuse_readdir_cache_end(file, ctx->pos);
+ } else if (plus) {
+ res = parse_dirplusfile(page_address(page), res,
+ file, ctx, attr_version);
+ } else {
+ res = parse_dirfile(page_address(page), res, file,
+ ctx);
+ }
+ }
+
+ __free_page(page);
+ fuse_invalidate_atime(inode);
+ return res;
+}
+
+enum fuse_parse_result {
+ FOUND_ERR = -1,
+ FOUND_NONE = 0,
+ FOUND_SOME,
+ FOUND_ALL,
+};
+
+static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff,
+ void *addr, unsigned int size,
+ struct dir_context *ctx)
+{
+ unsigned int offset = ff->readdir.cache_off & ~PAGE_MASK;
+ enum fuse_parse_result res = FOUND_NONE;
+
+ WARN_ON(offset >= size);
+
+ for (;;) {
+ struct fuse_dirent *dirent = addr + offset;
+ unsigned int nbytes = size - offset;
+ size_t reclen;
+
+ if (nbytes < FUSE_NAME_OFFSET || !dirent->namelen)
+ break;
+
+ reclen = FUSE_DIRENT_SIZE(dirent); /* derefs ->namelen */
+
+ if (WARN_ON(dirent->namelen > FUSE_NAME_MAX))
+ return FOUND_ERR;
+ if (WARN_ON(reclen > nbytes))
+ return FOUND_ERR;
+ if (WARN_ON(memchr(dirent->name, '/', dirent->namelen) != NULL))
+ return FOUND_ERR;
+
+ if (ff->readdir.pos == ctx->pos) {
+ res = FOUND_SOME;
+ if (!dir_emit(ctx, dirent->name, dirent->namelen,
+ dirent->ino, dirent->type))
+ return FOUND_ALL;
+ ctx->pos = dirent->off;
+ }
+ ff->readdir.pos = dirent->off;
+ ff->readdir.cache_off += reclen;
+
+ offset += reclen;
+ }
+
+ return res;
+}
+
+static void fuse_rdc_reset(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ fi->rdc.cached = false;
+ fi->rdc.version++;
+ fi->rdc.size = 0;
+ fi->rdc.pos = 0;
+}
+
+#define UNCACHED 1
+
+static int fuse_readdir_cached(struct file *file, struct dir_context *ctx)
+{
+ struct fuse_file *ff = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ enum fuse_parse_result res;
+ pgoff_t index;
+ unsigned int size;
+ struct page *page;
+ void *addr;
+
+ /* Seeked? If so, reset the cache stream */
+ if (ff->readdir.pos != ctx->pos) {
+ ff->readdir.pos = 0;
+ ff->readdir.cache_off = 0;
+ }
+
+ /*
+ * We're just about to start reading into the cache or reading the
+ * cache; both cases require an up-to-date mtime value.
+ */
+ if (!ctx->pos && fc->auto_inval_data) {
+ int err = fuse_update_attributes(inode, file, STATX_MTIME);
+
+ if (err)
+ return err;
+ }
+
+retry:
+ spin_lock(&fi->rdc.lock);
+retry_locked:
+ if (!fi->rdc.cached) {
+ /* Starting cache? Set cache mtime. */
+ if (!ctx->pos && !fi->rdc.size) {
+ fi->rdc.mtime = inode->i_mtime;
+ fi->rdc.iversion = inode_query_iversion(inode);
+ }
+ spin_unlock(&fi->rdc.lock);
+ return UNCACHED;
+ }
+ /*
+ * When at the beginning of the directory (i.e. just after opendir(3) or
+ * rewinddir(3)), then need to check whether directory contents have
+ * changed, and reset the cache if so.
+ */
+ if (!ctx->pos) {
+ if (inode_peek_iversion(inode) != fi->rdc.iversion ||
+ !timespec64_equal(&fi->rdc.mtime, &inode->i_mtime)) {
+ fuse_rdc_reset(inode);
+ goto retry_locked;
+ }
+ }
+
+ /*
+ * If cache version changed since the last getdents() call, then reset
+ * the cache stream.
+ */
+ if (ff->readdir.version != fi->rdc.version) {
+ ff->readdir.pos = 0;
+ ff->readdir.cache_off = 0;
+ }
+ /*
+ * If at the beginning of the cache, than reset version to
+ * current.
+ */
+ if (ff->readdir.pos == 0)
+ ff->readdir.version = fi->rdc.version;
+
+ WARN_ON(fi->rdc.size < ff->readdir.cache_off);
+
+ index = ff->readdir.cache_off >> PAGE_SHIFT;
+
+ if (index == (fi->rdc.size >> PAGE_SHIFT))
+ size = fi->rdc.size & ~PAGE_MASK;
+ else
+ size = PAGE_SIZE;
+ spin_unlock(&fi->rdc.lock);
+
+ /* EOF? */
+ if ((ff->readdir.cache_off & ~PAGE_MASK) == size)
+ return 0;
+
+ page = find_get_page_flags(file->f_mapping, index,
+ FGP_ACCESSED | FGP_LOCK);
+ /* Page gone missing, then re-added to cache, but not initialized? */
+ if (page && !PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ page = NULL;
+ }
+ spin_lock(&fi->rdc.lock);
+ if (!page) {
+ /*
+ * Uh-oh: page gone missing, cache is useless
+ */
+ if (fi->rdc.version == ff->readdir.version)
+ fuse_rdc_reset(inode);
+ goto retry_locked;
+ }
+
+ /* Make sure it's still the same version after getting the page. */
+ if (ff->readdir.version != fi->rdc.version) {
+ spin_unlock(&fi->rdc.lock);
+ unlock_page(page);
+ put_page(page);
+ goto retry;
+ }
+ spin_unlock(&fi->rdc.lock);
+
+ /*
+ * Contents of the page are now protected against changing by holding
+ * the page lock.
+ */
+ addr = kmap(page);
+ res = fuse_parse_cache(ff, addr, size, ctx);
+ kunmap(page);
+ unlock_page(page);
+ put_page(page);
+
+ if (res == FOUND_ERR)
+ return -EIO;
+
+ if (res == FOUND_ALL)
+ return 0;
+
+ if (size == PAGE_SIZE) {
+ /* We hit end of page: skip to next page. */
+ ff->readdir.cache_off = ALIGN(ff->readdir.cache_off, PAGE_SIZE);
+ goto retry;
+ }
+
+ /*
+ * End of cache reached. If found position, then we are done, otherwise
+ * need to fall back to uncached, since the position we were looking for
+ * wasn't in the cache.
+ */
+ return res == FOUND_SOME ? 0 : UNCACHED;
+}
+
+int fuse_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct fuse_file *ff = file->private_data;
+ struct inode *inode = file_inode(file);
+ int err;
+
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ mutex_lock(&ff->readdir.lock);
+
+ err = UNCACHED;
+ if (ff->open_flags & FOPEN_CACHE_DIR)
+ err = fuse_readdir_cached(file, ctx);
+ if (err == UNCACHED)
+ err = fuse_readdir_uncached(file, ctx);
+
+ mutex_unlock(&ff->readdir.lock);
+
+ return err;
+}
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
new file mode 100644
index 000000000..4d8d4f16c
--- /dev/null
+++ b/fs/fuse/virtio_fs.c
@@ -0,0 +1,1541 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * virtio-fs: Virtio Filesystem
+ * Copyright (C) 2018 Red Hat, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/dax.h>
+#include <linux/pci.h>
+#include <linux/pfn_t.h>
+#include <linux/memremap.h>
+#include <linux/module.h>
+#include <linux/virtio.h>
+#include <linux/virtio_fs.h>
+#include <linux/delay.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/highmem.h>
+#include <linux/uio.h>
+#include "fuse_i.h"
+
+/* Used to help calculate the FUSE connection's max_pages limit for a request's
+ * size. Parts of the struct fuse_req are sliced into scattergather lists in
+ * addition to the pages used, so this can help account for that overhead.
+ */
+#define FUSE_HEADER_OVERHEAD 4
+
+/* List of virtio-fs device instances and a lock for the list. Also provides
+ * mutual exclusion in device removal and mounting path
+ */
+static DEFINE_MUTEX(virtio_fs_mutex);
+static LIST_HEAD(virtio_fs_instances);
+
+enum {
+ VQ_HIPRIO,
+ VQ_REQUEST
+};
+
+#define VQ_NAME_LEN 24
+
+/* Per-virtqueue state */
+struct virtio_fs_vq {
+ spinlock_t lock;
+ struct virtqueue *vq; /* protected by ->lock */
+ struct work_struct done_work;
+ struct list_head queued_reqs;
+ struct list_head end_reqs; /* End these requests */
+ struct delayed_work dispatch_work;
+ struct fuse_dev *fud;
+ bool connected;
+ long in_flight;
+ struct completion in_flight_zero; /* No inflight requests */
+ char name[VQ_NAME_LEN];
+} ____cacheline_aligned_in_smp;
+
+/* A virtio-fs device instance */
+struct virtio_fs {
+ struct kref refcount;
+ struct list_head list; /* on virtio_fs_instances */
+ char *tag;
+ struct virtio_fs_vq *vqs;
+ unsigned int nvqs; /* number of virtqueues */
+ unsigned int num_request_queues; /* number of request queues */
+ struct dax_device *dax_dev;
+
+ /* DAX memory window where file contents are mapped */
+ void *window_kaddr;
+ phys_addr_t window_phys_addr;
+ size_t window_len;
+};
+
+struct virtio_fs_forget_req {
+ struct fuse_in_header ih;
+ struct fuse_forget_in arg;
+};
+
+struct virtio_fs_forget {
+ /* This request can be temporarily queued on virt queue */
+ struct list_head list;
+ struct virtio_fs_forget_req req;
+};
+
+struct virtio_fs_req_work {
+ struct fuse_req *req;
+ struct virtio_fs_vq *fsvq;
+ struct work_struct done_work;
+};
+
+static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
+ struct fuse_req *req, bool in_flight);
+
+static const struct constant_table dax_param_enums[] = {
+ {"always", FUSE_DAX_ALWAYS },
+ {"never", FUSE_DAX_NEVER },
+ {"inode", FUSE_DAX_INODE_USER },
+ {}
+};
+
+enum {
+ OPT_DAX,
+ OPT_DAX_ENUM,
+};
+
+static const struct fs_parameter_spec virtio_fs_parameters[] = {
+ fsparam_flag("dax", OPT_DAX),
+ fsparam_enum("dax", OPT_DAX_ENUM, dax_param_enums),
+ {}
+};
+
+static int virtio_fs_parse_param(struct fs_context *fsc,
+ struct fs_parameter *param)
+{
+ struct fs_parse_result result;
+ struct fuse_fs_context *ctx = fsc->fs_private;
+ int opt;
+
+ opt = fs_parse(fsc, virtio_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case OPT_DAX:
+ ctx->dax_mode = FUSE_DAX_ALWAYS;
+ break;
+ case OPT_DAX_ENUM:
+ ctx->dax_mode = result.uint_32;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void virtio_fs_free_fsc(struct fs_context *fsc)
+{
+ struct fuse_fs_context *ctx = fsc->fs_private;
+
+ kfree(ctx);
+}
+
+static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq)
+{
+ struct virtio_fs *fs = vq->vdev->priv;
+
+ return &fs->vqs[vq->index];
+}
+
+/* Should be called with fsvq->lock held. */
+static inline void inc_in_flight_req(struct virtio_fs_vq *fsvq)
+{
+ fsvq->in_flight++;
+}
+
+/* Should be called with fsvq->lock held. */
+static inline void dec_in_flight_req(struct virtio_fs_vq *fsvq)
+{
+ WARN_ON(fsvq->in_flight <= 0);
+ fsvq->in_flight--;
+ if (!fsvq->in_flight)
+ complete(&fsvq->in_flight_zero);
+}
+
+static void release_virtio_fs_obj(struct kref *ref)
+{
+ struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount);
+
+ kfree(vfs->vqs);
+ kfree(vfs);
+}
+
+/* Make sure virtiofs_mutex is held */
+static void virtio_fs_put(struct virtio_fs *fs)
+{
+ kref_put(&fs->refcount, release_virtio_fs_obj);
+}
+
+static void virtio_fs_fiq_release(struct fuse_iqueue *fiq)
+{
+ struct virtio_fs *vfs = fiq->priv;
+
+ mutex_lock(&virtio_fs_mutex);
+ virtio_fs_put(vfs);
+ mutex_unlock(&virtio_fs_mutex);
+}
+
+static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq)
+{
+ WARN_ON(fsvq->in_flight < 0);
+
+ /* Wait for in flight requests to finish.*/
+ spin_lock(&fsvq->lock);
+ if (fsvq->in_flight) {
+ /* We are holding virtio_fs_mutex. There should not be any
+ * waiters waiting for completion.
+ */
+ reinit_completion(&fsvq->in_flight_zero);
+ spin_unlock(&fsvq->lock);
+ wait_for_completion(&fsvq->in_flight_zero);
+ } else {
+ spin_unlock(&fsvq->lock);
+ }
+
+ flush_work(&fsvq->done_work);
+ flush_delayed_work(&fsvq->dispatch_work);
+}
+
+static void virtio_fs_drain_all_queues_locked(struct virtio_fs *fs)
+{
+ struct virtio_fs_vq *fsvq;
+ int i;
+
+ for (i = 0; i < fs->nvqs; i++) {
+ fsvq = &fs->vqs[i];
+ virtio_fs_drain_queue(fsvq);
+ }
+}
+
+static void virtio_fs_drain_all_queues(struct virtio_fs *fs)
+{
+ /* Provides mutual exclusion between ->remove and ->kill_sb
+ * paths. We don't want both of these draining queue at the
+ * same time. Current completion logic reinits completion
+ * and that means there should not be any other thread
+ * doing reinit or waiting for completion already.
+ */
+ mutex_lock(&virtio_fs_mutex);
+ virtio_fs_drain_all_queues_locked(fs);
+ mutex_unlock(&virtio_fs_mutex);
+}
+
+static void virtio_fs_start_all_queues(struct virtio_fs *fs)
+{
+ struct virtio_fs_vq *fsvq;
+ int i;
+
+ for (i = 0; i < fs->nvqs; i++) {
+ fsvq = &fs->vqs[i];
+ spin_lock(&fsvq->lock);
+ fsvq->connected = true;
+ spin_unlock(&fsvq->lock);
+ }
+}
+
+/* Add a new instance to the list or return -EEXIST if tag name exists*/
+static int virtio_fs_add_instance(struct virtio_fs *fs)
+{
+ struct virtio_fs *fs2;
+ bool duplicate = false;
+
+ mutex_lock(&virtio_fs_mutex);
+
+ list_for_each_entry(fs2, &virtio_fs_instances, list) {
+ if (strcmp(fs->tag, fs2->tag) == 0)
+ duplicate = true;
+ }
+
+ if (!duplicate)
+ list_add_tail(&fs->list, &virtio_fs_instances);
+
+ mutex_unlock(&virtio_fs_mutex);
+
+ if (duplicate)
+ return -EEXIST;
+ return 0;
+}
+
+/* Return the virtio_fs with a given tag, or NULL */
+static struct virtio_fs *virtio_fs_find_instance(const char *tag)
+{
+ struct virtio_fs *fs;
+
+ mutex_lock(&virtio_fs_mutex);
+
+ list_for_each_entry(fs, &virtio_fs_instances, list) {
+ if (strcmp(fs->tag, tag) == 0) {
+ kref_get(&fs->refcount);
+ goto found;
+ }
+ }
+
+ fs = NULL; /* not found */
+
+found:
+ mutex_unlock(&virtio_fs_mutex);
+
+ return fs;
+}
+
+static void virtio_fs_free_devs(struct virtio_fs *fs)
+{
+ unsigned int i;
+
+ for (i = 0; i < fs->nvqs; i++) {
+ struct virtio_fs_vq *fsvq = &fs->vqs[i];
+
+ if (!fsvq->fud)
+ continue;
+
+ fuse_dev_free(fsvq->fud);
+ fsvq->fud = NULL;
+ }
+}
+
+/* Read filesystem name from virtio config into fs->tag (must kfree()). */
+static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs)
+{
+ char tag_buf[sizeof_field(struct virtio_fs_config, tag)];
+ char *end;
+ size_t len;
+
+ virtio_cread_bytes(vdev, offsetof(struct virtio_fs_config, tag),
+ &tag_buf, sizeof(tag_buf));
+ end = memchr(tag_buf, '\0', sizeof(tag_buf));
+ if (end == tag_buf)
+ return -EINVAL; /* empty tag */
+ if (!end)
+ end = &tag_buf[sizeof(tag_buf)];
+
+ len = end - tag_buf;
+ fs->tag = devm_kmalloc(&vdev->dev, len + 1, GFP_KERNEL);
+ if (!fs->tag)
+ return -ENOMEM;
+ memcpy(fs->tag, tag_buf, len);
+ fs->tag[len] = '\0';
+ return 0;
+}
+
+/* Work function for hiprio completion */
+static void virtio_fs_hiprio_done_work(struct work_struct *work)
+{
+ struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
+ done_work);
+ struct virtqueue *vq = fsvq->vq;
+
+ /* Free completed FUSE_FORGET requests */
+ spin_lock(&fsvq->lock);
+ do {
+ unsigned int len;
+ void *req;
+
+ virtqueue_disable_cb(vq);
+
+ while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
+ kfree(req);
+ dec_in_flight_req(fsvq);
+ }
+ } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
+ spin_unlock(&fsvq->lock);
+}
+
+static void virtio_fs_request_dispatch_work(struct work_struct *work)
+{
+ struct fuse_req *req;
+ struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
+ dispatch_work.work);
+ int ret;
+
+ pr_debug("virtio-fs: worker %s called.\n", __func__);
+ while (1) {
+ spin_lock(&fsvq->lock);
+ req = list_first_entry_or_null(&fsvq->end_reqs, struct fuse_req,
+ list);
+ if (!req) {
+ spin_unlock(&fsvq->lock);
+ break;
+ }
+
+ list_del_init(&req->list);
+ spin_unlock(&fsvq->lock);
+ fuse_request_end(req);
+ }
+
+ /* Dispatch pending requests */
+ while (1) {
+ spin_lock(&fsvq->lock);
+ req = list_first_entry_or_null(&fsvq->queued_reqs,
+ struct fuse_req, list);
+ if (!req) {
+ spin_unlock(&fsvq->lock);
+ return;
+ }
+ list_del_init(&req->list);
+ spin_unlock(&fsvq->lock);
+
+ ret = virtio_fs_enqueue_req(fsvq, req, true);
+ if (ret < 0) {
+ if (ret == -ENOMEM || ret == -ENOSPC) {
+ spin_lock(&fsvq->lock);
+ list_add_tail(&req->list, &fsvq->queued_reqs);
+ schedule_delayed_work(&fsvq->dispatch_work,
+ msecs_to_jiffies(1));
+ spin_unlock(&fsvq->lock);
+ return;
+ }
+ req->out.h.error = ret;
+ spin_lock(&fsvq->lock);
+ dec_in_flight_req(fsvq);
+ spin_unlock(&fsvq->lock);
+ pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n",
+ ret);
+ fuse_request_end(req);
+ }
+ }
+}
+
+/*
+ * Returns 1 if queue is full and sender should wait a bit before sending
+ * next request, 0 otherwise.
+ */
+static int send_forget_request(struct virtio_fs_vq *fsvq,
+ struct virtio_fs_forget *forget,
+ bool in_flight)
+{
+ struct scatterlist sg;
+ struct virtqueue *vq;
+ int ret = 0;
+ bool notify;
+ struct virtio_fs_forget_req *req = &forget->req;
+
+ spin_lock(&fsvq->lock);
+ if (!fsvq->connected) {
+ if (in_flight)
+ dec_in_flight_req(fsvq);
+ kfree(forget);
+ goto out;
+ }
+
+ sg_init_one(&sg, req, sizeof(*req));
+ vq = fsvq->vq;
+ dev_dbg(&vq->vdev->dev, "%s\n", __func__);
+
+ ret = virtqueue_add_outbuf(vq, &sg, 1, forget, GFP_ATOMIC);
+ if (ret < 0) {
+ if (ret == -ENOMEM || ret == -ENOSPC) {
+ pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n",
+ ret);
+ list_add_tail(&forget->list, &fsvq->queued_reqs);
+ schedule_delayed_work(&fsvq->dispatch_work,
+ msecs_to_jiffies(1));
+ if (!in_flight)
+ inc_in_flight_req(fsvq);
+ /* Queue is full */
+ ret = 1;
+ } else {
+ pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n",
+ ret);
+ kfree(forget);
+ if (in_flight)
+ dec_in_flight_req(fsvq);
+ }
+ goto out;
+ }
+
+ if (!in_flight)
+ inc_in_flight_req(fsvq);
+ notify = virtqueue_kick_prepare(vq);
+ spin_unlock(&fsvq->lock);
+
+ if (notify)
+ virtqueue_notify(vq);
+ return ret;
+out:
+ spin_unlock(&fsvq->lock);
+ return ret;
+}
+
+static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
+{
+ struct virtio_fs_forget *forget;
+ struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
+ dispatch_work.work);
+ pr_debug("virtio-fs: worker %s called.\n", __func__);
+ while (1) {
+ spin_lock(&fsvq->lock);
+ forget = list_first_entry_or_null(&fsvq->queued_reqs,
+ struct virtio_fs_forget, list);
+ if (!forget) {
+ spin_unlock(&fsvq->lock);
+ return;
+ }
+
+ list_del(&forget->list);
+ spin_unlock(&fsvq->lock);
+ if (send_forget_request(fsvq, forget, true))
+ return;
+ }
+}
+
+/* Allocate and copy args into req->argbuf */
+static int copy_args_to_argbuf(struct fuse_req *req)
+{
+ struct fuse_args *args = req->args;
+ unsigned int offset = 0;
+ unsigned int num_in;
+ unsigned int num_out;
+ unsigned int len;
+ unsigned int i;
+
+ num_in = args->in_numargs - args->in_pages;
+ num_out = args->out_numargs - args->out_pages;
+ len = fuse_len_args(num_in, (struct fuse_arg *) args->in_args) +
+ fuse_len_args(num_out, args->out_args);
+
+ req->argbuf = kmalloc(len, GFP_ATOMIC);
+ if (!req->argbuf)
+ return -ENOMEM;
+
+ for (i = 0; i < num_in; i++) {
+ memcpy(req->argbuf + offset,
+ args->in_args[i].value,
+ args->in_args[i].size);
+ offset += args->in_args[i].size;
+ }
+
+ return 0;
+}
+
+/* Copy args out of and free req->argbuf */
+static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req)
+{
+ unsigned int remaining;
+ unsigned int offset;
+ unsigned int num_in;
+ unsigned int num_out;
+ unsigned int i;
+
+ remaining = req->out.h.len - sizeof(req->out.h);
+ num_in = args->in_numargs - args->in_pages;
+ num_out = args->out_numargs - args->out_pages;
+ offset = fuse_len_args(num_in, (struct fuse_arg *)args->in_args);
+
+ for (i = 0; i < num_out; i++) {
+ unsigned int argsize = args->out_args[i].size;
+
+ if (args->out_argvar &&
+ i == args->out_numargs - 1 &&
+ argsize > remaining) {
+ argsize = remaining;
+ }
+
+ memcpy(args->out_args[i].value, req->argbuf + offset, argsize);
+ offset += argsize;
+
+ if (i != args->out_numargs - 1)
+ remaining -= argsize;
+ }
+
+ /* Store the actual size of the variable-length arg */
+ if (args->out_argvar)
+ args->out_args[args->out_numargs - 1].size = remaining;
+
+ kfree(req->argbuf);
+ req->argbuf = NULL;
+}
+
+/* Work function for request completion */
+static void virtio_fs_request_complete(struct fuse_req *req,
+ struct virtio_fs_vq *fsvq)
+{
+ struct fuse_pqueue *fpq = &fsvq->fud->pq;
+ struct fuse_args *args;
+ struct fuse_args_pages *ap;
+ unsigned int len, i, thislen;
+ struct page *page;
+
+ /*
+ * TODO verify that server properly follows FUSE protocol
+ * (oh.uniq, oh.len)
+ */
+ args = req->args;
+ copy_args_from_argbuf(args, req);
+
+ if (args->out_pages && args->page_zeroing) {
+ len = args->out_args[args->out_numargs - 1].size;
+ ap = container_of(args, typeof(*ap), args);
+ for (i = 0; i < ap->num_pages; i++) {
+ thislen = ap->descs[i].length;
+ if (len < thislen) {
+ WARN_ON(ap->descs[i].offset);
+ page = ap->pages[i];
+ zero_user_segment(page, len, thislen);
+ len = 0;
+ } else {
+ len -= thislen;
+ }
+ }
+ }
+
+ spin_lock(&fpq->lock);
+ clear_bit(FR_SENT, &req->flags);
+ spin_unlock(&fpq->lock);
+
+ fuse_request_end(req);
+ spin_lock(&fsvq->lock);
+ dec_in_flight_req(fsvq);
+ spin_unlock(&fsvq->lock);
+}
+
+static void virtio_fs_complete_req_work(struct work_struct *work)
+{
+ struct virtio_fs_req_work *w =
+ container_of(work, typeof(*w), done_work);
+
+ virtio_fs_request_complete(w->req, w->fsvq);
+ kfree(w);
+}
+
+static void virtio_fs_requests_done_work(struct work_struct *work)
+{
+ struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
+ done_work);
+ struct fuse_pqueue *fpq = &fsvq->fud->pq;
+ struct virtqueue *vq = fsvq->vq;
+ struct fuse_req *req;
+ struct fuse_req *next;
+ unsigned int len;
+ LIST_HEAD(reqs);
+
+ /* Collect completed requests off the virtqueue */
+ spin_lock(&fsvq->lock);
+ do {
+ virtqueue_disable_cb(vq);
+
+ while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
+ spin_lock(&fpq->lock);
+ list_move_tail(&req->list, &reqs);
+ spin_unlock(&fpq->lock);
+ }
+ } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
+ spin_unlock(&fsvq->lock);
+
+ /* End requests */
+ list_for_each_entry_safe(req, next, &reqs, list) {
+ list_del_init(&req->list);
+
+ /* blocking async request completes in a worker context */
+ if (req->args->may_block) {
+ struct virtio_fs_req_work *w;
+
+ w = kzalloc(sizeof(*w), GFP_NOFS | __GFP_NOFAIL);
+ INIT_WORK(&w->done_work, virtio_fs_complete_req_work);
+ w->fsvq = fsvq;
+ w->req = req;
+ schedule_work(&w->done_work);
+ } else {
+ virtio_fs_request_complete(req, fsvq);
+ }
+ }
+}
+
+/* Virtqueue interrupt handler */
+static void virtio_fs_vq_done(struct virtqueue *vq)
+{
+ struct virtio_fs_vq *fsvq = vq_to_fsvq(vq);
+
+ dev_dbg(&vq->vdev->dev, "%s %s\n", __func__, fsvq->name);
+
+ schedule_work(&fsvq->done_work);
+}
+
+static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name,
+ int vq_type)
+{
+ strscpy(fsvq->name, name, VQ_NAME_LEN);
+ spin_lock_init(&fsvq->lock);
+ INIT_LIST_HEAD(&fsvq->queued_reqs);
+ INIT_LIST_HEAD(&fsvq->end_reqs);
+ init_completion(&fsvq->in_flight_zero);
+
+ if (vq_type == VQ_REQUEST) {
+ INIT_WORK(&fsvq->done_work, virtio_fs_requests_done_work);
+ INIT_DELAYED_WORK(&fsvq->dispatch_work,
+ virtio_fs_request_dispatch_work);
+ } else {
+ INIT_WORK(&fsvq->done_work, virtio_fs_hiprio_done_work);
+ INIT_DELAYED_WORK(&fsvq->dispatch_work,
+ virtio_fs_hiprio_dispatch_work);
+ }
+}
+
+/* Initialize virtqueues */
+static int virtio_fs_setup_vqs(struct virtio_device *vdev,
+ struct virtio_fs *fs)
+{
+ struct virtqueue **vqs;
+ vq_callback_t **callbacks;
+ const char **names;
+ unsigned int i;
+ int ret = 0;
+
+ virtio_cread_le(vdev, struct virtio_fs_config, num_request_queues,
+ &fs->num_request_queues);
+ if (fs->num_request_queues == 0)
+ return -EINVAL;
+
+ fs->nvqs = VQ_REQUEST + fs->num_request_queues;
+ fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL);
+ if (!fs->vqs)
+ return -ENOMEM;
+
+ vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL);
+ callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]),
+ GFP_KERNEL);
+ names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL);
+ if (!vqs || !callbacks || !names) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* Initialize the hiprio/forget request virtqueue */
+ callbacks[VQ_HIPRIO] = virtio_fs_vq_done;
+ virtio_fs_init_vq(&fs->vqs[VQ_HIPRIO], "hiprio", VQ_HIPRIO);
+ names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name;
+
+ /* Initialize the requests virtqueues */
+ for (i = VQ_REQUEST; i < fs->nvqs; i++) {
+ char vq_name[VQ_NAME_LEN];
+
+ snprintf(vq_name, VQ_NAME_LEN, "requests.%u", i - VQ_REQUEST);
+ virtio_fs_init_vq(&fs->vqs[i], vq_name, VQ_REQUEST);
+ callbacks[i] = virtio_fs_vq_done;
+ names[i] = fs->vqs[i].name;
+ }
+
+ ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL);
+ if (ret < 0)
+ goto out;
+
+ for (i = 0; i < fs->nvqs; i++)
+ fs->vqs[i].vq = vqs[i];
+
+ virtio_fs_start_all_queues(fs);
+out:
+ kfree(names);
+ kfree(callbacks);
+ kfree(vqs);
+ if (ret)
+ kfree(fs->vqs);
+ return ret;
+}
+
+/* Free virtqueues (device must already be reset) */
+static void virtio_fs_cleanup_vqs(struct virtio_device *vdev)
+{
+ vdev->config->del_vqs(vdev);
+}
+
+/* Map a window offset to a page frame number. The window offset will have
+ * been produced by .iomap_begin(), which maps a file offset to a window
+ * offset.
+ */
+static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
+ long nr_pages, enum dax_access_mode mode,
+ void **kaddr, pfn_t *pfn)
+{
+ struct virtio_fs *fs = dax_get_private(dax_dev);
+ phys_addr_t offset = PFN_PHYS(pgoff);
+ size_t max_nr_pages = fs->window_len / PAGE_SIZE - pgoff;
+
+ if (kaddr)
+ *kaddr = fs->window_kaddr + offset;
+ if (pfn)
+ *pfn = phys_to_pfn_t(fs->window_phys_addr + offset,
+ PFN_DEV | PFN_MAP);
+ return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
+}
+
+static int virtio_fs_zero_page_range(struct dax_device *dax_dev,
+ pgoff_t pgoff, size_t nr_pages)
+{
+ long rc;
+ void *kaddr;
+
+ rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, &kaddr,
+ NULL);
+ if (rc < 0)
+ return rc;
+ memset(kaddr, 0, nr_pages << PAGE_SHIFT);
+ dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT);
+ return 0;
+}
+
+static const struct dax_operations virtio_fs_dax_ops = {
+ .direct_access = virtio_fs_direct_access,
+ .zero_page_range = virtio_fs_zero_page_range,
+};
+
+static void virtio_fs_cleanup_dax(void *data)
+{
+ struct dax_device *dax_dev = data;
+
+ kill_dax(dax_dev);
+ put_dax(dax_dev);
+}
+
+static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
+{
+ struct virtio_shm_region cache_reg;
+ struct dev_pagemap *pgmap;
+ bool have_cache;
+
+ if (!IS_ENABLED(CONFIG_FUSE_DAX))
+ return 0;
+
+ /* Get cache region */
+ have_cache = virtio_get_shm_region(vdev, &cache_reg,
+ (u8)VIRTIO_FS_SHMCAP_ID_CACHE);
+ if (!have_cache) {
+ dev_notice(&vdev->dev, "%s: No cache capability\n", __func__);
+ return 0;
+ }
+
+ if (!devm_request_mem_region(&vdev->dev, cache_reg.addr, cache_reg.len,
+ dev_name(&vdev->dev))) {
+ dev_warn(&vdev->dev, "could not reserve region addr=0x%llx len=0x%llx\n",
+ cache_reg.addr, cache_reg.len);
+ return -EBUSY;
+ }
+
+ dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n", cache_reg.len,
+ cache_reg.addr);
+
+ pgmap = devm_kzalloc(&vdev->dev, sizeof(*pgmap), GFP_KERNEL);
+ if (!pgmap)
+ return -ENOMEM;
+
+ pgmap->type = MEMORY_DEVICE_FS_DAX;
+
+ /* Ideally we would directly use the PCI BAR resource but
+ * devm_memremap_pages() wants its own copy in pgmap. So
+ * initialize a struct resource from scratch (only the start
+ * and end fields will be used).
+ */
+ pgmap->range = (struct range) {
+ .start = (phys_addr_t) cache_reg.addr,
+ .end = (phys_addr_t) cache_reg.addr + cache_reg.len - 1,
+ };
+ pgmap->nr_range = 1;
+
+ fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap);
+ if (IS_ERR(fs->window_kaddr))
+ return PTR_ERR(fs->window_kaddr);
+
+ fs->window_phys_addr = (phys_addr_t) cache_reg.addr;
+ fs->window_len = (phys_addr_t) cache_reg.len;
+
+ dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n",
+ __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len);
+
+ fs->dax_dev = alloc_dax(fs, &virtio_fs_dax_ops);
+ if (IS_ERR(fs->dax_dev))
+ return PTR_ERR(fs->dax_dev);
+
+ return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax,
+ fs->dax_dev);
+}
+
+static int virtio_fs_probe(struct virtio_device *vdev)
+{
+ struct virtio_fs *fs;
+ int ret;
+
+ fs = kzalloc(sizeof(*fs), GFP_KERNEL);
+ if (!fs)
+ return -ENOMEM;
+ kref_init(&fs->refcount);
+ vdev->priv = fs;
+
+ ret = virtio_fs_read_tag(vdev, fs);
+ if (ret < 0)
+ goto out;
+
+ ret = virtio_fs_setup_vqs(vdev, fs);
+ if (ret < 0)
+ goto out;
+
+ /* TODO vq affinity */
+
+ ret = virtio_fs_setup_dax(vdev, fs);
+ if (ret < 0)
+ goto out_vqs;
+
+ /* Bring the device online in case the filesystem is mounted and
+ * requests need to be sent before we return.
+ */
+ virtio_device_ready(vdev);
+
+ ret = virtio_fs_add_instance(fs);
+ if (ret < 0)
+ goto out_vqs;
+
+ return 0;
+
+out_vqs:
+ virtio_reset_device(vdev);
+ virtio_fs_cleanup_vqs(vdev);
+ kfree(fs->vqs);
+
+out:
+ vdev->priv = NULL;
+ kfree(fs);
+ return ret;
+}
+
+static void virtio_fs_stop_all_queues(struct virtio_fs *fs)
+{
+ struct virtio_fs_vq *fsvq;
+ int i;
+
+ for (i = 0; i < fs->nvqs; i++) {
+ fsvq = &fs->vqs[i];
+ spin_lock(&fsvq->lock);
+ fsvq->connected = false;
+ spin_unlock(&fsvq->lock);
+ }
+}
+
+static void virtio_fs_remove(struct virtio_device *vdev)
+{
+ struct virtio_fs *fs = vdev->priv;
+
+ mutex_lock(&virtio_fs_mutex);
+ /* This device is going away. No one should get new reference */
+ list_del_init(&fs->list);
+ virtio_fs_stop_all_queues(fs);
+ virtio_fs_drain_all_queues_locked(fs);
+ virtio_reset_device(vdev);
+ virtio_fs_cleanup_vqs(vdev);
+
+ vdev->priv = NULL;
+ /* Put device reference on virtio_fs object */
+ virtio_fs_put(fs);
+ mutex_unlock(&virtio_fs_mutex);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int virtio_fs_freeze(struct virtio_device *vdev)
+{
+ /* TODO need to save state here */
+ pr_warn("virtio-fs: suspend/resume not yet supported\n");
+ return -EOPNOTSUPP;
+}
+
+static int virtio_fs_restore(struct virtio_device *vdev)
+{
+ /* TODO need to restore state here */
+ return 0;
+}
+#endif /* CONFIG_PM_SLEEP */
+
+static const struct virtio_device_id id_table[] = {
+ { VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID },
+ {},
+};
+
+static const unsigned int feature_table[] = {};
+
+static struct virtio_driver virtio_fs_driver = {
+ .driver.name = KBUILD_MODNAME,
+ .driver.owner = THIS_MODULE,
+ .id_table = id_table,
+ .feature_table = feature_table,
+ .feature_table_size = ARRAY_SIZE(feature_table),
+ .probe = virtio_fs_probe,
+ .remove = virtio_fs_remove,
+#ifdef CONFIG_PM_SLEEP
+ .freeze = virtio_fs_freeze,
+ .restore = virtio_fs_restore,
+#endif
+};
+
+static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq)
+__releases(fiq->lock)
+{
+ struct fuse_forget_link *link;
+ struct virtio_fs_forget *forget;
+ struct virtio_fs_forget_req *req;
+ struct virtio_fs *fs;
+ struct virtio_fs_vq *fsvq;
+ u64 unique;
+
+ link = fuse_dequeue_forget(fiq, 1, NULL);
+ unique = fuse_get_unique(fiq);
+
+ fs = fiq->priv;
+ fsvq = &fs->vqs[VQ_HIPRIO];
+ spin_unlock(&fiq->lock);
+
+ /* Allocate a buffer for the request */
+ forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL);
+ req = &forget->req;
+
+ req->ih = (struct fuse_in_header){
+ .opcode = FUSE_FORGET,
+ .nodeid = link->forget_one.nodeid,
+ .unique = unique,
+ .len = sizeof(*req),
+ };
+ req->arg = (struct fuse_forget_in){
+ .nlookup = link->forget_one.nlookup,
+ };
+
+ send_forget_request(fsvq, forget, false);
+ kfree(link);
+}
+
+static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq)
+__releases(fiq->lock)
+{
+ /*
+ * TODO interrupts.
+ *
+ * Normal fs operations on a local filesystems aren't interruptible.
+ * Exceptions are blocking lock operations; for example fcntl(F_SETLKW)
+ * with shared lock between host and guest.
+ */
+ spin_unlock(&fiq->lock);
+}
+
+/* Count number of scatter-gather elements required */
+static unsigned int sg_count_fuse_pages(struct fuse_page_desc *page_descs,
+ unsigned int num_pages,
+ unsigned int total_len)
+{
+ unsigned int i;
+ unsigned int this_len;
+
+ for (i = 0; i < num_pages && total_len; i++) {
+ this_len = min(page_descs[i].length, total_len);
+ total_len -= this_len;
+ }
+
+ return i;
+}
+
+/* Return the number of scatter-gather list elements required */
+static unsigned int sg_count_fuse_req(struct fuse_req *req)
+{
+ struct fuse_args *args = req->args;
+ struct fuse_args_pages *ap = container_of(args, typeof(*ap), args);
+ unsigned int size, total_sgs = 1 /* fuse_in_header */;
+
+ if (args->in_numargs - args->in_pages)
+ total_sgs += 1;
+
+ if (args->in_pages) {
+ size = args->in_args[args->in_numargs - 1].size;
+ total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages,
+ size);
+ }
+
+ if (!test_bit(FR_ISREPLY, &req->flags))
+ return total_sgs;
+
+ total_sgs += 1 /* fuse_out_header */;
+
+ if (args->out_numargs - args->out_pages)
+ total_sgs += 1;
+
+ if (args->out_pages) {
+ size = args->out_args[args->out_numargs - 1].size;
+ total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages,
+ size);
+ }
+
+ return total_sgs;
+}
+
+/* Add pages to scatter-gather list and return number of elements used */
+static unsigned int sg_init_fuse_pages(struct scatterlist *sg,
+ struct page **pages,
+ struct fuse_page_desc *page_descs,
+ unsigned int num_pages,
+ unsigned int total_len)
+{
+ unsigned int i;
+ unsigned int this_len;
+
+ for (i = 0; i < num_pages && total_len; i++) {
+ sg_init_table(&sg[i], 1);
+ this_len = min(page_descs[i].length, total_len);
+ sg_set_page(&sg[i], pages[i], this_len, page_descs[i].offset);
+ total_len -= this_len;
+ }
+
+ return i;
+}
+
+/* Add args to scatter-gather list and return number of elements used */
+static unsigned int sg_init_fuse_args(struct scatterlist *sg,
+ struct fuse_req *req,
+ struct fuse_arg *args,
+ unsigned int numargs,
+ bool argpages,
+ void *argbuf,
+ unsigned int *len_used)
+{
+ struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args);
+ unsigned int total_sgs = 0;
+ unsigned int len;
+
+ len = fuse_len_args(numargs - argpages, args);
+ if (len)
+ sg_init_one(&sg[total_sgs++], argbuf, len);
+
+ if (argpages)
+ total_sgs += sg_init_fuse_pages(&sg[total_sgs],
+ ap->pages, ap->descs,
+ ap->num_pages,
+ args[numargs - 1].size);
+
+ if (len_used)
+ *len_used = len;
+
+ return total_sgs;
+}
+
+/* Add a request to a virtqueue and kick the device */
+static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
+ struct fuse_req *req, bool in_flight)
+{
+ /* requests need at least 4 elements */
+ struct scatterlist *stack_sgs[6];
+ struct scatterlist stack_sg[ARRAY_SIZE(stack_sgs)];
+ struct scatterlist **sgs = stack_sgs;
+ struct scatterlist *sg = stack_sg;
+ struct virtqueue *vq;
+ struct fuse_args *args = req->args;
+ unsigned int argbuf_used = 0;
+ unsigned int out_sgs = 0;
+ unsigned int in_sgs = 0;
+ unsigned int total_sgs;
+ unsigned int i;
+ int ret;
+ bool notify;
+ struct fuse_pqueue *fpq;
+
+ /* Does the sglist fit on the stack? */
+ total_sgs = sg_count_fuse_req(req);
+ if (total_sgs > ARRAY_SIZE(stack_sgs)) {
+ sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC);
+ sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC);
+ if (!sgs || !sg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* Use a bounce buffer since stack args cannot be mapped */
+ ret = copy_args_to_argbuf(req);
+ if (ret < 0)
+ goto out;
+
+ /* Request elements */
+ sg_init_one(&sg[out_sgs++], &req->in.h, sizeof(req->in.h));
+ out_sgs += sg_init_fuse_args(&sg[out_sgs], req,
+ (struct fuse_arg *)args->in_args,
+ args->in_numargs, args->in_pages,
+ req->argbuf, &argbuf_used);
+
+ /* Reply elements */
+ if (test_bit(FR_ISREPLY, &req->flags)) {
+ sg_init_one(&sg[out_sgs + in_sgs++],
+ &req->out.h, sizeof(req->out.h));
+ in_sgs += sg_init_fuse_args(&sg[out_sgs + in_sgs], req,
+ args->out_args, args->out_numargs,
+ args->out_pages,
+ req->argbuf + argbuf_used, NULL);
+ }
+
+ WARN_ON(out_sgs + in_sgs != total_sgs);
+
+ for (i = 0; i < total_sgs; i++)
+ sgs[i] = &sg[i];
+
+ spin_lock(&fsvq->lock);
+
+ if (!fsvq->connected) {
+ spin_unlock(&fsvq->lock);
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ vq = fsvq->vq;
+ ret = virtqueue_add_sgs(vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC);
+ if (ret < 0) {
+ spin_unlock(&fsvq->lock);
+ goto out;
+ }
+
+ /* Request successfully sent. */
+ fpq = &fsvq->fud->pq;
+ spin_lock(&fpq->lock);
+ list_add_tail(&req->list, fpq->processing);
+ spin_unlock(&fpq->lock);
+ set_bit(FR_SENT, &req->flags);
+ /* matches barrier in request_wait_answer() */
+ smp_mb__after_atomic();
+
+ if (!in_flight)
+ inc_in_flight_req(fsvq);
+ notify = virtqueue_kick_prepare(vq);
+
+ spin_unlock(&fsvq->lock);
+
+ if (notify)
+ virtqueue_notify(vq);
+
+out:
+ if (ret < 0 && req->argbuf) {
+ kfree(req->argbuf);
+ req->argbuf = NULL;
+ }
+ if (sgs != stack_sgs) {
+ kfree(sgs);
+ kfree(sg);
+ }
+
+ return ret;
+}
+
+static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
+__releases(fiq->lock)
+{
+ unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
+ struct virtio_fs *fs;
+ struct fuse_req *req;
+ struct virtio_fs_vq *fsvq;
+ int ret;
+
+ WARN_ON(list_empty(&fiq->pending));
+ req = list_last_entry(&fiq->pending, struct fuse_req, list);
+ clear_bit(FR_PENDING, &req->flags);
+ list_del_init(&req->list);
+ WARN_ON(!list_empty(&fiq->pending));
+ spin_unlock(&fiq->lock);
+
+ fs = fiq->priv;
+
+ pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n",
+ __func__, req->in.h.opcode, req->in.h.unique,
+ req->in.h.nodeid, req->in.h.len,
+ fuse_len_args(req->args->out_numargs, req->args->out_args));
+
+ fsvq = &fs->vqs[queue_id];
+ ret = virtio_fs_enqueue_req(fsvq, req, false);
+ if (ret < 0) {
+ if (ret == -ENOMEM || ret == -ENOSPC) {
+ /*
+ * Virtqueue full. Retry submission from worker
+ * context as we might be holding fc->bg_lock.
+ */
+ spin_lock(&fsvq->lock);
+ list_add_tail(&req->list, &fsvq->queued_reqs);
+ inc_in_flight_req(fsvq);
+ schedule_delayed_work(&fsvq->dispatch_work,
+ msecs_to_jiffies(1));
+ spin_unlock(&fsvq->lock);
+ return;
+ }
+ req->out.h.error = ret;
+ pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret);
+
+ /* Can't end request in submission context. Use a worker */
+ spin_lock(&fsvq->lock);
+ list_add_tail(&req->list, &fsvq->end_reqs);
+ schedule_delayed_work(&fsvq->dispatch_work, 0);
+ spin_unlock(&fsvq->lock);
+ return;
+ }
+}
+
+static const struct fuse_iqueue_ops virtio_fs_fiq_ops = {
+ .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock,
+ .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock,
+ .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock,
+ .release = virtio_fs_fiq_release,
+};
+
+static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx)
+{
+ ctx->rootmode = S_IFDIR;
+ ctx->default_permissions = 1;
+ ctx->allow_other = 1;
+ ctx->max_read = UINT_MAX;
+ ctx->blksize = 512;
+ ctx->destroy = true;
+ ctx->no_control = true;
+ ctx->no_force_umount = true;
+}
+
+static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ struct fuse_conn *fc = fm->fc;
+ struct virtio_fs *fs = fc->iq.priv;
+ struct fuse_fs_context *ctx = fsc->fs_private;
+ unsigned int i;
+ int err;
+
+ virtio_fs_ctx_set_defaults(ctx);
+ mutex_lock(&virtio_fs_mutex);
+
+ /* After holding mutex, make sure virtiofs device is still there.
+ * Though we are holding a reference to it, drive ->remove might
+ * still have cleaned up virtual queues. In that case bail out.
+ */
+ err = -EINVAL;
+ if (list_empty(&fs->list)) {
+ pr_info("virtio-fs: tag <%s> not found\n", fs->tag);
+ goto err;
+ }
+
+ err = -ENOMEM;
+ /* Allocate fuse_dev for hiprio and notification queues */
+ for (i = 0; i < fs->nvqs; i++) {
+ struct virtio_fs_vq *fsvq = &fs->vqs[i];
+
+ fsvq->fud = fuse_dev_alloc();
+ if (!fsvq->fud)
+ goto err_free_fuse_devs;
+ }
+
+ /* virtiofs allocates and installs its own fuse devices */
+ ctx->fudptr = NULL;
+ if (ctx->dax_mode != FUSE_DAX_NEVER) {
+ if (ctx->dax_mode == FUSE_DAX_ALWAYS && !fs->dax_dev) {
+ err = -EINVAL;
+ pr_err("virtio-fs: dax can't be enabled as filesystem"
+ " device does not support it.\n");
+ goto err_free_fuse_devs;
+ }
+ ctx->dax_dev = fs->dax_dev;
+ }
+ err = fuse_fill_super_common(sb, ctx);
+ if (err < 0)
+ goto err_free_fuse_devs;
+
+ for (i = 0; i < fs->nvqs; i++) {
+ struct virtio_fs_vq *fsvq = &fs->vqs[i];
+
+ fuse_dev_install(fsvq->fud, fc);
+ }
+
+ /* Previous unmount will stop all queues. Start these again */
+ virtio_fs_start_all_queues(fs);
+ fuse_send_init(fm);
+ mutex_unlock(&virtio_fs_mutex);
+ return 0;
+
+err_free_fuse_devs:
+ virtio_fs_free_devs(fs);
+err:
+ mutex_unlock(&virtio_fs_mutex);
+ return err;
+}
+
+static void virtio_fs_conn_destroy(struct fuse_mount *fm)
+{
+ struct fuse_conn *fc = fm->fc;
+ struct virtio_fs *vfs = fc->iq.priv;
+ struct virtio_fs_vq *fsvq = &vfs->vqs[VQ_HIPRIO];
+
+ /* Stop dax worker. Soon evict_inodes() will be called which
+ * will free all memory ranges belonging to all inodes.
+ */
+ if (IS_ENABLED(CONFIG_FUSE_DAX))
+ fuse_dax_cancel_work(fc);
+
+ /* Stop forget queue. Soon destroy will be sent */
+ spin_lock(&fsvq->lock);
+ fsvq->connected = false;
+ spin_unlock(&fsvq->lock);
+ virtio_fs_drain_all_queues(vfs);
+
+ fuse_conn_destroy(fm);
+
+ /* fuse_conn_destroy() must have sent destroy. Stop all queues
+ * and drain one more time and free fuse devices. Freeing fuse
+ * devices will drop their reference on fuse_conn and that in
+ * turn will drop its reference on virtio_fs object.
+ */
+ virtio_fs_stop_all_queues(vfs);
+ virtio_fs_drain_all_queues(vfs);
+ virtio_fs_free_devs(vfs);
+}
+
+static void virtio_kill_sb(struct super_block *sb)
+{
+ struct fuse_mount *fm = get_fuse_mount_super(sb);
+ bool last;
+
+ /* If mount failed, we can still be called without any fc */
+ if (sb->s_root) {
+ last = fuse_mount_remove(fm);
+ if (last)
+ virtio_fs_conn_destroy(fm);
+ }
+ kill_anon_super(sb);
+ fuse_mount_destroy(fm);
+}
+
+static int virtio_fs_test_super(struct super_block *sb,
+ struct fs_context *fsc)
+{
+ struct fuse_mount *fsc_fm = fsc->s_fs_info;
+ struct fuse_mount *sb_fm = get_fuse_mount_super(sb);
+
+ return fsc_fm->fc->iq.priv == sb_fm->fc->iq.priv;
+}
+
+static int virtio_fs_get_tree(struct fs_context *fsc)
+{
+ struct virtio_fs *fs;
+ struct super_block *sb;
+ struct fuse_conn *fc = NULL;
+ struct fuse_mount *fm;
+ unsigned int virtqueue_size;
+ int err = -EIO;
+
+ /* This gets a reference on virtio_fs object. This ptr gets installed
+ * in fc->iq->priv. Once fuse_conn is going away, it calls ->put()
+ * to drop the reference to this object.
+ */
+ fs = virtio_fs_find_instance(fsc->source);
+ if (!fs) {
+ pr_info("virtio-fs: tag <%s> not found\n", fsc->source);
+ return -EINVAL;
+ }
+
+ virtqueue_size = virtqueue_get_vring_size(fs->vqs[VQ_REQUEST].vq);
+ if (WARN_ON(virtqueue_size <= FUSE_HEADER_OVERHEAD))
+ goto out_err;
+
+ err = -ENOMEM;
+ fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL);
+ if (!fc)
+ goto out_err;
+
+ fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL);
+ if (!fm)
+ goto out_err;
+
+ fuse_conn_init(fc, fm, fsc->user_ns, &virtio_fs_fiq_ops, fs);
+ fc->release = fuse_free_conn;
+ fc->delete_stale = true;
+ fc->auto_submounts = true;
+ fc->sync_fs = true;
+
+ /* Tell FUSE to split requests that exceed the virtqueue's size */
+ fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit,
+ virtqueue_size - FUSE_HEADER_OVERHEAD);
+
+ fsc->s_fs_info = fm;
+ sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc);
+ if (fsc->s_fs_info)
+ fuse_mount_destroy(fm);
+ if (IS_ERR(sb))
+ return PTR_ERR(sb);
+
+ if (!sb->s_root) {
+ err = virtio_fs_fill_super(sb, fsc);
+ if (err) {
+ deactivate_locked_super(sb);
+ return err;
+ }
+
+ sb->s_flags |= SB_ACTIVE;
+ }
+
+ WARN_ON(fsc->root);
+ fsc->root = dget(sb->s_root);
+ return 0;
+
+out_err:
+ kfree(fc);
+ mutex_lock(&virtio_fs_mutex);
+ virtio_fs_put(fs);
+ mutex_unlock(&virtio_fs_mutex);
+ return err;
+}
+
+static const struct fs_context_operations virtio_fs_context_ops = {
+ .free = virtio_fs_free_fsc,
+ .parse_param = virtio_fs_parse_param,
+ .get_tree = virtio_fs_get_tree,
+};
+
+static int virtio_fs_init_fs_context(struct fs_context *fsc)
+{
+ struct fuse_fs_context *ctx;
+
+ if (fsc->purpose == FS_CONTEXT_FOR_SUBMOUNT)
+ return fuse_init_fs_context_submount(fsc);
+
+ ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ fsc->fs_private = ctx;
+ fsc->ops = &virtio_fs_context_ops;
+ return 0;
+}
+
+static struct file_system_type virtio_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "virtiofs",
+ .init_fs_context = virtio_fs_init_fs_context,
+ .kill_sb = virtio_kill_sb,
+};
+
+static int __init virtio_fs_init(void)
+{
+ int ret;
+
+ ret = register_virtio_driver(&virtio_fs_driver);
+ if (ret < 0)
+ return ret;
+
+ ret = register_filesystem(&virtio_fs_type);
+ if (ret < 0) {
+ unregister_virtio_driver(&virtio_fs_driver);
+ return ret;
+ }
+
+ return 0;
+}
+module_init(virtio_fs_init);
+
+static void __exit virtio_fs_exit(void)
+{
+ unregister_filesystem(&virtio_fs_type);
+ unregister_virtio_driver(&virtio_fs_driver);
+}
+module_exit(virtio_fs_exit);
+
+MODULE_AUTHOR("Stefan Hajnoczi <stefanha@redhat.com>");
+MODULE_DESCRIPTION("Virtio Filesystem");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_FS(KBUILD_MODNAME);
+MODULE_DEVICE_TABLE(virtio, id_table);
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
new file mode 100644
index 000000000..0d3e7177f
--- /dev/null
+++ b/fs/fuse/xattr.c
@@ -0,0 +1,266 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2016 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * This program can be distributed under the terms of the GNU GPL.
+ * See the file COPYING.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+
+int fuse_setxattr(struct inode *inode, const char *name, const void *value,
+ size_t size, int flags, unsigned int extra_flags)
+{
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ FUSE_ARGS(args);
+ struct fuse_setxattr_in inarg;
+ int err;
+
+ if (fm->fc->no_setxattr)
+ return -EOPNOTSUPP;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.size = size;
+ inarg.flags = flags;
+ inarg.setxattr_flags = extra_flags;
+
+ args.opcode = FUSE_SETXATTR;
+ args.nodeid = get_node_id(inode);
+ args.in_numargs = 3;
+ args.in_args[0].size = fm->fc->setxattr_ext ?
+ sizeof(inarg) : FUSE_COMPAT_SETXATTR_IN_SIZE;
+ args.in_args[0].value = &inarg;
+ args.in_args[1].size = strlen(name) + 1;
+ args.in_args[1].value = name;
+ args.in_args[2].size = size;
+ args.in_args[2].value = value;
+ err = fuse_simple_request(fm, &args);
+ if (err == -ENOSYS) {
+ fm->fc->no_setxattr = 1;
+ err = -EOPNOTSUPP;
+ }
+ if (!err)
+ fuse_update_ctime(inode);
+
+ return err;
+}
+
+ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
+ size_t size)
+{
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ FUSE_ARGS(args);
+ struct fuse_getxattr_in inarg;
+ struct fuse_getxattr_out outarg;
+ ssize_t ret;
+
+ if (fm->fc->no_getxattr)
+ return -EOPNOTSUPP;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.size = size;
+ args.opcode = FUSE_GETXATTR;
+ args.nodeid = get_node_id(inode);
+ args.in_numargs = 2;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.in_args[1].size = strlen(name) + 1;
+ args.in_args[1].value = name;
+ /* This is really two different operations rolled into one */
+ args.out_numargs = 1;
+ if (size) {
+ args.out_argvar = true;
+ args.out_args[0].size = size;
+ args.out_args[0].value = value;
+ } else {
+ args.out_args[0].size = sizeof(outarg);
+ args.out_args[0].value = &outarg;
+ }
+ ret = fuse_simple_request(fm, &args);
+ if (!ret && !size)
+ ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX);
+ if (ret == -ENOSYS) {
+ fm->fc->no_getxattr = 1;
+ ret = -EOPNOTSUPP;
+ }
+ return ret;
+}
+
+static int fuse_verify_xattr_list(char *list, size_t size)
+{
+ size_t origsize = size;
+
+ while (size) {
+ size_t thislen = strnlen(list, size);
+
+ if (!thislen || thislen == size)
+ return -EIO;
+
+ size -= thislen + 1;
+ list += thislen + 1;
+ }
+
+ return origsize;
+}
+
+ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
+{
+ struct inode *inode = d_inode(entry);
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ FUSE_ARGS(args);
+ struct fuse_getxattr_in inarg;
+ struct fuse_getxattr_out outarg;
+ ssize_t ret;
+
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ if (!fuse_allow_current_process(fm->fc))
+ return -EACCES;
+
+ if (fm->fc->no_listxattr)
+ return -EOPNOTSUPP;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.size = size;
+ args.opcode = FUSE_LISTXATTR;
+ args.nodeid = get_node_id(inode);
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ /* This is really two different operations rolled into one */
+ args.out_numargs = 1;
+ if (size) {
+ args.out_argvar = true;
+ args.out_args[0].size = size;
+ args.out_args[0].value = list;
+ } else {
+ args.out_args[0].size = sizeof(outarg);
+ args.out_args[0].value = &outarg;
+ }
+ ret = fuse_simple_request(fm, &args);
+ if (!ret && !size)
+ ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX);
+ if (ret > 0 && size)
+ ret = fuse_verify_xattr_list(list, ret);
+ if (ret == -ENOSYS) {
+ fm->fc->no_listxattr = 1;
+ ret = -EOPNOTSUPP;
+ }
+ return ret;
+}
+
+int fuse_removexattr(struct inode *inode, const char *name)
+{
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ FUSE_ARGS(args);
+ int err;
+
+ if (fm->fc->no_removexattr)
+ return -EOPNOTSUPP;
+
+ args.opcode = FUSE_REMOVEXATTR;
+ args.nodeid = get_node_id(inode);
+ args.in_numargs = 1;
+ args.in_args[0].size = strlen(name) + 1;
+ args.in_args[0].value = name;
+ err = fuse_simple_request(fm, &args);
+ if (err == -ENOSYS) {
+ fm->fc->no_removexattr = 1;
+ err = -EOPNOTSUPP;
+ }
+ if (!err)
+ fuse_update_ctime(inode);
+
+ return err;
+}
+
+static int fuse_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *value, size_t size)
+{
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ return fuse_getxattr(inode, name, value, size);
+}
+
+static int fuse_xattr_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value, size_t size,
+ int flags)
+{
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ if (!value)
+ return fuse_removexattr(inode, name);
+
+ return fuse_setxattr(inode, name, value, size, flags, 0);
+}
+
+static bool no_xattr_list(struct dentry *dentry)
+{
+ return false;
+}
+
+static int no_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *value, size_t size)
+{
+ return -EOPNOTSUPP;
+}
+
+static int no_xattr_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct inode *nodee,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static const struct xattr_handler fuse_xattr_handler = {
+ .prefix = "",
+ .get = fuse_xattr_get,
+ .set = fuse_xattr_set,
+};
+
+const struct xattr_handler *fuse_xattr_handlers[] = {
+ &fuse_xattr_handler,
+ NULL
+};
+
+const struct xattr_handler *fuse_acl_xattr_handlers[] = {
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+ &fuse_xattr_handler,
+ NULL
+};
+
+static const struct xattr_handler fuse_no_acl_access_xattr_handler = {
+ .name = XATTR_NAME_POSIX_ACL_ACCESS,
+ .flags = ACL_TYPE_ACCESS,
+ .list = no_xattr_list,
+ .get = no_xattr_get,
+ .set = no_xattr_set,
+};
+
+static const struct xattr_handler fuse_no_acl_default_xattr_handler = {
+ .name = XATTR_NAME_POSIX_ACL_DEFAULT,
+ .flags = ACL_TYPE_ACCESS,
+ .list = no_xattr_list,
+ .get = no_xattr_get,
+ .set = no_xattr_set,
+};
+
+const struct xattr_handler *fuse_no_acl_xattr_handlers[] = {
+ &fuse_no_acl_access_xattr_handler,
+ &fuse_no_acl_default_xattr_handler,
+ &fuse_xattr_handler,
+ NULL
+};