summaryrefslogtreecommitdiffstats
path: root/fs/nfs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:27:49 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:27:49 +0000
commitace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch)
treeb2d64bc10158fdd5497876388cd68142ca374ed3 /fs/nfs
parentInitial commit. (diff)
downloadlinux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz
linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'fs/nfs')
-rw-r--r--fs/nfs/Kconfig214
-rw-r--r--fs/nfs/Makefile37
-rw-r--r--fs/nfs/blocklayout/Makefile7
-rw-r--r--fs/nfs/blocklayout/blocklayout.c1030
-rw-r--r--fs/nfs/blocklayout/blocklayout.h197
-rw-r--r--fs/nfs/blocklayout/dev.c530
-rw-r--r--fs/nfs/blocklayout/extent_tree.c647
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c287
-rw-r--r--fs/nfs/cache_lib.c158
-rw-r--r--fs/nfs/cache_lib.h32
-rw-r--r--fs/nfs/callback.c414
-rw-r--r--fs/nfs/callback.h223
-rw-r--r--fs/nfs/callback_proc.c739
-rw-r--r--fs/nfs/callback_xdr.c1096
-rw-r--r--fs/nfs/client.c1424
-rw-r--r--fs/nfs/delegation.c1479
-rw-r--r--fs/nfs/delegation.h91
-rw-r--r--fs/nfs/dir.c3324
-rw-r--r--fs/nfs/direct.c1058
-rw-r--r--fs/nfs/dns_resolve.c480
-rw-r--r--fs/nfs/dns_resolve.h37
-rw-r--r--fs/nfs/export.c158
-rw-r--r--fs/nfs/file.c904
-rw-r--r--fs/nfs/filelayout/Makefile6
-rw-r--r--fs/nfs/filelayout/filelayout.c1162
-rw-r--r--fs/nfs/filelayout/filelayout.h118
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c303
-rw-r--r--fs/nfs/flexfilelayout/Makefile6
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c2623
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h226
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c621
-rw-r--r--fs/nfs/fs_context.c1670
-rw-r--r--fs/nfs/fscache.c393
-rw-r--r--fs/nfs/fscache.h202
-rw-r--r--fs/nfs/getroot.c164
-rw-r--r--fs/nfs/inode.c2540
-rw-r--r--fs/nfs/internal.h954
-rw-r--r--fs/nfs/io.c148
-rw-r--r--fs/nfs/iostat.h60
-rw-r--r--fs/nfs/mount_clnt.c544
-rw-r--r--fs/nfs/namespace.c370
-rw-r--r--fs/nfs/netns.h44
-rw-r--r--fs/nfs/nfs.h30
-rw-r--r--fs/nfs/nfs2super.c32
-rw-r--r--fs/nfs/nfs2xdr.c1156
-rw-r--r--fs/nfs/nfs3_fs.h37
-rw-r--r--fs/nfs/nfs3acl.c343
-rw-r--r--fs/nfs/nfs3client.c131
-rw-r--r--fs/nfs/nfs3proc.c1068
-rw-r--r--fs/nfs/nfs3super.c33
-rw-r--r--fs/nfs/nfs3xdr.c2581
-rw-r--r--fs/nfs/nfs42.h65
-rw-r--r--fs/nfs/nfs42proc.c1461
-rw-r--r--fs/nfs/nfs42xattr.c1066
-rw-r--r--fs/nfs/nfs42xdr.c1672
-rw-r--r--fs/nfs/nfs4_fs.h674
-rw-r--r--fs/nfs/nfs4client.c1384
-rw-r--r--fs/nfs/nfs4file.c469
-rw-r--r--fs/nfs/nfs4getroot.c41
-rw-r--r--fs/nfs/nfs4idmap.c806
-rw-r--r--fs/nfs/nfs4idmap.h68
-rw-r--r--fs/nfs/nfs4namespace.c577
-rw-r--r--fs/nfs/nfs4proc.c10762
-rw-r--r--fs/nfs/nfs4renewd.c151
-rw-r--r--fs/nfs/nfs4session.c657
-rw-r--r--fs/nfs/nfs4session.h185
-rw-r--r--fs/nfs/nfs4state.c2786
-rw-r--r--fs/nfs/nfs4super.c314
-rw-r--r--fs/nfs/nfs4sysctl.c53
-rw-r--r--fs/nfs/nfs4trace.c31
-rw-r--r--fs/nfs/nfs4trace.h2559
-rw-r--r--fs/nfs/nfs4xdr.c7721
-rw-r--r--fs/nfs/nfsroot.c316
-rw-r--r--fs/nfs/nfstrace.c15
-rw-r--r--fs/nfs/nfstrace.h1738
-rw-r--r--fs/nfs/pagelist.c1588
-rw-r--r--fs/nfs/pnfs.c3358
-rw-r--r--fs/nfs/pnfs.h932
-rw-r--r--fs/nfs/pnfs_dev.c384
-rw-r--r--fs/nfs/pnfs_nfs.c1209
-rw-r--r--fs/nfs/proc.c763
-rw-r--r--fs/nfs/read.c458
-rw-r--r--fs/nfs/super.c1428
-rw-r--r--fs/nfs/symlink.c85
-rw-r--r--fs/nfs/sysctl.c47
-rw-r--r--fs/nfs/sysfs.c361
-rw-r--r--fs/nfs/sysfs.h33
-rw-r--r--fs/nfs/unlink.c531
-rw-r--r--fs/nfs/write.c2217
89 files changed, 81096 insertions, 0 deletions
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
new file mode 100644
index 0000000000..7df2503cef
--- /dev/null
+++ b/fs/nfs/Kconfig
@@ -0,0 +1,214 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config NFS_FS
+ tristate "NFS client support"
+ depends on INET && FILE_LOCKING && MULTIUSER
+ select LOCKD
+ select SUNRPC
+ select NFS_ACL_SUPPORT if NFS_V3_ACL
+ help
+ Choose Y here if you want to access files residing on other
+ computers using Sun's Network File System protocol. To compile
+ this file system support as a module, choose M here: the module
+ will be called nfs.
+
+ To mount file systems exported by NFS servers, you also need to
+ install the user space mount.nfs command which can be found in
+ the Linux nfs-utils package, available from http://linux-nfs.org/.
+ Information about using the mount command is available in the
+ mount(8) man page. More detail about the Linux NFS client
+ implementation is available via the nfs(5) man page.
+
+ Below you can choose which versions of the NFS protocol are
+ available in the kernel to mount NFS servers. Support for NFS
+ version 2 (RFC 1094) is always available when NFS_FS is selected.
+
+ To configure a system which mounts its root file system via NFS
+ at boot time, say Y here, select "Kernel level IP
+ autoconfiguration" in the NETWORK menu, and select "Root file
+ system on NFS" below. You cannot compile this file system as a
+ module in this case.
+
+ If unsure, say N.
+
+config NFS_V2
+ tristate "NFS client support for NFS version 2"
+ depends on NFS_FS
+ default y
+ help
+ This option enables support for version 2 of the NFS protocol
+ (RFC 1094) in the kernel's NFS client.
+
+ If unsure, say Y.
+
+config NFS_V3
+ tristate "NFS client support for NFS version 3"
+ depends on NFS_FS
+ default y
+ help
+ This option enables support for version 3 of the NFS protocol
+ (RFC 1813) in the kernel's NFS client.
+
+ If unsure, say Y.
+
+config NFS_V3_ACL
+ bool "NFS client support for the NFSv3 ACL protocol extension"
+ depends on NFS_V3
+ help
+ Some NFS servers support an auxiliary NFSv3 ACL protocol that
+ Sun added to Solaris but never became an official part of the
+ NFS version 3 protocol. This protocol extension allows
+ applications on NFS clients to manipulate POSIX Access Control
+ Lists on files residing on NFS servers. NFS servers enforce
+ ACLs on local files whether this protocol is available or not.
+
+ Choose Y here if your NFS server supports the Solaris NFSv3 ACL
+ protocol extension and you want your NFS client to allow
+ applications to access and modify ACLs on files on the server.
+
+ Most NFS servers don't support the Solaris NFSv3 ACL protocol
+ extension. You can choose N here or specify the "noacl" mount
+ option to prevent your NFS client from trying to use the NFSv3
+ ACL protocol.
+
+ If unsure, say N.
+
+config NFS_V4
+ tristate "NFS client support for NFS version 4"
+ depends on NFS_FS
+ select KEYS
+ help
+ This option enables support for version 4 of the NFS protocol
+ (RFC 3530) in the kernel's NFS client.
+
+ To mount NFS servers using NFSv4, you also need to install user
+ space programs which can be found in the Linux nfs-utils package,
+ available from http://linux-nfs.org/.
+
+ If unsure, say Y.
+
+config NFS_SWAP
+ bool "Provide swap over NFS support"
+ default n
+ depends on NFS_FS && SWAP
+ select SUNRPC_SWAP
+ help
+ This option enables swapon to work on files located on NFS mounts.
+
+config NFS_V4_1
+ bool "NFS client support for NFSv4.1"
+ depends on NFS_V4
+ select SUNRPC_BACKCHANNEL
+ help
+ This option enables support for minor version 1 of the NFSv4 protocol
+ (RFC 5661) in the kernel's NFS client.
+
+ If unsure, say N.
+
+config NFS_V4_2
+ bool "NFS client support for NFSv4.2"
+ depends on NFS_V4_1
+ help
+ This option enables support for minor version 2 of the NFSv4 protocol
+ in the kernel's NFS client.
+
+ If unsure, say N.
+
+config PNFS_FILE_LAYOUT
+ tristate
+ depends on NFS_V4_1
+ default NFS_V4
+
+config PNFS_BLOCK
+ tristate
+ depends on NFS_V4_1 && BLK_DEV_DM
+ default NFS_V4
+
+config PNFS_FLEXFILE_LAYOUT
+ tristate
+ depends on NFS_V4_1 && NFS_V3
+ default NFS_V4
+
+config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
+ string "NFSv4.1 Implementation ID Domain"
+ depends on NFS_V4_1
+ default "kernel.org"
+ help
+ This option defines the domain portion of the implementation ID that
+ may be sent in the NFS exchange_id operation. The value must be in
+ the format of a DNS domain name and should be set to the DNS domain
+ name of the distribution.
+ If the NFS client is unchanged from the upstream kernel, this
+ option should be set to the default "kernel.org".
+
+config NFS_V4_1_MIGRATION
+ bool "NFSv4.1 client support for migration"
+ depends on NFS_V4_1
+ default n
+ help
+ This option makes the NFS client advertise to NFSv4.1 servers that
+ it can support NFSv4 migration.
+
+ The NFSv4.1 pieces of the Linux NFSv4 migration implementation are
+ still experimental. If you are not an NFSv4 developer, say N here.
+
+config NFS_V4_SECURITY_LABEL
+ bool
+ depends on NFS_V4_2 && SECURITY
+ default y
+
+config ROOT_NFS
+ bool "Root file system on NFS"
+ depends on NFS_FS=y && IP_PNP
+ help
+ If you want your system to mount its root file system via NFS,
+ choose Y here. This is common practice for managing systems
+ without local permanent storage. For details, read
+ <file:Documentation/admin-guide/nfs/nfsroot.rst>.
+
+ Most people say N here.
+
+config NFS_FSCACHE
+ bool "Provide NFS client caching support"
+ depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
+ select NETFS_SUPPORT
+ help
+ Say Y here if you want NFS data to be cached locally on disc through
+ the general filesystem cache manager
+
+config NFS_USE_LEGACY_DNS
+ bool "Use the legacy NFS DNS resolver"
+ depends on NFS_V4
+ help
+ The kernel now provides a method for translating a host name into an
+ IP address. Select Y here if you would rather use your own DNS
+ resolver script.
+
+ If unsure, say N
+
+config NFS_USE_KERNEL_DNS
+ bool
+ depends on NFS_V4 && !NFS_USE_LEGACY_DNS
+ select DNS_RESOLVER
+ default y
+
+config NFS_DEBUG
+ bool
+ depends on NFS_FS && SUNRPC_DEBUG
+ select CRC32
+ default y
+
+config NFS_DISABLE_UDP_SUPPORT
+ bool "NFS: Disable NFS UDP protocol support"
+ depends on NFS_FS
+ default y
+ help
+ Choose Y here to disable the use of NFS over UDP. NFS over UDP
+ on modern networks (1Gb+) can lead to data corruption caused by
+ fragmentation during high loads.
+
+config NFS_V4_2_READ_PLUS
+ bool "NFS: Enable support for the NFSv4.2 READ_PLUS operation"
+ depends on NFS_V4_2
+ default y
+ help
+ Choose Y here to enable use of the NFS v4.2 READ_PLUS operation.
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
new file mode 100644
index 0000000000..5f6db37f46
--- /dev/null
+++ b/fs/nfs/Makefile
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the Linux nfs filesystem routines.
+#
+
+obj-$(CONFIG_NFS_FS) += nfs.o
+
+CFLAGS_nfstrace.o += -I$(src)
+nfs-y := client.o dir.o file.o getroot.o inode.o super.o \
+ io.o direct.o pagelist.o read.o symlink.o unlink.o \
+ write.o namespace.o mount_clnt.o nfstrace.o \
+ export.o sysfs.o fs_context.o
+nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
+nfs-$(CONFIG_SYSCTL) += sysctl.o
+nfs-$(CONFIG_NFS_FSCACHE) += fscache.o
+
+obj-$(CONFIG_NFS_V2) += nfsv2.o
+nfsv2-y := nfs2super.o proc.o nfs2xdr.o
+
+obj-$(CONFIG_NFS_V3) += nfsv3.o
+nfsv3-y := nfs3super.o nfs3client.o nfs3proc.o nfs3xdr.o
+nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
+
+obj-$(CONFIG_NFS_V4) += nfsv4.o
+CFLAGS_nfs4trace.o += -I$(src)
+nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
+ delegation.o nfs4idmap.o callback.o callback_xdr.o callback_proc.o \
+ nfs4namespace.o nfs4getroot.o nfs4client.o nfs4session.o \
+ dns_resolve.o nfs4trace.o
+nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
+nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
+nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o pnfs_nfs.o
+nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o nfs42xattr.o
+
+obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/
+obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
+obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += flexfilelayout/
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
new file mode 100644
index 0000000000..7668a1bfb5
--- /dev/null
+++ b/fs/nfs/blocklayout/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for the pNFS block layout driver kernel module
+#
+obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
+
+blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
new file mode 100644
index 0000000000..6be13e0ec1
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -0,0 +1,1030 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayout.c
+ *
+ * Module for the NFSv4.1 pNFS block layout driver.
+ *
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@citi.umich.edu>
+ * Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/bio.h> /* struct bio */
+#include <linux/prefetch.h>
+#include <linux/pagevec.h>
+
+#include "../pnfs.h"
+#include "../nfs4session.h"
+#include "../internal.h"
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
+MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
+
+static bool is_hole(struct pnfs_block_extent *be)
+{
+ switch (be->be_state) {
+ case PNFS_BLOCK_NONE_DATA:
+ return true;
+ case PNFS_BLOCK_INVALID_DATA:
+ return be->be_tag ? false : true;
+ default:
+ return false;
+ }
+}
+
+/* The data we are handed might be spread across several bios. We need
+ * to track when the last one is finished.
+ */
+struct parallel_io {
+ struct kref refcnt;
+ void (*pnfs_callback) (void *data);
+ void *data;
+};
+
+static inline struct parallel_io *alloc_parallel(void *data)
+{
+ struct parallel_io *rv;
+
+ rv = kmalloc(sizeof(*rv), GFP_NOFS);
+ if (rv) {
+ rv->data = data;
+ kref_init(&rv->refcnt);
+ }
+ return rv;
+}
+
+static inline void get_parallel(struct parallel_io *p)
+{
+ kref_get(&p->refcnt);
+}
+
+static void destroy_parallel(struct kref *kref)
+{
+ struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
+
+ dprintk("%s enter\n", __func__);
+ p->pnfs_callback(p->data);
+ kfree(p);
+}
+
+static inline void put_parallel(struct parallel_io *p)
+{
+ kref_put(&p->refcnt, destroy_parallel);
+}
+
+static struct bio *
+bl_submit_bio(struct bio *bio)
+{
+ if (bio) {
+ get_parallel(bio->bi_private);
+ dprintk("%s submitting %s bio %u@%llu\n", __func__,
+ bio_op(bio) == READ ? "read" : "write",
+ bio->bi_iter.bi_size,
+ (unsigned long long)bio->bi_iter.bi_sector);
+ submit_bio(bio);
+ }
+ return NULL;
+}
+
+static bool offset_in_map(u64 offset, struct pnfs_block_dev_map *map)
+{
+ return offset >= map->start && offset < map->start + map->len;
+}
+
+static struct bio *
+do_add_page_to_bio(struct bio *bio, int npg, enum req_op op, sector_t isect,
+ struct page *page, struct pnfs_block_dev_map *map,
+ struct pnfs_block_extent *be, bio_end_io_t end_io,
+ struct parallel_io *par, unsigned int offset, int *len)
+{
+ struct pnfs_block_dev *dev =
+ container_of(be->be_device, struct pnfs_block_dev, node);
+ u64 disk_addr, end;
+
+ dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
+ npg, (__force u32)op, (unsigned long long)isect, offset, *len);
+
+ /* translate to device offset */
+ isect += be->be_v_offset;
+ isect -= be->be_f_offset;
+
+ /* translate to physical disk offset */
+ disk_addr = (u64)isect << SECTOR_SHIFT;
+ if (!offset_in_map(disk_addr, map)) {
+ if (!dev->map(dev, disk_addr, map) || !offset_in_map(disk_addr, map))
+ return ERR_PTR(-EIO);
+ bio = bl_submit_bio(bio);
+ }
+ disk_addr += map->disk_offset;
+ disk_addr -= map->start;
+
+ /* limit length to what the device mapping allows */
+ end = disk_addr + *len;
+ if (end >= map->start + map->len)
+ *len = map->start + map->len - disk_addr;
+
+retry:
+ if (!bio) {
+ bio = bio_alloc(map->bdev, bio_max_segs(npg), op, GFP_NOIO);
+ bio->bi_iter.bi_sector = disk_addr >> SECTOR_SHIFT;
+ bio->bi_end_io = end_io;
+ bio->bi_private = par;
+ }
+ if (bio_add_page(bio, page, *len, offset) < *len) {
+ bio = bl_submit_bio(bio);
+ goto retry;
+ }
+ return bio;
+}
+
+static void bl_mark_devices_unavailable(struct nfs_pgio_header *header, bool rw)
+{
+ struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+ size_t bytes_left = header->args.count;
+ sector_t isect, extent_length = 0;
+ struct pnfs_block_extent be;
+
+ isect = header->args.offset >> SECTOR_SHIFT;
+ bytes_left += header->args.offset - (isect << SECTOR_SHIFT);
+
+ while (bytes_left > 0) {
+ if (!ext_tree_lookup(bl, isect, &be, rw))
+ return;
+ extent_length = be.be_length - (isect - be.be_f_offset);
+ nfs4_mark_deviceid_unavailable(be.be_device);
+ isect += extent_length;
+ if (bytes_left > extent_length << SECTOR_SHIFT)
+ bytes_left -= extent_length << SECTOR_SHIFT;
+ else
+ bytes_left = 0;
+ }
+}
+
+static void bl_end_io_read(struct bio *bio)
+{
+ struct parallel_io *par = bio->bi_private;
+
+ if (bio->bi_status) {
+ struct nfs_pgio_header *header = par->data;
+
+ if (!header->pnfs_error)
+ header->pnfs_error = -EIO;
+ pnfs_set_lo_fail(header->lseg);
+ bl_mark_devices_unavailable(header, false);
+ }
+
+ bio_put(bio);
+ put_parallel(par);
+}
+
+static void bl_read_cleanup(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_pgio_header *hdr;
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ hdr = container_of(task, struct nfs_pgio_header, task);
+ pnfs_ld_read_done(hdr);
+}
+
+static void
+bl_end_par_io_read(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ hdr->task.tk_status = hdr->pnfs_error;
+ INIT_WORK(&hdr->task.u.tk_work, bl_read_cleanup);
+ schedule_work(&hdr->task.u.tk_work);
+}
+
+static enum pnfs_try_status
+bl_read_pagelist(struct nfs_pgio_header *header)
+{
+ struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+ struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
+ struct bio *bio = NULL;
+ struct pnfs_block_extent be;
+ sector_t isect, extent_length = 0;
+ struct parallel_io *par;
+ loff_t f_offset = header->args.offset;
+ size_t bytes_left = header->args.count;
+ unsigned int pg_offset = header->args.pgbase, pg_len;
+ struct page **pages = header->args.pages;
+ int pg_index = header->args.pgbase >> PAGE_SHIFT;
+ const bool is_dio = (header->dreq != NULL);
+ struct blk_plug plug;
+ int i;
+
+ dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
+ header->page_array.npages, f_offset,
+ (unsigned int)header->args.count);
+
+ par = alloc_parallel(header);
+ if (!par)
+ return PNFS_NOT_ATTEMPTED;
+ par->pnfs_callback = bl_end_par_io_read;
+
+ blk_start_plug(&plug);
+
+ isect = (sector_t) (f_offset >> SECTOR_SHIFT);
+ /* Code assumes extents are page-aligned */
+ for (i = pg_index; i < header->page_array.npages; i++) {
+ if (extent_length <= 0) {
+ /* We've used up the previous extent */
+ bio = bl_submit_bio(bio);
+
+ /* Get the next one */
+ if (!ext_tree_lookup(bl, isect, &be, false)) {
+ header->pnfs_error = -EIO;
+ goto out;
+ }
+ extent_length = be.be_length - (isect - be.be_f_offset);
+ }
+
+ if (is_dio) {
+ if (pg_offset + bytes_left > PAGE_SIZE)
+ pg_len = PAGE_SIZE - pg_offset;
+ else
+ pg_len = bytes_left;
+ } else {
+ BUG_ON(pg_offset != 0);
+ pg_len = PAGE_SIZE;
+ }
+
+ if (is_hole(&be)) {
+ bio = bl_submit_bio(bio);
+ /* Fill hole w/ zeroes w/o accessing device */
+ dprintk("%s Zeroing page for hole\n", __func__);
+ zero_user_segment(pages[i], pg_offset, pg_len);
+
+ /* invalidate map */
+ map.start = NFS4_MAX_UINT64;
+ } else {
+ bio = do_add_page_to_bio(bio,
+ header->page_array.npages - i,
+ REQ_OP_READ,
+ isect, pages[i], &map, &be,
+ bl_end_io_read, par,
+ pg_offset, &pg_len);
+ if (IS_ERR(bio)) {
+ header->pnfs_error = PTR_ERR(bio);
+ bio = NULL;
+ goto out;
+ }
+ }
+ isect += (pg_len >> SECTOR_SHIFT);
+ extent_length -= (pg_len >> SECTOR_SHIFT);
+ f_offset += pg_len;
+ bytes_left -= pg_len;
+ pg_offset = 0;
+ }
+ if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
+ header->res.eof = 1;
+ header->res.count = header->inode->i_size - header->args.offset;
+ } else {
+ header->res.count = (isect << SECTOR_SHIFT) - header->args.offset;
+ }
+out:
+ bl_submit_bio(bio);
+ blk_finish_plug(&plug);
+ put_parallel(par);
+ return PNFS_ATTEMPTED;
+}
+
+static void bl_end_io_write(struct bio *bio)
+{
+ struct parallel_io *par = bio->bi_private;
+ struct nfs_pgio_header *header = par->data;
+
+ if (bio->bi_status) {
+ if (!header->pnfs_error)
+ header->pnfs_error = -EIO;
+ pnfs_set_lo_fail(header->lseg);
+ bl_mark_devices_unavailable(header, true);
+ }
+ bio_put(bio);
+ put_parallel(par);
+}
+
+/* Function scheduled for call during bl_end_par_io_write,
+ * it marks sectors as written and extends the commitlist.
+ */
+static void bl_write_cleanup(struct work_struct *work)
+{
+ struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work);
+ struct nfs_pgio_header *hdr =
+ container_of(task, struct nfs_pgio_header, task);
+
+ dprintk("%s enter\n", __func__);
+
+ if (likely(!hdr->pnfs_error)) {
+ struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg);
+ u64 start = hdr->args.offset & (loff_t)PAGE_MASK;
+ u64 end = (hdr->args.offset + hdr->args.count +
+ PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
+ u64 lwb = hdr->args.offset + hdr->args.count;
+
+ ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
+ (end - start) >> SECTOR_SHIFT, lwb);
+ }
+
+ pnfs_ld_write_done(hdr);
+}
+
+/* Called when last of bios associated with a bl_write_pagelist call finishes */
+static void bl_end_par_io_write(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ hdr->task.tk_status = hdr->pnfs_error;
+ hdr->verf.committed = NFS_FILE_SYNC;
+ INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
+ schedule_work(&hdr->task.u.tk_work);
+}
+
+static enum pnfs_try_status
+bl_write_pagelist(struct nfs_pgio_header *header, int sync)
+{
+ struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+ struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
+ struct bio *bio = NULL;
+ struct pnfs_block_extent be;
+ sector_t isect, extent_length = 0;
+ struct parallel_io *par = NULL;
+ loff_t offset = header->args.offset;
+ size_t count = header->args.count;
+ struct page **pages = header->args.pages;
+ int pg_index = header->args.pgbase >> PAGE_SHIFT;
+ unsigned int pg_len;
+ struct blk_plug plug;
+ int i;
+
+ dprintk("%s enter, %zu@%lld\n", __func__, count, offset);
+
+ /* At this point, header->page_aray is a (sequential) list of nfs_pages.
+ * We want to write each, and if there is an error set pnfs_error
+ * to have it redone using nfs.
+ */
+ par = alloc_parallel(header);
+ if (!par)
+ return PNFS_NOT_ATTEMPTED;
+ par->pnfs_callback = bl_end_par_io_write;
+
+ blk_start_plug(&plug);
+
+ /* we always write out the whole page */
+ offset = offset & (loff_t)PAGE_MASK;
+ isect = offset >> SECTOR_SHIFT;
+
+ for (i = pg_index; i < header->page_array.npages; i++) {
+ if (extent_length <= 0) {
+ /* We've used up the previous extent */
+ bio = bl_submit_bio(bio);
+ /* Get the next one */
+ if (!ext_tree_lookup(bl, isect, &be, true)) {
+ header->pnfs_error = -EINVAL;
+ goto out;
+ }
+
+ extent_length = be.be_length - (isect - be.be_f_offset);
+ }
+
+ pg_len = PAGE_SIZE;
+ bio = do_add_page_to_bio(bio, header->page_array.npages - i,
+ REQ_OP_WRITE, isect, pages[i], &map,
+ &be, bl_end_io_write, par, 0, &pg_len);
+ if (IS_ERR(bio)) {
+ header->pnfs_error = PTR_ERR(bio);
+ bio = NULL;
+ goto out;
+ }
+
+ offset += pg_len;
+ count -= pg_len;
+ isect += (pg_len >> SECTOR_SHIFT);
+ extent_length -= (pg_len >> SECTOR_SHIFT);
+ }
+
+ header->res.count = header->args.count;
+out:
+ bl_submit_bio(bio);
+ blk_finish_plug(&plug);
+ put_parallel(par);
+ return PNFS_ATTEMPTED;
+}
+
+static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+ int err;
+
+ dprintk("%s enter\n", __func__);
+
+ err = ext_tree_remove(bl, true, 0, LLONG_MAX);
+ WARN_ON(err);
+
+ kfree_rcu(bl, bl_layout.plh_rcu);
+}
+
+static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode,
+ gfp_t gfp_flags, bool is_scsi_layout)
+{
+ struct pnfs_block_layout *bl;
+
+ dprintk("%s enter\n", __func__);
+ bl = kzalloc(sizeof(*bl), gfp_flags);
+ if (!bl)
+ return NULL;
+
+ bl->bl_ext_rw = RB_ROOT;
+ bl->bl_ext_ro = RB_ROOT;
+ spin_lock_init(&bl->bl_ext_lock);
+
+ bl->bl_scsi_layout = is_scsi_layout;
+ return &bl->bl_layout;
+}
+
+static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
+ gfp_t gfp_flags)
+{
+ return __bl_alloc_layout_hdr(inode, gfp_flags, false);
+}
+
+static struct pnfs_layout_hdr *sl_alloc_layout_hdr(struct inode *inode,
+ gfp_t gfp_flags)
+{
+ return __bl_alloc_layout_hdr(inode, gfp_flags, true);
+}
+
+static void bl_free_lseg(struct pnfs_layout_segment *lseg)
+{
+ dprintk("%s enter\n", __func__);
+ kfree(lseg);
+}
+
+/* Tracks info needed to ensure extents in layout obey constraints of spec */
+struct layout_verification {
+ u32 mode; /* R or RW */
+ u64 start; /* Expected start of next non-COW extent */
+ u64 inval; /* Start of INVAL coverage */
+ u64 cowread; /* End of COW read coverage */
+};
+
+/* Verify the extent meets the layout requirements of the pnfs-block draft,
+ * section 2.3.1.
+ */
+static int verify_extent(struct pnfs_block_extent *be,
+ struct layout_verification *lv)
+{
+ if (lv->mode == IOMODE_READ) {
+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
+ be->be_state == PNFS_BLOCK_INVALID_DATA)
+ return -EIO;
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ return 0;
+ }
+ /* lv->mode == IOMODE_RW */
+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ if (lv->cowread > lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ lv->inval = lv->start;
+ return 0;
+ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ return 0;
+ } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
+ if (be->be_f_offset > lv->start)
+ return -EIO;
+ if (be->be_f_offset < lv->inval)
+ return -EIO;
+ if (be->be_f_offset < lv->cowread)
+ return -EIO;
+ /* It looks like you might want to min this with lv->start,
+ * but you really don't.
+ */
+ lv->inval = lv->inval + be->be_length;
+ lv->cowread = be->be_f_offset + be->be_length;
+ return 0;
+ } else
+ return -EIO;
+}
+
+static int decode_sector_number(__be32 **rp, sector_t *sp)
+{
+ uint64_t s;
+
+ *rp = xdr_decode_hyper(*rp, &s);
+ if (s & 0x1ff) {
+ printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
+ return -1;
+ }
+ *sp = s >> SECTOR_SHIFT;
+ return 0;
+}
+
+static struct nfs4_deviceid_node *
+bl_find_get_deviceid(struct nfs_server *server,
+ const struct nfs4_deviceid *id, const struct cred *cred,
+ gfp_t gfp_mask)
+{
+ struct nfs4_deviceid_node *node;
+ unsigned long start, end;
+
+retry:
+ node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
+ if (!node)
+ return ERR_PTR(-ENODEV);
+
+ if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
+ return node;
+
+ end = jiffies;
+ start = end - PNFS_DEVICE_RETRY_TIMEOUT;
+ if (!time_in_range(node->timestamp_unavailable, start, end)) {
+ nfs4_delete_deviceid(node->ld, node->nfs_client, id);
+ goto retry;
+ }
+
+ nfs4_put_deviceid_node(node);
+ return ERR_PTR(-ENODEV);
+}
+
+static int
+bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo,
+ struct layout_verification *lv, struct list_head *extents,
+ gfp_t gfp_mask)
+{
+ struct pnfs_block_extent *be;
+ struct nfs4_deviceid id;
+ int error;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE);
+ if (!p)
+ return -EIO;
+
+ be = kzalloc(sizeof(*be), GFP_NOFS);
+ if (!be)
+ return -ENOMEM;
+
+ memcpy(&id, p, NFS4_DEVICEID4_SIZE);
+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+ be->be_device = bl_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
+ lo->plh_lc_cred, gfp_mask);
+ if (IS_ERR(be->be_device)) {
+ error = PTR_ERR(be->be_device);
+ goto out_free_be;
+ }
+
+ /*
+ * The next three values are read in as bytes, but stored in the
+ * extent structure in 512-byte granularity.
+ */
+ error = -EIO;
+ if (decode_sector_number(&p, &be->be_f_offset) < 0)
+ goto out_put_deviceid;
+ if (decode_sector_number(&p, &be->be_length) < 0)
+ goto out_put_deviceid;
+ if (decode_sector_number(&p, &be->be_v_offset) < 0)
+ goto out_put_deviceid;
+ be->be_state = be32_to_cpup(p++);
+
+ error = verify_extent(be, lv);
+ if (error) {
+ dprintk("%s: extent verification failed\n", __func__);
+ goto out_put_deviceid;
+ }
+
+ list_add_tail(&be->be_list, extents);
+ return 0;
+
+out_put_deviceid:
+ nfs4_put_deviceid_node(be->be_device);
+out_free_be:
+ kfree(be);
+ return error;
+}
+
+static struct pnfs_layout_segment *
+bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
+ gfp_t gfp_mask)
+{
+ struct layout_verification lv = {
+ .mode = lgr->range.iomode,
+ .start = lgr->range.offset >> SECTOR_SHIFT,
+ .inval = lgr->range.offset >> SECTOR_SHIFT,
+ .cowread = lgr->range.offset >> SECTOR_SHIFT,
+ };
+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+ struct pnfs_layout_segment *lseg;
+ struct xdr_buf buf;
+ struct xdr_stream xdr;
+ struct page *scratch;
+ int status, i;
+ uint32_t count;
+ __be32 *p;
+ LIST_HEAD(extents);
+
+ dprintk("---> %s\n", __func__);
+
+ lseg = kzalloc(sizeof(*lseg), gfp_mask);
+ if (!lseg)
+ return ERR_PTR(-ENOMEM);
+
+ status = -ENOMEM;
+ scratch = alloc_page(gfp_mask);
+ if (!scratch)
+ goto out;
+
+ xdr_init_decode_pages(&xdr, &buf,
+ lgr->layoutp->pages, lgr->layoutp->len);
+ xdr_set_scratch_page(&xdr, scratch);
+
+ status = -EIO;
+ p = xdr_inline_decode(&xdr, 4);
+ if (unlikely(!p))
+ goto out_free_scratch;
+
+ count = be32_to_cpup(p++);
+ dprintk("%s: number of extents %d\n", __func__, count);
+
+ /*
+ * Decode individual extents, putting them in temporary staging area
+ * until whole layout is decoded to make error recovery easier.
+ */
+ for (i = 0; i < count; i++) {
+ status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask);
+ if (status)
+ goto process_extents;
+ }
+
+ if (lgr->range.offset + lgr->range.length !=
+ lv.start << SECTOR_SHIFT) {
+ dprintk("%s Final length mismatch\n", __func__);
+ status = -EIO;
+ goto process_extents;
+ }
+
+ if (lv.start < lv.cowread) {
+ dprintk("%s Final uncovered COW extent\n", __func__);
+ status = -EIO;
+ }
+
+process_extents:
+ while (!list_empty(&extents)) {
+ struct pnfs_block_extent *be =
+ list_first_entry(&extents, struct pnfs_block_extent,
+ be_list);
+ list_del(&be->be_list);
+
+ if (!status)
+ status = ext_tree_insert(bl, be);
+
+ if (status) {
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ }
+ }
+
+out_free_scratch:
+ __free_page(scratch);
+out:
+ dprintk("%s returns %d\n", __func__, status);
+ switch (status) {
+ case -ENODEV:
+ /* Our extent block devices are unavailable */
+ set_bit(NFS_LSEG_UNAVAILABLE, &lseg->pls_flags);
+ fallthrough;
+ case 0:
+ return lseg;
+ default:
+ kfree(lseg);
+ return ERR_PTR(status);
+ }
+}
+
+static void
+bl_return_range(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range)
+{
+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+ sector_t offset = range->offset >> SECTOR_SHIFT, end;
+
+ if (range->offset % 8) {
+ dprintk("%s: offset %lld not block size aligned\n",
+ __func__, range->offset);
+ return;
+ }
+
+ if (range->length != NFS4_MAX_UINT64) {
+ if (range->length % 8) {
+ dprintk("%s: length %lld not block size aligned\n",
+ __func__, range->length);
+ return;
+ }
+
+ end = offset + (range->length >> SECTOR_SHIFT);
+ } else {
+ end = round_down(NFS4_MAX_UINT64, PAGE_SIZE);
+ }
+
+ ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end);
+}
+
+static int
+bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg)
+{
+ return ext_tree_prepare_commit(arg);
+}
+
+static void
+bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
+{
+ ext_tree_mark_committed(&lcdata->args, lcdata->res.status);
+}
+
+static int
+bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
+{
+ dprintk("%s enter\n", __func__);
+
+ if (server->pnfs_blksize == 0) {
+ dprintk("%s Server did not return blksize\n", __func__);
+ return -EINVAL;
+ }
+ if (server->pnfs_blksize > PAGE_SIZE) {
+ printk(KERN_ERR "%s: pNFS blksize %d not supported.\n",
+ __func__, server->pnfs_blksize);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static bool
+is_aligned_req(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req, unsigned int alignment, bool is_write)
+{
+ /*
+ * Always accept buffered writes, higher layers take care of the
+ * right alignment.
+ */
+ if (pgio->pg_dreq == NULL)
+ return true;
+
+ if (!IS_ALIGNED(req->wb_offset, alignment))
+ return false;
+
+ if (IS_ALIGNED(req->wb_bytes, alignment))
+ return true;
+
+ if (is_write &&
+ (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode))) {
+ /*
+ * If the write goes up to the inode size, just write
+ * the full page. Data past the inode size is
+ * guaranteed to be zeroed by the higher level client
+ * code, and this behaviour is mandated by RFC 5663
+ * section 2.3.2.
+ */
+ return true;
+ }
+
+ return false;
+}
+
+static void
+bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+{
+ if (!is_aligned_req(pgio, req, SECTOR_SIZE, false)) {
+ nfs_pageio_reset_read_mds(pgio);
+ return;
+ }
+
+ pnfs_generic_pg_init_read(pgio, req);
+
+ if (pgio->pg_lseg &&
+ test_bit(NFS_LSEG_UNAVAILABLE, &pgio->pg_lseg->pls_flags)) {
+ pnfs_error_mark_layout_for_return(pgio->pg_inode, pgio->pg_lseg);
+ pnfs_set_lo_fail(pgio->pg_lseg);
+ nfs_pageio_reset_read_mds(pgio);
+ }
+}
+
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
+bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+ struct nfs_page *req)
+{
+ if (!is_aligned_req(pgio, req, SECTOR_SIZE, false))
+ return 0;
+ return pnfs_generic_pg_test(pgio, prev, req);
+}
+
+/*
+ * Return the number of contiguous bytes for a given inode
+ * starting at page frame idx.
+ */
+static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
+{
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t end;
+
+ /* Optimize common case that writes from 0 to end of file */
+ end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (end != inode->i_mapping->nrpages) {
+ rcu_read_lock();
+ end = page_cache_next_miss(mapping, idx + 1, ULONG_MAX);
+ rcu_read_unlock();
+ }
+
+ if (!end)
+ return i_size_read(inode) - (idx << PAGE_SHIFT);
+ else
+ return (end - idx) << PAGE_SHIFT;
+}
+
+static void
+bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+{
+ u64 wb_size;
+
+ if (!is_aligned_req(pgio, req, PAGE_SIZE, true)) {
+ nfs_pageio_reset_write_mds(pgio);
+ return;
+ }
+
+ if (pgio->pg_dreq == NULL)
+ wb_size = pnfs_num_cont_bytes(pgio->pg_inode, req->wb_index);
+ else
+ wb_size = nfs_dreq_bytes_left(pgio->pg_dreq, req_offset(req));
+
+ pnfs_generic_pg_init_write(pgio, req, wb_size);
+
+ if (pgio->pg_lseg &&
+ test_bit(NFS_LSEG_UNAVAILABLE, &pgio->pg_lseg->pls_flags)) {
+
+ pnfs_error_mark_layout_for_return(pgio->pg_inode, pgio->pg_lseg);
+ pnfs_set_lo_fail(pgio->pg_lseg);
+ nfs_pageio_reset_write_mds(pgio);
+ }
+}
+
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
+bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+ struct nfs_page *req)
+{
+ if (!is_aligned_req(pgio, req, PAGE_SIZE, true))
+ return 0;
+ return pnfs_generic_pg_test(pgio, prev, req);
+}
+
+static const struct nfs_pageio_ops bl_pg_read_ops = {
+ .pg_init = bl_pg_init_read,
+ .pg_test = bl_pg_test_read,
+ .pg_doio = pnfs_generic_pg_readpages,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static const struct nfs_pageio_ops bl_pg_write_ops = {
+ .pg_init = bl_pg_init_write,
+ .pg_test = bl_pg_test_write,
+ .pg_doio = pnfs_generic_pg_writepages,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static struct pnfs_layoutdriver_type blocklayout_type = {
+ .id = LAYOUT_BLOCK_VOLUME,
+ .name = "LAYOUT_BLOCK_VOLUME",
+ .owner = THIS_MODULE,
+ .flags = PNFS_LAYOUTRET_ON_SETATTR |
+ PNFS_LAYOUTRET_ON_ERROR |
+ PNFS_READ_WHOLE_PAGE,
+ .read_pagelist = bl_read_pagelist,
+ .write_pagelist = bl_write_pagelist,
+ .alloc_layout_hdr = bl_alloc_layout_hdr,
+ .free_layout_hdr = bl_free_layout_hdr,
+ .alloc_lseg = bl_alloc_lseg,
+ .free_lseg = bl_free_lseg,
+ .return_range = bl_return_range,
+ .prepare_layoutcommit = bl_prepare_layoutcommit,
+ .cleanup_layoutcommit = bl_cleanup_layoutcommit,
+ .set_layoutdriver = bl_set_layoutdriver,
+ .alloc_deviceid_node = bl_alloc_deviceid_node,
+ .free_deviceid_node = bl_free_deviceid_node,
+ .pg_read_ops = &bl_pg_read_ops,
+ .pg_write_ops = &bl_pg_write_ops,
+ .sync = pnfs_generic_sync,
+};
+
+static struct pnfs_layoutdriver_type scsilayout_type = {
+ .id = LAYOUT_SCSI,
+ .name = "LAYOUT_SCSI",
+ .owner = THIS_MODULE,
+ .flags = PNFS_LAYOUTRET_ON_SETATTR |
+ PNFS_LAYOUTRET_ON_ERROR |
+ PNFS_READ_WHOLE_PAGE,
+ .read_pagelist = bl_read_pagelist,
+ .write_pagelist = bl_write_pagelist,
+ .alloc_layout_hdr = sl_alloc_layout_hdr,
+ .free_layout_hdr = bl_free_layout_hdr,
+ .alloc_lseg = bl_alloc_lseg,
+ .free_lseg = bl_free_lseg,
+ .return_range = bl_return_range,
+ .prepare_layoutcommit = bl_prepare_layoutcommit,
+ .cleanup_layoutcommit = bl_cleanup_layoutcommit,
+ .set_layoutdriver = bl_set_layoutdriver,
+ .alloc_deviceid_node = bl_alloc_deviceid_node,
+ .free_deviceid_node = bl_free_deviceid_node,
+ .pg_read_ops = &bl_pg_read_ops,
+ .pg_write_ops = &bl_pg_write_ops,
+ .sync = pnfs_generic_sync,
+};
+
+
+static int __init nfs4blocklayout_init(void)
+{
+ int ret;
+
+ dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
+
+ ret = bl_init_pipefs();
+ if (ret)
+ goto out;
+
+ ret = pnfs_register_layoutdriver(&blocklayout_type);
+ if (ret)
+ goto out_cleanup_pipe;
+
+ ret = pnfs_register_layoutdriver(&scsilayout_type);
+ if (ret)
+ goto out_unregister_block;
+ return 0;
+
+out_unregister_block:
+ pnfs_unregister_layoutdriver(&blocklayout_type);
+out_cleanup_pipe:
+ bl_cleanup_pipefs();
+out:
+ return ret;
+}
+
+static void __exit nfs4blocklayout_exit(void)
+{
+ dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
+ __func__);
+
+ pnfs_unregister_layoutdriver(&scsilayout_type);
+ pnfs_unregister_layoutdriver(&blocklayout_type);
+ bl_cleanup_pipefs();
+}
+
+MODULE_ALIAS("nfs-layouttype4-3");
+MODULE_ALIAS("nfs-layouttype4-5");
+
+module_init(nfs4blocklayout_init);
+module_exit(nfs4blocklayout_exit);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
new file mode 100644
index 0000000000..716bc75e9e
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -0,0 +1,197 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayout.h
+ *
+ * Module for the NFSv4.1 pNFS block layout driver.
+ *
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@citi.umich.edu>
+ * Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#ifndef FS_NFS_NFS4BLOCKLAYOUT_H
+#define FS_NFS_NFS4BLOCKLAYOUT_H
+
+#include <linux/device-mapper.h>
+#include <linux/nfs_fs.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
+#include "../nfs4_fs.h"
+#include "../pnfs.h"
+#include "../netns.h"
+
+#define PAGE_CACHE_SECTORS (PAGE_SIZE >> SECTOR_SHIFT)
+#define PAGE_CACHE_SECTOR_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
+#define SECTOR_SIZE (1 << SECTOR_SHIFT)
+
+struct pnfs_block_dev;
+
+#define PNFS_BLOCK_MAX_UUIDS 4
+#define PNFS_BLOCK_MAX_DEVICES 64
+
+/*
+ * Random upper cap for the uuid length to avoid unbounded allocation.
+ * Not actually limited by the protocol.
+ */
+#define PNFS_BLOCK_UUID_LEN 128
+
+struct pnfs_block_volume {
+ enum pnfs_block_volume_type type;
+ union {
+ struct {
+ int len;
+ int nr_sigs;
+ struct {
+ u64 offset;
+ u32 sig_len;
+ u8 sig[PNFS_BLOCK_UUID_LEN];
+ } sigs[PNFS_BLOCK_MAX_UUIDS];
+ } simple;
+ struct {
+ u64 start;
+ u64 len;
+ u32 volume;
+ } slice;
+ struct {
+ u32 volumes_count;
+ u32 volumes[PNFS_BLOCK_MAX_DEVICES];
+ } concat;
+ struct {
+ u64 chunk_size;
+ u32 volumes_count;
+ u32 volumes[PNFS_BLOCK_MAX_DEVICES];
+ } stripe;
+ struct {
+ enum scsi_code_set code_set;
+ enum scsi_designator_type designator_type;
+ int designator_len;
+ u8 designator[256];
+ u64 pr_key;
+ } scsi;
+ };
+};
+
+struct pnfs_block_dev_map {
+ u64 start;
+ u64 len;
+ u64 disk_offset;
+ struct block_device *bdev;
+};
+
+struct pnfs_block_dev {
+ struct nfs4_deviceid_node node;
+
+ u64 start;
+ u64 len;
+
+ u32 nr_children;
+ struct pnfs_block_dev *children;
+ u64 chunk_size;
+
+ struct block_device *bdev;
+ u64 disk_offset;
+
+ u64 pr_key;
+ bool pr_registered;
+
+ bool (*map)(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map);
+};
+
+/* sector_t fields are all in 512-byte sectors */
+struct pnfs_block_extent {
+ union {
+ struct rb_node be_node;
+ struct list_head be_list;
+ };
+ struct nfs4_deviceid_node *be_device;
+ sector_t be_f_offset; /* the starting offset in the file */
+ sector_t be_length; /* the size of the extent */
+ sector_t be_v_offset; /* the starting offset in the volume */
+ enum pnfs_block_extent_state be_state; /* the state of this extent */
+#define EXTENT_WRITTEN 1
+#define EXTENT_COMMITTING 2
+ unsigned int be_tag;
+};
+
+struct pnfs_block_layout {
+ struct pnfs_layout_hdr bl_layout;
+ struct rb_root bl_ext_rw;
+ struct rb_root bl_ext_ro;
+ spinlock_t bl_ext_lock; /* Protects list manipulation */
+ bool bl_scsi_layout;
+ u64 bl_lwb;
+};
+
+static inline struct pnfs_block_layout *
+BLK_LO2EXT(struct pnfs_layout_hdr *lo)
+{
+ return container_of(lo, struct pnfs_block_layout, bl_layout);
+}
+
+static inline struct pnfs_block_layout *
+BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
+{
+ return BLK_LO2EXT(lseg->pls_layout);
+}
+
+struct bl_pipe_msg {
+ struct rpc_pipe_msg msg;
+ wait_queue_head_t *bl_wq;
+};
+
+struct bl_msg_hdr {
+ u8 type;
+ u16 totallen; /* length of entire message, including hdr itself */
+};
+
+#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */
+#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/
+#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */
+#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
+#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
+
+/* dev.c */
+struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_mask);
+void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
+
+/* extent_tree.c */
+int ext_tree_insert(struct pnfs_block_layout *bl,
+ struct pnfs_block_extent *new);
+int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
+ sector_t end);
+int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
+ sector_t len, u64 lwb);
+bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
+ struct pnfs_block_extent *ret, bool rw);
+int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
+void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
+
+/* rpc_pipefs.c */
+dev_t bl_resolve_deviceid(struct nfs_server *server,
+ struct pnfs_block_volume *b, gfp_t gfp_mask);
+int __init bl_init_pipefs(void);
+void bl_cleanup_pipefs(void);
+
+#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
new file mode 100644
index 0000000000..65cbb5607a
--- /dev/null
+++ b/fs/nfs/blocklayout/dev.c
@@ -0,0 +1,530 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014-2016 Christoph Hellwig.
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/blkdev.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_xdr.h>
+#include <linux/pr.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static void
+bl_free_device(struct pnfs_block_dev *dev)
+{
+ if (dev->nr_children) {
+ int i;
+
+ for (i = 0; i < dev->nr_children; i++)
+ bl_free_device(&dev->children[i]);
+ kfree(dev->children);
+ } else {
+ if (dev->pr_registered) {
+ const struct pr_ops *ops =
+ dev->bdev->bd_disk->fops->pr_ops;
+ int error;
+
+ error = ops->pr_register(dev->bdev, dev->pr_key, 0,
+ false);
+ if (error)
+ pr_err("failed to unregister PR key.\n");
+ }
+
+ if (dev->bdev)
+ blkdev_put(dev->bdev, NULL);
+ }
+}
+
+void
+bl_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+ struct pnfs_block_dev *dev =
+ container_of(d, struct pnfs_block_dev, node);
+
+ bl_free_device(dev);
+ kfree_rcu(dev, node.rcu);
+}
+
+static int
+nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
+{
+ __be32 *p;
+ int i;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
+ b->type = be32_to_cpup(p++);
+
+ switch (b->type) {
+ case PNFS_BLOCK_VOLUME_SIMPLE:
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
+ b->simple.nr_sigs = be32_to_cpup(p++);
+ if (!b->simple.nr_sigs || b->simple.nr_sigs > PNFS_BLOCK_MAX_UUIDS) {
+ dprintk("Bad signature count: %d\n", b->simple.nr_sigs);
+ return -EIO;
+ }
+
+ b->simple.len = 4 + 4;
+ for (i = 0; i < b->simple.nr_sigs; i++) {
+ p = xdr_inline_decode(xdr, 8 + 4);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
+ b->simple.sigs[i].sig_len = be32_to_cpup(p++);
+ if (b->simple.sigs[i].sig_len > PNFS_BLOCK_UUID_LEN) {
+ pr_info("signature too long: %d\n",
+ b->simple.sigs[i].sig_len);
+ return -EIO;
+ }
+
+ p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
+ if (!p)
+ return -EIO;
+ memcpy(&b->simple.sigs[i].sig, p,
+ b->simple.sigs[i].sig_len);
+
+ b->simple.len += 8 + 4 + \
+ (XDR_QUADLEN(b->simple.sigs[i].sig_len) << 2);
+ }
+ break;
+ case PNFS_BLOCK_VOLUME_SLICE:
+ p = xdr_inline_decode(xdr, 8 + 8 + 4);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &b->slice.start);
+ p = xdr_decode_hyper(p, &b->slice.len);
+ b->slice.volume = be32_to_cpup(p++);
+ break;
+ case PNFS_BLOCK_VOLUME_CONCAT:
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
+
+ b->concat.volumes_count = be32_to_cpup(p++);
+ if (b->concat.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
+ dprintk("Too many volumes: %d\n", b->concat.volumes_count);
+ return -EIO;
+ }
+
+ p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
+ if (!p)
+ return -EIO;
+ for (i = 0; i < b->concat.volumes_count; i++)
+ b->concat.volumes[i] = be32_to_cpup(p++);
+ break;
+ case PNFS_BLOCK_VOLUME_STRIPE:
+ p = xdr_inline_decode(xdr, 8 + 4);
+ if (!p)
+ return -EIO;
+
+ p = xdr_decode_hyper(p, &b->stripe.chunk_size);
+ b->stripe.volumes_count = be32_to_cpup(p++);
+ if (b->stripe.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
+ dprintk("Too many volumes: %d\n", b->stripe.volumes_count);
+ return -EIO;
+ }
+
+ p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
+ if (!p)
+ return -EIO;
+ for (i = 0; i < b->stripe.volumes_count; i++)
+ b->stripe.volumes[i] = be32_to_cpup(p++);
+ break;
+ case PNFS_BLOCK_VOLUME_SCSI:
+ p = xdr_inline_decode(xdr, 4 + 4 + 4);
+ if (!p)
+ return -EIO;
+ b->scsi.code_set = be32_to_cpup(p++);
+ b->scsi.designator_type = be32_to_cpup(p++);
+ b->scsi.designator_len = be32_to_cpup(p++);
+ p = xdr_inline_decode(xdr, b->scsi.designator_len);
+ if (!p)
+ return -EIO;
+ if (b->scsi.designator_len > 256)
+ return -EIO;
+ memcpy(&b->scsi.designator, p, b->scsi.designator_len);
+ p = xdr_inline_decode(xdr, 8);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &b->scsi.pr_key);
+ break;
+ default:
+ dprintk("unknown volume type!\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map)
+{
+ map->start = dev->start;
+ map->len = dev->len;
+ map->disk_offset = dev->disk_offset;
+ map->bdev = dev->bdev;
+ return true;
+}
+
+static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map)
+{
+ int i;
+
+ for (i = 0; i < dev->nr_children; i++) {
+ struct pnfs_block_dev *child = &dev->children[i];
+
+ if (child->start > offset ||
+ child->start + child->len <= offset)
+ continue;
+
+ child->map(child, offset - child->start, map);
+ return true;
+ }
+
+ dprintk("%s: ran off loop!\n", __func__);
+ return false;
+}
+
+static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map)
+{
+ struct pnfs_block_dev *child;
+ u64 chunk;
+ u32 chunk_idx;
+ u64 disk_offset;
+
+ chunk = div_u64(offset, dev->chunk_size);
+ div_u64_rem(chunk, dev->nr_children, &chunk_idx);
+
+ if (chunk_idx >= dev->nr_children) {
+ dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
+ __func__, chunk_idx, offset, dev->chunk_size);
+ /* error, should not happen */
+ return false;
+ }
+
+ /* truncate offset to the beginning of the stripe */
+ offset = chunk * dev->chunk_size;
+
+ /* disk offset of the stripe */
+ disk_offset = div_u64(offset, dev->nr_children);
+
+ child = &dev->children[chunk_idx];
+ child->map(child, disk_offset, map);
+
+ map->start += offset;
+ map->disk_offset += disk_offset;
+ map->len = dev->chunk_size;
+ return true;
+}
+
+static int
+bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
+
+
+static int
+bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ struct block_device *bdev;
+ dev_t dev;
+
+ dev = bl_resolve_deviceid(server, v, gfp_mask);
+ if (!dev)
+ return -EIO;
+
+ bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL,
+ NULL);
+ if (IS_ERR(bdev)) {
+ printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
+ MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
+ return PTR_ERR(bdev);
+ }
+ d->bdev = bdev;
+
+
+ d->len = bdev_nr_bytes(d->bdev);
+ d->map = bl_map_simple;
+
+ printk(KERN_INFO "pNFS: using block device %s\n",
+ d->bdev->bd_disk->disk_name);
+ return 0;
+}
+
+static bool
+bl_validate_designator(struct pnfs_block_volume *v)
+{
+ switch (v->scsi.designator_type) {
+ case PS_DESIGNATOR_EUI64:
+ if (v->scsi.code_set != PS_CODE_SET_BINARY)
+ return false;
+
+ if (v->scsi.designator_len != 8 &&
+ v->scsi.designator_len != 10 &&
+ v->scsi.designator_len != 16)
+ return false;
+
+ return true;
+ case PS_DESIGNATOR_NAA:
+ if (v->scsi.code_set != PS_CODE_SET_BINARY)
+ return false;
+
+ if (v->scsi.designator_len != 8 &&
+ v->scsi.designator_len != 16)
+ return false;
+
+ return true;
+ case PS_DESIGNATOR_T10:
+ case PS_DESIGNATOR_NAME:
+ pr_err("pNFS: unsupported designator "
+ "(code set %d, type %d, len %d.\n",
+ v->scsi.code_set,
+ v->scsi.designator_type,
+ v->scsi.designator_len);
+ return false;
+ default:
+ pr_err("pNFS: invalid designator "
+ "(code set %d, type %d, len %d.\n",
+ v->scsi.code_set,
+ v->scsi.designator_type,
+ v->scsi.designator_len);
+ return false;
+ }
+}
+
+static struct block_device *
+bl_open_path(struct pnfs_block_volume *v, const char *prefix)
+{
+ struct block_device *bdev;
+ const char *devname;
+
+ devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN",
+ prefix, v->scsi.designator_len, v->scsi.designator);
+ if (!devname)
+ return ERR_PTR(-ENOMEM);
+
+ bdev = blkdev_get_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL,
+ NULL);
+ if (IS_ERR(bdev)) {
+ pr_warn("pNFS: failed to open device %s (%ld)\n",
+ devname, PTR_ERR(bdev));
+ }
+
+ kfree(devname);
+ return bdev;
+}
+
+static int
+bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ struct block_device *bdev;
+ const struct pr_ops *ops;
+ int error;
+
+ if (!bl_validate_designator(v))
+ return -EINVAL;
+
+ /*
+ * Try to open the RH/Fedora specific dm-mpath udev path first, as the
+ * wwn- links will only point to the first discovered SCSI device there.
+ * On other distributions like Debian, the default SCSI by-id path will
+ * point to the dm-multipath device if one exists.
+ */
+ bdev = bl_open_path(v, "dm-uuid-mpath-0x");
+ if (IS_ERR(bdev))
+ bdev = bl_open_path(v, "wwn-0x");
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+ d->bdev = bdev;
+
+ d->len = bdev_nr_bytes(d->bdev);
+ d->map = bl_map_simple;
+ d->pr_key = v->scsi.pr_key;
+
+ pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
+ d->bdev->bd_disk->disk_name, d->pr_key);
+
+ ops = d->bdev->bd_disk->fops->pr_ops;
+ if (!ops) {
+ pr_err("pNFS: block device %s does not support reservations.",
+ d->bdev->bd_disk->disk_name);
+ error = -EINVAL;
+ goto out_blkdev_put;
+ }
+
+ error = ops->pr_register(d->bdev, 0, d->pr_key, true);
+ if (error) {
+ pr_err("pNFS: failed to register key for block device %s.",
+ d->bdev->bd_disk->disk_name);
+ goto out_blkdev_put;
+ }
+
+ d->pr_registered = true;
+ return 0;
+
+out_blkdev_put:
+ blkdev_put(d->bdev, NULL);
+ return error;
+}
+
+static int
+bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ int ret;
+
+ ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask);
+ if (ret)
+ return ret;
+
+ d->disk_offset = v->slice.start;
+ d->len = v->slice.len;
+ return 0;
+}
+
+static int
+bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ u64 len = 0;
+ int ret, i;
+
+ d->children = kcalloc(v->concat.volumes_count,
+ sizeof(struct pnfs_block_dev), gfp_mask);
+ if (!d->children)
+ return -ENOMEM;
+
+ for (i = 0; i < v->concat.volumes_count; i++) {
+ ret = bl_parse_deviceid(server, &d->children[i],
+ volumes, v->concat.volumes[i], gfp_mask);
+ if (ret)
+ return ret;
+
+ d->nr_children++;
+ d->children[i].start += len;
+ len += d->children[i].len;
+ }
+
+ d->len = len;
+ d->map = bl_map_concat;
+ return 0;
+}
+
+static int
+bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ u64 len = 0;
+ int ret, i;
+
+ d->children = kcalloc(v->stripe.volumes_count,
+ sizeof(struct pnfs_block_dev), gfp_mask);
+ if (!d->children)
+ return -ENOMEM;
+
+ for (i = 0; i < v->stripe.volumes_count; i++) {
+ ret = bl_parse_deviceid(server, &d->children[i],
+ volumes, v->stripe.volumes[i], gfp_mask);
+ if (ret)
+ return ret;
+
+ d->nr_children++;
+ len += d->children[i].len;
+ }
+
+ d->len = len;
+ d->chunk_size = v->stripe.chunk_size;
+ d->map = bl_map_stripe;
+ return 0;
+}
+
+static int
+bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ switch (volumes[idx].type) {
+ case PNFS_BLOCK_VOLUME_SIMPLE:
+ return bl_parse_simple(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_SLICE:
+ return bl_parse_slice(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_CONCAT:
+ return bl_parse_concat(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_STRIPE:
+ return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_SCSI:
+ return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
+ default:
+ dprintk("unsupported volume type: %d\n", volumes[idx].type);
+ return -EIO;
+ }
+}
+
+struct nfs4_deviceid_node *
+bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_mask)
+{
+ struct nfs4_deviceid_node *node = NULL;
+ struct pnfs_block_volume *volumes;
+ struct pnfs_block_dev *top;
+ struct xdr_stream xdr;
+ struct xdr_buf buf;
+ struct page *scratch;
+ int nr_volumes, ret, i;
+ __be32 *p;
+
+ scratch = alloc_page(gfp_mask);
+ if (!scratch)
+ goto out;
+
+ xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
+ xdr_set_scratch_page(&xdr, scratch);
+
+ p = xdr_inline_decode(&xdr, sizeof(__be32));
+ if (!p)
+ goto out_free_scratch;
+ nr_volumes = be32_to_cpup(p++);
+
+ volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume),
+ gfp_mask);
+ if (!volumes)
+ goto out_free_scratch;
+
+ for (i = 0; i < nr_volumes; i++) {
+ ret = nfs4_block_decode_volume(&xdr, &volumes[i]);
+ if (ret < 0)
+ goto out_free_volumes;
+ }
+
+ top = kzalloc(sizeof(*top), gfp_mask);
+ if (!top)
+ goto out_free_volumes;
+
+ ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask);
+
+ node = &top->node;
+ nfs4_init_deviceid_node(node, server, &pdev->dev_id);
+ if (ret)
+ nfs4_mark_deviceid_unavailable(node);
+
+out_free_volumes:
+ kfree(volumes);
+out_free_scratch:
+ __free_page(scratch);
+out:
+ return node;
+}
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
new file mode 100644
index 0000000000..8f7cff7a42
--- /dev/null
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -0,0 +1,647 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014-2016 Christoph Hellwig.
+ */
+
+#include <linux/vmalloc.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static inline struct pnfs_block_extent *
+ext_node(struct rb_node *node)
+{
+ return rb_entry(node, struct pnfs_block_extent, be_node);
+}
+
+static struct pnfs_block_extent *
+ext_tree_first(struct rb_root *root)
+{
+ struct rb_node *node = rb_first(root);
+ return node ? ext_node(node) : NULL;
+}
+
+static struct pnfs_block_extent *
+ext_tree_prev(struct pnfs_block_extent *be)
+{
+ struct rb_node *node = rb_prev(&be->be_node);
+ return node ? ext_node(node) : NULL;
+}
+
+static struct pnfs_block_extent *
+ext_tree_next(struct pnfs_block_extent *be)
+{
+ struct rb_node *node = rb_next(&be->be_node);
+ return node ? ext_node(node) : NULL;
+}
+
+static inline sector_t
+ext_f_end(struct pnfs_block_extent *be)
+{
+ return be->be_f_offset + be->be_length;
+}
+
+static struct pnfs_block_extent *
+__ext_tree_search(struct rb_root *root, sector_t start)
+{
+ struct rb_node *node = root->rb_node;
+ struct pnfs_block_extent *be = NULL;
+
+ while (node) {
+ be = ext_node(node);
+ if (start < be->be_f_offset)
+ node = node->rb_left;
+ else if (start >= ext_f_end(be))
+ node = node->rb_right;
+ else
+ return be;
+ }
+
+ if (be) {
+ if (start < be->be_f_offset)
+ return be;
+
+ if (start >= ext_f_end(be))
+ return ext_tree_next(be);
+ }
+
+ return NULL;
+}
+
+static bool
+ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
+{
+ if (be1->be_state != be2->be_state)
+ return false;
+ if (be1->be_device != be2->be_device)
+ return false;
+
+ if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
+ return false;
+
+ if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
+ (be1->be_v_offset + be1->be_length != be2->be_v_offset))
+ return false;
+
+ if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
+ be1->be_tag != be2->be_tag)
+ return false;
+
+ return true;
+}
+
+static struct pnfs_block_extent *
+ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
+{
+ struct pnfs_block_extent *left = ext_tree_prev(be);
+
+ if (left && ext_can_merge(left, be)) {
+ left->be_length += be->be_length;
+ rb_erase(&be->be_node, root);
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ return left;
+ }
+
+ return be;
+}
+
+static struct pnfs_block_extent *
+ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
+{
+ struct pnfs_block_extent *right = ext_tree_next(be);
+
+ if (right && ext_can_merge(be, right)) {
+ be->be_length += right->be_length;
+ rb_erase(&right->be_node, root);
+ nfs4_put_deviceid_node(right->be_device);
+ kfree(right);
+ }
+
+ return be;
+}
+
+static void __ext_put_deviceids(struct list_head *head)
+{
+ struct pnfs_block_extent *be, *tmp;
+
+ list_for_each_entry_safe(be, tmp, head, be_list) {
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ }
+}
+
+static void
+__ext_tree_insert(struct rb_root *root,
+ struct pnfs_block_extent *new, bool merge_ok)
+{
+ struct rb_node **p = &root->rb_node, *parent = NULL;
+ struct pnfs_block_extent *be;
+
+ while (*p) {
+ parent = *p;
+ be = ext_node(parent);
+
+ if (new->be_f_offset < be->be_f_offset) {
+ if (merge_ok && ext_can_merge(new, be)) {
+ be->be_f_offset = new->be_f_offset;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA)
+ be->be_v_offset = new->be_v_offset;
+ be->be_length += new->be_length;
+ be = ext_try_to_merge_left(root, be);
+ goto free_new;
+ }
+ p = &(*p)->rb_left;
+ } else if (new->be_f_offset >= ext_f_end(be)) {
+ if (merge_ok && ext_can_merge(be, new)) {
+ be->be_length += new->be_length;
+ be = ext_try_to_merge_right(root, be);
+ goto free_new;
+ }
+ p = &(*p)->rb_right;
+ } else {
+ BUG();
+ }
+ }
+
+ rb_link_node(&new->be_node, parent, p);
+ rb_insert_color(&new->be_node, root);
+ return;
+free_new:
+ nfs4_put_deviceid_node(new->be_device);
+ kfree(new);
+}
+
+static int
+__ext_tree_remove(struct rb_root *root,
+ sector_t start, sector_t end, struct list_head *tmp)
+{
+ struct pnfs_block_extent *be;
+ sector_t len1 = 0, len2 = 0;
+ sector_t orig_v_offset;
+ sector_t orig_len;
+
+ be = __ext_tree_search(root, start);
+ if (!be)
+ return 0;
+ if (be->be_f_offset >= end)
+ return 0;
+
+ orig_v_offset = be->be_v_offset;
+ orig_len = be->be_length;
+
+ if (start > be->be_f_offset)
+ len1 = start - be->be_f_offset;
+ if (ext_f_end(be) > end)
+ len2 = ext_f_end(be) - end;
+
+ if (len2 > 0) {
+ if (len1 > 0) {
+ struct pnfs_block_extent *new;
+
+ new = kzalloc(sizeof(*new), GFP_ATOMIC);
+ if (!new)
+ return -ENOMEM;
+
+ be->be_length = len1;
+
+ new->be_f_offset = end;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA) {
+ new->be_v_offset =
+ orig_v_offset + orig_len - len2;
+ }
+ new->be_length = len2;
+ new->be_state = be->be_state;
+ new->be_tag = be->be_tag;
+ new->be_device = nfs4_get_deviceid(be->be_device);
+
+ __ext_tree_insert(root, new, true);
+ } else {
+ be->be_f_offset = end;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA) {
+ be->be_v_offset =
+ orig_v_offset + orig_len - len2;
+ }
+ be->be_length = len2;
+ }
+ } else {
+ if (len1 > 0) {
+ be->be_length = len1;
+ be = ext_tree_next(be);
+ }
+
+ while (be && ext_f_end(be) <= end) {
+ struct pnfs_block_extent *next = ext_tree_next(be);
+
+ rb_erase(&be->be_node, root);
+ list_add_tail(&be->be_list, tmp);
+ be = next;
+ }
+
+ if (be && be->be_f_offset < end) {
+ len1 = ext_f_end(be) - end;
+ be->be_f_offset = end;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA)
+ be->be_v_offset += be->be_length - len1;
+ be->be_length = len1;
+ }
+ }
+
+ return 0;
+}
+
+int
+ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
+{
+ struct pnfs_block_extent *be;
+ struct rb_root *root;
+ int err = 0;
+
+ switch (new->be_state) {
+ case PNFS_BLOCK_READWRITE_DATA:
+ case PNFS_BLOCK_INVALID_DATA:
+ root = &bl->bl_ext_rw;
+ break;
+ case PNFS_BLOCK_READ_DATA:
+ case PNFS_BLOCK_NONE_DATA:
+ root = &bl->bl_ext_ro;
+ break;
+ default:
+ dprintk("invalid extent type\n");
+ return -EINVAL;
+ }
+
+ spin_lock(&bl->bl_ext_lock);
+retry:
+ be = __ext_tree_search(root, new->be_f_offset);
+ if (!be || be->be_f_offset >= ext_f_end(new)) {
+ __ext_tree_insert(root, new, true);
+ } else if (new->be_f_offset >= be->be_f_offset) {
+ if (ext_f_end(new) <= ext_f_end(be)) {
+ nfs4_put_deviceid_node(new->be_device);
+ kfree(new);
+ } else {
+ sector_t new_len = ext_f_end(new) - ext_f_end(be);
+ sector_t diff = new->be_length - new_len;
+
+ new->be_f_offset += diff;
+ new->be_v_offset += diff;
+ new->be_length = new_len;
+ goto retry;
+ }
+ } else if (ext_f_end(new) <= ext_f_end(be)) {
+ new->be_length = be->be_f_offset - new->be_f_offset;
+ __ext_tree_insert(root, new, true);
+ } else {
+ struct pnfs_block_extent *split;
+ sector_t new_len = ext_f_end(new) - ext_f_end(be);
+ sector_t diff = new->be_length - new_len;
+
+ split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
+ if (!split) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ split->be_length = be->be_f_offset - split->be_f_offset;
+ split->be_device = nfs4_get_deviceid(new->be_device);
+ __ext_tree_insert(root, split, true);
+
+ new->be_f_offset += diff;
+ new->be_v_offset += diff;
+ new->be_length = new_len;
+ goto retry;
+ }
+out:
+ spin_unlock(&bl->bl_ext_lock);
+ return err;
+}
+
+static bool
+__ext_tree_lookup(struct rb_root *root, sector_t isect,
+ struct pnfs_block_extent *ret)
+{
+ struct rb_node *node;
+ struct pnfs_block_extent *be;
+
+ node = root->rb_node;
+ while (node) {
+ be = ext_node(node);
+ if (isect < be->be_f_offset)
+ node = node->rb_left;
+ else if (isect >= ext_f_end(be))
+ node = node->rb_right;
+ else {
+ *ret = *be;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool
+ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
+ struct pnfs_block_extent *ret, bool rw)
+{
+ bool found = false;
+
+ spin_lock(&bl->bl_ext_lock);
+ if (!rw)
+ found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
+ if (!found)
+ found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
+ spin_unlock(&bl->bl_ext_lock);
+
+ return found;
+}
+
+int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
+ sector_t start, sector_t end)
+{
+ int err, err2;
+ LIST_HEAD(tmp);
+
+ spin_lock(&bl->bl_ext_lock);
+ err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
+ if (rw) {
+ err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp);
+ if (!err)
+ err = err2;
+ }
+ spin_unlock(&bl->bl_ext_lock);
+
+ __ext_put_deviceids(&tmp);
+ return err;
+}
+
+static int
+ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
+ sector_t split)
+{
+ struct pnfs_block_extent *new;
+ sector_t orig_len = be->be_length;
+
+ new = kzalloc(sizeof(*new), GFP_ATOMIC);
+ if (!new)
+ return -ENOMEM;
+
+ be->be_length = split - be->be_f_offset;
+
+ new->be_f_offset = split;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA)
+ new->be_v_offset = be->be_v_offset + be->be_length;
+ new->be_length = orig_len - be->be_length;
+ new->be_state = be->be_state;
+ new->be_tag = be->be_tag;
+ new->be_device = nfs4_get_deviceid(be->be_device);
+
+ __ext_tree_insert(root, new, false);
+ return 0;
+}
+
+int
+ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
+ sector_t len, u64 lwb)
+{
+ struct rb_root *root = &bl->bl_ext_rw;
+ sector_t end = start + len;
+ struct pnfs_block_extent *be;
+ int err = 0;
+ LIST_HEAD(tmp);
+
+ spin_lock(&bl->bl_ext_lock);
+ /*
+ * First remove all COW extents or holes from written to range.
+ */
+ err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
+ if (err)
+ goto out;
+
+ /*
+ * Then mark all invalid extents in the range as written to.
+ */
+ for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
+ if (be->be_f_offset >= end)
+ break;
+
+ if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
+ continue;
+
+ if (be->be_f_offset < start) {
+ struct pnfs_block_extent *left = ext_tree_prev(be);
+
+ if (left && ext_can_merge(left, be)) {
+ sector_t diff = start - be->be_f_offset;
+
+ left->be_length += diff;
+
+ be->be_f_offset += diff;
+ be->be_v_offset += diff;
+ be->be_length -= diff;
+ } else {
+ err = ext_tree_split(root, be, start);
+ if (err)
+ goto out;
+ }
+ }
+
+ if (ext_f_end(be) > end) {
+ struct pnfs_block_extent *right = ext_tree_next(be);
+
+ if (right && ext_can_merge(be, right)) {
+ sector_t diff = end - be->be_f_offset;
+
+ be->be_length -= diff;
+
+ right->be_f_offset -= diff;
+ right->be_v_offset -= diff;
+ right->be_length += diff;
+ } else {
+ err = ext_tree_split(root, be, end);
+ if (err)
+ goto out;
+ }
+ }
+
+ if (be->be_f_offset >= start && ext_f_end(be) <= end) {
+ be->be_tag = EXTENT_WRITTEN;
+ be = ext_try_to_merge_left(root, be);
+ be = ext_try_to_merge_right(root, be);
+ }
+ }
+out:
+ if (bl->bl_lwb < lwb)
+ bl->bl_lwb = lwb;
+ spin_unlock(&bl->bl_ext_lock);
+
+ __ext_put_deviceids(&tmp);
+ return err;
+}
+
+static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
+{
+ if (bl->bl_scsi_layout)
+ return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
+ else
+ return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
+}
+
+static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
+ size_t buffer_size)
+{
+ if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
+ int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
+
+ for (i = 0; i < nr_pages; i++)
+ put_page(arg->layoutupdate_pages[i]);
+ vfree(arg->start_p);
+ kfree(arg->layoutupdate_pages);
+ } else {
+ put_page(arg->layoutupdate_page);
+ }
+}
+
+static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
+{
+ p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
+ NFS4_DEVICEID4_SIZE);
+ p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+ p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+ p = xdr_encode_hyper(p, 0LL);
+ *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
+ return p;
+}
+
+static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
+{
+ p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+ return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+}
+
+static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
+ size_t buffer_size, size_t *count, __u64 *lastbyte)
+{
+ struct pnfs_block_extent *be;
+ int ret = 0;
+
+ spin_lock(&bl->bl_ext_lock);
+ for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
+ if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
+ be->be_tag != EXTENT_WRITTEN)
+ continue;
+
+ (*count)++;
+ if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
+ /* keep counting.. */
+ ret = -ENOSPC;
+ continue;
+ }
+
+ if (bl->bl_scsi_layout)
+ p = encode_scsi_range(be, p);
+ else
+ p = encode_block_extent(be, p);
+ be->be_tag = EXTENT_COMMITTING;
+ }
+ *lastbyte = bl->bl_lwb - 1;
+ bl->bl_lwb = 0;
+ spin_unlock(&bl->bl_ext_lock);
+
+ return ret;
+}
+
+int
+ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
+{
+ struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
+ size_t count = 0, buffer_size = PAGE_SIZE;
+ __be32 *start_p;
+ int ret;
+
+ dprintk("%s enter\n", __func__);
+
+ arg->layoutupdate_page = alloc_page(GFP_NOFS);
+ if (!arg->layoutupdate_page)
+ return -ENOMEM;
+ start_p = page_address(arg->layoutupdate_page);
+ arg->layoutupdate_pages = &arg->layoutupdate_page;
+
+retry:
+ ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten);
+ if (unlikely(ret)) {
+ ext_tree_free_commitdata(arg, buffer_size);
+
+ buffer_size = ext_tree_layoutupdate_size(bl, count);
+ count = 0;
+
+ arg->layoutupdate_pages =
+ kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
+ sizeof(struct page *), GFP_NOFS);
+ if (!arg->layoutupdate_pages)
+ return -ENOMEM;
+
+ start_p = __vmalloc(buffer_size, GFP_NOFS);
+ if (!start_p) {
+ kfree(arg->layoutupdate_pages);
+ return -ENOMEM;
+ }
+
+ goto retry;
+ }
+
+ *start_p = cpu_to_be32(count);
+ arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
+
+ if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
+ void *p = start_p, *end = p + arg->layoutupdate_len;
+ struct page *page = NULL;
+ int i = 0;
+
+ arg->start_p = start_p;
+ for ( ; p < end; p += PAGE_SIZE) {
+ page = vmalloc_to_page(p);
+ arg->layoutupdate_pages[i++] = page;
+ get_page(page);
+ }
+ }
+
+ dprintk("%s found %zu ranges\n", __func__, count);
+ return 0;
+}
+
+void
+ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
+{
+ struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
+ struct rb_root *root = &bl->bl_ext_rw;
+ struct pnfs_block_extent *be;
+
+ dprintk("%s status %d\n", __func__, status);
+
+ ext_tree_free_commitdata(arg, arg->layoutupdate_len);
+
+ spin_lock(&bl->bl_ext_lock);
+ for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
+ if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
+ be->be_tag != EXTENT_COMMITTING)
+ continue;
+
+ if (status) {
+ /*
+ * Mark as written and try again.
+ *
+ * XXX: some real error handling here wouldn't hurt..
+ */
+ be->be_tag = EXTENT_WRITTEN;
+ } else {
+ be->be_state = PNFS_BLOCK_READWRITE_DATA;
+ be->be_tag = 0;
+ }
+
+ be = ext_try_to_merge_left(root, be);
+ be = ext_try_to_merge_right(root, be);
+ }
+ spin_unlock(&bl->bl_ext_lock);
+}
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
new file mode 100644
index 0000000000..6c977288cc
--- /dev/null
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2006,2007 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@citi.umich.edu>
+ * Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include <linux/module.h>
+#include <linux/blkdev.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static void
+nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b)
+{
+ int i;
+
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(b->type);
+ *p++ = cpu_to_be32(b->simple.nr_sigs);
+ for (i = 0; i < b->simple.nr_sigs; i++) {
+ p = xdr_encode_hyper(p, b->simple.sigs[i].offset);
+ p = xdr_encode_opaque(p, b->simple.sigs[i].sig,
+ b->simple.sigs[i].sig_len);
+ }
+}
+
+dev_t
+bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
+ gfp_t gfp_mask)
+{
+ struct net *net = server->nfs_client->cl_net;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct bl_dev_msg *reply = &nn->bl_mount_reply;
+ struct bl_pipe_msg bl_pipe_msg;
+ struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
+ struct bl_msg_hdr *bl_msg;
+ DECLARE_WAITQUEUE(wq, current);
+ dev_t dev = 0;
+ int rc;
+
+ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
+
+ mutex_lock(&nn->bl_mutex);
+ bl_pipe_msg.bl_wq = &nn->bl_wq;
+
+ b->simple.len += 4; /* single volume */
+ if (b->simple.len > PAGE_SIZE)
+ goto out_unlock;
+
+ memset(msg, 0, sizeof(*msg));
+ msg->len = sizeof(*bl_msg) + b->simple.len;
+ msg->data = kzalloc(msg->len, gfp_mask);
+ if (!msg->data)
+ goto out_free_data;
+
+ bl_msg = msg->data;
+ bl_msg->type = BL_DEVICE_MOUNT;
+ bl_msg->totallen = b->simple.len;
+ nfs4_encode_simple(msg->data + sizeof(*bl_msg), b);
+
+ dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
+ add_wait_queue(&nn->bl_wq, &wq);
+ rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
+ if (rc < 0) {
+ remove_wait_queue(&nn->bl_wq, &wq);
+ goto out_free_data;
+ }
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ remove_wait_queue(&nn->bl_wq, &wq);
+
+ if (reply->status != BL_DEVICE_REQUEST_PROC) {
+ printk(KERN_WARNING "%s failed to decode device: %d\n",
+ __func__, reply->status);
+ goto out_free_data;
+ }
+
+ dev = MKDEV(reply->major, reply->minor);
+out_free_data:
+ kfree(msg->data);
+out_unlock:
+ mutex_unlock(&nn->bl_mutex);
+ return dev;
+}
+
+static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
+ size_t mlen)
+{
+ struct nfs_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info,
+ nfs_net_id);
+
+ if (mlen != sizeof (struct bl_dev_msg))
+ return -EINVAL;
+
+ if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
+ return -EFAULT;
+
+ wake_up(&nn->bl_wq);
+
+ return mlen;
+}
+
+static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+ struct bl_pipe_msg *bl_pipe_msg =
+ container_of(msg, struct bl_pipe_msg, msg);
+
+ if (msg->errno >= 0)
+ return;
+ wake_up(bl_pipe_msg->bl_wq);
+}
+
+static const struct rpc_pipe_ops bl_upcall_ops = {
+ .upcall = rpc_pipe_generic_upcall,
+ .downcall = bl_pipe_downcall,
+ .destroy_msg = bl_pipe_destroy_msg,
+};
+
+static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
+ struct rpc_pipe *pipe)
+{
+ struct dentry *dir, *dentry;
+
+ dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
+ if (dir == NULL)
+ return ERR_PTR(-ENOENT);
+ dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
+ dput(dir);
+ return dentry;
+}
+
+static void nfs4blocklayout_unregister_sb(struct super_block *sb,
+ struct rpc_pipe *pipe)
+{
+ if (pipe->dentry)
+ rpc_unlink(pipe->dentry);
+}
+
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct super_block *sb = ptr;
+ struct net *net = sb->s_fs_info;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct dentry *dentry;
+ int ret = 0;
+
+ if (!try_module_get(THIS_MODULE))
+ return 0;
+
+ if (nn->bl_device_pipe == NULL) {
+ module_put(THIS_MODULE);
+ return 0;
+ }
+
+ switch (event) {
+ case RPC_PIPEFS_MOUNT:
+ dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
+ if (IS_ERR(dentry)) {
+ ret = PTR_ERR(dentry);
+ break;
+ }
+ nn->bl_device_pipe->dentry = dentry;
+ break;
+ case RPC_PIPEFS_UMOUNT:
+ if (nn->bl_device_pipe->dentry)
+ nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
+ break;
+ default:
+ ret = -ENOTSUPP;
+ break;
+ }
+ module_put(THIS_MODULE);
+ return ret;
+}
+
+static struct notifier_block nfs4blocklayout_block = {
+ .notifier_call = rpc_pipefs_event,
+};
+
+static struct dentry *nfs4blocklayout_register_net(struct net *net,
+ struct rpc_pipe *pipe)
+{
+ struct super_block *pipefs_sb;
+ struct dentry *dentry;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (!pipefs_sb)
+ return NULL;
+ dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
+ rpc_put_sb_net(net);
+ return dentry;
+}
+
+static void nfs4blocklayout_unregister_net(struct net *net,
+ struct rpc_pipe *pipe)
+{
+ struct super_block *pipefs_sb;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (pipefs_sb) {
+ nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
+ rpc_put_sb_net(net);
+ }
+}
+
+static int nfs4blocklayout_net_init(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct dentry *dentry;
+
+ mutex_init(&nn->bl_mutex);
+ init_waitqueue_head(&nn->bl_wq);
+ nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
+ if (IS_ERR(nn->bl_device_pipe))
+ return PTR_ERR(nn->bl_device_pipe);
+ dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
+ if (IS_ERR(dentry)) {
+ rpc_destroy_pipe_data(nn->bl_device_pipe);
+ return PTR_ERR(dentry);
+ }
+ nn->bl_device_pipe->dentry = dentry;
+ return 0;
+}
+
+static void nfs4blocklayout_net_exit(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
+ rpc_destroy_pipe_data(nn->bl_device_pipe);
+ nn->bl_device_pipe = NULL;
+}
+
+static struct pernet_operations nfs4blocklayout_net_ops = {
+ .init = nfs4blocklayout_net_init,
+ .exit = nfs4blocklayout_net_exit,
+};
+
+int __init bl_init_pipefs(void)
+{
+ int ret;
+
+ ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
+ if (ret)
+ goto out;
+ ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
+ if (ret)
+ goto out_unregister_notifier;
+ return 0;
+
+out_unregister_notifier:
+ rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+out:
+ return ret;
+}
+
+void bl_cleanup_pipefs(void)
+{
+ rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+ unregister_pernet_subsys(&nfs4blocklayout_net_ops);
+}
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
new file mode 100644
index 0000000000..ef67295684
--- /dev/null
+++ b/fs/nfs/cache_lib.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/cache_lib.c
+ *
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <net/net_namespace.h>
+
+#include "cache_lib.h"
+
+#define NFS_CACHE_UPCALL_PATHLEN 256
+#define NFS_CACHE_UPCALL_TIMEOUT 15
+
+static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] =
+ "/sbin/nfs_cache_getent";
+static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT;
+
+module_param_string(cache_getent, nfs_cache_getent_prog,
+ sizeof(nfs_cache_getent_prog), 0600);
+MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program");
+module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600);
+MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which "
+ "the cache upcall is assumed to have failed");
+
+int nfs_cache_upcall(struct cache_detail *cd, char *entry_name)
+{
+ static char *envp[] = { "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ NULL
+ };
+ char *argv[] = {
+ nfs_cache_getent_prog,
+ cd->name,
+ entry_name,
+ NULL
+ };
+ int ret = -EACCES;
+
+ if (nfs_cache_getent_prog[0] == '\0')
+ goto out;
+ ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ /*
+ * Disable the upcall mechanism if we're getting an ENOENT or
+ * EACCES error. The admin can re-enable it on the fly by using
+ * sysfs to set the 'cache_getent' parameter once the problem
+ * has been fixed.
+ */
+ if (ret == -ENOENT || ret == -EACCES)
+ nfs_cache_getent_prog[0] = '\0';
+out:
+ return ret > 0 ? 0 : ret;
+}
+
+/*
+ * Deferred request handling
+ */
+void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq)
+{
+ if (refcount_dec_and_test(&dreq->count))
+ kfree(dreq);
+}
+
+static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany)
+{
+ struct nfs_cache_defer_req *dreq;
+
+ dreq = container_of(d, struct nfs_cache_defer_req, deferred_req);
+
+ complete(&dreq->completion);
+ nfs_cache_defer_req_put(dreq);
+}
+
+static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req)
+{
+ struct nfs_cache_defer_req *dreq;
+
+ dreq = container_of(req, struct nfs_cache_defer_req, req);
+ dreq->deferred_req.revisit = nfs_dns_cache_revisit;
+ refcount_inc(&dreq->count);
+
+ return &dreq->deferred_req;
+}
+
+struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void)
+{
+ struct nfs_cache_defer_req *dreq;
+
+ dreq = kzalloc(sizeof(*dreq), GFP_KERNEL);
+ if (dreq) {
+ init_completion(&dreq->completion);
+ refcount_set(&dreq->count, 1);
+ dreq->req.defer = nfs_dns_cache_defer;
+ }
+ return dreq;
+}
+
+int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
+{
+ if (wait_for_completion_timeout(&dreq->completion,
+ nfs_cache_getent_timeout * HZ) == 0)
+ return -ETIMEDOUT;
+ return 0;
+}
+
+int nfs_cache_register_sb(struct super_block *sb, struct cache_detail *cd)
+{
+ int ret;
+ struct dentry *dir;
+
+ dir = rpc_d_lookup_sb(sb, "cache");
+ ret = sunrpc_cache_register_pipefs(dir, cd->name, 0600, cd);
+ dput(dir);
+ return ret;
+}
+
+int nfs_cache_register_net(struct net *net, struct cache_detail *cd)
+{
+ struct super_block *pipefs_sb;
+ int ret = 0;
+
+ sunrpc_init_cache_detail(cd);
+ pipefs_sb = rpc_get_sb_net(net);
+ if (pipefs_sb) {
+ ret = nfs_cache_register_sb(pipefs_sb, cd);
+ rpc_put_sb_net(net);
+ if (ret)
+ sunrpc_destroy_cache_detail(cd);
+ }
+ return ret;
+}
+
+void nfs_cache_unregister_sb(struct super_block *sb, struct cache_detail *cd)
+{
+ sunrpc_cache_unregister_pipefs(cd);
+}
+
+void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd)
+{
+ struct super_block *pipefs_sb;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (pipefs_sb) {
+ nfs_cache_unregister_sb(pipefs_sb, cd);
+ rpc_put_sb_net(net);
+ }
+ sunrpc_destroy_cache_detail(cd);
+}
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
new file mode 100644
index 0000000000..220ee409ab
--- /dev/null
+++ b/fs/nfs/cache_lib.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+
+#include <linux/completion.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/atomic.h>
+
+/*
+ * Deferred request handling
+ */
+struct nfs_cache_defer_req {
+ struct cache_req req;
+ struct cache_deferred_req deferred_req;
+ struct completion completion;
+ refcount_t count;
+};
+
+extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name);
+extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
+extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
+extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
+
+extern int nfs_cache_register_net(struct net *net, struct cache_detail *cd);
+extern void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd);
+extern int nfs_cache_register_sb(struct super_block *sb,
+ struct cache_detail *cd);
+extern void nfs_cache_unregister_sb(struct super_block *sb,
+ struct cache_detail *cd);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
new file mode 100644
index 0000000000..466ebf1d41
--- /dev/null
+++ b/fs/nfs/callback.c
@@ -0,0 +1,414 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/callback.c
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFSv4 callback handling
+ */
+
+#include <linux/completion.h>
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/sched/signal.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/nfs_fs.h>
+#include <linux/errno.h>
+#include <linux/mutex.h>
+#include <linux/freezer.h>
+#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/sunrpc/bc_xprt.h>
+
+#include <net/inet_sock.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "internal.h"
+#include "netns.h"
+
+#define NFSDBG_FACILITY NFSDBG_CALLBACK
+
+struct nfs_callback_data {
+ unsigned int users;
+ struct svc_serv *serv;
+};
+
+static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1];
+static DEFINE_MUTEX(nfs_callback_mutex);
+static struct svc_program nfs4_callback_program;
+
+static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net)
+{
+ const struct cred *cred = current_cred();
+ int ret;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ ret = svc_xprt_create(serv, "tcp", net, PF_INET,
+ nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
+ cred);
+ if (ret <= 0)
+ goto out_err;
+ nn->nfs_callback_tcpport = ret;
+ dprintk("NFS: Callback listener port = %u (af %u, net %x)\n",
+ nn->nfs_callback_tcpport, PF_INET, net->ns.inum);
+
+ ret = svc_xprt_create(serv, "tcp", net, PF_INET6,
+ nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
+ cred);
+ if (ret > 0) {
+ nn->nfs_callback_tcpport6 = ret;
+ dprintk("NFS: Callback listener port = %u (af %u, net %x)\n",
+ nn->nfs_callback_tcpport6, PF_INET6, net->ns.inum);
+ } else if (ret != -EAFNOSUPPORT)
+ goto out_err;
+ return 0;
+
+out_err:
+ return (ret) ? ret : -ENOMEM;
+}
+
+/*
+ * This is the NFSv4 callback kernel thread.
+ */
+static int
+nfs4_callback_svc(void *vrqstp)
+{
+ struct svc_rqst *rqstp = vrqstp;
+
+ set_freezable();
+
+ while (!kthread_freezable_should_stop(NULL))
+ svc_recv(rqstp);
+
+ svc_exit_thread(rqstp);
+ return 0;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * The callback service for NFSv4.1 callbacks
+ */
+static int
+nfs41_callback_svc(void *vrqstp)
+{
+ struct svc_rqst *rqstp = vrqstp;
+ struct svc_serv *serv = rqstp->rq_server;
+ struct rpc_rqst *req;
+ int error;
+ DEFINE_WAIT(wq);
+
+ set_freezable();
+
+ while (!kthread_freezable_should_stop(NULL)) {
+ prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_IDLE);
+ spin_lock_bh(&serv->sv_cb_lock);
+ if (!list_empty(&serv->sv_cb_list)) {
+ req = list_first_entry(&serv->sv_cb_list,
+ struct rpc_rqst, rq_bc_list);
+ list_del(&req->rq_bc_list);
+ spin_unlock_bh(&serv->sv_cb_lock);
+ finish_wait(&serv->sv_cb_waitq, &wq);
+ dprintk("Invoking bc_svc_process()\n");
+ error = bc_svc_process(serv, req, rqstp);
+ dprintk("bc_svc_process() returned w/ error code= %d\n",
+ error);
+ } else {
+ spin_unlock_bh(&serv->sv_cb_lock);
+ if (!kthread_should_stop())
+ schedule();
+ finish_wait(&serv->sv_cb_waitq, &wq);
+ }
+ }
+
+ svc_exit_thread(rqstp);
+ return 0;
+}
+
+static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
+ struct svc_serv *serv)
+{
+ if (minorversion)
+ /*
+ * Save the svc_serv in the transport so that it can
+ * be referenced when the session backchannel is initialized
+ */
+ xprt->bc_serv = serv;
+}
+#else
+static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
+ struct svc_serv *serv)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
+ struct svc_serv *serv)
+{
+ int nrservs = nfs_callback_nr_threads;
+ int ret;
+
+ nfs_callback_bc_serv(minorversion, xprt, serv);
+
+ if (nrservs < NFS4_MIN_NR_CALLBACK_THREADS)
+ nrservs = NFS4_MIN_NR_CALLBACK_THREADS;
+
+ if (serv->sv_nrthreads == nrservs)
+ return 0;
+
+ ret = svc_set_num_threads(serv, NULL, nrservs);
+ if (ret) {
+ svc_set_num_threads(serv, NULL, 0);
+ return ret;
+ }
+ dprintk("nfs_callback_up: service started\n");
+ return 0;
+}
+
+static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ if (--nn->cb_users[minorversion])
+ return;
+
+ dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum);
+ svc_xprt_destroy_all(serv, net);
+}
+
+static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
+ struct net *net, struct rpc_xprt *xprt)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ int ret;
+
+ if (nn->cb_users[minorversion]++)
+ return 0;
+
+ dprintk("NFS: create per-net callback data; net=%x\n", net->ns.inum);
+
+ ret = svc_bind(serv, net);
+ if (ret < 0) {
+ printk(KERN_WARNING "NFS: bind callback service failed\n");
+ goto err_bind;
+ }
+
+ ret = 0;
+ if (!IS_ENABLED(CONFIG_NFS_V4_1) || minorversion == 0)
+ ret = nfs4_callback_up_net(serv, net);
+ else if (xprt->ops->bc_setup)
+ set_bc_enabled(serv);
+ else
+ ret = -EPROTONOSUPPORT;
+
+ if (ret < 0) {
+ printk(KERN_ERR "NFS: callback service start failed\n");
+ goto err_socks;
+ }
+ return 0;
+
+err_socks:
+ svc_rpcb_cleanup(serv, net);
+err_bind:
+ nn->cb_users[minorversion]--;
+ dprintk("NFS: Couldn't create callback socket: err = %d; "
+ "net = %x\n", ret, net->ns.inum);
+ return ret;
+}
+
+static struct svc_serv *nfs_callback_create_svc(int minorversion)
+{
+ struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+ int (*threadfn)(void *data);
+ struct svc_serv *serv;
+
+ /*
+ * Check whether we're already up and running.
+ */
+ if (cb_info->serv)
+ return svc_get(cb_info->serv);
+
+ /*
+ * Sanity check: if there's no task,
+ * we should be the first user ...
+ */
+ if (cb_info->users)
+ printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
+ cb_info->users);
+
+ threadfn = nfs4_callback_svc;
+#if defined(CONFIG_NFS_V4_1)
+ if (minorversion)
+ threadfn = nfs41_callback_svc;
+#else
+ if (minorversion)
+ return ERR_PTR(-ENOTSUPP);
+#endif
+ serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
+ threadfn);
+ if (!serv) {
+ printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
+ return ERR_PTR(-ENOMEM);
+ }
+ cb_info->serv = serv;
+ /* As there is only one thread we need to over-ride the
+ * default maximum of 80 connections
+ */
+ serv->sv_maxconn = 1024;
+ dprintk("nfs_callback_create_svc: service created\n");
+ return serv;
+}
+
+/*
+ * Bring up the callback thread if it is not already up.
+ */
+int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
+{
+ struct svc_serv *serv;
+ struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+ int ret;
+ struct net *net = xprt->xprt_net;
+
+ mutex_lock(&nfs_callback_mutex);
+
+ serv = nfs_callback_create_svc(minorversion);
+ if (IS_ERR(serv)) {
+ ret = PTR_ERR(serv);
+ goto err_create;
+ }
+
+ ret = nfs_callback_up_net(minorversion, serv, net, xprt);
+ if (ret < 0)
+ goto err_net;
+
+ ret = nfs_callback_start_svc(minorversion, xprt, serv);
+ if (ret < 0)
+ goto err_start;
+
+ cb_info->users++;
+err_net:
+ if (!cb_info->users)
+ cb_info->serv = NULL;
+ svc_put(serv);
+err_create:
+ mutex_unlock(&nfs_callback_mutex);
+ return ret;
+
+err_start:
+ nfs_callback_down_net(minorversion, serv, net);
+ dprintk("NFS: Couldn't create server thread; err = %d\n", ret);
+ goto err_net;
+}
+
+/*
+ * Kill the callback thread if it's no longer being used.
+ */
+void nfs_callback_down(int minorversion, struct net *net)
+{
+ struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+ struct svc_serv *serv;
+
+ mutex_lock(&nfs_callback_mutex);
+ serv = cb_info->serv;
+ nfs_callback_down_net(minorversion, serv, net);
+ cb_info->users--;
+ if (cb_info->users == 0) {
+ svc_get(serv);
+ svc_set_num_threads(serv, NULL, 0);
+ svc_put(serv);
+ dprintk("nfs_callback_down: service destroyed\n");
+ cb_info->serv = NULL;
+ }
+ mutex_unlock(&nfs_callback_mutex);
+}
+
+/* Boolean check of RPC_AUTH_GSS principal */
+int
+check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
+{
+ char *p = rqstp->rq_cred.cr_principal;
+
+ if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
+ return 1;
+
+ /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */
+ if (clp->cl_minorversion != 0)
+ return 0;
+ /*
+ * It might just be a normal user principal, in which case
+ * userspace won't bother to tell us the name at all.
+ */
+ if (p == NULL)
+ return 0;
+
+ /*
+ * Did we get the acceptor from userland during the SETCLIENID
+ * negotiation?
+ */
+ if (clp->cl_acceptor)
+ return !strcmp(p, clp->cl_acceptor);
+
+ /*
+ * Otherwise try to verify it using the cl_hostname. Note that this
+ * doesn't work if a non-canonical hostname was used in the devname.
+ */
+
+ /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
+
+ if (memcmp(p, "nfs@", 4) != 0)
+ return 0;
+ p += 4;
+ if (strcmp(p, clp->cl_hostname) != 0)
+ return 0;
+ return 1;
+}
+
+/*
+ * pg_authenticate method for nfsv4 callback threads.
+ *
+ * The authflavor has been negotiated, so an incorrect flavor is a server
+ * bug. Deny packets with incorrect authflavor.
+ *
+ * All other checking done after NFS decoding where the nfs_client can be
+ * found in nfs4_callback_compound
+ */
+static enum svc_auth_status nfs_callback_authenticate(struct svc_rqst *rqstp)
+{
+ rqstp->rq_auth_stat = rpc_autherr_badcred;
+
+ switch (rqstp->rq_authop->flavour) {
+ case RPC_AUTH_NULL:
+ if (rqstp->rq_proc != CB_NULL)
+ return SVC_DENIED;
+ break;
+ case RPC_AUTH_GSS:
+ /* No RPC_AUTH_GSS support yet in NFSv4.1 */
+ if (svc_is_backchannel(rqstp))
+ return SVC_DENIED;
+ }
+
+ rqstp->rq_auth_stat = rpc_auth_ok;
+ return SVC_OK;
+}
+
+/*
+ * Define NFS4 callback program
+ */
+static const struct svc_version *nfs4_callback_version[] = {
+ [1] = &nfs4_callback_version1,
+ [4] = &nfs4_callback_version4,
+};
+
+static struct svc_stat nfs4_callback_stats;
+
+static struct svc_program nfs4_callback_program = {
+ .pg_prog = NFS4_CALLBACK, /* RPC service number */
+ .pg_nvers = ARRAY_SIZE(nfs4_callback_version), /* Number of entries */
+ .pg_vers = nfs4_callback_version, /* version table */
+ .pg_name = "NFSv4 callback", /* service name */
+ .pg_class = "nfs", /* authentication class */
+ .pg_stats = &nfs4_callback_stats,
+ .pg_authenticate = nfs_callback_authenticate,
+ .pg_init_request = svc_generic_init_request,
+ .pg_rpcbind_set = svc_generic_rpcbind_set,
+};
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
new file mode 100644
index 0000000000..ccd4f245ca
--- /dev/null
+++ b/fs/nfs/callback.h
@@ -0,0 +1,223 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/fs/nfs/callback.h
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFSv4 callback definitions
+ */
+#ifndef __LINUX_FS_NFS_CALLBACK_H
+#define __LINUX_FS_NFS_CALLBACK_H
+#include <linux/sunrpc/svc.h>
+
+#define NFS4_CALLBACK 0x40000000
+#define NFS4_CALLBACK_XDRSIZE 2048
+#define NFS4_CALLBACK_BUFSIZE (1024 + NFS4_CALLBACK_XDRSIZE)
+
+enum nfs4_callback_procnum {
+ CB_NULL = 0,
+ CB_COMPOUND = 1,
+};
+
+enum nfs4_callback_opnum {
+ OP_CB_GETATTR = 3,
+ OP_CB_RECALL = 4,
+/* Callback operations new to NFSv4.1 */
+ OP_CB_LAYOUTRECALL = 5,
+ OP_CB_NOTIFY = 6,
+ OP_CB_PUSH_DELEG = 7,
+ OP_CB_RECALL_ANY = 8,
+ OP_CB_RECALLABLE_OBJ_AVAIL = 9,
+ OP_CB_RECALL_SLOT = 10,
+ OP_CB_SEQUENCE = 11,
+ OP_CB_WANTS_CANCELLED = 12,
+ OP_CB_NOTIFY_LOCK = 13,
+ OP_CB_NOTIFY_DEVICEID = 14,
+/* Callback operations new to NFSv4.2 */
+ OP_CB_OFFLOAD = 15,
+ OP_CB_ILLEGAL = 10044,
+};
+
+struct nfs4_slot;
+struct cb_process_state {
+ __be32 drc_status;
+ struct nfs_client *clp;
+ struct nfs4_slot *slot;
+ u32 minorversion;
+ struct net *net;
+};
+
+struct cb_compound_hdr_arg {
+ unsigned int taglen;
+ const char *tag;
+ unsigned int minorversion;
+ unsigned int cb_ident; /* v4.0 callback identifier */
+ unsigned nops;
+};
+
+struct cb_compound_hdr_res {
+ __be32 *status;
+ unsigned int taglen;
+ const char *tag;
+ __be32 *nops;
+};
+
+struct cb_getattrargs {
+ struct nfs_fh fh;
+ uint32_t bitmap[2];
+};
+
+struct cb_getattrres {
+ __be32 status;
+ uint32_t bitmap[2];
+ uint64_t size;
+ uint64_t change_attr;
+ struct timespec64 ctime;
+ struct timespec64 mtime;
+};
+
+struct cb_recallargs {
+ struct nfs_fh fh;
+ nfs4_stateid stateid;
+ uint32_t truncate;
+};
+
+#if defined(CONFIG_NFS_V4_1)
+
+struct referring_call {
+ uint32_t rc_sequenceid;
+ uint32_t rc_slotid;
+};
+
+struct referring_call_list {
+ struct nfs4_sessionid rcl_sessionid;
+ uint32_t rcl_nrefcalls;
+ struct referring_call *rcl_refcalls;
+};
+
+struct cb_sequenceargs {
+ struct sockaddr *csa_addr;
+ struct nfs4_sessionid csa_sessionid;
+ uint32_t csa_sequenceid;
+ uint32_t csa_slotid;
+ uint32_t csa_highestslotid;
+ uint32_t csa_cachethis;
+ uint32_t csa_nrclists;
+ struct referring_call_list *csa_rclists;
+};
+
+struct cb_sequenceres {
+ __be32 csr_status;
+ struct nfs4_sessionid csr_sessionid;
+ uint32_t csr_sequenceid;
+ uint32_t csr_slotid;
+ uint32_t csr_highestslotid;
+ uint32_t csr_target_highestslotid;
+};
+
+extern __be32 nfs4_callback_sequence(void *argp, void *resp,
+ struct cb_process_state *cps);
+
+#define RCA4_TYPE_MASK_RDATA_DLG 0
+#define RCA4_TYPE_MASK_WDATA_DLG 1
+#define RCA4_TYPE_MASK_DIR_DLG 2
+#define RCA4_TYPE_MASK_FILE_LAYOUT 3
+#define RCA4_TYPE_MASK_BLK_LAYOUT 4
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
+#define PNFS_FF_RCA4_TYPE_MASK_READ 16
+#define PNFS_FF_RCA4_TYPE_MASK_RW 17
+#define RCA4_TYPE_MASK_ALL 0x3f31f
+
+struct cb_recallanyargs {
+ uint32_t craa_objs_to_keep;
+ uint32_t craa_type_mask;
+};
+
+extern __be32 nfs4_callback_recallany(void *argp, void *resp,
+ struct cb_process_state *cps);
+
+struct cb_recallslotargs {
+ uint32_t crsa_target_highest_slotid;
+};
+extern __be32 nfs4_callback_recallslot(void *argp, void *resp,
+ struct cb_process_state *cps);
+
+struct cb_layoutrecallargs {
+ uint32_t cbl_recall_type;
+ uint32_t cbl_layout_type;
+ uint32_t cbl_layoutchanged;
+ union {
+ struct {
+ struct nfs_fh cbl_fh;
+ struct pnfs_layout_range cbl_range;
+ nfs4_stateid cbl_stateid;
+ };
+ struct nfs_fsid cbl_fsid;
+ };
+};
+
+extern __be32 nfs4_callback_layoutrecall(void *argp, void *resp,
+ struct cb_process_state *cps);
+
+struct cb_devicenotifyitem {
+ uint32_t cbd_notify_type;
+ uint32_t cbd_layout_type;
+ struct nfs4_deviceid cbd_dev_id;
+ uint32_t cbd_immediate;
+};
+
+struct cb_devicenotifyargs {
+ uint32_t ndevs;
+ struct cb_devicenotifyitem *devs;
+};
+
+extern __be32 nfs4_callback_devicenotify(void *argp, void *resp,
+ struct cb_process_state *cps);
+
+struct cb_notify_lock_args {
+ struct nfs_fh cbnl_fh;
+ struct nfs_lowner cbnl_owner;
+ bool cbnl_valid;
+};
+
+extern __be32 nfs4_callback_notify_lock(void *argp, void *resp,
+ struct cb_process_state *cps);
+#endif /* CONFIG_NFS_V4_1 */
+#ifdef CONFIG_NFS_V4_2
+struct cb_offloadargs {
+ struct nfs_fh coa_fh;
+ nfs4_stateid coa_stateid;
+ uint32_t error;
+ uint64_t wr_count;
+ struct nfs_writeverf wr_writeverf;
+};
+
+extern __be32 nfs4_callback_offload(void *args, void *dummy,
+ struct cb_process_state *cps);
+#endif /* CONFIG_NFS_V4_2 */
+extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
+extern __be32 nfs4_callback_getattr(void *argp, void *resp,
+ struct cb_process_state *cps);
+extern __be32 nfs4_callback_recall(void *argp, void *resp,
+ struct cb_process_state *cps);
+#if IS_ENABLED(CONFIG_NFS_V4)
+extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
+extern void nfs_callback_down(int minorversion, struct net *net);
+#endif /* CONFIG_NFS_V4 */
+/*
+ * nfs41: Callbacks are expected to not cause substantial latency,
+ * so we limit their concurrency to 1 by setting up the maximum number
+ * of slots for the backchannel.
+ */
+#define NFS41_BC_MIN_CALLBACKS 1
+#define NFS41_BC_MAX_CALLBACKS 1
+
+#define NFS4_MIN_NR_CALLBACK_THREADS 1
+
+extern unsigned int nfs_callback_set_tcpport;
+extern unsigned short nfs_callback_nr_threads;
+
+#endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
new file mode 100644
index 0000000000..6bed1394d7
--- /dev/null
+++ b/fs/nfs/callback_proc.c
@@ -0,0 +1,739 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/callback_proc.c
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFSv4 callback procedures
+ */
+
+#include <linux/errno.h>
+#include <linux/math.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>
+#include <linux/types.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "internal.h"
+#include "pnfs.h"
+#include "nfs4session.h"
+#include "nfs4trace.h"
+
+#define NFSDBG_FACILITY NFSDBG_CALLBACK
+
+__be32 nfs4_callback_getattr(void *argp, void *resp,
+ struct cb_process_state *cps)
+{
+ struct cb_getattrargs *args = argp;
+ struct cb_getattrres *res = resp;
+ struct nfs_delegation *delegation;
+ struct inode *inode;
+
+ res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+ if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
+ goto out;
+
+ res->bitmap[0] = res->bitmap[1] = 0;
+ res->status = htonl(NFS4ERR_BADHANDLE);
+
+ dprintk_rcu("NFS: GETATTR callback request from %s\n",
+ rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+ inode = nfs_delegation_find_inode(cps->clp, &args->fh);
+ if (IS_ERR(inode)) {
+ if (inode == ERR_PTR(-EAGAIN))
+ res->status = htonl(NFS4ERR_DELAY);
+ trace_nfs4_cb_getattr(cps->clp, &args->fh, NULL,
+ -ntohl(res->status));
+ goto out;
+ }
+ rcu_read_lock();
+ delegation = nfs4_get_valid_delegation(inode);
+ if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0)
+ goto out_iput;
+ res->size = i_size_read(inode);
+ res->change_attr = delegation->change_attr;
+ if (nfs_have_writebacks(inode))
+ res->change_attr++;
+ res->ctime = inode_get_ctime(inode);
+ res->mtime = inode->i_mtime;
+ res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) &
+ args->bitmap[0];
+ res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) &
+ args->bitmap[1];
+ res->status = 0;
+out_iput:
+ rcu_read_unlock();
+ trace_nfs4_cb_getattr(cps->clp, &args->fh, inode, -ntohl(res->status));
+ nfs_iput_and_deactive(inode);
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
+ return res->status;
+}
+
+__be32 nfs4_callback_recall(void *argp, void *resp,
+ struct cb_process_state *cps)
+{
+ struct cb_recallargs *args = argp;
+ struct inode *inode;
+ __be32 res;
+
+ res = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+ if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
+ goto out;
+
+ dprintk_rcu("NFS: RECALL callback request from %s\n",
+ rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+ res = htonl(NFS4ERR_BADHANDLE);
+ inode = nfs_delegation_find_inode(cps->clp, &args->fh);
+ if (IS_ERR(inode)) {
+ if (inode == ERR_PTR(-EAGAIN))
+ res = htonl(NFS4ERR_DELAY);
+ trace_nfs4_cb_recall(cps->clp, &args->fh, NULL,
+ &args->stateid, -ntohl(res));
+ goto out;
+ }
+ /* Set up a helper thread to actually return the delegation */
+ switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
+ case 0:
+ res = 0;
+ break;
+ case -ENOENT:
+ res = htonl(NFS4ERR_BAD_STATEID);
+ break;
+ default:
+ res = htonl(NFS4ERR_RESOURCE);
+ }
+ trace_nfs4_cb_recall(cps->clp, &args->fh, inode,
+ &args->stateid, -ntohl(res));
+ nfs_iput_and_deactive(inode);
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
+ return res;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+/*
+ * Lookup a layout inode by stateid
+ *
+ * Note: returns a refcount on the inode and superblock
+ */
+static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp,
+ const nfs4_stateid *stateid)
+ __must_hold(RCU)
+{
+ struct nfs_server *server;
+ struct inode *inode;
+ struct pnfs_layout_hdr *lo;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
+ if (!pnfs_layout_is_valid(lo))
+ continue;
+ if (!nfs4_stateid_match_other(stateid, &lo->plh_stateid))
+ continue;
+ if (nfs_sb_active(server->super))
+ inode = igrab(lo->plh_inode);
+ else
+ inode = ERR_PTR(-EAGAIN);
+ rcu_read_unlock();
+ if (inode)
+ return inode;
+ nfs_sb_deactive(server->super);
+ return ERR_PTR(-EAGAIN);
+ }
+ }
+ rcu_read_unlock();
+ return ERR_PTR(-ENOENT);
+}
+
+/*
+ * Lookup a layout inode by filehandle.
+ *
+ * Note: returns a refcount on the inode and superblock
+ *
+ */
+static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp,
+ const struct nfs_fh *fh)
+{
+ struct nfs_server *server;
+ struct nfs_inode *nfsi;
+ struct inode *inode;
+ struct pnfs_layout_hdr *lo;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
+ nfsi = NFS_I(lo->plh_inode);
+ if (nfs_compare_fh(fh, &nfsi->fh))
+ continue;
+ if (nfsi->layout != lo)
+ continue;
+ if (nfs_sb_active(server->super))
+ inode = igrab(lo->plh_inode);
+ else
+ inode = ERR_PTR(-EAGAIN);
+ rcu_read_unlock();
+ if (inode)
+ return inode;
+ nfs_sb_deactive(server->super);
+ return ERR_PTR(-EAGAIN);
+ }
+ }
+ rcu_read_unlock();
+ return ERR_PTR(-ENOENT);
+}
+
+static struct inode *nfs_layout_find_inode(struct nfs_client *clp,
+ const struct nfs_fh *fh,
+ const nfs4_stateid *stateid)
+{
+ struct inode *inode;
+
+ inode = nfs_layout_find_inode_by_stateid(clp, stateid);
+ if (inode == ERR_PTR(-ENOENT))
+ inode = nfs_layout_find_inode_by_fh(clp, fh);
+ return inode;
+}
+
+/*
+ * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing)
+ */
+static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *new)
+{
+ u32 oldseq, newseq;
+
+ /* Is the stateid not initialised? */
+ if (!pnfs_layout_is_valid(lo))
+ return NFS4ERR_NOMATCHING_LAYOUT;
+
+ /* Mismatched stateid? */
+ if (!nfs4_stateid_match_other(&lo->plh_stateid, new))
+ return NFS4ERR_BAD_STATEID;
+
+ newseq = be32_to_cpu(new->seqid);
+ /* Are we already in a layout recall situation? */
+ if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
+ lo->plh_return_seq != 0) {
+ if (newseq < lo->plh_return_seq)
+ return NFS4ERR_OLD_STATEID;
+ if (newseq > lo->plh_return_seq)
+ return NFS4ERR_DELAY;
+ goto out;
+ }
+
+ /* Check that the stateid matches what we think it should be. */
+ oldseq = be32_to_cpu(lo->plh_stateid.seqid);
+ if (newseq > oldseq + 1)
+ return NFS4ERR_DELAY;
+ /* Crazy server! */
+ if (newseq <= oldseq)
+ return NFS4ERR_OLD_STATEID;
+out:
+ return NFS_OK;
+}
+
+static u32 initiate_file_draining(struct nfs_client *clp,
+ struct cb_layoutrecallargs *args)
+{
+ struct inode *ino;
+ struct pnfs_layout_hdr *lo;
+ u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+ LIST_HEAD(free_me_list);
+
+ ino = nfs_layout_find_inode(clp, &args->cbl_fh, &args->cbl_stateid);
+ if (IS_ERR(ino)) {
+ if (ino == ERR_PTR(-EAGAIN))
+ rv = NFS4ERR_DELAY;
+ goto out_noput;
+ }
+
+ pnfs_layoutcommit_inode(ino, false);
+
+
+ spin_lock(&ino->i_lock);
+ lo = NFS_I(ino)->layout;
+ if (!lo) {
+ spin_unlock(&ino->i_lock);
+ goto out;
+ }
+ pnfs_get_layout_hdr(lo);
+ rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid);
+ if (rv != NFS_OK)
+ goto unlock;
+
+ /*
+ * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return)
+ */
+ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+ rv = NFS4ERR_DELAY;
+ goto unlock;
+ }
+
+ pnfs_set_layout_stateid(lo, &args->cbl_stateid, NULL, true);
+ switch (pnfs_mark_matching_lsegs_return(lo, &free_me_list,
+ &args->cbl_range,
+ be32_to_cpu(args->cbl_stateid.seqid))) {
+ case 0:
+ case -EBUSY:
+ /* There are layout segments that need to be returned */
+ rv = NFS4_OK;
+ break;
+ case -ENOENT:
+ set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
+ /* Embrace your forgetfulness! */
+ rv = NFS4ERR_NOMATCHING_LAYOUT;
+
+ if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
+ NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
+ &args->cbl_range);
+ }
+ }
+unlock:
+ spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&free_me_list);
+ /* Free all lsegs that are attached to commit buckets */
+ nfs_commit_inode(ino, 0);
+ pnfs_put_layout_hdr(lo);
+out:
+ nfs_iput_and_deactive(ino);
+out_noput:
+ trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino,
+ &args->cbl_stateid, -rv);
+ return rv;
+}
+
+static u32 initiate_bulk_draining(struct nfs_client *clp,
+ struct cb_layoutrecallargs *args)
+{
+ int stat;
+
+ if (args->cbl_recall_type == RETURN_FSID)
+ stat = pnfs_destroy_layouts_byfsid(clp, &args->cbl_fsid, true);
+ else
+ stat = pnfs_destroy_layouts_byclid(clp, true);
+ if (stat != 0)
+ return NFS4ERR_DELAY;
+ return NFS4ERR_NOMATCHING_LAYOUT;
+}
+
+static u32 do_callback_layoutrecall(struct nfs_client *clp,
+ struct cb_layoutrecallargs *args)
+{
+ if (args->cbl_recall_type == RETURN_FILE)
+ return initiate_file_draining(clp, args);
+ return initiate_bulk_draining(clp, args);
+}
+
+__be32 nfs4_callback_layoutrecall(void *argp, void *resp,
+ struct cb_process_state *cps)
+{
+ struct cb_layoutrecallargs *args = argp;
+ u32 res = NFS4ERR_OP_NOT_IN_SESSION;
+
+ if (cps->clp)
+ res = do_callback_layoutrecall(cps->clp, args);
+ return cpu_to_be32(res);
+}
+
+static void pnfs_recall_all_layouts(struct nfs_client *clp)
+{
+ struct cb_layoutrecallargs args;
+
+ /* Pretend we got a CB_LAYOUTRECALL(ALL) */
+ memset(&args, 0, sizeof(args));
+ args.cbl_recall_type = RETURN_ALL;
+ /* FIXME we ignore errors, what should we do? */
+ do_callback_layoutrecall(clp, &args);
+}
+
+__be32 nfs4_callback_devicenotify(void *argp, void *resp,
+ struct cb_process_state *cps)
+{
+ struct cb_devicenotifyargs *args = argp;
+ const struct pnfs_layoutdriver_type *ld = NULL;
+ uint32_t i;
+ __be32 res = 0;
+
+ if (!cps->clp) {
+ res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
+ goto out;
+ }
+
+ for (i = 0; i < args->ndevs; i++) {
+ struct cb_devicenotifyitem *dev = &args->devs[i];
+
+ if (!ld || ld->id != dev->cbd_layout_type) {
+ pnfs_put_layoutdriver(ld);
+ ld = pnfs_find_layoutdriver(dev->cbd_layout_type);
+ if (!ld)
+ continue;
+ }
+ nfs4_delete_deviceid(ld, cps->clp, &dev->cbd_dev_id);
+ }
+ pnfs_put_layoutdriver(ld);
+out:
+ kfree(args->devs);
+ return res;
+}
+
+/*
+ * Validate the sequenceID sent by the server.
+ * Return success if the sequenceID is one more than what we last saw on
+ * this slot, accounting for wraparound. Increments the slot's sequence.
+ *
+ * We don't yet implement a duplicate request cache, instead we set the
+ * back channel ca_maxresponsesize_cached to zero. This is OK for now
+ * since we only currently implement idempotent callbacks anyway.
+ *
+ * We have a single slot backchannel at this time, so we don't bother
+ * checking the used_slots bit array on the table. The lower layer guarantees
+ * a single outstanding callback request at a time.
+ */
+static __be32
+validate_seqid(const struct nfs4_slot_table *tbl, const struct nfs4_slot *slot,
+ const struct cb_sequenceargs * args)
+{
+ __be32 ret;
+
+ ret = cpu_to_be32(NFS4ERR_BADSLOT);
+ if (args->csa_slotid > tbl->server_highest_slotid)
+ goto out_err;
+
+ /* Replay */
+ if (args->csa_sequenceid == slot->seq_nr) {
+ ret = cpu_to_be32(NFS4ERR_DELAY);
+ if (nfs4_test_locked_slot(tbl, slot->slot_nr))
+ goto out_err;
+
+ /* Signal process_op to set this error on next op */
+ ret = cpu_to_be32(NFS4ERR_RETRY_UNCACHED_REP);
+ if (args->csa_cachethis == 0)
+ goto out_err;
+
+ /* Liar! We never allowed you to set csa_cachethis != 0 */
+ ret = cpu_to_be32(NFS4ERR_SEQ_FALSE_RETRY);
+ goto out_err;
+ }
+
+ /* Note: wraparound relies on seq_nr being of type u32 */
+ /* Misordered request */
+ ret = cpu_to_be32(NFS4ERR_SEQ_MISORDERED);
+ if (args->csa_sequenceid != slot->seq_nr + 1)
+ goto out_err;
+
+ return cpu_to_be32(NFS4_OK);
+
+out_err:
+ trace_nfs4_cb_seqid_err(args, ret);
+ return ret;
+}
+
+/*
+ * For each referring call triple, check the session's slot table for
+ * a match. If the slot is in use and the sequence numbers match, the
+ * client is still waiting for a response to the original request.
+ */
+static int referring_call_exists(struct nfs_client *clp,
+ uint32_t nrclists,
+ struct referring_call_list *rclists,
+ spinlock_t *lock)
+ __releases(lock)
+ __acquires(lock)
+{
+ int status = 0;
+ int i, j;
+ struct nfs4_session *session;
+ struct nfs4_slot_table *tbl;
+ struct referring_call_list *rclist;
+ struct referring_call *ref;
+
+ /*
+ * XXX When client trunking is implemented, this becomes
+ * a session lookup from within the loop
+ */
+ session = clp->cl_session;
+ tbl = &session->fc_slot_table;
+
+ for (i = 0; i < nrclists; i++) {
+ rclist = &rclists[i];
+ if (memcmp(session->sess_id.data,
+ rclist->rcl_sessionid.data,
+ NFS4_MAX_SESSIONID_LEN) != 0)
+ continue;
+
+ for (j = 0; j < rclist->rcl_nrefcalls; j++) {
+ ref = &rclist->rcl_refcalls[j];
+ spin_unlock(lock);
+ status = nfs4_slot_wait_on_seqid(tbl, ref->rc_slotid,
+ ref->rc_sequenceid, HZ >> 1) < 0;
+ spin_lock(lock);
+ if (status)
+ goto out;
+ }
+ }
+
+out:
+ return status;
+}
+
+__be32 nfs4_callback_sequence(void *argp, void *resp,
+ struct cb_process_state *cps)
+{
+ struct cb_sequenceargs *args = argp;
+ struct cb_sequenceres *res = resp;
+ struct nfs4_slot_table *tbl;
+ struct nfs4_slot *slot;
+ struct nfs_client *clp;
+ int i;
+ __be32 status = htonl(NFS4ERR_BADSESSION);
+
+ clp = nfs4_find_client_sessionid(cps->net, args->csa_addr,
+ &args->csa_sessionid, cps->minorversion);
+ if (clp == NULL)
+ goto out;
+
+ if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
+ goto out;
+
+ tbl = &clp->cl_session->bc_slot_table;
+
+ /* Set up res before grabbing the spinlock */
+ memcpy(&res->csr_sessionid, &args->csa_sessionid,
+ sizeof(res->csr_sessionid));
+ res->csr_sequenceid = args->csa_sequenceid;
+ res->csr_slotid = args->csa_slotid;
+
+ spin_lock(&tbl->slot_tbl_lock);
+ /* state manager is resetting the session */
+ if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
+ status = htonl(NFS4ERR_DELAY);
+ /* Return NFS4ERR_BADSESSION if we're draining the session
+ * in order to reset it.
+ */
+ if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
+ status = htonl(NFS4ERR_BADSESSION);
+ goto out_unlock;
+ }
+
+ status = htonl(NFS4ERR_BADSLOT);
+ slot = nfs4_lookup_slot(tbl, args->csa_slotid);
+ if (IS_ERR(slot))
+ goto out_unlock;
+
+ res->csr_highestslotid = tbl->server_highest_slotid;
+ res->csr_target_highestslotid = tbl->target_highest_slotid;
+
+ status = validate_seqid(tbl, slot, args);
+ if (status)
+ goto out_unlock;
+ if (!nfs4_try_to_lock_slot(tbl, slot)) {
+ status = htonl(NFS4ERR_DELAY);
+ goto out_unlock;
+ }
+ cps->slot = slot;
+
+ /* The ca_maxresponsesize_cached is 0 with no DRC */
+ if (args->csa_cachethis != 0) {
+ status = htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
+ goto out_unlock;
+ }
+
+ /*
+ * Check for pending referring calls. If a match is found, a
+ * related callback was received before the response to the original
+ * call.
+ */
+ if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists,
+ &tbl->slot_tbl_lock) < 0) {
+ status = htonl(NFS4ERR_DELAY);
+ goto out_unlock;
+ }
+
+ /*
+ * RFC5661 20.9.3
+ * If CB_SEQUENCE returns an error, then the state of the slot
+ * (sequence ID, cached reply) MUST NOT change.
+ */
+ slot->seq_nr = args->csa_sequenceid;
+out_unlock:
+ spin_unlock(&tbl->slot_tbl_lock);
+
+out:
+ cps->clp = clp; /* put in nfs4_callback_compound */
+ for (i = 0; i < args->csa_nrclists; i++)
+ kfree(args->csa_rclists[i].rcl_refcalls);
+ kfree(args->csa_rclists);
+
+ if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
+ cps->drc_status = status;
+ status = 0;
+ } else
+ res->csr_status = status;
+
+ trace_nfs4_cb_sequence(args, res, status);
+ return status;
+}
+
+static bool
+validate_bitmap_values(unsigned int mask)
+{
+ return (mask & ~RCA4_TYPE_MASK_ALL) == 0;
+}
+
+__be32 nfs4_callback_recallany(void *argp, void *resp,
+ struct cb_process_state *cps)
+{
+ struct cb_recallanyargs *args = argp;
+ __be32 status;
+ fmode_t flags = 0;
+ bool schedule_manager = false;
+
+ status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
+ if (!cps->clp) /* set in cb_sequence */
+ goto out;
+
+ dprintk_rcu("NFS: RECALL_ANY callback request from %s\n",
+ rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+ status = cpu_to_be32(NFS4ERR_INVAL);
+ if (!validate_bitmap_values(args->craa_type_mask))
+ goto out;
+
+ status = cpu_to_be32(NFS4_OK);
+ if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_RDATA_DLG))
+ flags = FMODE_READ;
+ if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_WDATA_DLG))
+ flags |= FMODE_WRITE;
+ if (flags)
+ nfs_expire_unused_delegation_types(cps->clp, flags);
+
+ if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_FILE_LAYOUT))
+ pnfs_recall_all_layouts(cps->clp);
+
+ if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_READ)) {
+ set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &cps->clp->cl_state);
+ schedule_manager = true;
+ }
+ if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_RW)) {
+ set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_RW, &cps->clp->cl_state);
+ schedule_manager = true;
+ }
+ if (schedule_manager)
+ nfs4_schedule_state_manager(cps->clp);
+
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+ return status;
+}
+
+/* Reduce the fore channel's max_slots to the target value */
+__be32 nfs4_callback_recallslot(void *argp, void *resp,
+ struct cb_process_state *cps)
+{
+ struct cb_recallslotargs *args = argp;
+ struct nfs4_slot_table *fc_tbl;
+ __be32 status;
+
+ status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+ if (!cps->clp) /* set in cb_sequence */
+ goto out;
+
+ dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target highest slotid %u\n",
+ rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
+ args->crsa_target_highest_slotid);
+
+ fc_tbl = &cps->clp->cl_session->fc_slot_table;
+
+ status = htonl(NFS4_OK);
+
+ nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid);
+ nfs41_notify_server(cps->clp);
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+ return status;
+}
+
+__be32 nfs4_callback_notify_lock(void *argp, void *resp,
+ struct cb_process_state *cps)
+{
+ struct cb_notify_lock_args *args = argp;
+
+ if (!cps->clp) /* set in cb_sequence */
+ return htonl(NFS4ERR_OP_NOT_IN_SESSION);
+
+ dprintk_rcu("NFS: CB_NOTIFY_LOCK request from %s\n",
+ rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+ /* Don't wake anybody if the string looked bogus */
+ if (args->cbnl_valid)
+ __wake_up(&cps->clp->cl_lock_waitq, TASK_NORMAL, 0, args);
+
+ return htonl(NFS4_OK);
+}
+#endif /* CONFIG_NFS_V4_1 */
+#ifdef CONFIG_NFS_V4_2
+static void nfs4_copy_cb_args(struct nfs4_copy_state *cp_state,
+ struct cb_offloadargs *args)
+{
+ cp_state->count = args->wr_count;
+ cp_state->error = args->error;
+ if (!args->error) {
+ cp_state->verf.committed = args->wr_writeverf.committed;
+ memcpy(&cp_state->verf.verifier.data[0],
+ &args->wr_writeverf.verifier.data[0],
+ NFS4_VERIFIER_SIZE);
+ }
+}
+
+__be32 nfs4_callback_offload(void *data, void *dummy,
+ struct cb_process_state *cps)
+{
+ struct cb_offloadargs *args = data;
+ struct nfs_server *server;
+ struct nfs4_copy_state *copy, *tmp_copy;
+ bool found = false;
+
+ copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL);
+ if (!copy)
+ return htonl(NFS4ERR_SERVERFAULT);
+
+ spin_lock(&cps->clp->cl_lock);
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &cps->clp->cl_superblocks,
+ client_link) {
+ list_for_each_entry(tmp_copy, &server->ss_copies, copies) {
+ if (memcmp(args->coa_stateid.other,
+ tmp_copy->stateid.other,
+ sizeof(args->coa_stateid.other)))
+ continue;
+ nfs4_copy_cb_args(tmp_copy, args);
+ complete(&tmp_copy->completion);
+ found = true;
+ goto out;
+ }
+ }
+out:
+ rcu_read_unlock();
+ if (!found) {
+ memcpy(&copy->stateid, &args->coa_stateid, NFS4_STATEID_SIZE);
+ nfs4_copy_cb_args(copy, args);
+ list_add_tail(&copy->copies, &cps->clp->pending_cb_stateids);
+ } else
+ kfree(copy);
+ spin_unlock(&cps->clp->cl_lock);
+
+ trace_nfs4_cb_offload(&args->coa_fh, &args->coa_stateid,
+ args->wr_count, args->error,
+ args->wr_writeverf.committed);
+ return 0;
+}
+#endif /* CONFIG_NFS_V4_2 */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
new file mode 100644
index 0000000000..321af81c45
--- /dev/null
+++ b/fs/nfs/callback_xdr.c
@@ -0,0 +1,1096 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/callback_xdr.c
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFSv4 callback encode/decode procedures
+ */
+#include <linux/kernel.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/ratelimit.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/bc_xprt.h>
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "internal.h"
+#include "nfs4session.h"
+#include "nfs4trace.h"
+
+#define CB_OP_TAGLEN_MAXSZ (512)
+#define CB_OP_HDR_RES_MAXSZ (2 * 4) // opcode, status
+#define CB_OP_GETATTR_BITMAP_MAXSZ (4 * 4) // bitmap length, 3 bitmaps
+#define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
+ CB_OP_GETATTR_BITMAP_MAXSZ + \
+ /* change, size, ctime, mtime */\
+ (2 + 2 + 3 + 3) * 4)
+#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+
+#if defined(CONFIG_NFS_V4_1)
+#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
+ NFS4_MAX_SESSIONID_LEN + \
+ (1 + 3) * 4) // seqid, 3 slotids
+#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_NOTIFY_LOCK_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#endif /* CONFIG_NFS_V4_1 */
+#ifdef CONFIG_NFS_V4_2
+#define CB_OP_OFFLOAD_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#endif /* CONFIG_NFS_V4_2 */
+
+#define NFSDBG_FACILITY NFSDBG_CALLBACK
+
+/* Internal error code */
+#define NFS4ERR_RESOURCE_HDR 11050
+
+struct callback_op {
+ __be32 (*process_op)(void *, void *, struct cb_process_state *);
+ __be32 (*decode_args)(struct svc_rqst *, struct xdr_stream *, void *);
+ __be32 (*encode_res)(struct svc_rqst *, struct xdr_stream *,
+ const void *);
+ long res_maxsize;
+};
+
+static struct callback_op callback_ops[];
+
+static __be32 nfs4_callback_null(struct svc_rqst *rqstp)
+{
+ return htonl(NFS4_OK);
+}
+
+/*
+ * svc_process_common() looks for an XDR encoder to know when
+ * not to drop a Reply.
+ */
+static bool nfs4_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+{
+ return true;
+}
+
+static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len,
+ const char **str, size_t maxlen)
+{
+ ssize_t err;
+
+ err = xdr_stream_decode_opaque_inline(xdr, (void **)str, maxlen);
+ if (err < 0)
+ return cpu_to_be32(NFS4ERR_RESOURCE);
+ *len = err;
+ return 0;
+}
+
+static __be32 decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ fh->size = ntohl(*p);
+ if (fh->size > NFS4_FHSIZE)
+ return htonl(NFS4ERR_BADHANDLE);
+ p = xdr_inline_decode(xdr, fh->size);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ memcpy(&fh->data[0], p, fh->size);
+ memset(&fh->data[fh->size], 0, sizeof(fh->data) - fh->size);
+ return 0;
+}
+
+static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
+{
+ __be32 *p;
+ unsigned int attrlen;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ attrlen = ntohl(*p);
+ p = xdr_inline_decode(xdr, attrlen << 2);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ if (likely(attrlen > 0))
+ bitmap[0] = ntohl(*p++);
+ if (attrlen > 1)
+ bitmap[1] = ntohl(*p);
+ return 0;
+}
+
+static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ memcpy(stateid->data, p, NFS4_STATEID_SIZE);
+ return 0;
+}
+
+static __be32 decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_DELEGATION_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
+static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
+{
+ __be32 *p;
+ __be32 status;
+
+ status = decode_string(xdr, &hdr->taglen, &hdr->tag, CB_OP_TAGLEN_MAXSZ);
+ if (unlikely(status != 0))
+ return status;
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ hdr->minorversion = ntohl(*p++);
+ /* Check for minor version support */
+ if (hdr->minorversion <= NFS4_MAX_MINOR_VERSION) {
+ hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 and v4.2 */
+ } else {
+ pr_warn_ratelimited("NFS: %s: NFSv4 server callback with "
+ "illegal minor version %u!\n",
+ __func__, hdr->minorversion);
+ return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+ }
+ hdr->nops = ntohl(*p);
+ return 0;
+}
+
+static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
+{
+ __be32 *p;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE_HDR);
+ *op = ntohl(*p);
+ return 0;
+}
+
+static __be32 decode_getattr_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr, void *argp)
+{
+ struct cb_getattrargs *args = argp;
+ __be32 status;
+
+ status = decode_fh(xdr, &args->fh);
+ if (unlikely(status != 0))
+ return status;
+ return decode_bitmap(xdr, args->bitmap);
+}
+
+static __be32 decode_recall_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr, void *argp)
+{
+ struct cb_recallargs *args = argp;
+ __be32 *p;
+ __be32 status;
+
+ status = decode_delegation_stateid(xdr, &args->stateid);
+ if (unlikely(status != 0))
+ return status;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ args->truncate = ntohl(*p);
+ return decode_fh(xdr, &args->fh);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+static __be32 decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_LAYOUT_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
+static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr, void *argp)
+{
+ struct cb_layoutrecallargs *args = argp;
+ __be32 *p;
+ __be32 status = 0;
+ uint32_t iomode;
+
+ p = xdr_inline_decode(xdr, 4 * sizeof(uint32_t));
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
+
+ args->cbl_layout_type = ntohl(*p++);
+ /* Depite the spec's xdr, iomode really belongs in the FILE switch,
+ * as it is unusable and ignored with the other types.
+ */
+ iomode = ntohl(*p++);
+ args->cbl_layoutchanged = ntohl(*p++);
+ args->cbl_recall_type = ntohl(*p++);
+
+ if (args->cbl_recall_type == RETURN_FILE) {
+ args->cbl_range.iomode = iomode;
+ status = decode_fh(xdr, &args->cbl_fh);
+ if (unlikely(status != 0))
+ return status;
+
+ p = xdr_inline_decode(xdr, 2 * sizeof(uint64_t));
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
+ p = xdr_decode_hyper(p, &args->cbl_range.offset);
+ p = xdr_decode_hyper(p, &args->cbl_range.length);
+ return decode_layout_stateid(xdr, &args->cbl_stateid);
+ } else if (args->cbl_recall_type == RETURN_FSID) {
+ p = xdr_inline_decode(xdr, 2 * sizeof(uint64_t));
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
+ p = xdr_decode_hyper(p, &args->cbl_fsid.major);
+ p = xdr_decode_hyper(p, &args->cbl_fsid.minor);
+ } else if (args->cbl_recall_type != RETURN_ALL)
+ return htonl(NFS4ERR_BADXDR);
+ return 0;
+}
+
+static
+__be32 decode_devicenotify_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *argp)
+{
+ struct cb_devicenotifyargs *args = argp;
+ uint32_t tmp, n, i;
+ __be32 *p;
+ __be32 status = 0;
+
+ /* Num of device notifications */
+ p = xdr_inline_decode(xdr, sizeof(uint32_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto out;
+ }
+ n = ntohl(*p++);
+ if (n == 0)
+ goto out;
+
+ args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL);
+ if (!args->devs) {
+ status = htonl(NFS4ERR_DELAY);
+ goto out;
+ }
+
+ /* Decode each dev notification */
+ for (i = 0; i < n; i++) {
+ struct cb_devicenotifyitem *dev = &args->devs[i];
+
+ p = xdr_inline_decode(xdr, (4 * sizeof(uint32_t)) +
+ NFS4_DEVICEID4_SIZE);
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto err;
+ }
+
+ tmp = ntohl(*p++); /* bitmap size */
+ if (tmp != 1) {
+ status = htonl(NFS4ERR_INVAL);
+ goto err;
+ }
+ dev->cbd_notify_type = ntohl(*p++);
+ if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
+ dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) {
+ status = htonl(NFS4ERR_INVAL);
+ goto err;
+ }
+
+ tmp = ntohl(*p++); /* opaque size */
+ if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) &&
+ (tmp != NFS4_DEVICEID4_SIZE + 8)) ||
+ ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) &&
+ (tmp != NFS4_DEVICEID4_SIZE + 4))) {
+ status = htonl(NFS4ERR_INVAL);
+ goto err;
+ }
+ dev->cbd_layout_type = ntohl(*p++);
+ memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE);
+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+ if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
+ p = xdr_inline_decode(xdr, sizeof(uint32_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto err;
+ }
+ dev->cbd_immediate = ntohl(*p++);
+ } else {
+ dev->cbd_immediate = 0;
+ }
+
+ dprintk("%s: type %d layout 0x%x immediate %d\n",
+ __func__, dev->cbd_notify_type, dev->cbd_layout_type,
+ dev->cbd_immediate);
+ }
+ args->ndevs = n;
+ dprintk("%s: ndevs %d\n", __func__, args->ndevs);
+ return 0;
+err:
+ kfree(args->devs);
+out:
+ args->devs = NULL;
+ args->ndevs = 0;
+ dprintk("%s: status %d ndevs %d\n",
+ __func__, ntohl(status), args->ndevs);
+ return status;
+}
+
+static __be32 decode_sessionid(struct xdr_stream *xdr,
+ struct nfs4_sessionid *sid)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+
+ memcpy(sid->data, p, NFS4_MAX_SESSIONID_LEN);
+ return 0;
+}
+
+static __be32 decode_rc_list(struct xdr_stream *xdr,
+ struct referring_call_list *rc_list)
+{
+ __be32 *p;
+ int i;
+ __be32 status;
+
+ status = decode_sessionid(xdr, &rc_list->rcl_sessionid);
+ if (status)
+ goto out;
+
+ status = htonl(NFS4ERR_RESOURCE);
+ p = xdr_inline_decode(xdr, sizeof(uint32_t));
+ if (unlikely(p == NULL))
+ goto out;
+
+ rc_list->rcl_nrefcalls = ntohl(*p++);
+ if (rc_list->rcl_nrefcalls) {
+ p = xdr_inline_decode(xdr,
+ rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t));
+ if (unlikely(p == NULL))
+ goto out;
+ rc_list->rcl_refcalls = kmalloc_array(rc_list->rcl_nrefcalls,
+ sizeof(*rc_list->rcl_refcalls),
+ GFP_KERNEL);
+ if (unlikely(rc_list->rcl_refcalls == NULL))
+ goto out;
+ for (i = 0; i < rc_list->rcl_nrefcalls; i++) {
+ rc_list->rcl_refcalls[i].rc_sequenceid = ntohl(*p++);
+ rc_list->rcl_refcalls[i].rc_slotid = ntohl(*p++);
+ }
+ }
+ status = 0;
+
+out:
+ return status;
+}
+
+static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *argp)
+{
+ struct cb_sequenceargs *args = argp;
+ __be32 *p;
+ int i;
+ __be32 status;
+
+ status = decode_sessionid(xdr, &args->csa_sessionid);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 5 * sizeof(uint32_t));
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+
+ args->csa_addr = svc_addr(rqstp);
+ args->csa_sequenceid = ntohl(*p++);
+ args->csa_slotid = ntohl(*p++);
+ args->csa_highestslotid = ntohl(*p++);
+ args->csa_cachethis = ntohl(*p++);
+ args->csa_nrclists = ntohl(*p++);
+ args->csa_rclists = NULL;
+ if (args->csa_nrclists) {
+ args->csa_rclists = kmalloc_array(args->csa_nrclists,
+ sizeof(*args->csa_rclists),
+ GFP_KERNEL);
+ if (unlikely(args->csa_rclists == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+
+ for (i = 0; i < args->csa_nrclists; i++) {
+ status = decode_rc_list(xdr, &args->csa_rclists[i]);
+ if (status) {
+ args->csa_nrclists = i;
+ goto out_free;
+ }
+ }
+ }
+ return 0;
+
+out_free:
+ for (i = 0; i < args->csa_nrclists; i++)
+ kfree(args->csa_rclists[i].rcl_refcalls);
+ kfree(args->csa_rclists);
+ return status;
+}
+
+static __be32 decode_recallany_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *argp)
+{
+ struct cb_recallanyargs *args = argp;
+ uint32_t bitmap[2];
+ __be32 *p, status;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
+ args->craa_objs_to_keep = ntohl(*p++);
+ status = decode_bitmap(xdr, bitmap);
+ if (unlikely(status))
+ return status;
+ args->craa_type_mask = bitmap[0];
+
+ return 0;
+}
+
+static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *argp)
+{
+ struct cb_recallslotargs *args = argp;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
+ args->crsa_target_highest_slotid = ntohl(*p++);
+ return 0;
+}
+
+static __be32 decode_lockowner(struct xdr_stream *xdr, struct cb_notify_lock_args *args)
+{
+ __be32 *p;
+ unsigned int len;
+
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
+
+ p = xdr_decode_hyper(p, &args->cbnl_owner.clientid);
+ len = be32_to_cpu(*p);
+
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
+
+ /* Only try to decode if the length is right */
+ if (len == 20) {
+ p += 2; /* skip "lock id:" */
+ args->cbnl_owner.s_dev = be32_to_cpu(*p++);
+ xdr_decode_hyper(p, &args->cbnl_owner.id);
+ args->cbnl_valid = true;
+ } else {
+ args->cbnl_owner.s_dev = 0;
+ args->cbnl_owner.id = 0;
+ args->cbnl_valid = false;
+ }
+ return 0;
+}
+
+static __be32 decode_notify_lock_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr, void *argp)
+{
+ struct cb_notify_lock_args *args = argp;
+ __be32 status;
+
+ status = decode_fh(xdr, &args->cbnl_fh);
+ if (unlikely(status != 0))
+ return status;
+ return decode_lockowner(xdr, args);
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+#ifdef CONFIG_NFS_V4_2
+static __be32 decode_write_response(struct xdr_stream *xdr,
+ struct cb_offloadargs *args)
+{
+ __be32 *p;
+
+ /* skip the always zero field */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out;
+ p++;
+
+ /* decode count, stable_how, verifier */
+ p = xdr_inline_decode(xdr, 8 + 4);
+ if (unlikely(!p))
+ goto out;
+ p = xdr_decode_hyper(p, &args->wr_count);
+ args->wr_writeverf.committed = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, NFS4_VERIFIER_SIZE);
+ if (likely(p)) {
+ memcpy(&args->wr_writeverf.verifier.data[0], p,
+ NFS4_VERIFIER_SIZE);
+ return 0;
+ }
+out:
+ return htonl(NFS4ERR_RESOURCE);
+}
+
+static __be32 decode_offload_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct cb_offloadargs *args = data;
+ __be32 *p;
+ __be32 status;
+
+ /* decode fh */
+ status = decode_fh(xdr, &args->coa_fh);
+ if (unlikely(status != 0))
+ return status;
+
+ /* decode stateid */
+ status = decode_stateid(xdr, &args->coa_stateid);
+ if (unlikely(status != 0))
+ return status;
+
+ /* decode status */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out;
+ args->error = ntohl(*p++);
+ if (!args->error) {
+ status = decode_write_response(xdr, args);
+ if (unlikely(status != 0))
+ return status;
+ } else {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out;
+ p = xdr_decode_hyper(p, &args->wr_count);
+ }
+ return 0;
+out:
+ return htonl(NFS4ERR_RESOURCE);
+}
+#endif /* CONFIG_NFS_V4_2 */
+static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
+{
+ if (unlikely(xdr_stream_encode_opaque(xdr, str, len) < 0))
+ return cpu_to_be32(NFS4ERR_RESOURCE);
+ return 0;
+}
+
+static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, size_t sz)
+{
+ if (xdr_stream_encode_uint32_array(xdr, bitmap, sz) < 0)
+ return cpu_to_be32(NFS4ERR_RESOURCE);
+ return 0;
+}
+
+static __be32 encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t change)
+{
+ __be32 *p;
+
+ if (!(bitmap[0] & FATTR4_WORD0_CHANGE))
+ return 0;
+ p = xdr_reserve_space(xdr, 8);
+ if (unlikely(!p))
+ return htonl(NFS4ERR_RESOURCE);
+ p = xdr_encode_hyper(p, change);
+ return 0;
+}
+
+static __be32 encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t size)
+{
+ __be32 *p;
+
+ if (!(bitmap[0] & FATTR4_WORD0_SIZE))
+ return 0;
+ p = xdr_reserve_space(xdr, 8);
+ if (unlikely(!p))
+ return htonl(NFS4ERR_RESOURCE);
+ p = xdr_encode_hyper(p, size);
+ return 0;
+}
+
+static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec64 *time)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 12);
+ if (unlikely(!p))
+ return htonl(NFS4ERR_RESOURCE);
+ p = xdr_encode_hyper(p, time->tv_sec);
+ *p = htonl(time->tv_nsec);
+ return 0;
+}
+
+static __be32 encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec64 *time)
+{
+ if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA))
+ return 0;
+ return encode_attr_time(xdr,time);
+}
+
+static __be32 encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec64 *time)
+{
+ if (!(bitmap[1] & FATTR4_WORD1_TIME_MODIFY))
+ return 0;
+ return encode_attr_time(xdr,time);
+}
+
+static __be32 encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr)
+{
+ __be32 status;
+
+ hdr->status = xdr_reserve_space(xdr, 4);
+ if (unlikely(hdr->status == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ status = encode_string(xdr, hdr->taglen, hdr->tag);
+ if (unlikely(status != 0))
+ return status;
+ hdr->nops = xdr_reserve_space(xdr, 4);
+ if (unlikely(hdr->nops == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ return 0;
+}
+
+static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 8);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE_HDR);
+ *p++ = htonl(op);
+ *p = res;
+ return 0;
+}
+
+static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr,
+ const void *resp)
+{
+ const struct cb_getattrres *res = resp;
+ __be32 *savep = NULL;
+ __be32 status = res->status;
+
+ if (unlikely(status != 0))
+ goto out;
+ status = encode_attr_bitmap(xdr, res->bitmap, ARRAY_SIZE(res->bitmap));
+ if (unlikely(status != 0))
+ goto out;
+ status = cpu_to_be32(NFS4ERR_RESOURCE);
+ savep = xdr_reserve_space(xdr, sizeof(*savep));
+ if (unlikely(!savep))
+ goto out;
+ status = encode_attr_change(xdr, res->bitmap, res->change_attr);
+ if (unlikely(status != 0))
+ goto out;
+ status = encode_attr_size(xdr, res->bitmap, res->size);
+ if (unlikely(status != 0))
+ goto out;
+ status = encode_attr_ctime(xdr, res->bitmap, &res->ctime);
+ if (unlikely(status != 0))
+ goto out;
+ status = encode_attr_mtime(xdr, res->bitmap, &res->mtime);
+ *savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1)));
+out:
+ return status;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static __be32 encode_sessionid(struct xdr_stream *xdr,
+ const struct nfs4_sessionid *sid)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+
+ memcpy(p, sid, NFS4_MAX_SESSIONID_LEN);
+ return 0;
+}
+
+static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ const void *resp)
+{
+ const struct cb_sequenceres *res = resp;
+ __be32 *p;
+ __be32 status = res->csr_status;
+
+ if (unlikely(status != 0))
+ return status;
+
+ status = encode_sessionid(xdr, &res->csr_sessionid);
+ if (status)
+ return status;
+
+ p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t));
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+
+ *p++ = htonl(res->csr_sequenceid);
+ *p++ = htonl(res->csr_slotid);
+ *p++ = htonl(res->csr_highestslotid);
+ *p++ = htonl(res->csr_target_highestslotid);
+ return 0;
+}
+
+static __be32
+preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+ if (op_nr == OP_CB_SEQUENCE) {
+ if (nop != 0)
+ return htonl(NFS4ERR_SEQUENCE_POS);
+ } else {
+ if (nop == 0)
+ return htonl(NFS4ERR_OP_NOT_IN_SESSION);
+ }
+
+ switch (op_nr) {
+ case OP_CB_GETATTR:
+ case OP_CB_RECALL:
+ case OP_CB_SEQUENCE:
+ case OP_CB_RECALL_ANY:
+ case OP_CB_RECALL_SLOT:
+ case OP_CB_LAYOUTRECALL:
+ case OP_CB_NOTIFY_DEVICEID:
+ case OP_CB_NOTIFY_LOCK:
+ *op = &callback_ops[op_nr];
+ break;
+
+ case OP_CB_NOTIFY:
+ case OP_CB_PUSH_DELEG:
+ case OP_CB_RECALLABLE_OBJ_AVAIL:
+ case OP_CB_WANTS_CANCELLED:
+ return htonl(NFS4ERR_NOTSUPP);
+
+ default:
+ return htonl(NFS4ERR_OP_ILLEGAL);
+ }
+
+ return htonl(NFS_OK);
+}
+
+static void nfs4_callback_free_slot(struct nfs4_session *session,
+ struct nfs4_slot *slot)
+{
+ struct nfs4_slot_table *tbl = &session->bc_slot_table;
+
+ spin_lock(&tbl->slot_tbl_lock);
+ /*
+ * Let the state manager know callback processing done.
+ * A single slot, so highest used slotid is either 0 or -1
+ */
+ nfs4_free_slot(tbl, slot);
+ spin_unlock(&tbl->slot_tbl_lock);
+}
+
+static void nfs4_cb_free_slot(struct cb_process_state *cps)
+{
+ if (cps->slot) {
+ nfs4_callback_free_slot(cps->clp->cl_session, cps->slot);
+ cps->slot = NULL;
+ }
+}
+
+#else /* CONFIG_NFS_V4_1 */
+
+static __be32
+preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+ return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+}
+
+static void nfs4_cb_free_slot(struct cb_process_state *cps)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+#ifdef CONFIG_NFS_V4_2
+static __be32
+preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+ __be32 status = preprocess_nfs41_op(nop, op_nr, op);
+ if (status != htonl(NFS4ERR_OP_ILLEGAL))
+ return status;
+
+ if (op_nr == OP_CB_OFFLOAD) {
+ *op = &callback_ops[op_nr];
+ return htonl(NFS_OK);
+ } else
+ return htonl(NFS4ERR_NOTSUPP);
+ return htonl(NFS4ERR_OP_ILLEGAL);
+}
+#else /* CONFIG_NFS_V4_2 */
+static __be32
+preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+ return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+}
+#endif /* CONFIG_NFS_V4_2 */
+
+static __be32
+preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
+{
+ switch (op_nr) {
+ case OP_CB_GETATTR:
+ case OP_CB_RECALL:
+ *op = &callback_ops[op_nr];
+ break;
+ default:
+ return htonl(NFS4ERR_OP_ILLEGAL);
+ }
+
+ return htonl(NFS_OK);
+}
+
+static __be32 process_op(int nop, struct svc_rqst *rqstp,
+ struct cb_process_state *cps)
+{
+ struct xdr_stream *xdr_out = &rqstp->rq_res_stream;
+ struct callback_op *op = &callback_ops[0];
+ unsigned int op_nr;
+ __be32 status;
+ long maxlen;
+ __be32 res;
+
+ status = decode_op_hdr(&rqstp->rq_arg_stream, &op_nr);
+ if (unlikely(status))
+ return status;
+
+ switch (cps->minorversion) {
+ case 0:
+ status = preprocess_nfs4_op(op_nr, &op);
+ break;
+ case 1:
+ status = preprocess_nfs41_op(nop, op_nr, &op);
+ break;
+ case 2:
+ status = preprocess_nfs42_op(nop, op_nr, &op);
+ break;
+ default:
+ status = htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+ }
+
+ if (status == htonl(NFS4ERR_OP_ILLEGAL))
+ op_nr = OP_CB_ILLEGAL;
+ if (status)
+ goto encode_hdr;
+
+ if (cps->drc_status) {
+ status = cps->drc_status;
+ goto encode_hdr;
+ }
+
+ maxlen = xdr_out->end - xdr_out->p;
+ if (maxlen > 0 && maxlen < PAGE_SIZE) {
+ status = op->decode_args(rqstp, &rqstp->rq_arg_stream,
+ rqstp->rq_argp);
+ if (likely(status == 0))
+ status = op->process_op(rqstp->rq_argp, rqstp->rq_resp,
+ cps);
+ } else
+ status = htonl(NFS4ERR_RESOURCE);
+
+encode_hdr:
+ res = encode_op_hdr(xdr_out, op_nr, status);
+ if (unlikely(res))
+ return res;
+ if (op->encode_res != NULL && status == 0)
+ status = op->encode_res(rqstp, xdr_out, rqstp->rq_resp);
+ return status;
+}
+
+/*
+ * Decode, process and encode a COMPOUND
+ */
+static __be32 nfs4_callback_compound(struct svc_rqst *rqstp)
+{
+ struct cb_compound_hdr_arg hdr_arg = { 0 };
+ struct cb_compound_hdr_res hdr_res = { NULL };
+ struct cb_process_state cps = {
+ .drc_status = 0,
+ .clp = NULL,
+ .net = SVC_NET(rqstp),
+ };
+ unsigned int nops = 0;
+ __be32 status;
+
+ status = decode_compound_hdr_arg(&rqstp->rq_arg_stream, &hdr_arg);
+ if (status == htonl(NFS4ERR_RESOURCE))
+ return rpc_garbage_args;
+
+ if (hdr_arg.minorversion == 0) {
+ cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident);
+ if (!cps.clp) {
+ trace_nfs_cb_no_clp(rqstp->rq_xid, hdr_arg.cb_ident);
+ goto out_invalidcred;
+ }
+ if (!check_gss_callback_principal(cps.clp, rqstp)) {
+ trace_nfs_cb_badprinc(rqstp->rq_xid, hdr_arg.cb_ident);
+ nfs_put_client(cps.clp);
+ goto out_invalidcred;
+ }
+ }
+
+ cps.minorversion = hdr_arg.minorversion;
+ hdr_res.taglen = hdr_arg.taglen;
+ hdr_res.tag = hdr_arg.tag;
+ if (encode_compound_hdr_res(&rqstp->rq_res_stream, &hdr_res) != 0) {
+ if (cps.clp)
+ nfs_put_client(cps.clp);
+ return rpc_system_err;
+ }
+ while (status == 0 && nops != hdr_arg.nops) {
+ status = process_op(nops, rqstp, &cps);
+ nops++;
+ }
+
+ /* Buffer overflow in decode_ops_hdr or encode_ops_hdr. Return
+ * resource error in cb_compound status without returning op */
+ if (unlikely(status == htonl(NFS4ERR_RESOURCE_HDR))) {
+ status = htonl(NFS4ERR_RESOURCE);
+ nops--;
+ }
+
+ *hdr_res.status = status;
+ *hdr_res.nops = htonl(nops);
+ nfs4_cb_free_slot(&cps);
+ nfs_put_client(cps.clp);
+ return rpc_success;
+
+out_invalidcred:
+ pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n");
+ rqstp->rq_auth_stat = rpc_autherr_badcred;
+ return rpc_success;
+}
+
+static int
+nfs_callback_dispatch(struct svc_rqst *rqstp)
+{
+ const struct svc_procedure *procp = rqstp->rq_procinfo;
+
+ *rqstp->rq_accept_statp = procp->pc_func(rqstp);
+ return 1;
+}
+
+/*
+ * Define NFS4 callback COMPOUND ops.
+ */
+static struct callback_op callback_ops[] = {
+ [0] = {
+ .res_maxsize = CB_OP_HDR_RES_MAXSZ,
+ },
+ [OP_CB_GETATTR] = {
+ .process_op = nfs4_callback_getattr,
+ .decode_args = decode_getattr_args,
+ .encode_res = encode_getattr_res,
+ .res_maxsize = CB_OP_GETATTR_RES_MAXSZ,
+ },
+ [OP_CB_RECALL] = {
+ .process_op = nfs4_callback_recall,
+ .decode_args = decode_recall_args,
+ .res_maxsize = CB_OP_RECALL_RES_MAXSZ,
+ },
+#if defined(CONFIG_NFS_V4_1)
+ [OP_CB_LAYOUTRECALL] = {
+ .process_op = nfs4_callback_layoutrecall,
+ .decode_args = decode_layoutrecall_args,
+ .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
+ },
+ [OP_CB_NOTIFY_DEVICEID] = {
+ .process_op = nfs4_callback_devicenotify,
+ .decode_args = decode_devicenotify_args,
+ .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ,
+ },
+ [OP_CB_SEQUENCE] = {
+ .process_op = nfs4_callback_sequence,
+ .decode_args = decode_cb_sequence_args,
+ .encode_res = encode_cb_sequence_res,
+ .res_maxsize = CB_OP_SEQUENCE_RES_MAXSZ,
+ },
+ [OP_CB_RECALL_ANY] = {
+ .process_op = nfs4_callback_recallany,
+ .decode_args = decode_recallany_args,
+ .res_maxsize = CB_OP_RECALLANY_RES_MAXSZ,
+ },
+ [OP_CB_RECALL_SLOT] = {
+ .process_op = nfs4_callback_recallslot,
+ .decode_args = decode_recallslot_args,
+ .res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ,
+ },
+ [OP_CB_NOTIFY_LOCK] = {
+ .process_op = nfs4_callback_notify_lock,
+ .decode_args = decode_notify_lock_args,
+ .res_maxsize = CB_OP_NOTIFY_LOCK_RES_MAXSZ,
+ },
+#endif /* CONFIG_NFS_V4_1 */
+#ifdef CONFIG_NFS_V4_2
+ [OP_CB_OFFLOAD] = {
+ .process_op = nfs4_callback_offload,
+ .decode_args = decode_offload_args,
+ .res_maxsize = CB_OP_OFFLOAD_RES_MAXSZ,
+ },
+#endif /* CONFIG_NFS_V4_2 */
+};
+
+/*
+ * Define NFS4 callback procedures
+ */
+static const struct svc_procedure nfs4_callback_procedures1[] = {
+ [CB_NULL] = {
+ .pc_func = nfs4_callback_null,
+ .pc_encode = nfs4_encode_void,
+ .pc_xdrressize = 1,
+ .pc_name = "NULL",
+ },
+ [CB_COMPOUND] = {
+ .pc_func = nfs4_callback_compound,
+ .pc_encode = nfs4_encode_void,
+ .pc_argsize = 256,
+ .pc_argzero = 256,
+ .pc_ressize = 256,
+ .pc_xdrressize = NFS4_CALLBACK_BUFSIZE,
+ .pc_name = "COMPOUND",
+ }
+};
+
+static DEFINE_PER_CPU_ALIGNED(unsigned long,
+ nfs4_callback_count1[ARRAY_SIZE(nfs4_callback_procedures1)]);
+const struct svc_version nfs4_callback_version1 = {
+ .vs_vers = 1,
+ .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1),
+ .vs_proc = nfs4_callback_procedures1,
+ .vs_count = nfs4_callback_count1,
+ .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
+ .vs_dispatch = nfs_callback_dispatch,
+ .vs_hidden = true,
+ .vs_need_cong_ctrl = true,
+};
+
+static DEFINE_PER_CPU_ALIGNED(unsigned long,
+ nfs4_callback_count4[ARRAY_SIZE(nfs4_callback_procedures1)]);
+const struct svc_version nfs4_callback_version4 = {
+ .vs_vers = 4,
+ .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1),
+ .vs_proc = nfs4_callback_procedures1,
+ .vs_count = nfs4_callback_count4,
+ .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
+ .vs_dispatch = nfs_callback_dispatch,
+ .vs_hidden = true,
+ .vs_need_cong_ctrl = true,
+};
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
new file mode 100644
index 0000000000..44eca51b28
--- /dev/null
+++ b/fs/nfs/client.c
@@ -0,0 +1,1424 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* client.c: NFS client sharing and management code
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/xprtsock.h>
+#include <linux/sunrpc/xprtrdma.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <net/ipv6.h>
+#include <linux/nfs_xdr.h>
+#include <linux/sunrpc/bc_xprt.h>
+#include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
+
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#include "fscache.h"
+#include "pnfs.h"
+#include "nfs.h"
+#include "netns.h"
+#include "sysfs.h"
+#include "nfs42.h"
+
+#define NFSDBG_FACILITY NFSDBG_CLIENT
+
+static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
+static DEFINE_SPINLOCK(nfs_version_lock);
+static DEFINE_MUTEX(nfs_version_mutex);
+static LIST_HEAD(nfs_versions);
+
+/*
+ * RPC cruft for NFS
+ */
+static const struct rpc_version *nfs_version[5] = {
+ [2] = NULL,
+ [3] = NULL,
+ [4] = NULL,
+};
+
+const struct rpc_program nfs_program = {
+ .name = "nfs",
+ .number = NFS_PROGRAM,
+ .nrvers = ARRAY_SIZE(nfs_version),
+ .version = nfs_version,
+ .stats = &nfs_rpcstat,
+ .pipe_dir_name = NFS_PIPE_DIRNAME,
+};
+
+struct rpc_stat nfs_rpcstat = {
+ .program = &nfs_program
+};
+
+static struct nfs_subversion *find_nfs_version(unsigned int version)
+{
+ struct nfs_subversion *nfs;
+ spin_lock(&nfs_version_lock);
+
+ list_for_each_entry(nfs, &nfs_versions, list) {
+ if (nfs->rpc_ops->version == version) {
+ spin_unlock(&nfs_version_lock);
+ return nfs;
+ }
+ }
+
+ spin_unlock(&nfs_version_lock);
+ return ERR_PTR(-EPROTONOSUPPORT);
+}
+
+struct nfs_subversion *get_nfs_version(unsigned int version)
+{
+ struct nfs_subversion *nfs = find_nfs_version(version);
+
+ if (IS_ERR(nfs)) {
+ mutex_lock(&nfs_version_mutex);
+ request_module("nfsv%d", version);
+ nfs = find_nfs_version(version);
+ mutex_unlock(&nfs_version_mutex);
+ }
+
+ if (!IS_ERR(nfs) && !try_module_get(nfs->owner))
+ return ERR_PTR(-EAGAIN);
+ return nfs;
+}
+
+void put_nfs_version(struct nfs_subversion *nfs)
+{
+ module_put(nfs->owner);
+}
+
+void register_nfs_version(struct nfs_subversion *nfs)
+{
+ spin_lock(&nfs_version_lock);
+
+ list_add(&nfs->list, &nfs_versions);
+ nfs_version[nfs->rpc_ops->version] = nfs->rpc_vers;
+
+ spin_unlock(&nfs_version_lock);
+}
+EXPORT_SYMBOL_GPL(register_nfs_version);
+
+void unregister_nfs_version(struct nfs_subversion *nfs)
+{
+ spin_lock(&nfs_version_lock);
+
+ nfs_version[nfs->rpc_ops->version] = NULL;
+ list_del(&nfs->list);
+
+ spin_unlock(&nfs_version_lock);
+}
+EXPORT_SYMBOL_GPL(unregister_nfs_version);
+
+/*
+ * Allocate a shared client record
+ *
+ * Since these are allocated/deallocated very rarely, we don't
+ * bother putting them in a slab cache...
+ */
+struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
+{
+ struct nfs_client *clp;
+ int err = -ENOMEM;
+
+ if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
+ goto error_0;
+
+ clp->cl_minorversion = cl_init->minorversion;
+ clp->cl_nfs_mod = cl_init->nfs_mod;
+ if (!try_module_get(clp->cl_nfs_mod->owner))
+ goto error_dealloc;
+
+ clp->rpc_ops = clp->cl_nfs_mod->rpc_ops;
+
+ refcount_set(&clp->cl_count, 1);
+ clp->cl_cons_state = NFS_CS_INITING;
+
+ memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen);
+ clp->cl_addrlen = cl_init->addrlen;
+
+ if (cl_init->hostname) {
+ err = -ENOMEM;
+ clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL);
+ if (!clp->cl_hostname)
+ goto error_cleanup;
+ }
+
+ INIT_LIST_HEAD(&clp->cl_superblocks);
+ clp->cl_rpcclient = ERR_PTR(-EINVAL);
+
+ clp->cl_flags = cl_init->init_flags;
+ clp->cl_proto = cl_init->proto;
+ clp->cl_nconnect = cl_init->nconnect;
+ clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1;
+ clp->cl_net = get_net(cl_init->net);
+
+ clp->cl_principal = "*";
+ clp->cl_xprtsec = cl_init->xprtsec;
+ return clp;
+
+error_cleanup:
+ put_nfs_version(clp->cl_nfs_mod);
+error_dealloc:
+ kfree(clp);
+error_0:
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(nfs_alloc_client);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+static void nfs_cleanup_cb_ident_idr(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ idr_destroy(&nn->cb_ident_idr);
+}
+
+/* nfs_client_lock held */
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+ struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+
+ if (clp->cl_cb_ident)
+ idr_remove(&nn->cb_ident_idr, clp->cl_cb_ident);
+}
+
+static void pnfs_init_server(struct nfs_server *server)
+{
+ rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
+}
+
+#else
+static void nfs_cleanup_cb_ident_idr(struct net *net)
+{
+}
+
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+}
+
+static void pnfs_init_server(struct nfs_server *server)
+{
+}
+
+#endif /* CONFIG_NFS_V4 */
+
+/*
+ * Destroy a shared client record
+ */
+void nfs_free_client(struct nfs_client *clp)
+{
+ /* -EIO all pending I/O */
+ if (!IS_ERR(clp->cl_rpcclient))
+ rpc_shutdown_client(clp->cl_rpcclient);
+
+ put_net(clp->cl_net);
+ put_nfs_version(clp->cl_nfs_mod);
+ kfree(clp->cl_hostname);
+ kfree(clp->cl_acceptor);
+ kfree(clp);
+}
+EXPORT_SYMBOL_GPL(nfs_free_client);
+
+/*
+ * Release a reference to a shared client record
+ */
+void nfs_put_client(struct nfs_client *clp)
+{
+ struct nfs_net *nn;
+
+ if (!clp)
+ return;
+
+ nn = net_generic(clp->cl_net, nfs_net_id);
+
+ if (refcount_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) {
+ list_del(&clp->cl_share_link);
+ nfs_cb_idr_remove_locked(clp);
+ spin_unlock(&nn->nfs_client_lock);
+
+ WARN_ON_ONCE(!list_empty(&clp->cl_superblocks));
+
+ clp->rpc_ops->free_client(clp);
+ }
+}
+EXPORT_SYMBOL_GPL(nfs_put_client);
+
+/*
+ * Find an nfs_client on the list that matches the initialisation data
+ * that is supplied.
+ */
+static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data)
+{
+ struct nfs_client *clp;
+ const struct sockaddr *sap = (struct sockaddr *)data->addr;
+ struct nfs_net *nn = net_generic(data->net, nfs_net_id);
+ int error;
+
+again:
+ list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
+ const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+ /* Don't match clients that failed to initialise properly */
+ if (clp->cl_cons_state < 0)
+ continue;
+
+ /* If a client is still initializing then we need to wait */
+ if (clp->cl_cons_state > NFS_CS_READY) {
+ refcount_inc(&clp->cl_count);
+ spin_unlock(&nn->nfs_client_lock);
+ error = nfs_wait_client_init_complete(clp);
+ nfs_put_client(clp);
+ spin_lock(&nn->nfs_client_lock);
+ if (error < 0)
+ return ERR_PTR(error);
+ goto again;
+ }
+
+ /* Different NFS versions cannot share the same nfs_client */
+ if (clp->rpc_ops != data->nfs_mod->rpc_ops)
+ continue;
+
+ if (clp->cl_proto != data->proto)
+ continue;
+ /* Match nfsv4 minorversion */
+ if (clp->cl_minorversion != data->minorversion)
+ continue;
+
+ /* Match request for a dedicated DS */
+ if (test_bit(NFS_CS_DS, &data->init_flags) !=
+ test_bit(NFS_CS_DS, &clp->cl_flags))
+ continue;
+
+ /* Match the full socket address */
+ if (!rpc_cmp_addr_port(sap, clap))
+ /* Match all xprt_switch full socket addresses */
+ if (IS_ERR(clp->cl_rpcclient) ||
+ !rpc_clnt_xprt_switch_has_addr(clp->cl_rpcclient,
+ sap))
+ continue;
+
+ /* Match the xprt security policy */
+ if (clp->cl_xprtsec.policy != data->xprtsec.policy)
+ continue;
+
+ refcount_inc(&clp->cl_count);
+ return clp;
+ }
+ return NULL;
+}
+
+/*
+ * Return true if @clp is done initializing, false if still working on it.
+ *
+ * Use nfs_client_init_status to check if it was successful.
+ */
+bool nfs_client_init_is_complete(const struct nfs_client *clp)
+{
+ return clp->cl_cons_state <= NFS_CS_READY;
+}
+EXPORT_SYMBOL_GPL(nfs_client_init_is_complete);
+
+/*
+ * Return 0 if @clp was successfully initialized, -errno otherwise.
+ *
+ * This must be called *after* nfs_client_init_is_complete() returns true,
+ * otherwise it will pop WARN_ON_ONCE and return -EINVAL
+ */
+int nfs_client_init_status(const struct nfs_client *clp)
+{
+ /* called without checking nfs_client_init_is_complete */
+ if (clp->cl_cons_state > NFS_CS_READY) {
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+ return clp->cl_cons_state;
+}
+EXPORT_SYMBOL_GPL(nfs_client_init_status);
+
+int nfs_wait_client_init_complete(const struct nfs_client *clp)
+{
+ return wait_event_killable(nfs_client_active_wq,
+ nfs_client_init_is_complete(clp));
+}
+EXPORT_SYMBOL_GPL(nfs_wait_client_init_complete);
+
+/*
+ * Found an existing client. Make sure it's ready before returning.
+ */
+static struct nfs_client *
+nfs_found_client(const struct nfs_client_initdata *cl_init,
+ struct nfs_client *clp)
+{
+ int error;
+
+ error = nfs_wait_client_init_complete(clp);
+ if (error < 0) {
+ nfs_put_client(clp);
+ return ERR_PTR(-ERESTARTSYS);
+ }
+
+ if (clp->cl_cons_state < NFS_CS_READY) {
+ error = clp->cl_cons_state;
+ nfs_put_client(clp);
+ return ERR_PTR(error);
+ }
+
+ smp_rmb();
+ return clp;
+}
+
+/*
+ * Look up a client by IP address and protocol version
+ * - creates a new record if one doesn't yet exist
+ */
+struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
+{
+ struct nfs_client *clp, *new = NULL;
+ struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
+ const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops;
+
+ if (cl_init->hostname == NULL) {
+ WARN_ON(1);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* see if the client already exists */
+ do {
+ spin_lock(&nn->nfs_client_lock);
+
+ clp = nfs_match_client(cl_init);
+ if (clp) {
+ spin_unlock(&nn->nfs_client_lock);
+ if (new)
+ new->rpc_ops->free_client(new);
+ if (IS_ERR(clp))
+ return clp;
+ return nfs_found_client(cl_init, clp);
+ }
+ if (new) {
+ list_add_tail(&new->cl_share_link,
+ &nn->nfs_client_list);
+ spin_unlock(&nn->nfs_client_lock);
+ return rpc_ops->init_client(new, cl_init);
+ }
+
+ spin_unlock(&nn->nfs_client_lock);
+
+ new = rpc_ops->alloc_client(cl_init);
+ } while (!IS_ERR(new));
+
+ return new;
+}
+EXPORT_SYMBOL_GPL(nfs_get_client);
+
+/*
+ * Mark a server as ready or failed
+ */
+void nfs_mark_client_ready(struct nfs_client *clp, int state)
+{
+ smp_wmb();
+ clp->cl_cons_state = state;
+ wake_up_all(&nfs_client_active_wq);
+}
+EXPORT_SYMBOL_GPL(nfs_mark_client_ready);
+
+/*
+ * Initialise the timeout values for a connection
+ */
+void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
+ int timeo, int retrans)
+{
+ to->to_initval = timeo * HZ / 10;
+ to->to_retries = retrans;
+
+ switch (proto) {
+ case XPRT_TRANSPORT_TCP:
+ case XPRT_TRANSPORT_TCP_TLS:
+ case XPRT_TRANSPORT_RDMA:
+ if (retrans == NFS_UNSPEC_RETRANS)
+ to->to_retries = NFS_DEF_TCP_RETRANS;
+ if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0)
+ to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
+ if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
+ to->to_initval = NFS_MAX_TCP_TIMEOUT;
+ to->to_increment = to->to_initval;
+ to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
+ if (to->to_maxval > NFS_MAX_TCP_TIMEOUT)
+ to->to_maxval = NFS_MAX_TCP_TIMEOUT;
+ if (to->to_maxval < to->to_initval)
+ to->to_maxval = to->to_initval;
+ to->to_exponential = 0;
+ break;
+ case XPRT_TRANSPORT_UDP:
+ if (retrans == NFS_UNSPEC_RETRANS)
+ to->to_retries = NFS_DEF_UDP_RETRANS;
+ if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0)
+ to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10;
+ if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
+ to->to_initval = NFS_MAX_UDP_TIMEOUT;
+ to->to_maxval = NFS_MAX_UDP_TIMEOUT;
+ to->to_exponential = 1;
+ break;
+ default:
+ BUG();
+ }
+}
+EXPORT_SYMBOL_GPL(nfs_init_timeout_values);
+
+/*
+ * Create an RPC client handle
+ */
+int nfs_create_rpc_client(struct nfs_client *clp,
+ const struct nfs_client_initdata *cl_init,
+ rpc_authflavor_t flavor)
+{
+ struct rpc_clnt *clnt = NULL;
+ struct rpc_create_args args = {
+ .net = clp->cl_net,
+ .protocol = clp->cl_proto,
+ .nconnect = clp->cl_nconnect,
+ .address = (struct sockaddr *)&clp->cl_addr,
+ .addrsize = clp->cl_addrlen,
+ .timeout = cl_init->timeparms,
+ .servername = clp->cl_hostname,
+ .nodename = cl_init->nodename,
+ .program = &nfs_program,
+ .version = clp->rpc_ops->version,
+ .authflavor = flavor,
+ .cred = cl_init->cred,
+ .xprtsec = cl_init->xprtsec,
+ .connect_timeout = cl_init->connect_timeout,
+ .reconnect_timeout = cl_init->reconnect_timeout,
+ };
+
+ if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags))
+ args.flags |= RPC_CLNT_CREATE_DISCRTRY;
+ if (test_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags))
+ args.flags |= RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT;
+ if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags))
+ args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+ if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags))
+ args.flags |= RPC_CLNT_CREATE_INFINITE_SLOTS;
+ if (test_bit(NFS_CS_NOPING, &clp->cl_flags))
+ args.flags |= RPC_CLNT_CREATE_NOPING;
+ if (test_bit(NFS_CS_REUSEPORT, &clp->cl_flags))
+ args.flags |= RPC_CLNT_CREATE_REUSEPORT;
+
+ if (!IS_ERR(clp->cl_rpcclient))
+ return 0;
+
+ clnt = rpc_create(&args);
+ if (IS_ERR(clnt)) {
+ dprintk("%s: cannot create RPC client. Error = %ld\n",
+ __func__, PTR_ERR(clnt));
+ return PTR_ERR(clnt);
+ }
+
+ clnt->cl_principal = clp->cl_principal;
+ clp->cl_rpcclient = clnt;
+ clnt->cl_max_connect = clp->cl_max_connect;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_create_rpc_client);
+
+/*
+ * Version 2 or 3 client destruction
+ */
+static void nfs_destroy_server(struct nfs_server *server)
+{
+ if (server->nlm_host)
+ nlmclnt_done(server->nlm_host);
+}
+
+/*
+ * Version 2 or 3 lockd setup
+ */
+static int nfs_start_lockd(struct nfs_server *server)
+{
+ struct nlm_host *host;
+ struct nfs_client *clp = server->nfs_client;
+ struct nlmclnt_initdata nlm_init = {
+ .hostname = clp->cl_hostname,
+ .address = (struct sockaddr *)&clp->cl_addr,
+ .addrlen = clp->cl_addrlen,
+ .nfs_version = clp->rpc_ops->version,
+ .noresvport = server->flags & NFS_MOUNT_NORESVPORT ?
+ 1 : 0,
+ .net = clp->cl_net,
+ .nlmclnt_ops = clp->cl_nfs_mod->rpc_ops->nlmclnt_ops,
+ .cred = server->cred,
+ };
+
+ if (nlm_init.nfs_version > 3)
+ return 0;
+ if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) &&
+ (server->flags & NFS_MOUNT_LOCAL_FCNTL))
+ return 0;
+
+ switch (clp->cl_proto) {
+ default:
+ nlm_init.protocol = IPPROTO_TCP;
+ break;
+#ifndef CONFIG_NFS_DISABLE_UDP_SUPPORT
+ case XPRT_TRANSPORT_UDP:
+ nlm_init.protocol = IPPROTO_UDP;
+#endif
+ }
+
+ host = nlmclnt_init(&nlm_init);
+ if (IS_ERR(host))
+ return PTR_ERR(host);
+
+ server->nlm_host = host;
+ server->destroy = nfs_destroy_server;
+ nfs_sysfs_link_rpc_client(server, nlmclnt_rpc_clnt(host), NULL);
+ return 0;
+}
+
+/*
+ * Create a general RPC client
+ */
+int nfs_init_server_rpcclient(struct nfs_server *server,
+ const struct rpc_timeout *timeo,
+ rpc_authflavor_t pseudoflavour)
+{
+ struct nfs_client *clp = server->nfs_client;
+
+ server->client = rpc_clone_client_set_auth(clp->cl_rpcclient,
+ pseudoflavour);
+ if (IS_ERR(server->client)) {
+ dprintk("%s: couldn't create rpc_client!\n", __func__);
+ return PTR_ERR(server->client);
+ }
+
+ memcpy(&server->client->cl_timeout_default,
+ timeo,
+ sizeof(server->client->cl_timeout_default));
+ server->client->cl_timeout = &server->client->cl_timeout_default;
+ server->client->cl_softrtry = 0;
+ if (server->flags & NFS_MOUNT_SOFTERR)
+ server->client->cl_softerr = 1;
+ if (server->flags & NFS_MOUNT_SOFT)
+ server->client->cl_softrtry = 1;
+
+ nfs_sysfs_link_rpc_client(server, server->client, NULL);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient);
+
+/**
+ * nfs_init_client - Initialise an NFS2 or NFS3 client
+ *
+ * @clp: nfs_client to initialise
+ * @cl_init: Initialisation parameters
+ *
+ * Returns pointer to an NFS client, or an ERR_PTR value.
+ */
+struct nfs_client *nfs_init_client(struct nfs_client *clp,
+ const struct nfs_client_initdata *cl_init)
+{
+ int error;
+
+ /* the client is already initialised */
+ if (clp->cl_cons_state == NFS_CS_READY)
+ return clp;
+
+ /*
+ * Create a client RPC handle for doing FSSTAT with UNIX auth only
+ * - RFC 2623, sec 2.3.2
+ */
+ error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
+ nfs_mark_client_ready(clp, error == 0 ? NFS_CS_READY : error);
+ if (error < 0) {
+ nfs_put_client(clp);
+ clp = ERR_PTR(error);
+ }
+ return clp;
+}
+EXPORT_SYMBOL_GPL(nfs_init_client);
+
+/*
+ * Create a version 2 or 3 client
+ */
+static int nfs_init_server(struct nfs_server *server,
+ const struct fs_context *fc)
+{
+ const struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct rpc_timeout timeparms;
+ struct nfs_client_initdata cl_init = {
+ .hostname = ctx->nfs_server.hostname,
+ .addr = &ctx->nfs_server._address,
+ .addrlen = ctx->nfs_server.addrlen,
+ .nfs_mod = ctx->nfs_mod,
+ .proto = ctx->nfs_server.protocol,
+ .net = fc->net_ns,
+ .timeparms = &timeparms,
+ .cred = server->cred,
+ .nconnect = ctx->nfs_server.nconnect,
+ .init_flags = (1UL << NFS_CS_REUSEPORT),
+ .xprtsec = ctx->xprtsec,
+ };
+ struct nfs_client *clp;
+ int error;
+
+ nfs_init_timeout_values(&timeparms, ctx->nfs_server.protocol,
+ ctx->timeo, ctx->retrans);
+ if (ctx->flags & NFS_MOUNT_NORESVPORT)
+ set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+
+ /* Allocate or find a client reference we can use */
+ clp = nfs_get_client(&cl_init);
+ if (IS_ERR(clp))
+ return PTR_ERR(clp);
+
+ server->nfs_client = clp;
+ nfs_sysfs_add_server(server);
+ nfs_sysfs_link_rpc_client(server, clp->cl_rpcclient, "_state");
+
+ /* Initialise the client representation from the mount data */
+ server->flags = ctx->flags;
+ server->options = ctx->options;
+ server->caps |= NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS;
+
+ switch (clp->rpc_ops->version) {
+ case 2:
+ server->fattr_valid = NFS_ATTR_FATTR_V2;
+ break;
+ case 3:
+ server->fattr_valid = NFS_ATTR_FATTR_V3;
+ break;
+ default:
+ server->fattr_valid = NFS_ATTR_FATTR_V4;
+ }
+
+ if (ctx->rsize)
+ server->rsize = nfs_io_size(ctx->rsize, clp->cl_proto);
+ if (ctx->wsize)
+ server->wsize = nfs_io_size(ctx->wsize, clp->cl_proto);
+
+ server->acregmin = ctx->acregmin * HZ;
+ server->acregmax = ctx->acregmax * HZ;
+ server->acdirmin = ctx->acdirmin * HZ;
+ server->acdirmax = ctx->acdirmax * HZ;
+
+ /* Start lockd here, before we might error out */
+ error = nfs_start_lockd(server);
+ if (error < 0)
+ goto error;
+
+ server->port = ctx->nfs_server.port;
+ server->auth_info = ctx->auth_info;
+
+ error = nfs_init_server_rpcclient(server, &timeparms,
+ ctx->selected_flavor);
+ if (error < 0)
+ goto error;
+
+ /* Preserve the values of mount_server-related mount options */
+ if (ctx->mount_server.addrlen) {
+ memcpy(&server->mountd_address, &ctx->mount_server.address,
+ ctx->mount_server.addrlen);
+ server->mountd_addrlen = ctx->mount_server.addrlen;
+ }
+ server->mountd_version = ctx->mount_server.version;
+ server->mountd_port = ctx->mount_server.port;
+ server->mountd_protocol = ctx->mount_server.protocol;
+
+ server->namelen = ctx->namlen;
+ return 0;
+
+error:
+ server->nfs_client = NULL;
+ nfs_put_client(clp);
+ return error;
+}
+
+/*
+ * Load up the server record from information gained in an fsinfo record
+ */
+static void nfs_server_set_fsinfo(struct nfs_server *server,
+ struct nfs_fsinfo *fsinfo)
+{
+ struct nfs_client *clp = server->nfs_client;
+ unsigned long max_rpc_payload, raw_max_rpc_payload;
+
+ /* Work out a lot of parameters */
+ if (server->rsize == 0)
+ server->rsize = nfs_io_size(fsinfo->rtpref, clp->cl_proto);
+ if (server->wsize == 0)
+ server->wsize = nfs_io_size(fsinfo->wtpref, clp->cl_proto);
+
+ if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax)
+ server->rsize = nfs_io_size(fsinfo->rtmax, clp->cl_proto);
+ if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax)
+ server->wsize = nfs_io_size(fsinfo->wtmax, clp->cl_proto);
+
+ raw_max_rpc_payload = rpc_max_payload(server->client);
+ max_rpc_payload = nfs_block_size(raw_max_rpc_payload, NULL);
+
+ if (server->rsize > max_rpc_payload)
+ server->rsize = max_rpc_payload;
+ if (server->rsize > NFS_MAX_FILE_IO_SIZE)
+ server->rsize = NFS_MAX_FILE_IO_SIZE;
+ server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ if (server->wsize > max_rpc_payload)
+ server->wsize = max_rpc_payload;
+ if (server->wsize > NFS_MAX_FILE_IO_SIZE)
+ server->wsize = NFS_MAX_FILE_IO_SIZE;
+ server->wpages = (server->wsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
+
+ server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
+ if (server->dtsize > NFS_MAX_FILE_IO_SIZE)
+ server->dtsize = NFS_MAX_FILE_IO_SIZE;
+ if (server->dtsize > server->rsize)
+ server->dtsize = server->rsize;
+
+ if (server->flags & NFS_MOUNT_NOAC) {
+ server->acregmin = server->acregmax = 0;
+ server->acdirmin = server->acdirmax = 0;
+ }
+
+ server->maxfilesize = fsinfo->maxfilesize;
+
+ server->time_delta = fsinfo->time_delta;
+ server->change_attr_type = fsinfo->change_attr_type;
+
+ server->clone_blksize = fsinfo->clone_blksize;
+ /* We're airborne Set socket buffersize */
+ rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
+
+#ifdef CONFIG_NFS_V4_2
+ /*
+ * Defaults until limited by the session parameters.
+ */
+ server->gxasize = min_t(unsigned int, raw_max_rpc_payload,
+ XATTR_SIZE_MAX);
+ server->sxasize = min_t(unsigned int, raw_max_rpc_payload,
+ XATTR_SIZE_MAX);
+ server->lxasize = min_t(unsigned int, raw_max_rpc_payload,
+ nfs42_listxattr_xdrsize(XATTR_LIST_MAX));
+
+ if (fsinfo->xattr_support)
+ server->caps |= NFS_CAP_XATTR;
+#endif
+}
+
+/*
+ * Probe filesystem information, including the FSID on v2/v3
+ */
+static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr)
+{
+ struct nfs_fsinfo fsinfo;
+ struct nfs_client *clp = server->nfs_client;
+ int error;
+
+ if (clp->rpc_ops->set_capabilities != NULL) {
+ error = clp->rpc_ops->set_capabilities(server, mntfh);
+ if (error < 0)
+ return error;
+ }
+
+ fsinfo.fattr = fattr;
+ fsinfo.nlayouttypes = 0;
+ memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype));
+ error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
+ if (error < 0)
+ return error;
+
+ nfs_server_set_fsinfo(server, &fsinfo);
+
+ /* Get some general file system info */
+ if (server->namelen == 0) {
+ struct nfs_pathconf pathinfo;
+
+ pathinfo.fattr = fattr;
+ nfs_fattr_init(fattr);
+
+ if (clp->rpc_ops->pathconf(server, mntfh, &pathinfo) >= 0)
+ server->namelen = pathinfo.max_namelen;
+ }
+
+ if (clp->rpc_ops->discover_trunking != NULL &&
+ (server->caps & NFS_CAP_FS_LOCATIONS &&
+ (server->flags & NFS_MOUNT_TRUNK_DISCOVERY))) {
+ error = clp->rpc_ops->discover_trunking(server, mntfh);
+ if (error < 0)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Grab the destination's particulars, including lease expiry time.
+ *
+ * Returns zero if probe succeeded and retrieved FSID matches the FSID
+ * we have cached.
+ */
+int nfs_probe_server(struct nfs_server *server, struct nfs_fh *mntfh)
+{
+ struct nfs_fattr *fattr;
+ int error;
+
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ return -ENOMEM;
+
+ /* Sanity: the probe won't work if the destination server
+ * does not recognize the migrated FH. */
+ error = nfs_probe_fsinfo(server, mntfh, fattr);
+
+ nfs_free_fattr(fattr);
+ return error;
+}
+EXPORT_SYMBOL_GPL(nfs_probe_server);
+
+/*
+ * Copy useful information when duplicating a server record
+ */
+void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source)
+{
+ target->flags = source->flags;
+ target->rsize = source->rsize;
+ target->wsize = source->wsize;
+ target->acregmin = source->acregmin;
+ target->acregmax = source->acregmax;
+ target->acdirmin = source->acdirmin;
+ target->acdirmax = source->acdirmax;
+ target->caps = source->caps;
+ target->options = source->options;
+ target->auth_info = source->auth_info;
+ target->port = source->port;
+}
+EXPORT_SYMBOL_GPL(nfs_server_copy_userdata);
+
+void nfs_server_insert_lists(struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+
+ spin_lock(&nn->nfs_client_lock);
+ list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
+ list_add_tail(&server->master_link, &nn->nfs_volume_list);
+ clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
+ spin_unlock(&nn->nfs_client_lock);
+
+}
+EXPORT_SYMBOL_GPL(nfs_server_insert_lists);
+
+void nfs_server_remove_lists(struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs_net *nn;
+
+ if (clp == NULL)
+ return;
+ nn = net_generic(clp->cl_net, nfs_net_id);
+ spin_lock(&nn->nfs_client_lock);
+ list_del_rcu(&server->client_link);
+ if (list_empty(&clp->cl_superblocks))
+ set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
+ list_del(&server->master_link);
+ spin_unlock(&nn->nfs_client_lock);
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(nfs_server_remove_lists);
+
+static DEFINE_IDA(s_sysfs_ids);
+
+/*
+ * Allocate and initialise a server record
+ */
+struct nfs_server *nfs_alloc_server(void)
+{
+ struct nfs_server *server;
+
+ server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
+ if (!server)
+ return NULL;
+
+ server->s_sysfs_id = ida_alloc(&s_sysfs_ids, GFP_KERNEL);
+ if (server->s_sysfs_id < 0) {
+ kfree(server);
+ return NULL;
+ }
+
+ server->client = server->client_acl = ERR_PTR(-EINVAL);
+
+ /* Zero out the NFS state stuff */
+ INIT_LIST_HEAD(&server->client_link);
+ INIT_LIST_HEAD(&server->master_link);
+ INIT_LIST_HEAD(&server->delegations);
+ INIT_LIST_HEAD(&server->layouts);
+ INIT_LIST_HEAD(&server->state_owners_lru);
+ INIT_LIST_HEAD(&server->ss_copies);
+
+ atomic_set(&server->active, 0);
+
+ server->io_stats = nfs_alloc_iostats();
+ if (!server->io_stats) {
+ kfree(server);
+ return NULL;
+ }
+
+ server->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+
+ ida_init(&server->openowner_id);
+ ida_init(&server->lockowner_id);
+ pnfs_init_server(server);
+ rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC");
+
+ return server;
+}
+EXPORT_SYMBOL_GPL(nfs_alloc_server);
+
+/*
+ * Free up a server record
+ */
+void nfs_free_server(struct nfs_server *server)
+{
+ nfs_server_remove_lists(server);
+
+ if (server->destroy != NULL)
+ server->destroy(server);
+
+ if (!IS_ERR(server->client_acl))
+ rpc_shutdown_client(server->client_acl);
+ if (!IS_ERR(server->client))
+ rpc_shutdown_client(server->client);
+
+ nfs_put_client(server->nfs_client);
+
+ if (server->kobj.state_initialized) {
+ nfs_sysfs_remove_server(server);
+ kobject_put(&server->kobj);
+ }
+ ida_free(&s_sysfs_ids, server->s_sysfs_id);
+
+ ida_destroy(&server->lockowner_id);
+ ida_destroy(&server->openowner_id);
+ nfs_free_iostats(server->io_stats);
+ put_cred(server->cred);
+ kfree(server);
+ nfs_release_automount_timer();
+}
+EXPORT_SYMBOL_GPL(nfs_free_server);
+
+/*
+ * Create a version 2 or 3 volume record
+ * - keyed on server and FSID
+ */
+struct nfs_server *nfs_create_server(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct nfs_server *server;
+ struct nfs_fattr *fattr;
+ int error;
+
+ server = nfs_alloc_server();
+ if (!server)
+ return ERR_PTR(-ENOMEM);
+
+ server->cred = get_cred(fc->cred);
+
+ error = -ENOMEM;
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ goto error;
+
+ /* Get a client representation */
+ error = nfs_init_server(server, fc);
+ if (error < 0)
+ goto error;
+
+ /* Probe the root fh to retrieve its FSID */
+ error = nfs_probe_fsinfo(server, ctx->mntfh, fattr);
+ if (error < 0)
+ goto error;
+ if (server->nfs_client->rpc_ops->version == 3) {
+ if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
+ server->namelen = NFS3_MAXNAMLEN;
+ if (!(ctx->flags & NFS_MOUNT_NORDIRPLUS))
+ server->caps |= NFS_CAP_READDIRPLUS;
+ } else {
+ if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
+ server->namelen = NFS2_MAXNAMLEN;
+ }
+
+ if (!(fattr->valid & NFS_ATTR_FATTR)) {
+ error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh,
+ fattr, NULL);
+ if (error < 0) {
+ dprintk("nfs_create_server: getattr error = %d\n", -error);
+ goto error;
+ }
+ }
+ memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
+
+ dprintk("Server FSID: %llx:%llx\n",
+ (unsigned long long) server->fsid.major,
+ (unsigned long long) server->fsid.minor);
+
+ nfs_server_insert_lists(server);
+ server->mount_time = jiffies;
+ nfs_free_fattr(fattr);
+ return server;
+
+error:
+ nfs_free_fattr(fattr);
+ nfs_free_server(server);
+ return ERR_PTR(error);
+}
+EXPORT_SYMBOL_GPL(nfs_create_server);
+
+/*
+ * Clone an NFS2, NFS3 or NFS4 server record
+ */
+struct nfs_server *nfs_clone_server(struct nfs_server *source,
+ struct nfs_fh *fh,
+ struct nfs_fattr *fattr,
+ rpc_authflavor_t flavor)
+{
+ struct nfs_server *server;
+ int error;
+
+ server = nfs_alloc_server();
+ if (!server)
+ return ERR_PTR(-ENOMEM);
+
+ server->cred = get_cred(source->cred);
+
+ /* Copy data from the source */
+ server->nfs_client = source->nfs_client;
+ server->destroy = source->destroy;
+ refcount_inc(&server->nfs_client->cl_count);
+ nfs_server_copy_userdata(server, source);
+
+ server->fsid = fattr->fsid;
+
+ nfs_sysfs_add_server(server);
+
+ nfs_sysfs_link_rpc_client(server,
+ server->nfs_client->cl_rpcclient, "_state");
+
+ error = nfs_init_server_rpcclient(server,
+ source->client->cl_timeout,
+ flavor);
+ if (error < 0)
+ goto out_free_server;
+
+ /* probe the filesystem info for this server filesystem */
+ error = nfs_probe_server(server, fh);
+ if (error < 0)
+ goto out_free_server;
+
+ if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
+ server->namelen = NFS4_MAXNAMLEN;
+
+ error = nfs_start_lockd(server);
+ if (error < 0)
+ goto out_free_server;
+
+ nfs_server_insert_lists(server);
+ server->mount_time = jiffies;
+
+ return server;
+
+out_free_server:
+ nfs_free_server(server);
+ return ERR_PTR(error);
+}
+EXPORT_SYMBOL_GPL(nfs_clone_server);
+
+void nfs_clients_init(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ INIT_LIST_HEAD(&nn->nfs_client_list);
+ INIT_LIST_HEAD(&nn->nfs_volume_list);
+#if IS_ENABLED(CONFIG_NFS_V4)
+ idr_init(&nn->cb_ident_idr);
+#endif
+ spin_lock_init(&nn->nfs_client_lock);
+ nn->boot_time = ktime_get_real();
+
+ nfs_netns_sysfs_setup(nn, net);
+}
+
+void nfs_clients_exit(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ nfs_netns_sysfs_destroy(nn);
+ nfs_cleanup_cb_ident_idr(net);
+ WARN_ON_ONCE(!list_empty(&nn->nfs_client_list));
+ WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list));
+}
+
+#ifdef CONFIG_PROC_FS
+static void *nfs_server_list_start(struct seq_file *p, loff_t *pos);
+static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos);
+static void nfs_server_list_stop(struct seq_file *p, void *v);
+static int nfs_server_list_show(struct seq_file *m, void *v);
+
+static const struct seq_operations nfs_server_list_ops = {
+ .start = nfs_server_list_start,
+ .next = nfs_server_list_next,
+ .stop = nfs_server_list_stop,
+ .show = nfs_server_list_show,
+};
+
+static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos);
+static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos);
+static void nfs_volume_list_stop(struct seq_file *p, void *v);
+static int nfs_volume_list_show(struct seq_file *m, void *v);
+
+static const struct seq_operations nfs_volume_list_ops = {
+ .start = nfs_volume_list_start,
+ .next = nfs_volume_list_next,
+ .stop = nfs_volume_list_stop,
+ .show = nfs_volume_list_show,
+};
+
+/*
+ * set up the iterator to start reading from the server list and return the first item
+ */
+static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
+ __acquires(&nn->nfs_client_lock)
+{
+ struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
+
+ /* lock the list against modification */
+ spin_lock(&nn->nfs_client_lock);
+ return seq_list_start_head(&nn->nfs_client_list, *_pos);
+}
+
+/*
+ * move to next server
+ */
+static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
+{
+ struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
+
+ return seq_list_next(v, &nn->nfs_client_list, pos);
+}
+
+/*
+ * clean up after reading from the transports list
+ */
+static void nfs_server_list_stop(struct seq_file *p, void *v)
+ __releases(&nn->nfs_client_lock)
+{
+ struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
+
+ spin_unlock(&nn->nfs_client_lock);
+}
+
+/*
+ * display a header line followed by a load of call lines
+ */
+static int nfs_server_list_show(struct seq_file *m, void *v)
+{
+ struct nfs_client *clp;
+ struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
+
+ /* display header on line 1 */
+ if (v == &nn->nfs_client_list) {
+ seq_puts(m, "NV SERVER PORT USE HOSTNAME\n");
+ return 0;
+ }
+
+ /* display one transport per line on subsequent lines */
+ clp = list_entry(v, struct nfs_client, cl_share_link);
+
+ /* Check if the client is initialized */
+ if (clp->cl_cons_state != NFS_CS_READY)
+ return 0;
+
+ rcu_read_lock();
+ seq_printf(m, "v%u %s %s %3d %s\n",
+ clp->rpc_ops->version,
+ rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
+ rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
+ refcount_read(&clp->cl_count),
+ clp->cl_hostname);
+ rcu_read_unlock();
+
+ return 0;
+}
+
+/*
+ * set up the iterator to start reading from the volume list and return the first item
+ */
+static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
+ __acquires(&nn->nfs_client_lock)
+{
+ struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
+
+ /* lock the list against modification */
+ spin_lock(&nn->nfs_client_lock);
+ return seq_list_start_head(&nn->nfs_volume_list, *_pos);
+}
+
+/*
+ * move to next volume
+ */
+static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
+{
+ struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
+
+ return seq_list_next(v, &nn->nfs_volume_list, pos);
+}
+
+/*
+ * clean up after reading from the transports list
+ */
+static void nfs_volume_list_stop(struct seq_file *p, void *v)
+ __releases(&nn->nfs_client_lock)
+{
+ struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
+
+ spin_unlock(&nn->nfs_client_lock);
+}
+
+/*
+ * display a header line followed by a load of call lines
+ */
+static int nfs_volume_list_show(struct seq_file *m, void *v)
+{
+ struct nfs_server *server;
+ struct nfs_client *clp;
+ char dev[13]; // 8 for 2^24, 1 for ':', 3 for 2^8, 1 for '\0'
+ char fsid[34]; // 2 * 16 for %llx, 1 for ':', 1 for '\0'
+ struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
+
+ /* display header on line 1 */
+ if (v == &nn->nfs_volume_list) {
+ seq_puts(m, "NV SERVER PORT DEV FSID"
+ " FSC\n");
+ return 0;
+ }
+ /* display one transport per line on subsequent lines */
+ server = list_entry(v, struct nfs_server, master_link);
+ clp = server->nfs_client;
+
+ snprintf(dev, sizeof(dev), "%u:%u",
+ MAJOR(server->s_dev), MINOR(server->s_dev));
+
+ snprintf(fsid, sizeof(fsid), "%llx:%llx",
+ (unsigned long long) server->fsid.major,
+ (unsigned long long) server->fsid.minor);
+
+ rcu_read_lock();
+ seq_printf(m, "v%u %s %s %-12s %-33s %s\n",
+ clp->rpc_ops->version,
+ rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
+ rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
+ dev,
+ fsid,
+ nfs_server_fscache_state(server));
+ rcu_read_unlock();
+
+ return 0;
+}
+
+int nfs_fs_proc_net_init(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct proc_dir_entry *p;
+
+ nn->proc_nfsfs = proc_net_mkdir(net, "nfsfs", net->proc_net);
+ if (!nn->proc_nfsfs)
+ goto error_0;
+
+ /* a file of servers with which we're dealing */
+ p = proc_create_net("servers", S_IFREG|S_IRUGO, nn->proc_nfsfs,
+ &nfs_server_list_ops, sizeof(struct seq_net_private));
+ if (!p)
+ goto error_1;
+
+ /* a file of volumes that we have mounted */
+ p = proc_create_net("volumes", S_IFREG|S_IRUGO, nn->proc_nfsfs,
+ &nfs_volume_list_ops, sizeof(struct seq_net_private));
+ if (!p)
+ goto error_1;
+ return 0;
+
+error_1:
+ remove_proc_subtree("nfsfs", net->proc_net);
+error_0:
+ return -ENOMEM;
+}
+
+void nfs_fs_proc_net_exit(struct net *net)
+{
+ remove_proc_subtree("nfsfs", net->proc_net);
+}
+
+/*
+ * initialise the /proc/fs/nfsfs/ directory
+ */
+int __init nfs_fs_proc_init(void)
+{
+ if (!proc_mkdir("fs/nfsfs", NULL))
+ goto error_0;
+
+ /* a file of servers with which we're dealing */
+ if (!proc_symlink("fs/nfsfs/servers", NULL, "../../net/nfsfs/servers"))
+ goto error_1;
+
+ /* a file of volumes that we have mounted */
+ if (!proc_symlink("fs/nfsfs/volumes", NULL, "../../net/nfsfs/volumes"))
+ goto error_1;
+
+ return 0;
+error_1:
+ remove_proc_subtree("fs/nfsfs", NULL);
+error_0:
+ return -ENOMEM;
+}
+
+/*
+ * clean up the /proc/fs/nfsfs/ directory
+ */
+void nfs_fs_proc_exit(void)
+{
+ remove_proc_subtree("fs/nfsfs", NULL);
+ ida_destroy(&s_sysfs_ids);
+}
+
+#endif /* CONFIG_PROC_FS */
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
new file mode 100644
index 0000000000..cf73655810
--- /dev/null
+++ b/fs/nfs/delegation.c
@@ -0,0 +1,1479 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/delegation.c
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFS file delegation management
+ *
+ */
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/iversion.h>
+
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_xdr.h>
+
+#include "nfs4_fs.h"
+#include "nfs4session.h"
+#include "delegation.h"
+#include "internal.h"
+#include "nfs4trace.h"
+
+#define NFS_DEFAULT_DELEGATION_WATERMARK (5000U)
+
+static atomic_long_t nfs_active_delegations;
+static unsigned nfs_delegation_watermark = NFS_DEFAULT_DELEGATION_WATERMARK;
+
+static void __nfs_free_delegation(struct nfs_delegation *delegation)
+{
+ put_cred(delegation->cred);
+ delegation->cred = NULL;
+ kfree_rcu(delegation, rcu);
+}
+
+static void nfs_mark_delegation_revoked(struct nfs_delegation *delegation)
+{
+ if (!test_and_set_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+ delegation->stateid.type = NFS4_INVALID_STATEID_TYPE;
+ atomic_long_dec(&nfs_active_delegations);
+ if (!test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+ nfs_clear_verifier_delegated(delegation->inode);
+ }
+}
+
+static struct nfs_delegation *nfs_get_delegation(struct nfs_delegation *delegation)
+{
+ refcount_inc(&delegation->refcount);
+ return delegation;
+}
+
+static void nfs_put_delegation(struct nfs_delegation *delegation)
+{
+ if (refcount_dec_and_test(&delegation->refcount))
+ __nfs_free_delegation(delegation);
+}
+
+static void nfs_free_delegation(struct nfs_delegation *delegation)
+{
+ nfs_mark_delegation_revoked(delegation);
+ nfs_put_delegation(delegation);
+}
+
+/**
+ * nfs_mark_delegation_referenced - set delegation's REFERENCED flag
+ * @delegation: delegation to process
+ *
+ */
+void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
+{
+ set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
+}
+
+static void nfs_mark_return_delegation(struct nfs_server *server,
+ struct nfs_delegation *delegation)
+{
+ set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+ set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
+}
+
+static bool
+nfs4_is_valid_delegation(const struct nfs_delegation *delegation,
+ fmode_t flags)
+{
+ if (delegation != NULL && (delegation->type & flags) == flags &&
+ !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) &&
+ !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+ return true;
+ return false;
+}
+
+struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (nfs4_is_valid_delegation(delegation, 0))
+ return delegation;
+ return NULL;
+}
+
+static int
+nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
+{
+ struct nfs_delegation *delegation;
+ int ret = 0;
+
+ flags &= FMODE_READ|FMODE_WRITE;
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (nfs4_is_valid_delegation(delegation, flags)) {
+ if (mark)
+ nfs_mark_delegation_referenced(delegation);
+ ret = 1;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+/**
+ * nfs4_have_delegation - check if inode has a delegation, mark it
+ * NFS_DELEGATION_REFERENCED if there is one.
+ * @inode: inode to check
+ * @flags: delegation types to check for
+ *
+ * Returns one if inode has the indicated delegation, otherwise zero.
+ */
+int nfs4_have_delegation(struct inode *inode, fmode_t flags)
+{
+ return nfs4_do_check_delegation(inode, flags, true);
+}
+
+/*
+ * nfs4_check_delegation - check if inode has a delegation, do not mark
+ * NFS_DELEGATION_REFERENCED if it has one.
+ */
+int nfs4_check_delegation(struct inode *inode, fmode_t flags)
+{
+ return nfs4_do_check_delegation(inode, flags, false);
+}
+
+static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_stateid *stateid)
+{
+ struct inode *inode = state->inode;
+ struct file_lock *fl;
+ struct file_lock_context *flctx = locks_inode_context(inode);
+ struct list_head *list;
+ int status = 0;
+
+ if (flctx == NULL)
+ goto out;
+
+ list = &flctx->flc_posix;
+ spin_lock(&flctx->flc_lock);
+restart:
+ list_for_each_entry(fl, list, fl_list) {
+ if (nfs_file_open_context(fl->fl_file)->state != state)
+ continue;
+ spin_unlock(&flctx->flc_lock);
+ status = nfs4_lock_delegation_recall(fl, state, stateid);
+ if (status < 0)
+ goto out;
+ spin_lock(&flctx->flc_lock);
+ }
+ if (list == &flctx->flc_posix) {
+ list = &flctx->flc_flock;
+ goto restart;
+ }
+ spin_unlock(&flctx->flc_lock);
+out:
+ return status;
+}
+
+static int nfs_delegation_claim_opens(struct inode *inode,
+ const nfs4_stateid *stateid, fmode_t type)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_open_context *ctx;
+ struct nfs4_state_owner *sp;
+ struct nfs4_state *state;
+ unsigned int seq;
+ int err;
+
+again:
+ rcu_read_lock();
+ list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
+ state = ctx->state;
+ if (state == NULL)
+ continue;
+ if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
+ continue;
+ if (!nfs4_valid_open_stateid(state))
+ continue;
+ if (!nfs4_stateid_match(&state->stateid, stateid))
+ continue;
+ if (!get_nfs_open_context(ctx))
+ continue;
+ rcu_read_unlock();
+ sp = state->owner;
+ /* Block nfs4_proc_unlck */
+ mutex_lock(&sp->so_delegreturn_mutex);
+ seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+ err = nfs4_open_delegation_recall(ctx, state, stateid);
+ if (!err)
+ err = nfs_delegation_claim_locks(state, stateid);
+ if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+ err = -EAGAIN;
+ mutex_unlock(&sp->so_delegreturn_mutex);
+ put_nfs_open_context(ctx);
+ if (err != 0)
+ return err;
+ goto again;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+/**
+ * nfs_inode_reclaim_delegation - process a delegation reclaim request
+ * @inode: inode to process
+ * @cred: credential to use for request
+ * @type: delegation type
+ * @stateid: delegation stateid
+ * @pagemod_limit: write delegation "space_limit"
+ *
+ */
+void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
+ fmode_t type, const nfs4_stateid *stateid,
+ unsigned long pagemod_limit)
+{
+ struct nfs_delegation *delegation;
+ const struct cred *oldcred = NULL;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation != NULL) {
+ spin_lock(&delegation->lock);
+ nfs4_stateid_copy(&delegation->stateid, stateid);
+ delegation->type = type;
+ delegation->pagemod_limit = pagemod_limit;
+ oldcred = delegation->cred;
+ delegation->cred = get_cred(cred);
+ clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+ if (test_and_clear_bit(NFS_DELEGATION_REVOKED,
+ &delegation->flags))
+ atomic_long_inc(&nfs_active_delegations);
+ spin_unlock(&delegation->lock);
+ rcu_read_unlock();
+ put_cred(oldcred);
+ trace_nfs4_reclaim_delegation(inode, type);
+ } else {
+ rcu_read_unlock();
+ nfs_inode_set_delegation(inode, cred, type, stateid,
+ pagemod_limit);
+ }
+}
+
+static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
+{
+ const struct cred *cred;
+ int res = 0;
+
+ if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+ spin_lock(&delegation->lock);
+ cred = get_cred(delegation->cred);
+ spin_unlock(&delegation->lock);
+ res = nfs4_proc_delegreturn(inode, cred,
+ &delegation->stateid,
+ issync);
+ put_cred(cred);
+ }
+ return res;
+}
+
+static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation)
+{
+ struct inode *inode = NULL;
+
+ spin_lock(&delegation->lock);
+ if (delegation->inode != NULL)
+ inode = igrab(delegation->inode);
+ if (!inode)
+ set_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags);
+ spin_unlock(&delegation->lock);
+ return inode;
+}
+
+static struct nfs_delegation *
+nfs_start_delegation_return_locked(struct nfs_inode *nfsi)
+{
+ struct nfs_delegation *ret = NULL;
+ struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
+
+ if (delegation == NULL)
+ goto out;
+ spin_lock(&delegation->lock);
+ if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
+ clear_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags);
+ /* Refcount matched in nfs_end_delegation_return() */
+ ret = nfs_get_delegation(delegation);
+ }
+ spin_unlock(&delegation->lock);
+ if (ret)
+ nfs_clear_verifier_delegated(&nfsi->vfs_inode);
+out:
+ return ret;
+}
+
+static struct nfs_delegation *
+nfs_start_delegation_return(struct nfs_inode *nfsi)
+{
+ struct nfs_delegation *delegation;
+
+ rcu_read_lock();
+ delegation = nfs_start_delegation_return_locked(nfsi);
+ rcu_read_unlock();
+ return delegation;
+}
+
+static void nfs_abort_delegation_return(struct nfs_delegation *delegation,
+ struct nfs_client *clp, int err)
+{
+
+ spin_lock(&delegation->lock);
+ clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
+ if (err == -EAGAIN) {
+ set_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags);
+ set_bit(NFS4CLNT_DELEGRETURN_DELAYED, &clp->cl_state);
+ }
+ spin_unlock(&delegation->lock);
+}
+
+static struct nfs_delegation *
+nfs_detach_delegation_locked(struct nfs_inode *nfsi,
+ struct nfs_delegation *delegation,
+ struct nfs_client *clp)
+{
+ struct nfs_delegation *deleg_cur =
+ rcu_dereference_protected(nfsi->delegation,
+ lockdep_is_held(&clp->cl_lock));
+
+ if (deleg_cur == NULL || delegation != deleg_cur)
+ return NULL;
+
+ spin_lock(&delegation->lock);
+ if (!delegation->inode) {
+ spin_unlock(&delegation->lock);
+ return NULL;
+ }
+ list_del_rcu(&delegation->super_list);
+ delegation->inode = NULL;
+ rcu_assign_pointer(nfsi->delegation, NULL);
+ spin_unlock(&delegation->lock);
+ return delegation;
+}
+
+static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi,
+ struct nfs_delegation *delegation,
+ struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+
+ spin_lock(&clp->cl_lock);
+ delegation = nfs_detach_delegation_locked(nfsi, delegation, clp);
+ spin_unlock(&clp->cl_lock);
+ return delegation;
+}
+
+static struct nfs_delegation *
+nfs_inode_detach_delegation(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_delegation *delegation;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(nfsi->delegation);
+ if (delegation != NULL)
+ delegation = nfs_detach_delegation(nfsi, delegation, server);
+ rcu_read_unlock();
+ return delegation;
+}
+
+static void
+nfs_update_delegation_cred(struct nfs_delegation *delegation,
+ const struct cred *cred)
+{
+ const struct cred *old;
+
+ if (cred_fscmp(delegation->cred, cred) != 0) {
+ old = xchg(&delegation->cred, get_cred(cred));
+ put_cred(old);
+ }
+}
+
+static void
+nfs_update_inplace_delegation(struct nfs_delegation *delegation,
+ const struct nfs_delegation *update)
+{
+ if (nfs4_stateid_is_newer(&update->stateid, &delegation->stateid)) {
+ delegation->stateid.seqid = update->stateid.seqid;
+ smp_wmb();
+ delegation->type = update->type;
+ delegation->pagemod_limit = update->pagemod_limit;
+ if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+ delegation->change_attr = update->change_attr;
+ nfs_update_delegation_cred(delegation, update->cred);
+ /* smp_mb__before_atomic() is implicit due to xchg() */
+ clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
+ atomic_long_inc(&nfs_active_delegations);
+ }
+ }
+}
+
+/**
+ * nfs_inode_set_delegation - set up a delegation on an inode
+ * @inode: inode to which delegation applies
+ * @cred: cred to use for subsequent delegation processing
+ * @type: delegation type
+ * @stateid: delegation stateid
+ * @pagemod_limit: write delegation "space_limit"
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
+ fmode_t type,
+ const nfs4_stateid *stateid,
+ unsigned long pagemod_limit)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_delegation *delegation, *old_delegation;
+ struct nfs_delegation *freeme = NULL;
+ int status = 0;
+
+ delegation = kmalloc(sizeof(*delegation), GFP_KERNEL_ACCOUNT);
+ if (delegation == NULL)
+ return -ENOMEM;
+ nfs4_stateid_copy(&delegation->stateid, stateid);
+ refcount_set(&delegation->refcount, 1);
+ delegation->type = type;
+ delegation->pagemod_limit = pagemod_limit;
+ delegation->change_attr = inode_peek_iversion_raw(inode);
+ delegation->cred = get_cred(cred);
+ delegation->inode = inode;
+ delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
+ spin_lock_init(&delegation->lock);
+
+ spin_lock(&clp->cl_lock);
+ old_delegation = rcu_dereference_protected(nfsi->delegation,
+ lockdep_is_held(&clp->cl_lock));
+ if (old_delegation == NULL)
+ goto add_new;
+ /* Is this an update of the existing delegation? */
+ if (nfs4_stateid_match_other(&old_delegation->stateid,
+ &delegation->stateid)) {
+ spin_lock(&old_delegation->lock);
+ nfs_update_inplace_delegation(old_delegation,
+ delegation);
+ spin_unlock(&old_delegation->lock);
+ goto out;
+ }
+ if (!test_bit(NFS_DELEGATION_REVOKED, &old_delegation->flags)) {
+ /*
+ * Deal with broken servers that hand out two
+ * delegations for the same file.
+ * Allow for upgrades to a WRITE delegation, but
+ * nothing else.
+ */
+ dfprintk(FILE, "%s: server %s handed out "
+ "a duplicate delegation!\n",
+ __func__, clp->cl_hostname);
+ if (delegation->type == old_delegation->type ||
+ !(delegation->type & FMODE_WRITE)) {
+ freeme = delegation;
+ delegation = NULL;
+ goto out;
+ }
+ if (test_and_set_bit(NFS_DELEGATION_RETURNING,
+ &old_delegation->flags))
+ goto out;
+ }
+ freeme = nfs_detach_delegation_locked(nfsi, old_delegation, clp);
+ if (freeme == NULL)
+ goto out;
+add_new:
+ /*
+ * If we didn't revalidate the change attribute before setting
+ * the delegation, then pre-emptively ask for a full attribute
+ * cache revalidation.
+ */
+ spin_lock(&inode->i_lock);
+ if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_CHANGE)
+ nfs_set_cache_invalid(inode,
+ NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME |
+ NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE |
+ NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK |
+ NFS_INO_INVALID_OTHER | NFS_INO_INVALID_DATA |
+ NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL |
+ NFS_INO_INVALID_XATTR);
+ spin_unlock(&inode->i_lock);
+
+ list_add_tail_rcu(&delegation->super_list, &server->delegations);
+ rcu_assign_pointer(nfsi->delegation, delegation);
+ delegation = NULL;
+
+ atomic_long_inc(&nfs_active_delegations);
+
+ trace_nfs4_set_delegation(inode, type);
+out:
+ spin_unlock(&clp->cl_lock);
+ if (delegation != NULL)
+ __nfs_free_delegation(delegation);
+ if (freeme != NULL) {
+ nfs_do_return_delegation(inode, freeme, 0);
+ nfs_free_delegation(freeme);
+ }
+ return status;
+}
+
+/*
+ * Basic procedure for returning a delegation to the server
+ */
+static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync)
+{
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+ unsigned int mode = O_WRONLY | O_RDWR;
+ int err = 0;
+
+ if (delegation == NULL)
+ return 0;
+
+ if (!issync)
+ mode |= O_NONBLOCK;
+ /* Recall of any remaining application leases */
+ err = break_lease(inode, mode);
+
+ while (err == 0) {
+ if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+ break;
+ err = nfs_delegation_claim_opens(inode, &delegation->stateid,
+ delegation->type);
+ if (!issync || err != -EAGAIN)
+ break;
+ /*
+ * Guard against state recovery
+ */
+ err = nfs4_wait_clnt_recover(clp);
+ }
+
+ if (err) {
+ nfs_abort_delegation_return(delegation, clp, err);
+ goto out;
+ }
+
+ err = nfs_do_return_delegation(inode, delegation, issync);
+out:
+ /* Refcount matched in nfs_start_delegation_return_locked() */
+ nfs_put_delegation(delegation);
+ return err;
+}
+
+static bool nfs_delegation_need_return(struct nfs_delegation *delegation)
+{
+ bool ret = false;
+
+ if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
+ ret = true;
+ else if (test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags)) {
+ struct inode *inode;
+
+ spin_lock(&delegation->lock);
+ inode = delegation->inode;
+ if (inode && list_empty(&NFS_I(inode)->open_files))
+ ret = true;
+ spin_unlock(&delegation->lock);
+ }
+ if (ret)
+ clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
+ if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags) ||
+ test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+ ret = false;
+
+ return ret;
+}
+
+static int nfs_server_return_marked_delegations(struct nfs_server *server,
+ void __always_unused *data)
+{
+ struct nfs_delegation *delegation;
+ struct nfs_delegation *prev;
+ struct inode *inode;
+ struct inode *place_holder = NULL;
+ struct nfs_delegation *place_holder_deleg = NULL;
+ int err = 0;
+
+restart:
+ /*
+ * To avoid quadratic looping we hold a reference
+ * to an inode place_holder. Each time we restart, we
+ * list delegation in the server from the delegations
+ * of that inode.
+ * prev is an RCU-protected pointer to a delegation which
+ * wasn't marked for return and might be a good choice for
+ * the next place_holder.
+ */
+ prev = NULL;
+ delegation = NULL;
+ rcu_read_lock();
+ if (place_holder)
+ delegation = rcu_dereference(NFS_I(place_holder)->delegation);
+ if (!delegation || delegation != place_holder_deleg)
+ delegation = list_entry_rcu(server->delegations.next,
+ struct nfs_delegation, super_list);
+ list_for_each_entry_from_rcu(delegation, &server->delegations, super_list) {
+ struct inode *to_put = NULL;
+
+ if (test_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags))
+ continue;
+ if (!nfs_delegation_need_return(delegation)) {
+ if (nfs4_is_valid_delegation(delegation, 0))
+ prev = delegation;
+ continue;
+ }
+
+ if (prev) {
+ struct inode *tmp = nfs_delegation_grab_inode(prev);
+ if (tmp) {
+ to_put = place_holder;
+ place_holder = tmp;
+ place_holder_deleg = prev;
+ }
+ }
+
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL) {
+ rcu_read_unlock();
+ iput(to_put);
+ goto restart;
+ }
+ delegation = nfs_start_delegation_return_locked(NFS_I(inode));
+ rcu_read_unlock();
+
+ iput(to_put);
+
+ err = nfs_end_delegation_return(inode, delegation, 0);
+ iput(inode);
+ cond_resched();
+ if (!err)
+ goto restart;
+ set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
+ goto out;
+ }
+ rcu_read_unlock();
+out:
+ iput(place_holder);
+ return err;
+}
+
+static bool nfs_server_clear_delayed_delegations(struct nfs_server *server)
+{
+ struct nfs_delegation *d;
+ bool ret = false;
+
+ list_for_each_entry_rcu (d, &server->delegations, super_list) {
+ if (!test_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags))
+ continue;
+ nfs_mark_return_delegation(server, d);
+ clear_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags);
+ ret = true;
+ }
+ return ret;
+}
+
+static bool nfs_client_clear_delayed_delegations(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+ bool ret = false;
+
+ if (!test_and_clear_bit(NFS4CLNT_DELEGRETURN_DELAYED, &clp->cl_state))
+ goto out;
+ rcu_read_lock();
+ list_for_each_entry_rcu (server, &clp->cl_superblocks, client_link) {
+ if (nfs_server_clear_delayed_delegations(server))
+ ret = true;
+ }
+ rcu_read_unlock();
+out:
+ return ret;
+}
+
+/**
+ * nfs_client_return_marked_delegations - return previously marked delegations
+ * @clp: nfs_client to process
+ *
+ * Note that this function is designed to be called by the state
+ * manager thread. For this reason, it cannot flush the dirty data,
+ * since that could deadlock in case of a state recovery error.
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_client_return_marked_delegations(struct nfs_client *clp)
+{
+ int err = nfs_client_for_each_server(
+ clp, nfs_server_return_marked_delegations, NULL);
+ if (err)
+ return err;
+ /* If a return was delayed, sleep to prevent hard looping */
+ if (nfs_client_clear_delayed_delegations(clp))
+ ssleep(1);
+ return 0;
+}
+
+/**
+ * nfs_inode_evict_delegation - return delegation, don't reclaim opens
+ * @inode: inode to process
+ *
+ * Does not protect against delegation reclaims, therefore really only safe
+ * to be called from nfs4_clear_inode(). Guaranteed to always free
+ * the delegation structure.
+ */
+void nfs_inode_evict_delegation(struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+
+ delegation = nfs_inode_detach_delegation(inode);
+ if (delegation != NULL) {
+ set_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
+ set_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags);
+ nfs_do_return_delegation(inode, delegation, 1);
+ nfs_free_delegation(delegation);
+ }
+}
+
+/**
+ * nfs4_inode_return_delegation - synchronously return a delegation
+ * @inode: inode to process
+ *
+ * This routine will always flush any dirty data to disk on the
+ * assumption that if we need to return the delegation, then
+ * we should stop caching.
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs4_inode_return_delegation(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_delegation *delegation;
+
+ delegation = nfs_start_delegation_return(nfsi);
+ if (delegation != NULL) {
+ /* Synchronous recall of any application leases */
+ break_lease(inode, O_WRONLY | O_RDWR);
+ if (S_ISREG(inode->i_mode))
+ nfs_wb_all(inode);
+ return nfs_end_delegation_return(inode, delegation, 1);
+ }
+ return 0;
+}
+
+/**
+ * nfs4_inode_return_delegation_on_close - asynchronously return a delegation
+ * @inode: inode to process
+ *
+ * This routine is called on file close in order to determine if the
+ * inode delegation needs to be returned immediately.
+ */
+void nfs4_inode_return_delegation_on_close(struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+ struct nfs_delegation *ret = NULL;
+
+ if (!inode)
+ return;
+ rcu_read_lock();
+ delegation = nfs4_get_valid_delegation(inode);
+ if (!delegation)
+ goto out;
+ if (test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) ||
+ atomic_long_read(&nfs_active_delegations) >= nfs_delegation_watermark) {
+ spin_lock(&delegation->lock);
+ if (delegation->inode &&
+ list_empty(&NFS_I(inode)->open_files) &&
+ !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
+ clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
+ /* Refcount matched in nfs_end_delegation_return() */
+ ret = nfs_get_delegation(delegation);
+ }
+ spin_unlock(&delegation->lock);
+ if (ret)
+ nfs_clear_verifier_delegated(inode);
+ }
+out:
+ rcu_read_unlock();
+ nfs_end_delegation_return(inode, ret, 0);
+}
+
+/**
+ * nfs4_inode_make_writeable
+ * @inode: pointer to inode
+ *
+ * Make the inode writeable by returning the delegation if necessary
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs4_inode_make_writeable(struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+
+ rcu_read_lock();
+ delegation = nfs4_get_valid_delegation(inode);
+ if (delegation == NULL ||
+ (nfs4_has_session(NFS_SERVER(inode)->nfs_client) &&
+ (delegation->type & FMODE_WRITE))) {
+ rcu_read_unlock();
+ return 0;
+ }
+ rcu_read_unlock();
+ return nfs4_inode_return_delegation(inode);
+}
+
+static void nfs_mark_return_if_closed_delegation(struct nfs_server *server,
+ struct nfs_delegation *delegation)
+{
+ set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
+ set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
+}
+
+static bool nfs_server_mark_return_all_delegations(struct nfs_server *server)
+{
+ struct nfs_delegation *delegation;
+ bool ret = false;
+
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ nfs_mark_return_delegation(server, delegation);
+ ret = true;
+ }
+ return ret;
+}
+
+static void nfs_client_mark_return_all_delegations(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ nfs_server_mark_return_all_delegations(server);
+ rcu_read_unlock();
+}
+
+static void nfs_delegation_run_state_manager(struct nfs_client *clp)
+{
+ if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))
+ nfs4_schedule_state_manager(clp);
+}
+
+/**
+ * nfs_expire_all_delegations
+ * @clp: client to process
+ *
+ */
+void nfs_expire_all_delegations(struct nfs_client *clp)
+{
+ nfs_client_mark_return_all_delegations(clp);
+ nfs_delegation_run_state_manager(clp);
+}
+
+/**
+ * nfs_server_return_all_delegations - return delegations for one superblock
+ * @server: pointer to nfs_server to process
+ *
+ */
+void nfs_server_return_all_delegations(struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+ bool need_wait;
+
+ if (clp == NULL)
+ return;
+
+ rcu_read_lock();
+ need_wait = nfs_server_mark_return_all_delegations(server);
+ rcu_read_unlock();
+
+ if (need_wait) {
+ nfs4_schedule_state_manager(clp);
+ nfs4_wait_clnt_recover(clp);
+ }
+}
+
+static void nfs_mark_return_unused_delegation_types(struct nfs_server *server,
+ fmode_t flags)
+{
+ struct nfs_delegation *delegation;
+
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
+ continue;
+ if (delegation->type & flags)
+ nfs_mark_return_if_closed_delegation(server, delegation);
+ }
+}
+
+static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *clp,
+ fmode_t flags)
+{
+ struct nfs_server *server;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ nfs_mark_return_unused_delegation_types(server, flags);
+ rcu_read_unlock();
+}
+
+static void nfs_revoke_delegation(struct inode *inode,
+ const nfs4_stateid *stateid)
+{
+ struct nfs_delegation *delegation;
+ nfs4_stateid tmp;
+ bool ret = false;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation == NULL)
+ goto out;
+ if (stateid == NULL) {
+ nfs4_stateid_copy(&tmp, &delegation->stateid);
+ stateid = &tmp;
+ } else {
+ if (!nfs4_stateid_match_other(stateid, &delegation->stateid))
+ goto out;
+ spin_lock(&delegation->lock);
+ if (stateid->seqid) {
+ if (nfs4_stateid_is_newer(&delegation->stateid, stateid)) {
+ spin_unlock(&delegation->lock);
+ goto out;
+ }
+ delegation->stateid.seqid = stateid->seqid;
+ }
+ spin_unlock(&delegation->lock);
+ }
+ nfs_mark_delegation_revoked(delegation);
+ ret = true;
+out:
+ rcu_read_unlock();
+ if (ret)
+ nfs_inode_find_state_and_recover(inode, stateid);
+}
+
+void nfs_remove_bad_delegation(struct inode *inode,
+ const nfs4_stateid *stateid)
+{
+ nfs_revoke_delegation(inode, stateid);
+}
+EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation);
+
+void nfs_delegation_mark_returned(struct inode *inode,
+ const nfs4_stateid *stateid)
+{
+ struct nfs_delegation *delegation;
+
+ if (!inode)
+ return;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (!delegation)
+ goto out_rcu_unlock;
+
+ spin_lock(&delegation->lock);
+ if (!nfs4_stateid_match_other(stateid, &delegation->stateid))
+ goto out_spin_unlock;
+ if (stateid->seqid) {
+ /* If delegation->stateid is newer, dont mark as returned */
+ if (nfs4_stateid_is_newer(&delegation->stateid, stateid))
+ goto out_clear_returning;
+ if (delegation->stateid.seqid != stateid->seqid)
+ delegation->stateid.seqid = stateid->seqid;
+ }
+
+ nfs_mark_delegation_revoked(delegation);
+
+out_clear_returning:
+ clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
+out_spin_unlock:
+ spin_unlock(&delegation->lock);
+out_rcu_unlock:
+ rcu_read_unlock();
+
+ nfs_inode_find_state_and_recover(inode, stateid);
+}
+
+/**
+ * nfs_expire_unused_delegation_types
+ * @clp: client to process
+ * @flags: delegation types to expire
+ *
+ */
+void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags)
+{
+ nfs_client_mark_return_unused_delegation_types(clp, flags);
+ nfs_delegation_run_state_manager(clp);
+}
+
+static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
+{
+ struct nfs_delegation *delegation;
+
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
+ continue;
+ nfs_mark_return_if_closed_delegation(server, delegation);
+ }
+}
+
+/**
+ * nfs_expire_unreferenced_delegations - Eliminate unused delegations
+ * @clp: nfs_client to process
+ *
+ */
+void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ nfs_mark_return_unreferenced_delegations(server);
+ rcu_read_unlock();
+
+ nfs_delegation_run_state_manager(clp);
+}
+
+/**
+ * nfs_async_inode_return_delegation - asynchronously return a delegation
+ * @inode: inode to process
+ * @stateid: state ID information
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_async_inode_return_delegation(struct inode *inode,
+ const nfs4_stateid *stateid)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs_delegation *delegation;
+
+ rcu_read_lock();
+ delegation = nfs4_get_valid_delegation(inode);
+ if (delegation == NULL)
+ goto out_enoent;
+ if (stateid != NULL &&
+ !clp->cl_mvops->match_stateid(&delegation->stateid, stateid))
+ goto out_enoent;
+ nfs_mark_return_delegation(server, delegation);
+ rcu_read_unlock();
+
+ /* If there are any application leases or delegations, recall them */
+ break_lease(inode, O_WRONLY | O_RDWR | O_NONBLOCK);
+
+ nfs_delegation_run_state_manager(clp);
+ return 0;
+out_enoent:
+ rcu_read_unlock();
+ return -ENOENT;
+}
+
+static struct inode *
+nfs_delegation_find_inode_server(struct nfs_server *server,
+ const struct nfs_fh *fhandle)
+{
+ struct nfs_delegation *delegation;
+ struct super_block *freeme = NULL;
+ struct inode *res = NULL;
+
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ spin_lock(&delegation->lock);
+ if (delegation->inode != NULL &&
+ !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) &&
+ nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
+ if (nfs_sb_active(server->super)) {
+ freeme = server->super;
+ res = igrab(delegation->inode);
+ }
+ spin_unlock(&delegation->lock);
+ if (res != NULL)
+ return res;
+ if (freeme) {
+ rcu_read_unlock();
+ nfs_sb_deactive(freeme);
+ rcu_read_lock();
+ }
+ return ERR_PTR(-EAGAIN);
+ }
+ spin_unlock(&delegation->lock);
+ }
+ return ERR_PTR(-ENOENT);
+}
+
+/**
+ * nfs_delegation_find_inode - retrieve the inode associated with a delegation
+ * @clp: client state handle
+ * @fhandle: filehandle from a delegation recall
+ *
+ * Returns pointer to inode matching "fhandle," or NULL if a matching inode
+ * cannot be found.
+ */
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp,
+ const struct nfs_fh *fhandle)
+{
+ struct nfs_server *server;
+ struct inode *res;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ res = nfs_delegation_find_inode_server(server, fhandle);
+ if (res != ERR_PTR(-ENOENT)) {
+ rcu_read_unlock();
+ return res;
+ }
+ }
+ rcu_read_unlock();
+ return ERR_PTR(-ENOENT);
+}
+
+static void nfs_delegation_mark_reclaim_server(struct nfs_server *server)
+{
+ struct nfs_delegation *delegation;
+
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ /*
+ * If the delegation may have been admin revoked, then we
+ * cannot reclaim it.
+ */
+ if (test_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags))
+ continue;
+ set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+ }
+}
+
+/**
+ * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed
+ * @clp: nfs_client to process
+ *
+ */
+void nfs_delegation_mark_reclaim(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ nfs_delegation_mark_reclaim_server(server);
+ rcu_read_unlock();
+}
+
+static int nfs_server_reap_unclaimed_delegations(struct nfs_server *server,
+ void __always_unused *data)
+{
+ struct nfs_delegation *delegation;
+ struct inode *inode;
+restart:
+ rcu_read_lock();
+restart_locked:
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ if (test_bit(NFS_DELEGATION_INODE_FREEING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURNING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_NEED_RECLAIM,
+ &delegation->flags) == 0)
+ continue;
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL)
+ goto restart_locked;
+ delegation = nfs_start_delegation_return_locked(NFS_I(inode));
+ rcu_read_unlock();
+ if (delegation != NULL) {
+ if (nfs_detach_delegation(NFS_I(inode), delegation,
+ server) != NULL)
+ nfs_free_delegation(delegation);
+ /* Match nfs_start_delegation_return_locked */
+ nfs_put_delegation(delegation);
+ }
+ iput(inode);
+ cond_resched();
+ goto restart;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+/**
+ * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
+ * @clp: nfs_client to process
+ *
+ */
+void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
+{
+ nfs_client_for_each_server(clp, nfs_server_reap_unclaimed_delegations,
+ NULL);
+}
+
+static inline bool nfs4_server_rebooted(const struct nfs_client *clp)
+{
+ return (clp->cl_state & (BIT(NFS4CLNT_CHECK_LEASE) |
+ BIT(NFS4CLNT_LEASE_EXPIRED) |
+ BIT(NFS4CLNT_SESSION_RESET))) != 0;
+}
+
+static void nfs_mark_test_expired_delegation(struct nfs_server *server,
+ struct nfs_delegation *delegation)
+{
+ if (delegation->stateid.type == NFS4_INVALID_STATEID_TYPE)
+ return;
+ clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+ set_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
+ set_bit(NFS4CLNT_DELEGATION_EXPIRED, &server->nfs_client->cl_state);
+}
+
+static void nfs_inode_mark_test_expired_delegation(struct nfs_server *server,
+ struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation)
+ nfs_mark_test_expired_delegation(server, delegation);
+ rcu_read_unlock();
+
+}
+
+static void nfs_delegation_mark_test_expired_server(struct nfs_server *server)
+{
+ struct nfs_delegation *delegation;
+
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list)
+ nfs_mark_test_expired_delegation(server, delegation);
+}
+
+/**
+ * nfs_mark_test_expired_all_delegations - mark all delegations for testing
+ * @clp: nfs_client to process
+ *
+ * Iterates through all the delegations associated with this server and
+ * marks them as needing to be checked for validity.
+ */
+void nfs_mark_test_expired_all_delegations(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ nfs_delegation_mark_test_expired_server(server);
+ rcu_read_unlock();
+}
+
+/**
+ * nfs_test_expired_all_delegations - test all delegations for a client
+ * @clp: nfs_client to process
+ *
+ * Helper for handling "recallable state revoked" status from server.
+ */
+void nfs_test_expired_all_delegations(struct nfs_client *clp)
+{
+ nfs_mark_test_expired_all_delegations(clp);
+ nfs4_schedule_state_manager(clp);
+}
+
+static void
+nfs_delegation_test_free_expired(struct inode *inode,
+ nfs4_stateid *stateid,
+ const struct cred *cred)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ const struct nfs4_minor_version_ops *ops = server->nfs_client->cl_mvops;
+ int status;
+
+ if (!cred)
+ return;
+ status = ops->test_and_free_expired(server, stateid, cred);
+ if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID)
+ nfs_remove_bad_delegation(inode, stateid);
+}
+
+static int nfs_server_reap_expired_delegations(struct nfs_server *server,
+ void __always_unused *data)
+{
+ struct nfs_delegation *delegation;
+ struct inode *inode;
+ const struct cred *cred;
+ nfs4_stateid stateid;
+restart:
+ rcu_read_lock();
+restart_locked:
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ if (test_bit(NFS_DELEGATION_INODE_FREEING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURNING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_TEST_EXPIRED,
+ &delegation->flags) == 0)
+ continue;
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL)
+ goto restart_locked;
+ spin_lock(&delegation->lock);
+ cred = get_cred_rcu(delegation->cred);
+ nfs4_stateid_copy(&stateid, &delegation->stateid);
+ spin_unlock(&delegation->lock);
+ clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
+ rcu_read_unlock();
+ nfs_delegation_test_free_expired(inode, &stateid, cred);
+ put_cred(cred);
+ if (!nfs4_server_rebooted(server->nfs_client)) {
+ iput(inode);
+ cond_resched();
+ goto restart;
+ }
+ nfs_inode_mark_test_expired_delegation(server,inode);
+ iput(inode);
+ return -EAGAIN;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+/**
+ * nfs_reap_expired_delegations - reap expired delegations
+ * @clp: nfs_client to process
+ *
+ * Iterates through all the delegations associated with this server and
+ * checks if they have may have been revoked. This function is usually
+ * expected to be called in cases where the server may have lost its
+ * lease.
+ */
+void nfs_reap_expired_delegations(struct nfs_client *clp)
+{
+ nfs_client_for_each_server(clp, nfs_server_reap_expired_delegations,
+ NULL);
+}
+
+void nfs_inode_find_delegation_state_and_recover(struct inode *inode,
+ const nfs4_stateid *stateid)
+{
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+ struct nfs_delegation *delegation;
+ bool found = false;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation &&
+ nfs4_stateid_match_or_older(&delegation->stateid, stateid) &&
+ !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+ nfs_mark_test_expired_delegation(NFS_SERVER(inode), delegation);
+ found = true;
+ }
+ rcu_read_unlock();
+ if (found)
+ nfs4_schedule_state_manager(clp);
+}
+
+/**
+ * nfs_delegations_present - check for existence of delegations
+ * @clp: client state handle
+ *
+ * Returns one if there are any nfs_delegation structures attached
+ * to this nfs_client.
+ */
+int nfs_delegations_present(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+ int ret = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ if (!list_empty(&server->delegations)) {
+ ret = 1;
+ break;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * nfs4_refresh_delegation_stateid - Update delegation stateid seqid
+ * @dst: stateid to refresh
+ * @inode: inode to check
+ *
+ * Returns "true" and updates "dst->seqid" * if inode had a delegation
+ * that matches our delegation stateid. Otherwise "false" is returned.
+ */
+bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+ bool ret = false;
+ if (!inode)
+ goto out;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation != NULL &&
+ nfs4_stateid_match_other(dst, &delegation->stateid) &&
+ nfs4_stateid_is_newer(&delegation->stateid, dst) &&
+ !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+ dst->seqid = delegation->stateid.seqid;
+ ret = true;
+ }
+ rcu_read_unlock();
+out:
+ return ret;
+}
+
+/**
+ * nfs4_copy_delegation_stateid - Copy inode's state ID information
+ * @inode: inode to check
+ * @flags: delegation type requirement
+ * @dst: stateid data structure to fill in
+ * @cred: optional argument to retrieve credential
+ *
+ * Returns "true" and fills in "dst->data" * if inode had a delegation,
+ * otherwise "false" is returned.
+ */
+bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
+ nfs4_stateid *dst, const struct cred **cred)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_delegation *delegation;
+ bool ret = false;
+
+ flags &= FMODE_READ|FMODE_WRITE;
+ rcu_read_lock();
+ delegation = rcu_dereference(nfsi->delegation);
+ if (!delegation)
+ goto out;
+ spin_lock(&delegation->lock);
+ ret = nfs4_is_valid_delegation(delegation, flags);
+ if (ret) {
+ nfs4_stateid_copy(dst, &delegation->stateid);
+ nfs_mark_delegation_referenced(delegation);
+ if (cred)
+ *cred = get_cred(delegation->cred);
+ }
+ spin_unlock(&delegation->lock);
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * nfs4_delegation_flush_on_close - Check if we must flush file on close
+ * @inode: inode to check
+ *
+ * This function checks the number of outstanding writes to the file
+ * against the delegation 'space_limit' field to see if
+ * the spec requires us to flush the file on close.
+ */
+bool nfs4_delegation_flush_on_close(const struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_delegation *delegation;
+ bool ret = true;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(nfsi->delegation);
+ if (delegation == NULL || !(delegation->type & FMODE_WRITE))
+ goto out;
+ if (atomic_long_read(&nfsi->nrequests) < delegation->pagemod_limit)
+ ret = false;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+module_param_named(delegation_watermark, nfs_delegation_watermark, uint, 0644);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
new file mode 100644
index 0000000000..1c378992b7
--- /dev/null
+++ b/fs/nfs/delegation.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/fs/nfs/delegation.h
+ *
+ * Copyright (c) Trond Myklebust
+ *
+ * Definitions pertaining to NFS delegated files
+ */
+#ifndef FS_NFS_DELEGATION_H
+#define FS_NFS_DELEGATION_H
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+/*
+ * NFSv4 delegation
+ */
+struct nfs_delegation {
+ struct list_head super_list;
+ const struct cred *cred;
+ struct inode *inode;
+ nfs4_stateid stateid;
+ fmode_t type;
+ unsigned long pagemod_limit;
+ __u64 change_attr;
+ unsigned long flags;
+ refcount_t refcount;
+ spinlock_t lock;
+ struct rcu_head rcu;
+};
+
+enum {
+ NFS_DELEGATION_NEED_RECLAIM = 0,
+ NFS_DELEGATION_RETURN,
+ NFS_DELEGATION_RETURN_IF_CLOSED,
+ NFS_DELEGATION_REFERENCED,
+ NFS_DELEGATION_RETURNING,
+ NFS_DELEGATION_REVOKED,
+ NFS_DELEGATION_TEST_EXPIRED,
+ NFS_DELEGATION_INODE_FREEING,
+ NFS_DELEGATION_RETURN_DELAYED,
+};
+
+int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
+ fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit);
+void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
+ fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit);
+int nfs4_inode_return_delegation(struct inode *inode);
+void nfs4_inode_return_delegation_on_close(struct inode *inode);
+int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
+void nfs_inode_evict_delegation(struct inode *inode);
+
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
+void nfs_server_return_all_delegations(struct nfs_server *);
+void nfs_expire_all_delegations(struct nfs_client *clp);
+void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags);
+void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
+int nfs_client_return_marked_delegations(struct nfs_client *clp);
+int nfs_delegations_present(struct nfs_client *clp);
+void nfs_remove_bad_delegation(struct inode *inode, const nfs4_stateid *stateid);
+void nfs_delegation_mark_returned(struct inode *inode, const nfs4_stateid *stateid);
+
+void nfs_delegation_mark_reclaim(struct nfs_client *clp);
+void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
+
+void nfs_mark_test_expired_all_delegations(struct nfs_client *clp);
+void nfs_test_expired_all_delegations(struct nfs_client *clp);
+void nfs_reap_expired_delegations(struct nfs_client *clp);
+
+/* NFSv4 delegation-related procedures */
+int nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, const nfs4_stateid *stateid, int issync);
+int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);
+int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid);
+bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, const struct cred **cred);
+bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
+
+struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode);
+void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
+int nfs4_have_delegation(struct inode *inode, fmode_t flags);
+int nfs4_check_delegation(struct inode *inode, fmode_t flags);
+bool nfs4_delegation_flush_on_close(const struct inode *inode);
+void nfs_inode_find_delegation_state_and_recover(struct inode *inode,
+ const nfs4_stateid *stateid);
+int nfs4_inode_make_writeable(struct inode *inode);
+
+#endif
+
+static inline int nfs_have_delegated_attributes(struct inode *inode)
+{
+ return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ);
+}
+
+#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
new file mode 100644
index 0000000000..9fc5061d51
--- /dev/null
+++ b/fs/nfs/dir.c
@@ -0,0 +1,3324 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/dir.c
+ *
+ * Copyright (C) 1992 Rick Sladkey
+ *
+ * nfs directory handling functions
+ *
+ * 10 Apr 1996 Added silly rename for unlink --okir
+ * 28 Sep 1996 Improved directory cache --okir
+ * 23 Aug 1997 Claus Heine claus@momo.math.rwth-aachen.de
+ * Re-implemented silly rename for unlink, newly implemented
+ * silly rename for nfs_rename() following the suggestions
+ * of Olaf Kirch (okir) found in this file.
+ * Following Linus comments on my original hack, this version
+ * depends only on the dcache stuff and doesn't touch the inode
+ * layer (iput() and friends).
+ * 6 Jun 1999 Cache readdir lookups in the page cache. -DaveM
+ */
+
+#include <linux/compat.h>
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/swap.h>
+#include <linux/sched.h>
+#include <linux/kmemleak.h>
+#include <linux/xattr.h>
+#include <linux/hash.h>
+
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#include "fscache.h"
+
+#include "nfstrace.h"
+
+/* #define NFS_DEBUG_VERBOSE 1 */
+
+static int nfs_opendir(struct inode *, struct file *);
+static int nfs_closedir(struct inode *, struct file *);
+static int nfs_readdir(struct file *, struct dir_context *);
+static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
+static loff_t nfs_llseek_dir(struct file *, loff_t, int);
+static void nfs_readdir_clear_array(struct folio *);
+
+const struct file_operations nfs_dir_operations = {
+ .llseek = nfs_llseek_dir,
+ .read = generic_read_dir,
+ .iterate_shared = nfs_readdir,
+ .open = nfs_opendir,
+ .release = nfs_closedir,
+ .fsync = nfs_fsync_dir,
+};
+
+const struct address_space_operations nfs_dir_aops = {
+ .free_folio = nfs_readdir_clear_array,
+};
+
+#define NFS_INIT_DTSIZE PAGE_SIZE
+
+static struct nfs_open_dir_context *
+alloc_nfs_open_dir_context(struct inode *dir)
+{
+ struct nfs_inode *nfsi = NFS_I(dir);
+ struct nfs_open_dir_context *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT);
+ if (ctx != NULL) {
+ ctx->attr_gencount = nfsi->attr_gencount;
+ ctx->dtsize = NFS_INIT_DTSIZE;
+ spin_lock(&dir->i_lock);
+ if (list_empty(&nfsi->open_files) &&
+ (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
+ nfs_set_cache_invalid(dir,
+ NFS_INO_INVALID_DATA |
+ NFS_INO_REVAL_FORCED);
+ list_add_tail_rcu(&ctx->list, &nfsi->open_files);
+ memcpy(ctx->verf, nfsi->cookieverf, sizeof(ctx->verf));
+ spin_unlock(&dir->i_lock);
+ return ctx;
+ }
+ return ERR_PTR(-ENOMEM);
+}
+
+static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx)
+{
+ spin_lock(&dir->i_lock);
+ list_del_rcu(&ctx->list);
+ spin_unlock(&dir->i_lock);
+ kfree_rcu(ctx, rcu_head);
+}
+
+/*
+ * Open file
+ */
+static int
+nfs_opendir(struct inode *inode, struct file *filp)
+{
+ int res = 0;
+ struct nfs_open_dir_context *ctx;
+
+ dfprintk(FILE, "NFS: open dir(%pD2)\n", filp);
+
+ nfs_inc_stats(inode, NFSIOS_VFSOPEN);
+
+ ctx = alloc_nfs_open_dir_context(inode);
+ if (IS_ERR(ctx)) {
+ res = PTR_ERR(ctx);
+ goto out;
+ }
+ filp->private_data = ctx;
+out:
+ return res;
+}
+
+static int
+nfs_closedir(struct inode *inode, struct file *filp)
+{
+ put_nfs_open_dir_context(file_inode(filp), filp->private_data);
+ return 0;
+}
+
+struct nfs_cache_array_entry {
+ u64 cookie;
+ u64 ino;
+ const char *name;
+ unsigned int name_len;
+ unsigned char d_type;
+};
+
+struct nfs_cache_array {
+ u64 change_attr;
+ u64 last_cookie;
+ unsigned int size;
+ unsigned char folio_full : 1,
+ folio_is_eof : 1,
+ cookies_are_ordered : 1;
+ struct nfs_cache_array_entry array[];
+};
+
+struct nfs_readdir_descriptor {
+ struct file *file;
+ struct folio *folio;
+ struct dir_context *ctx;
+ pgoff_t folio_index;
+ pgoff_t folio_index_max;
+ u64 dir_cookie;
+ u64 last_cookie;
+ loff_t current_index;
+
+ __be32 verf[NFS_DIR_VERIFIER_SIZE];
+ unsigned long dir_verifier;
+ unsigned long timestamp;
+ unsigned long gencount;
+ unsigned long attr_gencount;
+ unsigned int cache_entry_index;
+ unsigned int buffer_fills;
+ unsigned int dtsize;
+ bool clear_cache;
+ bool plus;
+ bool eob;
+ bool eof;
+};
+
+static void nfs_set_dtsize(struct nfs_readdir_descriptor *desc, unsigned int sz)
+{
+ struct nfs_server *server = NFS_SERVER(file_inode(desc->file));
+ unsigned int maxsize = server->dtsize;
+
+ if (sz > maxsize)
+ sz = maxsize;
+ if (sz < NFS_MIN_FILE_IO_SIZE)
+ sz = NFS_MIN_FILE_IO_SIZE;
+ desc->dtsize = sz;
+}
+
+static void nfs_shrink_dtsize(struct nfs_readdir_descriptor *desc)
+{
+ nfs_set_dtsize(desc, desc->dtsize >> 1);
+}
+
+static void nfs_grow_dtsize(struct nfs_readdir_descriptor *desc)
+{
+ nfs_set_dtsize(desc, desc->dtsize << 1);
+}
+
+static void nfs_readdir_folio_init_array(struct folio *folio, u64 last_cookie,
+ u64 change_attr)
+{
+ struct nfs_cache_array *array;
+
+ array = kmap_local_folio(folio, 0);
+ array->change_attr = change_attr;
+ array->last_cookie = last_cookie;
+ array->size = 0;
+ array->folio_full = 0;
+ array->folio_is_eof = 0;
+ array->cookies_are_ordered = 1;
+ kunmap_local(array);
+}
+
+/*
+ * we are freeing strings created by nfs_add_to_readdir_array()
+ */
+static void nfs_readdir_clear_array(struct folio *folio)
+{
+ struct nfs_cache_array *array;
+ unsigned int i;
+
+ array = kmap_local_folio(folio, 0);
+ for (i = 0; i < array->size; i++)
+ kfree(array->array[i].name);
+ array->size = 0;
+ kunmap_local(array);
+}
+
+static void nfs_readdir_folio_reinit_array(struct folio *folio, u64 last_cookie,
+ u64 change_attr)
+{
+ nfs_readdir_clear_array(folio);
+ nfs_readdir_folio_init_array(folio, last_cookie, change_attr);
+}
+
+static struct folio *
+nfs_readdir_folio_array_alloc(u64 last_cookie, gfp_t gfp_flags)
+{
+ struct folio *folio = folio_alloc(gfp_flags, 0);
+ if (folio)
+ nfs_readdir_folio_init_array(folio, last_cookie, 0);
+ return folio;
+}
+
+static void nfs_readdir_folio_array_free(struct folio *folio)
+{
+ if (folio) {
+ nfs_readdir_clear_array(folio);
+ folio_put(folio);
+ }
+}
+
+static u64 nfs_readdir_array_index_cookie(struct nfs_cache_array *array)
+{
+ return array->size == 0 ? array->last_cookie : array->array[0].cookie;
+}
+
+static void nfs_readdir_array_set_eof(struct nfs_cache_array *array)
+{
+ array->folio_is_eof = 1;
+ array->folio_full = 1;
+}
+
+static bool nfs_readdir_array_is_full(struct nfs_cache_array *array)
+{
+ return array->folio_full;
+}
+
+/*
+ * the caller is responsible for freeing qstr.name
+ * when called by nfs_readdir_add_to_array, the strings will be freed in
+ * nfs_clear_readdir_array()
+ */
+static const char *nfs_readdir_copy_name(const char *name, unsigned int len)
+{
+ const char *ret = kmemdup_nul(name, len, GFP_KERNEL);
+
+ /*
+ * Avoid a kmemleak false positive. The pointer to the name is stored
+ * in a page cache page which kmemleak does not scan.
+ */
+ if (ret != NULL)
+ kmemleak_not_leak(ret);
+ return ret;
+}
+
+static size_t nfs_readdir_array_maxentries(void)
+{
+ return (PAGE_SIZE - sizeof(struct nfs_cache_array)) /
+ sizeof(struct nfs_cache_array_entry);
+}
+
+/*
+ * Check that the next array entry lies entirely within the page bounds
+ */
+static int nfs_readdir_array_can_expand(struct nfs_cache_array *array)
+{
+ if (array->folio_full)
+ return -ENOSPC;
+ if (array->size == nfs_readdir_array_maxentries()) {
+ array->folio_full = 1;
+ return -ENOSPC;
+ }
+ return 0;
+}
+
+static int nfs_readdir_folio_array_append(struct folio *folio,
+ const struct nfs_entry *entry,
+ u64 *cookie)
+{
+ struct nfs_cache_array *array;
+ struct nfs_cache_array_entry *cache_entry;
+ const char *name;
+ int ret = -ENOMEM;
+
+ name = nfs_readdir_copy_name(entry->name, entry->len);
+
+ array = kmap_local_folio(folio, 0);
+ if (!name)
+ goto out;
+ ret = nfs_readdir_array_can_expand(array);
+ if (ret) {
+ kfree(name);
+ goto out;
+ }
+
+ cache_entry = &array->array[array->size];
+ cache_entry->cookie = array->last_cookie;
+ cache_entry->ino = entry->ino;
+ cache_entry->d_type = entry->d_type;
+ cache_entry->name_len = entry->len;
+ cache_entry->name = name;
+ array->last_cookie = entry->cookie;
+ if (array->last_cookie <= cache_entry->cookie)
+ array->cookies_are_ordered = 0;
+ array->size++;
+ if (entry->eof != 0)
+ nfs_readdir_array_set_eof(array);
+out:
+ *cookie = array->last_cookie;
+ kunmap_local(array);
+ return ret;
+}
+
+#define NFS_READDIR_COOKIE_MASK (U32_MAX >> 14)
+/*
+ * Hash algorithm allowing content addressible access to sequences
+ * of directory cookies. Content is addressed by the value of the
+ * cookie index of the first readdir entry in a page.
+ *
+ * We select only the first 18 bits to avoid issues with excessive
+ * memory use for the page cache XArray. 18 bits should allow the caching
+ * of 262144 pages of sequences of readdir entries. Since each page holds
+ * 127 readdir entries for a typical 64-bit system, that works out to a
+ * cache of ~ 33 million entries per directory.
+ */
+static pgoff_t nfs_readdir_folio_cookie_hash(u64 cookie)
+{
+ if (cookie == 0)
+ return 0;
+ return hash_64(cookie, 18);
+}
+
+static bool nfs_readdir_folio_validate(struct folio *folio, u64 last_cookie,
+ u64 change_attr)
+{
+ struct nfs_cache_array *array = kmap_local_folio(folio, 0);
+ int ret = true;
+
+ if (array->change_attr != change_attr)
+ ret = false;
+ if (nfs_readdir_array_index_cookie(array) != last_cookie)
+ ret = false;
+ kunmap_local(array);
+ return ret;
+}
+
+static void nfs_readdir_folio_unlock_and_put(struct folio *folio)
+{
+ folio_unlock(folio);
+ folio_put(folio);
+}
+
+static void nfs_readdir_folio_init_and_validate(struct folio *folio, u64 cookie,
+ u64 change_attr)
+{
+ if (folio_test_uptodate(folio)) {
+ if (nfs_readdir_folio_validate(folio, cookie, change_attr))
+ return;
+ nfs_readdir_clear_array(folio);
+ }
+ nfs_readdir_folio_init_array(folio, cookie, change_attr);
+ folio_mark_uptodate(folio);
+}
+
+static struct folio *nfs_readdir_folio_get_locked(struct address_space *mapping,
+ u64 cookie, u64 change_attr)
+{
+ pgoff_t index = nfs_readdir_folio_cookie_hash(cookie);
+ struct folio *folio;
+
+ folio = filemap_grab_folio(mapping, index);
+ if (IS_ERR(folio))
+ return NULL;
+ nfs_readdir_folio_init_and_validate(folio, cookie, change_attr);
+ return folio;
+}
+
+static u64 nfs_readdir_folio_last_cookie(struct folio *folio)
+{
+ struct nfs_cache_array *array;
+ u64 ret;
+
+ array = kmap_local_folio(folio, 0);
+ ret = array->last_cookie;
+ kunmap_local(array);
+ return ret;
+}
+
+static bool nfs_readdir_folio_needs_filling(struct folio *folio)
+{
+ struct nfs_cache_array *array;
+ bool ret;
+
+ array = kmap_local_folio(folio, 0);
+ ret = !nfs_readdir_array_is_full(array);
+ kunmap_local(array);
+ return ret;
+}
+
+static void nfs_readdir_folio_set_eof(struct folio *folio)
+{
+ struct nfs_cache_array *array;
+
+ array = kmap_local_folio(folio, 0);
+ nfs_readdir_array_set_eof(array);
+ kunmap_local(array);
+}
+
+static struct folio *nfs_readdir_folio_get_next(struct address_space *mapping,
+ u64 cookie, u64 change_attr)
+{
+ pgoff_t index = nfs_readdir_folio_cookie_hash(cookie);
+ struct folio *folio;
+
+ folio = __filemap_get_folio(mapping, index,
+ FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
+ return NULL;
+ nfs_readdir_folio_init_and_validate(folio, cookie, change_attr);
+ if (nfs_readdir_folio_last_cookie(folio) != cookie)
+ nfs_readdir_folio_reinit_array(folio, cookie, change_attr);
+ return folio;
+}
+
+static inline
+int is_32bit_api(void)
+{
+#ifdef CONFIG_COMPAT
+ return in_compat_syscall();
+#else
+ return (BITS_PER_LONG == 32);
+#endif
+}
+
+static
+bool nfs_readdir_use_cookie(const struct file *filp)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return false;
+ return true;
+}
+
+static void nfs_readdir_seek_next_array(struct nfs_cache_array *array,
+ struct nfs_readdir_descriptor *desc)
+{
+ if (array->folio_full) {
+ desc->last_cookie = array->last_cookie;
+ desc->current_index += array->size;
+ desc->cache_entry_index = 0;
+ desc->folio_index++;
+ } else
+ desc->last_cookie = nfs_readdir_array_index_cookie(array);
+}
+
+static void nfs_readdir_rewind_search(struct nfs_readdir_descriptor *desc)
+{
+ desc->current_index = 0;
+ desc->last_cookie = 0;
+ desc->folio_index = 0;
+}
+
+static int nfs_readdir_search_for_pos(struct nfs_cache_array *array,
+ struct nfs_readdir_descriptor *desc)
+{
+ loff_t diff = desc->ctx->pos - desc->current_index;
+ unsigned int index;
+
+ if (diff < 0)
+ goto out_eof;
+ if (diff >= array->size) {
+ if (array->folio_is_eof)
+ goto out_eof;
+ nfs_readdir_seek_next_array(array, desc);
+ return -EAGAIN;
+ }
+
+ index = (unsigned int)diff;
+ desc->dir_cookie = array->array[index].cookie;
+ desc->cache_entry_index = index;
+ return 0;
+out_eof:
+ desc->eof = true;
+ return -EBADCOOKIE;
+}
+
+static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array,
+ u64 cookie)
+{
+ if (!array->cookies_are_ordered)
+ return true;
+ /* Optimisation for monotonically increasing cookies */
+ if (cookie >= array->last_cookie)
+ return false;
+ if (array->size && cookie < array->array[0].cookie)
+ return false;
+ return true;
+}
+
+static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
+ struct nfs_readdir_descriptor *desc)
+{
+ unsigned int i;
+ int status = -EAGAIN;
+
+ if (!nfs_readdir_array_cookie_in_range(array, desc->dir_cookie))
+ goto check_eof;
+
+ for (i = 0; i < array->size; i++) {
+ if (array->array[i].cookie == desc->dir_cookie) {
+ if (nfs_readdir_use_cookie(desc->file))
+ desc->ctx->pos = desc->dir_cookie;
+ else
+ desc->ctx->pos = desc->current_index + i;
+ desc->cache_entry_index = i;
+ return 0;
+ }
+ }
+check_eof:
+ if (array->folio_is_eof) {
+ status = -EBADCOOKIE;
+ if (desc->dir_cookie == array->last_cookie)
+ desc->eof = true;
+ } else
+ nfs_readdir_seek_next_array(array, desc);
+ return status;
+}
+
+static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc)
+{
+ struct nfs_cache_array *array;
+ int status;
+
+ array = kmap_local_folio(desc->folio, 0);
+
+ if (desc->dir_cookie == 0)
+ status = nfs_readdir_search_for_pos(array, desc);
+ else
+ status = nfs_readdir_search_for_cookie(array, desc);
+
+ kunmap_local(array);
+ return status;
+}
+
+/* Fill a page with xdr information before transferring to the cache page */
+static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
+ __be32 *verf, u64 cookie,
+ struct page **pages, size_t bufsize,
+ __be32 *verf_res)
+{
+ struct inode *inode = file_inode(desc->file);
+ struct nfs_readdir_arg arg = {
+ .dentry = file_dentry(desc->file),
+ .cred = desc->file->f_cred,
+ .verf = verf,
+ .cookie = cookie,
+ .pages = pages,
+ .page_len = bufsize,
+ .plus = desc->plus,
+ };
+ struct nfs_readdir_res res = {
+ .verf = verf_res,
+ };
+ unsigned long timestamp, gencount;
+ int error;
+
+ again:
+ timestamp = jiffies;
+ gencount = nfs_inc_attr_generation_counter();
+ desc->dir_verifier = nfs_save_change_attribute(inode);
+ error = NFS_PROTO(inode)->readdir(&arg, &res);
+ if (error < 0) {
+ /* We requested READDIRPLUS, but the server doesn't grok it */
+ if (error == -ENOTSUPP && desc->plus) {
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
+ desc->plus = arg.plus = false;
+ goto again;
+ }
+ goto error;
+ }
+ desc->timestamp = timestamp;
+ desc->gencount = gencount;
+error:
+ return error;
+}
+
+static int xdr_decode(struct nfs_readdir_descriptor *desc,
+ struct nfs_entry *entry, struct xdr_stream *xdr)
+{
+ struct inode *inode = file_inode(desc->file);
+ int error;
+
+ error = NFS_PROTO(inode)->decode_dirent(xdr, entry, desc->plus);
+ if (error)
+ return error;
+ entry->fattr->time_start = desc->timestamp;
+ entry->fattr->gencount = desc->gencount;
+ return 0;
+}
+
+/* Match file and dirent using either filehandle or fileid
+ * Note: caller is responsible for checking the fsid
+ */
+static
+int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
+{
+ struct inode *inode;
+ struct nfs_inode *nfsi;
+
+ if (d_really_is_negative(dentry))
+ return 0;
+
+ inode = d_inode(dentry);
+ if (is_bad_inode(inode) || NFS_STALE(inode))
+ return 0;
+
+ nfsi = NFS_I(inode);
+ if (entry->fattr->fileid != nfsi->fileid)
+ return 0;
+ if (entry->fh->size && nfs_compare_fh(entry->fh, &nfsi->fh) != 0)
+ return 0;
+ return 1;
+}
+
+#define NFS_READDIR_CACHE_USAGE_THRESHOLD (8UL)
+
+static bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx,
+ unsigned int cache_hits,
+ unsigned int cache_misses)
+{
+ if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
+ return false;
+ if (ctx->pos == 0 ||
+ cache_hits + cache_misses > NFS_READDIR_CACHE_USAGE_THRESHOLD)
+ return true;
+ return false;
+}
+
+/*
+ * This function is called by the getattr code to request the
+ * use of readdirplus to accelerate any future lookups in the same
+ * directory.
+ */
+void nfs_readdir_record_entry_cache_hit(struct inode *dir)
+{
+ struct nfs_inode *nfsi = NFS_I(dir);
+ struct nfs_open_dir_context *ctx;
+
+ if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
+ S_ISDIR(dir->i_mode)) {
+ rcu_read_lock();
+ list_for_each_entry_rcu (ctx, &nfsi->open_files, list)
+ atomic_inc(&ctx->cache_hits);
+ rcu_read_unlock();
+ }
+}
+
+/*
+ * This function is mainly for use by nfs_getattr().
+ *
+ * If this is an 'ls -l', we want to force use of readdirplus.
+ */
+void nfs_readdir_record_entry_cache_miss(struct inode *dir)
+{
+ struct nfs_inode *nfsi = NFS_I(dir);
+ struct nfs_open_dir_context *ctx;
+
+ if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
+ S_ISDIR(dir->i_mode)) {
+ rcu_read_lock();
+ list_for_each_entry_rcu (ctx, &nfsi->open_files, list)
+ atomic_inc(&ctx->cache_misses);
+ rcu_read_unlock();
+ }
+}
+
+static void nfs_lookup_advise_force_readdirplus(struct inode *dir,
+ unsigned int flags)
+{
+ if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE))
+ return;
+ if (flags & (LOOKUP_EXCL | LOOKUP_PARENT | LOOKUP_REVAL))
+ return;
+ nfs_readdir_record_entry_cache_miss(dir);
+}
+
+static
+void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry,
+ unsigned long dir_verifier)
+{
+ struct qstr filename = QSTR_INIT(entry->name, entry->len);
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ struct dentry *dentry;
+ struct dentry *alias;
+ struct inode *inode;
+ int status;
+
+ if (!(entry->fattr->valid & NFS_ATTR_FATTR_FILEID))
+ return;
+ if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID))
+ return;
+ if (filename.len == 0)
+ return;
+ /* Validate that the name doesn't contain any illegal '\0' */
+ if (strnlen(filename.name, filename.len) != filename.len)
+ return;
+ /* ...or '/' */
+ if (strnchr(filename.name, filename.len, '/'))
+ return;
+ if (filename.name[0] == '.') {
+ if (filename.len == 1)
+ return;
+ if (filename.len == 2 && filename.name[1] == '.')
+ return;
+ }
+ filename.hash = full_name_hash(parent, filename.name, filename.len);
+
+ dentry = d_lookup(parent, &filename);
+again:
+ if (!dentry) {
+ dentry = d_alloc_parallel(parent, &filename, &wq);
+ if (IS_ERR(dentry))
+ return;
+ }
+ if (!d_in_lookup(dentry)) {
+ /* Is there a mountpoint here? If so, just exit */
+ if (!nfs_fsid_equal(&NFS_SB(dentry->d_sb)->fsid,
+ &entry->fattr->fsid))
+ goto out;
+ if (nfs_same_file(dentry, entry)) {
+ if (!entry->fh->size)
+ goto out;
+ nfs_set_verifier(dentry, dir_verifier);
+ status = nfs_refresh_inode(d_inode(dentry), entry->fattr);
+ if (!status)
+ nfs_setsecurity(d_inode(dentry), entry->fattr);
+ trace_nfs_readdir_lookup_revalidate(d_inode(parent),
+ dentry, 0, status);
+ goto out;
+ } else {
+ trace_nfs_readdir_lookup_revalidate_failed(
+ d_inode(parent), dentry, 0);
+ d_invalidate(dentry);
+ dput(dentry);
+ dentry = NULL;
+ goto again;
+ }
+ }
+ if (!entry->fh->size) {
+ d_lookup_done(dentry);
+ goto out;
+ }
+
+ inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
+ alias = d_splice_alias(inode, dentry);
+ d_lookup_done(dentry);
+ if (alias) {
+ if (IS_ERR(alias))
+ goto out;
+ dput(dentry);
+ dentry = alias;
+ }
+ nfs_set_verifier(dentry, dir_verifier);
+ trace_nfs_readdir_lookup(d_inode(parent), dentry, 0);
+out:
+ dput(dentry);
+}
+
+static int nfs_readdir_entry_decode(struct nfs_readdir_descriptor *desc,
+ struct nfs_entry *entry,
+ struct xdr_stream *stream)
+{
+ int ret;
+
+ if (entry->fattr->label)
+ entry->fattr->label->len = NFS4_MAXLABELLEN;
+ ret = xdr_decode(desc, entry, stream);
+ if (ret || !desc->plus)
+ return ret;
+ nfs_prime_dcache(file_dentry(desc->file), entry, desc->dir_verifier);
+ return 0;
+}
+
+/* Perform conversion from xdr to cache array */
+static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc,
+ struct nfs_entry *entry,
+ struct page **xdr_pages, unsigned int buflen,
+ struct folio **arrays, size_t narrays,
+ u64 change_attr)
+{
+ struct address_space *mapping = desc->file->f_mapping;
+ struct folio *new, *folio = *arrays;
+ struct xdr_stream stream;
+ struct page *scratch;
+ struct xdr_buf buf;
+ u64 cookie;
+ int status;
+
+ scratch = alloc_page(GFP_KERNEL);
+ if (scratch == NULL)
+ return -ENOMEM;
+
+ xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
+ xdr_set_scratch_page(&stream, scratch);
+
+ do {
+ status = nfs_readdir_entry_decode(desc, entry, &stream);
+ if (status != 0)
+ break;
+
+ status = nfs_readdir_folio_array_append(folio, entry, &cookie);
+ if (status != -ENOSPC)
+ continue;
+
+ if (folio->mapping != mapping) {
+ if (!--narrays)
+ break;
+ new = nfs_readdir_folio_array_alloc(cookie, GFP_KERNEL);
+ if (!new)
+ break;
+ arrays++;
+ *arrays = folio = new;
+ } else {
+ new = nfs_readdir_folio_get_next(mapping, cookie,
+ change_attr);
+ if (!new)
+ break;
+ if (folio != *arrays)
+ nfs_readdir_folio_unlock_and_put(folio);
+ folio = new;
+ }
+ desc->folio_index_max++;
+ status = nfs_readdir_folio_array_append(folio, entry, &cookie);
+ } while (!status && !entry->eof);
+
+ switch (status) {
+ case -EBADCOOKIE:
+ if (!entry->eof)
+ break;
+ nfs_readdir_folio_set_eof(folio);
+ fallthrough;
+ case -EAGAIN:
+ status = 0;
+ break;
+ case -ENOSPC:
+ status = 0;
+ if (!desc->plus)
+ break;
+ while (!nfs_readdir_entry_decode(desc, entry, &stream))
+ ;
+ }
+
+ if (folio != *arrays)
+ nfs_readdir_folio_unlock_and_put(folio);
+
+ put_page(scratch);
+ return status;
+}
+
+static void nfs_readdir_free_pages(struct page **pages, size_t npages)
+{
+ while (npages--)
+ put_page(pages[npages]);
+ kfree(pages);
+}
+
+/*
+ * nfs_readdir_alloc_pages() will allocate pages that must be freed with a call
+ * to nfs_readdir_free_pages()
+ */
+static struct page **nfs_readdir_alloc_pages(size_t npages)
+{
+ struct page **pages;
+ size_t i;
+
+ pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ return NULL;
+ for (i = 0; i < npages; i++) {
+ struct page *page = alloc_page(GFP_KERNEL);
+ if (page == NULL)
+ goto out_freepages;
+ pages[i] = page;
+ }
+ return pages;
+
+out_freepages:
+ nfs_readdir_free_pages(pages, i);
+ return NULL;
+}
+
+static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
+ __be32 *verf_arg, __be32 *verf_res,
+ struct folio **arrays, size_t narrays)
+{
+ u64 change_attr;
+ struct page **pages;
+ struct folio *folio = *arrays;
+ struct nfs_entry *entry;
+ size_t array_size;
+ struct inode *inode = file_inode(desc->file);
+ unsigned int dtsize = desc->dtsize;
+ unsigned int pglen;
+ int status = -ENOMEM;
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+ entry->cookie = nfs_readdir_folio_last_cookie(folio);
+ entry->fh = nfs_alloc_fhandle();
+ entry->fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
+ entry->server = NFS_SERVER(inode);
+ if (entry->fh == NULL || entry->fattr == NULL)
+ goto out;
+
+ array_size = (dtsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ pages = nfs_readdir_alloc_pages(array_size);
+ if (!pages)
+ goto out;
+
+ change_attr = inode_peek_iversion_raw(inode);
+ status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie, pages,
+ dtsize, verf_res);
+ if (status < 0)
+ goto free_pages;
+
+ pglen = status;
+ if (pglen != 0)
+ status = nfs_readdir_folio_filler(desc, entry, pages, pglen,
+ arrays, narrays, change_attr);
+ else
+ nfs_readdir_folio_set_eof(folio);
+ desc->buffer_fills++;
+
+free_pages:
+ nfs_readdir_free_pages(pages, array_size);
+out:
+ nfs_free_fattr(entry->fattr);
+ nfs_free_fhandle(entry->fh);
+ kfree(entry);
+ return status;
+}
+
+static void nfs_readdir_folio_put(struct nfs_readdir_descriptor *desc)
+{
+ folio_put(desc->folio);
+ desc->folio = NULL;
+}
+
+static void
+nfs_readdir_folio_unlock_and_put_cached(struct nfs_readdir_descriptor *desc)
+{
+ folio_unlock(desc->folio);
+ nfs_readdir_folio_put(desc);
+}
+
+static struct folio *
+nfs_readdir_folio_get_cached(struct nfs_readdir_descriptor *desc)
+{
+ struct address_space *mapping = desc->file->f_mapping;
+ u64 change_attr = inode_peek_iversion_raw(mapping->host);
+ u64 cookie = desc->last_cookie;
+ struct folio *folio;
+
+ folio = nfs_readdir_folio_get_locked(mapping, cookie, change_attr);
+ if (!folio)
+ return NULL;
+ if (desc->clear_cache && !nfs_readdir_folio_needs_filling(folio))
+ nfs_readdir_folio_reinit_array(folio, cookie, change_attr);
+ return folio;
+}
+
+/*
+ * Returns 0 if desc->dir_cookie was found on page desc->page_index
+ * and locks the page to prevent removal from the page cache.
+ */
+static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
+{
+ struct inode *inode = file_inode(desc->file);
+ struct nfs_inode *nfsi = NFS_I(inode);
+ __be32 verf[NFS_DIR_VERIFIER_SIZE];
+ int res;
+
+ desc->folio = nfs_readdir_folio_get_cached(desc);
+ if (!desc->folio)
+ return -ENOMEM;
+ if (nfs_readdir_folio_needs_filling(desc->folio)) {
+ /* Grow the dtsize if we had to go back for more pages */
+ if (desc->folio_index == desc->folio_index_max)
+ nfs_grow_dtsize(desc);
+ desc->folio_index_max = desc->folio_index;
+ trace_nfs_readdir_cache_fill(desc->file, nfsi->cookieverf,
+ desc->last_cookie,
+ desc->folio->index, desc->dtsize);
+ res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf,
+ &desc->folio, 1);
+ if (res < 0) {
+ nfs_readdir_folio_unlock_and_put_cached(desc);
+ trace_nfs_readdir_cache_fill_done(inode, res);
+ if (res == -EBADCOOKIE || res == -ENOTSYNC) {
+ invalidate_inode_pages2(desc->file->f_mapping);
+ nfs_readdir_rewind_search(desc);
+ trace_nfs_readdir_invalidate_cache_range(
+ inode, 0, MAX_LFS_FILESIZE);
+ return -EAGAIN;
+ }
+ return res;
+ }
+ /*
+ * Set the cookie verifier if the page cache was empty
+ */
+ if (desc->last_cookie == 0 &&
+ memcmp(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf))) {
+ memcpy(nfsi->cookieverf, verf,
+ sizeof(nfsi->cookieverf));
+ invalidate_inode_pages2_range(desc->file->f_mapping, 1,
+ -1);
+ trace_nfs_readdir_invalidate_cache_range(
+ inode, 1, MAX_LFS_FILESIZE);
+ }
+ desc->clear_cache = false;
+ }
+ res = nfs_readdir_search_array(desc);
+ if (res == 0)
+ return 0;
+ nfs_readdir_folio_unlock_and_put_cached(desc);
+ return res;
+}
+
+/* Search for desc->dir_cookie from the beginning of the page cache */
+static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
+{
+ int res;
+
+ do {
+ res = find_and_lock_cache_page(desc);
+ } while (res == -EAGAIN);
+ return res;
+}
+
+#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL)
+
+/*
+ * Once we've found the start of the dirent within a page: fill 'er up...
+ */
+static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
+ const __be32 *verf)
+{
+ struct file *file = desc->file;
+ struct nfs_cache_array *array;
+ unsigned int i;
+ bool first_emit = !desc->dir_cookie;
+
+ array = kmap_local_folio(desc->folio, 0);
+ for (i = desc->cache_entry_index; i < array->size; i++) {
+ struct nfs_cache_array_entry *ent;
+
+ /*
+ * nfs_readdir_handle_cache_misses return force clear at
+ * (cache_misses > NFS_READDIR_CACHE_MISS_THRESHOLD) for
+ * readdir heuristic, NFS_READDIR_CACHE_MISS_THRESHOLD + 1
+ * entries need be emitted here.
+ */
+ if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 2) {
+ desc->eob = true;
+ break;
+ }
+
+ ent = &array->array[i];
+ if (!dir_emit(desc->ctx, ent->name, ent->name_len,
+ nfs_compat_user_ino64(ent->ino), ent->d_type)) {
+ desc->eob = true;
+ break;
+ }
+ memcpy(desc->verf, verf, sizeof(desc->verf));
+ if (i == array->size - 1) {
+ desc->dir_cookie = array->last_cookie;
+ nfs_readdir_seek_next_array(array, desc);
+ } else {
+ desc->dir_cookie = array->array[i + 1].cookie;
+ desc->last_cookie = array->array[0].cookie;
+ }
+ if (nfs_readdir_use_cookie(file))
+ desc->ctx->pos = desc->dir_cookie;
+ else
+ desc->ctx->pos++;
+ }
+ if (array->folio_is_eof)
+ desc->eof = !desc->eob;
+
+ kunmap_local(array);
+ dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %llu\n",
+ (unsigned long long)desc->dir_cookie);
+}
+
+/*
+ * If we cannot find a cookie in our cache, we suspect that this is
+ * because it points to a deleted file, so we ask the server to return
+ * whatever it thinks is the next entry. We then feed this to filldir.
+ * If all goes well, we should then be able to find our way round the
+ * cache on the next call to readdir_search_pagecache();
+ *
+ * NOTE: we cannot add the anonymous page to the pagecache because
+ * the data it contains might not be page aligned. Besides,
+ * we should already have a complete representation of the
+ * directory in the page cache by the time we get here.
+ */
+static int uncached_readdir(struct nfs_readdir_descriptor *desc)
+{
+ struct folio **arrays;
+ size_t i, sz = 512;
+ __be32 verf[NFS_DIR_VERIFIER_SIZE];
+ int status = -ENOMEM;
+
+ dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %llu\n",
+ (unsigned long long)desc->dir_cookie);
+
+ arrays = kcalloc(sz, sizeof(*arrays), GFP_KERNEL);
+ if (!arrays)
+ goto out;
+ arrays[0] = nfs_readdir_folio_array_alloc(desc->dir_cookie, GFP_KERNEL);
+ if (!arrays[0])
+ goto out;
+
+ desc->folio_index = 0;
+ desc->cache_entry_index = 0;
+ desc->last_cookie = desc->dir_cookie;
+ desc->folio_index_max = 0;
+
+ trace_nfs_readdir_uncached(desc->file, desc->verf, desc->last_cookie,
+ -1, desc->dtsize);
+
+ status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz);
+ if (status < 0) {
+ trace_nfs_readdir_uncached_done(file_inode(desc->file), status);
+ goto out_free;
+ }
+
+ for (i = 0; !desc->eob && i < sz && arrays[i]; i++) {
+ desc->folio = arrays[i];
+ nfs_do_filldir(desc, verf);
+ }
+ desc->folio = NULL;
+
+ /*
+ * Grow the dtsize if we have to go back for more pages,
+ * or shrink it if we're reading too many.
+ */
+ if (!desc->eof) {
+ if (!desc->eob)
+ nfs_grow_dtsize(desc);
+ else if (desc->buffer_fills == 1 &&
+ i < (desc->folio_index_max >> 1))
+ nfs_shrink_dtsize(desc);
+ }
+out_free:
+ for (i = 0; i < sz && arrays[i]; i++)
+ nfs_readdir_folio_array_free(arrays[i]);
+out:
+ if (!nfs_readdir_use_cookie(desc->file))
+ nfs_readdir_rewind_search(desc);
+ desc->folio_index_max = -1;
+ kfree(arrays);
+ dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status);
+ return status;
+}
+
+static bool nfs_readdir_handle_cache_misses(struct inode *inode,
+ struct nfs_readdir_descriptor *desc,
+ unsigned int cache_misses,
+ bool force_clear)
+{
+ if (desc->ctx->pos == 0 || !desc->plus)
+ return false;
+ if (cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD && !force_clear)
+ return false;
+ trace_nfs_readdir_force_readdirplus(inode);
+ return true;
+}
+
+/* The file offset position represents the dirent entry number. A
+ last cookie cache takes care of the common case of reading the
+ whole directory.
+ */
+static int nfs_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct dentry *dentry = file_dentry(file);
+ struct inode *inode = d_inode(dentry);
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_open_dir_context *dir_ctx = file->private_data;
+ struct nfs_readdir_descriptor *desc;
+ unsigned int cache_hits, cache_misses;
+ bool force_clear;
+ int res;
+
+ dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
+ file, (long long)ctx->pos);
+ nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
+
+ /*
+ * ctx->pos points to the dirent entry number.
+ * *desc->dir_cookie has the cookie for the next entry. We have
+ * to either find the entry with the appropriate number or
+ * revalidate the cookie.
+ */
+ nfs_revalidate_mapping(inode, file->f_mapping);
+
+ res = -ENOMEM;
+ desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+ if (!desc)
+ goto out;
+ desc->file = file;
+ desc->ctx = ctx;
+ desc->folio_index_max = -1;
+
+ spin_lock(&file->f_lock);
+ desc->dir_cookie = dir_ctx->dir_cookie;
+ desc->folio_index = dir_ctx->page_index;
+ desc->last_cookie = dir_ctx->last_cookie;
+ desc->attr_gencount = dir_ctx->attr_gencount;
+ desc->eof = dir_ctx->eof;
+ nfs_set_dtsize(desc, dir_ctx->dtsize);
+ memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf));
+ cache_hits = atomic_xchg(&dir_ctx->cache_hits, 0);
+ cache_misses = atomic_xchg(&dir_ctx->cache_misses, 0);
+ force_clear = dir_ctx->force_clear;
+ spin_unlock(&file->f_lock);
+
+ if (desc->eof) {
+ res = 0;
+ goto out_free;
+ }
+
+ desc->plus = nfs_use_readdirplus(inode, ctx, cache_hits, cache_misses);
+ force_clear = nfs_readdir_handle_cache_misses(inode, desc, cache_misses,
+ force_clear);
+ desc->clear_cache = force_clear;
+
+ do {
+ res = readdir_search_pagecache(desc);
+
+ if (res == -EBADCOOKIE) {
+ res = 0;
+ /* This means either end of directory */
+ if (desc->dir_cookie && !desc->eof) {
+ /* Or that the server has 'lost' a cookie */
+ res = uncached_readdir(desc);
+ if (res == 0)
+ continue;
+ if (res == -EBADCOOKIE || res == -ENOTSYNC)
+ res = 0;
+ }
+ break;
+ }
+ if (res == -ETOOSMALL && desc->plus) {
+ nfs_zap_caches(inode);
+ desc->plus = false;
+ desc->eof = false;
+ continue;
+ }
+ if (res < 0)
+ break;
+
+ nfs_do_filldir(desc, nfsi->cookieverf);
+ nfs_readdir_folio_unlock_and_put_cached(desc);
+ if (desc->folio_index == desc->folio_index_max)
+ desc->clear_cache = force_clear;
+ } while (!desc->eob && !desc->eof);
+
+ spin_lock(&file->f_lock);
+ dir_ctx->dir_cookie = desc->dir_cookie;
+ dir_ctx->last_cookie = desc->last_cookie;
+ dir_ctx->attr_gencount = desc->attr_gencount;
+ dir_ctx->page_index = desc->folio_index;
+ dir_ctx->force_clear = force_clear;
+ dir_ctx->eof = desc->eof;
+ dir_ctx->dtsize = desc->dtsize;
+ memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf));
+ spin_unlock(&file->f_lock);
+out_free:
+ kfree(desc);
+
+out:
+ dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res);
+ return res;
+}
+
+static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
+{
+ struct nfs_open_dir_context *dir_ctx = filp->private_data;
+
+ dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n",
+ filp, offset, whence);
+
+ switch (whence) {
+ default:
+ return -EINVAL;
+ case SEEK_SET:
+ if (offset < 0)
+ return -EINVAL;
+ spin_lock(&filp->f_lock);
+ break;
+ case SEEK_CUR:
+ if (offset == 0)
+ return filp->f_pos;
+ spin_lock(&filp->f_lock);
+ offset += filp->f_pos;
+ if (offset < 0) {
+ spin_unlock(&filp->f_lock);
+ return -EINVAL;
+ }
+ }
+ if (offset != filp->f_pos) {
+ filp->f_pos = offset;
+ dir_ctx->page_index = 0;
+ if (!nfs_readdir_use_cookie(filp)) {
+ dir_ctx->dir_cookie = 0;
+ dir_ctx->last_cookie = 0;
+ } else {
+ dir_ctx->dir_cookie = offset;
+ dir_ctx->last_cookie = offset;
+ }
+ dir_ctx->eof = false;
+ }
+ spin_unlock(&filp->f_lock);
+ return offset;
+}
+
+/*
+ * All directory operations under NFS are synchronous, so fsync()
+ * is a dummy operation.
+ */
+static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,
+ int datasync)
+{
+ dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync);
+
+ nfs_inc_stats(file_inode(filp), NFSIOS_VFSFSYNC);
+ return 0;
+}
+
+/**
+ * nfs_force_lookup_revalidate - Mark the directory as having changed
+ * @dir: pointer to directory inode
+ *
+ * This forces the revalidation code in nfs_lookup_revalidate() to do a
+ * full lookup on all child dentries of 'dir' whenever a change occurs
+ * on the server that might have invalidated our dcache.
+ *
+ * Note that we reserve bit '0' as a tag to let us know when a dentry
+ * was revalidated while holding a delegation on its inode.
+ *
+ * The caller should be holding dir->i_lock
+ */
+void nfs_force_lookup_revalidate(struct inode *dir)
+{
+ NFS_I(dir)->cache_change_attribute += 2;
+}
+EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);
+
+/**
+ * nfs_verify_change_attribute - Detects NFS remote directory changes
+ * @dir: pointer to parent directory inode
+ * @verf: previously saved change attribute
+ *
+ * Return "false" if the verifiers doesn't match the change attribute.
+ * This would usually indicate that the directory contents have changed on
+ * the server, and that any dentries need revalidating.
+ */
+static bool nfs_verify_change_attribute(struct inode *dir, unsigned long verf)
+{
+ return (verf & ~1UL) == nfs_save_change_attribute(dir);
+}
+
+static void nfs_set_verifier_delegated(unsigned long *verf)
+{
+ *verf |= 1UL;
+}
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+static void nfs_unset_verifier_delegated(unsigned long *verf)
+{
+ *verf &= ~1UL;
+}
+#endif /* IS_ENABLED(CONFIG_NFS_V4) */
+
+static bool nfs_test_verifier_delegated(unsigned long verf)
+{
+ return verf & 1;
+}
+
+static bool nfs_verifier_is_delegated(struct dentry *dentry)
+{
+ return nfs_test_verifier_delegated(dentry->d_time);
+}
+
+static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf)
+{
+ struct inode *inode = d_inode(dentry);
+ struct inode *dir = d_inode(dentry->d_parent);
+
+ if (!nfs_verify_change_attribute(dir, verf))
+ return;
+ if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ nfs_set_verifier_delegated(&verf);
+ dentry->d_time = verf;
+}
+
+/**
+ * nfs_set_verifier - save a parent directory verifier in the dentry
+ * @dentry: pointer to dentry
+ * @verf: verifier to save
+ *
+ * Saves the parent directory verifier in @dentry. If the inode has
+ * a delegation, we also tag the dentry as having been revalidated
+ * while holding a delegation so that we know we don't have to
+ * look it up again after a directory change.
+ */
+void nfs_set_verifier(struct dentry *dentry, unsigned long verf)
+{
+
+ spin_lock(&dentry->d_lock);
+ nfs_set_verifier_locked(dentry, verf);
+ spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_set_verifier);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+/**
+ * nfs_clear_verifier_delegated - clear the dir verifier delegation tag
+ * @inode: pointer to inode
+ *
+ * Iterates through the dentries in the inode alias list and clears
+ * the tag used to indicate that the dentry has been revalidated
+ * while holding a delegation.
+ * This function is intended for use when the delegation is being
+ * returned or revoked.
+ */
+void nfs_clear_verifier_delegated(struct inode *inode)
+{
+ struct dentry *alias;
+
+ if (!inode)
+ return;
+ spin_lock(&inode->i_lock);
+ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+ spin_lock(&alias->d_lock);
+ nfs_unset_verifier_delegated(&alias->d_time);
+ spin_unlock(&alias->d_lock);
+ }
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_clear_verifier_delegated);
+#endif /* IS_ENABLED(CONFIG_NFS_V4) */
+
+static int nfs_dentry_verify_change(struct inode *dir, struct dentry *dentry)
+{
+ if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE) &&
+ d_really_is_negative(dentry))
+ return dentry->d_time == inode_peek_iversion_raw(dir);
+ return nfs_verify_change_attribute(dir, dentry->d_time);
+}
+
+/*
+ * A check for whether or not the parent directory has changed.
+ * In the case it has, we assume that the dentries are untrustworthy
+ * and may need to be looked up again.
+ * If rcu_walk prevents us from performing a full check, return 0.
+ */
+static int nfs_check_verifier(struct inode *dir, struct dentry *dentry,
+ int rcu_walk)
+{
+ if (IS_ROOT(dentry))
+ return 1;
+ if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
+ return 0;
+ if (!nfs_dentry_verify_change(dir, dentry))
+ return 0;
+ /* Revalidate nfsi->cache_change_attribute before we declare a match */
+ if (nfs_mapping_need_revalidate_inode(dir)) {
+ if (rcu_walk)
+ return 0;
+ if (__nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
+ return 0;
+ }
+ if (!nfs_dentry_verify_change(dir, dentry))
+ return 0;
+ return 1;
+}
+
+/*
+ * Use intent information to check whether or not we're going to do
+ * an O_EXCL create using this path component.
+ */
+static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags)
+{
+ if (NFS_PROTO(dir)->version == 2)
+ return 0;
+ return flags & LOOKUP_EXCL;
+}
+
+/*
+ * Inode and filehandle revalidation for lookups.
+ *
+ * We force revalidation in the cases where the VFS sets LOOKUP_REVAL,
+ * or if the intent information indicates that we're about to open this
+ * particular file and the "nocto" mount flag is not set.
+ *
+ */
+static
+int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ int ret;
+
+ if (IS_AUTOMOUNT(inode))
+ return 0;
+
+ if (flags & LOOKUP_OPEN) {
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ /* A NFSv4 OPEN will revalidate later */
+ if (server->caps & NFS_CAP_ATOMIC_OPEN)
+ goto out;
+ fallthrough;
+ case S_IFDIR:
+ if (server->flags & NFS_MOUNT_NOCTO)
+ break;
+ /* NFS close-to-open cache consistency validation */
+ goto out_force;
+ }
+ }
+
+ /* VFS wants an on-the-wire revalidation */
+ if (flags & LOOKUP_REVAL)
+ goto out_force;
+out:
+ if (inode->i_nlink > 0 ||
+ (inode->i_nlink == 0 &&
+ test_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(inode)->flags)))
+ return 0;
+ else
+ return -ESTALE;
+out_force:
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+ ret = __nfs_revalidate_inode(server, inode);
+ if (ret != 0)
+ return ret;
+ goto out;
+}
+
+static void nfs_mark_dir_for_revalidate(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE);
+ spin_unlock(&inode->i_lock);
+}
+
+/*
+ * We judge how long we want to trust negative
+ * dentries by looking at the parent inode mtime.
+ *
+ * If parent mtime has changed, we revalidate, else we wait for a
+ * period corresponding to the parent's attribute cache timeout value.
+ *
+ * If LOOKUP_RCU prevents us from performing a full check, return 1
+ * suggesting a reval is needed.
+ *
+ * Note that when creating a new file, or looking up a rename target,
+ * then it shouldn't be necessary to revalidate a negative dentry.
+ */
+static inline
+int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+ return 0;
+ if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
+ return 1;
+ /* Case insensitive server? Revalidate negative dentries */
+ if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE))
+ return 1;
+ return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU);
+}
+
+static int
+nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry,
+ struct inode *inode, int error)
+{
+ switch (error) {
+ case 1:
+ break;
+ case 0:
+ /*
+ * We can't d_drop the root of a disconnected tree:
+ * its d_hash is on the s_anon list and d_drop() would hide
+ * it from shrink_dcache_for_unmount(), leading to busy
+ * inodes on unmount and further oopses.
+ */
+ if (inode && IS_ROOT(dentry))
+ error = 1;
+ break;
+ }
+ trace_nfs_lookup_revalidate_exit(dir, dentry, 0, error);
+ return error;
+}
+
+static int
+nfs_lookup_revalidate_negative(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ int ret = 1;
+ if (nfs_neg_need_reval(dir, dentry, flags)) {
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+ ret = 0;
+ }
+ return nfs_lookup_revalidate_done(dir, dentry, NULL, ret);
+}
+
+static int
+nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry,
+ struct inode *inode)
+{
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
+}
+
+static int nfs_lookup_revalidate_dentry(struct inode *dir,
+ struct dentry *dentry,
+ struct inode *inode, unsigned int flags)
+{
+ struct nfs_fh *fhandle;
+ struct nfs_fattr *fattr;
+ unsigned long dir_verifier;
+ int ret;
+
+ trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
+
+ ret = -ENOMEM;
+ fhandle = nfs_alloc_fhandle();
+ fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
+ if (fhandle == NULL || fattr == NULL)
+ goto out;
+
+ dir_verifier = nfs_save_change_attribute(dir);
+ ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr);
+ if (ret < 0) {
+ switch (ret) {
+ case -ESTALE:
+ case -ENOENT:
+ ret = 0;
+ break;
+ case -ETIMEDOUT:
+ if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL)
+ ret = 1;
+ }
+ goto out;
+ }
+
+ /* Request help from readdirplus */
+ nfs_lookup_advise_force_readdirplus(dir, flags);
+
+ ret = 0;
+ if (nfs_compare_fh(NFS_FH(inode), fhandle))
+ goto out;
+ if (nfs_refresh_inode(inode, fattr) < 0)
+ goto out;
+
+ nfs_setsecurity(inode, fattr);
+ nfs_set_verifier(dentry, dir_verifier);
+
+ ret = 1;
+out:
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fhandle);
+
+ /*
+ * If the lookup failed despite the dentry change attribute being
+ * a match, then we should revalidate the directory cache.
+ */
+ if (!ret && nfs_dentry_verify_change(dir, dentry))
+ nfs_mark_dir_for_revalidate(dir);
+ return nfs_lookup_revalidate_done(dir, dentry, inode, ret);
+}
+
+/*
+ * This is called every time the dcache has a lookup hit,
+ * and we should check whether we can really trust that
+ * lookup.
+ *
+ * NOTE! The hit can be a negative hit too, don't assume
+ * we have an inode!
+ *
+ * If the parent directory is seen to have changed, we throw out the
+ * cached dentry and do a new lookup.
+ */
+static int
+nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct inode *inode;
+ int error;
+
+ nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
+ inode = d_inode(dentry);
+
+ if (!inode)
+ return nfs_lookup_revalidate_negative(dir, dentry, flags);
+
+ if (is_bad_inode(inode)) {
+ dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
+ __func__, dentry);
+ goto out_bad;
+ }
+
+ if ((flags & LOOKUP_RENAME_TARGET) && d_count(dentry) < 2 &&
+ nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE))
+ goto out_bad;
+
+ if (nfs_verifier_is_delegated(dentry))
+ return nfs_lookup_revalidate_delegated(dir, dentry, inode);
+
+ /* Force a full look up iff the parent directory has changed */
+ if (!(flags & (LOOKUP_EXCL | LOOKUP_REVAL)) &&
+ nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) {
+ error = nfs_lookup_verify_inode(inode, flags);
+ if (error) {
+ if (error == -ESTALE)
+ nfs_mark_dir_for_revalidate(dir);
+ goto out_bad;
+ }
+ goto out_valid;
+ }
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ if (NFS_STALE(inode))
+ goto out_bad;
+
+ return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags);
+out_valid:
+ return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
+out_bad:
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+ return nfs_lookup_revalidate_done(dir, dentry, inode, 0);
+}
+
+static int
+__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
+ int (*reval)(struct inode *, struct dentry *, unsigned int))
+{
+ struct dentry *parent;
+ struct inode *dir;
+ int ret;
+
+ if (flags & LOOKUP_RCU) {
+ if (dentry->d_fsdata == NFS_FSDATA_BLOCKED)
+ return -ECHILD;
+ parent = READ_ONCE(dentry->d_parent);
+ dir = d_inode_rcu(parent);
+ if (!dir)
+ return -ECHILD;
+ ret = reval(dir, dentry, flags);
+ if (parent != READ_ONCE(dentry->d_parent))
+ return -ECHILD;
+ } else {
+ /* Wait for unlink to complete */
+ wait_var_event(&dentry->d_fsdata,
+ dentry->d_fsdata != NFS_FSDATA_BLOCKED);
+ parent = dget_parent(dentry);
+ ret = reval(d_inode(parent), dentry, flags);
+ dput(parent);
+ }
+ return ret;
+}
+
+static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate);
+}
+
+/*
+ * A weaker form of d_revalidate for revalidating just the d_inode(dentry)
+ * when we don't really care about the dentry name. This is called when a
+ * pathwalk ends on a dentry that was not found via a normal lookup in the
+ * parent dir (e.g.: ".", "..", procfs symlinks or mountpoint traversals).
+ *
+ * In this situation, we just want to verify that the inode itself is OK
+ * since the dentry might have changed on the server.
+ */
+static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct inode *inode = d_inode(dentry);
+ int error = 0;
+
+ /*
+ * I believe we can only get a negative dentry here in the case of a
+ * procfs-style symlink. Just assume it's correct for now, but we may
+ * eventually need to do something more here.
+ */
+ if (!inode) {
+ dfprintk(LOOKUPCACHE, "%s: %pd2 has negative inode\n",
+ __func__, dentry);
+ return 1;
+ }
+
+ if (is_bad_inode(inode)) {
+ dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
+ __func__, dentry);
+ return 0;
+ }
+
+ error = nfs_lookup_verify_inode(inode, flags);
+ dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n",
+ __func__, inode->i_ino, error ? "invalid" : "valid");
+ return !error;
+}
+
+/*
+ * This is called from dput() when d_count is going to 0.
+ */
+static int nfs_dentry_delete(const struct dentry *dentry)
+{
+ dfprintk(VFS, "NFS: dentry_delete(%pd2, %x)\n",
+ dentry, dentry->d_flags);
+
+ /* Unhash any dentry with a stale inode */
+ if (d_really_is_positive(dentry) && NFS_STALE(d_inode(dentry)))
+ return 1;
+
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+ /* Unhash it, so that ->d_iput() would be called */
+ return 1;
+ }
+ if (!(dentry->d_sb->s_flags & SB_ACTIVE)) {
+ /* Unhash it, so that ancestors of killed async unlink
+ * files will be cleaned up during umount */
+ return 1;
+ }
+ return 0;
+
+}
+
+/* Ensure that we revalidate inode->i_nlink */
+static void nfs_drop_nlink(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ /* drop the inode if we're reasonably sure this is the last link */
+ if (inode->i_nlink > 0)
+ drop_nlink(inode);
+ NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter();
+ nfs_set_cache_invalid(
+ inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME |
+ NFS_INO_INVALID_NLINK);
+ spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Called when the dentry loses inode.
+ * We use it to clean up silly-renamed files.
+ */
+static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
+{
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+ nfs_complete_unlink(dentry, inode);
+ nfs_drop_nlink(inode);
+ }
+ iput(inode);
+}
+
+static void nfs_d_release(struct dentry *dentry)
+{
+ /* free cached devname value, if it survived that far */
+ if (unlikely(dentry->d_fsdata)) {
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+ WARN_ON(1);
+ else
+ kfree(dentry->d_fsdata);
+ }
+}
+
+const struct dentry_operations nfs_dentry_operations = {
+ .d_revalidate = nfs_lookup_revalidate,
+ .d_weak_revalidate = nfs_weak_revalidate,
+ .d_delete = nfs_dentry_delete,
+ .d_iput = nfs_dentry_iput,
+ .d_automount = nfs_d_automount,
+ .d_release = nfs_d_release,
+};
+EXPORT_SYMBOL_GPL(nfs_dentry_operations);
+
+struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
+{
+ struct dentry *res;
+ struct inode *inode = NULL;
+ struct nfs_fh *fhandle = NULL;
+ struct nfs_fattr *fattr = NULL;
+ unsigned long dir_verifier;
+ int error;
+
+ dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry);
+ nfs_inc_stats(dir, NFSIOS_VFSLOOKUP);
+
+ if (unlikely(dentry->d_name.len > NFS_SERVER(dir)->namelen))
+ return ERR_PTR(-ENAMETOOLONG);
+
+ /*
+ * If we're doing an exclusive create, optimize away the lookup
+ * but don't hash the dentry.
+ */
+ if (nfs_is_exclusive_create(dir, flags) || flags & LOOKUP_RENAME_TARGET)
+ return NULL;
+
+ res = ERR_PTR(-ENOMEM);
+ fhandle = nfs_alloc_fhandle();
+ fattr = nfs_alloc_fattr_with_label(NFS_SERVER(dir));
+ if (fhandle == NULL || fattr == NULL)
+ goto out;
+
+ dir_verifier = nfs_save_change_attribute(dir);
+ trace_nfs_lookup_enter(dir, dentry, flags);
+ error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr);
+ if (error == -ENOENT) {
+ if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE))
+ dir_verifier = inode_peek_iversion_raw(dir);
+ goto no_entry;
+ }
+ if (error < 0) {
+ res = ERR_PTR(error);
+ goto out;
+ }
+ inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
+ res = ERR_CAST(inode);
+ if (IS_ERR(res))
+ goto out;
+
+ /* Notify readdir to use READDIRPLUS */
+ nfs_lookup_advise_force_readdirplus(dir, flags);
+
+no_entry:
+ res = d_splice_alias(inode, dentry);
+ if (res != NULL) {
+ if (IS_ERR(res))
+ goto out;
+ dentry = res;
+ }
+ nfs_set_verifier(dentry, dir_verifier);
+out:
+ trace_nfs_lookup_exit(dir, dentry, flags, PTR_ERR_OR_ZERO(res));
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fhandle);
+ return res;
+}
+EXPORT_SYMBOL_GPL(nfs_lookup);
+
+void nfs_d_prune_case_insensitive_aliases(struct inode *inode)
+{
+ /* Case insensitive server? Revalidate dentries */
+ if (inode && nfs_server_capable(inode, NFS_CAP_CASE_INSENSITIVE))
+ d_prune_aliases(inode);
+}
+EXPORT_SYMBOL_GPL(nfs_d_prune_case_insensitive_aliases);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+static int nfs4_lookup_revalidate(struct dentry *, unsigned int);
+
+const struct dentry_operations nfs4_dentry_operations = {
+ .d_revalidate = nfs4_lookup_revalidate,
+ .d_weak_revalidate = nfs_weak_revalidate,
+ .d_delete = nfs_dentry_delete,
+ .d_iput = nfs_dentry_iput,
+ .d_automount = nfs_d_automount,
+ .d_release = nfs_d_release,
+};
+EXPORT_SYMBOL_GPL(nfs4_dentry_operations);
+
+static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags, struct file *filp)
+{
+ return alloc_nfs_open_context(dentry, flags_to_mode(open_flags), filp);
+}
+
+static int do_open(struct inode *inode, struct file *filp)
+{
+ nfs_fscache_open_file(inode, filp);
+ return 0;
+}
+
+static int nfs_finish_open(struct nfs_open_context *ctx,
+ struct dentry *dentry,
+ struct file *file, unsigned open_flags)
+{
+ int err;
+
+ err = finish_open(file, dentry, do_open);
+ if (err)
+ goto out;
+ if (S_ISREG(file_inode(file)->i_mode))
+ nfs_file_set_open_context(file, ctx);
+ else
+ err = -EOPENSTALE;
+out:
+ return err;
+}
+
+int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
+ struct file *file, unsigned open_flags,
+ umode_t mode)
+{
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ struct nfs_open_context *ctx;
+ struct dentry *res;
+ struct iattr attr = { .ia_valid = ATTR_OPEN };
+ struct inode *inode;
+ unsigned int lookup_flags = 0;
+ unsigned long dir_verifier;
+ bool switched = false;
+ int created = 0;
+ int err;
+
+ /* Expect a negative dentry */
+ BUG_ON(d_inode(dentry));
+
+ dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n",
+ dir->i_sb->s_id, dir->i_ino, dentry);
+
+ err = nfs_check_flags(open_flags);
+ if (err)
+ return err;
+
+ /* NFS only supports OPEN on regular files */
+ if ((open_flags & O_DIRECTORY)) {
+ if (!d_in_lookup(dentry)) {
+ /*
+ * Hashed negative dentry with O_DIRECTORY: dentry was
+ * revalidated and is fine, no need to perform lookup
+ * again
+ */
+ return -ENOENT;
+ }
+ lookup_flags = LOOKUP_OPEN|LOOKUP_DIRECTORY;
+ goto no_open;
+ }
+
+ if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
+ return -ENAMETOOLONG;
+
+ if (open_flags & O_CREAT) {
+ struct nfs_server *server = NFS_SERVER(dir);
+
+ if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+ mode &= ~current_umask();
+
+ attr.ia_valid |= ATTR_MODE;
+ attr.ia_mode = mode;
+ }
+ if (open_flags & O_TRUNC) {
+ attr.ia_valid |= ATTR_SIZE;
+ attr.ia_size = 0;
+ }
+
+ if (!(open_flags & O_CREAT) && !d_in_lookup(dentry)) {
+ d_drop(dentry);
+ switched = true;
+ dentry = d_alloc_parallel(dentry->d_parent,
+ &dentry->d_name, &wq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ if (unlikely(!d_in_lookup(dentry)))
+ return finish_no_open(file, dentry);
+ }
+
+ ctx = create_nfs_open_context(dentry, open_flags, file);
+ err = PTR_ERR(ctx);
+ if (IS_ERR(ctx))
+ goto out;
+
+ trace_nfs_atomic_open_enter(dir, ctx, open_flags);
+ inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, &created);
+ if (created)
+ file->f_mode |= FMODE_CREATED;
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ trace_nfs_atomic_open_exit(dir, ctx, open_flags, err);
+ put_nfs_open_context(ctx);
+ d_drop(dentry);
+ switch (err) {
+ case -ENOENT:
+ d_splice_alias(NULL, dentry);
+ if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE))
+ dir_verifier = inode_peek_iversion_raw(dir);
+ else
+ dir_verifier = nfs_save_change_attribute(dir);
+ nfs_set_verifier(dentry, dir_verifier);
+ break;
+ case -EISDIR:
+ case -ENOTDIR:
+ goto no_open;
+ case -ELOOP:
+ if (!(open_flags & O_NOFOLLOW))
+ goto no_open;
+ break;
+ /* case -EINVAL: */
+ default:
+ break;
+ }
+ goto out;
+ }
+ file->f_mode |= FMODE_CAN_ODIRECT;
+
+ err = nfs_finish_open(ctx, ctx->dentry, file, open_flags);
+ trace_nfs_atomic_open_exit(dir, ctx, open_flags, err);
+ put_nfs_open_context(ctx);
+out:
+ if (unlikely(switched)) {
+ d_lookup_done(dentry);
+ dput(dentry);
+ }
+ return err;
+
+no_open:
+ res = nfs_lookup(dir, dentry, lookup_flags);
+ if (!res) {
+ inode = d_inode(dentry);
+ if ((lookup_flags & LOOKUP_DIRECTORY) && inode &&
+ !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)))
+ res = ERR_PTR(-ENOTDIR);
+ else if (inode && S_ISREG(inode->i_mode))
+ res = ERR_PTR(-EOPENSTALE);
+ } else if (!IS_ERR(res)) {
+ inode = d_inode(res);
+ if ((lookup_flags & LOOKUP_DIRECTORY) && inode &&
+ !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) {
+ dput(res);
+ res = ERR_PTR(-ENOTDIR);
+ } else if (inode && S_ISREG(inode->i_mode)) {
+ dput(res);
+ res = ERR_PTR(-EOPENSTALE);
+ }
+ }
+ if (switched) {
+ d_lookup_done(dentry);
+ if (!res)
+ res = dentry;
+ else
+ dput(dentry);
+ }
+ if (IS_ERR(res))
+ return PTR_ERR(res);
+ return finish_no_open(file, res);
+}
+EXPORT_SYMBOL_GPL(nfs_atomic_open);
+
+static int
+nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct inode *inode;
+
+ if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
+ goto full_reval;
+ if (d_mountpoint(dentry))
+ goto full_reval;
+
+ inode = d_inode(dentry);
+
+ /* We can't create new files in nfs_open_revalidate(), so we
+ * optimize away revalidation of negative dentries.
+ */
+ if (inode == NULL)
+ goto full_reval;
+
+ if (nfs_verifier_is_delegated(dentry))
+ return nfs_lookup_revalidate_delegated(dir, dentry, inode);
+
+ /* NFS only supports OPEN on regular files */
+ if (!S_ISREG(inode->i_mode))
+ goto full_reval;
+
+ /* We cannot do exclusive creation on a positive dentry */
+ if (flags & (LOOKUP_EXCL | LOOKUP_REVAL))
+ goto reval_dentry;
+
+ /* Check if the directory changed */
+ if (!nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU))
+ goto reval_dentry;
+
+ /* Let f_op->open() actually open (and revalidate) the file */
+ return 1;
+reval_dentry:
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+ return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags);
+
+full_reval:
+ return nfs_do_lookup_revalidate(dir, dentry, flags);
+}
+
+static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return __nfs_lookup_revalidate(dentry, flags,
+ nfs4_do_lookup_revalidate);
+}
+
+#endif /* CONFIG_NFSV4 */
+
+struct dentry *
+nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr)
+{
+ struct dentry *parent = dget_parent(dentry);
+ struct inode *dir = d_inode(parent);
+ struct inode *inode;
+ struct dentry *d;
+ int error;
+
+ d_drop(dentry);
+
+ if (fhandle->size == 0) {
+ error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr);
+ if (error)
+ goto out_error;
+ }
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ if (!(fattr->valid & NFS_ATTR_FATTR)) {
+ struct nfs_server *server = NFS_SB(dentry->d_sb);
+ error = server->nfs_client->rpc_ops->getattr(server, fhandle,
+ fattr, NULL);
+ if (error < 0)
+ goto out_error;
+ }
+ inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
+ d = d_splice_alias(inode, dentry);
+out:
+ dput(parent);
+ return d;
+out_error:
+ d = ERR_PTR(error);
+ goto out;
+}
+EXPORT_SYMBOL_GPL(nfs_add_or_obtain);
+
+/*
+ * Code common to create, mkdir, and mknod.
+ */
+int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr)
+{
+ struct dentry *d;
+
+ d = nfs_add_or_obtain(dentry, fhandle, fattr);
+ if (IS_ERR(d))
+ return PTR_ERR(d);
+
+ /* Callers don't care */
+ dput(d);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_instantiate);
+
+/*
+ * Following a failed create operation, we drop the dentry rather
+ * than retain a negative dentry. This avoids a problem in the event
+ * that the operation succeeded on the server, but an error in the
+ * reply path made it appear to have failed.
+ */
+int nfs_create(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
+{
+ struct iattr attr;
+ int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT;
+ int error;
+
+ dfprintk(VFS, "NFS: create(%s/%lu), %pd\n",
+ dir->i_sb->s_id, dir->i_ino, dentry);
+
+ attr.ia_mode = mode;
+ attr.ia_valid = ATTR_MODE;
+
+ trace_nfs_create_enter(dir, dentry, open_flags);
+ error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags);
+ trace_nfs_create_exit(dir, dentry, open_flags, error);
+ if (error != 0)
+ goto out_err;
+ return 0;
+out_err:
+ d_drop(dentry);
+ return error;
+}
+EXPORT_SYMBOL_GPL(nfs_create);
+
+/*
+ * See comments for nfs_proc_create regarding failed operations.
+ */
+int
+nfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t rdev)
+{
+ struct iattr attr;
+ int status;
+
+ dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n",
+ dir->i_sb->s_id, dir->i_ino, dentry);
+
+ attr.ia_mode = mode;
+ attr.ia_valid = ATTR_MODE;
+
+ trace_nfs_mknod_enter(dir, dentry);
+ status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
+ trace_nfs_mknod_exit(dir, dentry, status);
+ if (status != 0)
+ goto out_err;
+ return 0;
+out_err:
+ d_drop(dentry);
+ return status;
+}
+EXPORT_SYMBOL_GPL(nfs_mknod);
+
+/*
+ * See comments for nfs_proc_create regarding failed operations.
+ */
+int nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
+{
+ struct iattr attr;
+ int error;
+
+ dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n",
+ dir->i_sb->s_id, dir->i_ino, dentry);
+
+ attr.ia_valid = ATTR_MODE;
+ attr.ia_mode = mode | S_IFDIR;
+
+ trace_nfs_mkdir_enter(dir, dentry);
+ error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
+ trace_nfs_mkdir_exit(dir, dentry, error);
+ if (error != 0)
+ goto out_err;
+ return 0;
+out_err:
+ d_drop(dentry);
+ return error;
+}
+EXPORT_SYMBOL_GPL(nfs_mkdir);
+
+static void nfs_dentry_handle_enoent(struct dentry *dentry)
+{
+ if (simple_positive(dentry))
+ d_delete(dentry);
+}
+
+static void nfs_dentry_remove_handle_error(struct inode *dir,
+ struct dentry *dentry, int error)
+{
+ switch (error) {
+ case -ENOENT:
+ if (d_really_is_positive(dentry))
+ d_delete(dentry);
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ break;
+ case 0:
+ nfs_d_prune_case_insensitive_aliases(d_inode(dentry));
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ }
+}
+
+int nfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ int error;
+
+ dfprintk(VFS, "NFS: rmdir(%s/%lu), %pd\n",
+ dir->i_sb->s_id, dir->i_ino, dentry);
+
+ trace_nfs_rmdir_enter(dir, dentry);
+ if (d_really_is_positive(dentry)) {
+ down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
+ error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
+ /* Ensure the VFS deletes this inode */
+ switch (error) {
+ case 0:
+ clear_nlink(d_inode(dentry));
+ break;
+ case -ENOENT:
+ nfs_dentry_handle_enoent(dentry);
+ }
+ up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
+ } else
+ error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
+ nfs_dentry_remove_handle_error(dir, dentry, error);
+ trace_nfs_rmdir_exit(dir, dentry, error);
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(nfs_rmdir);
+
+/*
+ * Remove a file after making sure there are no pending writes,
+ * and after checking that the file has only one user.
+ *
+ * We invalidate the attribute cache and free the inode prior to the operation
+ * to avoid possible races if the server reuses the inode.
+ */
+static int nfs_safe_remove(struct dentry *dentry)
+{
+ struct inode *dir = d_inode(dentry->d_parent);
+ struct inode *inode = d_inode(dentry);
+ int error = -EBUSY;
+
+ dfprintk(VFS, "NFS: safe_remove(%pd2)\n", dentry);
+
+ /* If the dentry was sillyrenamed, we simply call d_delete() */
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+ error = 0;
+ goto out;
+ }
+
+ trace_nfs_remove_enter(dir, dentry);
+ if (inode != NULL) {
+ error = NFS_PROTO(dir)->remove(dir, dentry);
+ if (error == 0)
+ nfs_drop_nlink(inode);
+ } else
+ error = NFS_PROTO(dir)->remove(dir, dentry);
+ if (error == -ENOENT)
+ nfs_dentry_handle_enoent(dentry);
+ trace_nfs_remove_exit(dir, dentry, error);
+out:
+ return error;
+}
+
+/* We do silly rename. In case sillyrename() returns -EBUSY, the inode
+ * belongs to an active ".nfs..." file and we return -EBUSY.
+ *
+ * If sillyrename() returns 0, we do nothing, otherwise we unlink.
+ */
+int nfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ int error;
+
+ dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id,
+ dir->i_ino, dentry);
+
+ trace_nfs_unlink_enter(dir, dentry);
+ spin_lock(&dentry->d_lock);
+ if (d_count(dentry) > 1 && !test_bit(NFS_INO_PRESERVE_UNLINKED,
+ &NFS_I(d_inode(dentry))->flags)) {
+ spin_unlock(&dentry->d_lock);
+ /* Start asynchronous writeout of the inode */
+ write_inode_now(d_inode(dentry), 0);
+ error = nfs_sillyrename(dir, dentry);
+ goto out;
+ }
+ /* We must prevent any concurrent open until the unlink
+ * completes. ->d_revalidate will wait for ->d_fsdata
+ * to clear. We set it here to ensure no lookup succeeds until
+ * the unlink is complete on the server.
+ */
+ error = -ETXTBSY;
+ if (WARN_ON(dentry->d_flags & DCACHE_NFSFS_RENAMED) ||
+ WARN_ON(dentry->d_fsdata == NFS_FSDATA_BLOCKED)) {
+ spin_unlock(&dentry->d_lock);
+ goto out;
+ }
+ /* old devname */
+ kfree(dentry->d_fsdata);
+ dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+
+ spin_unlock(&dentry->d_lock);
+ error = nfs_safe_remove(dentry);
+ nfs_dentry_remove_handle_error(dir, dentry, error);
+ dentry->d_fsdata = NULL;
+ wake_up_var(&dentry->d_fsdata);
+out:
+ trace_nfs_unlink_exit(dir, dentry, error);
+ return error;
+}
+EXPORT_SYMBOL_GPL(nfs_unlink);
+
+/*
+ * To create a symbolic link, most file systems instantiate a new inode,
+ * add a page to it containing the path, then write it out to the disk
+ * using prepare_write/commit_write.
+ *
+ * Unfortunately the NFS client can't create the in-core inode first
+ * because it needs a file handle to create an in-core inode (see
+ * fs/nfs/inode.c:nfs_fhget). We only have a file handle *after* the
+ * symlink request has completed on the server.
+ *
+ * So instead we allocate a raw page, copy the symname into it, then do
+ * the SYMLINK request with the page as the buffer. If it succeeds, we
+ * now have a new file handle and can instantiate an in-core NFS inode
+ * and move the raw page into its mapping.
+ */
+int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, const char *symname)
+{
+ struct page *page;
+ char *kaddr;
+ struct iattr attr;
+ unsigned int pathlen = strlen(symname);
+ int error;
+
+ dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s)\n", dir->i_sb->s_id,
+ dir->i_ino, dentry, symname);
+
+ if (pathlen > PAGE_SIZE)
+ return -ENAMETOOLONG;
+
+ attr.ia_mode = S_IFLNK | S_IRWXUGO;
+ attr.ia_valid = ATTR_MODE;
+
+ page = alloc_page(GFP_USER);
+ if (!page)
+ return -ENOMEM;
+
+ kaddr = page_address(page);
+ memcpy(kaddr, symname, pathlen);
+ if (pathlen < PAGE_SIZE)
+ memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
+
+ trace_nfs_symlink_enter(dir, dentry);
+ error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
+ trace_nfs_symlink_exit(dir, dentry, error);
+ if (error != 0) {
+ dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n",
+ dir->i_sb->s_id, dir->i_ino,
+ dentry, symname, error);
+ d_drop(dentry);
+ __free_page(page);
+ return error;
+ }
+
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+
+ /*
+ * No big deal if we can't add this page to the page cache here.
+ * READLINK will get the missing page from the server if needed.
+ */
+ if (!add_to_page_cache_lru(page, d_inode(dentry)->i_mapping, 0,
+ GFP_KERNEL)) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ /*
+ * add_to_page_cache_lru() grabs an extra page refcount.
+ * Drop it here to avoid leaking this page later.
+ */
+ put_page(page);
+ } else
+ __free_page(page);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_symlink);
+
+int
+nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(old_dentry);
+ int error;
+
+ dfprintk(VFS, "NFS: link(%pd2 -> %pd2)\n",
+ old_dentry, dentry);
+
+ trace_nfs_link_enter(inode, dir, dentry);
+ d_drop(dentry);
+ if (S_ISREG(inode->i_mode))
+ nfs_sync_inode(inode);
+ error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
+ if (error == 0) {
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ ihold(inode);
+ d_add(dentry, inode);
+ }
+ trace_nfs_link_exit(inode, dir, dentry, error);
+ return error;
+}
+EXPORT_SYMBOL_GPL(nfs_link);
+
+static void
+nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data)
+{
+ struct dentry *new_dentry = data->new_dentry;
+
+ new_dentry->d_fsdata = NULL;
+ wake_up_var(&new_dentry->d_fsdata);
+}
+
+/*
+ * RENAME
+ * FIXME: Some nfsds, like the Linux user space nfsd, may generate a
+ * different file handle for the same inode after a rename (e.g. when
+ * moving to a different directory). A fail-safe method to do so would
+ * be to look up old_dir/old_name, create a link to new_dir/new_name and
+ * rename the old file using the sillyrename stuff. This way, the original
+ * file in old_dir will go away when the last process iput()s the inode.
+ *
+ * FIXED.
+ *
+ * It actually works quite well. One needs to have the possibility for
+ * at least one ".nfs..." file in each directory the file ever gets
+ * moved or linked to which happens automagically with the new
+ * implementation that only depends on the dcache stuff instead of
+ * using the inode layer
+ *
+ * Unfortunately, things are a little more complicated than indicated
+ * above. For a cross-directory move, we want to make sure we can get
+ * rid of the old inode after the operation. This means there must be
+ * no pending writes (if it's a file), and the use count must be 1.
+ * If these conditions are met, we can drop the dentries before doing
+ * the rename.
+ */
+int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
+ struct dentry *old_dentry, struct inode *new_dir,
+ struct dentry *new_dentry, unsigned int flags)
+{
+ struct inode *old_inode = d_inode(old_dentry);
+ struct inode *new_inode = d_inode(new_dentry);
+ struct dentry *dentry = NULL;
+ struct rpc_task *task;
+ bool must_unblock = false;
+ int error = -EBUSY;
+
+ if (flags)
+ return -EINVAL;
+
+ dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n",
+ old_dentry, new_dentry,
+ d_count(new_dentry));
+
+ trace_nfs_rename_enter(old_dir, old_dentry, new_dir, new_dentry);
+ /*
+ * For non-directories, check whether the target is busy and if so,
+ * make a copy of the dentry and then do a silly-rename. If the
+ * silly-rename succeeds, the copied dentry is hashed and becomes
+ * the new target.
+ */
+ if (new_inode && !S_ISDIR(new_inode->i_mode)) {
+ /* We must prevent any concurrent open until the unlink
+ * completes. ->d_revalidate will wait for ->d_fsdata
+ * to clear. We set it here to ensure no lookup succeeds until
+ * the unlink is complete on the server.
+ */
+ error = -ETXTBSY;
+ if (WARN_ON(new_dentry->d_flags & DCACHE_NFSFS_RENAMED) ||
+ WARN_ON(new_dentry->d_fsdata == NFS_FSDATA_BLOCKED))
+ goto out;
+ if (new_dentry->d_fsdata) {
+ /* old devname */
+ kfree(new_dentry->d_fsdata);
+ new_dentry->d_fsdata = NULL;
+ }
+
+ spin_lock(&new_dentry->d_lock);
+ if (d_count(new_dentry) > 2) {
+ int err;
+
+ spin_unlock(&new_dentry->d_lock);
+
+ /* copy the target dentry's name */
+ dentry = d_alloc(new_dentry->d_parent,
+ &new_dentry->d_name);
+ if (!dentry)
+ goto out;
+
+ /* silly-rename the existing target ... */
+ err = nfs_sillyrename(new_dir, new_dentry);
+ if (err)
+ goto out;
+
+ new_dentry = dentry;
+ new_inode = NULL;
+ } else {
+ new_dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+ must_unblock = true;
+ spin_unlock(&new_dentry->d_lock);
+ }
+
+ }
+
+ if (S_ISREG(old_inode->i_mode))
+ nfs_sync_inode(old_inode);
+ task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry,
+ must_unblock ? nfs_unblock_rename : NULL);
+ if (IS_ERR(task)) {
+ error = PTR_ERR(task);
+ goto out;
+ }
+
+ error = rpc_wait_for_completion_task(task);
+ if (error != 0) {
+ ((struct nfs_renamedata *)task->tk_calldata)->cancelled = 1;
+ /* Paired with the atomic_dec_and_test() barrier in rpc_do_put_task() */
+ smp_wmb();
+ } else
+ error = task->tk_status;
+ rpc_put_task(task);
+ /* Ensure the inode attributes are revalidated */
+ if (error == 0) {
+ spin_lock(&old_inode->i_lock);
+ NFS_I(old_inode)->attr_gencount = nfs_inc_attr_generation_counter();
+ nfs_set_cache_invalid(old_inode, NFS_INO_INVALID_CHANGE |
+ NFS_INO_INVALID_CTIME |
+ NFS_INO_REVAL_FORCED);
+ spin_unlock(&old_inode->i_lock);
+ }
+out:
+ trace_nfs_rename_exit(old_dir, old_dentry,
+ new_dir, new_dentry, error);
+ if (!error) {
+ if (new_inode != NULL)
+ nfs_drop_nlink(new_inode);
+ /*
+ * The d_move() should be here instead of in an async RPC completion
+ * handler because we need the proper locks to move the dentry. If
+ * we're interrupted by a signal, the async RPC completion handler
+ * should mark the directories for revalidation.
+ */
+ d_move(old_dentry, new_dentry);
+ nfs_set_verifier(old_dentry,
+ nfs_save_change_attribute(new_dir));
+ } else if (error == -ENOENT)
+ nfs_dentry_handle_enoent(old_dentry);
+
+ /* new dentry created? */
+ if (dentry)
+ dput(dentry);
+ return error;
+}
+EXPORT_SYMBOL_GPL(nfs_rename);
+
+static DEFINE_SPINLOCK(nfs_access_lru_lock);
+static LIST_HEAD(nfs_access_lru_list);
+static atomic_long_t nfs_access_nr_entries;
+
+static unsigned long nfs_access_max_cachesize = 4*1024*1024;
+module_param(nfs_access_max_cachesize, ulong, 0644);
+MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length");
+
+static void nfs_access_free_entry(struct nfs_access_entry *entry)
+{
+ put_group_info(entry->group_info);
+ kfree_rcu(entry, rcu_head);
+ smp_mb__before_atomic();
+ atomic_long_dec(&nfs_access_nr_entries);
+ smp_mb__after_atomic();
+}
+
+static void nfs_access_free_list(struct list_head *head)
+{
+ struct nfs_access_entry *cache;
+
+ while (!list_empty(head)) {
+ cache = list_entry(head->next, struct nfs_access_entry, lru);
+ list_del(&cache->lru);
+ nfs_access_free_entry(cache);
+ }
+}
+
+static unsigned long
+nfs_do_access_cache_scan(unsigned int nr_to_scan)
+{
+ LIST_HEAD(head);
+ struct nfs_inode *nfsi, *next;
+ struct nfs_access_entry *cache;
+ long freed = 0;
+
+ spin_lock(&nfs_access_lru_lock);
+ list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
+ struct inode *inode;
+
+ if (nr_to_scan-- == 0)
+ break;
+ inode = &nfsi->vfs_inode;
+ spin_lock(&inode->i_lock);
+ if (list_empty(&nfsi->access_cache_entry_lru))
+ goto remove_lru_entry;
+ cache = list_entry(nfsi->access_cache_entry_lru.next,
+ struct nfs_access_entry, lru);
+ list_move(&cache->lru, &head);
+ rb_erase(&cache->rb_node, &nfsi->access_cache);
+ freed++;
+ if (!list_empty(&nfsi->access_cache_entry_lru))
+ list_move_tail(&nfsi->access_cache_inode_lru,
+ &nfs_access_lru_list);
+ else {
+remove_lru_entry:
+ list_del_init(&nfsi->access_cache_inode_lru);
+ smp_mb__before_atomic();
+ clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
+ smp_mb__after_atomic();
+ }
+ spin_unlock(&inode->i_lock);
+ }
+ spin_unlock(&nfs_access_lru_lock);
+ nfs_access_free_list(&head);
+ return freed;
+}
+
+unsigned long
+nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int nr_to_scan = sc->nr_to_scan;
+ gfp_t gfp_mask = sc->gfp_mask;
+
+ if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+ return SHRINK_STOP;
+ return nfs_do_access_cache_scan(nr_to_scan);
+}
+
+
+unsigned long
+nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries));
+}
+
+static void
+nfs_access_cache_enforce_limit(void)
+{
+ long nr_entries = atomic_long_read(&nfs_access_nr_entries);
+ unsigned long diff;
+ unsigned int nr_to_scan;
+
+ if (nr_entries < 0 || nr_entries <= nfs_access_max_cachesize)
+ return;
+ nr_to_scan = 100;
+ diff = nr_entries - nfs_access_max_cachesize;
+ if (diff < nr_to_scan)
+ nr_to_scan = diff;
+ nfs_do_access_cache_scan(nr_to_scan);
+}
+
+static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
+{
+ struct rb_root *root_node = &nfsi->access_cache;
+ struct rb_node *n;
+ struct nfs_access_entry *entry;
+
+ /* Unhook entries from the cache */
+ while ((n = rb_first(root_node)) != NULL) {
+ entry = rb_entry(n, struct nfs_access_entry, rb_node);
+ rb_erase(n, root_node);
+ list_move(&entry->lru, head);
+ }
+ nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
+}
+
+void nfs_access_zap_cache(struct inode *inode)
+{
+ LIST_HEAD(head);
+
+ if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0)
+ return;
+ /* Remove from global LRU init */
+ spin_lock(&nfs_access_lru_lock);
+ if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
+ list_del_init(&NFS_I(inode)->access_cache_inode_lru);
+
+ spin_lock(&inode->i_lock);
+ __nfs_access_zap_cache(NFS_I(inode), &head);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&nfs_access_lru_lock);
+ nfs_access_free_list(&head);
+}
+EXPORT_SYMBOL_GPL(nfs_access_zap_cache);
+
+static int access_cmp(const struct cred *a, const struct nfs_access_entry *b)
+{
+ struct group_info *ga, *gb;
+ int g;
+
+ if (uid_lt(a->fsuid, b->fsuid))
+ return -1;
+ if (uid_gt(a->fsuid, b->fsuid))
+ return 1;
+
+ if (gid_lt(a->fsgid, b->fsgid))
+ return -1;
+ if (gid_gt(a->fsgid, b->fsgid))
+ return 1;
+
+ ga = a->group_info;
+ gb = b->group_info;
+ if (ga == gb)
+ return 0;
+ if (ga == NULL)
+ return -1;
+ if (gb == NULL)
+ return 1;
+ if (ga->ngroups < gb->ngroups)
+ return -1;
+ if (ga->ngroups > gb->ngroups)
+ return 1;
+
+ for (g = 0; g < ga->ngroups; g++) {
+ if (gid_lt(ga->gid[g], gb->gid[g]))
+ return -1;
+ if (gid_gt(ga->gid[g], gb->gid[g]))
+ return 1;
+ }
+ return 0;
+}
+
+static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, const struct cred *cred)
+{
+ struct rb_node *n = NFS_I(inode)->access_cache.rb_node;
+
+ while (n != NULL) {
+ struct nfs_access_entry *entry =
+ rb_entry(n, struct nfs_access_entry, rb_node);
+ int cmp = access_cmp(cred, entry);
+
+ if (cmp < 0)
+ n = n->rb_left;
+ else if (cmp > 0)
+ n = n->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+static u64 nfs_access_login_time(const struct task_struct *task,
+ const struct cred *cred)
+{
+ const struct task_struct *parent;
+ const struct cred *pcred;
+ u64 ret;
+
+ rcu_read_lock();
+ for (;;) {
+ parent = rcu_dereference(task->real_parent);
+ pcred = __task_cred(parent);
+ if (parent == task || cred_fscmp(pcred, cred) != 0)
+ break;
+ task = parent;
+ }
+ ret = task->start_time;
+ rcu_read_unlock();
+ return ret;
+}
+
+static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, u32 *mask, bool may_block)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ u64 login_time = nfs_access_login_time(current, cred);
+ struct nfs_access_entry *cache;
+ bool retry = true;
+ int err;
+
+ spin_lock(&inode->i_lock);
+ for(;;) {
+ if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+ goto out_zap;
+ cache = nfs_access_search_rbtree(inode, cred);
+ err = -ENOENT;
+ if (cache == NULL)
+ goto out;
+ /* Found an entry, is our attribute cache valid? */
+ if (!nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS))
+ break;
+ if (!retry)
+ break;
+ err = -ECHILD;
+ if (!may_block)
+ goto out;
+ spin_unlock(&inode->i_lock);
+ err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (err)
+ return err;
+ spin_lock(&inode->i_lock);
+ retry = false;
+ }
+ err = -ENOENT;
+ if ((s64)(login_time - cache->timestamp) > 0)
+ goto out;
+ *mask = cache->mask;
+ list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru);
+ err = 0;
+out:
+ spin_unlock(&inode->i_lock);
+ return err;
+out_zap:
+ spin_unlock(&inode->i_lock);
+ nfs_access_zap_cache(inode);
+ return -ENOENT;
+}
+
+static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cred, u32 *mask)
+{
+ /* Only check the most recently returned cache entry,
+ * but do it without locking.
+ */
+ struct nfs_inode *nfsi = NFS_I(inode);
+ u64 login_time = nfs_access_login_time(current, cred);
+ struct nfs_access_entry *cache;
+ int err = -ECHILD;
+ struct list_head *lh;
+
+ rcu_read_lock();
+ if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+ goto out;
+ lh = rcu_dereference(list_tail_rcu(&nfsi->access_cache_entry_lru));
+ cache = list_entry(lh, struct nfs_access_entry, lru);
+ if (lh == &nfsi->access_cache_entry_lru ||
+ access_cmp(cred, cache) != 0)
+ cache = NULL;
+ if (cache == NULL)
+ goto out;
+ if ((s64)(login_time - cache->timestamp) > 0)
+ goto out;
+ if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS))
+ goto out;
+ *mask = cache->mask;
+ err = 0;
+out:
+ rcu_read_unlock();
+ return err;
+}
+
+int nfs_access_get_cached(struct inode *inode, const struct cred *cred,
+ u32 *mask, bool may_block)
+{
+ int status;
+
+ status = nfs_access_get_cached_rcu(inode, cred, mask);
+ if (status != 0)
+ status = nfs_access_get_cached_locked(inode, cred, mask,
+ may_block);
+
+ return status;
+}
+EXPORT_SYMBOL_GPL(nfs_access_get_cached);
+
+static void nfs_access_add_rbtree(struct inode *inode,
+ struct nfs_access_entry *set,
+ const struct cred *cred)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct rb_root *root_node = &nfsi->access_cache;
+ struct rb_node **p = &root_node->rb_node;
+ struct rb_node *parent = NULL;
+ struct nfs_access_entry *entry;
+ int cmp;
+
+ spin_lock(&inode->i_lock);
+ while (*p != NULL) {
+ parent = *p;
+ entry = rb_entry(parent, struct nfs_access_entry, rb_node);
+ cmp = access_cmp(cred, entry);
+
+ if (cmp < 0)
+ p = &parent->rb_left;
+ else if (cmp > 0)
+ p = &parent->rb_right;
+ else
+ goto found;
+ }
+ rb_link_node(&set->rb_node, parent, p);
+ rb_insert_color(&set->rb_node, root_node);
+ list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
+ spin_unlock(&inode->i_lock);
+ return;
+found:
+ rb_replace_node(parent, &set->rb_node, root_node);
+ list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
+ list_del(&entry->lru);
+ spin_unlock(&inode->i_lock);
+ nfs_access_free_entry(entry);
+}
+
+void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set,
+ const struct cred *cred)
+{
+ struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL);
+ if (cache == NULL)
+ return;
+ RB_CLEAR_NODE(&cache->rb_node);
+ cache->fsuid = cred->fsuid;
+ cache->fsgid = cred->fsgid;
+ cache->group_info = get_group_info(cred->group_info);
+ cache->mask = set->mask;
+ cache->timestamp = ktime_get_ns();
+
+ /* The above field assignments must be visible
+ * before this item appears on the lru. We cannot easily
+ * use rcu_assign_pointer, so just force the memory barrier.
+ */
+ smp_wmb();
+ nfs_access_add_rbtree(inode, cache, cred);
+
+ /* Update accounting */
+ smp_mb__before_atomic();
+ atomic_long_inc(&nfs_access_nr_entries);
+ smp_mb__after_atomic();
+
+ /* Add inode to global LRU list */
+ if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
+ spin_lock(&nfs_access_lru_lock);
+ if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
+ list_add_tail(&NFS_I(inode)->access_cache_inode_lru,
+ &nfs_access_lru_list);
+ spin_unlock(&nfs_access_lru_lock);
+ }
+ nfs_access_cache_enforce_limit();
+}
+EXPORT_SYMBOL_GPL(nfs_access_add_cache);
+
+#define NFS_MAY_READ (NFS_ACCESS_READ)
+#define NFS_MAY_WRITE (NFS_ACCESS_MODIFY | \
+ NFS_ACCESS_EXTEND | \
+ NFS_ACCESS_DELETE)
+#define NFS_FILE_MAY_WRITE (NFS_ACCESS_MODIFY | \
+ NFS_ACCESS_EXTEND)
+#define NFS_DIR_MAY_WRITE NFS_MAY_WRITE
+#define NFS_MAY_LOOKUP (NFS_ACCESS_LOOKUP)
+#define NFS_MAY_EXECUTE (NFS_ACCESS_EXECUTE)
+static int
+nfs_access_calc_mask(u32 access_result, umode_t umode)
+{
+ int mask = 0;
+
+ if (access_result & NFS_MAY_READ)
+ mask |= MAY_READ;
+ if (S_ISDIR(umode)) {
+ if ((access_result & NFS_DIR_MAY_WRITE) == NFS_DIR_MAY_WRITE)
+ mask |= MAY_WRITE;
+ if ((access_result & NFS_MAY_LOOKUP) == NFS_MAY_LOOKUP)
+ mask |= MAY_EXEC;
+ } else if (S_ISREG(umode)) {
+ if ((access_result & NFS_FILE_MAY_WRITE) == NFS_FILE_MAY_WRITE)
+ mask |= MAY_WRITE;
+ if ((access_result & NFS_MAY_EXECUTE) == NFS_MAY_EXECUTE)
+ mask |= MAY_EXEC;
+ } else if (access_result & NFS_MAY_WRITE)
+ mask |= MAY_WRITE;
+ return mask;
+}
+
+void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result)
+{
+ entry->mask = access_result;
+}
+EXPORT_SYMBOL_GPL(nfs_access_set_mask);
+
+static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask)
+{
+ struct nfs_access_entry cache;
+ bool may_block = (mask & MAY_NOT_BLOCK) == 0;
+ int cache_mask = -1;
+ int status;
+
+ trace_nfs_access_enter(inode);
+
+ status = nfs_access_get_cached(inode, cred, &cache.mask, may_block);
+ if (status == 0)
+ goto out_cached;
+
+ status = -ECHILD;
+ if (!may_block)
+ goto out;
+
+ /*
+ * Determine which access bits we want to ask for...
+ */
+ cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND |
+ nfs_access_xattr_mask(NFS_SERVER(inode));
+ if (S_ISDIR(inode->i_mode))
+ cache.mask |= NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP;
+ else
+ cache.mask |= NFS_ACCESS_EXECUTE;
+ status = NFS_PROTO(inode)->access(inode, &cache, cred);
+ if (status != 0) {
+ if (status == -ESTALE) {
+ if (!S_ISDIR(inode->i_mode))
+ nfs_set_inode_stale(inode);
+ else
+ nfs_zap_caches(inode);
+ }
+ goto out;
+ }
+ nfs_access_add_cache(inode, &cache, cred);
+out_cached:
+ cache_mask = nfs_access_calc_mask(cache.mask, inode->i_mode);
+ if ((mask & ~cache_mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0)
+ status = -EACCES;
+out:
+ trace_nfs_access_exit(inode, mask, cache_mask, status);
+ return status;
+}
+
+static int nfs_open_permission_mask(int openflags)
+{
+ int mask = 0;
+
+ if (openflags & __FMODE_EXEC) {
+ /* ONLY check exec rights */
+ mask = MAY_EXEC;
+ } else {
+ if ((openflags & O_ACCMODE) != O_WRONLY)
+ mask |= MAY_READ;
+ if ((openflags & O_ACCMODE) != O_RDONLY)
+ mask |= MAY_WRITE;
+ }
+
+ return mask;
+}
+
+int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags)
+{
+ return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
+}
+EXPORT_SYMBOL_GPL(nfs_may_open);
+
+static int nfs_execute_ok(struct inode *inode, int mask)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ int ret = 0;
+
+ if (S_ISDIR(inode->i_mode))
+ return 0;
+ if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_MODE)) {
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+ ret = __nfs_revalidate_inode(server, inode);
+ }
+ if (ret == 0 && !execute_ok(inode))
+ ret = -EACCES;
+ return ret;
+}
+
+int nfs_permission(struct mnt_idmap *idmap,
+ struct inode *inode,
+ int mask)
+{
+ const struct cred *cred = current_cred();
+ int res = 0;
+
+ nfs_inc_stats(inode, NFSIOS_VFSACCESS);
+
+ if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
+ goto out;
+ /* Is this sys_access() ? */
+ if (mask & (MAY_ACCESS | MAY_CHDIR))
+ goto force_lookup;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFLNK:
+ goto out;
+ case S_IFREG:
+ if ((mask & MAY_OPEN) &&
+ nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN))
+ return 0;
+ break;
+ case S_IFDIR:
+ /*
+ * Optimize away all write operations, since the server
+ * will check permissions when we perform the op.
+ */
+ if ((mask & MAY_WRITE) && !(mask & MAY_READ))
+ goto out;
+ }
+
+force_lookup:
+ if (!NFS_PROTO(inode)->access)
+ goto out_notsup;
+
+ res = nfs_do_access(inode, cred, mask);
+out:
+ if (!res && (mask & MAY_EXEC))
+ res = nfs_execute_ok(inode, mask);
+
+ dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n",
+ inode->i_sb->s_id, inode->i_ino, mask, res);
+ return res;
+out_notsup:
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+
+ res = nfs_revalidate_inode(inode, NFS_INO_INVALID_MODE |
+ NFS_INO_INVALID_OTHER);
+ if (res == 0)
+ res = generic_permission(&nop_mnt_idmap, inode, mask);
+ goto out;
+}
+EXPORT_SYMBOL_GPL(nfs_permission);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
new file mode 100644
index 0000000000..5918c67dae
--- /dev/null
+++ b/fs/nfs/direct.c
@@ -0,0 +1,1058 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/direct.c
+ *
+ * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
+ *
+ * High-performance uncached I/O for the Linux NFS client
+ *
+ * There are important applications whose performance or correctness
+ * depends on uncached access to file data. Database clusters
+ * (multiple copies of the same instance running on separate hosts)
+ * implement their own cache coherency protocol that subsumes file
+ * system cache protocols. Applications that process datasets
+ * considerably larger than the client's memory do not always benefit
+ * from a local cache. A streaming video server, for instance, has no
+ * need to cache the contents of a file.
+ *
+ * When an application requests uncached I/O, all read and write requests
+ * are made directly to the server; data stored or fetched via these
+ * requests is not cached in the Linux page cache. The client does not
+ * correct unaligned requests from applications. All requested bytes are
+ * held on permanent storage before a direct write system call returns to
+ * an application.
+ *
+ * Solaris implements an uncached I/O facility called directio() that
+ * is used for backups and sequential I/O to very large files. Solaris
+ * also supports uncaching whole NFS partitions with "-o forcedirectio,"
+ * an undocumented mount option.
+ *
+ * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
+ * help from Andrew Morton.
+ *
+ * 18 Dec 2001 Initial implementation for 2.4 --cel
+ * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy
+ * 08 Jun 2003 Port to 2.5 APIs --cel
+ * 31 Mar 2004 Handle direct I/O without VFS support --cel
+ * 15 Sep 2004 Parallel async reads --cel
+ * 04 May 2005 support O_DIRECT with aio --cel
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/kref.h>
+#include <linux/slab.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/module.h>
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/sunrpc/clnt.h>
+
+#include <linux/uaccess.h>
+#include <linux/atomic.h>
+
+#include "internal.h"
+#include "iostat.h"
+#include "pnfs.h"
+#include "fscache.h"
+#include "nfstrace.h"
+
+#define NFSDBG_FACILITY NFSDBG_VFS
+
+static struct kmem_cache *nfs_direct_cachep;
+
+static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
+static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq);
+static void nfs_direct_write_schedule_work(struct work_struct *work);
+
+static inline void get_dreq(struct nfs_direct_req *dreq)
+{
+ atomic_inc(&dreq->io_count);
+}
+
+static inline int put_dreq(struct nfs_direct_req *dreq)
+{
+ return atomic_dec_and_test(&dreq->io_count);
+}
+
+static void
+nfs_direct_handle_truncated(struct nfs_direct_req *dreq,
+ const struct nfs_pgio_header *hdr,
+ ssize_t dreq_len)
+{
+ if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) ||
+ test_bit(NFS_IOHDR_EOF, &hdr->flags)))
+ return;
+ if (dreq->max_count >= dreq_len) {
+ dreq->max_count = dreq_len;
+ if (dreq->count > dreq_len)
+ dreq->count = dreq_len;
+ }
+
+ if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && !dreq->error)
+ dreq->error = hdr->error;
+}
+
+static void
+nfs_direct_count_bytes(struct nfs_direct_req *dreq,
+ const struct nfs_pgio_header *hdr)
+{
+ loff_t hdr_end = hdr->io_start + hdr->good_bytes;
+ ssize_t dreq_len = 0;
+
+ if (hdr_end > dreq->io_start)
+ dreq_len = hdr_end - dreq->io_start;
+
+ nfs_direct_handle_truncated(dreq, hdr, dreq_len);
+
+ if (dreq_len > dreq->max_count)
+ dreq_len = dreq->max_count;
+
+ if (dreq->count < dreq_len)
+ dreq->count = dreq_len;
+}
+
+static void nfs_direct_truncate_request(struct nfs_direct_req *dreq,
+ struct nfs_page *req)
+{
+ loff_t offs = req_offset(req);
+ size_t req_start = (size_t)(offs - dreq->io_start);
+
+ if (req_start < dreq->max_count)
+ dreq->max_count = req_start;
+ if (req_start < dreq->count)
+ dreq->count = req_start;
+}
+
+/**
+ * nfs_swap_rw - NFS address space operation for swap I/O
+ * @iocb: target I/O control block
+ * @iter: I/O buffer
+ *
+ * Perform IO to the swap-file. This is much like direct IO.
+ */
+int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
+{
+ ssize_t ret;
+
+ VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
+
+ if (iov_iter_rw(iter) == READ)
+ ret = nfs_file_direct_read(iocb, iter, true);
+ else
+ ret = nfs_file_direct_write(iocb, iter, true);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
+{
+ unsigned int i;
+ for (i = 0; i < npages; i++)
+ put_page(pages[i]);
+}
+
+void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
+ struct nfs_direct_req *dreq)
+{
+ cinfo->inode = dreq->inode;
+ cinfo->mds = &dreq->mds_cinfo;
+ cinfo->ds = &dreq->ds_cinfo;
+ cinfo->dreq = dreq;
+ cinfo->completion_ops = &nfs_direct_commit_completion_ops;
+}
+
+static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
+{
+ struct nfs_direct_req *dreq;
+
+ dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL);
+ if (!dreq)
+ return NULL;
+
+ kref_init(&dreq->kref);
+ kref_get(&dreq->kref);
+ init_completion(&dreq->completion);
+ INIT_LIST_HEAD(&dreq->mds_cinfo.list);
+ pnfs_init_ds_commit_info(&dreq->ds_cinfo);
+ INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
+ spin_lock_init(&dreq->lock);
+
+ return dreq;
+}
+
+static void nfs_direct_req_free(struct kref *kref)
+{
+ struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
+
+ pnfs_release_ds_info(&dreq->ds_cinfo, dreq->inode);
+ if (dreq->l_ctx != NULL)
+ nfs_put_lock_context(dreq->l_ctx);
+ if (dreq->ctx != NULL)
+ put_nfs_open_context(dreq->ctx);
+ kmem_cache_free(nfs_direct_cachep, dreq);
+}
+
+static void nfs_direct_req_release(struct nfs_direct_req *dreq)
+{
+ kref_put(&dreq->kref, nfs_direct_req_free);
+}
+
+ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset)
+{
+ loff_t start = offset - dreq->io_start;
+ return dreq->max_count - start;
+}
+EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
+
+/*
+ * Collects and returns the final error value/byte-count.
+ */
+static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
+{
+ ssize_t result = -EIOCBQUEUED;
+
+ /* Async requests don't wait here */
+ if (dreq->iocb)
+ goto out;
+
+ result = wait_for_completion_killable(&dreq->completion);
+
+ if (!result) {
+ result = dreq->count;
+ WARN_ON_ONCE(dreq->count < 0);
+ }
+ if (!result)
+ result = dreq->error;
+
+out:
+ return (ssize_t) result;
+}
+
+/*
+ * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust
+ * the iocb is still valid here if this is a synchronous request.
+ */
+static void nfs_direct_complete(struct nfs_direct_req *dreq)
+{
+ struct inode *inode = dreq->inode;
+
+ inode_dio_end(inode);
+
+ if (dreq->iocb) {
+ long res = (long) dreq->error;
+ if (dreq->count != 0) {
+ res = (long) dreq->count;
+ WARN_ON_ONCE(dreq->count < 0);
+ }
+ dreq->iocb->ki_complete(dreq->iocb, res);
+ }
+
+ complete(&dreq->completion);
+
+ nfs_direct_req_release(dreq);
+}
+
+static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
+{
+ unsigned long bytes = 0;
+ struct nfs_direct_req *dreq = hdr->dreq;
+
+ spin_lock(&dreq->lock);
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ spin_unlock(&dreq->lock);
+ goto out_put;
+ }
+
+ nfs_direct_count_bytes(dreq, hdr);
+ spin_unlock(&dreq->lock);
+
+ while (!list_empty(&hdr->pages)) {
+ struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+ struct page *page = req->wb_page;
+
+ if (!PageCompound(page) && bytes < hdr->good_bytes &&
+ (dreq->flags == NFS_ODIRECT_SHOULD_DIRTY))
+ set_page_dirty(page);
+ bytes += req->wb_bytes;
+ nfs_list_remove_request(req);
+ nfs_release_request(req);
+ }
+out_put:
+ if (put_dreq(dreq))
+ nfs_direct_complete(dreq);
+ hdr->release(hdr);
+}
+
+static void nfs_read_sync_pgio_error(struct list_head *head, int error)
+{
+ struct nfs_page *req;
+
+ while (!list_empty(head)) {
+ req = nfs_list_entry(head->next);
+ nfs_list_remove_request(req);
+ nfs_release_request(req);
+ }
+}
+
+static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
+{
+ get_dreq(hdr->dreq);
+}
+
+static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
+ .error_cleanup = nfs_read_sync_pgio_error,
+ .init_hdr = nfs_direct_pgio_init,
+ .completion = nfs_direct_read_completion,
+};
+
+/*
+ * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
+ * operation. If nfs_readdata_alloc() or get_user_pages() fails,
+ * bail and stop sending more reads. Read length accounting is
+ * handled automatically by nfs_direct_read_result(). Otherwise, if
+ * no requests have been sent, just return an error.
+ */
+
+static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
+ struct iov_iter *iter,
+ loff_t pos)
+{
+ struct nfs_pageio_descriptor desc;
+ struct inode *inode = dreq->inode;
+ ssize_t result = -EINVAL;
+ size_t requested_bytes = 0;
+ size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE);
+
+ nfs_pageio_init_read(&desc, dreq->inode, false,
+ &nfs_direct_read_completion_ops);
+ get_dreq(dreq);
+ desc.pg_dreq = dreq;
+ inode_dio_begin(inode);
+
+ while (iov_iter_count(iter)) {
+ struct page **pagevec;
+ size_t bytes;
+ size_t pgbase;
+ unsigned npages, i;
+
+ result = iov_iter_get_pages_alloc2(iter, &pagevec,
+ rsize, &pgbase);
+ if (result < 0)
+ break;
+
+ bytes = result;
+ npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
+ for (i = 0; i < npages; i++) {
+ struct nfs_page *req;
+ unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
+ /* XXX do we need to do the eof zeroing found in async_filler? */
+ req = nfs_page_create_from_page(dreq->ctx, pagevec[i],
+ pgbase, pos, req_len);
+ if (IS_ERR(req)) {
+ result = PTR_ERR(req);
+ break;
+ }
+ if (!nfs_pageio_add_request(&desc, req)) {
+ result = desc.pg_error;
+ nfs_release_request(req);
+ break;
+ }
+ pgbase = 0;
+ bytes -= req_len;
+ requested_bytes += req_len;
+ pos += req_len;
+ dreq->bytes_left -= req_len;
+ }
+ nfs_direct_release_pages(pagevec, npages);
+ kvfree(pagevec);
+ if (result < 0)
+ break;
+ }
+
+ nfs_pageio_complete(&desc);
+
+ /*
+ * If no bytes were started, return the error, and let the
+ * generic layer handle the completion.
+ */
+ if (requested_bytes == 0) {
+ inode_dio_end(inode);
+ nfs_direct_req_release(dreq);
+ return result < 0 ? result : -EIO;
+ }
+
+ if (put_dreq(dreq))
+ nfs_direct_complete(dreq);
+ return requested_bytes;
+}
+
+/**
+ * nfs_file_direct_read - file direct read operation for NFS files
+ * @iocb: target I/O control block
+ * @iter: vector of user buffers into which to read data
+ * @swap: flag indicating this is swap IO, not O_DIRECT IO
+ *
+ * We use this function for direct reads instead of calling
+ * generic_file_aio_read() in order to avoid gfar's check to see if
+ * the request starts before the end of the file. For that check
+ * to work, we must generate a GETATTR before each direct read, and
+ * even then there is a window between the GETATTR and the subsequent
+ * READ where the file size could change. Our preference is simply
+ * to do all reads the application wants, and the server will take
+ * care of managing the end of file boundary.
+ *
+ * This function also eliminates unnecessarily updating the file's
+ * atime locally, as the NFS server sets the file's atime, and this
+ * client must read the updated atime from the server back into its
+ * cache.
+ */
+ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
+ bool swap)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct nfs_direct_req *dreq;
+ struct nfs_lock_context *l_ctx;
+ ssize_t result, requested;
+ size_t count = iov_iter_count(iter);
+ nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
+
+ dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
+ file, count, (long long) iocb->ki_pos);
+
+ result = 0;
+ if (!count)
+ goto out;
+
+ task_io_account_read(count);
+
+ result = -ENOMEM;
+ dreq = nfs_direct_req_alloc();
+ if (dreq == NULL)
+ goto out;
+
+ dreq->inode = inode;
+ dreq->bytes_left = dreq->max_count = count;
+ dreq->io_start = iocb->ki_pos;
+ dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+ l_ctx = nfs_get_lock_context(dreq->ctx);
+ if (IS_ERR(l_ctx)) {
+ result = PTR_ERR(l_ctx);
+ nfs_direct_req_release(dreq);
+ goto out_release;
+ }
+ dreq->l_ctx = l_ctx;
+ if (!is_sync_kiocb(iocb))
+ dreq->iocb = iocb;
+
+ if (user_backed_iter(iter))
+ dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
+
+ if (!swap)
+ nfs_start_io_direct(inode);
+
+ NFS_I(inode)->read_io += count;
+ requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
+
+ if (!swap)
+ nfs_end_io_direct(inode);
+
+ if (requested > 0) {
+ result = nfs_direct_wait(dreq);
+ if (result > 0) {
+ requested -= result;
+ iocb->ki_pos += result;
+ }
+ iov_iter_revert(iter, requested);
+ } else {
+ result = requested;
+ }
+
+out_release:
+ nfs_direct_req_release(dreq);
+out:
+ return result;
+}
+
+static void nfs_direct_add_page_head(struct list_head *list,
+ struct nfs_page *req)
+{
+ struct nfs_page *head = req->wb_head;
+
+ if (!list_empty(&head->wb_list) || !nfs_lock_request(head))
+ return;
+ if (!list_empty(&head->wb_list)) {
+ nfs_unlock_request(head);
+ return;
+ }
+ list_add(&head->wb_list, list);
+ kref_get(&head->wb_kref);
+ kref_get(&head->wb_kref);
+}
+
+static void nfs_direct_join_group(struct list_head *list,
+ struct nfs_commit_info *cinfo,
+ struct inode *inode)
+{
+ struct nfs_page *req, *subreq;
+
+ list_for_each_entry(req, list, wb_list) {
+ if (req->wb_head != req) {
+ nfs_direct_add_page_head(&req->wb_list, req);
+ continue;
+ }
+ subreq = req->wb_this_page;
+ if (subreq == req)
+ continue;
+ do {
+ /*
+ * Remove subrequests from this list before freeing
+ * them in the call to nfs_join_page_group().
+ */
+ if (!list_empty(&subreq->wb_list)) {
+ nfs_list_remove_request(subreq);
+ nfs_release_request(subreq);
+ }
+ } while ((subreq = subreq->wb_this_page) != req);
+ nfs_join_page_group(req, cinfo, inode);
+ }
+}
+
+static void
+nfs_direct_write_scan_commit_list(struct inode *inode,
+ struct list_head *list,
+ struct nfs_commit_info *cinfo)
+{
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+ pnfs_recover_commit_reqs(list, cinfo);
+ nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+}
+
+static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
+{
+ struct nfs_pageio_descriptor desc;
+ struct nfs_page *req;
+ LIST_HEAD(reqs);
+ struct nfs_commit_info cinfo;
+
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
+ nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
+
+ nfs_direct_join_group(&reqs, &cinfo, dreq->inode);
+
+ nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
+ get_dreq(dreq);
+
+ nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
+ &nfs_direct_write_completion_ops);
+ desc.pg_dreq = dreq;
+
+ while (!list_empty(&reqs)) {
+ req = nfs_list_entry(reqs.next);
+ /* Bump the transmission count */
+ req->wb_nio++;
+ if (!nfs_pageio_add_request(&desc, req)) {
+ spin_lock(&dreq->lock);
+ if (dreq->error < 0) {
+ desc.pg_error = dreq->error;
+ } else if (desc.pg_error != -EAGAIN) {
+ dreq->flags = 0;
+ if (!desc.pg_error)
+ desc.pg_error = -EIO;
+ dreq->error = desc.pg_error;
+ } else
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ spin_unlock(&dreq->lock);
+ break;
+ }
+ nfs_release_request(req);
+ }
+ nfs_pageio_complete(&desc);
+
+ while (!list_empty(&reqs)) {
+ req = nfs_list_entry(reqs.next);
+ nfs_list_remove_request(req);
+ nfs_unlock_and_release_request(req);
+ if (desc.pg_error == -EAGAIN) {
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
+ } else {
+ spin_lock(&dreq->lock);
+ nfs_direct_truncate_request(dreq, req);
+ spin_unlock(&dreq->lock);
+ nfs_release_request(req);
+ }
+ }
+
+ if (put_dreq(dreq))
+ nfs_direct_write_complete(dreq);
+}
+
+static void nfs_direct_commit_complete(struct nfs_commit_data *data)
+{
+ const struct nfs_writeverf *verf = data->res.verf;
+ struct nfs_direct_req *dreq = data->dreq;
+ struct nfs_commit_info cinfo;
+ struct nfs_page *req;
+ int status = data->task.tk_status;
+
+ trace_nfs_direct_commit_complete(dreq);
+
+ if (status < 0) {
+ /* Errors in commit are fatal */
+ dreq->error = status;
+ dreq->flags = NFS_ODIRECT_DONE;
+ } else {
+ status = dreq->error;
+ }
+
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
+
+ while (!list_empty(&data->pages)) {
+ req = nfs_list_entry(data->pages.next);
+ nfs_list_remove_request(req);
+ if (status < 0) {
+ spin_lock(&dreq->lock);
+ nfs_direct_truncate_request(dreq, req);
+ spin_unlock(&dreq->lock);
+ nfs_release_request(req);
+ } else if (!nfs_write_match_verf(verf, req)) {
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ /*
+ * Despite the reboot, the write was successful,
+ * so reset wb_nio.
+ */
+ req->wb_nio = 0;
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
+ } else
+ nfs_release_request(req);
+ nfs_unlock_and_release_request(req);
+ }
+
+ if (nfs_commit_end(cinfo.mds))
+ nfs_direct_write_complete(dreq);
+}
+
+static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
+ struct nfs_page *req)
+{
+ struct nfs_direct_req *dreq = cinfo->dreq;
+
+ trace_nfs_direct_resched_write(dreq);
+
+ spin_lock(&dreq->lock);
+ if (dreq->flags != NFS_ODIRECT_DONE)
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ spin_unlock(&dreq->lock);
+ nfs_mark_request_commit(req, NULL, cinfo, 0);
+}
+
+static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
+ .completion = nfs_direct_commit_complete,
+ .resched_write = nfs_direct_resched_write,
+};
+
+static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
+{
+ int res;
+ struct nfs_commit_info cinfo;
+ LIST_HEAD(mds_list);
+
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
+ nfs_scan_commit(dreq->inode, &mds_list, &cinfo);
+ res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo);
+ if (res < 0) /* res == -ENOMEM */
+ nfs_direct_write_reschedule(dreq);
+}
+
+static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq)
+{
+ struct nfs_commit_info cinfo;
+ struct nfs_page *req;
+ LIST_HEAD(reqs);
+
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
+ nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
+
+ while (!list_empty(&reqs)) {
+ req = nfs_list_entry(reqs.next);
+ nfs_list_remove_request(req);
+ nfs_direct_truncate_request(dreq, req);
+ nfs_release_request(req);
+ nfs_unlock_and_release_request(req);
+ }
+}
+
+static void nfs_direct_write_schedule_work(struct work_struct *work)
+{
+ struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
+ int flags = dreq->flags;
+
+ dreq->flags = 0;
+ switch (flags) {
+ case NFS_ODIRECT_DO_COMMIT:
+ nfs_direct_commit_schedule(dreq);
+ break;
+ case NFS_ODIRECT_RESCHED_WRITES:
+ nfs_direct_write_reschedule(dreq);
+ break;
+ default:
+ nfs_direct_write_clear_reqs(dreq);
+ nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
+ nfs_direct_complete(dreq);
+ }
+}
+
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq)
+{
+ trace_nfs_direct_write_complete(dreq);
+ queue_work(nfsiod_workqueue, &dreq->work); /* Calls nfs_direct_write_schedule_work */
+}
+
+static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
+{
+ struct nfs_direct_req *dreq = hdr->dreq;
+ struct nfs_commit_info cinfo;
+ struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+ int flags = NFS_ODIRECT_DONE;
+
+ trace_nfs_direct_write_completion(dreq);
+
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
+
+ spin_lock(&dreq->lock);
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ spin_unlock(&dreq->lock);
+ goto out_put;
+ }
+
+ nfs_direct_count_bytes(dreq, hdr);
+ if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags) &&
+ !test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
+ if (!dreq->flags)
+ dreq->flags = NFS_ODIRECT_DO_COMMIT;
+ flags = dreq->flags;
+ }
+ spin_unlock(&dreq->lock);
+
+ while (!list_empty(&hdr->pages)) {
+
+ req = nfs_list_entry(hdr->pages.next);
+ nfs_list_remove_request(req);
+ if (flags == NFS_ODIRECT_DO_COMMIT) {
+ kref_get(&req->wb_kref);
+ memcpy(&req->wb_verf, &hdr->verf.verifier,
+ sizeof(req->wb_verf));
+ nfs_mark_request_commit(req, hdr->lseg, &cinfo,
+ hdr->ds_commit_idx);
+ } else if (flags == NFS_ODIRECT_RESCHED_WRITES) {
+ kref_get(&req->wb_kref);
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
+ }
+ nfs_unlock_and_release_request(req);
+ }
+
+out_put:
+ if (put_dreq(dreq))
+ nfs_direct_write_complete(dreq);
+ hdr->release(hdr);
+}
+
+static void nfs_write_sync_pgio_error(struct list_head *head, int error)
+{
+ struct nfs_page *req;
+
+ while (!list_empty(head)) {
+ req = nfs_list_entry(head->next);
+ nfs_list_remove_request(req);
+ nfs_unlock_and_release_request(req);
+ }
+}
+
+static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
+{
+ struct nfs_direct_req *dreq = hdr->dreq;
+ struct nfs_page *req;
+ struct nfs_commit_info cinfo;
+
+ trace_nfs_direct_write_reschedule_io(dreq);
+
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
+ spin_lock(&dreq->lock);
+ if (dreq->error == 0)
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ set_bit(NFS_IOHDR_REDO, &hdr->flags);
+ spin_unlock(&dreq->lock);
+ while (!list_empty(&hdr->pages)) {
+ req = nfs_list_entry(hdr->pages.next);
+ nfs_list_remove_request(req);
+ nfs_unlock_request(req);
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
+ }
+}
+
+static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
+ .error_cleanup = nfs_write_sync_pgio_error,
+ .init_hdr = nfs_direct_pgio_init,
+ .completion = nfs_direct_write_completion,
+ .reschedule_io = nfs_direct_write_reschedule_io,
+};
+
+
+/*
+ * NB: Return the value of the first error return code. Subsequent
+ * errors after the first one are ignored.
+ */
+/*
+ * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
+ * operation. If nfs_writedata_alloc() or get_user_pages() fails,
+ * bail and stop sending more writes. Write length accounting is
+ * handled automatically by nfs_direct_write_result(). Otherwise, if
+ * no requests have been sent, just return an error.
+ */
+static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
+ struct iov_iter *iter,
+ loff_t pos, int ioflags)
+{
+ struct nfs_pageio_descriptor desc;
+ struct inode *inode = dreq->inode;
+ struct nfs_commit_info cinfo;
+ ssize_t result = 0;
+ size_t requested_bytes = 0;
+ size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
+ bool defer = false;
+
+ trace_nfs_direct_write_schedule_iovec(dreq);
+
+ nfs_pageio_init_write(&desc, inode, ioflags, false,
+ &nfs_direct_write_completion_ops);
+ desc.pg_dreq = dreq;
+ get_dreq(dreq);
+ inode_dio_begin(inode);
+
+ NFS_I(inode)->write_io += iov_iter_count(iter);
+ while (iov_iter_count(iter)) {
+ struct page **pagevec;
+ size_t bytes;
+ size_t pgbase;
+ unsigned npages, i;
+
+ result = iov_iter_get_pages_alloc2(iter, &pagevec,
+ wsize, &pgbase);
+ if (result < 0)
+ break;
+
+ bytes = result;
+ npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
+ for (i = 0; i < npages; i++) {
+ struct nfs_page *req;
+ unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
+
+ req = nfs_page_create_from_page(dreq->ctx, pagevec[i],
+ pgbase, pos, req_len);
+ if (IS_ERR(req)) {
+ result = PTR_ERR(req);
+ break;
+ }
+
+ if (desc.pg_error < 0) {
+ nfs_free_request(req);
+ result = desc.pg_error;
+ break;
+ }
+
+ pgbase = 0;
+ bytes -= req_len;
+ requested_bytes += req_len;
+ pos += req_len;
+ dreq->bytes_left -= req_len;
+
+ if (defer) {
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
+ continue;
+ }
+
+ nfs_lock_request(req);
+ if (nfs_pageio_add_request(&desc, req))
+ continue;
+
+ /* Exit on hard errors */
+ if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) {
+ result = desc.pg_error;
+ nfs_unlock_and_release_request(req);
+ break;
+ }
+
+ /* If the error is soft, defer remaining requests */
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
+ spin_lock(&dreq->lock);
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ spin_unlock(&dreq->lock);
+ nfs_unlock_request(req);
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
+ desc.pg_error = 0;
+ defer = true;
+ }
+ nfs_direct_release_pages(pagevec, npages);
+ kvfree(pagevec);
+ if (result < 0)
+ break;
+ }
+ nfs_pageio_complete(&desc);
+
+ /*
+ * If no bytes were started, return the error, and let the
+ * generic layer handle the completion.
+ */
+ if (requested_bytes == 0) {
+ inode_dio_end(inode);
+ nfs_direct_req_release(dreq);
+ return result < 0 ? result : -EIO;
+ }
+
+ if (put_dreq(dreq))
+ nfs_direct_write_complete(dreq);
+ return requested_bytes;
+}
+
+/**
+ * nfs_file_direct_write - file direct write operation for NFS files
+ * @iocb: target I/O control block
+ * @iter: vector of user buffers from which to write data
+ * @swap: flag indicating this is swap IO, not O_DIRECT IO
+ *
+ * We use this function for direct writes instead of calling
+ * generic_file_aio_write() in order to avoid taking the inode
+ * semaphore and updating the i_size. The NFS server will set
+ * the new i_size and this client must read the updated size
+ * back into its cache. We let the server do generic write
+ * parameter checking and report problems.
+ *
+ * We eliminate local atime updates, see direct read above.
+ *
+ * We avoid unnecessary page cache invalidations for normal cached
+ * readers of this file.
+ *
+ * Note that O_APPEND is not supported for NFS direct writes, as there
+ * is no atomic O_APPEND write facility in the NFS protocol.
+ */
+ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
+ bool swap)
+{
+ ssize_t result, requested;
+ size_t count;
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct nfs_direct_req *dreq;
+ struct nfs_lock_context *l_ctx;
+ loff_t pos, end;
+
+ dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
+ file, iov_iter_count(iter), (long long) iocb->ki_pos);
+
+ if (swap)
+ /* bypass generic checks */
+ result = iov_iter_count(iter);
+ else
+ result = generic_write_checks(iocb, iter);
+ if (result <= 0)
+ return result;
+ count = result;
+ nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
+
+ pos = iocb->ki_pos;
+ end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT;
+
+ task_io_account_write(count);
+
+ result = -ENOMEM;
+ dreq = nfs_direct_req_alloc();
+ if (!dreq)
+ goto out;
+
+ dreq->inode = inode;
+ dreq->bytes_left = dreq->max_count = count;
+ dreq->io_start = pos;
+ dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+ l_ctx = nfs_get_lock_context(dreq->ctx);
+ if (IS_ERR(l_ctx)) {
+ result = PTR_ERR(l_ctx);
+ nfs_direct_req_release(dreq);
+ goto out_release;
+ }
+ dreq->l_ctx = l_ctx;
+ if (!is_sync_kiocb(iocb))
+ dreq->iocb = iocb;
+ pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode);
+
+ if (swap) {
+ requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
+ FLUSH_STABLE);
+ } else {
+ nfs_start_io_direct(inode);
+
+ requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
+ FLUSH_COND_STABLE);
+
+ if (mapping->nrpages) {
+ invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_SHIFT, end);
+ }
+
+ nfs_end_io_direct(inode);
+ }
+
+ if (requested > 0) {
+ result = nfs_direct_wait(dreq);
+ if (result > 0) {
+ requested -= result;
+ iocb->ki_pos = pos + result;
+ /* XXX: should check the generic_write_sync retval */
+ generic_write_sync(iocb, result);
+ }
+ iov_iter_revert(iter, requested);
+ } else {
+ result = requested;
+ }
+ nfs_fscache_invalidate(inode, FSCACHE_INVAL_DIO_WRITE);
+out_release:
+ nfs_direct_req_release(dreq);
+out:
+ return result;
+}
+
+/**
+ * nfs_init_directcache - create a slab cache for nfs_direct_req structures
+ *
+ */
+int __init nfs_init_directcache(void)
+{
+ nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
+ sizeof(struct nfs_direct_req),
+ 0, (SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD),
+ NULL);
+ if (nfs_direct_cachep == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/**
+ * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
+ *
+ */
+void nfs_destroy_directcache(void)
+{
+ kmem_cache_destroy(nfs_direct_cachep);
+}
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
new file mode 100644
index 0000000000..714975e5c0
--- /dev/null
+++ b/fs/nfs/dns_resolve.c
@@ -0,0 +1,480 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/dns_resolve.c
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * Resolves DNS hostnames into valid ip addresses
+ */
+
+#include <linux/module.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
+
+#include "dns_resolve.h"
+
+#ifdef CONFIG_NFS_USE_KERNEL_DNS
+
+#include <linux/dns_resolver.h>
+
+ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
+ struct sockaddr_storage *ss, size_t salen)
+{
+ struct sockaddr *sa = (struct sockaddr *)ss;
+ ssize_t ret;
+ char *ip_addr = NULL;
+ int ip_len;
+
+ ip_len = dns_query(net, NULL, name, namelen, NULL, &ip_addr, NULL,
+ false);
+ if (ip_len > 0)
+ ret = rpc_pton(net, ip_addr, ip_len, sa, salen);
+ else
+ ret = -ESRCH;
+ kfree(ip_addr);
+ return ret;
+}
+
+#else
+
+#include <linux/hash.h>
+#include <linux/string.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/seq_file.h>
+#include <linux/inet.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/svcauth.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/nfs_fs.h>
+
+#include "nfs4_fs.h"
+#include "cache_lib.h"
+#include "netns.h"
+
+#define NFS_DNS_HASHBITS 4
+#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS)
+
+struct nfs_dns_ent {
+ struct cache_head h;
+
+ char *hostname;
+ size_t namelen;
+
+ struct sockaddr_storage addr;
+ size_t addrlen;
+ struct rcu_head rcu_head;
+};
+
+
+static void nfs_dns_ent_update(struct cache_head *cnew,
+ struct cache_head *ckey)
+{
+ struct nfs_dns_ent *new;
+ struct nfs_dns_ent *key;
+
+ new = container_of(cnew, struct nfs_dns_ent, h);
+ key = container_of(ckey, struct nfs_dns_ent, h);
+
+ memcpy(&new->addr, &key->addr, key->addrlen);
+ new->addrlen = key->addrlen;
+}
+
+static void nfs_dns_ent_init(struct cache_head *cnew,
+ struct cache_head *ckey)
+{
+ struct nfs_dns_ent *new;
+ struct nfs_dns_ent *key;
+
+ new = container_of(cnew, struct nfs_dns_ent, h);
+ key = container_of(ckey, struct nfs_dns_ent, h);
+
+ kfree(new->hostname);
+ new->hostname = kmemdup_nul(key->hostname, key->namelen, GFP_KERNEL);
+ if (new->hostname) {
+ new->namelen = key->namelen;
+ nfs_dns_ent_update(cnew, ckey);
+ } else {
+ new->namelen = 0;
+ new->addrlen = 0;
+ }
+}
+
+static void nfs_dns_ent_free_rcu(struct rcu_head *head)
+{
+ struct nfs_dns_ent *item;
+
+ item = container_of(head, struct nfs_dns_ent, rcu_head);
+ kfree(item->hostname);
+ kfree(item);
+}
+
+static void nfs_dns_ent_put(struct kref *ref)
+{
+ struct nfs_dns_ent *item;
+
+ item = container_of(ref, struct nfs_dns_ent, h.ref);
+ call_rcu(&item->rcu_head, nfs_dns_ent_free_rcu);
+}
+
+static struct cache_head *nfs_dns_ent_alloc(void)
+{
+ struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL);
+
+ if (item != NULL) {
+ item->hostname = NULL;
+ item->namelen = 0;
+ item->addrlen = 0;
+ return &item->h;
+ }
+ return NULL;
+};
+
+static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key)
+{
+ return hash_str(key->hostname, NFS_DNS_HASHBITS);
+}
+
+static void nfs_dns_request(struct cache_detail *cd,
+ struct cache_head *ch,
+ char **bpp, int *blen)
+{
+ struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+
+ qword_add(bpp, blen, key->hostname);
+ (*bpp)[-1] = '\n';
+}
+
+static int nfs_dns_upcall(struct cache_detail *cd,
+ struct cache_head *ch)
+{
+ struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+
+ if (test_and_set_bit(CACHE_PENDING, &ch->flags))
+ return 0;
+ if (!nfs_cache_upcall(cd, key->hostname))
+ return 0;
+ clear_bit(CACHE_PENDING, &ch->flags);
+ return sunrpc_cache_pipe_upcall_timeout(cd, ch);
+}
+
+static int nfs_dns_match(struct cache_head *ca,
+ struct cache_head *cb)
+{
+ struct nfs_dns_ent *a;
+ struct nfs_dns_ent *b;
+
+ a = container_of(ca, struct nfs_dns_ent, h);
+ b = container_of(cb, struct nfs_dns_ent, h);
+
+ if (a->namelen == 0 || a->namelen != b->namelen)
+ return 0;
+ return memcmp(a->hostname, b->hostname, a->namelen) == 0;
+}
+
+static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
+ struct cache_head *h)
+{
+ struct nfs_dns_ent *item;
+ long ttl;
+
+ if (h == NULL) {
+ seq_puts(m, "# ip address hostname ttl\n");
+ return 0;
+ }
+ item = container_of(h, struct nfs_dns_ent, h);
+ ttl = item->h.expiry_time - seconds_since_boot();
+ if (ttl < 0)
+ ttl = 0;
+
+ if (!test_bit(CACHE_NEGATIVE, &h->flags)) {
+ char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1];
+
+ rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf));
+ seq_printf(m, "%15s ", buf);
+ } else
+ seq_puts(m, "<none> ");
+ seq_printf(m, "%15s %ld\n", item->hostname, ttl);
+ return 0;
+}
+
+static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
+ struct nfs_dns_ent *key)
+{
+ struct cache_head *ch;
+
+ ch = sunrpc_cache_lookup_rcu(cd,
+ &key->h,
+ nfs_dns_hash(key));
+ if (!ch)
+ return NULL;
+ return container_of(ch, struct nfs_dns_ent, h);
+}
+
+static struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd,
+ struct nfs_dns_ent *new,
+ struct nfs_dns_ent *key)
+{
+ struct cache_head *ch;
+
+ ch = sunrpc_cache_update(cd,
+ &new->h, &key->h,
+ nfs_dns_hash(key));
+ if (!ch)
+ return NULL;
+ return container_of(ch, struct nfs_dns_ent, h);
+}
+
+static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
+{
+ char buf1[NFS_DNS_HOSTNAME_MAXLEN+1];
+ struct nfs_dns_ent key, *item;
+ unsigned int ttl;
+ ssize_t len;
+ int ret = -EINVAL;
+
+ if (buf[buflen-1] != '\n')
+ goto out;
+ buf[buflen-1] = '\0';
+
+ len = qword_get(&buf, buf1, sizeof(buf1));
+ if (len <= 0)
+ goto out;
+ key.addrlen = rpc_pton(cd->net, buf1, len,
+ (struct sockaddr *)&key.addr,
+ sizeof(key.addr));
+
+ len = qword_get(&buf, buf1, sizeof(buf1));
+ if (len <= 0)
+ goto out;
+
+ key.hostname = buf1;
+ key.namelen = len;
+ memset(&key.h, 0, sizeof(key.h));
+
+ if (get_uint(&buf, &ttl) < 0)
+ goto out;
+ if (ttl == 0)
+ goto out;
+ key.h.expiry_time = ttl + seconds_since_boot();
+
+ ret = -ENOMEM;
+ item = nfs_dns_lookup(cd, &key);
+ if (item == NULL)
+ goto out;
+
+ if (key.addrlen == 0)
+ set_bit(CACHE_NEGATIVE, &key.h.flags);
+
+ item = nfs_dns_update(cd, &key, item);
+ if (item == NULL)
+ goto out;
+
+ ret = 0;
+ cache_put(&item->h, cd);
+out:
+ return ret;
+}
+
+static int do_cache_lookup(struct cache_detail *cd,
+ struct nfs_dns_ent *key,
+ struct nfs_dns_ent **item,
+ struct nfs_cache_defer_req *dreq)
+{
+ int ret = -ENOMEM;
+
+ *item = nfs_dns_lookup(cd, key);
+ if (*item) {
+ ret = cache_check(cd, &(*item)->h, &dreq->req);
+ if (ret)
+ *item = NULL;
+ }
+ return ret;
+}
+
+static int do_cache_lookup_nowait(struct cache_detail *cd,
+ struct nfs_dns_ent *key,
+ struct nfs_dns_ent **item)
+{
+ int ret = -ENOMEM;
+
+ *item = nfs_dns_lookup(cd, key);
+ if (!*item)
+ goto out_err;
+ ret = -ETIMEDOUT;
+ if (!test_bit(CACHE_VALID, &(*item)->h.flags)
+ || (*item)->h.expiry_time < seconds_since_boot()
+ || cd->flush_time > (*item)->h.last_refresh)
+ goto out_put;
+ ret = -ENOENT;
+ if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
+ goto out_put;
+ return 0;
+out_put:
+ cache_put(&(*item)->h, cd);
+out_err:
+ *item = NULL;
+ return ret;
+}
+
+static int do_cache_lookup_wait(struct cache_detail *cd,
+ struct nfs_dns_ent *key,
+ struct nfs_dns_ent **item)
+{
+ struct nfs_cache_defer_req *dreq;
+ int ret = -ENOMEM;
+
+ dreq = nfs_cache_defer_req_alloc();
+ if (!dreq)
+ goto out;
+ ret = do_cache_lookup(cd, key, item, dreq);
+ if (ret == -EAGAIN) {
+ ret = nfs_cache_wait_for_upcall(dreq);
+ if (!ret)
+ ret = do_cache_lookup_nowait(cd, key, item);
+ }
+ nfs_cache_defer_req_put(dreq);
+out:
+ return ret;
+}
+
+ssize_t nfs_dns_resolve_name(struct net *net, char *name,
+ size_t namelen, struct sockaddr_storage *ss, size_t salen)
+{
+ struct nfs_dns_ent key = {
+ .hostname = name,
+ .namelen = namelen,
+ };
+ struct nfs_dns_ent *item = NULL;
+ ssize_t ret;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ ret = do_cache_lookup_wait(nn->nfs_dns_resolve, &key, &item);
+ if (ret == 0) {
+ if (salen >= item->addrlen) {
+ memcpy(ss, &item->addr, item->addrlen);
+ ret = item->addrlen;
+ } else
+ ret = -EOVERFLOW;
+ cache_put(&item->h, nn->nfs_dns_resolve);
+ } else if (ret == -ENOENT)
+ ret = -ESRCH;
+ return ret;
+}
+
+static struct cache_detail nfs_dns_resolve_template = {
+ .owner = THIS_MODULE,
+ .hash_size = NFS_DNS_HASHTBL_SIZE,
+ .name = "dns_resolve",
+ .cache_put = nfs_dns_ent_put,
+ .cache_upcall = nfs_dns_upcall,
+ .cache_request = nfs_dns_request,
+ .cache_parse = nfs_dns_parse,
+ .cache_show = nfs_dns_show,
+ .match = nfs_dns_match,
+ .init = nfs_dns_ent_init,
+ .update = nfs_dns_ent_update,
+ .alloc = nfs_dns_ent_alloc,
+};
+
+
+int nfs_dns_resolver_cache_init(struct net *net)
+{
+ int err;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ nn->nfs_dns_resolve = cache_create_net(&nfs_dns_resolve_template, net);
+ if (IS_ERR(nn->nfs_dns_resolve))
+ return PTR_ERR(nn->nfs_dns_resolve);
+
+ err = nfs_cache_register_net(net, nn->nfs_dns_resolve);
+ if (err)
+ goto err_reg;
+ return 0;
+
+err_reg:
+ cache_destroy_net(nn->nfs_dns_resolve, net);
+ return err;
+}
+
+void nfs_dns_resolver_cache_destroy(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ nfs_cache_unregister_net(net, nn->nfs_dns_resolve);
+ cache_destroy_net(nn->nfs_dns_resolve, net);
+}
+
+static int nfs4_dns_net_init(struct net *net)
+{
+ return nfs_dns_resolver_cache_init(net);
+}
+
+static void nfs4_dns_net_exit(struct net *net)
+{
+ nfs_dns_resolver_cache_destroy(net);
+}
+
+static struct pernet_operations nfs4_dns_resolver_ops = {
+ .init = nfs4_dns_net_init,
+ .exit = nfs4_dns_net_exit,
+};
+
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct super_block *sb = ptr;
+ struct net *net = sb->s_fs_info;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct cache_detail *cd = nn->nfs_dns_resolve;
+ int ret = 0;
+
+ if (cd == NULL)
+ return 0;
+
+ if (!try_module_get(THIS_MODULE))
+ return 0;
+
+ switch (event) {
+ case RPC_PIPEFS_MOUNT:
+ ret = nfs_cache_register_sb(sb, cd);
+ break;
+ case RPC_PIPEFS_UMOUNT:
+ nfs_cache_unregister_sb(sb, cd);
+ break;
+ default:
+ ret = -ENOTSUPP;
+ break;
+ }
+ module_put(THIS_MODULE);
+ return ret;
+}
+
+static struct notifier_block nfs_dns_resolver_block = {
+ .notifier_call = rpc_pipefs_event,
+};
+
+int nfs_dns_resolver_init(void)
+{
+ int err;
+
+ err = register_pernet_subsys(&nfs4_dns_resolver_ops);
+ if (err < 0)
+ goto out;
+ err = rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
+ if (err < 0)
+ goto out1;
+ return 0;
+out1:
+ unregister_pernet_subsys(&nfs4_dns_resolver_ops);
+out:
+ return err;
+}
+
+void nfs_dns_resolver_destroy(void)
+{
+ rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block);
+ unregister_pernet_subsys(&nfs4_dns_resolver_ops);
+}
+#endif
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
new file mode 100644
index 0000000000..fe3b172c4d
--- /dev/null
+++ b/fs/nfs/dns_resolve.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Resolve DNS hostnames into valid ip addresses
+ */
+#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H
+#define __LINUX_FS_NFS_DNS_RESOLVE_H
+
+#define NFS_DNS_HOSTNAME_MAXLEN (128)
+
+
+#ifdef CONFIG_NFS_USE_KERNEL_DNS
+static inline int nfs_dns_resolver_init(void)
+{
+ return 0;
+}
+
+static inline void nfs_dns_resolver_destroy(void)
+{}
+
+static inline int nfs_dns_resolver_cache_init(struct net *net)
+{
+ return 0;
+}
+
+static inline void nfs_dns_resolver_cache_destroy(struct net *net)
+{}
+#else
+extern int nfs_dns_resolver_init(void);
+extern void nfs_dns_resolver_destroy(void);
+extern int nfs_dns_resolver_cache_init(struct net *net);
+extern void nfs_dns_resolver_cache_destroy(struct net *net);
+#endif
+
+extern ssize_t nfs_dns_resolve_name(struct net *net, char *name,
+ size_t namelen, struct sockaddr_storage *sa, size_t salen);
+
+#endif
diff --git a/fs/nfs/export.c b/fs/nfs/export.c
new file mode 100644
index 0000000000..be686b8e0c
--- /dev/null
+++ b/fs/nfs/export.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2015, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+#include <linux/dcache.h>
+#include <linux/exportfs.h>
+#include <linux/nfs.h>
+#include <linux/nfs_fs.h>
+
+#include "internal.h"
+#include "nfstrace.h"
+
+#define NFSDBG_FACILITY NFSDBG_VFS
+
+enum {
+ FILEID_HIGH_OFF = 0, /* inode fileid high */
+ FILEID_LOW_OFF, /* inode fileid low */
+ FILE_I_TYPE_OFF, /* inode type */
+ EMBED_FH_OFF /* embeded server fh */
+};
+
+
+static struct nfs_fh *nfs_exp_embedfh(__u32 *p)
+{
+ return (struct nfs_fh *)(p + EMBED_FH_OFF);
+}
+
+/*
+ * Let's break subtree checking for now... otherwise we'll have to embed parent fh
+ * but there might not be enough space.
+ */
+static int
+nfs_encode_fh(struct inode *inode, __u32 *p, int *max_len, struct inode *parent)
+{
+ struct nfs_fh *server_fh = NFS_FH(inode);
+ struct nfs_fh *clnt_fh = nfs_exp_embedfh(p);
+ size_t fh_size = offsetof(struct nfs_fh, data) + server_fh->size;
+ int len = EMBED_FH_OFF + XDR_QUADLEN(fh_size);
+
+ dprintk("%s: max fh len %d inode %p parent %p",
+ __func__, *max_len, inode, parent);
+
+ if (*max_len < len) {
+ dprintk("%s: fh len %d too small, required %d\n",
+ __func__, *max_len, len);
+ *max_len = len;
+ return FILEID_INVALID;
+ }
+
+ p[FILEID_HIGH_OFF] = NFS_FILEID(inode) >> 32;
+ p[FILEID_LOW_OFF] = NFS_FILEID(inode);
+ p[FILE_I_TYPE_OFF] = inode->i_mode & S_IFMT;
+ p[len - 1] = 0; /* Padding */
+ nfs_copy_fh(clnt_fh, server_fh);
+ *max_len = len;
+ dprintk("%s: result fh fileid %llu mode %u size %d\n",
+ __func__, NFS_FILEID(inode), inode->i_mode, *max_len);
+ return *max_len;
+}
+
+static struct dentry *
+nfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ struct nfs_fattr *fattr = NULL;
+ struct nfs_fh *server_fh = nfs_exp_embedfh(fid->raw);
+ size_t fh_size = offsetof(struct nfs_fh, data) + server_fh->size;
+ const struct nfs_rpc_ops *rpc_ops;
+ struct dentry *dentry;
+ struct inode *inode;
+ int len = EMBED_FH_OFF + XDR_QUADLEN(fh_size);
+ u32 *p = fid->raw;
+ int ret;
+
+ /* NULL translates to ESTALE */
+ if (fh_len < len || fh_type != len)
+ return NULL;
+
+ fattr = nfs_alloc_fattr_with_label(NFS_SB(sb));
+ if (fattr == NULL) {
+ dentry = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ fattr->fileid = ((u64)p[FILEID_HIGH_OFF] << 32) + p[FILEID_LOW_OFF];
+ fattr->mode = p[FILE_I_TYPE_OFF];
+ fattr->valid |= NFS_ATTR_FATTR_FILEID | NFS_ATTR_FATTR_TYPE;
+
+ dprintk("%s: fileid %llu mode %d\n", __func__, fattr->fileid, fattr->mode);
+
+ inode = nfs_ilookup(sb, fattr, server_fh);
+ if (inode)
+ goto out_found;
+
+ rpc_ops = NFS_SB(sb)->nfs_client->rpc_ops;
+ ret = rpc_ops->getattr(NFS_SB(sb), server_fh, fattr, NULL);
+ if (ret) {
+ dprintk("%s: getattr failed %d\n", __func__, ret);
+ trace_nfs_fh_to_dentry(sb, server_fh, fattr->fileid, ret);
+ dentry = ERR_PTR(ret);
+ goto out_free_fattr;
+ }
+
+ inode = nfs_fhget(sb, server_fh, fattr);
+
+out_found:
+ dentry = d_obtain_alias(inode);
+out_free_fattr:
+ nfs_free_fattr(fattr);
+out:
+ return dentry;
+}
+
+static struct dentry *
+nfs_get_parent(struct dentry *dentry)
+{
+ int ret;
+ struct inode *inode = d_inode(dentry), *pinode;
+ struct super_block *sb = inode->i_sb;
+ struct nfs_server *server = NFS_SB(sb);
+ struct nfs_fattr *fattr = NULL;
+ struct dentry *parent;
+ struct nfs_rpc_ops const *ops = server->nfs_client->rpc_ops;
+ struct nfs_fh fh;
+
+ if (!ops->lookupp)
+ return ERR_PTR(-EACCES);
+
+ fattr = nfs_alloc_fattr_with_label(server);
+ if (fattr == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ ret = ops->lookupp(inode, &fh, fattr);
+ if (ret) {
+ parent = ERR_PTR(ret);
+ goto out;
+ }
+
+ pinode = nfs_fhget(sb, &fh, fattr);
+ parent = d_obtain_alias(pinode);
+out:
+ nfs_free_fattr(fattr);
+ return parent;
+}
+
+const struct export_operations nfs_export_ops = {
+ .encode_fh = nfs_encode_fh,
+ .fh_to_dentry = nfs_fh_to_dentry,
+ .get_parent = nfs_get_parent,
+ .flags = EXPORT_OP_NOWCC |
+ EXPORT_OP_NOSUBTREECHK |
+ EXPORT_OP_CLOSE_BEFORE_UNLINK |
+ EXPORT_OP_REMOTE_FS |
+ EXPORT_OP_NOATOMIC_ATTR |
+ EXPORT_OP_FLUSH_ON_CLOSE,
+};
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
new file mode 100644
index 0000000000..3f97688104
--- /dev/null
+++ b/fs/nfs/file.c
@@ -0,0 +1,904 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/file.c
+ *
+ * Copyright (C) 1992 Rick Sladkey
+ *
+ * Changes Copyright (C) 1994 by Florian La Roche
+ * - Do not copy data too often around in the kernel.
+ * - In nfs_file_read the return value of kmalloc wasn't checked.
+ * - Put in a better version of read look-ahead buffering. Original idea
+ * and implementation by Wai S Kok elekokws@ee.nus.sg.
+ *
+ * Expire cache on write to a file by Wai S Kok (Oct 1994).
+ *
+ * Total rewrite of read side for new NFS buffer cache.. Linus.
+ *
+ * nfs regular file handling functions
+ */
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/gfp.h>
+#include <linux/swap.h>
+
+#include <linux/uaccess.h>
+#include <linux/filelock.h>
+
+#include "delegation.h"
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+#include "pnfs.h"
+
+#include "nfstrace.h"
+
+#define NFSDBG_FACILITY NFSDBG_FILE
+
+static const struct vm_operations_struct nfs_file_vm_ops;
+
+int nfs_check_flags(int flags)
+{
+ if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT))
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_check_flags);
+
+/*
+ * Open file
+ */
+static int
+nfs_file_open(struct inode *inode, struct file *filp)
+{
+ int res;
+
+ dprintk("NFS: open file(%pD2)\n", filp);
+
+ nfs_inc_stats(inode, NFSIOS_VFSOPEN);
+ res = nfs_check_flags(filp->f_flags);
+ if (res)
+ return res;
+
+ res = nfs_open(inode, filp);
+ if (res == 0)
+ filp->f_mode |= FMODE_CAN_ODIRECT;
+ return res;
+}
+
+int
+nfs_file_release(struct inode *inode, struct file *filp)
+{
+ dprintk("NFS: release(%pD2)\n", filp);
+
+ nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
+ nfs_file_clear_open_context(filp);
+ nfs_fscache_release_file(inode, filp);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_file_release);
+
+/**
+ * nfs_revalidate_file_size - Revalidate the file size
+ * @inode: pointer to inode struct
+ * @filp: pointer to struct file
+ *
+ * Revalidates the file length. This is basically a wrapper around
+ * nfs_revalidate_inode() that takes into account the fact that we may
+ * have cached writes (in which case we don't care about the server's
+ * idea of what the file length is), or O_DIRECT (in which case we
+ * shouldn't trust the cache).
+ */
+static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+
+ if (filp->f_flags & O_DIRECT)
+ goto force_reval;
+ if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_SIZE))
+ goto force_reval;
+ return 0;
+force_reval:
+ return __nfs_revalidate_inode(server, inode);
+}
+
+loff_t nfs_file_llseek(struct file *filp, loff_t offset, int whence)
+{
+ dprintk("NFS: llseek file(%pD2, %lld, %d)\n",
+ filp, offset, whence);
+
+ /*
+ * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
+ * the cached file length
+ */
+ if (whence != SEEK_SET && whence != SEEK_CUR) {
+ struct inode *inode = filp->f_mapping->host;
+
+ int retval = nfs_revalidate_file_size(inode, filp);
+ if (retval < 0)
+ return (loff_t)retval;
+ }
+
+ return generic_file_llseek(filp, offset, whence);
+}
+EXPORT_SYMBOL_GPL(nfs_file_llseek);
+
+/*
+ * Flush all dirty pages, and check for write errors.
+ */
+static int
+nfs_file_flush(struct file *file, fl_owner_t id)
+{
+ struct inode *inode = file_inode(file);
+ errseq_t since;
+
+ dprintk("NFS: flush(%pD2)\n", file);
+
+ nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
+ if ((file->f_mode & FMODE_WRITE) == 0)
+ return 0;
+
+ /* Flush writes to the server and return any errors */
+ since = filemap_sample_wb_err(file->f_mapping);
+ nfs_wb_all(inode);
+ return filemap_check_wb_err(file->f_mapping, since);
+}
+
+ssize_t
+nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t result;
+
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return nfs_file_direct_read(iocb, to, false);
+
+ dprintk("NFS: read(%pD2, %zu@%lu)\n",
+ iocb->ki_filp,
+ iov_iter_count(to), (unsigned long) iocb->ki_pos);
+
+ nfs_start_io_read(inode);
+ result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
+ if (!result) {
+ result = generic_file_read_iter(iocb, to);
+ if (result > 0)
+ nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
+ }
+ nfs_end_io_read(inode);
+ return result;
+}
+EXPORT_SYMBOL_GPL(nfs_file_read);
+
+ssize_t
+nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ struct inode *inode = file_inode(in);
+ ssize_t result;
+
+ dprintk("NFS: splice_read(%pD2, %zu@%llu)\n", in, len, *ppos);
+
+ nfs_start_io_read(inode);
+ result = nfs_revalidate_mapping(inode, in->f_mapping);
+ if (!result) {
+ result = filemap_splice_read(in, ppos, pipe, len, flags);
+ if (result > 0)
+ nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
+ }
+ nfs_end_io_read(inode);
+ return result;
+}
+EXPORT_SYMBOL_GPL(nfs_file_splice_read);
+
+int
+nfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct inode *inode = file_inode(file);
+ int status;
+
+ dprintk("NFS: mmap(%pD2)\n", file);
+
+ /* Note: generic_file_mmap() returns ENOSYS on nommu systems
+ * so we call that before revalidating the mapping
+ */
+ status = generic_file_mmap(file, vma);
+ if (!status) {
+ vma->vm_ops = &nfs_file_vm_ops;
+ status = nfs_revalidate_mapping(inode, file->f_mapping);
+ }
+ return status;
+}
+EXPORT_SYMBOL_GPL(nfs_file_mmap);
+
+/*
+ * Flush any dirty pages for this process, and check for write errors.
+ * The return status from this call provides a reliable indication of
+ * whether any write errors occurred for this process.
+ */
+static int
+nfs_file_fsync_commit(struct file *file, int datasync)
+{
+ struct inode *inode = file_inode(file);
+ int ret, ret2;
+
+ dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);
+
+ nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
+ ret = nfs_commit_inode(inode, FLUSH_SYNC);
+ ret2 = file_check_and_advance_wb_err(file);
+ if (ret2 < 0)
+ return ret2;
+ return ret;
+}
+
+int
+nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ struct inode *inode = file_inode(file);
+ struct nfs_inode *nfsi = NFS_I(inode);
+ long save_nredirtied = atomic_long_read(&nfsi->redirtied_pages);
+ long nredirtied;
+ int ret;
+
+ trace_nfs_fsync_enter(inode);
+
+ for (;;) {
+ ret = file_write_and_wait_range(file, start, end);
+ if (ret != 0)
+ break;
+ ret = nfs_file_fsync_commit(file, datasync);
+ if (ret != 0)
+ break;
+ ret = pnfs_sync_inode(inode, !!datasync);
+ if (ret != 0)
+ break;
+ nredirtied = atomic_long_read(&nfsi->redirtied_pages);
+ if (nredirtied == save_nredirtied)
+ break;
+ save_nredirtied = nredirtied;
+ }
+
+ trace_nfs_fsync_exit(inode, ret);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_file_fsync);
+
+/*
+ * Decide whether a read/modify/write cycle may be more efficient
+ * then a modify/write/read cycle when writing to a page in the
+ * page cache.
+ *
+ * Some pNFS layout drivers can only read/write at a certain block
+ * granularity like all block devices and therefore we must perform
+ * read/modify/write whenever a page hasn't read yet and the data
+ * to be written there is not aligned to a block boundary and/or
+ * smaller than the block size.
+ *
+ * The modify/write/read cycle may occur if a page is read before
+ * being completely filled by the writer. In this situation, the
+ * page must be completely written to stable storage on the server
+ * before it can be refilled by reading in the page from the server.
+ * This can lead to expensive, small, FILE_SYNC mode writes being
+ * done.
+ *
+ * It may be more efficient to read the page first if the file is
+ * open for reading in addition to writing, the page is not marked
+ * as Uptodate, it is not dirty or waiting to be committed,
+ * indicating that it was previously allocated and then modified,
+ * that there were valid bytes of data in that range of the file,
+ * and that the new data won't completely replace the old data in
+ * that range of the file.
+ */
+static bool nfs_folio_is_full_write(struct folio *folio, loff_t pos,
+ unsigned int len)
+{
+ unsigned int pglen = nfs_folio_length(folio);
+ unsigned int offset = offset_in_folio(folio, pos);
+ unsigned int end = offset + len;
+
+ return !pglen || (end >= pglen && !offset);
+}
+
+static bool nfs_want_read_modify_write(struct file *file, struct folio *folio,
+ loff_t pos, unsigned int len)
+{
+ /*
+ * Up-to-date pages, those with ongoing or full-page write
+ * don't need read/modify/write
+ */
+ if (folio_test_uptodate(folio) || folio_test_private(folio) ||
+ nfs_folio_is_full_write(folio, pos, len))
+ return false;
+
+ if (pnfs_ld_read_whole_page(file_inode(file)))
+ return true;
+ /* Open for reading too? */
+ if (file->f_mode & FMODE_READ)
+ return true;
+ return false;
+}
+
+/*
+ * This does the "real" work of the write. We must allocate and lock the
+ * page to be sent back to the generic routine, which then copies the
+ * data from user space.
+ *
+ * If the writer ends up delaying the write, the writer needs to
+ * increment the page use counts until he is done with the page.
+ */
+static int nfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, struct page **pagep,
+ void **fsdata)
+{
+ struct folio *folio;
+ int once_thru = 0;
+ int ret;
+
+ dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n",
+ file, mapping->host->i_ino, len, (long long) pos);
+
+start:
+ folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ *pagep = &folio->page;
+
+ ret = nfs_flush_incompatible(file, folio);
+ if (ret) {
+ folio_unlock(folio);
+ folio_put(folio);
+ } else if (!once_thru &&
+ nfs_want_read_modify_write(file, folio, pos, len)) {
+ once_thru = 1;
+ ret = nfs_read_folio(file, folio);
+ folio_put(folio);
+ if (!ret)
+ goto start;
+ }
+ return ret;
+}
+
+static int nfs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct nfs_open_context *ctx = nfs_file_open_context(file);
+ struct folio *folio = page_folio(page);
+ unsigned offset = offset_in_folio(folio, pos);
+ int status;
+
+ dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n",
+ file, mapping->host->i_ino, len, (long long) pos);
+
+ /*
+ * Zero any uninitialised parts of the page, and then mark the page
+ * as up to date if it turns out that we're extending the file.
+ */
+ if (!folio_test_uptodate(folio)) {
+ size_t fsize = folio_size(folio);
+ unsigned pglen = nfs_folio_length(folio);
+ unsigned end = offset + copied;
+
+ if (pglen == 0) {
+ folio_zero_segments(folio, 0, offset, end, fsize);
+ folio_mark_uptodate(folio);
+ } else if (end >= pglen) {
+ folio_zero_segment(folio, end, fsize);
+ if (offset == 0)
+ folio_mark_uptodate(folio);
+ } else
+ folio_zero_segment(folio, pglen, fsize);
+ }
+
+ status = nfs_update_folio(file, folio, offset, copied);
+
+ folio_unlock(folio);
+ folio_put(folio);
+
+ if (status < 0)
+ return status;
+ NFS_I(mapping->host)->write_io += copied;
+
+ if (nfs_ctx_key_to_expire(ctx, mapping->host))
+ nfs_wb_all(mapping->host);
+
+ return copied;
+}
+
+/*
+ * Partially or wholly invalidate a page
+ * - Release the private state associated with a page if undergoing complete
+ * page invalidation
+ * - Called if either PG_private or PG_fscache is set on the page
+ * - Caller holds page lock
+ */
+static void nfs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
+{
+ struct inode *inode = folio_file_mapping(folio)->host;
+ dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n",
+ folio->index, offset, length);
+
+ if (offset != 0 || length < folio_size(folio))
+ return;
+ /* Cancel any unstarted writes on this page */
+ nfs_wb_folio_cancel(inode, folio);
+ folio_wait_fscache(folio);
+ trace_nfs_invalidate_folio(inode, folio);
+}
+
+/*
+ * Attempt to release the private state associated with a folio
+ * - Called if either private or fscache flags are set on the folio
+ * - Caller holds folio lock
+ * - Return true (may release folio) or false (may not)
+ */
+static bool nfs_release_folio(struct folio *folio, gfp_t gfp)
+{
+ dfprintk(PAGECACHE, "NFS: release_folio(%p)\n", folio);
+
+ /* If the private flag is set, then the folio is not freeable */
+ if (folio_test_private(folio)) {
+ if ((current_gfp_context(gfp) & GFP_KERNEL) != GFP_KERNEL ||
+ current_is_kswapd())
+ return false;
+ if (nfs_wb_folio(folio_file_mapping(folio)->host, folio) < 0)
+ return false;
+ }
+ return nfs_fscache_release_folio(folio, gfp);
+}
+
+static void nfs_check_dirty_writeback(struct folio *folio,
+ bool *dirty, bool *writeback)
+{
+ struct nfs_inode *nfsi;
+ struct address_space *mapping = folio->mapping;
+
+ /*
+ * Check if an unstable folio is currently being committed and
+ * if so, have the VM treat it as if the folio is under writeback
+ * so it will not block due to folios that will shortly be freeable.
+ */
+ nfsi = NFS_I(mapping->host);
+ if (atomic_read(&nfsi->commit_info.rpcs_out)) {
+ *writeback = true;
+ return;
+ }
+
+ /*
+ * If the private flag is set, then the folio is not freeable
+ * and as the inode is not being committed, it's not going to
+ * be cleaned in the near future so treat it as dirty
+ */
+ if (folio_test_private(folio))
+ *dirty = true;
+}
+
+/*
+ * Attempt to clear the private state associated with a page when an error
+ * occurs that requires the cached contents of an inode to be written back or
+ * destroyed
+ * - Called if either PG_private or fscache is set on the page
+ * - Caller holds page lock
+ * - Return 0 if successful, -error otherwise
+ */
+static int nfs_launder_folio(struct folio *folio)
+{
+ struct inode *inode = folio->mapping->host;
+ int ret;
+
+ dfprintk(PAGECACHE, "NFS: launder_folio(%ld, %llu)\n",
+ inode->i_ino, folio_pos(folio));
+
+ folio_wait_fscache(folio);
+ ret = nfs_wb_folio(inode, folio);
+ trace_nfs_launder_folio_done(inode, folio, ret);
+ return ret;
+}
+
+static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
+ sector_t *span)
+{
+ unsigned long blocks;
+ long long isize;
+ int ret;
+ struct inode *inode = file_inode(file);
+ struct rpc_clnt *clnt = NFS_CLIENT(inode);
+ struct nfs_client *cl = NFS_SERVER(inode)->nfs_client;
+
+ spin_lock(&inode->i_lock);
+ blocks = inode->i_blocks;
+ isize = inode->i_size;
+ spin_unlock(&inode->i_lock);
+ if (blocks*512 < isize) {
+ pr_warn("swap activate: swapfile has holes\n");
+ return -EINVAL;
+ }
+
+ ret = rpc_clnt_swap_activate(clnt);
+ if (ret)
+ return ret;
+ ret = add_swap_extent(sis, 0, sis->max, 0);
+ if (ret < 0) {
+ rpc_clnt_swap_deactivate(clnt);
+ return ret;
+ }
+
+ *span = sis->pages;
+
+ if (cl->rpc_ops->enable_swap)
+ cl->rpc_ops->enable_swap(inode);
+
+ sis->flags |= SWP_FS_OPS;
+ return ret;
+}
+
+static void nfs_swap_deactivate(struct file *file)
+{
+ struct inode *inode = file_inode(file);
+ struct rpc_clnt *clnt = NFS_CLIENT(inode);
+ struct nfs_client *cl = NFS_SERVER(inode)->nfs_client;
+
+ rpc_clnt_swap_deactivate(clnt);
+ if (cl->rpc_ops->disable_swap)
+ cl->rpc_ops->disable_swap(file_inode(file));
+}
+
+const struct address_space_operations nfs_file_aops = {
+ .read_folio = nfs_read_folio,
+ .readahead = nfs_readahead,
+ .dirty_folio = filemap_dirty_folio,
+ .writepage = nfs_writepage,
+ .writepages = nfs_writepages,
+ .write_begin = nfs_write_begin,
+ .write_end = nfs_write_end,
+ .invalidate_folio = nfs_invalidate_folio,
+ .release_folio = nfs_release_folio,
+ .migrate_folio = nfs_migrate_folio,
+ .launder_folio = nfs_launder_folio,
+ .is_dirty_writeback = nfs_check_dirty_writeback,
+ .error_remove_page = generic_error_remove_page,
+ .swap_activate = nfs_swap_activate,
+ .swap_deactivate = nfs_swap_deactivate,
+ .swap_rw = nfs_swap_rw,
+};
+
+/*
+ * Notification that a PTE pointing to an NFS page is about to be made
+ * writable, implying that someone is about to modify the page through a
+ * shared-writable mapping
+ */
+static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf)
+{
+ struct file *filp = vmf->vma->vm_file;
+ struct inode *inode = file_inode(filp);
+ unsigned pagelen;
+ vm_fault_t ret = VM_FAULT_NOPAGE;
+ struct address_space *mapping;
+ struct folio *folio = page_folio(vmf->page);
+
+ dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n",
+ filp, filp->f_mapping->host->i_ino,
+ (long long)folio_file_pos(folio));
+
+ sb_start_pagefault(inode->i_sb);
+
+ /* make sure the cache has finished storing the page */
+ if (folio_test_fscache(folio) &&
+ folio_wait_fscache_killable(folio) < 0) {
+ ret = VM_FAULT_RETRY;
+ goto out;
+ }
+
+ wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING,
+ nfs_wait_bit_killable,
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
+
+ folio_lock(folio);
+ mapping = folio_file_mapping(folio);
+ if (mapping != inode->i_mapping)
+ goto out_unlock;
+
+ folio_wait_writeback(folio);
+
+ pagelen = nfs_folio_length(folio);
+ if (pagelen == 0)
+ goto out_unlock;
+
+ ret = VM_FAULT_LOCKED;
+ if (nfs_flush_incompatible(filp, folio) == 0 &&
+ nfs_update_folio(filp, folio, 0, pagelen) == 0)
+ goto out;
+
+ ret = VM_FAULT_SIGBUS;
+out_unlock:
+ folio_unlock(folio);
+out:
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+}
+
+static const struct vm_operations_struct nfs_file_vm_ops = {
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = nfs_vm_page_mkwrite,
+};
+
+ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ unsigned int mntflags = NFS_SERVER(inode)->flags;
+ ssize_t result, written;
+ errseq_t since;
+ int error;
+
+ result = nfs_key_timeout_notify(file, inode);
+ if (result)
+ return result;
+
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return nfs_file_direct_write(iocb, from, false);
+
+ dprintk("NFS: write(%pD2, %zu@%Ld)\n",
+ file, iov_iter_count(from), (long long) iocb->ki_pos);
+
+ if (IS_SWAPFILE(inode))
+ goto out_swapfile;
+ /*
+ * O_APPEND implies that we must revalidate the file length.
+ */
+ if (iocb->ki_flags & IOCB_APPEND || iocb->ki_pos > i_size_read(inode)) {
+ result = nfs_revalidate_file_size(inode, file);
+ if (result)
+ return result;
+ }
+
+ nfs_clear_invalid_mapping(file->f_mapping);
+
+ since = filemap_sample_wb_err(file->f_mapping);
+ nfs_start_io_write(inode);
+ result = generic_write_checks(iocb, from);
+ if (result > 0)
+ result = generic_perform_write(iocb, from);
+ nfs_end_io_write(inode);
+ if (result <= 0)
+ goto out;
+
+ written = result;
+ nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
+
+ if (mntflags & NFS_MOUNT_WRITE_EAGER) {
+ result = filemap_fdatawrite_range(file->f_mapping,
+ iocb->ki_pos - written,
+ iocb->ki_pos - 1);
+ if (result < 0)
+ goto out;
+ }
+ if (mntflags & NFS_MOUNT_WRITE_WAIT) {
+ filemap_fdatawait_range(file->f_mapping,
+ iocb->ki_pos - written,
+ iocb->ki_pos - 1);
+ }
+ result = generic_write_sync(iocb, written);
+ if (result < 0)
+ return result;
+
+out:
+ /* Return error values */
+ error = filemap_check_wb_err(file->f_mapping, since);
+ switch (error) {
+ default:
+ break;
+ case -EDQUOT:
+ case -EFBIG:
+ case -ENOSPC:
+ nfs_wb_all(inode);
+ error = file_check_and_advance_wb_err(file);
+ if (error < 0)
+ result = error;
+ }
+ return result;
+
+out_swapfile:
+ printk(KERN_INFO "NFS: attempt to write to active swap file!\n");
+ return -ETXTBSY;
+}
+EXPORT_SYMBOL_GPL(nfs_file_write);
+
+static int
+do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
+{
+ struct inode *inode = filp->f_mapping->host;
+ int status = 0;
+ unsigned int saved_type = fl->fl_type;
+
+ /* Try local locking first */
+ posix_test_lock(filp, fl);
+ if (fl->fl_type != F_UNLCK) {
+ /* found a conflict */
+ goto out;
+ }
+ fl->fl_type = saved_type;
+
+ if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ goto out_noconflict;
+
+ if (is_local)
+ goto out_noconflict;
+
+ status = NFS_PROTO(inode)->lock(filp, cmd, fl);
+out:
+ return status;
+out_noconflict:
+ fl->fl_type = F_UNLCK;
+ goto out;
+}
+
+static int
+do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
+{
+ struct inode *inode = filp->f_mapping->host;
+ struct nfs_lock_context *l_ctx;
+ int status;
+
+ /*
+ * Flush all pending writes before doing anything
+ * with locks..
+ */
+ nfs_wb_all(inode);
+
+ l_ctx = nfs_get_lock_context(nfs_file_open_context(filp));
+ if (!IS_ERR(l_ctx)) {
+ status = nfs_iocounter_wait(l_ctx);
+ nfs_put_lock_context(l_ctx);
+ /* NOTE: special case
+ * If we're signalled while cleaning up locks on process exit, we
+ * still need to complete the unlock.
+ */
+ if (status < 0 && !(fl->fl_flags & FL_CLOSE))
+ return status;
+ }
+
+ /*
+ * Use local locking if mounted with "-onolock" or with appropriate
+ * "-olocal_lock="
+ */
+ if (!is_local)
+ status = NFS_PROTO(inode)->lock(filp, cmd, fl);
+ else
+ status = locks_lock_file_wait(filp, fl);
+ return status;
+}
+
+static int
+do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
+{
+ struct inode *inode = filp->f_mapping->host;
+ int status;
+
+ /*
+ * Flush all pending writes before doing anything
+ * with locks..
+ */
+ status = nfs_sync_mapping(filp->f_mapping);
+ if (status != 0)
+ goto out;
+
+ /*
+ * Use local locking if mounted with "-onolock" or with appropriate
+ * "-olocal_lock="
+ */
+ if (!is_local)
+ status = NFS_PROTO(inode)->lock(filp, cmd, fl);
+ else
+ status = locks_lock_file_wait(filp, fl);
+ if (status < 0)
+ goto out;
+
+ /*
+ * Invalidate cache to prevent missing any changes. If
+ * the file is mapped, clear the page cache as well so
+ * those mappings will be loaded.
+ *
+ * This makes locking act as a cache coherency point.
+ */
+ nfs_sync_mapping(filp->f_mapping);
+ if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) {
+ nfs_zap_caches(inode);
+ if (mapping_mapped(filp->f_mapping))
+ nfs_revalidate_mapping(inode, filp->f_mapping);
+ }
+out:
+ return status;
+}
+
+/*
+ * Lock a (portion of) a file
+ */
+int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = filp->f_mapping->host;
+ int ret = -ENOLCK;
+ int is_local = 0;
+
+ dprintk("NFS: lock(%pD2, t=%x, fl=%x, r=%lld:%lld)\n",
+ filp, fl->fl_type, fl->fl_flags,
+ (long long)fl->fl_start, (long long)fl->fl_end);
+
+ nfs_inc_stats(inode, NFSIOS_VFSLOCK);
+
+ if (fl->fl_flags & FL_RECLAIM)
+ return -ENOGRACE;
+
+ if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
+ is_local = 1;
+
+ if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
+ ret = NFS_PROTO(inode)->lock_check_bounds(fl);
+ if (ret < 0)
+ goto out_err;
+ }
+
+ if (IS_GETLK(cmd))
+ ret = do_getlk(filp, cmd, fl, is_local);
+ else if (fl->fl_type == F_UNLCK)
+ ret = do_unlk(filp, cmd, fl, is_local);
+ else
+ ret = do_setlk(filp, cmd, fl, is_local);
+out_err:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_lock);
+
+/*
+ * Lock a (portion of) a file
+ */
+int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = filp->f_mapping->host;
+ int is_local = 0;
+
+ dprintk("NFS: flock(%pD2, t=%x, fl=%x)\n",
+ filp, fl->fl_type, fl->fl_flags);
+
+ if (!(fl->fl_flags & FL_FLOCK))
+ return -ENOLCK;
+
+ if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
+ is_local = 1;
+
+ /* We're simulating flock() locks using posix locks on the server */
+ if (fl->fl_type == F_UNLCK)
+ return do_unlk(filp, cmd, fl, is_local);
+ return do_setlk(filp, cmd, fl, is_local);
+}
+EXPORT_SYMBOL_GPL(nfs_flock);
+
+const struct file_operations nfs_file_operations = {
+ .llseek = nfs_file_llseek,
+ .read_iter = nfs_file_read,
+ .write_iter = nfs_file_write,
+ .mmap = nfs_file_mmap,
+ .open = nfs_file_open,
+ .flush = nfs_file_flush,
+ .release = nfs_file_release,
+ .fsync = nfs_file_fsync,
+ .lock = nfs_lock,
+ .flock = nfs_flock,
+ .splice_read = nfs_file_splice_read,
+ .splice_write = iter_file_splice_write,
+ .check_flags = nfs_check_flags,
+ .setlease = simple_nosetlease,
+};
+EXPORT_SYMBOL_GPL(nfs_file_operations);
diff --git a/fs/nfs/filelayout/Makefile b/fs/nfs/filelayout/Makefile
new file mode 100644
index 0000000000..de056312d3
--- /dev/null
+++ b/fs/nfs/filelayout/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for the pNFS Files Layout Driver kernel module
+#
+obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
+nfs_layout_nfsv41_files-y := filelayout.o filelayoutdev.o
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
new file mode 100644
index 0000000000..ce8f8934bc
--- /dev/null
+++ b/fs/nfs/filelayout/filelayout.c
@@ -0,0 +1,1162 @@
+/*
+ * Module for the pnfs nfs4 file layout driver.
+ * Defines all I/O and Policy interface operations, plus code
+ * to register itself with the pNFS client.
+ *
+ * Copyright (c) 2002
+ * The Regents of the University of Michigan
+ * All Rights Reserved
+ *
+ * Dean Hildebrand <dhildebz@umich.edu>
+ *
+ * Permission is granted to use, copy, create derivative works, and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the University of Michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. If
+ * the above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * This software is provided as is, without representation or warranty
+ * of any kind either express or implied, including without limitation
+ * the implied warranties of merchantability, fitness for a particular
+ * purpose, or noninfringement. The Regents of the University of
+ * Michigan shall not be liable for any damages, including special,
+ * indirect, incidental, or consequential damages, with respect to any
+ * claim arising out of or in connection with the use of the software,
+ * even if it has been or is hereafter advised of the possibility of
+ * such damages.
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+
+#include <linux/sunrpc/metrics.h>
+
+#include "../nfs4session.h"
+#include "../internal.h"
+#include "../delegation.h"
+#include "filelayout.h"
+#include "../nfs4trace.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
+MODULE_DESCRIPTION("The NFSv4 file layout driver");
+
+#define FILELAYOUT_POLL_RETRY_MAX (15*HZ)
+static const struct pnfs_commit_ops filelayout_commit_ops;
+
+static loff_t
+filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
+ loff_t offset)
+{
+ u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
+ u64 stripe_no;
+ u32 rem;
+
+ offset -= flseg->pattern_offset;
+ stripe_no = div_u64(offset, stripe_width);
+ div_u64_rem(offset, flseg->stripe_unit, &rem);
+
+ return stripe_no * flseg->stripe_unit + rem;
+}
+
+/* This function is used by the layout driver to calculate the
+ * offset of the file on the dserver based on whether the
+ * layout type is STRIPE_DENSE or STRIPE_SPARSE
+ */
+static loff_t
+filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+
+ switch (flseg->stripe_type) {
+ case STRIPE_SPARSE:
+ return offset;
+
+ case STRIPE_DENSE:
+ return filelayout_get_dense_offset(flseg, offset);
+ }
+
+ BUG();
+}
+
+static void filelayout_reset_write(struct nfs_pgio_header *hdr)
+{
+ struct rpc_task *task = &hdr->task;
+
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ dprintk("%s Reset task %5u for i/o through MDS "
+ "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+ hdr->task.tk_pid,
+ hdr->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(hdr->inode),
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
+
+ task->tk_status = pnfs_write_done_resend_to_mds(hdr);
+ }
+}
+
+static void filelayout_reset_read(struct nfs_pgio_header *hdr)
+{
+ struct rpc_task *task = &hdr->task;
+
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ dprintk("%s Reset task %5u for i/o through MDS "
+ "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+ hdr->task.tk_pid,
+ hdr->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(hdr->inode),
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
+
+ task->tk_status = pnfs_read_done_resend_to_mds(hdr);
+ }
+}
+
+static int filelayout_async_handle_error(struct rpc_task *task,
+ struct nfs4_state *state,
+ struct nfs_client *clp,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_layout_hdr *lo = lseg->pls_layout;
+ struct inode *inode = lo->plh_inode;
+ struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
+ struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
+
+ if (task->tk_status >= 0)
+ return 0;
+
+ switch (task->tk_status) {
+ /* DS session errors */
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case -NFS4ERR_SEQ_FALSE_RETRY:
+ case -NFS4ERR_SEQ_MISORDERED:
+ dprintk("%s ERROR %d, Reset session. Exchangeid "
+ "flags 0x%x\n", __func__, task->tk_status,
+ clp->cl_exchange_flags);
+ nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
+ break;
+ case -NFS4ERR_DELAY:
+ case -NFS4ERR_GRACE:
+ rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
+ break;
+ case -NFS4ERR_RETRY_UNCACHED_REP:
+ break;
+ /* Invalidate Layout errors */
+ case -NFS4ERR_ACCESS:
+ case -NFS4ERR_PNFS_NO_LAYOUT:
+ case -ESTALE: /* mapped NFS4ERR_STALE */
+ case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */
+ case -EISDIR: /* mapped NFS4ERR_ISDIR */
+ case -NFS4ERR_FHEXPIRED:
+ case -NFS4ERR_WRONG_TYPE:
+ dprintk("%s Invalid layout error %d\n", __func__,
+ task->tk_status);
+ /*
+ * Destroy layout so new i/o will get a new layout.
+ * Layout will not be destroyed until all current lseg
+ * references are put. Mark layout as invalid to resend failed
+ * i/o and all i/o waiting on the slot table to the MDS until
+ * layout is destroyed and a new valid layout is obtained.
+ */
+ pnfs_destroy_layout(NFS_I(inode));
+ rpc_wake_up(&tbl->slot_tbl_waitq);
+ goto reset;
+ /* RPC connection errors */
+ case -ECONNREFUSED:
+ case -EHOSTDOWN:
+ case -EHOSTUNREACH:
+ case -ENETUNREACH:
+ case -EIO:
+ case -ETIMEDOUT:
+ case -EPIPE:
+ case -EPROTO:
+ case -ENODEV:
+ dprintk("%s DS connection error %d\n", __func__,
+ task->tk_status);
+ nfs4_mark_deviceid_unavailable(devid);
+ pnfs_error_mark_layout_for_return(inode, lseg);
+ pnfs_set_lo_fail(lseg);
+ rpc_wake_up(&tbl->slot_tbl_waitq);
+ fallthrough;
+ default:
+reset:
+ dprintk("%s Retry through MDS. Error %d\n", __func__,
+ task->tk_status);
+ return -NFS4ERR_RESET_TO_MDS;
+ }
+ task->tk_status = 0;
+ return -EAGAIN;
+}
+
+/* NFS_PROTO call done callback routines */
+
+static int filelayout_read_done_cb(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ int err;
+
+ trace_nfs4_pnfs_read(hdr, task->tk_status);
+ err = filelayout_async_handle_error(task, hdr->args.context->state,
+ hdr->ds_clp, hdr->lseg);
+
+ switch (err) {
+ case -NFS4ERR_RESET_TO_MDS:
+ filelayout_reset_read(hdr);
+ return task->tk_status;
+ case -EAGAIN:
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+/*
+ * We reference the rpc_cred of the first WRITE that triggers the need for
+ * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
+ * rfc5661 is not clear about which credential should be used.
+ */
+static void
+filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
+{
+ loff_t end_offs = 0;
+
+ if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
+ hdr->res.verf->committed == NFS_FILE_SYNC)
+ return;
+ if (hdr->res.verf->committed == NFS_DATA_SYNC)
+ end_offs = hdr->mds_offset + (loff_t)hdr->res.count;
+
+ /* Note: if the write is unstable, don't set end_offs until commit */
+ pnfs_set_layoutcommit(hdr->inode, hdr->lseg, end_offs);
+ dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
+ (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
+}
+
+bool
+filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node)
+{
+ return filelayout_test_devid_invalid(node) ||
+ nfs4_test_deviceid_unavailable(node);
+}
+
+static bool
+filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_deviceid_node *node = FILELAYOUT_DEVID_NODE(lseg);
+
+ return filelayout_test_devid_unavailable(node);
+}
+
+/*
+ * Call ops for the async read/write cases
+ * In the case of dense layouts, the offset needs to be reset to its
+ * original value.
+ */
+static void filelayout_read_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+ rpc_exit(task, -EIO);
+ return;
+ }
+ if (filelayout_reset_to_mds(hdr->lseg)) {
+ dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
+ filelayout_reset_read(hdr);
+ rpc_exit(task, 0);
+ return;
+ }
+ hdr->pgio_done_cb = filelayout_read_done_cb;
+
+ if (nfs4_setup_sequence(hdr->ds_clp,
+ &hdr->args.seq_args,
+ &hdr->res.seq_res,
+ task))
+ return;
+ if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+ hdr->args.lock_context, FMODE_READ) == -EIO)
+ rpc_exit(task, -EIO); /* lost lock, terminate I/O */
+}
+
+static void filelayout_read_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+ task->tk_status == 0) {
+ nfs41_sequence_done(task, &hdr->res.seq_res);
+ return;
+ }
+
+ /* Note this may cause RPC to be resent */
+ hdr->mds_ops->rpc_call_done(task, data);
+}
+
+static void filelayout_read_count_stats(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
+}
+
+static int filelayout_write_done_cb(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ int err;
+
+ trace_nfs4_pnfs_write(hdr, task->tk_status);
+ err = filelayout_async_handle_error(task, hdr->args.context->state,
+ hdr->ds_clp, hdr->lseg);
+
+ switch (err) {
+ case -NFS4ERR_RESET_TO_MDS:
+ filelayout_reset_write(hdr);
+ return task->tk_status;
+ case -EAGAIN:
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
+
+ filelayout_set_layoutcommit(hdr);
+
+ /* zero out the fattr */
+ hdr->fattr.valid = 0;
+ if (task->tk_status >= 0)
+ nfs_writeback_update_inode(hdr);
+
+ return 0;
+}
+
+static int filelayout_commit_done_cb(struct rpc_task *task,
+ struct nfs_commit_data *data)
+{
+ int err;
+
+ trace_nfs4_pnfs_commit_ds(data, task->tk_status);
+ err = filelayout_async_handle_error(task, NULL, data->ds_clp,
+ data->lseg);
+
+ switch (err) {
+ case -NFS4ERR_RESET_TO_MDS:
+ pnfs_generic_prepare_to_resend_writes(data);
+ return -EAGAIN;
+ case -EAGAIN:
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
+
+ pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
+
+ return 0;
+}
+
+static void filelayout_write_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+ rpc_exit(task, -EIO);
+ return;
+ }
+ if (filelayout_reset_to_mds(hdr->lseg)) {
+ dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
+ filelayout_reset_write(hdr);
+ rpc_exit(task, 0);
+ return;
+ }
+ if (nfs4_setup_sequence(hdr->ds_clp,
+ &hdr->args.seq_args,
+ &hdr->res.seq_res,
+ task))
+ return;
+ if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+ hdr->args.lock_context, FMODE_WRITE) == -EIO)
+ rpc_exit(task, -EIO); /* lost lock, terminate I/O */
+}
+
+static void filelayout_write_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+ task->tk_status == 0) {
+ nfs41_sequence_done(task, &hdr->res.seq_res);
+ return;
+ }
+
+ /* Note this may cause RPC to be resent */
+ hdr->mds_ops->rpc_call_done(task, data);
+}
+
+static void filelayout_write_count_stats(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
+}
+
+static void filelayout_commit_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs_commit_data *wdata = data;
+
+ nfs4_setup_sequence(wdata->ds_clp,
+ &wdata->args.seq_args,
+ &wdata->res.seq_res,
+ task);
+}
+
+static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
+{
+ struct nfs_commit_data *cdata = data;
+
+ rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics);
+}
+
+static const struct rpc_call_ops filelayout_read_call_ops = {
+ .rpc_call_prepare = filelayout_read_prepare,
+ .rpc_call_done = filelayout_read_call_done,
+ .rpc_count_stats = filelayout_read_count_stats,
+ .rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops filelayout_write_call_ops = {
+ .rpc_call_prepare = filelayout_write_prepare,
+ .rpc_call_done = filelayout_write_call_done,
+ .rpc_count_stats = filelayout_write_count_stats,
+ .rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops filelayout_commit_call_ops = {
+ .rpc_call_prepare = filelayout_commit_prepare,
+ .rpc_call_done = pnfs_generic_write_commit_done,
+ .rpc_count_stats = filelayout_commit_count_stats,
+ .rpc_release = pnfs_generic_commit_release,
+};
+
+static enum pnfs_try_status
+filelayout_read_pagelist(struct nfs_pgio_header *hdr)
+{
+ struct pnfs_layout_segment *lseg = hdr->lseg;
+ struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
+ loff_t offset = hdr->args.offset;
+ u32 j, idx;
+ struct nfs_fh *fh;
+
+ dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n",
+ __func__, hdr->inode->i_ino,
+ hdr->args.pgbase, (size_t)hdr->args.count, offset);
+
+ /* Retrieve the correct rpc_client for the byte range */
+ j = nfs4_fl_calc_j_index(lseg, offset);
+ idx = nfs4_fl_calc_ds_index(lseg, j);
+ ds = nfs4_fl_prepare_ds(lseg, idx);
+ if (!ds)
+ return PNFS_NOT_ATTEMPTED;
+
+ ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode);
+ if (IS_ERR(ds_clnt))
+ return PNFS_NOT_ATTEMPTED;
+
+ dprintk("%s USE DS: %s cl_count %d\n", __func__,
+ ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count));
+
+ /* No multipath support. Use first DS */
+ refcount_inc(&ds->ds_clp->cl_count);
+ hdr->ds_clp = ds->ds_clp;
+ hdr->ds_commit_idx = idx;
+ fh = nfs4_fl_select_ds_fh(lseg, j);
+ if (fh)
+ hdr->args.fh = fh;
+
+ hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
+ hdr->mds_offset = offset;
+
+ /* Perform an asynchronous read to ds */
+ nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
+ NFS_PROTO(hdr->inode), &filelayout_read_call_ops,
+ 0, RPC_TASK_SOFTCONN);
+ return PNFS_ATTEMPTED;
+}
+
+/* Perform async writes. */
+static enum pnfs_try_status
+filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
+{
+ struct pnfs_layout_segment *lseg = hdr->lseg;
+ struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
+ loff_t offset = hdr->args.offset;
+ u32 j, idx;
+ struct nfs_fh *fh;
+
+ /* Retrieve the correct rpc_client for the byte range */
+ j = nfs4_fl_calc_j_index(lseg, offset);
+ idx = nfs4_fl_calc_ds_index(lseg, j);
+ ds = nfs4_fl_prepare_ds(lseg, idx);
+ if (!ds)
+ return PNFS_NOT_ATTEMPTED;
+
+ ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode);
+ if (IS_ERR(ds_clnt))
+ return PNFS_NOT_ATTEMPTED;
+
+ dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d\n",
+ __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
+ offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count));
+
+ hdr->pgio_done_cb = filelayout_write_done_cb;
+ refcount_inc(&ds->ds_clp->cl_count);
+ hdr->ds_clp = ds->ds_clp;
+ hdr->ds_commit_idx = idx;
+ fh = nfs4_fl_select_ds_fh(lseg, j);
+ if (fh)
+ hdr->args.fh = fh;
+ hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
+
+ /* Perform an asynchronous write */
+ nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
+ NFS_PROTO(hdr->inode), &filelayout_write_call_ops,
+ sync, RPC_TASK_SOFTCONN);
+ return PNFS_ATTEMPTED;
+}
+
+static int
+filelayout_check_deviceid(struct pnfs_layout_hdr *lo,
+ struct nfs4_filelayout_segment *fl,
+ gfp_t gfp_flags)
+{
+ struct nfs4_deviceid_node *d;
+ struct nfs4_file_layout_dsaddr *dsaddr;
+ int status = -EINVAL;
+
+ /* Is the deviceid already set? If so, we're good. */
+ if (fl->dsaddr != NULL)
+ return 0;
+
+ /* find and reference the deviceid */
+ d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &fl->deviceid,
+ lo->plh_lc_cred, gfp_flags);
+ if (d == NULL)
+ goto out;
+
+ dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
+ /* Found deviceid is unavailable */
+ if (filelayout_test_devid_unavailable(&dsaddr->id_node))
+ goto out_put;
+
+ if (fl->first_stripe_index >= dsaddr->stripe_count) {
+ dprintk("%s Bad first_stripe_index %u\n",
+ __func__, fl->first_stripe_index);
+ goto out_put;
+ }
+
+ if ((fl->stripe_type == STRIPE_SPARSE &&
+ fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) ||
+ (fl->stripe_type == STRIPE_DENSE &&
+ fl->num_fh != dsaddr->stripe_count)) {
+ dprintk("%s num_fh %u not valid for given packing\n",
+ __func__, fl->num_fh);
+ goto out_put;
+ }
+ status = 0;
+
+ /*
+ * Atomic compare and xchange to ensure we don't scribble
+ * over a non-NULL pointer.
+ */
+ if (cmpxchg(&fl->dsaddr, NULL, dsaddr) != NULL)
+ goto out_put;
+out:
+ return status;
+out_put:
+ nfs4_fl_put_deviceid(dsaddr);
+ goto out;
+}
+
+/*
+ * filelayout_check_layout()
+ *
+ * Make sure layout segment parameters are sane WRT the device.
+ * At this point no generic layer initialization of the lseg has occurred,
+ * and nothing has been added to the layout_hdr cache.
+ *
+ */
+static int
+filelayout_check_layout(struct pnfs_layout_hdr *lo,
+ struct nfs4_filelayout_segment *fl,
+ struct nfs4_layoutget_res *lgr,
+ gfp_t gfp_flags)
+{
+ int status = -EINVAL;
+
+ dprintk("--> %s\n", __func__);
+
+ /* FIXME: remove this check when layout segment support is added */
+ if (lgr->range.offset != 0 ||
+ lgr->range.length != NFS4_MAX_UINT64) {
+ dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
+ __func__);
+ goto out;
+ }
+
+ if (fl->pattern_offset > lgr->range.offset) {
+ dprintk("%s pattern_offset %lld too large\n",
+ __func__, fl->pattern_offset);
+ goto out;
+ }
+
+ if (!fl->stripe_unit) {
+ dprintk("%s Invalid stripe unit (%u)\n",
+ __func__, fl->stripe_unit);
+ goto out;
+ }
+
+ status = 0;
+out:
+ dprintk("--> %s returns %d\n", __func__, status);
+ return status;
+}
+
+static void _filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
+{
+ int i;
+
+ if (fl->fh_array) {
+ for (i = 0; i < fl->num_fh; i++) {
+ if (!fl->fh_array[i])
+ break;
+ kfree(fl->fh_array[i]);
+ }
+ kfree(fl->fh_array);
+ }
+ kfree(fl);
+}
+
+static int
+filelayout_decode_layout(struct pnfs_layout_hdr *flo,
+ struct nfs4_filelayout_segment *fl,
+ struct nfs4_layoutget_res *lgr,
+ gfp_t gfp_flags)
+{
+ struct xdr_stream stream;
+ struct xdr_buf buf;
+ struct page *scratch;
+ __be32 *p;
+ uint32_t nfl_util;
+ int i;
+
+ dprintk("%s: set_layout_map Begin\n", __func__);
+
+ scratch = alloc_page(gfp_flags);
+ if (!scratch)
+ return -ENOMEM;
+
+ xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
+ xdr_set_scratch_page(&stream, scratch);
+
+ /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
+ * num_fh (4) */
+ p = xdr_inline_decode(&stream, NFS4_DEVICEID4_SIZE + 20);
+ if (unlikely(!p))
+ goto out_err;
+
+ memcpy(&fl->deviceid, p, sizeof(fl->deviceid));
+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+ nfs4_print_deviceid(&fl->deviceid);
+
+ nfl_util = be32_to_cpup(p++);
+ if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
+ fl->commit_through_mds = 1;
+ if (nfl_util & NFL4_UFLG_DENSE)
+ fl->stripe_type = STRIPE_DENSE;
+ else
+ fl->stripe_type = STRIPE_SPARSE;
+ fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK;
+
+ fl->first_stripe_index = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &fl->pattern_offset);
+ fl->num_fh = be32_to_cpup(p++);
+
+ dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n",
+ __func__, nfl_util, fl->num_fh, fl->first_stripe_index,
+ fl->pattern_offset);
+
+ /* Note that a zero value for num_fh is legal for STRIPE_SPARSE.
+ * Futher checking is done in filelayout_check_layout */
+ if (fl->num_fh >
+ max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT))
+ goto out_err;
+
+ if (fl->num_fh > 0) {
+ fl->fh_array = kcalloc(fl->num_fh, sizeof(fl->fh_array[0]),
+ gfp_flags);
+ if (!fl->fh_array)
+ goto out_err;
+ }
+
+ for (i = 0; i < fl->num_fh; i++) {
+ /* Do we want to use a mempool here? */
+ fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), gfp_flags);
+ if (!fl->fh_array[i])
+ goto out_err;
+
+ p = xdr_inline_decode(&stream, 4);
+ if (unlikely(!p))
+ goto out_err;
+ fl->fh_array[i]->size = be32_to_cpup(p++);
+ if (fl->fh_array[i]->size > NFS_MAXFHSIZE) {
+ printk(KERN_ERR "NFS: Too big fh %d received %d\n",
+ i, fl->fh_array[i]->size);
+ goto out_err;
+ }
+
+ p = xdr_inline_decode(&stream, fl->fh_array[i]->size);
+ if (unlikely(!p))
+ goto out_err;
+ memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
+ dprintk("DEBUG: %s: fh len %d\n", __func__,
+ fl->fh_array[i]->size);
+ }
+
+ __free_page(scratch);
+ return 0;
+
+out_err:
+ __free_page(scratch);
+ return -EIO;
+}
+
+static void
+filelayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+
+ dprintk("--> %s\n", __func__);
+ if (fl->dsaddr != NULL)
+ nfs4_fl_put_deviceid(fl->dsaddr);
+ /* This assumes a single RW lseg */
+ if (lseg->pls_range.iomode == IOMODE_RW) {
+ struct nfs4_filelayout *flo;
+ struct inode *inode;
+
+ flo = FILELAYOUT_FROM_HDR(lseg->pls_layout);
+ inode = flo->generic_hdr.plh_inode;
+ spin_lock(&inode->i_lock);
+ pnfs_generic_ds_cinfo_release_lseg(&flo->commit_info, lseg);
+ spin_unlock(&inode->i_lock);
+ }
+ _filelayout_free_lseg(fl);
+}
+
+static struct pnfs_layout_segment *
+filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
+ struct nfs4_layoutget_res *lgr,
+ gfp_t gfp_flags)
+{
+ struct nfs4_filelayout_segment *fl;
+ int rc;
+
+ dprintk("--> %s\n", __func__);
+ fl = kzalloc(sizeof(*fl), gfp_flags);
+ if (!fl)
+ return NULL;
+
+ rc = filelayout_decode_layout(layoutid, fl, lgr, gfp_flags);
+ if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, gfp_flags)) {
+ _filelayout_free_lseg(fl);
+ return NULL;
+ }
+ return &fl->generic_hdr;
+}
+
+static bool
+filelayout_lseg_is_striped(const struct nfs4_filelayout_segment *flseg)
+{
+ return flseg->num_fh > 1;
+}
+
+/*
+ * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
+ *
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
+filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+ struct nfs_page *req)
+{
+ unsigned int size;
+ u64 p_stripe, r_stripe;
+ u32 stripe_offset;
+ u64 segment_offset = pgio->pg_lseg->pls_range.offset;
+ u32 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
+
+ /* calls nfs_generic_pg_test */
+ size = pnfs_generic_pg_test(pgio, prev, req);
+ if (!size)
+ return 0;
+ else if (!filelayout_lseg_is_striped(FILELAYOUT_LSEG(pgio->pg_lseg)))
+ return size;
+
+ /* see if req and prev are in the same stripe */
+ if (prev) {
+ p_stripe = (u64)req_offset(prev) - segment_offset;
+ r_stripe = (u64)req_offset(req) - segment_offset;
+ do_div(p_stripe, stripe_unit);
+ do_div(r_stripe, stripe_unit);
+
+ if (p_stripe != r_stripe)
+ return 0;
+ }
+
+ /* calculate remaining bytes in the current stripe */
+ div_u64_rem((u64)req_offset(req) - segment_offset,
+ stripe_unit,
+ &stripe_offset);
+ WARN_ON_ONCE(stripe_offset > stripe_unit);
+ if (stripe_offset >= stripe_unit)
+ return 0;
+ return min(stripe_unit - (unsigned int)stripe_offset, size);
+}
+
+static struct pnfs_layout_segment *
+fl_pnfs_update_layout(struct inode *ino,
+ struct nfs_open_context *ctx,
+ loff_t pos,
+ u64 count,
+ enum pnfs_iomode iomode,
+ bool strict_iomode,
+ gfp_t gfp_flags)
+{
+ struct pnfs_layout_segment *lseg = NULL;
+ struct pnfs_layout_hdr *lo;
+ struct nfs4_filelayout_segment *fl;
+ int status;
+
+ lseg = pnfs_update_layout(ino, ctx, pos, count, iomode, strict_iomode,
+ gfp_flags);
+ if (IS_ERR(lseg)) {
+ /* Fall back to MDS on recoverable errors */
+ if (!nfs_error_is_fatal_on_server(PTR_ERR(lseg)))
+ lseg = NULL;
+ goto out;
+ } else if (!lseg)
+ goto out;
+
+ lo = NFS_I(ino)->layout;
+ fl = FILELAYOUT_LSEG(lseg);
+
+ status = filelayout_check_deviceid(lo, fl, gfp_flags);
+ if (status) {
+ pnfs_error_mark_layout_for_return(ino, lseg);
+ pnfs_set_lo_fail(lseg);
+ pnfs_put_lseg(lseg);
+ lseg = NULL;
+ }
+out:
+ return lseg;
+}
+
+static void
+filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ pnfs_generic_pg_check_layout(pgio);
+ if (!pgio->pg_lseg) {
+ pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode,
+ nfs_req_openctx(req),
+ 0,
+ NFS4_MAX_UINT64,
+ IOMODE_READ,
+ false,
+ GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
+ /* If no lseg, fall back to read through mds */
+ if (pgio->pg_lseg == NULL)
+ nfs_pageio_reset_read_mds(pgio);
+}
+
+static void
+filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ pnfs_generic_pg_check_layout(pgio);
+ if (!pgio->pg_lseg) {
+ pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode,
+ nfs_req_openctx(req),
+ 0,
+ NFS4_MAX_UINT64,
+ IOMODE_RW,
+ false,
+ GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
+
+ /* If no lseg, fall back to write through mds */
+ if (pgio->pg_lseg == NULL)
+ nfs_pageio_reset_write_mds(pgio);
+}
+
+static const struct nfs_pageio_ops filelayout_pg_read_ops = {
+ .pg_init = filelayout_pg_init_read,
+ .pg_test = filelayout_pg_test,
+ .pg_doio = pnfs_generic_pg_readpages,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static const struct nfs_pageio_ops filelayout_pg_write_ops = {
+ .pg_init = filelayout_pg_init_write,
+ .pg_test = filelayout_pg_test,
+ .pg_doio = pnfs_generic_pg_writepages,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
+{
+ if (fl->stripe_type == STRIPE_SPARSE)
+ return nfs4_fl_calc_ds_index(&fl->generic_hdr, j);
+ else
+ return j;
+}
+
+static void
+filelayout_mark_request_commit(struct nfs_page *req,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx)
+
+{
+ struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+ u32 i, j;
+
+ if (fl->commit_through_mds) {
+ nfs_request_add_commit_list(req, cinfo);
+ } else {
+ /* Note that we are calling nfs4_fl_calc_j_index on each page
+ * that ends up being committed to a data server. An attractive
+ * alternative is to add a field to nfs_write_data and nfs_page
+ * to store the value calculated in filelayout_write_pagelist
+ * and just use that here.
+ */
+ j = nfs4_fl_calc_j_index(lseg, req_offset(req));
+ i = select_bucket_index(fl, j);
+ pnfs_layout_mark_request_commit(req, lseg, cinfo, i);
+ }
+}
+
+static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+
+ if (flseg->stripe_type == STRIPE_SPARSE)
+ return i;
+ else
+ return nfs4_fl_calc_ds_index(lseg, i);
+}
+
+static struct nfs_fh *
+select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+
+ if (flseg->stripe_type == STRIPE_SPARSE) {
+ if (flseg->num_fh == 1)
+ i = 0;
+ else if (flseg->num_fh == 0)
+ /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
+ return NULL;
+ }
+ return flseg->fh_array[i];
+}
+
+static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
+{
+ struct pnfs_layout_segment *lseg = data->lseg;
+ struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
+ u32 idx;
+ struct nfs_fh *fh;
+
+ idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
+ ds = nfs4_fl_prepare_ds(lseg, idx);
+ if (!ds)
+ goto out_err;
+
+ ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, data->inode);
+ if (IS_ERR(ds_clnt))
+ goto out_err;
+
+ dprintk("%s ino %lu, how %d cl_count %d\n", __func__,
+ data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count));
+ data->commit_done_cb = filelayout_commit_done_cb;
+ refcount_inc(&ds->ds_clp->cl_count);
+ data->ds_clp = ds->ds_clp;
+ fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
+ if (fh)
+ data->args.fh = fh;
+ return nfs_initiate_commit(ds_clnt, data, NFS_PROTO(data->inode),
+ &filelayout_commit_call_ops, how,
+ RPC_TASK_SOFTCONN);
+out_err:
+ pnfs_generic_prepare_to_resend_writes(data);
+ pnfs_generic_commit_release(data);
+ return -EAGAIN;
+}
+
+static int
+filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
+ int how, struct nfs_commit_info *cinfo)
+{
+ return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
+ filelayout_initiate_commit);
+}
+
+static struct nfs4_deviceid_node *
+filelayout_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_flags)
+{
+ struct nfs4_file_layout_dsaddr *dsaddr;
+
+ dsaddr = nfs4_fl_alloc_deviceid_node(server, pdev, gfp_flags);
+ if (!dsaddr)
+ return NULL;
+ return &dsaddr->id_node;
+}
+
+static void
+filelayout_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+ nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node));
+}
+
+static struct pnfs_layout_hdr *
+filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
+{
+ struct nfs4_filelayout *flo;
+
+ flo = kzalloc(sizeof(*flo), gfp_flags);
+ if (flo == NULL)
+ return NULL;
+ pnfs_init_ds_commit_info(&flo->commit_info);
+ flo->commit_info.ops = &filelayout_commit_ops;
+ return &flo->generic_hdr;
+}
+
+static void
+filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ kfree_rcu(FILELAYOUT_FROM_HDR(lo), generic_hdr.plh_rcu);
+}
+
+static struct pnfs_ds_commit_info *
+filelayout_get_ds_info(struct inode *inode)
+{
+ struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
+
+ if (layout == NULL)
+ return NULL;
+ else
+ return &FILELAYOUT_FROM_HDR(layout)->commit_info;
+}
+
+static void
+filelayout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+ struct inode *inode = lseg->pls_layout->plh_inode;
+ struct pnfs_commit_array *array, *new;
+ unsigned int size = (fl->stripe_type == STRIPE_SPARSE) ?
+ fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
+
+ new = pnfs_alloc_commit_array(size, nfs_io_gfp_mask());
+ if (new) {
+ spin_lock(&inode->i_lock);
+ array = pnfs_add_commit_array(fl_cinfo, new, lseg);
+ spin_unlock(&inode->i_lock);
+ if (array != new)
+ pnfs_free_commit_array(new);
+ }
+}
+
+static void
+filelayout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+ struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ pnfs_generic_ds_cinfo_destroy(fl_cinfo);
+ spin_unlock(&inode->i_lock);
+}
+
+static const struct pnfs_commit_ops filelayout_commit_ops = {
+ .setup_ds_info = filelayout_setup_ds_info,
+ .release_ds_info = filelayout_release_ds_info,
+ .mark_request_commit = filelayout_mark_request_commit,
+ .clear_request_commit = pnfs_generic_clear_request_commit,
+ .scan_commit_lists = pnfs_generic_scan_commit_lists,
+ .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
+ .search_commit_reqs = pnfs_generic_search_commit_reqs,
+ .commit_pagelist = filelayout_commit_pagelist,
+};
+
+static struct pnfs_layoutdriver_type filelayout_type = {
+ .id = LAYOUT_NFSV4_1_FILES,
+ .name = "LAYOUT_NFSV4_1_FILES",
+ .owner = THIS_MODULE,
+ .flags = PNFS_LAYOUTGET_ON_OPEN,
+ .max_layoutget_response = 4096, /* 1 page or so... */
+ .alloc_layout_hdr = filelayout_alloc_layout_hdr,
+ .free_layout_hdr = filelayout_free_layout_hdr,
+ .alloc_lseg = filelayout_alloc_lseg,
+ .free_lseg = filelayout_free_lseg,
+ .pg_read_ops = &filelayout_pg_read_ops,
+ .pg_write_ops = &filelayout_pg_write_ops,
+ .get_ds_info = &filelayout_get_ds_info,
+ .read_pagelist = filelayout_read_pagelist,
+ .write_pagelist = filelayout_write_pagelist,
+ .alloc_deviceid_node = filelayout_alloc_deviceid_node,
+ .free_deviceid_node = filelayout_free_deviceid_node,
+ .sync = pnfs_nfs_generic_sync,
+};
+
+static int __init nfs4filelayout_init(void)
+{
+ printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n",
+ __func__);
+ return pnfs_register_layoutdriver(&filelayout_type);
+}
+
+static void __exit nfs4filelayout_exit(void)
+{
+ printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n",
+ __func__);
+ pnfs_unregister_layoutdriver(&filelayout_type);
+}
+
+MODULE_ALIAS("nfs-layouttype4-1");
+
+module_init(nfs4filelayout_init);
+module_exit(nfs4filelayout_exit);
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
new file mode 100644
index 0000000000..aed0748fd6
--- /dev/null
+++ b/fs/nfs/filelayout/filelayout.h
@@ -0,0 +1,118 @@
+/*
+ * NFSv4 file layout driver data structures.
+ *
+ * Copyright (c) 2002
+ * The Regents of the University of Michigan
+ * All Rights Reserved
+ *
+ * Dean Hildebrand <dhildebz@umich.edu>
+ *
+ * Permission is granted to use, copy, create derivative works, and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the University of Michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. If
+ * the above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * This software is provided as is, without representation or warranty
+ * of any kind either express or implied, including without limitation
+ * the implied warranties of merchantability, fitness for a particular
+ * purpose, or noninfringement. The Regents of the University of
+ * Michigan shall not be liable for any damages, including special,
+ * indirect, incidental, or consequential damages, with respect to any
+ * claim arising out of or in connection with the use of the software,
+ * even if it has been or is hereafter advised of the possibility of
+ * such damages.
+ */
+
+#ifndef FS_NFS_NFS4FILELAYOUT_H
+#define FS_NFS_NFS4FILELAYOUT_H
+
+#include "../pnfs.h"
+
+/*
+ * Field testing shows we need to support up to 4096 stripe indices.
+ * We store each index as a u8 (u32 on the wire) to keep the memory footprint
+ * reasonable. This in turn means we support a maximum of 256
+ * RFC 5661 multipath_list4 structures.
+ */
+#define NFS4_PNFS_MAX_STRIPE_CNT 4096
+#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */
+
+enum stripetype4 {
+ STRIPE_SPARSE = 1,
+ STRIPE_DENSE = 2
+};
+
+struct nfs4_file_layout_dsaddr {
+ struct nfs4_deviceid_node id_node;
+ u32 stripe_count;
+ u8 *stripe_indices;
+ u32 ds_num;
+ struct nfs4_pnfs_ds *ds_list[];
+};
+
+struct nfs4_filelayout_segment {
+ struct pnfs_layout_segment generic_hdr;
+ u32 stripe_type;
+ u32 commit_through_mds;
+ u32 stripe_unit;
+ u32 first_stripe_index;
+ u64 pattern_offset;
+ struct nfs4_deviceid deviceid;
+ struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
+ unsigned int num_fh;
+ struct nfs_fh **fh_array;
+};
+
+struct nfs4_filelayout {
+ struct pnfs_layout_hdr generic_hdr;
+ struct pnfs_ds_commit_info commit_info;
+};
+
+static inline struct nfs4_filelayout *
+FILELAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo)
+{
+ return container_of(lo, struct nfs4_filelayout, generic_hdr);
+}
+
+static inline struct nfs4_filelayout_segment *
+FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
+{
+ return container_of(lseg,
+ struct nfs4_filelayout_segment,
+ generic_hdr);
+}
+
+static inline struct nfs4_deviceid_node *
+FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg)
+{
+ return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node;
+}
+
+static inline bool
+filelayout_test_devid_invalid(struct nfs4_deviceid_node *node)
+{
+ return test_bit(NFS_DEVICEID_INVALID, &node->flags);
+}
+
+extern bool
+filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node);
+
+extern struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
+
+u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
+u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
+struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
+ u32 ds_idx);
+
+extern struct nfs4_file_layout_dsaddr *
+nfs4_fl_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_flags);
+extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
+extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
+
+#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
new file mode 100644
index 0000000000..acf4b88889
--- /dev/null
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -0,0 +1,303 @@
+/*
+ * Device operations for the pnfs nfs4 file layout driver.
+ *
+ * Copyright (c) 2002
+ * The Regents of the University of Michigan
+ * All Rights Reserved
+ *
+ * Dean Hildebrand <dhildebz@umich.edu>
+ * Garth Goodson <Garth.Goodson@netapp.com>
+ *
+ * Permission is granted to use, copy, create derivative works, and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the University of Michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. If
+ * the above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * This software is provided as is, without representation or warranty
+ * of any kind either express or implied, including without limitation
+ * the implied warranties of merchantability, fitness for a particular
+ * purpose, or noninfringement. The Regents of the University of
+ * Michigan shall not be liable for any damages, including special,
+ * indirect, incidental, or consequential damages, with respect to any
+ * claim arising out of or in connection with the use of the software,
+ * even if it has been or is hereafter advised of the possibility of
+ * such damages.
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+
+#include "../internal.h"
+#include "../nfs4session.h"
+#include "filelayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
+static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
+
+void
+nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+ struct nfs4_pnfs_ds *ds;
+ int i;
+
+ nfs4_print_deviceid(&dsaddr->id_node.deviceid);
+
+ for (i = 0; i < dsaddr->ds_num; i++) {
+ ds = dsaddr->ds_list[i];
+ if (ds != NULL)
+ nfs4_pnfs_ds_put(ds);
+ }
+ kfree(dsaddr->stripe_indices);
+ kfree_rcu(dsaddr, id_node.rcu);
+}
+
+/* Decode opaque device data and return the result */
+struct nfs4_file_layout_dsaddr *
+nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags)
+{
+ int i;
+ u32 cnt, num;
+ u8 *indexp;
+ __be32 *p;
+ u8 *stripe_indices;
+ u8 max_stripe_index;
+ struct nfs4_file_layout_dsaddr *dsaddr = NULL;
+ struct xdr_stream stream;
+ struct xdr_buf buf;
+ struct page *scratch;
+ struct list_head dsaddrs;
+ struct nfs4_pnfs_ds_addr *da;
+
+ /* set up xdr stream */
+ scratch = alloc_page(gfp_flags);
+ if (!scratch)
+ goto out_err;
+
+ xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
+ xdr_set_scratch_page(&stream, scratch);
+
+ /* Get the stripe count (number of stripe index) */
+ p = xdr_inline_decode(&stream, 4);
+ if (unlikely(!p))
+ goto out_err_free_scratch;
+
+ cnt = be32_to_cpup(p);
+ dprintk("%s stripe count %d\n", __func__, cnt);
+ if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
+ printk(KERN_WARNING "NFS: %s: stripe count %d greater than "
+ "supported maximum %d\n", __func__,
+ cnt, NFS4_PNFS_MAX_STRIPE_CNT);
+ goto out_err_free_scratch;
+ }
+
+ /* read stripe indices */
+ stripe_indices = kcalloc(cnt, sizeof(u8), gfp_flags);
+ if (!stripe_indices)
+ goto out_err_free_scratch;
+
+ p = xdr_inline_decode(&stream, cnt << 2);
+ if (unlikely(!p))
+ goto out_err_free_stripe_indices;
+
+ indexp = &stripe_indices[0];
+ max_stripe_index = 0;
+ for (i = 0; i < cnt; i++) {
+ *indexp = be32_to_cpup(p++);
+ max_stripe_index = max(max_stripe_index, *indexp);
+ indexp++;
+ }
+
+ /* Check the multipath list count */
+ p = xdr_inline_decode(&stream, 4);
+ if (unlikely(!p))
+ goto out_err_free_stripe_indices;
+
+ num = be32_to_cpup(p);
+ dprintk("%s ds_num %u\n", __func__, num);
+ if (num > NFS4_PNFS_MAX_MULTI_CNT) {
+ printk(KERN_WARNING "NFS: %s: multipath count %d greater than "
+ "supported maximum %d\n", __func__,
+ num, NFS4_PNFS_MAX_MULTI_CNT);
+ goto out_err_free_stripe_indices;
+ }
+
+ /* validate stripe indices are all < num */
+ if (max_stripe_index >= num) {
+ printk(KERN_WARNING "NFS: %s: stripe index %u >= num ds %u\n",
+ __func__, max_stripe_index, num);
+ goto out_err_free_stripe_indices;
+ }
+
+ dsaddr = kzalloc(struct_size(dsaddr, ds_list, num), gfp_flags);
+ if (!dsaddr)
+ goto out_err_free_stripe_indices;
+
+ dsaddr->stripe_count = cnt;
+ dsaddr->stripe_indices = stripe_indices;
+ stripe_indices = NULL;
+ dsaddr->ds_num = num;
+ nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id);
+
+ INIT_LIST_HEAD(&dsaddrs);
+
+ for (i = 0; i < dsaddr->ds_num; i++) {
+ int j;
+ u32 mp_count;
+
+ p = xdr_inline_decode(&stream, 4);
+ if (unlikely(!p))
+ goto out_err_free_deviceid;
+
+ mp_count = be32_to_cpup(p); /* multipath count */
+ for (j = 0; j < mp_count; j++) {
+ da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
+ &stream, gfp_flags);
+ if (da)
+ list_add_tail(&da->da_node, &dsaddrs);
+ }
+ if (list_empty(&dsaddrs)) {
+ dprintk("%s: no suitable DS addresses found\n",
+ __func__);
+ goto out_err_free_deviceid;
+ }
+
+ dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
+ if (!dsaddr->ds_list[i])
+ goto out_err_drain_dsaddrs;
+
+ /* If DS was already in cache, free ds addrs */
+ while (!list_empty(&dsaddrs)) {
+ da = list_first_entry(&dsaddrs,
+ struct nfs4_pnfs_ds_addr,
+ da_node);
+ list_del_init(&da->da_node);
+ kfree(da->da_remotestr);
+ kfree(da);
+ }
+ }
+
+ __free_page(scratch);
+ return dsaddr;
+
+out_err_drain_dsaddrs:
+ while (!list_empty(&dsaddrs)) {
+ da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
+ da_node);
+ list_del_init(&da->da_node);
+ kfree(da->da_remotestr);
+ kfree(da);
+ }
+out_err_free_deviceid:
+ nfs4_fl_free_deviceid(dsaddr);
+ /* stripe_indicies was part of dsaddr */
+ goto out_err_free_scratch;
+out_err_free_stripe_indices:
+ kfree(stripe_indices);
+out_err_free_scratch:
+ __free_page(scratch);
+out_err:
+ dprintk("%s ERROR: returning NULL\n", __func__);
+ return NULL;
+}
+
+void
+nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+ nfs4_put_deviceid_node(&dsaddr->id_node);
+}
+
+/*
+ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
+ * Then: ((res + fsi) % dsaddr->stripe_count)
+ */
+u32
+nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+ u64 tmp;
+
+ tmp = offset - flseg->pattern_offset;
+ do_div(tmp, flseg->stripe_unit);
+ tmp += flseg->first_stripe_index;
+ return do_div(tmp, flseg->dsaddr->stripe_count);
+}
+
+u32
+nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
+{
+ return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
+}
+
+struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
+{
+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+ u32 i;
+
+ if (flseg->stripe_type == STRIPE_SPARSE) {
+ if (flseg->num_fh == 1)
+ i = 0;
+ else if (flseg->num_fh == 0)
+ /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
+ return NULL;
+ else
+ i = nfs4_fl_calc_ds_index(lseg, j);
+ } else
+ i = j;
+ return flseg->fh_array[i];
+}
+
+/* Upon return, either ds is connected, or ds is NULL */
+struct nfs4_pnfs_ds *
+nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
+{
+ struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
+ struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
+ struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
+ struct nfs4_pnfs_ds *ret = ds;
+ struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
+ int status;
+
+ if (ds == NULL) {
+ printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
+ __func__, ds_idx);
+ pnfs_generic_mark_devid_invalid(devid);
+ goto out;
+ }
+ smp_rmb();
+ if (ds->ds_clp)
+ goto out_test_devid;
+
+ status = nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
+ dataserver_retrans, 4,
+ s->nfs_client->cl_minorversion);
+ if (status) {
+ nfs4_mark_deviceid_unavailable(devid);
+ ret = NULL;
+ goto out;
+ }
+
+out_test_devid:
+ if (ret->ds_clp == NULL ||
+ filelayout_test_devid_unavailable(devid))
+ ret = NULL;
+out:
+ return ret;
+}
+
+module_param(dataserver_retrans, uint, 0644);
+MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client "
+ "retries a request before it attempts further "
+ " recovery action.");
+module_param(dataserver_timeo, uint, 0644);
+MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
+ "NFSv4.1 client waits for a response from a "
+ " data server before it retries an NFS request.");
diff --git a/fs/nfs/flexfilelayout/Makefile b/fs/nfs/flexfilelayout/Makefile
new file mode 100644
index 0000000000..49f03422b6
--- /dev/null
+++ b/fs/nfs/flexfilelayout/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for the pNFS Flexfile Layout Driver kernel module
+#
+obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += nfs_layout_flexfiles.o
+nfs_layout_flexfiles-y := flexfilelayout.o flexfilelayoutdev.o
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
new file mode 100644
index 0000000000..ef817a0475
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -0,0 +1,2623 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Module for pnfs flexfile layout driver.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs_page.h>
+#include <linux/module.h>
+#include <linux/sched/mm.h>
+
+#include <linux/sunrpc/metrics.h>
+
+#include "flexfilelayout.h"
+#include "../nfs4session.h"
+#include "../nfs4idmap.h"
+#include "../internal.h"
+#include "../delegation.h"
+#include "../nfs4trace.h"
+#include "../iostat.h"
+#include "../nfs.h"
+#include "../nfs42.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+#define FF_LAYOUT_POLL_RETRY_MAX (15*HZ)
+#define FF_LAYOUTRETURN_MAXERR 20
+
+enum nfs4_ff_op_type {
+ NFS4_FF_OP_LAYOUTSTATS,
+ NFS4_FF_OP_LAYOUTRETURN,
+};
+
+static unsigned short io_maxretrans;
+
+static const struct pnfs_commit_ops ff_layout_commit_ops;
+static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr);
+static int
+ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
+ struct nfs42_layoutstat_devinfo *devinfo,
+ int dev_limit, enum nfs4_ff_op_type type);
+static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
+ const struct nfs42_layoutstat_devinfo *devinfo,
+ struct nfs4_ff_layout_mirror *mirror);
+
+static struct pnfs_layout_hdr *
+ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
+{
+ struct nfs4_flexfile_layout *ffl;
+
+ ffl = kzalloc(sizeof(*ffl), gfp_flags);
+ if (ffl) {
+ pnfs_init_ds_commit_info(&ffl->commit_info);
+ INIT_LIST_HEAD(&ffl->error_list);
+ INIT_LIST_HEAD(&ffl->mirrors);
+ ffl->last_report_time = ktime_get();
+ ffl->commit_info.ops = &ff_layout_commit_ops;
+ return &ffl->generic_hdr;
+ } else
+ return NULL;
+}
+
+static void
+ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(lo);
+ struct nfs4_ff_layout_ds_err *err, *n;
+
+ list_for_each_entry_safe(err, n, &ffl->error_list, list) {
+ list_del(&err->list);
+ kfree(err);
+ }
+ kfree_rcu(ffl, generic_hdr.plh_rcu);
+}
+
+static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
+ if (unlikely(p == NULL))
+ return -ENOBUFS;
+ stateid->type = NFS4_PNFS_DS_STATEID_TYPE;
+ memcpy(stateid->data, p, NFS4_STATEID_SIZE);
+ dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
+ p[0], p[1], p[2], p[3]);
+ return 0;
+}
+
+static int decode_deviceid(struct xdr_stream *xdr, struct nfs4_deviceid *devid)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE);
+ if (unlikely(!p))
+ return -ENOBUFS;
+ memcpy(devid, p, NFS4_DEVICEID4_SIZE);
+ nfs4_print_deviceid(devid);
+ return 0;
+}
+
+static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -ENOBUFS;
+ fh->size = be32_to_cpup(p++);
+ if (fh->size > NFS_MAXFHSIZE) {
+ printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n",
+ fh->size);
+ return -EOVERFLOW;
+ }
+ /* fh.data */
+ p = xdr_inline_decode(xdr, fh->size);
+ if (unlikely(!p))
+ return -ENOBUFS;
+ memcpy(&fh->data, p, fh->size);
+ dprintk("%s: fh len %d\n", __func__, fh->size);
+
+ return 0;
+}
+
+/*
+ * Currently only stringified uids and gids are accepted.
+ * I.e., kerberos is not supported to the DSes, so no pricipals.
+ *
+ * That means that one common function will suffice, but when
+ * principals are added, this should be split to accomodate
+ * calls to both nfs_map_name_to_uid() and nfs_map_group_to_gid().
+ */
+static int
+decode_name(struct xdr_stream *xdr, u32 *id)
+{
+ __be32 *p;
+ int len;
+
+ /* opaque_length(4)*/
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -ENOBUFS;
+ len = be32_to_cpup(p++);
+ if (len < 0)
+ return -EINVAL;
+
+ dprintk("%s: len %u\n", __func__, len);
+
+ /* opaque body */
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return -ENOBUFS;
+
+ if (!nfs_map_string_to_numeric((char *)p, len, id))
+ return -EINVAL;
+
+ return 0;
+}
+
+static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
+ const struct nfs4_ff_layout_mirror *m2)
+{
+ int i, j;
+
+ if (m1->fh_versions_cnt != m2->fh_versions_cnt)
+ return false;
+ for (i = 0; i < m1->fh_versions_cnt; i++) {
+ bool found_fh = false;
+ for (j = 0; j < m2->fh_versions_cnt; j++) {
+ if (nfs_compare_fh(&m1->fh_versions[i],
+ &m2->fh_versions[j]) == 0) {
+ found_fh = true;
+ break;
+ }
+ }
+ if (!found_fh)
+ return false;
+ }
+ return true;
+}
+
+static struct nfs4_ff_layout_mirror *
+ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
+ struct nfs4_ff_layout_mirror *mirror)
+{
+ struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
+ struct nfs4_ff_layout_mirror *pos;
+ struct inode *inode = lo->plh_inode;
+
+ spin_lock(&inode->i_lock);
+ list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
+ if (memcmp(&mirror->devid, &pos->devid, sizeof(pos->devid)) != 0)
+ continue;
+ if (!ff_mirror_match_fh(mirror, pos))
+ continue;
+ if (refcount_inc_not_zero(&pos->ref)) {
+ spin_unlock(&inode->i_lock);
+ return pos;
+ }
+ }
+ list_add(&mirror->mirrors, &ff_layout->mirrors);
+ mirror->layout = lo;
+ spin_unlock(&inode->i_lock);
+ return mirror;
+}
+
+static void
+ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+ struct inode *inode;
+ if (mirror->layout == NULL)
+ return;
+ inode = mirror->layout->plh_inode;
+ spin_lock(&inode->i_lock);
+ list_del(&mirror->mirrors);
+ spin_unlock(&inode->i_lock);
+ mirror->layout = NULL;
+}
+
+static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+
+ mirror = kzalloc(sizeof(*mirror), gfp_flags);
+ if (mirror != NULL) {
+ spin_lock_init(&mirror->lock);
+ refcount_set(&mirror->ref, 1);
+ INIT_LIST_HEAD(&mirror->mirrors);
+ }
+ return mirror;
+}
+
+static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+ const struct cred *cred;
+
+ ff_layout_remove_mirror(mirror);
+ kfree(mirror->fh_versions);
+ cred = rcu_access_pointer(mirror->ro_cred);
+ put_cred(cred);
+ cred = rcu_access_pointer(mirror->rw_cred);
+ put_cred(cred);
+ nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
+ kfree(mirror);
+}
+
+static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+ if (mirror != NULL && refcount_dec_and_test(&mirror->ref))
+ ff_layout_free_mirror(mirror);
+}
+
+static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
+{
+ u32 i;
+
+ for (i = 0; i < fls->mirror_array_cnt; i++)
+ ff_layout_put_mirror(fls->mirror_array[i]);
+}
+
+static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
+{
+ if (fls) {
+ ff_layout_free_mirror_array(fls);
+ kfree(fls);
+ }
+}
+
+static bool
+ff_lseg_match_mirrors(struct pnfs_layout_segment *l1,
+ struct pnfs_layout_segment *l2)
+{
+ const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1);
+ const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1);
+ u32 i;
+
+ if (fl1->mirror_array_cnt != fl2->mirror_array_cnt)
+ return false;
+ for (i = 0; i < fl1->mirror_array_cnt; i++) {
+ if (fl1->mirror_array[i] != fl2->mirror_array[i])
+ return false;
+ }
+ return true;
+}
+
+static bool
+ff_lseg_range_is_after(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
+{
+ u64 end1, end2;
+
+ if (l1->iomode != l2->iomode)
+ return l1->iomode != IOMODE_READ;
+ end1 = pnfs_calc_offset_end(l1->offset, l1->length);
+ end2 = pnfs_calc_offset_end(l2->offset, l2->length);
+ if (end1 < l2->offset)
+ return false;
+ if (end2 < l1->offset)
+ return true;
+ return l2->offset <= l1->offset;
+}
+
+static bool
+ff_lseg_merge(struct pnfs_layout_segment *new,
+ struct pnfs_layout_segment *old)
+{
+ u64 new_end, old_end;
+
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
+ return false;
+ if (new->pls_range.iomode != old->pls_range.iomode)
+ return false;
+ old_end = pnfs_calc_offset_end(old->pls_range.offset,
+ old->pls_range.length);
+ if (old_end < new->pls_range.offset)
+ return false;
+ new_end = pnfs_calc_offset_end(new->pls_range.offset,
+ new->pls_range.length);
+ if (new_end < old->pls_range.offset)
+ return false;
+ if (!ff_lseg_match_mirrors(new, old))
+ return false;
+
+ /* Mergeable: copy info from 'old' to 'new' */
+ if (new_end < old_end)
+ new_end = old_end;
+ if (new->pls_range.offset < old->pls_range.offset)
+ new->pls_range.offset = old->pls_range.offset;
+ new->pls_range.length = pnfs_calc_offset_length(new->pls_range.offset,
+ new_end);
+ if (test_bit(NFS_LSEG_ROC, &old->pls_flags))
+ set_bit(NFS_LSEG_ROC, &new->pls_flags);
+ return true;
+}
+
+static void
+ff_layout_add_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ struct list_head *free_me)
+{
+ pnfs_generic_layout_insert_lseg(lo, lseg,
+ ff_lseg_range_is_after,
+ ff_lseg_merge,
+ free_me);
+}
+
+static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
+{
+ int i, j;
+
+ for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
+ for (j = i + 1; j < fls->mirror_array_cnt; j++)
+ if (fls->mirror_array[i]->efficiency <
+ fls->mirror_array[j]->efficiency)
+ swap(fls->mirror_array[i],
+ fls->mirror_array[j]);
+ }
+}
+
+static struct pnfs_layout_segment *
+ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
+ struct nfs4_layoutget_res *lgr,
+ gfp_t gfp_flags)
+{
+ struct pnfs_layout_segment *ret;
+ struct nfs4_ff_layout_segment *fls = NULL;
+ struct xdr_stream stream;
+ struct xdr_buf buf;
+ struct page *scratch;
+ u64 stripe_unit;
+ u32 mirror_array_cnt;
+ __be32 *p;
+ int i, rc;
+
+ dprintk("--> %s\n", __func__);
+ scratch = alloc_page(gfp_flags);
+ if (!scratch)
+ return ERR_PTR(-ENOMEM);
+
+ xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
+ lgr->layoutp->len);
+ xdr_set_scratch_page(&stream, scratch);
+
+ /* stripe unit and mirror_array_cnt */
+ rc = -EIO;
+ p = xdr_inline_decode(&stream, 8 + 4);
+ if (!p)
+ goto out_err_free;
+
+ p = xdr_decode_hyper(p, &stripe_unit);
+ mirror_array_cnt = be32_to_cpup(p++);
+ dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__,
+ stripe_unit, mirror_array_cnt);
+
+ if (mirror_array_cnt > NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT ||
+ mirror_array_cnt == 0)
+ goto out_err_free;
+
+ rc = -ENOMEM;
+ fls = kzalloc(struct_size(fls, mirror_array, mirror_array_cnt),
+ gfp_flags);
+ if (!fls)
+ goto out_err_free;
+
+ fls->mirror_array_cnt = mirror_array_cnt;
+ fls->stripe_unit = stripe_unit;
+
+ for (i = 0; i < fls->mirror_array_cnt; i++) {
+ struct nfs4_ff_layout_mirror *mirror;
+ struct cred *kcred;
+ const struct cred __rcu *cred;
+ kuid_t uid;
+ kgid_t gid;
+ u32 ds_count, fh_count, id;
+ int j;
+
+ rc = -EIO;
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_err_free;
+ ds_count = be32_to_cpup(p);
+
+ /* FIXME: allow for striping? */
+ if (ds_count != 1)
+ goto out_err_free;
+
+ fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags);
+ if (fls->mirror_array[i] == NULL) {
+ rc = -ENOMEM;
+ goto out_err_free;
+ }
+
+ fls->mirror_array[i]->ds_count = ds_count;
+
+ /* deviceid */
+ rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid);
+ if (rc)
+ goto out_err_free;
+
+ /* efficiency */
+ rc = -EIO;
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_err_free;
+ fls->mirror_array[i]->efficiency = be32_to_cpup(p);
+
+ /* stateid */
+ rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid);
+ if (rc)
+ goto out_err_free;
+
+ /* fh */
+ rc = -EIO;
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_err_free;
+ fh_count = be32_to_cpup(p);
+
+ fls->mirror_array[i]->fh_versions =
+ kcalloc(fh_count, sizeof(struct nfs_fh),
+ gfp_flags);
+ if (fls->mirror_array[i]->fh_versions == NULL) {
+ rc = -ENOMEM;
+ goto out_err_free;
+ }
+
+ for (j = 0; j < fh_count; j++) {
+ rc = decode_nfs_fh(&stream,
+ &fls->mirror_array[i]->fh_versions[j]);
+ if (rc)
+ goto out_err_free;
+ }
+
+ fls->mirror_array[i]->fh_versions_cnt = fh_count;
+
+ /* user */
+ rc = decode_name(&stream, &id);
+ if (rc)
+ goto out_err_free;
+
+ uid = make_kuid(&init_user_ns, id);
+
+ /* group */
+ rc = decode_name(&stream, &id);
+ if (rc)
+ goto out_err_free;
+
+ gid = make_kgid(&init_user_ns, id);
+
+ if (gfp_flags & __GFP_FS)
+ kcred = prepare_kernel_cred(&init_task);
+ else {
+ unsigned int nofs_flags = memalloc_nofs_save();
+ kcred = prepare_kernel_cred(&init_task);
+ memalloc_nofs_restore(nofs_flags);
+ }
+ rc = -ENOMEM;
+ if (!kcred)
+ goto out_err_free;
+ kcred->fsuid = uid;
+ kcred->fsgid = gid;
+ cred = RCU_INITIALIZER(kcred);
+
+ if (lgr->range.iomode == IOMODE_READ)
+ rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
+ else
+ rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
+
+ mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
+ if (mirror != fls->mirror_array[i]) {
+ /* swap cred ptrs so free_mirror will clean up old */
+ if (lgr->range.iomode == IOMODE_READ) {
+ cred = xchg(&mirror->ro_cred, cred);
+ rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
+ } else {
+ cred = xchg(&mirror->rw_cred, cred);
+ rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
+ }
+ ff_layout_free_mirror(fls->mirror_array[i]);
+ fls->mirror_array[i] = mirror;
+ }
+
+ dprintk("%s: iomode %s uid %u gid %u\n", __func__,
+ lgr->range.iomode == IOMODE_READ ? "READ" : "RW",
+ from_kuid(&init_user_ns, uid),
+ from_kgid(&init_user_ns, gid));
+ }
+
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_sort_mirrors;
+ fls->flags = be32_to_cpup(p);
+
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_sort_mirrors;
+ for (i=0; i < fls->mirror_array_cnt; i++)
+ fls->mirror_array[i]->report_interval = be32_to_cpup(p);
+
+out_sort_mirrors:
+ ff_layout_sort_mirrors(fls);
+ ret = &fls->generic_hdr;
+ dprintk("<-- %s (success)\n", __func__);
+out_free_page:
+ __free_page(scratch);
+ return ret;
+out_err_free:
+ _ff_layout_free_lseg(fls);
+ ret = ERR_PTR(rc);
+ dprintk("<-- %s (%d)\n", __func__, rc);
+ goto out_free_page;
+}
+
+static void
+ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+
+ dprintk("--> %s\n", __func__);
+
+ if (lseg->pls_range.iomode == IOMODE_RW) {
+ struct nfs4_flexfile_layout *ffl;
+ struct inode *inode;
+
+ ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout);
+ inode = ffl->generic_hdr.plh_inode;
+ spin_lock(&inode->i_lock);
+ pnfs_generic_ds_cinfo_release_lseg(&ffl->commit_info, lseg);
+ spin_unlock(&inode->i_lock);
+ }
+ _ff_layout_free_lseg(fls);
+}
+
+static void
+nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
+{
+ /* first IO request? */
+ if (atomic_inc_return(&timer->n_ops) == 1) {
+ timer->start_time = now;
+ }
+}
+
+static ktime_t
+nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
+{
+ ktime_t start;
+
+ if (atomic_dec_return(&timer->n_ops) < 0)
+ WARN_ON_ONCE(1);
+
+ start = timer->start_time;
+ timer->start_time = now;
+ return ktime_sub(now, start);
+}
+
+static bool
+nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
+ struct nfs4_ff_layoutstat *layoutstat,
+ ktime_t now)
+{
+ s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL;
+ struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout);
+
+ nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
+ if (!mirror->start_time)
+ mirror->start_time = now;
+ if (mirror->report_interval != 0)
+ report_interval = (s64)mirror->report_interval * 1000LL;
+ else if (layoutstats_timer != 0)
+ report_interval = (s64)layoutstats_timer * 1000LL;
+ if (ktime_to_ms(ktime_sub(now, ffl->last_report_time)) >=
+ report_interval) {
+ ffl->last_report_time = now;
+ return true;
+ }
+
+ return false;
+}
+
+static void
+nfs4_ff_layout_stat_io_update_requested(struct nfs4_ff_layoutstat *layoutstat,
+ __u64 requested)
+{
+ struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
+
+ iostat->ops_requested++;
+ iostat->bytes_requested += requested;
+}
+
+static void
+nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
+ __u64 requested,
+ __u64 completed,
+ ktime_t time_completed,
+ ktime_t time_started)
+{
+ struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
+ ktime_t completion_time = ktime_sub(time_completed, time_started);
+ ktime_t timer;
+
+ iostat->ops_completed++;
+ iostat->bytes_completed += completed;
+ iostat->bytes_not_delivered += requested - completed;
+
+ timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer, time_completed);
+ iostat->total_busy_time =
+ ktime_add(iostat->total_busy_time, timer);
+ iostat->aggregate_completion_time =
+ ktime_add(iostat->aggregate_completion_time,
+ completion_time);
+}
+
+static void
+nfs4_ff_layout_stat_io_start_read(struct inode *inode,
+ struct nfs4_ff_layout_mirror *mirror,
+ __u64 requested, ktime_t now)
+{
+ bool report;
+
+ spin_lock(&mirror->lock);
+ report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now);
+ nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
+ set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
+ spin_unlock(&mirror->lock);
+
+ if (report)
+ pnfs_report_layoutstat(inode, nfs_io_gfp_mask());
+}
+
+static void
+nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
+ struct nfs4_ff_layout_mirror *mirror,
+ __u64 requested,
+ __u64 completed)
+{
+ spin_lock(&mirror->lock);
+ nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
+ requested, completed,
+ ktime_get(), task->tk_start);
+ set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
+ spin_unlock(&mirror->lock);
+}
+
+static void
+nfs4_ff_layout_stat_io_start_write(struct inode *inode,
+ struct nfs4_ff_layout_mirror *mirror,
+ __u64 requested, ktime_t now)
+{
+ bool report;
+
+ spin_lock(&mirror->lock);
+ report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now);
+ nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
+ set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
+ spin_unlock(&mirror->lock);
+
+ if (report)
+ pnfs_report_layoutstat(inode, nfs_io_gfp_mask());
+}
+
+static void
+nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
+ struct nfs4_ff_layout_mirror *mirror,
+ __u64 requested,
+ __u64 completed,
+ enum nfs3_stable_how committed)
+{
+ if (committed == NFS_UNSTABLE)
+ requested = completed = 0;
+
+ spin_lock(&mirror->lock);
+ nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
+ requested, completed, ktime_get(), task->tk_start);
+ set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
+ spin_unlock(&mirror->lock);
+}
+
+static void
+ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx)
+{
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+ if (devid)
+ nfs4_mark_deviceid_unavailable(devid);
+}
+
+static void
+ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx)
+{
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+ if (devid)
+ nfs4_mark_deviceid_available(devid);
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
+ u32 start_idx, u32 *best_idx,
+ bool check_device)
+{
+ struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_pnfs_ds *ds;
+ u32 idx;
+
+ /* mirrors are initially sorted by efficiency */
+ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
+ if (!ds)
+ continue;
+
+ if (check_device &&
+ nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node))
+ continue;
+
+ *best_idx = idx;
+ return ds;
+ }
+
+ return NULL;
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg,
+ u32 start_idx, u32 *best_idx)
+{
+ return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false);
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg,
+ u32 start_idx, u32 *best_idx)
+{
+ return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true);
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
+ u32 start_idx, u32 *best_idx)
+{
+ struct nfs4_pnfs_ds *ds;
+
+ ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx);
+ if (ds)
+ return ds;
+ return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio,
+ u32 *best_idx)
+{
+ struct pnfs_layout_segment *lseg = pgio->pg_lseg;
+ struct nfs4_pnfs_ds *ds;
+
+ ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx,
+ best_idx);
+ if (ds || !pgio->pg_mirror_idx)
+ return ds;
+ return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx);
+}
+
+static void
+ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req,
+ bool strict_iomode)
+{
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg =
+ pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
+ req_offset(req), req->wb_bytes, IOMODE_READ,
+ strict_iomode, nfs_io_gfp_mask());
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ }
+}
+
+static void
+ff_layout_pg_check_layout(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ pnfs_generic_pg_check_layout(pgio);
+ pnfs_generic_pg_check_range(pgio, req);
+}
+
+static void
+ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ struct nfs_pgio_mirror *pgm;
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_pnfs_ds *ds;
+ u32 ds_idx;
+
+retry:
+ ff_layout_pg_check_layout(pgio, req);
+ /* Use full layout for now */
+ if (!pgio->pg_lseg) {
+ ff_layout_pg_get_read(pgio, req, false);
+ if (!pgio->pg_lseg)
+ goto out_nolseg;
+ }
+ if (ff_layout_avoid_read_on_rw(pgio->pg_lseg)) {
+ ff_layout_pg_get_read(pgio, req, true);
+ if (!pgio->pg_lseg)
+ goto out_nolseg;
+ }
+
+ ds = ff_layout_get_ds_for_read(pgio, &ds_idx);
+ if (!ds) {
+ if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
+ goto out_mds;
+ pnfs_generic_pg_cleanup(pgio);
+ /* Sleep for 1 second before retrying */
+ ssleep(1);
+ goto retry;
+ }
+
+ mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
+ pgm = &pgio->pg_mirrors[0];
+ pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
+
+ pgio->pg_mirror_idx = ds_idx;
+
+ if (NFS_SERVER(pgio->pg_inode)->flags &
+ (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
+ pgio->pg_maxretrans = io_maxretrans;
+ return;
+out_nolseg:
+ if (pgio->pg_error < 0)
+ return;
+out_mds:
+ trace_pnfs_mds_fallback_pg_init_read(pgio->pg_inode,
+ 0, NFS4_MAX_UINT64, IOMODE_READ,
+ NFS_I(pgio->pg_inode)->layout,
+ pgio->pg_lseg);
+ pgio->pg_maxretrans = 0;
+ nfs_pageio_reset_read_mds(pgio);
+}
+
+static void
+ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs_pgio_mirror *pgm;
+ struct nfs4_pnfs_ds *ds;
+ u32 i;
+
+retry:
+ ff_layout_pg_check_layout(pgio, req);
+ if (!pgio->pg_lseg) {
+ pgio->pg_lseg =
+ pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
+ req_offset(req), req->wb_bytes,
+ IOMODE_RW, false, nfs_io_gfp_mask());
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
+ /* If no lseg, fall back to write through mds */
+ if (pgio->pg_lseg == NULL)
+ goto out_mds;
+
+ /* Use a direct mapping of ds_idx to pgio mirror_idx */
+ if (pgio->pg_mirror_count != FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg))
+ goto out_eagain;
+
+ for (i = 0; i < pgio->pg_mirror_count; i++) {
+ mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
+ ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true);
+ if (!ds) {
+ if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
+ goto out_mds;
+ pnfs_generic_pg_cleanup(pgio);
+ /* Sleep for 1 second before retrying */
+ ssleep(1);
+ goto retry;
+ }
+ pgm = &pgio->pg_mirrors[i];
+ pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
+ }
+
+ if (NFS_SERVER(pgio->pg_inode)->flags &
+ (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
+ pgio->pg_maxretrans = io_maxretrans;
+ return;
+out_eagain:
+ pnfs_generic_pg_cleanup(pgio);
+ pgio->pg_error = -EAGAIN;
+ return;
+out_mds:
+ trace_pnfs_mds_fallback_pg_init_write(pgio->pg_inode,
+ 0, NFS4_MAX_UINT64, IOMODE_RW,
+ NFS_I(pgio->pg_inode)->layout,
+ pgio->pg_lseg);
+ pgio->pg_maxretrans = 0;
+ nfs_pageio_reset_write_mds(pgio);
+ pgio->pg_error = -EAGAIN;
+}
+
+static unsigned int
+ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ if (!pgio->pg_lseg) {
+ pgio->pg_lseg =
+ pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
+ req_offset(req), req->wb_bytes,
+ IOMODE_RW, false, nfs_io_gfp_mask());
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ goto out;
+ }
+ }
+ if (pgio->pg_lseg)
+ return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
+
+ trace_pnfs_mds_fallback_pg_get_mirror_count(pgio->pg_inode,
+ 0, NFS4_MAX_UINT64, IOMODE_RW,
+ NFS_I(pgio->pg_inode)->layout,
+ pgio->pg_lseg);
+ /* no lseg means that pnfs is not in use, so no mirroring here */
+ nfs_pageio_reset_write_mds(pgio);
+out:
+ return 1;
+}
+
+static u32
+ff_layout_pg_set_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx)
+{
+ u32 old = desc->pg_mirror_idx;
+
+ desc->pg_mirror_idx = idx;
+ return old;
+}
+
+static struct nfs_pgio_mirror *
+ff_layout_pg_get_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx)
+{
+ return &desc->pg_mirrors[idx];
+}
+
+static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
+ .pg_init = ff_layout_pg_init_read,
+ .pg_test = pnfs_generic_pg_test,
+ .pg_doio = pnfs_generic_pg_readpages,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static const struct nfs_pageio_ops ff_layout_pg_write_ops = {
+ .pg_init = ff_layout_pg_init_write,
+ .pg_test = pnfs_generic_pg_test,
+ .pg_doio = pnfs_generic_pg_writepages,
+ .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
+ .pg_get_mirror = ff_layout_pg_get_mirror_write,
+ .pg_set_mirror = ff_layout_pg_set_mirror_write,
+};
+
+static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
+{
+ struct rpc_task *task = &hdr->task;
+
+ pnfs_layoutcommit_inode(hdr->inode, false);
+
+ if (retry_pnfs) {
+ dprintk("%s Reset task %5u for i/o through pNFS "
+ "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+ hdr->task.tk_pid,
+ hdr->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(hdr->inode),
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
+
+ hdr->completion_ops->reschedule_io(hdr);
+ return;
+ }
+
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ dprintk("%s Reset task %5u for i/o through MDS "
+ "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+ hdr->task.tk_pid,
+ hdr->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(hdr->inode),
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
+
+ trace_pnfs_mds_fallback_write_done(hdr->inode,
+ hdr->args.offset, hdr->args.count,
+ IOMODE_RW, NFS_I(hdr->inode)->layout,
+ hdr->lseg);
+ task->tk_status = pnfs_write_done_resend_to_mds(hdr);
+ }
+}
+
+static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr)
+{
+ u32 idx = hdr->pgio_mirror_idx + 1;
+ u32 new_idx = 0;
+
+ if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx))
+ ff_layout_send_layouterror(hdr->lseg);
+ else
+ pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
+ pnfs_read_resend_pnfs(hdr, new_idx);
+}
+
+static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
+{
+ struct rpc_task *task = &hdr->task;
+
+ pnfs_layoutcommit_inode(hdr->inode, false);
+ pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
+
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ dprintk("%s Reset task %5u for i/o through MDS "
+ "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+ hdr->task.tk_pid,
+ hdr->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(hdr->inode),
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
+
+ trace_pnfs_mds_fallback_read_done(hdr->inode,
+ hdr->args.offset, hdr->args.count,
+ IOMODE_READ, NFS_I(hdr->inode)->layout,
+ hdr->lseg);
+ task->tk_status = pnfs_read_done_resend_to_mds(hdr);
+ }
+}
+
+static int ff_layout_async_handle_error_v4(struct rpc_task *task,
+ struct nfs4_state *state,
+ struct nfs_client *clp,
+ struct pnfs_layout_segment *lseg,
+ u32 idx)
+{
+ struct pnfs_layout_hdr *lo = lseg->pls_layout;
+ struct inode *inode = lo->plh_inode;
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+ struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
+
+ switch (task->tk_status) {
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case -NFS4ERR_SEQ_FALSE_RETRY:
+ case -NFS4ERR_SEQ_MISORDERED:
+ dprintk("%s ERROR %d, Reset session. Exchangeid "
+ "flags 0x%x\n", __func__, task->tk_status,
+ clp->cl_exchange_flags);
+ nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
+ break;
+ case -NFS4ERR_DELAY:
+ case -NFS4ERR_GRACE:
+ rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX);
+ break;
+ case -NFS4ERR_RETRY_UNCACHED_REP:
+ break;
+ /* Invalidate Layout errors */
+ case -NFS4ERR_PNFS_NO_LAYOUT:
+ case -ESTALE: /* mapped NFS4ERR_STALE */
+ case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */
+ case -EISDIR: /* mapped NFS4ERR_ISDIR */
+ case -NFS4ERR_FHEXPIRED:
+ case -NFS4ERR_WRONG_TYPE:
+ dprintk("%s Invalid layout error %d\n", __func__,
+ task->tk_status);
+ /*
+ * Destroy layout so new i/o will get a new layout.
+ * Layout will not be destroyed until all current lseg
+ * references are put. Mark layout as invalid to resend failed
+ * i/o and all i/o waiting on the slot table to the MDS until
+ * layout is destroyed and a new valid layout is obtained.
+ */
+ pnfs_destroy_layout(NFS_I(inode));
+ rpc_wake_up(&tbl->slot_tbl_waitq);
+ goto reset;
+ /* RPC connection errors */
+ case -ECONNREFUSED:
+ case -EHOSTDOWN:
+ case -EHOSTUNREACH:
+ case -ENETUNREACH:
+ case -EIO:
+ case -ETIMEDOUT:
+ case -EPIPE:
+ case -EPROTO:
+ case -ENODEV:
+ dprintk("%s DS connection error %d\n", __func__,
+ task->tk_status);
+ nfs4_delete_deviceid(devid->ld, devid->nfs_client,
+ &devid->deviceid);
+ rpc_wake_up(&tbl->slot_tbl_waitq);
+ fallthrough;
+ default:
+ if (ff_layout_avoid_mds_available_ds(lseg))
+ return -NFS4ERR_RESET_TO_PNFS;
+reset:
+ dprintk("%s Retry through MDS. Error %d\n", __func__,
+ task->tk_status);
+ return -NFS4ERR_RESET_TO_MDS;
+ }
+ task->tk_status = 0;
+ return -EAGAIN;
+}
+
+/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
+static int ff_layout_async_handle_error_v3(struct rpc_task *task,
+ struct pnfs_layout_segment *lseg,
+ u32 idx)
+{
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+ switch (task->tk_status) {
+ /* File access problems. Don't mark the device as unavailable */
+ case -EACCES:
+ case -ESTALE:
+ case -EISDIR:
+ case -EBADHANDLE:
+ case -ELOOP:
+ case -ENOSPC:
+ break;
+ case -EJUKEBOX:
+ nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+ goto out_retry;
+ default:
+ dprintk("%s DS connection error %d\n", __func__,
+ task->tk_status);
+ nfs4_delete_deviceid(devid->ld, devid->nfs_client,
+ &devid->deviceid);
+ }
+ /* FIXME: Need to prevent infinite looping here. */
+ return -NFS4ERR_RESET_TO_PNFS;
+out_retry:
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
+ rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
+ return -EAGAIN;
+}
+
+static int ff_layout_async_handle_error(struct rpc_task *task,
+ struct nfs4_state *state,
+ struct nfs_client *clp,
+ struct pnfs_layout_segment *lseg,
+ u32 idx)
+{
+ int vers = clp->cl_nfs_mod->rpc_vers->number;
+
+ if (task->tk_status >= 0) {
+ ff_layout_mark_ds_reachable(lseg, idx);
+ return 0;
+ }
+
+ /* Handle the case of an invalid layout segment */
+ if (!pnfs_is_valid_lseg(lseg))
+ return -NFS4ERR_RESET_TO_PNFS;
+
+ switch (vers) {
+ case 3:
+ return ff_layout_async_handle_error_v3(task, lseg, idx);
+ case 4:
+ return ff_layout_async_handle_error_v4(task, state, clp,
+ lseg, idx);
+ default:
+ /* should never happen */
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+}
+
+static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
+ u32 idx, u64 offset, u64 length,
+ u32 *op_status, int opnum, int error)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+ u32 status = *op_status;
+ int err;
+
+ if (status == 0) {
+ switch (error) {
+ case -ETIMEDOUT:
+ case -EPFNOSUPPORT:
+ case -EPROTONOSUPPORT:
+ case -EOPNOTSUPP:
+ case -EINVAL:
+ case -ECONNREFUSED:
+ case -ECONNRESET:
+ case -EHOSTDOWN:
+ case -EHOSTUNREACH:
+ case -ENETUNREACH:
+ case -EADDRINUSE:
+ case -ENOBUFS:
+ case -EPIPE:
+ case -EPERM:
+ case -EPROTO:
+ case -ENODEV:
+ *op_status = status = NFS4ERR_NXIO;
+ break;
+ case -EACCES:
+ *op_status = status = NFS4ERR_ACCESS;
+ break;
+ default:
+ return;
+ }
+ }
+
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+ mirror, offset, length, status, opnum,
+ nfs_io_gfp_mask());
+
+ switch (status) {
+ case NFS4ERR_DELAY:
+ case NFS4ERR_GRACE:
+ break;
+ case NFS4ERR_NXIO:
+ ff_layout_mark_ds_unreachable(lseg, idx);
+ /*
+ * Don't return the layout if this is a read and we still
+ * have layouts to try
+ */
+ if (opnum == OP_READ)
+ break;
+ fallthrough;
+ default:
+ pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
+ lseg);
+ }
+
+ dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
+}
+
+/* NFS_PROTO call done callback routines */
+static int ff_layout_read_done_cb(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ int err;
+
+ if (task->tk_status < 0) {
+ ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+ hdr->args.offset, hdr->args.count,
+ &hdr->res.op_status, OP_READ,
+ task->tk_status);
+ trace_ff_layout_read_error(hdr);
+ }
+
+ err = ff_layout_async_handle_error(task, hdr->args.context->state,
+ hdr->ds_clp, hdr->lseg,
+ hdr->pgio_mirror_idx);
+
+ trace_nfs4_pnfs_read(hdr, err);
+ clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
+ clear_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags);
+ switch (err) {
+ case -NFS4ERR_RESET_TO_PNFS:
+ set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
+ return task->tk_status;
+ case -NFS4ERR_RESET_TO_MDS:
+ set_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags);
+ return task->tk_status;
+ case -EAGAIN:
+ goto out_eagain;
+ }
+
+ return 0;
+out_eagain:
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+}
+
+static bool
+ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg)
+{
+ return !(FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_LAYOUTCOMMIT);
+}
+
+/*
+ * We reference the rpc_cred of the first WRITE that triggers the need for
+ * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
+ * rfc5661 is not clear about which credential should be used.
+ *
+ * Flexlayout client should treat DS replied FILE_SYNC as DATA_SYNC, so
+ * to follow http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751
+ * we always send layoutcommit after DS writes.
+ */
+static void
+ff_layout_set_layoutcommit(struct inode *inode,
+ struct pnfs_layout_segment *lseg,
+ loff_t end_offset)
+{
+ if (!ff_layout_need_layoutcommit(lseg))
+ return;
+
+ pnfs_set_layoutcommit(inode, lseg, end_offset);
+ dprintk("%s inode %lu pls_end_pos %llu\n", __func__, inode->i_ino,
+ (unsigned long long) NFS_I(inode)->layout->plh_lwb);
+}
+
+static void ff_layout_read_record_layoutstats_start(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
+ nfs4_ff_layout_stat_io_start_read(hdr->inode,
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count,
+ task->tk_start);
+}
+
+static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
+ nfs4_ff_layout_stat_io_end_read(task,
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count,
+ hdr->res.count);
+ set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags);
+}
+
+static int ff_layout_read_prepare_common(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+ rpc_exit(task, -EIO);
+ return -EIO;
+ }
+
+ if (!pnfs_is_valid_lseg(hdr->lseg)) {
+ rpc_exit(task, -EAGAIN);
+ return -EAGAIN;
+ }
+
+ ff_layout_read_record_layoutstats_start(task, hdr);
+ return 0;
+}
+
+/*
+ * Call ops for the async read/write cases
+ * In the case of dense layouts, the offset needs to be reset to its
+ * original value.
+ */
+static void ff_layout_read_prepare_v3(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (ff_layout_read_prepare_common(task, hdr))
+ return;
+
+ rpc_call_start(task);
+}
+
+static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (nfs4_setup_sequence(hdr->ds_clp,
+ &hdr->args.seq_args,
+ &hdr->res.seq_res,
+ task))
+ return;
+
+ ff_layout_read_prepare_common(task, hdr);
+}
+
+static void ff_layout_read_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+ task->tk_status == 0) {
+ nfs4_sequence_done(task, &hdr->res.seq_res);
+ return;
+ }
+
+ /* Note this may cause RPC to be resent */
+ hdr->mds_ops->rpc_call_done(task, hdr);
+}
+
+static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ ff_layout_read_record_layoutstats_done(task, hdr);
+ rpc_count_iostats_metrics(task,
+ &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
+}
+
+static void ff_layout_read_release(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ ff_layout_read_record_layoutstats_done(&hdr->task, hdr);
+ if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags))
+ ff_layout_resend_pnfs_read(hdr);
+ else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
+ ff_layout_reset_read(hdr);
+ pnfs_generic_rw_release(data);
+}
+
+
+static int ff_layout_write_done_cb(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ loff_t end_offs = 0;
+ int err;
+
+ if (task->tk_status < 0) {
+ ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+ hdr->args.offset, hdr->args.count,
+ &hdr->res.op_status, OP_WRITE,
+ task->tk_status);
+ trace_ff_layout_write_error(hdr);
+ }
+
+ err = ff_layout_async_handle_error(task, hdr->args.context->state,
+ hdr->ds_clp, hdr->lseg,
+ hdr->pgio_mirror_idx);
+
+ trace_nfs4_pnfs_write(hdr, err);
+ clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
+ clear_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags);
+ switch (err) {
+ case -NFS4ERR_RESET_TO_PNFS:
+ set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
+ return task->tk_status;
+ case -NFS4ERR_RESET_TO_MDS:
+ set_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags);
+ return task->tk_status;
+ case -EAGAIN:
+ return -EAGAIN;
+ }
+
+ if (hdr->res.verf->committed == NFS_FILE_SYNC ||
+ hdr->res.verf->committed == NFS_DATA_SYNC)
+ end_offs = hdr->mds_offset + (loff_t)hdr->res.count;
+
+ /* Note: if the write is unstable, don't set end_offs until commit */
+ ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, end_offs);
+
+ /* zero out fattr since we don't care DS attr at all */
+ hdr->fattr.valid = 0;
+ if (task->tk_status >= 0)
+ nfs_writeback_update_inode(hdr);
+
+ return 0;
+}
+
+static int ff_layout_commit_done_cb(struct rpc_task *task,
+ struct nfs_commit_data *data)
+{
+ int err;
+
+ if (task->tk_status < 0) {
+ ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
+ data->args.offset, data->args.count,
+ &data->res.op_status, OP_COMMIT,
+ task->tk_status);
+ trace_ff_layout_commit_error(data);
+ }
+
+ err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
+ data->lseg, data->ds_commit_index);
+
+ trace_nfs4_pnfs_commit_ds(data, err);
+ switch (err) {
+ case -NFS4ERR_RESET_TO_PNFS:
+ pnfs_generic_prepare_to_resend_writes(data);
+ return -EAGAIN;
+ case -NFS4ERR_RESET_TO_MDS:
+ pnfs_generic_prepare_to_resend_writes(data);
+ return -EAGAIN;
+ case -EAGAIN:
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
+
+ ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb);
+
+ return 0;
+}
+
+static void ff_layout_write_record_layoutstats_start(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
+ nfs4_ff_layout_stat_io_start_write(hdr->inode,
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count,
+ task->tk_start);
+}
+
+static void ff_layout_write_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
+ nfs4_ff_layout_stat_io_end_write(task,
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count, hdr->res.count,
+ hdr->res.verf->committed);
+ set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags);
+}
+
+static int ff_layout_write_prepare_common(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+ rpc_exit(task, -EIO);
+ return -EIO;
+ }
+
+ if (!pnfs_is_valid_lseg(hdr->lseg)) {
+ rpc_exit(task, -EAGAIN);
+ return -EAGAIN;
+ }
+
+ ff_layout_write_record_layoutstats_start(task, hdr);
+ return 0;
+}
+
+static void ff_layout_write_prepare_v3(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (ff_layout_write_prepare_common(task, hdr))
+ return;
+
+ rpc_call_start(task);
+}
+
+static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (nfs4_setup_sequence(hdr->ds_clp,
+ &hdr->args.seq_args,
+ &hdr->res.seq_res,
+ task))
+ return;
+
+ ff_layout_write_prepare_common(task, hdr);
+}
+
+static void ff_layout_write_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+ task->tk_status == 0) {
+ nfs4_sequence_done(task, &hdr->res.seq_res);
+ return;
+ }
+
+ /* Note this may cause RPC to be resent */
+ hdr->mds_ops->rpc_call_done(task, hdr);
+}
+
+static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ ff_layout_write_record_layoutstats_done(task, hdr);
+ rpc_count_iostats_metrics(task,
+ &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
+}
+
+static void ff_layout_write_release(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ ff_layout_write_record_layoutstats_done(&hdr->task, hdr);
+ if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) {
+ ff_layout_send_layouterror(hdr->lseg);
+ ff_layout_reset_write(hdr, true);
+ } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
+ ff_layout_reset_write(hdr, false);
+ pnfs_generic_rw_release(data);
+}
+
+static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task,
+ struct nfs_commit_data *cdata)
+{
+ if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags))
+ return;
+ nfs4_ff_layout_stat_io_start_write(cdata->inode,
+ FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+ 0, task->tk_start);
+}
+
+static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_commit_data *cdata)
+{
+ struct nfs_page *req;
+ __u64 count = 0;
+
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags))
+ return;
+
+ if (task->tk_status == 0) {
+ list_for_each_entry(req, &cdata->pages, wb_list)
+ count += req->wb_bytes;
+ }
+ nfs4_ff_layout_stat_io_end_write(task,
+ FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+ count, count, NFS_FILE_SYNC);
+ set_bit(NFS_LSEG_LAYOUTRETURN, &cdata->lseg->pls_flags);
+}
+
+static int ff_layout_commit_prepare_common(struct rpc_task *task,
+ struct nfs_commit_data *cdata)
+{
+ if (!pnfs_is_valid_lseg(cdata->lseg)) {
+ rpc_exit(task, -EAGAIN);
+ return -EAGAIN;
+ }
+
+ ff_layout_commit_record_layoutstats_start(task, cdata);
+ return 0;
+}
+
+static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
+{
+ if (ff_layout_commit_prepare_common(task, data))
+ return;
+
+ rpc_call_start(task);
+}
+
+static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
+{
+ struct nfs_commit_data *wdata = data;
+
+ if (nfs4_setup_sequence(wdata->ds_clp,
+ &wdata->args.seq_args,
+ &wdata->res.seq_res,
+ task))
+ return;
+ ff_layout_commit_prepare_common(task, data);
+}
+
+static void ff_layout_commit_done(struct rpc_task *task, void *data)
+{
+ pnfs_generic_write_commit_done(task, data);
+}
+
+static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
+{
+ struct nfs_commit_data *cdata = data;
+
+ ff_layout_commit_record_layoutstats_done(task, cdata);
+ rpc_count_iostats_metrics(task,
+ &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
+}
+
+static void ff_layout_commit_release(void *data)
+{
+ struct nfs_commit_data *cdata = data;
+
+ ff_layout_commit_record_layoutstats_done(&cdata->task, cdata);
+ pnfs_generic_commit_release(data);
+}
+
+static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
+ .rpc_call_prepare = ff_layout_read_prepare_v3,
+ .rpc_call_done = ff_layout_read_call_done,
+ .rpc_count_stats = ff_layout_read_count_stats,
+ .rpc_release = ff_layout_read_release,
+};
+
+static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
+ .rpc_call_prepare = ff_layout_read_prepare_v4,
+ .rpc_call_done = ff_layout_read_call_done,
+ .rpc_count_stats = ff_layout_read_count_stats,
+ .rpc_release = ff_layout_read_release,
+};
+
+static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
+ .rpc_call_prepare = ff_layout_write_prepare_v3,
+ .rpc_call_done = ff_layout_write_call_done,
+ .rpc_count_stats = ff_layout_write_count_stats,
+ .rpc_release = ff_layout_write_release,
+};
+
+static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
+ .rpc_call_prepare = ff_layout_write_prepare_v4,
+ .rpc_call_done = ff_layout_write_call_done,
+ .rpc_count_stats = ff_layout_write_count_stats,
+ .rpc_release = ff_layout_write_release,
+};
+
+static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
+ .rpc_call_prepare = ff_layout_commit_prepare_v3,
+ .rpc_call_done = ff_layout_commit_done,
+ .rpc_count_stats = ff_layout_commit_count_stats,
+ .rpc_release = ff_layout_commit_release,
+};
+
+static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
+ .rpc_call_prepare = ff_layout_commit_prepare_v4,
+ .rpc_call_done = ff_layout_commit_done,
+ .rpc_count_stats = ff_layout_commit_count_stats,
+ .rpc_release = ff_layout_commit_release,
+};
+
+static enum pnfs_try_status
+ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
+{
+ struct pnfs_layout_segment *lseg = hdr->lseg;
+ struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
+ struct nfs4_ff_layout_mirror *mirror;
+ const struct cred *ds_cred;
+ loff_t offset = hdr->args.offset;
+ u32 idx = hdr->pgio_mirror_idx;
+ int vers;
+ struct nfs_fh *fh;
+
+ dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n",
+ __func__, hdr->inode->i_ino,
+ hdr->args.pgbase, (size_t)hdr->args.count, offset);
+
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
+ if (!ds)
+ goto out_failed;
+
+ ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
+ hdr->inode);
+ if (IS_ERR(ds_clnt))
+ goto out_failed;
+
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
+ if (!ds_cred)
+ goto out_failed;
+
+ vers = nfs4_ff_layout_ds_version(mirror);
+
+ dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
+ ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers);
+
+ hdr->pgio_done_cb = ff_layout_read_done_cb;
+ refcount_inc(&ds->ds_clp->cl_count);
+ hdr->ds_clp = ds->ds_clp;
+ fh = nfs4_ff_layout_select_ds_fh(mirror);
+ if (fh)
+ hdr->args.fh = fh;
+
+ nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
+
+ /*
+ * Note that if we ever decide to split across DSes,
+ * then we may need to handle dense-like offsets.
+ */
+ hdr->args.offset = offset;
+ hdr->mds_offset = offset;
+
+ /* Perform an asynchronous read to ds */
+ nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
+ vers == 3 ? &ff_layout_read_call_ops_v3 :
+ &ff_layout_read_call_ops_v4,
+ 0, RPC_TASK_SOFTCONN);
+ put_cred(ds_cred);
+ return PNFS_ATTEMPTED;
+
+out_failed:
+ if (ff_layout_avoid_mds_available_ds(lseg))
+ return PNFS_TRY_AGAIN;
+ trace_pnfs_mds_fallback_read_pagelist(hdr->inode,
+ hdr->args.offset, hdr->args.count,
+ IOMODE_READ, NFS_I(hdr->inode)->layout, lseg);
+ return PNFS_NOT_ATTEMPTED;
+}
+
+/* Perform async writes. */
+static enum pnfs_try_status
+ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
+{
+ struct pnfs_layout_segment *lseg = hdr->lseg;
+ struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
+ struct nfs4_ff_layout_mirror *mirror;
+ const struct cred *ds_cred;
+ loff_t offset = hdr->args.offset;
+ int vers;
+ struct nfs_fh *fh;
+ u32 idx = hdr->pgio_mirror_idx;
+
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
+ if (!ds)
+ goto out_failed;
+
+ ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
+ hdr->inode);
+ if (IS_ERR(ds_clnt))
+ goto out_failed;
+
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
+ if (!ds_cred)
+ goto out_failed;
+
+ vers = nfs4_ff_layout_ds_version(mirror);
+
+ dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n",
+ __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
+ offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count),
+ vers);
+
+ hdr->pgio_done_cb = ff_layout_write_done_cb;
+ refcount_inc(&ds->ds_clp->cl_count);
+ hdr->ds_clp = ds->ds_clp;
+ hdr->ds_commit_idx = idx;
+ fh = nfs4_ff_layout_select_ds_fh(mirror);
+ if (fh)
+ hdr->args.fh = fh;
+
+ nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
+
+ /*
+ * Note that if we ever decide to split across DSes,
+ * then we may need to handle dense-like offsets.
+ */
+ hdr->args.offset = offset;
+
+ /* Perform an asynchronous write */
+ nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
+ vers == 3 ? &ff_layout_write_call_ops_v3 :
+ &ff_layout_write_call_ops_v4,
+ sync, RPC_TASK_SOFTCONN);
+ put_cred(ds_cred);
+ return PNFS_ATTEMPTED;
+
+out_failed:
+ if (ff_layout_avoid_mds_available_ds(lseg))
+ return PNFS_TRY_AGAIN;
+ trace_pnfs_mds_fallback_write_pagelist(hdr->inode,
+ hdr->args.offset, hdr->args.count,
+ IOMODE_RW, NFS_I(hdr->inode)->layout, lseg);
+ return PNFS_NOT_ATTEMPTED;
+}
+
+static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+ return i;
+}
+
+static struct nfs_fh *
+select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+
+ /* FIXME: Assume that there is only one NFS version available
+ * for the DS.
+ */
+ return &flseg->mirror_array[i]->fh_versions[0];
+}
+
+static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
+{
+ struct pnfs_layout_segment *lseg = data->lseg;
+ struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
+ struct nfs4_ff_layout_mirror *mirror;
+ const struct cred *ds_cred;
+ u32 idx;
+ int vers, ret;
+ struct nfs_fh *fh;
+
+ if (!lseg || !(pnfs_is_valid_lseg(lseg) ||
+ test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)))
+ goto out_err;
+
+ idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
+ if (!ds)
+ goto out_err;
+
+ ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
+ data->inode);
+ if (IS_ERR(ds_clnt))
+ goto out_err;
+
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred);
+ if (!ds_cred)
+ goto out_err;
+
+ vers = nfs4_ff_layout_ds_version(mirror);
+
+ dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
+ data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count),
+ vers);
+ data->commit_done_cb = ff_layout_commit_done_cb;
+ data->cred = ds_cred;
+ refcount_inc(&ds->ds_clp->cl_count);
+ data->ds_clp = ds->ds_clp;
+ fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
+ if (fh)
+ data->args.fh = fh;
+
+ ret = nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
+ vers == 3 ? &ff_layout_commit_call_ops_v3 :
+ &ff_layout_commit_call_ops_v4,
+ how, RPC_TASK_SOFTCONN);
+ put_cred(ds_cred);
+ return ret;
+out_err:
+ pnfs_generic_prepare_to_resend_writes(data);
+ pnfs_generic_commit_release(data);
+ return -EAGAIN;
+}
+
+static int
+ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
+ int how, struct nfs_commit_info *cinfo)
+{
+ return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
+ ff_layout_initiate_commit);
+}
+
+static bool ff_layout_match_rw(const struct rpc_task *task,
+ const struct nfs_pgio_header *hdr,
+ const struct pnfs_layout_segment *lseg)
+{
+ return hdr->lseg == lseg;
+}
+
+static bool ff_layout_match_commit(const struct rpc_task *task,
+ const struct nfs_commit_data *cdata,
+ const struct pnfs_layout_segment *lseg)
+{
+ return cdata->lseg == lseg;
+}
+
+static bool ff_layout_match_io(const struct rpc_task *task, const void *data)
+{
+ const struct rpc_call_ops *ops = task->tk_ops;
+
+ if (ops == &ff_layout_read_call_ops_v3 ||
+ ops == &ff_layout_read_call_ops_v4 ||
+ ops == &ff_layout_write_call_ops_v3 ||
+ ops == &ff_layout_write_call_ops_v4)
+ return ff_layout_match_rw(task, task->tk_calldata, data);
+ if (ops == &ff_layout_commit_call_ops_v3 ||
+ ops == &ff_layout_commit_call_ops_v4)
+ return ff_layout_match_commit(task, task->tk_calldata, data);
+ return false;
+}
+
+static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_ff_layout_ds *mirror_ds;
+ struct nfs4_pnfs_ds *ds;
+ struct nfs_client *ds_clp;
+ struct rpc_clnt *clnt;
+ u32 idx;
+
+ for (idx = 0; idx < flseg->mirror_array_cnt; idx++) {
+ mirror = flseg->mirror_array[idx];
+ mirror_ds = mirror->mirror_ds;
+ if (!mirror_ds)
+ continue;
+ ds = mirror->mirror_ds->ds;
+ if (!ds)
+ continue;
+ ds_clp = ds->ds_clp;
+ if (!ds_clp)
+ continue;
+ clnt = ds_clp->cl_rpcclient;
+ if (!clnt)
+ continue;
+ if (!rpc_cancel_tasks(clnt, -EAGAIN, ff_layout_match_io, lseg))
+ continue;
+ rpc_clnt_disconnect(clnt);
+ }
+}
+
+static struct pnfs_ds_commit_info *
+ff_layout_get_ds_info(struct inode *inode)
+{
+ struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
+
+ if (layout == NULL)
+ return NULL;
+
+ return &FF_LAYOUT_FROM_HDR(layout)->commit_info;
+}
+
+static void
+ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+ struct inode *inode = lseg->pls_layout->plh_inode;
+ struct pnfs_commit_array *array, *new;
+
+ new = pnfs_alloc_commit_array(flseg->mirror_array_cnt,
+ nfs_io_gfp_mask());
+ if (new) {
+ spin_lock(&inode->i_lock);
+ array = pnfs_add_commit_array(fl_cinfo, new, lseg);
+ spin_unlock(&inode->i_lock);
+ if (array != new)
+ pnfs_free_commit_array(new);
+ }
+}
+
+static void
+ff_layout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+ struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ pnfs_generic_ds_cinfo_destroy(fl_cinfo);
+ spin_unlock(&inode->i_lock);
+}
+
+static void
+ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+ nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
+ id_node));
+}
+
+static int ff_layout_encode_ioerr(struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args,
+ const struct nfs4_flexfile_layoutreturn_args *ff_args)
+{
+ __be32 *start;
+
+ start = xdr_reserve_space(xdr, 4);
+ if (unlikely(!start))
+ return -E2BIG;
+
+ *start = cpu_to_be32(ff_args->num_errors);
+ /* This assume we always return _ALL_ layouts */
+ return ff_layout_encode_ds_ioerr(xdr, &ff_args->errors);
+}
+
+static void
+encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len)
+{
+ WARN_ON_ONCE(xdr_stream_encode_opaque_fixed(xdr, buf, len) < 0);
+}
+
+static void
+ff_layout_encode_ff_iostat_head(struct xdr_stream *xdr,
+ const nfs4_stateid *stateid,
+ const struct nfs42_layoutstat_devinfo *devinfo)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 8 + 8);
+ p = xdr_encode_hyper(p, devinfo->offset);
+ p = xdr_encode_hyper(p, devinfo->length);
+ encode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE);
+ p = xdr_reserve_space(xdr, 4*8);
+ p = xdr_encode_hyper(p, devinfo->read_count);
+ p = xdr_encode_hyper(p, devinfo->read_bytes);
+ p = xdr_encode_hyper(p, devinfo->write_count);
+ p = xdr_encode_hyper(p, devinfo->write_bytes);
+ encode_opaque_fixed(xdr, devinfo->dev_id.data, NFS4_DEVICEID4_SIZE);
+}
+
+static void
+ff_layout_encode_ff_iostat(struct xdr_stream *xdr,
+ const nfs4_stateid *stateid,
+ const struct nfs42_layoutstat_devinfo *devinfo)
+{
+ ff_layout_encode_ff_iostat_head(xdr, stateid, devinfo);
+ ff_layout_encode_ff_layoutupdate(xdr, devinfo,
+ devinfo->ld_private.data);
+}
+
+/* report nothing for now */
+static void ff_layout_encode_iostats_array(struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args,
+ struct nfs4_flexfile_layoutreturn_args *ff_args)
+{
+ __be32 *p;
+ int i;
+
+ p = xdr_reserve_space(xdr, 4);
+ *p = cpu_to_be32(ff_args->num_dev);
+ for (i = 0; i < ff_args->num_dev; i++)
+ ff_layout_encode_ff_iostat(xdr,
+ &args->layout->plh_stateid,
+ &ff_args->devinfo[i]);
+}
+
+static void
+ff_layout_free_iostats_array(struct nfs42_layoutstat_devinfo *devinfo,
+ unsigned int num_entries)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_entries; i++) {
+ if (!devinfo[i].ld_private.ops)
+ continue;
+ if (!devinfo[i].ld_private.ops->free)
+ continue;
+ devinfo[i].ld_private.ops->free(&devinfo[i].ld_private);
+ }
+}
+
+static struct nfs4_deviceid_node *
+ff_layout_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_flags)
+{
+ struct nfs4_ff_layout_ds *dsaddr;
+
+ dsaddr = nfs4_ff_alloc_deviceid_node(server, pdev, gfp_flags);
+ if (!dsaddr)
+ return NULL;
+ return &dsaddr->id_node;
+}
+
+static void
+ff_layout_encode_layoutreturn(struct xdr_stream *xdr,
+ const void *voidargs,
+ const struct nfs4_xdr_opaque_data *ff_opaque)
+{
+ const struct nfs4_layoutreturn_args *args = voidargs;
+ struct nfs4_flexfile_layoutreturn_args *ff_args = ff_opaque->data;
+ struct xdr_buf tmp_buf = {
+ .head = {
+ [0] = {
+ .iov_base = page_address(ff_args->pages[0]),
+ },
+ },
+ .buflen = PAGE_SIZE,
+ };
+ struct xdr_stream tmp_xdr;
+ __be32 *start;
+
+ dprintk("%s: Begin\n", __func__);
+
+ xdr_init_encode(&tmp_xdr, &tmp_buf, NULL, NULL);
+
+ ff_layout_encode_ioerr(&tmp_xdr, args, ff_args);
+ ff_layout_encode_iostats_array(&tmp_xdr, args, ff_args);
+
+ start = xdr_reserve_space(xdr, 4);
+ *start = cpu_to_be32(tmp_buf.len);
+ xdr_write_pages(xdr, ff_args->pages, 0, tmp_buf.len);
+
+ dprintk("%s: Return\n", __func__);
+}
+
+static void
+ff_layout_free_layoutreturn(struct nfs4_xdr_opaque_data *args)
+{
+ struct nfs4_flexfile_layoutreturn_args *ff_args;
+
+ if (!args->data)
+ return;
+ ff_args = args->data;
+ args->data = NULL;
+
+ ff_layout_free_ds_ioerr(&ff_args->errors);
+ ff_layout_free_iostats_array(ff_args->devinfo, ff_args->num_dev);
+
+ put_page(ff_args->pages[0]);
+ kfree(ff_args);
+}
+
+static const struct nfs4_xdr_opaque_ops layoutreturn_ops = {
+ .encode = ff_layout_encode_layoutreturn,
+ .free = ff_layout_free_layoutreturn,
+};
+
+static int
+ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args)
+{
+ struct nfs4_flexfile_layoutreturn_args *ff_args;
+ struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(args->layout);
+
+ ff_args = kmalloc(sizeof(*ff_args), nfs_io_gfp_mask());
+ if (!ff_args)
+ goto out_nomem;
+ ff_args->pages[0] = alloc_page(nfs_io_gfp_mask());
+ if (!ff_args->pages[0])
+ goto out_nomem_free;
+
+ INIT_LIST_HEAD(&ff_args->errors);
+ ff_args->num_errors = ff_layout_fetch_ds_ioerr(args->layout,
+ &args->range, &ff_args->errors,
+ FF_LAYOUTRETURN_MAXERR);
+
+ spin_lock(&args->inode->i_lock);
+ ff_args->num_dev = ff_layout_mirror_prepare_stats(
+ &ff_layout->generic_hdr, &ff_args->devinfo[0],
+ ARRAY_SIZE(ff_args->devinfo), NFS4_FF_OP_LAYOUTRETURN);
+ spin_unlock(&args->inode->i_lock);
+
+ args->ld_private->ops = &layoutreturn_ops;
+ args->ld_private->data = ff_args;
+ return 0;
+out_nomem_free:
+ kfree(ff_args);
+out_nomem:
+ return -ENOMEM;
+}
+
+#ifdef CONFIG_NFS_V4_2
+void
+ff_layout_send_layouterror(struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_layout_hdr *lo = lseg->pls_layout;
+ struct nfs42_layout_error *errors;
+ LIST_HEAD(head);
+
+ if (!nfs_server_capable(lo->plh_inode, NFS_CAP_LAYOUTERROR))
+ return;
+ ff_layout_fetch_ds_ioerr(lo, &lseg->pls_range, &head, -1);
+ if (list_empty(&head))
+ return;
+
+ errors = kmalloc_array(NFS42_LAYOUTERROR_MAX, sizeof(*errors),
+ nfs_io_gfp_mask());
+ if (errors != NULL) {
+ const struct nfs4_ff_layout_ds_err *pos;
+ size_t n = 0;
+
+ list_for_each_entry(pos, &head, list) {
+ errors[n].offset = pos->offset;
+ errors[n].length = pos->length;
+ nfs4_stateid_copy(&errors[n].stateid, &pos->stateid);
+ errors[n].errors[0].dev_id = pos->deviceid;
+ errors[n].errors[0].status = pos->status;
+ errors[n].errors[0].opnum = pos->opnum;
+ n++;
+ if (!list_is_last(&pos->list, &head) &&
+ n < NFS42_LAYOUTERROR_MAX)
+ continue;
+ if (nfs42_proc_layouterror(lseg, errors, n) < 0)
+ break;
+ n = 0;
+ }
+ kfree(errors);
+ }
+ ff_layout_free_ds_ioerr(&head);
+}
+#else
+void
+ff_layout_send_layouterror(struct pnfs_layout_segment *lseg)
+{
+}
+#endif
+
+static int
+ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen)
+{
+ const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+
+ return snprintf(buf, buflen, "%pI4", &sin->sin_addr);
+}
+
+static size_t
+ff_layout_ntop6_noscopeid(const struct sockaddr *sap, char *buf,
+ const int buflen)
+{
+ const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+ const struct in6_addr *addr = &sin6->sin6_addr;
+
+ /*
+ * RFC 4291, Section 2.2.2
+ *
+ * Shorthanded ANY address
+ */
+ if (ipv6_addr_any(addr))
+ return snprintf(buf, buflen, "::");
+
+ /*
+ * RFC 4291, Section 2.2.2
+ *
+ * Shorthanded loopback address
+ */
+ if (ipv6_addr_loopback(addr))
+ return snprintf(buf, buflen, "::1");
+
+ /*
+ * RFC 4291, Section 2.2.3
+ *
+ * Special presentation address format for mapped v4
+ * addresses.
+ */
+ if (ipv6_addr_v4mapped(addr))
+ return snprintf(buf, buflen, "::ffff:%pI4",
+ &addr->s6_addr32[3]);
+
+ /*
+ * RFC 4291, Section 2.2.1
+ */
+ return snprintf(buf, buflen, "%pI6c", addr);
+}
+
+/* Derived from rpc_sockaddr2uaddr */
+static void
+ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da)
+{
+ struct sockaddr *sap = (struct sockaddr *)&da->da_addr;
+ char portbuf[RPCBIND_MAXUADDRPLEN];
+ char addrbuf[RPCBIND_MAXUADDRLEN];
+ unsigned short port;
+ int len, netid_len;
+ __be32 *p;
+
+ switch (sap->sa_family) {
+ case AF_INET:
+ if (ff_layout_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0)
+ return;
+ port = ntohs(((struct sockaddr_in *)sap)->sin_port);
+ break;
+ case AF_INET6:
+ if (ff_layout_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0)
+ return;
+ port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff);
+ len = strlcat(addrbuf, portbuf, sizeof(addrbuf));
+
+ netid_len = strlen(da->da_netid);
+ p = xdr_reserve_space(xdr, 4 + netid_len);
+ xdr_encode_opaque(p, da->da_netid, netid_len);
+
+ p = xdr_reserve_space(xdr, 4 + len);
+ xdr_encode_opaque(p, addrbuf, len);
+}
+
+static void
+ff_layout_encode_nfstime(struct xdr_stream *xdr,
+ ktime_t t)
+{
+ struct timespec64 ts;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 12);
+ ts = ktime_to_timespec64(t);
+ p = xdr_encode_hyper(p, ts.tv_sec);
+ *p++ = cpu_to_be32(ts.tv_nsec);
+}
+
+static void
+ff_layout_encode_io_latency(struct xdr_stream *xdr,
+ struct nfs4_ff_io_stat *stat)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 5 * 8);
+ p = xdr_encode_hyper(p, stat->ops_requested);
+ p = xdr_encode_hyper(p, stat->bytes_requested);
+ p = xdr_encode_hyper(p, stat->ops_completed);
+ p = xdr_encode_hyper(p, stat->bytes_completed);
+ p = xdr_encode_hyper(p, stat->bytes_not_delivered);
+ ff_layout_encode_nfstime(xdr, stat->total_busy_time);
+ ff_layout_encode_nfstime(xdr, stat->aggregate_completion_time);
+}
+
+static void
+ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
+ const struct nfs42_layoutstat_devinfo *devinfo,
+ struct nfs4_ff_layout_mirror *mirror)
+{
+ struct nfs4_pnfs_ds_addr *da;
+ struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds;
+ struct nfs_fh *fh = &mirror->fh_versions[0];
+ __be32 *p;
+
+ da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node);
+ dprintk("%s: DS %s: encoding address %s\n",
+ __func__, ds->ds_remotestr, da->da_remotestr);
+ /* netaddr4 */
+ ff_layout_encode_netaddr(xdr, da);
+ /* nfs_fh4 */
+ p = xdr_reserve_space(xdr, 4 + fh->size);
+ xdr_encode_opaque(p, fh->data, fh->size);
+ /* ff_io_latency4 read */
+ spin_lock(&mirror->lock);
+ ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat);
+ /* ff_io_latency4 write */
+ ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat);
+ spin_unlock(&mirror->lock);
+ /* nfstime4 */
+ ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time));
+ /* bool */
+ p = xdr_reserve_space(xdr, 4);
+ *p = cpu_to_be32(false);
+}
+
+static void
+ff_layout_encode_layoutstats(struct xdr_stream *xdr, const void *args,
+ const struct nfs4_xdr_opaque_data *opaque)
+{
+ struct nfs42_layoutstat_devinfo *devinfo = container_of(opaque,
+ struct nfs42_layoutstat_devinfo, ld_private);
+ __be32 *start;
+
+ /* layoutupdate length */
+ start = xdr_reserve_space(xdr, 4);
+ ff_layout_encode_ff_layoutupdate(xdr, devinfo, opaque->data);
+
+ *start = cpu_to_be32((xdr->p - start - 1) * 4);
+}
+
+static void
+ff_layout_free_layoutstats(struct nfs4_xdr_opaque_data *opaque)
+{
+ struct nfs4_ff_layout_mirror *mirror = opaque->data;
+
+ ff_layout_put_mirror(mirror);
+}
+
+static const struct nfs4_xdr_opaque_ops layoutstat_ops = {
+ .encode = ff_layout_encode_layoutstats,
+ .free = ff_layout_free_layoutstats,
+};
+
+static int
+ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
+ struct nfs42_layoutstat_devinfo *devinfo,
+ int dev_limit, enum nfs4_ff_op_type type)
+{
+ struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_deviceid_node *dev;
+ int i = 0;
+
+ list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
+ if (i >= dev_limit)
+ break;
+ if (IS_ERR_OR_NULL(mirror->mirror_ds))
+ continue;
+ if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL,
+ &mirror->flags) &&
+ type != NFS4_FF_OP_LAYOUTRETURN)
+ continue;
+ /* mirror refcount put in cleanup_layoutstats */
+ if (!refcount_inc_not_zero(&mirror->ref))
+ continue;
+ dev = &mirror->mirror_ds->id_node;
+ memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
+ devinfo->offset = 0;
+ devinfo->length = NFS4_MAX_UINT64;
+ spin_lock(&mirror->lock);
+ devinfo->read_count = mirror->read_stat.io_stat.ops_completed;
+ devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
+ devinfo->write_count = mirror->write_stat.io_stat.ops_completed;
+ devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
+ spin_unlock(&mirror->lock);
+ devinfo->layout_type = LAYOUT_FLEX_FILES;
+ devinfo->ld_private.ops = &layoutstat_ops;
+ devinfo->ld_private.data = mirror;
+
+ devinfo++;
+ i++;
+ }
+ return i;
+}
+
+static int ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
+{
+ struct pnfs_layout_hdr *lo;
+ struct nfs4_flexfile_layout *ff_layout;
+ const int dev_count = PNFS_LAYOUTSTATS_MAXDEV;
+
+ /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
+ args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo),
+ nfs_io_gfp_mask());
+ if (!args->devinfo)
+ return -ENOMEM;
+
+ spin_lock(&args->inode->i_lock);
+ lo = NFS_I(args->inode)->layout;
+ if (lo && pnfs_layout_is_valid(lo)) {
+ ff_layout = FF_LAYOUT_FROM_HDR(lo);
+ args->num_dev = ff_layout_mirror_prepare_stats(
+ &ff_layout->generic_hdr, &args->devinfo[0], dev_count,
+ NFS4_FF_OP_LAYOUTSTATS);
+ } else
+ args->num_dev = 0;
+ spin_unlock(&args->inode->i_lock);
+ if (!args->num_dev) {
+ kfree(args->devinfo);
+ args->devinfo = NULL;
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+static int
+ff_layout_set_layoutdriver(struct nfs_server *server,
+ const struct nfs_fh *dummy)
+{
+#if IS_ENABLED(CONFIG_NFS_V4_2)
+ server->caps |= NFS_CAP_LAYOUTSTATS;
+#endif
+ return 0;
+}
+
+static const struct pnfs_commit_ops ff_layout_commit_ops = {
+ .setup_ds_info = ff_layout_setup_ds_info,
+ .release_ds_info = ff_layout_release_ds_info,
+ .mark_request_commit = pnfs_layout_mark_request_commit,
+ .clear_request_commit = pnfs_generic_clear_request_commit,
+ .scan_commit_lists = pnfs_generic_scan_commit_lists,
+ .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
+ .commit_pagelist = ff_layout_commit_pagelist,
+};
+
+static struct pnfs_layoutdriver_type flexfilelayout_type = {
+ .id = LAYOUT_FLEX_FILES,
+ .name = "LAYOUT_FLEX_FILES",
+ .owner = THIS_MODULE,
+ .flags = PNFS_LAYOUTGET_ON_OPEN,
+ .max_layoutget_response = 4096, /* 1 page or so... */
+ .set_layoutdriver = ff_layout_set_layoutdriver,
+ .alloc_layout_hdr = ff_layout_alloc_layout_hdr,
+ .free_layout_hdr = ff_layout_free_layout_hdr,
+ .alloc_lseg = ff_layout_alloc_lseg,
+ .free_lseg = ff_layout_free_lseg,
+ .add_lseg = ff_layout_add_lseg,
+ .pg_read_ops = &ff_layout_pg_read_ops,
+ .pg_write_ops = &ff_layout_pg_write_ops,
+ .get_ds_info = ff_layout_get_ds_info,
+ .free_deviceid_node = ff_layout_free_deviceid_node,
+ .read_pagelist = ff_layout_read_pagelist,
+ .write_pagelist = ff_layout_write_pagelist,
+ .alloc_deviceid_node = ff_layout_alloc_deviceid_node,
+ .prepare_layoutreturn = ff_layout_prepare_layoutreturn,
+ .sync = pnfs_nfs_generic_sync,
+ .prepare_layoutstats = ff_layout_prepare_layoutstats,
+ .cancel_io = ff_layout_cancel_io,
+};
+
+static int __init nfs4flexfilelayout_init(void)
+{
+ printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
+ __func__);
+ return pnfs_register_layoutdriver(&flexfilelayout_type);
+}
+
+static void __exit nfs4flexfilelayout_exit(void)
+{
+ printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
+ __func__);
+ pnfs_unregister_layoutdriver(&flexfilelayout_type);
+}
+
+MODULE_ALIAS("nfs-layouttype4-4");
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("The NFSv4 flexfile layout driver");
+
+module_init(nfs4flexfilelayout_init);
+module_exit(nfs4flexfilelayout_exit);
+
+module_param(io_maxretrans, ushort, 0644);
+MODULE_PARM_DESC(io_maxretrans, "The number of times the NFSv4.1 client "
+ "retries an I/O request before returning an error. ");
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
new file mode 100644
index 0000000000..354a031c69
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -0,0 +1,226 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NFSv4 flexfile layout driver data structures.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+
+#ifndef FS_NFS_NFS4FLEXFILELAYOUT_H
+#define FS_NFS_NFS4FLEXFILELAYOUT_H
+
+#define FF_FLAGS_NO_LAYOUTCOMMIT 1
+#define FF_FLAGS_NO_IO_THRU_MDS 2
+#define FF_FLAGS_NO_READ_IO 4
+
+#include <linux/refcount.h>
+#include "../pnfs.h"
+
+/* XXX: Let's filter out insanely large mirror count for now to avoid oom
+ * due to network error etc. */
+#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
+
+/* LAYOUTSTATS report interval in ms */
+#define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L)
+#define FF_LAYOUTSTATS_MAXDEV 4
+
+struct nfs4_ff_ds_version {
+ u32 version;
+ u32 minor_version;
+ u32 rsize;
+ u32 wsize;
+ bool tightly_coupled;
+};
+
+/* chained in global deviceid hlist */
+struct nfs4_ff_layout_ds {
+ struct nfs4_deviceid_node id_node;
+ u32 ds_versions_cnt;
+ struct nfs4_ff_ds_version *ds_versions;
+ struct nfs4_pnfs_ds *ds;
+};
+
+struct nfs4_ff_layout_ds_err {
+ struct list_head list; /* linked in mirror error_list */
+ u64 offset;
+ u64 length;
+ int status;
+ enum nfs_opnum4 opnum;
+ nfs4_stateid stateid;
+ struct nfs4_deviceid deviceid;
+};
+
+struct nfs4_ff_io_stat {
+ __u64 ops_requested;
+ __u64 bytes_requested;
+ __u64 ops_completed;
+ __u64 bytes_completed;
+ __u64 bytes_not_delivered;
+ ktime_t total_busy_time;
+ ktime_t aggregate_completion_time;
+};
+
+struct nfs4_ff_busy_timer {
+ ktime_t start_time;
+ atomic_t n_ops;
+};
+
+struct nfs4_ff_layoutstat {
+ struct nfs4_ff_io_stat io_stat;
+ struct nfs4_ff_busy_timer busy_timer;
+};
+
+struct nfs4_ff_layout_mirror {
+ struct pnfs_layout_hdr *layout;
+ struct list_head mirrors;
+ u32 ds_count;
+ u32 efficiency;
+ struct nfs4_deviceid devid;
+ struct nfs4_ff_layout_ds *mirror_ds;
+ u32 fh_versions_cnt;
+ struct nfs_fh *fh_versions;
+ nfs4_stateid stateid;
+ const struct cred __rcu *ro_cred;
+ const struct cred __rcu *rw_cred;
+ refcount_t ref;
+ spinlock_t lock;
+ unsigned long flags;
+ struct nfs4_ff_layoutstat read_stat;
+ struct nfs4_ff_layoutstat write_stat;
+ ktime_t start_time;
+ u32 report_interval;
+};
+
+#define NFS4_FF_MIRROR_STAT_AVAIL (0)
+
+struct nfs4_ff_layout_segment {
+ struct pnfs_layout_segment generic_hdr;
+ u64 stripe_unit;
+ u32 flags;
+ u32 mirror_array_cnt;
+ struct nfs4_ff_layout_mirror *mirror_array[];
+};
+
+struct nfs4_flexfile_layout {
+ struct pnfs_layout_hdr generic_hdr;
+ struct pnfs_ds_commit_info commit_info;
+ struct list_head mirrors;
+ struct list_head error_list; /* nfs4_ff_layout_ds_err */
+ ktime_t last_report_time; /* Layoutstat report times */
+};
+
+struct nfs4_flexfile_layoutreturn_args {
+ struct list_head errors;
+ struct nfs42_layoutstat_devinfo devinfo[FF_LAYOUTSTATS_MAXDEV];
+ unsigned int num_errors;
+ unsigned int num_dev;
+ struct page *pages[1];
+};
+
+static inline struct nfs4_flexfile_layout *
+FF_LAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo)
+{
+ return container_of(lo, struct nfs4_flexfile_layout, generic_hdr);
+}
+
+static inline struct nfs4_ff_layout_segment *
+FF_LAYOUT_LSEG(struct pnfs_layout_segment *lseg)
+{
+ return container_of(lseg,
+ struct nfs4_ff_layout_segment,
+ generic_hdr);
+}
+
+static inline struct nfs4_ff_layout_ds *
+FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node)
+{
+ return container_of(node, struct nfs4_ff_layout_ds, id_node);
+}
+
+static inline struct nfs4_ff_layout_mirror *
+FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx)
+{
+ struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+
+ if (idx < fls->mirror_array_cnt)
+ return fls->mirror_array[idx];
+ return NULL;
+}
+
+static inline struct nfs4_deviceid_node *
+FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
+{
+ struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, idx);
+
+ if (mirror != NULL) {
+ struct nfs4_ff_layout_ds *mirror_ds = mirror->mirror_ds;
+
+ if (!IS_ERR_OR_NULL(mirror_ds))
+ return &mirror_ds->id_node;
+ }
+ return NULL;
+}
+
+static inline u32
+FF_LAYOUT_MIRROR_COUNT(struct pnfs_layout_segment *lseg)
+{
+ return FF_LAYOUT_LSEG(lseg)->mirror_array_cnt;
+}
+
+static inline bool
+ff_layout_no_fallback_to_mds(struct pnfs_layout_segment *lseg)
+{
+ return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_IO_THRU_MDS;
+}
+
+static inline bool
+ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg)
+{
+ return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_READ_IO;
+}
+
+static inline int
+nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror)
+{
+ return mirror->mirror_ds->ds_versions[0].version;
+}
+
+struct nfs4_ff_layout_ds *
+nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags);
+void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
+void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
+int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
+ struct nfs4_ff_layout_mirror *mirror, u64 offset,
+ u64 length, int status, enum nfs_opnum4 opnum,
+ gfp_t gfp_flags);
+void ff_layout_send_layouterror(struct pnfs_layout_segment *lseg);
+int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head);
+void ff_layout_free_ds_ioerr(struct list_head *head);
+unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range,
+ struct list_head *head,
+ unsigned int maxnum);
+struct nfs_fh *
+nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror);
+void
+nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
+ nfs4_stateid *stateid);
+
+struct nfs4_pnfs_ds *
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
+ struct nfs4_ff_layout_mirror *mirror,
+ bool fail_return);
+
+struct rpc_clnt *
+nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
+ struct nfs_client *ds_clp,
+ struct inode *inode);
+const struct cred *ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
+ const struct pnfs_layout_range *range,
+ const struct cred *mdscred);
+bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg);
+bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg);
+
+#endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
new file mode 100644
index 0000000000..e028f5a0ef
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -0,0 +1,621 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Device operations for the pnfs nfs4 file layout driver.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/sunrpc/addr.h>
+
+#include "../internal.h"
+#include "../nfs4session.h"
+#include "flexfilelayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static unsigned int dataserver_timeo = NFS_DEF_TCP_TIMEO;
+static unsigned int dataserver_retrans;
+
+static bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
+
+void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
+{
+ if (!IS_ERR_OR_NULL(mirror_ds))
+ nfs4_put_deviceid_node(&mirror_ds->id_node);
+}
+
+void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
+{
+ nfs4_print_deviceid(&mirror_ds->id_node.deviceid);
+ nfs4_pnfs_ds_put(mirror_ds->ds);
+ kfree(mirror_ds->ds_versions);
+ kfree_rcu(mirror_ds, id_node.rcu);
+}
+
+/* Decode opaque device data and construct new_ds using it */
+struct nfs4_ff_layout_ds *
+nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags)
+{
+ struct xdr_stream stream;
+ struct xdr_buf buf;
+ struct page *scratch;
+ struct list_head dsaddrs;
+ struct nfs4_pnfs_ds_addr *da;
+ struct nfs4_ff_layout_ds *new_ds = NULL;
+ struct nfs4_ff_ds_version *ds_versions = NULL;
+ u32 mp_count;
+ u32 version_count;
+ __be32 *p;
+ int i, ret = -ENOMEM;
+
+ /* set up xdr stream */
+ scratch = alloc_page(gfp_flags);
+ if (!scratch)
+ goto out_err;
+
+ new_ds = kzalloc(sizeof(struct nfs4_ff_layout_ds), gfp_flags);
+ if (!new_ds)
+ goto out_scratch;
+
+ nfs4_init_deviceid_node(&new_ds->id_node,
+ server,
+ &pdev->dev_id);
+ INIT_LIST_HEAD(&dsaddrs);
+
+ xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
+ xdr_set_scratch_page(&stream, scratch);
+
+ /* multipath count */
+ p = xdr_inline_decode(&stream, 4);
+ if (unlikely(!p))
+ goto out_err_drain_dsaddrs;
+ mp_count = be32_to_cpup(p);
+ dprintk("%s: multipath ds count %d\n", __func__, mp_count);
+
+ for (i = 0; i < mp_count; i++) {
+ /* multipath ds */
+ da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
+ &stream, gfp_flags);
+ if (da)
+ list_add_tail(&da->da_node, &dsaddrs);
+ }
+ if (list_empty(&dsaddrs)) {
+ dprintk("%s: no suitable DS addresses found\n",
+ __func__);
+ ret = -ENOMEDIUM;
+ goto out_err_drain_dsaddrs;
+ }
+
+ /* version count */
+ p = xdr_inline_decode(&stream, 4);
+ if (unlikely(!p))
+ goto out_err_drain_dsaddrs;
+ version_count = be32_to_cpup(p);
+ dprintk("%s: version count %d\n", __func__, version_count);
+
+ ds_versions = kcalloc(version_count,
+ sizeof(struct nfs4_ff_ds_version),
+ gfp_flags);
+ if (!ds_versions)
+ goto out_scratch;
+
+ for (i = 0; i < version_count; i++) {
+ /* 20 = version(4) + minor_version(4) + rsize(4) + wsize(4) +
+ * tightly_coupled(4) */
+ p = xdr_inline_decode(&stream, 20);
+ if (unlikely(!p))
+ goto out_err_drain_dsaddrs;
+ ds_versions[i].version = be32_to_cpup(p++);
+ ds_versions[i].minor_version = be32_to_cpup(p++);
+ ds_versions[i].rsize = nfs_io_size(be32_to_cpup(p++),
+ server->nfs_client->cl_proto);
+ ds_versions[i].wsize = nfs_io_size(be32_to_cpup(p++),
+ server->nfs_client->cl_proto);
+ ds_versions[i].tightly_coupled = be32_to_cpup(p);
+
+ if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE)
+ ds_versions[i].rsize = NFS_MAX_FILE_IO_SIZE;
+ if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE)
+ ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE;
+
+ /*
+ * check for valid major/minor combination.
+ * currently we support dataserver which talk:
+ * v3, v4.0, v4.1, v4.2
+ */
+ if (!((ds_versions[i].version == 3 && ds_versions[i].minor_version == 0) ||
+ (ds_versions[i].version == 4 && ds_versions[i].minor_version < 3))) {
+ dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__,
+ i, ds_versions[i].version,
+ ds_versions[i].minor_version);
+ ret = -EPROTONOSUPPORT;
+ goto out_err_drain_dsaddrs;
+ }
+
+ dprintk("%s: [%d] vers %u minor_ver %u rsize %u wsize %u coupled %d\n",
+ __func__, i, ds_versions[i].version,
+ ds_versions[i].minor_version,
+ ds_versions[i].rsize,
+ ds_versions[i].wsize,
+ ds_versions[i].tightly_coupled);
+ }
+
+ new_ds->ds_versions = ds_versions;
+ new_ds->ds_versions_cnt = version_count;
+
+ new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
+ if (!new_ds->ds)
+ goto out_err_drain_dsaddrs;
+
+ /* If DS was already in cache, free ds addrs */
+ while (!list_empty(&dsaddrs)) {
+ da = list_first_entry(&dsaddrs,
+ struct nfs4_pnfs_ds_addr,
+ da_node);
+ list_del_init(&da->da_node);
+ kfree(da->da_remotestr);
+ kfree(da);
+ }
+
+ __free_page(scratch);
+ return new_ds;
+
+out_err_drain_dsaddrs:
+ while (!list_empty(&dsaddrs)) {
+ da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
+ da_node);
+ list_del_init(&da->da_node);
+ kfree(da->da_remotestr);
+ kfree(da);
+ }
+
+ kfree(ds_versions);
+out_scratch:
+ __free_page(scratch);
+out_err:
+ kfree(new_ds);
+
+ dprintk("%s ERROR: returning %d\n", __func__, ret);
+ return NULL;
+}
+
+static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
+ u64 offset, u64 length)
+{
+ u64 end;
+
+ end = max_t(u64, pnfs_end_offset(err->offset, err->length),
+ pnfs_end_offset(offset, length));
+ err->offset = min_t(u64, err->offset, offset);
+ err->length = end - err->offset;
+}
+
+static int
+ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1,
+ const struct nfs4_ff_layout_ds_err *e2)
+{
+ int ret;
+
+ if (e1->opnum != e2->opnum)
+ return e1->opnum < e2->opnum ? -1 : 1;
+ if (e1->status != e2->status)
+ return e1->status < e2->status ? -1 : 1;
+ ret = memcmp(e1->stateid.data, e2->stateid.data,
+ sizeof(e1->stateid.data));
+ if (ret != 0)
+ return ret;
+ ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid));
+ if (ret != 0)
+ return ret;
+ if (pnfs_end_offset(e1->offset, e1->length) < e2->offset)
+ return -1;
+ if (e1->offset > pnfs_end_offset(e2->offset, e2->length))
+ return 1;
+ /* If ranges overlap or are contiguous, they are the same */
+ return 0;
+}
+
+static void
+ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
+ struct nfs4_ff_layout_ds_err *dserr)
+{
+ struct nfs4_ff_layout_ds_err *err, *tmp;
+ struct list_head *head = &flo->error_list;
+ int match;
+
+ /* Do insertion sort w/ merges */
+ list_for_each_entry_safe(err, tmp, &flo->error_list, list) {
+ match = ff_ds_error_match(err, dserr);
+ if (match < 0)
+ continue;
+ if (match > 0) {
+ /* Add entry "dserr" _before_ entry "err" */
+ head = &err->list;
+ break;
+ }
+ /* Entries match, so merge "err" into "dserr" */
+ extend_ds_error(dserr, err->offset, err->length);
+ list_replace(&err->list, &dserr->list);
+ kfree(err);
+ return;
+ }
+
+ list_add_tail(&dserr->list, head);
+}
+
+int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
+ struct nfs4_ff_layout_mirror *mirror, u64 offset,
+ u64 length, int status, enum nfs_opnum4 opnum,
+ gfp_t gfp_flags)
+{
+ struct nfs4_ff_layout_ds_err *dserr;
+
+ if (status == 0)
+ return 0;
+
+ if (IS_ERR_OR_NULL(mirror->mirror_ds))
+ return -EINVAL;
+
+ dserr = kmalloc(sizeof(*dserr), gfp_flags);
+ if (!dserr)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&dserr->list);
+ dserr->offset = offset;
+ dserr->length = length;
+ dserr->status = status;
+ dserr->opnum = opnum;
+ nfs4_stateid_copy(&dserr->stateid, &mirror->stateid);
+ memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid,
+ NFS4_DEVICEID4_SIZE);
+
+ spin_lock(&flo->generic_hdr.plh_inode->i_lock);
+ ff_layout_add_ds_error_locked(flo, dserr);
+ spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
+ return 0;
+}
+
+static const struct cred *
+ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
+{
+ const struct cred *cred, __rcu **pcred;
+
+ if (iomode == IOMODE_READ)
+ pcred = &mirror->ro_cred;
+ else
+ pcred = &mirror->rw_cred;
+
+ rcu_read_lock();
+ do {
+ cred = rcu_dereference(*pcred);
+ if (!cred)
+ break;
+
+ cred = get_cred_rcu(cred);
+ } while(!cred);
+ rcu_read_unlock();
+ return cred;
+}
+
+struct nfs_fh *
+nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror)
+{
+ /* FIXME: For now assume there is only 1 version available for the DS */
+ return &mirror->fh_versions[0];
+}
+
+void
+nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
+ nfs4_stateid *stateid)
+{
+ if (nfs4_ff_layout_ds_version(mirror) == 4)
+ nfs4_stateid_copy(stateid, &mirror->stateid);
+}
+
+static bool
+ff_layout_init_mirror_ds(struct pnfs_layout_hdr *lo,
+ struct nfs4_ff_layout_mirror *mirror)
+{
+ if (mirror == NULL)
+ goto outerr;
+ if (mirror->mirror_ds == NULL) {
+ struct nfs4_deviceid_node *node;
+ struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV);
+
+ node = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode),
+ &mirror->devid, lo->plh_lc_cred,
+ GFP_KERNEL);
+ if (node)
+ mirror_ds = FF_LAYOUT_MIRROR_DS(node);
+
+ /* check for race with another call to this function */
+ if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) &&
+ mirror_ds != ERR_PTR(-ENODEV))
+ nfs4_put_deviceid_node(node);
+ }
+
+ if (IS_ERR(mirror->mirror_ds))
+ goto outerr;
+
+ return true;
+outerr:
+ return false;
+}
+
+/**
+ * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call
+ * @lseg: the layout segment we're operating on
+ * @mirror: layout mirror describing the DS to use
+ * @fail_return: return layout on connect failure?
+ *
+ * Try to prepare a DS connection to accept an RPC call. This involves
+ * selecting a mirror to use and connecting the client to it if it's not
+ * already connected.
+ *
+ * Since we only need a single functioning mirror to satisfy a read, we don't
+ * want to return the layout if there is one. For writes though, any down
+ * mirror should result in a LAYOUTRETURN. @fail_return is how we distinguish
+ * between the two cases.
+ *
+ * Returns a pointer to a connected DS object on success or NULL on failure.
+ */
+struct nfs4_pnfs_ds *
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
+ struct nfs4_ff_layout_mirror *mirror,
+ bool fail_return)
+{
+ struct nfs4_pnfs_ds *ds = NULL;
+ struct inode *ino = lseg->pls_layout->plh_inode;
+ struct nfs_server *s = NFS_SERVER(ino);
+ unsigned int max_payload;
+ int status;
+
+ if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror))
+ goto noconnect;
+
+ ds = mirror->mirror_ds->ds;
+ if (READ_ONCE(ds->ds_clp))
+ goto out;
+ /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
+ smp_rmb();
+
+ /* FIXME: For now we assume the server sent only one version of NFS
+ * to use for the DS.
+ */
+ status = nfs4_pnfs_ds_connect(s, ds, &mirror->mirror_ds->id_node,
+ dataserver_timeo, dataserver_retrans,
+ mirror->mirror_ds->ds_versions[0].version,
+ mirror->mirror_ds->ds_versions[0].minor_version);
+
+ /* connect success, check rsize/wsize limit */
+ if (!status) {
+ max_payload =
+ nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
+ NULL);
+ if (mirror->mirror_ds->ds_versions[0].rsize > max_payload)
+ mirror->mirror_ds->ds_versions[0].rsize = max_payload;
+ if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
+ mirror->mirror_ds->ds_versions[0].wsize = max_payload;
+ goto out;
+ }
+noconnect:
+ ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+ mirror, lseg->pls_range.offset,
+ lseg->pls_range.length, NFS4ERR_NXIO,
+ OP_ILLEGAL, GFP_NOIO);
+ ff_layout_send_layouterror(lseg);
+ if (fail_return || !ff_layout_has_available_ds(lseg))
+ pnfs_error_mark_layout_for_return(ino, lseg);
+ ds = NULL;
+out:
+ return ds;
+}
+
+const struct cred *
+ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
+ const struct pnfs_layout_range *range,
+ const struct cred *mdscred)
+{
+ const struct cred *cred;
+
+ if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) {
+ cred = ff_layout_get_mirror_cred(mirror, range->iomode);
+ if (!cred)
+ cred = get_cred(mdscred);
+ } else {
+ cred = get_cred(mdscred);
+ }
+ return cred;
+}
+
+/**
+ * nfs4_ff_find_or_create_ds_client - Find or create a DS rpc client
+ * @mirror: pointer to the mirror
+ * @ds_clp: nfs_client for the DS
+ * @inode: pointer to inode
+ *
+ * Find or create a DS rpc client with th MDS server rpc client auth flavor
+ * in the nfs_client cl_ds_clients list.
+ */
+struct rpc_clnt *
+nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
+ struct nfs_client *ds_clp, struct inode *inode)
+{
+ switch (mirror->mirror_ds->ds_versions[0].version) {
+ case 3:
+ /* For NFSv3 DS, flavor is set when creating DS connections */
+ return ds_clp->cl_rpcclient;
+ case 4:
+ return nfs4_find_or_create_ds_client(ds_clp, inode);
+ default:
+ BUG();
+ }
+}
+
+void ff_layout_free_ds_ioerr(struct list_head *head)
+{
+ struct nfs4_ff_layout_ds_err *err;
+
+ while (!list_empty(head)) {
+ err = list_first_entry(head,
+ struct nfs4_ff_layout_ds_err,
+ list);
+ list_del(&err->list);
+ kfree(err);
+ }
+}
+
+/* called with inode i_lock held */
+int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head)
+{
+ struct nfs4_ff_layout_ds_err *err;
+ __be32 *p;
+
+ list_for_each_entry(err, head, list) {
+ /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
+ * + array length + deviceid(NFS4_DEVICEID4_SIZE)
+ * + status(4) + opnum(4)
+ */
+ p = xdr_reserve_space(xdr,
+ 28 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
+ if (unlikely(!p))
+ return -ENOBUFS;
+ p = xdr_encode_hyper(p, err->offset);
+ p = xdr_encode_hyper(p, err->length);
+ p = xdr_encode_opaque_fixed(p, &err->stateid,
+ NFS4_STATEID_SIZE);
+ /* Encode 1 error */
+ *p++ = cpu_to_be32(1);
+ p = xdr_encode_opaque_fixed(p, &err->deviceid,
+ NFS4_DEVICEID4_SIZE);
+ *p++ = cpu_to_be32(err->status);
+ *p++ = cpu_to_be32(err->opnum);
+ dprintk("%s: offset %llu length %llu status %d op %d\n",
+ __func__, err->offset, err->length, err->status,
+ err->opnum);
+ }
+
+ return 0;
+}
+
+static
+unsigned int do_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range,
+ struct list_head *head,
+ unsigned int maxnum)
+{
+ struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo);
+ struct inode *inode = lo->plh_inode;
+ struct nfs4_ff_layout_ds_err *err, *n;
+ unsigned int ret = 0;
+
+ spin_lock(&inode->i_lock);
+ list_for_each_entry_safe(err, n, &flo->error_list, list) {
+ if (!pnfs_is_range_intersecting(err->offset,
+ pnfs_end_offset(err->offset, err->length),
+ range->offset,
+ pnfs_end_offset(range->offset, range->length)))
+ continue;
+ if (!maxnum)
+ break;
+ list_move(&err->list, head);
+ maxnum--;
+ ret++;
+ }
+ spin_unlock(&inode->i_lock);
+ return ret;
+}
+
+unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range,
+ struct list_head *head,
+ unsigned int maxnum)
+{
+ unsigned int ret;
+
+ ret = do_layout_fetch_ds_ioerr(lo, range, head, maxnum);
+ /* If we're over the max, discard all remaining entries */
+ if (ret == maxnum) {
+ LIST_HEAD(discard);
+ do_layout_fetch_ds_ioerr(lo, range, &discard, -1);
+ ff_layout_free_ds_ioerr(&discard);
+ }
+ return ret;
+}
+
+static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_deviceid_node *devid;
+ u32 idx;
+
+ for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ if (mirror) {
+ if (!mirror->mirror_ds)
+ return true;
+ if (IS_ERR(mirror->mirror_ds))
+ continue;
+ devid = &mirror->mirror_ds->id_node;
+ if (!nfs4_test_deviceid_unavailable(devid))
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_deviceid_node *devid;
+ u32 idx;
+
+ for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ if (!mirror || IS_ERR(mirror->mirror_ds))
+ return false;
+ if (!mirror->mirror_ds)
+ continue;
+ devid = &mirror->mirror_ds->id_node;
+ if (nfs4_test_deviceid_unavailable(devid))
+ return false;
+ }
+
+ return FF_LAYOUT_MIRROR_COUNT(lseg) != 0;
+}
+
+static bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+ if (lseg->pls_range.iomode == IOMODE_READ)
+ return ff_read_layout_has_available_ds(lseg);
+ /* Note: RW layout needs all mirrors available */
+ return ff_rw_layout_has_available_ds(lseg);
+}
+
+bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg)
+{
+ return ff_layout_no_fallback_to_mds(lseg) ||
+ ff_layout_has_available_ds(lseg);
+}
+
+bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg)
+{
+ return lseg->pls_range.iomode == IOMODE_RW &&
+ ff_layout_no_read_on_rw(lseg);
+}
+
+module_param(dataserver_retrans, uint, 0644);
+MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client "
+ "retries a request before it attempts further "
+ " recovery action.");
+module_param(dataserver_timeo, uint, 0644);
+MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
+ "NFSv4.1 client waits for a response from a "
+ " data server before it retries an NFS request.");
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
new file mode 100644
index 0000000000..853e8d609b
--- /dev/null
+++ b/fs/nfs/fs_context.c
@@ -0,0 +1,1670 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/fs_context.c
+ *
+ * Copyright (C) 1992 Rick Sladkey
+ * Conversion to new mount api Copyright (C) David Howells
+ *
+ * NFS mount handling.
+ *
+ * Split from fs/nfs/super.c by David Howells <dhowells@redhat.com>
+ */
+
+#include <linux/compat.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+
+#include <net/handshake.h>
+
+#include "nfs.h"
+#include "internal.h"
+
+#include "nfstrace.h"
+
+#define NFSDBG_FACILITY NFSDBG_MOUNT
+
+#if IS_ENABLED(CONFIG_NFS_V3)
+#define NFS_DEFAULT_VERSION 3
+#else
+#define NFS_DEFAULT_VERSION 2
+#endif
+
+#define NFS_MAX_CONNECTIONS 16
+
+enum nfs_param {
+ Opt_ac,
+ Opt_acdirmax,
+ Opt_acdirmin,
+ Opt_acl,
+ Opt_acregmax,
+ Opt_acregmin,
+ Opt_actimeo,
+ Opt_addr,
+ Opt_bg,
+ Opt_bsize,
+ Opt_clientaddr,
+ Opt_cto,
+ Opt_fg,
+ Opt_fscache,
+ Opt_fscache_flag,
+ Opt_hard,
+ Opt_intr,
+ Opt_local_lock,
+ Opt_lock,
+ Opt_lookupcache,
+ Opt_migration,
+ Opt_minorversion,
+ Opt_mountaddr,
+ Opt_mounthost,
+ Opt_mountport,
+ Opt_mountproto,
+ Opt_mountvers,
+ Opt_namelen,
+ Opt_nconnect,
+ Opt_max_connect,
+ Opt_port,
+ Opt_posix,
+ Opt_proto,
+ Opt_rdirplus,
+ Opt_rdma,
+ Opt_resvport,
+ Opt_retrans,
+ Opt_retry,
+ Opt_rsize,
+ Opt_sec,
+ Opt_sharecache,
+ Opt_sloppy,
+ Opt_soft,
+ Opt_softerr,
+ Opt_softreval,
+ Opt_source,
+ Opt_tcp,
+ Opt_timeo,
+ Opt_trunkdiscovery,
+ Opt_udp,
+ Opt_v,
+ Opt_vers,
+ Opt_wsize,
+ Opt_write,
+ Opt_xprtsec,
+};
+
+enum {
+ Opt_local_lock_all,
+ Opt_local_lock_flock,
+ Opt_local_lock_none,
+ Opt_local_lock_posix,
+};
+
+static const struct constant_table nfs_param_enums_local_lock[] = {
+ { "all", Opt_local_lock_all },
+ { "flock", Opt_local_lock_flock },
+ { "posix", Opt_local_lock_posix },
+ { "none", Opt_local_lock_none },
+ {}
+};
+
+enum {
+ Opt_lookupcache_all,
+ Opt_lookupcache_none,
+ Opt_lookupcache_positive,
+};
+
+static const struct constant_table nfs_param_enums_lookupcache[] = {
+ { "all", Opt_lookupcache_all },
+ { "none", Opt_lookupcache_none },
+ { "pos", Opt_lookupcache_positive },
+ { "positive", Opt_lookupcache_positive },
+ {}
+};
+
+enum {
+ Opt_write_lazy,
+ Opt_write_eager,
+ Opt_write_wait,
+};
+
+static const struct constant_table nfs_param_enums_write[] = {
+ { "lazy", Opt_write_lazy },
+ { "eager", Opt_write_eager },
+ { "wait", Opt_write_wait },
+ {}
+};
+
+static const struct fs_parameter_spec nfs_fs_parameters[] = {
+ fsparam_flag_no("ac", Opt_ac),
+ fsparam_u32 ("acdirmax", Opt_acdirmax),
+ fsparam_u32 ("acdirmin", Opt_acdirmin),
+ fsparam_flag_no("acl", Opt_acl),
+ fsparam_u32 ("acregmax", Opt_acregmax),
+ fsparam_u32 ("acregmin", Opt_acregmin),
+ fsparam_u32 ("actimeo", Opt_actimeo),
+ fsparam_string("addr", Opt_addr),
+ fsparam_flag ("bg", Opt_bg),
+ fsparam_u32 ("bsize", Opt_bsize),
+ fsparam_string("clientaddr", Opt_clientaddr),
+ fsparam_flag_no("cto", Opt_cto),
+ fsparam_flag ("fg", Opt_fg),
+ fsparam_flag_no("fsc", Opt_fscache_flag),
+ fsparam_string("fsc", Opt_fscache),
+ fsparam_flag ("hard", Opt_hard),
+ __fsparam(NULL, "intr", Opt_intr,
+ fs_param_neg_with_no|fs_param_deprecated, NULL),
+ fsparam_enum ("local_lock", Opt_local_lock, nfs_param_enums_local_lock),
+ fsparam_flag_no("lock", Opt_lock),
+ fsparam_enum ("lookupcache", Opt_lookupcache, nfs_param_enums_lookupcache),
+ fsparam_flag_no("migration", Opt_migration),
+ fsparam_u32 ("minorversion", Opt_minorversion),
+ fsparam_string("mountaddr", Opt_mountaddr),
+ fsparam_string("mounthost", Opt_mounthost),
+ fsparam_u32 ("mountport", Opt_mountport),
+ fsparam_string("mountproto", Opt_mountproto),
+ fsparam_u32 ("mountvers", Opt_mountvers),
+ fsparam_u32 ("namlen", Opt_namelen),
+ fsparam_u32 ("nconnect", Opt_nconnect),
+ fsparam_u32 ("max_connect", Opt_max_connect),
+ fsparam_string("nfsvers", Opt_vers),
+ fsparam_u32 ("port", Opt_port),
+ fsparam_flag_no("posix", Opt_posix),
+ fsparam_string("proto", Opt_proto),
+ fsparam_flag_no("rdirplus", Opt_rdirplus),
+ fsparam_flag ("rdma", Opt_rdma),
+ fsparam_flag_no("resvport", Opt_resvport),
+ fsparam_u32 ("retrans", Opt_retrans),
+ fsparam_string("retry", Opt_retry),
+ fsparam_u32 ("rsize", Opt_rsize),
+ fsparam_string("sec", Opt_sec),
+ fsparam_flag_no("sharecache", Opt_sharecache),
+ fsparam_flag ("sloppy", Opt_sloppy),
+ fsparam_flag ("soft", Opt_soft),
+ fsparam_flag ("softerr", Opt_softerr),
+ fsparam_flag ("softreval", Opt_softreval),
+ fsparam_string("source", Opt_source),
+ fsparam_flag ("tcp", Opt_tcp),
+ fsparam_u32 ("timeo", Opt_timeo),
+ fsparam_flag_no("trunkdiscovery", Opt_trunkdiscovery),
+ fsparam_flag ("udp", Opt_udp),
+ fsparam_flag ("v2", Opt_v),
+ fsparam_flag ("v3", Opt_v),
+ fsparam_flag ("v4", Opt_v),
+ fsparam_flag ("v4.0", Opt_v),
+ fsparam_flag ("v4.1", Opt_v),
+ fsparam_flag ("v4.2", Opt_v),
+ fsparam_string("vers", Opt_vers),
+ fsparam_enum ("write", Opt_write, nfs_param_enums_write),
+ fsparam_u32 ("wsize", Opt_wsize),
+ fsparam_string("xprtsec", Opt_xprtsec),
+ {}
+};
+
+enum {
+ Opt_vers_2,
+ Opt_vers_3,
+ Opt_vers_4,
+ Opt_vers_4_0,
+ Opt_vers_4_1,
+ Opt_vers_4_2,
+};
+
+static const struct constant_table nfs_vers_tokens[] = {
+ { "2", Opt_vers_2 },
+ { "3", Opt_vers_3 },
+ { "4", Opt_vers_4 },
+ { "4.0", Opt_vers_4_0 },
+ { "4.1", Opt_vers_4_1 },
+ { "4.2", Opt_vers_4_2 },
+ {}
+};
+
+enum {
+ Opt_xprt_rdma,
+ Opt_xprt_rdma6,
+ Opt_xprt_tcp,
+ Opt_xprt_tcp6,
+ Opt_xprt_udp,
+ Opt_xprt_udp6,
+ nr__Opt_xprt
+};
+
+static const struct constant_table nfs_xprt_protocol_tokens[] = {
+ { "rdma", Opt_xprt_rdma },
+ { "rdma6", Opt_xprt_rdma6 },
+ { "tcp", Opt_xprt_tcp },
+ { "tcp6", Opt_xprt_tcp6 },
+ { "udp", Opt_xprt_udp },
+ { "udp6", Opt_xprt_udp6 },
+ {}
+};
+
+enum {
+ Opt_sec_krb5,
+ Opt_sec_krb5i,
+ Opt_sec_krb5p,
+ Opt_sec_lkey,
+ Opt_sec_lkeyi,
+ Opt_sec_lkeyp,
+ Opt_sec_none,
+ Opt_sec_spkm,
+ Opt_sec_spkmi,
+ Opt_sec_spkmp,
+ Opt_sec_sys,
+ nr__Opt_sec
+};
+
+static const struct constant_table nfs_secflavor_tokens[] = {
+ { "krb5", Opt_sec_krb5 },
+ { "krb5i", Opt_sec_krb5i },
+ { "krb5p", Opt_sec_krb5p },
+ { "lkey", Opt_sec_lkey },
+ { "lkeyi", Opt_sec_lkeyi },
+ { "lkeyp", Opt_sec_lkeyp },
+ { "none", Opt_sec_none },
+ { "null", Opt_sec_none },
+ { "spkm3", Opt_sec_spkm },
+ { "spkm3i", Opt_sec_spkmi },
+ { "spkm3p", Opt_sec_spkmp },
+ { "sys", Opt_sec_sys },
+ {}
+};
+
+enum {
+ Opt_xprtsec_none,
+ Opt_xprtsec_tls,
+ Opt_xprtsec_mtls,
+ nr__Opt_xprtsec
+};
+
+static const struct constant_table nfs_xprtsec_policies[] = {
+ { "none", Opt_xprtsec_none },
+ { "tls", Opt_xprtsec_tls },
+ { "mtls", Opt_xprtsec_mtls },
+ {}
+};
+
+/*
+ * Sanity-check a server address provided by the mount command.
+ *
+ * Address family must be initialized, and address must not be
+ * the ANY address for that family.
+ */
+static int nfs_verify_server_address(struct sockaddr_storage *addr)
+{
+ switch (addr->ss_family) {
+ case AF_INET: {
+ struct sockaddr_in *sa = (struct sockaddr_in *)addr;
+ return sa->sin_addr.s_addr != htonl(INADDR_ANY);
+ }
+ case AF_INET6: {
+ struct in6_addr *sa = &((struct sockaddr_in6 *)addr)->sin6_addr;
+ return !ipv6_addr_any(sa);
+ }
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_NFS_DISABLE_UDP_SUPPORT
+static bool nfs_server_transport_udp_invalid(const struct nfs_fs_context *ctx)
+{
+ return true;
+}
+#else
+static bool nfs_server_transport_udp_invalid(const struct nfs_fs_context *ctx)
+{
+ if (ctx->version == 4)
+ return true;
+ return false;
+}
+#endif
+
+/*
+ * Sanity check the NFS transport protocol.
+ */
+static int nfs_validate_transport_protocol(struct fs_context *fc,
+ struct nfs_fs_context *ctx)
+{
+ switch (ctx->nfs_server.protocol) {
+ case XPRT_TRANSPORT_UDP:
+ if (nfs_server_transport_udp_invalid(ctx))
+ goto out_invalid_transport_udp;
+ break;
+ case XPRT_TRANSPORT_TCP:
+ case XPRT_TRANSPORT_RDMA:
+ break;
+ default:
+ ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+ }
+
+ if (ctx->xprtsec.policy != RPC_XPRTSEC_NONE)
+ switch (ctx->nfs_server.protocol) {
+ case XPRT_TRANSPORT_TCP:
+ ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP_TLS;
+ break;
+ default:
+ goto out_invalid_xprtsec_policy;
+ }
+
+ return 0;
+out_invalid_transport_udp:
+ return nfs_invalf(fc, "NFS: Unsupported transport protocol udp");
+out_invalid_xprtsec_policy:
+ return nfs_invalf(fc, "NFS: Transport does not support xprtsec");
+}
+
+/*
+ * For text based NFSv2/v3 mounts, the mount protocol transport default
+ * settings should depend upon the specified NFS transport.
+ */
+static void nfs_set_mount_transport_protocol(struct nfs_fs_context *ctx)
+{
+ if (ctx->mount_server.protocol == XPRT_TRANSPORT_UDP ||
+ ctx->mount_server.protocol == XPRT_TRANSPORT_TCP)
+ return;
+ switch (ctx->nfs_server.protocol) {
+ case XPRT_TRANSPORT_UDP:
+ ctx->mount_server.protocol = XPRT_TRANSPORT_UDP;
+ break;
+ case XPRT_TRANSPORT_TCP:
+ case XPRT_TRANSPORT_RDMA:
+ ctx->mount_server.protocol = XPRT_TRANSPORT_TCP;
+ }
+}
+
+/*
+ * Add 'flavor' to 'auth_info' if not already present.
+ * Returns true if 'flavor' ends up in the list, false otherwise
+ */
+static int nfs_auth_info_add(struct fs_context *fc,
+ struct nfs_auth_info *auth_info,
+ rpc_authflavor_t flavor)
+{
+ unsigned int i;
+ unsigned int max_flavor_len = ARRAY_SIZE(auth_info->flavors);
+
+ /* make sure this flavor isn't already in the list */
+ for (i = 0; i < auth_info->flavor_len; i++) {
+ if (flavor == auth_info->flavors[i])
+ return 0;
+ }
+
+ if (auth_info->flavor_len + 1 >= max_flavor_len)
+ return nfs_invalf(fc, "NFS: too many sec= flavors");
+
+ auth_info->flavors[auth_info->flavor_len++] = flavor;
+ return 0;
+}
+
+/*
+ * Parse the value of the 'sec=' option.
+ */
+static int nfs_parse_security_flavors(struct fs_context *fc,
+ struct fs_parameter *param)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ rpc_authflavor_t pseudoflavor;
+ char *string = param->string, *p;
+ int ret;
+
+ trace_nfs_mount_assign(param->key, string);
+
+ while ((p = strsep(&string, ":")) != NULL) {
+ if (!*p)
+ continue;
+ switch (lookup_constant(nfs_secflavor_tokens, p, -1)) {
+ case Opt_sec_none:
+ pseudoflavor = RPC_AUTH_NULL;
+ break;
+ case Opt_sec_sys:
+ pseudoflavor = RPC_AUTH_UNIX;
+ break;
+ case Opt_sec_krb5:
+ pseudoflavor = RPC_AUTH_GSS_KRB5;
+ break;
+ case Opt_sec_krb5i:
+ pseudoflavor = RPC_AUTH_GSS_KRB5I;
+ break;
+ case Opt_sec_krb5p:
+ pseudoflavor = RPC_AUTH_GSS_KRB5P;
+ break;
+ case Opt_sec_lkey:
+ pseudoflavor = RPC_AUTH_GSS_LKEY;
+ break;
+ case Opt_sec_lkeyi:
+ pseudoflavor = RPC_AUTH_GSS_LKEYI;
+ break;
+ case Opt_sec_lkeyp:
+ pseudoflavor = RPC_AUTH_GSS_LKEYP;
+ break;
+ case Opt_sec_spkm:
+ pseudoflavor = RPC_AUTH_GSS_SPKM;
+ break;
+ case Opt_sec_spkmi:
+ pseudoflavor = RPC_AUTH_GSS_SPKMI;
+ break;
+ case Opt_sec_spkmp:
+ pseudoflavor = RPC_AUTH_GSS_SPKMP;
+ break;
+ default:
+ return nfs_invalf(fc, "NFS: sec=%s option not recognized", p);
+ }
+
+ ret = nfs_auth_info_add(fc, &ctx->auth_info, pseudoflavor);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int nfs_parse_xprtsec_policy(struct fs_context *fc,
+ struct fs_parameter *param)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+
+ trace_nfs_mount_assign(param->key, param->string);
+
+ switch (lookup_constant(nfs_xprtsec_policies, param->string, -1)) {
+ case Opt_xprtsec_none:
+ ctx->xprtsec.policy = RPC_XPRTSEC_NONE;
+ break;
+ case Opt_xprtsec_tls:
+ ctx->xprtsec.policy = RPC_XPRTSEC_TLS_ANON;
+ break;
+ case Opt_xprtsec_mtls:
+ ctx->xprtsec.policy = RPC_XPRTSEC_TLS_X509;
+ break;
+ default:
+ return nfs_invalf(fc, "NFS: Unrecognized transport security policy");
+ }
+ return 0;
+}
+
+static int nfs_parse_version_string(struct fs_context *fc,
+ const char *string)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+
+ ctx->flags &= ~NFS_MOUNT_VER3;
+ switch (lookup_constant(nfs_vers_tokens, string, -1)) {
+ case Opt_vers_2:
+ ctx->version = 2;
+ break;
+ case Opt_vers_3:
+ ctx->flags |= NFS_MOUNT_VER3;
+ ctx->version = 3;
+ break;
+ case Opt_vers_4:
+ /* Backward compatibility option. In future,
+ * the mount program should always supply
+ * a NFSv4 minor version number.
+ */
+ ctx->version = 4;
+ break;
+ case Opt_vers_4_0:
+ ctx->version = 4;
+ ctx->minorversion = 0;
+ break;
+ case Opt_vers_4_1:
+ ctx->version = 4;
+ ctx->minorversion = 1;
+ break;
+ case Opt_vers_4_2:
+ ctx->version = 4;
+ ctx->minorversion = 2;
+ break;
+ default:
+ return nfs_invalf(fc, "NFS: Unsupported NFS version");
+ }
+ return 0;
+}
+
+/*
+ * Parse a single mount parameter.
+ */
+static int nfs_fs_context_parse_param(struct fs_context *fc,
+ struct fs_parameter *param)
+{
+ struct fs_parse_result result;
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ unsigned short protofamily, mountfamily;
+ unsigned int len;
+ int ret, opt;
+
+ trace_nfs_mount_option(param);
+
+ opt = fs_parse(fc, nfs_fs_parameters, param, &result);
+ if (opt < 0)
+ return (opt == -ENOPARAM && ctx->sloppy) ? 1 : opt;
+
+ if (fc->security)
+ ctx->has_sec_mnt_opts = 1;
+
+ switch (opt) {
+ case Opt_source:
+ if (fc->source)
+ return nfs_invalf(fc, "NFS: Multiple sources not supported");
+ fc->source = param->string;
+ param->string = NULL;
+ break;
+
+ /*
+ * boolean options: foo/nofoo
+ */
+ case Opt_soft:
+ ctx->flags |= NFS_MOUNT_SOFT;
+ ctx->flags &= ~NFS_MOUNT_SOFTERR;
+ break;
+ case Opt_softerr:
+ ctx->flags |= NFS_MOUNT_SOFTERR | NFS_MOUNT_SOFTREVAL;
+ ctx->flags &= ~NFS_MOUNT_SOFT;
+ break;
+ case Opt_hard:
+ ctx->flags &= ~(NFS_MOUNT_SOFT |
+ NFS_MOUNT_SOFTERR |
+ NFS_MOUNT_SOFTREVAL);
+ break;
+ case Opt_softreval:
+ if (result.negated)
+ ctx->flags &= ~NFS_MOUNT_SOFTREVAL;
+ else
+ ctx->flags |= NFS_MOUNT_SOFTREVAL;
+ break;
+ case Opt_posix:
+ if (result.negated)
+ ctx->flags &= ~NFS_MOUNT_POSIX;
+ else
+ ctx->flags |= NFS_MOUNT_POSIX;
+ break;
+ case Opt_cto:
+ if (result.negated)
+ ctx->flags |= NFS_MOUNT_NOCTO;
+ else
+ ctx->flags &= ~NFS_MOUNT_NOCTO;
+ break;
+ case Opt_trunkdiscovery:
+ if (result.negated)
+ ctx->flags &= ~NFS_MOUNT_TRUNK_DISCOVERY;
+ else
+ ctx->flags |= NFS_MOUNT_TRUNK_DISCOVERY;
+ break;
+ case Opt_ac:
+ if (result.negated)
+ ctx->flags |= NFS_MOUNT_NOAC;
+ else
+ ctx->flags &= ~NFS_MOUNT_NOAC;
+ break;
+ case Opt_lock:
+ if (result.negated) {
+ ctx->flags |= NFS_MOUNT_NONLM;
+ ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL);
+ } else {
+ ctx->flags &= ~NFS_MOUNT_NONLM;
+ ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL);
+ }
+ break;
+ case Opt_udp:
+ ctx->flags &= ~NFS_MOUNT_TCP;
+ ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP;
+ break;
+ case Opt_tcp:
+ case Opt_rdma:
+ ctx->flags |= NFS_MOUNT_TCP; /* for side protocols */
+ ret = xprt_find_transport_ident(param->key);
+ if (ret < 0)
+ goto out_bad_transport;
+ ctx->nfs_server.protocol = ret;
+ break;
+ case Opt_acl:
+ if (result.negated)
+ ctx->flags |= NFS_MOUNT_NOACL;
+ else
+ ctx->flags &= ~NFS_MOUNT_NOACL;
+ break;
+ case Opt_rdirplus:
+ if (result.negated)
+ ctx->flags |= NFS_MOUNT_NORDIRPLUS;
+ else
+ ctx->flags &= ~NFS_MOUNT_NORDIRPLUS;
+ break;
+ case Opt_sharecache:
+ if (result.negated)
+ ctx->flags |= NFS_MOUNT_UNSHARED;
+ else
+ ctx->flags &= ~NFS_MOUNT_UNSHARED;
+ break;
+ case Opt_resvport:
+ if (result.negated)
+ ctx->flags |= NFS_MOUNT_NORESVPORT;
+ else
+ ctx->flags &= ~NFS_MOUNT_NORESVPORT;
+ break;
+ case Opt_fscache_flag:
+ if (result.negated)
+ ctx->options &= ~NFS_OPTION_FSCACHE;
+ else
+ ctx->options |= NFS_OPTION_FSCACHE;
+ kfree(ctx->fscache_uniq);
+ ctx->fscache_uniq = NULL;
+ break;
+ case Opt_fscache:
+ ctx->options |= NFS_OPTION_FSCACHE;
+ kfree(ctx->fscache_uniq);
+ ctx->fscache_uniq = param->string;
+ param->string = NULL;
+ break;
+ case Opt_migration:
+ if (result.negated)
+ ctx->options &= ~NFS_OPTION_MIGRATION;
+ else
+ ctx->options |= NFS_OPTION_MIGRATION;
+ break;
+
+ /*
+ * options that take numeric values
+ */
+ case Opt_port:
+ if (result.uint_32 > USHRT_MAX)
+ goto out_of_bounds;
+ ctx->nfs_server.port = result.uint_32;
+ break;
+ case Opt_rsize:
+ ctx->rsize = result.uint_32;
+ break;
+ case Opt_wsize:
+ ctx->wsize = result.uint_32;
+ break;
+ case Opt_bsize:
+ ctx->bsize = result.uint_32;
+ break;
+ case Opt_timeo:
+ if (result.uint_32 < 1 || result.uint_32 > INT_MAX)
+ goto out_of_bounds;
+ ctx->timeo = result.uint_32;
+ break;
+ case Opt_retrans:
+ if (result.uint_32 > INT_MAX)
+ goto out_of_bounds;
+ ctx->retrans = result.uint_32;
+ break;
+ case Opt_acregmin:
+ ctx->acregmin = result.uint_32;
+ break;
+ case Opt_acregmax:
+ ctx->acregmax = result.uint_32;
+ break;
+ case Opt_acdirmin:
+ ctx->acdirmin = result.uint_32;
+ break;
+ case Opt_acdirmax:
+ ctx->acdirmax = result.uint_32;
+ break;
+ case Opt_actimeo:
+ ctx->acregmin = result.uint_32;
+ ctx->acregmax = result.uint_32;
+ ctx->acdirmin = result.uint_32;
+ ctx->acdirmax = result.uint_32;
+ break;
+ case Opt_namelen:
+ ctx->namlen = result.uint_32;
+ break;
+ case Opt_mountport:
+ if (result.uint_32 > USHRT_MAX)
+ goto out_of_bounds;
+ ctx->mount_server.port = result.uint_32;
+ break;
+ case Opt_mountvers:
+ if (result.uint_32 < NFS_MNT_VERSION ||
+ result.uint_32 > NFS_MNT3_VERSION)
+ goto out_of_bounds;
+ ctx->mount_server.version = result.uint_32;
+ break;
+ case Opt_minorversion:
+ if (result.uint_32 > NFS4_MAX_MINOR_VERSION)
+ goto out_of_bounds;
+ ctx->minorversion = result.uint_32;
+ break;
+
+ /*
+ * options that take text values
+ */
+ case Opt_v:
+ ret = nfs_parse_version_string(fc, param->key + 1);
+ if (ret < 0)
+ return ret;
+ break;
+ case Opt_vers:
+ if (!param->string)
+ goto out_invalid_value;
+ trace_nfs_mount_assign(param->key, param->string);
+ ret = nfs_parse_version_string(fc, param->string);
+ if (ret < 0)
+ return ret;
+ break;
+ case Opt_sec:
+ ret = nfs_parse_security_flavors(fc, param);
+ if (ret < 0)
+ return ret;
+ break;
+ case Opt_xprtsec:
+ ret = nfs_parse_xprtsec_policy(fc, param);
+ if (ret < 0)
+ return ret;
+ break;
+
+ case Opt_proto:
+ if (!param->string)
+ goto out_invalid_value;
+ trace_nfs_mount_assign(param->key, param->string);
+ protofamily = AF_INET;
+ switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) {
+ case Opt_xprt_udp6:
+ protofamily = AF_INET6;
+ fallthrough;
+ case Opt_xprt_udp:
+ ctx->flags &= ~NFS_MOUNT_TCP;
+ ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP;
+ break;
+ case Opt_xprt_tcp6:
+ protofamily = AF_INET6;
+ fallthrough;
+ case Opt_xprt_tcp:
+ ctx->flags |= NFS_MOUNT_TCP;
+ ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+ break;
+ case Opt_xprt_rdma6:
+ protofamily = AF_INET6;
+ fallthrough;
+ case Opt_xprt_rdma:
+ /* vector side protocols to TCP */
+ ctx->flags |= NFS_MOUNT_TCP;
+ ret = xprt_find_transport_ident(param->string);
+ if (ret < 0)
+ goto out_bad_transport;
+ ctx->nfs_server.protocol = ret;
+ break;
+ default:
+ goto out_bad_transport;
+ }
+
+ ctx->protofamily = protofamily;
+ break;
+
+ case Opt_mountproto:
+ if (!param->string)
+ goto out_invalid_value;
+ trace_nfs_mount_assign(param->key, param->string);
+ mountfamily = AF_INET;
+ switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) {
+ case Opt_xprt_udp6:
+ mountfamily = AF_INET6;
+ fallthrough;
+ case Opt_xprt_udp:
+ ctx->mount_server.protocol = XPRT_TRANSPORT_UDP;
+ break;
+ case Opt_xprt_tcp6:
+ mountfamily = AF_INET6;
+ fallthrough;
+ case Opt_xprt_tcp:
+ ctx->mount_server.protocol = XPRT_TRANSPORT_TCP;
+ break;
+ case Opt_xprt_rdma: /* not used for side protocols */
+ default:
+ goto out_bad_transport;
+ }
+ ctx->mountfamily = mountfamily;
+ break;
+
+ case Opt_addr:
+ trace_nfs_mount_assign(param->key, param->string);
+ len = rpc_pton(fc->net_ns, param->string, param->size,
+ &ctx->nfs_server.address,
+ sizeof(ctx->nfs_server._address));
+ if (len == 0)
+ goto out_invalid_address;
+ ctx->nfs_server.addrlen = len;
+ break;
+ case Opt_clientaddr:
+ trace_nfs_mount_assign(param->key, param->string);
+ kfree(ctx->client_address);
+ ctx->client_address = param->string;
+ param->string = NULL;
+ break;
+ case Opt_mounthost:
+ trace_nfs_mount_assign(param->key, param->string);
+ kfree(ctx->mount_server.hostname);
+ ctx->mount_server.hostname = param->string;
+ param->string = NULL;
+ break;
+ case Opt_mountaddr:
+ trace_nfs_mount_assign(param->key, param->string);
+ len = rpc_pton(fc->net_ns, param->string, param->size,
+ &ctx->mount_server.address,
+ sizeof(ctx->mount_server._address));
+ if (len == 0)
+ goto out_invalid_address;
+ ctx->mount_server.addrlen = len;
+ break;
+ case Opt_nconnect:
+ trace_nfs_mount_assign(param->key, param->string);
+ if (result.uint_32 < 1 || result.uint_32 > NFS_MAX_CONNECTIONS)
+ goto out_of_bounds;
+ ctx->nfs_server.nconnect = result.uint_32;
+ break;
+ case Opt_max_connect:
+ trace_nfs_mount_assign(param->key, param->string);
+ if (result.uint_32 < 1 || result.uint_32 > NFS_MAX_TRANSPORTS)
+ goto out_of_bounds;
+ ctx->nfs_server.max_connect = result.uint_32;
+ break;
+ case Opt_lookupcache:
+ trace_nfs_mount_assign(param->key, param->string);
+ switch (result.uint_32) {
+ case Opt_lookupcache_all:
+ ctx->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE);
+ break;
+ case Opt_lookupcache_positive:
+ ctx->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE;
+ ctx->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG;
+ break;
+ case Opt_lookupcache_none:
+ ctx->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE;
+ break;
+ default:
+ goto out_invalid_value;
+ }
+ break;
+ case Opt_local_lock:
+ trace_nfs_mount_assign(param->key, param->string);
+ switch (result.uint_32) {
+ case Opt_local_lock_all:
+ ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK |
+ NFS_MOUNT_LOCAL_FCNTL);
+ break;
+ case Opt_local_lock_flock:
+ ctx->flags |= NFS_MOUNT_LOCAL_FLOCK;
+ break;
+ case Opt_local_lock_posix:
+ ctx->flags |= NFS_MOUNT_LOCAL_FCNTL;
+ break;
+ case Opt_local_lock_none:
+ ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
+ NFS_MOUNT_LOCAL_FCNTL);
+ break;
+ default:
+ goto out_invalid_value;
+ }
+ break;
+ case Opt_write:
+ trace_nfs_mount_assign(param->key, param->string);
+ switch (result.uint_32) {
+ case Opt_write_lazy:
+ ctx->flags &=
+ ~(NFS_MOUNT_WRITE_EAGER | NFS_MOUNT_WRITE_WAIT);
+ break;
+ case Opt_write_eager:
+ ctx->flags |= NFS_MOUNT_WRITE_EAGER;
+ ctx->flags &= ~NFS_MOUNT_WRITE_WAIT;
+ break;
+ case Opt_write_wait:
+ ctx->flags |=
+ NFS_MOUNT_WRITE_EAGER | NFS_MOUNT_WRITE_WAIT;
+ break;
+ default:
+ goto out_invalid_value;
+ }
+ break;
+
+ /*
+ * Special options
+ */
+ case Opt_sloppy:
+ ctx->sloppy = true;
+ break;
+ }
+
+ return 0;
+
+out_invalid_value:
+ return nfs_invalf(fc, "NFS: Bad mount option value specified");
+out_invalid_address:
+ return nfs_invalf(fc, "NFS: Bad IP address specified");
+out_of_bounds:
+ return nfs_invalf(fc, "NFS: Value for '%s' out of range", param->key);
+out_bad_transport:
+ return nfs_invalf(fc, "NFS: Unrecognized transport protocol");
+}
+
+/*
+ * Split fc->source into "hostname:export_path".
+ *
+ * The leftmost colon demarks the split between the server's hostname
+ * and the export path. If the hostname starts with a left square
+ * bracket, then it may contain colons.
+ *
+ * Note: caller frees hostname and export path, even on error.
+ */
+static int nfs_parse_source(struct fs_context *fc,
+ size_t maxnamlen, size_t maxpathlen)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ const char *dev_name = fc->source;
+ size_t len;
+ const char *end;
+
+ if (unlikely(!dev_name || !*dev_name))
+ return -EINVAL;
+
+ /* Is the host name protected with square brakcets? */
+ if (*dev_name == '[') {
+ end = strchr(++dev_name, ']');
+ if (end == NULL || end[1] != ':')
+ goto out_bad_devname;
+
+ len = end - dev_name;
+ end++;
+ } else {
+ const char *comma;
+
+ end = strchr(dev_name, ':');
+ if (end == NULL)
+ goto out_bad_devname;
+ len = end - dev_name;
+
+ /* kill possible hostname list: not supported */
+ comma = memchr(dev_name, ',', len);
+ if (comma)
+ len = comma - dev_name;
+ }
+
+ if (len > maxnamlen)
+ goto out_hostname;
+
+ kfree(ctx->nfs_server.hostname);
+
+ /* N.B. caller will free nfs_server.hostname in all cases */
+ ctx->nfs_server.hostname = kmemdup_nul(dev_name, len, GFP_KERNEL);
+ if (!ctx->nfs_server.hostname)
+ goto out_nomem;
+ len = strlen(++end);
+ if (len > maxpathlen)
+ goto out_path;
+ ctx->nfs_server.export_path = kmemdup_nul(end, len, GFP_KERNEL);
+ if (!ctx->nfs_server.export_path)
+ goto out_nomem;
+
+ trace_nfs_mount_path(ctx->nfs_server.export_path);
+ return 0;
+
+out_bad_devname:
+ return nfs_invalf(fc, "NFS: device name not in host:path format");
+out_nomem:
+ nfs_errorf(fc, "NFS: not enough memory to parse device name");
+ return -ENOMEM;
+out_hostname:
+ nfs_errorf(fc, "NFS: server hostname too long");
+ return -ENAMETOOLONG;
+out_path:
+ nfs_errorf(fc, "NFS: export pathname too long");
+ return -ENAMETOOLONG;
+}
+
+static inline bool is_remount_fc(struct fs_context *fc)
+{
+ return fc->root != NULL;
+}
+
+/*
+ * Parse monolithic NFS2/NFS3 mount data
+ * - fills in the mount root filehandle
+ *
+ * For option strings, user space handles the following behaviors:
+ *
+ * + DNS: mapping server host name to IP address ("addr=" option)
+ *
+ * + failure mode: how to behave if a mount request can't be handled
+ * immediately ("fg/bg" option)
+ *
+ * + retry: how often to retry a mount request ("retry=" option)
+ *
+ * + breaking back: trying proto=udp after proto=tcp, v2 after v3,
+ * mountproto=tcp after mountproto=udp, and so on
+ */
+static int nfs23_parse_monolithic(struct fs_context *fc,
+ struct nfs_mount_data *data)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct nfs_fh *mntfh = ctx->mntfh;
+ struct sockaddr_storage *sap = &ctx->nfs_server._address;
+ int extra_flags = NFS_MOUNT_LEGACY_INTERFACE;
+ int ret;
+
+ if (data == NULL)
+ goto out_no_data;
+
+ ctx->version = NFS_DEFAULT_VERSION;
+ switch (data->version) {
+ case 1:
+ data->namlen = 0;
+ fallthrough;
+ case 2:
+ data->bsize = 0;
+ fallthrough;
+ case 3:
+ if (data->flags & NFS_MOUNT_VER3)
+ goto out_no_v3;
+ data->root.size = NFS2_FHSIZE;
+ memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
+ /* Turn off security negotiation */
+ extra_flags |= NFS_MOUNT_SECFLAVOUR;
+ fallthrough;
+ case 4:
+ if (data->flags & NFS_MOUNT_SECFLAVOUR)
+ goto out_no_sec;
+ fallthrough;
+ case 5:
+ memset(data->context, 0, sizeof(data->context));
+ fallthrough;
+ case 6:
+ if (data->flags & NFS_MOUNT_VER3) {
+ if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
+ goto out_invalid_fh;
+ mntfh->size = data->root.size;
+ ctx->version = 3;
+ } else {
+ mntfh->size = NFS2_FHSIZE;
+ ctx->version = 2;
+ }
+
+
+ memcpy(mntfh->data, data->root.data, mntfh->size);
+ if (mntfh->size < sizeof(mntfh->data))
+ memset(mntfh->data + mntfh->size, 0,
+ sizeof(mntfh->data) - mntfh->size);
+
+ /*
+ * for proto == XPRT_TRANSPORT_UDP, which is what uses
+ * to_exponential, implying shift: limit the shift value
+ * to BITS_PER_LONG (majortimeo is unsigned long)
+ */
+ if (!(data->flags & NFS_MOUNT_TCP)) /* this will be UDP */
+ if (data->retrans >= 64) /* shift value is too large */
+ goto out_invalid_data;
+
+ /*
+ * Translate to nfs_fs_context, which nfs_fill_super
+ * can deal with.
+ */
+ ctx->flags = data->flags & NFS_MOUNT_FLAGMASK;
+ ctx->flags |= extra_flags;
+ ctx->rsize = data->rsize;
+ ctx->wsize = data->wsize;
+ ctx->timeo = data->timeo;
+ ctx->retrans = data->retrans;
+ ctx->acregmin = data->acregmin;
+ ctx->acregmax = data->acregmax;
+ ctx->acdirmin = data->acdirmin;
+ ctx->acdirmax = data->acdirmax;
+ ctx->need_mount = false;
+
+ memcpy(sap, &data->addr, sizeof(data->addr));
+ ctx->nfs_server.addrlen = sizeof(data->addr);
+ ctx->nfs_server.port = ntohs(data->addr.sin_port);
+ if (sap->ss_family != AF_INET ||
+ !nfs_verify_server_address(sap))
+ goto out_no_address;
+
+ if (!(data->flags & NFS_MOUNT_TCP))
+ ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP;
+ /* N.B. caller will free nfs_server.hostname in all cases */
+ ctx->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
+ if (!ctx->nfs_server.hostname)
+ goto out_nomem;
+
+ ctx->namlen = data->namlen;
+ ctx->bsize = data->bsize;
+
+ if (data->flags & NFS_MOUNT_SECFLAVOUR)
+ ctx->selected_flavor = data->pseudoflavor;
+ else
+ ctx->selected_flavor = RPC_AUTH_UNIX;
+
+ if (!(data->flags & NFS_MOUNT_NONLM))
+ ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK|
+ NFS_MOUNT_LOCAL_FCNTL);
+ else
+ ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK|
+ NFS_MOUNT_LOCAL_FCNTL);
+
+ /*
+ * The legacy version 6 binary mount data from userspace has a
+ * field used only to transport selinux information into the
+ * kernel. To continue to support that functionality we
+ * have a touch of selinux knowledge here in the NFS code. The
+ * userspace code converted context=blah to just blah so we are
+ * converting back to the full string selinux understands.
+ */
+ if (data->context[0]){
+#ifdef CONFIG_SECURITY_SELINUX
+ int ret;
+
+ data->context[NFS_MAX_CONTEXT_LEN] = '\0';
+ ret = vfs_parse_fs_string(fc, "context",
+ data->context, strlen(data->context));
+ if (ret < 0)
+ return ret;
+#else
+ return -EINVAL;
+#endif
+ }
+
+ break;
+ default:
+ goto generic;
+ }
+
+ ret = nfs_validate_transport_protocol(fc, ctx);
+ if (ret)
+ return ret;
+
+ ctx->skip_reconfig_option_check = true;
+ return 0;
+
+generic:
+ return generic_parse_monolithic(fc, data);
+
+out_no_data:
+ if (is_remount_fc(fc)) {
+ ctx->skip_reconfig_option_check = true;
+ return 0;
+ }
+ return nfs_invalf(fc, "NFS: mount program didn't pass any mount data");
+
+out_no_v3:
+ return nfs_invalf(fc, "NFS: nfs_mount_data version does not support v3");
+
+out_no_sec:
+ return nfs_invalf(fc, "NFS: nfs_mount_data version supports only AUTH_SYS");
+
+out_nomem:
+ return -ENOMEM;
+
+out_no_address:
+ return nfs_invalf(fc, "NFS: mount program didn't pass remote address");
+
+out_invalid_fh:
+ return nfs_invalf(fc, "NFS: invalid root filehandle");
+
+out_invalid_data:
+ return nfs_invalf(fc, "NFS: invalid binary mount data");
+}
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+struct compat_nfs_string {
+ compat_uint_t len;
+ compat_uptr_t data;
+};
+
+static inline void compat_nfs_string(struct nfs_string *dst,
+ struct compat_nfs_string *src)
+{
+ dst->data = compat_ptr(src->data);
+ dst->len = src->len;
+}
+
+struct compat_nfs4_mount_data_v1 {
+ compat_int_t version;
+ compat_int_t flags;
+ compat_int_t rsize;
+ compat_int_t wsize;
+ compat_int_t timeo;
+ compat_int_t retrans;
+ compat_int_t acregmin;
+ compat_int_t acregmax;
+ compat_int_t acdirmin;
+ compat_int_t acdirmax;
+ struct compat_nfs_string client_addr;
+ struct compat_nfs_string mnt_path;
+ struct compat_nfs_string hostname;
+ compat_uint_t host_addrlen;
+ compat_uptr_t host_addr;
+ compat_int_t proto;
+ compat_int_t auth_flavourlen;
+ compat_uptr_t auth_flavours;
+};
+
+static void nfs4_compat_mount_data_conv(struct nfs4_mount_data *data)
+{
+ struct compat_nfs4_mount_data_v1 *compat =
+ (struct compat_nfs4_mount_data_v1 *)data;
+
+ /* copy the fields backwards */
+ data->auth_flavours = compat_ptr(compat->auth_flavours);
+ data->auth_flavourlen = compat->auth_flavourlen;
+ data->proto = compat->proto;
+ data->host_addr = compat_ptr(compat->host_addr);
+ data->host_addrlen = compat->host_addrlen;
+ compat_nfs_string(&data->hostname, &compat->hostname);
+ compat_nfs_string(&data->mnt_path, &compat->mnt_path);
+ compat_nfs_string(&data->client_addr, &compat->client_addr);
+ data->acdirmax = compat->acdirmax;
+ data->acdirmin = compat->acdirmin;
+ data->acregmax = compat->acregmax;
+ data->acregmin = compat->acregmin;
+ data->retrans = compat->retrans;
+ data->timeo = compat->timeo;
+ data->wsize = compat->wsize;
+ data->rsize = compat->rsize;
+ data->flags = compat->flags;
+ data->version = compat->version;
+}
+
+/*
+ * Validate NFSv4 mount options
+ */
+static int nfs4_parse_monolithic(struct fs_context *fc,
+ struct nfs4_mount_data *data)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct sockaddr_storage *sap = &ctx->nfs_server._address;
+ int ret;
+ char *c;
+
+ if (!data) {
+ if (is_remount_fc(fc))
+ goto done;
+ return nfs_invalf(fc,
+ "NFS4: mount program didn't pass any mount data");
+ }
+
+ ctx->version = 4;
+
+ if (data->version != 1)
+ return generic_parse_monolithic(fc, data);
+
+ if (in_compat_syscall())
+ nfs4_compat_mount_data_conv(data);
+
+ if (data->host_addrlen > sizeof(ctx->nfs_server.address))
+ goto out_no_address;
+ if (data->host_addrlen == 0)
+ goto out_no_address;
+ ctx->nfs_server.addrlen = data->host_addrlen;
+ if (copy_from_user(sap, data->host_addr, data->host_addrlen))
+ return -EFAULT;
+ if (!nfs_verify_server_address(sap))
+ goto out_no_address;
+ ctx->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port);
+
+ if (data->auth_flavourlen) {
+ rpc_authflavor_t pseudoflavor;
+
+ if (data->auth_flavourlen > 1)
+ goto out_inval_auth;
+ if (copy_from_user(&pseudoflavor, data->auth_flavours,
+ sizeof(pseudoflavor)))
+ return -EFAULT;
+ ctx->selected_flavor = pseudoflavor;
+ } else {
+ ctx->selected_flavor = RPC_AUTH_UNIX;
+ }
+
+ c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+ ctx->nfs_server.hostname = c;
+
+ c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+ ctx->nfs_server.export_path = c;
+ trace_nfs_mount_path(c);
+
+ c = strndup_user(data->client_addr.data, 16);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+ ctx->client_address = c;
+
+ /*
+ * Translate to nfs_fs_context, which nfs_fill_super
+ * can deal with.
+ */
+
+ ctx->flags = data->flags & NFS4_MOUNT_FLAGMASK;
+ ctx->rsize = data->rsize;
+ ctx->wsize = data->wsize;
+ ctx->timeo = data->timeo;
+ ctx->retrans = data->retrans;
+ ctx->acregmin = data->acregmin;
+ ctx->acregmax = data->acregmax;
+ ctx->acdirmin = data->acdirmin;
+ ctx->acdirmax = data->acdirmax;
+ ctx->nfs_server.protocol = data->proto;
+ ret = nfs_validate_transport_protocol(fc, ctx);
+ if (ret)
+ return ret;
+done:
+ ctx->skip_reconfig_option_check = true;
+ return 0;
+
+out_inval_auth:
+ return nfs_invalf(fc, "NFS4: Invalid number of RPC auth flavours %d",
+ data->auth_flavourlen);
+
+out_no_address:
+ return nfs_invalf(fc, "NFS4: mount program didn't pass remote address");
+}
+#endif
+
+/*
+ * Parse a monolithic block of data from sys_mount().
+ */
+static int nfs_fs_context_parse_monolithic(struct fs_context *fc,
+ void *data)
+{
+ if (fc->fs_type == &nfs_fs_type)
+ return nfs23_parse_monolithic(fc, data);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+ if (fc->fs_type == &nfs4_fs_type)
+ return nfs4_parse_monolithic(fc, data);
+#endif
+
+ return nfs_invalf(fc, "NFS: Unsupported monolithic data version");
+}
+
+/*
+ * Validate the preparsed information in the config.
+ */
+static int nfs_fs_context_validate(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct nfs_subversion *nfs_mod;
+ struct sockaddr_storage *sap = &ctx->nfs_server._address;
+ int max_namelen = PAGE_SIZE;
+ int max_pathlen = NFS_MAXPATHLEN;
+ int port = 0;
+ int ret;
+
+ if (!fc->source)
+ goto out_no_device_name;
+
+ /* Check for sanity first. */
+ if (ctx->minorversion && ctx->version != 4)
+ goto out_minorversion_mismatch;
+
+ if (ctx->options & NFS_OPTION_MIGRATION &&
+ (ctx->version != 4 || ctx->minorversion != 0))
+ goto out_migration_misuse;
+
+ /* Verify that any proto=/mountproto= options match the address
+ * families in the addr=/mountaddr= options.
+ */
+ if (ctx->protofamily != AF_UNSPEC &&
+ ctx->protofamily != ctx->nfs_server.address.sa_family)
+ goto out_proto_mismatch;
+
+ if (ctx->mountfamily != AF_UNSPEC) {
+ if (ctx->mount_server.addrlen) {
+ if (ctx->mountfamily != ctx->mount_server.address.sa_family)
+ goto out_mountproto_mismatch;
+ } else {
+ if (ctx->mountfamily != ctx->nfs_server.address.sa_family)
+ goto out_mountproto_mismatch;
+ }
+ }
+
+ if (!nfs_verify_server_address(sap))
+ goto out_no_address;
+
+ ret = nfs_validate_transport_protocol(fc, ctx);
+ if (ret)
+ return ret;
+
+ if (ctx->version == 4) {
+ if (IS_ENABLED(CONFIG_NFS_V4)) {
+ if (ctx->nfs_server.protocol == XPRT_TRANSPORT_RDMA)
+ port = NFS_RDMA_PORT;
+ else
+ port = NFS_PORT;
+ max_namelen = NFS4_MAXNAMLEN;
+ max_pathlen = NFS4_MAXPATHLEN;
+ ctx->flags &= ~(NFS_MOUNT_NONLM | NFS_MOUNT_NOACL |
+ NFS_MOUNT_VER3 | NFS_MOUNT_LOCAL_FLOCK |
+ NFS_MOUNT_LOCAL_FCNTL);
+ } else {
+ goto out_v4_not_compiled;
+ }
+ } else {
+ nfs_set_mount_transport_protocol(ctx);
+ if (ctx->nfs_server.protocol == XPRT_TRANSPORT_RDMA)
+ port = NFS_RDMA_PORT;
+ }
+
+ nfs_set_port(sap, &ctx->nfs_server.port, port);
+
+ ret = nfs_parse_source(fc, max_namelen, max_pathlen);
+ if (ret < 0)
+ return ret;
+
+ /* Load the NFS protocol module if we haven't done so yet */
+ if (!ctx->nfs_mod) {
+ nfs_mod = get_nfs_version(ctx->version);
+ if (IS_ERR(nfs_mod)) {
+ ret = PTR_ERR(nfs_mod);
+ goto out_version_unavailable;
+ }
+ ctx->nfs_mod = nfs_mod;
+ }
+
+ /* Ensure the filesystem context has the correct fs_type */
+ if (fc->fs_type != ctx->nfs_mod->nfs_fs) {
+ module_put(fc->fs_type->owner);
+ __module_get(ctx->nfs_mod->nfs_fs->owner);
+ fc->fs_type = ctx->nfs_mod->nfs_fs;
+ }
+ return 0;
+
+out_no_device_name:
+ return nfs_invalf(fc, "NFS: Device name not specified");
+out_v4_not_compiled:
+ nfs_errorf(fc, "NFS: NFSv4 is not compiled into kernel");
+ return -EPROTONOSUPPORT;
+out_no_address:
+ return nfs_invalf(fc, "NFS: mount program didn't pass remote address");
+out_mountproto_mismatch:
+ return nfs_invalf(fc, "NFS: Mount server address does not match mountproto= option");
+out_proto_mismatch:
+ return nfs_invalf(fc, "NFS: Server address does not match proto= option");
+out_minorversion_mismatch:
+ return nfs_invalf(fc, "NFS: Mount option vers=%u does not support minorversion=%u",
+ ctx->version, ctx->minorversion);
+out_migration_misuse:
+ return nfs_invalf(fc, "NFS: 'Migration' not supported for this NFS version");
+out_version_unavailable:
+ nfs_errorf(fc, "NFS: Version unavailable");
+ return ret;
+}
+
+/*
+ * Create an NFS superblock by the appropriate method.
+ */
+static int nfs_get_tree(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ int err = nfs_fs_context_validate(fc);
+
+ if (err)
+ return err;
+ if (!ctx->internal)
+ return ctx->nfs_mod->rpc_ops->try_get_tree(fc);
+ else
+ return nfs_get_tree_common(fc);
+}
+
+/*
+ * Handle duplication of a configuration. The caller copied *src into *sc, but
+ * it can't deal with resource pointers in the filesystem context, so we have
+ * to do that. We need to clear pointers, copy data or get extra refs as
+ * appropriate.
+ */
+static int nfs_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
+{
+ struct nfs_fs_context *src = nfs_fc2context(src_fc), *ctx;
+
+ ctx = kmemdup(src, sizeof(struct nfs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->mntfh = nfs_alloc_fhandle();
+ if (!ctx->mntfh) {
+ kfree(ctx);
+ return -ENOMEM;
+ }
+ nfs_copy_fh(ctx->mntfh, src->mntfh);
+
+ __module_get(ctx->nfs_mod->owner);
+ ctx->client_address = NULL;
+ ctx->mount_server.hostname = NULL;
+ ctx->nfs_server.export_path = NULL;
+ ctx->nfs_server.hostname = NULL;
+ ctx->fscache_uniq = NULL;
+ ctx->clone_data.fattr = NULL;
+ fc->fs_private = ctx;
+ return 0;
+}
+
+static void nfs_fs_context_free(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+
+ if (ctx) {
+ if (ctx->server)
+ nfs_free_server(ctx->server);
+ if (ctx->nfs_mod)
+ put_nfs_version(ctx->nfs_mod);
+ kfree(ctx->client_address);
+ kfree(ctx->mount_server.hostname);
+ kfree(ctx->nfs_server.export_path);
+ kfree(ctx->nfs_server.hostname);
+ kfree(ctx->fscache_uniq);
+ nfs_free_fhandle(ctx->mntfh);
+ nfs_free_fattr(ctx->clone_data.fattr);
+ kfree(ctx);
+ }
+}
+
+static const struct fs_context_operations nfs_fs_context_ops = {
+ .free = nfs_fs_context_free,
+ .dup = nfs_fs_context_dup,
+ .parse_param = nfs_fs_context_parse_param,
+ .parse_monolithic = nfs_fs_context_parse_monolithic,
+ .get_tree = nfs_get_tree,
+ .reconfigure = nfs_reconfigure,
+};
+
+/*
+ * Prepare superblock configuration. We use the namespaces attached to the
+ * context. This may be the current process's namespaces, or it may be a
+ * container's namespaces.
+ */
+static int nfs_init_fs_context(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct nfs_fs_context), GFP_KERNEL);
+ if (unlikely(!ctx))
+ return -ENOMEM;
+
+ ctx->mntfh = nfs_alloc_fhandle();
+ if (unlikely(!ctx->mntfh)) {
+ kfree(ctx);
+ return -ENOMEM;
+ }
+
+ ctx->protofamily = AF_UNSPEC;
+ ctx->mountfamily = AF_UNSPEC;
+ ctx->mount_server.port = NFS_UNSPEC_PORT;
+
+ if (fc->root) {
+ /* reconfigure, start with the current config */
+ struct nfs_server *nfss = fc->root->d_sb->s_fs_info;
+ struct net *net = nfss->nfs_client->cl_net;
+
+ ctx->flags = nfss->flags;
+ ctx->rsize = nfss->rsize;
+ ctx->wsize = nfss->wsize;
+ ctx->retrans = nfss->client->cl_timeout->to_retries;
+ ctx->selected_flavor = nfss->client->cl_auth->au_flavor;
+ ctx->acregmin = nfss->acregmin / HZ;
+ ctx->acregmax = nfss->acregmax / HZ;
+ ctx->acdirmin = nfss->acdirmin / HZ;
+ ctx->acdirmax = nfss->acdirmax / HZ;
+ ctx->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
+ ctx->nfs_server.port = nfss->port;
+ ctx->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
+ ctx->version = nfss->nfs_client->rpc_ops->version;
+ ctx->minorversion = nfss->nfs_client->cl_minorversion;
+
+ memcpy(&ctx->nfs_server._address, &nfss->nfs_client->cl_addr,
+ ctx->nfs_server.addrlen);
+
+ if (fc->net_ns != net) {
+ put_net(fc->net_ns);
+ fc->net_ns = get_net(net);
+ }
+
+ ctx->nfs_mod = nfss->nfs_client->cl_nfs_mod;
+ __module_get(ctx->nfs_mod->owner);
+ } else {
+ /* defaults */
+ ctx->timeo = NFS_UNSPEC_TIMEO;
+ ctx->retrans = NFS_UNSPEC_RETRANS;
+ ctx->acregmin = NFS_DEF_ACREGMIN;
+ ctx->acregmax = NFS_DEF_ACREGMAX;
+ ctx->acdirmin = NFS_DEF_ACDIRMIN;
+ ctx->acdirmax = NFS_DEF_ACDIRMAX;
+ ctx->nfs_server.port = NFS_UNSPEC_PORT;
+ ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+ ctx->selected_flavor = RPC_AUTH_MAXFLAVOR;
+ ctx->minorversion = 0;
+ ctx->need_mount = true;
+ ctx->xprtsec.policy = RPC_XPRTSEC_NONE;
+ ctx->xprtsec.cert_serial = TLS_NO_CERT;
+ ctx->xprtsec.privkey_serial = TLS_NO_PRIVKEY;
+
+ fc->s_iflags |= SB_I_STABLE_WRITES;
+ }
+ fc->fs_private = ctx;
+ fc->ops = &nfs_fs_context_ops;
+ return 0;
+}
+
+struct file_system_type nfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "nfs",
+ .init_fs_context = nfs_init_fs_context,
+ .parameters = nfs_fs_parameters,
+ .kill_sb = nfs_kill_super,
+ .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
+};
+MODULE_ALIAS_FS("nfs");
+EXPORT_SYMBOL_GPL(nfs_fs_type);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+struct file_system_type nfs4_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "nfs4",
+ .init_fs_context = nfs_init_fs_context,
+ .parameters = nfs_fs_parameters,
+ .kill_sb = nfs_kill_super,
+ .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
+};
+MODULE_ALIAS_FS("nfs4");
+MODULE_ALIAS("nfs4");
+EXPORT_SYMBOL_GPL(nfs4_fs_type);
+#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
new file mode 100644
index 0000000000..b05717fe0d
--- /dev/null
+++ b/fs/nfs/fscache.c
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* NFS filesystem cache interface
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/iversion.h>
+#include <linux/xarray.h>
+#include <linux/fscache.h>
+#include <linux/netfs.h>
+
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+#include "nfstrace.h"
+
+#define NFS_MAX_KEY_LEN 1000
+
+static bool nfs_append_int(char *key, int *_len, unsigned long long x)
+{
+ if (*_len > NFS_MAX_KEY_LEN)
+ return false;
+ if (x == 0)
+ key[(*_len)++] = ',';
+ else
+ *_len += sprintf(key + *_len, ",%llx", x);
+ return true;
+}
+
+/*
+ * Get the per-client index cookie for an NFS client if the appropriate mount
+ * flag was set
+ * - We always try and get an index cookie for the client, but get filehandle
+ * cookies on a per-superblock basis, depending on the mount flags
+ */
+static bool nfs_fscache_get_client_key(struct nfs_client *clp,
+ char *key, int *_len)
+{
+ const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
+ const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
+
+ *_len += snprintf(key + *_len, NFS_MAX_KEY_LEN - *_len,
+ ",%u.%u,%x",
+ clp->rpc_ops->version,
+ clp->cl_minorversion,
+ clp->cl_addr.ss_family);
+
+ switch (clp->cl_addr.ss_family) {
+ case AF_INET:
+ if (!nfs_append_int(key, _len, sin->sin_port) ||
+ !nfs_append_int(key, _len, sin->sin_addr.s_addr))
+ return false;
+ return true;
+
+ case AF_INET6:
+ if (!nfs_append_int(key, _len, sin6->sin6_port) ||
+ !nfs_append_int(key, _len, sin6->sin6_addr.s6_addr32[0]) ||
+ !nfs_append_int(key, _len, sin6->sin6_addr.s6_addr32[1]) ||
+ !nfs_append_int(key, _len, sin6->sin6_addr.s6_addr32[2]) ||
+ !nfs_append_int(key, _len, sin6->sin6_addr.s6_addr32[3]))
+ return false;
+ return true;
+
+ default:
+ printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
+ clp->cl_addr.ss_family);
+ return false;
+ }
+}
+
+/*
+ * Get the cache cookie for an NFS superblock.
+ *
+ * The default uniquifier is just an empty string, but it may be overridden
+ * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent
+ * superblock across an automount point of some nature.
+ */
+int nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int ulen)
+{
+ struct fscache_volume *vcookie;
+ struct nfs_server *nfss = NFS_SB(sb);
+ unsigned int len = 3;
+ char *key;
+
+ if (uniq) {
+ nfss->fscache_uniq = kmemdup_nul(uniq, ulen, GFP_KERNEL);
+ if (!nfss->fscache_uniq)
+ return -ENOMEM;
+ }
+
+ key = kmalloc(NFS_MAX_KEY_LEN + 24, GFP_KERNEL);
+ if (!key)
+ return -ENOMEM;
+
+ memcpy(key, "nfs", 3);
+ if (!nfs_fscache_get_client_key(nfss->nfs_client, key, &len) ||
+ !nfs_append_int(key, &len, nfss->fsid.major) ||
+ !nfs_append_int(key, &len, nfss->fsid.minor) ||
+ !nfs_append_int(key, &len, sb->s_flags & NFS_SB_MASK) ||
+ !nfs_append_int(key, &len, nfss->flags) ||
+ !nfs_append_int(key, &len, nfss->rsize) ||
+ !nfs_append_int(key, &len, nfss->wsize) ||
+ !nfs_append_int(key, &len, nfss->acregmin) ||
+ !nfs_append_int(key, &len, nfss->acregmax) ||
+ !nfs_append_int(key, &len, nfss->acdirmin) ||
+ !nfs_append_int(key, &len, nfss->acdirmax) ||
+ !nfs_append_int(key, &len, nfss->client->cl_auth->au_flavor))
+ goto out;
+
+ if (ulen > 0) {
+ if (ulen > NFS_MAX_KEY_LEN - len)
+ goto out;
+ key[len++] = ',';
+ memcpy(key + len, uniq, ulen);
+ len += ulen;
+ }
+ key[len] = 0;
+
+ /* create a cache index for looking up filehandles */
+ vcookie = fscache_acquire_volume(key,
+ NULL, /* preferred_cache */
+ NULL, 0 /* coherency_data */);
+ if (IS_ERR(vcookie)) {
+ if (vcookie != ERR_PTR(-EBUSY)) {
+ kfree(key);
+ return PTR_ERR(vcookie);
+ }
+ pr_err("NFS: Cache volume key already in use (%s)\n", key);
+ vcookie = NULL;
+ }
+ nfss->fscache = vcookie;
+
+out:
+ kfree(key);
+ return 0;
+}
+
+/*
+ * release a per-superblock cookie
+ */
+void nfs_fscache_release_super_cookie(struct super_block *sb)
+{
+ struct nfs_server *nfss = NFS_SB(sb);
+
+ fscache_relinquish_volume(nfss->fscache, NULL, false);
+ nfss->fscache = NULL;
+ kfree(nfss->fscache_uniq);
+}
+
+/*
+ * Initialise the per-inode cache cookie pointer for an NFS inode.
+ */
+void nfs_fscache_init_inode(struct inode *inode)
+{
+ struct nfs_fscache_inode_auxdata auxdata;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ netfs_inode(inode)->cache = NULL;
+ if (!(nfss->fscache && S_ISREG(inode->i_mode)))
+ return;
+
+ nfs_fscache_update_auxdata(&auxdata, inode);
+
+ netfs_inode(inode)->cache = fscache_acquire_cookie(
+ nfss->fscache,
+ 0,
+ nfsi->fh.data, /* index_key */
+ nfsi->fh.size,
+ &auxdata, /* aux_data */
+ sizeof(auxdata),
+ i_size_read(inode));
+
+ if (netfs_inode(inode)->cache)
+ mapping_set_release_always(inode->i_mapping);
+}
+
+/*
+ * Release a per-inode cookie.
+ */
+void nfs_fscache_clear_inode(struct inode *inode)
+{
+ fscache_relinquish_cookie(netfs_i_cookie(netfs_inode(inode)), false);
+ netfs_inode(inode)->cache = NULL;
+}
+
+/*
+ * Enable or disable caching for a file that is being opened as appropriate.
+ * The cookie is allocated when the inode is initialised, but is not enabled at
+ * that time. Enablement is deferred to file-open time to avoid stat() and
+ * access() thrashing the cache.
+ *
+ * For now, with NFS, only regular files that are open read-only will be able
+ * to use the cache.
+ *
+ * We enable the cache for an inode if we open it read-only and it isn't
+ * currently open for writing. We disable the cache if the inode is open
+ * write-only.
+ *
+ * The caller uses the file struct to pin i_writecount on the inode before
+ * calling us when a file is opened for writing, so we can make use of that.
+ *
+ * Note that this may be invoked multiple times in parallel by parallel
+ * nfs_open() functions.
+ */
+void nfs_fscache_open_file(struct inode *inode, struct file *filp)
+{
+ struct nfs_fscache_inode_auxdata auxdata;
+ struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
+ bool open_for_write = inode_is_open_for_write(inode);
+
+ if (!fscache_cookie_valid(cookie))
+ return;
+
+ fscache_use_cookie(cookie, open_for_write);
+ if (open_for_write) {
+ nfs_fscache_update_auxdata(&auxdata, inode);
+ fscache_invalidate(cookie, &auxdata, i_size_read(inode),
+ FSCACHE_INVAL_DIO_WRITE);
+ }
+}
+EXPORT_SYMBOL_GPL(nfs_fscache_open_file);
+
+void nfs_fscache_release_file(struct inode *inode, struct file *filp)
+{
+ struct nfs_fscache_inode_auxdata auxdata;
+ struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
+ loff_t i_size = i_size_read(inode);
+
+ nfs_fscache_update_auxdata(&auxdata, inode);
+ fscache_unuse_cookie(cookie, &auxdata, &i_size);
+}
+
+int nfs_netfs_read_folio(struct file *file, struct folio *folio)
+{
+ if (!netfs_inode(folio_inode(folio))->cache)
+ return -ENOBUFS;
+
+ return netfs_read_folio(file, folio);
+}
+
+int nfs_netfs_readahead(struct readahead_control *ractl)
+{
+ struct inode *inode = ractl->mapping->host;
+
+ if (!netfs_inode(inode)->cache)
+ return -ENOBUFS;
+
+ netfs_readahead(ractl);
+ return 0;
+}
+
+static atomic_t nfs_netfs_debug_id;
+static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *file)
+{
+ rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file));
+ rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id);
+
+ return 0;
+}
+
+static void nfs_netfs_free_request(struct netfs_io_request *rreq)
+{
+ put_nfs_open_context(rreq->netfs_priv);
+}
+
+static inline int nfs_netfs_begin_cache_operation(struct netfs_io_request *rreq)
+{
+ return fscache_begin_read_operation(&rreq->cache_resources,
+ netfs_i_cookie(netfs_inode(rreq->inode)));
+}
+
+static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq)
+{
+ struct nfs_netfs_io_data *netfs;
+
+ netfs = kzalloc(sizeof(*netfs), GFP_KERNEL_ACCOUNT);
+ if (!netfs)
+ return NULL;
+ netfs->sreq = sreq;
+ refcount_set(&netfs->refcount, 1);
+ return netfs;
+}
+
+static bool nfs_netfs_clamp_length(struct netfs_io_subrequest *sreq)
+{
+ size_t rsize = NFS_SB(sreq->rreq->inode->i_sb)->rsize;
+
+ sreq->len = min(sreq->len, rsize);
+ return true;
+}
+
+static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)
+{
+ struct nfs_netfs_io_data *netfs;
+ struct nfs_pageio_descriptor pgio;
+ struct inode *inode = sreq->rreq->inode;
+ struct nfs_open_context *ctx = sreq->rreq->netfs_priv;
+ struct page *page;
+ int err;
+ pgoff_t start = (sreq->start + sreq->transferred) >> PAGE_SHIFT;
+ pgoff_t last = ((sreq->start + sreq->len -
+ sreq->transferred - 1) >> PAGE_SHIFT);
+ XA_STATE(xas, &sreq->rreq->mapping->i_pages, start);
+
+ nfs_pageio_init_read(&pgio, inode, false,
+ &nfs_async_read_completion_ops);
+
+ netfs = nfs_netfs_alloc(sreq);
+ if (!netfs)
+ return netfs_subreq_terminated(sreq, -ENOMEM, false);
+
+ pgio.pg_netfs = netfs; /* used in completion */
+
+ xas_lock(&xas);
+ xas_for_each(&xas, page, last) {
+ /* nfs_read_add_folio() may schedule() due to pNFS layout and other RPCs */
+ xas_pause(&xas);
+ xas_unlock(&xas);
+ err = nfs_read_add_folio(&pgio, ctx, page_folio(page));
+ if (err < 0) {
+ netfs->error = err;
+ goto out;
+ }
+ xas_lock(&xas);
+ }
+ xas_unlock(&xas);
+out:
+ nfs_pageio_complete_read(&pgio);
+ nfs_netfs_put(netfs);
+}
+
+void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr)
+{
+ struct nfs_netfs_io_data *netfs = hdr->netfs;
+
+ if (!netfs)
+ return;
+
+ nfs_netfs_get(netfs);
+}
+
+int nfs_netfs_folio_unlock(struct folio *folio)
+{
+ struct inode *inode = folio_file_mapping(folio)->host;
+
+ /*
+ * If fscache is enabled, netfs will unlock pages.
+ */
+ if (netfs_inode(inode)->cache)
+ return 0;
+
+ return 1;
+}
+
+void nfs_netfs_read_completion(struct nfs_pgio_header *hdr)
+{
+ struct nfs_netfs_io_data *netfs = hdr->netfs;
+ struct netfs_io_subrequest *sreq;
+
+ if (!netfs)
+ return;
+
+ sreq = netfs->sreq;
+ if (test_bit(NFS_IOHDR_EOF, &hdr->flags))
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &sreq->flags);
+
+ if (hdr->error)
+ netfs->error = hdr->error;
+ else
+ atomic64_add(hdr->res.count, &netfs->transferred);
+
+ nfs_netfs_put(netfs);
+ hdr->netfs = NULL;
+}
+
+const struct netfs_request_ops nfs_netfs_ops = {
+ .init_request = nfs_netfs_init_request,
+ .free_request = nfs_netfs_free_request,
+ .begin_cache_operation = nfs_netfs_begin_cache_operation,
+ .issue_read = nfs_netfs_issue_read,
+ .clamp_length = nfs_netfs_clamp_length
+};
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
new file mode 100644
index 0000000000..2dc6445449
--- /dev/null
+++ b/fs/nfs/fscache.h
@@ -0,0 +1,202 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* NFS filesystem cache interface definitions
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#ifndef _NFS_FSCACHE_H
+#define _NFS_FSCACHE_H
+
+#include <linux/swap.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/fscache.h>
+#include <linux/iversion.h>
+
+#ifdef CONFIG_NFS_FSCACHE
+
+/*
+ * Definition of the auxiliary data attached to NFS inode storage objects
+ * within the cache.
+ *
+ * The contents of this struct are recorded in the on-disk local cache in the
+ * auxiliary data attached to the data storage object backing an inode. This
+ * permits coherency to be managed when a new inode binds to an already extant
+ * cache object.
+ */
+struct nfs_fscache_inode_auxdata {
+ s64 mtime_sec;
+ s64 mtime_nsec;
+ s64 ctime_sec;
+ s64 ctime_nsec;
+ u64 change_attr;
+};
+
+struct nfs_netfs_io_data {
+ /*
+ * NFS may split a netfs_io_subrequest into multiple RPCs, each
+ * with their own read completion. In netfs, we can only call
+ * netfs_subreq_terminated() once for each subrequest. Use the
+ * refcount here to double as a marker of the last RPC completion,
+ * and only call netfs via netfs_subreq_terminated() once.
+ */
+ refcount_t refcount;
+ struct netfs_io_subrequest *sreq;
+
+ /*
+ * Final disposition of the netfs_io_subrequest, sent in
+ * netfs_subreq_terminated()
+ */
+ atomic64_t transferred;
+ int error;
+};
+
+static inline void nfs_netfs_get(struct nfs_netfs_io_data *netfs)
+{
+ refcount_inc(&netfs->refcount);
+}
+
+static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
+{
+ ssize_t final_len;
+
+ /* Only the last RPC completion should call netfs_subreq_terminated() */
+ if (!refcount_dec_and_test(&netfs->refcount))
+ return;
+
+ /*
+ * The NFS pageio interface may read a complete page, even when netfs
+ * only asked for a partial page. Specifically, this may be seen when
+ * one thread is truncating a file while another one is reading the last
+ * page of the file.
+ * Correct the final length here to be no larger than the netfs subrequest
+ * length, and thus avoid netfs's "Subreq overread" warning message.
+ */
+ final_len = min_t(s64, netfs->sreq->len, atomic64_read(&netfs->transferred));
+ netfs_subreq_terminated(netfs->sreq, netfs->error ?: final_len, false);
+ kfree(netfs);
+}
+static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi)
+{
+ netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops);
+}
+extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr);
+extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr);
+extern int nfs_netfs_folio_unlock(struct folio *folio);
+
+/*
+ * fscache.c
+ */
+extern int nfs_fscache_get_super_cookie(struct super_block *, const char *, int);
+extern void nfs_fscache_release_super_cookie(struct super_block *);
+
+extern void nfs_fscache_init_inode(struct inode *);
+extern void nfs_fscache_clear_inode(struct inode *);
+extern void nfs_fscache_open_file(struct inode *, struct file *);
+extern void nfs_fscache_release_file(struct inode *, struct file *);
+extern int nfs_netfs_readahead(struct readahead_control *ractl);
+extern int nfs_netfs_read_folio(struct file *file, struct folio *folio);
+
+static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
+{
+ if (folio_test_fscache(folio)) {
+ if (current_is_kswapd() || !(gfp & __GFP_FS))
+ return false;
+ folio_wait_fscache(folio);
+ }
+ fscache_note_page_release(netfs_i_cookie(netfs_inode(folio->mapping->host)));
+ return true;
+}
+
+static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata,
+ struct inode *inode)
+{
+ memset(auxdata, 0, sizeof(*auxdata));
+ auxdata->mtime_sec = inode->i_mtime.tv_sec;
+ auxdata->mtime_nsec = inode->i_mtime.tv_nsec;
+ auxdata->ctime_sec = inode_get_ctime(inode).tv_sec;
+ auxdata->ctime_nsec = inode_get_ctime(inode).tv_nsec;
+
+ if (NFS_SERVER(inode)->nfs_client->rpc_ops->version == 4)
+ auxdata->change_attr = inode_peek_iversion_raw(inode);
+}
+
+/*
+ * Invalidate the contents of fscache for this inode. This will not sleep.
+ */
+static inline void nfs_fscache_invalidate(struct inode *inode, int flags)
+{
+ struct nfs_fscache_inode_auxdata auxdata;
+ struct fscache_cookie *cookie = netfs_i_cookie(&NFS_I(inode)->netfs);
+
+ nfs_fscache_update_auxdata(&auxdata, inode);
+ fscache_invalidate(cookie, &auxdata, i_size_read(inode), flags);
+}
+
+/*
+ * indicate the client caching state as readable text
+ */
+static inline const char *nfs_server_fscache_state(struct nfs_server *server)
+{
+ if (server->fscache)
+ return "yes";
+ return "no ";
+}
+
+static inline void nfs_netfs_set_pgio_header(struct nfs_pgio_header *hdr,
+ struct nfs_pageio_descriptor *desc)
+{
+ hdr->netfs = desc->pg_netfs;
+}
+static inline void nfs_netfs_set_pageio_descriptor(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr)
+{
+ desc->pg_netfs = hdr->netfs;
+}
+static inline void nfs_netfs_reset_pageio_descriptor(struct nfs_pageio_descriptor *desc)
+{
+ desc->pg_netfs = NULL;
+}
+#else /* CONFIG_NFS_FSCACHE */
+static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
+static inline void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr) {}
+static inline void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) {}
+static inline int nfs_netfs_folio_unlock(struct folio *folio)
+{
+ return 1;
+}
+static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
+
+static inline void nfs_fscache_init_inode(struct inode *inode) {}
+static inline void nfs_fscache_clear_inode(struct inode *inode) {}
+static inline void nfs_fscache_open_file(struct inode *inode,
+ struct file *filp) {}
+static inline void nfs_fscache_release_file(struct inode *inode, struct file *file) {}
+static inline int nfs_netfs_readahead(struct readahead_control *ractl)
+{
+ return -ENOBUFS;
+}
+static inline int nfs_netfs_read_folio(struct file *file, struct folio *folio)
+{
+ return -ENOBUFS;
+}
+
+static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
+{
+ return true; /* may release folio */
+}
+static inline void nfs_fscache_invalidate(struct inode *inode, int flags) {}
+
+static inline const char *nfs_server_fscache_state(struct nfs_server *server)
+{
+ return "no ";
+}
+static inline void nfs_netfs_set_pgio_header(struct nfs_pgio_header *hdr,
+ struct nfs_pageio_descriptor *desc) {}
+static inline void nfs_netfs_set_pageio_descriptor(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr) {}
+static inline void nfs_netfs_reset_pageio_descriptor(struct nfs_pageio_descriptor *desc) {}
+#endif /* CONFIG_NFS_FSCACHE */
+#endif /* _NFS_FSCACHE_H */
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
new file mode 100644
index 0000000000..11ff2b2e06
--- /dev/null
+++ b/fs/nfs/getroot.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* getroot.c: get the root dentry for an NFS mount
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_CLIENT
+
+/*
+ * Set the superblock root dentry.
+ * Note that this function frees the inode in case of error.
+ */
+static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *inode)
+{
+ /* The mntroot acts as the dummy root dentry for this superblock */
+ if (sb->s_root == NULL) {
+ sb->s_root = d_make_root(inode);
+ if (sb->s_root == NULL)
+ return -ENOMEM;
+ ihold(inode);
+ /*
+ * Ensure that this dentry is invisible to d_find_alias().
+ * Otherwise, it may be spliced into the tree by
+ * d_splice_alias if a parent directory from the same
+ * filesystem gets mounted at a later time.
+ * This again causes shrink_dcache_for_umount_subtree() to
+ * Oops, since the test for IS_ROOT() will fail.
+ */
+ spin_lock(&d_inode(sb->s_root)->i_lock);
+ spin_lock(&sb->s_root->d_lock);
+ hlist_del_init(&sb->s_root->d_u.d_alias);
+ spin_unlock(&sb->s_root->d_lock);
+ spin_unlock(&d_inode(sb->s_root)->i_lock);
+ }
+ return 0;
+}
+
+/*
+ * get an NFS2/NFS3 root dentry from the root filehandle
+ */
+int nfs_get_root(struct super_block *s, struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct nfs_server *server = NFS_SB(s), *clone_server;
+ struct nfs_fsinfo fsinfo;
+ struct dentry *root;
+ struct inode *inode;
+ char *name;
+ int error = -ENOMEM;
+ unsigned long kflags = 0, kflags_out = 0;
+
+ name = kstrdup(fc->source, GFP_KERNEL);
+ if (!name)
+ goto out;
+
+ /* get the actual root for this mount */
+ fsinfo.fattr = nfs_alloc_fattr_with_label(server);
+ if (fsinfo.fattr == NULL)
+ goto out_name;
+
+ error = server->nfs_client->rpc_ops->getroot(server, ctx->mntfh, &fsinfo);
+ if (error < 0) {
+ dprintk("nfs_get_root: getattr error = %d\n", -error);
+ nfs_errorf(fc, "NFS: Couldn't getattr on root");
+ goto out_fattr;
+ }
+
+ inode = nfs_fhget(s, ctx->mntfh, fsinfo.fattr);
+ if (IS_ERR(inode)) {
+ dprintk("nfs_get_root: get root inode failed\n");
+ error = PTR_ERR(inode);
+ nfs_errorf(fc, "NFS: Couldn't get root inode");
+ goto out_fattr;
+ }
+
+ error = nfs_superblock_set_dummy_root(s, inode);
+ if (error != 0)
+ goto out_fattr;
+
+ /* root dentries normally start off anonymous and get spliced in later
+ * if the dentry tree reaches them; however if the dentry already
+ * exists, we'll pick it up at this point and use it as the root
+ */
+ root = d_obtain_root(inode);
+ if (IS_ERR(root)) {
+ dprintk("nfs_get_root: get root dentry failed\n");
+ error = PTR_ERR(root);
+ nfs_errorf(fc, "NFS: Couldn't get root dentry");
+ goto out_fattr;
+ }
+
+ security_d_instantiate(root, inode);
+ spin_lock(&root->d_lock);
+ if (IS_ROOT(root) && !root->d_fsdata &&
+ !(root->d_flags & DCACHE_NFSFS_RENAMED)) {
+ root->d_fsdata = name;
+ name = NULL;
+ }
+ spin_unlock(&root->d_lock);
+ fc->root = root;
+ if (server->caps & NFS_CAP_SECURITY_LABEL)
+ kflags |= SECURITY_LSM_NATIVE_LABELS;
+ if (ctx->clone_data.sb) {
+ if (d_inode(fc->root)->i_fop != &nfs_dir_operations) {
+ error = -ESTALE;
+ goto error_splat_root;
+ }
+ /* clone lsm security options from the parent to the new sb */
+ error = security_sb_clone_mnt_opts(ctx->clone_data.sb,
+ s, kflags, &kflags_out);
+ if (error)
+ goto error_splat_root;
+ clone_server = NFS_SB(ctx->clone_data.sb);
+ server->has_sec_mnt_opts = clone_server->has_sec_mnt_opts;
+ } else {
+ error = security_sb_set_mnt_opts(s, fc->security,
+ kflags, &kflags_out);
+ }
+ if (error)
+ goto error_splat_root;
+ if (server->caps & NFS_CAP_SECURITY_LABEL &&
+ !(kflags_out & SECURITY_LSM_NATIVE_LABELS))
+ server->caps &= ~NFS_CAP_SECURITY_LABEL;
+
+ nfs_setsecurity(inode, fsinfo.fattr);
+ error = 0;
+
+out_fattr:
+ nfs_free_fattr(fsinfo.fattr);
+out_name:
+ kfree(name);
+out:
+ return error;
+error_splat_root:
+ dput(fc->root);
+ fc->root = NULL;
+ goto out_fattr;
+}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
new file mode 100644
index 0000000000..e21c073158
--- /dev/null
+++ b/fs/nfs/inode.c
@@ -0,0 +1,2540 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/inode.c
+ *
+ * Copyright (C) 1992 Rick Sladkey
+ *
+ * nfs inode and superblock handling functions
+ *
+ * Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some
+ * experimental NFS changes. Modularisation taken straight from SYS5 fs.
+ *
+ * Change to nfs_read_super() to permit NFS mounts to multi-homed hosts.
+ * J.S.Peatfield@damtp.cam.ac.uk
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched/signal.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/nfs_xdr.h>
+#include <linux/slab.h>
+#include <linux/compat.h>
+#include <linux/freezer.h>
+#include <linux/uaccess.h>
+#include <linux/iversion.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#include "fscache.h"
+#include "pnfs.h"
+#include "nfs.h"
+#include "netns.h"
+#include "sysfs.h"
+
+#include "nfstrace.h"
+
+#define NFSDBG_FACILITY NFSDBG_VFS
+
+#define NFS_64_BIT_INODE_NUMBERS_ENABLED 1
+
+/* Default is to see 64-bit inode numbers */
+static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
+
+static int nfs_update_inode(struct inode *, struct nfs_fattr *);
+
+static struct kmem_cache * nfs_inode_cachep;
+
+static inline unsigned long
+nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
+{
+ return nfs_fileid_to_ino_t(fattr->fileid);
+}
+
+int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
+{
+ schedule();
+ if (signal_pending_state(mode, current))
+ return -ERESTARTSYS;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
+
+/**
+ * nfs_compat_user_ino64 - returns the user-visible inode number
+ * @fileid: 64-bit fileid
+ *
+ * This function returns a 32-bit inode number if the boot parameter
+ * nfs.enable_ino64 is zero.
+ */
+u64 nfs_compat_user_ino64(u64 fileid)
+{
+#ifdef CONFIG_COMPAT
+ compat_ulong_t ino;
+#else
+ unsigned long ino;
+#endif
+
+ if (enable_ino64)
+ return fileid;
+ ino = fileid;
+ if (sizeof(ino) < sizeof(fileid))
+ ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8;
+ return ino;
+}
+
+int nfs_drop_inode(struct inode *inode)
+{
+ return NFS_STALE(inode) || generic_drop_inode(inode);
+}
+EXPORT_SYMBOL_GPL(nfs_drop_inode);
+
+void nfs_clear_inode(struct inode *inode)
+{
+ /*
+ * The following should never happen...
+ */
+ WARN_ON_ONCE(nfs_have_writebacks(inode));
+ WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files));
+ nfs_zap_acl_cache(inode);
+ nfs_access_zap_cache(inode);
+ nfs_fscache_clear_inode(inode);
+}
+EXPORT_SYMBOL_GPL(nfs_clear_inode);
+
+void nfs_evict_inode(struct inode *inode)
+{
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+ nfs_clear_inode(inode);
+}
+
+int nfs_sync_inode(struct inode *inode)
+{
+ inode_dio_wait(inode);
+ return nfs_wb_all(inode);
+}
+EXPORT_SYMBOL_GPL(nfs_sync_inode);
+
+/**
+ * nfs_sync_mapping - helper to flush all mmapped dirty data to disk
+ * @mapping: pointer to struct address_space
+ */
+int nfs_sync_mapping(struct address_space *mapping)
+{
+ int ret = 0;
+
+ if (mapping->nrpages != 0) {
+ unmap_mapping_range(mapping, 0, 0, 0);
+ ret = nfs_wb_all(mapping->host);
+ }
+ return ret;
+}
+
+static int nfs_attribute_timeout(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
+}
+
+static bool nfs_check_cache_flags_invalid(struct inode *inode,
+ unsigned long flags)
+{
+ unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+
+ return (cache_validity & flags) != 0;
+}
+
+bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags)
+{
+ if (nfs_check_cache_flags_invalid(inode, flags))
+ return true;
+ return nfs_attribute_cache_expired(inode);
+}
+EXPORT_SYMBOL_GPL(nfs_check_cache_invalid);
+
+#ifdef CONFIG_NFS_V4_2
+static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi)
+{
+ return nfsi->xattr_cache != NULL;
+}
+#else
+static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi)
+{
+ return false;
+}
+#endif
+
+void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ bool have_delegation = NFS_PROTO(inode)->have_delegation(inode, FMODE_READ);
+
+ if (have_delegation) {
+ if (!(flags & NFS_INO_REVAL_FORCED))
+ flags &= ~(NFS_INO_INVALID_MODE |
+ NFS_INO_INVALID_OTHER |
+ NFS_INO_INVALID_XATTR);
+ flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE);
+ }
+
+ if (!nfs_has_xattr_cache(nfsi))
+ flags &= ~NFS_INO_INVALID_XATTR;
+ if (flags & NFS_INO_INVALID_DATA)
+ nfs_fscache_invalidate(inode, 0);
+ flags &= ~NFS_INO_REVAL_FORCED;
+
+ nfsi->cache_validity |= flags;
+
+ if (inode->i_mapping->nrpages == 0) {
+ nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+ nfs_ooo_clear(nfsi);
+ } else if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
+ nfs_ooo_clear(nfsi);
+ }
+ trace_nfs_set_cache_invalid(inode, 0);
+}
+EXPORT_SYMBOL_GPL(nfs_set_cache_invalid);
+
+/*
+ * Invalidate the local caches
+ */
+static void nfs_zap_caches_locked(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ int mode = inode->i_mode;
+
+ nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
+
+ nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
+ nfsi->attrtimeo_timestamp = jiffies;
+
+ if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR |
+ NFS_INO_INVALID_DATA |
+ NFS_INO_INVALID_ACCESS |
+ NFS_INO_INVALID_ACL |
+ NFS_INO_INVALID_XATTR);
+ else
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR |
+ NFS_INO_INVALID_ACCESS |
+ NFS_INO_INVALID_ACL |
+ NFS_INO_INVALID_XATTR);
+ nfs_zap_label_cache_locked(nfsi);
+}
+
+void nfs_zap_caches(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ nfs_zap_caches_locked(inode);
+ spin_unlock(&inode->i_lock);
+}
+
+void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
+{
+ if (mapping->nrpages != 0) {
+ spin_lock(&inode->i_lock);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
+ spin_unlock(&inode->i_lock);
+ }
+}
+
+void nfs_zap_acl_cache(struct inode *inode)
+{
+ void (*clear_acl_cache)(struct inode *);
+
+ clear_acl_cache = NFS_PROTO(inode)->clear_acl_cache;
+ if (clear_acl_cache != NULL)
+ clear_acl_cache(inode);
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ACL;
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_zap_acl_cache);
+
+void nfs_invalidate_atime(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_invalidate_atime);
+
+/*
+ * Invalidate, but do not unhash, the inode.
+ * NB: must be called with inode->i_lock held!
+ */
+static void nfs_set_inode_stale_locked(struct inode *inode)
+{
+ set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+ nfs_zap_caches_locked(inode);
+ trace_nfs_set_inode_stale(inode);
+}
+
+void nfs_set_inode_stale(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ nfs_set_inode_stale_locked(inode);
+ spin_unlock(&inode->i_lock);
+}
+
+struct nfs_find_desc {
+ struct nfs_fh *fh;
+ struct nfs_fattr *fattr;
+};
+
+/*
+ * In NFSv3 we can have 64bit inode numbers. In order to support
+ * this, and re-exported directories (also seen in NFSv2)
+ * we are forced to allow 2 different inodes to have the same
+ * i_ino.
+ */
+static int
+nfs_find_actor(struct inode *inode, void *opaque)
+{
+ struct nfs_find_desc *desc = opaque;
+ struct nfs_fh *fh = desc->fh;
+ struct nfs_fattr *fattr = desc->fattr;
+
+ if (NFS_FILEID(inode) != fattr->fileid)
+ return 0;
+ if (inode_wrong_type(inode, fattr->mode))
+ return 0;
+ if (nfs_compare_fh(NFS_FH(inode), fh))
+ return 0;
+ if (is_bad_inode(inode) || NFS_STALE(inode))
+ return 0;
+ return 1;
+}
+
+static int
+nfs_init_locked(struct inode *inode, void *opaque)
+{
+ struct nfs_find_desc *desc = opaque;
+ struct nfs_fattr *fattr = desc->fattr;
+
+ set_nfs_fileid(inode, fattr->fileid);
+ inode->i_mode = fattr->mode;
+ nfs_copy_fh(NFS_FH(inode), desc->fh);
+ return 0;
+}
+
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static void nfs_clear_label_invalid(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_LABEL;
+ spin_unlock(&inode->i_lock);
+}
+
+void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr)
+{
+ int error;
+
+ if (fattr->label == NULL)
+ return;
+
+ if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) {
+ error = security_inode_notifysecctx(inode, fattr->label->label,
+ fattr->label->len);
+ if (error)
+ printk(KERN_ERR "%s() %s %d "
+ "security_inode_notifysecctx() %d\n",
+ __func__,
+ (char *)fattr->label->label,
+ fattr->label->len, error);
+ nfs_clear_label_invalid(inode);
+ }
+}
+
+struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags)
+{
+ struct nfs4_label *label;
+
+ if (!(server->caps & NFS_CAP_SECURITY_LABEL))
+ return NULL;
+
+ label = kzalloc(sizeof(struct nfs4_label), flags);
+ if (label == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ label->label = kzalloc(NFS4_MAXLABELLEN, flags);
+ if (label->label == NULL) {
+ kfree(label);
+ return ERR_PTR(-ENOMEM);
+ }
+ label->len = NFS4_MAXLABELLEN;
+
+ return label;
+}
+EXPORT_SYMBOL_GPL(nfs4_label_alloc);
+#else
+void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr)
+{
+}
+#endif
+EXPORT_SYMBOL_GPL(nfs_setsecurity);
+
+/* Search for inode identified by fh, fileid and i_mode in inode cache. */
+struct inode *
+nfs_ilookup(struct super_block *sb, struct nfs_fattr *fattr, struct nfs_fh *fh)
+{
+ struct nfs_find_desc desc = {
+ .fh = fh,
+ .fattr = fattr,
+ };
+ struct inode *inode;
+ unsigned long hash;
+
+ if (!(fattr->valid & NFS_ATTR_FATTR_FILEID) ||
+ !(fattr->valid & NFS_ATTR_FATTR_TYPE))
+ return NULL;
+
+ hash = nfs_fattr_to_ino_t(fattr);
+ inode = ilookup5(sb, hash, nfs_find_actor, &desc);
+
+ dprintk("%s: returning %p\n", __func__, inode);
+ return inode;
+}
+
+static void nfs_inode_init_regular(struct nfs_inode *nfsi)
+{
+ atomic_long_set(&nfsi->nrequests, 0);
+ atomic_long_set(&nfsi->redirtied_pages, 0);
+ INIT_LIST_HEAD(&nfsi->commit_info.list);
+ atomic_long_set(&nfsi->commit_info.ncommit, 0);
+ atomic_set(&nfsi->commit_info.rpcs_out, 0);
+ mutex_init(&nfsi->commit_mutex);
+}
+
+static void nfs_inode_init_dir(struct nfs_inode *nfsi)
+{
+ nfsi->cache_change_attribute = 0;
+ memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+ init_rwsem(&nfsi->rmdir_sem);
+}
+
+/*
+ * This is our front-end to iget that looks up inodes by file handle
+ * instead of inode number.
+ */
+struct inode *
+nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
+{
+ struct nfs_find_desc desc = {
+ .fh = fh,
+ .fattr = fattr
+ };
+ struct inode *inode = ERR_PTR(-ENOENT);
+ u64 fattr_supported = NFS_SB(sb)->fattr_valid;
+ unsigned long hash;
+
+ nfs_attr_check_mountpoint(sb, fattr);
+
+ if (nfs_attr_use_mounted_on_fileid(fattr))
+ fattr->fileid = fattr->mounted_on_fileid;
+ else if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
+ goto out_no_inode;
+ if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
+ goto out_no_inode;
+
+ hash = nfs_fattr_to_ino_t(fattr);
+
+ inode = iget5_locked(sb, hash, nfs_find_actor, nfs_init_locked, &desc);
+ if (inode == NULL) {
+ inode = ERR_PTR(-ENOMEM);
+ goto out_no_inode;
+ }
+
+ if (inode->i_state & I_NEW) {
+ struct nfs_inode *nfsi = NFS_I(inode);
+ unsigned long now = jiffies;
+
+ /* We set i_ino for the few things that still rely on it,
+ * such as stat(2) */
+ inode->i_ino = hash;
+
+ /* We can't support update_atime(), since the server will reset it */
+ inode->i_flags |= S_NOATIME|S_NOCMTIME;
+ inode->i_mode = fattr->mode;
+ nfsi->cache_validity = 0;
+ if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
+ && (fattr_supported & NFS_ATTR_FATTR_MODE))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE);
+ /* Why so? Because we want revalidate for devices/FIFOs, and
+ * that's precisely what we have in nfs_file_inode_operations.
+ */
+ inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops;
+ if (S_ISREG(inode->i_mode)) {
+ inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
+ inode->i_data.a_ops = &nfs_file_aops;
+ nfs_inode_init_regular(nfsi);
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
+ inode->i_fop = &nfs_dir_operations;
+ inode->i_data.a_ops = &nfs_dir_aops;
+ nfs_inode_init_dir(nfsi);
+ /* Deal with crossing mountpoints */
+ if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT ||
+ fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+ if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
+ inode->i_op = &nfs_referral_inode_operations;
+ else
+ inode->i_op = &nfs_mountpoint_inode_operations;
+ inode->i_fop = NULL;
+ inode->i_flags |= S_AUTOMOUNT;
+ }
+ } else if (S_ISLNK(inode->i_mode)) {
+ inode->i_op = &nfs_symlink_inode_operations;
+ inode_nohighmem(inode);
+ } else
+ init_special_inode(inode, inode->i_mode, fattr->rdev);
+
+ memset(&inode->i_atime, 0, sizeof(inode->i_atime));
+ memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
+ inode_set_ctime(inode, 0, 0);
+ inode_set_iversion_raw(inode, 0);
+ inode->i_size = 0;
+ clear_nlink(inode);
+ inode->i_uid = make_kuid(&init_user_ns, -2);
+ inode->i_gid = make_kgid(&init_user_ns, -2);
+ inode->i_blocks = 0;
+ nfsi->write_io = 0;
+ nfsi->read_io = 0;
+
+ nfsi->read_cache_jiffies = fattr->time_start;
+ nfsi->attr_gencount = fattr->gencount;
+ if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+ inode->i_atime = fattr->atime;
+ else if (fattr_supported & NFS_ATTR_FATTR_ATIME)
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
+ if (fattr->valid & NFS_ATTR_FATTR_MTIME)
+ inode->i_mtime = fattr->mtime;
+ else if (fattr_supported & NFS_ATTR_FATTR_MTIME)
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
+ if (fattr->valid & NFS_ATTR_FATTR_CTIME)
+ inode_set_ctime_to_ts(inode, fattr->ctime);
+ else if (fattr_supported & NFS_ATTR_FATTR_CTIME)
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME);
+ if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
+ inode_set_iversion_raw(inode, fattr->change_attr);
+ else
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE);
+ if (fattr->valid & NFS_ATTR_FATTR_SIZE)
+ inode->i_size = nfs_size_to_loff_t(fattr->size);
+ else
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_SIZE);
+ if (fattr->valid & NFS_ATTR_FATTR_NLINK)
+ set_nlink(inode, fattr->nlink);
+ else if (fattr_supported & NFS_ATTR_FATTR_NLINK)
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK);
+ if (fattr->valid & NFS_ATTR_FATTR_OWNER)
+ inode->i_uid = fattr->uid;
+ else if (fattr_supported & NFS_ATTR_FATTR_OWNER)
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+ if (fattr->valid & NFS_ATTR_FATTR_GROUP)
+ inode->i_gid = fattr->gid;
+ else if (fattr_supported & NFS_ATTR_FATTR_GROUP)
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+ if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+ inode->i_blocks = fattr->du.nfs2.blocks;
+ else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED &&
+ fattr->size != 0)
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS);
+ if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
+ /*
+ * report the blocks in 512byte units
+ */
+ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
+ } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED &&
+ fattr->size != 0)
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS);
+
+ nfs_setsecurity(inode, fattr);
+
+ nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
+ nfsi->attrtimeo_timestamp = now;
+ nfsi->access_cache = RB_ROOT;
+
+ nfs_fscache_init_inode(inode);
+
+ unlock_new_inode(inode);
+ } else {
+ int err = nfs_refresh_inode(inode, fattr);
+ if (err < 0) {
+ iput(inode);
+ inode = ERR_PTR(err);
+ goto out_no_inode;
+ }
+ }
+ dprintk("NFS: nfs_fhget(%s/%Lu fh_crc=0x%08x ct=%d)\n",
+ inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(inode),
+ nfs_display_fhandle_hash(fh),
+ atomic_read(&inode->i_count));
+
+out:
+ return inode;
+
+out_no_inode:
+ dprintk("nfs_fhget: iget failed with error %ld\n", PTR_ERR(inode));
+ goto out;
+}
+EXPORT_SYMBOL_GPL(nfs_fhget);
+
+#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN)
+
+int
+nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
+{
+ struct inode *inode = d_inode(dentry);
+ struct nfs_fattr *fattr;
+ int error = 0;
+
+ nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
+
+ /* skip mode change if it's just for clearing setuid/setgid */
+ if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
+ attr->ia_valid &= ~ATTR_MODE;
+
+ if (attr->ia_valid & ATTR_SIZE) {
+ BUG_ON(!S_ISREG(inode->i_mode));
+
+ error = inode_newsize_ok(inode, attr->ia_size);
+ if (error)
+ return error;
+
+ if (attr->ia_size == i_size_read(inode))
+ attr->ia_valid &= ~ATTR_SIZE;
+ }
+
+ /* Optimization: if the end result is no change, don't RPC */
+ if (((attr->ia_valid & NFS_VALID_ATTRS) & ~(ATTR_FILE|ATTR_OPEN)) == 0)
+ return 0;
+
+ trace_nfs_setattr_enter(inode);
+
+ /* Write all dirty data */
+ if (S_ISREG(inode->i_mode))
+ nfs_sync_inode(inode);
+
+ fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
+ if (fattr == NULL) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
+ if (error == 0)
+ error = nfs_refresh_inode(inode, fattr);
+ nfs_free_fattr(fattr);
+out:
+ trace_nfs_setattr_exit(inode, error);
+ return error;
+}
+EXPORT_SYMBOL_GPL(nfs_setattr);
+
+/**
+ * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall
+ * @inode: inode of the file used
+ * @offset: file offset to start truncating
+ *
+ * This is a copy of the common vmtruncate, but with the locking
+ * corrected to take into account the fact that NFS requires
+ * inode->i_size to be updated under the inode->i_lock.
+ * Note: must be called with inode->i_lock held!
+ */
+static int nfs_vmtruncate(struct inode * inode, loff_t offset)
+{
+ int err;
+
+ err = inode_newsize_ok(inode, offset);
+ if (err)
+ goto out;
+
+ trace_nfs_size_truncate(inode, offset);
+ i_size_write(inode, offset);
+ /* Optimisation */
+ if (offset == 0) {
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA;
+ nfs_ooo_clear(NFS_I(inode));
+ }
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
+
+ spin_unlock(&inode->i_lock);
+ truncate_pagecache(inode, offset);
+ spin_lock(&inode->i_lock);
+out:
+ return err;
+}
+
+/**
+ * nfs_setattr_update_inode - Update inode metadata after a setattr call.
+ * @inode: pointer to struct inode
+ * @attr: pointer to struct iattr
+ * @fattr: pointer to struct nfs_fattr
+ *
+ * Note: we do this in the *proc.c in order to ensure that
+ * it works for things like exclusive creates too.
+ */
+void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
+ struct nfs_fattr *fattr)
+{
+ /* Barrier: bump the attribute generation count. */
+ nfs_fattr_set_barrier(fattr);
+
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->attr_gencount = fattr->gencount;
+ if ((attr->ia_valid & ATTR_SIZE) != 0) {
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME |
+ NFS_INO_INVALID_BLOCKS);
+ nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
+ nfs_vmtruncate(inode, attr->ia_size);
+ }
+ if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_CTIME;
+ if ((attr->ia_valid & ATTR_KILL_SUID) != 0 &&
+ inode->i_mode & S_ISUID)
+ inode->i_mode &= ~S_ISUID;
+ if (setattr_should_drop_sgid(&nop_mnt_idmap, inode))
+ inode->i_mode &= ~S_ISGID;
+ if ((attr->ia_valid & ATTR_MODE) != 0) {
+ int mode = attr->ia_mode & S_IALLUGO;
+ mode |= inode->i_mode & ~S_IALLUGO;
+ inode->i_mode = mode;
+ }
+ if ((attr->ia_valid & ATTR_UID) != 0)
+ inode->i_uid = attr->ia_uid;
+ if ((attr->ia_valid & ATTR_GID) != 0)
+ inode->i_gid = attr->ia_gid;
+ if (fattr->valid & NFS_ATTR_FATTR_CTIME)
+ inode_set_ctime_to_ts(inode, fattr->ctime);
+ else
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_CTIME);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL);
+ }
+ if (attr->ia_valid & (ATTR_ATIME_SET|ATTR_ATIME)) {
+ NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_ATIME
+ | NFS_INO_INVALID_CTIME);
+ if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+ inode->i_atime = fattr->atime;
+ else if (attr->ia_valid & ATTR_ATIME_SET)
+ inode->i_atime = attr->ia_atime;
+ else
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
+
+ if (fattr->valid & NFS_ATTR_FATTR_CTIME)
+ inode_set_ctime_to_ts(inode, fattr->ctime);
+ else
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_CTIME);
+ }
+ if (attr->ia_valid & (ATTR_MTIME_SET|ATTR_MTIME)) {
+ NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_MTIME
+ | NFS_INO_INVALID_CTIME);
+ if (fattr->valid & NFS_ATTR_FATTR_MTIME)
+ inode->i_mtime = fattr->mtime;
+ else if (attr->ia_valid & ATTR_MTIME_SET)
+ inode->i_mtime = attr->ia_mtime;
+ else
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
+
+ if (fattr->valid & NFS_ATTR_FATTR_CTIME)
+ inode_set_ctime_to_ts(inode, fattr->ctime);
+ else
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_CTIME);
+ }
+ if (fattr->valid)
+ nfs_update_inode(inode, fattr);
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
+
+/*
+ * Don't request help from readdirplus if the file is being written to,
+ * or if attribute caching is turned off
+ */
+static bool nfs_getattr_readdirplus_enable(const struct inode *inode)
+{
+ return nfs_server_capable(inode, NFS_CAP_READDIRPLUS) &&
+ !nfs_have_writebacks(inode) && NFS_MAXATTRTIMEO(inode) > 5 * HZ;
+}
+
+static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry)
+{
+ if (!IS_ROOT(dentry)) {
+ struct dentry *parent = dget_parent(dentry);
+ nfs_readdir_record_entry_cache_miss(d_inode(parent));
+ dput(parent);
+ }
+}
+
+static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry)
+{
+ if (!IS_ROOT(dentry)) {
+ struct dentry *parent = dget_parent(dentry);
+ nfs_readdir_record_entry_cache_hit(d_inode(parent));
+ dput(parent);
+ }
+}
+
+static u32 nfs_get_valid_attrmask(struct inode *inode)
+{
+ unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+ u32 reply_mask = STATX_INO | STATX_TYPE;
+
+ if (!(cache_validity & NFS_INO_INVALID_ATIME))
+ reply_mask |= STATX_ATIME;
+ if (!(cache_validity & NFS_INO_INVALID_CTIME))
+ reply_mask |= STATX_CTIME;
+ if (!(cache_validity & NFS_INO_INVALID_MTIME))
+ reply_mask |= STATX_MTIME;
+ if (!(cache_validity & NFS_INO_INVALID_SIZE))
+ reply_mask |= STATX_SIZE;
+ if (!(cache_validity & NFS_INO_INVALID_NLINK))
+ reply_mask |= STATX_NLINK;
+ if (!(cache_validity & NFS_INO_INVALID_MODE))
+ reply_mask |= STATX_MODE;
+ if (!(cache_validity & NFS_INO_INVALID_OTHER))
+ reply_mask |= STATX_UID | STATX_GID;
+ if (!(cache_validity & NFS_INO_INVALID_BLOCKS))
+ reply_mask |= STATX_BLOCKS;
+ if (!(cache_validity & NFS_INO_INVALID_CHANGE))
+ reply_mask |= STATX_CHANGE_COOKIE;
+ return reply_mask;
+}
+
+int nfs_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+ struct nfs_server *server = NFS_SERVER(inode);
+ unsigned long cache_validity;
+ int err = 0;
+ bool force_sync = query_flags & AT_STATX_FORCE_SYNC;
+ bool do_update = false;
+ bool readdirplus_enabled = nfs_getattr_readdirplus_enable(inode);
+
+ trace_nfs_getattr_enter(inode);
+
+ request_mask &= STATX_TYPE | STATX_MODE | STATX_NLINK | STATX_UID |
+ STATX_GID | STATX_ATIME | STATX_MTIME | STATX_CTIME |
+ STATX_INO | STATX_SIZE | STATX_BLOCKS |
+ STATX_CHANGE_COOKIE;
+
+ if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) {
+ if (readdirplus_enabled)
+ nfs_readdirplus_parent_cache_hit(path->dentry);
+ goto out_no_revalidate;
+ }
+
+ /* Flush out writes to the server in order to update c/mtime/version. */
+ if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_CHANGE_COOKIE)) &&
+ S_ISREG(inode->i_mode))
+ filemap_write_and_wait(inode->i_mapping);
+
+ /*
+ * We may force a getattr if the user cares about atime.
+ *
+ * Note that we only have to check the vfsmount flags here:
+ * - NFS always sets S_NOATIME by so checking it would give a
+ * bogus result
+ * - NFS never sets SB_NOATIME or SB_NODIRATIME so there is
+ * no point in checking those.
+ */
+ if ((path->mnt->mnt_flags & MNT_NOATIME) ||
+ ((path->mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
+ request_mask &= ~STATX_ATIME;
+
+ /* Is the user requesting attributes that might need revalidation? */
+ if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME|
+ STATX_MTIME|STATX_UID|STATX_GID|
+ STATX_SIZE|STATX_BLOCKS|
+ STATX_CHANGE_COOKIE)))
+ goto out_no_revalidate;
+
+ /* Check whether the cached attributes are stale */
+ do_update |= force_sync || nfs_attribute_cache_expired(inode);
+ cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+ do_update |= cache_validity & NFS_INO_INVALID_CHANGE;
+ if (request_mask & STATX_ATIME)
+ do_update |= cache_validity & NFS_INO_INVALID_ATIME;
+ if (request_mask & STATX_CTIME)
+ do_update |= cache_validity & NFS_INO_INVALID_CTIME;
+ if (request_mask & STATX_MTIME)
+ do_update |= cache_validity & NFS_INO_INVALID_MTIME;
+ if (request_mask & STATX_SIZE)
+ do_update |= cache_validity & NFS_INO_INVALID_SIZE;
+ if (request_mask & STATX_NLINK)
+ do_update |= cache_validity & NFS_INO_INVALID_NLINK;
+ if (request_mask & STATX_MODE)
+ do_update |= cache_validity & NFS_INO_INVALID_MODE;
+ if (request_mask & (STATX_UID | STATX_GID))
+ do_update |= cache_validity & NFS_INO_INVALID_OTHER;
+ if (request_mask & STATX_BLOCKS)
+ do_update |= cache_validity & NFS_INO_INVALID_BLOCKS;
+
+ if (do_update) {
+ if (readdirplus_enabled)
+ nfs_readdirplus_parent_cache_miss(path->dentry);
+ err = __nfs_revalidate_inode(server, inode);
+ if (err)
+ goto out;
+ } else if (readdirplus_enabled)
+ nfs_readdirplus_parent_cache_hit(path->dentry);
+out_no_revalidate:
+ /* Only return attributes that were revalidated. */
+ stat->result_mask = nfs_get_valid_attrmask(inode) | request_mask;
+
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
+ stat->change_cookie = inode_peek_iversion_raw(inode);
+ stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC;
+ if (server->change_attr_type != NFS4_CHANGE_TYPE_IS_UNDEFINED)
+ stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC;
+ if (S_ISDIR(inode->i_mode))
+ stat->blksize = NFS_SERVER(inode)->dtsize;
+out:
+ trace_nfs_getattr_exit(inode, err);
+ return err;
+}
+EXPORT_SYMBOL_GPL(nfs_getattr);
+
+static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
+{
+ refcount_set(&l_ctx->count, 1);
+ l_ctx->lockowner = current->files;
+ INIT_LIST_HEAD(&l_ctx->list);
+ atomic_set(&l_ctx->io_count, 0);
+}
+
+static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
+{
+ struct nfs_lock_context *pos;
+
+ list_for_each_entry_rcu(pos, &ctx->lock_context.list, list) {
+ if (pos->lockowner != current->files)
+ continue;
+ if (refcount_inc_not_zero(&pos->count))
+ return pos;
+ }
+ return NULL;
+}
+
+struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
+{
+ struct nfs_lock_context *res, *new = NULL;
+ struct inode *inode = d_inode(ctx->dentry);
+
+ rcu_read_lock();
+ res = __nfs_find_lock_context(ctx);
+ rcu_read_unlock();
+ if (res == NULL) {
+ new = kmalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ nfs_init_lock_context(new);
+ spin_lock(&inode->i_lock);
+ res = __nfs_find_lock_context(ctx);
+ if (res == NULL) {
+ new->open_context = get_nfs_open_context(ctx);
+ if (new->open_context) {
+ list_add_tail_rcu(&new->list,
+ &ctx->lock_context.list);
+ res = new;
+ new = NULL;
+ } else
+ res = ERR_PTR(-EBADF);
+ }
+ spin_unlock(&inode->i_lock);
+ kfree(new);
+ }
+ return res;
+}
+EXPORT_SYMBOL_GPL(nfs_get_lock_context);
+
+void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
+{
+ struct nfs_open_context *ctx = l_ctx->open_context;
+ struct inode *inode = d_inode(ctx->dentry);
+
+ if (!refcount_dec_and_lock(&l_ctx->count, &inode->i_lock))
+ return;
+ list_del_rcu(&l_ctx->list);
+ spin_unlock(&inode->i_lock);
+ put_nfs_open_context(ctx);
+ kfree_rcu(l_ctx, rcu_head);
+}
+EXPORT_SYMBOL_GPL(nfs_put_lock_context);
+
+/**
+ * nfs_close_context - Common close_context() routine NFSv2/v3
+ * @ctx: pointer to context
+ * @is_sync: is this a synchronous close
+ *
+ * Ensure that the attributes are up to date if we're mounted
+ * with close-to-open semantics and we have cached data that will
+ * need to be revalidated on open.
+ */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+ struct nfs_inode *nfsi;
+ struct inode *inode;
+
+ if (!(ctx->mode & FMODE_WRITE))
+ return;
+ if (!is_sync)
+ return;
+ inode = d_inode(ctx->dentry);
+ if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ return;
+ nfsi = NFS_I(inode);
+ if (inode->i_mapping->nrpages == 0)
+ return;
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ return;
+ if (!list_empty(&nfsi->open_files))
+ return;
+ if (NFS_SERVER(inode)->flags & NFS_MOUNT_NOCTO)
+ return;
+ nfs_revalidate_inode(inode,
+ NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE);
+}
+EXPORT_SYMBOL_GPL(nfs_close_context);
+
+struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
+ fmode_t f_mode,
+ struct file *filp)
+{
+ struct nfs_open_context *ctx;
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT);
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+ nfs_sb_active(dentry->d_sb);
+ ctx->dentry = dget(dentry);
+ if (filp)
+ ctx->cred = get_cred(filp->f_cred);
+ else
+ ctx->cred = get_current_cred();
+ rcu_assign_pointer(ctx->ll_cred, NULL);
+ ctx->state = NULL;
+ ctx->mode = f_mode;
+ ctx->flags = 0;
+ ctx->error = 0;
+ ctx->flock_owner = (fl_owner_t)filp;
+ nfs_init_lock_context(&ctx->lock_context);
+ ctx->lock_context.open_context = ctx;
+ INIT_LIST_HEAD(&ctx->list);
+ ctx->mdsthreshold = NULL;
+ return ctx;
+}
+EXPORT_SYMBOL_GPL(alloc_nfs_open_context);
+
+struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
+{
+ if (ctx != NULL && refcount_inc_not_zero(&ctx->lock_context.count))
+ return ctx;
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(get_nfs_open_context);
+
+static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
+{
+ struct inode *inode = d_inode(ctx->dentry);
+ struct super_block *sb = ctx->dentry->d_sb;
+
+ if (!refcount_dec_and_test(&ctx->lock_context.count))
+ return;
+ if (!list_empty(&ctx->list)) {
+ spin_lock(&inode->i_lock);
+ list_del_rcu(&ctx->list);
+ spin_unlock(&inode->i_lock);
+ }
+ if (inode != NULL)
+ NFS_PROTO(inode)->close_context(ctx, is_sync);
+ put_cred(ctx->cred);
+ dput(ctx->dentry);
+ nfs_sb_deactive(sb);
+ put_rpccred(rcu_dereference_protected(ctx->ll_cred, 1));
+ kfree(ctx->mdsthreshold);
+ kfree_rcu(ctx, rcu_head);
+}
+
+void put_nfs_open_context(struct nfs_open_context *ctx)
+{
+ __put_nfs_open_context(ctx, 0);
+}
+EXPORT_SYMBOL_GPL(put_nfs_open_context);
+
+static void put_nfs_open_context_sync(struct nfs_open_context *ctx)
+{
+ __put_nfs_open_context(ctx, 1);
+}
+
+/*
+ * Ensure that mmap has a recent RPC credential for use when writing out
+ * shared pages
+ */
+void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
+{
+ struct inode *inode = d_inode(ctx->dentry);
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ spin_lock(&inode->i_lock);
+ if (list_empty(&nfsi->open_files) &&
+ nfs_ooo_test(nfsi))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA |
+ NFS_INO_REVAL_FORCED);
+ list_add_tail_rcu(&ctx->list, &nfsi->open_files);
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
+
+void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+{
+ filp->private_data = get_nfs_open_context(ctx);
+ set_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags);
+ if (list_empty(&ctx->list))
+ nfs_inode_attach_open_context(ctx);
+}
+EXPORT_SYMBOL_GPL(nfs_file_set_open_context);
+
+/*
+ * Given an inode, search for an open context with the desired characteristics
+ */
+struct nfs_open_context *nfs_find_open_context(struct inode *inode, const struct cred *cred, fmode_t mode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_open_context *pos, *ctx = NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pos, &nfsi->open_files, list) {
+ if (cred != NULL && cred_fscmp(pos->cred, cred) != 0)
+ continue;
+ if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
+ continue;
+ if (!test_bit(NFS_CONTEXT_FILE_OPEN, &pos->flags))
+ continue;
+ ctx = get_nfs_open_context(pos);
+ if (ctx)
+ break;
+ }
+ rcu_read_unlock();
+ return ctx;
+}
+
+void nfs_file_clear_open_context(struct file *filp)
+{
+ struct nfs_open_context *ctx = nfs_file_open_context(filp);
+
+ if (ctx) {
+ struct inode *inode = d_inode(ctx->dentry);
+
+ clear_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags);
+ /*
+ * We fatal error on write before. Try to writeback
+ * every page again.
+ */
+ if (ctx->error < 0)
+ invalidate_inode_pages2(inode->i_mapping);
+ filp->private_data = NULL;
+ put_nfs_open_context_sync(ctx);
+ }
+}
+
+/*
+ * These allocate and release file read/write context information.
+ */
+int nfs_open(struct inode *inode, struct file *filp)
+{
+ struct nfs_open_context *ctx;
+
+ ctx = alloc_nfs_open_context(file_dentry(filp),
+ flags_to_mode(filp->f_flags), filp);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ nfs_file_set_open_context(filp, ctx);
+ put_nfs_open_context(ctx);
+ nfs_fscache_open_file(inode, filp);
+ return 0;
+}
+
+/*
+ * This function is called whenever some part of NFS notices that
+ * the cached attributes have to be refreshed.
+ */
+int
+__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
+{
+ int status = -ESTALE;
+ struct nfs_fattr *fattr = NULL;
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ dfprintk(PAGECACHE, "NFS: revalidating (%s/%Lu)\n",
+ inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode));
+
+ trace_nfs_revalidate_inode_enter(inode);
+
+ if (is_bad_inode(inode))
+ goto out;
+ if (NFS_STALE(inode))
+ goto out;
+
+ /* pNFS: Attributes aren't updated until we layoutcommit */
+ if (S_ISREG(inode->i_mode)) {
+ status = pnfs_sync_inode(inode, false);
+ if (status)
+ goto out;
+ }
+
+ status = -ENOMEM;
+ fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
+ if (fattr == NULL)
+ goto out;
+
+ nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
+
+ status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, inode);
+ if (status != 0) {
+ dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n",
+ inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(inode), status);
+ switch (status) {
+ case -ETIMEDOUT:
+ /* A soft timeout occurred. Use cached information? */
+ if (server->flags & NFS_MOUNT_SOFTREVAL)
+ status = 0;
+ break;
+ case -ESTALE:
+ if (!S_ISDIR(inode->i_mode))
+ nfs_set_inode_stale(inode);
+ else
+ nfs_zap_caches(inode);
+ }
+ goto out;
+ }
+
+ status = nfs_refresh_inode(inode, fattr);
+ if (status) {
+ dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n",
+ inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(inode), status);
+ goto out;
+ }
+
+ if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
+ nfs_zap_acl_cache(inode);
+
+ nfs_setsecurity(inode, fattr);
+
+ dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n",
+ inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(inode));
+
+out:
+ nfs_free_fattr(fattr);
+ trace_nfs_revalidate_inode_exit(inode, status);
+ return status;
+}
+
+int nfs_attribute_cache_expired(struct inode *inode)
+{
+ if (nfs_have_delegated_attributes(inode))
+ return 0;
+ return nfs_attribute_timeout(inode);
+}
+
+/**
+ * nfs_revalidate_inode - Revalidate the inode attributes
+ * @inode: pointer to inode struct
+ * @flags: cache flags to check
+ *
+ * Updates inode attribute information by retrieving the data from the server.
+ */
+int nfs_revalidate_inode(struct inode *inode, unsigned long flags)
+{
+ if (!nfs_check_cache_invalid(inode, flags))
+ return NFS_STALE(inode) ? -ESTALE : 0;
+ return __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+}
+EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
+
+static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
+{
+ int ret;
+
+ nfs_fscache_invalidate(inode, 0);
+ if (mapping->nrpages != 0) {
+ if (S_ISREG(inode->i_mode)) {
+ ret = nfs_sync_mapping(mapping);
+ if (ret < 0)
+ return ret;
+ }
+ ret = invalidate_inode_pages2(mapping);
+ if (ret < 0)
+ return ret;
+ }
+ nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
+
+ dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n",
+ inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(inode));
+ return 0;
+}
+
+/**
+ * nfs_clear_invalid_mapping - Conditionally clear a mapping
+ * @mapping: pointer to mapping
+ *
+ * If the NFS_INO_INVALID_DATA inode flag is set, clear the mapping.
+ */
+int nfs_clear_invalid_mapping(struct address_space *mapping)
+{
+ struct inode *inode = mapping->host;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ unsigned long *bitlock = &nfsi->flags;
+ int ret = 0;
+
+ /*
+ * We must clear NFS_INO_INVALID_DATA first to ensure that
+ * invalidations that come in while we're shooting down the mappings
+ * are respected. But, that leaves a race window where one revalidator
+ * can clear the flag, and then another checks it before the mapping
+ * gets invalidated. Fix that by serializing access to this part of
+ * the function.
+ *
+ * At the same time, we need to allow other tasks to see whether we
+ * might be in the middle of invalidating the pages, so we only set
+ * the bit lock here if it looks like we're going to be doing that.
+ */
+ for (;;) {
+ ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING,
+ nfs_wait_bit_killable,
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
+ if (ret)
+ goto out;
+ spin_lock(&inode->i_lock);
+ if (test_bit(NFS_INO_INVALIDATING, bitlock)) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ break;
+ spin_unlock(&inode->i_lock);
+ goto out;
+ }
+
+ set_bit(NFS_INO_INVALIDATING, bitlock);
+ smp_wmb();
+ nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+ nfs_ooo_clear(nfsi);
+ spin_unlock(&inode->i_lock);
+ trace_nfs_invalidate_mapping_enter(inode);
+ ret = nfs_invalidate_mapping(inode, mapping);
+ trace_nfs_invalidate_mapping_exit(inode, ret);
+
+ clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
+ smp_mb__after_atomic();
+ wake_up_bit(bitlock, NFS_INO_INVALIDATING);
+out:
+ return ret;
+}
+
+bool nfs_mapping_need_revalidate_inode(struct inode *inode)
+{
+ return nfs_check_cache_invalid(inode, NFS_INO_INVALID_CHANGE) ||
+ NFS_STALE(inode);
+}
+
+int nfs_revalidate_mapping_rcu(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ unsigned long *bitlock = &nfsi->flags;
+ int ret = 0;
+
+ if (IS_SWAPFILE(inode))
+ goto out;
+ if (nfs_mapping_need_revalidate_inode(inode)) {
+ ret = -ECHILD;
+ goto out;
+ }
+ spin_lock(&inode->i_lock);
+ if (test_bit(NFS_INO_INVALIDATING, bitlock) ||
+ (nfsi->cache_validity & NFS_INO_INVALID_DATA))
+ ret = -ECHILD;
+ spin_unlock(&inode->i_lock);
+out:
+ return ret;
+}
+
+/**
+ * nfs_revalidate_mapping - Revalidate the pagecache
+ * @inode: pointer to host inode
+ * @mapping: pointer to mapping
+ */
+int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+{
+ /* swapfiles are not supposed to be shared. */
+ if (IS_SWAPFILE(inode))
+ return 0;
+
+ if (nfs_mapping_need_revalidate_inode(inode)) {
+ int ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (ret < 0)
+ return ret;
+ }
+
+ return nfs_clear_invalid_mapping(mapping);
+}
+
+static bool nfs_file_has_writers(struct nfs_inode *nfsi)
+{
+ struct inode *inode = &nfsi->vfs_inode;
+
+ if (!S_ISREG(inode->i_mode))
+ return false;
+ if (list_empty(&nfsi->open_files))
+ return false;
+ return inode_is_open_for_write(inode);
+}
+
+static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi)
+{
+ return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi);
+}
+
+static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+ struct timespec64 ts;
+
+ if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
+ && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
+ && inode_eq_iversion_raw(inode, fattr->pre_change_attr)) {
+ inode_set_iversion_raw(inode, fattr->change_attr);
+ if (S_ISDIR(inode->i_mode))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
+ else if (nfs_server_capable(inode, NFS_CAP_XATTR))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR);
+ }
+ /* If we have atomic WCC data, we may update some attributes */
+ ts = inode_get_ctime(inode);
+ if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
+ && (fattr->valid & NFS_ATTR_FATTR_CTIME)
+ && timespec64_equal(&ts, &fattr->pre_ctime)) {
+ inode_set_ctime_to_ts(inode, fattr->ctime);
+ }
+
+ ts = inode->i_mtime;
+ if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
+ && (fattr->valid & NFS_ATTR_FATTR_MTIME)
+ && timespec64_equal(&ts, &fattr->pre_mtime)) {
+ inode->i_mtime = fattr->mtime;
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
+ && (fattr->valid & NFS_ATTR_FATTR_SIZE)
+ && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
+ && !nfs_have_writebacks(inode)) {
+ trace_nfs_size_wcc(inode, fattr->size);
+ i_size_write(inode, nfs_size_to_loff_t(fattr->size));
+ }
+}
+
+/**
+ * nfs_check_inode_attributes - verify consistency of the inode attribute cache
+ * @inode: pointer to inode
+ * @fattr: updated attributes
+ *
+ * Verifies the attribute cache. If we have just changed the attributes,
+ * so that fattr carries weak cache consistency data, then it may
+ * also update the ctime/mtime/change_attribute.
+ */
+static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fattr)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ loff_t cur_size, new_isize;
+ unsigned long invalid = 0;
+ struct timespec64 ts;
+
+ if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ return 0;
+
+ if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) {
+ /* Only a mounted-on-fileid? Just exit */
+ if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
+ return 0;
+ /* Has the inode gone and changed behind our back? */
+ } else if (nfsi->fileid != fattr->fileid) {
+ /* Is this perhaps the mounted-on fileid? */
+ if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) &&
+ nfsi->fileid == fattr->mounted_on_fileid)
+ return 0;
+ return -ESTALE;
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && inode_wrong_type(inode, fattr->mode))
+ return -ESTALE;
+
+
+ if (!nfs_file_has_buffered_writers(nfsi)) {
+ /* Verify a few of the more important attributes */
+ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr))
+ invalid |= NFS_INO_INVALID_CHANGE;
+
+ ts = inode->i_mtime;
+ if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime))
+ invalid |= NFS_INO_INVALID_MTIME;
+
+ ts = inode_get_ctime(inode);
+ if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec64_equal(&ts, &fattr->ctime))
+ invalid |= NFS_INO_INVALID_CTIME;
+
+ if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+ cur_size = i_size_read(inode);
+ new_isize = nfs_size_to_loff_t(fattr->size);
+ if (cur_size != new_isize)
+ invalid |= NFS_INO_INVALID_SIZE;
+ }
+ }
+
+ /* Have any file permissions changed? */
+ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
+ invalid |= NFS_INO_INVALID_MODE;
+ if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid))
+ invalid |= NFS_INO_INVALID_OTHER;
+ if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid))
+ invalid |= NFS_INO_INVALID_OTHER;
+
+ /* Has the link count changed? */
+ if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
+ invalid |= NFS_INO_INVALID_NLINK;
+
+ ts = inode->i_atime;
+ if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec64_equal(&ts, &fattr->atime))
+ invalid |= NFS_INO_INVALID_ATIME;
+
+ if (invalid != 0)
+ nfs_set_cache_invalid(inode, invalid);
+
+ nfsi->read_cache_jiffies = fattr->time_start;
+ return 0;
+}
+
+static atomic_long_t nfs_attr_generation_counter;
+
+static unsigned long nfs_read_attr_generation_counter(void)
+{
+ return atomic_long_read(&nfs_attr_generation_counter);
+}
+
+unsigned long nfs_inc_attr_generation_counter(void)
+{
+ return atomic_long_inc_return(&nfs_attr_generation_counter);
+}
+EXPORT_SYMBOL_GPL(nfs_inc_attr_generation_counter);
+
+void nfs_fattr_init(struct nfs_fattr *fattr)
+{
+ fattr->valid = 0;
+ fattr->time_start = jiffies;
+ fattr->gencount = nfs_inc_attr_generation_counter();
+ fattr->owner_name = NULL;
+ fattr->group_name = NULL;
+}
+EXPORT_SYMBOL_GPL(nfs_fattr_init);
+
+/**
+ * nfs_fattr_set_barrier
+ * @fattr: attributes
+ *
+ * Used to set a barrier after an attribute was updated. This
+ * barrier ensures that older attributes from RPC calls that may
+ * have raced with our update cannot clobber these new values.
+ * Note that you are still responsible for ensuring that other
+ * operations which change the attribute on the server do not
+ * collide.
+ */
+void nfs_fattr_set_barrier(struct nfs_fattr *fattr)
+{
+ fattr->gencount = nfs_inc_attr_generation_counter();
+}
+
+struct nfs_fattr *nfs_alloc_fattr(void)
+{
+ struct nfs_fattr *fattr;
+
+ fattr = kmalloc(sizeof(*fattr), GFP_KERNEL);
+ if (fattr != NULL) {
+ nfs_fattr_init(fattr);
+ fattr->label = NULL;
+ }
+ return fattr;
+}
+EXPORT_SYMBOL_GPL(nfs_alloc_fattr);
+
+struct nfs_fattr *nfs_alloc_fattr_with_label(struct nfs_server *server)
+{
+ struct nfs_fattr *fattr = nfs_alloc_fattr();
+
+ if (!fattr)
+ return NULL;
+
+ fattr->label = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(fattr->label)) {
+ kfree(fattr);
+ return NULL;
+ }
+
+ return fattr;
+}
+EXPORT_SYMBOL_GPL(nfs_alloc_fattr_with_label);
+
+struct nfs_fh *nfs_alloc_fhandle(void)
+{
+ struct nfs_fh *fh;
+
+ fh = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
+ if (fh != NULL)
+ fh->size = 0;
+ return fh;
+}
+EXPORT_SYMBOL_GPL(nfs_alloc_fhandle);
+
+#ifdef NFS_DEBUG
+/*
+ * _nfs_display_fhandle_hash - calculate the crc32 hash for the filehandle
+ * in the same way that wireshark does
+ *
+ * @fh: file handle
+ *
+ * For debugging only.
+ */
+u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh)
+{
+ /* wireshark uses 32-bit AUTODIN crc and does a bitwise
+ * not on the result */
+ return nfs_fhandle_hash(fh);
+}
+EXPORT_SYMBOL_GPL(_nfs_display_fhandle_hash);
+
+/*
+ * _nfs_display_fhandle - display an NFS file handle on the console
+ *
+ * @fh: file handle to display
+ * @caption: display caption
+ *
+ * For debugging only.
+ */
+void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption)
+{
+ unsigned short i;
+
+ if (fh == NULL || fh->size == 0) {
+ printk(KERN_DEFAULT "%s at %p is empty\n", caption, fh);
+ return;
+ }
+
+ printk(KERN_DEFAULT "%s at %p is %u bytes, crc: 0x%08x:\n",
+ caption, fh, fh->size, _nfs_display_fhandle_hash(fh));
+ for (i = 0; i < fh->size; i += 16) {
+ __be32 *pos = (__be32 *)&fh->data[i];
+
+ switch ((fh->size - i - 1) >> 2) {
+ case 0:
+ printk(KERN_DEFAULT " %08x\n",
+ be32_to_cpup(pos));
+ break;
+ case 1:
+ printk(KERN_DEFAULT " %08x %08x\n",
+ be32_to_cpup(pos), be32_to_cpup(pos + 1));
+ break;
+ case 2:
+ printk(KERN_DEFAULT " %08x %08x %08x\n",
+ be32_to_cpup(pos), be32_to_cpup(pos + 1),
+ be32_to_cpup(pos + 2));
+ break;
+ default:
+ printk(KERN_DEFAULT " %08x %08x %08x %08x\n",
+ be32_to_cpup(pos), be32_to_cpup(pos + 1),
+ be32_to_cpup(pos + 2), be32_to_cpup(pos + 3));
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(_nfs_display_fhandle);
+#endif
+
+/**
+ * nfs_inode_attrs_cmp_generic - compare attributes
+ * @fattr: attributes
+ * @inode: pointer to inode
+ *
+ * Attempt to divine whether or not an RPC call reply carrying stale
+ * attributes got scheduled after another call carrying updated ones.
+ * Note also the check for wraparound of 'attr_gencount'
+ *
+ * The function returns '1' if it thinks the attributes in @fattr are
+ * more recent than the ones cached in @inode. Otherwise it returns
+ * the value '0'.
+ */
+static int nfs_inode_attrs_cmp_generic(const struct nfs_fattr *fattr,
+ const struct inode *inode)
+{
+ unsigned long attr_gencount = NFS_I(inode)->attr_gencount;
+
+ return (long)(fattr->gencount - attr_gencount) > 0 ||
+ (long)(attr_gencount - nfs_read_attr_generation_counter()) > 0;
+}
+
+/**
+ * nfs_inode_attrs_cmp_monotonic - compare attributes
+ * @fattr: attributes
+ * @inode: pointer to inode
+ *
+ * Attempt to divine whether or not an RPC call reply carrying stale
+ * attributes got scheduled after another call carrying updated ones.
+ *
+ * We assume that the server observes monotonic semantics for
+ * the change attribute, so a larger value means that the attributes in
+ * @fattr are more recent, in which case the function returns the
+ * value '1'.
+ * A return value of '0' indicates no measurable change
+ * A return value of '-1' means that the attributes in @inode are
+ * more recent.
+ */
+static int nfs_inode_attrs_cmp_monotonic(const struct nfs_fattr *fattr,
+ const struct inode *inode)
+{
+ s64 diff = fattr->change_attr - inode_peek_iversion_raw(inode);
+ if (diff > 0)
+ return 1;
+ return diff == 0 ? 0 : -1;
+}
+
+/**
+ * nfs_inode_attrs_cmp_strict_monotonic - compare attributes
+ * @fattr: attributes
+ * @inode: pointer to inode
+ *
+ * Attempt to divine whether or not an RPC call reply carrying stale
+ * attributes got scheduled after another call carrying updated ones.
+ *
+ * We assume that the server observes strictly monotonic semantics for
+ * the change attribute, so a larger value means that the attributes in
+ * @fattr are more recent, in which case the function returns the
+ * value '1'.
+ * A return value of '-1' means that the attributes in @inode are
+ * more recent or unchanged.
+ */
+static int nfs_inode_attrs_cmp_strict_monotonic(const struct nfs_fattr *fattr,
+ const struct inode *inode)
+{
+ return nfs_inode_attrs_cmp_monotonic(fattr, inode) > 0 ? 1 : -1;
+}
+
+/**
+ * nfs_inode_attrs_cmp - compare attributes
+ * @fattr: attributes
+ * @inode: pointer to inode
+ *
+ * This function returns '1' if it thinks the attributes in @fattr are
+ * more recent than the ones cached in @inode. It returns '-1' if
+ * the attributes in @inode are more recent than the ones in @fattr,
+ * and it returns 0 if not sure.
+ */
+static int nfs_inode_attrs_cmp(const struct nfs_fattr *fattr,
+ const struct inode *inode)
+{
+ if (nfs_inode_attrs_cmp_generic(fattr, inode) > 0)
+ return 1;
+ switch (NFS_SERVER(inode)->change_attr_type) {
+ case NFS4_CHANGE_TYPE_IS_UNDEFINED:
+ break;
+ case NFS4_CHANGE_TYPE_IS_TIME_METADATA:
+ if (!(fattr->valid & NFS_ATTR_FATTR_CHANGE))
+ break;
+ return nfs_inode_attrs_cmp_monotonic(fattr, inode);
+ default:
+ if (!(fattr->valid & NFS_ATTR_FATTR_CHANGE))
+ break;
+ return nfs_inode_attrs_cmp_strict_monotonic(fattr, inode);
+ }
+ return 0;
+}
+
+/**
+ * nfs_inode_finish_partial_attr_update - complete a previous inode update
+ * @fattr: attributes
+ * @inode: pointer to inode
+ *
+ * Returns '1' if the last attribute update left the inode cached
+ * attributes in a partially unrevalidated state, and @fattr
+ * matches the change attribute of that partial update.
+ * Otherwise returns '0'.
+ */
+static int nfs_inode_finish_partial_attr_update(const struct nfs_fattr *fattr,
+ const struct inode *inode)
+{
+ const unsigned long check_valid =
+ NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME |
+ NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE |
+ NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_OTHER |
+ NFS_INO_INVALID_NLINK;
+ unsigned long cache_validity = NFS_I(inode)->cache_validity;
+ enum nfs4_change_attr_type ctype = NFS_SERVER(inode)->change_attr_type;
+
+ if (ctype != NFS4_CHANGE_TYPE_IS_UNDEFINED &&
+ !(cache_validity & NFS_INO_INVALID_CHANGE) &&
+ (cache_validity & check_valid) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
+ nfs_inode_attrs_cmp_monotonic(fattr, inode) == 0)
+ return 1;
+ return 0;
+}
+
+static void nfs_ooo_merge(struct nfs_inode *nfsi,
+ u64 start, u64 end)
+{
+ int i, cnt;
+
+ if (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)
+ /* No point merging anything */
+ return;
+
+ if (!nfsi->ooo) {
+ nfsi->ooo = kmalloc(sizeof(*nfsi->ooo), GFP_ATOMIC);
+ if (!nfsi->ooo) {
+ nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER;
+ return;
+ }
+ nfsi->ooo->cnt = 0;
+ }
+
+ /* add this range, merging if possible */
+ cnt = nfsi->ooo->cnt;
+ for (i = 0; i < cnt; i++) {
+ if (end == nfsi->ooo->gap[i].start)
+ end = nfsi->ooo->gap[i].end;
+ else if (start == nfsi->ooo->gap[i].end)
+ start = nfsi->ooo->gap[i].start;
+ else
+ continue;
+ /* Remove 'i' from table and loop to insert the new range */
+ cnt -= 1;
+ nfsi->ooo->gap[i] = nfsi->ooo->gap[cnt];
+ i = -1;
+ }
+ if (start != end) {
+ if (cnt >= ARRAY_SIZE(nfsi->ooo->gap)) {
+ nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER;
+ kfree(nfsi->ooo);
+ nfsi->ooo = NULL;
+ return;
+ }
+ nfsi->ooo->gap[cnt].start = start;
+ nfsi->ooo->gap[cnt].end = end;
+ cnt += 1;
+ }
+ nfsi->ooo->cnt = cnt;
+}
+
+static void nfs_ooo_record(struct nfs_inode *nfsi,
+ struct nfs_fattr *fattr)
+{
+ /* This reply was out-of-order, so record in the
+ * pre/post change id, possibly cancelling
+ * gaps created when iversion was jumpped forward.
+ */
+ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) &&
+ (fattr->valid & NFS_ATTR_FATTR_PRECHANGE))
+ nfs_ooo_merge(nfsi,
+ fattr->change_attr,
+ fattr->pre_change_attr);
+}
+
+static int nfs_refresh_inode_locked(struct inode *inode,
+ struct nfs_fattr *fattr)
+{
+ int attr_cmp = nfs_inode_attrs_cmp(fattr, inode);
+ int ret = 0;
+
+ trace_nfs_refresh_inode_enter(inode);
+
+ if (attr_cmp > 0 || nfs_inode_finish_partial_attr_update(fattr, inode))
+ ret = nfs_update_inode(inode, fattr);
+ else {
+ nfs_ooo_record(NFS_I(inode), fattr);
+
+ if (attr_cmp == 0)
+ ret = nfs_check_inode_attributes(inode, fattr);
+ }
+
+ trace_nfs_refresh_inode_exit(inode, ret);
+ return ret;
+}
+
+/**
+ * nfs_refresh_inode - try to update the inode attribute cache
+ * @inode: pointer to inode
+ * @fattr: updated attributes
+ *
+ * Check that an RPC call that returned attributes has not overlapped with
+ * other recent updates of the inode metadata, then decide whether it is
+ * safe to do a full update of the inode attributes, or whether just to
+ * call nfs_check_inode_attributes.
+ */
+int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+ int status;
+
+ if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+ return 0;
+ spin_lock(&inode->i_lock);
+ status = nfs_refresh_inode_locked(inode, fattr);
+ spin_unlock(&inode->i_lock);
+
+ return status;
+}
+EXPORT_SYMBOL_GPL(nfs_refresh_inode);
+
+static int nfs_post_op_update_inode_locked(struct inode *inode,
+ struct nfs_fattr *fattr, unsigned int invalid)
+{
+ if (S_ISDIR(inode->i_mode))
+ invalid |= NFS_INO_INVALID_DATA;
+ nfs_set_cache_invalid(inode, invalid);
+ if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+ return 0;
+ return nfs_refresh_inode_locked(inode, fattr);
+}
+
+/**
+ * nfs_post_op_update_inode - try to update the inode attribute cache
+ * @inode: pointer to inode
+ * @fattr: updated attributes
+ *
+ * After an operation that has changed the inode metadata, mark the
+ * attribute cache as being invalid, then try to update it.
+ *
+ * NB: if the server didn't return any post op attributes, this
+ * function will force the retrieval of attributes before the next
+ * NFS request. Thus it should be used only for operations that
+ * are expected to change one or more attributes, to avoid
+ * unnecessary NFS requests and trips through nfs_update_inode().
+ */
+int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+ int status;
+
+ spin_lock(&inode->i_lock);
+ nfs_fattr_set_barrier(fattr);
+ status = nfs_post_op_update_inode_locked(inode, fattr,
+ NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_CTIME
+ | NFS_INO_REVAL_FORCED);
+ spin_unlock(&inode->i_lock);
+
+ return status;
+}
+EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
+
+/**
+ * nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache
+ * @inode: pointer to inode
+ * @fattr: updated attributes
+ *
+ * After an operation that has changed the inode metadata, mark the
+ * attribute cache as being invalid, then try to update it. Fake up
+ * weak cache consistency data, if none exist.
+ *
+ * This function is mainly designed to be used by the ->write_done() functions.
+ */
+int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr)
+{
+ int attr_cmp = nfs_inode_attrs_cmp(fattr, inode);
+ int status;
+
+ /* Don't do a WCC update if these attributes are already stale */
+ if (attr_cmp < 0)
+ return 0;
+ if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !attr_cmp) {
+ /* Record the pre/post change info before clearing PRECHANGE */
+ nfs_ooo_record(NFS_I(inode), fattr);
+ fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
+ | NFS_ATTR_FATTR_PRESIZE
+ | NFS_ATTR_FATTR_PREMTIME
+ | NFS_ATTR_FATTR_PRECTIME);
+ goto out_noforce;
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
+ fattr->pre_change_attr = inode_peek_iversion_raw(inode);
+ fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
+ fattr->pre_ctime = inode_get_ctime(inode);
+ fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
+ fattr->pre_mtime = inode->i_mtime;
+ fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) {
+ fattr->pre_size = i_size_read(inode);
+ fattr->valid |= NFS_ATTR_FATTR_PRESIZE;
+ }
+out_noforce:
+ status = nfs_post_op_update_inode_locked(inode, fattr,
+ NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_CTIME
+ | NFS_INO_INVALID_MTIME
+ | NFS_INO_INVALID_BLOCKS);
+ return status;
+}
+
+/**
+ * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache
+ * @inode: pointer to inode
+ * @fattr: updated attributes
+ *
+ * After an operation that has changed the inode metadata, mark the
+ * attribute cache as being invalid, then try to update it. Fake up
+ * weak cache consistency data, if none exist.
+ *
+ * This function is mainly designed to be used by the ->write_done() functions.
+ */
+int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
+{
+ int status;
+
+ spin_lock(&inode->i_lock);
+ nfs_fattr_set_barrier(fattr);
+ status = nfs_post_op_update_inode_force_wcc_locked(inode, fattr);
+ spin_unlock(&inode->i_lock);
+ return status;
+}
+EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc);
+
+
+/*
+ * Many nfs protocol calls return the new file attributes after
+ * an operation. Here we update the inode to reflect the state
+ * of the server's inode.
+ *
+ * This is a bit tricky because we have to make sure all dirty pages
+ * have been sent off to the server before calling invalidate_inode_pages.
+ * To make sure no other process adds more write requests while we try
+ * our best to flush them, we make them sleep during the attribute refresh.
+ *
+ * A very similar scenario holds for the dir cache.
+ */
+static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_inode *nfsi = NFS_I(inode);
+ loff_t cur_isize, new_isize;
+ u64 fattr_supported = server->fattr_valid;
+ unsigned long invalid = 0;
+ unsigned long now = jiffies;
+ unsigned long save_cache_validity;
+ bool have_writers = nfs_file_has_buffered_writers(nfsi);
+ bool cache_revalidated = true;
+ bool attr_changed = false;
+ bool have_delegation;
+
+ dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
+ __func__, inode->i_sb->s_id, inode->i_ino,
+ nfs_display_fhandle_hash(NFS_FH(inode)),
+ atomic_read(&inode->i_count), fattr->valid);
+
+ if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) {
+ /* Only a mounted-on-fileid? Just exit */
+ if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
+ return 0;
+ /* Has the inode gone and changed behind our back? */
+ } else if (nfsi->fileid != fattr->fileid) {
+ /* Is this perhaps the mounted-on fileid? */
+ if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) &&
+ nfsi->fileid == fattr->mounted_on_fileid)
+ return 0;
+ printk(KERN_ERR "NFS: server %s error: fileid changed\n"
+ "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n",
+ NFS_SERVER(inode)->nfs_client->cl_hostname,
+ inode->i_sb->s_id, (long long)nfsi->fileid,
+ (long long)fattr->fileid);
+ goto out_err;
+ }
+
+ /*
+ * Make sure the inode's type hasn't changed.
+ */
+ if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && inode_wrong_type(inode, fattr->mode)) {
+ /*
+ * Big trouble! The inode has become a different object.
+ */
+ printk(KERN_DEBUG "NFS: %s: inode %lu mode changed, %07o to %07o\n",
+ __func__, inode->i_ino, inode->i_mode, fattr->mode);
+ goto out_err;
+ }
+
+ /* Update the fsid? */
+ if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
+ !nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
+ !IS_AUTOMOUNT(inode))
+ server->fsid = fattr->fsid;
+
+ /* Save the delegation state before clearing cache_validity */
+ have_delegation = nfs_have_delegated_attributes(inode);
+
+ /*
+ * Update the read time so we don't revalidate too often.
+ */
+ nfsi->read_cache_jiffies = fattr->time_start;
+
+ save_cache_validity = nfsi->cache_validity;
+ nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ATIME
+ | NFS_INO_REVAL_FORCED
+ | NFS_INO_INVALID_BLOCKS);
+
+ /* Do atomic weak cache consistency updates */
+ nfs_wcc_update_inode(inode, fattr);
+
+ if (pnfs_layoutcommit_outstanding(inode)) {
+ nfsi->cache_validity |=
+ save_cache_validity &
+ (NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME |
+ NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE |
+ NFS_INO_INVALID_BLOCKS);
+ cache_revalidated = false;
+ }
+
+ /* More cache consistency checks */
+ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
+ if (!have_writers && nfsi->ooo && nfsi->ooo->cnt == 1 &&
+ nfsi->ooo->gap[0].end == inode_peek_iversion_raw(inode)) {
+ /* There is one remaining gap that hasn't been
+ * merged into iversion - do that now.
+ */
+ inode_set_iversion_raw(inode, nfsi->ooo->gap[0].start);
+ kfree(nfsi->ooo);
+ nfsi->ooo = NULL;
+ }
+ if (!inode_eq_iversion_raw(inode, fattr->change_attr)) {
+ /* Could it be a race with writeback? */
+ if (!(have_writers || have_delegation)) {
+ invalid |= NFS_INO_INVALID_DATA
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_XATTR;
+ /* Force revalidate of all attributes */
+ save_cache_validity |= NFS_INO_INVALID_CTIME
+ | NFS_INO_INVALID_MTIME
+ | NFS_INO_INVALID_SIZE
+ | NFS_INO_INVALID_BLOCKS
+ | NFS_INO_INVALID_NLINK
+ | NFS_INO_INVALID_MODE
+ | NFS_INO_INVALID_OTHER;
+ if (S_ISDIR(inode->i_mode))
+ nfs_force_lookup_revalidate(inode);
+ attr_changed = true;
+ dprintk("NFS: change_attr change on server for file %s/%ld\n",
+ inode->i_sb->s_id,
+ inode->i_ino);
+ } else if (!have_delegation) {
+ nfs_ooo_record(nfsi, fattr);
+ nfs_ooo_merge(nfsi, inode_peek_iversion_raw(inode),
+ fattr->change_attr);
+ }
+ inode_set_iversion_raw(inode, fattr->change_attr);
+ }
+ } else {
+ nfsi->cache_validity |=
+ save_cache_validity & NFS_INO_INVALID_CHANGE;
+ if (!have_delegation ||
+ (nfsi->cache_validity & NFS_INO_INVALID_CHANGE) != 0)
+ cache_revalidated = false;
+ }
+
+ if (fattr->valid & NFS_ATTR_FATTR_MTIME)
+ inode->i_mtime = fattr->mtime;
+ else if (fattr_supported & NFS_ATTR_FATTR_MTIME)
+ nfsi->cache_validity |=
+ save_cache_validity & NFS_INO_INVALID_MTIME;
+
+ if (fattr->valid & NFS_ATTR_FATTR_CTIME)
+ inode_set_ctime_to_ts(inode, fattr->ctime);
+ else if (fattr_supported & NFS_ATTR_FATTR_CTIME)
+ nfsi->cache_validity |=
+ save_cache_validity & NFS_INO_INVALID_CTIME;
+
+ /* Check if our cached file size is stale */
+ if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+ new_isize = nfs_size_to_loff_t(fattr->size);
+ cur_isize = i_size_read(inode);
+ if (new_isize != cur_isize && !have_delegation) {
+ /* Do we perhaps have any outstanding writes, or has
+ * the file grown beyond our last write? */
+ if (!nfs_have_writebacks(inode) || new_isize > cur_isize) {
+ trace_nfs_size_update(inode, new_isize);
+ i_size_write(inode, new_isize);
+ if (!have_writers)
+ invalid |= NFS_INO_INVALID_DATA;
+ }
+ }
+ if (new_isize == 0 &&
+ !(fattr->valid & (NFS_ATTR_FATTR_SPACE_USED |
+ NFS_ATTR_FATTR_BLOCKS_USED))) {
+ fattr->du.nfs3.used = 0;
+ fattr->valid |= NFS_ATTR_FATTR_SPACE_USED;
+ }
+ } else
+ nfsi->cache_validity |=
+ save_cache_validity & NFS_INO_INVALID_SIZE;
+
+ if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+ inode->i_atime = fattr->atime;
+ else if (fattr_supported & NFS_ATTR_FATTR_ATIME)
+ nfsi->cache_validity |=
+ save_cache_validity & NFS_INO_INVALID_ATIME;
+
+ if (fattr->valid & NFS_ATTR_FATTR_MODE) {
+ if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
+ umode_t newmode = inode->i_mode & S_IFMT;
+ newmode |= fattr->mode & S_IALLUGO;
+ inode->i_mode = newmode;
+ invalid |= NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
+ }
+ } else if (fattr_supported & NFS_ATTR_FATTR_MODE)
+ nfsi->cache_validity |=
+ save_cache_validity & NFS_INO_INVALID_MODE;
+
+ if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
+ if (!uid_eq(inode->i_uid, fattr->uid)) {
+ invalid |= NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
+ inode->i_uid = fattr->uid;
+ }
+ } else if (fattr_supported & NFS_ATTR_FATTR_OWNER)
+ nfsi->cache_validity |=
+ save_cache_validity & NFS_INO_INVALID_OTHER;
+
+ if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
+ if (!gid_eq(inode->i_gid, fattr->gid)) {
+ invalid |= NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
+ inode->i_gid = fattr->gid;
+ }
+ } else if (fattr_supported & NFS_ATTR_FATTR_GROUP)
+ nfsi->cache_validity |=
+ save_cache_validity & NFS_INO_INVALID_OTHER;
+
+ if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
+ if (inode->i_nlink != fattr->nlink)
+ set_nlink(inode, fattr->nlink);
+ } else if (fattr_supported & NFS_ATTR_FATTR_NLINK)
+ nfsi->cache_validity |=
+ save_cache_validity & NFS_INO_INVALID_NLINK;
+
+ if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
+ /*
+ * report the blocks in 512byte units
+ */
+ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
+ } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED)
+ nfsi->cache_validity |=
+ save_cache_validity & NFS_INO_INVALID_BLOCKS;
+
+ if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+ inode->i_blocks = fattr->du.nfs2.blocks;
+ else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED)
+ nfsi->cache_validity |=
+ save_cache_validity & NFS_INO_INVALID_BLOCKS;
+
+ /* Update attrtimeo value if we're out of the unstable period */
+ if (attr_changed) {
+ nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
+ nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
+ nfsi->attrtimeo_timestamp = now;
+ /* Set barrier to be more recent than all outstanding updates */
+ nfsi->attr_gencount = nfs_inc_attr_generation_counter();
+ } else {
+ if (cache_revalidated) {
+ if (!time_in_range_open(now, nfsi->attrtimeo_timestamp,
+ nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
+ nfsi->attrtimeo <<= 1;
+ if (nfsi->attrtimeo > NFS_MAXATTRTIMEO(inode))
+ nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
+ }
+ nfsi->attrtimeo_timestamp = now;
+ }
+ /* Set the barrier to be more recent than this fattr */
+ if ((long)(fattr->gencount - nfsi->attr_gencount) > 0)
+ nfsi->attr_gencount = fattr->gencount;
+ }
+
+ /* Don't invalidate the data if we were to blame */
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
+ || S_ISLNK(inode->i_mode)))
+ invalid &= ~NFS_INO_INVALID_DATA;
+ nfs_set_cache_invalid(inode, invalid);
+
+ return 0;
+ out_err:
+ /*
+ * No need to worry about unhashing the dentry, as the
+ * lookup validation will know that the inode is bad.
+ * (But we fall through to invalidate the caches.)
+ */
+ nfs_set_inode_stale_locked(inode);
+ return -ESTALE;
+}
+
+struct inode *nfs_alloc_inode(struct super_block *sb)
+{
+ struct nfs_inode *nfsi;
+ nfsi = alloc_inode_sb(sb, nfs_inode_cachep, GFP_KERNEL);
+ if (!nfsi)
+ return NULL;
+ nfsi->flags = 0UL;
+ nfsi->cache_validity = 0UL;
+ nfsi->ooo = NULL;
+#if IS_ENABLED(CONFIG_NFS_V4)
+ nfsi->nfs4_acl = NULL;
+#endif /* CONFIG_NFS_V4 */
+#ifdef CONFIG_NFS_V4_2
+ nfsi->xattr_cache = NULL;
+#endif
+ nfs_netfs_inode_init(nfsi);
+
+ return &nfsi->vfs_inode;
+}
+EXPORT_SYMBOL_GPL(nfs_alloc_inode);
+
+void nfs_free_inode(struct inode *inode)
+{
+ kfree(NFS_I(inode)->ooo);
+ kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
+}
+EXPORT_SYMBOL_GPL(nfs_free_inode);
+
+static inline void nfs4_init_once(struct nfs_inode *nfsi)
+{
+#if IS_ENABLED(CONFIG_NFS_V4)
+ INIT_LIST_HEAD(&nfsi->open_states);
+ nfsi->delegation = NULL;
+ init_rwsem(&nfsi->rwsem);
+ nfsi->layout = NULL;
+#endif
+}
+
+static void init_once(void *foo)
+{
+ struct nfs_inode *nfsi = foo;
+
+ inode_init_once(&nfsi->vfs_inode);
+ INIT_LIST_HEAD(&nfsi->open_files);
+ INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
+ INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
+ nfs4_init_once(nfsi);
+}
+
+static int __init nfs_init_inodecache(void)
+{
+ nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
+ sizeof(struct nfs_inode),
+ 0, (SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ init_once);
+ if (nfs_inode_cachep == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void nfs_destroy_inodecache(void)
+{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
+ kmem_cache_destroy(nfs_inode_cachep);
+}
+
+struct workqueue_struct *nfsiod_workqueue;
+EXPORT_SYMBOL_GPL(nfsiod_workqueue);
+
+/*
+ * start up the nfsiod workqueue
+ */
+static int nfsiod_start(void)
+{
+ struct workqueue_struct *wq;
+ dprintk("RPC: creating workqueue nfsiod\n");
+ wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+ if (wq == NULL)
+ return -ENOMEM;
+ nfsiod_workqueue = wq;
+ return 0;
+}
+
+/*
+ * Destroy the nfsiod workqueue
+ */
+static void nfsiod_stop(void)
+{
+ struct workqueue_struct *wq;
+
+ wq = nfsiod_workqueue;
+ if (wq == NULL)
+ return;
+ nfsiod_workqueue = NULL;
+ destroy_workqueue(wq);
+}
+
+unsigned int nfs_net_id;
+EXPORT_SYMBOL_GPL(nfs_net_id);
+
+static int nfs_net_init(struct net *net)
+{
+ nfs_clients_init(net);
+ return nfs_fs_proc_net_init(net);
+}
+
+static void nfs_net_exit(struct net *net)
+{
+ nfs_fs_proc_net_exit(net);
+ nfs_clients_exit(net);
+}
+
+static struct pernet_operations nfs_net_ops = {
+ .init = nfs_net_init,
+ .exit = nfs_net_exit,
+ .id = &nfs_net_id,
+ .size = sizeof(struct nfs_net),
+};
+
+/*
+ * Initialize NFS
+ */
+static int __init init_nfs_fs(void)
+{
+ int err;
+
+ err = nfs_sysfs_init();
+ if (err < 0)
+ goto out10;
+
+ err = register_pernet_subsys(&nfs_net_ops);
+ if (err < 0)
+ goto out9;
+
+ err = nfsiod_start();
+ if (err)
+ goto out7;
+
+ err = nfs_fs_proc_init();
+ if (err)
+ goto out6;
+
+ err = nfs_init_nfspagecache();
+ if (err)
+ goto out5;
+
+ err = nfs_init_inodecache();
+ if (err)
+ goto out4;
+
+ err = nfs_init_readpagecache();
+ if (err)
+ goto out3;
+
+ err = nfs_init_writepagecache();
+ if (err)
+ goto out2;
+
+ err = nfs_init_directcache();
+ if (err)
+ goto out1;
+
+ rpc_proc_register(&init_net, &nfs_rpcstat);
+
+ err = register_nfs_fs();
+ if (err)
+ goto out0;
+
+ return 0;
+out0:
+ rpc_proc_unregister(&init_net, "nfs");
+ nfs_destroy_directcache();
+out1:
+ nfs_destroy_writepagecache();
+out2:
+ nfs_destroy_readpagecache();
+out3:
+ nfs_destroy_inodecache();
+out4:
+ nfs_destroy_nfspagecache();
+out5:
+ nfs_fs_proc_exit();
+out6:
+ nfsiod_stop();
+out7:
+ unregister_pernet_subsys(&nfs_net_ops);
+out9:
+ nfs_sysfs_exit();
+out10:
+ return err;
+}
+
+static void __exit exit_nfs_fs(void)
+{
+ nfs_destroy_directcache();
+ nfs_destroy_writepagecache();
+ nfs_destroy_readpagecache();
+ nfs_destroy_inodecache();
+ nfs_destroy_nfspagecache();
+ unregister_pernet_subsys(&nfs_net_ops);
+ rpc_proc_unregister(&init_net, "nfs");
+ unregister_nfs_fs();
+ nfs_fs_proc_exit();
+ nfsiod_stop();
+ nfs_sysfs_exit();
+}
+
+/* Not quite true; I just maintain it */
+MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
+MODULE_LICENSE("GPL");
+module_param(enable_ino64, bool, 0644);
+
+module_init(init_nfs_fs)
+module_exit(exit_nfs_fs)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
new file mode 100644
index 0000000000..b1fa81c9df
--- /dev/null
+++ b/fs/nfs/internal.h
@@ -0,0 +1,954 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NFS internal definitions
+ */
+
+#include "nfs4_fs.h"
+#include <linux/fs_context.h>
+#include <linux/security.h>
+#include <linux/crc32.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/nfs_page.h>
+#include <linux/wait_bit.h>
+
+#define NFS_SB_MASK (SB_RDONLY|SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS)
+
+extern const struct export_operations nfs_export_ops;
+
+struct nfs_string;
+struct nfs_pageio_descriptor;
+
+static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)
+{
+ if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid))
+ fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT;
+}
+
+static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr)
+{
+ if (((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) == 0) ||
+ (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) &&
+ ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0)))
+ return 0;
+ return 1;
+}
+
+static inline bool nfs_lookup_is_soft_revalidate(const struct dentry *dentry)
+{
+ if (!(NFS_SB(dentry->d_sb)->flags & NFS_MOUNT_SOFTREVAL))
+ return false;
+ if (!d_is_positive(dentry) || !NFS_FH(d_inode(dentry))->size)
+ return false;
+ return true;
+}
+
+static inline fmode_t flags_to_mode(int flags)
+{
+ fmode_t res = (__force fmode_t)flags & FMODE_EXEC;
+ if ((flags & O_ACCMODE) != O_WRONLY)
+ res |= FMODE_READ;
+ if ((flags & O_ACCMODE) != O_RDONLY)
+ res |= FMODE_WRITE;
+ return res;
+}
+
+/*
+ * Note: RFC 1813 doesn't limit the number of auth flavors that
+ * a server can return, so make something up.
+ */
+#define NFS_MAX_SECFLAVORS (12)
+
+/*
+ * Value used if the user did not specify a port value.
+ */
+#define NFS_UNSPEC_PORT (-1)
+
+#define NFS_UNSPEC_RETRANS (UINT_MAX)
+#define NFS_UNSPEC_TIMEO (UINT_MAX)
+
+struct nfs_client_initdata {
+ unsigned long init_flags;
+ const char *hostname; /* Hostname of the server */
+ const struct sockaddr_storage *addr; /* Address of the server */
+ const char *nodename; /* Hostname of the client */
+ const char *ip_addr; /* IP address of the client */
+ size_t addrlen;
+ struct nfs_subversion *nfs_mod;
+ int proto;
+ u32 minorversion;
+ unsigned int nconnect;
+ unsigned int max_connect;
+ struct net *net;
+ const struct rpc_timeout *timeparms;
+ const struct cred *cred;
+ struct xprtsec_parms xprtsec;
+ unsigned long connect_timeout;
+ unsigned long reconnect_timeout;
+};
+
+/*
+ * In-kernel mount arguments
+ */
+struct nfs_fs_context {
+ bool internal;
+ bool skip_reconfig_option_check;
+ bool need_mount;
+ bool sloppy;
+ unsigned int flags; /* NFS{,4}_MOUNT_* flags */
+ unsigned int rsize, wsize;
+ unsigned int timeo, retrans;
+ unsigned int acregmin, acregmax;
+ unsigned int acdirmin, acdirmax;
+ unsigned int namlen;
+ unsigned int options;
+ unsigned int bsize;
+ struct nfs_auth_info auth_info;
+ rpc_authflavor_t selected_flavor;
+ struct xprtsec_parms xprtsec;
+ char *client_address;
+ unsigned int version;
+ unsigned int minorversion;
+ char *fscache_uniq;
+ unsigned short protofamily;
+ unsigned short mountfamily;
+ bool has_sec_mnt_opts;
+
+ struct {
+ union {
+ struct sockaddr address;
+ struct sockaddr_storage _address;
+ };
+ size_t addrlen;
+ char *hostname;
+ u32 version;
+ int port;
+ unsigned short protocol;
+ } mount_server;
+
+ struct {
+ union {
+ struct sockaddr address;
+ struct sockaddr_storage _address;
+ };
+ size_t addrlen;
+ char *hostname;
+ char *export_path;
+ int port;
+ unsigned short protocol;
+ unsigned short nconnect;
+ unsigned short max_connect;
+ unsigned short export_path_len;
+ } nfs_server;
+
+ struct nfs_fh *mntfh;
+ struct nfs_server *server;
+ struct nfs_subversion *nfs_mod;
+
+ /* Information for a cloned mount. */
+ struct nfs_clone_mount {
+ struct super_block *sb;
+ struct dentry *dentry;
+ struct nfs_fattr *fattr;
+ unsigned int inherited_bsize;
+ } clone_data;
+};
+
+#define nfs_errorf(fc, fmt, ...) ((fc)->log.log ? \
+ errorf(fc, fmt, ## __VA_ARGS__) : \
+ ({ dprintk(fmt "\n", ## __VA_ARGS__); }))
+
+#define nfs_ferrorf(fc, fac, fmt, ...) ((fc)->log.log ? \
+ errorf(fc, fmt, ## __VA_ARGS__) : \
+ ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); }))
+
+#define nfs_invalf(fc, fmt, ...) ((fc)->log.log ? \
+ invalf(fc, fmt, ## __VA_ARGS__) : \
+ ({ dprintk(fmt "\n", ## __VA_ARGS__); -EINVAL; }))
+
+#define nfs_finvalf(fc, fac, fmt, ...) ((fc)->log.log ? \
+ invalf(fc, fmt, ## __VA_ARGS__) : \
+ ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); -EINVAL; }))
+
+#define nfs_warnf(fc, fmt, ...) ((fc)->log.log ? \
+ warnf(fc, fmt, ## __VA_ARGS__) : \
+ ({ dprintk(fmt "\n", ## __VA_ARGS__); }))
+
+#define nfs_fwarnf(fc, fac, fmt, ...) ((fc)->log.log ? \
+ warnf(fc, fmt, ## __VA_ARGS__) : \
+ ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); }))
+
+static inline struct nfs_fs_context *nfs_fc2context(const struct fs_context *fc)
+{
+ return fc->fs_private;
+}
+
+/* mount_clnt.c */
+struct nfs_mount_request {
+ struct sockaddr_storage *sap;
+ size_t salen;
+ char *hostname;
+ char *dirpath;
+ u32 version;
+ unsigned short protocol;
+ struct nfs_fh *fh;
+ int noresvport;
+ unsigned int *auth_flav_len;
+ rpc_authflavor_t *auth_flavs;
+ struct net *net;
+};
+
+extern int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans);
+extern void nfs_umount(const struct nfs_mount_request *info);
+
+/* client.c */
+extern const struct rpc_program nfs_program;
+extern void nfs_clients_init(struct net *net);
+extern void nfs_clients_exit(struct net *net);
+extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *);
+int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t);
+struct nfs_client *nfs_get_client(const struct nfs_client_initdata *);
+int nfs_probe_server(struct nfs_server *, struct nfs_fh *);
+void nfs_server_insert_lists(struct nfs_server *);
+void nfs_server_remove_lists(struct nfs_server *);
+void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans);
+int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t,
+ rpc_authflavor_t);
+struct nfs_server *nfs_alloc_server(void);
+void nfs_server_copy_userdata(struct nfs_server *, struct nfs_server *);
+
+extern void nfs_put_client(struct nfs_client *);
+extern void nfs_free_client(struct nfs_client *);
+extern struct nfs_client *nfs4_find_client_ident(struct net *, int);
+extern struct nfs_client *
+nfs4_find_client_sessionid(struct net *, const struct sockaddr *,
+ struct nfs4_sessionid *, u32);
+extern struct nfs_server *nfs_create_server(struct fs_context *);
+extern void nfs4_server_set_init_caps(struct nfs_server *);
+extern struct nfs_server *nfs4_create_server(struct fs_context *);
+extern struct nfs_server *nfs4_create_referral_server(struct fs_context *);
+extern int nfs4_update_server(struct nfs_server *server, const char *hostname,
+ struct sockaddr_storage *sap, size_t salen,
+ struct net *net);
+extern void nfs_free_server(struct nfs_server *server);
+extern struct nfs_server *nfs_clone_server(struct nfs_server *,
+ struct nfs_fh *,
+ struct nfs_fattr *,
+ rpc_authflavor_t);
+extern bool nfs_client_init_is_complete(const struct nfs_client *clp);
+extern int nfs_client_init_status(const struct nfs_client *clp);
+extern int nfs_wait_client_init_complete(const struct nfs_client *clp);
+extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
+extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
+ const struct sockaddr_storage *ds_addr,
+ int ds_addrlen, int ds_proto,
+ unsigned int ds_timeo,
+ unsigned int ds_retrans,
+ u32 minor_version);
+extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
+ struct inode *);
+extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
+ const struct sockaddr_storage *ds_addr, int ds_addrlen,
+ int ds_proto, unsigned int ds_timeo,
+ unsigned int ds_retrans);
+#ifdef CONFIG_PROC_FS
+extern int __init nfs_fs_proc_init(void);
+extern void nfs_fs_proc_exit(void);
+extern int nfs_fs_proc_net_init(struct net *net);
+extern void nfs_fs_proc_net_exit(struct net *net);
+#else
+static inline int nfs_fs_proc_net_init(struct net *net)
+{
+ return 0;
+}
+static inline void nfs_fs_proc_net_exit(struct net *net)
+{
+}
+static inline int nfs_fs_proc_init(void)
+{
+ return 0;
+}
+static inline void nfs_fs_proc_exit(void)
+{
+}
+#endif
+
+/* callback_xdr.c */
+extern const struct svc_version nfs4_callback_version1;
+extern const struct svc_version nfs4_callback_version4;
+
+/* fs_context.c */
+extern struct file_system_type nfs_fs_type;
+
+/* pagelist.c */
+extern int __init nfs_init_nfspagecache(void);
+extern void nfs_destroy_nfspagecache(void);
+extern int __init nfs_init_readpagecache(void);
+extern void nfs_destroy_readpagecache(void);
+extern int __init nfs_init_writepagecache(void);
+extern void nfs_destroy_writepagecache(void);
+
+extern int __init nfs_init_directcache(void);
+extern void nfs_destroy_directcache(void);
+extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr,
+ void (*release)(struct nfs_pgio_header *hdr));
+void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
+int nfs_iocounter_wait(struct nfs_lock_context *l_ctx);
+
+extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
+struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
+void nfs_pgio_header_free(struct nfs_pgio_header *);
+int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
+int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
+ const struct cred *cred, const struct nfs_rpc_ops *rpc_ops,
+ const struct rpc_call_ops *call_ops, int how, int flags);
+void nfs_free_request(struct nfs_page *req);
+struct nfs_pgio_mirror *
+nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
+
+static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1,
+ const struct nfs_open_context *ctx2)
+{
+ return cred_fscmp(ctx1->cred, ctx2->cred) == 0 && ctx1->state == ctx2->state;
+}
+
+/* nfs2xdr.c */
+extern const struct rpc_procinfo nfs_procedures[];
+extern int nfs2_decode_dirent(struct xdr_stream *,
+ struct nfs_entry *, bool);
+
+/* nfs3xdr.c */
+extern const struct rpc_procinfo nfs3_procedures[];
+extern int nfs3_decode_dirent(struct xdr_stream *,
+ struct nfs_entry *, bool);
+
+/* nfs4xdr.c */
+#if IS_ENABLED(CONFIG_NFS_V4)
+extern int nfs4_decode_dirent(struct xdr_stream *,
+ struct nfs_entry *, bool);
+#endif
+#ifdef CONFIG_NFS_V4_1
+extern const u32 nfs41_maxread_overhead;
+extern const u32 nfs41_maxwrite_overhead;
+extern const u32 nfs41_maxgetdevinfo_overhead;
+#endif
+
+/* nfs4proc.c */
+#if IS_ENABLED(CONFIG_NFS_V4)
+extern const struct rpc_procinfo nfs4_procedures[];
+#endif
+
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags);
+static inline struct nfs4_label *
+nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src)
+{
+ if (!dst || !src)
+ return NULL;
+
+ if (src->len > NFS4_MAXLABELLEN)
+ return NULL;
+
+ dst->lfs = src->lfs;
+ dst->pi = src->pi;
+ dst->len = src->len;
+ memcpy(dst->label, src->label, src->len);
+
+ return dst;
+}
+
+static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
+{
+ if (nfs_server_capable(&nfsi->vfs_inode, NFS_CAP_SECURITY_LABEL))
+ nfsi->cache_validity |= NFS_INO_INVALID_LABEL;
+}
+#else
+static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; }
+static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
+{
+}
+static inline struct nfs4_label *
+nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src)
+{
+ return NULL;
+}
+#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
+
+/* proc.c */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
+extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
+ const struct nfs_client_initdata *);
+
+/* dir.c */
+extern void nfs_readdir_record_entry_cache_hit(struct inode *dir);
+extern void nfs_readdir_record_entry_cache_miss(struct inode *dir);
+extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
+ struct shrink_control *sc);
+extern unsigned long nfs_access_cache_scan(struct shrinker *shrink,
+ struct shrink_control *sc);
+struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int);
+void nfs_d_prune_case_insensitive_aliases(struct inode *inode);
+int nfs_create(struct mnt_idmap *, struct inode *, struct dentry *,
+ umode_t, bool);
+int nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *,
+ umode_t);
+int nfs_rmdir(struct inode *, struct dentry *);
+int nfs_unlink(struct inode *, struct dentry *);
+int nfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *,
+ const char *);
+int nfs_link(struct dentry *, struct inode *, struct dentry *);
+int nfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t,
+ dev_t);
+int nfs_rename(struct mnt_idmap *, struct inode *, struct dentry *,
+ struct inode *, struct dentry *, unsigned int);
+
+#ifdef CONFIG_NFS_V4_2
+static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server)
+{
+ if (!(server->caps & NFS_CAP_XATTR))
+ return 0;
+ return NFS4_ACCESS_XAREAD | NFS4_ACCESS_XAWRITE | NFS4_ACCESS_XALIST;
+}
+#else
+static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server)
+{
+ return 0;
+}
+#endif
+
+/* file.c */
+int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync);
+loff_t nfs_file_llseek(struct file *, loff_t, int);
+ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
+ssize_t nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags);
+int nfs_file_mmap(struct file *, struct vm_area_struct *);
+ssize_t nfs_file_write(struct kiocb *, struct iov_iter *);
+int nfs_file_release(struct inode *, struct file *);
+int nfs_lock(struct file *, int, struct file_lock *);
+int nfs_flock(struct file *, int, struct file_lock *);
+int nfs_check_flags(int);
+
+/* inode.c */
+extern struct workqueue_struct *nfsiod_workqueue;
+extern struct inode *nfs_alloc_inode(struct super_block *sb);
+extern void nfs_free_inode(struct inode *);
+extern int nfs_write_inode(struct inode *, struct writeback_control *);
+extern int nfs_drop_inode(struct inode *);
+extern void nfs_clear_inode(struct inode *);
+extern void nfs_evict_inode(struct inode *);
+extern void nfs_zap_acl_cache(struct inode *inode);
+extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags);
+extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
+extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
+
+/* super.c */
+extern const struct super_operations nfs_sops;
+bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t);
+int nfs_try_get_tree(struct fs_context *);
+int nfs_get_tree_common(struct fs_context *);
+void nfs_kill_super(struct super_block *);
+
+extern struct rpc_stat nfs_rpcstat;
+
+extern int __init register_nfs_fs(void);
+extern void __exit unregister_nfs_fs(void);
+extern bool nfs_sb_active(struct super_block *sb);
+extern void nfs_sb_deactive(struct super_block *sb);
+extern int nfs_client_for_each_server(struct nfs_client *clp,
+ int (*fn)(struct nfs_server *, void *),
+ void *data);
+#ifdef CONFIG_NFS_FSCACHE
+extern const struct netfs_request_ops nfs_netfs_ops;
+#endif
+
+/* io.c */
+extern void nfs_start_io_read(struct inode *inode);
+extern void nfs_end_io_read(struct inode *inode);
+extern void nfs_start_io_write(struct inode *inode);
+extern void nfs_end_io_write(struct inode *inode);
+extern void nfs_start_io_direct(struct inode *inode);
+extern void nfs_end_io_direct(struct inode *inode);
+
+static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi)
+{
+ return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0;
+}
+
+/* namespace.c */
+#define NFS_PATH_CANONICAL 1
+extern char *nfs_path(char **p, struct dentry *dentry,
+ char *buffer, ssize_t buflen, unsigned flags);
+extern struct vfsmount *nfs_d_automount(struct path *path);
+int nfs_submount(struct fs_context *, struct nfs_server *);
+int nfs_do_submount(struct fs_context *);
+
+/* getroot.c */
+extern int nfs_get_root(struct super_block *s, struct fs_context *fc);
+#if IS_ENABLED(CONFIG_NFS_V4)
+extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool);
+#endif
+
+struct nfs_pgio_completion_ops;
+/* read.c */
+extern const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
+extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode, bool force_mds,
+ const struct nfs_pgio_completion_ops *compl_ops);
+extern bool nfs_read_alloc_scratch(struct nfs_pgio_header *hdr, size_t size);
+extern int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio,
+ struct nfs_open_context *ctx,
+ struct folio *folio);
+extern void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio);
+extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
+extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
+
+/* super.c */
+void nfs_umount_begin(struct super_block *);
+int nfs_statfs(struct dentry *, struct kstatfs *);
+int nfs_show_options(struct seq_file *, struct dentry *);
+int nfs_show_devname(struct seq_file *, struct dentry *);
+int nfs_show_path(struct seq_file *, struct dentry *);
+int nfs_show_stats(struct seq_file *, struct dentry *);
+int nfs_reconfigure(struct fs_context *);
+
+/* write.c */
+extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode, int ioflags, bool force_mds,
+ const struct nfs_pgio_completion_ops *compl_ops);
+extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
+extern void nfs_commit_free(struct nfs_commit_data *p);
+extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
+extern int nfs_initiate_commit(struct rpc_clnt *clnt,
+ struct nfs_commit_data *data,
+ const struct nfs_rpc_ops *nfs_ops,
+ const struct rpc_call_ops *call_ops,
+ int how, int flags);
+extern void nfs_init_commit(struct nfs_commit_data *data,
+ struct list_head *head,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo);
+int nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
+ struct nfs_commit_info *cinfo, int max);
+unsigned long nfs_reqs_to_commit(struct nfs_commit_info *);
+int nfs_scan_commit(struct inode *inode, struct list_head *dst,
+ struct nfs_commit_info *cinfo);
+void nfs_mark_request_commit(struct nfs_page *req,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx);
+int nfs_write_need_commit(struct nfs_pgio_header *);
+void nfs_writeback_update_inode(struct nfs_pgio_header *hdr);
+int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
+ int how, struct nfs_commit_info *cinfo);
+void nfs_retry_commit(struct list_head *page_list,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx);
+void nfs_commitdata_release(struct nfs_commit_data *data);
+void nfs_request_add_commit_list(struct nfs_page *req,
+ struct nfs_commit_info *cinfo);
+void nfs_request_add_commit_list_locked(struct nfs_page *req,
+ struct list_head *dst,
+ struct nfs_commit_info *cinfo);
+void nfs_request_remove_commit_list(struct nfs_page *req,
+ struct nfs_commit_info *cinfo);
+void nfs_init_cinfo(struct nfs_commit_info *cinfo,
+ struct inode *inode,
+ struct nfs_direct_req *dreq);
+int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
+bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode);
+void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio);
+
+int nfs_filemap_write_and_wait_range(struct address_space *mapping,
+ loff_t lstart, loff_t lend);
+
+#ifdef CONFIG_NFS_V4_1
+static inline void
+pnfs_bucket_clear_pnfs_ds_commit_verifiers(struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets)
+{
+ unsigned int i;
+
+ for (i = 0; i < nbuckets; i++)
+ buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
+}
+static inline
+void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
+{
+ struct pnfs_commit_array *array;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(array, &cinfo->commits, cinfo_list)
+ pnfs_bucket_clear_pnfs_ds_commit_verifiers(array->buckets,
+ array->nbuckets);
+ rcu_read_unlock();
+}
+#else
+static inline
+void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
+{
+}
+#endif
+
+#ifdef CONFIG_MIGRATION
+int nfs_migrate_folio(struct address_space *, struct folio *dst,
+ struct folio *src, enum migrate_mode);
+#else
+#define nfs_migrate_folio NULL
+#endif
+
+static inline int
+nfs_write_verifier_cmp(const struct nfs_write_verifier *v1,
+ const struct nfs_write_verifier *v2)
+{
+ return memcmp(v1->data, v2->data, sizeof(v1->data));
+}
+
+static inline bool
+nfs_write_match_verf(const struct nfs_writeverf *verf,
+ struct nfs_page *req)
+{
+ return verf->committed > NFS_UNSTABLE &&
+ !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier);
+}
+
+static inline gfp_t nfs_io_gfp_mask(void)
+{
+ if (current->flags & PF_WQ_WORKER)
+ return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
+ return GFP_KERNEL;
+}
+
+/*
+ * Special version of should_remove_suid() that ignores capabilities.
+ */
+static inline int nfs_should_remove_suid(const struct inode *inode)
+{
+ umode_t mode = inode->i_mode;
+ int kill = 0;
+
+ /* suid always must be killed */
+ if (unlikely(mode & S_ISUID))
+ kill = ATTR_KILL_SUID;
+
+ /*
+ * sgid without any exec bits is just a mandatory locking mark; leave
+ * it alone. If some exec bits are set, it's a real sgid; kill it.
+ */
+ if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+ kill |= ATTR_KILL_SGID;
+
+ if (unlikely(kill && S_ISREG(mode)))
+ return kill;
+
+ return 0;
+}
+
+/* unlink.c */
+extern struct rpc_task *
+nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
+ struct dentry *old_dentry, struct dentry *new_dentry,
+ void (*complete)(struct rpc_task *, struct nfs_renamedata *));
+extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry);
+
+/* direct.c */
+void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
+ struct nfs_direct_req *dreq);
+extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset);
+
+/* nfs4proc.c */
+extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
+ const struct nfs_client_initdata *);
+extern int nfs40_walk_client_list(struct nfs_client *clp,
+ struct nfs_client **result,
+ const struct cred *cred);
+extern int nfs41_walk_client_list(struct nfs_client *clp,
+ struct nfs_client **result,
+ const struct cred *cred);
+extern void nfs4_test_session_trunk(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt,
+ void *data);
+
+static inline struct inode *nfs_igrab_and_active(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (sb && nfs_sb_active(sb)) {
+ if (igrab(inode))
+ return inode;
+ nfs_sb_deactive(sb);
+ }
+ return NULL;
+}
+
+static inline void nfs_iput_and_deactive(struct inode *inode)
+{
+ if (inode != NULL) {
+ struct super_block *sb = inode->i_sb;
+
+ iput(inode);
+ nfs_sb_deactive(sb);
+ }
+}
+
+/*
+ * Determine the device name as a string
+ */
+static inline char *nfs_devname(struct dentry *dentry,
+ char *buffer, ssize_t buflen)
+{
+ char *dummy;
+ return nfs_path(&dummy, dentry, buffer, buflen, NFS_PATH_CANONICAL);
+}
+
+/*
+ * Determine the actual block size (and log2 thereof)
+ */
+static inline
+unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp)
+{
+ /* make sure blocksize is a power of two */
+ if ((bsize & (bsize - 1)) || nrbitsp) {
+ unsigned char nrbits;
+
+ for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--)
+ ;
+ bsize = 1 << nrbits;
+ if (nrbitsp)
+ *nrbitsp = nrbits;
+ }
+
+ return bsize;
+}
+
+/*
+ * Calculate the number of 512byte blocks used.
+ */
+static inline blkcnt_t nfs_calc_block_size(u64 tsize)
+{
+ blkcnt_t used = (tsize + 511) >> 9;
+ return (used > ULONG_MAX) ? ULONG_MAX : used;
+}
+
+/*
+ * Compute and set NFS server blocksize
+ */
+static inline
+unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
+{
+ if (bsize < NFS_MIN_FILE_IO_SIZE)
+ bsize = NFS_DEF_FILE_IO_SIZE;
+ else if (bsize >= NFS_MAX_FILE_IO_SIZE)
+ bsize = NFS_MAX_FILE_IO_SIZE;
+
+ return nfs_block_bits(bsize, nrbitsp);
+}
+
+/*
+ * Compute and set NFS server rsize / wsize
+ */
+static inline
+unsigned long nfs_io_size(unsigned long iosize, enum xprt_transports proto)
+{
+ if (iosize < NFS_MIN_FILE_IO_SIZE)
+ iosize = NFS_DEF_FILE_IO_SIZE;
+ else if (iosize >= NFS_MAX_FILE_IO_SIZE)
+ iosize = NFS_MAX_FILE_IO_SIZE;
+
+ if (proto == XPRT_TRANSPORT_UDP || iosize < PAGE_SIZE)
+ return nfs_block_bits(iosize, NULL);
+ return iosize & PAGE_MASK;
+}
+
+/*
+ * Determine the maximum file size for a superblock
+ */
+static inline
+void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
+{
+ sb->s_maxbytes = (loff_t)maxfilesize;
+ if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0)
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+}
+
+/*
+ * Record the page as unstable (an extra writeback period) and mark its
+ * inode as dirty.
+ */
+static inline void nfs_folio_mark_unstable(struct folio *folio,
+ struct nfs_commit_info *cinfo)
+{
+ if (folio && !cinfo->dreq) {
+ struct inode *inode = folio_file_mapping(folio)->host;
+ long nr = folio_nr_pages(folio);
+
+ /* This page is really still in write-back - just that the
+ * writeback is happening on the server now.
+ */
+ node_stat_mod_folio(folio, NR_WRITEBACK, nr);
+ wb_stat_mod(&inode_to_bdi(inode)->wb, WB_WRITEBACK, nr);
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+ }
+}
+
+/*
+ * Determine the number of bytes of data the page contains
+ */
+static inline
+unsigned int nfs_page_length(struct page *page)
+{
+ loff_t i_size = i_size_read(page_file_mapping(page)->host);
+
+ if (i_size > 0) {
+ pgoff_t index = page_index(page);
+ pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT;
+ if (index < end_index)
+ return PAGE_SIZE;
+ if (index == end_index)
+ return ((i_size - 1) & ~PAGE_MASK) + 1;
+ }
+ return 0;
+}
+
+/*
+ * Determine the number of bytes of data the page contains
+ */
+static inline size_t nfs_folio_length(struct folio *folio)
+{
+ loff_t i_size = i_size_read(folio_file_mapping(folio)->host);
+
+ if (i_size > 0) {
+ pgoff_t index = folio_index(folio) >> folio_order(folio);
+ pgoff_t end_index = (i_size - 1) >> folio_shift(folio);
+ if (index < end_index)
+ return folio_size(folio);
+ if (index == end_index)
+ return offset_in_folio(folio, i_size - 1) + 1;
+ }
+ return 0;
+}
+
+/*
+ * Convert a umode to a dirent->d_type
+ */
+static inline
+unsigned char nfs_umode_to_dtype(umode_t mode)
+{
+ return (mode >> 12) & 15;
+}
+
+/*
+ * Determine the number of pages in an array of length 'len' and
+ * with a base offset of 'base'
+ */
+static inline unsigned int nfs_page_array_len(unsigned int base, size_t len)
+{
+ return ((unsigned long)len + (unsigned long)base + PAGE_SIZE - 1) >>
+ PAGE_SHIFT;
+}
+
+/*
+ * Convert a struct timespec64 into a 64-bit change attribute
+ *
+ * This does approximately the same thing as timespec64_to_ns(),
+ * but for calculation efficiency, we multiply the seconds by
+ * 1024*1024*1024.
+ */
+static inline
+u64 nfs_timespec_to_change_attr(const struct timespec64 *ts)
+{
+ return ((u64)ts->tv_sec << 30) + ts->tv_nsec;
+}
+
+#ifdef CONFIG_CRC32
+static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid)
+{
+ return ~crc32_le(0xFFFFFFFF, &stateid->other[0],
+ NFS4_STATEID_OTHER_SIZE);
+}
+#else
+static inline u32 nfs_stateid_hash(nfs4_stateid *stateid)
+{
+ return 0;
+}
+#endif
+
+static inline bool nfs_error_is_fatal(int err)
+{
+ switch (err) {
+ case -ERESTARTSYS:
+ case -EINTR:
+ case -EACCES:
+ case -EDQUOT:
+ case -EFBIG:
+ case -EIO:
+ case -ENOSPC:
+ case -EROFS:
+ case -ESTALE:
+ case -E2BIG:
+ case -ENOMEM:
+ case -ETIMEDOUT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool nfs_error_is_fatal_on_server(int err)
+{
+ switch (err) {
+ case 0:
+ case -ERESTARTSYS:
+ case -EINTR:
+ case -ENOMEM:
+ return false;
+ }
+ return nfs_error_is_fatal(err);
+}
+
+/*
+ * Select between a default port value and a user-specified port value.
+ * If a zero value is set, then autobind will be used.
+ */
+static inline void nfs_set_port(struct sockaddr_storage *sap, int *port,
+ const unsigned short default_port)
+{
+ if (*port == NFS_UNSPEC_PORT)
+ *port = default_port;
+
+ rpc_set_port((struct sockaddr *)sap, *port);
+}
+
+struct nfs_direct_req {
+ struct kref kref; /* release manager */
+
+ /* I/O parameters */
+ struct nfs_open_context *ctx; /* file open context info */
+ struct nfs_lock_context *l_ctx; /* Lock context info */
+ struct kiocb * iocb; /* controlling i/o request */
+ struct inode * inode; /* target file of i/o */
+
+ /* completion state */
+ atomic_t io_count; /* i/os we're waiting for */
+ spinlock_t lock; /* protect completion state */
+
+ loff_t io_start; /* Start offset for I/O */
+ ssize_t count, /* bytes actually processed */
+ max_count, /* max expected count */
+ bytes_left, /* bytes left to be sent */
+ error; /* any reported error */
+ struct completion completion; /* wait for i/o completion */
+
+ /* commit state */
+ struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */
+ struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */
+ struct work_struct work;
+ int flags;
+ /* for write */
+#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */
+#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */
+ /* for read */
+#define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */
+#define NFS_ODIRECT_DONE INT_MAX /* write verification failed */
+};
diff --git a/fs/nfs/io.c b/fs/nfs/io.c
new file mode 100644
index 0000000000..b5551ed8f6
--- /dev/null
+++ b/fs/nfs/io.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2016 Trond Myklebust
+ *
+ * I/O and data path helper functionality.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/rwsem.h>
+#include <linux/fs.h>
+#include <linux/nfs_fs.h>
+
+#include "internal.h"
+
+/* Call with exclusively locked inode->i_rwsem */
+static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode)
+{
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
+ clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
+ inode_dio_wait(inode);
+ }
+}
+
+/**
+ * nfs_start_io_read - declare the file is being used for buffered reads
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ * On exit, the function ensures that the NFS_INO_ODIRECT flag is unset,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that buffered read operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas direct I/O
+ * operations need to wait to grab an exclusive lock in order to set
+ * NFS_INO_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
+ */
+void
+nfs_start_io_read(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ /* Be an optimist! */
+ down_read(&inode->i_rwsem);
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0)
+ return;
+ up_read(&inode->i_rwsem);
+ /* Slow path.... */
+ down_write(&inode->i_rwsem);
+ nfs_block_o_direct(nfsi, inode);
+ downgrade_write(&inode->i_rwsem);
+}
+
+/**
+ * nfs_end_io_read - declare that the buffered read operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_read(struct inode *inode)
+{
+ up_read(&inode->i_rwsem);
+}
+
+/**
+ * nfs_start_io_write - declare the file is being used for buffered writes
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ */
+void
+nfs_start_io_write(struct inode *inode)
+{
+ down_write(&inode->i_rwsem);
+ nfs_block_o_direct(NFS_I(inode), inode);
+}
+
+/**
+ * nfs_end_io_write - declare that the buffered write operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered write operation is done, and release the
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_write(struct inode *inode)
+{
+ up_write(&inode->i_rwsem);
+}
+
+/* Call with exclusively locked inode->i_rwsem */
+static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode)
+{
+ if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
+ set_bit(NFS_INO_ODIRECT, &nfsi->flags);
+ nfs_sync_mapping(inode->i_mapping);
+ }
+}
+
+/**
+ * nfs_start_io_direct - declare the file is being used for direct i/o
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is about to start, and ensure
+ * that we block all buffered I/O.
+ * On exit, the function ensures that the NFS_INO_ODIRECT flag is set,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that direct I/O operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas buffered I/O
+ * operations need to wait to grab an exclusive lock in order to clear
+ * NFS_INO_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
+ */
+void
+nfs_start_io_direct(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ /* Be an optimist! */
+ down_read(&inode->i_rwsem);
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0)
+ return;
+ up_read(&inode->i_rwsem);
+ /* Slow path.... */
+ down_write(&inode->i_rwsem);
+ nfs_block_buffered(nfsi, inode);
+ downgrade_write(&inode->i_rwsem);
+}
+
+/**
+ * nfs_end_io_direct - declare that the direct i/o operation is done
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_direct(struct inode *inode)
+{
+ up_read(&inode->i_rwsem);
+}
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
new file mode 100644
index 0000000000..5aa776b5a3
--- /dev/null
+++ b/fs/nfs/iostat.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/fs/nfs/iostat.h
+ *
+ * Declarations for NFS client per-mount statistics
+ *
+ * Copyright (C) 2005, 2006 Chuck Lever <cel@netapp.com>
+ *
+ */
+
+#ifndef _NFS_IOSTAT
+#define _NFS_IOSTAT
+
+#include <linux/percpu.h>
+#include <linux/cache.h>
+#include <linux/nfs_iostat.h>
+
+struct nfs_iostats {
+ unsigned long long bytes[__NFSIOS_BYTESMAX];
+ unsigned long events[__NFSIOS_COUNTSMAX];
+} ____cacheline_aligned;
+
+static inline void nfs_inc_server_stats(const struct nfs_server *server,
+ enum nfs_stat_eventcounters stat)
+{
+ this_cpu_inc(server->io_stats->events[stat]);
+}
+
+static inline void nfs_inc_stats(const struct inode *inode,
+ enum nfs_stat_eventcounters stat)
+{
+ nfs_inc_server_stats(NFS_SERVER(inode), stat);
+}
+
+static inline void nfs_add_server_stats(const struct nfs_server *server,
+ enum nfs_stat_bytecounters stat,
+ long addend)
+{
+ this_cpu_add(server->io_stats->bytes[stat], addend);
+}
+
+static inline void nfs_add_stats(const struct inode *inode,
+ enum nfs_stat_bytecounters stat,
+ long addend)
+{
+ nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
+}
+
+static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void)
+{
+ return alloc_percpu(struct nfs_iostats);
+}
+
+static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats)
+{
+ if (stats != NULL)
+ free_percpu(stats);
+}
+
+#endif /* _NFS_IOSTAT */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
new file mode 100644
index 0000000000..68e76b6263
--- /dev/null
+++ b/fs/nfs/mount_clnt.c
@@ -0,0 +1,544 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * In-kernel MOUNT protocol client
+ *
+ * Copyright (C) 1997, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/uio.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs_fs.h>
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_MOUNT
+
+/*
+ * Defined by RFC 1094, section A.3; and RFC 1813, section 5.1.4
+ */
+#define MNTPATHLEN (1024)
+
+/*
+ * XDR data type sizes
+ */
+#define encode_dirpath_sz (1 + XDR_QUADLEN(MNTPATHLEN))
+#define MNT_status_sz (1)
+#define MNT_fhandle_sz XDR_QUADLEN(NFS2_FHSIZE)
+#define MNT_fhandlev3_sz XDR_QUADLEN(NFS3_FHSIZE)
+#define MNT_authflav3_sz (1 + NFS_MAX_SECFLAVORS)
+
+/*
+ * XDR argument and result sizes
+ */
+#define MNT_enc_dirpath_sz encode_dirpath_sz
+#define MNT_dec_mountres_sz (MNT_status_sz + MNT_fhandle_sz)
+#define MNT_dec_mountres3_sz (MNT_status_sz + MNT_fhandlev3_sz + \
+ MNT_authflav3_sz)
+
+/*
+ * Defined by RFC 1094, section A.5
+ */
+enum {
+ MOUNTPROC_NULL = 0,
+ MOUNTPROC_MNT = 1,
+ MOUNTPROC_DUMP = 2,
+ MOUNTPROC_UMNT = 3,
+ MOUNTPROC_UMNTALL = 4,
+ MOUNTPROC_EXPORT = 5,
+};
+
+/*
+ * Defined by RFC 1813, section 5.2
+ */
+enum {
+ MOUNTPROC3_NULL = 0,
+ MOUNTPROC3_MNT = 1,
+ MOUNTPROC3_DUMP = 2,
+ MOUNTPROC3_UMNT = 3,
+ MOUNTPROC3_UMNTALL = 4,
+ MOUNTPROC3_EXPORT = 5,
+};
+
+static const struct rpc_program mnt_program;
+
+/*
+ * Defined by OpenGroup XNFS Version 3W, chapter 8
+ */
+enum mountstat {
+ MNT_OK = 0,
+ MNT_EPERM = 1,
+ MNT_ENOENT = 2,
+ MNT_EACCES = 13,
+ MNT_EINVAL = 22,
+};
+
+static struct {
+ u32 status;
+ int errno;
+} mnt_errtbl[] = {
+ { .status = MNT_OK, .errno = 0, },
+ { .status = MNT_EPERM, .errno = -EPERM, },
+ { .status = MNT_ENOENT, .errno = -ENOENT, },
+ { .status = MNT_EACCES, .errno = -EACCES, },
+ { .status = MNT_EINVAL, .errno = -EINVAL, },
+};
+
+/*
+ * Defined by RFC 1813, section 5.1.5
+ */
+enum mountstat3 {
+ MNT3_OK = 0, /* no error */
+ MNT3ERR_PERM = 1, /* Not owner */
+ MNT3ERR_NOENT = 2, /* No such file or directory */
+ MNT3ERR_IO = 5, /* I/O error */
+ MNT3ERR_ACCES = 13, /* Permission denied */
+ MNT3ERR_NOTDIR = 20, /* Not a directory */
+ MNT3ERR_INVAL = 22, /* Invalid argument */
+ MNT3ERR_NAMETOOLONG = 63, /* Filename too long */
+ MNT3ERR_NOTSUPP = 10004, /* Operation not supported */
+ MNT3ERR_SERVERFAULT = 10006, /* A failure on the server */
+};
+
+static struct {
+ u32 status;
+ int errno;
+} mnt3_errtbl[] = {
+ { .status = MNT3_OK, .errno = 0, },
+ { .status = MNT3ERR_PERM, .errno = -EPERM, },
+ { .status = MNT3ERR_NOENT, .errno = -ENOENT, },
+ { .status = MNT3ERR_IO, .errno = -EIO, },
+ { .status = MNT3ERR_ACCES, .errno = -EACCES, },
+ { .status = MNT3ERR_NOTDIR, .errno = -ENOTDIR, },
+ { .status = MNT3ERR_INVAL, .errno = -EINVAL, },
+ { .status = MNT3ERR_NAMETOOLONG, .errno = -ENAMETOOLONG, },
+ { .status = MNT3ERR_NOTSUPP, .errno = -ENOTSUPP, },
+ { .status = MNT3ERR_SERVERFAULT, .errno = -EREMOTEIO, },
+};
+
+struct mountres {
+ int errno;
+ struct nfs_fh *fh;
+ unsigned int *auth_count;
+ rpc_authflavor_t *auth_flavors;
+};
+
+struct mnt_fhstatus {
+ u32 status;
+ struct nfs_fh *fh;
+};
+
+/**
+ * nfs_mount - Obtain an NFS file handle for the given host and path
+ * @info: pointer to mount request arguments
+ * @timeo: deciseconds the mount waits for a response before it retries
+ * @retrans: number of times the mount retries a request
+ *
+ * Uses timeout parameters specified by caller. On successful return, the
+ * auth_flavs list and auth_flav_len will be populated with the list from the
+ * server or a faked-up list if the server didn't provide one.
+ */
+int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans)
+{
+ struct rpc_timeout mnt_timeout;
+ struct mountres result = {
+ .fh = info->fh,
+ .auth_count = info->auth_flav_len,
+ .auth_flavors = info->auth_flavs,
+ };
+ struct rpc_message msg = {
+ .rpc_argp = info->dirpath,
+ .rpc_resp = &result,
+ };
+ struct rpc_create_args args = {
+ .net = info->net,
+ .protocol = info->protocol,
+ .address = (struct sockaddr *)info->sap,
+ .addrsize = info->salen,
+ .timeout = &mnt_timeout,
+ .servername = info->hostname,
+ .program = &mnt_program,
+ .version = info->version,
+ .authflavor = RPC_AUTH_UNIX,
+ .cred = current_cred(),
+ };
+ struct rpc_clnt *mnt_clnt;
+ int status;
+
+ dprintk("NFS: sending MNT request for %s:%s\n",
+ (info->hostname ? info->hostname : "server"),
+ info->dirpath);
+
+ if (strlen(info->dirpath) > MNTPATHLEN)
+ return -ENAMETOOLONG;
+
+ if (info->noresvport)
+ args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+
+ nfs_init_timeout_values(&mnt_timeout, info->protocol, timeo, retrans);
+ mnt_clnt = rpc_create(&args);
+ if (IS_ERR(mnt_clnt))
+ goto out_clnt_err;
+
+ if (info->version == NFS_MNT3_VERSION)
+ msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT];
+ else
+ msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT];
+
+ status = rpc_call_sync(mnt_clnt, &msg, RPC_TASK_SOFT|RPC_TASK_TIMEOUT);
+ rpc_shutdown_client(mnt_clnt);
+
+ if (status < 0)
+ goto out_call_err;
+ if (result.errno != 0)
+ goto out_mnt_err;
+
+ dprintk("NFS: MNT request succeeded\n");
+ status = 0;
+
+ /*
+ * If the server didn't provide a flavor list, allow the
+ * client to try any flavor.
+ */
+ if (info->version != NFS_MNT3_VERSION || *info->auth_flav_len == 0) {
+ dprintk("NFS: Faking up auth_flavs list\n");
+ info->auth_flavs[0] = RPC_AUTH_NULL;
+ *info->auth_flav_len = 1;
+ }
+out:
+ return status;
+
+out_clnt_err:
+ status = PTR_ERR(mnt_clnt);
+ dprintk("NFS: failed to create MNT RPC client, status=%d\n", status);
+ goto out;
+
+out_call_err:
+ dprintk("NFS: MNT request failed, status=%d\n", status);
+ goto out;
+
+out_mnt_err:
+ dprintk("NFS: MNT server returned result %d\n", result.errno);
+ status = result.errno;
+ goto out;
+}
+
+/**
+ * nfs_umount - Notify a server that we have unmounted this export
+ * @info: pointer to umount request arguments
+ *
+ * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always
+ * use UDP.
+ */
+void nfs_umount(const struct nfs_mount_request *info)
+{
+ static const struct rpc_timeout nfs_umnt_timeout = {
+ .to_initval = 1 * HZ,
+ .to_maxval = 3 * HZ,
+ .to_retries = 2,
+ };
+ struct rpc_create_args args = {
+ .net = info->net,
+ .protocol = IPPROTO_UDP,
+ .address = (struct sockaddr *)info->sap,
+ .addrsize = info->salen,
+ .timeout = &nfs_umnt_timeout,
+ .servername = info->hostname,
+ .program = &mnt_program,
+ .version = info->version,
+ .authflavor = RPC_AUTH_UNIX,
+ .flags = RPC_CLNT_CREATE_NOPING,
+ .cred = current_cred(),
+ };
+ struct rpc_message msg = {
+ .rpc_argp = info->dirpath,
+ };
+ struct rpc_clnt *clnt;
+ int status;
+
+ if (strlen(info->dirpath) > MNTPATHLEN)
+ return;
+
+ if (info->noresvport)
+ args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+
+ clnt = rpc_create(&args);
+ if (IS_ERR(clnt))
+ goto out_clnt_err;
+
+ dprintk("NFS: sending UMNT request for %s:%s\n",
+ (info->hostname ? info->hostname : "server"), info->dirpath);
+
+ if (info->version == NFS_MNT3_VERSION)
+ msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT];
+ else
+ msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT];
+
+ status = rpc_call_sync(clnt, &msg, 0);
+ rpc_shutdown_client(clnt);
+
+ if (unlikely(status < 0))
+ goto out_call_err;
+
+ return;
+
+out_clnt_err:
+ dprintk("NFS: failed to create UMNT RPC client, status=%ld\n",
+ PTR_ERR(clnt));
+ return;
+
+out_call_err:
+ dprintk("NFS: UMNT request failed, status=%d\n", status);
+}
+
+/*
+ * XDR encode/decode functions for MOUNT
+ */
+
+static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
+{
+ const u32 pathname_len = strlen(pathname);
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4 + pathname_len);
+ xdr_encode_opaque(p, pathname, pathname_len);
+}
+
+static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *dirpath)
+{
+ encode_mntdirpath(xdr, dirpath);
+}
+
+/*
+ * RFC 1094: "A non-zero status indicates some sort of error. In this
+ * case, the status is a UNIX error number." This can be problematic
+ * if the server and client use different errno values for the same
+ * error.
+ *
+ * However, the OpenGroup XNFS spec provides a simple mapping that is
+ * independent of local errno values on the server and the client.
+ */
+static int decode_status(struct xdr_stream *xdr, struct mountres *res)
+{
+ unsigned int i;
+ u32 status;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -EIO;
+ status = be32_to_cpup(p);
+
+ for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
+ if (mnt_errtbl[i].status == status) {
+ res->errno = mnt_errtbl[i].errno;
+ return 0;
+ }
+ }
+
+ dprintk("NFS: unrecognized MNT status code: %u\n", status);
+ res->errno = -EACCES;
+ return 0;
+}
+
+static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res)
+{
+ struct nfs_fh *fh = res->fh;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS2_FHSIZE);
+ if (unlikely(p == NULL))
+ return -EIO;
+
+ fh->size = NFS2_FHSIZE;
+ memcpy(fh->data, p, NFS2_FHSIZE);
+ return 0;
+}
+
+static int mnt_xdr_dec_mountres(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct mountres *res = data;
+ int status;
+
+ status = decode_status(xdr, res);
+ if (unlikely(status != 0 || res->errno != 0))
+ return status;
+ return decode_fhandle(xdr, res);
+}
+
+static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
+{
+ unsigned int i;
+ u32 status;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -EIO;
+ status = be32_to_cpup(p);
+
+ for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
+ if (mnt3_errtbl[i].status == status) {
+ res->errno = mnt3_errtbl[i].errno;
+ return 0;
+ }
+ }
+
+ dprintk("NFS: unrecognized MNT3 status code: %u\n", status);
+ res->errno = -EACCES;
+ return 0;
+}
+
+static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res)
+{
+ struct nfs_fh *fh = res->fh;
+ u32 size;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -EIO;
+
+ size = be32_to_cpup(p);
+ if (size > NFS3_FHSIZE || size == 0)
+ return -EIO;
+
+ p = xdr_inline_decode(xdr, size);
+ if (unlikely(p == NULL))
+ return -EIO;
+
+ fh->size = size;
+ memcpy(fh->data, p, size);
+ return 0;
+}
+
+static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
+{
+ rpc_authflavor_t *flavors = res->auth_flavors;
+ unsigned int *count = res->auth_count;
+ u32 entries, i;
+ __be32 *p;
+
+ if (*count == 0)
+ return 0;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -EIO;
+ entries = be32_to_cpup(p);
+ dprintk("NFS: received %u auth flavors\n", entries);
+ if (entries > NFS_MAX_SECFLAVORS)
+ entries = NFS_MAX_SECFLAVORS;
+
+ p = xdr_inline_decode(xdr, 4 * entries);
+ if (unlikely(p == NULL))
+ return -EIO;
+
+ if (entries > *count)
+ entries = *count;
+
+ for (i = 0; i < entries; i++) {
+ flavors[i] = be32_to_cpup(p++);
+ dprintk("NFS: auth flavor[%u]: %d\n", i, flavors[i]);
+ }
+ *count = i;
+
+ return 0;
+}
+
+static int mnt_xdr_dec_mountres3(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct mountres *res = data;
+ int status;
+
+ status = decode_fhs_status(xdr, res);
+ if (unlikely(status != 0 || res->errno != 0))
+ return status;
+ status = decode_fhandle3(xdr, res);
+ if (unlikely(status != 0)) {
+ res->errno = -EBADHANDLE;
+ return 0;
+ }
+ return decode_auth_flavors(xdr, res);
+}
+
+static const struct rpc_procinfo mnt_procedures[] = {
+ [MOUNTPROC_MNT] = {
+ .p_proc = MOUNTPROC_MNT,
+ .p_encode = mnt_xdr_enc_dirpath,
+ .p_decode = mnt_xdr_dec_mountres,
+ .p_arglen = MNT_enc_dirpath_sz,
+ .p_replen = MNT_dec_mountres_sz,
+ .p_statidx = MOUNTPROC_MNT,
+ .p_name = "MOUNT",
+ },
+ [MOUNTPROC_UMNT] = {
+ .p_proc = MOUNTPROC_UMNT,
+ .p_encode = mnt_xdr_enc_dirpath,
+ .p_arglen = MNT_enc_dirpath_sz,
+ .p_statidx = MOUNTPROC_UMNT,
+ .p_name = "UMOUNT",
+ },
+};
+
+static const struct rpc_procinfo mnt3_procedures[] = {
+ [MOUNTPROC3_MNT] = {
+ .p_proc = MOUNTPROC3_MNT,
+ .p_encode = mnt_xdr_enc_dirpath,
+ .p_decode = mnt_xdr_dec_mountres3,
+ .p_arglen = MNT_enc_dirpath_sz,
+ .p_replen = MNT_dec_mountres3_sz,
+ .p_statidx = MOUNTPROC3_MNT,
+ .p_name = "MOUNT",
+ },
+ [MOUNTPROC3_UMNT] = {
+ .p_proc = MOUNTPROC3_UMNT,
+ .p_encode = mnt_xdr_enc_dirpath,
+ .p_arglen = MNT_enc_dirpath_sz,
+ .p_statidx = MOUNTPROC3_UMNT,
+ .p_name = "UMOUNT",
+ },
+};
+
+static unsigned int mnt_counts[ARRAY_SIZE(mnt_procedures)];
+static const struct rpc_version mnt_version1 = {
+ .number = 1,
+ .nrprocs = ARRAY_SIZE(mnt_procedures),
+ .procs = mnt_procedures,
+ .counts = mnt_counts,
+};
+
+static unsigned int mnt3_counts[ARRAY_SIZE(mnt3_procedures)];
+static const struct rpc_version mnt_version3 = {
+ .number = 3,
+ .nrprocs = ARRAY_SIZE(mnt3_procedures),
+ .procs = mnt3_procedures,
+ .counts = mnt3_counts,
+};
+
+static const struct rpc_version *mnt_version[] = {
+ NULL,
+ &mnt_version1,
+ NULL,
+ &mnt_version3,
+};
+
+static struct rpc_stat mnt_stats;
+
+static const struct rpc_program mnt_program = {
+ .name = "mount",
+ .number = NFS_MNT_PROGRAM,
+ .nrvers = ARRAY_SIZE(mnt_version),
+ .version = mnt_version,
+ .stats = &mnt_stats,
+};
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
new file mode 100644
index 0000000000..e7494cdd95
--- /dev/null
+++ b/fs/nfs/namespace.c
@@ -0,0 +1,370 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/namespace.c
+ *
+ * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ * - Modified by David Howells <dhowells@redhat.com>
+ *
+ * NFS namespace
+ */
+
+#include <linux/module.h>
+#include <linux/dcache.h>
+#include <linux/gfp.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/nfs_fs.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/vfs.h>
+#include <linux/sunrpc/gss_api.h>
+#include "internal.h"
+#include "nfs.h"
+
+#define NFSDBG_FACILITY NFSDBG_VFS
+
+static void nfs_expire_automounts(struct work_struct *work);
+
+static LIST_HEAD(nfs_automount_list);
+static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts);
+int nfs_mountpoint_expiry_timeout = 500 * HZ;
+
+/*
+ * nfs_path - reconstruct the path given an arbitrary dentry
+ * @base - used to return pointer to the end of devname part of path
+ * @dentry_in - pointer to dentry
+ * @buffer - result buffer
+ * @buflen_in - length of buffer
+ * @flags - options (see below)
+ *
+ * Helper function for constructing the server pathname
+ * by arbitrary hashed dentry.
+ *
+ * This is mainly for use in figuring out the path on the
+ * server side when automounting on top of an existing partition
+ * and in generating /proc/mounts and friends.
+ *
+ * Supported flags:
+ * NFS_PATH_CANONICAL: ensure there is exactly one slash after
+ * the original device (export) name
+ * (if unset, the original name is returned verbatim)
+ */
+char *nfs_path(char **p, struct dentry *dentry_in, char *buffer,
+ ssize_t buflen_in, unsigned flags)
+{
+ char *end;
+ int namelen;
+ unsigned seq;
+ const char *base;
+ struct dentry *dentry;
+ ssize_t buflen;
+
+rename_retry:
+ buflen = buflen_in;
+ dentry = dentry_in;
+ end = buffer+buflen;
+ *--end = '\0';
+ buflen--;
+
+ seq = read_seqbegin(&rename_lock);
+ rcu_read_lock();
+ while (1) {
+ spin_lock(&dentry->d_lock);
+ if (IS_ROOT(dentry))
+ break;
+ namelen = dentry->d_name.len;
+ buflen -= namelen + 1;
+ if (buflen < 0)
+ goto Elong_unlock;
+ end -= namelen;
+ memcpy(end, dentry->d_name.name, namelen);
+ *--end = '/';
+ spin_unlock(&dentry->d_lock);
+ dentry = dentry->d_parent;
+ }
+ if (read_seqretry(&rename_lock, seq)) {
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ goto rename_retry;
+ }
+ if ((flags & NFS_PATH_CANONICAL) && *end != '/') {
+ if (--buflen < 0) {
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ goto Elong;
+ }
+ *--end = '/';
+ }
+ *p = end;
+ base = dentry->d_fsdata;
+ if (!base) {
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ WARN_ON(1);
+ return end;
+ }
+ namelen = strlen(base);
+ if (*end == '/') {
+ /* Strip off excess slashes in base string */
+ while (namelen > 0 && base[namelen - 1] == '/')
+ namelen--;
+ }
+ buflen -= namelen;
+ if (buflen < 0) {
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ goto Elong;
+ }
+ end -= namelen;
+ memcpy(end, base, namelen);
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ return end;
+Elong_unlock:
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ if (read_seqretry(&rename_lock, seq))
+ goto rename_retry;
+Elong:
+ return ERR_PTR(-ENAMETOOLONG);
+}
+EXPORT_SYMBOL_GPL(nfs_path);
+
+/*
+ * nfs_d_automount - Handle crossing a mountpoint on the server
+ * @path - The mountpoint
+ *
+ * When we encounter a mountpoint on the server, we want to set up
+ * a mountpoint on the client too, to prevent inode numbers from
+ * colliding, and to allow "df" to work properly.
+ * On NFSv4, we also want to allow for the fact that different
+ * filesystems may be migrated to different servers in a failover
+ * situation, and that different filesystems may want to use
+ * different security flavours.
+ */
+struct vfsmount *nfs_d_automount(struct path *path)
+{
+ struct nfs_fs_context *ctx;
+ struct fs_context *fc;
+ struct vfsmount *mnt = ERR_PTR(-ENOMEM);
+ struct nfs_server *server = NFS_SB(path->dentry->d_sb);
+ struct nfs_client *client = server->nfs_client;
+ int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout);
+ int ret;
+
+ if (IS_ROOT(path->dentry))
+ return ERR_PTR(-ESTALE);
+
+ /* Open a new filesystem context, transferring parameters from the
+ * parent superblock, including the network namespace.
+ */
+ fc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry);
+ if (IS_ERR(fc))
+ return ERR_CAST(fc);
+
+ ctx = nfs_fc2context(fc);
+ ctx->clone_data.dentry = path->dentry;
+ ctx->clone_data.sb = path->dentry->d_sb;
+ ctx->clone_data.fattr = nfs_alloc_fattr();
+ if (!ctx->clone_data.fattr)
+ goto out_fc;
+
+ if (fc->net_ns != client->cl_net) {
+ put_net(fc->net_ns);
+ fc->net_ns = get_net(client->cl_net);
+ }
+
+ /* for submounts we want the same server; referrals will reassign */
+ memcpy(&ctx->nfs_server._address, &client->cl_addr, client->cl_addrlen);
+ ctx->nfs_server.addrlen = client->cl_addrlen;
+ ctx->nfs_server.port = server->port;
+
+ ctx->version = client->rpc_ops->version;
+ ctx->minorversion = client->cl_minorversion;
+ ctx->nfs_mod = client->cl_nfs_mod;
+ __module_get(ctx->nfs_mod->owner);
+
+ ret = client->rpc_ops->submount(fc, server);
+ if (ret < 0) {
+ mnt = ERR_PTR(ret);
+ goto out_fc;
+ }
+
+ up_write(&fc->root->d_sb->s_umount);
+ mnt = vfs_create_mount(fc);
+ if (IS_ERR(mnt))
+ goto out_fc;
+
+ mntget(mnt); /* prevent immediate expiration */
+ if (timeout <= 0)
+ goto out_fc;
+
+ mnt_set_expiry(mnt, &nfs_automount_list);
+ schedule_delayed_work(&nfs_automount_task, timeout);
+
+out_fc:
+ put_fs_context(fc);
+ return mnt;
+}
+
+static int
+nfs_namespace_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ if (NFS_FH(d_inode(path->dentry))->size != 0)
+ return nfs_getattr(idmap, path, stat, request_mask,
+ query_flags);
+ generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry),
+ stat);
+ return 0;
+}
+
+static int
+nfs_namespace_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
+{
+ if (NFS_FH(d_inode(dentry))->size != 0)
+ return nfs_setattr(idmap, dentry, attr);
+ return -EACCES;
+}
+
+const struct inode_operations nfs_mountpoint_inode_operations = {
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+};
+
+const struct inode_operations nfs_referral_inode_operations = {
+ .getattr = nfs_namespace_getattr,
+ .setattr = nfs_namespace_setattr,
+};
+
+static void nfs_expire_automounts(struct work_struct *work)
+{
+ struct list_head *list = &nfs_automount_list;
+ int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout);
+
+ mark_mounts_for_expiry(list);
+ if (!list_empty(list) && timeout > 0)
+ schedule_delayed_work(&nfs_automount_task, timeout);
+}
+
+void nfs_release_automount_timer(void)
+{
+ if (list_empty(&nfs_automount_list))
+ cancel_delayed_work(&nfs_automount_task);
+}
+
+/**
+ * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
+ * @fc: pointer to struct nfs_fs_context
+ *
+ */
+int nfs_do_submount(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct dentry *dentry = ctx->clone_data.dentry;
+ struct nfs_server *server;
+ char *buffer, *p;
+ int ret;
+
+ /* create a new volume representation */
+ server = ctx->nfs_mod->rpc_ops->clone_server(NFS_SB(ctx->clone_data.sb),
+ ctx->mntfh,
+ ctx->clone_data.fattr,
+ ctx->selected_flavor);
+
+ if (IS_ERR(server))
+ return PTR_ERR(server);
+
+ ctx->server = server;
+
+ buffer = kmalloc(4096, GFP_USER);
+ if (!buffer)
+ return -ENOMEM;
+
+ ctx->internal = true;
+ ctx->clone_data.inherited_bsize = ctx->clone_data.sb->s_blocksize_bits;
+
+ p = nfs_devname(dentry, buffer, 4096);
+ if (IS_ERR(p)) {
+ nfs_errorf(fc, "NFS: Couldn't determine submount pathname");
+ ret = PTR_ERR(p);
+ } else {
+ ret = vfs_parse_fs_string(fc, "source", p, buffer + 4096 - p);
+ if (!ret)
+ ret = vfs_get_tree(fc);
+ }
+ kfree(buffer);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_do_submount);
+
+int nfs_submount(struct fs_context *fc, struct nfs_server *server)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct dentry *dentry = ctx->clone_data.dentry;
+ struct dentry *parent = dget_parent(dentry);
+ int err;
+
+ /* Look it up again to get its attributes */
+ err = server->nfs_client->rpc_ops->lookup(d_inode(parent), dentry,
+ ctx->mntfh, ctx->clone_data.fattr);
+ dput(parent);
+ if (err != 0)
+ return err;
+
+ ctx->selected_flavor = server->client->cl_auth->au_flavor;
+ return nfs_do_submount(fc);
+}
+EXPORT_SYMBOL_GPL(nfs_submount);
+
+static int param_set_nfs_timeout(const char *val, const struct kernel_param *kp)
+{
+ long num;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+ ret = kstrtol(val, 0, &num);
+ if (ret)
+ return -EINVAL;
+ if (num > 0) {
+ if (num >= INT_MAX / HZ)
+ num = INT_MAX;
+ else
+ num *= HZ;
+ *((int *)kp->arg) = num;
+ if (!list_empty(&nfs_automount_list))
+ mod_delayed_work(system_wq, &nfs_automount_task, num);
+ } else {
+ *((int *)kp->arg) = -1*HZ;
+ cancel_delayed_work(&nfs_automount_task);
+ }
+ return 0;
+}
+
+static int param_get_nfs_timeout(char *buffer, const struct kernel_param *kp)
+{
+ long num = *((int *)kp->arg);
+
+ if (num > 0) {
+ if (num >= INT_MAX - (HZ - 1))
+ num = INT_MAX / HZ;
+ else
+ num = (num + (HZ - 1)) / HZ;
+ } else
+ num = -1;
+ return sysfs_emit(buffer, "%li\n", num);
+}
+
+static const struct kernel_param_ops param_ops_nfs_timeout = {
+ .set = param_set_nfs_timeout,
+ .get = param_get_nfs_timeout,
+};
+#define param_check_nfs_timeout(name, p) __param_check(name, p, int)
+
+module_param(nfs_mountpoint_expiry_timeout, nfs_timeout, 0644);
+MODULE_PARM_DESC(nfs_mountpoint_expiry_timeout,
+ "Set the NFS automounted mountpoint timeout value (seconds)."
+ "Values <= 0 turn expiration off.");
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
new file mode 100644
index 0000000000..c8374f74dc
--- /dev/null
+++ b/fs/nfs/netns.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NFS-private data for each "struct net". Accessed with net_generic().
+ */
+
+#ifndef __NFS_NETNS_H__
+#define __NFS_NETNS_H__
+
+#include <linux/nfs4.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+struct bl_dev_msg {
+ int32_t status;
+ uint32_t major, minor;
+};
+
+struct nfs_netns_client;
+
+struct nfs_net {
+ struct cache_detail *nfs_dns_resolve;
+ struct rpc_pipe *bl_device_pipe;
+ struct bl_dev_msg bl_mount_reply;
+ wait_queue_head_t bl_wq;
+ struct mutex bl_mutex;
+ struct list_head nfs_client_list;
+ struct list_head nfs_volume_list;
+#if IS_ENABLED(CONFIG_NFS_V4)
+ struct idr cb_ident_idr; /* Protected by nfs_client_lock */
+ unsigned short nfs_callback_tcpport;
+ unsigned short nfs_callback_tcpport6;
+ int cb_users[NFS4_MAX_MINOR_VERSION + 1];
+#endif
+ struct nfs_netns_client *nfs_client;
+ spinlock_t nfs_client_lock;
+ ktime_t boot_time;
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *proc_nfsfs;
+#endif
+};
+
+extern unsigned int nfs_net_id;
+
+#endif
diff --git a/fs/nfs/nfs.h b/fs/nfs/nfs.h
new file mode 100644
index 0000000000..5ba00610ae
--- /dev/null
+++ b/fs/nfs/nfs.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2012 Netapp, Inc. All rights reserved.
+ *
+ * Function and structures exported by the NFS module
+ * for use by NFS version-specific modules.
+ */
+#ifndef __LINUX_INTERNAL_NFS_H
+#define __LINUX_INTERNAL_NFS_H
+
+#include <linux/fs.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs_xdr.h>
+
+struct nfs_subversion {
+ struct module *owner; /* THIS_MODULE pointer */
+ struct file_system_type *nfs_fs; /* NFS filesystem type */
+ const struct rpc_version *rpc_vers; /* NFS version information */
+ const struct nfs_rpc_ops *rpc_ops; /* NFS operations */
+ const struct super_operations *sops; /* NFS Super operations */
+ const struct xattr_handler **xattr; /* NFS xattr handlers */
+ struct list_head list; /* List of NFS versions */
+};
+
+struct nfs_subversion *get_nfs_version(unsigned int);
+void put_nfs_version(struct nfs_subversion *);
+void register_nfs_version(struct nfs_subversion *);
+void unregister_nfs_version(struct nfs_subversion *);
+
+#endif /* __LINUX_INTERNAL_NFS_H */
diff --git a/fs/nfs/nfs2super.c b/fs/nfs/nfs2super.c
new file mode 100644
index 0000000000..467f21ee6a
--- /dev/null
+++ b/fs/nfs/nfs2super.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2012 Netapp, Inc. All rights reserved.
+ */
+#include <linux/module.h>
+#include <linux/nfs_fs.h>
+#include "internal.h"
+#include "nfs.h"
+
+static struct nfs_subversion nfs_v2 = {
+ .owner = THIS_MODULE,
+ .nfs_fs = &nfs_fs_type,
+ .rpc_vers = &nfs_version2,
+ .rpc_ops = &nfs_v2_clientops,
+ .sops = &nfs_sops,
+};
+
+static int __init init_nfs_v2(void)
+{
+ register_nfs_version(&nfs_v2);
+ return 0;
+}
+
+static void __exit exit_nfs_v2(void)
+{
+ unregister_nfs_version(&nfs_v2);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_nfs_v2);
+module_exit(exit_nfs_v2);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
new file mode 100644
index 0000000000..c190938142
--- /dev/null
+++ b/fs/nfs/nfs2xdr.c
@@ -0,0 +1,1156 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/nfs2xdr.c
+ *
+ * XDR functions to encode/decode NFS RPC arguments and results.
+ *
+ * Copyright (C) 1992, 1993, 1994 Rick Sladkey
+ * Copyright (C) 1996 Olaf Kirch
+ * 04 Aug 1998 Ion Badulescu <ionut@cs.columbia.edu>
+ * FIFO's need special handling in NFSv2
+ */
+
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs_fs.h>
+#include "nfstrace.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_XDR
+
+/* Mapping from NFS error code to "errno" error code. */
+#define errno_NFSERR_IO EIO
+
+/*
+ * Declare the space requirements for NFS arguments and replies as
+ * number of 32bit-words
+ */
+#define NFS_pagepad_sz (1) /* Page padding */
+#define NFS_fhandle_sz (8)
+#define NFS_sattr_sz (8)
+#define NFS_filename_sz (1+(NFS2_MAXNAMLEN>>2))
+#define NFS_path_sz (1+(NFS2_MAXPATHLEN>>2))
+#define NFS_fattr_sz (17)
+#define NFS_info_sz (5)
+#define NFS_entry_sz (NFS_filename_sz+3)
+
+#define NFS_diropargs_sz (NFS_fhandle_sz+NFS_filename_sz)
+#define NFS_removeargs_sz (NFS_fhandle_sz+NFS_filename_sz)
+#define NFS_sattrargs_sz (NFS_fhandle_sz+NFS_sattr_sz)
+#define NFS_readlinkargs_sz (NFS_fhandle_sz)
+#define NFS_readargs_sz (NFS_fhandle_sz+3)
+#define NFS_writeargs_sz (NFS_fhandle_sz+4)
+#define NFS_createargs_sz (NFS_diropargs_sz+NFS_sattr_sz)
+#define NFS_renameargs_sz (NFS_diropargs_sz+NFS_diropargs_sz)
+#define NFS_linkargs_sz (NFS_fhandle_sz+NFS_diropargs_sz)
+#define NFS_symlinkargs_sz (NFS_diropargs_sz+1+NFS_sattr_sz)
+#define NFS_readdirargs_sz (NFS_fhandle_sz+2)
+
+#define NFS_attrstat_sz (1+NFS_fattr_sz)
+#define NFS_diropres_sz (1+NFS_fhandle_sz+NFS_fattr_sz)
+#define NFS_readlinkres_sz (2+NFS_pagepad_sz)
+#define NFS_readres_sz (1+NFS_fattr_sz+1+NFS_pagepad_sz)
+#define NFS_writeres_sz (NFS_attrstat_sz)
+#define NFS_stat_sz (1)
+#define NFS_readdirres_sz (1+NFS_pagepad_sz)
+#define NFS_statfsres_sz (1+NFS_info_sz)
+
+static int nfs_stat_to_errno(enum nfs_stat);
+
+/*
+ * Encode/decode NFSv2 basic data types
+ *
+ * Basic NFSv2 data types are defined in section 2.3 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions. For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+
+static struct user_namespace *rpc_userns(const struct rpc_clnt *clnt)
+{
+ if (clnt && clnt->cl_cred)
+ return clnt->cl_cred->user_ns;
+ return &init_user_ns;
+}
+
+static struct user_namespace *rpc_rqst_userns(const struct rpc_rqst *rqstp)
+{
+ if (rqstp->rq_task)
+ return rpc_userns(rqstp->rq_task->tk_client);
+ return &init_user_ns;
+}
+
+/*
+ * typedef opaque nfsdata<>;
+ */
+static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result)
+{
+ u32 recvd, count;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ count = be32_to_cpup(p);
+ recvd = xdr_read_pages(xdr, count);
+ if (unlikely(count > recvd))
+ goto out_cheating;
+out:
+ result->eof = 0; /* NFSv2 does not pass EOF flag on the wire. */
+ result->count = count;
+ return count;
+out_cheating:
+ dprintk("NFS: server cheating in read result: "
+ "count %u > recvd %u\n", count, recvd);
+ count = recvd;
+ goto out;
+}
+
+/*
+ * enum stat {
+ * NFS_OK = 0,
+ * NFSERR_PERM = 1,
+ * NFSERR_NOENT = 2,
+ * NFSERR_IO = 5,
+ * NFSERR_NXIO = 6,
+ * NFSERR_ACCES = 13,
+ * NFSERR_EXIST = 17,
+ * NFSERR_NODEV = 19,
+ * NFSERR_NOTDIR = 20,
+ * NFSERR_ISDIR = 21,
+ * NFSERR_FBIG = 27,
+ * NFSERR_NOSPC = 28,
+ * NFSERR_ROFS = 30,
+ * NFSERR_NAMETOOLONG = 63,
+ * NFSERR_NOTEMPTY = 66,
+ * NFSERR_DQUOT = 69,
+ * NFSERR_STALE = 70,
+ * NFSERR_WFLUSH = 99
+ * };
+ */
+static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ if (unlikely(*p != cpu_to_be32(NFS_OK)))
+ goto out_status;
+ *status = 0;
+ return 0;
+out_status:
+ *status = be32_to_cpup(p);
+ trace_nfs_xdr_status(xdr, (int)*status);
+ return 0;
+}
+
+/*
+ * 2.3.2. ftype
+ *
+ * enum ftype {
+ * NFNON = 0,
+ * NFREG = 1,
+ * NFDIR = 2,
+ * NFBLK = 3,
+ * NFCHR = 4,
+ * NFLNK = 5
+ * };
+ *
+ */
+static __be32 *xdr_decode_ftype(__be32 *p, u32 *type)
+{
+ *type = be32_to_cpup(p++);
+ if (unlikely(*type > NF2FIFO))
+ *type = NFBAD;
+ return p;
+}
+
+/*
+ * 2.3.3. fhandle
+ *
+ * typedef opaque fhandle[FHSIZE];
+ */
+static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS2_FHSIZE);
+ memcpy(p, fh->data, NFS2_FHSIZE);
+}
+
+static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS2_FHSIZE);
+ if (unlikely(!p))
+ return -EIO;
+ fh->size = NFS2_FHSIZE;
+ memcpy(fh->data, p, NFS2_FHSIZE);
+ return 0;
+}
+
+/*
+ * 2.3.4. timeval
+ *
+ * struct timeval {
+ * unsigned int seconds;
+ * unsigned int useconds;
+ * };
+ */
+static __be32 *xdr_encode_time(__be32 *p, const struct timespec64 *timep)
+{
+ *p++ = cpu_to_be32((u32)timep->tv_sec);
+ if (timep->tv_nsec != 0)
+ *p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC);
+ else
+ *p++ = cpu_to_be32(0);
+ return p;
+}
+
+/*
+ * Passing the invalid value useconds=1000000 is a Sun convention for
+ * "set to current server time". It's needed to make permissions checks
+ * for the "touch" program across v2 mounts to Solaris and Irix servers
+ * work correctly. See description of sattr in section 6.1 of "NFS
+ * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5.
+ */
+static __be32 *xdr_encode_current_server_time(__be32 *p,
+ const struct timespec64 *timep)
+{
+ *p++ = cpu_to_be32(timep->tv_sec);
+ *p++ = cpu_to_be32(1000000);
+ return p;
+}
+
+static __be32 *xdr_decode_time(__be32 *p, struct timespec64 *timep)
+{
+ timep->tv_sec = be32_to_cpup(p++);
+ timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC;
+ return p;
+}
+
+/*
+ * 2.3.5. fattr
+ *
+ * struct fattr {
+ * ftype type;
+ * unsigned int mode;
+ * unsigned int nlink;
+ * unsigned int uid;
+ * unsigned int gid;
+ * unsigned int size;
+ * unsigned int blocksize;
+ * unsigned int rdev;
+ * unsigned int blocks;
+ * unsigned int fsid;
+ * unsigned int fileid;
+ * timeval atime;
+ * timeval mtime;
+ * timeval ctime;
+ * };
+ *
+ */
+static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+ struct user_namespace *userns)
+{
+ u32 rdev, type;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS_fattr_sz << 2);
+ if (unlikely(!p))
+ return -EIO;
+
+ fattr->valid |= NFS_ATTR_FATTR_V2;
+
+ p = xdr_decode_ftype(p, &type);
+
+ fattr->mode = be32_to_cpup(p++);
+ fattr->nlink = be32_to_cpup(p++);
+ fattr->uid = make_kuid(userns, be32_to_cpup(p++));
+ if (!uid_valid(fattr->uid))
+ goto out_uid;
+ fattr->gid = make_kgid(userns, be32_to_cpup(p++));
+ if (!gid_valid(fattr->gid))
+ goto out_gid;
+
+ fattr->size = be32_to_cpup(p++);
+ fattr->du.nfs2.blocksize = be32_to_cpup(p++);
+
+ rdev = be32_to_cpup(p++);
+ fattr->rdev = new_decode_dev(rdev);
+ if (type == (u32)NFCHR && rdev == (u32)NFS2_FIFO_DEV) {
+ fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
+ fattr->rdev = 0;
+ }
+
+ fattr->du.nfs2.blocks = be32_to_cpup(p++);
+ fattr->fsid.major = be32_to_cpup(p++);
+ fattr->fsid.minor = 0;
+ fattr->fileid = be32_to_cpup(p++);
+
+ p = xdr_decode_time(p, &fattr->atime);
+ p = xdr_decode_time(p, &fattr->mtime);
+ xdr_decode_time(p, &fattr->ctime);
+ fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime);
+
+ return 0;
+out_uid:
+ dprintk("NFS: returned invalid uid\n");
+ return -EINVAL;
+out_gid:
+ dprintk("NFS: returned invalid gid\n");
+ return -EINVAL;
+}
+
+/*
+ * 2.3.6. sattr
+ *
+ * struct sattr {
+ * unsigned int mode;
+ * unsigned int uid;
+ * unsigned int gid;
+ * unsigned int size;
+ * timeval atime;
+ * timeval mtime;
+ * };
+ */
+
+#define NFS2_SATTR_NOT_SET (0xffffffff)
+
+static __be32 *xdr_time_not_set(__be32 *p)
+{
+ *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+ *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+ return p;
+}
+
+static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr,
+ struct user_namespace *userns)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS_sattr_sz << 2);
+
+ if (attr->ia_valid & ATTR_MODE)
+ *p++ = cpu_to_be32(attr->ia_mode);
+ else
+ *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+ if (attr->ia_valid & ATTR_UID)
+ *p++ = cpu_to_be32(from_kuid_munged(userns, attr->ia_uid));
+ else
+ *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+ if (attr->ia_valid & ATTR_GID)
+ *p++ = cpu_to_be32(from_kgid_munged(userns, attr->ia_gid));
+ else
+ *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+ if (attr->ia_valid & ATTR_SIZE)
+ *p++ = cpu_to_be32((u32)attr->ia_size);
+ else
+ *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+
+ if (attr->ia_valid & ATTR_ATIME_SET)
+ p = xdr_encode_time(p, &attr->ia_atime);
+ else if (attr->ia_valid & ATTR_ATIME)
+ p = xdr_encode_current_server_time(p, &attr->ia_atime);
+ else
+ p = xdr_time_not_set(p);
+ if (attr->ia_valid & ATTR_MTIME_SET)
+ xdr_encode_time(p, &attr->ia_mtime);
+ else if (attr->ia_valid & ATTR_MTIME)
+ xdr_encode_current_server_time(p, &attr->ia_mtime);
+ else
+ xdr_time_not_set(p);
+}
+
+/*
+ * 2.3.7. filename
+ *
+ * typedef string filename<MAXNAMLEN>;
+ */
+static void encode_filename(struct xdr_stream *xdr,
+ const char *name, u32 length)
+{
+ __be32 *p;
+
+ WARN_ON_ONCE(length > NFS2_MAXNAMLEN);
+ p = xdr_reserve_space(xdr, 4 + length);
+ xdr_encode_opaque(p, name, length);
+}
+
+static int decode_filename_inline(struct xdr_stream *xdr,
+ const char **name, u32 *length)
+{
+ __be32 *p;
+ u32 count;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ count = be32_to_cpup(p);
+ if (count > NFS3_MAXNAMLEN)
+ goto out_nametoolong;
+ p = xdr_inline_decode(xdr, count);
+ if (unlikely(!p))
+ return -EIO;
+ *name = (const char *)p;
+ *length = count;
+ return 0;
+out_nametoolong:
+ dprintk("NFS: returned filename too long: %u\n", count);
+ return -ENAMETOOLONG;
+}
+
+/*
+ * 2.3.8. path
+ *
+ * typedef string path<MAXPATHLEN>;
+ */
+static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+ *p = cpu_to_be32(length);
+ xdr_write_pages(xdr, pages, 0, length);
+}
+
+static int decode_path(struct xdr_stream *xdr)
+{
+ u32 length, recvd;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ length = be32_to_cpup(p);
+ if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN))
+ goto out_size;
+ recvd = xdr_read_pages(xdr, length);
+ if (unlikely(length > recvd))
+ goto out_cheating;
+ xdr_terminate_string(xdr->buf, length);
+ return 0;
+out_size:
+ dprintk("NFS: returned pathname too long: %u\n", length);
+ return -ENAMETOOLONG;
+out_cheating:
+ dprintk("NFS: server cheating in pathname result: "
+ "length %u > received %u\n", length, recvd);
+ return -EIO;
+}
+
+/*
+ * 2.3.9. attrstat
+ *
+ * union attrstat switch (stat status) {
+ * case NFS_OK:
+ * fattr attributes;
+ * default:
+ * void;
+ * };
+ */
+static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result,
+ __u32 *op_status,
+ struct user_namespace *userns)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (op_status)
+ *op_status = status;
+ if (status != NFS_OK)
+ goto out_default;
+ error = decode_fattr(xdr, result, userns);
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+/*
+ * 2.3.10. diropargs
+ *
+ * struct diropargs {
+ * fhandle dir;
+ * filename name;
+ * };
+ */
+static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh,
+ const char *name, u32 length)
+{
+ encode_fhandle(xdr, fh);
+ encode_filename(xdr, name, length);
+}
+
+/*
+ * 2.3.11. diropres
+ *
+ * union diropres switch (stat status) {
+ * case NFS_OK:
+ * struct {
+ * fhandle file;
+ * fattr attributes;
+ * } diropok;
+ * default:
+ * void;
+ * };
+ */
+static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result,
+ struct user_namespace *userns)
+{
+ int error;
+
+ error = decode_fhandle(xdr, result->fh);
+ if (unlikely(error))
+ goto out;
+ error = decode_fattr(xdr, result->fattr, userns);
+out:
+ return error;
+}
+
+static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result,
+ struct user_namespace *userns)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS_OK)
+ goto out_default;
+ error = decode_diropok(xdr, result, userns);
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+
+/*
+ * NFSv2 XDR encode functions
+ *
+ * NFSv2 argument types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ */
+
+static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_fh *fh = data;
+
+ encode_fhandle(xdr, fh);
+}
+
+/*
+ * 2.2.3. sattrargs
+ *
+ * struct sattrargs {
+ * fhandle file;
+ * sattr attributes;
+ * };
+ */
+static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_sattrargs *args = data;
+
+ encode_fhandle(xdr, args->fh);
+ encode_sattr(xdr, args->sattr, rpc_rqst_userns(req));
+}
+
+static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_diropargs *args = data;
+
+ encode_diropargs(xdr, args->fh, args->name, args->len);
+}
+
+static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_readlinkargs *args = data;
+
+ encode_fhandle(xdr, args->fh);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->pglen,
+ NFS_readlinkres_sz - NFS_pagepad_sz);
+}
+
+/*
+ * 2.2.7. readargs
+ *
+ * struct readargs {
+ * fhandle file;
+ * unsigned offset;
+ * unsigned count;
+ * unsigned totalcount;
+ * };
+ */
+static void encode_readargs(struct xdr_stream *xdr,
+ const struct nfs_pgio_args *args)
+{
+ u32 offset = args->offset;
+ u32 count = args->count;
+ __be32 *p;
+
+ encode_fhandle(xdr, args->fh);
+
+ p = xdr_reserve_space(xdr, 4 + 4 + 4);
+ *p++ = cpu_to_be32(offset);
+ *p++ = cpu_to_be32(count);
+ *p = cpu_to_be32(count);
+}
+
+static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_pgio_args *args = data;
+
+ encode_readargs(xdr, args);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->count,
+ NFS_readres_sz - NFS_pagepad_sz);
+ req->rq_rcv_buf.flags |= XDRBUF_READ;
+}
+
+/*
+ * 2.2.9. writeargs
+ *
+ * struct writeargs {
+ * fhandle file;
+ * unsigned beginoffset;
+ * unsigned offset;
+ * unsigned totalcount;
+ * nfsdata data;
+ * };
+ */
+static void encode_writeargs(struct xdr_stream *xdr,
+ const struct nfs_pgio_args *args)
+{
+ u32 offset = args->offset;
+ u32 count = args->count;
+ __be32 *p;
+
+ encode_fhandle(xdr, args->fh);
+
+ p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
+ *p++ = cpu_to_be32(offset);
+ *p++ = cpu_to_be32(offset);
+ *p++ = cpu_to_be32(count);
+
+ /* nfsdata */
+ *p = cpu_to_be32(count);
+ xdr_write_pages(xdr, args->pages, args->pgbase, count);
+}
+
+static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_pgio_args *args = data;
+
+ encode_writeargs(xdr, args);
+ xdr->buf->flags |= XDRBUF_WRITE;
+}
+
+/*
+ * 2.2.10. createargs
+ *
+ * struct createargs {
+ * diropargs where;
+ * sattr attributes;
+ * };
+ */
+static void nfs2_xdr_enc_createargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_createargs *args = data;
+
+ encode_diropargs(xdr, args->fh, args->name, args->len);
+ encode_sattr(xdr, args->sattr, rpc_rqst_userns(req));
+}
+
+static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_removeargs *args = data;
+
+ encode_diropargs(xdr, args->fh, args->name.name, args->name.len);
+}
+
+/*
+ * 2.2.12. renameargs
+ *
+ * struct renameargs {
+ * diropargs from;
+ * diropargs to;
+ * };
+ */
+static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_renameargs *args = data;
+ const struct qstr *old = args->old_name;
+ const struct qstr *new = args->new_name;
+
+ encode_diropargs(xdr, args->old_dir, old->name, old->len);
+ encode_diropargs(xdr, args->new_dir, new->name, new->len);
+}
+
+/*
+ * 2.2.13. linkargs
+ *
+ * struct linkargs {
+ * fhandle from;
+ * diropargs to;
+ * };
+ */
+static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_linkargs *args = data;
+
+ encode_fhandle(xdr, args->fromfh);
+ encode_diropargs(xdr, args->tofh, args->toname, args->tolen);
+}
+
+/*
+ * 2.2.14. symlinkargs
+ *
+ * struct symlinkargs {
+ * diropargs from;
+ * path to;
+ * sattr attributes;
+ * };
+ */
+static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_symlinkargs *args = data;
+
+ encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen);
+ encode_path(xdr, args->pages, args->pathlen);
+ encode_sattr(xdr, args->sattr, rpc_rqst_userns(req));
+}
+
+/*
+ * 2.2.17. readdirargs
+ *
+ * struct readdirargs {
+ * fhandle dir;
+ * nfscookie cookie;
+ * unsigned count;
+ * };
+ */
+static void encode_readdirargs(struct xdr_stream *xdr,
+ const struct nfs_readdirargs *args)
+{
+ __be32 *p;
+
+ encode_fhandle(xdr, args->fh);
+
+ p = xdr_reserve_space(xdr, 4 + 4);
+ *p++ = cpu_to_be32(args->cookie);
+ *p = cpu_to_be32(args->count);
+}
+
+static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_readdirargs *args = data;
+
+ encode_readdirargs(xdr, args);
+ rpc_prepare_reply_pages(req, args->pages, 0, args->count,
+ NFS_readdirres_sz - NFS_pagepad_sz);
+}
+
+/*
+ * NFSv2 XDR decode functions
+ *
+ * NFSv2 result types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ */
+
+static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *__unused)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS_OK)
+ goto out_default;
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *result)
+{
+ return decode_attrstat(xdr, result, NULL, rpc_rqst_userns(req));
+}
+
+static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *result)
+{
+ return decode_diropres(xdr, result, rpc_rqst_userns(req));
+}
+
+/*
+ * 2.2.6. readlinkres
+ *
+ * union readlinkres switch (stat status) {
+ * case NFS_OK:
+ * path data;
+ * default:
+ * void;
+ * };
+ */
+static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req,
+ struct xdr_stream *xdr, void *__unused)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS_OK)
+ goto out_default;
+ error = decode_path(xdr);
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+/*
+ * 2.2.7. readres
+ *
+ * union readres switch (stat status) {
+ * case NFS_OK:
+ * fattr attributes;
+ * nfsdata data;
+ * default:
+ * void;
+ * };
+ */
+static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_pgio_res *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ result->op_status = status;
+ if (status != NFS_OK)
+ goto out_default;
+ error = decode_fattr(xdr, result->fattr, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ error = decode_nfsdata(xdr, result);
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_pgio_res *result = data;
+
+ /* All NFSv2 writes are "file sync" writes */
+ result->verf->committed = NFS_FILE_SYNC;
+ return decode_attrstat(xdr, result->fattr, &result->op_status,
+ rpc_rqst_userns(req));
+}
+
+/**
+ * nfs2_decode_dirent - Decode a single NFSv2 directory entry stored in
+ * the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 2.2.17. entry
+ *
+ * struct entry {
+ * unsigned fileid;
+ * filename name;
+ * nfscookie cookie;
+ * entry *nextentry;
+ * };
+ */
+int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+ bool plus)
+{
+ __be32 *p;
+ int error;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EAGAIN;
+ if (*p++ == xdr_zero) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EAGAIN;
+ if (*p++ == xdr_zero)
+ return -EAGAIN;
+ entry->eof = 1;
+ return -EBADCOOKIE;
+ }
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EAGAIN;
+ entry->ino = be32_to_cpup(p);
+
+ error = decode_filename_inline(xdr, &entry->name, &entry->len);
+ if (unlikely(error))
+ return error == -ENAMETOOLONG ? -ENAMETOOLONG : -EAGAIN;
+
+ /*
+ * The type (size and byte order) of nfscookie isn't defined in
+ * RFC 1094. This implementation assumes that it's an XDR uint32.
+ */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EAGAIN;
+ entry->cookie = be32_to_cpup(p);
+
+ entry->d_type = DT_UNKNOWN;
+
+ return 0;
+}
+
+/*
+ * 2.2.17. readdirres
+ *
+ * union readdirres switch (stat status) {
+ * case NFS_OK:
+ * struct {
+ * entry *entries;
+ * bool eof;
+ * } readdirok;
+ * default:
+ * void;
+ * };
+ *
+ * Read the directory contents into the page cache, but don't
+ * touch them. The actual decoding is done by nfs2_decode_dirent()
+ * during subsequent nfs_readdir() calls.
+ */
+static int decode_readdirok(struct xdr_stream *xdr)
+{
+ return xdr_read_pages(xdr, xdr->buf->page_len);
+}
+
+static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req,
+ struct xdr_stream *xdr, void *__unused)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS_OK)
+ goto out_default;
+ error = decode_readdirok(xdr);
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+/*
+ * 2.2.18. statfsres
+ *
+ * union statfsres (stat status) {
+ * case NFS_OK:
+ * struct {
+ * unsigned tsize;
+ * unsigned bsize;
+ * unsigned blocks;
+ * unsigned bfree;
+ * unsigned bavail;
+ * } info;
+ * default:
+ * void;
+ * };
+ */
+static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS_info_sz << 2);
+ if (unlikely(!p))
+ return -EIO;
+ result->tsize = be32_to_cpup(p++);
+ result->bsize = be32_to_cpup(p++);
+ result->blocks = be32_to_cpup(p++);
+ result->bfree = be32_to_cpup(p++);
+ result->bavail = be32_to_cpup(p);
+ return 0;
+}
+
+static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *result)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS_OK)
+ goto out_default;
+ error = decode_info(xdr, result);
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+
+/*
+ * We need to translate between nfs status return values and
+ * the local errno values which may not be the same.
+ */
+static const struct {
+ int stat;
+ int errno;
+} nfs_errtbl[] = {
+ { NFS_OK, 0 },
+ { NFSERR_PERM, -EPERM },
+ { NFSERR_NOENT, -ENOENT },
+ { NFSERR_IO, -errno_NFSERR_IO},
+ { NFSERR_NXIO, -ENXIO },
+/* { NFSERR_EAGAIN, -EAGAIN }, */
+ { NFSERR_ACCES, -EACCES },
+ { NFSERR_EXIST, -EEXIST },
+ { NFSERR_XDEV, -EXDEV },
+ { NFSERR_NODEV, -ENODEV },
+ { NFSERR_NOTDIR, -ENOTDIR },
+ { NFSERR_ISDIR, -EISDIR },
+ { NFSERR_INVAL, -EINVAL },
+ { NFSERR_FBIG, -EFBIG },
+ { NFSERR_NOSPC, -ENOSPC },
+ { NFSERR_ROFS, -EROFS },
+ { NFSERR_MLINK, -EMLINK },
+ { NFSERR_NAMETOOLONG, -ENAMETOOLONG },
+ { NFSERR_NOTEMPTY, -ENOTEMPTY },
+ { NFSERR_DQUOT, -EDQUOT },
+ { NFSERR_STALE, -ESTALE },
+ { NFSERR_REMOTE, -EREMOTE },
+#ifdef EWFLUSH
+ { NFSERR_WFLUSH, -EWFLUSH },
+#endif
+ { NFSERR_BADHANDLE, -EBADHANDLE },
+ { NFSERR_NOT_SYNC, -ENOTSYNC },
+ { NFSERR_BAD_COOKIE, -EBADCOOKIE },
+ { NFSERR_NOTSUPP, -ENOTSUPP },
+ { NFSERR_TOOSMALL, -ETOOSMALL },
+ { NFSERR_SERVERFAULT, -EREMOTEIO },
+ { NFSERR_BADTYPE, -EBADTYPE },
+ { NFSERR_JUKEBOX, -EJUKEBOX },
+ { -1, -EIO }
+};
+
+/**
+ * nfs_stat_to_errno - convert an NFS status code to a local errno
+ * @status: NFS status code to convert
+ *
+ * Returns a local errno value, or -EIO if the NFS status code is
+ * not recognized. This function is used jointly by NFSv2 and NFSv3.
+ */
+static int nfs_stat_to_errno(enum nfs_stat status)
+{
+ int i;
+
+ for (i = 0; nfs_errtbl[i].stat != -1; i++) {
+ if (nfs_errtbl[i].stat == (int)status)
+ return nfs_errtbl[i].errno;
+ }
+ dprintk("NFS: Unrecognized nfs status value: %u\n", status);
+ return nfs_errtbl[i].errno;
+}
+
+#define PROC(proc, argtype, restype, timer) \
+[NFSPROC_##proc] = { \
+ .p_proc = NFSPROC_##proc, \
+ .p_encode = nfs2_xdr_enc_##argtype, \
+ .p_decode = nfs2_xdr_dec_##restype, \
+ .p_arglen = NFS_##argtype##_sz, \
+ .p_replen = NFS_##restype##_sz, \
+ .p_timer = timer, \
+ .p_statidx = NFSPROC_##proc, \
+ .p_name = #proc, \
+ }
+const struct rpc_procinfo nfs_procedures[] = {
+ PROC(GETATTR, fhandle, attrstat, 1),
+ PROC(SETATTR, sattrargs, attrstat, 0),
+ PROC(LOOKUP, diropargs, diropres, 2),
+ PROC(READLINK, readlinkargs, readlinkres, 3),
+ PROC(READ, readargs, readres, 3),
+ PROC(WRITE, writeargs, writeres, 4),
+ PROC(CREATE, createargs, diropres, 0),
+ PROC(REMOVE, removeargs, stat, 0),
+ PROC(RENAME, renameargs, stat, 0),
+ PROC(LINK, linkargs, stat, 0),
+ PROC(SYMLINK, symlinkargs, stat, 0),
+ PROC(MKDIR, createargs, diropres, 0),
+ PROC(RMDIR, diropargs, stat, 0),
+ PROC(READDIR, readdirargs, readdirres, 3),
+ PROC(STATFS, fhandle, statfsres, 0),
+};
+
+static unsigned int nfs_version2_counts[ARRAY_SIZE(nfs_procedures)];
+const struct rpc_version nfs_version2 = {
+ .number = 2,
+ .nrprocs = ARRAY_SIZE(nfs_procedures),
+ .procs = nfs_procedures,
+ .counts = nfs_version2_counts,
+};
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
new file mode 100644
index 0000000000..b333ea119e
--- /dev/null
+++ b/fs/nfs/nfs3_fs.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2014 Anna Schumaker.
+ *
+ * NFSv3-specific filesystem definitions and declarations
+ */
+#ifndef __LINUX_FS_NFS_NFS3_FS_H
+#define __LINUX_FS_NFS_NFS3_FS_H
+
+/*
+ * nfs3acl.c
+ */
+#ifdef CONFIG_NFS_V3_ACL
+extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu);
+extern int nfs3_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct posix_acl *acl, int type);
+extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+ struct posix_acl *dfacl);
+extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t);
+#else
+static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+ struct posix_acl *dfacl)
+{
+ return 0;
+}
+#define nfs3_listxattr NULL
+#endif /* CONFIG_NFS_V3_ACL */
+
+/* nfs3client.c */
+struct nfs_server *nfs3_create_server(struct fs_context *);
+struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
+ struct nfs_fattr *, rpc_authflavor_t);
+
+/* nfs3super.c */
+extern struct nfs_subversion nfs_v3;
+
+#endif /* __LINUX_FS_NFS_NFS3_FS_H */
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
new file mode 100644
index 0000000000..18d8f6529f
--- /dev/null
+++ b/fs/nfs/nfs3acl.c
@@ -0,0 +1,343 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs_fs.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/nfsacl.h>
+
+#include "internal.h"
+#include "nfs3_fs.h"
+
+#define NFSDBG_FACILITY NFSDBG_PROC
+
+/*
+ * nfs3_prepare_get_acl, nfs3_complete_get_acl, nfs3_abort_get_acl: Helpers for
+ * caching get_acl results in a race-free way. See fs/posix_acl.c:get_acl()
+ * for explanations.
+ */
+static void nfs3_prepare_get_acl(struct posix_acl **p)
+{
+ struct posix_acl *sentinel = uncached_acl_sentinel(current);
+
+ /* If the ACL isn't being read yet, set our sentinel. */
+ cmpxchg(p, ACL_NOT_CACHED, sentinel);
+}
+
+static void nfs3_complete_get_acl(struct posix_acl **p, struct posix_acl *acl)
+{
+ struct posix_acl *sentinel = uncached_acl_sentinel(current);
+
+ /* Only cache the ACL if our sentinel is still in place. */
+ posix_acl_dup(acl);
+ if (cmpxchg(p, sentinel, acl) != sentinel)
+ posix_acl_release(acl);
+}
+
+static void nfs3_abort_get_acl(struct posix_acl **p)
+{
+ struct posix_acl *sentinel = uncached_acl_sentinel(current);
+
+ /* Remove our sentinel upon failure. */
+ cmpxchg(p, sentinel, ACL_NOT_CACHED);
+}
+
+struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct page *pages[NFSACL_MAXPAGES] = { };
+ struct nfs3_getaclargs args = {
+ .fh = NFS_FH(inode),
+ /* The xdr layer may allocate pages here. */
+ .pages = pages,
+ };
+ struct nfs3_getaclres res = {
+ NULL,
+ };
+ struct rpc_message msg = {
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ int status, count;
+
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
+ if (!nfs_server_capable(inode, NFS_CAP_ACLS))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ status = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
+ if (status < 0)
+ return ERR_PTR(status);
+
+ /*
+ * Only get the access acl when explicitly requested: We don't
+ * need it for access decisions, and only some applications use
+ * it. Applications which request the access acl first are not
+ * penalized from this optimization.
+ */
+ if (type == ACL_TYPE_ACCESS)
+ args.mask |= NFS_ACLCNT|NFS_ACL;
+ if (S_ISDIR(inode->i_mode))
+ args.mask |= NFS_DFACLCNT|NFS_DFACL;
+ if (args.mask == 0)
+ return NULL;
+
+ dprintk("NFS call getacl\n");
+ msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL];
+ res.fattr = nfs_alloc_fattr();
+ if (res.fattr == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ if (args.mask & NFS_ACL)
+ nfs3_prepare_get_acl(&inode->i_acl);
+ if (args.mask & NFS_DFACL)
+ nfs3_prepare_get_acl(&inode->i_default_acl);
+
+ status = rpc_call_sync(server->client_acl, &msg, 0);
+ dprintk("NFS reply getacl: %d\n", status);
+
+ /* pages may have been allocated at the xdr layer. */
+ for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++)
+ __free_page(args.pages[count]);
+
+ switch (status) {
+ case 0:
+ status = nfs_refresh_inode(inode, res.fattr);
+ break;
+ case -EPFNOSUPPORT:
+ case -EPROTONOSUPPORT:
+ dprintk("NFS_V3_ACL extension not supported; disabling\n");
+ server->caps &= ~NFS_CAP_ACLS;
+ fallthrough;
+ case -ENOTSUPP:
+ status = -EOPNOTSUPP;
+ goto getout;
+ default:
+ goto getout;
+ }
+ if ((args.mask & res.mask) != args.mask) {
+ status = -EIO;
+ goto getout;
+ }
+
+ if (res.acl_access != NULL) {
+ if ((posix_acl_equiv_mode(res.acl_access, NULL) == 0) ||
+ res.acl_access->a_count == 0) {
+ posix_acl_release(res.acl_access);
+ res.acl_access = NULL;
+ }
+ }
+
+ if (res.mask & NFS_ACL)
+ nfs3_complete_get_acl(&inode->i_acl, res.acl_access);
+ else
+ forget_cached_acl(inode, ACL_TYPE_ACCESS);
+
+ if (res.mask & NFS_DFACL)
+ nfs3_complete_get_acl(&inode->i_default_acl, res.acl_default);
+ else
+ forget_cached_acl(inode, ACL_TYPE_DEFAULT);
+
+ nfs_free_fattr(res.fattr);
+ if (type == ACL_TYPE_ACCESS) {
+ posix_acl_release(res.acl_default);
+ return res.acl_access;
+ } else {
+ posix_acl_release(res.acl_access);
+ return res.acl_default;
+ }
+
+getout:
+ nfs3_abort_get_acl(&inode->i_acl);
+ nfs3_abort_get_acl(&inode->i_default_acl);
+ posix_acl_release(res.acl_access);
+ posix_acl_release(res.acl_default);
+ nfs_free_fattr(res.fattr);
+ return ERR_PTR(status);
+}
+
+static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+ struct posix_acl *dfacl)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_fattr *fattr;
+ struct page *pages[NFSACL_MAXPAGES];
+ struct nfs3_setaclargs args = {
+ .inode = inode,
+ .mask = NFS_ACL,
+ .acl_access = acl,
+ .pages = pages,
+ };
+ struct rpc_message msg = {
+ .rpc_argp = &args,
+ .rpc_resp = &fattr,
+ };
+ int status = 0;
+
+ if (acl == NULL && (!S_ISDIR(inode->i_mode) || dfacl == NULL))
+ goto out;
+
+ status = -EOPNOTSUPP;
+ if (!nfs_server_capable(inode, NFS_CAP_ACLS))
+ goto out;
+
+ /* We are doing this here because XDR marshalling does not
+ * return any results, it BUGs. */
+ status = -ENOSPC;
+ if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES)
+ goto out;
+ if (dfacl != NULL && dfacl->a_count > NFS_ACL_MAX_ENTRIES)
+ goto out;
+ if (S_ISDIR(inode->i_mode)) {
+ args.mask |= NFS_DFACL;
+ args.acl_default = dfacl;
+ args.len = nfsacl_size(acl, dfacl);
+ } else
+ args.len = nfsacl_size(acl, NULL);
+
+ if (args.len > NFS_ACL_INLINE_BUFSIZE) {
+ unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT);
+
+ status = -ENOMEM;
+ do {
+ args.pages[args.npages] = alloc_page(GFP_KERNEL);
+ if (args.pages[args.npages] == NULL)
+ goto out_freepages;
+ args.npages++;
+ } while (args.npages < npages);
+ }
+
+ dprintk("NFS call setacl\n");
+ status = -ENOMEM;
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ goto out_freepages;
+
+ msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
+ msg.rpc_resp = fattr;
+ status = rpc_call_sync(server->client_acl, &msg, 0);
+ nfs_access_zap_cache(inode);
+ nfs_zap_acl_cache(inode);
+ dprintk("NFS reply setacl: %d\n", status);
+
+ switch (status) {
+ case 0:
+ status = nfs_refresh_inode(inode, fattr);
+ break;
+ case -EPFNOSUPPORT:
+ case -EPROTONOSUPPORT:
+ dprintk("NFS_V3_ACL SETACL RPC not supported"
+ "(will not retry)\n");
+ server->caps &= ~NFS_CAP_ACLS;
+ fallthrough;
+ case -ENOTSUPP:
+ status = -EOPNOTSUPP;
+ }
+ nfs_free_fattr(fattr);
+out_freepages:
+ while (args.npages != 0) {
+ args.npages--;
+ __free_page(args.pages[args.npages]);
+ }
+out:
+ return status;
+}
+
+int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+ struct posix_acl *dfacl)
+{
+ int ret;
+ ret = __nfs3_proc_setacls(inode, acl, dfacl);
+ return (ret == -EOPNOTSUPP) ? 0 : ret;
+
+}
+
+int nfs3_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct posix_acl *acl, int type)
+{
+ struct posix_acl *orig = acl, *dfacl = NULL, *alloc;
+ struct inode *inode = d_inode(dentry);
+ int status;
+
+ if (S_ISDIR(inode->i_mode)) {
+ switch(type) {
+ case ACL_TYPE_ACCESS:
+ alloc = get_inode_acl(inode, ACL_TYPE_DEFAULT);
+ if (IS_ERR(alloc))
+ goto fail;
+ dfacl = alloc;
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ alloc = get_inode_acl(inode, ACL_TYPE_ACCESS);
+ if (IS_ERR(alloc))
+ goto fail;
+ dfacl = acl;
+ acl = alloc;
+ break;
+ }
+ }
+
+ if (acl == NULL) {
+ alloc = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+ if (IS_ERR(alloc))
+ goto fail;
+ acl = alloc;
+ }
+ status = __nfs3_proc_setacls(inode, acl, dfacl);
+out:
+ if (acl != orig)
+ posix_acl_release(acl);
+ if (dfacl != orig)
+ posix_acl_release(dfacl);
+ return status;
+
+fail:
+ status = PTR_ERR(alloc);
+ goto out;
+}
+
+static int
+nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data,
+ size_t size, ssize_t *result)
+{
+ struct posix_acl *acl;
+ char *p = data + *result;
+
+ acl = get_inode_acl(inode, type);
+ if (IS_ERR_OR_NULL(acl))
+ return 0;
+
+ posix_acl_release(acl);
+
+ *result += strlen(name);
+ *result += 1;
+ if (!size)
+ return 0;
+ if (*result > size)
+ return -ERANGE;
+
+ strcpy(p, name);
+ return 0;
+}
+
+ssize_t
+nfs3_listxattr(struct dentry *dentry, char *data, size_t size)
+{
+ struct inode *inode = d_inode(dentry);
+ ssize_t result = 0;
+ int error;
+
+ error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS,
+ XATTR_NAME_POSIX_ACL_ACCESS, data, size, &result);
+ if (error)
+ return error;
+
+ error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT,
+ XATTR_NAME_POSIX_ACL_DEFAULT, data, size, &result);
+ if (error)
+ return error;
+ return result;
+}
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
new file mode 100644
index 0000000000..674c012868
--- /dev/null
+++ b/fs/nfs/nfs3client.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/sunrpc/addr.h>
+#include "internal.h"
+#include "nfs3_fs.h"
+#include "netns.h"
+#include "sysfs.h"
+
+#ifdef CONFIG_NFS_V3_ACL
+static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program };
+static const struct rpc_version *nfsacl_version[] = {
+ [3] = &nfsacl_version3,
+};
+
+const struct rpc_program nfsacl_program = {
+ .name = "nfsacl",
+ .number = NFS_ACL_PROGRAM,
+ .nrvers = ARRAY_SIZE(nfsacl_version),
+ .version = nfsacl_version,
+ .stats = &nfsacl_rpcstat,
+};
+
+/*
+ * Initialise an NFSv3 ACL client connection
+ */
+static void nfs_init_server_aclclient(struct nfs_server *server)
+{
+ if (server->flags & NFS_MOUNT_NOACL)
+ goto out_noacl;
+
+ server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3);
+ if (IS_ERR(server->client_acl))
+ goto out_noacl;
+
+ nfs_sysfs_link_rpc_client(server, server->client_acl, NULL);
+
+ /* No errors! Assume that Sun nfsacls are supported */
+ server->caps |= NFS_CAP_ACLS;
+ return;
+
+out_noacl:
+ server->caps &= ~NFS_CAP_ACLS;
+}
+#else
+static inline void nfs_init_server_aclclient(struct nfs_server *server)
+{
+ server->flags &= ~NFS_MOUNT_NOACL;
+ server->caps &= ~NFS_CAP_ACLS;
+}
+#endif
+
+struct nfs_server *nfs3_create_server(struct fs_context *fc)
+{
+ struct nfs_server *server = nfs_create_server(fc);
+
+ /* Create a client RPC handle for the NFS v3 ACL management interface */
+ if (!IS_ERR(server))
+ nfs_init_server_aclclient(server);
+ return server;
+}
+
+struct nfs_server *nfs3_clone_server(struct nfs_server *source,
+ struct nfs_fh *fh,
+ struct nfs_fattr *fattr,
+ rpc_authflavor_t flavor)
+{
+ struct nfs_server *server = nfs_clone_server(source, fh, fattr, flavor);
+ if (!IS_ERR(server) && !IS_ERR(source->client_acl))
+ nfs_init_server_aclclient(server);
+ return server;
+}
+
+/*
+ * Set up a pNFS Data Server client over NFSv3.
+ *
+ * Return any existing nfs_client that matches server address,port,version
+ * and minorversion.
+ *
+ * For a new nfs_client, use a soft mount (default), a low retrans and a
+ * low timeout interval so that if a connection is lost, we retry through
+ * the MDS.
+ */
+struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
+ const struct sockaddr_storage *ds_addr, int ds_addrlen,
+ int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans)
+{
+ struct rpc_timeout ds_timeout;
+ unsigned long connect_timeout = ds_timeo * (ds_retrans + 1) * HZ / 10;
+ struct nfs_client *mds_clp = mds_srv->nfs_client;
+ struct nfs_client_initdata cl_init = {
+ .addr = ds_addr,
+ .addrlen = ds_addrlen,
+ .nodename = mds_clp->cl_rpcclient->cl_nodename,
+ .ip_addr = mds_clp->cl_ipaddr,
+ .nfs_mod = &nfs_v3,
+ .proto = ds_proto,
+ .net = mds_clp->cl_net,
+ .timeparms = &ds_timeout,
+ .cred = mds_srv->cred,
+ .xprtsec = mds_clp->cl_xprtsec,
+ .connect_timeout = connect_timeout,
+ .reconnect_timeout = connect_timeout,
+ };
+ struct nfs_client *clp;
+ char buf[INET6_ADDRSTRLEN + 1];
+
+ /* fake a hostname because lockd wants it */
+ if (rpc_ntop((struct sockaddr *)ds_addr, buf, sizeof(buf)) <= 0)
+ return ERR_PTR(-EINVAL);
+ cl_init.hostname = buf;
+
+ switch (ds_proto) {
+ case XPRT_TRANSPORT_TCP:
+ case XPRT_TRANSPORT_TCP_TLS:
+ if (mds_clp->cl_nconnect > 1)
+ cl_init.nconnect = mds_clp->cl_nconnect;
+ }
+
+ if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
+ __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+
+ __set_bit(NFS_CS_DS, &cl_init.init_flags);
+
+ /* Use the MDS nfs_client cl_ipaddr. */
+ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
+ clp = nfs_get_client(&cl_init);
+
+ return clp;
+}
+EXPORT_SYMBOL_GPL(nfs3_set_ds_client);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
new file mode 100644
index 0000000000..4bf208a0a8
--- /dev/null
+++ b/fs/nfs/nfs3proc.c
@@ -0,0 +1,1068 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/nfs3proc.c
+ *
+ * Client-side NFSv3 procedures stubs.
+ *
+ * Copyright (C) 1997, Olaf Kirch
+ */
+
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/slab.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/lockd/bind.h>
+#include <linux/nfs_mount.h>
+#include <linux/freezer.h>
+#include <linux/xattr.h>
+
+#include "iostat.h"
+#include "internal.h"
+#include "nfs3_fs.h"
+
+#define NFSDBG_FACILITY NFSDBG_PROC
+
+/* A wrapper to handle the EJUKEBOX error messages */
+static int
+nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
+{
+ int res;
+ do {
+ res = rpc_call_sync(clnt, msg, flags);
+ if (res != -EJUKEBOX)
+ break;
+ __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
+ schedule_timeout(NFS_JUKEBOX_RETRY_TIME);
+ res = -ERESTARTSYS;
+ } while (!fatal_signal_pending(current));
+ return res;
+}
+
+#define rpc_call_sync(clnt, msg, flags) nfs3_rpc_wrapper(clnt, msg, flags)
+
+static int
+nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode)
+{
+ if (task->tk_status != -EJUKEBOX)
+ return 0;
+ nfs_inc_stats(inode, NFSIOS_DELAY);
+ task->tk_status = 0;
+ rpc_restart_call(task);
+ rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
+ return 1;
+}
+
+static int
+do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_FSINFO],
+ .rpc_argp = fhandle,
+ .rpc_resp = info,
+ };
+ int status;
+
+ dprintk("%s: call fsinfo\n", __func__);
+ nfs_fattr_init(info->fattr);
+ status = rpc_call_sync(client, &msg, 0);
+ dprintk("%s: reply fsinfo: %d\n", __func__, status);
+ if (status == 0 && !(info->fattr->valid & NFS_ATTR_FATTR)) {
+ msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
+ msg.rpc_resp = info->fattr;
+ status = rpc_call_sync(client, &msg, 0);
+ dprintk("%s: reply getattr: %d\n", __func__, status);
+ }
+ return status;
+}
+
+/*
+ * Bare-bones access to getattr: this is for nfs_get_root/nfs_get_sb
+ */
+static int
+nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ int status;
+
+ status = do_proc_get_root(server->client, fhandle, info);
+ if (status && server->nfs_client->cl_rpcclient != server->client)
+ status = do_proc_get_root(server->nfs_client->cl_rpcclient, fhandle, info);
+ return status;
+}
+
+/*
+ * One function for each procedure in the NFS protocol.
+ */
+static int
+nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct inode *inode)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR],
+ .rpc_argp = fhandle,
+ .rpc_resp = fattr,
+ };
+ int status;
+ unsigned short task_flags = 0;
+
+ /* Is this is an attribute revalidation, subject to softreval? */
+ if (inode && (server->flags & NFS_MOUNT_SOFTREVAL))
+ task_flags |= RPC_TASK_TIMEOUT;
+
+ dprintk("NFS call getattr\n");
+ nfs_fattr_init(fattr);
+ status = rpc_call_sync(server->client, &msg, task_flags);
+ dprintk("NFS reply getattr: %d\n", status);
+ return status;
+}
+
+static int
+nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
+ struct iattr *sattr)
+{
+ struct inode *inode = d_inode(dentry);
+ struct nfs3_sattrargs arg = {
+ .fh = NFS_FH(inode),
+ .sattr = sattr,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_SETATTR],
+ .rpc_argp = &arg,
+ .rpc_resp = fattr,
+ };
+ int status;
+
+ dprintk("NFS call setattr\n");
+ if (sattr->ia_valid & ATTR_FILE)
+ msg.rpc_cred = nfs_file_cred(sattr->ia_file);
+ nfs_fattr_init(fattr);
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ if (status == 0) {
+ nfs_setattr_update_inode(inode, sattr, fattr);
+ if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+ nfs_zap_acl_cache(inode);
+ }
+ dprintk("NFS reply setattr: %d\n", status);
+ return status;
+}
+
+static int
+__nfs3_proc_lookup(struct inode *dir, const char *name, size_t len,
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+ unsigned short task_flags)
+{
+ struct nfs3_diropargs arg = {
+ .fh = NFS_FH(dir),
+ .name = name,
+ .len = len
+ };
+ struct nfs3_diropres res = {
+ .fh = fhandle,
+ .fattr = fattr
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_LOOKUP],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int status;
+
+ res.dir_attr = nfs_alloc_fattr();
+ if (res.dir_attr == NULL)
+ return -ENOMEM;
+
+ nfs_fattr_init(fattr);
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, task_flags);
+ nfs_refresh_inode(dir, res.dir_attr);
+ if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) {
+ msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
+ msg.rpc_argp = fhandle;
+ msg.rpc_resp = fattr;
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, task_flags);
+ }
+ nfs_free_fattr(res.dir_attr);
+ dprintk("NFS reply lookup: %d\n", status);
+ return status;
+}
+
+static int
+nfs3_proc_lookup(struct inode *dir, struct dentry *dentry,
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+{
+ unsigned short task_flags = 0;
+
+ /* Is this is an attribute revalidation, subject to softreval? */
+ if (nfs_lookup_is_soft_revalidate(dentry))
+ task_flags |= RPC_TASK_TIMEOUT;
+
+ dprintk("NFS call lookup %pd2\n", dentry);
+ return __nfs3_proc_lookup(dir, dentry->d_name.name,
+ dentry->d_name.len, fhandle, fattr,
+ task_flags);
+}
+
+static int nfs3_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr)
+{
+ const char dotdot[] = "..";
+ const size_t len = strlen(dotdot);
+ unsigned short task_flags = 0;
+
+ if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL)
+ task_flags |= RPC_TASK_TIMEOUT;
+
+ return __nfs3_proc_lookup(inode, dotdot, len, fhandle, fattr,
+ task_flags);
+}
+
+static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry,
+ const struct cred *cred)
+{
+ struct nfs3_accessargs arg = {
+ .fh = NFS_FH(inode),
+ .access = entry->mask,
+ };
+ struct nfs3_accessres res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ int status = -ENOMEM;
+
+ dprintk("NFS call access\n");
+ res.fattr = nfs_alloc_fattr();
+ if (res.fattr == NULL)
+ goto out;
+
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ nfs_refresh_inode(inode, res.fattr);
+ if (status == 0)
+ nfs_access_set_mask(entry, res.access);
+ nfs_free_fattr(res.fattr);
+out:
+ dprintk("NFS reply access: %d\n", status);
+ return status;
+}
+
+static int nfs3_proc_readlink(struct inode *inode, struct page *page,
+ unsigned int pgbase, unsigned int pglen)
+{
+ struct nfs_fattr *fattr;
+ struct nfs3_readlinkargs args = {
+ .fh = NFS_FH(inode),
+ .pgbase = pgbase,
+ .pglen = pglen,
+ .pages = &page
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_READLINK],
+ .rpc_argp = &args,
+ };
+ int status = -ENOMEM;
+
+ dprintk("NFS call readlink\n");
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ goto out;
+ msg.rpc_resp = fattr;
+
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ nfs_refresh_inode(inode, fattr);
+ nfs_free_fattr(fattr);
+out:
+ dprintk("NFS reply readlink: %d\n", status);
+ return status;
+}
+
+struct nfs3_createdata {
+ struct rpc_message msg;
+ union {
+ struct nfs3_createargs create;
+ struct nfs3_mkdirargs mkdir;
+ struct nfs3_symlinkargs symlink;
+ struct nfs3_mknodargs mknod;
+ } arg;
+ struct nfs3_diropres res;
+ struct nfs_fh fh;
+ struct nfs_fattr fattr;
+ struct nfs_fattr dir_attr;
+};
+
+static struct nfs3_createdata *nfs3_alloc_createdata(void)
+{
+ struct nfs3_createdata *data;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (data != NULL) {
+ data->msg.rpc_argp = &data->arg;
+ data->msg.rpc_resp = &data->res;
+ data->res.fh = &data->fh;
+ data->res.fattr = &data->fattr;
+ data->res.dir_attr = &data->dir_attr;
+ nfs_fattr_init(data->res.fattr);
+ nfs_fattr_init(data->res.dir_attr);
+ }
+ return data;
+}
+
+static struct dentry *
+nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
+{
+ int status;
+
+ status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
+ nfs_post_op_update_inode(dir, data->res.dir_attr);
+ if (status != 0)
+ return ERR_PTR(status);
+
+ return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr);
+}
+
+static void nfs3_free_createdata(struct nfs3_createdata *data)
+{
+ kfree(data);
+}
+
+/*
+ * Create a regular file.
+ */
+static int
+nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+ int flags)
+{
+ struct posix_acl *default_acl, *acl;
+ struct nfs3_createdata *data;
+ struct dentry *d_alias;
+ int status = -ENOMEM;
+
+ dprintk("NFS call create %pd\n", dentry);
+
+ data = nfs3_alloc_createdata();
+ if (data == NULL)
+ goto out;
+
+ data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_CREATE];
+ data->arg.create.fh = NFS_FH(dir);
+ data->arg.create.name = dentry->d_name.name;
+ data->arg.create.len = dentry->d_name.len;
+ data->arg.create.sattr = sattr;
+
+ data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
+ if (flags & O_EXCL) {
+ data->arg.create.createmode = NFS3_CREATE_EXCLUSIVE;
+ data->arg.create.verifier[0] = cpu_to_be32(jiffies);
+ data->arg.create.verifier[1] = cpu_to_be32(current->pid);
+ }
+
+ status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
+ if (status)
+ goto out;
+
+ for (;;) {
+ d_alias = nfs3_do_create(dir, dentry, data);
+ status = PTR_ERR_OR_ZERO(d_alias);
+
+ if (status != -ENOTSUPP)
+ break;
+ /* If the server doesn't support the exclusive creation
+ * semantics, try again with simple 'guarded' mode. */
+ switch (data->arg.create.createmode) {
+ case NFS3_CREATE_EXCLUSIVE:
+ data->arg.create.createmode = NFS3_CREATE_GUARDED;
+ break;
+
+ case NFS3_CREATE_GUARDED:
+ data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
+ break;
+
+ case NFS3_CREATE_UNCHECKED:
+ goto out_release_acls;
+ }
+ nfs_fattr_init(data->res.dir_attr);
+ nfs_fattr_init(data->res.fattr);
+ }
+
+ if (status != 0)
+ goto out_release_acls;
+
+ if (d_alias)
+ dentry = d_alias;
+
+ /* When we created the file with exclusive semantics, make
+ * sure we set the attributes afterwards. */
+ if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) {
+ dprintk("NFS call setattr (post-create)\n");
+
+ if (!(sattr->ia_valid & ATTR_ATIME_SET))
+ sattr->ia_valid |= ATTR_ATIME;
+ if (!(sattr->ia_valid & ATTR_MTIME_SET))
+ sattr->ia_valid |= ATTR_MTIME;
+
+ /* Note: we could use a guarded setattr here, but I'm
+ * not sure this buys us anything (and I'd have
+ * to revamp the NFSv3 XDR code) */
+ status = nfs3_proc_setattr(dentry, data->res.fattr, sattr);
+ nfs_post_op_update_inode(d_inode(dentry), data->res.fattr);
+ dprintk("NFS reply setattr (post-create): %d\n", status);
+ if (status != 0)
+ goto out_dput;
+ }
+
+ status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
+
+out_dput:
+ dput(d_alias);
+out_release_acls:
+ posix_acl_release(acl);
+ posix_acl_release(default_acl);
+out:
+ nfs3_free_createdata(data);
+ dprintk("NFS reply create: %d\n", status);
+ return status;
+}
+
+static int
+nfs3_proc_remove(struct inode *dir, struct dentry *dentry)
+{
+ struct nfs_removeargs arg = {
+ .fh = NFS_FH(dir),
+ .name = dentry->d_name,
+ };
+ struct nfs_removeres res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int status = -ENOMEM;
+
+ dprintk("NFS call remove %pd2\n", dentry);
+ res.dir_attr = nfs_alloc_fattr();
+ if (res.dir_attr == NULL)
+ goto out;
+
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ nfs_post_op_update_inode(dir, res.dir_attr);
+ nfs_free_fattr(res.dir_attr);
+out:
+ dprintk("NFS reply remove: %d\n", status);
+ return status;
+}
+
+static void
+nfs3_proc_unlink_setup(struct rpc_message *msg,
+ struct dentry *dentry,
+ struct inode *inode)
+{
+ msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE];
+}
+
+static void nfs3_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
+{
+ rpc_call_start(task);
+}
+
+static int
+nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
+{
+ struct nfs_removeres *res;
+ if (nfs3_async_handle_jukebox(task, dir))
+ return 0;
+ res = task->tk_msg.rpc_resp;
+ nfs_post_op_update_inode(dir, res->dir_attr);
+ return 1;
+}
+
+static void
+nfs3_proc_rename_setup(struct rpc_message *msg,
+ struct dentry *old_dentry,
+ struct dentry *new_dentry)
+{
+ msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME];
+}
+
+static void nfs3_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
+{
+ rpc_call_start(task);
+}
+
+static int
+nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+ struct inode *new_dir)
+{
+ struct nfs_renameres *res;
+
+ if (nfs3_async_handle_jukebox(task, old_dir))
+ return 0;
+ res = task->tk_msg.rpc_resp;
+
+ nfs_post_op_update_inode(old_dir, res->old_fattr);
+ nfs_post_op_update_inode(new_dir, res->new_fattr);
+ return 1;
+}
+
+static int
+nfs3_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name)
+{
+ struct nfs3_linkargs arg = {
+ .fromfh = NFS_FH(inode),
+ .tofh = NFS_FH(dir),
+ .toname = name->name,
+ .tolen = name->len
+ };
+ struct nfs3_linkres res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_LINK],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int status = -ENOMEM;
+
+ dprintk("NFS call link %s\n", name->name);
+ res.fattr = nfs_alloc_fattr();
+ res.dir_attr = nfs_alloc_fattr();
+ if (res.fattr == NULL || res.dir_attr == NULL)
+ goto out;
+
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ nfs_post_op_update_inode(dir, res.dir_attr);
+ nfs_post_op_update_inode(inode, res.fattr);
+out:
+ nfs_free_fattr(res.dir_attr);
+ nfs_free_fattr(res.fattr);
+ dprintk("NFS reply link: %d\n", status);
+ return status;
+}
+
+static int
+nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
+ unsigned int len, struct iattr *sattr)
+{
+ struct nfs3_createdata *data;
+ struct dentry *d_alias;
+ int status = -ENOMEM;
+
+ if (len > NFS3_MAXPATHLEN)
+ return -ENAMETOOLONG;
+
+ dprintk("NFS call symlink %pd\n", dentry);
+
+ data = nfs3_alloc_createdata();
+ if (data == NULL)
+ goto out;
+ data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK];
+ data->arg.symlink.fromfh = NFS_FH(dir);
+ data->arg.symlink.fromname = dentry->d_name.name;
+ data->arg.symlink.fromlen = dentry->d_name.len;
+ data->arg.symlink.pages = &page;
+ data->arg.symlink.pathlen = len;
+ data->arg.symlink.sattr = sattr;
+
+ d_alias = nfs3_do_create(dir, dentry, data);
+ status = PTR_ERR_OR_ZERO(d_alias);
+
+ if (status == 0)
+ dput(d_alias);
+
+ nfs3_free_createdata(data);
+out:
+ dprintk("NFS reply symlink: %d\n", status);
+ return status;
+}
+
+static int
+nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
+{
+ struct posix_acl *default_acl, *acl;
+ struct nfs3_createdata *data;
+ struct dentry *d_alias;
+ int status = -ENOMEM;
+
+ dprintk("NFS call mkdir %pd\n", dentry);
+
+ data = nfs3_alloc_createdata();
+ if (data == NULL)
+ goto out;
+
+ status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
+ if (status)
+ goto out;
+
+ data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];
+ data->arg.mkdir.fh = NFS_FH(dir);
+ data->arg.mkdir.name = dentry->d_name.name;
+ data->arg.mkdir.len = dentry->d_name.len;
+ data->arg.mkdir.sattr = sattr;
+
+ d_alias = nfs3_do_create(dir, dentry, data);
+ status = PTR_ERR_OR_ZERO(d_alias);
+
+ if (status != 0)
+ goto out_release_acls;
+
+ if (d_alias)
+ dentry = d_alias;
+
+ status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
+
+ dput(d_alias);
+out_release_acls:
+ posix_acl_release(acl);
+ posix_acl_release(default_acl);
+out:
+ nfs3_free_createdata(data);
+ dprintk("NFS reply mkdir: %d\n", status);
+ return status;
+}
+
+static int
+nfs3_proc_rmdir(struct inode *dir, const struct qstr *name)
+{
+ struct nfs_fattr *dir_attr;
+ struct nfs3_diropargs arg = {
+ .fh = NFS_FH(dir),
+ .name = name->name,
+ .len = name->len
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_RMDIR],
+ .rpc_argp = &arg,
+ };
+ int status = -ENOMEM;
+
+ dprintk("NFS call rmdir %s\n", name->name);
+ dir_attr = nfs_alloc_fattr();
+ if (dir_attr == NULL)
+ goto out;
+
+ msg.rpc_resp = dir_attr;
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ nfs_post_op_update_inode(dir, dir_attr);
+ nfs_free_fattr(dir_attr);
+out:
+ dprintk("NFS reply rmdir: %d\n", status);
+ return status;
+}
+
+/*
+ * The READDIR implementation is somewhat hackish - we pass the user buffer
+ * to the encode function, which installs it in the receive iovec.
+ * The decode function itself doesn't perform any decoding, it just makes
+ * sure the reply is syntactically correct.
+ *
+ * Also note that this implementation handles both plain readdir and
+ * readdirplus.
+ */
+static int nfs3_proc_readdir(struct nfs_readdir_arg *nr_arg,
+ struct nfs_readdir_res *nr_res)
+{
+ struct inode *dir = d_inode(nr_arg->dentry);
+ struct nfs3_readdirargs arg = {
+ .fh = NFS_FH(dir),
+ .cookie = nr_arg->cookie,
+ .plus = nr_arg->plus,
+ .count = nr_arg->page_len,
+ .pages = nr_arg->pages
+ };
+ struct nfs3_readdirres res = {
+ .verf = nr_res->verf,
+ .plus = nr_arg->plus,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_READDIR],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ .rpc_cred = nr_arg->cred,
+ };
+ int status = -ENOMEM;
+
+ if (nr_arg->plus)
+ msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS];
+ if (arg.cookie)
+ memcpy(arg.verf, nr_arg->verf, sizeof(arg.verf));
+
+ dprintk("NFS call readdir%s %llu\n", nr_arg->plus ? "plus" : "",
+ (unsigned long long)nr_arg->cookie);
+
+ res.dir_attr = nfs_alloc_fattr();
+ if (res.dir_attr == NULL)
+ goto out;
+
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+
+ nfs_invalidate_atime(dir);
+ nfs_refresh_inode(dir, res.dir_attr);
+
+ nfs_free_fattr(res.dir_attr);
+out:
+ dprintk("NFS reply readdir%s: %d\n", nr_arg->plus ? "plus" : "",
+ status);
+ return status;
+}
+
+static int
+nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+ dev_t rdev)
+{
+ struct posix_acl *default_acl, *acl;
+ struct nfs3_createdata *data;
+ struct dentry *d_alias;
+ int status = -ENOMEM;
+
+ dprintk("NFS call mknod %pd %u:%u\n", dentry,
+ MAJOR(rdev), MINOR(rdev));
+
+ data = nfs3_alloc_createdata();
+ if (data == NULL)
+ goto out;
+
+ status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
+ if (status)
+ goto out;
+
+ data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD];
+ data->arg.mknod.fh = NFS_FH(dir);
+ data->arg.mknod.name = dentry->d_name.name;
+ data->arg.mknod.len = dentry->d_name.len;
+ data->arg.mknod.sattr = sattr;
+ data->arg.mknod.rdev = rdev;
+
+ switch (sattr->ia_mode & S_IFMT) {
+ case S_IFBLK:
+ data->arg.mknod.type = NF3BLK;
+ break;
+ case S_IFCHR:
+ data->arg.mknod.type = NF3CHR;
+ break;
+ case S_IFIFO:
+ data->arg.mknod.type = NF3FIFO;
+ break;
+ case S_IFSOCK:
+ data->arg.mknod.type = NF3SOCK;
+ break;
+ default:
+ status = -EINVAL;
+ goto out_release_acls;
+ }
+
+ d_alias = nfs3_do_create(dir, dentry, data);
+ status = PTR_ERR_OR_ZERO(d_alias);
+ if (status != 0)
+ goto out_release_acls;
+
+ if (d_alias)
+ dentry = d_alias;
+
+ status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
+
+ dput(d_alias);
+out_release_acls:
+ posix_acl_release(acl);
+ posix_acl_release(default_acl);
+out:
+ nfs3_free_createdata(data);
+ dprintk("NFS reply mknod: %d\n", status);
+ return status;
+}
+
+static int
+nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsstat *stat)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_FSSTAT],
+ .rpc_argp = fhandle,
+ .rpc_resp = stat,
+ };
+ int status;
+
+ dprintk("NFS call fsstat\n");
+ nfs_fattr_init(stat->fattr);
+ status = rpc_call_sync(server->client, &msg, 0);
+ dprintk("NFS reply fsstat: %d\n", status);
+ return status;
+}
+
+static int
+do_proc_fsinfo(struct rpc_clnt *client, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_FSINFO],
+ .rpc_argp = fhandle,
+ .rpc_resp = info,
+ };
+ int status;
+
+ dprintk("NFS call fsinfo\n");
+ nfs_fattr_init(info->fattr);
+ status = rpc_call_sync(client, &msg, 0);
+ dprintk("NFS reply fsinfo: %d\n", status);
+ return status;
+}
+
+/*
+ * Bare-bones access to fsinfo: this is for nfs_get_root/nfs_get_sb via
+ * nfs_create_server
+ */
+static int
+nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ int status;
+
+ status = do_proc_fsinfo(server->client, fhandle, info);
+ if (status && server->nfs_client->cl_rpcclient != server->client)
+ status = do_proc_fsinfo(server->nfs_client->cl_rpcclient, fhandle, info);
+ return status;
+}
+
+static int
+nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_pathconf *info)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_PATHCONF],
+ .rpc_argp = fhandle,
+ .rpc_resp = info,
+ };
+ int status;
+
+ dprintk("NFS call pathconf\n");
+ nfs_fattr_init(info->fattr);
+ status = rpc_call_sync(server->client, &msg, 0);
+ dprintk("NFS reply pathconf: %d\n", status);
+ return status;
+}
+
+static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+ struct inode *inode = hdr->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+
+ if (hdr->pgio_done_cb != NULL)
+ return hdr->pgio_done_cb(task, hdr);
+
+ if (nfs3_async_handle_jukebox(task, inode))
+ return -EAGAIN;
+
+ if (task->tk_status >= 0 && !server->read_hdrsize)
+ cmpxchg(&server->read_hdrsize, 0, hdr->res.replen);
+
+ nfs_invalidate_atime(inode);
+ nfs_refresh_inode(inode, &hdr->fattr);
+ return 0;
+}
+
+static void nfs3_proc_read_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg)
+{
+ msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
+ hdr->args.replen = NFS_SERVER(hdr->inode)->read_hdrsize;
+}
+
+static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ rpc_call_start(task);
+ return 0;
+}
+
+static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+ struct inode *inode = hdr->inode;
+
+ if (hdr->pgio_done_cb != NULL)
+ return hdr->pgio_done_cb(task, hdr);
+
+ if (nfs3_async_handle_jukebox(task, inode))
+ return -EAGAIN;
+ if (task->tk_status >= 0)
+ nfs_writeback_update_inode(hdr);
+ return 0;
+}
+
+static void nfs3_proc_write_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg,
+ struct rpc_clnt **clnt)
+{
+ msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
+}
+
+static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
+{
+ rpc_call_start(task);
+}
+
+static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data)
+{
+ if (data->commit_done_cb != NULL)
+ return data->commit_done_cb(task, data);
+
+ if (nfs3_async_handle_jukebox(task, data->inode))
+ return -EAGAIN;
+ nfs_refresh_inode(data->inode, data->res.fattr);
+ return 0;
+}
+
+static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg,
+ struct rpc_clnt **clnt)
+{
+ msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT];
+}
+
+static void nfs3_nlm_alloc_call(void *data)
+{
+ struct nfs_lock_context *l_ctx = data;
+ if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags)) {
+ get_nfs_open_context(l_ctx->open_context);
+ nfs_get_lock_context(l_ctx->open_context);
+ }
+}
+
+static bool nfs3_nlm_unlock_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs_lock_context *l_ctx = data;
+ if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags))
+ return nfs_async_iocounter_wait(task, l_ctx);
+ return false;
+
+}
+
+static void nfs3_nlm_release_call(void *data)
+{
+ struct nfs_lock_context *l_ctx = data;
+ struct nfs_open_context *ctx;
+ if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags)) {
+ ctx = l_ctx->open_context;
+ nfs_put_lock_context(l_ctx);
+ put_nfs_open_context(ctx);
+ }
+}
+
+static const struct nlmclnt_operations nlmclnt_fl_close_lock_ops = {
+ .nlmclnt_alloc_call = nfs3_nlm_alloc_call,
+ .nlmclnt_unlock_prepare = nfs3_nlm_unlock_prepare,
+ .nlmclnt_release_call = nfs3_nlm_release_call,
+};
+
+static int
+nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = file_inode(filp);
+ struct nfs_lock_context *l_ctx = NULL;
+ struct nfs_open_context *ctx = nfs_file_open_context(filp);
+ int status;
+
+ if (fl->fl_flags & FL_CLOSE) {
+ l_ctx = nfs_get_lock_context(ctx);
+ if (IS_ERR(l_ctx))
+ l_ctx = NULL;
+ else
+ set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags);
+ }
+
+ status = nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl, l_ctx);
+
+ if (l_ctx)
+ nfs_put_lock_context(l_ctx);
+
+ return status;
+}
+
+static int nfs3_have_delegation(struct inode *inode, fmode_t flags)
+{
+ return 0;
+}
+
+static const struct inode_operations nfs3_dir_inode_operations = {
+ .create = nfs_create,
+ .lookup = nfs_lookup,
+ .link = nfs_link,
+ .unlink = nfs_unlink,
+ .symlink = nfs_symlink,
+ .mkdir = nfs_mkdir,
+ .rmdir = nfs_rmdir,
+ .mknod = nfs_mknod,
+ .rename = nfs_rename,
+ .permission = nfs_permission,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+#ifdef CONFIG_NFS_V3_ACL
+ .listxattr = nfs3_listxattr,
+ .get_inode_acl = nfs3_get_acl,
+ .set_acl = nfs3_set_acl,
+#endif
+};
+
+static const struct inode_operations nfs3_file_inode_operations = {
+ .permission = nfs_permission,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+#ifdef CONFIG_NFS_V3_ACL
+ .listxattr = nfs3_listxattr,
+ .get_inode_acl = nfs3_get_acl,
+ .set_acl = nfs3_set_acl,
+#endif
+};
+
+const struct nfs_rpc_ops nfs_v3_clientops = {
+ .version = 3, /* protocol version */
+ .dentry_ops = &nfs_dentry_operations,
+ .dir_inode_ops = &nfs3_dir_inode_operations,
+ .file_inode_ops = &nfs3_file_inode_operations,
+ .file_ops = &nfs_file_operations,
+ .nlmclnt_ops = &nlmclnt_fl_close_lock_ops,
+ .getroot = nfs3_proc_get_root,
+ .submount = nfs_submount,
+ .try_get_tree = nfs_try_get_tree,
+ .getattr = nfs3_proc_getattr,
+ .setattr = nfs3_proc_setattr,
+ .lookup = nfs3_proc_lookup,
+ .lookupp = nfs3_proc_lookupp,
+ .access = nfs3_proc_access,
+ .readlink = nfs3_proc_readlink,
+ .create = nfs3_proc_create,
+ .remove = nfs3_proc_remove,
+ .unlink_setup = nfs3_proc_unlink_setup,
+ .unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare,
+ .unlink_done = nfs3_proc_unlink_done,
+ .rename_setup = nfs3_proc_rename_setup,
+ .rename_rpc_prepare = nfs3_proc_rename_rpc_prepare,
+ .rename_done = nfs3_proc_rename_done,
+ .link = nfs3_proc_link,
+ .symlink = nfs3_proc_symlink,
+ .mkdir = nfs3_proc_mkdir,
+ .rmdir = nfs3_proc_rmdir,
+ .readdir = nfs3_proc_readdir,
+ .mknod = nfs3_proc_mknod,
+ .statfs = nfs3_proc_statfs,
+ .fsinfo = nfs3_proc_fsinfo,
+ .pathconf = nfs3_proc_pathconf,
+ .decode_dirent = nfs3_decode_dirent,
+ .pgio_rpc_prepare = nfs3_proc_pgio_rpc_prepare,
+ .read_setup = nfs3_proc_read_setup,
+ .read_done = nfs3_read_done,
+ .write_setup = nfs3_proc_write_setup,
+ .write_done = nfs3_write_done,
+ .commit_setup = nfs3_proc_commit_setup,
+ .commit_rpc_prepare = nfs3_proc_commit_rpc_prepare,
+ .commit_done = nfs3_commit_done,
+ .lock = nfs3_proc_lock,
+ .clear_acl_cache = forget_all_cached_acls,
+ .close_context = nfs_close_context,
+ .have_delegation = nfs3_have_delegation,
+ .alloc_client = nfs_alloc_client,
+ .init_client = nfs_init_client,
+ .free_client = nfs_free_client,
+ .create_server = nfs3_create_server,
+ .clone_server = nfs3_clone_server,
+};
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
new file mode 100644
index 0000000000..8a9be9e47f
--- /dev/null
+++ b/fs/nfs/nfs3super.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2012 Netapp, Inc. All rights reserved.
+ */
+#include <linux/module.h>
+#include <linux/nfs_fs.h>
+#include "internal.h"
+#include "nfs3_fs.h"
+#include "nfs.h"
+
+struct nfs_subversion nfs_v3 = {
+ .owner = THIS_MODULE,
+ .nfs_fs = &nfs_fs_type,
+ .rpc_vers = &nfs_version3,
+ .rpc_ops = &nfs_v3_clientops,
+ .sops = &nfs_sops,
+};
+
+static int __init init_nfs_v3(void)
+{
+ register_nfs_version(&nfs_v3);
+ return 0;
+}
+
+static void __exit exit_nfs_v3(void)
+{
+ unregister_nfs_version(&nfs_v3);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_nfs_v3);
+module_exit(exit_nfs_v3);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
new file mode 100644
index 0000000000..60f032be80
--- /dev/null
+++ b/fs/nfs/nfs3xdr.c
@@ -0,0 +1,2581 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/nfs3xdr.c
+ *
+ * XDR functions to encode/decode NFSv3 RPC arguments and results.
+ *
+ * Copyright (C) 1996, 1997 Olaf Kirch
+ */
+
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/kdev_t.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfsacl.h>
+#include "nfstrace.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_XDR
+
+/* Mapping from NFS error code to "errno" error code. */
+#define errno_NFSERR_IO EIO
+
+/*
+ * Declare the space requirements for NFS arguments and replies as
+ * number of 32bit-words
+ */
+#define NFS3_pagepad_sz (1) /* Page padding */
+#define NFS3_fhandle_sz (1+16)
+#define NFS3_fh_sz (NFS3_fhandle_sz) /* shorthand */
+#define NFS3_post_op_fh_sz (1+NFS3_fh_sz)
+#define NFS3_sattr_sz (15)
+#define NFS3_filename_sz (1+(NFS3_MAXNAMLEN>>2))
+#define NFS3_path_sz (1+(NFS3_MAXPATHLEN>>2))
+#define NFS3_fattr_sz (21)
+#define NFS3_cookieverf_sz (NFS3_COOKIEVERFSIZE>>2)
+#define NFS3_wcc_attr_sz (6)
+#define NFS3_pre_op_attr_sz (1+NFS3_wcc_attr_sz)
+#define NFS3_post_op_attr_sz (1+NFS3_fattr_sz)
+#define NFS3_wcc_data_sz (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz)
+#define NFS3_diropargs_sz (NFS3_fh_sz+NFS3_filename_sz)
+
+#define NFS3_getattrargs_sz (NFS3_fh_sz)
+#define NFS3_setattrargs_sz (NFS3_fh_sz+NFS3_sattr_sz+3)
+#define NFS3_lookupargs_sz (NFS3_fh_sz+NFS3_filename_sz)
+#define NFS3_accessargs_sz (NFS3_fh_sz+1)
+#define NFS3_readlinkargs_sz (NFS3_fh_sz)
+#define NFS3_readargs_sz (NFS3_fh_sz+3)
+#define NFS3_writeargs_sz (NFS3_fh_sz+5)
+#define NFS3_createargs_sz (NFS3_diropargs_sz+NFS3_sattr_sz)
+#define NFS3_mkdirargs_sz (NFS3_diropargs_sz+NFS3_sattr_sz)
+#define NFS3_symlinkargs_sz (NFS3_diropargs_sz+1+NFS3_sattr_sz)
+#define NFS3_mknodargs_sz (NFS3_diropargs_sz+2+NFS3_sattr_sz)
+#define NFS3_removeargs_sz (NFS3_fh_sz+NFS3_filename_sz)
+#define NFS3_renameargs_sz (NFS3_diropargs_sz+NFS3_diropargs_sz)
+#define NFS3_linkargs_sz (NFS3_fh_sz+NFS3_diropargs_sz)
+#define NFS3_readdirargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+3)
+#define NFS3_readdirplusargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+4)
+#define NFS3_commitargs_sz (NFS3_fh_sz+3)
+
+#define NFS3_getattrres_sz (1+NFS3_fattr_sz)
+#define NFS3_setattrres_sz (1+NFS3_wcc_data_sz)
+#define NFS3_removeres_sz (NFS3_setattrres_sz)
+#define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))
+#define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1)
+#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1+NFS3_pagepad_sz)
+#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3+NFS3_pagepad_sz)
+#define NFS3_writeres_sz (1+NFS3_wcc_data_sz+4)
+#define NFS3_createres_sz (1+NFS3_post_op_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
+#define NFS3_renameres_sz (1+(2 * NFS3_wcc_data_sz))
+#define NFS3_linkres_sz (1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
+#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2+NFS3_pagepad_sz)
+#define NFS3_fsstatres_sz (1+NFS3_post_op_attr_sz+13)
+#define NFS3_fsinfores_sz (1+NFS3_post_op_attr_sz+12)
+#define NFS3_pathconfres_sz (1+NFS3_post_op_attr_sz+6)
+#define NFS3_commitres_sz (1+NFS3_wcc_data_sz+2)
+
+#define ACL3_getaclargs_sz (NFS3_fh_sz+1)
+#define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \
+ XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
+#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \
+ XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)+\
+ NFS3_pagepad_sz)
+#define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz)
+
+static int nfs3_stat_to_errno(enum nfs_stat);
+
+/*
+ * Map file type to S_IFMT bits
+ */
+static const umode_t nfs_type2fmt[] = {
+ [NF3BAD] = 0,
+ [NF3REG] = S_IFREG,
+ [NF3DIR] = S_IFDIR,
+ [NF3BLK] = S_IFBLK,
+ [NF3CHR] = S_IFCHR,
+ [NF3LNK] = S_IFLNK,
+ [NF3SOCK] = S_IFSOCK,
+ [NF3FIFO] = S_IFIFO,
+};
+
+static struct user_namespace *rpc_userns(const struct rpc_clnt *clnt)
+{
+ if (clnt && clnt->cl_cred)
+ return clnt->cl_cred->user_ns;
+ return &init_user_ns;
+}
+
+static struct user_namespace *rpc_rqst_userns(const struct rpc_rqst *rqstp)
+{
+ if (rqstp->rq_task)
+ return rpc_userns(rqstp->rq_task->tk_client);
+ return &init_user_ns;
+}
+
+/*
+ * Encode/decode NFSv3 basic data types
+ *
+ * Basic NFSv3 data types are defined in section 2.5 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions. For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+
+static void encode_uint32(struct xdr_stream *xdr, u32 value)
+{
+ __be32 *p = xdr_reserve_space(xdr, 4);
+ *p = cpu_to_be32(value);
+}
+
+static int decode_uint32(struct xdr_stream *xdr, u32 *value)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *value = be32_to_cpup(p);
+ return 0;
+}
+
+static int decode_uint64(struct xdr_stream *xdr, u64 *value)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, value);
+ return 0;
+}
+
+/*
+ * fileid3
+ *
+ * typedef uint64 fileid3;
+ */
+static __be32 *xdr_decode_fileid3(__be32 *p, u64 *fileid)
+{
+ return xdr_decode_hyper(p, fileid);
+}
+
+static int decode_fileid3(struct xdr_stream *xdr, u64 *fileid)
+{
+ return decode_uint64(xdr, fileid);
+}
+
+/*
+ * filename3
+ *
+ * typedef string filename3<>;
+ */
+static void encode_filename3(struct xdr_stream *xdr,
+ const char *name, u32 length)
+{
+ __be32 *p;
+
+ WARN_ON_ONCE(length > NFS3_MAXNAMLEN);
+ p = xdr_reserve_space(xdr, 4 + length);
+ xdr_encode_opaque(p, name, length);
+}
+
+static int decode_inline_filename3(struct xdr_stream *xdr,
+ const char **name, u32 *length)
+{
+ __be32 *p;
+ u32 count;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ count = be32_to_cpup(p);
+ if (count > NFS3_MAXNAMLEN)
+ goto out_nametoolong;
+ p = xdr_inline_decode(xdr, count);
+ if (unlikely(!p))
+ return -EIO;
+ *name = (const char *)p;
+ *length = count;
+ return 0;
+
+out_nametoolong:
+ dprintk("NFS: returned filename too long: %u\n", count);
+ return -ENAMETOOLONG;
+}
+
+/*
+ * nfspath3
+ *
+ * typedef string nfspath3<>;
+ */
+static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages,
+ const u32 length)
+{
+ encode_uint32(xdr, length);
+ xdr_write_pages(xdr, pages, 0, length);
+}
+
+static int decode_nfspath3(struct xdr_stream *xdr)
+{
+ u32 recvd, count;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ count = be32_to_cpup(p);
+ if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN))
+ goto out_nametoolong;
+ recvd = xdr_read_pages(xdr, count);
+ if (unlikely(count > recvd))
+ goto out_cheating;
+ xdr_terminate_string(xdr->buf, count);
+ return 0;
+
+out_nametoolong:
+ dprintk("NFS: returned pathname too long: %u\n", count);
+ return -ENAMETOOLONG;
+out_cheating:
+ dprintk("NFS: server cheating in pathname result: "
+ "count %u > recvd %u\n", count, recvd);
+ return -EIO;
+}
+
+/*
+ * cookie3
+ *
+ * typedef uint64 cookie3
+ */
+static __be32 *xdr_encode_cookie3(__be32 *p, u64 cookie)
+{
+ return xdr_encode_hyper(p, cookie);
+}
+
+static int decode_cookie3(struct xdr_stream *xdr, u64 *cookie)
+{
+ return decode_uint64(xdr, cookie);
+}
+
+/*
+ * cookieverf3
+ *
+ * typedef opaque cookieverf3[NFS3_COOKIEVERFSIZE];
+ */
+static __be32 *xdr_encode_cookieverf3(__be32 *p, const __be32 *verifier)
+{
+ memcpy(p, verifier, NFS3_COOKIEVERFSIZE);
+ return p + XDR_QUADLEN(NFS3_COOKIEVERFSIZE);
+}
+
+static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
+ if (unlikely(!p))
+ return -EIO;
+ memcpy(verifier, p, NFS3_COOKIEVERFSIZE);
+ return 0;
+}
+
+/*
+ * createverf3
+ *
+ * typedef opaque createverf3[NFS3_CREATEVERFSIZE];
+ */
+static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS3_CREATEVERFSIZE);
+ memcpy(p, verifier, NFS3_CREATEVERFSIZE);
+}
+
+static int decode_writeverf3(struct xdr_stream *xdr, struct nfs_write_verifier *verifier)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE);
+ if (unlikely(!p))
+ return -EIO;
+ memcpy(verifier->data, p, NFS3_WRITEVERFSIZE);
+ return 0;
+}
+
+/*
+ * size3
+ *
+ * typedef uint64 size3;
+ */
+static __be32 *xdr_decode_size3(__be32 *p, u64 *size)
+{
+ return xdr_decode_hyper(p, size);
+}
+
+/*
+ * nfsstat3
+ *
+ * enum nfsstat3 {
+ * NFS3_OK = 0,
+ * ...
+ * }
+ */
+#define NFS3_OK NFS_OK
+
+static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ if (unlikely(*p != cpu_to_be32(NFS3_OK)))
+ goto out_status;
+ *status = 0;
+ return 0;
+out_status:
+ *status = be32_to_cpup(p);
+ trace_nfs_xdr_status(xdr, (int)*status);
+ return 0;
+}
+
+/*
+ * ftype3
+ *
+ * enum ftype3 {
+ * NF3REG = 1,
+ * NF3DIR = 2,
+ * NF3BLK = 3,
+ * NF3CHR = 4,
+ * NF3LNK = 5,
+ * NF3SOCK = 6,
+ * NF3FIFO = 7
+ * };
+ */
+static void encode_ftype3(struct xdr_stream *xdr, const u32 type)
+{
+ encode_uint32(xdr, type);
+}
+
+static __be32 *xdr_decode_ftype3(__be32 *p, umode_t *mode)
+{
+ u32 type;
+
+ type = be32_to_cpup(p++);
+ if (type > NF3FIFO)
+ type = NF3NON;
+ *mode = nfs_type2fmt[type];
+ return p;
+}
+
+/*
+ * specdata3
+ *
+ * struct specdata3 {
+ * uint32 specdata1;
+ * uint32 specdata2;
+ * };
+ */
+static void encode_specdata3(struct xdr_stream *xdr, const dev_t rdev)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 8);
+ *p++ = cpu_to_be32(MAJOR(rdev));
+ *p = cpu_to_be32(MINOR(rdev));
+}
+
+static __be32 *xdr_decode_specdata3(__be32 *p, dev_t *rdev)
+{
+ unsigned int major, minor;
+
+ major = be32_to_cpup(p++);
+ minor = be32_to_cpup(p++);
+ *rdev = MKDEV(major, minor);
+ if (MAJOR(*rdev) != major || MINOR(*rdev) != minor)
+ *rdev = 0;
+ return p;
+}
+
+/*
+ * nfs_fh3
+ *
+ * struct nfs_fh3 {
+ * opaque data<NFS3_FHSIZE>;
+ * };
+ */
+static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+ __be32 *p;
+
+ WARN_ON_ONCE(fh->size > NFS3_FHSIZE);
+ p = xdr_reserve_space(xdr, 4 + fh->size);
+ xdr_encode_opaque(p, fh->data, fh->size);
+}
+
+static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+ u32 length;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ length = be32_to_cpup(p++);
+ if (unlikely(length > NFS3_FHSIZE || length == 0))
+ goto out_toobig;
+ p = xdr_inline_decode(xdr, length);
+ if (unlikely(!p))
+ return -EIO;
+ fh->size = length;
+ memcpy(fh->data, p, length);
+ return 0;
+out_toobig:
+ trace_nfs_xdr_bad_filehandle(xdr, NFSERR_BADHANDLE);
+ return -E2BIG;
+}
+
+static void zero_nfs_fh3(struct nfs_fh *fh)
+{
+ memset(fh, 0, sizeof(*fh));
+}
+
+/*
+ * nfstime3
+ *
+ * struct nfstime3 {
+ * uint32 seconds;
+ * uint32 nseconds;
+ * };
+ */
+static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec64 *timep)
+{
+ *p++ = cpu_to_be32((u32)timep->tv_sec);
+ *p++ = cpu_to_be32(timep->tv_nsec);
+ return p;
+}
+
+static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec64 *timep)
+{
+ timep->tv_sec = be32_to_cpup(p++);
+ timep->tv_nsec = be32_to_cpup(p++);
+ return p;
+}
+
+/*
+ * sattr3
+ *
+ * enum time_how {
+ * DONT_CHANGE = 0,
+ * SET_TO_SERVER_TIME = 1,
+ * SET_TO_CLIENT_TIME = 2
+ * };
+ *
+ * union set_mode3 switch (bool set_it) {
+ * case TRUE:
+ * mode3 mode;
+ * default:
+ * void;
+ * };
+ *
+ * union set_uid3 switch (bool set_it) {
+ * case TRUE:
+ * uid3 uid;
+ * default:
+ * void;
+ * };
+ *
+ * union set_gid3 switch (bool set_it) {
+ * case TRUE:
+ * gid3 gid;
+ * default:
+ * void;
+ * };
+ *
+ * union set_size3 switch (bool set_it) {
+ * case TRUE:
+ * size3 size;
+ * default:
+ * void;
+ * };
+ *
+ * union set_atime switch (time_how set_it) {
+ * case SET_TO_CLIENT_TIME:
+ * nfstime3 atime;
+ * default:
+ * void;
+ * };
+ *
+ * union set_mtime switch (time_how set_it) {
+ * case SET_TO_CLIENT_TIME:
+ * nfstime3 mtime;
+ * default:
+ * void;
+ * };
+ *
+ * struct sattr3 {
+ * set_mode3 mode;
+ * set_uid3 uid;
+ * set_gid3 gid;
+ * set_size3 size;
+ * set_atime atime;
+ * set_mtime mtime;
+ * };
+ */
+static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr,
+ struct user_namespace *userns)
+{
+ u32 nbytes;
+ __be32 *p;
+
+ /*
+ * In order to make only a single xdr_reserve_space() call,
+ * pre-compute the total number of bytes to be reserved.
+ * Six boolean values, one for each set_foo field, are always
+ * present in the encoded result, so start there.
+ */
+ nbytes = 6 * 4;
+ if (attr->ia_valid & ATTR_MODE)
+ nbytes += 4;
+ if (attr->ia_valid & ATTR_UID)
+ nbytes += 4;
+ if (attr->ia_valid & ATTR_GID)
+ nbytes += 4;
+ if (attr->ia_valid & ATTR_SIZE)
+ nbytes += 8;
+ if (attr->ia_valid & ATTR_ATIME_SET)
+ nbytes += 8;
+ if (attr->ia_valid & ATTR_MTIME_SET)
+ nbytes += 8;
+ p = xdr_reserve_space(xdr, nbytes);
+
+ if (attr->ia_valid & ATTR_MODE) {
+ *p++ = xdr_one;
+ *p++ = cpu_to_be32(attr->ia_mode & S_IALLUGO);
+ } else
+ *p++ = xdr_zero;
+
+ if (attr->ia_valid & ATTR_UID) {
+ *p++ = xdr_one;
+ *p++ = cpu_to_be32(from_kuid_munged(userns, attr->ia_uid));
+ } else
+ *p++ = xdr_zero;
+
+ if (attr->ia_valid & ATTR_GID) {
+ *p++ = xdr_one;
+ *p++ = cpu_to_be32(from_kgid_munged(userns, attr->ia_gid));
+ } else
+ *p++ = xdr_zero;
+
+ if (attr->ia_valid & ATTR_SIZE) {
+ *p++ = xdr_one;
+ p = xdr_encode_hyper(p, (u64)attr->ia_size);
+ } else
+ *p++ = xdr_zero;
+
+ if (attr->ia_valid & ATTR_ATIME_SET) {
+ *p++ = xdr_two;
+ p = xdr_encode_nfstime3(p, &attr->ia_atime);
+ } else if (attr->ia_valid & ATTR_ATIME) {
+ *p++ = xdr_one;
+ } else
+ *p++ = xdr_zero;
+
+ if (attr->ia_valid & ATTR_MTIME_SET) {
+ *p++ = xdr_two;
+ xdr_encode_nfstime3(p, &attr->ia_mtime);
+ } else if (attr->ia_valid & ATTR_MTIME) {
+ *p = xdr_one;
+ } else
+ *p = xdr_zero;
+}
+
+/*
+ * fattr3
+ *
+ * struct fattr3 {
+ * ftype3 type;
+ * mode3 mode;
+ * uint32 nlink;
+ * uid3 uid;
+ * gid3 gid;
+ * size3 size;
+ * size3 used;
+ * specdata3 rdev;
+ * uint64 fsid;
+ * fileid3 fileid;
+ * nfstime3 atime;
+ * nfstime3 mtime;
+ * nfstime3 ctime;
+ * };
+ */
+static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+ struct user_namespace *userns)
+{
+ umode_t fmode;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2);
+ if (unlikely(!p))
+ return -EIO;
+
+ p = xdr_decode_ftype3(p, &fmode);
+
+ fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode;
+ fattr->nlink = be32_to_cpup(p++);
+ fattr->uid = make_kuid(userns, be32_to_cpup(p++));
+ if (!uid_valid(fattr->uid))
+ goto out_uid;
+ fattr->gid = make_kgid(userns, be32_to_cpup(p++));
+ if (!gid_valid(fattr->gid))
+ goto out_gid;
+
+ p = xdr_decode_size3(p, &fattr->size);
+ p = xdr_decode_size3(p, &fattr->du.nfs3.used);
+ p = xdr_decode_specdata3(p, &fattr->rdev);
+
+ p = xdr_decode_hyper(p, &fattr->fsid.major);
+ fattr->fsid.minor = 0;
+
+ p = xdr_decode_fileid3(p, &fattr->fileid);
+ p = xdr_decode_nfstime3(p, &fattr->atime);
+ p = xdr_decode_nfstime3(p, &fattr->mtime);
+ xdr_decode_nfstime3(p, &fattr->ctime);
+ fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime);
+
+ fattr->valid |= NFS_ATTR_FATTR_V3;
+ return 0;
+out_uid:
+ dprintk("NFS: returned invalid uid\n");
+ return -EINVAL;
+out_gid:
+ dprintk("NFS: returned invalid gid\n");
+ return -EINVAL;
+}
+
+/*
+ * post_op_attr
+ *
+ * union post_op_attr switch (bool attributes_follow) {
+ * case TRUE:
+ * fattr3 attributes;
+ * case FALSE:
+ * void;
+ * };
+ */
+static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+ struct user_namespace *userns)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ if (*p != xdr_zero)
+ return decode_fattr3(xdr, fattr, userns);
+ return 0;
+}
+
+/*
+ * wcc_attr
+ * struct wcc_attr {
+ * size3 size;
+ * nfstime3 mtime;
+ * nfstime3 ctime;
+ * };
+ */
+static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2);
+ if (unlikely(!p))
+ return -EIO;
+
+ fattr->valid |= NFS_ATTR_FATTR_PRESIZE
+ | NFS_ATTR_FATTR_PRECHANGE
+ | NFS_ATTR_FATTR_PREMTIME
+ | NFS_ATTR_FATTR_PRECTIME;
+
+ p = xdr_decode_size3(p, &fattr->pre_size);
+ p = xdr_decode_nfstime3(p, &fattr->pre_mtime);
+ xdr_decode_nfstime3(p, &fattr->pre_ctime);
+ fattr->pre_change_attr = nfs_timespec_to_change_attr(&fattr->pre_ctime);
+
+ return 0;
+}
+
+/*
+ * pre_op_attr
+ * union pre_op_attr switch (bool attributes_follow) {
+ * case TRUE:
+ * wcc_attr attributes;
+ * case FALSE:
+ * void;
+ * };
+ *
+ * wcc_data
+ *
+ * struct wcc_data {
+ * pre_op_attr before;
+ * post_op_attr after;
+ * };
+ */
+static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ if (*p != xdr_zero)
+ return decode_wcc_attr(xdr, fattr);
+ return 0;
+}
+
+static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+ struct user_namespace *userns)
+{
+ int error;
+
+ error = decode_pre_op_attr(xdr, fattr);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, fattr, userns);
+out:
+ return error;
+}
+
+/*
+ * post_op_fh3
+ *
+ * union post_op_fh3 switch (bool handle_follows) {
+ * case TRUE:
+ * nfs_fh3 handle;
+ * case FALSE:
+ * void;
+ * };
+ */
+static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+ __be32 *p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ if (*p != xdr_zero)
+ return decode_nfs_fh3(xdr, fh);
+ zero_nfs_fh3(fh);
+ return 0;
+}
+
+/*
+ * diropargs3
+ *
+ * struct diropargs3 {
+ * nfs_fh3 dir;
+ * filename3 name;
+ * };
+ */
+static void encode_diropargs3(struct xdr_stream *xdr, const struct nfs_fh *fh,
+ const char *name, u32 length)
+{
+ encode_nfs_fh3(xdr, fh);
+ encode_filename3(xdr, name, length);
+}
+
+
+/*
+ * NFSv3 XDR encode functions
+ *
+ * NFSv3 argument types are defined in section 3.3 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
+ */
+
+/*
+ * 3.3.1 GETATTR3args
+ *
+ * struct GETATTR3args {
+ * nfs_fh3 object;
+ * };
+ */
+static void nfs3_xdr_enc_getattr3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_fh *fh = data;
+
+ encode_nfs_fh3(xdr, fh);
+}
+
+/*
+ * 3.3.2 SETATTR3args
+ *
+ * union sattrguard3 switch (bool check) {
+ * case TRUE:
+ * nfstime3 obj_ctime;
+ * case FALSE:
+ * void;
+ * };
+ *
+ * struct SETATTR3args {
+ * nfs_fh3 object;
+ * sattr3 new_attributes;
+ * sattrguard3 guard;
+ * };
+ */
+static void encode_sattrguard3(struct xdr_stream *xdr,
+ const struct nfs3_sattrargs *args)
+{
+ __be32 *p;
+
+ if (args->guard) {
+ p = xdr_reserve_space(xdr, 4 + 8);
+ *p++ = xdr_one;
+ xdr_encode_nfstime3(p, &args->guardtime);
+ } else {
+ p = xdr_reserve_space(xdr, 4);
+ *p = xdr_zero;
+ }
+}
+
+static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_sattrargs *args = data;
+ encode_nfs_fh3(xdr, args->fh);
+ encode_sattr3(xdr, args->sattr, rpc_rqst_userns(req));
+ encode_sattrguard3(xdr, args);
+}
+
+/*
+ * 3.3.3 LOOKUP3args
+ *
+ * struct LOOKUP3args {
+ * diropargs3 what;
+ * };
+ */
+static void nfs3_xdr_enc_lookup3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_diropargs *args = data;
+
+ encode_diropargs3(xdr, args->fh, args->name, args->len);
+}
+
+/*
+ * 3.3.4 ACCESS3args
+ *
+ * struct ACCESS3args {
+ * nfs_fh3 object;
+ * uint32 access;
+ * };
+ */
+static void encode_access3args(struct xdr_stream *xdr,
+ const struct nfs3_accessargs *args)
+{
+ encode_nfs_fh3(xdr, args->fh);
+ encode_uint32(xdr, args->access);
+}
+
+static void nfs3_xdr_enc_access3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_accessargs *args = data;
+
+ encode_access3args(xdr, args);
+}
+
+/*
+ * 3.3.5 READLINK3args
+ *
+ * struct READLINK3args {
+ * nfs_fh3 symlink;
+ * };
+ */
+static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_readlinkargs *args = data;
+
+ encode_nfs_fh3(xdr, args->fh);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->pglen,
+ NFS3_readlinkres_sz - NFS3_pagepad_sz);
+}
+
+/*
+ * 3.3.6 READ3args
+ *
+ * struct READ3args {
+ * nfs_fh3 file;
+ * offset3 offset;
+ * count3 count;
+ * };
+ */
+static void encode_read3args(struct xdr_stream *xdr,
+ const struct nfs_pgio_args *args)
+{
+ __be32 *p;
+
+ encode_nfs_fh3(xdr, args->fh);
+
+ p = xdr_reserve_space(xdr, 8 + 4);
+ p = xdr_encode_hyper(p, args->offset);
+ *p = cpu_to_be32(args->count);
+}
+
+static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_pgio_args *args = data;
+ unsigned int replen = args->replen ? args->replen :
+ NFS3_readres_sz - NFS3_pagepad_sz;
+
+ encode_read3args(xdr, args);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, replen);
+ req->rq_rcv_buf.flags |= XDRBUF_READ;
+}
+
+/*
+ * 3.3.7 WRITE3args
+ *
+ * enum stable_how {
+ * UNSTABLE = 0,
+ * DATA_SYNC = 1,
+ * FILE_SYNC = 2
+ * };
+ *
+ * struct WRITE3args {
+ * nfs_fh3 file;
+ * offset3 offset;
+ * count3 count;
+ * stable_how stable;
+ * opaque data<>;
+ * };
+ */
+static void encode_write3args(struct xdr_stream *xdr,
+ const struct nfs_pgio_args *args)
+{
+ __be32 *p;
+
+ encode_nfs_fh3(xdr, args->fh);
+
+ p = xdr_reserve_space(xdr, 8 + 4 + 4 + 4);
+ p = xdr_encode_hyper(p, args->offset);
+ *p++ = cpu_to_be32(args->count);
+ *p++ = cpu_to_be32(args->stable);
+ *p = cpu_to_be32(args->count);
+ xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
+}
+
+static void nfs3_xdr_enc_write3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_pgio_args *args = data;
+
+ encode_write3args(xdr, args);
+ xdr->buf->flags |= XDRBUF_WRITE;
+}
+
+/*
+ * 3.3.8 CREATE3args
+ *
+ * enum createmode3 {
+ * UNCHECKED = 0,
+ * GUARDED = 1,
+ * EXCLUSIVE = 2
+ * };
+ *
+ * union createhow3 switch (createmode3 mode) {
+ * case UNCHECKED:
+ * case GUARDED:
+ * sattr3 obj_attributes;
+ * case EXCLUSIVE:
+ * createverf3 verf;
+ * };
+ *
+ * struct CREATE3args {
+ * diropargs3 where;
+ * createhow3 how;
+ * };
+ */
+static void encode_createhow3(struct xdr_stream *xdr,
+ const struct nfs3_createargs *args,
+ struct user_namespace *userns)
+{
+ encode_uint32(xdr, args->createmode);
+ switch (args->createmode) {
+ case NFS3_CREATE_UNCHECKED:
+ case NFS3_CREATE_GUARDED:
+ encode_sattr3(xdr, args->sattr, userns);
+ break;
+ case NFS3_CREATE_EXCLUSIVE:
+ encode_createverf3(xdr, args->verifier);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static void nfs3_xdr_enc_create3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_createargs *args = data;
+
+ encode_diropargs3(xdr, args->fh, args->name, args->len);
+ encode_createhow3(xdr, args, rpc_rqst_userns(req));
+}
+
+/*
+ * 3.3.9 MKDIR3args
+ *
+ * struct MKDIR3args {
+ * diropargs3 where;
+ * sattr3 attributes;
+ * };
+ */
+static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_mkdirargs *args = data;
+
+ encode_diropargs3(xdr, args->fh, args->name, args->len);
+ encode_sattr3(xdr, args->sattr, rpc_rqst_userns(req));
+}
+
+/*
+ * 3.3.10 SYMLINK3args
+ *
+ * struct symlinkdata3 {
+ * sattr3 symlink_attributes;
+ * nfspath3 symlink_data;
+ * };
+ *
+ * struct SYMLINK3args {
+ * diropargs3 where;
+ * symlinkdata3 symlink;
+ * };
+ */
+static void encode_symlinkdata3(struct xdr_stream *xdr,
+ const void *data,
+ struct user_namespace *userns)
+{
+ const struct nfs3_symlinkargs *args = data;
+
+ encode_sattr3(xdr, args->sattr, userns);
+ encode_nfspath3(xdr, args->pages, args->pathlen);
+}
+
+static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_symlinkargs *args = data;
+
+ encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
+ encode_symlinkdata3(xdr, args, rpc_rqst_userns(req));
+ xdr->buf->flags |= XDRBUF_WRITE;
+}
+
+/*
+ * 3.3.11 MKNOD3args
+ *
+ * struct devicedata3 {
+ * sattr3 dev_attributes;
+ * specdata3 spec;
+ * };
+ *
+ * union mknoddata3 switch (ftype3 type) {
+ * case NF3CHR:
+ * case NF3BLK:
+ * devicedata3 device;
+ * case NF3SOCK:
+ * case NF3FIFO:
+ * sattr3 pipe_attributes;
+ * default:
+ * void;
+ * };
+ *
+ * struct MKNOD3args {
+ * diropargs3 where;
+ * mknoddata3 what;
+ * };
+ */
+static void encode_devicedata3(struct xdr_stream *xdr,
+ const struct nfs3_mknodargs *args,
+ struct user_namespace *userns)
+{
+ encode_sattr3(xdr, args->sattr, userns);
+ encode_specdata3(xdr, args->rdev);
+}
+
+static void encode_mknoddata3(struct xdr_stream *xdr,
+ const struct nfs3_mknodargs *args,
+ struct user_namespace *userns)
+{
+ encode_ftype3(xdr, args->type);
+ switch (args->type) {
+ case NF3CHR:
+ case NF3BLK:
+ encode_devicedata3(xdr, args, userns);
+ break;
+ case NF3SOCK:
+ case NF3FIFO:
+ encode_sattr3(xdr, args->sattr, userns);
+ break;
+ case NF3REG:
+ case NF3DIR:
+ break;
+ default:
+ BUG();
+ }
+}
+
+static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_mknodargs *args = data;
+
+ encode_diropargs3(xdr, args->fh, args->name, args->len);
+ encode_mknoddata3(xdr, args, rpc_rqst_userns(req));
+}
+
+/*
+ * 3.3.12 REMOVE3args
+ *
+ * struct REMOVE3args {
+ * diropargs3 object;
+ * };
+ */
+static void nfs3_xdr_enc_remove3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_removeargs *args = data;
+
+ encode_diropargs3(xdr, args->fh, args->name.name, args->name.len);
+}
+
+/*
+ * 3.3.14 RENAME3args
+ *
+ * struct RENAME3args {
+ * diropargs3 from;
+ * diropargs3 to;
+ * };
+ */
+static void nfs3_xdr_enc_rename3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_renameargs *args = data;
+ const struct qstr *old = args->old_name;
+ const struct qstr *new = args->new_name;
+
+ encode_diropargs3(xdr, args->old_dir, old->name, old->len);
+ encode_diropargs3(xdr, args->new_dir, new->name, new->len);
+}
+
+/*
+ * 3.3.15 LINK3args
+ *
+ * struct LINK3args {
+ * nfs_fh3 file;
+ * diropargs3 link;
+ * };
+ */
+static void nfs3_xdr_enc_link3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_linkargs *args = data;
+
+ encode_nfs_fh3(xdr, args->fromfh);
+ encode_diropargs3(xdr, args->tofh, args->toname, args->tolen);
+}
+
+/*
+ * 3.3.16 READDIR3args
+ *
+ * struct READDIR3args {
+ * nfs_fh3 dir;
+ * cookie3 cookie;
+ * cookieverf3 cookieverf;
+ * count3 count;
+ * };
+ */
+static void encode_readdir3args(struct xdr_stream *xdr,
+ const struct nfs3_readdirargs *args)
+{
+ __be32 *p;
+
+ encode_nfs_fh3(xdr, args->fh);
+
+ p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4);
+ p = xdr_encode_cookie3(p, args->cookie);
+ p = xdr_encode_cookieverf3(p, args->verf);
+ *p = cpu_to_be32(args->count);
+}
+
+static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_readdirargs *args = data;
+
+ encode_readdir3args(xdr, args);
+ rpc_prepare_reply_pages(req, args->pages, 0, args->count,
+ NFS3_readdirres_sz - NFS3_pagepad_sz);
+}
+
+/*
+ * 3.3.17 READDIRPLUS3args
+ *
+ * struct READDIRPLUS3args {
+ * nfs_fh3 dir;
+ * cookie3 cookie;
+ * cookieverf3 cookieverf;
+ * count3 dircount;
+ * count3 maxcount;
+ * };
+ */
+static void encode_readdirplus3args(struct xdr_stream *xdr,
+ const struct nfs3_readdirargs *args)
+{
+ uint32_t dircount = args->count;
+ uint32_t maxcount = args->count;
+ __be32 *p;
+
+ encode_nfs_fh3(xdr, args->fh);
+
+ p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4 + 4);
+ p = xdr_encode_cookie3(p, args->cookie);
+ p = xdr_encode_cookieverf3(p, args->verf);
+
+ /*
+ * readdirplus: need dircount + buffer size.
+ * We just make sure we make dircount big enough
+ */
+ *p++ = cpu_to_be32(dircount);
+ *p = cpu_to_be32(maxcount);
+}
+
+static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_readdirargs *args = data;
+
+ encode_readdirplus3args(xdr, args);
+ rpc_prepare_reply_pages(req, args->pages, 0, args->count,
+ NFS3_readdirres_sz - NFS3_pagepad_sz);
+}
+
+/*
+ * 3.3.21 COMMIT3args
+ *
+ * struct COMMIT3args {
+ * nfs_fh3 file;
+ * offset3 offset;
+ * count3 count;
+ * };
+ */
+static void encode_commit3args(struct xdr_stream *xdr,
+ const struct nfs_commitargs *args)
+{
+ __be32 *p;
+
+ encode_nfs_fh3(xdr, args->fh);
+
+ p = xdr_reserve_space(xdr, 8 + 4);
+ p = xdr_encode_hyper(p, args->offset);
+ *p = cpu_to_be32(args->count);
+}
+
+static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_commitargs *args = data;
+
+ encode_commit3args(xdr, args);
+}
+
+#ifdef CONFIG_NFS_V3_ACL
+
+static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_getaclargs *args = data;
+
+ encode_nfs_fh3(xdr, args->fh);
+ encode_uint32(xdr, args->mask);
+ if (args->mask & (NFS_ACL | NFS_DFACL)) {
+ rpc_prepare_reply_pages(req, args->pages, 0,
+ NFSACL_MAXPAGES << PAGE_SHIFT,
+ ACL3_getaclres_sz - NFS3_pagepad_sz);
+ req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
+ }
+}
+
+static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_setaclargs *args = data;
+ unsigned int base;
+ int error;
+
+ encode_nfs_fh3(xdr, NFS_FH(args->inode));
+ encode_uint32(xdr, args->mask);
+
+ base = req->rq_slen;
+ if (args->npages != 0)
+ xdr_write_pages(xdr, args->pages, 0, args->len);
+ else
+ xdr_reserve_space(xdr, args->len);
+
+ error = nfsacl_encode(xdr->buf, base, args->inode,
+ (args->mask & NFS_ACL) ?
+ args->acl_access : NULL, 1, 0);
+ /* FIXME: this is just broken */
+ BUG_ON(error < 0);
+ error = nfsacl_encode(xdr->buf, base + error, args->inode,
+ (args->mask & NFS_DFACL) ?
+ args->acl_default : NULL, 1,
+ NFS_ACL_DEFAULT);
+ BUG_ON(error < 0);
+}
+
+#endif /* CONFIG_NFS_V3_ACL */
+
+/*
+ * NFSv3 XDR decode functions
+ *
+ * NFSv3 result types are defined in section 3.3 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
+ */
+
+/*
+ * 3.3.1 GETATTR3res
+ *
+ * struct GETATTR3resok {
+ * fattr3 obj_attributes;
+ * };
+ *
+ * union GETATTR3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * GETATTR3resok resok;
+ * default:
+ * void;
+ * };
+ */
+static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *result)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_default;
+ error = decode_fattr3(xdr, result, rpc_rqst_userns(req));
+out:
+ return error;
+out_default:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.2 SETATTR3res
+ *
+ * struct SETATTR3resok {
+ * wcc_data obj_wcc;
+ * };
+ *
+ * struct SETATTR3resfail {
+ * wcc_data obj_wcc;
+ * };
+ *
+ * union SETATTR3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * SETATTR3resok resok;
+ * default:
+ * SETATTR3resfail resfail;
+ * };
+ */
+static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *result)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_wcc_data(xdr, result, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_status;
+out:
+ return error;
+out_status:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.3 LOOKUP3res
+ *
+ * struct LOOKUP3resok {
+ * nfs_fh3 object;
+ * post_op_attr obj_attributes;
+ * post_op_attr dir_attributes;
+ * };
+ *
+ * struct LOOKUP3resfail {
+ * post_op_attr dir_attributes;
+ * };
+ *
+ * union LOOKUP3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * LOOKUP3resok resok;
+ * default:
+ * LOOKUP3resfail resfail;
+ * };
+ */
+static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct user_namespace *userns = rpc_rqst_userns(req);
+ struct nfs3_diropres *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_default;
+ error = decode_nfs_fh3(xdr, result->fh);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, result->fattr, userns);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, result->dir_attr, userns);
+out:
+ return error;
+out_default:
+ error = decode_post_op_attr(xdr, result->dir_attr, userns);
+ if (unlikely(error))
+ goto out;
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.4 ACCESS3res
+ *
+ * struct ACCESS3resok {
+ * post_op_attr obj_attributes;
+ * uint32 access;
+ * };
+ *
+ * struct ACCESS3resfail {
+ * post_op_attr obj_attributes;
+ * };
+ *
+ * union ACCESS3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * ACCESS3resok resok;
+ * default:
+ * ACCESS3resfail resfail;
+ * };
+ */
+static int nfs3_xdr_dec_access3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs3_accessres *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_default;
+ error = decode_uint32(xdr, &result->access);
+out:
+ return error;
+out_default:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.5 READLINK3res
+ *
+ * struct READLINK3resok {
+ * post_op_attr symlink_attributes;
+ * nfspath3 data;
+ * };
+ *
+ * struct READLINK3resfail {
+ * post_op_attr symlink_attributes;
+ * };
+ *
+ * union READLINK3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * READLINK3resok resok;
+ * default:
+ * READLINK3resfail resfail;
+ * };
+ */
+static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *result)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, result, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_default;
+ error = decode_nfspath3(xdr);
+out:
+ return error;
+out_default:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.6 READ3res
+ *
+ * struct READ3resok {
+ * post_op_attr file_attributes;
+ * count3 count;
+ * bool eof;
+ * opaque data<>;
+ * };
+ *
+ * struct READ3resfail {
+ * post_op_attr file_attributes;
+ * };
+ *
+ * union READ3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * READ3resok resok;
+ * default:
+ * READ3resfail resfail;
+ * };
+ */
+static int decode_read3resok(struct xdr_stream *xdr,
+ struct nfs_pgio_res *result)
+{
+ u32 eof, count, ocount, recvd;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4 + 4 + 4);
+ if (unlikely(!p))
+ return -EIO;
+ count = be32_to_cpup(p++);
+ eof = be32_to_cpup(p++);
+ ocount = be32_to_cpup(p++);
+ if (unlikely(ocount != count))
+ goto out_mismatch;
+ recvd = xdr_read_pages(xdr, count);
+ if (unlikely(count > recvd))
+ goto out_cheating;
+out:
+ result->eof = eof;
+ result->count = count;
+ return count;
+out_mismatch:
+ dprintk("NFS: READ count doesn't match length of opaque: "
+ "count %u != ocount %u\n", count, ocount);
+ return -EIO;
+out_cheating:
+ dprintk("NFS: server cheating in read result: "
+ "count %u > recvd %u\n", count, recvd);
+ count = recvd;
+ eof = 0;
+ goto out;
+}
+
+static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_pgio_res *result = data;
+ unsigned int pos;
+ enum nfs_stat status;
+ int error;
+
+ pos = xdr_stream_pos(xdr);
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ result->op_status = status;
+ if (status != NFS3_OK)
+ goto out_status;
+ result->replen = 3 + ((xdr_stream_pos(xdr) - pos) >> 2);
+ error = decode_read3resok(xdr, result);
+out:
+ return error;
+out_status:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.7 WRITE3res
+ *
+ * enum stable_how {
+ * UNSTABLE = 0,
+ * DATA_SYNC = 1,
+ * FILE_SYNC = 2
+ * };
+ *
+ * struct WRITE3resok {
+ * wcc_data file_wcc;
+ * count3 count;
+ * stable_how committed;
+ * writeverf3 verf;
+ * };
+ *
+ * struct WRITE3resfail {
+ * wcc_data file_wcc;
+ * };
+ *
+ * union WRITE3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * WRITE3resok resok;
+ * default:
+ * WRITE3resfail resfail;
+ * };
+ */
+static int decode_write3resok(struct xdr_stream *xdr,
+ struct nfs_pgio_res *result)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4 + 4);
+ if (unlikely(!p))
+ return -EIO;
+ result->count = be32_to_cpup(p++);
+ result->verf->committed = be32_to_cpup(p++);
+ if (unlikely(result->verf->committed > NFS_FILE_SYNC))
+ goto out_badvalue;
+ if (decode_writeverf3(xdr, &result->verf->verifier))
+ return -EIO;
+ return result->count;
+out_badvalue:
+ dprintk("NFS: bad stable_how value: %u\n", result->verf->committed);
+ return -EIO;
+}
+
+static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_pgio_res *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_wcc_data(xdr, result->fattr, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ result->op_status = status;
+ if (status != NFS3_OK)
+ goto out_status;
+ error = decode_write3resok(xdr, result);
+out:
+ return error;
+out_status:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.8 CREATE3res
+ *
+ * struct CREATE3resok {
+ * post_op_fh3 obj;
+ * post_op_attr obj_attributes;
+ * wcc_data dir_wcc;
+ * };
+ *
+ * struct CREATE3resfail {
+ * wcc_data dir_wcc;
+ * };
+ *
+ * union CREATE3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * CREATE3resok resok;
+ * default:
+ * CREATE3resfail resfail;
+ * };
+ */
+static int decode_create3resok(struct xdr_stream *xdr,
+ struct nfs3_diropres *result,
+ struct user_namespace *userns)
+{
+ int error;
+
+ error = decode_post_op_fh3(xdr, result->fh);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, result->fattr, userns);
+ if (unlikely(error))
+ goto out;
+ /* The server isn't required to return a file handle.
+ * If it didn't, force the client to perform a LOOKUP
+ * to determine the correct file handle and attribute
+ * values for the new object. */
+ if (result->fh->size == 0)
+ result->fattr->valid = 0;
+ error = decode_wcc_data(xdr, result->dir_attr, userns);
+out:
+ return error;
+}
+
+static int nfs3_xdr_dec_create3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct user_namespace *userns = rpc_rqst_userns(req);
+ struct nfs3_diropres *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_default;
+ error = decode_create3resok(xdr, result, userns);
+out:
+ return error;
+out_default:
+ error = decode_wcc_data(xdr, result->dir_attr, userns);
+ if (unlikely(error))
+ goto out;
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.12 REMOVE3res
+ *
+ * struct REMOVE3resok {
+ * wcc_data dir_wcc;
+ * };
+ *
+ * struct REMOVE3resfail {
+ * wcc_data dir_wcc;
+ * };
+ *
+ * union REMOVE3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * REMOVE3resok resok;
+ * default:
+ * REMOVE3resfail resfail;
+ * };
+ */
+static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_removeres *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_wcc_data(xdr, result->dir_attr, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_status;
+out:
+ return error;
+out_status:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.14 RENAME3res
+ *
+ * struct RENAME3resok {
+ * wcc_data fromdir_wcc;
+ * wcc_data todir_wcc;
+ * };
+ *
+ * struct RENAME3resfail {
+ * wcc_data fromdir_wcc;
+ * wcc_data todir_wcc;
+ * };
+ *
+ * union RENAME3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * RENAME3resok resok;
+ * default:
+ * RENAME3resfail resfail;
+ * };
+ */
+static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct user_namespace *userns = rpc_rqst_userns(req);
+ struct nfs_renameres *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_wcc_data(xdr, result->old_fattr, userns);
+ if (unlikely(error))
+ goto out;
+ error = decode_wcc_data(xdr, result->new_fattr, userns);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_status;
+out:
+ return error;
+out_status:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.15 LINK3res
+ *
+ * struct LINK3resok {
+ * post_op_attr file_attributes;
+ * wcc_data linkdir_wcc;
+ * };
+ *
+ * struct LINK3resfail {
+ * post_op_attr file_attributes;
+ * wcc_data linkdir_wcc;
+ * };
+ *
+ * union LINK3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * LINK3resok resok;
+ * default:
+ * LINK3resfail resfail;
+ * };
+ */
+static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *data)
+{
+ struct user_namespace *userns = rpc_rqst_userns(req);
+ struct nfs3_linkres *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, result->fattr, userns);
+ if (unlikely(error))
+ goto out;
+ error = decode_wcc_data(xdr, result->dir_attr, userns);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_status;
+out:
+ return error;
+out_status:
+ return nfs3_stat_to_errno(status);
+}
+
+/**
+ * nfs3_decode_dirent - Decode a single NFSv3 directory entry stored in
+ * the local page cache
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 3.3.16 entry3
+ *
+ * struct entry3 {
+ * fileid3 fileid;
+ * filename3 name;
+ * cookie3 cookie;
+ * fhandle3 filehandle;
+ * post_op_attr3 attributes;
+ * entry3 *nextentry;
+ * };
+ *
+ * 3.3.17 entryplus3
+ * struct entryplus3 {
+ * fileid3 fileid;
+ * filename3 name;
+ * cookie3 cookie;
+ * post_op_attr name_attributes;
+ * post_op_fh3 name_handle;
+ * entryplus3 *nextentry;
+ * };
+ */
+int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+ bool plus)
+{
+ struct user_namespace *userns = rpc_userns(entry->server->client);
+ __be32 *p;
+ int error;
+ u64 new_cookie;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EAGAIN;
+ if (*p == xdr_zero) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EAGAIN;
+ if (*p == xdr_zero)
+ return -EAGAIN;
+ entry->eof = 1;
+ return -EBADCOOKIE;
+ }
+
+ error = decode_fileid3(xdr, &entry->ino);
+ if (unlikely(error))
+ return -EAGAIN;
+
+ error = decode_inline_filename3(xdr, &entry->name, &entry->len);
+ if (unlikely(error))
+ return error == -ENAMETOOLONG ? -ENAMETOOLONG : -EAGAIN;
+
+ error = decode_cookie3(xdr, &new_cookie);
+ if (unlikely(error))
+ return -EAGAIN;
+
+ entry->d_type = DT_UNKNOWN;
+
+ if (plus) {
+ entry->fattr->valid = 0;
+ error = decode_post_op_attr(xdr, entry->fattr, userns);
+ if (unlikely(error))
+ return -EAGAIN;
+ if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
+ entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
+
+ if (entry->fattr->fileid != entry->ino) {
+ entry->fattr->mounted_on_fileid = entry->ino;
+ entry->fattr->valid |= NFS_ATTR_FATTR_MOUNTED_ON_FILEID;
+ }
+
+ /* In fact, a post_op_fh3: */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EAGAIN;
+ if (*p != xdr_zero) {
+ error = decode_nfs_fh3(xdr, entry->fh);
+ if (unlikely(error))
+ return -EAGAIN;
+ } else
+ zero_nfs_fh3(entry->fh);
+ }
+
+ entry->cookie = new_cookie;
+
+ return 0;
+}
+
+/*
+ * 3.3.16 READDIR3res
+ *
+ * struct dirlist3 {
+ * entry3 *entries;
+ * bool eof;
+ * };
+ *
+ * struct READDIR3resok {
+ * post_op_attr dir_attributes;
+ * cookieverf3 cookieverf;
+ * dirlist3 reply;
+ * };
+ *
+ * struct READDIR3resfail {
+ * post_op_attr dir_attributes;
+ * };
+ *
+ * union READDIR3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * READDIR3resok resok;
+ * default:
+ * READDIR3resfail resfail;
+ * };
+ *
+ * Read the directory contents into the page cache, but otherwise
+ * don't touch them. The actual decoding is done by nfs3_decode_entry()
+ * during subsequent nfs_readdir() calls.
+ */
+static int decode_dirlist3(struct xdr_stream *xdr)
+{
+ return xdr_read_pages(xdr, xdr->buf->page_len);
+}
+
+static int decode_readdir3resok(struct xdr_stream *xdr,
+ struct nfs3_readdirres *result,
+ struct user_namespace *userns)
+{
+ int error;
+
+ error = decode_post_op_attr(xdr, result->dir_attr, userns);
+ if (unlikely(error))
+ goto out;
+ /* XXX: do we need to check if result->verf != NULL ? */
+ error = decode_cookieverf3(xdr, result->verf);
+ if (unlikely(error))
+ goto out;
+ error = decode_dirlist3(xdr);
+out:
+ return error;
+}
+
+static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs3_readdirres *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_default;
+ error = decode_readdir3resok(xdr, result, rpc_rqst_userns(req));
+out:
+ return error;
+out_default:
+ error = decode_post_op_attr(xdr, result->dir_attr, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.18 FSSTAT3res
+ *
+ * struct FSSTAT3resok {
+ * post_op_attr obj_attributes;
+ * size3 tbytes;
+ * size3 fbytes;
+ * size3 abytes;
+ * size3 tfiles;
+ * size3 ffiles;
+ * size3 afiles;
+ * uint32 invarsec;
+ * };
+ *
+ * struct FSSTAT3resfail {
+ * post_op_attr obj_attributes;
+ * };
+ *
+ * union FSSTAT3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * FSSTAT3resok resok;
+ * default:
+ * FSSTAT3resfail resfail;
+ * };
+ */
+static int decode_fsstat3resok(struct xdr_stream *xdr,
+ struct nfs_fsstat *result)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 8 * 6 + 4);
+ if (unlikely(!p))
+ return -EIO;
+ p = xdr_decode_size3(p, &result->tbytes);
+ p = xdr_decode_size3(p, &result->fbytes);
+ p = xdr_decode_size3(p, &result->abytes);
+ p = xdr_decode_size3(p, &result->tfiles);
+ p = xdr_decode_size3(p, &result->ffiles);
+ xdr_decode_size3(p, &result->afiles);
+ /* ignore invarsec */
+ return 0;
+}
+
+static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_fsstat *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_status;
+ error = decode_fsstat3resok(xdr, result);
+out:
+ return error;
+out_status:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.19 FSINFO3res
+ *
+ * struct FSINFO3resok {
+ * post_op_attr obj_attributes;
+ * uint32 rtmax;
+ * uint32 rtpref;
+ * uint32 rtmult;
+ * uint32 wtmax;
+ * uint32 wtpref;
+ * uint32 wtmult;
+ * uint32 dtpref;
+ * size3 maxfilesize;
+ * nfstime3 time_delta;
+ * uint32 properties;
+ * };
+ *
+ * struct FSINFO3resfail {
+ * post_op_attr obj_attributes;
+ * };
+ *
+ * union FSINFO3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * FSINFO3resok resok;
+ * default:
+ * FSINFO3resfail resfail;
+ * };
+ */
+static int decode_fsinfo3resok(struct xdr_stream *xdr,
+ struct nfs_fsinfo *result)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4);
+ if (unlikely(!p))
+ return -EIO;
+ result->rtmax = be32_to_cpup(p++);
+ result->rtpref = be32_to_cpup(p++);
+ result->rtmult = be32_to_cpup(p++);
+ result->wtmax = be32_to_cpup(p++);
+ result->wtpref = be32_to_cpup(p++);
+ result->wtmult = be32_to_cpup(p++);
+ result->dtpref = be32_to_cpup(p++);
+ p = xdr_decode_size3(p, &result->maxfilesize);
+ xdr_decode_nfstime3(p, &result->time_delta);
+
+ /* ignore properties */
+ result->lease_time = 0;
+ result->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+ result->xattr_support = 0;
+ return 0;
+}
+
+static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_fsinfo *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_status;
+ error = decode_fsinfo3resok(xdr, result);
+out:
+ return error;
+out_status:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.20 PATHCONF3res
+ *
+ * struct PATHCONF3resok {
+ * post_op_attr obj_attributes;
+ * uint32 linkmax;
+ * uint32 name_max;
+ * bool no_trunc;
+ * bool chown_restricted;
+ * bool case_insensitive;
+ * bool case_preserving;
+ * };
+ *
+ * struct PATHCONF3resfail {
+ * post_op_attr obj_attributes;
+ * };
+ *
+ * union PATHCONF3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * PATHCONF3resok resok;
+ * default:
+ * PATHCONF3resfail resfail;
+ * };
+ */
+static int decode_pathconf3resok(struct xdr_stream *xdr,
+ struct nfs_pathconf *result)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4 * 6);
+ if (unlikely(!p))
+ return -EIO;
+ result->max_link = be32_to_cpup(p++);
+ result->max_namelen = be32_to_cpup(p);
+ /* ignore remaining fields */
+ return 0;
+}
+
+static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_pathconf *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_status;
+ error = decode_pathconf3resok(xdr, result);
+out:
+ return error;
+out_status:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.21 COMMIT3res
+ *
+ * struct COMMIT3resok {
+ * wcc_data file_wcc;
+ * writeverf3 verf;
+ * };
+ *
+ * struct COMMIT3resfail {
+ * wcc_data file_wcc;
+ * };
+ *
+ * union COMMIT3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * COMMIT3resok resok;
+ * default:
+ * COMMIT3resfail resfail;
+ * };
+ */
+static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_commitres *result = data;
+ struct nfs_writeverf *verf = result->verf;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_wcc_data(xdr, result->fattr, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ result->op_status = status;
+ if (status != NFS3_OK)
+ goto out_status;
+ error = decode_writeverf3(xdr, &verf->verifier);
+ if (!error)
+ verf->committed = NFS_FILE_SYNC;
+out:
+ return error;
+out_status:
+ return nfs3_stat_to_errno(status);
+}
+
+#ifdef CONFIG_NFS_V3_ACL
+
+static inline int decode_getacl3resok(struct xdr_stream *xdr,
+ struct nfs3_getaclres *result,
+ struct user_namespace *userns)
+{
+ struct posix_acl **acl;
+ unsigned int *aclcnt;
+ size_t hdrlen;
+ int error;
+
+ error = decode_post_op_attr(xdr, result->fattr, userns);
+ if (unlikely(error))
+ goto out;
+ error = decode_uint32(xdr, &result->mask);
+ if (unlikely(error))
+ goto out;
+ error = -EINVAL;
+ if (result->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+ goto out;
+
+ hdrlen = xdr_stream_pos(xdr);
+
+ acl = NULL;
+ if (result->mask & NFS_ACL)
+ acl = &result->acl_access;
+ aclcnt = NULL;
+ if (result->mask & NFS_ACLCNT)
+ aclcnt = &result->acl_access_count;
+ error = nfsacl_decode(xdr->buf, hdrlen, aclcnt, acl);
+ if (unlikely(error <= 0))
+ goto out;
+
+ acl = NULL;
+ if (result->mask & NFS_DFACL)
+ acl = &result->acl_default;
+ aclcnt = NULL;
+ if (result->mask & NFS_DFACLCNT)
+ aclcnt = &result->acl_default_count;
+ error = nfsacl_decode(xdr->buf, hdrlen + error, aclcnt, acl);
+ if (unlikely(error <= 0))
+ return error;
+ error = 0;
+out:
+ return error;
+}
+
+static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *result)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_default;
+ error = decode_getacl3resok(xdr, result, rpc_rqst_userns(req));
+out:
+ return error;
+out_default:
+ return nfs3_stat_to_errno(status);
+}
+
+static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *result)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_default;
+ error = decode_post_op_attr(xdr, result, rpc_rqst_userns(req));
+out:
+ return error;
+out_default:
+ return nfs3_stat_to_errno(status);
+}
+
+#endif /* CONFIG_NFS_V3_ACL */
+
+
+/*
+ * We need to translate between nfs status return values and
+ * the local errno values which may not be the same.
+ */
+static const struct {
+ int stat;
+ int errno;
+} nfs_errtbl[] = {
+ { NFS_OK, 0 },
+ { NFSERR_PERM, -EPERM },
+ { NFSERR_NOENT, -ENOENT },
+ { NFSERR_IO, -errno_NFSERR_IO},
+ { NFSERR_NXIO, -ENXIO },
+/* { NFSERR_EAGAIN, -EAGAIN }, */
+ { NFSERR_ACCES, -EACCES },
+ { NFSERR_EXIST, -EEXIST },
+ { NFSERR_XDEV, -EXDEV },
+ { NFSERR_NODEV, -ENODEV },
+ { NFSERR_NOTDIR, -ENOTDIR },
+ { NFSERR_ISDIR, -EISDIR },
+ { NFSERR_INVAL, -EINVAL },
+ { NFSERR_FBIG, -EFBIG },
+ { NFSERR_NOSPC, -ENOSPC },
+ { NFSERR_ROFS, -EROFS },
+ { NFSERR_MLINK, -EMLINK },
+ { NFSERR_NAMETOOLONG, -ENAMETOOLONG },
+ { NFSERR_NOTEMPTY, -ENOTEMPTY },
+ { NFSERR_DQUOT, -EDQUOT },
+ { NFSERR_STALE, -ESTALE },
+ { NFSERR_REMOTE, -EREMOTE },
+#ifdef EWFLUSH
+ { NFSERR_WFLUSH, -EWFLUSH },
+#endif
+ { NFSERR_BADHANDLE, -EBADHANDLE },
+ { NFSERR_NOT_SYNC, -ENOTSYNC },
+ { NFSERR_BAD_COOKIE, -EBADCOOKIE },
+ { NFSERR_NOTSUPP, -ENOTSUPP },
+ { NFSERR_TOOSMALL, -ETOOSMALL },
+ { NFSERR_SERVERFAULT, -EREMOTEIO },
+ { NFSERR_BADTYPE, -EBADTYPE },
+ { NFSERR_JUKEBOX, -EJUKEBOX },
+ { -1, -EIO }
+};
+
+/**
+ * nfs3_stat_to_errno - convert an NFS status code to a local errno
+ * @status: NFS status code to convert
+ *
+ * Returns a local errno value, or -EIO if the NFS status code is
+ * not recognized. This function is used jointly by NFSv2 and NFSv3.
+ */
+static int nfs3_stat_to_errno(enum nfs_stat status)
+{
+ int i;
+
+ for (i = 0; nfs_errtbl[i].stat != -1; i++) {
+ if (nfs_errtbl[i].stat == (int)status)
+ return nfs_errtbl[i].errno;
+ }
+ dprintk("NFS: Unrecognized nfs status value: %u\n", status);
+ return nfs_errtbl[i].errno;
+}
+
+
+#define PROC(proc, argtype, restype, timer) \
+[NFS3PROC_##proc] = { \
+ .p_proc = NFS3PROC_##proc, \
+ .p_encode = nfs3_xdr_enc_##argtype##3args, \
+ .p_decode = nfs3_xdr_dec_##restype##3res, \
+ .p_arglen = NFS3_##argtype##args_sz, \
+ .p_replen = NFS3_##restype##res_sz, \
+ .p_timer = timer, \
+ .p_statidx = NFS3PROC_##proc, \
+ .p_name = #proc, \
+ }
+
+const struct rpc_procinfo nfs3_procedures[] = {
+ PROC(GETATTR, getattr, getattr, 1),
+ PROC(SETATTR, setattr, setattr, 0),
+ PROC(LOOKUP, lookup, lookup, 2),
+ PROC(ACCESS, access, access, 1),
+ PROC(READLINK, readlink, readlink, 3),
+ PROC(READ, read, read, 3),
+ PROC(WRITE, write, write, 4),
+ PROC(CREATE, create, create, 0),
+ PROC(MKDIR, mkdir, create, 0),
+ PROC(SYMLINK, symlink, create, 0),
+ PROC(MKNOD, mknod, create, 0),
+ PROC(REMOVE, remove, remove, 0),
+ PROC(RMDIR, lookup, setattr, 0),
+ PROC(RENAME, rename, rename, 0),
+ PROC(LINK, link, link, 0),
+ PROC(READDIR, readdir, readdir, 3),
+ PROC(READDIRPLUS, readdirplus, readdir, 3),
+ PROC(FSSTAT, getattr, fsstat, 0),
+ PROC(FSINFO, getattr, fsinfo, 0),
+ PROC(PATHCONF, getattr, pathconf, 0),
+ PROC(COMMIT, commit, commit, 5),
+};
+
+static unsigned int nfs_version3_counts[ARRAY_SIZE(nfs3_procedures)];
+const struct rpc_version nfs_version3 = {
+ .number = 3,
+ .nrprocs = ARRAY_SIZE(nfs3_procedures),
+ .procs = nfs3_procedures,
+ .counts = nfs_version3_counts,
+};
+
+#ifdef CONFIG_NFS_V3_ACL
+static const struct rpc_procinfo nfs3_acl_procedures[] = {
+ [ACLPROC3_GETACL] = {
+ .p_proc = ACLPROC3_GETACL,
+ .p_encode = nfs3_xdr_enc_getacl3args,
+ .p_decode = nfs3_xdr_dec_getacl3res,
+ .p_arglen = ACL3_getaclargs_sz,
+ .p_replen = ACL3_getaclres_sz,
+ .p_timer = 1,
+ .p_name = "GETACL",
+ },
+ [ACLPROC3_SETACL] = {
+ .p_proc = ACLPROC3_SETACL,
+ .p_encode = nfs3_xdr_enc_setacl3args,
+ .p_decode = nfs3_xdr_dec_setacl3res,
+ .p_arglen = ACL3_setaclargs_sz,
+ .p_replen = ACL3_setaclres_sz,
+ .p_timer = 0,
+ .p_name = "SETACL",
+ },
+};
+
+static unsigned int nfs3_acl_counts[ARRAY_SIZE(nfs3_acl_procedures)];
+const struct rpc_version nfsacl_version3 = {
+ .number = 3,
+ .nrprocs = ARRAY_SIZE(nfs3_acl_procedures),
+ .procs = nfs3_acl_procedures,
+ .counts = nfs3_acl_counts,
+};
+#endif /* CONFIG_NFS_V3_ACL */
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
new file mode 100644
index 0000000000..b59876b01a
--- /dev/null
+++ b/fs/nfs/nfs42.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
+ */
+
+#ifndef __LINUX_FS_NFS_NFS4_2_H
+#define __LINUX_FS_NFS_NFS4_2_H
+
+#include <linux/xattr.h>
+
+/*
+ * FIXME: four LAYOUTSTATS calls per compound at most! Do we need to support
+ * more? Need to consider not to pre-alloc too much for a compound.
+ */
+#define PNFS_LAYOUTSTATS_MAXDEV (4)
+#define READ_PLUS_SCRATCH_SIZE (16)
+
+/* nfs4.2proc.c */
+#ifdef CONFIG_NFS_V4_2
+int nfs42_proc_allocate(struct file *, loff_t, loff_t);
+ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t,
+ struct nl4_server *, nfs4_stateid *, bool);
+int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
+loff_t nfs42_proc_llseek(struct file *, loff_t, int);
+int nfs42_proc_layoutstats_generic(struct nfs_server *,
+ struct nfs42_layoutstat_data *);
+int nfs42_proc_clone(struct file *, struct file *, loff_t, loff_t, loff_t);
+int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg,
+ const struct nfs42_layout_error *errors,
+ size_t n);
+int nfs42_proc_copy_notify(struct file *, struct file *,
+ struct nfs42_copy_notify_res *);
+static inline bool nfs42_files_from_same_server(struct file *in,
+ struct file *out)
+{
+ struct nfs_client *c_in = (NFS_SERVER(file_inode(in)))->nfs_client;
+ struct nfs_client *c_out = (NFS_SERVER(file_inode(out)))->nfs_client;
+
+ return nfs4_check_serverowner_major_id(c_in->cl_serverowner,
+ c_out->cl_serverowner);
+}
+
+ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name,
+ void *buf, size_t buflen);
+int nfs42_proc_setxattr(struct inode *inode, const char *name,
+ const void *buf, size_t buflen, int flags);
+ssize_t nfs42_proc_listxattrs(struct inode *inode, void *buf,
+ size_t buflen, u64 *cookiep, bool *eofp);
+int nfs42_proc_removexattr(struct inode *inode, const char *name);
+
+/*
+ * Maximum XDR buffer size needed for a listxattr buffer of buflen size.
+ *
+ * The upper boundary is a buffer with all 1-byte sized attribute names.
+ * They would be 7 bytes long in the eventual buffer ("user.x\0"), and
+ * 8 bytes long XDR-encoded.
+ *
+ * Include the trailing eof word as well.
+ */
+static inline u32 nfs42_listxattr_xdrsize(u32 buflen)
+{
+ return ((buflen / (XATTR_USER_PREFIX_LEN + 2)) * 8) + 4;
+}
+#endif /* CONFIG_NFS_V4_2 */
+#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
new file mode 100644
index 0000000000..28704f9246
--- /dev/null
+++ b/fs/nfs/nfs42proc.c
@@ -0,0 +1,1461 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
+ */
+#include <linux/fs.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_xdr.h>
+#include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
+#include "nfs42.h"
+#include "iostat.h"
+#include "pnfs.h"
+#include "nfs4session.h"
+#include "internal.h"
+#include "delegation.h"
+#include "nfs4trace.h"
+
+#define NFSDBG_FACILITY NFSDBG_PROC
+static int nfs42_do_offload_cancel_async(struct file *dst, nfs4_stateid *std);
+
+static void nfs42_set_netaddr(struct file *filep, struct nfs42_netaddr *naddr)
+{
+ struct nfs_client *clp = (NFS_SERVER(file_inode(filep)))->nfs_client;
+ unsigned short port = 2049;
+
+ rcu_read_lock();
+ naddr->netid_len = scnprintf(naddr->netid,
+ sizeof(naddr->netid), "%s",
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_NETID));
+ naddr->addr_len = scnprintf(naddr->addr,
+ sizeof(naddr->addr),
+ "%s.%u.%u",
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR),
+ port >> 8, port & 255);
+ rcu_read_unlock();
+}
+
+static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
+ struct nfs_lock_context *lock, loff_t offset, loff_t len)
+{
+ struct inode *inode = file_inode(filep);
+ struct nfs_server *server = NFS_SERVER(inode);
+ u32 bitmask[NFS_BITMASK_SZ];
+ struct nfs42_falloc_args args = {
+ .falloc_fh = NFS_FH(inode),
+ .falloc_offset = offset,
+ .falloc_length = len,
+ .falloc_bitmask = bitmask,
+ };
+ struct nfs42_falloc_res res = {
+ .falloc_server = server,
+ };
+ int status;
+
+ msg->rpc_argp = &args;
+ msg->rpc_resp = &res;
+
+ status = nfs4_set_rw_stateid(&args.falloc_stateid, lock->open_context,
+ lock, FMODE_WRITE);
+ if (status) {
+ if (status == -EAGAIN)
+ status = -NFS4ERR_BAD_STATEID;
+ return status;
+ }
+
+ nfs4_bitmask_set(bitmask, server->cache_consistency_bitmask, inode,
+ NFS_INO_INVALID_BLOCKS);
+
+ res.falloc_fattr = nfs_alloc_fattr();
+ if (!res.falloc_fattr)
+ return -ENOMEM;
+
+ status = nfs4_call_sync(server->client, server, msg,
+ &args.seq_args, &res.seq_res, 0);
+ if (status == 0) {
+ if (nfs_should_remove_suid(inode)) {
+ spin_lock(&inode->i_lock);
+ nfs_set_cache_invalid(inode,
+ NFS_INO_REVAL_FORCED | NFS_INO_INVALID_MODE);
+ spin_unlock(&inode->i_lock);
+ }
+ status = nfs_post_op_update_inode_force_wcc(inode,
+ res.falloc_fattr);
+ }
+ if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE])
+ trace_nfs4_fallocate(inode, &args, status);
+ else
+ trace_nfs4_deallocate(inode, &args, status);
+ kfree(res.falloc_fattr);
+ return status;
+}
+
+static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
+ loff_t offset, loff_t len)
+{
+ struct inode *inode = file_inode(filep);
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs4_exception exception = { };
+ struct nfs_lock_context *lock;
+ int err;
+
+ lock = nfs_get_lock_context(nfs_file_open_context(filep));
+ if (IS_ERR(lock))
+ return PTR_ERR(lock);
+
+ exception.inode = inode;
+ exception.state = lock->open_context->state;
+
+ err = nfs_sync_inode(inode);
+ if (err)
+ goto out;
+
+ do {
+ err = _nfs42_proc_fallocate(msg, filep, lock, offset, len);
+ if (err == -ENOTSUPP) {
+ err = -EOPNOTSUPP;
+ break;
+ }
+ err = nfs4_handle_exception(server, err, &exception);
+ } while (exception.retry);
+out:
+ nfs_put_lock_context(lock);
+ return err;
+}
+
+int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE],
+ };
+ struct inode *inode = file_inode(filep);
+ int err;
+
+ if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE))
+ return -EOPNOTSUPP;
+
+ inode_lock(inode);
+
+ err = nfs42_proc_fallocate(&msg, filep, offset, len);
+ if (err == -EOPNOTSUPP)
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE;
+
+ inode_unlock(inode);
+ return err;
+}
+
+int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DEALLOCATE],
+ };
+ struct inode *inode = file_inode(filep);
+ int err;
+
+ if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE))
+ return -EOPNOTSUPP;
+
+ inode_lock(inode);
+
+ err = nfs42_proc_fallocate(&msg, filep, offset, len);
+ if (err == 0)
+ truncate_pagecache_range(inode, offset, (offset + len) -1);
+ if (err == -EOPNOTSUPP)
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
+
+ inode_unlock(inode);
+ return err;
+}
+
+static int handle_async_copy(struct nfs42_copy_res *res,
+ struct nfs_server *dst_server,
+ struct nfs_server *src_server,
+ struct file *src,
+ struct file *dst,
+ nfs4_stateid *src_stateid,
+ bool *restart)
+{
+ struct nfs4_copy_state *copy, *tmp_copy = NULL, *iter;
+ int status = NFS4_OK;
+ struct nfs_open_context *dst_ctx = nfs_file_open_context(dst);
+ struct nfs_open_context *src_ctx = nfs_file_open_context(src);
+
+ copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL);
+ if (!copy)
+ return -ENOMEM;
+
+ spin_lock(&dst_server->nfs_client->cl_lock);
+ list_for_each_entry(iter,
+ &dst_server->nfs_client->pending_cb_stateids,
+ copies) {
+ if (memcmp(&res->write_res.stateid, &iter->stateid,
+ NFS4_STATEID_SIZE))
+ continue;
+ tmp_copy = iter;
+ list_del(&iter->copies);
+ break;
+ }
+ if (tmp_copy) {
+ spin_unlock(&dst_server->nfs_client->cl_lock);
+ kfree(copy);
+ copy = tmp_copy;
+ goto out;
+ }
+
+ memcpy(&copy->stateid, &res->write_res.stateid, NFS4_STATEID_SIZE);
+ init_completion(&copy->completion);
+ copy->parent_dst_state = dst_ctx->state;
+ copy->parent_src_state = src_ctx->state;
+
+ list_add_tail(&copy->copies, &dst_server->ss_copies);
+ spin_unlock(&dst_server->nfs_client->cl_lock);
+
+ if (dst_server != src_server) {
+ spin_lock(&src_server->nfs_client->cl_lock);
+ list_add_tail(&copy->src_copies, &src_server->ss_copies);
+ spin_unlock(&src_server->nfs_client->cl_lock);
+ }
+
+ status = wait_for_completion_interruptible(&copy->completion);
+ spin_lock(&dst_server->nfs_client->cl_lock);
+ list_del_init(&copy->copies);
+ spin_unlock(&dst_server->nfs_client->cl_lock);
+ if (dst_server != src_server) {
+ spin_lock(&src_server->nfs_client->cl_lock);
+ list_del_init(&copy->src_copies);
+ spin_unlock(&src_server->nfs_client->cl_lock);
+ }
+ if (status == -ERESTARTSYS) {
+ goto out_cancel;
+ } else if (copy->flags || copy->error == NFS4ERR_PARTNER_NO_AUTH) {
+ status = -EAGAIN;
+ *restart = true;
+ goto out_cancel;
+ }
+out:
+ res->write_res.count = copy->count;
+ memcpy(&res->write_res.verifier, &copy->verf, sizeof(copy->verf));
+ status = -copy->error;
+
+out_free:
+ kfree(copy);
+ return status;
+out_cancel:
+ nfs42_do_offload_cancel_async(dst, &copy->stateid);
+ if (!nfs42_files_from_same_server(src, dst))
+ nfs42_do_offload_cancel_async(src, src_stateid);
+ goto out_free;
+}
+
+static int process_copy_commit(struct file *dst, loff_t pos_dst,
+ struct nfs42_copy_res *res)
+{
+ struct nfs_commitres cres;
+ int status = -ENOMEM;
+
+ cres.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_KERNEL);
+ if (!cres.verf)
+ goto out;
+
+ status = nfs4_proc_commit(dst, pos_dst, res->write_res.count, &cres);
+ if (status)
+ goto out_free;
+ if (nfs_write_verifier_cmp(&res->write_res.verifier.verifier,
+ &cres.verf->verifier)) {
+ dprintk("commit verf differs from copy verf\n");
+ status = -EAGAIN;
+ }
+out_free:
+ kfree(cres.verf);
+out:
+ return status;
+}
+
+/**
+ * nfs42_copy_dest_done - perform inode cache updates after clone/copy offload
+ * @inode: pointer to destination inode
+ * @pos: destination offset
+ * @len: copy length
+ *
+ * Punch a hole in the inode page cache, so that the NFS client will
+ * know to retrieve new data.
+ * Update the file size if necessary, and then mark the inode as having
+ * invalid cached values for change attribute, ctime, mtime and space used.
+ */
+static void nfs42_copy_dest_done(struct inode *inode, loff_t pos, loff_t len)
+{
+ loff_t newsize = pos + len;
+ loff_t end = newsize - 1;
+
+ WARN_ON_ONCE(invalidate_inode_pages2_range(inode->i_mapping,
+ pos >> PAGE_SHIFT, end >> PAGE_SHIFT));
+
+ spin_lock(&inode->i_lock);
+ if (newsize > i_size_read(inode))
+ i_size_write(inode, newsize);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE |
+ NFS_INO_INVALID_CTIME |
+ NFS_INO_INVALID_MTIME |
+ NFS_INO_INVALID_BLOCKS);
+ spin_unlock(&inode->i_lock);
+}
+
+static ssize_t _nfs42_proc_copy(struct file *src,
+ struct nfs_lock_context *src_lock,
+ struct file *dst,
+ struct nfs_lock_context *dst_lock,
+ struct nfs42_copy_args *args,
+ struct nfs42_copy_res *res,
+ struct nl4_server *nss,
+ nfs4_stateid *cnr_stateid,
+ bool *restart)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY],
+ .rpc_argp = args,
+ .rpc_resp = res,
+ };
+ struct inode *dst_inode = file_inode(dst);
+ struct inode *src_inode = file_inode(src);
+ struct nfs_server *dst_server = NFS_SERVER(dst_inode);
+ struct nfs_server *src_server = NFS_SERVER(src_inode);
+ loff_t pos_src = args->src_pos;
+ loff_t pos_dst = args->dst_pos;
+ size_t count = args->count;
+ ssize_t status;
+
+ if (nss) {
+ args->cp_src = nss;
+ nfs4_stateid_copy(&args->src_stateid, cnr_stateid);
+ } else {
+ status = nfs4_set_rw_stateid(&args->src_stateid,
+ src_lock->open_context, src_lock, FMODE_READ);
+ if (status) {
+ if (status == -EAGAIN)
+ status = -NFS4ERR_BAD_STATEID;
+ return status;
+ }
+ }
+ status = nfs_filemap_write_and_wait_range(src->f_mapping,
+ pos_src, pos_src + (loff_t)count - 1);
+ if (status)
+ return status;
+
+ status = nfs4_set_rw_stateid(&args->dst_stateid, dst_lock->open_context,
+ dst_lock, FMODE_WRITE);
+ if (status) {
+ if (status == -EAGAIN)
+ status = -NFS4ERR_BAD_STATEID;
+ return status;
+ }
+
+ status = nfs_sync_inode(dst_inode);
+ if (status)
+ return status;
+
+ res->commit_res.verf = NULL;
+ if (args->sync) {
+ res->commit_res.verf =
+ kzalloc(sizeof(struct nfs_writeverf), GFP_KERNEL);
+ if (!res->commit_res.verf)
+ return -ENOMEM;
+ }
+ set_bit(NFS_CLNT_SRC_SSC_COPY_STATE,
+ &src_lock->open_context->state->flags);
+ set_bit(NFS_CLNT_DST_SSC_COPY_STATE,
+ &dst_lock->open_context->state->flags);
+
+ status = nfs4_call_sync(dst_server->client, dst_server, &msg,
+ &args->seq_args, &res->seq_res, 0);
+ trace_nfs4_copy(src_inode, dst_inode, args, res, nss, status);
+ if (status == -ENOTSUPP)
+ dst_server->caps &= ~NFS_CAP_COPY;
+ if (status)
+ goto out;
+
+ if (args->sync &&
+ nfs_write_verifier_cmp(&res->write_res.verifier.verifier,
+ &res->commit_res.verf->verifier)) {
+ status = -EAGAIN;
+ goto out;
+ }
+
+ if (!res->synchronous) {
+ status = handle_async_copy(res, dst_server, src_server, src,
+ dst, &args->src_stateid, restart);
+ if (status)
+ goto out;
+ }
+
+ if ((!res->synchronous || !args->sync) &&
+ res->write_res.verifier.committed != NFS_FILE_SYNC) {
+ status = process_copy_commit(dst, pos_dst, res);
+ if (status)
+ goto out;
+ }
+
+ nfs42_copy_dest_done(dst_inode, pos_dst, res->write_res.count);
+ nfs_invalidate_atime(src_inode);
+ status = res->write_res.count;
+out:
+ if (args->sync)
+ kfree(res->commit_res.verf);
+ return status;
+}
+
+ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
+ struct file *dst, loff_t pos_dst, size_t count,
+ struct nl4_server *nss,
+ nfs4_stateid *cnr_stateid, bool sync)
+{
+ struct nfs_server *server = NFS_SERVER(file_inode(dst));
+ struct nfs_lock_context *src_lock;
+ struct nfs_lock_context *dst_lock;
+ struct nfs42_copy_args args = {
+ .src_fh = NFS_FH(file_inode(src)),
+ .src_pos = pos_src,
+ .dst_fh = NFS_FH(file_inode(dst)),
+ .dst_pos = pos_dst,
+ .count = count,
+ .sync = sync,
+ };
+ struct nfs42_copy_res res;
+ struct nfs4_exception src_exception = {
+ .inode = file_inode(src),
+ .stateid = &args.src_stateid,
+ };
+ struct nfs4_exception dst_exception = {
+ .inode = file_inode(dst),
+ .stateid = &args.dst_stateid,
+ };
+ ssize_t err, err2;
+ bool restart = false;
+
+ src_lock = nfs_get_lock_context(nfs_file_open_context(src));
+ if (IS_ERR(src_lock))
+ return PTR_ERR(src_lock);
+
+ src_exception.state = src_lock->open_context->state;
+
+ dst_lock = nfs_get_lock_context(nfs_file_open_context(dst));
+ if (IS_ERR(dst_lock)) {
+ err = PTR_ERR(dst_lock);
+ goto out_put_src_lock;
+ }
+
+ dst_exception.state = dst_lock->open_context->state;
+
+ do {
+ inode_lock(file_inode(dst));
+ err = _nfs42_proc_copy(src, src_lock,
+ dst, dst_lock,
+ &args, &res,
+ nss, cnr_stateid, &restart);
+ inode_unlock(file_inode(dst));
+
+ if (err >= 0)
+ break;
+ if ((err == -ENOTSUPP ||
+ err == -NFS4ERR_OFFLOAD_DENIED) &&
+ nfs42_files_from_same_server(src, dst)) {
+ err = -EOPNOTSUPP;
+ break;
+ } else if (err == -EAGAIN) {
+ if (!restart) {
+ dst_exception.retry = 1;
+ continue;
+ }
+ break;
+ } else if (err == -NFS4ERR_OFFLOAD_NO_REQS &&
+ args.sync != res.synchronous) {
+ args.sync = res.synchronous;
+ dst_exception.retry = 1;
+ continue;
+ } else if ((err == -ESTALE ||
+ err == -NFS4ERR_OFFLOAD_DENIED ||
+ err == -ENOTSUPP) &&
+ !nfs42_files_from_same_server(src, dst)) {
+ nfs42_do_offload_cancel_async(src, &args.src_stateid);
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ err2 = nfs4_handle_exception(server, err, &src_exception);
+ err = nfs4_handle_exception(server, err, &dst_exception);
+ if (!err)
+ err = err2;
+ } while (src_exception.retry || dst_exception.retry);
+
+ nfs_put_lock_context(dst_lock);
+out_put_src_lock:
+ nfs_put_lock_context(src_lock);
+ return err;
+}
+
+struct nfs42_offloadcancel_data {
+ struct nfs_server *seq_server;
+ struct nfs42_offload_status_args args;
+ struct nfs42_offload_status_res res;
+};
+
+static void nfs42_offload_cancel_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs42_offloadcancel_data *data = calldata;
+
+ nfs4_setup_sequence(data->seq_server->nfs_client,
+ &data->args.osa_seq_args,
+ &data->res.osr_seq_res, task);
+}
+
+static void nfs42_offload_cancel_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs42_offloadcancel_data *data = calldata;
+
+ trace_nfs4_offload_cancel(&data->args, task->tk_status);
+ nfs41_sequence_done(task, &data->res.osr_seq_res);
+ if (task->tk_status &&
+ nfs4_async_handle_error(task, data->seq_server, NULL,
+ NULL) == -EAGAIN)
+ rpc_restart_call_prepare(task);
+}
+
+static void nfs42_free_offloadcancel_data(void *data)
+{
+ kfree(data);
+}
+
+static const struct rpc_call_ops nfs42_offload_cancel_ops = {
+ .rpc_call_prepare = nfs42_offload_cancel_prepare,
+ .rpc_call_done = nfs42_offload_cancel_done,
+ .rpc_release = nfs42_free_offloadcancel_data,
+};
+
+static int nfs42_do_offload_cancel_async(struct file *dst,
+ nfs4_stateid *stateid)
+{
+ struct nfs_server *dst_server = NFS_SERVER(file_inode(dst));
+ struct nfs42_offloadcancel_data *data = NULL;
+ struct nfs_open_context *ctx = nfs_file_open_context(dst);
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OFFLOAD_CANCEL],
+ .rpc_cred = ctx->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = dst_server->client,
+ .rpc_message = &msg,
+ .callback_ops = &nfs42_offload_cancel_ops,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC,
+ };
+ int status;
+
+ if (!(dst_server->caps & NFS_CAP_OFFLOAD_CANCEL))
+ return -EOPNOTSUPP;
+
+ data = kzalloc(sizeof(struct nfs42_offloadcancel_data), GFP_KERNEL);
+ if (data == NULL)
+ return -ENOMEM;
+
+ data->seq_server = dst_server;
+ data->args.osa_src_fh = NFS_FH(file_inode(dst));
+ memcpy(&data->args.osa_stateid, stateid,
+ sizeof(data->args.osa_stateid));
+ msg.rpc_argp = &data->args;
+ msg.rpc_resp = &data->res;
+ task_setup_data.callback_data = data;
+ nfs4_init_sequence(&data->args.osa_seq_args, &data->res.osr_seq_res,
+ 1, 0);
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ status = rpc_wait_for_completion_task(task);
+ if (status == -ENOTSUPP)
+ dst_server->caps &= ~NFS_CAP_OFFLOAD_CANCEL;
+ rpc_put_task(task);
+ return status;
+}
+
+static int _nfs42_proc_copy_notify(struct file *src, struct file *dst,
+ struct nfs42_copy_notify_args *args,
+ struct nfs42_copy_notify_res *res)
+{
+ struct nfs_server *src_server = NFS_SERVER(file_inode(src));
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY_NOTIFY],
+ .rpc_argp = args,
+ .rpc_resp = res,
+ };
+ int status;
+ struct nfs_open_context *ctx;
+ struct nfs_lock_context *l_ctx;
+
+ ctx = get_nfs_open_context(nfs_file_open_context(src));
+ l_ctx = nfs_get_lock_context(ctx);
+ if (IS_ERR(l_ctx)) {
+ status = PTR_ERR(l_ctx);
+ goto out;
+ }
+
+ status = nfs4_set_rw_stateid(&args->cna_src_stateid, ctx, l_ctx,
+ FMODE_READ);
+ nfs_put_lock_context(l_ctx);
+ if (status) {
+ if (status == -EAGAIN)
+ status = -NFS4ERR_BAD_STATEID;
+ goto out;
+ }
+
+ status = nfs4_call_sync(src_server->client, src_server, &msg,
+ &args->cna_seq_args, &res->cnr_seq_res, 0);
+ trace_nfs4_copy_notify(file_inode(src), args, res, status);
+ if (status == -ENOTSUPP)
+ src_server->caps &= ~NFS_CAP_COPY_NOTIFY;
+
+out:
+ put_nfs_open_context(nfs_file_open_context(src));
+ return status;
+}
+
+int nfs42_proc_copy_notify(struct file *src, struct file *dst,
+ struct nfs42_copy_notify_res *res)
+{
+ struct nfs_server *src_server = NFS_SERVER(file_inode(src));
+ struct nfs42_copy_notify_args *args;
+ struct nfs4_exception exception = {
+ .inode = file_inode(src),
+ };
+ int status;
+
+ if (!(src_server->caps & NFS_CAP_COPY_NOTIFY))
+ return -EOPNOTSUPP;
+
+ args = kzalloc(sizeof(struct nfs42_copy_notify_args), GFP_KERNEL);
+ if (args == NULL)
+ return -ENOMEM;
+
+ args->cna_src_fh = NFS_FH(file_inode(src)),
+ args->cna_dst.nl4_type = NL4_NETADDR;
+ nfs42_set_netaddr(dst, &args->cna_dst.u.nl4_addr);
+ exception.stateid = &args->cna_src_stateid;
+
+ do {
+ status = _nfs42_proc_copy_notify(src, dst, args, res);
+ if (status == -ENOTSUPP) {
+ status = -EOPNOTSUPP;
+ goto out;
+ }
+ status = nfs4_handle_exception(src_server, status, &exception);
+ } while (exception.retry);
+
+out:
+ kfree(args);
+ return status;
+}
+
+static loff_t _nfs42_proc_llseek(struct file *filep,
+ struct nfs_lock_context *lock, loff_t offset, int whence)
+{
+ struct inode *inode = file_inode(filep);
+ struct nfs42_seek_args args = {
+ .sa_fh = NFS_FH(inode),
+ .sa_offset = offset,
+ .sa_what = (whence == SEEK_HOLE) ?
+ NFS4_CONTENT_HOLE : NFS4_CONTENT_DATA,
+ };
+ struct nfs42_seek_res res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEEK],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ struct nfs_server *server = NFS_SERVER(inode);
+ int status;
+
+ if (!nfs_server_capable(inode, NFS_CAP_SEEK))
+ return -ENOTSUPP;
+
+ status = nfs4_set_rw_stateid(&args.sa_stateid, lock->open_context,
+ lock, FMODE_READ);
+ if (status) {
+ if (status == -EAGAIN)
+ status = -NFS4ERR_BAD_STATEID;
+ return status;
+ }
+
+ status = nfs_filemap_write_and_wait_range(inode->i_mapping,
+ offset, LLONG_MAX);
+ if (status)
+ return status;
+
+ status = nfs4_call_sync(server->client, server, &msg,
+ &args.seq_args, &res.seq_res, 0);
+ trace_nfs4_llseek(inode, &args, &res, status);
+ if (status == -ENOTSUPP)
+ server->caps &= ~NFS_CAP_SEEK;
+ if (status)
+ return status;
+
+ if (whence == SEEK_DATA && res.sr_eof)
+ return -NFS4ERR_NXIO;
+ else
+ return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
+}
+
+loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
+{
+ struct nfs_server *server = NFS_SERVER(file_inode(filep));
+ struct nfs4_exception exception = { };
+ struct nfs_lock_context *lock;
+ loff_t err;
+
+ lock = nfs_get_lock_context(nfs_file_open_context(filep));
+ if (IS_ERR(lock))
+ return PTR_ERR(lock);
+
+ exception.inode = file_inode(filep);
+ exception.state = lock->open_context->state;
+
+ do {
+ err = _nfs42_proc_llseek(filep, lock, offset, whence);
+ if (err >= 0)
+ break;
+ if (err == -ENOTSUPP) {
+ err = -EOPNOTSUPP;
+ break;
+ }
+ err = nfs4_handle_exception(server, err, &exception);
+ } while (exception.retry);
+
+ nfs_put_lock_context(lock);
+ return err;
+}
+
+
+static void
+nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs42_layoutstat_data *data = calldata;
+ struct inode *inode = data->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct pnfs_layout_hdr *lo;
+
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (!pnfs_layout_is_valid(lo)) {
+ spin_unlock(&inode->i_lock);
+ rpc_exit(task, 0);
+ return;
+ }
+ nfs4_stateid_copy(&data->args.stateid, &lo->plh_stateid);
+ spin_unlock(&inode->i_lock);
+ nfs4_setup_sequence(server->nfs_client, &data->args.seq_args,
+ &data->res.seq_res, task);
+}
+
+static void
+nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs42_layoutstat_data *data = calldata;
+ struct inode *inode = data->inode;
+ struct pnfs_layout_hdr *lo;
+
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return;
+
+ switch (task->tk_status) {
+ case 0:
+ return;
+ case -NFS4ERR_BADHANDLE:
+ case -ESTALE:
+ pnfs_destroy_layout(NFS_I(inode));
+ break;
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_BAD_STATEID:
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match(&data->args.stateid,
+ &lo->plh_stateid)) {
+ LIST_HEAD(head);
+
+ /*
+ * Mark the bad layout state as invalid, then retry
+ * with the current stateid.
+ */
+ pnfs_mark_layout_stateid_invalid(lo, &head);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&head);
+ nfs_commit_inode(inode, 0);
+ } else
+ spin_unlock(&inode->i_lock);
+ break;
+ case -NFS4ERR_OLD_STATEID:
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match_other(&data->args.stateid,
+ &lo->plh_stateid)) {
+ /* Do we need to delay before resending? */
+ if (!nfs4_stateid_is_newer(&lo->plh_stateid,
+ &data->args.stateid))
+ rpc_delay(task, HZ);
+ rpc_restart_call_prepare(task);
+ }
+ spin_unlock(&inode->i_lock);
+ break;
+ case -ENOTSUPP:
+ case -EOPNOTSUPP:
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
+ }
+
+ trace_nfs4_layoutstats(inode, &data->args.stateid, task->tk_status);
+}
+
+static void
+nfs42_layoutstat_release(void *calldata)
+{
+ struct nfs42_layoutstat_data *data = calldata;
+ struct nfs42_layoutstat_devinfo *devinfo = data->args.devinfo;
+ int i;
+
+ for (i = 0; i < data->args.num_dev; i++) {
+ if (devinfo[i].ld_private.ops && devinfo[i].ld_private.ops->free)
+ devinfo[i].ld_private.ops->free(&devinfo[i].ld_private);
+ }
+
+ pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout);
+ smp_mb__before_atomic();
+ clear_bit(NFS_INO_LAYOUTSTATS, &NFS_I(data->args.inode)->flags);
+ smp_mb__after_atomic();
+ nfs_iput_and_deactive(data->inode);
+ kfree(data->args.devinfo);
+ kfree(data);
+}
+
+static const struct rpc_call_ops nfs42_layoutstat_ops = {
+ .rpc_call_prepare = nfs42_layoutstat_prepare,
+ .rpc_call_done = nfs42_layoutstat_done,
+ .rpc_release = nfs42_layoutstat_release,
+};
+
+int nfs42_proc_layoutstats_generic(struct nfs_server *server,
+ struct nfs42_layoutstat_data *data)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTSTATS],
+ .rpc_argp = &data->args,
+ .rpc_resp = &data->res,
+ };
+ struct rpc_task_setup task_setup = {
+ .rpc_client = server->client,
+ .rpc_message = &msg,
+ .callback_ops = &nfs42_layoutstat_ops,
+ .callback_data = data,
+ .flags = RPC_TASK_ASYNC,
+ };
+ struct rpc_task *task;
+
+ data->inode = nfs_igrab_and_active(data->args.inode);
+ if (!data->inode) {
+ nfs42_layoutstat_release(data);
+ return -EAGAIN;
+ }
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0, 0);
+ task = rpc_run_task(&task_setup);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ rpc_put_task(task);
+ return 0;
+}
+
+static struct nfs42_layouterror_data *
+nfs42_alloc_layouterror_data(struct pnfs_layout_segment *lseg, gfp_t gfp_flags)
+{
+ struct nfs42_layouterror_data *data;
+ struct inode *inode = lseg->pls_layout->plh_inode;
+
+ data = kzalloc(sizeof(*data), gfp_flags);
+ if (data) {
+ data->args.inode = data->inode = nfs_igrab_and_active(inode);
+ if (data->inode) {
+ data->lseg = pnfs_get_lseg(lseg);
+ if (data->lseg)
+ return data;
+ nfs_iput_and_deactive(data->inode);
+ }
+ kfree(data);
+ }
+ return NULL;
+}
+
+static void
+nfs42_free_layouterror_data(struct nfs42_layouterror_data *data)
+{
+ pnfs_put_lseg(data->lseg);
+ nfs_iput_and_deactive(data->inode);
+ kfree(data);
+}
+
+static void
+nfs42_layouterror_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs42_layouterror_data *data = calldata;
+ struct inode *inode = data->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct pnfs_layout_hdr *lo = data->lseg->pls_layout;
+ unsigned i;
+
+ spin_lock(&inode->i_lock);
+ if (!pnfs_layout_is_valid(lo)) {
+ spin_unlock(&inode->i_lock);
+ rpc_exit(task, 0);
+ return;
+ }
+ for (i = 0; i < data->args.num_errors; i++)
+ nfs4_stateid_copy(&data->args.errors[i].stateid,
+ &lo->plh_stateid);
+ spin_unlock(&inode->i_lock);
+ nfs4_setup_sequence(server->nfs_client, &data->args.seq_args,
+ &data->res.seq_res, task);
+}
+
+static void
+nfs42_layouterror_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs42_layouterror_data *data = calldata;
+ struct inode *inode = data->inode;
+ struct pnfs_layout_hdr *lo = data->lseg->pls_layout;
+
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return;
+
+ switch (task->tk_status) {
+ case 0:
+ return;
+ case -NFS4ERR_BADHANDLE:
+ case -ESTALE:
+ pnfs_destroy_layout(NFS_I(inode));
+ break;
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_BAD_STATEID:
+ spin_lock(&inode->i_lock);
+ if (pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match(&data->args.errors[0].stateid,
+ &lo->plh_stateid)) {
+ LIST_HEAD(head);
+
+ /*
+ * Mark the bad layout state as invalid, then retry
+ * with the current stateid.
+ */
+ pnfs_mark_layout_stateid_invalid(lo, &head);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&head);
+ nfs_commit_inode(inode, 0);
+ } else
+ spin_unlock(&inode->i_lock);
+ break;
+ case -NFS4ERR_OLD_STATEID:
+ spin_lock(&inode->i_lock);
+ if (pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match_other(&data->args.errors[0].stateid,
+ &lo->plh_stateid)) {
+ /* Do we need to delay before resending? */
+ if (!nfs4_stateid_is_newer(&lo->plh_stateid,
+ &data->args.errors[0].stateid))
+ rpc_delay(task, HZ);
+ rpc_restart_call_prepare(task);
+ }
+ spin_unlock(&inode->i_lock);
+ break;
+ case -ENOTSUPP:
+ case -EOPNOTSUPP:
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTERROR;
+ }
+
+ trace_nfs4_layouterror(inode, &data->args.errors[0].stateid,
+ task->tk_status);
+}
+
+static void
+nfs42_layouterror_release(void *calldata)
+{
+ struct nfs42_layouterror_data *data = calldata;
+
+ nfs42_free_layouterror_data(data);
+}
+
+static const struct rpc_call_ops nfs42_layouterror_ops = {
+ .rpc_call_prepare = nfs42_layouterror_prepare,
+ .rpc_call_done = nfs42_layouterror_done,
+ .rpc_release = nfs42_layouterror_release,
+};
+
+int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg,
+ const struct nfs42_layout_error *errors, size_t n)
+{
+ struct inode *inode = lseg->pls_layout->plh_inode;
+ struct nfs42_layouterror_data *data;
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTERROR],
+ };
+ struct rpc_task_setup task_setup = {
+ .rpc_message = &msg,
+ .callback_ops = &nfs42_layouterror_ops,
+ .flags = RPC_TASK_ASYNC,
+ };
+ unsigned int i;
+
+ if (!nfs_server_capable(inode, NFS_CAP_LAYOUTERROR))
+ return -EOPNOTSUPP;
+ if (n > NFS42_LAYOUTERROR_MAX)
+ return -EINVAL;
+ data = nfs42_alloc_layouterror_data(lseg, nfs_io_gfp_mask());
+ if (!data)
+ return -ENOMEM;
+ for (i = 0; i < n; i++) {
+ data->args.errors[i] = errors[i];
+ data->args.num_errors++;
+ data->res.num_errors++;
+ }
+ msg.rpc_argp = &data->args;
+ msg.rpc_resp = &data->res;
+ task_setup.callback_data = data;
+ task_setup.rpc_client = NFS_SERVER(inode)->client;
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0, 0);
+ task = rpc_run_task(&task_setup);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ rpc_put_task(task);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs42_proc_layouterror);
+
+static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
+ struct file *dst_f, struct nfs_lock_context *src_lock,
+ struct nfs_lock_context *dst_lock, loff_t src_offset,
+ loff_t dst_offset, loff_t count)
+{
+ struct inode *src_inode = file_inode(src_f);
+ struct inode *dst_inode = file_inode(dst_f);
+ struct nfs_server *server = NFS_SERVER(dst_inode);
+ __u32 dst_bitmask[NFS_BITMASK_SZ];
+ struct nfs42_clone_args args = {
+ .src_fh = NFS_FH(src_inode),
+ .dst_fh = NFS_FH(dst_inode),
+ .src_offset = src_offset,
+ .dst_offset = dst_offset,
+ .count = count,
+ .dst_bitmask = dst_bitmask,
+ };
+ struct nfs42_clone_res res = {
+ .server = server,
+ };
+ int status;
+
+ msg->rpc_argp = &args;
+ msg->rpc_resp = &res;
+
+ status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context,
+ src_lock, FMODE_READ);
+ if (status) {
+ if (status == -EAGAIN)
+ status = -NFS4ERR_BAD_STATEID;
+ return status;
+ }
+ status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
+ dst_lock, FMODE_WRITE);
+ if (status) {
+ if (status == -EAGAIN)
+ status = -NFS4ERR_BAD_STATEID;
+ return status;
+ }
+
+ res.dst_fattr = nfs_alloc_fattr();
+ if (!res.dst_fattr)
+ return -ENOMEM;
+
+ nfs4_bitmask_set(dst_bitmask, server->cache_consistency_bitmask,
+ dst_inode, NFS_INO_INVALID_BLOCKS);
+
+ status = nfs4_call_sync(server->client, server, msg,
+ &args.seq_args, &res.seq_res, 0);
+ trace_nfs4_clone(src_inode, dst_inode, &args, status);
+ if (status == 0) {
+ /* a zero-length count means clone to EOF in src */
+ if (count == 0 && res.dst_fattr->valid & NFS_ATTR_FATTR_SIZE)
+ count = nfs_size_to_loff_t(res.dst_fattr->size) - dst_offset;
+ nfs42_copy_dest_done(dst_inode, dst_offset, count);
+ status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);
+ }
+
+ kfree(res.dst_fattr);
+ return status;
+}
+
+int nfs42_proc_clone(struct file *src_f, struct file *dst_f,
+ loff_t src_offset, loff_t dst_offset, loff_t count)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLONE],
+ };
+ struct inode *inode = file_inode(src_f);
+ struct nfs_server *server = NFS_SERVER(file_inode(src_f));
+ struct nfs_lock_context *src_lock;
+ struct nfs_lock_context *dst_lock;
+ struct nfs4_exception src_exception = { };
+ struct nfs4_exception dst_exception = { };
+ int err, err2;
+
+ if (!nfs_server_capable(inode, NFS_CAP_CLONE))
+ return -EOPNOTSUPP;
+
+ src_lock = nfs_get_lock_context(nfs_file_open_context(src_f));
+ if (IS_ERR(src_lock))
+ return PTR_ERR(src_lock);
+
+ src_exception.inode = file_inode(src_f);
+ src_exception.state = src_lock->open_context->state;
+
+ dst_lock = nfs_get_lock_context(nfs_file_open_context(dst_f));
+ if (IS_ERR(dst_lock)) {
+ err = PTR_ERR(dst_lock);
+ goto out_put_src_lock;
+ }
+
+ dst_exception.inode = file_inode(dst_f);
+ dst_exception.state = dst_lock->open_context->state;
+
+ do {
+ err = _nfs42_proc_clone(&msg, src_f, dst_f, src_lock, dst_lock,
+ src_offset, dst_offset, count);
+ if (err == -ENOTSUPP || err == -EOPNOTSUPP) {
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_CLONE;
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ err2 = nfs4_handle_exception(server, err, &src_exception);
+ err = nfs4_handle_exception(server, err, &dst_exception);
+ if (!err)
+ err = err2;
+ } while (src_exception.retry || dst_exception.retry);
+
+ nfs_put_lock_context(dst_lock);
+out_put_src_lock:
+ nfs_put_lock_context(src_lock);
+ return err;
+}
+
+#define NFS4XATTR_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE)
+
+static int _nfs42_proc_removexattr(struct inode *inode, const char *name)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs42_removexattrargs args = {
+ .fh = NFS_FH(inode),
+ .xattr_name = name,
+ };
+ struct nfs42_removexattrres res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVEXATTR],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ int ret;
+ unsigned long timestamp = jiffies;
+
+ ret = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
+ &res.seq_res, 1);
+ trace_nfs4_removexattr(inode, name, ret);
+ if (!ret)
+ nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0);
+
+ return ret;
+}
+
+static int _nfs42_proc_setxattr(struct inode *inode, const char *name,
+ const void *buf, size_t buflen, int flags)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ __u32 bitmask[NFS_BITMASK_SZ];
+ struct page *pages[NFS4XATTR_MAXPAGES];
+ struct nfs42_setxattrargs arg = {
+ .fh = NFS_FH(inode),
+ .bitmask = bitmask,
+ .xattr_pages = pages,
+ .xattr_len = buflen,
+ .xattr_name = name,
+ .xattr_flags = flags,
+ };
+ struct nfs42_setxattrres res = {
+ .server = server,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETXATTR],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int ret, np;
+ unsigned long timestamp = jiffies;
+
+ if (buflen > server->sxasize)
+ return -ERANGE;
+
+ res.fattr = nfs_alloc_fattr();
+ if (!res.fattr)
+ return -ENOMEM;
+
+ if (buflen > 0) {
+ np = nfs4_buf_to_pages_noslab(buf, buflen, arg.xattr_pages);
+ if (np < 0) {
+ ret = np;
+ goto out;
+ }
+ } else
+ np = 0;
+
+ nfs4_bitmask_set(bitmask, server->cache_consistency_bitmask,
+ inode, NFS_INO_INVALID_CHANGE);
+
+ ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
+ &res.seq_res, 1);
+ trace_nfs4_setxattr(inode, name, ret);
+
+ for (; np > 0; np--)
+ put_page(pages[np - 1]);
+
+ if (!ret) {
+ nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0);
+ ret = nfs_post_op_update_inode(inode, res.fattr);
+ }
+
+out:
+ kfree(res.fattr);
+ return ret;
+}
+
+static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
+ void *buf, size_t buflen, struct page **pages,
+ size_t plen)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs42_getxattrargs arg = {
+ .fh = NFS_FH(inode),
+ .xattr_name = name,
+ };
+ struct nfs42_getxattrres res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETXATTR],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ ssize_t ret;
+
+ arg.xattr_len = plen;
+ arg.xattr_pages = pages;
+
+ ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
+ &res.seq_res, 0);
+ trace_nfs4_getxattr(inode, name, ret);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Normally, the caching is done one layer up, but for successful
+ * RPCS, always cache the result here, even if the caller was
+ * just querying the length, or if the reply was too big for
+ * the caller. This avoids a second RPC in the case of the
+ * common query-alloc-retrieve cycle for xattrs.
+ *
+ * Note that xattr_len is always capped to XATTR_SIZE_MAX.
+ */
+
+ nfs4_xattr_cache_add(inode, name, NULL, pages, res.xattr_len);
+
+ if (buflen) {
+ if (res.xattr_len > buflen)
+ return -ERANGE;
+ _copy_from_pages(buf, pages, 0, res.xattr_len);
+ }
+
+ return res.xattr_len;
+}
+
+static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf,
+ size_t buflen, u64 *cookiep, bool *eofp)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct page **pages;
+ struct nfs42_listxattrsargs arg = {
+ .fh = NFS_FH(inode),
+ .cookie = *cookiep,
+ };
+ struct nfs42_listxattrsres res = {
+ .eof = false,
+ .xattr_buf = buf,
+ .xattr_len = buflen,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LISTXATTRS],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ u32 xdrlen;
+ int ret, np, i;
+
+
+ ret = -ENOMEM;
+ res.scratch = alloc_page(GFP_KERNEL);
+ if (!res.scratch)
+ goto out;
+
+ xdrlen = nfs42_listxattr_xdrsize(buflen);
+ if (xdrlen > server->lxasize)
+ xdrlen = server->lxasize;
+ np = xdrlen / PAGE_SIZE + 1;
+
+ pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ goto out_free_scratch;
+ for (i = 0; i < np; i++) {
+ pages[i] = alloc_page(GFP_KERNEL);
+ if (!pages[i])
+ goto out_free_pages;
+ }
+
+ arg.xattr_pages = pages;
+ arg.count = xdrlen;
+
+ ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
+ &res.seq_res, 0);
+ trace_nfs4_listxattr(inode, ret);
+
+ if (ret >= 0) {
+ ret = res.copied;
+ *cookiep = res.cookie;
+ *eofp = res.eof;
+ }
+
+out_free_pages:
+ while (--np >= 0) {
+ if (pages[np])
+ __free_page(pages[np]);
+ }
+ kfree(pages);
+out_free_scratch:
+ __free_page(res.scratch);
+out:
+ return ret;
+
+}
+
+ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name,
+ void *buf, size_t buflen)
+{
+ struct nfs4_exception exception = { };
+ ssize_t err, np, i;
+ struct page **pages;
+
+ np = nfs_page_array_len(0, buflen ?: XATTR_SIZE_MAX);
+ pages = kmalloc_array(np, sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ for (i = 0; i < np; i++) {
+ pages[i] = alloc_page(GFP_KERNEL);
+ if (!pages[i]) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /*
+ * The GETXATTR op has no length field in the call, and the
+ * xattr data is at the end of the reply.
+ *
+ * There is no downside in using the page-aligned length. It will
+ * allow receiving and caching xattrs that are too large for the
+ * caller but still fit in the page-rounded value.
+ */
+ do {
+ err = _nfs42_proc_getxattr(inode, name, buf, buflen,
+ pages, np * PAGE_SIZE);
+ if (err >= 0)
+ break;
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+
+out:
+ while (--i >= 0)
+ __free_page(pages[i]);
+ kfree(pages);
+
+ return err;
+}
+
+int nfs42_proc_setxattr(struct inode *inode, const char *name,
+ const void *buf, size_t buflen, int flags)
+{
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = _nfs42_proc_setxattr(inode, name, buf, buflen, flags);
+ if (!err)
+ break;
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+
+ return err;
+}
+
+ssize_t nfs42_proc_listxattrs(struct inode *inode, void *buf,
+ size_t buflen, u64 *cookiep, bool *eofp)
+{
+ struct nfs4_exception exception = { };
+ ssize_t err;
+
+ do {
+ err = _nfs42_proc_listxattrs(inode, buf, buflen,
+ cookiep, eofp);
+ if (err >= 0)
+ break;
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+
+ return err;
+}
+
+int nfs42_proc_removexattr(struct inode *inode, const char *name)
+{
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = _nfs42_proc_removexattr(inode, name);
+ if (!err)
+ break;
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+
+ return err;
+}
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
new file mode 100644
index 0000000000..911f634ba3
--- /dev/null
+++ b/fs/nfs/nfs42xattr.c
@@ -0,0 +1,1066 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2019, 2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ *
+ * User extended attribute client side cache functions.
+ *
+ * Author: Frank van der Linden <fllinden@amazon.com>
+ */
+#include <linux/errno.h>
+#include <linux/nfs_fs.h>
+#include <linux/hashtable.h>
+#include <linux/refcount.h>
+#include <uapi/linux/xattr.h>
+
+#include "nfs4_fs.h"
+#include "internal.h"
+
+/*
+ * User extended attributes client side caching is implemented by having
+ * a cache structure attached to NFS inodes. This structure is allocated
+ * when needed, and freed when the cache is zapped.
+ *
+ * The cache structure contains as hash table of entries, and a pointer
+ * to a special-cased entry for the listxattr cache.
+ *
+ * Accessing and allocating / freeing the caches is done via reference
+ * counting. The cache entries use a similar refcounting scheme.
+ *
+ * This makes freeing a cache, both from the shrinker and from the
+ * zap cache path, easy. It also means that, in current use cases,
+ * the large majority of inodes will not waste any memory, as they
+ * will never have any user extended attributes assigned to them.
+ *
+ * Attribute entries are hashed in to a simple hash table. They are
+ * also part of an LRU.
+ *
+ * There are three shrinkers.
+ *
+ * Two shrinkers deal with the cache entries themselves: one for
+ * large entries (> PAGE_SIZE), and one for smaller entries. The
+ * shrinker for the larger entries works more aggressively than
+ * those for the smaller entries.
+ *
+ * The other shrinker frees the cache structures themselves.
+ */
+
+/*
+ * 64 buckets is a good default. There is likely no reasonable
+ * workload that uses more than even 64 user extended attributes.
+ * You can certainly add a lot more - but you get what you ask for
+ * in those circumstances.
+ */
+#define NFS4_XATTR_HASH_SIZE 64
+
+#define NFSDBG_FACILITY NFSDBG_XATTRCACHE
+
+struct nfs4_xattr_cache;
+struct nfs4_xattr_entry;
+
+struct nfs4_xattr_bucket {
+ spinlock_t lock;
+ struct hlist_head hlist;
+ struct nfs4_xattr_cache *cache;
+ bool draining;
+};
+
+struct nfs4_xattr_cache {
+ struct kref ref;
+ struct nfs4_xattr_bucket buckets[NFS4_XATTR_HASH_SIZE];
+ struct list_head lru;
+ struct list_head dispose;
+ atomic_long_t nent;
+ spinlock_t listxattr_lock;
+ struct inode *inode;
+ struct nfs4_xattr_entry *listxattr;
+};
+
+struct nfs4_xattr_entry {
+ struct kref ref;
+ struct hlist_node hnode;
+ struct list_head lru;
+ struct list_head dispose;
+ char *xattr_name;
+ void *xattr_value;
+ size_t xattr_size;
+ struct nfs4_xattr_bucket *bucket;
+ uint32_t flags;
+};
+
+#define NFS4_XATTR_ENTRY_EXTVAL 0x0001
+
+/*
+ * LRU list of NFS inodes that have xattr caches.
+ */
+static struct list_lru nfs4_xattr_cache_lru;
+static struct list_lru nfs4_xattr_entry_lru;
+static struct list_lru nfs4_xattr_large_entry_lru;
+
+static struct kmem_cache *nfs4_xattr_cache_cachep;
+
+/*
+ * Hashing helper functions.
+ */
+static void
+nfs4_xattr_hash_init(struct nfs4_xattr_cache *cache)
+{
+ unsigned int i;
+
+ for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) {
+ INIT_HLIST_HEAD(&cache->buckets[i].hlist);
+ spin_lock_init(&cache->buckets[i].lock);
+ cache->buckets[i].cache = cache;
+ cache->buckets[i].draining = false;
+ }
+}
+
+/*
+ * Locking order:
+ * 1. inode i_lock or bucket lock
+ * 2. list_lru lock (taken by list_lru_* functions)
+ */
+
+/*
+ * Wrapper functions to add a cache entry to the right LRU.
+ */
+static bool
+nfs4_xattr_entry_lru_add(struct nfs4_xattr_entry *entry)
+{
+ struct list_lru *lru;
+
+ lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ?
+ &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
+
+ return list_lru_add(lru, &entry->lru);
+}
+
+static bool
+nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry)
+{
+ struct list_lru *lru;
+
+ lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ?
+ &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
+
+ return list_lru_del(lru, &entry->lru);
+}
+
+/*
+ * This function allocates cache entries. They are the normal
+ * extended attribute name/value pairs, but may also be a listxattr
+ * cache. Those allocations use the same entry so that they can be
+ * treated as one by the memory shrinker.
+ *
+ * xattr cache entries are allocated together with names. If the
+ * value fits in to one page with the entry structure and the name,
+ * it will also be part of the same allocation (kmalloc). This is
+ * expected to be the vast majority of cases. Larger allocations
+ * have a value pointer that is allocated separately by kvmalloc.
+ *
+ * Parameters:
+ *
+ * @name: Name of the extended attribute. NULL for listxattr cache
+ * entry.
+ * @value: Value of attribute, or listxattr cache. NULL if the
+ * value is to be copied from pages instead.
+ * @pages: Pages to copy the value from, if not NULL. Passed in to
+ * make it easier to copy the value after an RPC, even if
+ * the value will not be passed up to application (e.g.
+ * for a 'query' getxattr with NULL buffer).
+ * @len: Length of the value. Can be 0 for zero-length attributes.
+ * @value and @pages will be NULL if @len is 0.
+ */
+static struct nfs4_xattr_entry *
+nfs4_xattr_alloc_entry(const char *name, const void *value,
+ struct page **pages, size_t len)
+{
+ struct nfs4_xattr_entry *entry;
+ void *valp;
+ char *namep;
+ size_t alloclen, slen;
+ char *buf;
+ uint32_t flags;
+
+ BUILD_BUG_ON(sizeof(struct nfs4_xattr_entry) +
+ XATTR_NAME_MAX + 1 > PAGE_SIZE);
+
+ alloclen = sizeof(struct nfs4_xattr_entry);
+ if (name != NULL) {
+ slen = strlen(name) + 1;
+ alloclen += slen;
+ } else
+ slen = 0;
+
+ if (alloclen + len <= PAGE_SIZE) {
+ alloclen += len;
+ flags = 0;
+ } else {
+ flags = NFS4_XATTR_ENTRY_EXTVAL;
+ }
+
+ buf = kmalloc(alloclen, GFP_KERNEL);
+ if (buf == NULL)
+ return NULL;
+ entry = (struct nfs4_xattr_entry *)buf;
+
+ if (name != NULL) {
+ namep = buf + sizeof(struct nfs4_xattr_entry);
+ memcpy(namep, name, slen);
+ } else {
+ namep = NULL;
+ }
+
+
+ if (flags & NFS4_XATTR_ENTRY_EXTVAL) {
+ valp = kvmalloc(len, GFP_KERNEL);
+ if (valp == NULL) {
+ kfree(buf);
+ return NULL;
+ }
+ } else if (len != 0) {
+ valp = buf + sizeof(struct nfs4_xattr_entry) + slen;
+ } else
+ valp = NULL;
+
+ if (valp != NULL) {
+ if (value != NULL)
+ memcpy(valp, value, len);
+ else
+ _copy_from_pages(valp, pages, 0, len);
+ }
+
+ entry->flags = flags;
+ entry->xattr_value = valp;
+ kref_init(&entry->ref);
+ entry->xattr_name = namep;
+ entry->xattr_size = len;
+ entry->bucket = NULL;
+ INIT_LIST_HEAD(&entry->lru);
+ INIT_LIST_HEAD(&entry->dispose);
+ INIT_HLIST_NODE(&entry->hnode);
+
+ return entry;
+}
+
+static void
+nfs4_xattr_free_entry(struct nfs4_xattr_entry *entry)
+{
+ if (entry->flags & NFS4_XATTR_ENTRY_EXTVAL)
+ kvfree(entry->xattr_value);
+ kfree(entry);
+}
+
+static void
+nfs4_xattr_free_entry_cb(struct kref *kref)
+{
+ struct nfs4_xattr_entry *entry;
+
+ entry = container_of(kref, struct nfs4_xattr_entry, ref);
+
+ if (WARN_ON(!list_empty(&entry->lru)))
+ return;
+
+ nfs4_xattr_free_entry(entry);
+}
+
+static void
+nfs4_xattr_free_cache_cb(struct kref *kref)
+{
+ struct nfs4_xattr_cache *cache;
+ int i;
+
+ cache = container_of(kref, struct nfs4_xattr_cache, ref);
+
+ for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) {
+ if (WARN_ON(!hlist_empty(&cache->buckets[i].hlist)))
+ return;
+ cache->buckets[i].draining = false;
+ }
+
+ cache->listxattr = NULL;
+
+ kmem_cache_free(nfs4_xattr_cache_cachep, cache);
+
+}
+
+static struct nfs4_xattr_cache *
+nfs4_xattr_alloc_cache(void)
+{
+ struct nfs4_xattr_cache *cache;
+
+ cache = kmem_cache_alloc(nfs4_xattr_cache_cachep, GFP_KERNEL);
+ if (cache == NULL)
+ return NULL;
+
+ kref_init(&cache->ref);
+ atomic_long_set(&cache->nent, 0);
+
+ return cache;
+}
+
+/*
+ * Set the listxattr cache, which is a special-cased cache entry.
+ * The special value ERR_PTR(-ESTALE) is used to indicate that
+ * the cache is being drained - this prevents a new listxattr
+ * cache from being added to what is now a stale cache.
+ */
+static int
+nfs4_xattr_set_listcache(struct nfs4_xattr_cache *cache,
+ struct nfs4_xattr_entry *new)
+{
+ struct nfs4_xattr_entry *old;
+ int ret = 1;
+
+ spin_lock(&cache->listxattr_lock);
+
+ old = cache->listxattr;
+
+ if (old == ERR_PTR(-ESTALE)) {
+ ret = 0;
+ goto out;
+ }
+
+ cache->listxattr = new;
+ if (new != NULL && new != ERR_PTR(-ESTALE))
+ nfs4_xattr_entry_lru_add(new);
+
+ if (old != NULL) {
+ nfs4_xattr_entry_lru_del(old);
+ kref_put(&old->ref, nfs4_xattr_free_entry_cb);
+ }
+out:
+ spin_unlock(&cache->listxattr_lock);
+
+ return ret;
+}
+
+/*
+ * Unlink a cache from its parent inode, clearing out an invalid
+ * cache. Must be called with i_lock held.
+ */
+static struct nfs4_xattr_cache *
+nfs4_xattr_cache_unlink(struct inode *inode)
+{
+ struct nfs_inode *nfsi;
+ struct nfs4_xattr_cache *oldcache;
+
+ nfsi = NFS_I(inode);
+
+ oldcache = nfsi->xattr_cache;
+ if (oldcache != NULL) {
+ list_lru_del(&nfs4_xattr_cache_lru, &oldcache->lru);
+ oldcache->inode = NULL;
+ }
+ nfsi->xattr_cache = NULL;
+ nfsi->cache_validity &= ~NFS_INO_INVALID_XATTR;
+
+ return oldcache;
+
+}
+
+/*
+ * Discard a cache. Called by get_cache() if there was an old,
+ * invalid cache. Can also be called from a shrinker callback.
+ *
+ * The cache is dead, it has already been unlinked from its inode,
+ * and no longer appears on the cache LRU list.
+ *
+ * Mark all buckets as draining, so that no new entries are added. This
+ * could still happen in the unlikely, but possible case that another
+ * thread had grabbed a reference before it was unlinked from the inode,
+ * and is still holding it for an add operation.
+ *
+ * Remove all entries from the LRU lists, so that there is no longer
+ * any way to 'find' this cache. Then, remove the entries from the hash
+ * table.
+ *
+ * At that point, the cache will remain empty and can be freed when the final
+ * reference drops, which is very likely the kref_put at the end of
+ * this function, or the one called immediately afterwards in the
+ * shrinker callback.
+ */
+static void
+nfs4_xattr_discard_cache(struct nfs4_xattr_cache *cache)
+{
+ unsigned int i;
+ struct nfs4_xattr_entry *entry;
+ struct nfs4_xattr_bucket *bucket;
+ struct hlist_node *n;
+
+ nfs4_xattr_set_listcache(cache, ERR_PTR(-ESTALE));
+
+ for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) {
+ bucket = &cache->buckets[i];
+
+ spin_lock(&bucket->lock);
+ bucket->draining = true;
+ hlist_for_each_entry_safe(entry, n, &bucket->hlist, hnode) {
+ nfs4_xattr_entry_lru_del(entry);
+ hlist_del_init(&entry->hnode);
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+ }
+ spin_unlock(&bucket->lock);
+ }
+
+ atomic_long_set(&cache->nent, 0);
+
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+}
+
+/*
+ * Get a referenced copy of the cache structure. Avoid doing allocs
+ * while holding i_lock. Which means that we do some optimistic allocation,
+ * and might have to free the result in rare cases.
+ *
+ * This function only checks the NFS_INO_INVALID_XATTR cache validity bit
+ * and acts accordingly, replacing the cache when needed. For the read case
+ * (!add), this means that the caller must make sure that the cache
+ * is valid before caling this function. getxattr and listxattr call
+ * revalidate_inode to do this. The attribute cache timeout (for the
+ * non-delegated case) is expected to be dealt with in the revalidate
+ * call.
+ */
+
+static struct nfs4_xattr_cache *
+nfs4_xattr_get_cache(struct inode *inode, int add)
+{
+ struct nfs_inode *nfsi;
+ struct nfs4_xattr_cache *cache, *oldcache, *newcache;
+
+ nfsi = NFS_I(inode);
+
+ cache = oldcache = NULL;
+
+ spin_lock(&inode->i_lock);
+
+ if (nfsi->cache_validity & NFS_INO_INVALID_XATTR)
+ oldcache = nfs4_xattr_cache_unlink(inode);
+ else
+ cache = nfsi->xattr_cache;
+
+ if (cache != NULL)
+ kref_get(&cache->ref);
+
+ spin_unlock(&inode->i_lock);
+
+ if (add && cache == NULL) {
+ newcache = NULL;
+
+ cache = nfs4_xattr_alloc_cache();
+ if (cache == NULL)
+ goto out;
+
+ spin_lock(&inode->i_lock);
+ if (nfsi->cache_validity & NFS_INO_INVALID_XATTR) {
+ /*
+ * The cache was invalidated again. Give up,
+ * since what we want to enter is now likely
+ * outdated anyway.
+ */
+ spin_unlock(&inode->i_lock);
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+ cache = NULL;
+ goto out;
+ }
+
+ /*
+ * Check if someone beat us to it.
+ */
+ if (nfsi->xattr_cache != NULL) {
+ newcache = nfsi->xattr_cache;
+ kref_get(&newcache->ref);
+ } else {
+ kref_get(&cache->ref);
+ nfsi->xattr_cache = cache;
+ cache->inode = inode;
+ list_lru_add(&nfs4_xattr_cache_lru, &cache->lru);
+ }
+
+ spin_unlock(&inode->i_lock);
+
+ /*
+ * If there was a race, throw away the cache we just
+ * allocated, and use the new one allocated by someone
+ * else.
+ */
+ if (newcache != NULL) {
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+ cache = newcache;
+ }
+ }
+
+out:
+ /*
+ * Discard the now orphaned old cache.
+ */
+ if (oldcache != NULL)
+ nfs4_xattr_discard_cache(oldcache);
+
+ return cache;
+}
+
+static inline struct nfs4_xattr_bucket *
+nfs4_xattr_hash_bucket(struct nfs4_xattr_cache *cache, const char *name)
+{
+ return &cache->buckets[jhash(name, strlen(name), 0) &
+ (ARRAY_SIZE(cache->buckets) - 1)];
+}
+
+static struct nfs4_xattr_entry *
+nfs4_xattr_get_entry(struct nfs4_xattr_bucket *bucket, const char *name)
+{
+ struct nfs4_xattr_entry *entry;
+
+ entry = NULL;
+
+ hlist_for_each_entry(entry, &bucket->hlist, hnode) {
+ if (!strcmp(entry->xattr_name, name))
+ break;
+ }
+
+ return entry;
+}
+
+static int
+nfs4_xattr_hash_add(struct nfs4_xattr_cache *cache,
+ struct nfs4_xattr_entry *entry)
+{
+ struct nfs4_xattr_bucket *bucket;
+ struct nfs4_xattr_entry *oldentry = NULL;
+ int ret = 1;
+
+ bucket = nfs4_xattr_hash_bucket(cache, entry->xattr_name);
+ entry->bucket = bucket;
+
+ spin_lock(&bucket->lock);
+
+ if (bucket->draining) {
+ ret = 0;
+ goto out;
+ }
+
+ oldentry = nfs4_xattr_get_entry(bucket, entry->xattr_name);
+ if (oldentry != NULL) {
+ hlist_del_init(&oldentry->hnode);
+ nfs4_xattr_entry_lru_del(oldentry);
+ } else {
+ atomic_long_inc(&cache->nent);
+ }
+
+ hlist_add_head(&entry->hnode, &bucket->hlist);
+ nfs4_xattr_entry_lru_add(entry);
+
+out:
+ spin_unlock(&bucket->lock);
+
+ if (oldentry != NULL)
+ kref_put(&oldentry->ref, nfs4_xattr_free_entry_cb);
+
+ return ret;
+}
+
+static void
+nfs4_xattr_hash_remove(struct nfs4_xattr_cache *cache, const char *name)
+{
+ struct nfs4_xattr_bucket *bucket;
+ struct nfs4_xattr_entry *entry;
+
+ bucket = nfs4_xattr_hash_bucket(cache, name);
+
+ spin_lock(&bucket->lock);
+
+ entry = nfs4_xattr_get_entry(bucket, name);
+ if (entry != NULL) {
+ hlist_del_init(&entry->hnode);
+ nfs4_xattr_entry_lru_del(entry);
+ atomic_long_dec(&cache->nent);
+ }
+
+ spin_unlock(&bucket->lock);
+
+ if (entry != NULL)
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+}
+
+static struct nfs4_xattr_entry *
+nfs4_xattr_hash_find(struct nfs4_xattr_cache *cache, const char *name)
+{
+ struct nfs4_xattr_bucket *bucket;
+ struct nfs4_xattr_entry *entry;
+
+ bucket = nfs4_xattr_hash_bucket(cache, name);
+
+ spin_lock(&bucket->lock);
+
+ entry = nfs4_xattr_get_entry(bucket, name);
+ if (entry != NULL)
+ kref_get(&entry->ref);
+
+ spin_unlock(&bucket->lock);
+
+ return entry;
+}
+
+/*
+ * Entry point to retrieve an entry from the cache.
+ */
+ssize_t nfs4_xattr_cache_get(struct inode *inode, const char *name, char *buf,
+ ssize_t buflen)
+{
+ struct nfs4_xattr_cache *cache;
+ struct nfs4_xattr_entry *entry;
+ ssize_t ret;
+
+ cache = nfs4_xattr_get_cache(inode, 0);
+ if (cache == NULL)
+ return -ENOENT;
+
+ ret = 0;
+ entry = nfs4_xattr_hash_find(cache, name);
+
+ if (entry != NULL) {
+ dprintk("%s: cache hit '%s', len %lu\n", __func__,
+ entry->xattr_name, (unsigned long)entry->xattr_size);
+ if (buflen == 0) {
+ /* Length probe only */
+ ret = entry->xattr_size;
+ } else if (buflen < entry->xattr_size)
+ ret = -ERANGE;
+ else {
+ memcpy(buf, entry->xattr_value, entry->xattr_size);
+ ret = entry->xattr_size;
+ }
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+ } else {
+ dprintk("%s: cache miss '%s'\n", __func__, name);
+ ret = -ENOENT;
+ }
+
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+
+ return ret;
+}
+
+/*
+ * Retrieve a cached list of xattrs from the cache.
+ */
+ssize_t nfs4_xattr_cache_list(struct inode *inode, char *buf, ssize_t buflen)
+{
+ struct nfs4_xattr_cache *cache;
+ struct nfs4_xattr_entry *entry;
+ ssize_t ret;
+
+ cache = nfs4_xattr_get_cache(inode, 0);
+ if (cache == NULL)
+ return -ENOENT;
+
+ spin_lock(&cache->listxattr_lock);
+
+ entry = cache->listxattr;
+
+ if (entry != NULL && entry != ERR_PTR(-ESTALE)) {
+ if (buflen == 0) {
+ /* Length probe only */
+ ret = entry->xattr_size;
+ } else if (entry->xattr_size > buflen)
+ ret = -ERANGE;
+ else {
+ memcpy(buf, entry->xattr_value, entry->xattr_size);
+ ret = entry->xattr_size;
+ }
+ } else {
+ ret = -ENOENT;
+ }
+
+ spin_unlock(&cache->listxattr_lock);
+
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+
+ return ret;
+}
+
+/*
+ * Add an xattr to the cache.
+ *
+ * This also invalidates the xattr list cache.
+ */
+void nfs4_xattr_cache_add(struct inode *inode, const char *name,
+ const char *buf, struct page **pages, ssize_t buflen)
+{
+ struct nfs4_xattr_cache *cache;
+ struct nfs4_xattr_entry *entry;
+
+ dprintk("%s: add '%s' len %lu\n", __func__,
+ name, (unsigned long)buflen);
+
+ cache = nfs4_xattr_get_cache(inode, 1);
+ if (cache == NULL)
+ return;
+
+ entry = nfs4_xattr_alloc_entry(name, buf, pages, buflen);
+ if (entry == NULL)
+ goto out;
+
+ (void)nfs4_xattr_set_listcache(cache, NULL);
+
+ if (!nfs4_xattr_hash_add(cache, entry))
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+
+out:
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+}
+
+
+/*
+ * Remove an xattr from the cache.
+ *
+ * This also invalidates the xattr list cache.
+ */
+void nfs4_xattr_cache_remove(struct inode *inode, const char *name)
+{
+ struct nfs4_xattr_cache *cache;
+
+ dprintk("%s: remove '%s'\n", __func__, name);
+
+ cache = nfs4_xattr_get_cache(inode, 0);
+ if (cache == NULL)
+ return;
+
+ (void)nfs4_xattr_set_listcache(cache, NULL);
+ nfs4_xattr_hash_remove(cache, name);
+
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+}
+
+/*
+ * Cache listxattr output, replacing any possible old one.
+ */
+void nfs4_xattr_cache_set_list(struct inode *inode, const char *buf,
+ ssize_t buflen)
+{
+ struct nfs4_xattr_cache *cache;
+ struct nfs4_xattr_entry *entry;
+
+ cache = nfs4_xattr_get_cache(inode, 1);
+ if (cache == NULL)
+ return;
+
+ entry = nfs4_xattr_alloc_entry(NULL, buf, NULL, buflen);
+ if (entry == NULL)
+ goto out;
+
+ /*
+ * This is just there to be able to get to bucket->cache,
+ * which is obviously the same for all buckets, so just
+ * use bucket 0.
+ */
+ entry->bucket = &cache->buckets[0];
+
+ if (!nfs4_xattr_set_listcache(cache, entry))
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+
+out:
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+}
+
+/*
+ * Zap the entire cache. Called when an inode is evicted.
+ */
+void nfs4_xattr_cache_zap(struct inode *inode)
+{
+ struct nfs4_xattr_cache *oldcache;
+
+ spin_lock(&inode->i_lock);
+ oldcache = nfs4_xattr_cache_unlink(inode);
+ spin_unlock(&inode->i_lock);
+
+ if (oldcache)
+ nfs4_xattr_discard_cache(oldcache);
+}
+
+/*
+ * The entry LRU is shrunk more aggressively than the cache LRU,
+ * by settings @seeks to 1.
+ *
+ * Cache structures are freed only when they've become empty, after
+ * pruning all but one entry.
+ */
+
+static unsigned long nfs4_xattr_cache_count(struct shrinker *shrink,
+ struct shrink_control *sc);
+static unsigned long nfs4_xattr_entry_count(struct shrinker *shrink,
+ struct shrink_control *sc);
+static unsigned long nfs4_xattr_cache_scan(struct shrinker *shrink,
+ struct shrink_control *sc);
+static unsigned long nfs4_xattr_entry_scan(struct shrinker *shrink,
+ struct shrink_control *sc);
+
+static struct shrinker nfs4_xattr_cache_shrinker = {
+ .count_objects = nfs4_xattr_cache_count,
+ .scan_objects = nfs4_xattr_cache_scan,
+ .seeks = DEFAULT_SEEKS,
+ .flags = SHRINKER_MEMCG_AWARE,
+};
+
+static struct shrinker nfs4_xattr_entry_shrinker = {
+ .count_objects = nfs4_xattr_entry_count,
+ .scan_objects = nfs4_xattr_entry_scan,
+ .seeks = DEFAULT_SEEKS,
+ .batch = 512,
+ .flags = SHRINKER_MEMCG_AWARE,
+};
+
+static struct shrinker nfs4_xattr_large_entry_shrinker = {
+ .count_objects = nfs4_xattr_entry_count,
+ .scan_objects = nfs4_xattr_entry_scan,
+ .seeks = 1,
+ .batch = 512,
+ .flags = SHRINKER_MEMCG_AWARE,
+};
+
+static enum lru_status
+cache_lru_isolate(struct list_head *item,
+ struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+ struct list_head *dispose = arg;
+ struct inode *inode;
+ struct nfs4_xattr_cache *cache = container_of(item,
+ struct nfs4_xattr_cache, lru);
+
+ if (atomic_long_read(&cache->nent) > 1)
+ return LRU_SKIP;
+
+ /*
+ * If a cache structure is on the LRU list, we know that
+ * its inode is valid. Try to lock it to break the link.
+ * Since we're inverting the lock order here, only try.
+ */
+ inode = cache->inode;
+
+ if (!spin_trylock(&inode->i_lock))
+ return LRU_SKIP;
+
+ kref_get(&cache->ref);
+
+ cache->inode = NULL;
+ NFS_I(inode)->xattr_cache = NULL;
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_XATTR;
+ list_lru_isolate(lru, &cache->lru);
+
+ spin_unlock(&inode->i_lock);
+
+ list_add_tail(&cache->dispose, dispose);
+ return LRU_REMOVED;
+}
+
+static unsigned long
+nfs4_xattr_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ LIST_HEAD(dispose);
+ unsigned long freed;
+ struct nfs4_xattr_cache *cache;
+
+ freed = list_lru_shrink_walk(&nfs4_xattr_cache_lru, sc,
+ cache_lru_isolate, &dispose);
+ while (!list_empty(&dispose)) {
+ cache = list_first_entry(&dispose, struct nfs4_xattr_cache,
+ dispose);
+ list_del_init(&cache->dispose);
+ nfs4_xattr_discard_cache(cache);
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+ }
+
+ return freed;
+}
+
+
+static unsigned long
+nfs4_xattr_cache_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ unsigned long count;
+
+ count = list_lru_shrink_count(&nfs4_xattr_cache_lru, sc);
+ return vfs_pressure_ratio(count);
+}
+
+static enum lru_status
+entry_lru_isolate(struct list_head *item,
+ struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+ struct list_head *dispose = arg;
+ struct nfs4_xattr_bucket *bucket;
+ struct nfs4_xattr_cache *cache;
+ struct nfs4_xattr_entry *entry = container_of(item,
+ struct nfs4_xattr_entry, lru);
+
+ bucket = entry->bucket;
+ cache = bucket->cache;
+
+ /*
+ * Unhook the entry from its parent (either a cache bucket
+ * or a cache structure if it's a listxattr buf), so that
+ * it's no longer found. Then add it to the isolate list,
+ * to be freed later.
+ *
+ * In both cases, we're reverting lock order, so use
+ * trylock and skip the entry if we can't get the lock.
+ */
+ if (entry->xattr_name != NULL) {
+ /* Regular cache entry */
+ if (!spin_trylock(&bucket->lock))
+ return LRU_SKIP;
+
+ kref_get(&entry->ref);
+
+ hlist_del_init(&entry->hnode);
+ atomic_long_dec(&cache->nent);
+ list_lru_isolate(lru, &entry->lru);
+
+ spin_unlock(&bucket->lock);
+ } else {
+ /* Listxattr cache entry */
+ if (!spin_trylock(&cache->listxattr_lock))
+ return LRU_SKIP;
+
+ kref_get(&entry->ref);
+
+ cache->listxattr = NULL;
+ list_lru_isolate(lru, &entry->lru);
+
+ spin_unlock(&cache->listxattr_lock);
+ }
+
+ list_add_tail(&entry->dispose, dispose);
+ return LRU_REMOVED;
+}
+
+static unsigned long
+nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ LIST_HEAD(dispose);
+ unsigned long freed;
+ struct nfs4_xattr_entry *entry;
+ struct list_lru *lru;
+
+ lru = (shrink == &nfs4_xattr_large_entry_shrinker) ?
+ &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
+
+ freed = list_lru_shrink_walk(lru, sc, entry_lru_isolate, &dispose);
+
+ while (!list_empty(&dispose)) {
+ entry = list_first_entry(&dispose, struct nfs4_xattr_entry,
+ dispose);
+ list_del_init(&entry->dispose);
+
+ /*
+ * Drop two references: the one that we just grabbed
+ * in entry_lru_isolate, and the one that was set
+ * when the entry was first allocated.
+ */
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+ }
+
+ return freed;
+}
+
+static unsigned long
+nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ unsigned long count;
+ struct list_lru *lru;
+
+ lru = (shrink == &nfs4_xattr_large_entry_shrinker) ?
+ &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
+
+ count = list_lru_shrink_count(lru, sc);
+ return vfs_pressure_ratio(count);
+}
+
+
+static void nfs4_xattr_cache_init_once(void *p)
+{
+ struct nfs4_xattr_cache *cache = p;
+
+ spin_lock_init(&cache->listxattr_lock);
+ atomic_long_set(&cache->nent, 0);
+ nfs4_xattr_hash_init(cache);
+ cache->listxattr = NULL;
+ INIT_LIST_HEAD(&cache->lru);
+ INIT_LIST_HEAD(&cache->dispose);
+}
+
+static int nfs4_xattr_shrinker_init(struct shrinker *shrinker,
+ struct list_lru *lru, const char *name)
+{
+ int ret = 0;
+
+ ret = register_shrinker(shrinker, name);
+ if (ret)
+ return ret;
+
+ ret = list_lru_init_memcg(lru, shrinker);
+ if (ret)
+ unregister_shrinker(shrinker);
+
+ return ret;
+}
+
+static void nfs4_xattr_shrinker_destroy(struct shrinker *shrinker,
+ struct list_lru *lru)
+{
+ unregister_shrinker(shrinker);
+ list_lru_destroy(lru);
+}
+
+int __init nfs4_xattr_cache_init(void)
+{
+ int ret = 0;
+
+ nfs4_xattr_cache_cachep = kmem_cache_create("nfs4_xattr_cache_cache",
+ sizeof(struct nfs4_xattr_cache), 0,
+ (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+ nfs4_xattr_cache_init_once);
+ if (nfs4_xattr_cache_cachep == NULL)
+ return -ENOMEM;
+
+ ret = nfs4_xattr_shrinker_init(&nfs4_xattr_cache_shrinker,
+ &nfs4_xattr_cache_lru,
+ "nfs-xattr_cache");
+ if (ret)
+ goto out1;
+
+ ret = nfs4_xattr_shrinker_init(&nfs4_xattr_entry_shrinker,
+ &nfs4_xattr_entry_lru,
+ "nfs-xattr_entry");
+ if (ret)
+ goto out2;
+
+ ret = nfs4_xattr_shrinker_init(&nfs4_xattr_large_entry_shrinker,
+ &nfs4_xattr_large_entry_lru,
+ "nfs-xattr_large_entry");
+ if (!ret)
+ return 0;
+
+ nfs4_xattr_shrinker_destroy(&nfs4_xattr_entry_shrinker,
+ &nfs4_xattr_entry_lru);
+out2:
+ nfs4_xattr_shrinker_destroy(&nfs4_xattr_cache_shrinker,
+ &nfs4_xattr_cache_lru);
+out1:
+ kmem_cache_destroy(nfs4_xattr_cache_cachep);
+
+ return ret;
+}
+
+void nfs4_xattr_cache_exit(void)
+{
+ nfs4_xattr_shrinker_destroy(&nfs4_xattr_large_entry_shrinker,
+ &nfs4_xattr_large_entry_lru);
+ nfs4_xattr_shrinker_destroy(&nfs4_xattr_entry_shrinker,
+ &nfs4_xattr_entry_lru);
+ nfs4_xattr_shrinker_destroy(&nfs4_xattr_cache_shrinker,
+ &nfs4_xattr_cache_lru);
+ kmem_cache_destroy(nfs4_xattr_cache_cachep);
+}
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
new file mode 100644
index 0000000000..9e3ae53e22
--- /dev/null
+++ b/fs/nfs/nfs42xdr.c
@@ -0,0 +1,1672 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
+ */
+#ifndef __LINUX_FS_NFS_NFS4_2XDR_H
+#define __LINUX_FS_NFS_NFS4_2XDR_H
+
+#include "nfs42.h"
+
+/* Not limited by NFS itself, limited by the generic xattr code */
+#define nfs4_xattr_name_maxsz XDR_QUADLEN(XATTR_NAME_MAX)
+
+#define encode_fallocate_maxsz (encode_stateid_maxsz + \
+ 2 /* offset */ + \
+ 2 /* length */)
+#define NFS42_WRITE_RES_SIZE (1 /* wr_callback_id size */ +\
+ XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+ 2 /* wr_count */ + \
+ 1 /* wr_committed */ + \
+ XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define encode_allocate_maxsz (op_encode_hdr_maxsz + \
+ encode_fallocate_maxsz)
+#define decode_allocate_maxsz (op_decode_hdr_maxsz)
+#define encode_copy_maxsz (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+ XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+ 2 + 2 + 2 + 1 + 1 + 1 +\
+ 1 + /* One cnr_source_server */\
+ 1 + /* nl4_type */ \
+ 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT))
+#define decode_copy_maxsz (op_decode_hdr_maxsz + \
+ NFS42_WRITE_RES_SIZE + \
+ 1 /* cr_consecutive */ + \
+ 1 /* cr_synchronous */)
+#define encode_offload_cancel_maxsz (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define decode_offload_cancel_maxsz (op_decode_hdr_maxsz)
+#define encode_copy_notify_maxsz (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+ 1 + /* nl4_type */ \
+ 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT))
+#define decode_copy_notify_maxsz (op_decode_hdr_maxsz + \
+ 3 + /* cnr_lease_time */\
+ XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+ 1 + /* Support 1 cnr_source_server */\
+ 1 + /* nl4_type */ \
+ 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT))
+#define encode_deallocate_maxsz (op_encode_hdr_maxsz + \
+ encode_fallocate_maxsz)
+#define decode_deallocate_maxsz (op_decode_hdr_maxsz)
+#define encode_read_plus_maxsz (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + 3)
+#define NFS42_READ_PLUS_DATA_SEGMENT_SIZE \
+ (1 /* data_content4 */ + \
+ 2 /* data_info4.di_offset */ + \
+ 1 /* data_info4.di_length */)
+#define NFS42_READ_PLUS_HOLE_SEGMENT_SIZE \
+ (1 /* data_content4 */ + \
+ 2 /* data_info4.di_offset */ + \
+ 2 /* data_info4.di_length */)
+#define READ_PLUS_SEGMENT_SIZE_DIFF (NFS42_READ_PLUS_HOLE_SEGMENT_SIZE - \
+ NFS42_READ_PLUS_DATA_SEGMENT_SIZE)
+#define decode_read_plus_maxsz (op_decode_hdr_maxsz + \
+ 1 /* rpr_eof */ + \
+ 1 /* rpr_contents count */ + \
+ NFS42_READ_PLUS_HOLE_SEGMENT_SIZE)
+#define encode_seek_maxsz (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + \
+ 2 /* offset */ + \
+ 1 /* whence */)
+#define decode_seek_maxsz (op_decode_hdr_maxsz + \
+ 1 /* eof */ + \
+ 1 /* whence */ + \
+ 2 /* offset */ + \
+ 2 /* length */)
+#define encode_io_info_maxsz 4
+#define encode_layoutstats_maxsz (op_decode_hdr_maxsz + \
+ 2 /* offset */ + \
+ 2 /* length */ + \
+ encode_stateid_maxsz + \
+ encode_io_info_maxsz + \
+ encode_io_info_maxsz + \
+ 1 /* opaque devaddr4 length */ + \
+ XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
+#define decode_layoutstats_maxsz (op_decode_hdr_maxsz)
+#define encode_device_error_maxsz (XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
+ 1 /* status */ + 1 /* opnum */)
+#define encode_layouterror_maxsz (op_decode_hdr_maxsz + \
+ 2 /* offset */ + \
+ 2 /* length */ + \
+ encode_stateid_maxsz + \
+ 1 /* Array size */ + \
+ encode_device_error_maxsz)
+#define decode_layouterror_maxsz (op_decode_hdr_maxsz)
+#define encode_clone_maxsz (encode_stateid_maxsz + \
+ encode_stateid_maxsz + \
+ 2 /* src offset */ + \
+ 2 /* dst offset */ + \
+ 2 /* count */)
+#define decode_clone_maxsz (op_decode_hdr_maxsz)
+#define encode_getxattr_maxsz (op_encode_hdr_maxsz + 1 + \
+ nfs4_xattr_name_maxsz)
+#define decode_getxattr_maxsz (op_decode_hdr_maxsz + 1 + pagepad_maxsz)
+#define encode_setxattr_maxsz (op_encode_hdr_maxsz + \
+ 1 + nfs4_xattr_name_maxsz + 1)
+#define decode_setxattr_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz)
+#define encode_listxattrs_maxsz (op_encode_hdr_maxsz + 2 + 1)
+#define decode_listxattrs_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1 + 1)
+#define encode_removexattr_maxsz (op_encode_hdr_maxsz + 1 + \
+ nfs4_xattr_name_maxsz)
+#define decode_removexattr_maxsz (op_decode_hdr_maxsz + \
+ decode_change_info_maxsz)
+
+#define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_allocate_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_allocate_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_allocate_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_copy_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_savefh_maxsz + \
+ encode_putfh_maxsz + \
+ encode_copy_maxsz + \
+ encode_commit_maxsz)
+#define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_savefh_maxsz + \
+ decode_putfh_maxsz + \
+ decode_copy_maxsz + \
+ decode_commit_maxsz)
+#define NFS4_enc_offload_cancel_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_offload_cancel_maxsz)
+#define NFS4_dec_offload_cancel_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_offload_cancel_maxsz)
+#define NFS4_enc_copy_notify_sz (compound_encode_hdr_maxsz + \
+ encode_putfh_maxsz + \
+ encode_copy_notify_maxsz)
+#define NFS4_dec_copy_notify_sz (compound_decode_hdr_maxsz + \
+ decode_putfh_maxsz + \
+ decode_copy_notify_maxsz)
+#define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_deallocate_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_deallocate_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_deallocate_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_read_plus_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_read_plus_maxsz)
+#define NFS4_dec_read_plus_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_read_plus_maxsz)
+#define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_seek_maxsz)
+#define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_seek_maxsz)
+#define NFS4_enc_layoutstats_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ PNFS_LAYOUTSTATS_MAXDEV * encode_layoutstats_maxsz)
+#define NFS4_dec_layoutstats_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
+#define NFS4_enc_layouterror_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ NFS42_LAYOUTERROR_MAX * \
+ encode_layouterror_maxsz)
+#define NFS4_dec_layouterror_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ NFS42_LAYOUTERROR_MAX * \
+ decode_layouterror_maxsz)
+#define NFS4_enc_clone_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_savefh_maxsz + \
+ encode_putfh_maxsz + \
+ encode_clone_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_clone_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_savefh_maxsz + \
+ decode_putfh_maxsz + \
+ decode_clone_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_getxattr_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_getxattr_maxsz)
+#define NFS4_dec_getxattr_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_getxattr_maxsz)
+#define NFS4_enc_setxattr_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_setxattr_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_setxattr_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_setxattr_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_listxattrs_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_listxattrs_maxsz)
+#define NFS4_dec_listxattrs_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_listxattrs_maxsz)
+#define NFS4_enc_removexattr_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_removexattr_maxsz)
+#define NFS4_dec_removexattr_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_removexattr_maxsz)
+
+/*
+ * These values specify the maximum amount of data that is not
+ * associated with the extended attribute name or extended
+ * attribute list in the SETXATTR, GETXATTR and LISTXATTR
+ * respectively.
+ */
+const u32 nfs42_maxsetxattr_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+ compound_encode_hdr_maxsz +
+ encode_sequence_maxsz +
+ encode_putfh_maxsz + 1 +
+ nfs4_xattr_name_maxsz)
+ * XDR_UNIT);
+
+const u32 nfs42_maxgetxattr_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+ compound_decode_hdr_maxsz +
+ decode_sequence_maxsz +
+ decode_putfh_maxsz + 1) * XDR_UNIT);
+
+const u32 nfs42_maxlistxattrs_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+ compound_decode_hdr_maxsz +
+ decode_sequence_maxsz +
+ decode_putfh_maxsz + 3) * XDR_UNIT);
+
+static void encode_fallocate(struct xdr_stream *xdr,
+ const struct nfs42_falloc_args *args)
+{
+ encode_nfs4_stateid(xdr, &args->falloc_stateid);
+ encode_uint64(xdr, args->falloc_offset);
+ encode_uint64(xdr, args->falloc_length);
+}
+
+static void encode_allocate(struct xdr_stream *xdr,
+ const struct nfs42_falloc_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_ALLOCATE, decode_allocate_maxsz, hdr);
+ encode_fallocate(xdr, args);
+}
+
+static void encode_nl4_server(struct xdr_stream *xdr,
+ const struct nl4_server *ns)
+{
+ encode_uint32(xdr, ns->nl4_type);
+ switch (ns->nl4_type) {
+ case NL4_NAME:
+ case NL4_URL:
+ encode_string(xdr, ns->u.nl4_str_sz, ns->u.nl4_str);
+ break;
+ case NL4_NETADDR:
+ encode_string(xdr, ns->u.nl4_addr.netid_len,
+ ns->u.nl4_addr.netid);
+ encode_string(xdr, ns->u.nl4_addr.addr_len,
+ ns->u.nl4_addr.addr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+}
+
+static void encode_copy(struct xdr_stream *xdr,
+ const struct nfs42_copy_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_COPY, decode_copy_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->src_stateid);
+ encode_nfs4_stateid(xdr, &args->dst_stateid);
+
+ encode_uint64(xdr, args->src_pos);
+ encode_uint64(xdr, args->dst_pos);
+ encode_uint64(xdr, args->count);
+
+ encode_uint32(xdr, 1); /* consecutive = true */
+ encode_uint32(xdr, args->sync);
+ if (args->cp_src == NULL) { /* intra-ssc */
+ encode_uint32(xdr, 0); /* no src server list */
+ return;
+ }
+ encode_uint32(xdr, 1); /* supporting 1 server */
+ encode_nl4_server(xdr, args->cp_src);
+}
+
+static void encode_copy_commit(struct xdr_stream *xdr,
+ const struct nfs42_copy_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr);
+ p = reserve_space(xdr, 12);
+ p = xdr_encode_hyper(p, args->dst_pos);
+ *p = cpu_to_be32(args->count);
+}
+
+static void encode_offload_cancel(struct xdr_stream *xdr,
+ const struct nfs42_offload_status_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_OFFLOAD_CANCEL, decode_offload_cancel_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->osa_stateid);
+}
+
+static void encode_copy_notify(struct xdr_stream *xdr,
+ const struct nfs42_copy_notify_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_COPY_NOTIFY, decode_copy_notify_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->cna_src_stateid);
+ encode_nl4_server(xdr, &args->cna_dst);
+}
+
+static void encode_deallocate(struct xdr_stream *xdr,
+ const struct nfs42_falloc_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_DEALLOCATE, decode_deallocate_maxsz, hdr);
+ encode_fallocate(xdr, args);
+}
+
+static void encode_read_plus(struct xdr_stream *xdr,
+ const struct nfs_pgio_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_READ_PLUS, decode_read_plus_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->stateid);
+ encode_uint64(xdr, args->offset);
+ encode_uint32(xdr, args->count);
+}
+
+static void encode_seek(struct xdr_stream *xdr,
+ const struct nfs42_seek_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_SEEK, decode_seek_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->sa_stateid);
+ encode_uint64(xdr, args->sa_offset);
+ encode_uint32(xdr, args->sa_what);
+}
+
+static void encode_layoutstats(struct xdr_stream *xdr,
+ const struct nfs42_layoutstat_args *args,
+ struct nfs42_layoutstat_devinfo *devinfo,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LAYOUTSTATS, decode_layoutstats_maxsz, hdr);
+ p = reserve_space(xdr, 8 + 8);
+ p = xdr_encode_hyper(p, devinfo->offset);
+ p = xdr_encode_hyper(p, devinfo->length);
+ encode_nfs4_stateid(xdr, &args->stateid);
+ p = reserve_space(xdr, 4*8 + NFS4_DEVICEID4_SIZE + 4);
+ p = xdr_encode_hyper(p, devinfo->read_count);
+ p = xdr_encode_hyper(p, devinfo->read_bytes);
+ p = xdr_encode_hyper(p, devinfo->write_count);
+ p = xdr_encode_hyper(p, devinfo->write_bytes);
+ p = xdr_encode_opaque_fixed(p, devinfo->dev_id.data,
+ NFS4_DEVICEID4_SIZE);
+ /* Encode layoutupdate4 */
+ *p++ = cpu_to_be32(devinfo->layout_type);
+ if (devinfo->ld_private.ops)
+ devinfo->ld_private.ops->encode(xdr, args,
+ &devinfo->ld_private);
+ else
+ encode_uint32(xdr, 0);
+}
+
+static void encode_clone(struct xdr_stream *xdr,
+ const struct nfs42_clone_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_CLONE, decode_clone_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->src_stateid);
+ encode_nfs4_stateid(xdr, &args->dst_stateid);
+ p = reserve_space(xdr, 3*8);
+ p = xdr_encode_hyper(p, args->src_offset);
+ p = xdr_encode_hyper(p, args->dst_offset);
+ xdr_encode_hyper(p, args->count);
+}
+
+static void encode_device_error(struct xdr_stream *xdr,
+ const struct nfs42_device_error *error)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 2*4);
+ p = xdr_encode_opaque_fixed(p, error->dev_id.data,
+ NFS4_DEVICEID4_SIZE);
+ *p++ = cpu_to_be32(error->status);
+ *p = cpu_to_be32(error->opnum);
+}
+
+static void encode_layouterror(struct xdr_stream *xdr,
+ const struct nfs42_layout_error *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LAYOUTERROR, decode_layouterror_maxsz, hdr);
+ p = reserve_space(xdr, 8 + 8);
+ p = xdr_encode_hyper(p, args->offset);
+ p = xdr_encode_hyper(p, args->length);
+ encode_nfs4_stateid(xdr, &args->stateid);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(1);
+ encode_device_error(xdr, &args->errors[0]);
+}
+
+static void encode_setxattr(struct xdr_stream *xdr,
+ const struct nfs42_setxattrargs *arg,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ BUILD_BUG_ON(XATTR_CREATE != SETXATTR4_CREATE);
+ BUILD_BUG_ON(XATTR_REPLACE != SETXATTR4_REPLACE);
+
+ encode_op_hdr(xdr, OP_SETXATTR, decode_setxattr_maxsz, hdr);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(arg->xattr_flags);
+ encode_string(xdr, strlen(arg->xattr_name), arg->xattr_name);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(arg->xattr_len);
+ if (arg->xattr_len)
+ xdr_write_pages(xdr, arg->xattr_pages, 0, arg->xattr_len);
+}
+
+static void encode_getxattr(struct xdr_stream *xdr, const char *name,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_GETXATTR, decode_getxattr_maxsz, hdr);
+ encode_string(xdr, strlen(name), name);
+}
+
+static void encode_removexattr(struct xdr_stream *xdr, const char *name,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_REMOVEXATTR, decode_removexattr_maxsz, hdr);
+ encode_string(xdr, strlen(name), name);
+}
+
+static void encode_listxattrs(struct xdr_stream *xdr,
+ const struct nfs42_listxattrsargs *arg,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LISTXATTRS, decode_listxattrs_maxsz, hdr);
+
+ p = reserve_space(xdr, 12);
+ if (unlikely(!p))
+ return;
+
+ p = xdr_encode_hyper(p, arg->cookie);
+ /*
+ * RFC 8276 says to specify the full max length of the LISTXATTRS
+ * XDR reply. Count is set to the XDR length of the names array
+ * plus the EOF marker. So, add the cookie and the names count.
+ */
+ *p = cpu_to_be32(arg->count + 8 + 4);
+}
+
+/*
+ * Encode ALLOCATE request
+ */
+static void nfs4_xdr_enc_allocate(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_falloc_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->falloc_fh, &hdr);
+ encode_allocate(xdr, args, &hdr);
+ encode_getfattr(xdr, args->falloc_bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode COPY request
+ */
+static void nfs4_xdr_enc_copy(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_copy_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->src_fh, &hdr);
+ encode_savefh(xdr, &hdr);
+ encode_putfh(xdr, args->dst_fh, &hdr);
+ encode_copy(xdr, args, &hdr);
+ if (args->sync)
+ encode_copy_commit(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode OFFLOAD_CANEL request
+ */
+static void nfs4_xdr_enc_offload_cancel(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_offload_status_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->osa_seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->osa_seq_args, &hdr);
+ encode_putfh(xdr, args->osa_src_fh, &hdr);
+ encode_offload_cancel(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode COPY_NOTIFY request
+ */
+static void nfs4_xdr_enc_copy_notify(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_copy_notify_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->cna_seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->cna_seq_args, &hdr);
+ encode_putfh(xdr, args->cna_src_fh, &hdr);
+ encode_copy_notify(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode DEALLOCATE request
+ */
+static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_falloc_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->falloc_fh, &hdr);
+ encode_deallocate(xdr, args, &hdr);
+ encode_getfattr(xdr, args->falloc_bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode READ_PLUS request
+ */
+static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_pgio_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_read_plus(xdr, args, &hdr);
+
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->count,
+ hdr.replen - READ_PLUS_SEGMENT_SIZE_DIFF);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode SEEK request
+ */
+static void nfs4_xdr_enc_seek(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_seek_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->sa_fh, &hdr);
+ encode_seek(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode LAYOUTSTATS request
+ */
+static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_layoutstat_args *args = data;
+ int i;
+
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ WARN_ON(args->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
+ for (i = 0; i < args->num_dev; i++)
+ encode_layoutstats(xdr, args, &args->devinfo[i], &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode CLONE request
+ */
+static void nfs4_xdr_enc_clone(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_clone_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->src_fh, &hdr);
+ encode_savefh(xdr, &hdr);
+ encode_putfh(xdr, args->dst_fh, &hdr);
+ encode_clone(xdr, args, &hdr);
+ encode_getfattr(xdr, args->dst_bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode LAYOUTERROR request
+ */
+static void nfs4_xdr_enc_layouterror(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_layouterror_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+ int i;
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+ for (i = 0; i < args->num_errors; i++)
+ encode_layouterror(xdr, &args->errors[i], &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode SETXATTR request
+ */
+static void nfs4_xdr_enc_setxattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_setxattrargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_setxattr(xdr, args, &hdr);
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode GETXATTR request
+ */
+static void nfs4_xdr_enc_getxattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_getxattrargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+ uint32_t replen;
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ replen = hdr.replen + op_decode_hdr_maxsz + 1;
+ encode_getxattr(xdr, args->xattr_name, &hdr);
+
+ rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->xattr_len,
+ replen);
+
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode LISTXATTR request
+ */
+static void nfs4_xdr_enc_listxattrs(struct rpc_rqst *req,
+ struct xdr_stream *xdr, const void *data)
+{
+ const struct nfs42_listxattrsargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+ uint32_t replen;
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ replen = hdr.replen + op_decode_hdr_maxsz + 2 + 1;
+ encode_listxattrs(xdr, args, &hdr);
+
+ rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->count, replen);
+
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode REMOVEXATTR request
+ */
+static void nfs4_xdr_enc_removexattr(struct rpc_rqst *req,
+ struct xdr_stream *xdr, const void *data)
+{
+ const struct nfs42_removexattrargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_removexattr(xdr, args->xattr_name, &hdr);
+ encode_nops(&hdr);
+}
+
+static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
+{
+ return decode_op_hdr(xdr, OP_ALLOCATE);
+}
+
+static int decode_write_response(struct xdr_stream *xdr,
+ struct nfs42_write_res *res)
+{
+ __be32 *p;
+ int status, count;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ count = be32_to_cpup(p);
+ if (count > 1)
+ return -EREMOTEIO;
+ else if (count == 1) {
+ status = decode_opaque_fixed(xdr, &res->stateid,
+ NFS4_STATEID_SIZE);
+ if (unlikely(status))
+ return -EIO;
+ }
+ p = xdr_inline_decode(xdr, 8 + 4);
+ if (unlikely(!p))
+ return -EIO;
+ p = xdr_decode_hyper(p, &res->count);
+ res->verifier.committed = be32_to_cpup(p);
+ return decode_verifier(xdr, &res->verifier.verifier);
+}
+
+static int decode_nl4_server(struct xdr_stream *xdr, struct nl4_server *ns)
+{
+ struct nfs42_netaddr *naddr;
+ uint32_t dummy;
+ char *dummy_str;
+ __be32 *p;
+ int status;
+
+ /* nl_type */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ ns->nl4_type = be32_to_cpup(p);
+ switch (ns->nl4_type) {
+ case NL4_NAME:
+ case NL4_URL:
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
+ if (unlikely(dummy > NFS4_OPAQUE_LIMIT))
+ return -EIO;
+ memcpy(&ns->u.nl4_str, dummy_str, dummy);
+ ns->u.nl4_str_sz = dummy;
+ break;
+ case NL4_NETADDR:
+ naddr = &ns->u.nl4_addr;
+
+ /* netid string */
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
+ if (unlikely(dummy > RPCBIND_MAXNETIDLEN))
+ return -EIO;
+ naddr->netid_len = dummy;
+ memcpy(naddr->netid, dummy_str, naddr->netid_len);
+
+ /* uaddr string */
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
+ if (unlikely(dummy > RPCBIND_MAXUADDRLEN))
+ return -EIO;
+ naddr->addr_len = dummy;
+ memcpy(naddr->addr, dummy_str, naddr->addr_len);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int decode_copy_requirements(struct xdr_stream *xdr,
+ struct nfs42_copy_res *res) {
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4 + 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ res->consecutive = be32_to_cpup(p++);
+ res->synchronous = be32_to_cpup(p++);
+ return 0;
+}
+
+static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_COPY);
+ if (status == NFS4ERR_OFFLOAD_NO_REQS) {
+ status = decode_copy_requirements(xdr, res);
+ if (status)
+ return status;
+ return NFS4ERR_OFFLOAD_NO_REQS;
+ } else if (status)
+ return status;
+
+ status = decode_write_response(xdr, &res->write_res);
+ if (status)
+ return status;
+
+ return decode_copy_requirements(xdr, res);
+}
+
+static int decode_offload_cancel(struct xdr_stream *xdr,
+ struct nfs42_offload_status_res *res)
+{
+ return decode_op_hdr(xdr, OP_OFFLOAD_CANCEL);
+}
+
+static int decode_copy_notify(struct xdr_stream *xdr,
+ struct nfs42_copy_notify_res *res)
+{
+ __be32 *p;
+ int status, count;
+
+ status = decode_op_hdr(xdr, OP_COPY_NOTIFY);
+ if (status)
+ return status;
+ /* cnr_lease_time */
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(!p))
+ return -EIO;
+ p = xdr_decode_hyper(p, &res->cnr_lease_time.seconds);
+ res->cnr_lease_time.nseconds = be32_to_cpup(p);
+
+ status = decode_opaque_fixed(xdr, &res->cnr_stateid, NFS4_STATEID_SIZE);
+ if (unlikely(status))
+ return -EIO;
+
+ /* number of source addresses */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ count = be32_to_cpup(p);
+ if (count > 1)
+ pr_warn("NFS: %s: nsvr %d > Supported. Use first servers\n",
+ __func__, count);
+
+ status = decode_nl4_server(xdr, &res->cnr_src);
+ if (unlikely(status))
+ return -EIO;
+ return 0;
+}
+
+static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
+{
+ return decode_op_hdr(xdr, OP_DEALLOCATE);
+}
+
+struct read_plus_segment {
+ enum data_content4 type;
+ uint64_t offset;
+ union {
+ struct {
+ uint64_t length;
+ } hole;
+
+ struct {
+ uint32_t length;
+ unsigned int from;
+ } data;
+ };
+};
+
+static inline uint64_t read_plus_segment_length(struct read_plus_segment *seg)
+{
+ return seg->type == NFS4_CONTENT_DATA ? seg->data.length : seg->hole.length;
+}
+
+static int decode_read_plus_segment(struct xdr_stream *xdr,
+ struct read_plus_segment *seg)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
+ seg->type = be32_to_cpup(p++);
+
+ p = xdr_inline_decode(xdr, seg->type == NFS4_CONTENT_DATA ? 12 : 16);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &seg->offset);
+
+ if (seg->type == NFS4_CONTENT_DATA) {
+ struct xdr_buf buf;
+ uint32_t len = be32_to_cpup(p);
+
+ seg->data.length = len;
+ seg->data.from = xdr_stream_pos(xdr);
+
+ if (!xdr_stream_subsegment(xdr, &buf, xdr_align_size(len)))
+ return -EIO;
+ } else if (seg->type == NFS4_CONTENT_HOLE) {
+ xdr_decode_hyper(p, &seg->hole.length);
+ } else
+ return -EINVAL;
+ return 0;
+}
+
+static int process_read_plus_segment(struct xdr_stream *xdr,
+ struct nfs_pgio_args *args,
+ struct nfs_pgio_res *res,
+ struct read_plus_segment *seg)
+{
+ unsigned long offset = seg->offset;
+ unsigned long length = read_plus_segment_length(seg);
+ unsigned int bufpos;
+
+ if (offset + length < args->offset)
+ return 0;
+ else if (offset > args->offset + args->count) {
+ res->eof = 0;
+ return 0;
+ } else if (offset < args->offset) {
+ length -= (args->offset - offset);
+ offset = args->offset;
+ } else if (offset + length > args->offset + args->count) {
+ length = (args->offset + args->count) - offset;
+ res->eof = 0;
+ }
+
+ bufpos = xdr->buf->head[0].iov_len + (offset - args->offset);
+ if (seg->type == NFS4_CONTENT_HOLE)
+ return xdr_stream_zero(xdr, bufpos, length);
+ else
+ return xdr_stream_move_subsegment(xdr, seg->data.from, bufpos, length);
+}
+
+static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
+{
+ struct nfs_pgio_header *hdr =
+ container_of(res, struct nfs_pgio_header, res);
+ struct nfs_pgio_args *args = &hdr->args;
+ uint32_t segments;
+ struct read_plus_segment *segs;
+ int status, i;
+ __be32 *p;
+
+ status = decode_op_hdr(xdr, OP_READ_PLUS);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 4 + 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ res->count = 0;
+ res->eof = be32_to_cpup(p++);
+ segments = be32_to_cpup(p++);
+ if (segments == 0)
+ return 0;
+
+ segs = kmalloc_array(segments, sizeof(*segs), GFP_KERNEL);
+ if (!segs)
+ return -ENOMEM;
+
+ for (i = 0; i < segments; i++) {
+ status = decode_read_plus_segment(xdr, &segs[i]);
+ if (status < 0)
+ goto out;
+ }
+
+ xdr_set_pagelen(xdr, xdr_align_size(args->count));
+ for (i = segments; i > 0; i--)
+ res->count += process_read_plus_segment(xdr, args, res, &segs[i-1]);
+ status = 0;
+
+out:
+ kfree(segs);
+ return status;
+}
+
+static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
+{
+ int status;
+ __be32 *p;
+
+ status = decode_op_hdr(xdr, OP_SEEK);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 4 + 8);
+ if (unlikely(!p))
+ return -EIO;
+
+ res->sr_eof = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &res->sr_offset);
+ return 0;
+}
+
+static int decode_layoutstats(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_LAYOUTSTATS);
+}
+
+static int decode_clone(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_CLONE);
+}
+
+static int decode_layouterror(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_LAYOUTERROR);
+}
+
+static int decode_setxattr(struct xdr_stream *xdr,
+ struct nfs4_change_info *cinfo)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_SETXATTR);
+ if (status)
+ goto out;
+ status = decode_change_info(xdr, cinfo);
+out:
+ return status;
+}
+
+static int decode_getxattr(struct xdr_stream *xdr,
+ struct nfs42_getxattrres *res,
+ struct rpc_rqst *req)
+{
+ int status;
+ __be32 *p;
+ u32 len, rdlen;
+
+ status = decode_op_hdr(xdr, OP_GETXATTR);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ len = be32_to_cpup(p);
+
+ /*
+ * Only check against the page length here. The actual
+ * requested length may be smaller, but that is only
+ * checked against after possibly caching a valid reply.
+ */
+ if (len > req->rq_rcv_buf.page_len)
+ return -ERANGE;
+
+ res->xattr_len = len;
+
+ if (len > 0) {
+ rdlen = xdr_read_pages(xdr, len);
+ if (rdlen < len)
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int decode_removexattr(struct xdr_stream *xdr,
+ struct nfs4_change_info *cinfo)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_REMOVEXATTR);
+ if (status)
+ goto out;
+
+ status = decode_change_info(xdr, cinfo);
+out:
+ return status;
+}
+
+static int decode_listxattrs(struct xdr_stream *xdr,
+ struct nfs42_listxattrsres *res)
+{
+ int status;
+ __be32 *p;
+ u32 count, len, ulen;
+ size_t left, copied;
+ char *buf;
+
+ status = decode_op_hdr(xdr, OP_LISTXATTRS);
+ if (status) {
+ /*
+ * Special case: for LISTXATTRS, NFS4ERR_TOOSMALL
+ * should be translated to ERANGE.
+ */
+ if (status == -ETOOSMALL)
+ status = -ERANGE;
+ /*
+ * Special case: for LISTXATTRS, NFS4ERR_NOXATTR
+ * should be translated to success with zero-length reply.
+ */
+ if (status == -ENODATA) {
+ res->eof = true;
+ status = 0;
+ }
+ goto out;
+ }
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+
+ xdr_decode_hyper(p, &res->cookie);
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ left = res->xattr_len;
+ buf = res->xattr_buf;
+
+ count = be32_to_cpup(p);
+ copied = 0;
+
+ /*
+ * We have asked for enough room to encode the maximum number
+ * of possible attribute names, so everything should fit.
+ *
+ * But, don't rely on that assumption. Just decode entries
+ * until they don't fit anymore, just in case the server did
+ * something odd.
+ */
+ while (count--) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ len = be32_to_cpup(p);
+ if (len > (XATTR_NAME_MAX - XATTR_USER_PREFIX_LEN)) {
+ status = -ERANGE;
+ goto out;
+ }
+
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return -EIO;
+
+ ulen = len + XATTR_USER_PREFIX_LEN + 1;
+ if (buf) {
+ if (ulen > left) {
+ status = -ERANGE;
+ goto out;
+ }
+
+ memcpy(buf, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+ memcpy(buf + XATTR_USER_PREFIX_LEN, p, len);
+
+ buf[ulen - 1] = 0;
+ buf += ulen;
+ left -= ulen;
+ }
+ copied += ulen;
+ }
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ res->eof = be32_to_cpup(p);
+ res->copied = copied;
+
+out:
+ if (status == -ERANGE && res->xattr_len == XATTR_LIST_MAX)
+ status = -E2BIG;
+
+ return status;
+}
+
+/*
+ * Decode ALLOCATE request
+ */
+static int nfs4_xdr_dec_allocate(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_falloc_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_allocate(xdr, res);
+ if (status)
+ goto out;
+ decode_getfattr(xdr, res->falloc_fattr, res->falloc_server);
+out:
+ return status;
+}
+
+/*
+ * Decode COPY response
+ */
+static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_copy_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_savefh(xdr);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_copy(xdr, res);
+ if (status)
+ goto out;
+ if (res->commit_res.verf)
+ status = decode_commit(xdr, &res->commit_res);
+out:
+ return status;
+}
+
+/*
+ * Decode OFFLOAD_CANCEL response
+ */
+static int nfs4_xdr_dec_offload_cancel(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_offload_status_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->osr_seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_offload_cancel(xdr, res);
+
+out:
+ return status;
+}
+
+/*
+ * Decode COPY_NOTIFY response
+ */
+static int nfs4_xdr_dec_copy_notify(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_copy_notify_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->cnr_seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_copy_notify(xdr, res);
+
+out:
+ return status;
+}
+
+/*
+ * Decode DEALLOCATE request
+ */
+static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_falloc_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_deallocate(xdr, res);
+ if (status)
+ goto out;
+ decode_getfattr(xdr, res->falloc_fattr, res->falloc_server);
+out:
+ return status;
+}
+
+/*
+ * Decode READ_PLUS request
+ */
+static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_pgio_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ xdr_set_scratch_buffer(xdr, res->scratch, READ_PLUS_SCRATCH_SIZE);
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_read_plus(xdr, res);
+ if (!status)
+ status = res->count;
+out:
+ return status;
+}
+
+/*
+ * Decode SEEK request
+ */
+static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_seek_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_seek(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode LAYOUTSTATS request
+ */
+static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_layoutstat_res *res = data;
+ struct compound_hdr hdr;
+ int status, i;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
+ for (i = 0; i < res->num_dev; i++) {
+ status = decode_layoutstats(xdr);
+ if (status)
+ goto out;
+ }
+out:
+ res->rpc_status = status;
+ return status;
+}
+
+/*
+ * Decode CLONE request
+ */
+static int nfs4_xdr_dec_clone(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_clone_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_savefh(xdr);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_clone(xdr);
+ if (status)
+ goto out;
+ decode_getfattr(xdr, res->dst_fattr, res->server);
+out:
+ res->rpc_status = status;
+ return status;
+}
+
+/*
+ * Decode LAYOUTERROR request
+ */
+static int nfs4_xdr_dec_layouterror(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_layouterror_res *res = data;
+ struct compound_hdr hdr;
+ int status, i;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+
+ for (i = 0; i < res->num_errors && status == 0; i++)
+ status = decode_layouterror(xdr);
+out:
+ res->rpc_status = status;
+ return status;
+}
+
+/*
+ * Decode SETXATTR request
+ */
+static int nfs4_xdr_dec_setxattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_setxattrres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, req);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_setxattr(xdr, &res->cinfo);
+ if (status)
+ goto out;
+ status = decode_getfattr(xdr, res->fattr, res->server);
+out:
+ return status;
+}
+
+/*
+ * Decode GETXATTR request
+ */
+static int nfs4_xdr_dec_getxattr(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr, void *data)
+{
+ struct nfs42_getxattrres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_getxattr(xdr, res, rqstp);
+out:
+ return status;
+}
+
+/*
+ * Decode LISTXATTR request
+ */
+static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr, void *data)
+{
+ struct nfs42_listxattrsres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ xdr_set_scratch_page(xdr, res->scratch);
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_listxattrs(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode REMOVEXATTR request
+ */
+static int nfs4_xdr_dec_removexattr(struct rpc_rqst *req,
+ struct xdr_stream *xdr, void *data)
+{
+ struct nfs42_removexattrres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, req);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+
+ status = decode_removexattr(xdr, &res->cinfo);
+out:
+ return status;
+}
+#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
new file mode 100644
index 0000000000..47c5c1f86d
--- /dev/null
+++ b/fs/nfs/nfs4_fs.h
@@ -0,0 +1,674 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/fs/nfs/nfs4_fs.h
+ *
+ * Copyright (C) 2005 Trond Myklebust
+ *
+ * NFSv4-specific filesystem definitions and declarations
+ */
+
+#ifndef __LINUX_FS_NFS_NFS4_FS_H
+#define __LINUX_FS_NFS_NFS4_FS_H
+
+#if defined(CONFIG_NFS_V4_2)
+#define NFS4_MAX_MINOR_VERSION 2
+#elif defined(CONFIG_NFS_V4_1)
+#define NFS4_MAX_MINOR_VERSION 1
+#else
+#define NFS4_MAX_MINOR_VERSION 0
+#endif
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+
+#define NFS4_MAX_LOOP_ON_RECOVER (10)
+
+#include <linux/seqlock.h>
+#include <linux/filelock.h>
+
+struct idmap;
+
+enum nfs4_client_state {
+ NFS4CLNT_MANAGER_RUNNING = 0,
+ NFS4CLNT_CHECK_LEASE,
+ NFS4CLNT_LEASE_EXPIRED,
+ NFS4CLNT_RECLAIM_REBOOT,
+ NFS4CLNT_RECLAIM_NOGRACE,
+ NFS4CLNT_DELEGRETURN,
+ NFS4CLNT_SESSION_RESET,
+ NFS4CLNT_LEASE_CONFIRM,
+ NFS4CLNT_SERVER_SCOPE_MISMATCH,
+ NFS4CLNT_PURGE_STATE,
+ NFS4CLNT_BIND_CONN_TO_SESSION,
+ NFS4CLNT_MOVED,
+ NFS4CLNT_LEASE_MOVED,
+ NFS4CLNT_DELEGATION_EXPIRED,
+ NFS4CLNT_RUN_MANAGER,
+ NFS4CLNT_MANAGER_AVAILABLE,
+ NFS4CLNT_RECALL_RUNNING,
+ NFS4CLNT_RECALL_ANY_LAYOUT_READ,
+ NFS4CLNT_RECALL_ANY_LAYOUT_RW,
+ NFS4CLNT_DELEGRETURN_DELAYED,
+};
+
+#define NFS4_RENEW_TIMEOUT 0x01
+#define NFS4_RENEW_DELEGATION_CB 0x02
+
+struct nfs_seqid_counter;
+struct nfs4_minor_version_ops {
+ u32 minor_version;
+ unsigned init_caps;
+
+ int (*init_client)(struct nfs_client *);
+ void (*shutdown_client)(struct nfs_client *);
+ bool (*match_stateid)(const nfs4_stateid *,
+ const nfs4_stateid *);
+ int (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
+ struct nfs_fsinfo *);
+ void (*free_lock_state)(struct nfs_server *,
+ struct nfs4_lock_state *);
+ int (*test_and_free_expired)(struct nfs_server *,
+ nfs4_stateid *, const struct cred *);
+ struct nfs_seqid *
+ (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+ void (*session_trunk)(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt, void *data);
+ const struct rpc_call_ops *call_sync_ops;
+ const struct nfs4_state_recovery_ops *reboot_recovery_ops;
+ const struct nfs4_state_recovery_ops *nograce_recovery_ops;
+ const struct nfs4_state_maintenance_ops *state_renewal_ops;
+ const struct nfs4_mig_recovery_ops *mig_recovery_ops;
+};
+
+#define NFS_SEQID_CONFIRMED 1
+struct nfs_seqid_counter {
+ ktime_t create_time;
+ int owner_id;
+ int flags;
+ u32 counter;
+ spinlock_t lock; /* Protects the list */
+ struct list_head list; /* Defines sequence of RPC calls */
+ struct rpc_wait_queue wait; /* RPC call delay queue */
+};
+
+struct nfs_seqid {
+ struct nfs_seqid_counter *sequence;
+ struct list_head list;
+ struct rpc_task *task;
+};
+
+static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status)
+{
+ if (seqid_mutating_err(-status))
+ seqid->flags |= NFS_SEQID_CONFIRMED;
+}
+
+/*
+ * NFS4 state_owners and lock_owners are simply labels for ordered
+ * sequences of RPC calls. Their sole purpose is to provide once-only
+ * semantics by allowing the server to identify replayed requests.
+ */
+struct nfs4_state_owner {
+ struct nfs_server *so_server;
+ struct list_head so_lru;
+ unsigned long so_expires;
+ struct rb_node so_server_node;
+
+ const struct cred *so_cred; /* Associated cred */
+
+ spinlock_t so_lock;
+ atomic_t so_count;
+ unsigned long so_flags;
+ struct list_head so_states;
+ struct nfs_seqid_counter so_seqid;
+ seqcount_spinlock_t so_reclaim_seqcount;
+ struct mutex so_delegreturn_mutex;
+};
+
+enum {
+ NFS_OWNER_RECLAIM_REBOOT,
+ NFS_OWNER_RECLAIM_NOGRACE
+};
+
+#define NFS_LOCK_NEW 0
+#define NFS_LOCK_RECLAIM 1
+#define NFS_LOCK_EXPIRED 2
+
+/*
+ * struct nfs4_state maintains the client-side state for a given
+ * (state_owner,inode) tuple (OPEN) or state_owner (LOCK).
+ *
+ * OPEN:
+ * In order to know when to OPEN_DOWNGRADE or CLOSE the state on the server,
+ * we need to know how many files are open for reading or writing on a
+ * given inode. This information too is stored here.
+ *
+ * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
+ */
+
+struct nfs4_lock_state {
+ struct list_head ls_locks; /* Other lock stateids */
+ struct nfs4_state * ls_state; /* Pointer to open state */
+#define NFS_LOCK_INITIALIZED 0
+#define NFS_LOCK_LOST 1
+#define NFS_LOCK_UNLOCKING 2
+ unsigned long ls_flags;
+ struct nfs_seqid_counter ls_seqid;
+ nfs4_stateid ls_stateid;
+ refcount_t ls_count;
+ fl_owner_t ls_owner;
+};
+
+/* bits for nfs4_state->flags */
+enum {
+ LK_STATE_IN_USE,
+ NFS_DELEGATED_STATE, /* Current stateid is delegation */
+ NFS_OPEN_STATE, /* OPEN stateid is set */
+ NFS_O_RDONLY_STATE, /* OPEN stateid has read-only state */
+ NFS_O_WRONLY_STATE, /* OPEN stateid has write-only state */
+ NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */
+ NFS_STATE_RECLAIM_REBOOT, /* OPEN stateid server rebooted */
+ NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */
+ NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */
+ NFS_STATE_RECOVERY_FAILED, /* OPEN stateid state recovery failed */
+ NFS_STATE_MAY_NOTIFY_LOCK, /* server may CB_NOTIFY_LOCK */
+ NFS_STATE_CHANGE_WAIT, /* A state changing operation is outstanding */
+ NFS_CLNT_DST_SSC_COPY_STATE, /* dst server open state on client*/
+ NFS_CLNT_SRC_SSC_COPY_STATE, /* src server open state on client*/
+ NFS_SRV_SSC_COPY_STATE, /* ssc state on the dst server */
+};
+
+struct nfs4_state {
+ struct list_head open_states; /* List of states for the same state_owner */
+ struct list_head inode_states; /* List of states for the same inode */
+ struct list_head lock_states; /* List of subservient lock stateids */
+
+ struct nfs4_state_owner *owner; /* Pointer to the open owner */
+ struct inode *inode; /* Pointer to the inode */
+
+ unsigned long flags; /* Do we hold any locks? */
+ spinlock_t state_lock; /* Protects the lock_states list */
+
+ seqlock_t seqlock; /* Protects the stateid/open_stateid */
+ nfs4_stateid stateid; /* Current stateid: may be delegation */
+ nfs4_stateid open_stateid; /* OPEN stateid */
+
+ /* The following 3 fields are protected by owner->so_lock */
+ unsigned int n_rdonly; /* Number of read-only references */
+ unsigned int n_wronly; /* Number of write-only references */
+ unsigned int n_rdwr; /* Number of read/write references */
+ fmode_t state; /* State on the server (R,W, or RW) */
+ refcount_t count;
+
+ wait_queue_head_t waitq;
+ struct rcu_head rcu_head;
+};
+
+
+struct nfs4_exception {
+ struct nfs4_state *state;
+ struct inode *inode;
+ nfs4_stateid *stateid;
+ long timeout;
+ unsigned char task_is_privileged : 1;
+ unsigned char delay : 1,
+ recovering : 1,
+ retry : 1;
+ bool interruptible;
+};
+
+struct nfs4_state_recovery_ops {
+ int owner_flag_bit;
+ int state_flag_bit;
+ int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *);
+ int (*recover_lock)(struct nfs4_state *, struct file_lock *);
+ int (*establish_clid)(struct nfs_client *, const struct cred *);
+ int (*reclaim_complete)(struct nfs_client *, const struct cred *);
+ int (*detect_trunking)(struct nfs_client *, struct nfs_client **,
+ const struct cred *);
+};
+
+struct nfs4_opendata {
+ struct kref kref;
+ struct nfs_openargs o_arg;
+ struct nfs_openres o_res;
+ struct nfs_open_confirmargs c_arg;
+ struct nfs_open_confirmres c_res;
+ struct nfs4_string owner_name;
+ struct nfs4_string group_name;
+ struct nfs4_label *a_label;
+ struct nfs_fattr f_attr;
+ struct dentry *dir;
+ struct dentry *dentry;
+ struct nfs4_state_owner *owner;
+ struct nfs4_state *state;
+ struct iattr attrs;
+ struct nfs4_layoutget *lgp;
+ unsigned long timestamp;
+ bool rpc_done;
+ bool file_created;
+ bool is_recover;
+ bool cancelled;
+ int rpc_status;
+};
+
+struct nfs4_add_xprt_data {
+ struct nfs_client *clp;
+ const struct cred *cred;
+};
+
+struct nfs4_state_maintenance_ops {
+ int (*sched_state_renewal)(struct nfs_client *, const struct cred *, unsigned);
+ const struct cred * (*get_state_renewal_cred)(struct nfs_client *);
+ int (*renew_lease)(struct nfs_client *, const struct cred *);
+};
+
+struct nfs4_mig_recovery_ops {
+ int (*get_locations)(struct nfs_server *, struct nfs_fh *,
+ struct nfs4_fs_locations *, struct page *, const struct cred *);
+ int (*fsid_present)(struct inode *, const struct cred *);
+};
+
+extern const struct dentry_operations nfs4_dentry_operations;
+
+/* dir.c */
+int nfs_atomic_open(struct inode *, struct dentry *, struct file *,
+ unsigned, umode_t);
+
+/* fs_context.c */
+extern struct file_system_type nfs4_fs_type;
+
+/* nfs4namespace.c */
+struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *,
+ const struct qstr *);
+int nfs4_submount(struct fs_context *, struct nfs_server *);
+int nfs4_replace_transport(struct nfs_server *server,
+ const struct nfs4_fs_locations *locations);
+size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr_storage *ss,
+ size_t salen, struct net *net, int port);
+/* nfs4proc.c */
+extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *);
+extern int nfs4_async_handle_error(struct rpc_task *task,
+ struct nfs_server *server,
+ struct nfs4_state *state, long *timeout);
+extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *,
+ struct rpc_message *, struct nfs4_sequence_args *,
+ struct nfs4_sequence_res *, int);
+extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int, int);
+extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, const struct cred *, struct nfs4_setclientid_res *);
+extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, const struct cred *);
+extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool);
+extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, const struct cred *cred);
+extern int nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred);
+extern int nfs4_destroy_clientid(struct nfs_client *clp);
+extern int nfs4_init_clientid(struct nfs_client *, const struct cred *);
+extern int nfs41_init_clientid(struct nfs_client *, const struct cred *);
+extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait);
+extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
+extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *,
+ struct nfs4_fs_locations *, struct page *);
+extern int nfs4_proc_get_locations(struct nfs_server *, struct nfs_fh *,
+ struct nfs4_fs_locations *,
+ struct page *page, const struct cred *);
+extern int nfs4_proc_fsid_present(struct inode *, const struct cred *);
+extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *,
+ struct dentry *,
+ struct nfs_fh *,
+ struct nfs_fattr *);
+extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
+extern const struct xattr_handler *nfs4_xattr_handlers[];
+extern int nfs4_set_rw_stateid(nfs4_stateid *stateid,
+ const struct nfs_open_context *ctx,
+ const struct nfs_lock_context *l_ctx,
+ fmode_t fmode);
+extern void nfs4_bitmask_set(__u32 bitmask[], const __u32 src[],
+ struct inode *inode, unsigned long cache_validity);
+extern int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct inode *inode);
+extern int update_open_stateid(struct nfs4_state *state,
+ const nfs4_stateid *open_stateid,
+ const nfs4_stateid *deleg_stateid,
+ fmode_t fmode);
+extern int nfs4_proc_setlease(struct file *file, int arg,
+ struct file_lock **lease, void **priv);
+extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
+ struct nfs_fsinfo *fsinfo);
+extern void nfs4_update_changeattr(struct inode *dir,
+ struct nfs4_change_info *cinfo,
+ unsigned long timestamp,
+ unsigned long cache_validity);
+extern int nfs4_buf_to_pages_noslab(const void *buf, size_t buflen,
+ struct page **pages);
+
+#if defined(CONFIG_NFS_V4_1)
+extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *);
+extern int nfs4_proc_create_session(struct nfs_client *, const struct cred *);
+extern int nfs4_proc_destroy_session(struct nfs4_session *, const struct cred *);
+extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
+ bool sync);
+extern int nfs4_detect_session_trunking(struct nfs_client *clp,
+ struct nfs41_exchange_id_res *res, struct rpc_xprt *xprt);
+
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+ return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) ==
+ EXCHGID4_FLAG_USE_PNFS_DS;
+}
+
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+ return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS;
+}
+
+static inline bool
+_nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
+ struct rpc_clnt **clntp, struct rpc_message *msg)
+{
+ rpc_authflavor_t flavor;
+
+ if (sp4_mode == NFS_SP4_MACH_CRED_CLEANUP ||
+ sp4_mode == NFS_SP4_MACH_CRED_PNFS_CLEANUP) {
+ /* Using machine creds for cleanup operations
+ * is only relevent if the client credentials
+ * might expire. So don't bother for
+ * RPC_AUTH_UNIX. If file was only exported to
+ * sec=sys, the PUTFH would fail anyway.
+ */
+ if ((*clntp)->cl_auth->au_flavor == RPC_AUTH_UNIX)
+ return false;
+ }
+ if (test_bit(sp4_mode, &clp->cl_sp4_flags)) {
+ msg->rpc_cred = rpc_machine_cred();
+
+ flavor = clp->cl_rpcclient->cl_auth->au_flavor;
+ WARN_ON_ONCE(flavor != RPC_AUTH_GSS_KRB5I &&
+ flavor != RPC_AUTH_GSS_KRB5P);
+ *clntp = clp->cl_rpcclient;
+
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Function responsible for determining if an rpc_message should use the
+ * machine cred under SP4_MACH_CRED and if so switching the credential and
+ * authflavor (using the nfs_client's rpc_clnt which will be krb5i/p).
+ * Should be called before rpc_call_sync/rpc_call_async.
+ */
+static inline void
+nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
+ struct rpc_clnt **clntp, struct rpc_message *msg)
+{
+ _nfs4_state_protect(clp, sp4_mode, clntp, msg);
+}
+
+/*
+ * Special wrapper to nfs4_state_protect for write.
+ * If WRITE can use machine cred but COMMIT cannot, make sure all writes
+ * that use machine cred use NFS_FILE_SYNC.
+ */
+static inline void
+nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
+ struct rpc_message *msg, struct nfs_pgio_header *hdr)
+{
+ if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) &&
+ !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags))
+ hdr->args.stable = NFS_FILE_SYNC;
+}
+#else /* CONFIG_NFS_v4_1 */
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+ return false;
+}
+
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+ return false;
+}
+
+static inline void
+nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags,
+ struct rpc_clnt **clntp, struct rpc_message *msg)
+{
+}
+
+static inline void
+nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
+ struct rpc_message *msg, struct nfs_pgio_header *hdr)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
+
+extern const u32 nfs4_fattr_bitmap[3];
+extern const u32 nfs4_statfs_bitmap[3];
+extern const u32 nfs4_pathconf_bitmap[3];
+extern const u32 nfs4_fsinfo_bitmap[3];
+extern const u32 nfs4_fs_locations_bitmap[3];
+
+void nfs40_shutdown_client(struct nfs_client *);
+void nfs41_shutdown_client(struct nfs_client *);
+int nfs40_init_client(struct nfs_client *);
+int nfs41_init_client(struct nfs_client *);
+void nfs4_free_client(struct nfs_client *);
+
+struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *);
+
+/* nfs4renewd.c */
+extern void nfs4_schedule_state_renewal(struct nfs_client *);
+extern void nfs4_kill_renewd(struct nfs_client *);
+extern void nfs4_renew_state(struct work_struct *);
+extern void nfs4_set_lease_period(struct nfs_client *clp, unsigned long lease);
+
+
+/* nfs4state.c */
+extern const nfs4_stateid current_stateid;
+
+const struct cred *nfs4_get_clid_cred(struct nfs_client *clp);
+const struct cred *nfs4_get_machine_cred(struct nfs_client *clp);
+const struct cred *nfs4_get_renew_cred(struct nfs_client *clp);
+int nfs4_discover_server_trunking(struct nfs_client *clp,
+ struct nfs_client **);
+int nfs40_discover_server_trunking(struct nfs_client *clp,
+ struct nfs_client **, const struct cred *);
+#if defined(CONFIG_NFS_V4_1)
+int nfs41_discover_server_trunking(struct nfs_client *clp,
+ struct nfs_client **, const struct cred *);
+extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
+extern void nfs41_notify_server(struct nfs_client *);
+bool nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1,
+ struct nfs41_server_owner *o2);
+#else
+static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, const struct cred *, gfp_t);
+extern void nfs4_put_state_owner(struct nfs4_state_owner *);
+extern void nfs4_purge_state_owners(struct nfs_server *, struct list_head *);
+extern void nfs4_free_state_owners(struct list_head *head);
+extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
+extern void nfs4_put_open_state(struct nfs4_state *);
+extern void nfs4_close_state(struct nfs4_state *, fmode_t);
+extern void nfs4_close_sync(struct nfs4_state *, fmode_t);
+extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
+extern void nfs_inode_find_state_and_recover(struct inode *inode,
+ const nfs4_stateid *stateid);
+extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *, struct nfs4_state *);
+extern void nfs4_schedule_lease_recovery(struct nfs_client *);
+extern int nfs4_wait_clnt_recover(struct nfs_client *clp);
+extern int nfs4_client_recover_expired_lease(struct nfs_client *clp);
+extern void nfs4_schedule_state_manager(struct nfs_client *);
+extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
+extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
+extern int nfs4_schedule_migration_recovery(const struct nfs_server *);
+extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *);
+extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags, bool);
+extern void nfs41_handle_server_scope(struct nfs_client *,
+ struct nfs41_server_scope **);
+extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
+extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
+extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t,
+ const struct nfs_lock_context *, nfs4_stateid *,
+ const struct cred **);
+extern bool nfs4_copy_open_stateid(nfs4_stateid *dst,
+ struct nfs4_state *state);
+
+extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
+extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
+extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
+extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
+extern void nfs_release_seqid(struct nfs_seqid *seqid);
+extern void nfs_free_seqid(struct nfs_seqid *seqid);
+extern int nfs4_setup_sequence(struct nfs_client *client,
+ struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ struct rpc_task *task);
+extern int nfs4_sequence_done(struct rpc_task *task,
+ struct nfs4_sequence_res *res);
+
+extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp);
+extern int nfs4_proc_commit(struct file *dst, __u64 offset, __u32 count, struct nfs_commitres *res);
+extern const nfs4_stateid zero_stateid;
+extern const nfs4_stateid invalid_stateid;
+
+/* nfs4super.c */
+struct nfs_mount_info;
+extern struct nfs_subversion nfs_v4;
+extern bool nfs4_disable_idmapping;
+extern unsigned short max_session_slots;
+extern unsigned short max_session_cb_slots;
+extern unsigned short send_implementation_id;
+extern bool recover_lost_locks;
+
+#define NFS4_CLIENT_ID_UNIQ_LEN (64)
+extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN];
+
+extern int nfs4_try_get_tree(struct fs_context *);
+extern int nfs4_get_referral_tree(struct fs_context *);
+
+/* nfs4sysctl.c */
+#ifdef CONFIG_SYSCTL
+int nfs4_register_sysctl(void);
+void nfs4_unregister_sysctl(void);
+#else
+static inline int nfs4_register_sysctl(void)
+{
+ return 0;
+}
+
+static inline void nfs4_unregister_sysctl(void)
+{
+}
+#endif
+
+/* nfs4xdr.c */
+extern const struct rpc_procinfo nfs4_procedures[];
+
+#ifdef CONFIG_NFS_V4_2
+extern const u32 nfs42_maxsetxattr_overhead;
+extern const u32 nfs42_maxgetxattr_overhead;
+extern const u32 nfs42_maxlistxattrs_overhead;
+#endif
+
+struct nfs4_mount_data;
+
+/* callback_xdr.c */
+extern const struct svc_version nfs4_callback_version1;
+extern const struct svc_version nfs4_callback_version4;
+
+static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src)
+{
+ memcpy(dst->data, src->data, sizeof(dst->data));
+ dst->type = src->type;
+}
+
+static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src)
+{
+ if (dst->type != src->type)
+ return false;
+ return memcmp(dst->data, src->data, sizeof(dst->data)) == 0;
+}
+
+static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src)
+{
+ return memcmp(dst->other, src->other, NFS4_STATEID_OTHER_SIZE) == 0;
+}
+
+static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stateid *s2)
+{
+ return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0;
+}
+
+static inline bool nfs4_stateid_is_next(const nfs4_stateid *s1, const nfs4_stateid *s2)
+{
+ u32 seq1 = be32_to_cpu(s1->seqid);
+ u32 seq2 = be32_to_cpu(s2->seqid);
+
+ return seq2 == seq1 + 1U || (seq2 == 1U && seq1 == 0xffffffffU);
+}
+
+static inline bool nfs4_stateid_match_or_older(const nfs4_stateid *dst, const nfs4_stateid *src)
+{
+ return nfs4_stateid_match_other(dst, src) &&
+ !(src->seqid && nfs4_stateid_is_newer(dst, src));
+}
+
+static inline void nfs4_stateid_seqid_inc(nfs4_stateid *s1)
+{
+ u32 seqid = be32_to_cpu(s1->seqid);
+
+ if (++seqid == 0)
+ ++seqid;
+ s1->seqid = cpu_to_be32(seqid);
+}
+
+static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state)
+{
+ return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0;
+}
+
+static inline bool nfs4_state_match_open_stateid_other(const struct nfs4_state *state,
+ const nfs4_stateid *stateid)
+{
+ return test_bit(NFS_OPEN_STATE, &state->flags) &&
+ nfs4_stateid_match_other(&state->open_stateid, stateid);
+}
+
+/* nfs42xattr.c */
+#ifdef CONFIG_NFS_V4_2
+extern int __init nfs4_xattr_cache_init(void);
+extern void nfs4_xattr_cache_exit(void);
+extern void nfs4_xattr_cache_add(struct inode *inode, const char *name,
+ const char *buf, struct page **pages,
+ ssize_t buflen);
+extern void nfs4_xattr_cache_remove(struct inode *inode, const char *name);
+extern ssize_t nfs4_xattr_cache_get(struct inode *inode, const char *name,
+ char *buf, ssize_t buflen);
+extern void nfs4_xattr_cache_set_list(struct inode *inode, const char *buf,
+ ssize_t buflen);
+extern ssize_t nfs4_xattr_cache_list(struct inode *inode, char *buf,
+ ssize_t buflen);
+extern void nfs4_xattr_cache_zap(struct inode *inode);
+#else
+static inline void nfs4_xattr_cache_zap(struct inode *inode)
+{
+}
+#endif /* CONFIG_NFS_V4_2 */
+
+#else /* CONFIG_NFS_V4 */
+
+#define nfs4_close_state(a, b) do { } while (0)
+#define nfs4_close_sync(a, b) do { } while (0)
+#define nfs4_state_protect(a, b, c, d) do { } while (0)
+#define nfs4_state_protect_write(a, b, c, d) do { } while (0)
+
+
+#endif /* CONFIG_NFS_V4 */
+#endif /* __LINUX_FS_NFS_NFS4_FS.H */
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
new file mode 100644
index 0000000000..11e3a28559
--- /dev/null
+++ b/fs/nfs/nfs4client.c
@@ -0,0 +1,1384 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+#include <linux/module.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/auth.h>
+#include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/bc_xprt.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include "internal.h"
+#include "callback.h"
+#include "delegation.h"
+#include "nfs4session.h"
+#include "nfs4idmap.h"
+#include "pnfs.h"
+#include "netns.h"
+#include "sysfs.h"
+
+#define NFSDBG_FACILITY NFSDBG_CLIENT
+
+/*
+ * Get a unique NFSv4.0 callback identifier which will be used
+ * by the V4.0 callback service to lookup the nfs_client struct
+ */
+static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
+{
+ int ret = 0;
+ struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+
+ if (clp->rpc_ops->version != 4 || minorversion != 0)
+ return ret;
+ idr_preload(GFP_KERNEL);
+ spin_lock(&nn->nfs_client_lock);
+ ret = idr_alloc(&nn->cb_ident_idr, clp, 1, 0, GFP_NOWAIT);
+ if (ret >= 0)
+ clp->cl_cb_ident = ret;
+ spin_unlock(&nn->nfs_client_lock);
+ idr_preload_end();
+ return ret < 0 ? ret : 0;
+}
+
+#ifdef CONFIG_NFS_V4_1
+/*
+ * Per auth flavor data server rpc clients
+ */
+struct nfs4_ds_server {
+ struct list_head list; /* ds_clp->cl_ds_clients */
+ struct rpc_clnt *rpc_clnt;
+};
+
+/**
+ * nfs4_find_ds_client - Common lookup case for DS I/O
+ * @ds_clp: pointer to the DS's nfs_client
+ * @flavor: rpc auth flavour to match
+ */
+static struct nfs4_ds_server *
+nfs4_find_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor)
+{
+ struct nfs4_ds_server *dss;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(dss, &ds_clp->cl_ds_clients, list) {
+ if (dss->rpc_clnt->cl_auth->au_flavor != flavor)
+ continue;
+ goto out;
+ }
+ dss = NULL;
+out:
+ rcu_read_unlock();
+ return dss;
+}
+
+static struct nfs4_ds_server *
+nfs4_add_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor,
+ struct nfs4_ds_server *new)
+{
+ struct nfs4_ds_server *dss;
+
+ spin_lock(&ds_clp->cl_lock);
+ list_for_each_entry(dss, &ds_clp->cl_ds_clients, list) {
+ if (dss->rpc_clnt->cl_auth->au_flavor != flavor)
+ continue;
+ goto out;
+ }
+ if (new)
+ list_add_rcu(&new->list, &ds_clp->cl_ds_clients);
+ dss = new;
+out:
+ spin_unlock(&ds_clp->cl_lock); /* need some lock to protect list */
+ return dss;
+}
+
+static struct nfs4_ds_server *
+nfs4_alloc_ds_server(struct nfs_client *ds_clp, rpc_authflavor_t flavor)
+{
+ struct nfs4_ds_server *dss;
+
+ dss = kmalloc(sizeof(*dss), GFP_NOFS);
+ if (dss == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ dss->rpc_clnt = rpc_clone_client_set_auth(ds_clp->cl_rpcclient, flavor);
+ if (IS_ERR(dss->rpc_clnt)) {
+ int err = PTR_ERR(dss->rpc_clnt);
+ kfree (dss);
+ return ERR_PTR(err);
+ }
+ INIT_LIST_HEAD(&dss->list);
+
+ return dss;
+}
+
+static void
+nfs4_free_ds_server(struct nfs4_ds_server *dss)
+{
+ rpc_release_client(dss->rpc_clnt);
+ kfree(dss);
+}
+
+/**
+ * nfs4_find_or_create_ds_client - Find or create a DS rpc client
+ * @ds_clp: pointer to the DS's nfs_client
+ * @inode: pointer to the inode
+ *
+ * Find or create a DS rpc client with th MDS server rpc client auth flavor
+ * in the nfs_client cl_ds_clients list.
+ */
+struct rpc_clnt *
+nfs4_find_or_create_ds_client(struct nfs_client *ds_clp, struct inode *inode)
+{
+ struct nfs4_ds_server *dss, *new;
+ rpc_authflavor_t flavor = NFS_SERVER(inode)->client->cl_auth->au_flavor;
+
+ dss = nfs4_find_ds_client(ds_clp, flavor);
+ if (dss != NULL)
+ goto out;
+ new = nfs4_alloc_ds_server(ds_clp, flavor);
+ if (IS_ERR(new))
+ return ERR_CAST(new);
+ dss = nfs4_add_ds_client(ds_clp, flavor, new);
+ if (dss != new)
+ nfs4_free_ds_server(new);
+out:
+ return dss->rpc_clnt;
+}
+EXPORT_SYMBOL_GPL(nfs4_find_or_create_ds_client);
+
+static void
+nfs4_shutdown_ds_clients(struct nfs_client *clp)
+{
+ struct nfs4_ds_server *dss;
+
+ while (!list_empty(&clp->cl_ds_clients)) {
+ dss = list_entry(clp->cl_ds_clients.next,
+ struct nfs4_ds_server, list);
+ list_del(&dss->list);
+ rpc_shutdown_client(dss->rpc_clnt);
+ kfree (dss);
+ }
+}
+
+static void
+nfs4_cleanup_callback(struct nfs_client *clp)
+{
+ struct nfs4_copy_state *cp_state;
+
+ while (!list_empty(&clp->pending_cb_stateids)) {
+ cp_state = list_entry(clp->pending_cb_stateids.next,
+ struct nfs4_copy_state, copies);
+ list_del(&cp_state->copies);
+ kfree(cp_state);
+ }
+}
+
+void nfs41_shutdown_client(struct nfs_client *clp)
+{
+ if (nfs4_has_session(clp)) {
+ nfs4_cleanup_callback(clp);
+ nfs4_shutdown_ds_clients(clp);
+ nfs4_destroy_session(clp->cl_session);
+ nfs4_destroy_clientid(clp);
+ }
+
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+void nfs40_shutdown_client(struct nfs_client *clp)
+{
+ if (clp->cl_slot_tbl) {
+ nfs4_shutdown_slot_table(clp->cl_slot_tbl);
+ kfree(clp->cl_slot_tbl);
+ }
+}
+
+struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
+{
+ char buf[INET6_ADDRSTRLEN + 1];
+ const char *ip_addr = cl_init->ip_addr;
+ struct nfs_client *clp = nfs_alloc_client(cl_init);
+ int err;
+
+ if (IS_ERR(clp))
+ return clp;
+
+ err = nfs_get_cb_ident_idr(clp, cl_init->minorversion);
+ if (err)
+ goto error;
+
+ if (cl_init->minorversion > NFS4_MAX_MINOR_VERSION) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ spin_lock_init(&clp->cl_lock);
+ INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
+ INIT_LIST_HEAD(&clp->cl_ds_clients);
+ rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
+ clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
+ clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
+ clp->cl_mig_gen = 1;
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+ init_waitqueue_head(&clp->cl_lock_waitq);
+#endif
+ INIT_LIST_HEAD(&clp->pending_cb_stateids);
+
+ if (cl_init->minorversion != 0)
+ __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags);
+ __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
+ __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
+
+ if (test_bit(NFS_CS_DS, &cl_init->init_flags))
+ __set_bit(NFS_CS_DS, &clp->cl_flags);
+ /*
+ * Set up the connection to the server before we add add to the
+ * global list.
+ */
+ err = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I);
+ if (err == -EINVAL)
+ err = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
+ if (err < 0)
+ goto error;
+
+ /* If no clientaddr= option was specified, find a usable cb address */
+ if (ip_addr == NULL) {
+ struct sockaddr_storage cb_addr;
+ struct sockaddr *sap = (struct sockaddr *)&cb_addr;
+
+ err = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr));
+ if (err < 0)
+ goto error;
+ err = rpc_ntop(sap, buf, sizeof(buf));
+ if (err < 0)
+ goto error;
+ ip_addr = (const char *)buf;
+ }
+ strscpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
+
+ err = nfs_idmap_new(clp);
+ if (err < 0) {
+ dprintk("%s: failed to create idmapper. Error = %d\n",
+ __func__, err);
+ goto error;
+ }
+ __set_bit(NFS_CS_IDMAP, &clp->cl_res_state);
+ return clp;
+
+error:
+ nfs_free_client(clp);
+ return ERR_PTR(err);
+}
+
+/*
+ * Destroy the NFS4 callback service
+ */
+static void nfs4_destroy_callback(struct nfs_client *clp)
+{
+ if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
+ nfs_callback_down(clp->cl_mvops->minor_version, clp->cl_net);
+}
+
+static void nfs4_shutdown_client(struct nfs_client *clp)
+{
+ if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
+ nfs4_kill_renewd(clp);
+ clp->cl_mvops->shutdown_client(clp);
+ nfs4_destroy_callback(clp);
+ if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
+ nfs_idmap_delete(clp);
+
+ rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
+ kfree(clp->cl_serverowner);
+ kfree(clp->cl_serverscope);
+ kfree(clp->cl_implid);
+ kfree(clp->cl_owner_id);
+}
+
+void nfs4_free_client(struct nfs_client *clp)
+{
+ nfs4_shutdown_client(clp);
+ nfs_free_client(clp);
+}
+
+/*
+ * Initialize the NFS4 callback service
+ */
+static int nfs4_init_callback(struct nfs_client *clp)
+{
+ struct rpc_xprt *xprt;
+ int error;
+
+ xprt = rcu_dereference_raw(clp->cl_rpcclient->cl_xprt);
+
+ if (nfs4_has_session(clp)) {
+ error = xprt_setup_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);
+ if (error < 0)
+ return error;
+ }
+
+ error = nfs_callback_up(clp->cl_mvops->minor_version, xprt);
+ if (error < 0) {
+ dprintk("%s: failed to start callback. Error = %d\n",
+ __func__, error);
+ return error;
+ }
+ __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
+
+ return 0;
+}
+
+/**
+ * nfs40_init_client - nfs_client initialization tasks for NFSv4.0
+ * @clp: nfs_client to initialize
+ *
+ * Returns zero on success, or a negative errno if some error occurred.
+ */
+int nfs40_init_client(struct nfs_client *clp)
+{
+ struct nfs4_slot_table *tbl;
+ int ret;
+
+ tbl = kzalloc(sizeof(*tbl), GFP_NOFS);
+ if (tbl == NULL)
+ return -ENOMEM;
+
+ ret = nfs4_setup_slot_table(tbl, NFS4_MAX_SLOT_TABLE,
+ "NFSv4.0 transport Slot table");
+ if (ret) {
+ nfs4_shutdown_slot_table(tbl);
+ kfree(tbl);
+ return ret;
+ }
+
+ clp->cl_slot_tbl = tbl;
+ return 0;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+/**
+ * nfs41_init_client - nfs_client initialization tasks for NFSv4.1+
+ * @clp: nfs_client to initialize
+ *
+ * Returns zero on success, or a negative errno if some error occurred.
+ */
+int nfs41_init_client(struct nfs_client *clp)
+{
+ struct nfs4_session *session = NULL;
+
+ /*
+ * Create the session and mark it expired.
+ * When a SEQUENCE operation encounters the expired session
+ * it will do session recovery to initialize it.
+ */
+ session = nfs4_alloc_session(clp);
+ if (!session)
+ return -ENOMEM;
+
+ clp->cl_session = session;
+
+ /*
+ * The create session reply races with the server back
+ * channel probe. Mark the client NFS_CS_SESSION_INITING
+ * so that the client back channel can find the
+ * nfs_client struct
+ */
+ nfs_mark_client_ready(clp, NFS_CS_SESSION_INITING);
+ return 0;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * Initialize the minor version specific parts of an NFS4 client record
+ */
+static int nfs4_init_client_minor_version(struct nfs_client *clp)
+{
+ int ret;
+
+ ret = clp->cl_mvops->init_client(clp);
+ if (ret)
+ return ret;
+ return nfs4_init_callback(clp);
+}
+
+static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old)
+{
+ struct sockaddr_storage clp_addr, old_addr;
+ struct sockaddr *clp_sap = (struct sockaddr *)&clp_addr;
+ struct sockaddr *old_sap = (struct sockaddr *)&old_addr;
+ size_t clp_salen;
+ struct xprt_create xprt_args = {
+ .ident = old->cl_proto,
+ .net = old->cl_net,
+ .servername = old->cl_hostname,
+ };
+ int max_connect = test_bit(NFS_CS_PNFS, &clp->cl_flags) ?
+ clp->cl_max_connect : old->cl_max_connect;
+
+ if (clp->cl_proto != old->cl_proto)
+ return;
+ clp_salen = rpc_peeraddr(clp->cl_rpcclient, clp_sap, sizeof(clp_addr));
+ rpc_peeraddr(old->cl_rpcclient, old_sap, sizeof(old_addr));
+
+ if (clp_addr.ss_family != old_addr.ss_family)
+ return;
+
+ xprt_args.dstaddr = clp_sap;
+ xprt_args.addrlen = clp_salen;
+
+ rpc_clnt_add_xprt(old->cl_rpcclient, &xprt_args,
+ rpc_clnt_test_and_add_xprt, &max_connect);
+}
+
+/**
+ * nfs4_init_client - Initialise an NFS4 client record
+ *
+ * @clp: nfs_client to initialise
+ * @cl_init: pointer to nfs_client_initdata
+ *
+ * Returns pointer to an NFS client, or an ERR_PTR value.
+ */
+struct nfs_client *nfs4_init_client(struct nfs_client *clp,
+ const struct nfs_client_initdata *cl_init)
+{
+ struct nfs_client *old;
+ int error;
+
+ if (clp->cl_cons_state == NFS_CS_READY)
+ /* the client is initialised already */
+ return clp;
+
+ error = nfs4_init_client_minor_version(clp);
+ if (error < 0)
+ goto error;
+
+ error = nfs4_discover_server_trunking(clp, &old);
+ if (error < 0)
+ goto error;
+
+ if (clp != old) {
+ clp->cl_preserve_clid = true;
+ /*
+ * Mark the client as having failed initialization so other
+ * processes walking the nfs_client_list in nfs_match_client()
+ * won't try to use it.
+ */
+ nfs_mark_client_ready(clp, -EPERM);
+ if (old->cl_mvops->session_trunk)
+ nfs4_add_trunk(clp, old);
+ }
+ clear_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags);
+ nfs_put_client(clp);
+ return old;
+
+error:
+ nfs_mark_client_ready(clp, error);
+ nfs_put_client(clp);
+ return ERR_PTR(error);
+}
+
+/*
+ * SETCLIENTID just did a callback update with the callback ident in
+ * "drop," but server trunking discovery claims "drop" and "keep" are
+ * actually the same server. Swap the callback IDs so that "keep"
+ * will continue to use the callback ident the server now knows about,
+ * and so that "keep"'s original callback ident is destroyed when
+ * "drop" is freed.
+ */
+static void nfs4_swap_callback_idents(struct nfs_client *keep,
+ struct nfs_client *drop)
+{
+ struct nfs_net *nn = net_generic(keep->cl_net, nfs_net_id);
+ unsigned int save = keep->cl_cb_ident;
+
+ if (keep->cl_cb_ident == drop->cl_cb_ident)
+ return;
+
+ dprintk("%s: keeping callback ident %u and dropping ident %u\n",
+ __func__, keep->cl_cb_ident, drop->cl_cb_ident);
+
+ spin_lock(&nn->nfs_client_lock);
+
+ idr_replace(&nn->cb_ident_idr, keep, drop->cl_cb_ident);
+ keep->cl_cb_ident = drop->cl_cb_ident;
+
+ idr_replace(&nn->cb_ident_idr, drop, save);
+ drop->cl_cb_ident = save;
+
+ spin_unlock(&nn->nfs_client_lock);
+}
+
+static bool nfs4_match_client_owner_id(const struct nfs_client *clp1,
+ const struct nfs_client *clp2)
+{
+ if (clp1->cl_owner_id == NULL || clp2->cl_owner_id == NULL)
+ return true;
+ return strcmp(clp1->cl_owner_id, clp2->cl_owner_id) == 0;
+}
+
+static bool nfs4_same_verifier(nfs4_verifier *v1, nfs4_verifier *v2)
+{
+ return memcmp(v1->data, v2->data, sizeof(v1->data)) == 0;
+}
+
+static int nfs4_match_client(struct nfs_client *pos, struct nfs_client *new,
+ struct nfs_client **prev, struct nfs_net *nn)
+{
+ int status;
+
+ if (pos->rpc_ops != new->rpc_ops)
+ return 1;
+
+ if (pos->cl_minorversion != new->cl_minorversion)
+ return 1;
+
+ /* If "pos" isn't marked ready, we can't trust the
+ * remaining fields in "pos", especially the client
+ * ID and serverowner fields. Wait for CREATE_SESSION
+ * to finish. */
+ if (pos->cl_cons_state > NFS_CS_READY) {
+ refcount_inc(&pos->cl_count);
+ spin_unlock(&nn->nfs_client_lock);
+
+ nfs_put_client(*prev);
+ *prev = pos;
+
+ status = nfs_wait_client_init_complete(pos);
+ spin_lock(&nn->nfs_client_lock);
+
+ if (status < 0)
+ return status;
+ }
+
+ if (pos->cl_cons_state != NFS_CS_READY)
+ return 1;
+
+ if (pos->cl_clientid != new->cl_clientid)
+ return 1;
+
+ /* NFSv4.1 always uses the uniform string, however someone
+ * might switch the uniquifier string on us.
+ */
+ if (!nfs4_match_client_owner_id(pos, new))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * nfs40_walk_client_list - Find server that recognizes a client ID
+ *
+ * @new: nfs_client with client ID to test
+ * @result: OUT: found nfs_client, or new
+ * @cred: credential to use for trunking test
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status.
+ * If zero is returned, an nfs_client pointer is planted in "result."
+ *
+ * NB: nfs40_walk_client_list() relies on the new nfs_client being
+ * the last nfs_client on the list.
+ */
+int nfs40_walk_client_list(struct nfs_client *new,
+ struct nfs_client **result,
+ const struct cred *cred)
+{
+ struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
+ struct nfs_client *pos, *prev = NULL;
+ struct nfs4_setclientid_res clid = {
+ .clientid = new->cl_clientid,
+ .confirm = new->cl_confirm,
+ };
+ int status = -NFS4ERR_STALE_CLIENTID;
+
+ spin_lock(&nn->nfs_client_lock);
+ list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+
+ if (pos == new)
+ goto found;
+
+ status = nfs4_match_client(pos, new, &prev, nn);
+ if (status < 0)
+ goto out_unlock;
+ if (status != 0)
+ continue;
+ /*
+ * We just sent a new SETCLIENTID, which should have
+ * caused the server to return a new cl_confirm. So if
+ * cl_confirm is the same, then this is a different
+ * server that just returned the same cl_confirm by
+ * coincidence:
+ */
+ if ((new != pos) && nfs4_same_verifier(&pos->cl_confirm,
+ &new->cl_confirm))
+ continue;
+ /*
+ * But if the cl_confirm's are different, then the only
+ * way that a SETCLIENTID_CONFIRM to pos can succeed is
+ * if new and pos point to the same server:
+ */
+found:
+ refcount_inc(&pos->cl_count);
+ spin_unlock(&nn->nfs_client_lock);
+
+ nfs_put_client(prev);
+ prev = pos;
+
+ status = nfs4_proc_setclientid_confirm(pos, &clid, cred);
+ switch (status) {
+ case -NFS4ERR_STALE_CLIENTID:
+ break;
+ case 0:
+ nfs4_swap_callback_idents(pos, new);
+ pos->cl_confirm = new->cl_confirm;
+ nfs_mark_client_ready(pos, NFS_CS_READY);
+
+ prev = NULL;
+ *result = pos;
+ goto out;
+ case -ERESTARTSYS:
+ case -ETIMEDOUT:
+ /* The callback path may have been inadvertently
+ * changed. Schedule recovery!
+ */
+ nfs4_schedule_path_down_recovery(pos);
+ goto out;
+ default:
+ goto out;
+ }
+
+ spin_lock(&nn->nfs_client_lock);
+ }
+out_unlock:
+ spin_unlock(&nn->nfs_client_lock);
+
+ /* No match found. The server lost our clientid */
+out:
+ nfs_put_client(prev);
+ return status;
+}
+
+#ifdef CONFIG_NFS_V4_1
+/*
+ * Returns true if the server major ids match
+ */
+bool
+nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1,
+ struct nfs41_server_owner *o2)
+{
+ if (o1->major_id_sz != o2->major_id_sz)
+ return false;
+ return memcmp(o1->major_id, o2->major_id, o1->major_id_sz) == 0;
+}
+
+/*
+ * Returns true if the server scopes match
+ */
+static bool
+nfs4_check_server_scope(struct nfs41_server_scope *s1,
+ struct nfs41_server_scope *s2)
+{
+ if (s1->server_scope_sz != s2->server_scope_sz)
+ return false;
+ return memcmp(s1->server_scope, s2->server_scope,
+ s1->server_scope_sz) == 0;
+}
+
+/**
+ * nfs4_detect_session_trunking - Checks for session trunking.
+ * @clp: original mount nfs_client
+ * @res: result structure from an exchange_id using the original mount
+ * nfs_client with a new multi_addr transport
+ * @xprt: pointer to the transport to add.
+ *
+ * Called after a successful EXCHANGE_ID on a multi-addr connection.
+ * Upon success, add the transport.
+ *
+ * Returns zero on success, otherwise -EINVAL
+ *
+ * Note: since the exchange_id for the new multi_addr transport uses the
+ * same nfs_client from the original mount, the cl_owner_id is reused,
+ * so eir_clientowner is the same.
+ */
+int nfs4_detect_session_trunking(struct nfs_client *clp,
+ struct nfs41_exchange_id_res *res,
+ struct rpc_xprt *xprt)
+{
+ /* Check eir_clientid */
+ if (clp->cl_clientid != res->clientid)
+ goto out_err;
+
+ /* Check eir_server_owner so_major_id */
+ if (!nfs4_check_serverowner_major_id(clp->cl_serverowner,
+ res->server_owner))
+ goto out_err;
+
+ /* Check eir_server_owner so_minor_id */
+ if (clp->cl_serverowner->minor_id != res->server_owner->minor_id)
+ goto out_err;
+
+ /* Check eir_server_scope */
+ if (!nfs4_check_server_scope(clp->cl_serverscope, res->server_scope))
+ goto out_err;
+
+ pr_info("NFS: %s: Session trunking succeeded for %s\n",
+ clp->cl_hostname,
+ xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+ return 0;
+out_err:
+ pr_info("NFS: %s: Session trunking failed for %s\n", clp->cl_hostname,
+ xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+ return -EINVAL;
+}
+
+/**
+ * nfs41_walk_client_list - Find nfs_client that matches a client/server owner
+ *
+ * @new: nfs_client with client ID to test
+ * @result: OUT: found nfs_client, or new
+ * @cred: credential to use for trunking test
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status.
+ * If zero is returned, an nfs_client pointer is planted in "result."
+ *
+ * NB: nfs41_walk_client_list() relies on the new nfs_client being
+ * the last nfs_client on the list.
+ */
+int nfs41_walk_client_list(struct nfs_client *new,
+ struct nfs_client **result,
+ const struct cred *cred)
+{
+ struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
+ struct nfs_client *pos, *prev = NULL;
+ int status = -NFS4ERR_STALE_CLIENTID;
+
+ spin_lock(&nn->nfs_client_lock);
+ list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+
+ if (pos == new)
+ goto found;
+
+ status = nfs4_match_client(pos, new, &prev, nn);
+ if (status < 0)
+ goto out;
+ if (status != 0)
+ continue;
+
+ /*
+ * Note that session trunking is just a special subcase of
+ * client id trunking. In either case, we want to fall back
+ * to using the existing nfs_client.
+ */
+ if (!nfs4_check_serverowner_major_id(pos->cl_serverowner,
+ new->cl_serverowner))
+ continue;
+
+found:
+ refcount_inc(&pos->cl_count);
+ *result = pos;
+ status = 0;
+ break;
+ }
+
+out:
+ spin_unlock(&nn->nfs_client_lock);
+ nfs_put_client(prev);
+ return status;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static void nfs4_destroy_server(struct nfs_server *server)
+{
+ LIST_HEAD(freeme);
+
+ nfs_server_return_all_delegations(server);
+ unset_pnfs_layoutdriver(server);
+ nfs4_purge_state_owners(server, &freeme);
+ nfs4_free_state_owners(&freeme);
+}
+
+/*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by callback identifier
+ */
+struct nfs_client *
+nfs4_find_client_ident(struct net *net, int cb_ident)
+{
+ struct nfs_client *clp;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ spin_lock(&nn->nfs_client_lock);
+ clp = idr_find(&nn->cb_ident_idr, cb_ident);
+ if (clp)
+ refcount_inc(&clp->cl_count);
+ spin_unlock(&nn->nfs_client_lock);
+ return clp;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/* Common match routine for v4.0 and v4.1 callback services */
+static bool nfs4_cb_match_client(const struct sockaddr *addr,
+ struct nfs_client *clp, u32 minorversion)
+{
+ struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+
+ /* Don't match clients that failed to initialise */
+ if (!(clp->cl_cons_state == NFS_CS_READY ||
+ clp->cl_cons_state == NFS_CS_SESSION_INITING))
+ return false;
+
+ smp_rmb();
+
+ /* Match the version and minorversion */
+ if (clp->rpc_ops->version != 4 ||
+ clp->cl_minorversion != minorversion)
+ return false;
+
+ /* Match only the IP address, not the port number */
+ return rpc_cmp_addr(addr, clap);
+}
+
+/*
+ * NFSv4.1 callback thread helper
+ * For CB_COMPOUND calls, find a client by IP address, protocol version,
+ * minorversion, and sessionID
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
+ struct nfs4_sessionid *sid, u32 minorversion)
+{
+ struct nfs_client *clp;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ spin_lock(&nn->nfs_client_lock);
+ list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
+ if (!nfs4_cb_match_client(addr, clp, minorversion))
+ continue;
+
+ if (!nfs4_has_session(clp))
+ continue;
+
+ /* Match sessionid*/
+ if (memcmp(clp->cl_session->sess_id.data,
+ sid->data, NFS4_MAX_SESSIONID_LEN) != 0)
+ continue;
+
+ refcount_inc(&clp->cl_count);
+ spin_unlock(&nn->nfs_client_lock);
+ return clp;
+ }
+ spin_unlock(&nn->nfs_client_lock);
+ return NULL;
+}
+
+#else /* CONFIG_NFS_V4_1 */
+
+struct nfs_client *
+nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
+ struct nfs4_sessionid *sid, u32 minorversion)
+{
+ return NULL;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * Set up an NFS4 client
+ */
+static int nfs4_set_client(struct nfs_server *server,
+ const char *hostname,
+ const struct sockaddr_storage *addr,
+ const size_t addrlen,
+ const char *ip_addr,
+ int proto, const struct rpc_timeout *timeparms,
+ u32 minorversion, unsigned int nconnect,
+ unsigned int max_connect,
+ struct net *net,
+ struct xprtsec_parms *xprtsec)
+{
+ struct nfs_client_initdata cl_init = {
+ .hostname = hostname,
+ .addr = addr,
+ .addrlen = addrlen,
+ .ip_addr = ip_addr,
+ .nfs_mod = &nfs_v4,
+ .proto = proto,
+ .minorversion = minorversion,
+ .net = net,
+ .timeparms = timeparms,
+ .cred = server->cred,
+ .xprtsec = *xprtsec,
+ };
+ struct nfs_client *clp;
+
+ if (minorversion == 0)
+ __set_bit(NFS_CS_REUSEPORT, &cl_init.init_flags);
+ else
+ cl_init.max_connect = max_connect;
+ switch (proto) {
+ case XPRT_TRANSPORT_TCP:
+ case XPRT_TRANSPORT_TCP_TLS:
+ cl_init.nconnect = nconnect;
+ }
+
+ if (server->flags & NFS_MOUNT_NORESVPORT)
+ __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+ if (server->options & NFS_OPTION_MIGRATION)
+ __set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
+ if (test_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status))
+ __set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags);
+ server->port = rpc_get_port((struct sockaddr *)addr);
+
+ /* Allocate or find a client reference we can use */
+ clp = nfs_get_client(&cl_init);
+ if (IS_ERR(clp))
+ return PTR_ERR(clp);
+
+ if (server->nfs_client == clp) {
+ nfs_put_client(clp);
+ return -ELOOP;
+ }
+
+ /*
+ * Query for the lease time on clientid setup or renewal
+ *
+ * Note that this will be set on nfs_clients that were created
+ * only for the DS role and did not set this bit, but now will
+ * serve a dual role.
+ */
+ set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state);
+
+ server->nfs_client = clp;
+ nfs_sysfs_add_server(server);
+ nfs_sysfs_link_rpc_client(server, clp->cl_rpcclient, "_state");
+
+ return 0;
+}
+
+/*
+ * Set up a pNFS Data Server client.
+ *
+ * Return any existing nfs_client that matches server address,port,version
+ * and minorversion.
+ *
+ * For a new nfs_client, use a soft mount (default), a low retrans and a
+ * low timeout interval so that if a connection is lost, we retry through
+ * the MDS.
+ */
+struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
+ const struct sockaddr_storage *ds_addr, int ds_addrlen,
+ int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
+ u32 minor_version)
+{
+ struct rpc_timeout ds_timeout;
+ struct nfs_client *mds_clp = mds_srv->nfs_client;
+ struct nfs_client_initdata cl_init = {
+ .addr = ds_addr,
+ .addrlen = ds_addrlen,
+ .nodename = mds_clp->cl_rpcclient->cl_nodename,
+ .ip_addr = mds_clp->cl_ipaddr,
+ .nfs_mod = &nfs_v4,
+ .proto = ds_proto,
+ .minorversion = minor_version,
+ .net = mds_clp->cl_net,
+ .timeparms = &ds_timeout,
+ .cred = mds_srv->cred,
+ .xprtsec = mds_srv->nfs_client->cl_xprtsec,
+ };
+ char buf[INET6_ADDRSTRLEN + 1];
+
+ if (rpc_ntop((struct sockaddr *)ds_addr, buf, sizeof(buf)) <= 0)
+ return ERR_PTR(-EINVAL);
+ cl_init.hostname = buf;
+
+ switch (ds_proto) {
+ case XPRT_TRANSPORT_TCP:
+ case XPRT_TRANSPORT_TCP_TLS:
+ if (mds_clp->cl_nconnect > 1) {
+ cl_init.nconnect = mds_clp->cl_nconnect;
+ cl_init.max_connect = NFS_MAX_TRANSPORTS;
+ }
+ }
+
+ if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
+ __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+
+ __set_bit(NFS_CS_DS, &cl_init.init_flags);
+ __set_bit(NFS_CS_PNFS, &cl_init.init_flags);
+ cl_init.max_connect = NFS_MAX_TRANSPORTS;
+ /*
+ * Set an authflavor equual to the MDS value. Use the MDS nfs_client
+ * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
+ * (section 13.1 RFC 5661).
+ */
+ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
+ return nfs_get_client(&cl_init);
+}
+EXPORT_SYMBOL_GPL(nfs4_set_ds_client);
+
+/*
+ * Session has been established, and the client marked ready.
+ * Limit the mount rsize, wsize and dtsize using negotiated fore
+ * channel attributes.
+ */
+static void nfs4_session_limit_rwsize(struct nfs_server *server)
+{
+#ifdef CONFIG_NFS_V4_1
+ struct nfs4_session *sess;
+ u32 server_resp_sz;
+ u32 server_rqst_sz;
+
+ if (!nfs4_has_session(server->nfs_client))
+ return;
+ sess = server->nfs_client->cl_session;
+ server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead;
+ server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead;
+
+ if (server->dtsize > server_resp_sz)
+ server->dtsize = server_resp_sz;
+ if (server->rsize > server_resp_sz)
+ server->rsize = server_resp_sz;
+ if (server->wsize > server_rqst_sz)
+ server->wsize = server_rqst_sz;
+#endif /* CONFIG_NFS_V4_1 */
+}
+
+/*
+ * Limit xattr sizes using the channel attributes.
+ */
+static void nfs4_session_limit_xasize(struct nfs_server *server)
+{
+#ifdef CONFIG_NFS_V4_2
+ struct nfs4_session *sess;
+ u32 server_gxa_sz;
+ u32 server_sxa_sz;
+ u32 server_lxa_sz;
+
+ if (!nfs4_has_session(server->nfs_client))
+ return;
+
+ sess = server->nfs_client->cl_session;
+
+ server_gxa_sz = sess->fc_attrs.max_resp_sz - nfs42_maxgetxattr_overhead;
+ server_sxa_sz = sess->fc_attrs.max_rqst_sz - nfs42_maxsetxattr_overhead;
+ server_lxa_sz = sess->fc_attrs.max_resp_sz -
+ nfs42_maxlistxattrs_overhead;
+
+ if (server->gxasize > server_gxa_sz)
+ server->gxasize = server_gxa_sz;
+ if (server->sxasize > server_sxa_sz)
+ server->sxasize = server_sxa_sz;
+ if (server->lxasize > server_lxa_sz)
+ server->lxasize = server_lxa_sz;
+#endif
+}
+
+void nfs4_server_set_init_caps(struct nfs_server *server)
+{
+ /* Set the basic capabilities */
+ server->caps |= server->nfs_client->cl_mvops->init_caps;
+ if (server->flags & NFS_MOUNT_NORDIRPLUS)
+ server->caps &= ~NFS_CAP_READDIRPLUS;
+ if (server->nfs_client->cl_proto == XPRT_TRANSPORT_RDMA)
+ server->caps &= ~NFS_CAP_READ_PLUS;
+
+ /*
+ * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
+ * authentication.
+ */
+ if (nfs4_disable_idmapping &&
+ server->client->cl_auth->au_flavor == RPC_AUTH_UNIX)
+ server->caps |= NFS_CAP_UIDGID_NOMAP;
+}
+
+static int nfs4_server_common_setup(struct nfs_server *server,
+ struct nfs_fh *mntfh, bool auth_probe)
+{
+ int error;
+
+ /* data servers support only a subset of NFSv4.1 */
+ if (is_ds_only_client(server->nfs_client))
+ return -EPROTONOSUPPORT;
+
+ /* We must ensure the session is initialised first */
+ error = nfs4_init_session(server->nfs_client);
+ if (error < 0)
+ goto out;
+
+ nfs4_server_set_init_caps(server);
+
+ /* Probe the root fh to retrieve its FSID and filehandle */
+ error = nfs4_get_rootfh(server, mntfh, auth_probe);
+ if (error < 0)
+ goto out;
+
+ dprintk("Server FSID: %llx:%llx\n",
+ (unsigned long long) server->fsid.major,
+ (unsigned long long) server->fsid.minor);
+ nfs_display_fhandle(mntfh, "Pseudo-fs root FH");
+
+ error = nfs_probe_server(server, mntfh);
+ if (error < 0)
+ goto out;
+
+ nfs4_session_limit_rwsize(server);
+ nfs4_session_limit_xasize(server);
+
+ if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
+ server->namelen = NFS4_MAXNAMLEN;
+
+ nfs_server_insert_lists(server);
+ server->mount_time = jiffies;
+ server->destroy = nfs4_destroy_server;
+out:
+ return error;
+}
+
+/*
+ * Create a version 4 volume record
+ */
+static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct rpc_timeout timeparms;
+ int error;
+
+ nfs_init_timeout_values(&timeparms, ctx->nfs_server.protocol,
+ ctx->timeo, ctx->retrans);
+
+ /* Initialise the client representation from the mount data */
+ server->flags = ctx->flags;
+ server->options = ctx->options;
+ server->auth_info = ctx->auth_info;
+
+ /* Use the first specified auth flavor. If this flavor isn't
+ * allowed by the server, use the SECINFO path to try the
+ * other specified flavors */
+ if (ctx->auth_info.flavor_len >= 1)
+ ctx->selected_flavor = ctx->auth_info.flavors[0];
+ else
+ ctx->selected_flavor = RPC_AUTH_UNIX;
+
+ /* Get a client record */
+ error = nfs4_set_client(server,
+ ctx->nfs_server.hostname,
+ &ctx->nfs_server._address,
+ ctx->nfs_server.addrlen,
+ ctx->client_address,
+ ctx->nfs_server.protocol,
+ &timeparms,
+ ctx->minorversion,
+ ctx->nfs_server.nconnect,
+ ctx->nfs_server.max_connect,
+ fc->net_ns,
+ &ctx->xprtsec);
+ if (error < 0)
+ return error;
+
+ if (ctx->rsize)
+ server->rsize = nfs_io_size(ctx->rsize, server->nfs_client->cl_proto);
+ if (ctx->wsize)
+ server->wsize = nfs_io_size(ctx->wsize, server->nfs_client->cl_proto);
+
+ server->acregmin = ctx->acregmin * HZ;
+ server->acregmax = ctx->acregmax * HZ;
+ server->acdirmin = ctx->acdirmin * HZ;
+ server->acdirmax = ctx->acdirmax * HZ;
+ server->port = ctx->nfs_server.port;
+
+ return nfs_init_server_rpcclient(server, &timeparms,
+ ctx->selected_flavor);
+}
+
+/*
+ * Create a version 4 volume record
+ * - keyed on server and FSID
+ */
+struct nfs_server *nfs4_create_server(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct nfs_server *server;
+ bool auth_probe;
+ int error;
+
+ server = nfs_alloc_server();
+ if (!server)
+ return ERR_PTR(-ENOMEM);
+
+ server->cred = get_cred(fc->cred);
+
+ auth_probe = ctx->auth_info.flavor_len < 1;
+
+ /* set up the general RPC client */
+ error = nfs4_init_server(server, fc);
+ if (error < 0)
+ goto error;
+
+ error = nfs4_server_common_setup(server, ctx->mntfh, auth_probe);
+ if (error < 0)
+ goto error;
+
+ return server;
+
+error:
+ nfs_free_server(server);
+ return ERR_PTR(error);
+}
+
+/*
+ * Create an NFS4 referral server record
+ */
+struct nfs_server *nfs4_create_referral_server(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct nfs_client *parent_client;
+ struct nfs_server *server, *parent_server;
+ int proto, error;
+ bool auth_probe;
+
+ server = nfs_alloc_server();
+ if (!server)
+ return ERR_PTR(-ENOMEM);
+
+ parent_server = NFS_SB(ctx->clone_data.sb);
+ parent_client = parent_server->nfs_client;
+
+ server->cred = get_cred(parent_server->cred);
+
+ /* Initialise the client representation from the parent server */
+ nfs_server_copy_userdata(server, parent_server);
+
+ /* Get a client representation */
+#if IS_ENABLED(CONFIG_SUNRPC_XPRT_RDMA)
+ rpc_set_port(&ctx->nfs_server.address, NFS_RDMA_PORT);
+ error = nfs4_set_client(server,
+ ctx->nfs_server.hostname,
+ &ctx->nfs_server._address,
+ ctx->nfs_server.addrlen,
+ parent_client->cl_ipaddr,
+ XPRT_TRANSPORT_RDMA,
+ parent_server->client->cl_timeout,
+ parent_client->cl_mvops->minor_version,
+ parent_client->cl_nconnect,
+ parent_client->cl_max_connect,
+ parent_client->cl_net,
+ &parent_client->cl_xprtsec);
+ if (!error)
+ goto init_server;
+#endif /* IS_ENABLED(CONFIG_SUNRPC_XPRT_RDMA) */
+
+ proto = XPRT_TRANSPORT_TCP;
+ if (parent_client->cl_xprtsec.policy != RPC_XPRTSEC_NONE)
+ proto = XPRT_TRANSPORT_TCP_TLS;
+ rpc_set_port(&ctx->nfs_server.address, NFS_PORT);
+ error = nfs4_set_client(server,
+ ctx->nfs_server.hostname,
+ &ctx->nfs_server._address,
+ ctx->nfs_server.addrlen,
+ parent_client->cl_ipaddr,
+ proto,
+ parent_server->client->cl_timeout,
+ parent_client->cl_mvops->minor_version,
+ parent_client->cl_nconnect,
+ parent_client->cl_max_connect,
+ parent_client->cl_net,
+ &parent_client->cl_xprtsec);
+ if (error < 0)
+ goto error;
+
+#if IS_ENABLED(CONFIG_SUNRPC_XPRT_RDMA)
+init_server:
+#endif
+ error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout,
+ ctx->selected_flavor);
+ if (error < 0)
+ goto error;
+
+ auth_probe = parent_server->auth_info.flavor_len < 1;
+
+ error = nfs4_server_common_setup(server, ctx->mntfh, auth_probe);
+ if (error < 0)
+ goto error;
+
+ return server;
+
+error:
+ nfs_free_server(server);
+ return ERR_PTR(error);
+}
+
+/**
+ * nfs4_update_server - Move an nfs_server to a different nfs_client
+ *
+ * @server: represents FSID to be moved
+ * @hostname: new end-point's hostname
+ * @sap: new end-point's socket address
+ * @salen: size of "sap"
+ * @net: net namespace
+ *
+ * The nfs_server must be quiescent before this function is invoked.
+ * Either its session is drained (NFSv4.1+), or its transport is
+ * plugged and drained (NFSv4.0).
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs4_update_server(struct nfs_server *server, const char *hostname,
+ struct sockaddr_storage *sap, size_t salen, struct net *net)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct rpc_clnt *clnt = server->client;
+ struct xprt_create xargs = {
+ .ident = clp->cl_proto,
+ .net = net,
+ .dstaddr = (struct sockaddr *)sap,
+ .addrlen = salen,
+ .servername = hostname,
+ /* cel: bleh. We might need to pass TLS parameters here */
+ };
+ char buf[INET6_ADDRSTRLEN + 1];
+ struct sockaddr_storage address;
+ struct sockaddr *localaddr = (struct sockaddr *)&address;
+ int error;
+
+ error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout);
+ if (error != 0)
+ return error;
+
+ error = rpc_localaddr(clnt, localaddr, sizeof(address));
+ if (error != 0)
+ return error;
+
+ if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0)
+ return -EAFNOSUPPORT;
+
+ nfs_server_remove_lists(server);
+ set_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status);
+ error = nfs4_set_client(server, hostname, sap, salen, buf,
+ clp->cl_proto, clnt->cl_timeout,
+ clp->cl_minorversion,
+ clp->cl_nconnect, clp->cl_max_connect,
+ net, &clp->cl_xprtsec);
+ clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status);
+ if (error != 0) {
+ nfs_server_insert_lists(server);
+ return error;
+ }
+ nfs_put_client(clp);
+
+ if (server->nfs_client->cl_hostname == NULL) {
+ server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL);
+ if (server->nfs_client->cl_hostname == NULL)
+ return -ENOMEM;
+ }
+ nfs_server_insert_lists(server);
+
+ return nfs_probe_server(server, NFS_FH(d_inode(server->super->s_root)));
+}
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
new file mode 100644
index 0000000000..02788c3c85
--- /dev/null
+++ b/fs/nfs/nfs4file.c
@@ -0,0 +1,469 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/file.c
+ *
+ * Copyright (C) 1992 Rick Sladkey
+ */
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/falloc.h>
+#include <linux/mount.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_ssc.h>
+#include "delegation.h"
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+#include "pnfs.h"
+
+#include "nfstrace.h"
+
+#ifdef CONFIG_NFS_V4_2
+#include "nfs42.h"
+#endif
+
+#define NFSDBG_FACILITY NFSDBG_FILE
+
+static int
+nfs4_file_open(struct inode *inode, struct file *filp)
+{
+ struct nfs_open_context *ctx;
+ struct dentry *dentry = file_dentry(filp);
+ struct dentry *parent = NULL;
+ struct inode *dir;
+ unsigned openflags = filp->f_flags;
+ struct iattr attr;
+ int err;
+
+ /*
+ * If no cached dentry exists or if it's negative, NFSv4 handled the
+ * opens in ->lookup() or ->create().
+ *
+ * We only get this far for a cached positive dentry. We skipped
+ * revalidation, so handle it here by dropping the dentry and returning
+ * -EOPENSTALE. The VFS will retry the lookup/create/open.
+ */
+
+ dprintk("NFS: open file(%pd2)\n", dentry);
+
+ err = nfs_check_flags(openflags);
+ if (err)
+ return err;
+
+ /* We can't create new files here */
+ openflags &= ~(O_CREAT|O_EXCL);
+
+ parent = dget_parent(dentry);
+ dir = d_inode(parent);
+
+ ctx = alloc_nfs_open_context(file_dentry(filp),
+ flags_to_mode(openflags), filp);
+ err = PTR_ERR(ctx);
+ if (IS_ERR(ctx))
+ goto out;
+
+ attr.ia_valid = ATTR_OPEN;
+ if (openflags & O_TRUNC) {
+ attr.ia_valid |= ATTR_SIZE;
+ attr.ia_size = 0;
+ filemap_write_and_wait(inode->i_mapping);
+ }
+
+ inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ switch (err) {
+ default:
+ goto out_put_ctx;
+ case -ENOENT:
+ case -ESTALE:
+ case -EISDIR:
+ case -ENOTDIR:
+ case -ELOOP:
+ goto out_drop;
+ }
+ }
+ if (inode != d_inode(dentry))
+ goto out_drop;
+
+ nfs_file_set_open_context(filp, ctx);
+ nfs_fscache_open_file(inode, filp);
+ err = 0;
+ filp->f_mode |= FMODE_CAN_ODIRECT;
+
+out_put_ctx:
+ put_nfs_open_context(ctx);
+out:
+ dput(parent);
+ return err;
+
+out_drop:
+ d_drop(dentry);
+ err = -EOPENSTALE;
+ goto out_put_ctx;
+}
+
+/*
+ * Flush all dirty pages, and check for write errors.
+ */
+static int
+nfs4_file_flush(struct file *file, fl_owner_t id)
+{
+ struct inode *inode = file_inode(file);
+ errseq_t since;
+
+ dprintk("NFS: flush(%pD2)\n", file);
+
+ nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
+ if ((file->f_mode & FMODE_WRITE) == 0)
+ return 0;
+
+ /*
+ * If we're holding a write delegation, then check if we're required
+ * to flush the i/o on close. If not, then just start the i/o now.
+ */
+ if (!nfs4_delegation_flush_on_close(inode))
+ return filemap_fdatawrite(file->f_mapping);
+
+ /* Flush writes to the server and return any errors */
+ since = filemap_sample_wb_err(file->f_mapping);
+ nfs_wb_all(inode);
+ return filemap_check_wb_err(file->f_mapping, since);
+}
+
+#ifdef CONFIG_NFS_V4_2
+static ssize_t __nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t count, unsigned int flags)
+{
+ struct nfs42_copy_notify_res *cn_resp = NULL;
+ struct nl4_server *nss = NULL;
+ nfs4_stateid *cnrs = NULL;
+ ssize_t ret;
+ bool sync = false;
+
+ /* Only offload copy if superblock is the same */
+ if (file_in->f_op != &nfs4_file_operations)
+ return -EXDEV;
+ if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY) ||
+ !nfs_server_capable(file_inode(file_in), NFS_CAP_COPY))
+ return -EOPNOTSUPP;
+ if (file_inode(file_in) == file_inode(file_out))
+ return -EOPNOTSUPP;
+ /* if the copy size if smaller than 2 RPC payloads, make it
+ * synchronous
+ */
+ if (count <= 2 * NFS_SERVER(file_inode(file_in))->rsize)
+ sync = true;
+retry:
+ if (!nfs42_files_from_same_server(file_in, file_out)) {
+ /*
+ * for inter copy, if copy size is too small
+ * then fallback to generic copy.
+ */
+ if (sync)
+ return -EOPNOTSUPP;
+ cn_resp = kzalloc(sizeof(struct nfs42_copy_notify_res),
+ GFP_KERNEL);
+ if (unlikely(cn_resp == NULL))
+ return -ENOMEM;
+
+ ret = nfs42_proc_copy_notify(file_in, file_out, cn_resp);
+ if (ret) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ nss = &cn_resp->cnr_src;
+ cnrs = &cn_resp->cnr_stateid;
+ }
+ ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count,
+ nss, cnrs, sync);
+out:
+ kfree(cn_resp);
+
+ if (ret == -EAGAIN)
+ goto retry;
+ return ret;
+}
+
+static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t count, unsigned int flags)
+{
+ ssize_t ret;
+
+ ret = __nfs4_copy_file_range(file_in, pos_in, file_out, pos_out, count,
+ flags);
+ if (ret == -EOPNOTSUPP || ret == -EXDEV)
+ ret = generic_copy_file_range(file_in, pos_in, file_out,
+ pos_out, count, flags);
+ return ret;
+}
+
+static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
+{
+ loff_t ret;
+
+ switch (whence) {
+ case SEEK_HOLE:
+ case SEEK_DATA:
+ ret = nfs42_proc_llseek(filep, offset, whence);
+ if (ret != -EOPNOTSUPP)
+ return ret;
+ fallthrough;
+ default:
+ return nfs_file_llseek(filep, offset, whence);
+ }
+}
+
+static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t len)
+{
+ struct inode *inode = file_inode(filep);
+ long ret;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ if ((mode != 0) && (mode != (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)))
+ return -EOPNOTSUPP;
+
+ ret = inode_newsize_ok(inode, offset + len);
+ if (ret < 0)
+ return ret;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return nfs42_proc_deallocate(filep, offset, len);
+ return nfs42_proc_allocate(filep, offset, len);
+}
+
+static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off,
+ struct file *dst_file, loff_t dst_off, loff_t count,
+ unsigned int remap_flags)
+{
+ struct inode *dst_inode = file_inode(dst_file);
+ struct nfs_server *server = NFS_SERVER(dst_inode);
+ struct inode *src_inode = file_inode(src_file);
+ unsigned int bs = server->clone_blksize;
+ bool same_inode = false;
+ int ret;
+
+ /* NFS does not support deduplication. */
+ if (remap_flags & REMAP_FILE_DEDUP)
+ return -EOPNOTSUPP;
+
+ if (remap_flags & ~REMAP_FILE_ADVISORY)
+ return -EINVAL;
+
+ if (IS_SWAPFILE(dst_inode) || IS_SWAPFILE(src_inode))
+ return -ETXTBSY;
+
+ /* check alignment w.r.t. clone_blksize */
+ ret = -EINVAL;
+ if (bs) {
+ if (!IS_ALIGNED(src_off, bs) || !IS_ALIGNED(dst_off, bs))
+ goto out;
+ if (!IS_ALIGNED(count, bs) && i_size_read(src_inode) != (src_off + count))
+ goto out;
+ }
+
+ if (src_inode == dst_inode)
+ same_inode = true;
+
+ /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */
+ if (same_inode) {
+ inode_lock(src_inode);
+ } else if (dst_inode < src_inode) {
+ inode_lock_nested(dst_inode, I_MUTEX_PARENT);
+ inode_lock_nested(src_inode, I_MUTEX_CHILD);
+ } else {
+ inode_lock_nested(src_inode, I_MUTEX_PARENT);
+ inode_lock_nested(dst_inode, I_MUTEX_CHILD);
+ }
+
+ /* flush all pending writes on both src and dst so that server
+ * has the latest data */
+ ret = nfs_sync_inode(src_inode);
+ if (ret)
+ goto out_unlock;
+ ret = nfs_sync_inode(dst_inode);
+ if (ret)
+ goto out_unlock;
+
+ ret = nfs42_proc_clone(src_file, dst_file, src_off, dst_off, count);
+
+ /* truncate inode page cache of the dst range so that future reads can fetch
+ * new data from server */
+ if (!ret)
+ truncate_inode_pages_range(&dst_inode->i_data, dst_off, dst_off + count - 1);
+
+out_unlock:
+ if (same_inode) {
+ inode_unlock(src_inode);
+ } else if (dst_inode < src_inode) {
+ inode_unlock(src_inode);
+ inode_unlock(dst_inode);
+ } else {
+ inode_unlock(dst_inode);
+ inode_unlock(src_inode);
+ }
+out:
+ return ret < 0 ? ret : count;
+}
+
+static int read_name_gen = 1;
+#define SSC_READ_NAME_BODY "ssc_read_%d"
+
+static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
+ struct nfs_fh *src_fh, nfs4_stateid *stateid)
+{
+ struct nfs_fattr *fattr = nfs_alloc_fattr();
+ struct file *filep, *res;
+ struct nfs_server *server;
+ struct inode *r_ino = NULL;
+ struct nfs_open_context *ctx;
+ struct nfs4_state_owner *sp;
+ char *read_name = NULL;
+ int len, status = 0;
+
+ server = NFS_SB(ss_mnt->mnt_sb);
+
+ if (!fattr)
+ return ERR_PTR(-ENOMEM);
+
+ status = nfs4_proc_getattr(server, src_fh, fattr, NULL);
+ if (status < 0) {
+ res = ERR_PTR(status);
+ goto out;
+ }
+
+ if (!S_ISREG(fattr->mode)) {
+ res = ERR_PTR(-EBADF);
+ goto out;
+ }
+
+ res = ERR_PTR(-ENOMEM);
+ len = strlen(SSC_READ_NAME_BODY) + 16;
+ read_name = kzalloc(len, GFP_KERNEL);
+ if (read_name == NULL)
+ goto out;
+ snprintf(read_name, len, SSC_READ_NAME_BODY, read_name_gen++);
+
+ r_ino = nfs_fhget(ss_mnt->mnt_sb, src_fh, fattr);
+ if (IS_ERR(r_ino)) {
+ res = ERR_CAST(r_ino);
+ goto out_free_name;
+ }
+
+ filep = alloc_file_pseudo(r_ino, ss_mnt, read_name, O_RDONLY,
+ r_ino->i_fop);
+ if (IS_ERR(filep)) {
+ res = ERR_CAST(filep);
+ iput(r_ino);
+ goto out_free_name;
+ }
+
+ ctx = alloc_nfs_open_context(filep->f_path.dentry,
+ flags_to_mode(filep->f_flags), filep);
+ if (IS_ERR(ctx)) {
+ res = ERR_CAST(ctx);
+ goto out_filep;
+ }
+
+ res = ERR_PTR(-EINVAL);
+ sp = nfs4_get_state_owner(server, ctx->cred, GFP_KERNEL);
+ if (sp == NULL)
+ goto out_ctx;
+
+ ctx->state = nfs4_get_open_state(r_ino, sp);
+ if (ctx->state == NULL)
+ goto out_stateowner;
+
+ set_bit(NFS_SRV_SSC_COPY_STATE, &ctx->state->flags);
+ memcpy(&ctx->state->open_stateid.other, &stateid->other,
+ NFS4_STATEID_OTHER_SIZE);
+ update_open_stateid(ctx->state, stateid, NULL, filep->f_mode);
+ set_bit(NFS_OPEN_STATE, &ctx->state->flags);
+
+ nfs_file_set_open_context(filep, ctx);
+ put_nfs_open_context(ctx);
+
+ file_ra_state_init(&filep->f_ra, filep->f_mapping->host->i_mapping);
+ res = filep;
+out_free_name:
+ kfree(read_name);
+out:
+ nfs_free_fattr(fattr);
+ return res;
+out_stateowner:
+ nfs4_put_state_owner(sp);
+out_ctx:
+ put_nfs_open_context(ctx);
+out_filep:
+ fput(filep);
+ goto out_free_name;
+}
+
+static void __nfs42_ssc_close(struct file *filep)
+{
+ struct nfs_open_context *ctx = nfs_file_open_context(filep);
+
+ ctx->state->flags = 0;
+}
+
+static const struct nfs4_ssc_client_ops nfs4_ssc_clnt_ops_tbl = {
+ .sco_open = __nfs42_ssc_open,
+ .sco_close = __nfs42_ssc_close,
+};
+
+/**
+ * nfs42_ssc_register_ops - Wrapper to register NFS_V4 ops in nfs_common
+ *
+ * Return values:
+ * None
+ */
+void nfs42_ssc_register_ops(void)
+{
+ nfs42_ssc_register(&nfs4_ssc_clnt_ops_tbl);
+}
+
+/**
+ * nfs42_ssc_unregister_ops - wrapper to un-register NFS_V4 ops in nfs_common
+ *
+ * Return values:
+ * None.
+ */
+void nfs42_ssc_unregister_ops(void)
+{
+ nfs42_ssc_unregister(&nfs4_ssc_clnt_ops_tbl);
+}
+#endif /* CONFIG_NFS_V4_2 */
+
+static int nfs4_setlease(struct file *file, int arg, struct file_lock **lease,
+ void **priv)
+{
+ return nfs4_proc_setlease(file, arg, lease, priv);
+}
+
+const struct file_operations nfs4_file_operations = {
+ .read_iter = nfs_file_read,
+ .write_iter = nfs_file_write,
+ .mmap = nfs_file_mmap,
+ .open = nfs4_file_open,
+ .flush = nfs4_file_flush,
+ .release = nfs_file_release,
+ .fsync = nfs_file_fsync,
+ .lock = nfs_lock,
+ .flock = nfs_flock,
+ .splice_read = nfs_file_splice_read,
+ .splice_write = iter_file_splice_write,
+ .check_flags = nfs_check_flags,
+ .setlease = nfs4_setlease,
+#ifdef CONFIG_NFS_V4_2
+ .copy_file_range = nfs4_copy_file_range,
+ .llseek = nfs4_file_llseek,
+ .fallocate = nfs42_fallocate,
+ .remap_file_range = nfs42_remap_file_range,
+#else
+ .llseek = nfs_file_llseek,
+#endif
+};
diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c
new file mode 100644
index 0000000000..1a69479a3a
--- /dev/null
+++ b/fs/nfs/nfs4getroot.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+* Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+* Written by David Howells (dhowells@redhat.com)
+*/
+
+#include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_CLIENT
+
+int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_probe)
+{
+ struct nfs_fsinfo fsinfo;
+ int ret = -ENOMEM;
+
+ fsinfo.fattr = nfs_alloc_fattr();
+ if (fsinfo.fattr == NULL)
+ goto out;
+
+ /* Start by getting the root filehandle from the server */
+ ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo, auth_probe);
+ if (ret < 0) {
+ dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret);
+ goto out;
+ }
+
+ if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE)
+ || !S_ISDIR(fsinfo.fattr->mode)) {
+ printk(KERN_ERR "nfs4_get_rootfh:"
+ " getroot encountered non-directory\n");
+ ret = -ENOTDIR;
+ goto out;
+ }
+
+ memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
+out:
+ nfs_free_fattr(fsinfo.fattr);
+ return ret;
+}
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
new file mode 100644
index 0000000000..25a7c771cf
--- /dev/null
+++ b/fs/nfs/nfs4idmap.c
@@ -0,0 +1,806 @@
+/*
+ * fs/nfs/idmap.c
+ *
+ * UID and GID to name mapping for clients.
+ *
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/types.h>
+#include <linux/parser.h>
+#include <linux/fs.h>
+#include <net/net_namespace.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <keys/user-type.h>
+#include <keys/request_key_auth-type.h>
+#include <linux/module.h>
+#include <linux/user_namespace.h>
+
+#include "internal.h"
+#include "netns.h"
+#include "nfs4idmap.h"
+#include "nfs4trace.h"
+
+#define NFS_UINT_MAXLEN 11
+
+static const struct cred *id_resolver_cache;
+static struct key_type key_type_id_resolver_legacy;
+
+struct idmap_legacy_upcalldata {
+ struct rpc_pipe_msg pipe_msg;
+ struct idmap_msg idmap_msg;
+ struct key *authkey;
+ struct idmap *idmap;
+};
+
+struct idmap {
+ struct rpc_pipe_dir_object idmap_pdo;
+ struct rpc_pipe *idmap_pipe;
+ struct idmap_legacy_upcalldata *idmap_upcall_data;
+ struct mutex idmap_mutex;
+ struct user_namespace *user_ns;
+};
+
+static struct user_namespace *idmap_userns(const struct idmap *idmap)
+{
+ if (idmap && idmap->user_ns)
+ return idmap->user_ns;
+ return &init_user_ns;
+}
+
+/**
+ * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields
+ * @fattr: fully initialised struct nfs_fattr
+ * @owner_name: owner name string cache
+ * @group_name: group name string cache
+ */
+void nfs_fattr_init_names(struct nfs_fattr *fattr,
+ struct nfs4_string *owner_name,
+ struct nfs4_string *group_name)
+{
+ fattr->owner_name = owner_name;
+ fattr->group_name = group_name;
+}
+
+static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr)
+{
+ fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME;
+ kfree(fattr->owner_name->data);
+}
+
+static void nfs_fattr_free_group_name(struct nfs_fattr *fattr)
+{
+ fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME;
+ kfree(fattr->group_name->data);
+}
+
+static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+ struct nfs4_string *owner = fattr->owner_name;
+ kuid_t uid;
+
+ if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME))
+ return false;
+ if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) {
+ fattr->uid = uid;
+ fattr->valid |= NFS_ATTR_FATTR_OWNER;
+ }
+ return true;
+}
+
+static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+ struct nfs4_string *group = fattr->group_name;
+ kgid_t gid;
+
+ if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME))
+ return false;
+ if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) {
+ fattr->gid = gid;
+ fattr->valid |= NFS_ATTR_FATTR_GROUP;
+ }
+ return true;
+}
+
+/**
+ * nfs_fattr_free_names - free up the NFSv4 owner and group strings
+ * @fattr: a fully initialised nfs_fattr structure
+ */
+void nfs_fattr_free_names(struct nfs_fattr *fattr)
+{
+ if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)
+ nfs_fattr_free_owner_name(fattr);
+ if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)
+ nfs_fattr_free_group_name(fattr);
+}
+
+/**
+ * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free
+ * @server: pointer to the filesystem nfs_server structure
+ * @fattr: a fully initialised nfs_fattr structure
+ *
+ * This helper maps the cached NFSv4 owner/group strings in fattr into
+ * their numeric uid/gid equivalents, and then frees the cached strings.
+ */
+void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+ if (nfs_fattr_map_owner_name(server, fattr))
+ nfs_fattr_free_owner_name(fattr);
+ if (nfs_fattr_map_group_name(server, fattr))
+ nfs_fattr_free_group_name(fattr);
+}
+
+int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
+{
+ unsigned long val;
+ char buf[16];
+
+ if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf))
+ return 0;
+ memcpy(buf, name, namelen);
+ buf[namelen] = '\0';
+ if (kstrtoul(buf, 0, &val) != 0)
+ return 0;
+ *res = val;
+ return 1;
+}
+EXPORT_SYMBOL_GPL(nfs_map_string_to_numeric);
+
+static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
+{
+ return snprintf(buf, buflen, "%u", id);
+}
+
+static struct key_type key_type_id_resolver = {
+ .name = "id_resolver",
+ .preparse = user_preparse,
+ .free_preparse = user_free_preparse,
+ .instantiate = generic_key_instantiate,
+ .revoke = user_revoke,
+ .destroy = user_destroy,
+ .describe = user_describe,
+ .read = user_read,
+};
+
+int nfs_idmap_init(void)
+{
+ struct cred *cred;
+ struct key *keyring;
+ int ret = 0;
+
+ printk(KERN_NOTICE "NFS: Registering the %s key type\n",
+ key_type_id_resolver.name);
+
+ cred = prepare_kernel_cred(&init_task);
+ if (!cred)
+ return -ENOMEM;
+
+ keyring = keyring_alloc(".id_resolver",
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
+ if (IS_ERR(keyring)) {
+ ret = PTR_ERR(keyring);
+ goto failed_put_cred;
+ }
+
+ ret = register_key_type(&key_type_id_resolver);
+ if (ret < 0)
+ goto failed_put_key;
+
+ ret = register_key_type(&key_type_id_resolver_legacy);
+ if (ret < 0)
+ goto failed_reg_legacy;
+
+ set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags);
+ cred->thread_keyring = keyring;
+ cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+ id_resolver_cache = cred;
+ return 0;
+
+failed_reg_legacy:
+ unregister_key_type(&key_type_id_resolver);
+failed_put_key:
+ key_put(keyring);
+failed_put_cred:
+ put_cred(cred);
+ return ret;
+}
+
+void nfs_idmap_quit(void)
+{
+ key_revoke(id_resolver_cache->thread_keyring);
+ unregister_key_type(&key_type_id_resolver);
+ unregister_key_type(&key_type_id_resolver_legacy);
+ put_cred(id_resolver_cache);
+}
+
+/*
+ * Assemble the description to pass to request_key()
+ * This function will allocate a new string and update dest to point
+ * at it. The caller is responsible for freeing dest.
+ *
+ * On error 0 is returned. Otherwise, the length of dest is returned.
+ */
+static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
+ const char *type, size_t typelen, char **desc)
+{
+ char *cp;
+ size_t desclen = typelen + namelen + 2;
+
+ *desc = kmalloc(desclen, GFP_KERNEL);
+ if (!*desc)
+ return -ENOMEM;
+
+ cp = *desc;
+ memcpy(cp, type, typelen);
+ cp += typelen;
+ *cp++ = ':';
+
+ memcpy(cp, name, namelen);
+ cp += namelen;
+ *cp = '\0';
+ return desclen;
+}
+
+static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
+ const char *type, struct idmap *idmap)
+{
+ char *desc;
+ struct key *rkey = ERR_PTR(-EAGAIN);
+ ssize_t ret;
+
+ ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ if (!idmap->user_ns || idmap->user_ns == &init_user_ns)
+ rkey = request_key(&key_type_id_resolver, desc, "");
+ if (IS_ERR(rkey)) {
+ mutex_lock(&idmap->idmap_mutex);
+ rkey = request_key_with_auxdata(&key_type_id_resolver_legacy,
+ desc, NULL, "", 0, idmap);
+ mutex_unlock(&idmap->idmap_mutex);
+ }
+ if (!IS_ERR(rkey))
+ set_bit(KEY_FLAG_ROOT_CAN_INVAL, &rkey->flags);
+
+ kfree(desc);
+ return rkey;
+}
+
+static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
+ const char *type, void *data,
+ size_t data_size, struct idmap *idmap)
+{
+ const struct cred *saved_cred;
+ struct key *rkey;
+ const struct user_key_payload *payload;
+ ssize_t ret;
+
+ saved_cred = override_creds(id_resolver_cache);
+ rkey = nfs_idmap_request_key(name, namelen, type, idmap);
+ revert_creds(saved_cred);
+
+ if (IS_ERR(rkey)) {
+ ret = PTR_ERR(rkey);
+ goto out;
+ }
+
+ rcu_read_lock();
+ rkey->perm |= KEY_USR_VIEW;
+
+ ret = key_validate(rkey);
+ if (ret < 0)
+ goto out_up;
+
+ payload = user_key_payload_rcu(rkey);
+ if (IS_ERR_OR_NULL(payload)) {
+ ret = PTR_ERR(payload);
+ goto out_up;
+ }
+
+ ret = payload->datalen;
+ if (ret > 0 && ret <= data_size)
+ memcpy(data, payload->data, ret);
+ else
+ ret = -EINVAL;
+
+out_up:
+ rcu_read_unlock();
+ key_put(rkey);
+out:
+ return ret;
+}
+
+/* ID -> Name */
+static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf,
+ size_t buflen, struct idmap *idmap)
+{
+ char id_str[NFS_UINT_MAXLEN];
+ int id_len;
+ ssize_t ret;
+
+ id_len = nfs_map_numeric_to_string(id, id_str, sizeof(id_str));
+ ret = nfs_idmap_get_key(id_str, id_len, type, buf, buflen, idmap);
+ if (ret < 0)
+ return -EINVAL;
+ return ret;
+}
+
+/* Name -> ID */
+static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *type,
+ __u32 *id, struct idmap *idmap)
+{
+ char id_str[NFS_UINT_MAXLEN];
+ long id_long;
+ ssize_t data_size;
+ int ret = 0;
+
+ data_size = nfs_idmap_get_key(name, namelen, type, id_str, NFS_UINT_MAXLEN, idmap);
+ if (data_size <= 0) {
+ ret = -EINVAL;
+ } else {
+ ret = kstrtol(id_str, 10, &id_long);
+ if (!ret)
+ *id = (__u32)id_long;
+ }
+ return ret;
+}
+
+/* idmap classic begins here */
+
+enum {
+ Opt_find_uid, Opt_find_gid, Opt_find_user, Opt_find_group, Opt_find_err
+};
+
+static const match_table_t nfs_idmap_tokens = {
+ { Opt_find_uid, "uid:%s" },
+ { Opt_find_gid, "gid:%s" },
+ { Opt_find_user, "user:%s" },
+ { Opt_find_group, "group:%s" },
+ { Opt_find_err, NULL }
+};
+
+static int nfs_idmap_legacy_upcall(struct key *, void *);
+static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
+ size_t);
+static void idmap_release_pipe(struct inode *);
+static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
+
+static const struct rpc_pipe_ops idmap_upcall_ops = {
+ .upcall = rpc_pipe_generic_upcall,
+ .downcall = idmap_pipe_downcall,
+ .release_pipe = idmap_release_pipe,
+ .destroy_msg = idmap_pipe_destroy_msg,
+};
+
+static struct key_type key_type_id_resolver_legacy = {
+ .name = "id_legacy",
+ .preparse = user_preparse,
+ .free_preparse = user_free_preparse,
+ .instantiate = generic_key_instantiate,
+ .revoke = user_revoke,
+ .destroy = user_destroy,
+ .describe = user_describe,
+ .read = user_read,
+ .request_key = nfs_idmap_legacy_upcall,
+};
+
+static void nfs_idmap_pipe_destroy(struct dentry *dir,
+ struct rpc_pipe_dir_object *pdo)
+{
+ struct idmap *idmap = pdo->pdo_data;
+ struct rpc_pipe *pipe = idmap->idmap_pipe;
+
+ if (pipe->dentry) {
+ rpc_unlink(pipe->dentry);
+ pipe->dentry = NULL;
+ }
+}
+
+static int nfs_idmap_pipe_create(struct dentry *dir,
+ struct rpc_pipe_dir_object *pdo)
+{
+ struct idmap *idmap = pdo->pdo_data;
+ struct rpc_pipe *pipe = idmap->idmap_pipe;
+ struct dentry *dentry;
+
+ dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ pipe->dentry = dentry;
+ return 0;
+}
+
+static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = {
+ .create = nfs_idmap_pipe_create,
+ .destroy = nfs_idmap_pipe_destroy,
+};
+
+int
+nfs_idmap_new(struct nfs_client *clp)
+{
+ struct idmap *idmap;
+ struct rpc_pipe *pipe;
+ int error;
+
+ idmap = kzalloc(sizeof(*idmap), GFP_KERNEL);
+ if (idmap == NULL)
+ return -ENOMEM;
+
+ mutex_init(&idmap->idmap_mutex);
+ idmap->user_ns = get_user_ns(clp->cl_rpcclient->cl_cred->user_ns);
+
+ rpc_init_pipe_dir_object(&idmap->idmap_pdo,
+ &nfs_idmap_pipe_dir_object_ops,
+ idmap);
+
+ pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0);
+ if (IS_ERR(pipe)) {
+ error = PTR_ERR(pipe);
+ goto err;
+ }
+ idmap->idmap_pipe = pipe;
+
+ error = rpc_add_pipe_dir_object(clp->cl_net,
+ &clp->cl_rpcclient->cl_pipedir_objects,
+ &idmap->idmap_pdo);
+ if (error)
+ goto err_destroy_pipe;
+
+ clp->cl_idmap = idmap;
+ return 0;
+err_destroy_pipe:
+ rpc_destroy_pipe_data(idmap->idmap_pipe);
+err:
+ put_user_ns(idmap->user_ns);
+ kfree(idmap);
+ return error;
+}
+
+void
+nfs_idmap_delete(struct nfs_client *clp)
+{
+ struct idmap *idmap = clp->cl_idmap;
+
+ if (!idmap)
+ return;
+ clp->cl_idmap = NULL;
+ rpc_remove_pipe_dir_object(clp->cl_net,
+ &clp->cl_rpcclient->cl_pipedir_objects,
+ &idmap->idmap_pdo);
+ rpc_destroy_pipe_data(idmap->idmap_pipe);
+ put_user_ns(idmap->user_ns);
+ kfree(idmap);
+}
+
+static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap,
+ struct idmap_msg *im,
+ struct rpc_pipe_msg *msg)
+{
+ substring_t substr;
+ int token, ret;
+
+ im->im_type = IDMAP_TYPE_GROUP;
+ token = match_token(desc, nfs_idmap_tokens, &substr);
+
+ switch (token) {
+ case Opt_find_uid:
+ im->im_type = IDMAP_TYPE_USER;
+ fallthrough;
+ case Opt_find_gid:
+ im->im_conv = IDMAP_CONV_NAMETOID;
+ ret = match_strlcpy(im->im_name, &substr, IDMAP_NAMESZ);
+ break;
+
+ case Opt_find_user:
+ im->im_type = IDMAP_TYPE_USER;
+ fallthrough;
+ case Opt_find_group:
+ im->im_conv = IDMAP_CONV_IDTONAME;
+ ret = match_int(&substr, &im->im_id);
+ if (ret)
+ goto out;
+ break;
+
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ msg->data = im;
+ msg->len = sizeof(struct idmap_msg);
+
+out:
+ return ret;
+}
+
+static bool
+nfs_idmap_prepare_pipe_upcall(struct idmap *idmap,
+ struct idmap_legacy_upcalldata *data)
+{
+ if (idmap->idmap_upcall_data != NULL) {
+ WARN_ON_ONCE(1);
+ return false;
+ }
+ idmap->idmap_upcall_data = data;
+ return true;
+}
+
+static void nfs_idmap_complete_pipe_upcall(struct idmap_legacy_upcalldata *data,
+ int ret)
+{
+ complete_request_key(data->authkey, ret);
+ key_put(data->authkey);
+ kfree(data);
+}
+
+static void nfs_idmap_abort_pipe_upcall(struct idmap *idmap,
+ struct idmap_legacy_upcalldata *data,
+ int ret)
+{
+ if (cmpxchg(&idmap->idmap_upcall_data, data, NULL) == data)
+ nfs_idmap_complete_pipe_upcall(data, ret);
+}
+
+static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux)
+{
+ struct idmap_legacy_upcalldata *data;
+ struct request_key_auth *rka = get_request_key_auth(authkey);
+ struct rpc_pipe_msg *msg;
+ struct idmap_msg *im;
+ struct idmap *idmap = aux;
+ struct key *key = rka->target_key;
+ int ret = -ENOKEY;
+
+ if (!aux)
+ goto out1;
+
+ /* msg and im are freed in idmap_pipe_destroy_msg */
+ ret = -ENOMEM;
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ goto out1;
+
+ msg = &data->pipe_msg;
+ im = &data->idmap_msg;
+ data->idmap = idmap;
+ data->authkey = key_get(authkey);
+
+ ret = nfs_idmap_prepare_message(key->description, idmap, im, msg);
+ if (ret < 0)
+ goto out2;
+
+ ret = -EAGAIN;
+ if (!nfs_idmap_prepare_pipe_upcall(idmap, data))
+ goto out2;
+
+ ret = rpc_queue_upcall(idmap->idmap_pipe, msg);
+ if (ret < 0)
+ nfs_idmap_abort_pipe_upcall(idmap, data, ret);
+
+ return ret;
+out2:
+ kfree(data);
+out1:
+ complete_request_key(authkey, ret);
+ return ret;
+}
+
+static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data, size_t datalen)
+{
+ return key_instantiate_and_link(key, data, datalen,
+ id_resolver_cache->thread_keyring,
+ authkey);
+}
+
+static int nfs_idmap_read_and_verify_message(struct idmap_msg *im,
+ struct idmap_msg *upcall,
+ struct key *key, struct key *authkey)
+{
+ char id_str[NFS_UINT_MAXLEN];
+ size_t len;
+ int ret = -ENOKEY;
+
+ /* ret = -ENOKEY */
+ if (upcall->im_type != im->im_type || upcall->im_conv != im->im_conv)
+ goto out;
+ switch (im->im_conv) {
+ case IDMAP_CONV_NAMETOID:
+ if (strcmp(upcall->im_name, im->im_name) != 0)
+ break;
+ /* Note: here we store the NUL terminator too */
+ len = 1 + nfs_map_numeric_to_string(im->im_id, id_str,
+ sizeof(id_str));
+ ret = nfs_idmap_instantiate(key, authkey, id_str, len);
+ break;
+ case IDMAP_CONV_IDTONAME:
+ if (upcall->im_id != im->im_id)
+ break;
+ len = strlen(im->im_name);
+ ret = nfs_idmap_instantiate(key, authkey, im->im_name, len);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+out:
+ return ret;
+}
+
+static ssize_t
+idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
+{
+ struct request_key_auth *rka;
+ struct rpc_inode *rpci = RPC_I(file_inode(filp));
+ struct idmap *idmap = (struct idmap *)rpci->private;
+ struct idmap_legacy_upcalldata *data;
+ struct key *authkey;
+ struct idmap_msg im;
+ size_t namelen_in;
+ int ret = -ENOKEY;
+
+ /* If instantiation is successful, anyone waiting for key construction
+ * will have been woken up and someone else may now have used
+ * idmap_key_cons - so after this point we may no longer touch it.
+ */
+ data = xchg(&idmap->idmap_upcall_data, NULL);
+ if (data == NULL)
+ goto out_noupcall;
+
+ authkey = data->authkey;
+ rka = get_request_key_auth(authkey);
+
+ if (mlen != sizeof(im)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ if (copy_from_user(&im, src, mlen) != 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (!(im.im_status & IDMAP_STATUS_SUCCESS)) {
+ ret = -ENOKEY;
+ goto out;
+ }
+
+ namelen_in = strnlen(im.im_name, IDMAP_NAMESZ);
+ if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = nfs_idmap_read_and_verify_message(&im, &data->idmap_msg,
+ rka->target_key, authkey);
+ if (ret >= 0) {
+ key_set_timeout(rka->target_key, nfs_idmap_cache_timeout);
+ ret = mlen;
+ }
+
+out:
+ nfs_idmap_complete_pipe_upcall(data, ret);
+out_noupcall:
+ return ret;
+}
+
+static void
+idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+ struct idmap_legacy_upcalldata *data = container_of(msg,
+ struct idmap_legacy_upcalldata,
+ pipe_msg);
+ struct idmap *idmap = data->idmap;
+
+ if (msg->errno)
+ nfs_idmap_abort_pipe_upcall(idmap, data, msg->errno);
+}
+
+static void
+idmap_release_pipe(struct inode *inode)
+{
+ struct rpc_inode *rpci = RPC_I(inode);
+ struct idmap *idmap = (struct idmap *)rpci->private;
+ struct idmap_legacy_upcalldata *data;
+
+ data = xchg(&idmap->idmap_upcall_data, NULL);
+ if (data)
+ nfs_idmap_complete_pipe_upcall(data, -EPIPE);
+}
+
+int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid)
+{
+ struct idmap *idmap = server->nfs_client->cl_idmap;
+ __u32 id = -1;
+ int ret = 0;
+
+ if (!nfs_map_string_to_numeric(name, namelen, &id))
+ ret = nfs_idmap_lookup_id(name, namelen, "uid", &id, idmap);
+ if (ret == 0) {
+ *uid = make_kuid(idmap_userns(idmap), id);
+ if (!uid_valid(*uid))
+ ret = -ERANGE;
+ }
+ trace_nfs4_map_name_to_uid(name, namelen, id, ret);
+ return ret;
+}
+
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, kgid_t *gid)
+{
+ struct idmap *idmap = server->nfs_client->cl_idmap;
+ __u32 id = -1;
+ int ret = 0;
+
+ if (!nfs_map_string_to_numeric(name, namelen, &id))
+ ret = nfs_idmap_lookup_id(name, namelen, "gid", &id, idmap);
+ if (ret == 0) {
+ *gid = make_kgid(idmap_userns(idmap), id);
+ if (!gid_valid(*gid))
+ ret = -ERANGE;
+ }
+ trace_nfs4_map_group_to_gid(name, namelen, id, ret);
+ return ret;
+}
+
+int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf, size_t buflen)
+{
+ struct idmap *idmap = server->nfs_client->cl_idmap;
+ int ret = -EINVAL;
+ __u32 id;
+
+ id = from_kuid_munged(idmap_userns(idmap), uid);
+ if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+ ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap);
+ if (ret < 0)
+ ret = nfs_map_numeric_to_string(id, buf, buflen);
+ trace_nfs4_map_uid_to_name(buf, ret, id, ret);
+ return ret;
+}
+int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, size_t buflen)
+{
+ struct idmap *idmap = server->nfs_client->cl_idmap;
+ int ret = -EINVAL;
+ __u32 id;
+
+ id = from_kgid_munged(idmap_userns(idmap), gid);
+ if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+ ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap);
+ if (ret < 0)
+ ret = nfs_map_numeric_to_string(id, buf, buflen);
+ trace_nfs4_map_gid_to_group(buf, ret, id, ret);
+ return ret;
+}
diff --git a/fs/nfs/nfs4idmap.h b/fs/nfs/nfs4idmap.h
new file mode 100644
index 0000000000..de44d7330a
--- /dev/null
+++ b/fs/nfs/nfs4idmap.h
@@ -0,0 +1,68 @@
+/*
+ * fs/nfs/nfs4idmap.h
+ *
+ * UID and GID to name mapping for clients.
+ *
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef NFS_IDMAP_H
+#define NFS_IDMAP_H
+
+#include <linux/uidgid.h>
+#include <uapi/linux/nfs_idmap.h>
+
+
+/* Forward declaration to make this header independent of others */
+struct nfs_client;
+struct nfs_server;
+struct nfs_fattr;
+struct nfs4_string;
+
+int nfs_idmap_init(void);
+void nfs_idmap_quit(void);
+int nfs_idmap_new(struct nfs_client *);
+void nfs_idmap_delete(struct nfs_client *);
+
+void nfs_fattr_init_names(struct nfs_fattr *fattr,
+ struct nfs4_string *owner_name,
+ struct nfs4_string *group_name);
+void nfs_fattr_free_names(struct nfs_fattr *);
+void nfs_fattr_map_and_free_names(struct nfs_server *, struct nfs_fattr *);
+
+int nfs_map_name_to_uid(const struct nfs_server *, const char *, size_t, kuid_t *);
+int nfs_map_group_to_gid(const struct nfs_server *, const char *, size_t, kgid_t *);
+int nfs_map_uid_to_name(const struct nfs_server *, kuid_t, char *, size_t);
+int nfs_map_gid_to_group(const struct nfs_server *, kgid_t, char *, size_t);
+
+int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res);
+
+extern unsigned int nfs_idmap_cache_timeout;
+#endif /* NFS_IDMAP_H */
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
new file mode 100644
index 0000000000..9a98595bb1
--- /dev/null
+++ b/fs/nfs/nfs4namespace.c
@@ -0,0 +1,577 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/nfs4namespace.c
+ *
+ * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ * - Modified by David Howells <dhowells@redhat.com>
+ *
+ * NFSv4 namespace
+ */
+
+#include <linux/module.h>
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include "internal.h"
+#include "nfs4_fs.h"
+#include "nfs.h"
+#include "dns_resolve.h"
+
+#define NFSDBG_FACILITY NFSDBG_VFS
+
+/*
+ * Work out the length that an NFSv4 path would render to as a standard posix
+ * path, with a leading slash but no terminating slash.
+ */
+static ssize_t nfs4_pathname_len(const struct nfs4_pathname *pathname)
+{
+ ssize_t len = 0;
+ int i;
+
+ for (i = 0; i < pathname->ncomponents; i++) {
+ const struct nfs4_string *component = &pathname->components[i];
+
+ if (component->len > NAME_MAX)
+ goto too_long;
+ len += 1 + component->len; /* Adding "/foo" */
+ if (len > PATH_MAX)
+ goto too_long;
+ }
+ return len;
+
+too_long:
+ return -ENAMETOOLONG;
+}
+
+/*
+ * Convert the NFSv4 pathname components into a standard posix path.
+ */
+static char *nfs4_pathname_string(const struct nfs4_pathname *pathname,
+ unsigned short *_len)
+{
+ ssize_t len;
+ char *buf, *p;
+ int i;
+
+ len = nfs4_pathname_len(pathname);
+ if (len < 0)
+ return ERR_PTR(len);
+ *_len = len;
+
+ p = buf = kmalloc(len + 1, GFP_KERNEL);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < pathname->ncomponents; i++) {
+ const struct nfs4_string *component = &pathname->components[i];
+
+ *p++ = '/';
+ memcpy(p, component->data, component->len);
+ p += component->len;
+ }
+
+ *p = 0;
+ return buf;
+}
+
+/*
+ * return the path component of "<server>:<path>"
+ * nfspath - the "<server>:<path>" string
+ * end - one past the last char that could contain "<server>:"
+ * returns NULL on failure
+ */
+static char *nfs_path_component(const char *nfspath, const char *end)
+{
+ char *p;
+
+ if (*nfspath == '[') {
+ /* parse [] escaped IPv6 addrs */
+ p = strchr(nfspath, ']');
+ if (p != NULL && ++p < end && *p == ':')
+ return p + 1;
+ } else {
+ /* otherwise split on first colon */
+ p = strchr(nfspath, ':');
+ if (p != NULL && p < end)
+ return p + 1;
+ }
+ return NULL;
+}
+
+/*
+ * Determine the mount path as a string
+ */
+static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen)
+{
+ char *limit;
+ char *path = nfs_path(&limit, dentry, buffer, buflen,
+ NFS_PATH_CANONICAL);
+ if (!IS_ERR(path)) {
+ char *path_component = nfs_path_component(path, limit);
+ if (path_component)
+ return path_component;
+ }
+ return path;
+}
+
+/*
+ * Check that fs_locations::fs_root [RFC3530 6.3] is a prefix for what we
+ * believe to be the server path to this dentry
+ */
+static int nfs4_validate_fspath(struct dentry *dentry,
+ const struct nfs4_fs_locations *locations,
+ struct nfs_fs_context *ctx)
+{
+ const char *path;
+ char *fs_path;
+ unsigned short len;
+ char *buf;
+ int n;
+
+ buf = kmalloc(4096, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ path = nfs4_path(dentry, buf, 4096);
+ if (IS_ERR(path)) {
+ kfree(buf);
+ return PTR_ERR(path);
+ }
+
+ fs_path = nfs4_pathname_string(&locations->fs_path, &len);
+ if (IS_ERR(fs_path)) {
+ kfree(buf);
+ return PTR_ERR(fs_path);
+ }
+
+ n = strncmp(path, fs_path, len);
+ kfree(buf);
+ kfree(fs_path);
+ if (n != 0) {
+ dprintk("%s: path %s does not begin with fsroot %s\n",
+ __func__, path, ctx->nfs_server.export_path);
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr_storage *ss,
+ size_t salen, struct net *net, int port)
+{
+ struct sockaddr *sa = (struct sockaddr *)ss;
+ ssize_t ret;
+
+ ret = rpc_pton(net, string, len, sa, salen);
+ if (ret == 0) {
+ ret = rpc_uaddr2sockaddr(net, string, len, sa, salen);
+ if (ret == 0) {
+ ret = nfs_dns_resolve_name(net, string, len, ss, salen);
+ if (ret < 0)
+ ret = 0;
+ }
+ } else if (port) {
+ rpc_set_port(sa, port);
+ }
+ return ret;
+}
+
+/**
+ * nfs_find_best_sec - Find a security mechanism supported locally
+ * @clnt: pointer to rpc_clnt
+ * @server: NFS server struct
+ * @flavors: List of security tuples returned by SECINFO procedure
+ *
+ * Return an rpc client that uses the first security mechanism in
+ * "flavors" that is locally supported. The "flavors" array
+ * is searched in the order returned from the server, per RFC 3530
+ * recommendation and each flavor is checked for membership in the
+ * sec= mount option list if it exists.
+ *
+ * Return -EPERM if no matching flavor is found in the array.
+ *
+ * Please call rpc_shutdown_client() when you are done with this rpc client.
+ *
+ */
+static struct rpc_clnt *nfs_find_best_sec(struct rpc_clnt *clnt,
+ struct nfs_server *server,
+ struct nfs4_secinfo_flavors *flavors)
+{
+ rpc_authflavor_t pflavor;
+ struct nfs4_secinfo4 *secinfo;
+ unsigned int i;
+
+ for (i = 0; i < flavors->num_flavors; i++) {
+ secinfo = &flavors->flavors[i];
+
+ switch (secinfo->flavor) {
+ case RPC_AUTH_NULL:
+ case RPC_AUTH_UNIX:
+ case RPC_AUTH_GSS:
+ pflavor = rpcauth_get_pseudoflavor(secinfo->flavor,
+ &secinfo->flavor_info);
+ /* does the pseudoflavor match a sec= mount opt? */
+ if (pflavor != RPC_AUTH_MAXFLAVOR &&
+ nfs_auth_info_match(&server->auth_info, pflavor)) {
+ struct rpc_clnt *new;
+ struct rpc_cred *cred;
+
+ /* Cloning creates an rpc_auth for the flavor */
+ new = rpc_clone_client_set_auth(clnt, pflavor);
+ if (IS_ERR(new))
+ continue;
+ /**
+ * Check that the user actually can use the
+ * flavor. This is mostly for RPC_AUTH_GSS
+ * where cr_init obtains a gss context
+ */
+ cred = rpcauth_lookupcred(new->cl_auth, 0);
+ if (IS_ERR(cred)) {
+ rpc_shutdown_client(new);
+ continue;
+ }
+ put_rpccred(cred);
+ return new;
+ }
+ }
+ }
+ return ERR_PTR(-EPERM);
+}
+
+/**
+ * nfs4_negotiate_security - in response to an NFS4ERR_WRONGSEC on lookup,
+ * return an rpc_clnt that uses the best available security flavor with
+ * respect to the secinfo flavor list and the sec= mount options.
+ *
+ * @clnt: RPC client to clone
+ * @inode: directory inode
+ * @name: lookup name
+ *
+ * Please call rpc_shutdown_client() when you are done with this rpc client.
+ */
+struct rpc_clnt *
+nfs4_negotiate_security(struct rpc_clnt *clnt, struct inode *inode,
+ const struct qstr *name)
+{
+ struct page *page;
+ struct nfs4_secinfo_flavors *flavors;
+ struct rpc_clnt *new;
+ int err;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ flavors = page_address(page);
+
+ err = nfs4_proc_secinfo(inode, name, flavors);
+ if (err < 0) {
+ new = ERR_PTR(err);
+ goto out;
+ }
+
+ new = nfs_find_best_sec(clnt, NFS_SERVER(inode), flavors);
+
+out:
+ put_page(page);
+ return new;
+}
+
+static int try_location(struct fs_context *fc,
+ const struct nfs4_fs_location *location)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ unsigned int len, s;
+ char *export_path, *source, *p;
+ int ret = -ENOENT;
+
+ /* Allocate a buffer big enough to hold any of the hostnames plus a
+ * terminating char and also a buffer big enough to hold the hostname
+ * plus a colon plus the path.
+ */
+ len = 0;
+ for (s = 0; s < location->nservers; s++) {
+ const struct nfs4_string *buf = &location->servers[s];
+ if (buf->len > len)
+ len = buf->len;
+ }
+
+ kfree(ctx->nfs_server.hostname);
+ ctx->nfs_server.hostname = kmalloc(len + 1, GFP_KERNEL);
+ if (!ctx->nfs_server.hostname)
+ return -ENOMEM;
+
+ export_path = nfs4_pathname_string(&location->rootpath,
+ &ctx->nfs_server.export_path_len);
+ if (IS_ERR(export_path))
+ return PTR_ERR(export_path);
+
+ kfree(ctx->nfs_server.export_path);
+ ctx->nfs_server.export_path = export_path;
+
+ source = kmalloc(len + 1 + ctx->nfs_server.export_path_len + 1,
+ GFP_KERNEL);
+ if (!source)
+ return -ENOMEM;
+
+ kfree(fc->source);
+ fc->source = source;
+ for (s = 0; s < location->nservers; s++) {
+ const struct nfs4_string *buf = &location->servers[s];
+
+ if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
+ continue;
+
+ ctx->nfs_server.addrlen =
+ nfs_parse_server_name(buf->data, buf->len,
+ &ctx->nfs_server._address,
+ sizeof(ctx->nfs_server._address),
+ fc->net_ns, 0);
+ if (ctx->nfs_server.addrlen == 0)
+ continue;
+
+ rpc_set_port(&ctx->nfs_server.address, NFS_PORT);
+
+ memcpy(ctx->nfs_server.hostname, buf->data, buf->len);
+ ctx->nfs_server.hostname[buf->len] = '\0';
+
+ p = source;
+ memcpy(p, buf->data, buf->len);
+ p += buf->len;
+ *p++ = ':';
+ memcpy(p, ctx->nfs_server.export_path, ctx->nfs_server.export_path_len);
+ p += ctx->nfs_server.export_path_len;
+ *p = 0;
+
+ ret = nfs4_get_referral_tree(fc);
+ if (ret == 0)
+ return 0;
+ }
+
+ return ret;
+}
+
+/**
+ * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
+ * @fc: pointer to struct nfs_fs_context
+ * @locations: array of NFSv4 server location information
+ *
+ */
+static int nfs_follow_referral(struct fs_context *fc,
+ const struct nfs4_fs_locations *locations)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ int loc, error;
+
+ if (locations == NULL || locations->nlocations <= 0)
+ return -ENOENT;
+
+ dprintk("%s: referral at %pd2\n", __func__, ctx->clone_data.dentry);
+
+ /* Ensure fs path is a prefix of current dentry path */
+ error = nfs4_validate_fspath(ctx->clone_data.dentry, locations, ctx);
+ if (error < 0)
+ return error;
+
+ error = -ENOENT;
+ for (loc = 0; loc < locations->nlocations; loc++) {
+ const struct nfs4_fs_location *location = &locations->locations[loc];
+
+ if (location == NULL || location->nservers <= 0 ||
+ location->rootpath.ncomponents == 0)
+ continue;
+
+ error = try_location(fc, location);
+ if (error == 0)
+ return 0;
+ }
+
+ return error;
+}
+
+/*
+ * nfs_do_refmount - handle crossing a referral on server
+ * @dentry - dentry of referral
+ *
+ */
+static int nfs_do_refmount(struct fs_context *fc, struct rpc_clnt *client)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct dentry *dentry, *parent;
+ struct nfs4_fs_locations *fs_locations = NULL;
+ struct page *page;
+ int err = -ENOMEM;
+
+ /* BUG_ON(IS_ROOT(dentry)); */
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+ if (!fs_locations)
+ goto out_free;
+ fs_locations->fattr = nfs_alloc_fattr();
+ if (!fs_locations->fattr)
+ goto out_free_2;
+
+ /* Get locations */
+ dentry = ctx->clone_data.dentry;
+ parent = dget_parent(dentry);
+ dprintk("%s: getting locations for %pd2\n",
+ __func__, dentry);
+
+ err = nfs4_proc_fs_locations(client, d_inode(parent), &dentry->d_name, fs_locations, page);
+ dput(parent);
+ if (err != 0)
+ goto out_free_3;
+
+ err = -ENOENT;
+ if (fs_locations->nlocations <= 0 ||
+ fs_locations->fs_path.ncomponents <= 0)
+ goto out_free_3;
+
+ err = nfs_follow_referral(fc, fs_locations);
+out_free_3:
+ kfree(fs_locations->fattr);
+out_free_2:
+ kfree(fs_locations);
+out_free:
+ __free_page(page);
+ return err;
+}
+
+int nfs4_submount(struct fs_context *fc, struct nfs_server *server)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct dentry *dentry = ctx->clone_data.dentry;
+ struct dentry *parent = dget_parent(dentry);
+ struct inode *dir = d_inode(parent);
+ struct rpc_clnt *client;
+ int ret;
+
+ /* Look it up again to get its attributes and sec flavor */
+ client = nfs4_proc_lookup_mountpoint(dir, dentry, ctx->mntfh,
+ ctx->clone_data.fattr);
+ dput(parent);
+ if (IS_ERR(client))
+ return PTR_ERR(client);
+
+ ctx->selected_flavor = client->cl_auth->au_flavor;
+ if (ctx->clone_data.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+ ret = nfs_do_refmount(fc, client);
+ } else {
+ ret = nfs_do_submount(fc);
+ }
+
+ rpc_shutdown_client(client);
+ return ret;
+}
+
+/*
+ * Try one location from the fs_locations array.
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+static int nfs4_try_replacing_one_location(struct nfs_server *server,
+ char *page, char *page2,
+ const struct nfs4_fs_location *location)
+{
+ struct net *net = rpc_net_ns(server->client);
+ struct sockaddr_storage *sap;
+ unsigned int s;
+ size_t salen;
+ int error;
+
+ sap = kmalloc(sizeof(*sap), GFP_KERNEL);
+ if (sap == NULL)
+ return -ENOMEM;
+
+ error = -ENOENT;
+ for (s = 0; s < location->nservers; s++) {
+ const struct nfs4_string *buf = &location->servers[s];
+ char *hostname;
+
+ if (buf->len <= 0 || buf->len > PAGE_SIZE)
+ continue;
+
+ if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len) != NULL)
+ continue;
+
+ salen = nfs_parse_server_name(buf->data, buf->len,
+ sap, sizeof(*sap), net, 0);
+ if (salen == 0)
+ continue;
+ rpc_set_port((struct sockaddr *)sap, NFS_PORT);
+
+ error = -ENOMEM;
+ hostname = kmemdup_nul(buf->data, buf->len, GFP_KERNEL);
+ if (hostname == NULL)
+ break;
+
+ error = nfs4_update_server(server, hostname, sap, salen, net);
+ kfree(hostname);
+ if (error == 0)
+ break;
+ }
+
+ kfree(sap);
+ return error;
+}
+
+/**
+ * nfs4_replace_transport - set up transport to destination server
+ *
+ * @server: export being migrated
+ * @locations: fs_locations array
+ *
+ * Returns zero on success, or a negative errno value.
+ *
+ * The client tries all the entries in the "locations" array, in the
+ * order returned by the server, until one works or the end of the
+ * array is reached.
+ */
+int nfs4_replace_transport(struct nfs_server *server,
+ const struct nfs4_fs_locations *locations)
+{
+ char *page = NULL, *page2 = NULL;
+ int loc, error;
+
+ error = -ENOENT;
+ if (locations == NULL || locations->nlocations <= 0)
+ goto out;
+
+ error = -ENOMEM;
+ page = (char *) __get_free_page(GFP_USER);
+ if (!page)
+ goto out;
+ page2 = (char *) __get_free_page(GFP_USER);
+ if (!page2)
+ goto out;
+
+ for (loc = 0; loc < locations->nlocations; loc++) {
+ const struct nfs4_fs_location *location =
+ &locations->locations[loc];
+
+ if (location == NULL || location->nservers <= 0 ||
+ location->rootpath.ncomponents == 0)
+ continue;
+
+ error = nfs4_try_replacing_one_location(server, page,
+ page2, location);
+ if (error == 0)
+ break;
+ }
+
+out:
+ free_page((unsigned long)page);
+ free_page((unsigned long)page2);
+ return error;
+}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
new file mode 100644
index 0000000000..e8b52e3690
--- /dev/null
+++ b/fs/nfs/nfs4proc.c
@@ -0,0 +1,10762 @@
+/*
+ * fs/nfs/nfs4proc.c
+ *
+ * Client-side procedure declarations for NFSv4.
+ *
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Kendrick Smith <kmsmith@umich.edu>
+ * Andy Adamson <andros@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/ratelimit.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/nfs_mount.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/xattr.h>
+#include <linux/utsname.h>
+#include <linux/freezer.h>
+#include <linux/iversion.h>
+
+#include "nfs4_fs.h"
+#include "delegation.h"
+#include "internal.h"
+#include "iostat.h"
+#include "callback.h"
+#include "pnfs.h"
+#include "netns.h"
+#include "sysfs.h"
+#include "nfs4idmap.h"
+#include "nfs4session.h"
+#include "fscache.h"
+#include "nfs42.h"
+
+#include "nfs4trace.h"
+
+#define NFSDBG_FACILITY NFSDBG_PROC
+
+#define NFS4_BITMASK_SZ 3
+
+#define NFS4_POLL_RETRY_MIN (HZ/10)
+#define NFS4_POLL_RETRY_MAX (15*HZ)
+
+/* file attributes which can be mapped to nfs attributes */
+#define NFS4_VALID_ATTRS (ATTR_MODE \
+ | ATTR_UID \
+ | ATTR_GID \
+ | ATTR_SIZE \
+ | ATTR_ATIME \
+ | ATTR_MTIME \
+ | ATTR_CTIME \
+ | ATTR_ATIME_SET \
+ | ATTR_MTIME_SET)
+
+struct nfs4_opendata;
+static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
+static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
+static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct inode *inode);
+static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
+ struct nfs_fattr *fattr, struct iattr *sattr,
+ struct nfs_open_context *ctx, struct nfs4_label *ilabel);
+#ifdef CONFIG_NFS_V4_1
+static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
+ const struct cred *cred,
+ struct nfs4_slot *slot,
+ bool is_privileged);
+static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
+ const struct cred *);
+static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *,
+ const struct cred *, bool);
+#endif
+
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static inline struct nfs4_label *
+nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr, struct nfs4_label *label)
+{
+ int err;
+
+ if (label == NULL)
+ return NULL;
+
+ if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)
+ return NULL;
+
+ label->lfs = 0;
+ label->pi = 0;
+ label->len = 0;
+ label->label = NULL;
+
+ err = security_dentry_init_security(dentry, sattr->ia_mode,
+ &dentry->d_name, NULL,
+ (void **)&label->label, &label->len);
+ if (err == 0)
+ return label;
+
+ return NULL;
+}
+static inline void
+nfs4_label_release_security(struct nfs4_label *label)
+{
+ if (label)
+ security_release_secctx(label->label, label->len);
+}
+static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
+{
+ if (label)
+ return server->attr_bitmask;
+
+ return server->attr_bitmask_nl;
+}
+#else
+static inline struct nfs4_label *
+nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr, struct nfs4_label *l)
+{ return NULL; }
+static inline void
+nfs4_label_release_security(struct nfs4_label *label)
+{ return; }
+static inline u32 *
+nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
+{ return server->attr_bitmask; }
+#endif
+
+/* Prevent leaks of NFSv4 errors into userland */
+static int nfs4_map_errors(int err)
+{
+ if (err >= -1000)
+ return err;
+ switch (err) {
+ case -NFS4ERR_RESOURCE:
+ case -NFS4ERR_LAYOUTTRYLATER:
+ case -NFS4ERR_RECALLCONFLICT:
+ case -NFS4ERR_RETURNCONFLICT:
+ return -EREMOTEIO;
+ case -NFS4ERR_WRONGSEC:
+ case -NFS4ERR_WRONG_CRED:
+ return -EPERM;
+ case -NFS4ERR_BADOWNER:
+ case -NFS4ERR_BADNAME:
+ return -EINVAL;
+ case -NFS4ERR_SHARE_DENIED:
+ return -EACCES;
+ case -NFS4ERR_MINOR_VERS_MISMATCH:
+ return -EPROTONOSUPPORT;
+ case -NFS4ERR_FILE_OPEN:
+ return -EBUSY;
+ case -NFS4ERR_NOT_SAME:
+ return -ENOTSYNC;
+ default:
+ dprintk("%s could not handle NFSv4 error %d\n",
+ __func__, -err);
+ break;
+ }
+ return -EIO;
+}
+
+/*
+ * This is our standard bitmap for GETATTR requests.
+ */
+const u32 nfs4_fattr_bitmap[3] = {
+ FATTR4_WORD0_TYPE
+ | FATTR4_WORD0_CHANGE
+ | FATTR4_WORD0_SIZE
+ | FATTR4_WORD0_FSID
+ | FATTR4_WORD0_FILEID,
+ FATTR4_WORD1_MODE
+ | FATTR4_WORD1_NUMLINKS
+ | FATTR4_WORD1_OWNER
+ | FATTR4_WORD1_OWNER_GROUP
+ | FATTR4_WORD1_RAWDEV
+ | FATTR4_WORD1_SPACE_USED
+ | FATTR4_WORD1_TIME_ACCESS
+ | FATTR4_WORD1_TIME_METADATA
+ | FATTR4_WORD1_TIME_MODIFY
+ | FATTR4_WORD1_MOUNTED_ON_FILEID,
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ FATTR4_WORD2_SECURITY_LABEL
+#endif
+};
+
+static const u32 nfs4_pnfs_open_bitmap[3] = {
+ FATTR4_WORD0_TYPE
+ | FATTR4_WORD0_CHANGE
+ | FATTR4_WORD0_SIZE
+ | FATTR4_WORD0_FSID
+ | FATTR4_WORD0_FILEID,
+ FATTR4_WORD1_MODE
+ | FATTR4_WORD1_NUMLINKS
+ | FATTR4_WORD1_OWNER
+ | FATTR4_WORD1_OWNER_GROUP
+ | FATTR4_WORD1_RAWDEV
+ | FATTR4_WORD1_SPACE_USED
+ | FATTR4_WORD1_TIME_ACCESS
+ | FATTR4_WORD1_TIME_METADATA
+ | FATTR4_WORD1_TIME_MODIFY,
+ FATTR4_WORD2_MDSTHRESHOLD
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ | FATTR4_WORD2_SECURITY_LABEL
+#endif
+};
+
+static const u32 nfs4_open_noattr_bitmap[3] = {
+ FATTR4_WORD0_TYPE
+ | FATTR4_WORD0_FILEID,
+};
+
+const u32 nfs4_statfs_bitmap[3] = {
+ FATTR4_WORD0_FILES_AVAIL
+ | FATTR4_WORD0_FILES_FREE
+ | FATTR4_WORD0_FILES_TOTAL,
+ FATTR4_WORD1_SPACE_AVAIL
+ | FATTR4_WORD1_SPACE_FREE
+ | FATTR4_WORD1_SPACE_TOTAL
+};
+
+const u32 nfs4_pathconf_bitmap[3] = {
+ FATTR4_WORD0_MAXLINK
+ | FATTR4_WORD0_MAXNAME,
+ 0
+};
+
+const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
+ | FATTR4_WORD0_MAXREAD
+ | FATTR4_WORD0_MAXWRITE
+ | FATTR4_WORD0_LEASE_TIME,
+ FATTR4_WORD1_TIME_DELTA
+ | FATTR4_WORD1_FS_LAYOUT_TYPES,
+ FATTR4_WORD2_LAYOUT_BLKSIZE
+ | FATTR4_WORD2_CLONE_BLKSIZE
+ | FATTR4_WORD2_CHANGE_ATTR_TYPE
+ | FATTR4_WORD2_XATTR_SUPPORT
+};
+
+const u32 nfs4_fs_locations_bitmap[3] = {
+ FATTR4_WORD0_CHANGE
+ | FATTR4_WORD0_SIZE
+ | FATTR4_WORD0_FSID
+ | FATTR4_WORD0_FILEID
+ | FATTR4_WORD0_FS_LOCATIONS,
+ FATTR4_WORD1_OWNER
+ | FATTR4_WORD1_OWNER_GROUP
+ | FATTR4_WORD1_RAWDEV
+ | FATTR4_WORD1_SPACE_USED
+ | FATTR4_WORD1_TIME_ACCESS
+ | FATTR4_WORD1_TIME_METADATA
+ | FATTR4_WORD1_TIME_MODIFY
+ | FATTR4_WORD1_MOUNTED_ON_FILEID,
+};
+
+static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src,
+ struct inode *inode, unsigned long flags)
+{
+ unsigned long cache_validity;
+
+ memcpy(dst, src, NFS4_BITMASK_SZ*sizeof(*dst));
+ if (!inode || !nfs4_have_delegation(inode, FMODE_READ))
+ return;
+
+ cache_validity = READ_ONCE(NFS_I(inode)->cache_validity) | flags;
+
+ /* Remove the attributes over which we have full control */
+ dst[1] &= ~FATTR4_WORD1_RAWDEV;
+ if (!(cache_validity & NFS_INO_INVALID_SIZE))
+ dst[0] &= ~FATTR4_WORD0_SIZE;
+
+ if (!(cache_validity & NFS_INO_INVALID_CHANGE))
+ dst[0] &= ~FATTR4_WORD0_CHANGE;
+
+ if (!(cache_validity & NFS_INO_INVALID_MODE))
+ dst[1] &= ~FATTR4_WORD1_MODE;
+ if (!(cache_validity & NFS_INO_INVALID_OTHER))
+ dst[1] &= ~(FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP);
+}
+
+static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
+ struct nfs4_readdir_arg *readdir)
+{
+ unsigned int attrs = FATTR4_WORD0_FILEID | FATTR4_WORD0_TYPE;
+ __be32 *start, *p;
+
+ if (cookie > 2) {
+ readdir->cookie = cookie;
+ memcpy(&readdir->verifier, verifier, sizeof(readdir->verifier));
+ return;
+ }
+
+ readdir->cookie = 0;
+ memset(&readdir->verifier, 0, sizeof(readdir->verifier));
+ if (cookie == 2)
+ return;
+
+ /*
+ * NFSv4 servers do not return entries for '.' and '..'
+ * Therefore, we fake these entries here. We let '.'
+ * have cookie 0 and '..' have cookie 1. Note that
+ * when talking to the server, we always send cookie 0
+ * instead of 1 or 2.
+ */
+ start = p = kmap_atomic(*readdir->pages);
+
+ if (cookie == 0) {
+ *p++ = xdr_one; /* next */
+ *p++ = xdr_zero; /* cookie, first word */
+ *p++ = xdr_one; /* cookie, second word */
+ *p++ = xdr_one; /* entry len */
+ memcpy(p, ".\0\0\0", 4); /* entry */
+ p++;
+ *p++ = xdr_one; /* bitmap length */
+ *p++ = htonl(attrs); /* bitmap */
+ *p++ = htonl(12); /* attribute buffer length */
+ *p++ = htonl(NF4DIR);
+ p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry)));
+ }
+
+ *p++ = xdr_one; /* next */
+ *p++ = xdr_zero; /* cookie, first word */
+ *p++ = xdr_two; /* cookie, second word */
+ *p++ = xdr_two; /* entry len */
+ memcpy(p, "..\0\0", 4); /* entry */
+ p++;
+ *p++ = xdr_one; /* bitmap length */
+ *p++ = htonl(attrs); /* bitmap */
+ *p++ = htonl(12); /* attribute buffer length */
+ *p++ = htonl(NF4DIR);
+ p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry->d_parent)));
+
+ readdir->pgbase = (char *)p - (char *)start;
+ readdir->count -= readdir->pgbase;
+ kunmap_atomic(start);
+}
+
+static void nfs4_fattr_set_prechange(struct nfs_fattr *fattr, u64 version)
+{
+ if (!(fattr->valid & NFS_ATTR_FATTR_PRECHANGE)) {
+ fattr->pre_change_attr = version;
+ fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
+ }
+}
+
+static void nfs4_test_and_free_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ const struct cred *cred)
+{
+ const struct nfs4_minor_version_ops *ops = server->nfs_client->cl_mvops;
+
+ ops->test_and_free_expired(server, stateid, cred);
+}
+
+static void __nfs4_free_revoked_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ const struct cred *cred)
+{
+ stateid->type = NFS4_REVOKED_STATEID_TYPE;
+ nfs4_test_and_free_stateid(server, stateid, cred);
+}
+
+static void nfs4_free_revoked_stateid(struct nfs_server *server,
+ const nfs4_stateid *stateid,
+ const struct cred *cred)
+{
+ nfs4_stateid tmp;
+
+ nfs4_stateid_copy(&tmp, stateid);
+ __nfs4_free_revoked_stateid(server, &tmp, cred);
+}
+
+static long nfs4_update_delay(long *timeout)
+{
+ long ret;
+ if (!timeout)
+ return NFS4_POLL_RETRY_MAX;
+ if (*timeout <= 0)
+ *timeout = NFS4_POLL_RETRY_MIN;
+ if (*timeout > NFS4_POLL_RETRY_MAX)
+ *timeout = NFS4_POLL_RETRY_MAX;
+ ret = *timeout;
+ *timeout <<= 1;
+ return ret;
+}
+
+static int nfs4_delay_killable(long *timeout)
+{
+ might_sleep();
+
+ __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
+ schedule_timeout(nfs4_update_delay(timeout));
+ if (!__fatal_signal_pending(current))
+ return 0;
+ return -EINTR;
+}
+
+static int nfs4_delay_interruptible(long *timeout)
+{
+ might_sleep();
+
+ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE);
+ schedule_timeout(nfs4_update_delay(timeout));
+ if (!signal_pending(current))
+ return 0;
+ return __fatal_signal_pending(current) ? -EINTR :-ERESTARTSYS;
+}
+
+static int nfs4_delay(long *timeout, bool interruptible)
+{
+ if (interruptible)
+ return nfs4_delay_interruptible(timeout);
+ return nfs4_delay_killable(timeout);
+}
+
+static const nfs4_stateid *
+nfs4_recoverable_stateid(const nfs4_stateid *stateid)
+{
+ if (!stateid)
+ return NULL;
+ switch (stateid->type) {
+ case NFS4_OPEN_STATEID_TYPE:
+ case NFS4_LOCK_STATEID_TYPE:
+ case NFS4_DELEGATION_STATEID_TYPE:
+ return stateid;
+ default:
+ break;
+ }
+ return NULL;
+}
+
+/* This is the error handling routine for processes that are allowed
+ * to sleep.
+ */
+static int nfs4_do_handle_exception(struct nfs_server *server,
+ int errorcode, struct nfs4_exception *exception)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_state *state = exception->state;
+ const nfs4_stateid *stateid;
+ struct inode *inode = exception->inode;
+ int ret = errorcode;
+
+ exception->delay = 0;
+ exception->recovering = 0;
+ exception->retry = 0;
+
+ stateid = nfs4_recoverable_stateid(exception->stateid);
+ if (stateid == NULL && state != NULL)
+ stateid = nfs4_recoverable_stateid(&state->stateid);
+
+ switch(errorcode) {
+ case 0:
+ return 0;
+ case -NFS4ERR_BADHANDLE:
+ case -ESTALE:
+ if (inode != NULL && S_ISREG(inode->i_mode))
+ pnfs_destroy_layout(NFS_I(inode));
+ break;
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_PARTNER_NO_AUTH:
+ if (inode != NULL && stateid != NULL) {
+ nfs_inode_find_state_and_recover(inode,
+ stateid);
+ goto wait_on_recovery;
+ }
+ fallthrough;
+ case -NFS4ERR_OPENMODE:
+ if (inode) {
+ int err;
+
+ err = nfs_async_inode_return_delegation(inode,
+ stateid);
+ if (err == 0)
+ goto wait_on_recovery;
+ if (stateid != NULL && stateid->type == NFS4_DELEGATION_STATEID_TYPE) {
+ exception->retry = 1;
+ break;
+ }
+ }
+ if (state == NULL)
+ break;
+ ret = nfs4_schedule_stateid_recovery(server, state);
+ if (ret < 0)
+ break;
+ goto wait_on_recovery;
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_STALE_CLIENTID:
+ nfs4_schedule_lease_recovery(clp);
+ goto wait_on_recovery;
+ case -NFS4ERR_MOVED:
+ ret = nfs4_schedule_migration_recovery(server);
+ if (ret < 0)
+ break;
+ goto wait_on_recovery;
+ case -NFS4ERR_LEASE_MOVED:
+ nfs4_schedule_lease_moved_recovery(clp);
+ goto wait_on_recovery;
+#if defined(CONFIG_NFS_V4_1)
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_SEQ_FALSE_RETRY:
+ case -NFS4ERR_SEQ_MISORDERED:
+ /* Handled in nfs41_sequence_process() */
+ goto wait_on_recovery;
+#endif /* defined(CONFIG_NFS_V4_1) */
+ case -NFS4ERR_FILE_OPEN:
+ if (exception->timeout > HZ) {
+ /* We have retried a decent amount, time to
+ * fail
+ */
+ ret = -EBUSY;
+ break;
+ }
+ fallthrough;
+ case -NFS4ERR_DELAY:
+ nfs_inc_server_stats(server, NFSIOS_DELAY);
+ fallthrough;
+ case -NFS4ERR_GRACE:
+ case -NFS4ERR_LAYOUTTRYLATER:
+ case -NFS4ERR_RECALLCONFLICT:
+ case -NFS4ERR_RETURNCONFLICT:
+ exception->delay = 1;
+ return 0;
+
+ case -NFS4ERR_RETRY_UNCACHED_REP:
+ case -NFS4ERR_OLD_STATEID:
+ exception->retry = 1;
+ break;
+ case -NFS4ERR_BADOWNER:
+ /* The following works around a Linux server bug! */
+ case -NFS4ERR_BADNAME:
+ if (server->caps & NFS_CAP_UIDGID_NOMAP) {
+ server->caps &= ~NFS_CAP_UIDGID_NOMAP;
+ exception->retry = 1;
+ printk(KERN_WARNING "NFS: v4 server %s "
+ "does not accept raw "
+ "uid/gids. "
+ "Reenabling the idmapper.\n",
+ server->nfs_client->cl_hostname);
+ }
+ }
+ /* We failed to handle the error */
+ return nfs4_map_errors(ret);
+wait_on_recovery:
+ exception->recovering = 1;
+ return 0;
+}
+
+/* This is the error handling routine for processes that are allowed
+ * to sleep.
+ */
+int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+{
+ struct nfs_client *clp = server->nfs_client;
+ int ret;
+
+ ret = nfs4_do_handle_exception(server, errorcode, exception);
+ if (exception->delay) {
+ ret = nfs4_delay(&exception->timeout,
+ exception->interruptible);
+ goto out_retry;
+ }
+ if (exception->recovering) {
+ if (exception->task_is_privileged)
+ return -EDEADLOCK;
+ ret = nfs4_wait_clnt_recover(clp);
+ if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+ return -EIO;
+ goto out_retry;
+ }
+ return ret;
+out_retry:
+ if (ret == 0)
+ exception->retry = 1;
+ return ret;
+}
+
+static int
+nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server,
+ int errorcode, struct nfs4_exception *exception)
+{
+ struct nfs_client *clp = server->nfs_client;
+ int ret;
+
+ ret = nfs4_do_handle_exception(server, errorcode, exception);
+ if (exception->delay) {
+ rpc_delay(task, nfs4_update_delay(&exception->timeout));
+ goto out_retry;
+ }
+ if (exception->recovering) {
+ if (exception->task_is_privileged)
+ return -EDEADLOCK;
+ rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
+ if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
+ rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
+ goto out_retry;
+ }
+ if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+ ret = -EIO;
+ return ret;
+out_retry:
+ if (ret == 0) {
+ exception->retry = 1;
+ /*
+ * For NFS4ERR_MOVED, the client transport will need to
+ * be recomputed after migration recovery has completed.
+ */
+ if (errorcode == -NFS4ERR_MOVED)
+ rpc_task_release_transport(task);
+ }
+ return ret;
+}
+
+int
+nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server,
+ struct nfs4_state *state, long *timeout)
+{
+ struct nfs4_exception exception = {
+ .state = state,
+ };
+
+ if (task->tk_status >= 0)
+ return 0;
+ if (timeout)
+ exception.timeout = *timeout;
+ task->tk_status = nfs4_async_handle_exception(task, server,
+ task->tk_status,
+ &exception);
+ if (exception.delay && timeout)
+ *timeout = exception.timeout;
+ if (exception.retry)
+ return -EAGAIN;
+ return 0;
+}
+
+/*
+ * Return 'true' if 'clp' is using an rpc_client that is integrity protected
+ * or 'false' otherwise.
+ */
+static bool _nfs4_is_integrity_protected(struct nfs_client *clp)
+{
+ rpc_authflavor_t flavor = clp->cl_rpcclient->cl_auth->au_flavor;
+ return (flavor == RPC_AUTH_GSS_KRB5I) || (flavor == RPC_AUTH_GSS_KRB5P);
+}
+
+static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp)
+{
+ spin_lock(&clp->cl_lock);
+ if (time_before(clp->cl_last_renewal,timestamp))
+ clp->cl_last_renewal = timestamp;
+ spin_unlock(&clp->cl_lock);
+}
+
+static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
+{
+ struct nfs_client *clp = server->nfs_client;
+
+ if (!nfs4_has_session(clp))
+ do_renew_lease(clp, timestamp);
+}
+
+struct nfs4_call_sync_data {
+ const struct nfs_server *seq_server;
+ struct nfs4_sequence_args *seq_args;
+ struct nfs4_sequence_res *seq_res;
+};
+
+void nfs4_init_sequence(struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res, int cache_reply,
+ int privileged)
+{
+ args->sa_slot = NULL;
+ args->sa_cache_this = cache_reply;
+ args->sa_privileged = privileged;
+
+ res->sr_slot = NULL;
+}
+
+static void nfs40_sequence_free_slot(struct nfs4_sequence_res *res)
+{
+ struct nfs4_slot *slot = res->sr_slot;
+ struct nfs4_slot_table *tbl;
+
+ tbl = slot->table;
+ spin_lock(&tbl->slot_tbl_lock);
+ if (!nfs41_wake_and_assign_slot(tbl, slot))
+ nfs4_free_slot(tbl, slot);
+ spin_unlock(&tbl->slot_tbl_lock);
+
+ res->sr_slot = NULL;
+}
+
+static int nfs40_sequence_done(struct rpc_task *task,
+ struct nfs4_sequence_res *res)
+{
+ if (res->sr_slot != NULL)
+ nfs40_sequence_free_slot(res);
+ return 1;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static void nfs41_release_slot(struct nfs4_slot *slot)
+{
+ struct nfs4_session *session;
+ struct nfs4_slot_table *tbl;
+ bool send_new_highest_used_slotid = false;
+
+ if (!slot)
+ return;
+ tbl = slot->table;
+ session = tbl->session;
+
+ /* Bump the slot sequence number */
+ if (slot->seq_done)
+ slot->seq_nr++;
+ slot->seq_done = 0;
+
+ spin_lock(&tbl->slot_tbl_lock);
+ /* Be nice to the server: try to ensure that the last transmitted
+ * value for highest_user_slotid <= target_highest_slotid
+ */
+ if (tbl->highest_used_slotid > tbl->target_highest_slotid)
+ send_new_highest_used_slotid = true;
+
+ if (nfs41_wake_and_assign_slot(tbl, slot)) {
+ send_new_highest_used_slotid = false;
+ goto out_unlock;
+ }
+ nfs4_free_slot(tbl, slot);
+
+ if (tbl->highest_used_slotid != NFS4_NO_SLOT)
+ send_new_highest_used_slotid = false;
+out_unlock:
+ spin_unlock(&tbl->slot_tbl_lock);
+ if (send_new_highest_used_slotid)
+ nfs41_notify_server(session->clp);
+ if (waitqueue_active(&tbl->slot_waitq))
+ wake_up_all(&tbl->slot_waitq);
+}
+
+static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
+{
+ nfs41_release_slot(res->sr_slot);
+ res->sr_slot = NULL;
+}
+
+static void nfs4_slot_sequence_record_sent(struct nfs4_slot *slot,
+ u32 seqnr)
+{
+ if ((s32)(seqnr - slot->seq_nr_highest_sent) > 0)
+ slot->seq_nr_highest_sent = seqnr;
+}
+static void nfs4_slot_sequence_acked(struct nfs4_slot *slot, u32 seqnr)
+{
+ nfs4_slot_sequence_record_sent(slot, seqnr);
+ slot->seq_nr_last_acked = seqnr;
+}
+
+static void nfs4_probe_sequence(struct nfs_client *client, const struct cred *cred,
+ struct nfs4_slot *slot)
+{
+ struct rpc_task *task = _nfs41_proc_sequence(client, cred, slot, true);
+ if (!IS_ERR(task))
+ rpc_put_task_async(task);
+}
+
+static int nfs41_sequence_process(struct rpc_task *task,
+ struct nfs4_sequence_res *res)
+{
+ struct nfs4_session *session;
+ struct nfs4_slot *slot = res->sr_slot;
+ struct nfs_client *clp;
+ int status;
+ int ret = 1;
+
+ if (slot == NULL)
+ goto out_noaction;
+ /* don't increment the sequence number if the task wasn't sent */
+ if (!RPC_WAS_SENT(task) || slot->seq_done)
+ goto out;
+
+ session = slot->table->session;
+ clp = session->clp;
+
+ trace_nfs4_sequence_done(session, res);
+
+ status = res->sr_status;
+ if (task->tk_status == -NFS4ERR_DEADSESSION)
+ status = -NFS4ERR_DEADSESSION;
+
+ /* Check the SEQUENCE operation status */
+ switch (status) {
+ case 0:
+ /* Mark this sequence number as having been acked */
+ nfs4_slot_sequence_acked(slot, slot->seq_nr);
+ /* Update the slot's sequence and clientid lease timer */
+ slot->seq_done = 1;
+ do_renew_lease(clp, res->sr_timestamp);
+ /* Check sequence flags */
+ nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags,
+ !!slot->privileged);
+ nfs41_update_target_slotid(slot->table, slot, res);
+ break;
+ case 1:
+ /*
+ * sr_status remains 1 if an RPC level error occurred.
+ * The server may or may not have processed the sequence
+ * operation..
+ */
+ nfs4_slot_sequence_record_sent(slot, slot->seq_nr);
+ slot->seq_done = 1;
+ goto out;
+ case -NFS4ERR_DELAY:
+ /* The server detected a resend of the RPC call and
+ * returned NFS4ERR_DELAY as per Section 2.10.6.2
+ * of RFC5661.
+ */
+ dprintk("%s: slot=%u seq=%u: Operation in progress\n",
+ __func__,
+ slot->slot_nr,
+ slot->seq_nr);
+ goto out_retry;
+ case -NFS4ERR_RETRY_UNCACHED_REP:
+ case -NFS4ERR_SEQ_FALSE_RETRY:
+ /*
+ * The server thinks we tried to replay a request.
+ * Retry the call after bumping the sequence ID.
+ */
+ nfs4_slot_sequence_acked(slot, slot->seq_nr);
+ goto retry_new_seq;
+ case -NFS4ERR_BADSLOT:
+ /*
+ * The slot id we used was probably retired. Try again
+ * using a different slot id.
+ */
+ if (slot->slot_nr < slot->table->target_highest_slotid)
+ goto session_recover;
+ goto retry_nowait;
+ case -NFS4ERR_SEQ_MISORDERED:
+ nfs4_slot_sequence_record_sent(slot, slot->seq_nr);
+ /*
+ * Were one or more calls using this slot interrupted?
+ * If the server never received the request, then our
+ * transmitted slot sequence number may be too high. However,
+ * if the server did receive the request then it might
+ * accidentally give us a reply with a mismatched operation.
+ * We can sort this out by sending a lone sequence operation
+ * to the server on the same slot.
+ */
+ if ((s32)(slot->seq_nr - slot->seq_nr_last_acked) > 1) {
+ slot->seq_nr--;
+ if (task->tk_msg.rpc_proc != &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE]) {
+ nfs4_probe_sequence(clp, task->tk_msg.rpc_cred, slot);
+ res->sr_slot = NULL;
+ }
+ goto retry_nowait;
+ }
+ /*
+ * RFC5661:
+ * A retry might be sent while the original request is
+ * still in progress on the replier. The replier SHOULD
+ * deal with the issue by returning NFS4ERR_DELAY as the
+ * reply to SEQUENCE or CB_SEQUENCE operation, but
+ * implementations MAY return NFS4ERR_SEQ_MISORDERED.
+ *
+ * Restart the search after a delay.
+ */
+ slot->seq_nr = slot->seq_nr_highest_sent;
+ goto out_retry;
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ goto session_recover;
+ default:
+ /* Just update the slot sequence no. */
+ slot->seq_done = 1;
+ }
+out:
+ /* The session may be reset by one of the error handlers. */
+ dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
+out_noaction:
+ return ret;
+session_recover:
+ set_bit(NFS4_SLOT_TBL_DRAINING, &session->fc_slot_table.slot_tbl_state);
+ nfs4_schedule_session_recovery(session, status);
+ dprintk("%s ERROR: %d Reset session\n", __func__, status);
+ nfs41_sequence_free_slot(res);
+ goto out;
+retry_new_seq:
+ ++slot->seq_nr;
+retry_nowait:
+ if (rpc_restart_call_prepare(task)) {
+ nfs41_sequence_free_slot(res);
+ task->tk_status = 0;
+ ret = 0;
+ }
+ goto out;
+out_retry:
+ if (!rpc_restart_call(task))
+ goto out;
+ rpc_delay(task, NFS4_POLL_RETRY_MAX);
+ return 0;
+}
+
+int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
+{
+ if (!nfs41_sequence_process(task, res))
+ return 0;
+ if (res->sr_slot != NULL)
+ nfs41_sequence_free_slot(res);
+ return 1;
+
+}
+EXPORT_SYMBOL_GPL(nfs41_sequence_done);
+
+static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res)
+{
+ if (res->sr_slot == NULL)
+ return 1;
+ if (res->sr_slot->table->session != NULL)
+ return nfs41_sequence_process(task, res);
+ return nfs40_sequence_done(task, res);
+}
+
+static void nfs4_sequence_free_slot(struct nfs4_sequence_res *res)
+{
+ if (res->sr_slot != NULL) {
+ if (res->sr_slot->table->session != NULL)
+ nfs41_sequence_free_slot(res);
+ else
+ nfs40_sequence_free_slot(res);
+ }
+}
+
+int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
+{
+ if (res->sr_slot == NULL)
+ return 1;
+ if (!res->sr_slot->table->session)
+ return nfs40_sequence_done(task, res);
+ return nfs41_sequence_done(task, res);
+}
+EXPORT_SYMBOL_GPL(nfs4_sequence_done);
+
+static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_call_sync_data *data = calldata;
+
+ dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
+
+ nfs4_setup_sequence(data->seq_server->nfs_client,
+ data->seq_args, data->seq_res, task);
+}
+
+static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_call_sync_data *data = calldata;
+
+ nfs41_sequence_done(task, data->seq_res);
+}
+
+static const struct rpc_call_ops nfs41_call_sync_ops = {
+ .rpc_call_prepare = nfs41_call_sync_prepare,
+ .rpc_call_done = nfs41_call_sync_done,
+};
+
+#else /* !CONFIG_NFS_V4_1 */
+
+static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res)
+{
+ return nfs40_sequence_done(task, res);
+}
+
+static void nfs4_sequence_free_slot(struct nfs4_sequence_res *res)
+{
+ if (res->sr_slot != NULL)
+ nfs40_sequence_free_slot(res);
+}
+
+int nfs4_sequence_done(struct rpc_task *task,
+ struct nfs4_sequence_res *res)
+{
+ return nfs40_sequence_done(task, res);
+}
+EXPORT_SYMBOL_GPL(nfs4_sequence_done);
+
+#endif /* !CONFIG_NFS_V4_1 */
+
+static void nfs41_sequence_res_init(struct nfs4_sequence_res *res)
+{
+ res->sr_timestamp = jiffies;
+ res->sr_status_flags = 0;
+ res->sr_status = 1;
+}
+
+static
+void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ struct nfs4_slot *slot)
+{
+ if (!slot)
+ return;
+ slot->privileged = args->sa_privileged ? 1 : 0;
+ args->sa_slot = slot;
+
+ res->sr_slot = slot;
+}
+
+int nfs4_setup_sequence(struct nfs_client *client,
+ struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ struct rpc_task *task)
+{
+ struct nfs4_session *session = nfs4_get_session(client);
+ struct nfs4_slot_table *tbl = client->cl_slot_tbl;
+ struct nfs4_slot *slot;
+
+ /* slot already allocated? */
+ if (res->sr_slot != NULL)
+ goto out_start;
+
+ if (session)
+ tbl = &session->fc_slot_table;
+
+ spin_lock(&tbl->slot_tbl_lock);
+ /* The state manager will wait until the slot table is empty */
+ if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
+ goto out_sleep;
+
+ slot = nfs4_alloc_slot(tbl);
+ if (IS_ERR(slot)) {
+ if (slot == ERR_PTR(-ENOMEM))
+ goto out_sleep_timeout;
+ goto out_sleep;
+ }
+ spin_unlock(&tbl->slot_tbl_lock);
+
+ nfs4_sequence_attach_slot(args, res, slot);
+
+ trace_nfs4_setup_sequence(session, args);
+out_start:
+ nfs41_sequence_res_init(res);
+ rpc_call_start(task);
+ return 0;
+out_sleep_timeout:
+ /* Try again in 1/4 second */
+ if (args->sa_privileged)
+ rpc_sleep_on_priority_timeout(&tbl->slot_tbl_waitq, task,
+ jiffies + (HZ >> 2), RPC_PRIORITY_PRIVILEGED);
+ else
+ rpc_sleep_on_timeout(&tbl->slot_tbl_waitq, task,
+ NULL, jiffies + (HZ >> 2));
+ spin_unlock(&tbl->slot_tbl_lock);
+ return -EAGAIN;
+out_sleep:
+ if (args->sa_privileged)
+ rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task,
+ RPC_PRIORITY_PRIVILEGED);
+ else
+ rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
+ spin_unlock(&tbl->slot_tbl_lock);
+ return -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(nfs4_setup_sequence);
+
+static void nfs40_call_sync_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_call_sync_data *data = calldata;
+ nfs4_setup_sequence(data->seq_server->nfs_client,
+ data->seq_args, data->seq_res, task);
+}
+
+static void nfs40_call_sync_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_call_sync_data *data = calldata;
+ nfs4_sequence_done(task, data->seq_res);
+}
+
+static const struct rpc_call_ops nfs40_call_sync_ops = {
+ .rpc_call_prepare = nfs40_call_sync_prepare,
+ .rpc_call_done = nfs40_call_sync_done,
+};
+
+static int nfs4_call_sync_custom(struct rpc_task_setup *task_setup)
+{
+ int ret;
+ struct rpc_task *task;
+
+ task = rpc_run_task(task_setup);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ ret = task->tk_status;
+ rpc_put_task(task);
+ return ret;
+}
+
+static int nfs4_do_call_sync(struct rpc_clnt *clnt,
+ struct nfs_server *server,
+ struct rpc_message *msg,
+ struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ unsigned short task_flags)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_call_sync_data data = {
+ .seq_server = server,
+ .seq_args = args,
+ .seq_res = res,
+ };
+ struct rpc_task_setup task_setup = {
+ .rpc_client = clnt,
+ .rpc_message = msg,
+ .callback_ops = clp->cl_mvops->call_sync_ops,
+ .callback_data = &data,
+ .flags = task_flags,
+ };
+
+ return nfs4_call_sync_custom(&task_setup);
+}
+
+static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
+ struct nfs_server *server,
+ struct rpc_message *msg,
+ struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res)
+{
+ unsigned short task_flags = 0;
+
+ if (server->caps & NFS_CAP_MOVEABLE)
+ task_flags = RPC_TASK_MOVEABLE;
+ return nfs4_do_call_sync(clnt, server, msg, args, res, task_flags);
+}
+
+
+int nfs4_call_sync(struct rpc_clnt *clnt,
+ struct nfs_server *server,
+ struct rpc_message *msg,
+ struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ int cache_reply)
+{
+ nfs4_init_sequence(args, res, cache_reply, 0);
+ return nfs4_call_sync_sequence(clnt, server, msg, args, res);
+}
+
+static void
+nfs4_inc_nlink_locked(struct inode *inode)
+{
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE |
+ NFS_INO_INVALID_CTIME |
+ NFS_INO_INVALID_NLINK);
+ inc_nlink(inode);
+}
+
+static void
+nfs4_inc_nlink(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ nfs4_inc_nlink_locked(inode);
+ spin_unlock(&inode->i_lock);
+}
+
+static void
+nfs4_dec_nlink_locked(struct inode *inode)
+{
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE |
+ NFS_INO_INVALID_CTIME |
+ NFS_INO_INVALID_NLINK);
+ drop_nlink(inode);
+}
+
+static void
+nfs4_update_changeattr_locked(struct inode *inode,
+ struct nfs4_change_info *cinfo,
+ unsigned long timestamp, unsigned long cache_validity)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ u64 change_attr = inode_peek_iversion_raw(inode);
+
+ cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME;
+ if (S_ISDIR(inode->i_mode))
+ cache_validity |= NFS_INO_INVALID_DATA;
+
+ switch (NFS_SERVER(inode)->change_attr_type) {
+ case NFS4_CHANGE_TYPE_IS_UNDEFINED:
+ if (cinfo->after == change_attr)
+ goto out;
+ break;
+ default:
+ if ((s64)(change_attr - cinfo->after) >= 0)
+ goto out;
+ }
+
+ inode_set_iversion_raw(inode, cinfo->after);
+ if (!cinfo->atomic || cinfo->before != change_attr) {
+ if (S_ISDIR(inode->i_mode))
+ nfs_force_lookup_revalidate(inode);
+
+ if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ cache_validity |=
+ NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL |
+ NFS_INO_INVALID_SIZE | NFS_INO_INVALID_OTHER |
+ NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK |
+ NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR;
+ nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
+ }
+ nfsi->attrtimeo_timestamp = jiffies;
+ nfsi->read_cache_jiffies = timestamp;
+ nfsi->attr_gencount = nfs_inc_attr_generation_counter();
+ nfsi->cache_validity &= ~NFS_INO_INVALID_CHANGE;
+out:
+ nfs_set_cache_invalid(inode, cache_validity);
+}
+
+void
+nfs4_update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo,
+ unsigned long timestamp, unsigned long cache_validity)
+{
+ spin_lock(&dir->i_lock);
+ nfs4_update_changeattr_locked(dir, cinfo, timestamp, cache_validity);
+ spin_unlock(&dir->i_lock);
+}
+
+struct nfs4_open_createattrs {
+ struct nfs4_label *label;
+ struct iattr *sattr;
+ const __u32 verf[2];
+};
+
+static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server,
+ int err, struct nfs4_exception *exception)
+{
+ if (err != -EINVAL)
+ return false;
+ if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1))
+ return false;
+ server->caps &= ~NFS_CAP_ATOMIC_OPEN_V1;
+ exception->retry = 1;
+ return true;
+}
+
+static fmode_t _nfs4_ctx_to_accessmode(const struct nfs_open_context *ctx)
+{
+ return ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC);
+}
+
+static fmode_t _nfs4_ctx_to_openmode(const struct nfs_open_context *ctx)
+{
+ fmode_t ret = ctx->mode & (FMODE_READ|FMODE_WRITE);
+
+ return (ctx->mode & FMODE_EXEC) ? FMODE_READ | ret : ret;
+}
+
+static u32
+nfs4_map_atomic_open_share(struct nfs_server *server,
+ fmode_t fmode, int openflags)
+{
+ u32 res = 0;
+
+ switch (fmode & (FMODE_READ | FMODE_WRITE)) {
+ case FMODE_READ:
+ res = NFS4_SHARE_ACCESS_READ;
+ break;
+ case FMODE_WRITE:
+ res = NFS4_SHARE_ACCESS_WRITE;
+ break;
+ case FMODE_READ|FMODE_WRITE:
+ res = NFS4_SHARE_ACCESS_BOTH;
+ }
+ if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1))
+ goto out;
+ /* Want no delegation if we're using O_DIRECT */
+ if (openflags & O_DIRECT)
+ res |= NFS4_SHARE_WANT_NO_DELEG;
+out:
+ return res;
+}
+
+static enum open_claim_type4
+nfs4_map_atomic_open_claim(struct nfs_server *server,
+ enum open_claim_type4 claim)
+{
+ if (server->caps & NFS_CAP_ATOMIC_OPEN_V1)
+ return claim;
+ switch (claim) {
+ default:
+ return claim;
+ case NFS4_OPEN_CLAIM_FH:
+ return NFS4_OPEN_CLAIM_NULL;
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ return NFS4_OPEN_CLAIM_DELEGATE_CUR;
+ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+ return NFS4_OPEN_CLAIM_DELEGATE_PREV;
+ }
+}
+
+static void nfs4_init_opendata_res(struct nfs4_opendata *p)
+{
+ p->o_res.f_attr = &p->f_attr;
+ p->o_res.seqid = p->o_arg.seqid;
+ p->c_res.seqid = p->c_arg.seqid;
+ p->o_res.server = p->o_arg.server;
+ p->o_res.access_request = p->o_arg.access;
+ nfs_fattr_init(&p->f_attr);
+ nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name);
+}
+
+static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
+ struct nfs4_state_owner *sp, fmode_t fmode, int flags,
+ const struct nfs4_open_createattrs *c,
+ enum open_claim_type4 claim,
+ gfp_t gfp_mask)
+{
+ struct dentry *parent = dget_parent(dentry);
+ struct inode *dir = d_inode(parent);
+ struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+ struct nfs4_label *label = (c != NULL) ? c->label : NULL;
+ struct nfs4_opendata *p;
+
+ p = kzalloc(sizeof(*p), gfp_mask);
+ if (p == NULL)
+ goto err;
+
+ p->f_attr.label = nfs4_label_alloc(server, gfp_mask);
+ if (IS_ERR(p->f_attr.label))
+ goto err_free_p;
+
+ p->a_label = nfs4_label_alloc(server, gfp_mask);
+ if (IS_ERR(p->a_label))
+ goto err_free_f;
+
+ alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
+ p->o_arg.seqid = alloc_seqid(&sp->so_seqid, gfp_mask);
+ if (IS_ERR(p->o_arg.seqid))
+ goto err_free_label;
+ nfs_sb_active(dentry->d_sb);
+ p->dentry = dget(dentry);
+ p->dir = parent;
+ p->owner = sp;
+ atomic_inc(&sp->so_count);
+ p->o_arg.open_flags = flags;
+ p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
+ p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
+ p->o_arg.share_access = nfs4_map_atomic_open_share(server,
+ fmode, flags);
+ if (flags & O_CREAT) {
+ p->o_arg.umask = current_umask();
+ p->o_arg.label = nfs4_label_copy(p->a_label, label);
+ if (c->sattr != NULL && c->sattr->ia_valid != 0) {
+ p->o_arg.u.attrs = &p->attrs;
+ memcpy(&p->attrs, c->sattr, sizeof(p->attrs));
+
+ memcpy(p->o_arg.u.verifier.data, c->verf,
+ sizeof(p->o_arg.u.verifier.data));
+ }
+ }
+ /* ask server to check for all possible rights as results
+ * are cached */
+ switch (p->o_arg.claim) {
+ default:
+ break;
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_FH:
+ p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY |
+ NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE |
+ NFS4_ACCESS_EXECUTE |
+ nfs_access_xattr_mask(server);
+ }
+ p->o_arg.clientid = server->nfs_client->cl_clientid;
+ p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time);
+ p->o_arg.id.uniquifier = sp->so_seqid.owner_id;
+ p->o_arg.name = &dentry->d_name;
+ p->o_arg.server = server;
+ p->o_arg.bitmask = nfs4_bitmask(server, label);
+ p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0];
+ switch (p->o_arg.claim) {
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+ p->o_arg.fh = NFS_FH(dir);
+ break;
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ case NFS4_OPEN_CLAIM_FH:
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+ p->o_arg.fh = NFS_FH(d_inode(dentry));
+ }
+ p->c_arg.fh = &p->o_res.fh;
+ p->c_arg.stateid = &p->o_res.stateid;
+ p->c_arg.seqid = p->o_arg.seqid;
+ nfs4_init_opendata_res(p);
+ kref_init(&p->kref);
+ return p;
+
+err_free_label:
+ nfs4_label_free(p->a_label);
+err_free_f:
+ nfs4_label_free(p->f_attr.label);
+err_free_p:
+ kfree(p);
+err:
+ dput(parent);
+ return NULL;
+}
+
+static void nfs4_opendata_free(struct kref *kref)
+{
+ struct nfs4_opendata *p = container_of(kref,
+ struct nfs4_opendata, kref);
+ struct super_block *sb = p->dentry->d_sb;
+
+ nfs4_lgopen_release(p->lgp);
+ nfs_free_seqid(p->o_arg.seqid);
+ nfs4_sequence_free_slot(&p->o_res.seq_res);
+ if (p->state != NULL)
+ nfs4_put_open_state(p->state);
+ nfs4_put_state_owner(p->owner);
+
+ nfs4_label_free(p->a_label);
+ nfs4_label_free(p->f_attr.label);
+
+ dput(p->dir);
+ dput(p->dentry);
+ nfs_sb_deactive(sb);
+ nfs_fattr_free_names(&p->f_attr);
+ kfree(p->f_attr.mdsthreshold);
+ kfree(p);
+}
+
+static void nfs4_opendata_put(struct nfs4_opendata *p)
+{
+ if (p != NULL)
+ kref_put(&p->kref, nfs4_opendata_free);
+}
+
+static bool nfs4_mode_match_open_stateid(struct nfs4_state *state,
+ fmode_t fmode)
+{
+ switch(fmode & (FMODE_READ|FMODE_WRITE)) {
+ case FMODE_READ|FMODE_WRITE:
+ return state->n_rdwr != 0;
+ case FMODE_WRITE:
+ return state->n_wronly != 0;
+ case FMODE_READ:
+ return state->n_rdonly != 0;
+ }
+ WARN_ON_ONCE(1);
+ return false;
+}
+
+static int can_open_cached(struct nfs4_state *state, fmode_t mode,
+ int open_mode, enum open_claim_type4 claim)
+{
+ int ret = 0;
+
+ if (open_mode & (O_EXCL|O_TRUNC))
+ goto out;
+ switch (claim) {
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_FH:
+ goto out;
+ default:
+ break;
+ }
+ switch (mode & (FMODE_READ|FMODE_WRITE)) {
+ case FMODE_READ:
+ ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0
+ && state->n_rdonly != 0;
+ break;
+ case FMODE_WRITE:
+ ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0
+ && state->n_wronly != 0;
+ break;
+ case FMODE_READ|FMODE_WRITE:
+ ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0
+ && state->n_rdwr != 0;
+ }
+out:
+ return ret;
+}
+
+static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode,
+ enum open_claim_type4 claim)
+{
+ if (delegation == NULL)
+ return 0;
+ if ((delegation->type & fmode) != fmode)
+ return 0;
+ switch (claim) {
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_FH:
+ break;
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ if (!test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
+ break;
+ fallthrough;
+ default:
+ return 0;
+ }
+ nfs_mark_delegation_referenced(delegation);
+ return 1;
+}
+
+static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
+{
+ switch (fmode) {
+ case FMODE_WRITE:
+ state->n_wronly++;
+ break;
+ case FMODE_READ:
+ state->n_rdonly++;
+ break;
+ case FMODE_READ|FMODE_WRITE:
+ state->n_rdwr++;
+ }
+ nfs4_state_set_mode_locked(state, state->state | fmode);
+}
+
+#ifdef CONFIG_NFS_V4_1
+static bool nfs_open_stateid_recover_openmode(struct nfs4_state *state)
+{
+ if (state->n_rdonly && !test_bit(NFS_O_RDONLY_STATE, &state->flags))
+ return true;
+ if (state->n_wronly && !test_bit(NFS_O_WRONLY_STATE, &state->flags))
+ return true;
+ if (state->n_rdwr && !test_bit(NFS_O_RDWR_STATE, &state->flags))
+ return true;
+ return false;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static void nfs_state_log_update_open_stateid(struct nfs4_state *state)
+{
+ if (test_and_clear_bit(NFS_STATE_CHANGE_WAIT, &state->flags))
+ wake_up_all(&state->waitq);
+}
+
+static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
+{
+ struct nfs_client *clp = state->owner->so_server->nfs_client;
+ bool need_recover = false;
+
+ if (test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags) && state->n_rdonly)
+ need_recover = true;
+ if (test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags) && state->n_wronly)
+ need_recover = true;
+ if (test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags) && state->n_rdwr)
+ need_recover = true;
+ if (need_recover)
+ nfs4_state_mark_reclaim_nograce(clp, state);
+}
+
+/*
+ * Check for whether or not the caller may update the open stateid
+ * to the value passed in by stateid.
+ *
+ * Note: This function relies heavily on the server implementing
+ * RFC7530 Section 9.1.4.2, and RFC5661 Section 8.2.2
+ * correctly.
+ * i.e. The stateid seqids have to be initialised to 1, and
+ * are then incremented on every state transition.
+ */
+static bool nfs_stateid_is_sequential(struct nfs4_state *state,
+ const nfs4_stateid *stateid)
+{
+ if (test_bit(NFS_OPEN_STATE, &state->flags)) {
+ /* The common case - we're updating to a new sequence number */
+ if (nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+ if (nfs4_stateid_is_next(&state->open_stateid, stateid))
+ return true;
+ return false;
+ }
+ /* The server returned a new stateid */
+ }
+ /* This is the first OPEN in this generation */
+ if (stateid->seqid == cpu_to_be32(1))
+ return true;
+ return false;
+}
+
+static void nfs_resync_open_stateid_locked(struct nfs4_state *state)
+{
+ if (!(state->n_wronly || state->n_rdonly || state->n_rdwr))
+ return;
+ if (state->n_wronly)
+ set_bit(NFS_O_WRONLY_STATE, &state->flags);
+ if (state->n_rdonly)
+ set_bit(NFS_O_RDONLY_STATE, &state->flags);
+ if (state->n_rdwr)
+ set_bit(NFS_O_RDWR_STATE, &state->flags);
+ set_bit(NFS_OPEN_STATE, &state->flags);
+}
+
+static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
+ nfs4_stateid *stateid, fmode_t fmode)
+{
+ clear_bit(NFS_O_RDWR_STATE, &state->flags);
+ switch (fmode & (FMODE_READ|FMODE_WRITE)) {
+ case FMODE_WRITE:
+ clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+ break;
+ case FMODE_READ:
+ clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+ break;
+ case 0:
+ clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+ clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+ clear_bit(NFS_OPEN_STATE, &state->flags);
+ }
+ if (stateid == NULL)
+ return;
+ /* Handle OPEN+OPEN_DOWNGRADE races */
+ if (nfs4_stateid_match_other(stateid, &state->open_stateid) &&
+ !nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
+ nfs_resync_open_stateid_locked(state);
+ goto out;
+ }
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
+ nfs4_stateid_copy(&state->stateid, stateid);
+ nfs4_stateid_copy(&state->open_stateid, stateid);
+ trace_nfs4_open_stateid_update(state->inode, stateid, 0);
+out:
+ nfs_state_log_update_open_stateid(state);
+}
+
+static void nfs_clear_open_stateid(struct nfs4_state *state,
+ nfs4_stateid *arg_stateid,
+ nfs4_stateid *stateid, fmode_t fmode)
+{
+ write_seqlock(&state->seqlock);
+ /* Ignore, if the CLOSE argment doesn't match the current stateid */
+ if (nfs4_state_match_open_stateid_other(state, arg_stateid))
+ nfs_clear_open_stateid_locked(state, stateid, fmode);
+ write_sequnlock(&state->seqlock);
+ if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
+ nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
+}
+
+static void nfs_set_open_stateid_locked(struct nfs4_state *state,
+ const nfs4_stateid *stateid, nfs4_stateid *freeme)
+ __must_hold(&state->owner->so_lock)
+ __must_hold(&state->seqlock)
+ __must_hold(RCU)
+
+{
+ DEFINE_WAIT(wait);
+ int status = 0;
+ for (;;) {
+
+ if (nfs_stateid_is_sequential(state, stateid))
+ break;
+
+ if (status)
+ break;
+ /* Rely on seqids for serialisation with NFSv4.0 */
+ if (!nfs4_has_session(NFS_SERVER(state->inode)->nfs_client))
+ break;
+
+ set_bit(NFS_STATE_CHANGE_WAIT, &state->flags);
+ prepare_to_wait(&state->waitq, &wait, TASK_KILLABLE);
+ /*
+ * Ensure we process the state changes in the same order
+ * in which the server processed them by delaying the
+ * update of the stateid until we are in sequence.
+ */
+ write_sequnlock(&state->seqlock);
+ spin_unlock(&state->owner->so_lock);
+ rcu_read_unlock();
+ trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0);
+
+ if (!fatal_signal_pending(current)) {
+ if (schedule_timeout(5*HZ) == 0)
+ status = -EAGAIN;
+ else
+ status = 0;
+ } else
+ status = -EINTR;
+ finish_wait(&state->waitq, &wait);
+ rcu_read_lock();
+ spin_lock(&state->owner->so_lock);
+ write_seqlock(&state->seqlock);
+ }
+
+ if (test_bit(NFS_OPEN_STATE, &state->flags) &&
+ !nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+ nfs4_stateid_copy(freeme, &state->open_stateid);
+ nfs_test_and_clear_all_open_stateid(state);
+ }
+
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
+ nfs4_stateid_copy(&state->stateid, stateid);
+ nfs4_stateid_copy(&state->open_stateid, stateid);
+ trace_nfs4_open_stateid_update(state->inode, stateid, status);
+ nfs_state_log_update_open_stateid(state);
+}
+
+static void nfs_state_set_open_stateid(struct nfs4_state *state,
+ const nfs4_stateid *open_stateid,
+ fmode_t fmode,
+ nfs4_stateid *freeme)
+{
+ /*
+ * Protect the call to nfs4_state_set_mode_locked and
+ * serialise the stateid update
+ */
+ write_seqlock(&state->seqlock);
+ nfs_set_open_stateid_locked(state, open_stateid, freeme);
+ switch (fmode) {
+ case FMODE_READ:
+ set_bit(NFS_O_RDONLY_STATE, &state->flags);
+ break;
+ case FMODE_WRITE:
+ set_bit(NFS_O_WRONLY_STATE, &state->flags);
+ break;
+ case FMODE_READ|FMODE_WRITE:
+ set_bit(NFS_O_RDWR_STATE, &state->flags);
+ }
+ set_bit(NFS_OPEN_STATE, &state->flags);
+ write_sequnlock(&state->seqlock);
+}
+
+static void nfs_state_clear_open_state_flags(struct nfs4_state *state)
+{
+ clear_bit(NFS_O_RDWR_STATE, &state->flags);
+ clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+ clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+ clear_bit(NFS_OPEN_STATE, &state->flags);
+}
+
+static void nfs_state_set_delegation(struct nfs4_state *state,
+ const nfs4_stateid *deleg_stateid,
+ fmode_t fmode)
+{
+ /*
+ * Protect the call to nfs4_state_set_mode_locked and
+ * serialise the stateid update
+ */
+ write_seqlock(&state->seqlock);
+ nfs4_stateid_copy(&state->stateid, deleg_stateid);
+ set_bit(NFS_DELEGATED_STATE, &state->flags);
+ write_sequnlock(&state->seqlock);
+}
+
+static void nfs_state_clear_delegation(struct nfs4_state *state)
+{
+ write_seqlock(&state->seqlock);
+ nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+ clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ write_sequnlock(&state->seqlock);
+}
+
+int update_open_stateid(struct nfs4_state *state,
+ const nfs4_stateid *open_stateid,
+ const nfs4_stateid *delegation,
+ fmode_t fmode)
+{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs_inode *nfsi = NFS_I(state->inode);
+ struct nfs_delegation *deleg_cur;
+ nfs4_stateid freeme = { };
+ int ret = 0;
+
+ fmode &= (FMODE_READ|FMODE_WRITE);
+
+ rcu_read_lock();
+ spin_lock(&state->owner->so_lock);
+ if (open_stateid != NULL) {
+ nfs_state_set_open_stateid(state, open_stateid, fmode, &freeme);
+ ret = 1;
+ }
+
+ deleg_cur = nfs4_get_valid_delegation(state->inode);
+ if (deleg_cur == NULL)
+ goto no_delegation;
+
+ spin_lock(&deleg_cur->lock);
+ if (rcu_dereference(nfsi->delegation) != deleg_cur ||
+ test_bit(NFS_DELEGATION_RETURNING, &deleg_cur->flags) ||
+ (deleg_cur->type & fmode) != fmode)
+ goto no_delegation_unlock;
+
+ if (delegation == NULL)
+ delegation = &deleg_cur->stateid;
+ else if (!nfs4_stateid_match_other(&deleg_cur->stateid, delegation))
+ goto no_delegation_unlock;
+
+ nfs_mark_delegation_referenced(deleg_cur);
+ nfs_state_set_delegation(state, &deleg_cur->stateid, fmode);
+ ret = 1;
+no_delegation_unlock:
+ spin_unlock(&deleg_cur->lock);
+no_delegation:
+ if (ret)
+ update_open_stateflags(state, fmode);
+ spin_unlock(&state->owner->so_lock);
+ rcu_read_unlock();
+
+ if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
+ nfs4_schedule_state_manager(clp);
+ if (freeme.type != 0)
+ nfs4_test_and_free_stateid(server, &freeme,
+ state->owner->so_cred);
+
+ return ret;
+}
+
+static bool nfs4_update_lock_stateid(struct nfs4_lock_state *lsp,
+ const nfs4_stateid *stateid)
+{
+ struct nfs4_state *state = lsp->ls_state;
+ bool ret = false;
+
+ spin_lock(&state->state_lock);
+ if (!nfs4_stateid_match_other(stateid, &lsp->ls_stateid))
+ goto out_noupdate;
+ if (!nfs4_stateid_is_newer(stateid, &lsp->ls_stateid))
+ goto out_noupdate;
+ nfs4_stateid_copy(&lsp->ls_stateid, stateid);
+ ret = true;
+out_noupdate:
+ spin_unlock(&state->state_lock);
+ return ret;
+}
+
+static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode)
+{
+ struct nfs_delegation *delegation;
+
+ fmode &= FMODE_READ|FMODE_WRITE;
+ rcu_read_lock();
+ delegation = nfs4_get_valid_delegation(inode);
+ if (delegation == NULL || (delegation->type & fmode) == fmode) {
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+ nfs4_inode_return_delegation(inode);
+}
+
+static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
+{
+ struct nfs4_state *state = opendata->state;
+ struct nfs_delegation *delegation;
+ int open_mode = opendata->o_arg.open_flags;
+ fmode_t fmode = opendata->o_arg.fmode;
+ enum open_claim_type4 claim = opendata->o_arg.claim;
+ nfs4_stateid stateid;
+ int ret = -EAGAIN;
+
+ for (;;) {
+ spin_lock(&state->owner->so_lock);
+ if (can_open_cached(state, fmode, open_mode, claim)) {
+ update_open_stateflags(state, fmode);
+ spin_unlock(&state->owner->so_lock);
+ goto out_return_state;
+ }
+ spin_unlock(&state->owner->so_lock);
+ rcu_read_lock();
+ delegation = nfs4_get_valid_delegation(state->inode);
+ if (!can_open_delegated(delegation, fmode, claim)) {
+ rcu_read_unlock();
+ break;
+ }
+ /* Save the delegation */
+ nfs4_stateid_copy(&stateid, &delegation->stateid);
+ rcu_read_unlock();
+ nfs_release_seqid(opendata->o_arg.seqid);
+ if (!opendata->is_recover) {
+ ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
+ if (ret != 0)
+ goto out;
+ }
+ ret = -EAGAIN;
+
+ /* Try to update the stateid using the delegation */
+ if (update_open_stateid(state, NULL, &stateid, fmode))
+ goto out_return_state;
+ }
+out:
+ return ERR_PTR(ret);
+out_return_state:
+ refcount_inc(&state->count);
+ return state;
+}
+
+static void
+nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state)
+{
+ struct nfs_client *clp = NFS_SERVER(state->inode)->nfs_client;
+ struct nfs_delegation *delegation;
+ int delegation_flags = 0;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(state->inode)->delegation);
+ if (delegation)
+ delegation_flags = delegation->flags;
+ rcu_read_unlock();
+ switch (data->o_arg.claim) {
+ default:
+ break;
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ pr_err_ratelimited("NFS: Broken NFSv4 server %s is "
+ "returning a delegation for "
+ "OPEN(CLAIM_DELEGATE_CUR)\n",
+ clp->cl_hostname);
+ return;
+ }
+ if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
+ nfs_inode_set_delegation(state->inode,
+ data->owner->so_cred,
+ data->o_res.delegation_type,
+ &data->o_res.delegation,
+ data->o_res.pagemod_limit);
+ else
+ nfs_inode_reclaim_delegation(state->inode,
+ data->owner->so_cred,
+ data->o_res.delegation_type,
+ &data->o_res.delegation,
+ data->o_res.pagemod_limit);
+
+ if (data->o_res.do_recall)
+ nfs_async_inode_return_delegation(state->inode,
+ &data->o_res.delegation);
+}
+
+/*
+ * Check the inode attributes against the CLAIM_PREVIOUS returned attributes
+ * and update the nfs4_state.
+ */
+static struct nfs4_state *
+_nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
+{
+ struct inode *inode = data->state->inode;
+ struct nfs4_state *state = data->state;
+ int ret;
+
+ if (!data->rpc_done) {
+ if (data->rpc_status)
+ return ERR_PTR(data->rpc_status);
+ return nfs4_try_open_cached(data);
+ }
+
+ ret = nfs_refresh_inode(inode, &data->f_attr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (data->o_res.delegation_type != 0)
+ nfs4_opendata_check_deleg(data, state);
+
+ if (!update_open_stateid(state, &data->o_res.stateid,
+ NULL, data->o_arg.fmode))
+ return ERR_PTR(-EAGAIN);
+ refcount_inc(&state->count);
+
+ return state;
+}
+
+static struct inode *
+nfs4_opendata_get_inode(struct nfs4_opendata *data)
+{
+ struct inode *inode;
+
+ switch (data->o_arg.claim) {
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+ if (!(data->f_attr.valid & NFS_ATTR_FATTR))
+ return ERR_PTR(-EAGAIN);
+ inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh,
+ &data->f_attr);
+ break;
+ default:
+ inode = d_inode(data->dentry);
+ ihold(inode);
+ nfs_refresh_inode(inode, &data->f_attr);
+ }
+ return inode;
+}
+
+static struct nfs4_state *
+nfs4_opendata_find_nfs4_state(struct nfs4_opendata *data)
+{
+ struct nfs4_state *state;
+ struct inode *inode;
+
+ inode = nfs4_opendata_get_inode(data);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+ if (data->state != NULL && data->state->inode == inode) {
+ state = data->state;
+ refcount_inc(&state->count);
+ } else
+ state = nfs4_get_open_state(inode, data->owner);
+ iput(inode);
+ if (state == NULL)
+ state = ERR_PTR(-ENOMEM);
+ return state;
+}
+
+static struct nfs4_state *
+_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
+{
+ struct nfs4_state *state;
+
+ if (!data->rpc_done) {
+ state = nfs4_try_open_cached(data);
+ trace_nfs4_cached_open(data->state);
+ goto out;
+ }
+
+ state = nfs4_opendata_find_nfs4_state(data);
+ if (IS_ERR(state))
+ goto out;
+
+ if (data->o_res.delegation_type != 0)
+ nfs4_opendata_check_deleg(data, state);
+ if (!update_open_stateid(state, &data->o_res.stateid,
+ NULL, data->o_arg.fmode)) {
+ nfs4_put_open_state(state);
+ state = ERR_PTR(-EAGAIN);
+ }
+out:
+ nfs_release_seqid(data->o_arg.seqid);
+ return state;
+}
+
+static struct nfs4_state *
+nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
+{
+ struct nfs4_state *ret;
+
+ if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS)
+ ret =_nfs4_opendata_reclaim_to_nfs4_state(data);
+ else
+ ret = _nfs4_opendata_to_nfs4_state(data);
+ nfs4_sequence_free_slot(&data->o_res.seq_res);
+ return ret;
+}
+
+static struct nfs_open_context *
+nfs4_state_find_open_context_mode(struct nfs4_state *state, fmode_t mode)
+{
+ struct nfs_inode *nfsi = NFS_I(state->inode);
+ struct nfs_open_context *ctx;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
+ if (ctx->state != state)
+ continue;
+ if ((ctx->mode & mode) != mode)
+ continue;
+ if (!get_nfs_open_context(ctx))
+ continue;
+ rcu_read_unlock();
+ return ctx;
+ }
+ rcu_read_unlock();
+ return ERR_PTR(-ENOENT);
+}
+
+static struct nfs_open_context *
+nfs4_state_find_open_context(struct nfs4_state *state)
+{
+ struct nfs_open_context *ctx;
+
+ ctx = nfs4_state_find_open_context_mode(state, FMODE_READ|FMODE_WRITE);
+ if (!IS_ERR(ctx))
+ return ctx;
+ ctx = nfs4_state_find_open_context_mode(state, FMODE_WRITE);
+ if (!IS_ERR(ctx))
+ return ctx;
+ return nfs4_state_find_open_context_mode(state, FMODE_READ);
+}
+
+static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx,
+ struct nfs4_state *state, enum open_claim_type4 claim)
+{
+ struct nfs4_opendata *opendata;
+
+ opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0,
+ NULL, claim, GFP_NOFS);
+ if (opendata == NULL)
+ return ERR_PTR(-ENOMEM);
+ opendata->state = state;
+ refcount_inc(&state->count);
+ return opendata;
+}
+
+static int nfs4_open_recover_helper(struct nfs4_opendata *opendata,
+ fmode_t fmode)
+{
+ struct nfs4_state *newstate;
+ struct nfs_server *server = NFS_SB(opendata->dentry->d_sb);
+ int openflags = opendata->o_arg.open_flags;
+ int ret;
+
+ if (!nfs4_mode_match_open_stateid(opendata->state, fmode))
+ return 0;
+ opendata->o_arg.fmode = fmode;
+ opendata->o_arg.share_access =
+ nfs4_map_atomic_open_share(server, fmode, openflags);
+ memset(&opendata->o_res, 0, sizeof(opendata->o_res));
+ memset(&opendata->c_res, 0, sizeof(opendata->c_res));
+ nfs4_init_opendata_res(opendata);
+ ret = _nfs4_recover_proc_open(opendata);
+ if (ret != 0)
+ return ret;
+ newstate = nfs4_opendata_to_nfs4_state(opendata);
+ if (IS_ERR(newstate))
+ return PTR_ERR(newstate);
+ if (newstate != opendata->state)
+ ret = -ESTALE;
+ nfs4_close_state(newstate, fmode);
+ return ret;
+}
+
+static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *state)
+{
+ int ret;
+
+ /* memory barrier prior to reading state->n_* */
+ smp_rmb();
+ ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE);
+ if (ret != 0)
+ return ret;
+ ret = nfs4_open_recover_helper(opendata, FMODE_WRITE);
+ if (ret != 0)
+ return ret;
+ ret = nfs4_open_recover_helper(opendata, FMODE_READ);
+ if (ret != 0)
+ return ret;
+ /*
+ * We may have performed cached opens for all three recoveries.
+ * Check if we need to update the current stateid.
+ */
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 &&
+ !nfs4_stateid_match(&state->stateid, &state->open_stateid)) {
+ write_seqlock(&state->seqlock);
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
+ nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+ write_sequnlock(&state->seqlock);
+ }
+ return 0;
+}
+
+/*
+ * OPEN_RECLAIM:
+ * reclaim state on the server after a reboot.
+ */
+static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state *state)
+{
+ struct nfs_delegation *delegation;
+ struct nfs4_opendata *opendata;
+ fmode_t delegation_type = 0;
+ int status;
+
+ opendata = nfs4_open_recoverdata_alloc(ctx, state,
+ NFS4_OPEN_CLAIM_PREVIOUS);
+ if (IS_ERR(opendata))
+ return PTR_ERR(opendata);
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(state->inode)->delegation);
+ if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0)
+ delegation_type = delegation->type;
+ rcu_read_unlock();
+ opendata->o_arg.u.delegation_type = delegation_type;
+ status = nfs4_open_recover(opendata, state);
+ nfs4_opendata_put(opendata);
+ return status;
+}
+
+static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state *state)
+{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ struct nfs4_exception exception = { };
+ int err;
+ do {
+ err = _nfs4_do_open_reclaim(ctx, state);
+ trace_nfs4_open_reclaim(ctx, 0, err);
+ if (nfs4_clear_cap_atomic_open_v1(server, err, &exception))
+ continue;
+ if (err != -NFS4ERR_DELAY)
+ break;
+ nfs4_handle_exception(server, err, &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+ struct nfs_open_context *ctx;
+ int ret;
+
+ ctx = nfs4_state_find_open_context(state);
+ if (IS_ERR(ctx))
+ return -EAGAIN;
+ clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ nfs_state_clear_open_state_flags(state);
+ ret = nfs4_do_open_reclaim(ctx, state);
+ put_nfs_open_context(ctx);
+ return ret;
+}
+
+static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, struct file_lock *fl, int err)
+{
+ switch (err) {
+ default:
+ printk(KERN_ERR "NFS: %s: unhandled error "
+ "%d.\n", __func__, err);
+ fallthrough;
+ case 0:
+ case -ENOENT:
+ case -EAGAIN:
+ case -ESTALE:
+ case -ETIMEDOUT:
+ break;
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case -NFS4ERR_DEADSESSION:
+ return -EAGAIN;
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_STALE_STATEID:
+ /* Don't recall a delegation if it was lost */
+ nfs4_schedule_lease_recovery(server->nfs_client);
+ return -EAGAIN;
+ case -NFS4ERR_MOVED:
+ nfs4_schedule_migration_recovery(server);
+ return -EAGAIN;
+ case -NFS4ERR_LEASE_MOVED:
+ nfs4_schedule_lease_moved_recovery(server->nfs_client);
+ return -EAGAIN;
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_OPENMODE:
+ nfs_inode_find_state_and_recover(state->inode,
+ stateid);
+ nfs4_schedule_stateid_recovery(server, state);
+ return -EAGAIN;
+ case -NFS4ERR_DELAY:
+ case -NFS4ERR_GRACE:
+ ssleep(1);
+ return -EAGAIN;
+ case -ENOMEM:
+ case -NFS4ERR_DENIED:
+ if (fl) {
+ struct nfs4_lock_state *lsp = fl->fl_u.nfs4_fl.owner;
+ if (lsp)
+ set_bit(NFS_LOCK_LOST, &lsp->ls_flags);
+ }
+ return 0;
+ }
+ return err;
+}
+
+int nfs4_open_delegation_recall(struct nfs_open_context *ctx,
+ struct nfs4_state *state, const nfs4_stateid *stateid)
+{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ struct nfs4_opendata *opendata;
+ int err = 0;
+
+ opendata = nfs4_open_recoverdata_alloc(ctx, state,
+ NFS4_OPEN_CLAIM_DELEG_CUR_FH);
+ if (IS_ERR(opendata))
+ return PTR_ERR(opendata);
+ nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid);
+ if (!test_bit(NFS_O_RDWR_STATE, &state->flags)) {
+ err = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE);
+ if (err)
+ goto out;
+ }
+ if (!test_bit(NFS_O_WRONLY_STATE, &state->flags)) {
+ err = nfs4_open_recover_helper(opendata, FMODE_WRITE);
+ if (err)
+ goto out;
+ }
+ if (!test_bit(NFS_O_RDONLY_STATE, &state->flags)) {
+ err = nfs4_open_recover_helper(opendata, FMODE_READ);
+ if (err)
+ goto out;
+ }
+ nfs_state_clear_delegation(state);
+out:
+ nfs4_opendata_put(opendata);
+ return nfs4_handle_delegation_recall_error(server, state, stateid, NULL, err);
+}
+
+static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_opendata *data = calldata;
+
+ nfs4_setup_sequence(data->o_arg.server->nfs_client,
+ &data->c_arg.seq_args, &data->c_res.seq_res, task);
+}
+
+static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_opendata *data = calldata;
+
+ nfs40_sequence_done(task, &data->c_res.seq_res);
+
+ data->rpc_status = task->tk_status;
+ if (data->rpc_status == 0) {
+ nfs4_stateid_copy(&data->o_res.stateid, &data->c_res.stateid);
+ nfs_confirm_seqid(&data->owner->so_seqid, 0);
+ renew_lease(data->o_res.server, data->timestamp);
+ data->rpc_done = true;
+ }
+}
+
+static void nfs4_open_confirm_release(void *calldata)
+{
+ struct nfs4_opendata *data = calldata;
+ struct nfs4_state *state = NULL;
+
+ /* If this request hasn't been cancelled, do nothing */
+ if (!data->cancelled)
+ goto out_free;
+ /* In case of error, no cleanup! */
+ if (!data->rpc_done)
+ goto out_free;
+ state = nfs4_opendata_to_nfs4_state(data);
+ if (!IS_ERR(state))
+ nfs4_close_state(state, data->o_arg.fmode);
+out_free:
+ nfs4_opendata_put(data);
+}
+
+static const struct rpc_call_ops nfs4_open_confirm_ops = {
+ .rpc_call_prepare = nfs4_open_confirm_prepare,
+ .rpc_call_done = nfs4_open_confirm_done,
+ .rpc_release = nfs4_open_confirm_release,
+};
+
+/*
+ * Note: On error, nfs4_proc_open_confirm will free the struct nfs4_opendata
+ */
+static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
+{
+ struct nfs_server *server = NFS_SERVER(d_inode(data->dir));
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
+ .rpc_argp = &data->c_arg,
+ .rpc_resp = &data->c_res,
+ .rpc_cred = data->owner->so_cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = server->client,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_open_confirm_ops,
+ .callback_data = data,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
+ };
+ int status;
+
+ nfs4_init_sequence(&data->c_arg.seq_args, &data->c_res.seq_res, 1,
+ data->is_recover);
+ kref_get(&data->kref);
+ data->rpc_done = false;
+ data->rpc_status = 0;
+ data->timestamp = jiffies;
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ status = rpc_wait_for_completion_task(task);
+ if (status != 0) {
+ data->cancelled = true;
+ smp_wmb();
+ } else
+ status = data->rpc_status;
+ rpc_put_task(task);
+ return status;
+}
+
+static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_opendata *data = calldata;
+ struct nfs4_state_owner *sp = data->owner;
+ struct nfs_client *clp = sp->so_server->nfs_client;
+ enum open_claim_type4 claim = data->o_arg.claim;
+
+ if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
+ goto out_wait;
+ /*
+ * Check if we still need to send an OPEN call, or if we can use
+ * a delegation instead.
+ */
+ if (data->state != NULL) {
+ struct nfs_delegation *delegation;
+
+ if (can_open_cached(data->state, data->o_arg.fmode,
+ data->o_arg.open_flags, claim))
+ goto out_no_action;
+ rcu_read_lock();
+ delegation = nfs4_get_valid_delegation(data->state->inode);
+ if (can_open_delegated(delegation, data->o_arg.fmode, claim))
+ goto unlock_no_action;
+ rcu_read_unlock();
+ }
+ /* Update client id. */
+ data->o_arg.clientid = clp->cl_clientid;
+ switch (claim) {
+ default:
+ break;
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+ data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0];
+ fallthrough;
+ case NFS4_OPEN_CLAIM_FH:
+ task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
+ }
+ data->timestamp = jiffies;
+ if (nfs4_setup_sequence(data->o_arg.server->nfs_client,
+ &data->o_arg.seq_args,
+ &data->o_res.seq_res,
+ task) != 0)
+ nfs_release_seqid(data->o_arg.seqid);
+
+ /* Set the create mode (note dependency on the session type) */
+ data->o_arg.createmode = NFS4_CREATE_UNCHECKED;
+ if (data->o_arg.open_flags & O_EXCL) {
+ data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE4_1;
+ if (clp->cl_mvops->minor_version == 0) {
+ data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE;
+ /* don't put an ACCESS op in OPEN compound if O_EXCL,
+ * because ACCESS will return permission denied for
+ * all bits until close */
+ data->o_res.access_request = data->o_arg.access = 0;
+ } else if (nfs4_has_persistent_session(clp))
+ data->o_arg.createmode = NFS4_CREATE_GUARDED;
+ }
+ return;
+unlock_no_action:
+ trace_nfs4_cached_open(data->state);
+ rcu_read_unlock();
+out_no_action:
+ task->tk_action = NULL;
+out_wait:
+ nfs4_sequence_done(task, &data->o_res.seq_res);
+}
+
+static void nfs4_open_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_opendata *data = calldata;
+
+ data->rpc_status = task->tk_status;
+
+ if (!nfs4_sequence_process(task, &data->o_res.seq_res))
+ return;
+
+ if (task->tk_status == 0) {
+ if (data->o_res.f_attr->valid & NFS_ATTR_FATTR_TYPE) {
+ switch (data->o_res.f_attr->mode & S_IFMT) {
+ case S_IFREG:
+ break;
+ case S_IFLNK:
+ data->rpc_status = -ELOOP;
+ break;
+ case S_IFDIR:
+ data->rpc_status = -EISDIR;
+ break;
+ default:
+ data->rpc_status = -ENOTDIR;
+ }
+ }
+ renew_lease(data->o_res.server, data->timestamp);
+ if (!(data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM))
+ nfs_confirm_seqid(&data->owner->so_seqid, 0);
+ }
+ data->rpc_done = true;
+}
+
+static void nfs4_open_release(void *calldata)
+{
+ struct nfs4_opendata *data = calldata;
+ struct nfs4_state *state = NULL;
+
+ /* If this request hasn't been cancelled, do nothing */
+ if (!data->cancelled)
+ goto out_free;
+ /* In case of error, no cleanup! */
+ if (data->rpc_status != 0 || !data->rpc_done)
+ goto out_free;
+ /* In case we need an open_confirm, no cleanup! */
+ if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)
+ goto out_free;
+ state = nfs4_opendata_to_nfs4_state(data);
+ if (!IS_ERR(state))
+ nfs4_close_state(state, data->o_arg.fmode);
+out_free:
+ nfs4_opendata_put(data);
+}
+
+static const struct rpc_call_ops nfs4_open_ops = {
+ .rpc_call_prepare = nfs4_open_prepare,
+ .rpc_call_done = nfs4_open_done,
+ .rpc_release = nfs4_open_release,
+};
+
+static int nfs4_run_open_task(struct nfs4_opendata *data,
+ struct nfs_open_context *ctx)
+{
+ struct inode *dir = d_inode(data->dir);
+ struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs_openargs *o_arg = &data->o_arg;
+ struct nfs_openres *o_res = &data->o_res;
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN],
+ .rpc_argp = o_arg,
+ .rpc_resp = o_res,
+ .rpc_cred = data->owner->so_cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = server->client,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_open_ops,
+ .callback_data = data,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
+ };
+ int status;
+
+ if (nfs_server_capable(dir, NFS_CAP_MOVEABLE))
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
+ kref_get(&data->kref);
+ data->rpc_done = false;
+ data->rpc_status = 0;
+ data->cancelled = false;
+ data->is_recover = false;
+ if (!ctx) {
+ nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1, 1);
+ data->is_recover = true;
+ task_setup_data.flags |= RPC_TASK_TIMEOUT;
+ } else {
+ nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1, 0);
+ pnfs_lgopen_prepare(data, ctx);
+ }
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ status = rpc_wait_for_completion_task(task);
+ if (status != 0) {
+ data->cancelled = true;
+ smp_wmb();
+ } else
+ status = data->rpc_status;
+ rpc_put_task(task);
+
+ return status;
+}
+
+static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
+{
+ struct inode *dir = d_inode(data->dir);
+ struct nfs_openres *o_res = &data->o_res;
+ int status;
+
+ status = nfs4_run_open_task(data, NULL);
+ if (status != 0 || !data->rpc_done)
+ return status;
+
+ nfs_fattr_map_and_free_names(NFS_SERVER(dir), &data->f_attr);
+
+ if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM)
+ status = _nfs4_proc_open_confirm(data);
+
+ return status;
+}
+
+/*
+ * Additional permission checks in order to distinguish between an
+ * open for read, and an open for execute. This works around the
+ * fact that NFSv4 OPEN treats read and execute permissions as being
+ * the same.
+ * Note that in the non-execute case, we want to turn off permission
+ * checking if we just created a new file (POSIX open() semantics).
+ */
+static int nfs4_opendata_access(const struct cred *cred,
+ struct nfs4_opendata *opendata,
+ struct nfs4_state *state, fmode_t fmode)
+{
+ struct nfs_access_entry cache;
+ u32 mask, flags;
+
+ /* access call failed or for some reason the server doesn't
+ * support any access modes -- defer access call until later */
+ if (opendata->o_res.access_supported == 0)
+ return 0;
+
+ mask = 0;
+ if (fmode & FMODE_EXEC) {
+ /* ONLY check for exec rights */
+ if (S_ISDIR(state->inode->i_mode))
+ mask = NFS4_ACCESS_LOOKUP;
+ else
+ mask = NFS4_ACCESS_EXECUTE;
+ } else if ((fmode & FMODE_READ) && !opendata->file_created)
+ mask = NFS4_ACCESS_READ;
+
+ nfs_access_set_mask(&cache, opendata->o_res.access_result);
+ nfs_access_add_cache(state->inode, &cache, cred);
+
+ flags = NFS4_ACCESS_READ | NFS4_ACCESS_EXECUTE | NFS4_ACCESS_LOOKUP;
+ if ((mask & ~cache.mask & flags) == 0)
+ return 0;
+
+ return -EACCES;
+}
+
+/*
+ * Note: On error, nfs4_proc_open will free the struct nfs4_opendata
+ */
+static int _nfs4_proc_open(struct nfs4_opendata *data,
+ struct nfs_open_context *ctx)
+{
+ struct inode *dir = d_inode(data->dir);
+ struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs_openargs *o_arg = &data->o_arg;
+ struct nfs_openres *o_res = &data->o_res;
+ int status;
+
+ status = nfs4_run_open_task(data, ctx);
+ if (!data->rpc_done)
+ return status;
+ if (status != 0) {
+ if (status == -NFS4ERR_BADNAME &&
+ !(o_arg->open_flags & O_CREAT))
+ return -ENOENT;
+ return status;
+ }
+
+ nfs_fattr_map_and_free_names(server, &data->f_attr);
+
+ if (o_arg->open_flags & O_CREAT) {
+ if (o_arg->open_flags & O_EXCL)
+ data->file_created = true;
+ else if (o_res->cinfo.before != o_res->cinfo.after)
+ data->file_created = true;
+ if (data->file_created ||
+ inode_peek_iversion_raw(dir) != o_res->cinfo.after)
+ nfs4_update_changeattr(dir, &o_res->cinfo,
+ o_res->f_attr->time_start,
+ NFS_INO_INVALID_DATA);
+ }
+ if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
+ server->caps &= ~NFS_CAP_POSIX_LOCK;
+ if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
+ status = _nfs4_proc_open_confirm(data);
+ if (status != 0)
+ return status;
+ }
+ if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) {
+ struct nfs_fh *fh = &o_res->fh;
+
+ nfs4_sequence_free_slot(&o_res->seq_res);
+ if (o_arg->claim == NFS4_OPEN_CLAIM_FH)
+ fh = NFS_FH(d_inode(data->dentry));
+ nfs4_proc_getattr(server, fh, o_res->f_attr, NULL);
+ }
+ return 0;
+}
+
+/*
+ * OPEN_EXPIRED:
+ * reclaim state on the server after a network partition.
+ * Assumes caller holds the appropriate lock
+ */
+static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state)
+{
+ struct nfs4_opendata *opendata;
+ int ret;
+
+ opendata = nfs4_open_recoverdata_alloc(ctx, state, NFS4_OPEN_CLAIM_FH);
+ if (IS_ERR(opendata))
+ return PTR_ERR(opendata);
+ /*
+ * We're not recovering a delegation, so ask for no delegation.
+ * Otherwise the recovery thread could deadlock with an outstanding
+ * delegation return.
+ */
+ opendata->o_arg.open_flags = O_DIRECT;
+ ret = nfs4_open_recover(opendata, state);
+ if (ret == -ESTALE)
+ d_drop(ctx->dentry);
+ nfs4_opendata_put(opendata);
+ return ret;
+}
+
+static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state)
+{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = _nfs4_open_expired(ctx, state);
+ trace_nfs4_open_expired(ctx, 0, err);
+ if (nfs4_clear_cap_atomic_open_v1(server, err, &exception))
+ continue;
+ switch (err) {
+ default:
+ goto out;
+ case -NFS4ERR_GRACE:
+ case -NFS4ERR_DELAY:
+ nfs4_handle_exception(server, err, &exception);
+ err = 0;
+ }
+ } while (exception.retry);
+out:
+ return err;
+}
+
+static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+ struct nfs_open_context *ctx;
+ int ret;
+
+ ctx = nfs4_state_find_open_context(state);
+ if (IS_ERR(ctx))
+ return -EAGAIN;
+ ret = nfs4_do_open_expired(ctx, state);
+ put_nfs_open_context(ctx);
+ return ret;
+}
+
+static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state,
+ const nfs4_stateid *stateid)
+{
+ nfs_remove_bad_delegation(state->inode, stateid);
+ nfs_state_clear_delegation(state);
+}
+
+static void nfs40_clear_delegation_stateid(struct nfs4_state *state)
+{
+ if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL)
+ nfs_finish_clear_delegation_stateid(state, NULL);
+}
+
+static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+ /* NFSv4.0 doesn't allow for delegation recovery on open expire */
+ nfs40_clear_delegation_stateid(state);
+ nfs_state_clear_open_state_flags(state);
+ return nfs4_open_expired(sp, state);
+}
+
+static int nfs40_test_and_free_expired_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ const struct cred *cred)
+{
+ return -NFS4ERR_BAD_STATEID;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+static int nfs41_test_and_free_expired_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ const struct cred *cred)
+{
+ int status;
+
+ switch (stateid->type) {
+ default:
+ break;
+ case NFS4_INVALID_STATEID_TYPE:
+ case NFS4_SPECIAL_STATEID_TYPE:
+ return -NFS4ERR_BAD_STATEID;
+ case NFS4_REVOKED_STATEID_TYPE:
+ goto out_free;
+ }
+
+ status = nfs41_test_stateid(server, stateid, cred);
+ switch (status) {
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_DELEG_REVOKED:
+ break;
+ default:
+ return status;
+ }
+out_free:
+ /* Ack the revoked state to the server */
+ nfs41_free_stateid(server, stateid, cred, true);
+ return -NFS4ERR_EXPIRED;
+}
+
+static int nfs41_check_delegation_stateid(struct nfs4_state *state)
+{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ nfs4_stateid stateid;
+ struct nfs_delegation *delegation;
+ const struct cred *cred = NULL;
+ int status, ret = NFS_OK;
+
+ /* Get the delegation credential for use by test/free_stateid */
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(state->inode)->delegation);
+ if (delegation == NULL) {
+ rcu_read_unlock();
+ nfs_state_clear_delegation(state);
+ return NFS_OK;
+ }
+
+ spin_lock(&delegation->lock);
+ nfs4_stateid_copy(&stateid, &delegation->stateid);
+
+ if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED,
+ &delegation->flags)) {
+ spin_unlock(&delegation->lock);
+ rcu_read_unlock();
+ return NFS_OK;
+ }
+
+ if (delegation->cred)
+ cred = get_cred(delegation->cred);
+ spin_unlock(&delegation->lock);
+ rcu_read_unlock();
+ status = nfs41_test_and_free_expired_stateid(server, &stateid, cred);
+ trace_nfs4_test_delegation_stateid(state, NULL, status);
+ if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID)
+ nfs_finish_clear_delegation_stateid(state, &stateid);
+ else
+ ret = status;
+
+ put_cred(cred);
+ return ret;
+}
+
+static void nfs41_delegation_recover_stateid(struct nfs4_state *state)
+{
+ nfs4_stateid tmp;
+
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags) &&
+ nfs4_copy_delegation_stateid(state->inode, state->state,
+ &tmp, NULL) &&
+ nfs4_stateid_match_other(&state->stateid, &tmp))
+ nfs_state_set_delegation(state, &tmp, state->state);
+ else
+ nfs_state_clear_delegation(state);
+}
+
+/**
+ * nfs41_check_expired_locks - possibly free a lock stateid
+ *
+ * @state: NFSv4 state for an inode
+ *
+ * Returns NFS_OK if recovery for this stateid is now finished.
+ * Otherwise a negative NFS4ERR value is returned.
+ */
+static int nfs41_check_expired_locks(struct nfs4_state *state)
+{
+ int status, ret = NFS_OK;
+ struct nfs4_lock_state *lsp, *prev = NULL;
+ struct nfs_server *server = NFS_SERVER(state->inode);
+
+ if (!test_bit(LK_STATE_IN_USE, &state->flags))
+ goto out;
+
+ spin_lock(&state->state_lock);
+ list_for_each_entry(lsp, &state->lock_states, ls_locks) {
+ if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
+ const struct cred *cred = lsp->ls_state->owner->so_cred;
+
+ refcount_inc(&lsp->ls_count);
+ spin_unlock(&state->state_lock);
+
+ nfs4_put_lock_state(prev);
+ prev = lsp;
+
+ status = nfs41_test_and_free_expired_stateid(server,
+ &lsp->ls_stateid,
+ cred);
+ trace_nfs4_test_lock_stateid(state, lsp, status);
+ if (status == -NFS4ERR_EXPIRED ||
+ status == -NFS4ERR_BAD_STATEID) {
+ clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
+ lsp->ls_stateid.type = NFS4_INVALID_STATEID_TYPE;
+ if (!recover_lost_locks)
+ set_bit(NFS_LOCK_LOST, &lsp->ls_flags);
+ } else if (status != NFS_OK) {
+ ret = status;
+ nfs4_put_lock_state(prev);
+ goto out;
+ }
+ spin_lock(&state->state_lock);
+ }
+ }
+ spin_unlock(&state->state_lock);
+ nfs4_put_lock_state(prev);
+out:
+ return ret;
+}
+
+/**
+ * nfs41_check_open_stateid - possibly free an open stateid
+ *
+ * @state: NFSv4 state for an inode
+ *
+ * Returns NFS_OK if recovery for this stateid is now finished.
+ * Otherwise a negative NFS4ERR value is returned.
+ */
+static int nfs41_check_open_stateid(struct nfs4_state *state)
+{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ nfs4_stateid *stateid = &state->open_stateid;
+ const struct cred *cred = state->owner->so_cred;
+ int status;
+
+ if (test_bit(NFS_OPEN_STATE, &state->flags) == 0)
+ return -NFS4ERR_BAD_STATEID;
+ status = nfs41_test_and_free_expired_stateid(server, stateid, cred);
+ trace_nfs4_test_open_stateid(state, NULL, status);
+ if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) {
+ nfs_state_clear_open_state_flags(state);
+ stateid->type = NFS4_INVALID_STATEID_TYPE;
+ return status;
+ }
+ if (nfs_open_stateid_recover_openmode(state))
+ return -NFS4ERR_OPENMODE;
+ return NFS_OK;
+}
+
+static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+ int status;
+
+ status = nfs41_check_delegation_stateid(state);
+ if (status != NFS_OK)
+ return status;
+ nfs41_delegation_recover_stateid(state);
+
+ status = nfs41_check_expired_locks(state);
+ if (status != NFS_OK)
+ return status;
+ status = nfs41_check_open_stateid(state);
+ if (status != NFS_OK)
+ status = nfs4_open_expired(sp, state);
+ return status;
+}
+#endif
+
+/*
+ * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-*
+ * fields corresponding to attributes that were used to store the verifier.
+ * Make sure we clobber those fields in the later setattr call
+ */
+static unsigned nfs4_exclusive_attrset(struct nfs4_opendata *opendata,
+ struct iattr *sattr, struct nfs4_label **label)
+{
+ const __u32 *bitmask = opendata->o_arg.server->exclcreat_bitmask;
+ __u32 attrset[3];
+ unsigned ret;
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(attrset); i++) {
+ attrset[i] = opendata->o_res.attrset[i];
+ if (opendata->o_arg.createmode == NFS4_CREATE_EXCLUSIVE4_1)
+ attrset[i] &= ~bitmask[i];
+ }
+
+ ret = (opendata->o_arg.createmode == NFS4_CREATE_EXCLUSIVE) ?
+ sattr->ia_valid : 0;
+
+ if ((attrset[1] & (FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET))) {
+ if (sattr->ia_valid & ATTR_ATIME_SET)
+ ret |= ATTR_ATIME_SET;
+ else
+ ret |= ATTR_ATIME;
+ }
+
+ if ((attrset[1] & (FATTR4_WORD1_TIME_MODIFY|FATTR4_WORD1_TIME_MODIFY_SET))) {
+ if (sattr->ia_valid & ATTR_MTIME_SET)
+ ret |= ATTR_MTIME_SET;
+ else
+ ret |= ATTR_MTIME;
+ }
+
+ if (!(attrset[2] & FATTR4_WORD2_SECURITY_LABEL))
+ *label = NULL;
+ return ret;
+}
+
+static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
+ struct nfs_open_context *ctx)
+{
+ struct nfs4_state_owner *sp = opendata->owner;
+ struct nfs_server *server = sp->so_server;
+ struct dentry *dentry;
+ struct nfs4_state *state;
+ fmode_t acc_mode = _nfs4_ctx_to_accessmode(ctx);
+ struct inode *dir = d_inode(opendata->dir);
+ unsigned long dir_verifier;
+ unsigned int seq;
+ int ret;
+
+ seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+ dir_verifier = nfs_save_change_attribute(dir);
+
+ ret = _nfs4_proc_open(opendata, ctx);
+ if (ret != 0)
+ goto out;
+
+ state = _nfs4_opendata_to_nfs4_state(opendata);
+ ret = PTR_ERR(state);
+ if (IS_ERR(state))
+ goto out;
+ ctx->state = state;
+ if (server->caps & NFS_CAP_POSIX_LOCK)
+ set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+ if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK)
+ set_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags);
+ if (opendata->o_res.rflags & NFS4_OPEN_RESULT_PRESERVE_UNLINKED)
+ set_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(state->inode)->flags);
+
+ dentry = opendata->dentry;
+ if (d_really_is_negative(dentry)) {
+ struct dentry *alias;
+ d_drop(dentry);
+ alias = d_exact_alias(dentry, state->inode);
+ if (!alias)
+ alias = d_splice_alias(igrab(state->inode), dentry);
+ /* d_splice_alias() can't fail here - it's a non-directory */
+ if (alias) {
+ dput(ctx->dentry);
+ ctx->dentry = dentry = alias;
+ }
+ }
+
+ switch(opendata->o_arg.claim) {
+ default:
+ break;
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+ if (!opendata->rpc_done)
+ break;
+ if (opendata->o_res.delegation_type != 0)
+ dir_verifier = nfs_save_change_attribute(dir);
+ nfs_set_verifier(dentry, dir_verifier);
+ }
+
+ /* Parse layoutget results before we check for access */
+ pnfs_parse_lgopen(state->inode, opendata->lgp, ctx);
+
+ ret = nfs4_opendata_access(sp->so_cred, opendata, state, acc_mode);
+ if (ret != 0)
+ goto out;
+
+ if (d_inode(dentry) == state->inode) {
+ nfs_inode_attach_open_context(ctx);
+ if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+ nfs4_schedule_stateid_recovery(server, state);
+ }
+
+out:
+ if (!opendata->cancelled) {
+ if (opendata->lgp) {
+ nfs4_lgopen_release(opendata->lgp);
+ opendata->lgp = NULL;
+ }
+ nfs4_sequence_free_slot(&opendata->o_res.seq_res);
+ }
+ return ret;
+}
+
+/*
+ * Returns a referenced nfs4_state
+ */
+static int _nfs4_do_open(struct inode *dir,
+ struct nfs_open_context *ctx,
+ int flags,
+ const struct nfs4_open_createattrs *c,
+ int *opened)
+{
+ struct nfs4_state_owner *sp;
+ struct nfs4_state *state = NULL;
+ struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs4_opendata *opendata;
+ struct dentry *dentry = ctx->dentry;
+ const struct cred *cred = ctx->cred;
+ struct nfs4_threshold **ctx_th = &ctx->mdsthreshold;
+ fmode_t fmode = _nfs4_ctx_to_openmode(ctx);
+ enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL;
+ struct iattr *sattr = c->sattr;
+ struct nfs4_label *label = c->label;
+ int status;
+
+ /* Protect against reboot recovery conflicts */
+ status = -ENOMEM;
+ sp = nfs4_get_state_owner(server, cred, GFP_KERNEL);
+ if (sp == NULL) {
+ dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n");
+ goto out_err;
+ }
+ status = nfs4_client_recover_expired_lease(server->nfs_client);
+ if (status != 0)
+ goto err_put_state_owner;
+ if (d_really_is_positive(dentry))
+ nfs4_return_incompatible_delegation(d_inode(dentry), fmode);
+ status = -ENOMEM;
+ if (d_really_is_positive(dentry))
+ claim = NFS4_OPEN_CLAIM_FH;
+ opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags,
+ c, claim, GFP_KERNEL);
+ if (opendata == NULL)
+ goto err_put_state_owner;
+
+ if (server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
+ if (!opendata->f_attr.mdsthreshold) {
+ opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
+ if (!opendata->f_attr.mdsthreshold)
+ goto err_opendata_put;
+ }
+ opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];
+ }
+ if (d_really_is_positive(dentry))
+ opendata->state = nfs4_get_open_state(d_inode(dentry), sp);
+
+ status = _nfs4_open_and_get_state(opendata, ctx);
+ if (status != 0)
+ goto err_opendata_put;
+ state = ctx->state;
+
+ if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) &&
+ (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
+ unsigned attrs = nfs4_exclusive_attrset(opendata, sattr, &label);
+ /*
+ * send create attributes which was not set by open
+ * with an extra setattr.
+ */
+ if (attrs || label) {
+ unsigned ia_old = sattr->ia_valid;
+
+ sattr->ia_valid = attrs;
+ nfs_fattr_init(opendata->o_res.f_attr);
+ status = nfs4_do_setattr(state->inode, cred,
+ opendata->o_res.f_attr, sattr,
+ ctx, label);
+ if (status == 0) {
+ nfs_setattr_update_inode(state->inode, sattr,
+ opendata->o_res.f_attr);
+ nfs_setsecurity(state->inode, opendata->o_res.f_attr);
+ }
+ sattr->ia_valid = ia_old;
+ }
+ }
+ if (opened && opendata->file_created)
+ *opened = 1;
+
+ if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) {
+ *ctx_th = opendata->f_attr.mdsthreshold;
+ opendata->f_attr.mdsthreshold = NULL;
+ }
+
+ nfs4_opendata_put(opendata);
+ nfs4_put_state_owner(sp);
+ return 0;
+err_opendata_put:
+ nfs4_opendata_put(opendata);
+err_put_state_owner:
+ nfs4_put_state_owner(sp);
+out_err:
+ return status;
+}
+
+
+static struct nfs4_state *nfs4_do_open(struct inode *dir,
+ struct nfs_open_context *ctx,
+ int flags,
+ struct iattr *sattr,
+ struct nfs4_label *label,
+ int *opened)
+{
+ struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ struct nfs4_state *res;
+ struct nfs4_open_createattrs c = {
+ .label = label,
+ .sattr = sattr,
+ .verf = {
+ [0] = (__u32)jiffies,
+ [1] = (__u32)current->pid,
+ },
+ };
+ int status;
+
+ do {
+ status = _nfs4_do_open(dir, ctx, flags, &c, opened);
+ res = ctx->state;
+ trace_nfs4_open_file(ctx, flags, status);
+ if (status == 0)
+ break;
+ /* NOTE: BAD_SEQID means the server and client disagree about the
+ * book-keeping w.r.t. state-changing operations
+ * (OPEN/CLOSE/LOCK/LOCKU...)
+ * It is actually a sign of a bug on the client or on the server.
+ *
+ * If we receive a BAD_SEQID error in the particular case of
+ * doing an OPEN, we assume that nfs_increment_open_seqid() will
+ * have unhashed the old state_owner for us, and that we can
+ * therefore safely retry using a new one. We should still warn
+ * the user though...
+ */
+ if (status == -NFS4ERR_BAD_SEQID) {
+ pr_warn_ratelimited("NFS: v4 server %s "
+ " returned a bad sequence-id error!\n",
+ NFS_SERVER(dir)->nfs_client->cl_hostname);
+ exception.retry = 1;
+ continue;
+ }
+ /*
+ * BAD_STATEID on OPEN means that the server cancelled our
+ * state before it received the OPEN_CONFIRM.
+ * Recover by retrying the request as per the discussion
+ * on Page 181 of RFC3530.
+ */
+ if (status == -NFS4ERR_BAD_STATEID) {
+ exception.retry = 1;
+ continue;
+ }
+ if (status == -NFS4ERR_EXPIRED) {
+ nfs4_schedule_lease_recovery(server->nfs_client);
+ exception.retry = 1;
+ continue;
+ }
+ if (status == -EAGAIN) {
+ /* We must have found a delegation */
+ exception.retry = 1;
+ continue;
+ }
+ if (nfs4_clear_cap_atomic_open_v1(server, status, &exception))
+ continue;
+ res = ERR_PTR(nfs4_handle_exception(server,
+ status, &exception));
+ } while (exception.retry);
+ return res;
+}
+
+static int _nfs4_do_setattr(struct inode *inode,
+ struct nfs_setattrargs *arg,
+ struct nfs_setattrres *res,
+ const struct cred *cred,
+ struct nfs_open_context *ctx)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
+ .rpc_argp = arg,
+ .rpc_resp = res,
+ .rpc_cred = cred,
+ };
+ const struct cred *delegation_cred = NULL;
+ unsigned long timestamp = jiffies;
+ bool truncate;
+ int status;
+
+ nfs_fattr_init(res->fattr);
+
+ /* Servers should only apply open mode checks for file size changes */
+ truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false;
+ if (!truncate) {
+ nfs4_inode_make_writeable(inode);
+ goto zero_stateid;
+ }
+
+ if (nfs4_copy_delegation_stateid(inode, FMODE_WRITE, &arg->stateid, &delegation_cred)) {
+ /* Use that stateid */
+ } else if (ctx != NULL && ctx->state) {
+ struct nfs_lock_context *l_ctx;
+ if (!nfs4_valid_open_stateid(ctx->state))
+ return -EBADF;
+ l_ctx = nfs_get_lock_context(ctx);
+ if (IS_ERR(l_ctx))
+ return PTR_ERR(l_ctx);
+ status = nfs4_select_rw_stateid(ctx->state, FMODE_WRITE, l_ctx,
+ &arg->stateid, &delegation_cred);
+ nfs_put_lock_context(l_ctx);
+ if (status == -EIO)
+ return -EBADF;
+ else if (status == -EAGAIN)
+ goto zero_stateid;
+ } else {
+zero_stateid:
+ nfs4_stateid_copy(&arg->stateid, &zero_stateid);
+ }
+ if (delegation_cred)
+ msg.rpc_cred = delegation_cred;
+
+ status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1);
+
+ put_cred(delegation_cred);
+ if (status == 0 && ctx != NULL)
+ renew_lease(server, timestamp);
+ trace_nfs4_setattr(inode, &arg->stateid, status);
+ return status;
+}
+
+static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
+ struct nfs_fattr *fattr, struct iattr *sattr,
+ struct nfs_open_context *ctx, struct nfs4_label *ilabel)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ __u32 bitmask[NFS4_BITMASK_SZ];
+ struct nfs4_state *state = ctx ? ctx->state : NULL;
+ struct nfs_setattrargs arg = {
+ .fh = NFS_FH(inode),
+ .iap = sattr,
+ .server = server,
+ .bitmask = bitmask,
+ .label = ilabel,
+ };
+ struct nfs_setattrres res = {
+ .fattr = fattr,
+ .server = server,
+ };
+ struct nfs4_exception exception = {
+ .state = state,
+ .inode = inode,
+ .stateid = &arg.stateid,
+ };
+ unsigned long adjust_flags = NFS_INO_INVALID_CHANGE;
+ int err;
+
+ if (sattr->ia_valid & (ATTR_MODE | ATTR_KILL_SUID | ATTR_KILL_SGID))
+ adjust_flags |= NFS_INO_INVALID_MODE;
+ if (sattr->ia_valid & (ATTR_UID | ATTR_GID))
+ adjust_flags |= NFS_INO_INVALID_OTHER;
+
+ do {
+ nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, fattr->label),
+ inode, adjust_flags);
+
+ err = _nfs4_do_setattr(inode, &arg, &res, cred, ctx);
+ switch (err) {
+ case -NFS4ERR_OPENMODE:
+ if (!(sattr->ia_valid & ATTR_SIZE)) {
+ pr_warn_once("NFSv4: server %s is incorrectly "
+ "applying open mode checks to "
+ "a SETATTR that is not "
+ "changing file size.\n",
+ server->nfs_client->cl_hostname);
+ }
+ if (state && !(state->state & FMODE_WRITE)) {
+ err = -EBADF;
+ if (sattr->ia_valid & ATTR_OPEN)
+ err = -EACCES;
+ goto out;
+ }
+ }
+ err = nfs4_handle_exception(server, err, &exception);
+ } while (exception.retry);
+out:
+ return err;
+}
+
+static bool
+nfs4_wait_on_layoutreturn(struct inode *inode, struct rpc_task *task)
+{
+ if (inode == NULL || !nfs_have_layout(inode))
+ return false;
+
+ return pnfs_wait_on_layoutreturn(inode, task);
+}
+
+/*
+ * Update the seqid of an open stateid
+ */
+static void nfs4_sync_open_stateid(nfs4_stateid *dst,
+ struct nfs4_state *state)
+{
+ __be32 seqid_open;
+ u32 dst_seqid;
+ int seq;
+
+ for (;;) {
+ if (!nfs4_valid_open_stateid(state))
+ break;
+ seq = read_seqbegin(&state->seqlock);
+ if (!nfs4_state_match_open_stateid_other(state, dst)) {
+ nfs4_stateid_copy(dst, &state->open_stateid);
+ if (read_seqretry(&state->seqlock, seq))
+ continue;
+ break;
+ }
+ seqid_open = state->open_stateid.seqid;
+ if (read_seqretry(&state->seqlock, seq))
+ continue;
+
+ dst_seqid = be32_to_cpu(dst->seqid);
+ if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) < 0)
+ dst->seqid = seqid_open;
+ break;
+ }
+}
+
+/*
+ * Update the seqid of an open stateid after receiving
+ * NFS4ERR_OLD_STATEID
+ */
+static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst,
+ struct nfs4_state *state)
+{
+ __be32 seqid_open;
+ u32 dst_seqid;
+ bool ret;
+ int seq, status = -EAGAIN;
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ ret = false;
+ if (!nfs4_valid_open_stateid(state))
+ break;
+ seq = read_seqbegin(&state->seqlock);
+ if (!nfs4_state_match_open_stateid_other(state, dst)) {
+ if (read_seqretry(&state->seqlock, seq))
+ continue;
+ break;
+ }
+
+ write_seqlock(&state->seqlock);
+ seqid_open = state->open_stateid.seqid;
+
+ dst_seqid = be32_to_cpu(dst->seqid);
+
+ /* Did another OPEN bump the state's seqid? try again: */
+ if ((s32)(be32_to_cpu(seqid_open) - dst_seqid) > 0) {
+ dst->seqid = seqid_open;
+ write_sequnlock(&state->seqlock);
+ ret = true;
+ break;
+ }
+
+ /* server says we're behind but we haven't seen the update yet */
+ set_bit(NFS_STATE_CHANGE_WAIT, &state->flags);
+ prepare_to_wait(&state->waitq, &wait, TASK_KILLABLE);
+ write_sequnlock(&state->seqlock);
+ trace_nfs4_close_stateid_update_wait(state->inode, dst, 0);
+
+ if (fatal_signal_pending(current))
+ status = -EINTR;
+ else
+ if (schedule_timeout(5*HZ) != 0)
+ status = 0;
+
+ finish_wait(&state->waitq, &wait);
+
+ if (!status)
+ continue;
+ if (status == -EINTR)
+ break;
+
+ /* we slept the whole 5 seconds, we must have lost a seqid */
+ dst->seqid = cpu_to_be32(dst_seqid + 1);
+ ret = true;
+ break;
+ }
+
+ return ret;
+}
+
+struct nfs4_closedata {
+ struct inode *inode;
+ struct nfs4_state *state;
+ struct nfs_closeargs arg;
+ struct nfs_closeres res;
+ struct {
+ struct nfs4_layoutreturn_args arg;
+ struct nfs4_layoutreturn_res res;
+ struct nfs4_xdr_opaque_data ld_private;
+ u32 roc_barrier;
+ bool roc;
+ } lr;
+ struct nfs_fattr fattr;
+ unsigned long timestamp;
+};
+
+static void nfs4_free_closedata(void *data)
+{
+ struct nfs4_closedata *calldata = data;
+ struct nfs4_state_owner *sp = calldata->state->owner;
+ struct super_block *sb = calldata->state->inode->i_sb;
+
+ if (calldata->lr.roc)
+ pnfs_roc_release(&calldata->lr.arg, &calldata->lr.res,
+ calldata->res.lr_ret);
+ nfs4_put_open_state(calldata->state);
+ nfs_free_seqid(calldata->arg.seqid);
+ nfs4_put_state_owner(sp);
+ nfs_sb_deactive(sb);
+ kfree(calldata);
+}
+
+static void nfs4_close_done(struct rpc_task *task, void *data)
+{
+ struct nfs4_closedata *calldata = data;
+ struct nfs4_state *state = calldata->state;
+ struct nfs_server *server = NFS_SERVER(calldata->inode);
+ nfs4_stateid *res_stateid = NULL;
+ struct nfs4_exception exception = {
+ .state = state,
+ .inode = calldata->inode,
+ .stateid = &calldata->arg.stateid,
+ };
+
+ if (!nfs4_sequence_done(task, &calldata->res.seq_res))
+ return;
+ trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status);
+
+ /* Handle Layoutreturn errors */
+ if (pnfs_roc_done(task, &calldata->arg.lr_args, &calldata->res.lr_res,
+ &calldata->res.lr_ret) == -EAGAIN)
+ goto out_restart;
+
+ /* hmm. we are done with the inode, and in the process of freeing
+ * the state_owner. we keep this around to process errors
+ */
+ switch (task->tk_status) {
+ case 0:
+ res_stateid = &calldata->res.stateid;
+ renew_lease(server, calldata->timestamp);
+ break;
+ case -NFS4ERR_ACCESS:
+ if (calldata->arg.bitmask != NULL) {
+ calldata->arg.bitmask = NULL;
+ calldata->res.fattr = NULL;
+ goto out_restart;
+
+ }
+ break;
+ case -NFS4ERR_OLD_STATEID:
+ /* Did we race with OPEN? */
+ if (nfs4_refresh_open_old_stateid(&calldata->arg.stateid,
+ state))
+ goto out_restart;
+ goto out_release;
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_EXPIRED:
+ nfs4_free_revoked_stateid(server,
+ &calldata->arg.stateid,
+ task->tk_msg.rpc_cred);
+ fallthrough;
+ case -NFS4ERR_BAD_STATEID:
+ if (calldata->arg.fmode == 0)
+ break;
+ fallthrough;
+ default:
+ task->tk_status = nfs4_async_handle_exception(task,
+ server, task->tk_status, &exception);
+ if (exception.retry)
+ goto out_restart;
+ }
+ nfs_clear_open_stateid(state, &calldata->arg.stateid,
+ res_stateid, calldata->arg.fmode);
+out_release:
+ task->tk_status = 0;
+ nfs_release_seqid(calldata->arg.seqid);
+ nfs_refresh_inode(calldata->inode, &calldata->fattr);
+ dprintk("%s: ret = %d\n", __func__, task->tk_status);
+ return;
+out_restart:
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
+ goto out_release;
+}
+
+static void nfs4_close_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs4_closedata *calldata = data;
+ struct nfs4_state *state = calldata->state;
+ struct inode *inode = calldata->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct pnfs_layout_hdr *lo;
+ bool is_rdonly, is_wronly, is_rdwr;
+ int call_close = 0;
+
+ if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
+ goto out_wait;
+
+ task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
+ spin_lock(&state->owner->so_lock);
+ is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
+ is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
+ is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
+ /* Calculate the change in open mode */
+ calldata->arg.fmode = 0;
+ if (state->n_rdwr == 0) {
+ if (state->n_rdonly == 0)
+ call_close |= is_rdonly;
+ else if (is_rdonly)
+ calldata->arg.fmode |= FMODE_READ;
+ if (state->n_wronly == 0)
+ call_close |= is_wronly;
+ else if (is_wronly)
+ calldata->arg.fmode |= FMODE_WRITE;
+ if (calldata->arg.fmode != (FMODE_READ|FMODE_WRITE))
+ call_close |= is_rdwr;
+ } else if (is_rdwr)
+ calldata->arg.fmode |= FMODE_READ|FMODE_WRITE;
+
+ nfs4_sync_open_stateid(&calldata->arg.stateid, state);
+ if (!nfs4_valid_open_stateid(state))
+ call_close = 0;
+ spin_unlock(&state->owner->so_lock);
+
+ if (!call_close) {
+ /* Note: exit _without_ calling nfs4_close_done */
+ goto out_no_action;
+ }
+
+ if (!calldata->lr.roc && nfs4_wait_on_layoutreturn(inode, task)) {
+ nfs_release_seqid(calldata->arg.seqid);
+ goto out_wait;
+ }
+
+ lo = calldata->arg.lr_args ? calldata->arg.lr_args->layout : NULL;
+ if (lo && !pnfs_layout_is_valid(lo)) {
+ calldata->arg.lr_args = NULL;
+ calldata->res.lr_res = NULL;
+ }
+
+ if (calldata->arg.fmode == 0)
+ task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
+
+ if (calldata->arg.fmode == 0 || calldata->arg.fmode == FMODE_READ) {
+ /* Close-to-open cache consistency revalidation */
+ if (!nfs4_have_delegation(inode, FMODE_READ)) {
+ nfs4_bitmask_set(calldata->arg.bitmask_store,
+ server->cache_consistency_bitmask,
+ inode, 0);
+ calldata->arg.bitmask = calldata->arg.bitmask_store;
+ } else
+ calldata->arg.bitmask = NULL;
+ }
+
+ calldata->arg.share_access =
+ nfs4_map_atomic_open_share(NFS_SERVER(inode),
+ calldata->arg.fmode, 0);
+
+ if (calldata->res.fattr == NULL)
+ calldata->arg.bitmask = NULL;
+ else if (calldata->arg.bitmask == NULL)
+ calldata->res.fattr = NULL;
+ calldata->timestamp = jiffies;
+ if (nfs4_setup_sequence(NFS_SERVER(inode)->nfs_client,
+ &calldata->arg.seq_args,
+ &calldata->res.seq_res,
+ task) != 0)
+ nfs_release_seqid(calldata->arg.seqid);
+ return;
+out_no_action:
+ task->tk_action = NULL;
+out_wait:
+ nfs4_sequence_done(task, &calldata->res.seq_res);
+}
+
+static const struct rpc_call_ops nfs4_close_ops = {
+ .rpc_call_prepare = nfs4_close_prepare,
+ .rpc_call_done = nfs4_close_done,
+ .rpc_release = nfs4_free_closedata,
+};
+
+/*
+ * It is possible for data to be read/written from a mem-mapped file
+ * after the sys_close call (which hits the vfs layer as a flush).
+ * This means that we can't safely call nfsv4 close on a file until
+ * the inode is cleared. This in turn means that we are not good
+ * NFSv4 citizens - we do not indicate to the server to update the file's
+ * share state even when we are done with one of the three share
+ * stateid's in the inode.
+ *
+ * NOTE: Caller must be holding the sp->so_owner semaphore!
+ */
+int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
+{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+ struct nfs4_closedata *calldata;
+ struct nfs4_state_owner *sp = state->owner;
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
+ .rpc_cred = state->owner->so_cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = server->client,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_close_ops,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
+ };
+ int status = -ENOMEM;
+
+ if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE))
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
+ nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP,
+ &task_setup_data.rpc_client, &msg);
+
+ calldata = kzalloc(sizeof(*calldata), gfp_mask);
+ if (calldata == NULL)
+ goto out;
+ nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1, 0);
+ calldata->inode = state->inode;
+ calldata->state = state;
+ calldata->arg.fh = NFS_FH(state->inode);
+ if (!nfs4_copy_open_stateid(&calldata->arg.stateid, state))
+ goto out_free_calldata;
+ /* Serialization for the sequence id */
+ alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
+ calldata->arg.seqid = alloc_seqid(&state->owner->so_seqid, gfp_mask);
+ if (IS_ERR(calldata->arg.seqid))
+ goto out_free_calldata;
+ nfs_fattr_init(&calldata->fattr);
+ calldata->arg.fmode = 0;
+ calldata->lr.arg.ld_private = &calldata->lr.ld_private;
+ calldata->res.fattr = &calldata->fattr;
+ calldata->res.seqid = calldata->arg.seqid;
+ calldata->res.server = server;
+ calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
+ calldata->lr.roc = pnfs_roc(state->inode,
+ &calldata->lr.arg, &calldata->lr.res, msg.rpc_cred);
+ if (calldata->lr.roc) {
+ calldata->arg.lr_args = &calldata->lr.arg;
+ calldata->res.lr_res = &calldata->lr.res;
+ }
+ nfs_sb_active(calldata->inode->i_sb);
+
+ msg.rpc_argp = &calldata->arg;
+ msg.rpc_resp = &calldata->res;
+ task_setup_data.callback_data = calldata;
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ status = 0;
+ if (wait)
+ status = rpc_wait_for_completion_task(task);
+ rpc_put_task(task);
+ return status;
+out_free_calldata:
+ kfree(calldata);
+out:
+ nfs4_put_open_state(state);
+ nfs4_put_state_owner(sp);
+ return status;
+}
+
+static struct inode *
+nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx,
+ int open_flags, struct iattr *attr, int *opened)
+{
+ struct nfs4_state *state;
+ struct nfs4_label l, *label;
+
+ label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);
+
+ /* Protect against concurrent sillydeletes */
+ state = nfs4_do_open(dir, ctx, open_flags, attr, label, opened);
+
+ nfs4_label_release_security(label);
+
+ if (IS_ERR(state))
+ return ERR_CAST(state);
+ return state->inode;
+}
+
+static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+ if (ctx->state == NULL)
+ return;
+ if (is_sync)
+ nfs4_close_sync(ctx->state, _nfs4_ctx_to_openmode(ctx));
+ else
+ nfs4_close_state(ctx->state, _nfs4_ctx_to_openmode(ctx));
+}
+
+#define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL)
+#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL)
+#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_XATTR_SUPPORT - 1UL)
+
+static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
+{
+ u32 bitmask[3] = {}, minorversion = server->nfs_client->cl_minorversion;
+ struct nfs4_server_caps_arg args = {
+ .fhandle = fhandle,
+ .bitmask = bitmask,
+ };
+ struct nfs4_server_caps_res res = {};
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SERVER_CAPS],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ int status;
+ int i;
+
+ bitmask[0] = FATTR4_WORD0_SUPPORTED_ATTRS |
+ FATTR4_WORD0_FH_EXPIRE_TYPE |
+ FATTR4_WORD0_LINK_SUPPORT |
+ FATTR4_WORD0_SYMLINK_SUPPORT |
+ FATTR4_WORD0_ACLSUPPORT |
+ FATTR4_WORD0_CASE_INSENSITIVE |
+ FATTR4_WORD0_CASE_PRESERVING;
+ if (minorversion)
+ bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+
+ status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+ if (status == 0) {
+ /* Sanity check the server answers */
+ switch (minorversion) {
+ case 0:
+ res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK;
+ res.attr_bitmask[2] = 0;
+ break;
+ case 1:
+ res.attr_bitmask[2] &= FATTR4_WORD2_NFS41_MASK;
+ break;
+ case 2:
+ res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK;
+ }
+ memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
+ server->caps &= ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS |
+ NFS_CAP_SYMLINKS| NFS_CAP_SECURITY_LABEL);
+ server->fattr_valid = NFS_ATTR_FATTR_V4;
+ if (res.attr_bitmask[0] & FATTR4_WORD0_ACL &&
+ res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
+ server->caps |= NFS_CAP_ACLS;
+ if (res.has_links != 0)
+ server->caps |= NFS_CAP_HARDLINKS;
+ if (res.has_symlinks != 0)
+ server->caps |= NFS_CAP_SYMLINKS;
+ if (res.case_insensitive)
+ server->caps |= NFS_CAP_CASE_INSENSITIVE;
+ if (res.case_preserving)
+ server->caps |= NFS_CAP_CASE_PRESERVING;
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)
+ server->caps |= NFS_CAP_SECURITY_LABEL;
+#endif
+ if (res.attr_bitmask[0] & FATTR4_WORD0_FS_LOCATIONS)
+ server->caps |= NFS_CAP_FS_LOCATIONS;
+ if (!(res.attr_bitmask[0] & FATTR4_WORD0_FILEID))
+ server->fattr_valid &= ~NFS_ATTR_FATTR_FILEID;
+ if (!(res.attr_bitmask[1] & FATTR4_WORD1_MODE))
+ server->fattr_valid &= ~NFS_ATTR_FATTR_MODE;
+ if (!(res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS))
+ server->fattr_valid &= ~NFS_ATTR_FATTR_NLINK;
+ if (!(res.attr_bitmask[1] & FATTR4_WORD1_OWNER))
+ server->fattr_valid &= ~(NFS_ATTR_FATTR_OWNER |
+ NFS_ATTR_FATTR_OWNER_NAME);
+ if (!(res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP))
+ server->fattr_valid &= ~(NFS_ATTR_FATTR_GROUP |
+ NFS_ATTR_FATTR_GROUP_NAME);
+ if (!(res.attr_bitmask[1] & FATTR4_WORD1_SPACE_USED))
+ server->fattr_valid &= ~NFS_ATTR_FATTR_SPACE_USED;
+ if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS))
+ server->fattr_valid &= ~NFS_ATTR_FATTR_ATIME;
+ if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA))
+ server->fattr_valid &= ~NFS_ATTR_FATTR_CTIME;
+ if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY))
+ server->fattr_valid &= ~NFS_ATTR_FATTR_MTIME;
+ memcpy(server->attr_bitmask_nl, res.attr_bitmask,
+ sizeof(server->attr_bitmask));
+ server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+
+ memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
+ server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
+ server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+ server->cache_consistency_bitmask[2] = 0;
+
+ /* Avoid a regression due to buggy server */
+ for (i = 0; i < ARRAY_SIZE(res.exclcreat_bitmask); i++)
+ res.exclcreat_bitmask[i] &= res.attr_bitmask[i];
+ memcpy(server->exclcreat_bitmask, res.exclcreat_bitmask,
+ sizeof(server->exclcreat_bitmask));
+
+ server->acl_bitmask = res.acl_bitmask;
+ server->fh_expire_type = res.fh_expire_type;
+ }
+
+ return status;
+}
+
+int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+
+ nfs4_server_set_init_caps(server);
+ do {
+ err = nfs4_handle_exception(server,
+ _nfs4_server_capabilities(server, fhandle),
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static void test_fs_location_for_trunking(struct nfs4_fs_location *location,
+ struct nfs_client *clp,
+ struct nfs_server *server)
+{
+ int i;
+
+ for (i = 0; i < location->nservers; i++) {
+ struct nfs4_string *srv_loc = &location->servers[i];
+ struct sockaddr_storage addr;
+ size_t addrlen;
+ struct xprt_create xprt_args = {
+ .ident = 0,
+ .net = clp->cl_net,
+ };
+ struct nfs4_add_xprt_data xprtdata = {
+ .clp = clp,
+ };
+ struct rpc_add_xprt_test rpcdata = {
+ .add_xprt_test = clp->cl_mvops->session_trunk,
+ .data = &xprtdata,
+ };
+ char *servername = NULL;
+
+ if (!srv_loc->len)
+ continue;
+
+ addrlen = nfs_parse_server_name(srv_loc->data, srv_loc->len,
+ &addr, sizeof(addr),
+ clp->cl_net, server->port);
+ if (!addrlen)
+ return;
+ xprt_args.dstaddr = (struct sockaddr *)&addr;
+ xprt_args.addrlen = addrlen;
+ servername = kmalloc(srv_loc->len + 1, GFP_KERNEL);
+ if (!servername)
+ return;
+ memcpy(servername, srv_loc->data, srv_loc->len);
+ servername[srv_loc->len] = '\0';
+ xprt_args.servername = servername;
+
+ xprtdata.cred = nfs4_get_clid_cred(clp);
+ rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
+ rpc_clnt_setup_test_and_add_xprt,
+ &rpcdata);
+ if (xprtdata.cred)
+ put_cred(xprtdata.cred);
+ kfree(servername);
+ }
+}
+
+static int _nfs4_discover_trunking(struct nfs_server *server,
+ struct nfs_fh *fhandle)
+{
+ struct nfs4_fs_locations *locations = NULL;
+ struct page *page;
+ const struct cred *cred;
+ struct nfs_client *clp = server->nfs_client;
+ const struct nfs4_state_maintenance_ops *ops =
+ clp->cl_mvops->state_renewal_ops;
+ int status = -ENOMEM, i;
+
+ cred = ops->get_state_renewal_cred(clp);
+ if (cred == NULL) {
+ cred = nfs4_get_clid_cred(clp);
+ if (cred == NULL)
+ return -ENOKEY;
+ }
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ goto out_put_cred;
+ locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+ if (!locations)
+ goto out_free;
+ locations->fattr = nfs_alloc_fattr();
+ if (!locations->fattr)
+ goto out_free_2;
+
+ status = nfs4_proc_get_locations(server, fhandle, locations, page,
+ cred);
+ if (status)
+ goto out_free_3;
+
+ for (i = 0; i < locations->nlocations; i++)
+ test_fs_location_for_trunking(&locations->locations[i], clp,
+ server);
+out_free_3:
+ kfree(locations->fattr);
+out_free_2:
+ kfree(locations);
+out_free:
+ __free_page(page);
+out_put_cred:
+ put_cred(cred);
+ return status;
+}
+
+static int nfs4_discover_trunking(struct nfs_server *server,
+ struct nfs_fh *fhandle)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ struct nfs_client *clp = server->nfs_client;
+ int err = 0;
+
+ if (!nfs4_has_session(clp))
+ goto out;
+ do {
+ err = nfs4_handle_exception(server,
+ _nfs4_discover_trunking(server, fhandle),
+ &exception);
+ } while (exception.retry);
+out:
+ return err;
+}
+
+static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ u32 bitmask[3];
+ struct nfs4_lookup_root_arg args = {
+ .bitmask = bitmask,
+ };
+ struct nfs4_lookup_res res = {
+ .server = server,
+ .fattr = info->fattr,
+ .fh = fhandle,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUP_ROOT],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+
+ bitmask[0] = nfs4_fattr_bitmap[0];
+ bitmask[1] = nfs4_fattr_bitmap[1];
+ /*
+ * Process the label in the upcoming getfattr
+ */
+ bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL;
+
+ nfs_fattr_init(info->fattr);
+ return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = _nfs4_lookup_root(server, fhandle, info);
+ trace_nfs4_lookup_root(server, fhandle, info->fattr, err);
+ switch (err) {
+ case 0:
+ case -NFS4ERR_WRONGSEC:
+ goto out;
+ default:
+ err = nfs4_handle_exception(server, err, &exception);
+ }
+ } while (exception.retry);
+out:
+ return err;
+}
+
+static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info, rpc_authflavor_t flavor)
+{
+ struct rpc_auth_create_args auth_args = {
+ .pseudoflavor = flavor,
+ };
+ struct rpc_auth *auth;
+
+ auth = rpcauth_create(&auth_args, server->client);
+ if (IS_ERR(auth))
+ return -EACCES;
+ return nfs4_lookup_root(server, fhandle, info);
+}
+
+/*
+ * Retry pseudoroot lookup with various security flavors. We do this when:
+ *
+ * NFSv4.0: the PUTROOTFH operation returns NFS4ERR_WRONGSEC
+ * NFSv4.1: the server does not support the SECINFO_NO_NAME operation
+ *
+ * Returns zero on success, or a negative NFS4ERR value, or a
+ * negative errno value.
+ */
+static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ /* Per 3530bis 15.33.5 */
+ static const rpc_authflavor_t flav_array[] = {
+ RPC_AUTH_GSS_KRB5P,
+ RPC_AUTH_GSS_KRB5I,
+ RPC_AUTH_GSS_KRB5,
+ RPC_AUTH_UNIX, /* courtesy */
+ RPC_AUTH_NULL,
+ };
+ int status = -EPERM;
+ size_t i;
+
+ if (server->auth_info.flavor_len > 0) {
+ /* try each flavor specified by user */
+ for (i = 0; i < server->auth_info.flavor_len; i++) {
+ status = nfs4_lookup_root_sec(server, fhandle, info,
+ server->auth_info.flavors[i]);
+ if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
+ continue;
+ break;
+ }
+ } else {
+ /* no flavors specified by user, try default list */
+ for (i = 0; i < ARRAY_SIZE(flav_array); i++) {
+ status = nfs4_lookup_root_sec(server, fhandle, info,
+ flav_array[i]);
+ if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
+ continue;
+ break;
+ }
+ }
+
+ /*
+ * -EACCES could mean that the user doesn't have correct permissions
+ * to access the mount. It could also mean that we tried to mount
+ * with a gss auth flavor, but rpc.gssd isn't running. Either way,
+ * existing mount programs don't handle -EACCES very well so it should
+ * be mapped to -EPERM instead.
+ */
+ if (status == -EACCES)
+ status = -EPERM;
+ return status;
+}
+
+/**
+ * nfs4_proc_get_rootfh - get file handle for server's pseudoroot
+ * @server: initialized nfs_server handle
+ * @fhandle: we fill in the pseudo-fs root file handle
+ * @info: we fill in an FSINFO struct
+ * @auth_probe: probe the auth flavours
+ *
+ * Returns zero on success, or a negative errno.
+ */
+int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info,
+ bool auth_probe)
+{
+ int status = 0;
+
+ if (!auth_probe)
+ status = nfs4_lookup_root(server, fhandle, info);
+
+ if (auth_probe || status == NFS4ERR_WRONGSEC)
+ status = server->nfs_client->cl_mvops->find_root_sec(server,
+ fhandle, info);
+
+ if (status == 0)
+ status = nfs4_server_capabilities(server, fhandle);
+ if (status == 0)
+ status = nfs4_do_fsinfo(server, fhandle, info);
+
+ return nfs4_map_errors(status);
+}
+
+static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
+ struct nfs_fsinfo *info)
+{
+ int error;
+ struct nfs_fattr *fattr = info->fattr;
+
+ error = nfs4_server_capabilities(server, mntfh);
+ if (error < 0) {
+ dprintk("nfs4_get_root: getcaps error = %d\n", -error);
+ return error;
+ }
+
+ error = nfs4_proc_getattr(server, mntfh, fattr, NULL);
+ if (error < 0) {
+ dprintk("nfs4_get_root: getattr error = %d\n", -error);
+ goto out;
+ }
+
+ if (fattr->valid & NFS_ATTR_FATTR_FSID &&
+ !nfs_fsid_equal(&server->fsid, &fattr->fsid))
+ memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
+
+out:
+ return error;
+}
+
+/*
+ * Get locations and (maybe) other attributes of a referral.
+ * Note that we'll actually follow the referral later when
+ * we detect fsid mismatch in inode revalidation
+ */
+static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir,
+ const struct qstr *name, struct nfs_fattr *fattr,
+ struct nfs_fh *fhandle)
+{
+ int status = -ENOMEM;
+ struct page *page = NULL;
+ struct nfs4_fs_locations *locations = NULL;
+
+ page = alloc_page(GFP_KERNEL);
+ if (page == NULL)
+ goto out;
+ locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+ if (locations == NULL)
+ goto out;
+
+ locations->fattr = fattr;
+
+ status = nfs4_proc_fs_locations(client, dir, name, locations, page);
+ if (status != 0)
+ goto out;
+
+ /*
+ * If the fsid didn't change, this is a migration event, not a
+ * referral. Cause us to drop into the exception handler, which
+ * will kick off migration recovery.
+ */
+ if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &fattr->fsid)) {
+ dprintk("%s: server did not return a different fsid for"
+ " a referral at %s\n", __func__, name->name);
+ status = -NFS4ERR_MOVED;
+ goto out;
+ }
+ /* Fixup attributes for the nfs_lookup() call to nfs_fhget() */
+ nfs_fixup_referral_attributes(fattr);
+ memset(fhandle, 0, sizeof(struct nfs_fh));
+out:
+ if (page)
+ __free_page(page);
+ kfree(locations);
+ return status;
+}
+
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct inode *inode)
+{
+ __u32 bitmask[NFS4_BITMASK_SZ];
+ struct nfs4_getattr_arg args = {
+ .fh = fhandle,
+ .bitmask = bitmask,
+ };
+ struct nfs4_getattr_res res = {
+ .fattr = fattr,
+ .server = server,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ unsigned short task_flags = 0;
+
+ if (nfs4_has_session(server->nfs_client))
+ task_flags = RPC_TASK_MOVEABLE;
+
+ /* Is this is an attribute revalidation, subject to softreval? */
+ if (inode && (server->flags & NFS_MOUNT_SOFTREVAL))
+ task_flags |= RPC_TASK_TIMEOUT;
+
+ nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, fattr->label), inode, 0);
+ nfs_fattr_init(fattr);
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
+ return nfs4_do_call_sync(server->client, server, &msg,
+ &args.seq_args, &res.seq_res, task_flags);
+}
+
+int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct inode *inode)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = _nfs4_proc_getattr(server, fhandle, fattr, inode);
+ trace_nfs4_getattr(server, fhandle, fattr, err);
+ err = nfs4_handle_exception(server, err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+/*
+ * The file is not closed if it is opened due to the a request to change
+ * the size of the file. The open call will not be needed once the
+ * VFS layer lookup-intents are implemented.
+ *
+ * Close is called when the inode is destroyed.
+ * If we haven't opened the file for O_WRONLY, we
+ * need to in the size_change case to obtain a stateid.
+ *
+ * Got race?
+ * Because OPEN is always done by name in nfsv4, it is
+ * possible that we opened a different file by the same
+ * name. We can recognize this race condition, but we
+ * can't do anything about it besides returning an error.
+ *
+ * This will be fixed with VFS changes (lookup-intent).
+ */
+static int
+nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
+ struct iattr *sattr)
+{
+ struct inode *inode = d_inode(dentry);
+ const struct cred *cred = NULL;
+ struct nfs_open_context *ctx = NULL;
+ int status;
+
+ if (pnfs_ld_layoutret_on_setattr(inode) &&
+ sattr->ia_valid & ATTR_SIZE &&
+ sattr->ia_size < i_size_read(inode))
+ pnfs_commit_and_return_layout(inode);
+
+ nfs_fattr_init(fattr);
+
+ /* Deal with open(O_TRUNC) */
+ if (sattr->ia_valid & ATTR_OPEN)
+ sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME);
+
+ /* Optimization: if the end result is no change, don't RPC */
+ if ((sattr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0)
+ return 0;
+
+ /* Search for an existing open(O_WRITE) file */
+ if (sattr->ia_valid & ATTR_FILE) {
+
+ ctx = nfs_file_open_context(sattr->ia_file);
+ if (ctx)
+ cred = ctx->cred;
+ }
+
+ /* Return any delegations if we're going to change ACLs */
+ if ((sattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
+ nfs4_inode_make_writeable(inode);
+
+ status = nfs4_do_setattr(inode, cred, fattr, sattr, ctx, NULL);
+ if (status == 0) {
+ nfs_setattr_update_inode(inode, sattr, fattr);
+ nfs_setsecurity(inode, fattr);
+ }
+ return status;
+}
+
+static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
+ struct dentry *dentry, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr)
+{
+ struct nfs_server *server = NFS_SERVER(dir);
+ int status;
+ struct nfs4_lookup_arg args = {
+ .bitmask = server->attr_bitmask,
+ .dir_fh = NFS_FH(dir),
+ .name = &dentry->d_name,
+ };
+ struct nfs4_lookup_res res = {
+ .server = server,
+ .fattr = fattr,
+ .fh = fhandle,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUP],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ unsigned short task_flags = 0;
+
+ if (nfs_server_capable(dir, NFS_CAP_MOVEABLE))
+ task_flags = RPC_TASK_MOVEABLE;
+
+ /* Is this is an attribute revalidation, subject to softreval? */
+ if (nfs_lookup_is_soft_revalidate(dentry))
+ task_flags |= RPC_TASK_TIMEOUT;
+
+ args.bitmask = nfs4_bitmask(server, fattr->label);
+
+ nfs_fattr_init(fattr);
+
+ dprintk("NFS call lookup %pd2\n", dentry);
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
+ status = nfs4_do_call_sync(clnt, server, &msg,
+ &args.seq_args, &res.seq_res, task_flags);
+ dprintk("NFS reply lookup: %d\n", status);
+ return status;
+}
+
+static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr)
+{
+ fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
+ NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_MOUNTPOINT;
+ fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
+ fattr->nlink = 2;
+}
+
+static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
+ struct dentry *dentry, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ struct rpc_clnt *client = *clnt;
+ const struct qstr *name = &dentry->d_name;
+ int err;
+ do {
+ err = _nfs4_proc_lookup(client, dir, dentry, fhandle, fattr);
+ trace_nfs4_lookup(dir, name, err);
+ switch (err) {
+ case -NFS4ERR_BADNAME:
+ err = -ENOENT;
+ goto out;
+ case -NFS4ERR_MOVED:
+ err = nfs4_get_referral(client, dir, name, fattr, fhandle);
+ if (err == -NFS4ERR_MOVED)
+ err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception);
+ goto out;
+ case -NFS4ERR_WRONGSEC:
+ err = -EPERM;
+ if (client != *clnt)
+ goto out;
+ client = nfs4_negotiate_security(client, dir, name);
+ if (IS_ERR(client))
+ return PTR_ERR(client);
+
+ exception.retry = 1;
+ break;
+ default:
+ err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception);
+ }
+ } while (exception.retry);
+
+out:
+ if (err == 0)
+ *clnt = client;
+ else if (client != *clnt)
+ rpc_shutdown_client(client);
+
+ return err;
+}
+
+static int nfs4_proc_lookup(struct inode *dir, struct dentry *dentry,
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+{
+ int status;
+ struct rpc_clnt *client = NFS_CLIENT(dir);
+
+ status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr);
+ if (client != NFS_CLIENT(dir)) {
+ rpc_shutdown_client(client);
+ nfs_fixup_secinfo_attributes(fattr);
+ }
+ return status;
+}
+
+struct rpc_clnt *
+nfs4_proc_lookup_mountpoint(struct inode *dir, struct dentry *dentry,
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+{
+ struct rpc_clnt *client = NFS_CLIENT(dir);
+ int status;
+
+ status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr);
+ if (status < 0)
+ return ERR_PTR(status);
+ return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client;
+}
+
+static int _nfs4_proc_lookupp(struct inode *inode,
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+{
+ struct rpc_clnt *clnt = NFS_CLIENT(inode);
+ struct nfs_server *server = NFS_SERVER(inode);
+ int status;
+ struct nfs4_lookupp_arg args = {
+ .bitmask = server->attr_bitmask,
+ .fh = NFS_FH(inode),
+ };
+ struct nfs4_lookupp_res res = {
+ .server = server,
+ .fattr = fattr,
+ .fh = fhandle,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUPP],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ unsigned short task_flags = 0;
+
+ if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL)
+ task_flags |= RPC_TASK_TIMEOUT;
+
+ args.bitmask = nfs4_bitmask(server, fattr->label);
+
+ nfs_fattr_init(fattr);
+
+ dprintk("NFS call lookupp ino=0x%lx\n", inode->i_ino);
+ status = nfs4_call_sync(clnt, server, &msg, &args.seq_args,
+ &res.seq_res, task_flags);
+ dprintk("NFS reply lookupp: %d\n", status);
+ return status;
+}
+
+static int nfs4_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = _nfs4_proc_lookupp(inode, fhandle, fattr);
+ trace_nfs4_lookupp(inode, err);
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry,
+ const struct cred *cred)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs4_accessargs args = {
+ .fh = NFS_FH(inode),
+ .access = entry->mask,
+ };
+ struct nfs4_accessres res = {
+ .server = server,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ int status = 0;
+
+ if (!nfs4_have_delegation(inode, FMODE_READ)) {
+ res.fattr = nfs_alloc_fattr();
+ if (res.fattr == NULL)
+ return -ENOMEM;
+ args.bitmask = server->cache_consistency_bitmask;
+ }
+ status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+ if (!status) {
+ nfs_access_set_mask(entry, res.access);
+ if (res.fattr)
+ nfs_refresh_inode(inode, res.fattr);
+ }
+ nfs_free_fattr(res.fattr);
+ return status;
+}
+
+static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry,
+ const struct cred *cred)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = _nfs4_proc_access(inode, entry, cred);
+ trace_nfs4_access(inode, err);
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+/*
+ * TODO: For the time being, we don't try to get any attributes
+ * along with any of the zero-copy operations READ, READDIR,
+ * READLINK, WRITE.
+ *
+ * In the case of the first three, we want to put the GETATTR
+ * after the read-type operation -- this is because it is hard
+ * to predict the length of a GETATTR response in v4, and thus
+ * align the READ data correctly. This means that the GETATTR
+ * may end up partially falling into the page cache, and we should
+ * shift it into the 'tail' of the xdr_buf before processing.
+ * To do this efficiently, we need to know the total length
+ * of data received, which doesn't seem to be available outside
+ * of the RPC layer.
+ *
+ * In the case of WRITE, we also want to put the GETATTR after
+ * the operation -- in this case because we want to make sure
+ * we get the post-operation mtime and size.
+ *
+ * Both of these changes to the XDR layer would in fact be quite
+ * minor, but I decided to leave them for a subsequent patch.
+ */
+static int _nfs4_proc_readlink(struct inode *inode, struct page *page,
+ unsigned int pgbase, unsigned int pglen)
+{
+ struct nfs4_readlink args = {
+ .fh = NFS_FH(inode),
+ .pgbase = pgbase,
+ .pglen = pglen,
+ .pages = &page,
+ };
+ struct nfs4_readlink_res res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READLINK],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+
+ return nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_proc_readlink(struct inode *inode, struct page *page,
+ unsigned int pgbase, unsigned int pglen)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = _nfs4_proc_readlink(inode, page, pgbase, pglen);
+ trace_nfs4_readlink(inode, err);
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+/*
+ * This is just for mknod. open(O_CREAT) will always do ->open_context().
+ */
+static int
+nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+ int flags)
+{
+ struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs4_label l, *ilabel;
+ struct nfs_open_context *ctx;
+ struct nfs4_state *state;
+ int status = 0;
+
+ ctx = alloc_nfs_open_context(dentry, FMODE_READ, NULL);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
+
+ if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+ sattr->ia_mode &= ~current_umask();
+ state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, NULL);
+ if (IS_ERR(state)) {
+ status = PTR_ERR(state);
+ goto out;
+ }
+out:
+ nfs4_label_release_security(ilabel);
+ put_nfs_open_context(ctx);
+ return status;
+}
+
+static int
+_nfs4_proc_remove(struct inode *dir, const struct qstr *name, u32 ftype)
+{
+ struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs_removeargs args = {
+ .fh = NFS_FH(dir),
+ .name = *name,
+ };
+ struct nfs_removeres res = {
+ .server = server,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ unsigned long timestamp = jiffies;
+ int status;
+
+ status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
+ if (status == 0) {
+ spin_lock(&dir->i_lock);
+ /* Removing a directory decrements nlink in the parent */
+ if (ftype == NF4DIR && dir->i_nlink > 2)
+ nfs4_dec_nlink_locked(dir);
+ nfs4_update_changeattr_locked(dir, &res.cinfo, timestamp,
+ NFS_INO_INVALID_DATA);
+ spin_unlock(&dir->i_lock);
+ }
+ return status;
+}
+
+static int nfs4_proc_remove(struct inode *dir, struct dentry *dentry)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ struct inode *inode = d_inode(dentry);
+ int err;
+
+ if (inode) {
+ if (inode->i_nlink == 1)
+ nfs4_inode_return_delegation(inode);
+ else
+ nfs4_inode_make_writeable(inode);
+ }
+ do {
+ err = _nfs4_proc_remove(dir, &dentry->d_name, NF4REG);
+ trace_nfs4_remove(dir, &dentry->d_name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int nfs4_proc_rmdir(struct inode *dir, const struct qstr *name)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+
+ do {
+ err = _nfs4_proc_remove(dir, name, NF4DIR);
+ trace_nfs4_remove(dir, name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static void nfs4_proc_unlink_setup(struct rpc_message *msg,
+ struct dentry *dentry,
+ struct inode *inode)
+{
+ struct nfs_removeargs *args = msg->rpc_argp;
+ struct nfs_removeres *res = msg->rpc_resp;
+
+ res->server = NFS_SB(dentry->d_sb);
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
+ nfs4_init_sequence(&args->seq_args, &res->seq_res, 1, 0);
+
+ nfs_fattr_init(res->dir_attr);
+
+ if (inode) {
+ nfs4_inode_return_delegation(inode);
+ nfs_d_prune_case_insensitive_aliases(inode);
+ }
+}
+
+static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
+{
+ nfs4_setup_sequence(NFS_SB(data->dentry->d_sb)->nfs_client,
+ &data->args.seq_args,
+ &data->res.seq_res,
+ task);
+}
+
+static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
+{
+ struct nfs_unlinkdata *data = task->tk_calldata;
+ struct nfs_removeres *res = &data->res;
+
+ if (!nfs4_sequence_done(task, &res->seq_res))
+ return 0;
+ if (nfs4_async_handle_error(task, res->server, NULL,
+ &data->timeout) == -EAGAIN)
+ return 0;
+ if (task->tk_status == 0)
+ nfs4_update_changeattr(dir, &res->cinfo,
+ res->dir_attr->time_start,
+ NFS_INO_INVALID_DATA);
+ return 1;
+}
+
+static void nfs4_proc_rename_setup(struct rpc_message *msg,
+ struct dentry *old_dentry,
+ struct dentry *new_dentry)
+{
+ struct nfs_renameargs *arg = msg->rpc_argp;
+ struct nfs_renameres *res = msg->rpc_resp;
+ struct inode *old_inode = d_inode(old_dentry);
+ struct inode *new_inode = d_inode(new_dentry);
+
+ if (old_inode)
+ nfs4_inode_make_writeable(old_inode);
+ if (new_inode)
+ nfs4_inode_return_delegation(new_inode);
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
+ res->server = NFS_SB(old_dentry->d_sb);
+ nfs4_init_sequence(&arg->seq_args, &res->seq_res, 1, 0);
+}
+
+static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
+{
+ nfs4_setup_sequence(NFS_SERVER(data->old_dir)->nfs_client,
+ &data->args.seq_args,
+ &data->res.seq_res,
+ task);
+}
+
+static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+ struct inode *new_dir)
+{
+ struct nfs_renamedata *data = task->tk_calldata;
+ struct nfs_renameres *res = &data->res;
+
+ if (!nfs4_sequence_done(task, &res->seq_res))
+ return 0;
+ if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN)
+ return 0;
+
+ if (task->tk_status == 0) {
+ nfs_d_prune_case_insensitive_aliases(d_inode(data->old_dentry));
+ if (new_dir != old_dir) {
+ /* Note: If we moved a directory, nlink will change */
+ nfs4_update_changeattr(old_dir, &res->old_cinfo,
+ res->old_fattr->time_start,
+ NFS_INO_INVALID_NLINK |
+ NFS_INO_INVALID_DATA);
+ nfs4_update_changeattr(new_dir, &res->new_cinfo,
+ res->new_fattr->time_start,
+ NFS_INO_INVALID_NLINK |
+ NFS_INO_INVALID_DATA);
+ } else
+ nfs4_update_changeattr(old_dir, &res->old_cinfo,
+ res->old_fattr->time_start,
+ NFS_INO_INVALID_DATA);
+ }
+ return 1;
+}
+
+static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ __u32 bitmask[NFS4_BITMASK_SZ];
+ struct nfs4_link_arg arg = {
+ .fh = NFS_FH(inode),
+ .dir_fh = NFS_FH(dir),
+ .name = name,
+ .bitmask = bitmask,
+ };
+ struct nfs4_link_res res = {
+ .server = server,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int status = -ENOMEM;
+
+ res.fattr = nfs_alloc_fattr_with_label(server);
+ if (res.fattr == NULL)
+ goto out;
+
+ nfs4_inode_make_writeable(inode);
+ nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, res.fattr->label), inode,
+ NFS_INO_INVALID_CHANGE);
+ status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+ if (!status) {
+ nfs4_update_changeattr(dir, &res.cinfo, res.fattr->time_start,
+ NFS_INO_INVALID_DATA);
+ nfs4_inc_nlink(inode);
+ status = nfs_post_op_update_inode(inode, res.fattr);
+ if (!status)
+ nfs_setsecurity(inode, res.fattr);
+ }
+
+out:
+ nfs_free_fattr(res.fattr);
+ return status;
+}
+
+static int nfs4_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = nfs4_handle_exception(NFS_SERVER(inode),
+ _nfs4_proc_link(inode, dir, name),
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+struct nfs4_createdata {
+ struct rpc_message msg;
+ struct nfs4_create_arg arg;
+ struct nfs4_create_res res;
+ struct nfs_fh fh;
+ struct nfs_fattr fattr;
+};
+
+static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
+ const struct qstr *name, struct iattr *sattr, u32 ftype)
+{
+ struct nfs4_createdata *data;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (data != NULL) {
+ struct nfs_server *server = NFS_SERVER(dir);
+
+ data->fattr.label = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(data->fattr.label))
+ goto out_free;
+
+ data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
+ data->msg.rpc_argp = &data->arg;
+ data->msg.rpc_resp = &data->res;
+ data->arg.dir_fh = NFS_FH(dir);
+ data->arg.server = server;
+ data->arg.name = name;
+ data->arg.attrs = sattr;
+ data->arg.ftype = ftype;
+ data->arg.bitmask = nfs4_bitmask(server, data->fattr.label);
+ data->arg.umask = current_umask();
+ data->res.server = server;
+ data->res.fh = &data->fh;
+ data->res.fattr = &data->fattr;
+ nfs_fattr_init(data->res.fattr);
+ }
+ return data;
+out_free:
+ kfree(data);
+ return NULL;
+}
+
+static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
+{
+ int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg,
+ &data->arg.seq_args, &data->res.seq_res, 1);
+ if (status == 0) {
+ spin_lock(&dir->i_lock);
+ /* Creating a directory bumps nlink in the parent */
+ if (data->arg.ftype == NF4DIR)
+ nfs4_inc_nlink_locked(dir);
+ nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo,
+ data->res.fattr->time_start,
+ NFS_INO_INVALID_DATA);
+ spin_unlock(&dir->i_lock);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ }
+ return status;
+}
+
+static void nfs4_free_createdata(struct nfs4_createdata *data)
+{
+ nfs4_label_free(data->fattr.label);
+ kfree(data);
+}
+
+static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
+ struct page *page, unsigned int len, struct iattr *sattr,
+ struct nfs4_label *label)
+{
+ struct nfs4_createdata *data;
+ int status = -ENAMETOOLONG;
+
+ if (len > NFS4_MAXPATHLEN)
+ goto out;
+
+ status = -ENOMEM;
+ data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4LNK);
+ if (data == NULL)
+ goto out;
+
+ data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
+ data->arg.u.symlink.pages = &page;
+ data->arg.u.symlink.len = len;
+ data->arg.label = label;
+
+ status = nfs4_do_create(dir, dentry, data);
+
+ nfs4_free_createdata(data);
+out:
+ return status;
+}
+
+static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
+ struct page *page, unsigned int len, struct iattr *sattr)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ struct nfs4_label l, *label;
+ int err;
+
+ label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
+ do {
+ err = _nfs4_proc_symlink(dir, dentry, page, len, sattr, label);
+ trace_nfs4_symlink(dir, &dentry->d_name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
+ &exception);
+ } while (exception.retry);
+
+ nfs4_label_release_security(label);
+ return err;
+}
+
+static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr, struct nfs4_label *label)
+{
+ struct nfs4_createdata *data;
+ int status = -ENOMEM;
+
+ data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR);
+ if (data == NULL)
+ goto out;
+
+ data->arg.label = label;
+ status = nfs4_do_create(dir, dentry, data);
+
+ nfs4_free_createdata(data);
+out:
+ return status;
+}
+
+static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr)
+{
+ struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ struct nfs4_label l, *label;
+ int err;
+
+ label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
+ if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+ sattr->ia_mode &= ~current_umask();
+ do {
+ err = _nfs4_proc_mkdir(dir, dentry, sattr, label);
+ trace_nfs4_mkdir(dir, &dentry->d_name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
+ &exception);
+ } while (exception.retry);
+ nfs4_label_release_security(label);
+
+ return err;
+}
+
+static int _nfs4_proc_readdir(struct nfs_readdir_arg *nr_arg,
+ struct nfs_readdir_res *nr_res)
+{
+ struct inode *dir = d_inode(nr_arg->dentry);
+ struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs4_readdir_arg args = {
+ .fh = NFS_FH(dir),
+ .pages = nr_arg->pages,
+ .pgbase = 0,
+ .count = nr_arg->page_len,
+ .plus = nr_arg->plus,
+ };
+ struct nfs4_readdir_res res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READDIR],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = nr_arg->cred,
+ };
+ int status;
+
+ dprintk("%s: dentry = %pd2, cookie = %llu\n", __func__,
+ nr_arg->dentry, (unsigned long long)nr_arg->cookie);
+ if (!(server->caps & NFS_CAP_SECURITY_LABEL))
+ args.bitmask = server->attr_bitmask_nl;
+ else
+ args.bitmask = server->attr_bitmask;
+
+ nfs4_setup_readdir(nr_arg->cookie, nr_arg->verf, nr_arg->dentry, &args);
+ res.pgbase = args.pgbase;
+ status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
+ &res.seq_res, 0);
+ if (status >= 0) {
+ memcpy(nr_res->verf, res.verifier.data, NFS4_VERIFIER_SIZE);
+ status += args.pgbase;
+ }
+
+ nfs_invalidate_atime(dir);
+
+ dprintk("%s: returns %d\n", __func__, status);
+ return status;
+}
+
+static int nfs4_proc_readdir(struct nfs_readdir_arg *arg,
+ struct nfs_readdir_res *res)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = _nfs4_proc_readdir(arg, res);
+ trace_nfs4_readdir(d_inode(arg->dentry), err);
+ err = nfs4_handle_exception(NFS_SERVER(d_inode(arg->dentry)),
+ err, &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr, struct nfs4_label *label, dev_t rdev)
+{
+ struct nfs4_createdata *data;
+ int mode = sattr->ia_mode;
+ int status = -ENOMEM;
+
+ data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK);
+ if (data == NULL)
+ goto out;
+
+ if (S_ISFIFO(mode))
+ data->arg.ftype = NF4FIFO;
+ else if (S_ISBLK(mode)) {
+ data->arg.ftype = NF4BLK;
+ data->arg.u.device.specdata1 = MAJOR(rdev);
+ data->arg.u.device.specdata2 = MINOR(rdev);
+ }
+ else if (S_ISCHR(mode)) {
+ data->arg.ftype = NF4CHR;
+ data->arg.u.device.specdata1 = MAJOR(rdev);
+ data->arg.u.device.specdata2 = MINOR(rdev);
+ } else if (!S_ISSOCK(mode)) {
+ status = -EINVAL;
+ goto out_free;
+ }
+
+ data->arg.label = label;
+ status = nfs4_do_create(dir, dentry, data);
+out_free:
+ nfs4_free_createdata(data);
+out:
+ return status;
+}
+
+static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr, dev_t rdev)
+{
+ struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ struct nfs4_label l, *label;
+ int err;
+
+ label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
+ if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+ sattr->ia_mode &= ~current_umask();
+ do {
+ err = _nfs4_proc_mknod(dir, dentry, sattr, label, rdev);
+ trace_nfs4_mknod(dir, &dentry->d_name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
+ &exception);
+ } while (exception.retry);
+
+ nfs4_label_release_security(label);
+
+ return err;
+}
+
+static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsstat *fsstat)
+{
+ struct nfs4_statfs_arg args = {
+ .fh = fhandle,
+ .bitmask = server->attr_bitmask,
+ };
+ struct nfs4_statfs_res res = {
+ .fsstat = fsstat,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_STATFS],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+
+ nfs_fattr_init(fsstat->fattr);
+ return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = nfs4_handle_exception(server,
+ _nfs4_proc_statfs(server, fhandle, fsstat),
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *fsinfo)
+{
+ struct nfs4_fsinfo_arg args = {
+ .fh = fhandle,
+ .bitmask = server->attr_bitmask,
+ };
+ struct nfs4_fsinfo_res res = {
+ .fsinfo = fsinfo,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSINFO],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+
+ return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+
+ do {
+ err = _nfs4_do_fsinfo(server, fhandle, fsinfo);
+ trace_nfs4_fsinfo(server, fhandle, fsinfo->fattr, err);
+ if (err == 0) {
+ nfs4_set_lease_period(server->nfs_client, fsinfo->lease_time * HZ);
+ break;
+ }
+ err = nfs4_handle_exception(server, err, &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
+{
+ int error;
+
+ nfs_fattr_init(fsinfo->fattr);
+ error = nfs4_do_fsinfo(server, fhandle, fsinfo);
+ if (error == 0) {
+ /* block layout checks this! */
+ server->pnfs_blksize = fsinfo->blksize;
+ set_pnfs_layoutdriver(server, fhandle, fsinfo);
+ }
+
+ return error;
+}
+
+static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_pathconf *pathconf)
+{
+ struct nfs4_pathconf_arg args = {
+ .fh = fhandle,
+ .bitmask = server->attr_bitmask,
+ };
+ struct nfs4_pathconf_res res = {
+ .pathconf = pathconf,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PATHCONF],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+
+ /* None of the pathconf attributes are mandatory to implement */
+ if ((args.bitmask[0] & nfs4_pathconf_bitmap[0]) == 0) {
+ memset(pathconf, 0, sizeof(*pathconf));
+ return 0;
+ }
+
+ nfs_fattr_init(pathconf->fattr);
+ return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_pathconf *pathconf)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+
+ do {
+ err = nfs4_handle_exception(server,
+ _nfs4_proc_pathconf(server, fhandle, pathconf),
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+int nfs4_set_rw_stateid(nfs4_stateid *stateid,
+ const struct nfs_open_context *ctx,
+ const struct nfs_lock_context *l_ctx,
+ fmode_t fmode)
+{
+ return nfs4_select_rw_stateid(ctx->state, fmode, l_ctx, stateid, NULL);
+}
+EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid);
+
+static bool nfs4_stateid_is_current(nfs4_stateid *stateid,
+ const struct nfs_open_context *ctx,
+ const struct nfs_lock_context *l_ctx,
+ fmode_t fmode)
+{
+ nfs4_stateid _current_stateid;
+
+ /* If the current stateid represents a lost lock, then exit */
+ if (nfs4_set_rw_stateid(&_current_stateid, ctx, l_ctx, fmode) == -EIO)
+ return true;
+ return nfs4_stateid_match(stateid, &_current_stateid);
+}
+
+static bool nfs4_error_stateid_expired(int err)
+{
+ switch (err) {
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_OLD_STATEID:
+ case -NFS4ERR_OPENMODE:
+ case -NFS4ERR_EXPIRED:
+ return true;
+ }
+ return false;
+}
+
+static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+ struct nfs_server *server = NFS_SERVER(hdr->inode);
+
+ trace_nfs4_read(hdr, task->tk_status);
+ if (task->tk_status < 0) {
+ struct nfs4_exception exception = {
+ .inode = hdr->inode,
+ .state = hdr->args.context->state,
+ .stateid = &hdr->args.stateid,
+ };
+ task->tk_status = nfs4_async_handle_exception(task,
+ server, task->tk_status, &exception);
+ if (exception.retry) {
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
+ }
+
+ if (task->tk_status > 0)
+ renew_lease(server, hdr->timestamp);
+ return 0;
+}
+
+static bool nfs4_read_stateid_changed(struct rpc_task *task,
+ struct nfs_pgio_args *args)
+{
+
+ if (!nfs4_error_stateid_expired(task->tk_status) ||
+ nfs4_stateid_is_current(&args->stateid,
+ args->context,
+ args->lock_context,
+ FMODE_READ))
+ return false;
+ rpc_restart_call_prepare(task);
+ return true;
+}
+
+static bool nfs4_read_plus_not_supported(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ struct nfs_server *server = NFS_SERVER(hdr->inode);
+ struct rpc_message *msg = &task->tk_msg;
+
+ if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS] &&
+ server->caps & NFS_CAP_READ_PLUS && task->tk_status == -ENOTSUPP) {
+ server->caps &= ~NFS_CAP_READ_PLUS;
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+ rpc_restart_call_prepare(task);
+ return true;
+ }
+ return false;
+}
+
+static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+ if (!nfs4_sequence_done(task, &hdr->res.seq_res))
+ return -EAGAIN;
+ if (nfs4_read_stateid_changed(task, &hdr->args))
+ return -EAGAIN;
+ if (nfs4_read_plus_not_supported(task, hdr))
+ return -EAGAIN;
+ if (task->tk_status > 0)
+ nfs_invalidate_atime(hdr->inode);
+ return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
+ nfs4_read_done_cb(task, hdr);
+}
+
+#if defined CONFIG_NFS_V4_2 && defined CONFIG_NFS_V4_2_READ_PLUS
+static bool nfs42_read_plus_support(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg)
+{
+ /* Note: We don't use READ_PLUS with pNFS yet */
+ if (nfs_server_capable(hdr->inode, NFS_CAP_READ_PLUS) && !hdr->ds_clp) {
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS];
+ return nfs_read_alloc_scratch(hdr, READ_PLUS_SCRATCH_SIZE);
+ }
+ return false;
+}
+#else
+static bool nfs42_read_plus_support(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg)
+{
+ return false;
+}
+#endif /* CONFIG_NFS_V4_2 */
+
+static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg)
+{
+ hdr->timestamp = jiffies;
+ if (!hdr->pgio_done_cb)
+ hdr->pgio_done_cb = nfs4_read_done_cb;
+ if (!nfs42_read_plus_support(hdr, msg))
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+ nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
+}
+
+static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (nfs4_setup_sequence(NFS_SERVER(hdr->inode)->nfs_client,
+ &hdr->args.seq_args,
+ &hdr->res.seq_res,
+ task))
+ return 0;
+ if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+ hdr->args.lock_context,
+ hdr->rw_mode) == -EIO)
+ return -EIO;
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags)))
+ return -EIO;
+ return 0;
+}
+
+static int nfs4_write_done_cb(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ struct inode *inode = hdr->inode;
+
+ trace_nfs4_write(hdr, task->tk_status);
+ if (task->tk_status < 0) {
+ struct nfs4_exception exception = {
+ .inode = hdr->inode,
+ .state = hdr->args.context->state,
+ .stateid = &hdr->args.stateid,
+ };
+ task->tk_status = nfs4_async_handle_exception(task,
+ NFS_SERVER(inode), task->tk_status,
+ &exception);
+ if (exception.retry) {
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
+ }
+ if (task->tk_status >= 0) {
+ renew_lease(NFS_SERVER(inode), hdr->timestamp);
+ nfs_writeback_update_inode(hdr);
+ }
+ return 0;
+}
+
+static bool nfs4_write_stateid_changed(struct rpc_task *task,
+ struct nfs_pgio_args *args)
+{
+
+ if (!nfs4_error_stateid_expired(task->tk_status) ||
+ nfs4_stateid_is_current(&args->stateid,
+ args->context,
+ args->lock_context,
+ FMODE_WRITE))
+ return false;
+ rpc_restart_call_prepare(task);
+ return true;
+}
+
+static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+ if (!nfs4_sequence_done(task, &hdr->res.seq_res))
+ return -EAGAIN;
+ if (nfs4_write_stateid_changed(task, &hdr->args))
+ return -EAGAIN;
+ return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
+ nfs4_write_done_cb(task, hdr);
+}
+
+static
+bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr)
+{
+ /* Don't request attributes for pNFS or O_DIRECT writes */
+ if (hdr->ds_clp != NULL || hdr->dreq != NULL)
+ return false;
+ /* Otherwise, request attributes if and only if we don't hold
+ * a delegation
+ */
+ return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
+}
+
+void nfs4_bitmask_set(__u32 bitmask[], const __u32 src[],
+ struct inode *inode, unsigned long cache_validity)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ unsigned int i;
+
+ memcpy(bitmask, src, sizeof(*bitmask) * NFS4_BITMASK_SZ);
+ cache_validity |= READ_ONCE(NFS_I(inode)->cache_validity);
+
+ if (cache_validity & NFS_INO_INVALID_CHANGE)
+ bitmask[0] |= FATTR4_WORD0_CHANGE;
+ if (cache_validity & NFS_INO_INVALID_ATIME)
+ bitmask[1] |= FATTR4_WORD1_TIME_ACCESS;
+ if (cache_validity & NFS_INO_INVALID_MODE)
+ bitmask[1] |= FATTR4_WORD1_MODE;
+ if (cache_validity & NFS_INO_INVALID_OTHER)
+ bitmask[1] |= FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP;
+ if (cache_validity & NFS_INO_INVALID_NLINK)
+ bitmask[1] |= FATTR4_WORD1_NUMLINKS;
+ if (cache_validity & NFS_INO_INVALID_CTIME)
+ bitmask[1] |= FATTR4_WORD1_TIME_METADATA;
+ if (cache_validity & NFS_INO_INVALID_MTIME)
+ bitmask[1] |= FATTR4_WORD1_TIME_MODIFY;
+ if (cache_validity & NFS_INO_INVALID_BLOCKS)
+ bitmask[1] |= FATTR4_WORD1_SPACE_USED;
+
+ if (cache_validity & NFS_INO_INVALID_SIZE)
+ bitmask[0] |= FATTR4_WORD0_SIZE;
+
+ for (i = 0; i < NFS4_BITMASK_SZ; i++)
+ bitmask[i] &= server->attr_bitmask[i];
+}
+
+static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg,
+ struct rpc_clnt **clnt)
+{
+ struct nfs_server *server = NFS_SERVER(hdr->inode);
+
+ if (!nfs4_write_need_cache_consistency_data(hdr)) {
+ hdr->args.bitmask = NULL;
+ hdr->res.fattr = NULL;
+ } else {
+ nfs4_bitmask_set(hdr->args.bitmask_store,
+ server->cache_consistency_bitmask,
+ hdr->inode, NFS_INO_INVALID_BLOCKS);
+ hdr->args.bitmask = hdr->args.bitmask_store;
+ }
+
+ if (!hdr->pgio_done_cb)
+ hdr->pgio_done_cb = nfs4_write_done_cb;
+ hdr->res.server = server;
+ hdr->timestamp = jiffies;
+
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
+ nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
+ nfs4_state_protect_write(hdr->ds_clp ? hdr->ds_clp : server->nfs_client, clnt, msg, hdr);
+}
+
+static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
+{
+ nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client,
+ &data->args.seq_args,
+ &data->res.seq_res,
+ task);
+}
+
+static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data)
+{
+ struct inode *inode = data->inode;
+
+ trace_nfs4_commit(data, task->tk_status);
+ if (nfs4_async_handle_error(task, NFS_SERVER(inode),
+ NULL, NULL) == -EAGAIN) {
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static int nfs4_commit_done(struct rpc_task *task, struct nfs_commit_data *data)
+{
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return -EAGAIN;
+ return data->commit_done_cb(task, data);
+}
+
+static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg,
+ struct rpc_clnt **clnt)
+{
+ struct nfs_server *server = NFS_SERVER(data->inode);
+
+ if (data->commit_done_cb == NULL)
+ data->commit_done_cb = nfs4_commit_done_cb;
+ data->res.server = server;
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 0);
+ nfs4_state_protect(data->ds_clp ? data->ds_clp : server->nfs_client,
+ NFS_SP4_MACH_CRED_COMMIT, clnt, msg);
+}
+
+static int _nfs4_proc_commit(struct file *dst, struct nfs_commitargs *args,
+ struct nfs_commitres *res)
+{
+ struct inode *dst_inode = file_inode(dst);
+ struct nfs_server *server = NFS_SERVER(dst_inode);
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT],
+ .rpc_argp = args,
+ .rpc_resp = res,
+ };
+
+ args->fh = NFS_FH(dst_inode);
+ return nfs4_call_sync(server->client, server, &msg,
+ &args->seq_args, &res->seq_res, 1);
+}
+
+int nfs4_proc_commit(struct file *dst, __u64 offset, __u32 count, struct nfs_commitres *res)
+{
+ struct nfs_commitargs args = {
+ .offset = offset,
+ .count = count,
+ };
+ struct nfs_server *dst_server = NFS_SERVER(file_inode(dst));
+ struct nfs4_exception exception = { };
+ int status;
+
+ do {
+ status = _nfs4_proc_commit(dst, &args, res);
+ status = nfs4_handle_exception(dst_server, status, &exception);
+ } while (exception.retry);
+
+ return status;
+}
+
+struct nfs4_renewdata {
+ struct nfs_client *client;
+ unsigned long timestamp;
+};
+
+/*
+ * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special
+ * standalone procedure for queueing an asynchronous RENEW.
+ */
+static void nfs4_renew_release(void *calldata)
+{
+ struct nfs4_renewdata *data = calldata;
+ struct nfs_client *clp = data->client;
+
+ if (refcount_read(&clp->cl_count) > 1)
+ nfs4_schedule_state_renewal(clp);
+ nfs_put_client(clp);
+ kfree(data);
+}
+
+static void nfs4_renew_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_renewdata *data = calldata;
+ struct nfs_client *clp = data->client;
+ unsigned long timestamp = data->timestamp;
+
+ trace_nfs4_renew_async(clp, task->tk_status);
+ switch (task->tk_status) {
+ case 0:
+ break;
+ case -NFS4ERR_LEASE_MOVED:
+ nfs4_schedule_lease_moved_recovery(clp);
+ break;
+ default:
+ /* Unless we're shutting down, schedule state recovery! */
+ if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0)
+ return;
+ if (task->tk_status != NFS4ERR_CB_PATH_DOWN) {
+ nfs4_schedule_lease_recovery(clp);
+ return;
+ }
+ nfs4_schedule_path_down_recovery(clp);
+ }
+ do_renew_lease(clp, timestamp);
+}
+
+static const struct rpc_call_ops nfs4_renew_ops = {
+ .rpc_call_done = nfs4_renew_done,
+ .rpc_release = nfs4_renew_release,
+};
+
+static int nfs4_proc_async_renew(struct nfs_client *clp, const struct cred *cred, unsigned renew_flags)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
+ .rpc_argp = clp,
+ .rpc_cred = cred,
+ };
+ struct nfs4_renewdata *data;
+
+ if (renew_flags == 0)
+ return 0;
+ if (!refcount_inc_not_zero(&clp->cl_count))
+ return -EIO;
+ data = kmalloc(sizeof(*data), GFP_NOFS);
+ if (data == NULL) {
+ nfs_put_client(clp);
+ return -ENOMEM;
+ }
+ data->client = clp;
+ data->timestamp = jiffies;
+ return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT,
+ &nfs4_renew_ops, data);
+}
+
+static int nfs4_proc_renew(struct nfs_client *clp, const struct cred *cred)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
+ .rpc_argp = clp,
+ .rpc_cred = cred,
+ };
+ unsigned long now = jiffies;
+ int status;
+
+ status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+ if (status < 0)
+ return status;
+ do_renew_lease(clp, now);
+ return 0;
+}
+
+static bool nfs4_server_supports_acls(const struct nfs_server *server,
+ enum nfs4_acl_type type)
+{
+ switch (type) {
+ default:
+ return server->attr_bitmask[0] & FATTR4_WORD0_ACL;
+ case NFS4ACL_DACL:
+ return server->attr_bitmask[1] & FATTR4_WORD1_DACL;
+ case NFS4ACL_SACL:
+ return server->attr_bitmask[1] & FATTR4_WORD1_SACL;
+ }
+}
+
+/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that
+ * it's OK to put sizeof(void) * (XATTR_SIZE_MAX/PAGE_SIZE) bytes on
+ * the stack.
+ */
+#define NFS4ACL_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE)
+
+int nfs4_buf_to_pages_noslab(const void *buf, size_t buflen,
+ struct page **pages)
+{
+ struct page *newpage, **spages;
+ int rc = 0;
+ size_t len;
+ spages = pages;
+
+ do {
+ len = min_t(size_t, PAGE_SIZE, buflen);
+ newpage = alloc_page(GFP_KERNEL);
+
+ if (newpage == NULL)
+ goto unwind;
+ memcpy(page_address(newpage), buf, len);
+ buf += len;
+ buflen -= len;
+ *pages++ = newpage;
+ rc++;
+ } while (buflen != 0);
+
+ return rc;
+
+unwind:
+ for(; rc > 0; rc--)
+ __free_page(spages[rc-1]);
+ return -ENOMEM;
+}
+
+struct nfs4_cached_acl {
+ enum nfs4_acl_type type;
+ int cached;
+ size_t len;
+ char data[];
+};
+
+static void nfs4_set_cached_acl(struct inode *inode, struct nfs4_cached_acl *acl)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ spin_lock(&inode->i_lock);
+ kfree(nfsi->nfs4_acl);
+ nfsi->nfs4_acl = acl;
+ spin_unlock(&inode->i_lock);
+}
+
+static void nfs4_zap_acl_attr(struct inode *inode)
+{
+ nfs4_set_cached_acl(inode, NULL);
+}
+
+static ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf,
+ size_t buflen, enum nfs4_acl_type type)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs4_cached_acl *acl;
+ int ret = -ENOENT;
+
+ spin_lock(&inode->i_lock);
+ acl = nfsi->nfs4_acl;
+ if (acl == NULL)
+ goto out;
+ if (acl->type != type)
+ goto out;
+ if (buf == NULL) /* user is just asking for length */
+ goto out_len;
+ if (acl->cached == 0)
+ goto out;
+ ret = -ERANGE; /* see getxattr(2) man page */
+ if (acl->len > buflen)
+ goto out;
+ memcpy(buf, acl->data, acl->len);
+out_len:
+ ret = acl->len;
+out:
+ spin_unlock(&inode->i_lock);
+ return ret;
+}
+
+static void nfs4_write_cached_acl(struct inode *inode, struct page **pages,
+ size_t pgbase, size_t acl_len,
+ enum nfs4_acl_type type)
+{
+ struct nfs4_cached_acl *acl;
+ size_t buflen = sizeof(*acl) + acl_len;
+
+ if (buflen <= PAGE_SIZE) {
+ acl = kmalloc(buflen, GFP_KERNEL);
+ if (acl == NULL)
+ goto out;
+ acl->cached = 1;
+ _copy_from_pages(acl->data, pages, pgbase, acl_len);
+ } else {
+ acl = kmalloc(sizeof(*acl), GFP_KERNEL);
+ if (acl == NULL)
+ goto out;
+ acl->cached = 0;
+ }
+ acl->type = type;
+ acl->len = acl_len;
+out:
+ nfs4_set_cached_acl(inode, acl);
+}
+
+/*
+ * The getxattr API returns the required buffer length when called with a
+ * NULL buf. The NFSv4 acl tool then calls getxattr again after allocating
+ * the required buf. On a NULL buf, we send a page of data to the server
+ * guessing that the ACL request can be serviced by a page. If so, we cache
+ * up to the page of ACL data, and the 2nd call to getxattr is serviced by
+ * the cache. If not so, we throw away the page, and cache the required
+ * length. The next getxattr call will then produce another round trip to
+ * the server, this time with the input buf of the required size.
+ */
+static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf,
+ size_t buflen, enum nfs4_acl_type type)
+{
+ struct page **pages;
+ struct nfs_getaclargs args = {
+ .fh = NFS_FH(inode),
+ .acl_type = type,
+ .acl_len = buflen,
+ };
+ struct nfs_getaclres res = {
+ .acl_type = type,
+ .acl_len = buflen,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ unsigned int npages;
+ int ret = -ENOMEM, i;
+ struct nfs_server *server = NFS_SERVER(inode);
+
+ if (buflen == 0)
+ buflen = server->rsize;
+
+ npages = DIV_ROUND_UP(buflen, PAGE_SIZE) + 1;
+ pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ args.acl_pages = pages;
+
+ for (i = 0; i < npages; i++) {
+ pages[i] = alloc_page(GFP_KERNEL);
+ if (!pages[i])
+ goto out_free;
+ }
+
+ /* for decoding across pages */
+ res.acl_scratch = alloc_page(GFP_KERNEL);
+ if (!res.acl_scratch)
+ goto out_free;
+
+ args.acl_len = npages * PAGE_SIZE;
+
+ dprintk("%s buf %p buflen %zu npages %d args.acl_len %zu\n",
+ __func__, buf, buflen, npages, args.acl_len);
+ ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode),
+ &msg, &args.seq_args, &res.seq_res, 0);
+ if (ret)
+ goto out_free;
+
+ /* Handle the case where the passed-in buffer is too short */
+ if (res.acl_flags & NFS4_ACL_TRUNC) {
+ /* Did the user only issue a request for the acl length? */
+ if (buf == NULL)
+ goto out_ok;
+ ret = -ERANGE;
+ goto out_free;
+ }
+ nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len,
+ type);
+ if (buf) {
+ if (res.acl_len > buflen) {
+ ret = -ERANGE;
+ goto out_free;
+ }
+ _copy_from_pages(buf, pages, res.acl_data_offset, res.acl_len);
+ }
+out_ok:
+ ret = res.acl_len;
+out_free:
+ while (--i >= 0)
+ __free_page(pages[i]);
+ if (res.acl_scratch)
+ __free_page(res.acl_scratch);
+ kfree(pages);
+ return ret;
+}
+
+static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf,
+ size_t buflen, enum nfs4_acl_type type)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ ssize_t ret;
+ do {
+ ret = __nfs4_get_acl_uncached(inode, buf, buflen, type);
+ trace_nfs4_get_acl(inode, ret);
+ if (ret >= 0)
+ break;
+ ret = nfs4_handle_exception(NFS_SERVER(inode), ret, &exception);
+ } while (exception.retry);
+ return ret;
+}
+
+static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen,
+ enum nfs4_acl_type type)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ int ret;
+
+ if (!nfs4_server_supports_acls(server, type))
+ return -EOPNOTSUPP;
+ ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
+ if (ret < 0)
+ return ret;
+ if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+ nfs_zap_acl_cache(inode);
+ ret = nfs4_read_cached_acl(inode, buf, buflen, type);
+ if (ret != -ENOENT)
+ /* -ENOENT is returned if there is no ACL or if there is an ACL
+ * but no cached acl data, just the acl length */
+ return ret;
+ return nfs4_get_acl_uncached(inode, buf, buflen, type);
+}
+
+static int __nfs4_proc_set_acl(struct inode *inode, const void *buf,
+ size_t buflen, enum nfs4_acl_type type)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct page *pages[NFS4ACL_MAXPAGES];
+ struct nfs_setaclargs arg = {
+ .fh = NFS_FH(inode),
+ .acl_type = type,
+ .acl_len = buflen,
+ .acl_pages = pages,
+ };
+ struct nfs_setaclres res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETACL],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE);
+ int ret, i;
+
+ /* You can't remove system.nfs4_acl: */
+ if (buflen == 0)
+ return -EINVAL;
+ if (!nfs4_server_supports_acls(server, type))
+ return -EOPNOTSUPP;
+ if (npages > ARRAY_SIZE(pages))
+ return -ERANGE;
+ i = nfs4_buf_to_pages_noslab(buf, buflen, arg.acl_pages);
+ if (i < 0)
+ return i;
+ nfs4_inode_make_writeable(inode);
+ ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+
+ /*
+ * Free each page after tx, so the only ref left is
+ * held by the network stack
+ */
+ for (; i > 0; i--)
+ put_page(pages[i-1]);
+
+ /*
+ * Acl update can result in inode attribute update.
+ * so mark the attribute cache invalid.
+ */
+ spin_lock(&inode->i_lock);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE |
+ NFS_INO_INVALID_CTIME |
+ NFS_INO_REVAL_FORCED);
+ spin_unlock(&inode->i_lock);
+ nfs_access_zap_cache(inode);
+ nfs_zap_acl_cache(inode);
+ return ret;
+}
+
+static int nfs4_proc_set_acl(struct inode *inode, const void *buf,
+ size_t buflen, enum nfs4_acl_type type)
+{
+ struct nfs4_exception exception = { };
+ int err;
+ do {
+ err = __nfs4_proc_set_acl(inode, buf, buflen, type);
+ trace_nfs4_set_acl(inode, err);
+ if (err == -NFS4ERR_BADOWNER || err == -NFS4ERR_BADNAME) {
+ /*
+ * no need to retry since the kernel
+ * isn't involved in encoding the ACEs.
+ */
+ err = -EINVAL;
+ break;
+ }
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static int _nfs4_get_security_label(struct inode *inode, void *buf,
+ size_t buflen)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs4_label label = {0, 0, buflen, buf};
+
+ u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
+ struct nfs_fattr fattr = {
+ .label = &label,
+ };
+ struct nfs4_getattr_arg arg = {
+ .fh = NFS_FH(inode),
+ .bitmask = bitmask,
+ };
+ struct nfs4_getattr_res res = {
+ .fattr = &fattr,
+ .server = server,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int ret;
+
+ nfs_fattr_init(&fattr);
+
+ ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0);
+ if (ret)
+ return ret;
+ if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
+ return -ENOENT;
+ return label.len;
+}
+
+static int nfs4_get_security_label(struct inode *inode, void *buf,
+ size_t buflen)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+
+ if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
+ return -EOPNOTSUPP;
+
+ do {
+ err = _nfs4_get_security_label(inode, buf, buflen);
+ trace_nfs4_get_security_label(inode, err);
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int _nfs4_do_set_security_label(struct inode *inode,
+ struct nfs4_label *ilabel,
+ struct nfs_fattr *fattr)
+{
+
+ struct iattr sattr = {0};
+ struct nfs_server *server = NFS_SERVER(inode);
+ const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
+ struct nfs_setattrargs arg = {
+ .fh = NFS_FH(inode),
+ .iap = &sattr,
+ .server = server,
+ .bitmask = bitmask,
+ .label = ilabel,
+ };
+ struct nfs_setattrres res = {
+ .fattr = fattr,
+ .server = server,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int status;
+
+ nfs4_stateid_copy(&arg.stateid, &zero_stateid);
+
+ status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+ if (status)
+ dprintk("%s failed: %d\n", __func__, status);
+
+ return status;
+}
+
+static int nfs4_do_set_security_label(struct inode *inode,
+ struct nfs4_label *ilabel,
+ struct nfs_fattr *fattr)
+{
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = _nfs4_do_set_security_label(inode, ilabel, fattr);
+ trace_nfs4_set_security_label(inode, err);
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int
+nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen)
+{
+ struct nfs4_label ilabel = {0, 0, buflen, (char *)buf };
+ struct nfs_fattr *fattr;
+ int status;
+
+ if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
+ return -EOPNOTSUPP;
+
+ fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
+ if (fattr == NULL)
+ return -ENOMEM;
+
+ status = nfs4_do_set_security_label(inode, &ilabel, fattr);
+ if (status == 0)
+ nfs_setsecurity(inode, fattr);
+
+ return status;
+}
+#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
+
+
+static void nfs4_init_boot_verifier(const struct nfs_client *clp,
+ nfs4_verifier *bootverf)
+{
+ __be32 verf[2];
+
+ if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) {
+ /* An impossible timestamp guarantees this value
+ * will never match a generated boot time. */
+ verf[0] = cpu_to_be32(U32_MAX);
+ verf[1] = cpu_to_be32(U32_MAX);
+ } else {
+ struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+ u64 ns = ktime_to_ns(nn->boot_time);
+
+ verf[0] = cpu_to_be32(ns >> 32);
+ verf[1] = cpu_to_be32(ns);
+ }
+ memcpy(bootverf->data, verf, sizeof(bootverf->data));
+}
+
+static size_t
+nfs4_get_uniquifier(struct nfs_client *clp, char *buf, size_t buflen)
+{
+ struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+ struct nfs_netns_client *nn_clp = nn->nfs_client;
+ const char *id;
+
+ buf[0] = '\0';
+
+ if (nn_clp) {
+ rcu_read_lock();
+ id = rcu_dereference(nn_clp->identifier);
+ if (id)
+ strscpy(buf, id, buflen);
+ rcu_read_unlock();
+ }
+
+ if (nfs4_client_id_uniquifier[0] != '\0' && buf[0] == '\0')
+ strscpy(buf, nfs4_client_id_uniquifier, buflen);
+
+ return strlen(buf);
+}
+
+static int
+nfs4_init_nonuniform_client_string(struct nfs_client *clp)
+{
+ char buf[NFS4_CLIENT_ID_UNIQ_LEN];
+ size_t buflen;
+ size_t len;
+ char *str;
+
+ if (clp->cl_owner_id != NULL)
+ return 0;
+
+ rcu_read_lock();
+ len = 14 +
+ strlen(clp->cl_rpcclient->cl_nodename) +
+ 1 +
+ strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) +
+ 1;
+ rcu_read_unlock();
+
+ buflen = nfs4_get_uniquifier(clp, buf, sizeof(buf));
+ if (buflen)
+ len += buflen + 1;
+
+ if (len > NFS4_OPAQUE_LIMIT + 1)
+ return -EINVAL;
+
+ /*
+ * Since this string is allocated at mount time, and held until the
+ * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
+ * about a memory-reclaim deadlock.
+ */
+ str = kmalloc(len, GFP_KERNEL);
+ if (!str)
+ return -ENOMEM;
+
+ rcu_read_lock();
+ if (buflen)
+ scnprintf(str, len, "Linux NFSv4.0 %s/%s/%s",
+ clp->cl_rpcclient->cl_nodename, buf,
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR));
+ else
+ scnprintf(str, len, "Linux NFSv4.0 %s/%s",
+ clp->cl_rpcclient->cl_nodename,
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR));
+ rcu_read_unlock();
+
+ clp->cl_owner_id = str;
+ return 0;
+}
+
+static int
+nfs4_init_uniform_client_string(struct nfs_client *clp)
+{
+ char buf[NFS4_CLIENT_ID_UNIQ_LEN];
+ size_t buflen;
+ size_t len;
+ char *str;
+
+ if (clp->cl_owner_id != NULL)
+ return 0;
+
+ len = 10 + 10 + 1 + 10 + 1 +
+ strlen(clp->cl_rpcclient->cl_nodename) + 1;
+
+ buflen = nfs4_get_uniquifier(clp, buf, sizeof(buf));
+ if (buflen)
+ len += buflen + 1;
+
+ if (len > NFS4_OPAQUE_LIMIT + 1)
+ return -EINVAL;
+
+ /*
+ * Since this string is allocated at mount time, and held until the
+ * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
+ * about a memory-reclaim deadlock.
+ */
+ str = kmalloc(len, GFP_KERNEL);
+ if (!str)
+ return -ENOMEM;
+
+ if (buflen)
+ scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
+ clp->rpc_ops->version, clp->cl_minorversion,
+ buf, clp->cl_rpcclient->cl_nodename);
+ else
+ scnprintf(str, len, "Linux NFSv%u.%u %s",
+ clp->rpc_ops->version, clp->cl_minorversion,
+ clp->cl_rpcclient->cl_nodename);
+ clp->cl_owner_id = str;
+ return 0;
+}
+
+/*
+ * nfs4_callback_up_net() starts only "tcp" and "tcp6" callback
+ * services. Advertise one based on the address family of the
+ * clientaddr.
+ */
+static unsigned int
+nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len)
+{
+ if (strchr(clp->cl_ipaddr, ':') != NULL)
+ return scnprintf(buf, len, "tcp6");
+ else
+ return scnprintf(buf, len, "tcp");
+}
+
+static void nfs4_setclientid_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_setclientid *sc = calldata;
+
+ if (task->tk_status == 0)
+ sc->sc_cred = get_rpccred(task->tk_rqstp->rq_cred);
+}
+
+static const struct rpc_call_ops nfs4_setclientid_ops = {
+ .rpc_call_done = nfs4_setclientid_done,
+};
+
+/**
+ * nfs4_proc_setclientid - Negotiate client ID
+ * @clp: state data structure
+ * @program: RPC program for NFSv4 callback service
+ * @port: IP port number for NFS4 callback service
+ * @cred: credential to use for this call
+ * @res: where to place the result
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status code.
+ */
+int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
+ unsigned short port, const struct cred *cred,
+ struct nfs4_setclientid_res *res)
+{
+ nfs4_verifier sc_verifier;
+ struct nfs4_setclientid setclientid = {
+ .sc_verifier = &sc_verifier,
+ .sc_prog = program,
+ .sc_clnt = clp,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
+ .rpc_argp = &setclientid,
+ .rpc_resp = res,
+ .rpc_cred = cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clp->cl_rpcclient,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_setclientid_ops,
+ .callback_data = &setclientid,
+ .flags = RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN,
+ };
+ unsigned long now = jiffies;
+ int status;
+
+ /* nfs_client_id4 */
+ nfs4_init_boot_verifier(clp, &sc_verifier);
+
+ if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags))
+ status = nfs4_init_uniform_client_string(clp);
+ else
+ status = nfs4_init_nonuniform_client_string(clp);
+
+ if (status)
+ goto out;
+
+ /* cb_client4 */
+ setclientid.sc_netid_len =
+ nfs4_init_callback_netid(clp,
+ setclientid.sc_netid,
+ sizeof(setclientid.sc_netid));
+ setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,
+ sizeof(setclientid.sc_uaddr), "%s.%u.%u",
+ clp->cl_ipaddr, port >> 8, port & 255);
+
+ dprintk("NFS call setclientid auth=%s, '%s'\n",
+ clp->cl_rpcclient->cl_auth->au_ops->au_name,
+ clp->cl_owner_id);
+
+ status = nfs4_call_sync_custom(&task_setup_data);
+ if (setclientid.sc_cred) {
+ kfree(clp->cl_acceptor);
+ clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred);
+ put_rpccred(setclientid.sc_cred);
+ }
+
+ if (status == 0)
+ do_renew_lease(clp, now);
+out:
+ trace_nfs4_setclientid(clp, status);
+ dprintk("NFS reply setclientid: %d\n", status);
+ return status;
+}
+
+/**
+ * nfs4_proc_setclientid_confirm - Confirm client ID
+ * @clp: state data structure
+ * @arg: result of a previous SETCLIENTID
+ * @cred: credential to use for this call
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status code.
+ */
+int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
+ struct nfs4_setclientid_res *arg,
+ const struct cred *cred)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM],
+ .rpc_argp = arg,
+ .rpc_cred = cred,
+ };
+ int status;
+
+ dprintk("NFS call setclientid_confirm auth=%s, (client ID %llx)\n",
+ clp->cl_rpcclient->cl_auth->au_ops->au_name,
+ clp->cl_clientid);
+ status = rpc_call_sync(clp->cl_rpcclient, &msg,
+ RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
+ trace_nfs4_setclientid_confirm(clp, status);
+ dprintk("NFS reply setclientid_confirm: %d\n", status);
+ return status;
+}
+
+struct nfs4_delegreturndata {
+ struct nfs4_delegreturnargs args;
+ struct nfs4_delegreturnres res;
+ struct nfs_fh fh;
+ nfs4_stateid stateid;
+ unsigned long timestamp;
+ struct {
+ struct nfs4_layoutreturn_args arg;
+ struct nfs4_layoutreturn_res res;
+ struct nfs4_xdr_opaque_data ld_private;
+ u32 roc_barrier;
+ bool roc;
+ } lr;
+ struct nfs_fattr fattr;
+ int rpc_status;
+ struct inode *inode;
+};
+
+static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_delegreturndata *data = calldata;
+ struct nfs4_exception exception = {
+ .inode = data->inode,
+ .stateid = &data->stateid,
+ .task_is_privileged = data->args.seq_args.sa_privileged,
+ };
+
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return;
+
+ trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status);
+
+ /* Handle Layoutreturn errors */
+ if (pnfs_roc_done(task, &data->args.lr_args, &data->res.lr_res,
+ &data->res.lr_ret) == -EAGAIN)
+ goto out_restart;
+
+ switch (task->tk_status) {
+ case 0:
+ renew_lease(data->res.server, data->timestamp);
+ break;
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_EXPIRED:
+ nfs4_free_revoked_stateid(data->res.server,
+ data->args.stateid,
+ task->tk_msg.rpc_cred);
+ fallthrough;
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_STALE_STATEID:
+ case -ETIMEDOUT:
+ task->tk_status = 0;
+ break;
+ case -NFS4ERR_OLD_STATEID:
+ if (!nfs4_refresh_delegation_stateid(&data->stateid, data->inode))
+ nfs4_stateid_seqid_inc(&data->stateid);
+ if (data->args.bitmask) {
+ data->args.bitmask = NULL;
+ data->res.fattr = NULL;
+ }
+ goto out_restart;
+ case -NFS4ERR_ACCESS:
+ if (data->args.bitmask) {
+ data->args.bitmask = NULL;
+ data->res.fattr = NULL;
+ goto out_restart;
+ }
+ fallthrough;
+ default:
+ task->tk_status = nfs4_async_handle_exception(task,
+ data->res.server, task->tk_status,
+ &exception);
+ if (exception.retry)
+ goto out_restart;
+ }
+ nfs_delegation_mark_returned(data->inode, data->args.stateid);
+ data->rpc_status = task->tk_status;
+ return;
+out_restart:
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
+}
+
+static void nfs4_delegreturn_release(void *calldata)
+{
+ struct nfs4_delegreturndata *data = calldata;
+ struct inode *inode = data->inode;
+
+ if (data->lr.roc)
+ pnfs_roc_release(&data->lr.arg, &data->lr.res,
+ data->res.lr_ret);
+ if (inode) {
+ nfs4_fattr_set_prechange(&data->fattr,
+ inode_peek_iversion_raw(inode));
+ nfs_refresh_inode(inode, &data->fattr);
+ nfs_iput_and_deactive(inode);
+ }
+ kfree(calldata);
+}
+
+static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs4_delegreturndata *d_data;
+ struct pnfs_layout_hdr *lo;
+
+ d_data = data;
+
+ if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) {
+ nfs4_sequence_done(task, &d_data->res.seq_res);
+ return;
+ }
+
+ lo = d_data->args.lr_args ? d_data->args.lr_args->layout : NULL;
+ if (lo && !pnfs_layout_is_valid(lo)) {
+ d_data->args.lr_args = NULL;
+ d_data->res.lr_res = NULL;
+ }
+
+ nfs4_setup_sequence(d_data->res.server->nfs_client,
+ &d_data->args.seq_args,
+ &d_data->res.seq_res,
+ task);
+}
+
+static const struct rpc_call_ops nfs4_delegreturn_ops = {
+ .rpc_call_prepare = nfs4_delegreturn_prepare,
+ .rpc_call_done = nfs4_delegreturn_done,
+ .rpc_release = nfs4_delegreturn_release,
+};
+
+static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, const nfs4_stateid *stateid, int issync)
+{
+ struct nfs4_delegreturndata *data;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN],
+ .rpc_cred = cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = server->client,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_delegreturn_ops,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,
+ };
+ int status = 0;
+
+ if (nfs_server_capable(inode, NFS_CAP_MOVEABLE))
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (data == NULL)
+ return -ENOMEM;
+
+ nfs4_state_protect(server->nfs_client,
+ NFS_SP4_MACH_CRED_CLEANUP,
+ &task_setup_data.rpc_client, &msg);
+
+ data->args.fhandle = &data->fh;
+ data->args.stateid = &data->stateid;
+ nfs4_bitmask_set(data->args.bitmask_store,
+ server->cache_consistency_bitmask, inode, 0);
+ data->args.bitmask = data->args.bitmask_store;
+ nfs_copy_fh(&data->fh, NFS_FH(inode));
+ nfs4_stateid_copy(&data->stateid, stateid);
+ data->res.fattr = &data->fattr;
+ data->res.server = server;
+ data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
+ data->lr.arg.ld_private = &data->lr.ld_private;
+ nfs_fattr_init(data->res.fattr);
+ data->timestamp = jiffies;
+ data->rpc_status = 0;
+ data->inode = nfs_igrab_and_active(inode);
+ if (data->inode || issync) {
+ data->lr.roc = pnfs_roc(inode, &data->lr.arg, &data->lr.res,
+ cred);
+ if (data->lr.roc) {
+ data->args.lr_args = &data->lr.arg;
+ data->res.lr_res = &data->lr.res;
+ }
+ }
+
+ if (!data->inode)
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1,
+ 1);
+ else
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1,
+ 0);
+ task_setup_data.callback_data = data;
+ msg.rpc_argp = &data->args;
+ msg.rpc_resp = &data->res;
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ if (!issync)
+ goto out;
+ status = rpc_wait_for_completion_task(task);
+ if (status != 0)
+ goto out;
+ status = data->rpc_status;
+out:
+ rpc_put_task(task);
+ return status;
+}
+
+int nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, const nfs4_stateid *stateid, int issync)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs4_exception exception = { };
+ int err;
+ do {
+ err = _nfs4_proc_delegreturn(inode, cred, stateid, issync);
+ trace_nfs4_delegreturn(inode, stateid, err);
+ switch (err) {
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_EXPIRED:
+ case 0:
+ return 0;
+ }
+ err = nfs4_handle_exception(server, err, &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+ struct inode *inode = state->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs_lockt_args arg = {
+ .fh = NFS_FH(inode),
+ .fl = request,
+ };
+ struct nfs_lockt_res res = {
+ .denied = request,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKT],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ .rpc_cred = state->owner->so_cred,
+ };
+ struct nfs4_lock_state *lsp;
+ int status;
+
+ arg.lock_owner.clientid = clp->cl_clientid;
+ status = nfs4_set_lock_state(state, request);
+ if (status != 0)
+ goto out;
+ lsp = request->fl_u.nfs4_fl.owner;
+ arg.lock_owner.id = lsp->ls_seqid.owner_id;
+ arg.lock_owner.s_dev = server->s_dev;
+ status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+ switch (status) {
+ case 0:
+ request->fl_type = F_UNLCK;
+ break;
+ case -NFS4ERR_DENIED:
+ status = 0;
+ }
+ request->fl_ops->fl_release_private(request);
+ request->fl_ops = NULL;
+out:
+ return status;
+}
+
+static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+
+ do {
+ err = _nfs4_proc_getlk(state, cmd, request);
+ trace_nfs4_get_lock(request, state, cmd, err);
+ err = nfs4_handle_exception(NFS_SERVER(state->inode), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+/*
+ * Update the seqid of a lock stateid after receiving
+ * NFS4ERR_OLD_STATEID
+ */
+static bool nfs4_refresh_lock_old_stateid(nfs4_stateid *dst,
+ struct nfs4_lock_state *lsp)
+{
+ struct nfs4_state *state = lsp->ls_state;
+ bool ret = false;
+
+ spin_lock(&state->state_lock);
+ if (!nfs4_stateid_match_other(dst, &lsp->ls_stateid))
+ goto out;
+ if (!nfs4_stateid_is_newer(&lsp->ls_stateid, dst))
+ nfs4_stateid_seqid_inc(dst);
+ else
+ dst->seqid = lsp->ls_stateid.seqid;
+ ret = true;
+out:
+ spin_unlock(&state->state_lock);
+ return ret;
+}
+
+static bool nfs4_sync_lock_stateid(nfs4_stateid *dst,
+ struct nfs4_lock_state *lsp)
+{
+ struct nfs4_state *state = lsp->ls_state;
+ bool ret;
+
+ spin_lock(&state->state_lock);
+ ret = !nfs4_stateid_match_other(dst, &lsp->ls_stateid);
+ nfs4_stateid_copy(dst, &lsp->ls_stateid);
+ spin_unlock(&state->state_lock);
+ return ret;
+}
+
+struct nfs4_unlockdata {
+ struct nfs_locku_args arg;
+ struct nfs_locku_res res;
+ struct nfs4_lock_state *lsp;
+ struct nfs_open_context *ctx;
+ struct nfs_lock_context *l_ctx;
+ struct file_lock fl;
+ struct nfs_server *server;
+ unsigned long timestamp;
+};
+
+static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
+ struct nfs_open_context *ctx,
+ struct nfs4_lock_state *lsp,
+ struct nfs_seqid *seqid)
+{
+ struct nfs4_unlockdata *p;
+ struct nfs4_state *state = lsp->ls_state;
+ struct inode *inode = state->inode;
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (p == NULL)
+ return NULL;
+ p->arg.fh = NFS_FH(inode);
+ p->arg.fl = &p->fl;
+ p->arg.seqid = seqid;
+ p->res.seqid = seqid;
+ p->lsp = lsp;
+ /* Ensure we don't close file until we're done freeing locks! */
+ p->ctx = get_nfs_open_context(ctx);
+ p->l_ctx = nfs_get_lock_context(ctx);
+ locks_init_lock(&p->fl);
+ locks_copy_lock(&p->fl, fl);
+ p->server = NFS_SERVER(inode);
+ spin_lock(&state->state_lock);
+ nfs4_stateid_copy(&p->arg.stateid, &lsp->ls_stateid);
+ spin_unlock(&state->state_lock);
+ return p;
+}
+
+static void nfs4_locku_release_calldata(void *data)
+{
+ struct nfs4_unlockdata *calldata = data;
+ nfs_free_seqid(calldata->arg.seqid);
+ nfs4_put_lock_state(calldata->lsp);
+ nfs_put_lock_context(calldata->l_ctx);
+ put_nfs_open_context(calldata->ctx);
+ kfree(calldata);
+}
+
+static void nfs4_locku_done(struct rpc_task *task, void *data)
+{
+ struct nfs4_unlockdata *calldata = data;
+ struct nfs4_exception exception = {
+ .inode = calldata->lsp->ls_state->inode,
+ .stateid = &calldata->arg.stateid,
+ };
+
+ if (!nfs4_sequence_done(task, &calldata->res.seq_res))
+ return;
+ switch (task->tk_status) {
+ case 0:
+ renew_lease(calldata->server, calldata->timestamp);
+ locks_lock_inode_wait(calldata->lsp->ls_state->inode, &calldata->fl);
+ if (nfs4_update_lock_stateid(calldata->lsp,
+ &calldata->res.stateid))
+ break;
+ fallthrough;
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_EXPIRED:
+ nfs4_free_revoked_stateid(calldata->server,
+ &calldata->arg.stateid,
+ task->tk_msg.rpc_cred);
+ fallthrough;
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_STALE_STATEID:
+ if (nfs4_sync_lock_stateid(&calldata->arg.stateid,
+ calldata->lsp))
+ rpc_restart_call_prepare(task);
+ break;
+ case -NFS4ERR_OLD_STATEID:
+ if (nfs4_refresh_lock_old_stateid(&calldata->arg.stateid,
+ calldata->lsp))
+ rpc_restart_call_prepare(task);
+ break;
+ default:
+ task->tk_status = nfs4_async_handle_exception(task,
+ calldata->server, task->tk_status,
+ &exception);
+ if (exception.retry)
+ rpc_restart_call_prepare(task);
+ }
+ nfs_release_seqid(calldata->arg.seqid);
+}
+
+static void nfs4_locku_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs4_unlockdata *calldata = data;
+
+ if (test_bit(NFS_CONTEXT_UNLOCK, &calldata->l_ctx->open_context->flags) &&
+ nfs_async_iocounter_wait(task, calldata->l_ctx))
+ return;
+
+ if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
+ goto out_wait;
+ if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
+ /* Note: exit _without_ running nfs4_locku_done */
+ goto out_no_action;
+ }
+ calldata->timestamp = jiffies;
+ if (nfs4_setup_sequence(calldata->server->nfs_client,
+ &calldata->arg.seq_args,
+ &calldata->res.seq_res,
+ task) != 0)
+ nfs_release_seqid(calldata->arg.seqid);
+ return;
+out_no_action:
+ task->tk_action = NULL;
+out_wait:
+ nfs4_sequence_done(task, &calldata->res.seq_res);
+}
+
+static const struct rpc_call_ops nfs4_locku_ops = {
+ .rpc_call_prepare = nfs4_locku_prepare,
+ .rpc_call_done = nfs4_locku_done,
+ .rpc_release = nfs4_locku_release_calldata,
+};
+
+static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
+ struct nfs_open_context *ctx,
+ struct nfs4_lock_state *lsp,
+ struct nfs_seqid *seqid)
+{
+ struct nfs4_unlockdata *data;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKU],
+ .rpc_cred = ctx->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = NFS_CLIENT(lsp->ls_state->inode),
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_locku_ops,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC,
+ };
+
+ if (nfs_server_capable(lsp->ls_state->inode, NFS_CAP_MOVEABLE))
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
+ nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client,
+ NFS_SP4_MACH_CRED_CLEANUP, &task_setup_data.rpc_client, &msg);
+
+ /* Ensure this is an unlock - when canceling a lock, the
+ * canceled lock is passed in, and it won't be an unlock.
+ */
+ fl->fl_type = F_UNLCK;
+ if (fl->fl_flags & FL_CLOSE)
+ set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags);
+
+ data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid);
+ if (data == NULL) {
+ nfs_free_seqid(seqid);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1, 0);
+ msg.rpc_argp = &data->arg;
+ msg.rpc_resp = &data->res;
+ task_setup_data.callback_data = data;
+ return rpc_run_task(&task_setup_data);
+}
+
+static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+ struct inode *inode = state->inode;
+ struct nfs4_state_owner *sp = state->owner;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_seqid *seqid;
+ struct nfs4_lock_state *lsp;
+ struct rpc_task *task;
+ struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+ int status = 0;
+ unsigned char fl_flags = request->fl_flags;
+
+ status = nfs4_set_lock_state(state, request);
+ /* Unlock _before_ we do the RPC call */
+ request->fl_flags |= FL_EXISTS;
+ /* Exclude nfs_delegation_claim_locks() */
+ mutex_lock(&sp->so_delegreturn_mutex);
+ /* Exclude nfs4_reclaim_open_stateid() - note nesting! */
+ down_read(&nfsi->rwsem);
+ if (locks_lock_inode_wait(inode, request) == -ENOENT) {
+ up_read(&nfsi->rwsem);
+ mutex_unlock(&sp->so_delegreturn_mutex);
+ goto out;
+ }
+ lsp = request->fl_u.nfs4_fl.owner;
+ set_bit(NFS_LOCK_UNLOCKING, &lsp->ls_flags);
+ up_read(&nfsi->rwsem);
+ mutex_unlock(&sp->so_delegreturn_mutex);
+ if (status != 0)
+ goto out;
+ /* Is this a delegated lock? */
+ if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0)
+ goto out;
+ alloc_seqid = NFS_SERVER(inode)->nfs_client->cl_mvops->alloc_seqid;
+ seqid = alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
+ status = -ENOMEM;
+ if (IS_ERR(seqid))
+ goto out;
+ task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid);
+ status = PTR_ERR(task);
+ if (IS_ERR(task))
+ goto out;
+ status = rpc_wait_for_completion_task(task);
+ rpc_put_task(task);
+out:
+ request->fl_flags = fl_flags;
+ trace_nfs4_unlock(request, state, F_SETLK, status);
+ return status;
+}
+
+struct nfs4_lockdata {
+ struct nfs_lock_args arg;
+ struct nfs_lock_res res;
+ struct nfs4_lock_state *lsp;
+ struct nfs_open_context *ctx;
+ struct file_lock fl;
+ unsigned long timestamp;
+ int rpc_status;
+ int cancelled;
+ struct nfs_server *server;
+};
+
+static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
+ struct nfs_open_context *ctx, struct nfs4_lock_state *lsp,
+ gfp_t gfp_mask)
+{
+ struct nfs4_lockdata *p;
+ struct inode *inode = lsp->ls_state->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+
+ p = kzalloc(sizeof(*p), gfp_mask);
+ if (p == NULL)
+ return NULL;
+
+ p->arg.fh = NFS_FH(inode);
+ p->arg.fl = &p->fl;
+ p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask);
+ if (IS_ERR(p->arg.open_seqid))
+ goto out_free;
+ alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
+ p->arg.lock_seqid = alloc_seqid(&lsp->ls_seqid, gfp_mask);
+ if (IS_ERR(p->arg.lock_seqid))
+ goto out_free_seqid;
+ p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
+ p->arg.lock_owner.id = lsp->ls_seqid.owner_id;
+ p->arg.lock_owner.s_dev = server->s_dev;
+ p->res.lock_seqid = p->arg.lock_seqid;
+ p->lsp = lsp;
+ p->server = server;
+ p->ctx = get_nfs_open_context(ctx);
+ locks_init_lock(&p->fl);
+ locks_copy_lock(&p->fl, fl);
+ return p;
+out_free_seqid:
+ nfs_free_seqid(p->arg.open_seqid);
+out_free:
+ kfree(p);
+ return NULL;
+}
+
+static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_lockdata *data = calldata;
+ struct nfs4_state *state = data->lsp->ls_state;
+
+ if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
+ goto out_wait;
+ /* Do we need to do an open_to_lock_owner? */
+ if (!test_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags)) {
+ if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) {
+ goto out_release_lock_seqid;
+ }
+ nfs4_stateid_copy(&data->arg.open_stateid,
+ &state->open_stateid);
+ data->arg.new_lock_owner = 1;
+ data->res.open_seqid = data->arg.open_seqid;
+ } else {
+ data->arg.new_lock_owner = 0;
+ nfs4_stateid_copy(&data->arg.lock_stateid,
+ &data->lsp->ls_stateid);
+ }
+ if (!nfs4_valid_open_stateid(state)) {
+ data->rpc_status = -EBADF;
+ task->tk_action = NULL;
+ goto out_release_open_seqid;
+ }
+ data->timestamp = jiffies;
+ if (nfs4_setup_sequence(data->server->nfs_client,
+ &data->arg.seq_args,
+ &data->res.seq_res,
+ task) == 0)
+ return;
+out_release_open_seqid:
+ nfs_release_seqid(data->arg.open_seqid);
+out_release_lock_seqid:
+ nfs_release_seqid(data->arg.lock_seqid);
+out_wait:
+ nfs4_sequence_done(task, &data->res.seq_res);
+ dprintk("%s: ret = %d\n", __func__, data->rpc_status);
+}
+
+static void nfs4_lock_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_lockdata *data = calldata;
+ struct nfs4_lock_state *lsp = data->lsp;
+
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return;
+
+ data->rpc_status = task->tk_status;
+ switch (task->tk_status) {
+ case 0:
+ renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)),
+ data->timestamp);
+ if (data->arg.new_lock && !data->cancelled) {
+ data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
+ if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0)
+ goto out_restart;
+ }
+ if (data->arg.new_lock_owner != 0) {
+ nfs_confirm_seqid(&lsp->ls_seqid, 0);
+ nfs4_stateid_copy(&lsp->ls_stateid, &data->res.stateid);
+ set_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
+ } else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid))
+ goto out_restart;
+ break;
+ case -NFS4ERR_OLD_STATEID:
+ if (data->arg.new_lock_owner != 0 &&
+ nfs4_refresh_open_old_stateid(&data->arg.open_stateid,
+ lsp->ls_state))
+ goto out_restart;
+ if (nfs4_refresh_lock_old_stateid(&data->arg.lock_stateid, lsp))
+ goto out_restart;
+ fallthrough;
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_EXPIRED:
+ if (data->arg.new_lock_owner != 0) {
+ if (!nfs4_stateid_match(&data->arg.open_stateid,
+ &lsp->ls_state->open_stateid))
+ goto out_restart;
+ } else if (!nfs4_stateid_match(&data->arg.lock_stateid,
+ &lsp->ls_stateid))
+ goto out_restart;
+ }
+out_done:
+ dprintk("%s: ret = %d!\n", __func__, data->rpc_status);
+ return;
+out_restart:
+ if (!data->cancelled)
+ rpc_restart_call_prepare(task);
+ goto out_done;
+}
+
+static void nfs4_lock_release(void *calldata)
+{
+ struct nfs4_lockdata *data = calldata;
+
+ nfs_free_seqid(data->arg.open_seqid);
+ if (data->cancelled && data->rpc_status == 0) {
+ struct rpc_task *task;
+ task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp,
+ data->arg.lock_seqid);
+ if (!IS_ERR(task))
+ rpc_put_task_async(task);
+ dprintk("%s: cancelling lock!\n", __func__);
+ } else
+ nfs_free_seqid(data->arg.lock_seqid);
+ nfs4_put_lock_state(data->lsp);
+ put_nfs_open_context(data->ctx);
+ kfree(data);
+}
+
+static const struct rpc_call_ops nfs4_lock_ops = {
+ .rpc_call_prepare = nfs4_lock_prepare,
+ .rpc_call_done = nfs4_lock_done,
+ .rpc_release = nfs4_lock_release,
+};
+
+static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
+{
+ switch (error) {
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_BAD_STATEID:
+ lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+ if (new_lock_owner != 0 ||
+ test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0)
+ nfs4_schedule_stateid_recovery(server, lsp->ls_state);
+ break;
+ case -NFS4ERR_STALE_STATEID:
+ lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+ nfs4_schedule_lease_recovery(server->nfs_client);
+ }
+}
+
+static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int recovery_type)
+{
+ struct nfs4_lockdata *data;
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCK],
+ .rpc_cred = state->owner->so_cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = NFS_CLIENT(state->inode),
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_lock_ops,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
+ };
+ int ret;
+
+ if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE))
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
+ data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
+ fl->fl_u.nfs4_fl.owner, GFP_KERNEL);
+ if (data == NULL)
+ return -ENOMEM;
+ if (IS_SETLKW(cmd))
+ data->arg.block = 1;
+ nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1,
+ recovery_type > NFS_LOCK_NEW);
+ msg.rpc_argp = &data->arg;
+ msg.rpc_resp = &data->res;
+ task_setup_data.callback_data = data;
+ if (recovery_type > NFS_LOCK_NEW) {
+ if (recovery_type == NFS_LOCK_RECLAIM)
+ data->arg.reclaim = NFS_LOCK_RECLAIM;
+ } else
+ data->arg.new_lock = 1;
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ ret = rpc_wait_for_completion_task(task);
+ if (ret == 0) {
+ ret = data->rpc_status;
+ if (ret)
+ nfs4_handle_setlk_error(data->server, data->lsp,
+ data->arg.new_lock_owner, ret);
+ } else
+ data->cancelled = true;
+ trace_nfs4_set_lock(fl, state, &data->res.stateid, cmd, ret);
+ rpc_put_task(task);
+ dprintk("%s: ret = %d\n", __func__, ret);
+ return ret;
+}
+
+static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request)
+{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ struct nfs4_exception exception = {
+ .inode = state->inode,
+ };
+ int err;
+
+ do {
+ /* Cache the lock if possible... */
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
+ return 0;
+ err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
+ if (err != -NFS4ERR_DELAY)
+ break;
+ nfs4_handle_exception(server, err, &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request)
+{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ struct nfs4_exception exception = {
+ .inode = state->inode,
+ };
+ int err;
+
+ err = nfs4_set_lock_state(state, request);
+ if (err != 0)
+ return err;
+ if (!recover_lost_locks) {
+ set_bit(NFS_LOCK_LOST, &request->fl_u.nfs4_fl.owner->ls_flags);
+ return 0;
+ }
+ do {
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
+ return 0;
+ err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED);
+ switch (err) {
+ default:
+ goto out;
+ case -NFS4ERR_GRACE:
+ case -NFS4ERR_DELAY:
+ nfs4_handle_exception(server, err, &exception);
+ err = 0;
+ }
+ } while (exception.retry);
+out:
+ return err;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request)
+{
+ struct nfs4_lock_state *lsp;
+ int status;
+
+ status = nfs4_set_lock_state(state, request);
+ if (status != 0)
+ return status;
+ lsp = request->fl_u.nfs4_fl.owner;
+ if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) ||
+ test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
+ return 0;
+ return nfs4_lock_expired(state, request);
+}
+#endif
+
+static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+ struct nfs_inode *nfsi = NFS_I(state->inode);
+ struct nfs4_state_owner *sp = state->owner;
+ unsigned char fl_flags = request->fl_flags;
+ int status;
+
+ request->fl_flags |= FL_ACCESS;
+ status = locks_lock_inode_wait(state->inode, request);
+ if (status < 0)
+ goto out;
+ mutex_lock(&sp->so_delegreturn_mutex);
+ down_read(&nfsi->rwsem);
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
+ /* Yes: cache locks! */
+ /* ...but avoid races with delegation recall... */
+ request->fl_flags = fl_flags & ~FL_SLEEP;
+ status = locks_lock_inode_wait(state->inode, request);
+ up_read(&nfsi->rwsem);
+ mutex_unlock(&sp->so_delegreturn_mutex);
+ goto out;
+ }
+ up_read(&nfsi->rwsem);
+ mutex_unlock(&sp->so_delegreturn_mutex);
+ status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
+out:
+ request->fl_flags = fl_flags;
+ return status;
+}
+
+static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+ struct nfs4_exception exception = {
+ .state = state,
+ .inode = state->inode,
+ .interruptible = true,
+ };
+ int err;
+
+ do {
+ err = _nfs4_proc_setlk(state, cmd, request);
+ if (err == -NFS4ERR_DENIED)
+ err = -EAGAIN;
+ err = nfs4_handle_exception(NFS_SERVER(state->inode),
+ err, &exception);
+ } while (exception.retry);
+ return err;
+}
+
+#define NFS4_LOCK_MINTIMEOUT (1 * HZ)
+#define NFS4_LOCK_MAXTIMEOUT (30 * HZ)
+
+static int
+nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd,
+ struct file_lock *request)
+{
+ int status = -ERESTARTSYS;
+ unsigned long timeout = NFS4_LOCK_MINTIMEOUT;
+
+ while(!signalled()) {
+ status = nfs4_proc_setlk(state, cmd, request);
+ if ((status != -EAGAIN) || IS_SETLK(cmd))
+ break;
+ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+ schedule_timeout(timeout);
+ timeout *= 2;
+ timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout);
+ status = -ERESTARTSYS;
+ }
+ return status;
+}
+
+#ifdef CONFIG_NFS_V4_1
+struct nfs4_lock_waiter {
+ struct inode *inode;
+ struct nfs_lowner owner;
+ wait_queue_entry_t wait;
+};
+
+static int
+nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key)
+{
+ struct nfs4_lock_waiter *waiter =
+ container_of(wait, struct nfs4_lock_waiter, wait);
+
+ /* NULL key means to wake up everyone */
+ if (key) {
+ struct cb_notify_lock_args *cbnl = key;
+ struct nfs_lowner *lowner = &cbnl->cbnl_owner,
+ *wowner = &waiter->owner;
+
+ /* Only wake if the callback was for the same owner. */
+ if (lowner->id != wowner->id || lowner->s_dev != wowner->s_dev)
+ return 0;
+
+ /* Make sure it's for the right inode */
+ if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh))
+ return 0;
+ }
+
+ return woken_wake_function(wait, mode, flags, key);
+}
+
+static int
+nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+ struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner;
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ struct nfs_client *clp = server->nfs_client;
+ wait_queue_head_t *q = &clp->cl_lock_waitq;
+ struct nfs4_lock_waiter waiter = {
+ .inode = state->inode,
+ .owner = { .clientid = clp->cl_clientid,
+ .id = lsp->ls_seqid.owner_id,
+ .s_dev = server->s_dev },
+ };
+ int status;
+
+ /* Don't bother with waitqueue if we don't expect a callback */
+ if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags))
+ return nfs4_retry_setlk_simple(state, cmd, request);
+
+ init_wait(&waiter.wait);
+ waiter.wait.func = nfs4_wake_lock_waiter;
+ add_wait_queue(q, &waiter.wait);
+
+ do {
+ status = nfs4_proc_setlk(state, cmd, request);
+ if (status != -EAGAIN || IS_SETLK(cmd))
+ break;
+
+ status = -ERESTARTSYS;
+ wait_woken(&waiter.wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE,
+ NFS4_LOCK_MAXTIMEOUT);
+ } while (!signalled());
+
+ remove_wait_queue(q, &waiter.wait);
+
+ return status;
+}
+#else /* !CONFIG_NFS_V4_1 */
+static inline int
+nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+ return nfs4_retry_setlk_simple(state, cmd, request);
+}
+#endif
+
+static int
+nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
+{
+ struct nfs_open_context *ctx;
+ struct nfs4_state *state;
+ int status;
+
+ /* verify open state */
+ ctx = nfs_file_open_context(filp);
+ state = ctx->state;
+
+ if (IS_GETLK(cmd)) {
+ if (state != NULL)
+ return nfs4_proc_getlk(state, F_GETLK, request);
+ return 0;
+ }
+
+ if (!(IS_SETLK(cmd) || IS_SETLKW(cmd)))
+ return -EINVAL;
+
+ if (request->fl_type == F_UNLCK) {
+ if (state != NULL)
+ return nfs4_proc_unlck(state, cmd, request);
+ return 0;
+ }
+
+ if (state == NULL)
+ return -ENOLCK;
+
+ if ((request->fl_flags & FL_POSIX) &&
+ !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
+ return -ENOLCK;
+
+ /*
+ * Don't rely on the VFS having checked the file open mode,
+ * since it won't do this for flock() locks.
+ */
+ switch (request->fl_type) {
+ case F_RDLCK:
+ if (!(filp->f_mode & FMODE_READ))
+ return -EBADF;
+ break;
+ case F_WRLCK:
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+ }
+
+ status = nfs4_set_lock_state(state, request);
+ if (status != 0)
+ return status;
+
+ return nfs4_retry_setlk(state, cmd, request);
+}
+
+static int nfs4_delete_lease(struct file *file, void **priv)
+{
+ return generic_setlease(file, F_UNLCK, NULL, priv);
+}
+
+static int nfs4_add_lease(struct file *file, int arg, struct file_lock **lease,
+ void **priv)
+{
+ struct inode *inode = file_inode(file);
+ fmode_t type = arg == F_RDLCK ? FMODE_READ : FMODE_WRITE;
+ int ret;
+
+ /* No delegation, no lease */
+ if (!nfs4_have_delegation(inode, type))
+ return -EAGAIN;
+ ret = generic_setlease(file, arg, lease, priv);
+ if (ret || nfs4_have_delegation(inode, type))
+ return ret;
+ /* We raced with a delegation return */
+ nfs4_delete_lease(file, priv);
+ return -EAGAIN;
+}
+
+int nfs4_proc_setlease(struct file *file, int arg, struct file_lock **lease,
+ void **priv)
+{
+ switch (arg) {
+ case F_RDLCK:
+ case F_WRLCK:
+ return nfs4_add_lease(file, arg, lease, priv);
+ case F_UNLCK:
+ return nfs4_delete_lease(file, priv);
+ default:
+ return -EINVAL;
+ }
+}
+
+int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid)
+{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ int err;
+
+ err = nfs4_set_lock_state(state, fl);
+ if (err != 0)
+ return err;
+ do {
+ err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
+ if (err != -NFS4ERR_DELAY)
+ break;
+ ssleep(1);
+ } while (err == -NFS4ERR_DELAY);
+ return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err);
+}
+
+struct nfs_release_lockowner_data {
+ struct nfs4_lock_state *lsp;
+ struct nfs_server *server;
+ struct nfs_release_lockowner_args args;
+ struct nfs_release_lockowner_res res;
+ unsigned long timestamp;
+};
+
+static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs_release_lockowner_data *data = calldata;
+ struct nfs_server *server = data->server;
+ nfs4_setup_sequence(server->nfs_client, &data->args.seq_args,
+ &data->res.seq_res, task);
+ data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
+ data->timestamp = jiffies;
+}
+
+static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs_release_lockowner_data *data = calldata;
+ struct nfs_server *server = data->server;
+
+ nfs40_sequence_done(task, &data->res.seq_res);
+
+ switch (task->tk_status) {
+ case 0:
+ renew_lease(server, data->timestamp);
+ break;
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_EXPIRED:
+ nfs4_schedule_lease_recovery(server->nfs_client);
+ break;
+ case -NFS4ERR_LEASE_MOVED:
+ case -NFS4ERR_DELAY:
+ if (nfs4_async_handle_error(task, server,
+ NULL, NULL) == -EAGAIN)
+ rpc_restart_call_prepare(task);
+ }
+}
+
+static void nfs4_release_lockowner_release(void *calldata)
+{
+ struct nfs_release_lockowner_data *data = calldata;
+ nfs4_free_lock_state(data->server, data->lsp);
+ kfree(calldata);
+}
+
+static const struct rpc_call_ops nfs4_release_lockowner_ops = {
+ .rpc_call_prepare = nfs4_release_lockowner_prepare,
+ .rpc_call_done = nfs4_release_lockowner_done,
+ .rpc_release = nfs4_release_lockowner_release,
+};
+
+static void
+nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
+{
+ struct nfs_release_lockowner_data *data;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER],
+ };
+
+ if (server->nfs_client->cl_mvops->minor_version != 0)
+ return;
+
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return;
+ data->lsp = lsp;
+ data->server = server;
+ data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
+ data->args.lock_owner.id = lsp->ls_seqid.owner_id;
+ data->args.lock_owner.s_dev = server->s_dev;
+
+ msg.rpc_argp = &data->args;
+ msg.rpc_resp = &data->res;
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0, 0);
+ rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);
+}
+
+#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
+
+static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *unused, struct inode *inode,
+ const char *key, const void *buf,
+ size_t buflen, int flags)
+{
+ return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_ACL);
+}
+
+static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, void *buf, size_t buflen)
+{
+ return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_ACL);
+}
+
+static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry)
+{
+ return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_ACL);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+#define XATTR_NAME_NFSV4_DACL "system.nfs4_dacl"
+
+static int nfs4_xattr_set_nfs4_dacl(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *unused, struct inode *inode,
+ const char *key, const void *buf,
+ size_t buflen, int flags)
+{
+ return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_DACL);
+}
+
+static int nfs4_xattr_get_nfs4_dacl(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, void *buf, size_t buflen)
+{
+ return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_DACL);
+}
+
+static bool nfs4_xattr_list_nfs4_dacl(struct dentry *dentry)
+{
+ return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_DACL);
+}
+
+#define XATTR_NAME_NFSV4_SACL "system.nfs4_sacl"
+
+static int nfs4_xattr_set_nfs4_sacl(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *unused, struct inode *inode,
+ const char *key, const void *buf,
+ size_t buflen, int flags)
+{
+ return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_SACL);
+}
+
+static int nfs4_xattr_get_nfs4_sacl(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, void *buf, size_t buflen)
+{
+ return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_SACL);
+}
+
+static bool nfs4_xattr_list_nfs4_sacl(struct dentry *dentry)
+{
+ return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_SACL);
+}
+
+#endif
+
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+
+static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *unused, struct inode *inode,
+ const char *key, const void *buf,
+ size_t buflen, int flags)
+{
+ if (security_ismaclabel(key))
+ return nfs4_set_security_label(inode, buf, buflen);
+
+ return -EOPNOTSUPP;
+}
+
+static int nfs4_xattr_get_nfs4_label(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, void *buf, size_t buflen)
+{
+ if (security_ismaclabel(key))
+ return nfs4_get_security_label(inode, buf, buflen);
+ return -EOPNOTSUPP;
+}
+
+static ssize_t
+nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
+{
+ int len = 0;
+
+ if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) {
+ len = security_inode_listsecurity(inode, list, list_len);
+ if (len >= 0 && list_len && len > list_len)
+ return -ERANGE;
+ }
+ return len;
+}
+
+static const struct xattr_handler nfs4_xattr_nfs4_label_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = nfs4_xattr_get_nfs4_label,
+ .set = nfs4_xattr_set_nfs4_label,
+};
+
+#else
+
+static ssize_t
+nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
+{
+ return 0;
+}
+
+#endif
+
+#ifdef CONFIG_NFS_V4_2
+static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *unused, struct inode *inode,
+ const char *key, const void *buf,
+ size_t buflen, int flags)
+{
+ u32 mask;
+ int ret;
+
+ if (!nfs_server_capable(inode, NFS_CAP_XATTR))
+ return -EOPNOTSUPP;
+
+ /*
+ * There is no mapping from the MAY_* flags to the NFS_ACCESS_XA*
+ * flags right now. Handling of xattr operations use the normal
+ * file read/write permissions.
+ *
+ * Just in case the server has other ideas (which RFC 8276 allows),
+ * do a cached access check for the XA* flags to possibly avoid
+ * doing an RPC and getting EACCES back.
+ */
+ if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) {
+ if (!(mask & NFS_ACCESS_XAWRITE))
+ return -EACCES;
+ }
+
+ if (buf == NULL) {
+ ret = nfs42_proc_removexattr(inode, key);
+ if (!ret)
+ nfs4_xattr_cache_remove(inode, key);
+ } else {
+ ret = nfs42_proc_setxattr(inode, key, buf, buflen, flags);
+ if (!ret)
+ nfs4_xattr_cache_add(inode, key, buf, NULL, buflen);
+ }
+
+ return ret;
+}
+
+static int nfs4_xattr_get_nfs4_user(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, void *buf, size_t buflen)
+{
+ u32 mask;
+ ssize_t ret;
+
+ if (!nfs_server_capable(inode, NFS_CAP_XATTR))
+ return -EOPNOTSUPP;
+
+ if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) {
+ if (!(mask & NFS_ACCESS_XAREAD))
+ return -EACCES;
+ }
+
+ ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
+ if (ret)
+ return ret;
+
+ ret = nfs4_xattr_cache_get(inode, key, buf, buflen);
+ if (ret >= 0 || (ret < 0 && ret != -ENOENT))
+ return ret;
+
+ ret = nfs42_proc_getxattr(inode, key, buf, buflen);
+
+ return ret;
+}
+
+static ssize_t
+nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len)
+{
+ u64 cookie;
+ bool eof;
+ ssize_t ret, size;
+ char *buf;
+ size_t buflen;
+ u32 mask;
+
+ if (!nfs_server_capable(inode, NFS_CAP_XATTR))
+ return 0;
+
+ if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) {
+ if (!(mask & NFS_ACCESS_XALIST))
+ return 0;
+ }
+
+ ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
+ if (ret)
+ return ret;
+
+ ret = nfs4_xattr_cache_list(inode, list, list_len);
+ if (ret >= 0 || (ret < 0 && ret != -ENOENT))
+ return ret;
+
+ cookie = 0;
+ eof = false;
+ buflen = list_len ? list_len : XATTR_LIST_MAX;
+ buf = list_len ? list : NULL;
+ size = 0;
+
+ while (!eof) {
+ ret = nfs42_proc_listxattrs(inode, buf, buflen,
+ &cookie, &eof);
+ if (ret < 0)
+ return ret;
+
+ if (list_len) {
+ buf += ret;
+ buflen -= ret;
+ }
+ size += ret;
+ }
+
+ if (list_len)
+ nfs4_xattr_cache_set_list(inode, list, size);
+
+ return size;
+}
+
+#else
+
+static ssize_t
+nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len)
+{
+ return 0;
+}
+#endif /* CONFIG_NFS_V4_2 */
+
+/*
+ * nfs_fhget will use either the mounted_on_fileid or the fileid
+ */
+static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)
+{
+ if (!(((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) ||
+ (fattr->valid & NFS_ATTR_FATTR_FILEID)) &&
+ (fattr->valid & NFS_ATTR_FATTR_FSID) &&
+ (fattr->valid & NFS_ATTR_FATTR_V4_LOCATIONS)))
+ return;
+
+ fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
+ NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_V4_REFERRAL;
+ fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
+ fattr->nlink = 2;
+}
+
+static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
+ const struct qstr *name,
+ struct nfs4_fs_locations *fs_locations,
+ struct page *page)
+{
+ struct nfs_server *server = NFS_SERVER(dir);
+ u32 bitmask[3];
+ struct nfs4_fs_locations_arg args = {
+ .dir_fh = NFS_FH(dir),
+ .name = name,
+ .page = page,
+ .bitmask = bitmask,
+ };
+ struct nfs4_fs_locations_res res = {
+ .fs_locations = fs_locations,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ int status;
+
+ dprintk("%s: start\n", __func__);
+
+ bitmask[0] = nfs4_fattr_bitmap[0] | FATTR4_WORD0_FS_LOCATIONS;
+ bitmask[1] = nfs4_fattr_bitmap[1];
+
+ /* Ask for the fileid of the absent filesystem if mounted_on_fileid
+ * is not supported */
+ if (NFS_SERVER(dir)->attr_bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
+ bitmask[0] &= ~FATTR4_WORD0_FILEID;
+ else
+ bitmask[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
+
+ nfs_fattr_init(fs_locations->fattr);
+ fs_locations->server = server;
+ fs_locations->nlocations = 0;
+ status = nfs4_call_sync(client, server, &msg, &args.seq_args, &res.seq_res, 0);
+ dprintk("%s: returned status = %d\n", __func__, status);
+ return status;
+}
+
+int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
+ const struct qstr *name,
+ struct nfs4_fs_locations *fs_locations,
+ struct page *page)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = _nfs4_proc_fs_locations(client, dir, name,
+ fs_locations, page);
+ trace_nfs4_get_fs_locations(dir, name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+/*
+ * This operation also signals the server that this client is
+ * performing migration recovery. The server can stop returning
+ * NFS4ERR_LEASE_MOVED to this client. A RENEW operation is
+ * appended to this compound to identify the client ID which is
+ * performing recovery.
+ */
+static int _nfs40_proc_get_locations(struct nfs_server *server,
+ struct nfs_fh *fhandle,
+ struct nfs4_fs_locations *locations,
+ struct page *page, const struct cred *cred)
+{
+ struct rpc_clnt *clnt = server->client;
+ u32 bitmask[2] = {
+ [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
+ };
+ struct nfs4_fs_locations_arg args = {
+ .clientid = server->nfs_client->cl_clientid,
+ .fh = fhandle,
+ .page = page,
+ .bitmask = bitmask,
+ .migration = 1, /* skip LOOKUP */
+ .renew = 1, /* append RENEW */
+ };
+ struct nfs4_fs_locations_res res = {
+ .fs_locations = locations,
+ .migration = 1,
+ .renew = 1,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ unsigned long now = jiffies;
+ int status;
+
+ nfs_fattr_init(locations->fattr);
+ locations->server = server;
+ locations->nlocations = 0;
+
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1);
+ status = nfs4_call_sync_sequence(clnt, server, &msg,
+ &args.seq_args, &res.seq_res);
+ if (status)
+ return status;
+
+ renew_lease(server, now);
+ return 0;
+}
+
+#ifdef CONFIG_NFS_V4_1
+
+/*
+ * This operation also signals the server that this client is
+ * performing migration recovery. The server can stop asserting
+ * SEQ4_STATUS_LEASE_MOVED for this client. The client ID
+ * performing this operation is identified in the SEQUENCE
+ * operation in this compound.
+ *
+ * When the client supports GETATTR(fs_locations_info), it can
+ * be plumbed in here.
+ */
+static int _nfs41_proc_get_locations(struct nfs_server *server,
+ struct nfs_fh *fhandle,
+ struct nfs4_fs_locations *locations,
+ struct page *page, const struct cred *cred)
+{
+ struct rpc_clnt *clnt = server->client;
+ u32 bitmask[2] = {
+ [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
+ };
+ struct nfs4_fs_locations_arg args = {
+ .fh = fhandle,
+ .page = page,
+ .bitmask = bitmask,
+ .migration = 1, /* skip LOOKUP */
+ };
+ struct nfs4_fs_locations_res res = {
+ .fs_locations = locations,
+ .migration = 1,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ struct nfs4_call_sync_data data = {
+ .seq_server = server,
+ .seq_args = &args.seq_args,
+ .seq_res = &res.seq_res,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clnt,
+ .rpc_message = &msg,
+ .callback_ops = server->nfs_client->cl_mvops->call_sync_ops,
+ .callback_data = &data,
+ .flags = RPC_TASK_NO_ROUND_ROBIN,
+ };
+ int status;
+
+ nfs_fattr_init(locations->fattr);
+ locations->server = server;
+ locations->nlocations = 0;
+
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1);
+ status = nfs4_call_sync_custom(&task_setup_data);
+ if (status == NFS4_OK &&
+ res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED)
+ status = -NFS4ERR_LEASE_MOVED;
+ return status;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_proc_get_locations - discover locations for a migrated FSID
+ * @server: pointer to nfs_server to process
+ * @fhandle: pointer to the kernel NFS client file handle
+ * @locations: result of query
+ * @page: buffer
+ * @cred: credential to use for this operation
+ *
+ * Returns NFS4_OK on success, a negative NFS4ERR status code if the
+ * operation failed, or a negative errno if a local error occurred.
+ *
+ * On success, "locations" is filled in, but if the server has
+ * no locations information, NFS_ATTR_FATTR_V4_LOCATIONS is not
+ * asserted.
+ *
+ * -NFS4ERR_LEASE_MOVED is returned if the server still has leases
+ * from this client that require migration recovery.
+ */
+int nfs4_proc_get_locations(struct nfs_server *server,
+ struct nfs_fh *fhandle,
+ struct nfs4_fs_locations *locations,
+ struct page *page, const struct cred *cred)
+{
+ struct nfs_client *clp = server->nfs_client;
+ const struct nfs4_mig_recovery_ops *ops =
+ clp->cl_mvops->mig_recovery_ops;
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int status;
+
+ dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__,
+ (unsigned long long)server->fsid.major,
+ (unsigned long long)server->fsid.minor,
+ clp->cl_hostname);
+ nfs_display_fhandle(fhandle, __func__);
+
+ do {
+ status = ops->get_locations(server, fhandle, locations, page,
+ cred);
+ if (status != -NFS4ERR_DELAY)
+ break;
+ nfs4_handle_exception(server, status, &exception);
+ } while (exception.retry);
+ return status;
+}
+
+/*
+ * This operation also signals the server that this client is
+ * performing "lease moved" recovery. The server can stop
+ * returning NFS4ERR_LEASE_MOVED to this client. A RENEW operation
+ * is appended to this compound to identify the client ID which is
+ * performing recovery.
+ */
+static int _nfs40_proc_fsid_present(struct inode *inode, const struct cred *cred)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+ struct rpc_clnt *clnt = server->client;
+ struct nfs4_fsid_present_arg args = {
+ .fh = NFS_FH(inode),
+ .clientid = clp->cl_clientid,
+ .renew = 1, /* append RENEW */
+ };
+ struct nfs4_fsid_present_res res = {
+ .renew = 1,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ unsigned long now = jiffies;
+ int status;
+
+ res.fh = nfs_alloc_fhandle();
+ if (res.fh == NULL)
+ return -ENOMEM;
+
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1);
+ status = nfs4_call_sync_sequence(clnt, server, &msg,
+ &args.seq_args, &res.seq_res);
+ nfs_free_fhandle(res.fh);
+ if (status)
+ return status;
+
+ do_renew_lease(clp, now);
+ return 0;
+}
+
+#ifdef CONFIG_NFS_V4_1
+
+/*
+ * This operation also signals the server that this client is
+ * performing "lease moved" recovery. The server can stop asserting
+ * SEQ4_STATUS_LEASE_MOVED for this client. The client ID performing
+ * this operation is identified in the SEQUENCE operation in this
+ * compound.
+ */
+static int _nfs41_proc_fsid_present(struct inode *inode, const struct cred *cred)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct rpc_clnt *clnt = server->client;
+ struct nfs4_fsid_present_arg args = {
+ .fh = NFS_FH(inode),
+ };
+ struct nfs4_fsid_present_res res = {
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ int status;
+
+ res.fh = nfs_alloc_fhandle();
+ if (res.fh == NULL)
+ return -ENOMEM;
+
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1);
+ status = nfs4_call_sync_sequence(clnt, server, &msg,
+ &args.seq_args, &res.seq_res);
+ nfs_free_fhandle(res.fh);
+ if (status == NFS4_OK &&
+ res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED)
+ status = -NFS4ERR_LEASE_MOVED;
+ return status;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_proc_fsid_present - Is this FSID present or absent on server?
+ * @inode: inode on FSID to check
+ * @cred: credential to use for this operation
+ *
+ * Server indicates whether the FSID is present, moved, or not
+ * recognized. This operation is necessary to clear a LEASE_MOVED
+ * condition for this client ID.
+ *
+ * Returns NFS4_OK if the FSID is present on this server,
+ * -NFS4ERR_MOVED if the FSID is no longer present, a negative
+ * NFS4ERR code if some error occurred on the server, or a
+ * negative errno if a local failure occurred.
+ */
+int nfs4_proc_fsid_present(struct inode *inode, const struct cred *cred)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_client *clp = server->nfs_client;
+ const struct nfs4_mig_recovery_ops *ops =
+ clp->cl_mvops->mig_recovery_ops;
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int status;
+
+ dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__,
+ (unsigned long long)server->fsid.major,
+ (unsigned long long)server->fsid.minor,
+ clp->cl_hostname);
+ nfs_display_fhandle(NFS_FH(inode), __func__);
+
+ do {
+ status = ops->fsid_present(inode, cred);
+ if (status != -NFS4ERR_DELAY)
+ break;
+ nfs4_handle_exception(server, status, &exception);
+ } while (exception.retry);
+ return status;
+}
+
+/*
+ * If 'use_integrity' is true and the state managment nfs_client
+ * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient
+ * and the machine credential as per RFC3530bis and RFC5661 Security
+ * Considerations sections. Otherwise, just use the user cred with the
+ * filesystem's rpc_client.
+ */
+static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors, bool use_integrity)
+{
+ int status;
+ struct rpc_clnt *clnt = NFS_SERVER(dir)->client;
+ struct nfs_client *clp = NFS_SERVER(dir)->nfs_client;
+ struct nfs4_secinfo_arg args = {
+ .dir_fh = NFS_FH(dir),
+ .name = name,
+ };
+ struct nfs4_secinfo_res res = {
+ .flavors = flavors,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ struct nfs4_call_sync_data data = {
+ .seq_server = NFS_SERVER(dir),
+ .seq_args = &args.seq_args,
+ .seq_res = &res.seq_res,
+ };
+ struct rpc_task_setup task_setup = {
+ .rpc_client = clnt,
+ .rpc_message = &msg,
+ .callback_ops = clp->cl_mvops->call_sync_ops,
+ .callback_data = &data,
+ .flags = RPC_TASK_NO_ROUND_ROBIN,
+ };
+ const struct cred *cred = NULL;
+
+ if (use_integrity) {
+ clnt = clp->cl_rpcclient;
+ task_setup.rpc_client = clnt;
+
+ cred = nfs4_get_clid_cred(clp);
+ msg.rpc_cred = cred;
+ }
+
+ dprintk("NFS call secinfo %s\n", name->name);
+
+ nfs4_state_protect(clp, NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg);
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
+ status = nfs4_call_sync_custom(&task_setup);
+
+ dprintk("NFS reply secinfo: %d\n", status);
+
+ put_cred(cred);
+ return status;
+}
+
+int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name,
+ struct nfs4_secinfo_flavors *flavors)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = -NFS4ERR_WRONGSEC;
+
+ /* try to use integrity protection with machine cred */
+ if (_nfs4_is_integrity_protected(NFS_SERVER(dir)->nfs_client))
+ err = _nfs4_proc_secinfo(dir, name, flavors, true);
+
+ /*
+ * if unable to use integrity protection, or SECINFO with
+ * integrity protection returns NFS4ERR_WRONGSEC (which is
+ * disallowed by spec, but exists in deployed servers) use
+ * the current filesystem's rpc_client and the user cred.
+ */
+ if (err == -NFS4ERR_WRONGSEC)
+ err = _nfs4_proc_secinfo(dir, name, flavors, false);
+
+ trace_nfs4_secinfo(dir, name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+#ifdef CONFIG_NFS_V4_1
+/*
+ * Check the exchange flags returned by the server for invalid flags, having
+ * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or
+ * DS flags set.
+ */
+static int nfs4_check_cl_exchange_flags(u32 flags, u32 version)
+{
+ if (version >= 2 && (flags & ~EXCHGID4_2_FLAG_MASK_R))
+ goto out_inval;
+ else if (version < 2 && (flags & ~EXCHGID4_FLAG_MASK_R))
+ goto out_inval;
+ if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) &&
+ (flags & EXCHGID4_FLAG_USE_NON_PNFS))
+ goto out_inval;
+ if (!(flags & (EXCHGID4_FLAG_MASK_PNFS)))
+ goto out_inval;
+ return NFS_OK;
+out_inval:
+ return -NFS4ERR_INVAL;
+}
+
+static bool
+nfs41_same_server_scope(struct nfs41_server_scope *a,
+ struct nfs41_server_scope *b)
+{
+ if (a->server_scope_sz != b->server_scope_sz)
+ return false;
+ return memcmp(a->server_scope, b->server_scope, a->server_scope_sz) == 0;
+}
+
+static void
+nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs41_bind_conn_to_session_args *args = task->tk_msg.rpc_argp;
+ struct nfs41_bind_conn_to_session_res *res = task->tk_msg.rpc_resp;
+ struct nfs_client *clp = args->client;
+
+ switch (task->tk_status) {
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_DEADSESSION:
+ nfs4_schedule_session_recovery(clp->cl_session,
+ task->tk_status);
+ return;
+ }
+ if (args->dir == NFS4_CDFC4_FORE_OR_BOTH &&
+ res->dir != NFS4_CDFS4_BOTH) {
+ rpc_task_close_connection(task);
+ if (args->retries++ < MAX_BIND_CONN_TO_SESSION_RETRIES)
+ rpc_restart_call(task);
+ }
+}
+
+static const struct rpc_call_ops nfs4_bind_one_conn_to_session_ops = {
+ .rpc_call_done = nfs4_bind_one_conn_to_session_done,
+};
+
+/*
+ * nfs4_proc_bind_one_conn_to_session()
+ *
+ * The 4.1 client currently uses the same TCP connection for the
+ * fore and backchannel.
+ */
+static
+int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt,
+ struct nfs_client *clp,
+ const struct cred *cred)
+{
+ int status;
+ struct nfs41_bind_conn_to_session_args args = {
+ .client = clp,
+ .dir = NFS4_CDFC4_FORE_OR_BOTH,
+ .retries = 0,
+ };
+ struct nfs41_bind_conn_to_session_res res;
+ struct rpc_message msg = {
+ .rpc_proc =
+ &nfs4_procedures[NFSPROC4_CLNT_BIND_CONN_TO_SESSION],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clnt,
+ .rpc_xprt = xprt,
+ .callback_ops = &nfs4_bind_one_conn_to_session_ops,
+ .rpc_message = &msg,
+ .flags = RPC_TASK_TIMEOUT,
+ };
+ struct rpc_task *task;
+
+ nfs4_copy_sessionid(&args.sessionid, &clp->cl_session->sess_id);
+ if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
+ args.dir = NFS4_CDFC4_FORE;
+
+ /* Do not set the backchannel flag unless this is clnt->cl_xprt */
+ if (xprt != rcu_access_pointer(clnt->cl_xprt))
+ args.dir = NFS4_CDFC4_FORE;
+
+ task = rpc_run_task(&task_setup_data);
+ if (!IS_ERR(task)) {
+ status = task->tk_status;
+ rpc_put_task(task);
+ } else
+ status = PTR_ERR(task);
+ trace_nfs4_bind_conn_to_session(clp, status);
+ if (status == 0) {
+ if (memcmp(res.sessionid.data,
+ clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) {
+ dprintk("NFS: %s: Session ID mismatch\n", __func__);
+ return -EIO;
+ }
+ if ((res.dir & args.dir) != res.dir || res.dir == 0) {
+ dprintk("NFS: %s: Unexpected direction from server\n",
+ __func__);
+ return -EIO;
+ }
+ if (res.use_conn_in_rdma_mode != args.use_conn_in_rdma_mode) {
+ dprintk("NFS: %s: Server returned RDMA mode = true\n",
+ __func__);
+ return -EIO;
+ }
+ }
+
+ return status;
+}
+
+struct rpc_bind_conn_calldata {
+ struct nfs_client *clp;
+ const struct cred *cred;
+};
+
+static int
+nfs4_proc_bind_conn_to_session_callback(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt,
+ void *calldata)
+{
+ struct rpc_bind_conn_calldata *p = calldata;
+
+ return nfs4_proc_bind_one_conn_to_session(clnt, xprt, p->clp, p->cred);
+}
+
+int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, const struct cred *cred)
+{
+ struct rpc_bind_conn_calldata data = {
+ .clp = clp,
+ .cred = cred,
+ };
+ return rpc_clnt_iterate_for_each_xprt(clp->cl_rpcclient,
+ nfs4_proc_bind_conn_to_session_callback, &data);
+}
+
+/*
+ * Minimum set of SP4_MACH_CRED operations from RFC 5661 in the enforce map
+ * and operations we'd like to see to enable certain features in the allow map
+ */
+static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = {
+ .how = SP4_MACH_CRED,
+ .enforce.u.words = {
+ [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
+ 1 << (OP_EXCHANGE_ID - 32) |
+ 1 << (OP_CREATE_SESSION - 32) |
+ 1 << (OP_DESTROY_SESSION - 32) |
+ 1 << (OP_DESTROY_CLIENTID - 32)
+ },
+ .allow.u.words = {
+ [0] = 1 << (OP_CLOSE) |
+ 1 << (OP_OPEN_DOWNGRADE) |
+ 1 << (OP_LOCKU) |
+ 1 << (OP_DELEGRETURN) |
+ 1 << (OP_COMMIT),
+ [1] = 1 << (OP_SECINFO - 32) |
+ 1 << (OP_SECINFO_NO_NAME - 32) |
+ 1 << (OP_LAYOUTRETURN - 32) |
+ 1 << (OP_TEST_STATEID - 32) |
+ 1 << (OP_FREE_STATEID - 32) |
+ 1 << (OP_WRITE - 32)
+ }
+};
+
+/*
+ * Select the state protection mode for client `clp' given the server results
+ * from exchange_id in `sp'.
+ *
+ * Returns 0 on success, negative errno otherwise.
+ */
+static int nfs4_sp4_select_mode(struct nfs_client *clp,
+ struct nfs41_state_protection *sp)
+{
+ static const u32 supported_enforce[NFS4_OP_MAP_NUM_WORDS] = {
+ [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
+ 1 << (OP_EXCHANGE_ID - 32) |
+ 1 << (OP_CREATE_SESSION - 32) |
+ 1 << (OP_DESTROY_SESSION - 32) |
+ 1 << (OP_DESTROY_CLIENTID - 32)
+ };
+ unsigned long flags = 0;
+ unsigned int i;
+ int ret = 0;
+
+ if (sp->how == SP4_MACH_CRED) {
+ /* Print state protect result */
+ dfprintk(MOUNT, "Server SP4_MACH_CRED support:\n");
+ for (i = 0; i <= LAST_NFS4_OP; i++) {
+ if (test_bit(i, sp->enforce.u.longs))
+ dfprintk(MOUNT, " enforce op %d\n", i);
+ if (test_bit(i, sp->allow.u.longs))
+ dfprintk(MOUNT, " allow op %d\n", i);
+ }
+
+ /* make sure nothing is on enforce list that isn't supported */
+ for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++) {
+ if (sp->enforce.u.words[i] & ~supported_enforce[i]) {
+ dfprintk(MOUNT, "sp4_mach_cred: disabled\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /*
+ * Minimal mode - state operations are allowed to use machine
+ * credential. Note this already happens by default, so the
+ * client doesn't have to do anything more than the negotiation.
+ *
+ * NOTE: we don't care if EXCHANGE_ID is in the list -
+ * we're already using the machine cred for exchange_id
+ * and will never use a different cred.
+ */
+ if (test_bit(OP_BIND_CONN_TO_SESSION, sp->enforce.u.longs) &&
+ test_bit(OP_CREATE_SESSION, sp->enforce.u.longs) &&
+ test_bit(OP_DESTROY_SESSION, sp->enforce.u.longs) &&
+ test_bit(OP_DESTROY_CLIENTID, sp->enforce.u.longs)) {
+ dfprintk(MOUNT, "sp4_mach_cred:\n");
+ dfprintk(MOUNT, " minimal mode enabled\n");
+ __set_bit(NFS_SP4_MACH_CRED_MINIMAL, &flags);
+ } else {
+ dfprintk(MOUNT, "sp4_mach_cred: disabled\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (test_bit(OP_CLOSE, sp->allow.u.longs) &&
+ test_bit(OP_OPEN_DOWNGRADE, sp->allow.u.longs) &&
+ test_bit(OP_DELEGRETURN, sp->allow.u.longs) &&
+ test_bit(OP_LOCKU, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " cleanup mode enabled\n");
+ __set_bit(NFS_SP4_MACH_CRED_CLEANUP, &flags);
+ }
+
+ if (test_bit(OP_LAYOUTRETURN, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " pnfs cleanup mode enabled\n");
+ __set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP, &flags);
+ }
+
+ if (test_bit(OP_SECINFO, sp->allow.u.longs) &&
+ test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " secinfo mode enabled\n");
+ __set_bit(NFS_SP4_MACH_CRED_SECINFO, &flags);
+ }
+
+ if (test_bit(OP_TEST_STATEID, sp->allow.u.longs) &&
+ test_bit(OP_FREE_STATEID, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " stateid mode enabled\n");
+ __set_bit(NFS_SP4_MACH_CRED_STATEID, &flags);
+ }
+
+ if (test_bit(OP_WRITE, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " write mode enabled\n");
+ __set_bit(NFS_SP4_MACH_CRED_WRITE, &flags);
+ }
+
+ if (test_bit(OP_COMMIT, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " commit mode enabled\n");
+ __set_bit(NFS_SP4_MACH_CRED_COMMIT, &flags);
+ }
+ }
+out:
+ clp->cl_sp4_flags = flags;
+ return ret;
+}
+
+struct nfs41_exchange_id_data {
+ struct nfs41_exchange_id_res res;
+ struct nfs41_exchange_id_args args;
+};
+
+static void nfs4_exchange_id_release(void *data)
+{
+ struct nfs41_exchange_id_data *cdata =
+ (struct nfs41_exchange_id_data *)data;
+
+ nfs_put_client(cdata->args.client);
+ kfree(cdata->res.impl_id);
+ kfree(cdata->res.server_scope);
+ kfree(cdata->res.server_owner);
+ kfree(cdata);
+}
+
+static const struct rpc_call_ops nfs4_exchange_id_call_ops = {
+ .rpc_release = nfs4_exchange_id_release,
+};
+
+/*
+ * _nfs4_proc_exchange_id()
+ *
+ * Wrapper for EXCHANGE_ID operation.
+ */
+static struct rpc_task *
+nfs4_run_exchange_id(struct nfs_client *clp, const struct cred *cred,
+ u32 sp4_how, struct rpc_xprt *xprt)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID],
+ .rpc_cred = cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clp->cl_rpcclient,
+ .callback_ops = &nfs4_exchange_id_call_ops,
+ .rpc_message = &msg,
+ .flags = RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN,
+ };
+ struct nfs41_exchange_id_data *calldata;
+ int status;
+
+ if (!refcount_inc_not_zero(&clp->cl_count))
+ return ERR_PTR(-EIO);
+
+ status = -ENOMEM;
+ calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
+ if (!calldata)
+ goto out;
+
+ nfs4_init_boot_verifier(clp, &calldata->args.verifier);
+
+ status = nfs4_init_uniform_client_string(clp);
+ if (status)
+ goto out_calldata;
+
+ calldata->res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
+ GFP_NOFS);
+ status = -ENOMEM;
+ if (unlikely(calldata->res.server_owner == NULL))
+ goto out_calldata;
+
+ calldata->res.server_scope = kzalloc(sizeof(struct nfs41_server_scope),
+ GFP_NOFS);
+ if (unlikely(calldata->res.server_scope == NULL))
+ goto out_server_owner;
+
+ calldata->res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS);
+ if (unlikely(calldata->res.impl_id == NULL))
+ goto out_server_scope;
+
+ switch (sp4_how) {
+ case SP4_NONE:
+ calldata->args.state_protect.how = SP4_NONE;
+ break;
+
+ case SP4_MACH_CRED:
+ calldata->args.state_protect = nfs4_sp4_mach_cred_request;
+ break;
+
+ default:
+ /* unsupported! */
+ WARN_ON_ONCE(1);
+ status = -EINVAL;
+ goto out_impl_id;
+ }
+ if (xprt) {
+ task_setup_data.rpc_xprt = xprt;
+ task_setup_data.flags |= RPC_TASK_SOFTCONN;
+ memcpy(calldata->args.verifier.data, clp->cl_confirm.data,
+ sizeof(calldata->args.verifier.data));
+ }
+ calldata->args.client = clp;
+ calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+ EXCHGID4_FLAG_BIND_PRINC_STATEID;
+#ifdef CONFIG_NFS_V4_1_MIGRATION
+ calldata->args.flags |= EXCHGID4_FLAG_SUPP_MOVED_MIGR;
+#endif
+ if (test_bit(NFS_CS_DS, &clp->cl_flags))
+ calldata->args.flags |= EXCHGID4_FLAG_USE_PNFS_DS;
+ msg.rpc_argp = &calldata->args;
+ msg.rpc_resp = &calldata->res;
+ task_setup_data.callback_data = calldata;
+
+ return rpc_run_task(&task_setup_data);
+
+out_impl_id:
+ kfree(calldata->res.impl_id);
+out_server_scope:
+ kfree(calldata->res.server_scope);
+out_server_owner:
+ kfree(calldata->res.server_owner);
+out_calldata:
+ kfree(calldata);
+out:
+ nfs_put_client(clp);
+ return ERR_PTR(status);
+}
+
+/*
+ * _nfs4_proc_exchange_id()
+ *
+ * Wrapper for EXCHANGE_ID operation.
+ */
+static int _nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred,
+ u32 sp4_how)
+{
+ struct rpc_task *task;
+ struct nfs41_exchange_id_args *argp;
+ struct nfs41_exchange_id_res *resp;
+ unsigned long now = jiffies;
+ int status;
+
+ task = nfs4_run_exchange_id(clp, cred, sp4_how, NULL);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ argp = task->tk_msg.rpc_argp;
+ resp = task->tk_msg.rpc_resp;
+ status = task->tk_status;
+ if (status != 0)
+ goto out;
+
+ status = nfs4_check_cl_exchange_flags(resp->flags,
+ clp->cl_mvops->minor_version);
+ if (status != 0)
+ goto out;
+
+ status = nfs4_sp4_select_mode(clp, &resp->state_protect);
+ if (status != 0)
+ goto out;
+
+ do_renew_lease(clp, now);
+
+ clp->cl_clientid = resp->clientid;
+ clp->cl_exchange_flags = resp->flags;
+ clp->cl_seqid = resp->seqid;
+ /* Client ID is not confirmed */
+ if (!(resp->flags & EXCHGID4_FLAG_CONFIRMED_R))
+ clear_bit(NFS4_SESSION_ESTABLISHED,
+ &clp->cl_session->session_state);
+
+ if (clp->cl_serverscope != NULL &&
+ !nfs41_same_server_scope(clp->cl_serverscope,
+ resp->server_scope)) {
+ dprintk("%s: server_scope mismatch detected\n",
+ __func__);
+ set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
+ }
+
+ swap(clp->cl_serverowner, resp->server_owner);
+ swap(clp->cl_serverscope, resp->server_scope);
+ swap(clp->cl_implid, resp->impl_id);
+
+ /* Save the EXCHANGE_ID verifier session trunk tests */
+ memcpy(clp->cl_confirm.data, argp->verifier.data,
+ sizeof(clp->cl_confirm.data));
+out:
+ trace_nfs4_exchange_id(clp, status);
+ rpc_put_task(task);
+ return status;
+}
+
+/*
+ * nfs4_proc_exchange_id()
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status code.
+ *
+ * Since the clientid has expired, all compounds using sessions
+ * associated with the stale clientid will be returning
+ * NFS4ERR_BADSESSION in the sequence operation, and will therefore
+ * be in some phase of session reset.
+ *
+ * Will attempt to negotiate SP4_MACH_CRED if krb5i / krb5p auth is used.
+ */
+int nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred)
+{
+ rpc_authflavor_t authflavor = clp->cl_rpcclient->cl_auth->au_flavor;
+ int status;
+
+ /* try SP4_MACH_CRED if krb5i/p */
+ if (authflavor == RPC_AUTH_GSS_KRB5I ||
+ authflavor == RPC_AUTH_GSS_KRB5P) {
+ status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED);
+ if (!status)
+ return 0;
+ }
+
+ /* try SP4_NONE */
+ return _nfs4_proc_exchange_id(clp, cred, SP4_NONE);
+}
+
+/**
+ * nfs4_test_session_trunk
+ *
+ * This is an add_xprt_test() test function called from
+ * rpc_clnt_setup_test_and_add_xprt.
+ *
+ * The rpc_xprt_switch is referrenced by rpc_clnt_setup_test_and_add_xprt
+ * and is dereferrenced in nfs4_exchange_id_release
+ *
+ * Upon success, add the new transport to the rpc_clnt
+ *
+ * @clnt: struct rpc_clnt to get new transport
+ * @xprt: the rpc_xprt to test
+ * @data: call data for _nfs4_proc_exchange_id.
+ */
+void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
+ void *data)
+{
+ struct nfs4_add_xprt_data *adata = data;
+ struct rpc_task *task;
+ int status;
+
+ u32 sp4_how;
+
+ dprintk("--> %s try %s\n", __func__,
+ xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+ sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED);
+
+try_again:
+ /* Test connection for session trunking. Async exchange_id call */
+ task = nfs4_run_exchange_id(adata->clp, adata->cred, sp4_how, xprt);
+ if (IS_ERR(task))
+ return;
+
+ status = task->tk_status;
+ if (status == 0)
+ status = nfs4_detect_session_trunking(adata->clp,
+ task->tk_msg.rpc_resp, xprt);
+
+ if (status == 0)
+ rpc_clnt_xprt_switch_add_xprt(clnt, xprt);
+ else if (status != -NFS4ERR_DELAY && rpc_clnt_xprt_switch_has_addr(clnt,
+ (struct sockaddr *)&xprt->addr))
+ rpc_clnt_xprt_switch_remove_xprt(clnt, xprt);
+
+ rpc_put_task(task);
+ if (status == -NFS4ERR_DELAY) {
+ ssleep(1);
+ goto try_again;
+ }
+}
+EXPORT_SYMBOL_GPL(nfs4_test_session_trunk);
+
+static int _nfs4_proc_destroy_clientid(struct nfs_client *clp,
+ const struct cred *cred)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_CLIENTID],
+ .rpc_argp = clp,
+ .rpc_cred = cred,
+ };
+ int status;
+
+ status = rpc_call_sync(clp->cl_rpcclient, &msg,
+ RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
+ trace_nfs4_destroy_clientid(clp, status);
+ if (status)
+ dprintk("NFS: Got error %d from the server %s on "
+ "DESTROY_CLIENTID.", status, clp->cl_hostname);
+ return status;
+}
+
+static int nfs4_proc_destroy_clientid(struct nfs_client *clp,
+ const struct cred *cred)
+{
+ unsigned int loop;
+ int ret;
+
+ for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
+ ret = _nfs4_proc_destroy_clientid(clp, cred);
+ switch (ret) {
+ case -NFS4ERR_DELAY:
+ case -NFS4ERR_CLIENTID_BUSY:
+ ssleep(1);
+ break;
+ default:
+ return ret;
+ }
+ }
+ return 0;
+}
+
+int nfs4_destroy_clientid(struct nfs_client *clp)
+{
+ const struct cred *cred;
+ int ret = 0;
+
+ if (clp->cl_mvops->minor_version < 1)
+ goto out;
+ if (clp->cl_exchange_flags == 0)
+ goto out;
+ if (clp->cl_preserve_clid)
+ goto out;
+ cred = nfs4_get_clid_cred(clp);
+ ret = nfs4_proc_destroy_clientid(clp, cred);
+ put_cred(cred);
+ switch (ret) {
+ case 0:
+ case -NFS4ERR_STALE_CLIENTID:
+ clp->cl_exchange_flags = 0;
+ }
+out:
+ return ret;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+struct nfs4_get_lease_time_data {
+ struct nfs4_get_lease_time_args *args;
+ struct nfs4_get_lease_time_res *res;
+ struct nfs_client *clp;
+};
+
+static void nfs4_get_lease_time_prepare(struct rpc_task *task,
+ void *calldata)
+{
+ struct nfs4_get_lease_time_data *data =
+ (struct nfs4_get_lease_time_data *)calldata;
+
+ /* just setup sequence, do not trigger session recovery
+ since we're invoked within one */
+ nfs4_setup_sequence(data->clp,
+ &data->args->la_seq_args,
+ &data->res->lr_seq_res,
+ task);
+}
+
+/*
+ * Called from nfs4_state_manager thread for session setup, so don't recover
+ * from sequence operation or clientid errors.
+ */
+static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_get_lease_time_data *data =
+ (struct nfs4_get_lease_time_data *)calldata;
+
+ if (!nfs4_sequence_done(task, &data->res->lr_seq_res))
+ return;
+ switch (task->tk_status) {
+ case -NFS4ERR_DELAY:
+ case -NFS4ERR_GRACE:
+ rpc_delay(task, NFS4_POLL_RETRY_MIN);
+ task->tk_status = 0;
+ fallthrough;
+ case -NFS4ERR_RETRY_UNCACHED_REP:
+ rpc_restart_call_prepare(task);
+ return;
+ }
+}
+
+static const struct rpc_call_ops nfs4_get_lease_time_ops = {
+ .rpc_call_prepare = nfs4_get_lease_time_prepare,
+ .rpc_call_done = nfs4_get_lease_time_done,
+};
+
+int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
+{
+ struct nfs4_get_lease_time_args args;
+ struct nfs4_get_lease_time_res res = {
+ .lr_fsinfo = fsinfo,
+ };
+ struct nfs4_get_lease_time_data data = {
+ .args = &args,
+ .res = &res,
+ .clp = clp,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GET_LEASE_TIME],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ struct rpc_task_setup task_setup = {
+ .rpc_client = clp->cl_rpcclient,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_get_lease_time_ops,
+ .callback_data = &data,
+ .flags = RPC_TASK_TIMEOUT,
+ };
+
+ nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0, 1);
+ return nfs4_call_sync_custom(&task_setup);
+}
+
+#ifdef CONFIG_NFS_V4_1
+
+/*
+ * Initialize the values to be used by the client in CREATE_SESSION
+ * If nfs4_init_session set the fore channel request and response sizes,
+ * use them.
+ *
+ * Set the back channel max_resp_sz_cached to zero to force the client to
+ * always set csa_cachethis to FALSE because the current implementation
+ * of the back channel DRC only supports caching the CB_SEQUENCE operation.
+ */
+static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args,
+ struct rpc_clnt *clnt)
+{
+ unsigned int max_rqst_sz, max_resp_sz;
+ unsigned int max_bc_payload = rpc_max_bc_payload(clnt);
+ unsigned int max_bc_slots = rpc_num_bc_slots(clnt);
+
+ max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
+ max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
+
+ /* Fore channel attributes */
+ args->fc_attrs.max_rqst_sz = max_rqst_sz;
+ args->fc_attrs.max_resp_sz = max_resp_sz;
+ args->fc_attrs.max_ops = NFS4_MAX_OPS;
+ args->fc_attrs.max_reqs = max_session_slots;
+
+ dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "
+ "max_ops=%u max_reqs=%u\n",
+ __func__,
+ args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz,
+ args->fc_attrs.max_ops, args->fc_attrs.max_reqs);
+
+ /* Back channel attributes */
+ args->bc_attrs.max_rqst_sz = max_bc_payload;
+ args->bc_attrs.max_resp_sz = max_bc_payload;
+ args->bc_attrs.max_resp_sz_cached = 0;
+ args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
+ args->bc_attrs.max_reqs = max_t(unsigned short, max_session_cb_slots, 1);
+ if (args->bc_attrs.max_reqs > max_bc_slots)
+ args->bc_attrs.max_reqs = max_bc_slots;
+
+ dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
+ "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
+ __func__,
+ args->bc_attrs.max_rqst_sz, args->bc_attrs.max_resp_sz,
+ args->bc_attrs.max_resp_sz_cached, args->bc_attrs.max_ops,
+ args->bc_attrs.max_reqs);
+}
+
+static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args,
+ struct nfs41_create_session_res *res)
+{
+ struct nfs4_channel_attrs *sent = &args->fc_attrs;
+ struct nfs4_channel_attrs *rcvd = &res->fc_attrs;
+
+ if (rcvd->max_resp_sz > sent->max_resp_sz)
+ return -EINVAL;
+ /*
+ * Our requested max_ops is the minimum we need; we're not
+ * prepared to break up compounds into smaller pieces than that.
+ * So, no point even trying to continue if the server won't
+ * cooperate:
+ */
+ if (rcvd->max_ops < sent->max_ops)
+ return -EINVAL;
+ if (rcvd->max_reqs == 0)
+ return -EINVAL;
+ if (rcvd->max_reqs > NFS4_MAX_SLOT_TABLE)
+ rcvd->max_reqs = NFS4_MAX_SLOT_TABLE;
+ return 0;
+}
+
+static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args,
+ struct nfs41_create_session_res *res)
+{
+ struct nfs4_channel_attrs *sent = &args->bc_attrs;
+ struct nfs4_channel_attrs *rcvd = &res->bc_attrs;
+
+ if (!(res->flags & SESSION4_BACK_CHAN))
+ goto out;
+ if (rcvd->max_rqst_sz > sent->max_rqst_sz)
+ return -EINVAL;
+ if (rcvd->max_resp_sz < sent->max_resp_sz)
+ return -EINVAL;
+ if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
+ return -EINVAL;
+ if (rcvd->max_ops > sent->max_ops)
+ return -EINVAL;
+ if (rcvd->max_reqs > sent->max_reqs)
+ return -EINVAL;
+out:
+ return 0;
+}
+
+static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
+ struct nfs41_create_session_res *res)
+{
+ int ret;
+
+ ret = nfs4_verify_fore_channel_attrs(args, res);
+ if (ret)
+ return ret;
+ return nfs4_verify_back_channel_attrs(args, res);
+}
+
+static void nfs4_update_session(struct nfs4_session *session,
+ struct nfs41_create_session_res *res)
+{
+ nfs4_copy_sessionid(&session->sess_id, &res->sessionid);
+ /* Mark client id and session as being confirmed */
+ session->clp->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R;
+ set_bit(NFS4_SESSION_ESTABLISHED, &session->session_state);
+ session->flags = res->flags;
+ memcpy(&session->fc_attrs, &res->fc_attrs, sizeof(session->fc_attrs));
+ if (res->flags & SESSION4_BACK_CHAN)
+ memcpy(&session->bc_attrs, &res->bc_attrs,
+ sizeof(session->bc_attrs));
+}
+
+static int _nfs4_proc_create_session(struct nfs_client *clp,
+ const struct cred *cred)
+{
+ struct nfs4_session *session = clp->cl_session;
+ struct nfs41_create_session_args args = {
+ .client = clp,
+ .clientid = clp->cl_clientid,
+ .seqid = clp->cl_seqid,
+ .cb_program = NFS4_CALLBACK,
+ };
+ struct nfs41_create_session_res res;
+
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ int status;
+
+ nfs4_init_channel_attrs(&args, clp->cl_rpcclient);
+ args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
+
+ status = rpc_call_sync(session->clp->cl_rpcclient, &msg,
+ RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
+ trace_nfs4_create_session(clp, status);
+
+ switch (status) {
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_DELAY:
+ case -ETIMEDOUT:
+ case -EACCES:
+ case -EAGAIN:
+ goto out;
+ }
+
+ clp->cl_seqid++;
+ if (!status) {
+ /* Verify the session's negotiated channel_attrs values */
+ status = nfs4_verify_channel_attrs(&args, &res);
+ /* Increment the clientid slot sequence id */
+ if (status)
+ goto out;
+ nfs4_update_session(session, &res);
+ }
+out:
+ return status;
+}
+
+/*
+ * Issues a CREATE_SESSION operation to the server.
+ * It is the responsibility of the caller to verify the session is
+ * expired before calling this routine.
+ */
+int nfs4_proc_create_session(struct nfs_client *clp, const struct cred *cred)
+{
+ int status;
+ unsigned *ptr;
+ struct nfs4_session *session = clp->cl_session;
+ struct nfs4_add_xprt_data xprtdata = {
+ .clp = clp,
+ };
+ struct rpc_add_xprt_test rpcdata = {
+ .add_xprt_test = clp->cl_mvops->session_trunk,
+ .data = &xprtdata,
+ };
+
+ dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
+
+ status = _nfs4_proc_create_session(clp, cred);
+ if (status)
+ goto out;
+
+ /* Init or reset the session slot tables */
+ status = nfs4_setup_session_slot_tables(session);
+ dprintk("slot table setup returned %d\n", status);
+ if (status)
+ goto out;
+
+ ptr = (unsigned *)&session->sess_id.data[0];
+ dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__,
+ clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]);
+ rpc_clnt_probe_trunked_xprts(clp->cl_rpcclient, &rpcdata);
+out:
+ return status;
+}
+
+/*
+ * Issue the over-the-wire RPC DESTROY_SESSION.
+ * The caller must serialize access to this routine.
+ */
+int nfs4_proc_destroy_session(struct nfs4_session *session,
+ const struct cred *cred)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION],
+ .rpc_argp = session,
+ .rpc_cred = cred,
+ };
+ int status = 0;
+
+ /* session is still being setup */
+ if (!test_and_clear_bit(NFS4_SESSION_ESTABLISHED, &session->session_state))
+ return 0;
+
+ status = rpc_call_sync(session->clp->cl_rpcclient, &msg,
+ RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
+ trace_nfs4_destroy_session(session->clp, status);
+
+ if (status)
+ dprintk("NFS: Got error %d from the server on DESTROY_SESSION. "
+ "Session has been destroyed regardless...\n", status);
+ rpc_clnt_manage_trunked_xprts(session->clp->cl_rpcclient);
+ return status;
+}
+
+/*
+ * Renew the cl_session lease.
+ */
+struct nfs4_sequence_data {
+ struct nfs_client *clp;
+ struct nfs4_sequence_args args;
+ struct nfs4_sequence_res res;
+};
+
+static void nfs41_sequence_release(void *data)
+{
+ struct nfs4_sequence_data *calldata = data;
+ struct nfs_client *clp = calldata->clp;
+
+ if (refcount_read(&clp->cl_count) > 1)
+ nfs4_schedule_state_renewal(clp);
+ nfs_put_client(clp);
+ kfree(calldata);
+}
+
+static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp)
+{
+ switch(task->tk_status) {
+ case -NFS4ERR_DELAY:
+ rpc_delay(task, NFS4_POLL_RETRY_MAX);
+ return -EAGAIN;
+ default:
+ nfs4_schedule_lease_recovery(clp);
+ }
+ return 0;
+}
+
+static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs4_sequence_data *calldata = data;
+ struct nfs_client *clp = calldata->clp;
+
+ if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp))
+ return;
+
+ trace_nfs4_sequence(clp, task->tk_status);
+ if (task->tk_status < 0 && !task->tk_client->cl_shutdown) {
+ dprintk("%s ERROR %d\n", __func__, task->tk_status);
+ if (refcount_read(&clp->cl_count) == 1)
+ return;
+
+ if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) {
+ rpc_restart_call_prepare(task);
+ return;
+ }
+ }
+ dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
+}
+
+static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs4_sequence_data *calldata = data;
+ struct nfs_client *clp = calldata->clp;
+ struct nfs4_sequence_args *args;
+ struct nfs4_sequence_res *res;
+
+ args = task->tk_msg.rpc_argp;
+ res = task->tk_msg.rpc_resp;
+
+ nfs4_setup_sequence(clp, args, res, task);
+}
+
+static const struct rpc_call_ops nfs41_sequence_ops = {
+ .rpc_call_done = nfs41_sequence_call_done,
+ .rpc_call_prepare = nfs41_sequence_prepare,
+ .rpc_release = nfs41_sequence_release,
+};
+
+static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
+ const struct cred *cred,
+ struct nfs4_slot *slot,
+ bool is_privileged)
+{
+ struct nfs4_sequence_data *calldata;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
+ .rpc_cred = cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clp->cl_rpcclient,
+ .rpc_message = &msg,
+ .callback_ops = &nfs41_sequence_ops,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT | RPC_TASK_MOVEABLE,
+ };
+ struct rpc_task *ret;
+
+ ret = ERR_PTR(-EIO);
+ if (!refcount_inc_not_zero(&clp->cl_count))
+ goto out_err;
+
+ ret = ERR_PTR(-ENOMEM);
+ calldata = kzalloc(sizeof(*calldata), GFP_KERNEL);
+ if (calldata == NULL)
+ goto out_put_clp;
+ nfs4_init_sequence(&calldata->args, &calldata->res, 0, is_privileged);
+ nfs4_sequence_attach_slot(&calldata->args, &calldata->res, slot);
+ msg.rpc_argp = &calldata->args;
+ msg.rpc_resp = &calldata->res;
+ calldata->clp = clp;
+ task_setup_data.callback_data = calldata;
+
+ ret = rpc_run_task(&task_setup_data);
+ if (IS_ERR(ret))
+ goto out_err;
+ return ret;
+out_put_clp:
+ nfs_put_client(clp);
+out_err:
+ nfs41_release_slot(slot);
+ return ret;
+}
+
+static int nfs41_proc_async_sequence(struct nfs_client *clp, const struct cred *cred, unsigned renew_flags)
+{
+ struct rpc_task *task;
+ int ret = 0;
+
+ if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
+ return -EAGAIN;
+ task = _nfs41_proc_sequence(clp, cred, NULL, false);
+ if (IS_ERR(task))
+ ret = PTR_ERR(task);
+ else
+ rpc_put_task_async(task);
+ dprintk("<-- %s status=%d\n", __func__, ret);
+ return ret;
+}
+
+static int nfs4_proc_sequence(struct nfs_client *clp, const struct cred *cred)
+{
+ struct rpc_task *task;
+ int ret;
+
+ task = _nfs41_proc_sequence(clp, cred, NULL, true);
+ if (IS_ERR(task)) {
+ ret = PTR_ERR(task);
+ goto out;
+ }
+ ret = rpc_wait_for_completion_task(task);
+ if (!ret)
+ ret = task->tk_status;
+ rpc_put_task(task);
+out:
+ dprintk("<-- %s status=%d\n", __func__, ret);
+ return ret;
+}
+
+struct nfs4_reclaim_complete_data {
+ struct nfs_client *clp;
+ struct nfs41_reclaim_complete_args arg;
+ struct nfs41_reclaim_complete_res res;
+};
+
+static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs4_reclaim_complete_data *calldata = data;
+
+ nfs4_setup_sequence(calldata->clp,
+ &calldata->arg.seq_args,
+ &calldata->res.seq_res,
+ task);
+}
+
+static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp)
+{
+ switch(task->tk_status) {
+ case 0:
+ wake_up_all(&clp->cl_lock_waitq);
+ fallthrough;
+ case -NFS4ERR_COMPLETE_ALREADY:
+ case -NFS4ERR_WRONG_CRED: /* What to do here? */
+ break;
+ case -NFS4ERR_DELAY:
+ rpc_delay(task, NFS4_POLL_RETRY_MAX);
+ fallthrough;
+ case -NFS4ERR_RETRY_UNCACHED_REP:
+ case -EACCES:
+ dprintk("%s: failed to reclaim complete error %d for server %s, retrying\n",
+ __func__, task->tk_status, clp->cl_hostname);
+ return -EAGAIN;
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ break;
+ default:
+ nfs4_schedule_lease_recovery(clp);
+ }
+ return 0;
+}
+
+static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
+{
+ struct nfs4_reclaim_complete_data *calldata = data;
+ struct nfs_client *clp = calldata->clp;
+ struct nfs4_sequence_res *res = &calldata->res.seq_res;
+
+ if (!nfs41_sequence_done(task, res))
+ return;
+
+ trace_nfs4_reclaim_complete(clp, task->tk_status);
+ if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) {
+ rpc_restart_call_prepare(task);
+ return;
+ }
+}
+
+static void nfs4_free_reclaim_complete_data(void *data)
+{
+ struct nfs4_reclaim_complete_data *calldata = data;
+
+ kfree(calldata);
+}
+
+static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = {
+ .rpc_call_prepare = nfs4_reclaim_complete_prepare,
+ .rpc_call_done = nfs4_reclaim_complete_done,
+ .rpc_release = nfs4_free_reclaim_complete_data,
+};
+
+/*
+ * Issue a global reclaim complete.
+ */
+static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
+ const struct cred *cred)
+{
+ struct nfs4_reclaim_complete_data *calldata;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE],
+ .rpc_cred = cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clp->cl_rpcclient,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_reclaim_complete_call_ops,
+ .flags = RPC_TASK_NO_ROUND_ROBIN,
+ };
+ int status = -ENOMEM;
+
+ calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
+ if (calldata == NULL)
+ goto out;
+ calldata->clp = clp;
+ calldata->arg.one_fs = 0;
+
+ nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0, 1);
+ msg.rpc_argp = &calldata->arg;
+ msg.rpc_resp = &calldata->res;
+ task_setup_data.callback_data = calldata;
+ status = nfs4_call_sync_custom(&task_setup_data);
+out:
+ dprintk("<-- %s status=%d\n", __func__, status);
+ return status;
+}
+
+static void
+nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutget *lgp = calldata;
+ struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+
+ nfs4_setup_sequence(server->nfs_client, &lgp->args.seq_args,
+ &lgp->res.seq_res, task);
+}
+
+static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutget *lgp = calldata;
+
+ nfs41_sequence_process(task, &lgp->res.seq_res);
+}
+
+static int
+nfs4_layoutget_handle_exception(struct rpc_task *task,
+ struct nfs4_layoutget *lgp, struct nfs4_exception *exception)
+{
+ struct inode *inode = lgp->args.inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct pnfs_layout_hdr *lo = lgp->lo;
+ int nfs4err = task->tk_status;
+ int err, status = 0;
+ LIST_HEAD(head);
+
+ dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
+
+ nfs4_sequence_free_slot(&lgp->res.seq_res);
+
+ switch (nfs4err) {
+ case 0:
+ goto out;
+
+ /*
+ * NFS4ERR_LAYOUTUNAVAILABLE means we are not supposed to use pnfs
+ * on the file. set tk_status to -ENODATA to tell upper layer to
+ * retry go inband.
+ */
+ case -NFS4ERR_LAYOUTUNAVAILABLE:
+ status = -ENODATA;
+ goto out;
+ /*
+ * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
+ * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
+ */
+ case -NFS4ERR_BADLAYOUT:
+ status = -EOVERFLOW;
+ goto out;
+ /*
+ * NFS4ERR_LAYOUTTRYLATER is a conflict with another client
+ * (or clients) writing to the same RAID stripe except when
+ * the minlength argument is 0 (see RFC5661 section 18.43.3).
+ *
+ * Treat it like we would RECALLCONFLICT -- we retry for a little
+ * while, and then eventually give up.
+ */
+ case -NFS4ERR_LAYOUTTRYLATER:
+ if (lgp->args.minlength == 0) {
+ status = -EOVERFLOW;
+ goto out;
+ }
+ status = -EBUSY;
+ break;
+ case -NFS4ERR_RECALLCONFLICT:
+ case -NFS4ERR_RETURNCONFLICT:
+ status = -ERECALLCONFLICT;
+ break;
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_BAD_STATEID:
+ exception->timeout = 0;
+ spin_lock(&inode->i_lock);
+ /* If the open stateid was bad, then recover it. */
+ if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
+ !nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) {
+ spin_unlock(&inode->i_lock);
+ exception->state = lgp->args.ctx->state;
+ exception->stateid = &lgp->args.stateid;
+ break;
+ }
+
+ /*
+ * Mark the bad layout state as invalid, then retry
+ */
+ pnfs_mark_layout_stateid_invalid(lo, &head);
+ spin_unlock(&inode->i_lock);
+ nfs_commit_inode(inode, 0);
+ pnfs_free_lseg_list(&head);
+ status = -EAGAIN;
+ goto out;
+ }
+
+ err = nfs4_handle_exception(server, nfs4err, exception);
+ if (!status) {
+ if (exception->retry)
+ status = -EAGAIN;
+ else
+ status = err;
+ }
+out:
+ return status;
+}
+
+size_t max_response_pages(struct nfs_server *server)
+{
+ u32 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+ return nfs_page_array_len(0, max_resp_sz);
+}
+
+static void nfs4_layoutget_release(void *calldata)
+{
+ struct nfs4_layoutget *lgp = calldata;
+
+ nfs4_sequence_free_slot(&lgp->res.seq_res);
+ pnfs_layoutget_free(lgp);
+}
+
+static const struct rpc_call_ops nfs4_layoutget_call_ops = {
+ .rpc_call_prepare = nfs4_layoutget_prepare,
+ .rpc_call_done = nfs4_layoutget_done,
+ .rpc_release = nfs4_layoutget_release,
+};
+
+struct pnfs_layout_segment *
+nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
+{
+ struct inode *inode = lgp->args.inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
+ .rpc_argp = &lgp->args,
+ .rpc_resp = &lgp->res,
+ .rpc_cred = lgp->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = server->client,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_layoutget_call_ops,
+ .callback_data = lgp,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF |
+ RPC_TASK_MOVEABLE,
+ };
+ struct pnfs_layout_segment *lseg = NULL;
+ struct nfs4_exception exception = {
+ .inode = inode,
+ .timeout = *timeout,
+ };
+ int status = 0;
+
+ nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0);
+
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return ERR_CAST(task);
+
+ status = rpc_wait_for_completion_task(task);
+ if (status != 0)
+ goto out;
+
+ if (task->tk_status < 0) {
+ status = nfs4_layoutget_handle_exception(task, lgp, &exception);
+ *timeout = exception.timeout;
+ } else if (lgp->res.layoutp->len == 0) {
+ status = -EAGAIN;
+ *timeout = nfs4_update_delay(&exception.timeout);
+ } else
+ lseg = pnfs_layout_process(lgp);
+out:
+ trace_nfs4_layoutget(lgp->args.ctx,
+ &lgp->args.range,
+ &lgp->res.range,
+ &lgp->res.stateid,
+ status);
+
+ rpc_put_task(task);
+ dprintk("<-- %s status=%d\n", __func__, status);
+ if (status)
+ return ERR_PTR(status);
+ return lseg;
+}
+
+static void
+nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutreturn *lrp = calldata;
+
+ nfs4_setup_sequence(lrp->clp,
+ &lrp->args.seq_args,
+ &lrp->res.seq_res,
+ task);
+ if (!pnfs_layout_is_valid(lrp->args.layout))
+ rpc_exit(task, 0);
+}
+
+static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutreturn *lrp = calldata;
+ struct nfs_server *server;
+
+ if (!nfs41_sequence_process(task, &lrp->res.seq_res))
+ return;
+
+ /*
+ * Was there an RPC level error? Assume the call succeeded,
+ * and that we need to release the layout
+ */
+ if (task->tk_rpc_status != 0 && RPC_WAS_SENT(task)) {
+ lrp->res.lrs_present = 0;
+ return;
+ }
+
+ server = NFS_SERVER(lrp->args.inode);
+ switch (task->tk_status) {
+ case -NFS4ERR_OLD_STATEID:
+ if (nfs4_layout_refresh_old_stateid(&lrp->args.stateid,
+ &lrp->args.range,
+ lrp->args.inode))
+ goto out_restart;
+ fallthrough;
+ default:
+ task->tk_status = 0;
+ fallthrough;
+ case 0:
+ break;
+ case -NFS4ERR_DELAY:
+ if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
+ break;
+ goto out_restart;
+ }
+ return;
+out_restart:
+ task->tk_status = 0;
+ nfs4_sequence_free_slot(&lrp->res.seq_res);
+ rpc_restart_call_prepare(task);
+}
+
+static void nfs4_layoutreturn_release(void *calldata)
+{
+ struct nfs4_layoutreturn *lrp = calldata;
+ struct pnfs_layout_hdr *lo = lrp->args.layout;
+
+ pnfs_layoutreturn_free_lsegs(lo, &lrp->args.stateid, &lrp->args.range,
+ lrp->res.lrs_present ? &lrp->res.stateid : NULL);
+ nfs4_sequence_free_slot(&lrp->res.seq_res);
+ if (lrp->ld_private.ops && lrp->ld_private.ops->free)
+ lrp->ld_private.ops->free(&lrp->ld_private);
+ pnfs_put_layout_hdr(lrp->args.layout);
+ nfs_iput_and_deactive(lrp->inode);
+ put_cred(lrp->cred);
+ kfree(calldata);
+}
+
+static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
+ .rpc_call_prepare = nfs4_layoutreturn_prepare,
+ .rpc_call_done = nfs4_layoutreturn_done,
+ .rpc_release = nfs4_layoutreturn_release,
+};
+
+int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
+{
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
+ .rpc_argp = &lrp->args,
+ .rpc_resp = &lrp->res,
+ .rpc_cred = lrp->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = NFS_SERVER(lrp->args.inode)->client,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_layoutreturn_call_ops,
+ .callback_data = lrp,
+ .flags = RPC_TASK_MOVEABLE,
+ };
+ int status = 0;
+
+ nfs4_state_protect(NFS_SERVER(lrp->args.inode)->nfs_client,
+ NFS_SP4_MACH_CRED_PNFS_CLEANUP,
+ &task_setup_data.rpc_client, &msg);
+
+ lrp->inode = nfs_igrab_and_active(lrp->args.inode);
+ if (!sync) {
+ if (!lrp->inode) {
+ nfs4_layoutreturn_release(lrp);
+ return -EAGAIN;
+ }
+ task_setup_data.flags |= RPC_TASK_ASYNC;
+ }
+ if (!lrp->inode)
+ nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1,
+ 1);
+ else
+ nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1,
+ 0);
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ if (sync)
+ status = task->tk_status;
+ trace_nfs4_layoutreturn(lrp->args.inode, &lrp->args.stateid, status);
+ dprintk("<-- %s status=%d\n", __func__, status);
+ rpc_put_task(task);
+ return status;
+}
+
+static int
+_nfs4_proc_getdeviceinfo(struct nfs_server *server,
+ struct pnfs_device *pdev,
+ const struct cred *cred)
+{
+ struct nfs4_getdeviceinfo_args args = {
+ .pdev = pdev,
+ .notify_types = NOTIFY_DEVICEID4_CHANGE |
+ NOTIFY_DEVICEID4_DELETE,
+ };
+ struct nfs4_getdeviceinfo_res res = {
+ .pdev = pdev,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ int status;
+
+ status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+ if (res.notification & ~args.notify_types)
+ dprintk("%s: unsupported notification\n", __func__);
+ if (res.notification != args.notify_types)
+ pdev->nocache = 1;
+
+ trace_nfs4_getdeviceinfo(server, &pdev->dev_id, status);
+
+ dprintk("<-- %s status=%d\n", __func__, status);
+
+ return status;
+}
+
+int nfs4_proc_getdeviceinfo(struct nfs_server *server,
+ struct pnfs_device *pdev,
+ const struct cred *cred)
+{
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = nfs4_handle_exception(server,
+ _nfs4_proc_getdeviceinfo(server, pdev, cred),
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
+
+static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutcommit_data *data = calldata;
+ struct nfs_server *server = NFS_SERVER(data->args.inode);
+
+ nfs4_setup_sequence(server->nfs_client,
+ &data->args.seq_args,
+ &data->res.seq_res,
+ task);
+}
+
+static void
+nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutcommit_data *data = calldata;
+ struct nfs_server *server = NFS_SERVER(data->args.inode);
+
+ if (!nfs41_sequence_done(task, &data->res.seq_res))
+ return;
+
+ switch (task->tk_status) { /* Just ignore these failures */
+ case -NFS4ERR_DELEG_REVOKED: /* layout was recalled */
+ case -NFS4ERR_BADIOMODE: /* no IOMODE_RW layout for range */
+ case -NFS4ERR_BADLAYOUT: /* no layout */
+ case -NFS4ERR_GRACE: /* loca_recalim always false */
+ task->tk_status = 0;
+ break;
+ case 0:
+ break;
+ default:
+ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
+ rpc_restart_call_prepare(task);
+ return;
+ }
+ }
+}
+
+static void nfs4_layoutcommit_release(void *calldata)
+{
+ struct nfs4_layoutcommit_data *data = calldata;
+
+ pnfs_cleanup_layoutcommit(data);
+ nfs_post_op_update_inode_force_wcc(data->args.inode,
+ data->res.fattr);
+ put_cred(data->cred);
+ nfs_iput_and_deactive(data->inode);
+ kfree(data);
+}
+
+static const struct rpc_call_ops nfs4_layoutcommit_ops = {
+ .rpc_call_prepare = nfs4_layoutcommit_prepare,
+ .rpc_call_done = nfs4_layoutcommit_done,
+ .rpc_release = nfs4_layoutcommit_release,
+};
+
+int
+nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT],
+ .rpc_argp = &data->args,
+ .rpc_resp = &data->res,
+ .rpc_cred = data->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .task = &data->task,
+ .rpc_client = NFS_CLIENT(data->args.inode),
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_layoutcommit_ops,
+ .callback_data = data,
+ .flags = RPC_TASK_MOVEABLE,
+ };
+ struct rpc_task *task;
+ int status = 0;
+
+ dprintk("NFS: initiating layoutcommit call. sync %d "
+ "lbw: %llu inode %lu\n", sync,
+ data->args.lastbytewritten,
+ data->args.inode->i_ino);
+
+ if (!sync) {
+ data->inode = nfs_igrab_and_active(data->args.inode);
+ if (data->inode == NULL) {
+ nfs4_layoutcommit_release(data);
+ return -EAGAIN;
+ }
+ task_setup_data.flags = RPC_TASK_ASYNC;
+ }
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 0);
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ if (sync)
+ status = task->tk_status;
+ trace_nfs4_layoutcommit(data->args.inode, &data->args.stateid, status);
+ dprintk("%s: status %d\n", __func__, status);
+ rpc_put_task(task);
+ return status;
+}
+
+/*
+ * Use the state managment nfs_client cl_rpcclient, which uses krb5i (if
+ * possible) as per RFC3530bis and RFC5661 Security Considerations sections
+ */
+static int
+_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info,
+ struct nfs4_secinfo_flavors *flavors, bool use_integrity)
+{
+ struct nfs41_secinfo_no_name_args args = {
+ .style = SECINFO_STYLE_CURRENT_FH,
+ };
+ struct nfs4_secinfo_res res = {
+ .flavors = flavors,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO_NO_NAME],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ struct nfs4_call_sync_data data = {
+ .seq_server = server,
+ .seq_args = &args.seq_args,
+ .seq_res = &res.seq_res,
+ };
+ struct rpc_task_setup task_setup = {
+ .rpc_client = server->client,
+ .rpc_message = &msg,
+ .callback_ops = server->nfs_client->cl_mvops->call_sync_ops,
+ .callback_data = &data,
+ .flags = RPC_TASK_NO_ROUND_ROBIN,
+ };
+ const struct cred *cred = NULL;
+ int status;
+
+ if (use_integrity) {
+ task_setup.rpc_client = server->nfs_client->cl_rpcclient;
+
+ cred = nfs4_get_clid_cred(server->nfs_client);
+ msg.rpc_cred = cred;
+ }
+
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
+ status = nfs4_call_sync_custom(&task_setup);
+ dprintk("<-- %s status=%d\n", __func__, status);
+
+ put_cred(cred);
+
+ return status;
+}
+
+static int
+nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ /* first try using integrity protection */
+ err = -NFS4ERR_WRONGSEC;
+
+ /* try to use integrity protection with machine cred */
+ if (_nfs4_is_integrity_protected(server->nfs_client))
+ err = _nfs41_proc_secinfo_no_name(server, fhandle, info,
+ flavors, true);
+
+ /*
+ * if unable to use integrity protection, or SECINFO with
+ * integrity protection returns NFS4ERR_WRONGSEC (which is
+ * disallowed by spec, but exists in deployed servers) use
+ * the current filesystem's rpc_client and the user cred.
+ */
+ if (err == -NFS4ERR_WRONGSEC)
+ err = _nfs41_proc_secinfo_no_name(server, fhandle, info,
+ flavors, false);
+
+ switch (err) {
+ case 0:
+ case -NFS4ERR_WRONGSEC:
+ case -ENOTSUPP:
+ goto out;
+ default:
+ err = nfs4_handle_exception(server, err, &exception);
+ }
+ } while (exception.retry);
+out:
+ return err;
+}
+
+static int
+nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ int err;
+ struct page *page;
+ rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
+ struct nfs4_secinfo_flavors *flavors;
+ struct nfs4_secinfo4 *secinfo;
+ int i;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ flavors = page_address(page);
+ err = nfs41_proc_secinfo_no_name(server, fhandle, info, flavors);
+
+ /*
+ * Fall back on "guess and check" method if
+ * the server doesn't support SECINFO_NO_NAME
+ */
+ if (err == -NFS4ERR_WRONGSEC || err == -ENOTSUPP) {
+ err = nfs4_find_root_sec(server, fhandle, info);
+ goto out_freepage;
+ }
+ if (err)
+ goto out_freepage;
+
+ for (i = 0; i < flavors->num_flavors; i++) {
+ secinfo = &flavors->flavors[i];
+
+ switch (secinfo->flavor) {
+ case RPC_AUTH_NULL:
+ case RPC_AUTH_UNIX:
+ case RPC_AUTH_GSS:
+ flavor = rpcauth_get_pseudoflavor(secinfo->flavor,
+ &secinfo->flavor_info);
+ break;
+ default:
+ flavor = RPC_AUTH_MAXFLAVOR;
+ break;
+ }
+
+ if (!nfs_auth_info_match(&server->auth_info, flavor))
+ flavor = RPC_AUTH_MAXFLAVOR;
+
+ if (flavor != RPC_AUTH_MAXFLAVOR) {
+ err = nfs4_lookup_root_sec(server, fhandle,
+ info, flavor);
+ if (!err)
+ break;
+ }
+ }
+
+ if (flavor == RPC_AUTH_MAXFLAVOR)
+ err = -EPERM;
+
+out_freepage:
+ put_page(page);
+ if (err == -EACCES)
+ return -EPERM;
+out:
+ return err;
+}
+
+static int _nfs41_test_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ const struct cred *cred)
+{
+ int status;
+ struct nfs41_test_stateid_args args = {
+ .stateid = stateid,
+ };
+ struct nfs41_test_stateid_res res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ struct rpc_clnt *rpc_client = server->client;
+
+ nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID,
+ &rpc_client, &msg);
+
+ dprintk("NFS call test_stateid %p\n", stateid);
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1);
+ status = nfs4_call_sync_sequence(rpc_client, server, &msg,
+ &args.seq_args, &res.seq_res);
+ if (status != NFS_OK) {
+ dprintk("NFS reply test_stateid: failed, %d\n", status);
+ return status;
+ }
+ dprintk("NFS reply test_stateid: succeeded, %d\n", -res.status);
+ return -res.status;
+}
+
+static void nfs4_handle_delay_or_session_error(struct nfs_server *server,
+ int err, struct nfs4_exception *exception)
+{
+ exception->retry = 0;
+ switch(err) {
+ case -NFS4ERR_DELAY:
+ case -NFS4ERR_RETRY_UNCACHED_REP:
+ nfs4_handle_exception(server, err, exception);
+ break;
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case -NFS4ERR_DEADSESSION:
+ nfs4_do_handle_exception(server, err, exception);
+ }
+}
+
+/**
+ * nfs41_test_stateid - perform a TEST_STATEID operation
+ *
+ * @server: server / transport on which to perform the operation
+ * @stateid: state ID to test
+ * @cred: credential
+ *
+ * Returns NFS_OK if the server recognizes that "stateid" is valid.
+ * Otherwise a negative NFS4ERR value is returned if the operation
+ * failed or the state ID is not currently valid.
+ */
+static int nfs41_test_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ const struct cred *cred)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = _nfs41_test_stateid(server, stateid, cred);
+ nfs4_handle_delay_or_session_error(server, err, &exception);
+ } while (exception.retry);
+ return err;
+}
+
+struct nfs_free_stateid_data {
+ struct nfs_server *server;
+ struct nfs41_free_stateid_args args;
+ struct nfs41_free_stateid_res res;
+};
+
+static void nfs41_free_stateid_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs_free_stateid_data *data = calldata;
+ nfs4_setup_sequence(data->server->nfs_client,
+ &data->args.seq_args,
+ &data->res.seq_res,
+ task);
+}
+
+static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs_free_stateid_data *data = calldata;
+
+ nfs41_sequence_done(task, &data->res.seq_res);
+
+ switch (task->tk_status) {
+ case -NFS4ERR_DELAY:
+ if (nfs4_async_handle_error(task, data->server, NULL, NULL) == -EAGAIN)
+ rpc_restart_call_prepare(task);
+ }
+}
+
+static void nfs41_free_stateid_release(void *calldata)
+{
+ struct nfs_free_stateid_data *data = calldata;
+ struct nfs_client *clp = data->server->nfs_client;
+
+ nfs_put_client(clp);
+ kfree(calldata);
+}
+
+static const struct rpc_call_ops nfs41_free_stateid_ops = {
+ .rpc_call_prepare = nfs41_free_stateid_prepare,
+ .rpc_call_done = nfs41_free_stateid_done,
+ .rpc_release = nfs41_free_stateid_release,
+};
+
+/**
+ * nfs41_free_stateid - perform a FREE_STATEID operation
+ *
+ * @server: server / transport on which to perform the operation
+ * @stateid: state ID to release
+ * @cred: credential
+ * @privileged: set to true if this call needs to be privileged
+ *
+ * Note: this function is always asynchronous.
+ */
+static int nfs41_free_stateid(struct nfs_server *server,
+ const nfs4_stateid *stateid,
+ const struct cred *cred,
+ bool privileged)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID],
+ .rpc_cred = cred,
+ };
+ struct rpc_task_setup task_setup = {
+ .rpc_client = server->client,
+ .rpc_message = &msg,
+ .callback_ops = &nfs41_free_stateid_ops,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_MOVEABLE,
+ };
+ struct nfs_free_stateid_data *data;
+ struct rpc_task *task;
+ struct nfs_client *clp = server->nfs_client;
+
+ if (!refcount_inc_not_zero(&clp->cl_count))
+ return -EIO;
+
+ nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID,
+ &task_setup.rpc_client, &msg);
+
+ dprintk("NFS call free_stateid %p\n", stateid);
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ data->server = server;
+ nfs4_stateid_copy(&data->args.stateid, stateid);
+
+ task_setup.callback_data = data;
+
+ msg.rpc_argp = &data->args;
+ msg.rpc_resp = &data->res;
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, privileged);
+ task = rpc_run_task(&task_setup);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ rpc_put_task(task);
+ return 0;
+}
+
+static void
+nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
+{
+ const struct cred *cred = lsp->ls_state->owner->so_cred;
+
+ nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
+ nfs4_free_lock_state(server, lsp);
+}
+
+static bool nfs41_match_stateid(const nfs4_stateid *s1,
+ const nfs4_stateid *s2)
+{
+ if (s1->type != s2->type)
+ return false;
+
+ if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0)
+ return false;
+
+ if (s1->seqid == s2->seqid)
+ return true;
+
+ return s1->seqid == 0 || s2->seqid == 0;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+static bool nfs4_match_stateid(const nfs4_stateid *s1,
+ const nfs4_stateid *s2)
+{
+ return nfs4_stateid_match(s1, s2);
+}
+
+
+static const struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
+ .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
+ .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
+ .recover_open = nfs4_open_reclaim,
+ .recover_lock = nfs4_lock_reclaim,
+ .establish_clid = nfs4_init_clientid,
+ .detect_trunking = nfs40_discover_server_trunking,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
+ .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
+ .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
+ .recover_open = nfs4_open_reclaim,
+ .recover_lock = nfs4_lock_reclaim,
+ .establish_clid = nfs41_init_clientid,
+ .reclaim_complete = nfs41_proc_reclaim_complete,
+ .detect_trunking = nfs41_discover_server_trunking,
+};
+#endif /* CONFIG_NFS_V4_1 */
+
+static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
+ .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
+ .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
+ .recover_open = nfs40_open_expired,
+ .recover_lock = nfs4_lock_expired,
+ .establish_clid = nfs4_init_clientid,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static const struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
+ .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
+ .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
+ .recover_open = nfs41_open_expired,
+ .recover_lock = nfs41_lock_expired,
+ .establish_clid = nfs41_init_clientid,
+};
+#endif /* CONFIG_NFS_V4_1 */
+
+static const struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = {
+ .sched_state_renewal = nfs4_proc_async_renew,
+ .get_state_renewal_cred = nfs4_get_renew_cred,
+ .renew_lease = nfs4_proc_renew,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
+ .sched_state_renewal = nfs41_proc_async_sequence,
+ .get_state_renewal_cred = nfs4_get_machine_cred,
+ .renew_lease = nfs4_proc_sequence,
+};
+#endif
+
+static const struct nfs4_mig_recovery_ops nfs40_mig_recovery_ops = {
+ .get_locations = _nfs40_proc_get_locations,
+ .fsid_present = _nfs40_proc_fsid_present,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static const struct nfs4_mig_recovery_ops nfs41_mig_recovery_ops = {
+ .get_locations = _nfs41_proc_get_locations,
+ .fsid_present = _nfs41_proc_fsid_present,
+};
+#endif /* CONFIG_NFS_V4_1 */
+
+static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
+ .minor_version = 0,
+ .init_caps = NFS_CAP_READDIRPLUS
+ | NFS_CAP_ATOMIC_OPEN
+ | NFS_CAP_POSIX_LOCK,
+ .init_client = nfs40_init_client,
+ .shutdown_client = nfs40_shutdown_client,
+ .match_stateid = nfs4_match_stateid,
+ .find_root_sec = nfs4_find_root_sec,
+ .free_lock_state = nfs4_release_lockowner,
+ .test_and_free_expired = nfs40_test_and_free_expired_stateid,
+ .alloc_seqid = nfs_alloc_seqid,
+ .call_sync_ops = &nfs40_call_sync_ops,
+ .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
+ .nograce_recovery_ops = &nfs40_nograce_recovery_ops,
+ .state_renewal_ops = &nfs40_state_renewal_ops,
+ .mig_recovery_ops = &nfs40_mig_recovery_ops,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static struct nfs_seqid *
+nfs_alloc_no_seqid(struct nfs_seqid_counter *arg1, gfp_t arg2)
+{
+ return NULL;
+}
+
+static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
+ .minor_version = 1,
+ .init_caps = NFS_CAP_READDIRPLUS
+ | NFS_CAP_ATOMIC_OPEN
+ | NFS_CAP_POSIX_LOCK
+ | NFS_CAP_STATEID_NFSV41
+ | NFS_CAP_ATOMIC_OPEN_V1
+ | NFS_CAP_LGOPEN
+ | NFS_CAP_MOVEABLE,
+ .init_client = nfs41_init_client,
+ .shutdown_client = nfs41_shutdown_client,
+ .match_stateid = nfs41_match_stateid,
+ .find_root_sec = nfs41_find_root_sec,
+ .free_lock_state = nfs41_free_lock_state,
+ .test_and_free_expired = nfs41_test_and_free_expired_stateid,
+ .alloc_seqid = nfs_alloc_no_seqid,
+ .session_trunk = nfs4_test_session_trunk,
+ .call_sync_ops = &nfs41_call_sync_ops,
+ .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
+ .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
+ .state_renewal_ops = &nfs41_state_renewal_ops,
+ .mig_recovery_ops = &nfs41_mig_recovery_ops,
+};
+#endif
+
+#if defined(CONFIG_NFS_V4_2)
+static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
+ .minor_version = 2,
+ .init_caps = NFS_CAP_READDIRPLUS
+ | NFS_CAP_ATOMIC_OPEN
+ | NFS_CAP_POSIX_LOCK
+ | NFS_CAP_STATEID_NFSV41
+ | NFS_CAP_ATOMIC_OPEN_V1
+ | NFS_CAP_LGOPEN
+ | NFS_CAP_ALLOCATE
+ | NFS_CAP_COPY
+ | NFS_CAP_OFFLOAD_CANCEL
+ | NFS_CAP_COPY_NOTIFY
+ | NFS_CAP_DEALLOCATE
+ | NFS_CAP_SEEK
+ | NFS_CAP_LAYOUTSTATS
+ | NFS_CAP_CLONE
+ | NFS_CAP_LAYOUTERROR
+ | NFS_CAP_READ_PLUS
+ | NFS_CAP_MOVEABLE,
+ .init_client = nfs41_init_client,
+ .shutdown_client = nfs41_shutdown_client,
+ .match_stateid = nfs41_match_stateid,
+ .find_root_sec = nfs41_find_root_sec,
+ .free_lock_state = nfs41_free_lock_state,
+ .call_sync_ops = &nfs41_call_sync_ops,
+ .test_and_free_expired = nfs41_test_and_free_expired_stateid,
+ .alloc_seqid = nfs_alloc_no_seqid,
+ .session_trunk = nfs4_test_session_trunk,
+ .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
+ .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
+ .state_renewal_ops = &nfs41_state_renewal_ops,
+ .mig_recovery_ops = &nfs41_mig_recovery_ops,
+};
+#endif
+
+const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
+ [0] = &nfs_v4_0_minor_ops,
+#if defined(CONFIG_NFS_V4_1)
+ [1] = &nfs_v4_1_minor_ops,
+#endif
+#if defined(CONFIG_NFS_V4_2)
+ [2] = &nfs_v4_2_minor_ops,
+#endif
+};
+
+static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+ ssize_t error, error2, error3;
+
+ error = generic_listxattr(dentry, list, size);
+ if (error < 0)
+ return error;
+ if (list) {
+ list += error;
+ size -= error;
+ }
+
+ error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, size);
+ if (error2 < 0)
+ return error2;
+
+ if (list) {
+ list += error2;
+ size -= error2;
+ }
+
+ error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, size);
+ if (error3 < 0)
+ return error3;
+
+ return error + error2 + error3;
+}
+
+static void nfs4_enable_swap(struct inode *inode)
+{
+ /* The state manager thread must always be running.
+ * It will notice the client is a swapper, and stay put.
+ */
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+
+ nfs4_schedule_state_manager(clp);
+}
+
+static void nfs4_disable_swap(struct inode *inode)
+{
+ /* The state manager thread will now exit once it is
+ * woken.
+ */
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+
+ set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
+ wake_up_var(&clp->cl_state);
+}
+
+static const struct inode_operations nfs4_dir_inode_operations = {
+ .create = nfs_create,
+ .lookup = nfs_lookup,
+ .atomic_open = nfs_atomic_open,
+ .link = nfs_link,
+ .unlink = nfs_unlink,
+ .symlink = nfs_symlink,
+ .mkdir = nfs_mkdir,
+ .rmdir = nfs_rmdir,
+ .mknod = nfs_mknod,
+ .rename = nfs_rename,
+ .permission = nfs_permission,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+ .listxattr = nfs4_listxattr,
+};
+
+static const struct inode_operations nfs4_file_inode_operations = {
+ .permission = nfs_permission,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+ .listxattr = nfs4_listxattr,
+};
+
+const struct nfs_rpc_ops nfs_v4_clientops = {
+ .version = 4, /* protocol version */
+ .dentry_ops = &nfs4_dentry_operations,
+ .dir_inode_ops = &nfs4_dir_inode_operations,
+ .file_inode_ops = &nfs4_file_inode_operations,
+ .file_ops = &nfs4_file_operations,
+ .getroot = nfs4_proc_get_root,
+ .submount = nfs4_submount,
+ .try_get_tree = nfs4_try_get_tree,
+ .getattr = nfs4_proc_getattr,
+ .setattr = nfs4_proc_setattr,
+ .lookup = nfs4_proc_lookup,
+ .lookupp = nfs4_proc_lookupp,
+ .access = nfs4_proc_access,
+ .readlink = nfs4_proc_readlink,
+ .create = nfs4_proc_create,
+ .remove = nfs4_proc_remove,
+ .unlink_setup = nfs4_proc_unlink_setup,
+ .unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare,
+ .unlink_done = nfs4_proc_unlink_done,
+ .rename_setup = nfs4_proc_rename_setup,
+ .rename_rpc_prepare = nfs4_proc_rename_rpc_prepare,
+ .rename_done = nfs4_proc_rename_done,
+ .link = nfs4_proc_link,
+ .symlink = nfs4_proc_symlink,
+ .mkdir = nfs4_proc_mkdir,
+ .rmdir = nfs4_proc_rmdir,
+ .readdir = nfs4_proc_readdir,
+ .mknod = nfs4_proc_mknod,
+ .statfs = nfs4_proc_statfs,
+ .fsinfo = nfs4_proc_fsinfo,
+ .pathconf = nfs4_proc_pathconf,
+ .set_capabilities = nfs4_server_capabilities,
+ .decode_dirent = nfs4_decode_dirent,
+ .pgio_rpc_prepare = nfs4_proc_pgio_rpc_prepare,
+ .read_setup = nfs4_proc_read_setup,
+ .read_done = nfs4_read_done,
+ .write_setup = nfs4_proc_write_setup,
+ .write_done = nfs4_write_done,
+ .commit_setup = nfs4_proc_commit_setup,
+ .commit_rpc_prepare = nfs4_proc_commit_rpc_prepare,
+ .commit_done = nfs4_commit_done,
+ .lock = nfs4_proc_lock,
+ .clear_acl_cache = nfs4_zap_acl_attr,
+ .close_context = nfs4_close_context,
+ .open_context = nfs4_atomic_open,
+ .have_delegation = nfs4_have_delegation,
+ .alloc_client = nfs4_alloc_client,
+ .init_client = nfs4_init_client,
+ .free_client = nfs4_free_client,
+ .create_server = nfs4_create_server,
+ .clone_server = nfs_clone_server,
+ .discover_trunking = nfs4_discover_trunking,
+ .enable_swap = nfs4_enable_swap,
+ .disable_swap = nfs4_disable_swap,
+};
+
+static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
+ .name = XATTR_NAME_NFSV4_ACL,
+ .list = nfs4_xattr_list_nfs4_acl,
+ .get = nfs4_xattr_get_nfs4_acl,
+ .set = nfs4_xattr_set_nfs4_acl,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static const struct xattr_handler nfs4_xattr_nfs4_dacl_handler = {
+ .name = XATTR_NAME_NFSV4_DACL,
+ .list = nfs4_xattr_list_nfs4_dacl,
+ .get = nfs4_xattr_get_nfs4_dacl,
+ .set = nfs4_xattr_set_nfs4_dacl,
+};
+
+static const struct xattr_handler nfs4_xattr_nfs4_sacl_handler = {
+ .name = XATTR_NAME_NFSV4_SACL,
+ .list = nfs4_xattr_list_nfs4_sacl,
+ .get = nfs4_xattr_get_nfs4_sacl,
+ .set = nfs4_xattr_set_nfs4_sacl,
+};
+#endif
+
+#ifdef CONFIG_NFS_V4_2
+static const struct xattr_handler nfs4_xattr_nfs4_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = nfs4_xattr_get_nfs4_user,
+ .set = nfs4_xattr_set_nfs4_user,
+};
+#endif
+
+const struct xattr_handler *nfs4_xattr_handlers[] = {
+ &nfs4_xattr_nfs4_acl_handler,
+#if defined(CONFIG_NFS_V4_1)
+ &nfs4_xattr_nfs4_dacl_handler,
+ &nfs4_xattr_nfs4_sacl_handler,
+#endif
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ &nfs4_xattr_nfs4_label_handler,
+#endif
+#ifdef CONFIG_NFS_V4_2
+ &nfs4_xattr_nfs4_user_handler,
+#endif
+ NULL
+};
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
new file mode 100644
index 0000000000..db3811af07
--- /dev/null
+++ b/fs/nfs/nfs4renewd.c
@@ -0,0 +1,151 @@
+/*
+ * fs/nfs/nfs4renewd.c
+ *
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Kendrick Smith <kmsmith@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Implementation of the NFSv4 "renew daemon", which wakes up periodically to
+ * send a RENEW, to keep state alive on the server. The daemon is implemented
+ * as an rpc_task, not a real kernel thread, so it always runs in rpciod's
+ * context. There is one renewd per nfs_server.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/clnt.h>
+
+#include <linux/nfs.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
+#include "delegation.h"
+
+#define NFSDBG_FACILITY NFSDBG_STATE
+
+void
+nfs4_renew_state(struct work_struct *work)
+{
+ const struct nfs4_state_maintenance_ops *ops;
+ struct nfs_client *clp =
+ container_of(work, struct nfs_client, cl_renewd.work);
+ const struct cred *cred;
+ long lease;
+ unsigned long last, now;
+ unsigned renew_flags = 0;
+
+ ops = clp->cl_mvops->state_renewal_ops;
+ dprintk("%s: start\n", __func__);
+
+ if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state))
+ goto out;
+
+ lease = clp->cl_lease_time;
+ last = clp->cl_last_renewal;
+ now = jiffies;
+ /* Are we close to a lease timeout? */
+ if (time_after(now, last + lease/3))
+ renew_flags |= NFS4_RENEW_TIMEOUT;
+ if (nfs_delegations_present(clp))
+ renew_flags |= NFS4_RENEW_DELEGATION_CB;
+
+ if (renew_flags != 0) {
+ cred = ops->get_state_renewal_cred(clp);
+ if (cred == NULL) {
+ if (!(renew_flags & NFS4_RENEW_DELEGATION_CB)) {
+ set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ goto out;
+ }
+ nfs_expire_all_delegations(clp);
+ } else {
+ int ret;
+
+ /* Queue an asynchronous RENEW. */
+ ret = ops->sched_state_renewal(clp, cred, renew_flags);
+ put_cred(cred);
+ switch (ret) {
+ default:
+ goto out_exp;
+ case -EAGAIN:
+ case -ENOMEM:
+ break;
+ }
+ }
+ } else {
+ dprintk("%s: failed to call renewd. Reason: lease not expired \n",
+ __func__);
+ }
+ nfs4_schedule_state_renewal(clp);
+out_exp:
+ nfs_expire_unreferenced_delegations(clp);
+out:
+ dprintk("%s: done\n", __func__);
+}
+
+void
+nfs4_schedule_state_renewal(struct nfs_client *clp)
+{
+ long timeout;
+
+ spin_lock(&clp->cl_lock);
+ timeout = (2 * clp->cl_lease_time) / 3 + (long)clp->cl_last_renewal
+ - (long)jiffies;
+ if (timeout < 5 * HZ)
+ timeout = 5 * HZ;
+ dprintk("%s: requeueing work. Lease period = %ld\n",
+ __func__, (timeout + HZ - 1) / HZ);
+ mod_delayed_work(system_wq, &clp->cl_renewd, timeout);
+ set_bit(NFS_CS_RENEWD, &clp->cl_res_state);
+ spin_unlock(&clp->cl_lock);
+}
+
+void
+nfs4_kill_renewd(struct nfs_client *clp)
+{
+ cancel_delayed_work_sync(&clp->cl_renewd);
+}
+
+/**
+ * nfs4_set_lease_period - Sets the lease period on a nfs_client
+ *
+ * @clp: pointer to nfs_client
+ * @lease: new value for lease period
+ */
+void nfs4_set_lease_period(struct nfs_client *clp,
+ unsigned long lease)
+{
+ spin_lock(&clp->cl_lock);
+ clp->cl_lease_time = lease;
+ spin_unlock(&clp->cl_lock);
+
+ /* Cap maximum reconnect timeout at 1/2 lease period */
+ rpc_set_connect_timeout(clp->cl_rpcclient, lease, lease >> 1);
+}
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
new file mode 100644
index 0000000000..5db460476b
--- /dev/null
+++ b/fs/nfs/nfs4session.c
@@ -0,0 +1,657 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * fs/nfs/nfs4session.c
+ *
+ * Copyright (c) 2012 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/bc_xprt.h>
+#include <linux/nfs.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/module.h>
+
+#include "nfs4_fs.h"
+#include "internal.h"
+#include "nfs4session.h"
+#include "callback.h"
+
+#define NFSDBG_FACILITY NFSDBG_STATE
+
+static void nfs4_init_slot_table(struct nfs4_slot_table *tbl, const char *queue)
+{
+ tbl->highest_used_slotid = NFS4_NO_SLOT;
+ spin_lock_init(&tbl->slot_tbl_lock);
+ rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, queue);
+ init_waitqueue_head(&tbl->slot_waitq);
+ init_completion(&tbl->complete);
+}
+
+/*
+ * nfs4_shrink_slot_table - free retired slots from the slot table
+ */
+static void nfs4_shrink_slot_table(struct nfs4_slot_table *tbl, u32 newsize)
+{
+ struct nfs4_slot **p;
+ if (newsize >= tbl->max_slots)
+ return;
+
+ p = &tbl->slots;
+ while (newsize--)
+ p = &(*p)->next;
+ while (*p) {
+ struct nfs4_slot *slot = *p;
+
+ *p = slot->next;
+ kfree(slot);
+ tbl->max_slots--;
+ }
+}
+
+/**
+ * nfs4_slot_tbl_drain_complete - wake waiters when drain is complete
+ * @tbl: controlling slot table
+ *
+ */
+void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl)
+{
+ if (nfs4_slot_tbl_draining(tbl))
+ complete(&tbl->complete);
+}
+
+/*
+ * nfs4_free_slot - free a slot and efficiently update slot table.
+ *
+ * freeing a slot is trivially done by clearing its respective bit
+ * in the bitmap.
+ * If the freed slotid equals highest_used_slotid we want to update it
+ * so that the server would be able to size down the slot table if needed,
+ * otherwise we know that the highest_used_slotid is still in use.
+ * When updating highest_used_slotid there may be "holes" in the bitmap
+ * so we need to scan down from highest_used_slotid to 0 looking for the now
+ * highest slotid in use.
+ * If none found, highest_used_slotid is set to NFS4_NO_SLOT.
+ *
+ * Must be called while holding tbl->slot_tbl_lock
+ */
+void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
+{
+ u32 slotid = slot->slot_nr;
+
+ /* clear used bit in bitmap */
+ __clear_bit(slotid, tbl->used_slots);
+
+ /* update highest_used_slotid when it is freed */
+ if (slotid == tbl->highest_used_slotid) {
+ u32 new_max = find_last_bit(tbl->used_slots, slotid);
+ if (new_max < slotid)
+ tbl->highest_used_slotid = new_max;
+ else {
+ tbl->highest_used_slotid = NFS4_NO_SLOT;
+ nfs4_slot_tbl_drain_complete(tbl);
+ }
+ }
+ dprintk("%s: slotid %u highest_used_slotid %u\n", __func__,
+ slotid, tbl->highest_used_slotid);
+}
+
+static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table *tbl,
+ u32 slotid, u32 seq_init, gfp_t gfp_mask)
+{
+ struct nfs4_slot *slot;
+
+ slot = kzalloc(sizeof(*slot), gfp_mask);
+ if (slot) {
+ slot->table = tbl;
+ slot->slot_nr = slotid;
+ slot->seq_nr = seq_init;
+ slot->seq_nr_highest_sent = seq_init;
+ slot->seq_nr_last_acked = seq_init - 1;
+ }
+ return slot;
+}
+
+static struct nfs4_slot *nfs4_find_or_create_slot(struct nfs4_slot_table *tbl,
+ u32 slotid, u32 seq_init, gfp_t gfp_mask)
+{
+ struct nfs4_slot **p, *slot;
+
+ p = &tbl->slots;
+ for (;;) {
+ if (*p == NULL) {
+ *p = nfs4_new_slot(tbl, tbl->max_slots,
+ seq_init, gfp_mask);
+ if (*p == NULL)
+ break;
+ tbl->max_slots++;
+ }
+ slot = *p;
+ if (slot->slot_nr == slotid)
+ return slot;
+ p = &slot->next;
+ }
+ return ERR_PTR(-ENOMEM);
+}
+
+static void nfs4_lock_slot(struct nfs4_slot_table *tbl,
+ struct nfs4_slot *slot)
+{
+ u32 slotid = slot->slot_nr;
+
+ __set_bit(slotid, tbl->used_slots);
+ if (slotid > tbl->highest_used_slotid ||
+ tbl->highest_used_slotid == NFS4_NO_SLOT)
+ tbl->highest_used_slotid = slotid;
+ slot->generation = tbl->generation;
+}
+
+/*
+ * nfs4_try_to_lock_slot - Given a slot try to allocate it
+ *
+ * Note: must be called with the slot_tbl_lock held.
+ */
+bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
+{
+ if (nfs4_test_locked_slot(tbl, slot->slot_nr))
+ return false;
+ nfs4_lock_slot(tbl, slot);
+ return true;
+}
+
+/*
+ * nfs4_lookup_slot - Find a slot but don't allocate it
+ *
+ * Note: must be called with the slot_tbl_lock held.
+ */
+struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid)
+{
+ if (slotid <= tbl->max_slotid)
+ return nfs4_find_or_create_slot(tbl, slotid, 0, GFP_NOWAIT);
+ return ERR_PTR(-E2BIG);
+}
+
+static int nfs4_slot_get_seqid(struct nfs4_slot_table *tbl, u32 slotid,
+ u32 *seq_nr)
+ __must_hold(&tbl->slot_tbl_lock)
+{
+ struct nfs4_slot *slot;
+ int ret;
+
+ slot = nfs4_lookup_slot(tbl, slotid);
+ ret = PTR_ERR_OR_ZERO(slot);
+ if (!ret)
+ *seq_nr = slot->seq_nr;
+
+ return ret;
+}
+
+/*
+ * nfs4_slot_seqid_in_use - test if a slot sequence id is still in use
+ *
+ * Given a slot table, slot id and sequence number, determine if the
+ * RPC call in question is still in flight. This function is mainly
+ * intended for use by the callback channel.
+ */
+static bool nfs4_slot_seqid_in_use(struct nfs4_slot_table *tbl,
+ u32 slotid, u32 seq_nr)
+{
+ u32 cur_seq = 0;
+ bool ret = false;
+
+ spin_lock(&tbl->slot_tbl_lock);
+ if (nfs4_slot_get_seqid(tbl, slotid, &cur_seq) == 0 &&
+ cur_seq == seq_nr && test_bit(slotid, tbl->used_slots))
+ ret = true;
+ spin_unlock(&tbl->slot_tbl_lock);
+ return ret;
+}
+
+/*
+ * nfs4_slot_wait_on_seqid - wait until a slot sequence id is complete
+ *
+ * Given a slot table, slot id and sequence number, wait until the
+ * corresponding RPC call completes. This function is mainly
+ * intended for use by the callback channel.
+ */
+int nfs4_slot_wait_on_seqid(struct nfs4_slot_table *tbl,
+ u32 slotid, u32 seq_nr,
+ unsigned long timeout)
+{
+ if (wait_event_timeout(tbl->slot_waitq,
+ !nfs4_slot_seqid_in_use(tbl, slotid, seq_nr),
+ timeout) == 0)
+ return -ETIMEDOUT;
+ return 0;
+}
+
+/*
+ * nfs4_alloc_slot - efficiently look for a free slot
+ *
+ * nfs4_alloc_slot looks for an unset bit in the used_slots bitmap.
+ * If found, we mark the slot as used, update the highest_used_slotid,
+ * and respectively set up the sequence operation args.
+ *
+ * Note: must be called with under the slot_tbl_lock.
+ */
+struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl)
+{
+ struct nfs4_slot *ret = ERR_PTR(-EBUSY);
+ u32 slotid;
+
+ dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n",
+ __func__, tbl->used_slots[0], tbl->highest_used_slotid,
+ tbl->max_slotid + 1);
+ slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slotid + 1);
+ if (slotid <= tbl->max_slotid) {
+ ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
+ if (!IS_ERR(ret))
+ nfs4_lock_slot(tbl, ret);
+ }
+ dprintk("<-- %s used_slots=%04lx highest_used=%u slotid=%u\n",
+ __func__, tbl->used_slots[0], tbl->highest_used_slotid,
+ !IS_ERR(ret) ? ret->slot_nr : NFS4_NO_SLOT);
+ return ret;
+}
+
+static int nfs4_grow_slot_table(struct nfs4_slot_table *tbl,
+ u32 max_reqs, u32 ivalue)
+{
+ if (max_reqs <= tbl->max_slots)
+ return 0;
+ if (!IS_ERR(nfs4_find_or_create_slot(tbl, max_reqs - 1, ivalue, GFP_NOFS)))
+ return 0;
+ return -ENOMEM;
+}
+
+static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl,
+ u32 server_highest_slotid,
+ u32 ivalue)
+{
+ struct nfs4_slot **p;
+
+ nfs4_shrink_slot_table(tbl, server_highest_slotid + 1);
+ p = &tbl->slots;
+ while (*p) {
+ (*p)->seq_nr = ivalue;
+ (*p)->seq_nr_highest_sent = ivalue;
+ (*p)->seq_nr_last_acked = ivalue - 1;
+ p = &(*p)->next;
+ }
+ tbl->highest_used_slotid = NFS4_NO_SLOT;
+ tbl->target_highest_slotid = server_highest_slotid;
+ tbl->server_highest_slotid = server_highest_slotid;
+ tbl->d_target_highest_slotid = 0;
+ tbl->d2_target_highest_slotid = 0;
+ tbl->max_slotid = server_highest_slotid;
+}
+
+/*
+ * (re)Initialise a slot table
+ */
+static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl,
+ u32 max_reqs, u32 ivalue)
+{
+ int ret;
+
+ dprintk("--> %s: max_reqs=%u, tbl->max_slots %u\n", __func__,
+ max_reqs, tbl->max_slots);
+
+ if (max_reqs > NFS4_MAX_SLOT_TABLE)
+ max_reqs = NFS4_MAX_SLOT_TABLE;
+
+ ret = nfs4_grow_slot_table(tbl, max_reqs, ivalue);
+ if (ret)
+ goto out;
+
+ spin_lock(&tbl->slot_tbl_lock);
+ nfs4_reset_slot_table(tbl, max_reqs - 1, ivalue);
+ spin_unlock(&tbl->slot_tbl_lock);
+
+ dprintk("%s: tbl=%p slots=%p max_slots=%u\n", __func__,
+ tbl, tbl->slots, tbl->max_slots);
+out:
+ dprintk("<-- %s: return %d\n", __func__, ret);
+ return ret;
+}
+
+/*
+ * nfs4_release_slot_table - release all slot table entries
+ */
+static void nfs4_release_slot_table(struct nfs4_slot_table *tbl)
+{
+ nfs4_shrink_slot_table(tbl, 0);
+}
+
+/**
+ * nfs4_shutdown_slot_table - release resources attached to a slot table
+ * @tbl: slot table to shut down
+ *
+ */
+void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl)
+{
+ nfs4_release_slot_table(tbl);
+ rpc_destroy_wait_queue(&tbl->slot_tbl_waitq);
+}
+
+/**
+ * nfs4_setup_slot_table - prepare a stand-alone slot table for use
+ * @tbl: slot table to set up
+ * @max_reqs: maximum number of requests allowed
+ * @queue: name to give RPC wait queue
+ *
+ * Returns zero on success, or a negative errno.
+ */
+int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, unsigned int max_reqs,
+ const char *queue)
+{
+ nfs4_init_slot_table(tbl, queue);
+ return nfs4_realloc_slot_table(tbl, max_reqs, 0);
+}
+
+static bool nfs41_assign_slot(struct rpc_task *task, void *pslot)
+{
+ struct nfs4_sequence_args *args = task->tk_msg.rpc_argp;
+ struct nfs4_sequence_res *res = task->tk_msg.rpc_resp;
+ struct nfs4_slot *slot = pslot;
+ struct nfs4_slot_table *tbl = slot->table;
+
+ if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
+ return false;
+ slot->generation = tbl->generation;
+ args->sa_slot = slot;
+ res->sr_timestamp = jiffies;
+ res->sr_slot = slot;
+ res->sr_status_flags = 0;
+ res->sr_status = 1;
+ return true;
+}
+
+static bool __nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
+ struct nfs4_slot *slot)
+{
+ if (rpc_wake_up_first(&tbl->slot_tbl_waitq, nfs41_assign_slot, slot))
+ return true;
+ return false;
+}
+
+bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
+ struct nfs4_slot *slot)
+{
+ if (slot->slot_nr > tbl->max_slotid)
+ return false;
+ return __nfs41_wake_and_assign_slot(tbl, slot);
+}
+
+static bool nfs41_try_wake_next_slot_table_entry(struct nfs4_slot_table *tbl)
+{
+ struct nfs4_slot *slot = nfs4_alloc_slot(tbl);
+ if (!IS_ERR(slot)) {
+ bool ret = __nfs41_wake_and_assign_slot(tbl, slot);
+ if (ret)
+ return ret;
+ nfs4_free_slot(tbl, slot);
+ }
+ return false;
+}
+
+void nfs41_wake_slot_table(struct nfs4_slot_table *tbl)
+{
+ for (;;) {
+ if (!nfs41_try_wake_next_slot_table_entry(tbl))
+ break;
+ }
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static void nfs41_set_max_slotid_locked(struct nfs4_slot_table *tbl,
+ u32 target_highest_slotid)
+{
+ u32 max_slotid;
+
+ max_slotid = min(NFS4_MAX_SLOT_TABLE - 1, target_highest_slotid);
+ if (max_slotid > tbl->server_highest_slotid)
+ max_slotid = tbl->server_highest_slotid;
+ if (max_slotid > tbl->target_highest_slotid)
+ max_slotid = tbl->target_highest_slotid;
+ tbl->max_slotid = max_slotid;
+ nfs41_wake_slot_table(tbl);
+}
+
+/* Update the client's idea of target_highest_slotid */
+static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl,
+ u32 target_highest_slotid)
+{
+ if (tbl->target_highest_slotid == target_highest_slotid)
+ return;
+ tbl->target_highest_slotid = target_highest_slotid;
+ tbl->generation++;
+}
+
+void nfs41_set_target_slotid(struct nfs4_slot_table *tbl,
+ u32 target_highest_slotid)
+{
+ spin_lock(&tbl->slot_tbl_lock);
+ nfs41_set_target_slotid_locked(tbl, target_highest_slotid);
+ tbl->d_target_highest_slotid = 0;
+ tbl->d2_target_highest_slotid = 0;
+ nfs41_set_max_slotid_locked(tbl, target_highest_slotid);
+ spin_unlock(&tbl->slot_tbl_lock);
+}
+
+static void nfs41_set_server_slotid_locked(struct nfs4_slot_table *tbl,
+ u32 highest_slotid)
+{
+ if (tbl->server_highest_slotid == highest_slotid)
+ return;
+ if (tbl->highest_used_slotid > highest_slotid)
+ return;
+ /* Deallocate slots */
+ nfs4_shrink_slot_table(tbl, highest_slotid + 1);
+ tbl->server_highest_slotid = highest_slotid;
+}
+
+static s32 nfs41_derivative_target_slotid(s32 s1, s32 s2)
+{
+ s1 -= s2;
+ if (s1 == 0)
+ return 0;
+ if (s1 < 0)
+ return (s1 - 1) >> 1;
+ return (s1 + 1) >> 1;
+}
+
+static int nfs41_sign_s32(s32 s1)
+{
+ if (s1 > 0)
+ return 1;
+ if (s1 < 0)
+ return -1;
+ return 0;
+}
+
+static bool nfs41_same_sign_or_zero_s32(s32 s1, s32 s2)
+{
+ if (!s1 || !s2)
+ return true;
+ return nfs41_sign_s32(s1) == nfs41_sign_s32(s2);
+}
+
+/* Try to eliminate outliers by checking for sharp changes in the
+ * derivatives and second derivatives
+ */
+static bool nfs41_is_outlier_target_slotid(struct nfs4_slot_table *tbl,
+ u32 new_target)
+{
+ s32 d_target, d2_target;
+ bool ret = true;
+
+ d_target = nfs41_derivative_target_slotid(new_target,
+ tbl->target_highest_slotid);
+ d2_target = nfs41_derivative_target_slotid(d_target,
+ tbl->d_target_highest_slotid);
+ /* Is first derivative same sign? */
+ if (nfs41_same_sign_or_zero_s32(d_target, tbl->d_target_highest_slotid))
+ ret = false;
+ /* Is second derivative same sign? */
+ if (nfs41_same_sign_or_zero_s32(d2_target, tbl->d2_target_highest_slotid))
+ ret = false;
+ tbl->d_target_highest_slotid = d_target;
+ tbl->d2_target_highest_slotid = d2_target;
+ return ret;
+}
+
+void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
+ struct nfs4_slot *slot,
+ struct nfs4_sequence_res *res)
+{
+ u32 target_highest_slotid = min(res->sr_target_highest_slotid,
+ NFS4_MAX_SLOTID);
+ u32 highest_slotid = min(res->sr_highest_slotid, NFS4_MAX_SLOTID);
+
+ spin_lock(&tbl->slot_tbl_lock);
+ if (!nfs41_is_outlier_target_slotid(tbl, target_highest_slotid))
+ nfs41_set_target_slotid_locked(tbl, target_highest_slotid);
+ if (tbl->generation == slot->generation)
+ nfs41_set_server_slotid_locked(tbl, highest_slotid);
+ nfs41_set_max_slotid_locked(tbl, target_highest_slotid);
+ spin_unlock(&tbl->slot_tbl_lock);
+}
+
+static void nfs4_release_session_slot_tables(struct nfs4_session *session)
+{
+ nfs4_release_slot_table(&session->fc_slot_table);
+ nfs4_release_slot_table(&session->bc_slot_table);
+}
+
+/*
+ * Initialize or reset the forechannel and backchannel tables
+ */
+int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
+{
+ struct nfs4_slot_table *tbl;
+ int status;
+
+ dprintk("--> %s\n", __func__);
+ /* Fore channel */
+ tbl = &ses->fc_slot_table;
+ tbl->session = ses;
+ status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
+ if (status || !(ses->flags & SESSION4_BACK_CHAN)) /* -ENOMEM */
+ return status;
+ /* Back channel */
+ tbl = &ses->bc_slot_table;
+ tbl->session = ses;
+ status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
+ if (status && tbl->slots == NULL)
+ /* Fore and back channel share a connection so get
+ * both slot tables or neither */
+ nfs4_release_session_slot_tables(ses);
+ return status;
+}
+
+struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
+{
+ struct nfs4_session *session;
+
+ session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
+ if (!session)
+ return NULL;
+
+ nfs4_init_slot_table(&session->fc_slot_table, "ForeChannel Slot table");
+ nfs4_init_slot_table(&session->bc_slot_table, "BackChannel Slot table");
+ session->session_state = 1<<NFS4_SESSION_INITING;
+
+ session->clp = clp;
+ return session;
+}
+
+static void nfs4_destroy_session_slot_tables(struct nfs4_session *session)
+{
+ nfs4_shutdown_slot_table(&session->fc_slot_table);
+ nfs4_shutdown_slot_table(&session->bc_slot_table);
+}
+
+void nfs4_destroy_session(struct nfs4_session *session)
+{
+ struct rpc_xprt *xprt;
+ const struct cred *cred;
+
+ cred = nfs4_get_clid_cred(session->clp);
+ nfs4_proc_destroy_session(session, cred);
+ put_cred(cred);
+
+ rcu_read_lock();
+ xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt);
+ rcu_read_unlock();
+ dprintk("%s Destroy backchannel for xprt %p\n",
+ __func__, xprt);
+ xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);
+ nfs4_destroy_session_slot_tables(session);
+ kfree(session);
+}
+
+/*
+ * With sessions, the client is not marked ready until after a
+ * successful EXCHANGE_ID and CREATE_SESSION.
+ *
+ * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate
+ * other versions of NFS can be tried.
+ */
+static int nfs41_check_session_ready(struct nfs_client *clp)
+{
+ int ret;
+
+ if (clp->cl_cons_state == NFS_CS_SESSION_INITING) {
+ ret = nfs4_client_recover_expired_lease(clp);
+ if (ret)
+ return ret;
+ }
+ if (clp->cl_cons_state < NFS_CS_READY)
+ return -EPROTONOSUPPORT;
+ smp_rmb();
+ return 0;
+}
+
+int nfs4_init_session(struct nfs_client *clp)
+{
+ if (!nfs4_has_session(clp))
+ return 0;
+
+ clear_bit(NFS4_SESSION_INITING, &clp->cl_session->session_state);
+ return nfs41_check_session_ready(clp);
+}
+
+int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time)
+{
+ struct nfs4_session *session = clp->cl_session;
+ int ret;
+
+ spin_lock(&clp->cl_lock);
+ if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
+ /*
+ * Do not set NFS_CS_CHECK_LEASE_TIME instead set the
+ * DS lease to be equal to the MDS lease.
+ */
+ clp->cl_lease_time = lease_time;
+ clp->cl_last_renewal = jiffies;
+ }
+ spin_unlock(&clp->cl_lock);
+
+ ret = nfs41_check_session_ready(clp);
+ if (ret)
+ return ret;
+ /* Test for the DS role */
+ if (!is_ds_client(clp))
+ return -ENODEV;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
+
+#endif /* defined(CONFIG_NFS_V4_1) */
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
new file mode 100644
index 0000000000..351616c61d
--- /dev/null
+++ b/fs/nfs/nfs4session.h
@@ -0,0 +1,185 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * fs/nfs/nfs4session.h
+ *
+ * Copyright (c) 2012 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ */
+#ifndef __LINUX_FS_NFS_NFS4SESSION_H
+#define __LINUX_FS_NFS_NFS4SESSION_H
+
+/* maximum number of slots to use */
+#define NFS4_DEF_SLOT_TABLE_SIZE (64U)
+#define NFS4_DEF_CB_SLOT_TABLE_SIZE (16U)
+#define NFS4_MAX_SLOT_TABLE (1024U)
+#define NFS4_MAX_SLOTID (NFS4_MAX_SLOT_TABLE - 1U)
+#define NFS4_NO_SLOT ((u32)-1)
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+
+/* Sessions slot seqid */
+struct nfs4_slot {
+ struct nfs4_slot_table *table;
+ struct nfs4_slot *next;
+ unsigned long generation;
+ u32 slot_nr;
+ u32 seq_nr;
+ u32 seq_nr_last_acked;
+ u32 seq_nr_highest_sent;
+ unsigned int privileged : 1,
+ seq_done : 1;
+};
+
+/* Sessions */
+enum nfs4_slot_tbl_state {
+ NFS4_SLOT_TBL_DRAINING,
+};
+
+#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, BITS_PER_LONG)
+struct nfs4_slot_table {
+ struct nfs4_session *session; /* Parent session */
+ struct nfs4_slot *slots; /* seqid per slot */
+ unsigned long used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */
+ spinlock_t slot_tbl_lock;
+ struct rpc_wait_queue slot_tbl_waitq; /* allocators may wait here */
+ wait_queue_head_t slot_waitq; /* Completion wait on slot */
+ u32 max_slots; /* # slots in table */
+ u32 max_slotid; /* Max allowed slotid value */
+ u32 highest_used_slotid; /* sent to server on each SEQ.
+ * op for dynamic resizing */
+ u32 target_highest_slotid; /* Server max_slot target */
+ u32 server_highest_slotid; /* Server highest slotid */
+ s32 d_target_highest_slotid; /* Derivative */
+ s32 d2_target_highest_slotid; /* 2nd derivative */
+ unsigned long generation; /* Generation counter for
+ target_highest_slotid */
+ struct completion complete;
+ unsigned long slot_tbl_state;
+};
+
+/*
+ * Session related parameters
+ */
+struct nfs4_session {
+ struct nfs4_sessionid sess_id;
+ u32 flags;
+ unsigned long session_state;
+ u32 hash_alg;
+ u32 ssv_len;
+
+ /* The fore and back channel */
+ struct nfs4_channel_attrs fc_attrs;
+ struct nfs4_slot_table fc_slot_table;
+ struct nfs4_channel_attrs bc_attrs;
+ struct nfs4_slot_table bc_slot_table;
+ struct nfs_client *clp;
+};
+
+enum nfs4_session_state {
+ NFS4_SESSION_INITING,
+ NFS4_SESSION_ESTABLISHED,
+};
+
+extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,
+ unsigned int max_reqs, const char *queue);
+extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl);
+extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
+extern struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid);
+extern int nfs4_slot_wait_on_seqid(struct nfs4_slot_table *tbl,
+ u32 slotid, u32 seq_nr,
+ unsigned long timeout);
+extern bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
+extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
+extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
+bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
+ struct nfs4_slot *slot);
+void nfs41_wake_slot_table(struct nfs4_slot_table *tbl);
+
+static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl)
+{
+ return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
+}
+
+static inline bool nfs4_test_locked_slot(const struct nfs4_slot_table *tbl,
+ u32 slotid)
+{
+ return !!test_bit(slotid, tbl->used_slots);
+}
+
+static inline struct nfs4_session *nfs4_get_session(const struct nfs_client *clp)
+{
+ return clp->cl_session;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl,
+ u32 target_highest_slotid);
+extern void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
+ struct nfs4_slot *slot,
+ struct nfs4_sequence_res *res);
+
+extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses);
+
+extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
+extern void nfs4_destroy_session(struct nfs4_session *session);
+extern int nfs4_init_session(struct nfs_client *clp);
+extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
+
+/*
+ * Determine if sessions are in use.
+ */
+static inline int nfs4_has_session(const struct nfs_client *clp)
+{
+ if (clp->cl_session)
+ return 1;
+ return 0;
+}
+
+static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
+{
+ if (nfs4_has_session(clp))
+ return (clp->cl_session->flags & SESSION4_PERSIST);
+ return 0;
+}
+
+static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst,
+ const struct nfs4_sessionid *src)
+{
+ memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN);
+}
+
+#ifdef CONFIG_CRC32
+/*
+ * nfs_session_id_hash - calculate the crc32 hash for the session id
+ * @session - pointer to session
+ */
+#define nfs_session_id_hash(sess_id) \
+ (~crc32_le(0xFFFFFFFF, &(sess_id)->data[0], sizeof((sess_id)->data)))
+#else
+#define nfs_session_id_hash(session) (0)
+#endif
+#else /* defined(CONFIG_NFS_V4_1) */
+
+static inline int nfs4_init_session(struct nfs_client *clp)
+{
+ return 0;
+}
+
+/*
+ * Determine if sessions are in use.
+ */
+static inline int nfs4_has_session(const struct nfs_client *clp)
+{
+ return 0;
+}
+
+static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
+{
+ return 0;
+}
+
+#define nfs_session_id_hash(session) (0)
+
+#endif /* defined(CONFIG_NFS_V4_1) */
+#endif /* IS_ENABLED(CONFIG_NFS_V4) */
+#endif /* __LINUX_FS_NFS_NFS4SESSION_H */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
new file mode 100644
index 0000000000..9a5d911a7e
--- /dev/null
+++ b/fs/nfs/nfs4state.c
@@ -0,0 +1,2786 @@
+/*
+ * fs/nfs/nfs4state.c
+ *
+ * Client-side XDR for NFSv4.
+ *
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Kendrick Smith <kmsmith@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Implementation of the NFSv4 state model. For the time being,
+ * this is minimal, but will be made much more complex in a
+ * subsequent patch.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/nfs_fs.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/ratelimit.h>
+#include <linux/workqueue.h>
+#include <linux/bitops.h>
+#include <linux/jiffies.h>
+#include <linux/sched/mm.h>
+
+#include <linux/sunrpc/clnt.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "internal.h"
+#include "nfs4idmap.h"
+#include "nfs4session.h"
+#include "pnfs.h"
+#include "netns.h"
+#include "nfs4trace.h"
+
+#define NFSDBG_FACILITY NFSDBG_STATE
+
+#define OPENOWNER_POOL_SIZE 8
+
+static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp);
+
+const nfs4_stateid zero_stateid = {
+ { .data = { 0 } },
+ .type = NFS4_SPECIAL_STATEID_TYPE,
+};
+const nfs4_stateid invalid_stateid = {
+ {
+ /* Funky initialiser keeps older gcc versions happy */
+ .data = { 0xff, 0xff, 0xff, 0xff, 0 },
+ },
+ .type = NFS4_INVALID_STATEID_TYPE,
+};
+
+const nfs4_stateid current_stateid = {
+ {
+ /* Funky initialiser keeps older gcc versions happy */
+ .data = { 0x0, 0x0, 0x0, 0x1, 0 },
+ },
+ .type = NFS4_SPECIAL_STATEID_TYPE,
+};
+
+static DEFINE_MUTEX(nfs_clid_init_mutex);
+
+static int nfs4_setup_state_renewal(struct nfs_client *clp)
+{
+ int status;
+ struct nfs_fsinfo fsinfo;
+
+ if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
+ nfs4_schedule_state_renewal(clp);
+ return 0;
+ }
+
+ status = nfs4_proc_get_lease_time(clp, &fsinfo);
+ if (status == 0) {
+ nfs4_set_lease_period(clp, fsinfo.lease_time * HZ);
+ nfs4_schedule_state_renewal(clp);
+ }
+
+ return status;
+}
+
+int nfs4_init_clientid(struct nfs_client *clp, const struct cred *cred)
+{
+ struct nfs4_setclientid_res clid = {
+ .clientid = clp->cl_clientid,
+ .confirm = clp->cl_confirm,
+ };
+ unsigned short port;
+ int status;
+ struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+
+ if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state))
+ goto do_confirm;
+ port = nn->nfs_callback_tcpport;
+ if (clp->cl_addr.ss_family == AF_INET6)
+ port = nn->nfs_callback_tcpport6;
+
+ status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
+ if (status != 0)
+ goto out;
+ clp->cl_clientid = clid.clientid;
+ clp->cl_confirm = clid.confirm;
+ set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+do_confirm:
+ status = nfs4_proc_setclientid_confirm(clp, &clid, cred);
+ if (status != 0)
+ goto out;
+ clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+ nfs4_setup_state_renewal(clp);
+out:
+ return status;
+}
+
+/**
+ * nfs40_discover_server_trunking - Detect server IP address trunking (mv0)
+ *
+ * @clp: nfs_client under test
+ * @result: OUT: found nfs_client, or clp
+ * @cred: credential to use for trunking test
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status.
+ * If zero is returned, an nfs_client pointer is planted in
+ * "result".
+ *
+ * Note: The returned client may not yet be marked ready.
+ */
+int nfs40_discover_server_trunking(struct nfs_client *clp,
+ struct nfs_client **result,
+ const struct cred *cred)
+{
+ struct nfs4_setclientid_res clid = {
+ .clientid = clp->cl_clientid,
+ .confirm = clp->cl_confirm,
+ };
+ struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+ unsigned short port;
+ int status;
+
+ port = nn->nfs_callback_tcpport;
+ if (clp->cl_addr.ss_family == AF_INET6)
+ port = nn->nfs_callback_tcpport6;
+
+ status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
+ if (status != 0)
+ goto out;
+ clp->cl_clientid = clid.clientid;
+ clp->cl_confirm = clid.confirm;
+
+ status = nfs40_walk_client_list(clp, result, cred);
+ if (status == 0) {
+ /* Sustain the lease, even if it's empty. If the clientid4
+ * goes stale it's of no use for trunking discovery. */
+ nfs4_schedule_state_renewal(*result);
+
+ /* If the client state need to recover, do it. */
+ if (clp->cl_state)
+ nfs4_schedule_state_manager(clp);
+ }
+out:
+ return status;
+}
+
+const struct cred *nfs4_get_machine_cred(struct nfs_client *clp)
+{
+ return get_cred(rpc_machine_cred());
+}
+
+static void nfs4_root_machine_cred(struct nfs_client *clp)
+{
+
+ /* Force root creds instead of machine */
+ clp->cl_principal = NULL;
+ clp->cl_rpcclient->cl_principal = NULL;
+}
+
+static const struct cred *
+nfs4_get_renew_cred_server_locked(struct nfs_server *server)
+{
+ const struct cred *cred = NULL;
+ struct nfs4_state_owner *sp;
+ struct rb_node *pos;
+
+ for (pos = rb_first(&server->state_owners);
+ pos != NULL;
+ pos = rb_next(pos)) {
+ sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
+ if (list_empty(&sp->so_states))
+ continue;
+ cred = get_cred(sp->so_cred);
+ break;
+ }
+ return cred;
+}
+
+/**
+ * nfs4_get_renew_cred - Acquire credential for a renew operation
+ * @clp: client state handle
+ *
+ * Returns an rpc_cred with reference count bumped, or NULL.
+ * Caller must hold clp->cl_lock.
+ */
+const struct cred *nfs4_get_renew_cred(struct nfs_client *clp)
+{
+ const struct cred *cred = NULL;
+ struct nfs_server *server;
+
+ /* Use machine credentials if available */
+ cred = nfs4_get_machine_cred(clp);
+ if (cred != NULL)
+ goto out;
+
+ spin_lock(&clp->cl_lock);
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ cred = nfs4_get_renew_cred_server_locked(server);
+ if (cred != NULL)
+ break;
+ }
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+
+out:
+ return cred;
+}
+
+static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl)
+{
+ if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
+ spin_lock(&tbl->slot_tbl_lock);
+ nfs41_wake_slot_table(tbl);
+ spin_unlock(&tbl->slot_tbl_lock);
+ }
+}
+
+static void nfs4_end_drain_session(struct nfs_client *clp)
+{
+ struct nfs4_session *ses = clp->cl_session;
+
+ if (clp->cl_slot_tbl) {
+ nfs4_end_drain_slot_table(clp->cl_slot_tbl);
+ return;
+ }
+
+ if (ses != NULL) {
+ nfs4_end_drain_slot_table(&ses->bc_slot_table);
+ nfs4_end_drain_slot_table(&ses->fc_slot_table);
+ }
+}
+
+static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl)
+{
+ set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
+ spin_lock(&tbl->slot_tbl_lock);
+ if (tbl->highest_used_slotid != NFS4_NO_SLOT) {
+ reinit_completion(&tbl->complete);
+ spin_unlock(&tbl->slot_tbl_lock);
+ return wait_for_completion_interruptible(&tbl->complete);
+ }
+ spin_unlock(&tbl->slot_tbl_lock);
+ return 0;
+}
+
+static int nfs4_begin_drain_session(struct nfs_client *clp)
+{
+ struct nfs4_session *ses = clp->cl_session;
+ int ret;
+
+ if (clp->cl_slot_tbl)
+ return nfs4_drain_slot_tbl(clp->cl_slot_tbl);
+
+ /* back channel */
+ ret = nfs4_drain_slot_tbl(&ses->bc_slot_table);
+ if (ret)
+ return ret;
+ /* fore channel */
+ return nfs4_drain_slot_tbl(&ses->fc_slot_table);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static void nfs41_finish_session_reset(struct nfs_client *clp)
+{
+ clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+ clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+ /* create_session negotiated new slot table */
+ clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+ nfs4_setup_state_renewal(clp);
+}
+
+int nfs41_init_clientid(struct nfs_client *clp, const struct cred *cred)
+{
+ int status;
+
+ if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state))
+ goto do_confirm;
+ status = nfs4_proc_exchange_id(clp, cred);
+ if (status != 0)
+ goto out;
+ set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+do_confirm:
+ status = nfs4_proc_create_session(clp, cred);
+ if (status != 0)
+ goto out;
+ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_CONFIRMED_R))
+ nfs4_state_start_reclaim_reboot(clp);
+ nfs41_finish_session_reset(clp);
+ nfs_mark_client_ready(clp, NFS_CS_READY);
+out:
+ return status;
+}
+
+/**
+ * nfs41_discover_server_trunking - Detect server IP address trunking (mv1)
+ *
+ * @clp: nfs_client under test
+ * @result: OUT: found nfs_client, or clp
+ * @cred: credential to use for trunking test
+ *
+ * Returns NFS4_OK, a negative errno, or a negative NFS4ERR status.
+ * If NFS4_OK is returned, an nfs_client pointer is planted in
+ * "result".
+ *
+ * Note: The returned client may not yet be marked ready.
+ */
+int nfs41_discover_server_trunking(struct nfs_client *clp,
+ struct nfs_client **result,
+ const struct cred *cred)
+{
+ int status;
+
+ status = nfs4_proc_exchange_id(clp, cred);
+ if (status != NFS4_OK)
+ return status;
+
+ status = nfs41_walk_client_list(clp, result, cred);
+ if (status < 0)
+ return status;
+ if (clp != *result)
+ return 0;
+
+ /*
+ * Purge state if the client id was established in a prior
+ * instance and the client id could not have arrived on the
+ * server via Transparent State Migration.
+ */
+ if (clp->cl_exchange_flags & EXCHGID4_FLAG_CONFIRMED_R) {
+ if (!test_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags))
+ set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state);
+ else
+ set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+ }
+ nfs4_schedule_state_manager(clp);
+ status = nfs_wait_client_init_complete(clp);
+ if (status < 0)
+ nfs_put_client(clp);
+ return status;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_get_clid_cred - Acquire credential for a setclientid operation
+ * @clp: client state handle
+ *
+ * Returns a cred with reference count bumped, or NULL.
+ */
+const struct cred *nfs4_get_clid_cred(struct nfs_client *clp)
+{
+ const struct cred *cred;
+
+ cred = nfs4_get_machine_cred(clp);
+ return cred;
+}
+
+static struct nfs4_state_owner *
+nfs4_find_state_owner_locked(struct nfs_server *server, const struct cred *cred)
+{
+ struct rb_node **p = &server->state_owners.rb_node,
+ *parent = NULL;
+ struct nfs4_state_owner *sp;
+ int cmp;
+
+ while (*p != NULL) {
+ parent = *p;
+ sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
+ cmp = cred_fscmp(cred, sp->so_cred);
+
+ if (cmp < 0)
+ p = &parent->rb_left;
+ else if (cmp > 0)
+ p = &parent->rb_right;
+ else {
+ if (!list_empty(&sp->so_lru))
+ list_del_init(&sp->so_lru);
+ atomic_inc(&sp->so_count);
+ return sp;
+ }
+ }
+ return NULL;
+}
+
+static struct nfs4_state_owner *
+nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
+{
+ struct nfs_server *server = new->so_server;
+ struct rb_node **p = &server->state_owners.rb_node,
+ *parent = NULL;
+ struct nfs4_state_owner *sp;
+ int cmp;
+
+ while (*p != NULL) {
+ parent = *p;
+ sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
+ cmp = cred_fscmp(new->so_cred, sp->so_cred);
+
+ if (cmp < 0)
+ p = &parent->rb_left;
+ else if (cmp > 0)
+ p = &parent->rb_right;
+ else {
+ if (!list_empty(&sp->so_lru))
+ list_del_init(&sp->so_lru);
+ atomic_inc(&sp->so_count);
+ return sp;
+ }
+ }
+ rb_link_node(&new->so_server_node, parent, p);
+ rb_insert_color(&new->so_server_node, &server->state_owners);
+ return new;
+}
+
+static void
+nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
+{
+ struct nfs_server *server = sp->so_server;
+
+ if (!RB_EMPTY_NODE(&sp->so_server_node))
+ rb_erase(&sp->so_server_node, &server->state_owners);
+}
+
+static void
+nfs4_init_seqid_counter(struct nfs_seqid_counter *sc)
+{
+ sc->create_time = ktime_get();
+ sc->flags = 0;
+ sc->counter = 0;
+ spin_lock_init(&sc->lock);
+ INIT_LIST_HEAD(&sc->list);
+ rpc_init_wait_queue(&sc->wait, "Seqid_waitqueue");
+}
+
+static void
+nfs4_destroy_seqid_counter(struct nfs_seqid_counter *sc)
+{
+ rpc_destroy_wait_queue(&sc->wait);
+}
+
+/*
+ * nfs4_alloc_state_owner(): this is called on the OPEN or CREATE path to
+ * create a new state_owner.
+ *
+ */
+static struct nfs4_state_owner *
+nfs4_alloc_state_owner(struct nfs_server *server,
+ const struct cred *cred,
+ gfp_t gfp_flags)
+{
+ struct nfs4_state_owner *sp;
+
+ sp = kzalloc(sizeof(*sp), gfp_flags);
+ if (!sp)
+ return NULL;
+ sp->so_seqid.owner_id = ida_alloc(&server->openowner_id, gfp_flags);
+ if (sp->so_seqid.owner_id < 0) {
+ kfree(sp);
+ return NULL;
+ }
+ sp->so_server = server;
+ sp->so_cred = get_cred(cred);
+ spin_lock_init(&sp->so_lock);
+ INIT_LIST_HEAD(&sp->so_states);
+ nfs4_init_seqid_counter(&sp->so_seqid);
+ atomic_set(&sp->so_count, 1);
+ INIT_LIST_HEAD(&sp->so_lru);
+ seqcount_spinlock_init(&sp->so_reclaim_seqcount, &sp->so_lock);
+ mutex_init(&sp->so_delegreturn_mutex);
+ return sp;
+}
+
+static void
+nfs4_reset_state_owner(struct nfs4_state_owner *sp)
+{
+ /* This state_owner is no longer usable, but must
+ * remain in place so that state recovery can find it
+ * and the opens associated with it.
+ * It may also be used for new 'open' request to
+ * return a delegation to the server.
+ * So update the 'create_time' so that it looks like
+ * a new state_owner. This will cause the server to
+ * request an OPEN_CONFIRM to start a new sequence.
+ */
+ sp->so_seqid.create_time = ktime_get();
+}
+
+static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
+{
+ nfs4_destroy_seqid_counter(&sp->so_seqid);
+ put_cred(sp->so_cred);
+ ida_free(&sp->so_server->openowner_id, sp->so_seqid.owner_id);
+ kfree(sp);
+}
+
+static void nfs4_gc_state_owners(struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_state_owner *sp, *tmp;
+ unsigned long time_min, time_max;
+ LIST_HEAD(doomed);
+
+ spin_lock(&clp->cl_lock);
+ time_max = jiffies;
+ time_min = (long)time_max - (long)clp->cl_lease_time;
+ list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
+ /* NB: LRU is sorted so that oldest is at the head */
+ if (time_in_range(sp->so_expires, time_min, time_max))
+ break;
+ list_move(&sp->so_lru, &doomed);
+ nfs4_remove_state_owner_locked(sp);
+ }
+ spin_unlock(&clp->cl_lock);
+
+ list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
+ list_del(&sp->so_lru);
+ nfs4_free_state_owner(sp);
+ }
+}
+
+/**
+ * nfs4_get_state_owner - Look up a state owner given a credential
+ * @server: nfs_server to search
+ * @cred: RPC credential to match
+ * @gfp_flags: allocation mode
+ *
+ * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
+ */
+struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
+ const struct cred *cred,
+ gfp_t gfp_flags)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_state_owner *sp, *new;
+
+ spin_lock(&clp->cl_lock);
+ sp = nfs4_find_state_owner_locked(server, cred);
+ spin_unlock(&clp->cl_lock);
+ if (sp != NULL)
+ goto out;
+ new = nfs4_alloc_state_owner(server, cred, gfp_flags);
+ if (new == NULL)
+ goto out;
+ spin_lock(&clp->cl_lock);
+ sp = nfs4_insert_state_owner_locked(new);
+ spin_unlock(&clp->cl_lock);
+ if (sp != new)
+ nfs4_free_state_owner(new);
+out:
+ nfs4_gc_state_owners(server);
+ return sp;
+}
+
+/**
+ * nfs4_put_state_owner - Release a nfs4_state_owner
+ * @sp: state owner data to release
+ *
+ * Note that we keep released state owners on an LRU
+ * list.
+ * This caches valid state owners so that they can be
+ * reused, to avoid the OPEN_CONFIRM on minor version 0.
+ * It also pins the uniquifier of dropped state owners for
+ * a while, to ensure that those state owner names are
+ * never reused.
+ */
+void nfs4_put_state_owner(struct nfs4_state_owner *sp)
+{
+ struct nfs_server *server = sp->so_server;
+ struct nfs_client *clp = server->nfs_client;
+
+ if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
+ return;
+
+ sp->so_expires = jiffies;
+ list_add_tail(&sp->so_lru, &server->state_owners_lru);
+ spin_unlock(&clp->cl_lock);
+}
+
+/**
+ * nfs4_purge_state_owners - Release all cached state owners
+ * @server: nfs_server with cached state owners to release
+ * @head: resulting list of state owners
+ *
+ * Called at umount time. Remaining state owners will be on
+ * the LRU with ref count of zero.
+ * Note that the state owners are not freed, but are added
+ * to the list @head, which can later be used as an argument
+ * to nfs4_free_state_owners.
+ */
+void nfs4_purge_state_owners(struct nfs_server *server, struct list_head *head)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_state_owner *sp, *tmp;
+
+ spin_lock(&clp->cl_lock);
+ list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
+ list_move(&sp->so_lru, head);
+ nfs4_remove_state_owner_locked(sp);
+ }
+ spin_unlock(&clp->cl_lock);
+}
+
+/**
+ * nfs4_free_state_owners - Release all cached state owners
+ * @head: resulting list of state owners
+ *
+ * Frees a list of state owners that was generated by
+ * nfs4_purge_state_owners
+ */
+void nfs4_free_state_owners(struct list_head *head)
+{
+ struct nfs4_state_owner *sp, *tmp;
+
+ list_for_each_entry_safe(sp, tmp, head, so_lru) {
+ list_del(&sp->so_lru);
+ nfs4_free_state_owner(sp);
+ }
+}
+
+static struct nfs4_state *
+nfs4_alloc_open_state(void)
+{
+ struct nfs4_state *state;
+
+ state = kzalloc(sizeof(*state), GFP_KERNEL_ACCOUNT);
+ if (!state)
+ return NULL;
+ refcount_set(&state->count, 1);
+ INIT_LIST_HEAD(&state->lock_states);
+ spin_lock_init(&state->state_lock);
+ seqlock_init(&state->seqlock);
+ init_waitqueue_head(&state->waitq);
+ return state;
+}
+
+void
+nfs4_state_set_mode_locked(struct nfs4_state *state, fmode_t fmode)
+{
+ if (state->state == fmode)
+ return;
+ /* NB! List reordering - see the reclaim code for why. */
+ if ((fmode & FMODE_WRITE) != (state->state & FMODE_WRITE)) {
+ if (fmode & FMODE_WRITE)
+ list_move(&state->open_states, &state->owner->so_states);
+ else
+ list_move_tail(&state->open_states, &state->owner->so_states);
+ }
+ state->state = fmode;
+}
+
+static struct nfs4_state *
+__nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs4_state *state;
+
+ list_for_each_entry_rcu(state, &nfsi->open_states, inode_states) {
+ if (state->owner != owner)
+ continue;
+ if (!nfs4_valid_open_stateid(state))
+ continue;
+ if (refcount_inc_not_zero(&state->count))
+ return state;
+ }
+ return NULL;
+}
+
+static void
+nfs4_free_open_state(struct nfs4_state *state)
+{
+ kfree_rcu(state, rcu_head);
+}
+
+struct nfs4_state *
+nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner)
+{
+ struct nfs4_state *state, *new;
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ rcu_read_lock();
+ state = __nfs4_find_state_byowner(inode, owner);
+ rcu_read_unlock();
+ if (state)
+ goto out;
+ new = nfs4_alloc_open_state();
+ spin_lock(&owner->so_lock);
+ spin_lock(&inode->i_lock);
+ state = __nfs4_find_state_byowner(inode, owner);
+ if (state == NULL && new != NULL) {
+ state = new;
+ state->owner = owner;
+ atomic_inc(&owner->so_count);
+ ihold(inode);
+ state->inode = inode;
+ list_add_rcu(&state->inode_states, &nfsi->open_states);
+ spin_unlock(&inode->i_lock);
+ /* Note: The reclaim code dictates that we add stateless
+ * and read-only stateids to the end of the list */
+ list_add_tail(&state->open_states, &owner->so_states);
+ spin_unlock(&owner->so_lock);
+ } else {
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&owner->so_lock);
+ if (new)
+ nfs4_free_open_state(new);
+ }
+out:
+ return state;
+}
+
+void nfs4_put_open_state(struct nfs4_state *state)
+{
+ struct inode *inode = state->inode;
+ struct nfs4_state_owner *owner = state->owner;
+
+ if (!refcount_dec_and_lock(&state->count, &owner->so_lock))
+ return;
+ spin_lock(&inode->i_lock);
+ list_del_rcu(&state->inode_states);
+ list_del(&state->open_states);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&owner->so_lock);
+ nfs4_inode_return_delegation_on_close(inode);
+ iput(inode);
+ nfs4_free_open_state(state);
+ nfs4_put_state_owner(owner);
+}
+
+/*
+ * Close the current file.
+ */
+static void __nfs4_close(struct nfs4_state *state,
+ fmode_t fmode, gfp_t gfp_mask, int wait)
+{
+ struct nfs4_state_owner *owner = state->owner;
+ int call_close = 0;
+ fmode_t newstate;
+
+ atomic_inc(&owner->so_count);
+ /* Protect against nfs4_find_state() */
+ spin_lock(&owner->so_lock);
+ switch (fmode & (FMODE_READ | FMODE_WRITE)) {
+ case FMODE_READ:
+ state->n_rdonly--;
+ break;
+ case FMODE_WRITE:
+ state->n_wronly--;
+ break;
+ case FMODE_READ|FMODE_WRITE:
+ state->n_rdwr--;
+ }
+ newstate = FMODE_READ|FMODE_WRITE;
+ if (state->n_rdwr == 0) {
+ if (state->n_rdonly == 0) {
+ newstate &= ~FMODE_READ;
+ call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags);
+ call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
+ }
+ if (state->n_wronly == 0) {
+ newstate &= ~FMODE_WRITE;
+ call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags);
+ call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
+ }
+ if (newstate == 0)
+ clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ }
+ nfs4_state_set_mode_locked(state, newstate);
+ spin_unlock(&owner->so_lock);
+
+ if (!call_close) {
+ nfs4_put_open_state(state);
+ nfs4_put_state_owner(owner);
+ } else
+ nfs4_do_close(state, gfp_mask, wait);
+}
+
+void nfs4_close_state(struct nfs4_state *state, fmode_t fmode)
+{
+ __nfs4_close(state, fmode, GFP_KERNEL, 0);
+}
+
+void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
+{
+ __nfs4_close(state, fmode, GFP_KERNEL, 1);
+}
+
+/*
+ * Search the state->lock_states for an existing lock_owner
+ * that is compatible with either of the given owners.
+ * If the second is non-zero, then the first refers to a Posix-lock
+ * owner (current->files) and the second refers to a flock/OFD
+ * owner (struct file*). In that case, prefer a match for the first
+ * owner.
+ * If both sorts of locks are held on the one file we cannot know
+ * which stateid was intended to be used, so a "correct" choice cannot
+ * be made. Failing that, a "consistent" choice is preferable. The
+ * consistent choice we make is to prefer the first owner, that of a
+ * Posix lock.
+ */
+static struct nfs4_lock_state *
+__nfs4_find_lock_state(struct nfs4_state *state,
+ fl_owner_t fl_owner, fl_owner_t fl_owner2)
+{
+ struct nfs4_lock_state *pos, *ret = NULL;
+ list_for_each_entry(pos, &state->lock_states, ls_locks) {
+ if (pos->ls_owner == fl_owner) {
+ ret = pos;
+ break;
+ }
+ if (pos->ls_owner == fl_owner2)
+ ret = pos;
+ }
+ if (ret)
+ refcount_inc(&ret->ls_count);
+ return ret;
+}
+
+/*
+ * Return a compatible lock_state. If no initialized lock_state structure
+ * exists, return an uninitialized one.
+ *
+ */
+static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
+{
+ struct nfs4_lock_state *lsp;
+ struct nfs_server *server = state->owner->so_server;
+
+ lsp = kzalloc(sizeof(*lsp), GFP_KERNEL_ACCOUNT);
+ if (lsp == NULL)
+ return NULL;
+ nfs4_init_seqid_counter(&lsp->ls_seqid);
+ refcount_set(&lsp->ls_count, 1);
+ lsp->ls_state = state;
+ lsp->ls_owner = fl_owner;
+ lsp->ls_seqid.owner_id = ida_alloc(&server->lockowner_id, GFP_KERNEL_ACCOUNT);
+ if (lsp->ls_seqid.owner_id < 0)
+ goto out_free;
+ INIT_LIST_HEAD(&lsp->ls_locks);
+ return lsp;
+out_free:
+ kfree(lsp);
+ return NULL;
+}
+
+void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
+{
+ ida_free(&server->lockowner_id, lsp->ls_seqid.owner_id);
+ nfs4_destroy_seqid_counter(&lsp->ls_seqid);
+ kfree(lsp);
+}
+
+/*
+ * Return a compatible lock_state. If no initialized lock_state structure
+ * exists, return an uninitialized one.
+ *
+ */
+static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
+{
+ struct nfs4_lock_state *lsp, *new = NULL;
+
+ for(;;) {
+ spin_lock(&state->state_lock);
+ lsp = __nfs4_find_lock_state(state, owner, NULL);
+ if (lsp != NULL)
+ break;
+ if (new != NULL) {
+ list_add(&new->ls_locks, &state->lock_states);
+ set_bit(LK_STATE_IN_USE, &state->flags);
+ lsp = new;
+ new = NULL;
+ break;
+ }
+ spin_unlock(&state->state_lock);
+ new = nfs4_alloc_lock_state(state, owner);
+ if (new == NULL)
+ return NULL;
+ }
+ spin_unlock(&state->state_lock);
+ if (new != NULL)
+ nfs4_free_lock_state(state->owner->so_server, new);
+ return lsp;
+}
+
+/*
+ * Release reference to lock_state, and free it if we see that
+ * it is no longer in use
+ */
+void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
+{
+ struct nfs_server *server;
+ struct nfs4_state *state;
+
+ if (lsp == NULL)
+ return;
+ state = lsp->ls_state;
+ if (!refcount_dec_and_lock(&lsp->ls_count, &state->state_lock))
+ return;
+ list_del(&lsp->ls_locks);
+ if (list_empty(&state->lock_states))
+ clear_bit(LK_STATE_IN_USE, &state->flags);
+ spin_unlock(&state->state_lock);
+ server = state->owner->so_server;
+ if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
+ struct nfs_client *clp = server->nfs_client;
+
+ clp->cl_mvops->free_lock_state(server, lsp);
+ } else
+ nfs4_free_lock_state(server, lsp);
+}
+
+static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
+{
+ struct nfs4_lock_state *lsp = src->fl_u.nfs4_fl.owner;
+
+ dst->fl_u.nfs4_fl.owner = lsp;
+ refcount_inc(&lsp->ls_count);
+}
+
+static void nfs4_fl_release_lock(struct file_lock *fl)
+{
+ nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner);
+}
+
+static const struct file_lock_operations nfs4_fl_lock_ops = {
+ .fl_copy_lock = nfs4_fl_copy_lock,
+ .fl_release_private = nfs4_fl_release_lock,
+};
+
+int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
+{
+ struct nfs4_lock_state *lsp;
+
+ if (fl->fl_ops != NULL)
+ return 0;
+ lsp = nfs4_get_lock_state(state, fl->fl_owner);
+ if (lsp == NULL)
+ return -ENOMEM;
+ fl->fl_u.nfs4_fl.owner = lsp;
+ fl->fl_ops = &nfs4_fl_lock_ops;
+ return 0;
+}
+
+static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
+ struct nfs4_state *state,
+ const struct nfs_lock_context *l_ctx)
+{
+ struct nfs4_lock_state *lsp;
+ fl_owner_t fl_owner, fl_flock_owner;
+ int ret = -ENOENT;
+
+ if (l_ctx == NULL)
+ goto out;
+
+ if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
+ goto out;
+
+ fl_owner = l_ctx->lockowner;
+ fl_flock_owner = l_ctx->open_context->flock_owner;
+
+ spin_lock(&state->state_lock);
+ lsp = __nfs4_find_lock_state(state, fl_owner, fl_flock_owner);
+ if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
+ ret = -EIO;
+ else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
+ nfs4_stateid_copy(dst, &lsp->ls_stateid);
+ ret = 0;
+ }
+ spin_unlock(&state->state_lock);
+ nfs4_put_lock_state(lsp);
+out:
+ return ret;
+}
+
+bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
+{
+ bool ret;
+ const nfs4_stateid *src;
+ int seq;
+
+ do {
+ ret = false;
+ src = &zero_stateid;
+ seq = read_seqbegin(&state->seqlock);
+ if (test_bit(NFS_OPEN_STATE, &state->flags)) {
+ src = &state->open_stateid;
+ ret = true;
+ }
+ nfs4_stateid_copy(dst, src);
+ } while (read_seqretry(&state->seqlock, seq));
+ return ret;
+}
+
+/*
+ * Byte-range lock aware utility to initialize the stateid of read/write
+ * requests.
+ */
+int nfs4_select_rw_stateid(struct nfs4_state *state,
+ fmode_t fmode, const struct nfs_lock_context *l_ctx,
+ nfs4_stateid *dst, const struct cred **cred)
+{
+ int ret;
+
+ if (!nfs4_valid_open_stateid(state))
+ return -EIO;
+ if (cred != NULL)
+ *cred = NULL;
+ ret = nfs4_copy_lock_stateid(dst, state, l_ctx);
+ if (ret == -EIO)
+ /* A lost lock - don't even consider delegations */
+ goto out;
+ /* returns true if delegation stateid found and copied */
+ if (nfs4_copy_delegation_stateid(state->inode, fmode, dst, cred)) {
+ ret = 0;
+ goto out;
+ }
+ if (ret != -ENOENT)
+ /* nfs4_copy_delegation_stateid() didn't over-write
+ * dst, so it still has the lock stateid which we now
+ * choose to use.
+ */
+ goto out;
+ ret = nfs4_copy_open_stateid(dst, state) ? 0 : -EAGAIN;
+out:
+ if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41))
+ dst->seqid = 0;
+ return ret;
+}
+
+struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask)
+{
+ struct nfs_seqid *new;
+
+ new = kmalloc(sizeof(*new), gfp_mask);
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ new->sequence = counter;
+ INIT_LIST_HEAD(&new->list);
+ new->task = NULL;
+ return new;
+}
+
+void nfs_release_seqid(struct nfs_seqid *seqid)
+{
+ struct nfs_seqid_counter *sequence;
+
+ if (seqid == NULL || list_empty(&seqid->list))
+ return;
+ sequence = seqid->sequence;
+ spin_lock(&sequence->lock);
+ list_del_init(&seqid->list);
+ if (!list_empty(&sequence->list)) {
+ struct nfs_seqid *next;
+
+ next = list_first_entry(&sequence->list,
+ struct nfs_seqid, list);
+ rpc_wake_up_queued_task(&sequence->wait, next->task);
+ }
+ spin_unlock(&sequence->lock);
+}
+
+void nfs_free_seqid(struct nfs_seqid *seqid)
+{
+ nfs_release_seqid(seqid);
+ kfree(seqid);
+}
+
+/*
+ * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or
+ * failed with a seqid incrementing error -
+ * see comments nfs4.h:seqid_mutating_error()
+ */
+static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
+{
+ switch (status) {
+ case 0:
+ break;
+ case -NFS4ERR_BAD_SEQID:
+ if (seqid->sequence->flags & NFS_SEQID_CONFIRMED)
+ return;
+ pr_warn_ratelimited("NFS: v4 server returned a bad"
+ " sequence-id error on an"
+ " unconfirmed sequence %p!\n",
+ seqid->sequence);
+ return;
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_BADXDR:
+ case -NFS4ERR_RESOURCE:
+ case -NFS4ERR_NOFILEHANDLE:
+ case -NFS4ERR_MOVED:
+ /* Non-seqid mutating errors */
+ return;
+ }
+ /*
+ * Note: no locking needed as we are guaranteed to be first
+ * on the sequence list
+ */
+ seqid->sequence->counter++;
+}
+
+void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
+{
+ struct nfs4_state_owner *sp;
+
+ if (seqid == NULL)
+ return;
+
+ sp = container_of(seqid->sequence, struct nfs4_state_owner, so_seqid);
+ if (status == -NFS4ERR_BAD_SEQID)
+ nfs4_reset_state_owner(sp);
+ if (!nfs4_has_session(sp->so_server->nfs_client))
+ nfs_increment_seqid(status, seqid);
+}
+
+/*
+ * Increment the seqid if the LOCK/LOCKU succeeded, or
+ * failed with a seqid incrementing error -
+ * see comments nfs4.h:seqid_mutating_error()
+ */
+void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)
+{
+ if (seqid != NULL)
+ nfs_increment_seqid(status, seqid);
+}
+
+int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
+{
+ struct nfs_seqid_counter *sequence;
+ int status = 0;
+
+ if (seqid == NULL)
+ goto out;
+ sequence = seqid->sequence;
+ spin_lock(&sequence->lock);
+ seqid->task = task;
+ if (list_empty(&seqid->list))
+ list_add_tail(&seqid->list, &sequence->list);
+ if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid)
+ goto unlock;
+ rpc_sleep_on(&sequence->wait, task, NULL);
+ status = -EAGAIN;
+unlock:
+ spin_unlock(&sequence->lock);
+out:
+ return status;
+}
+
+static int nfs4_run_state_manager(void *);
+
+static void nfs4_clear_state_manager_bit(struct nfs_client *clp)
+{
+ clear_and_wake_up_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
+ rpc_wake_up(&clp->cl_rpcwaitq);
+}
+
+/*
+ * Schedule the nfs_client asynchronous state management routine
+ */
+void nfs4_schedule_state_manager(struct nfs_client *clp)
+{
+ struct task_struct *task;
+ char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
+ struct rpc_clnt *clnt = clp->cl_rpcclient;
+ bool swapon = false;
+
+ if (clnt->cl_shutdown)
+ return;
+
+ set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
+
+ if (atomic_read(&clnt->cl_swapper)) {
+ swapon = !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE,
+ &clp->cl_state);
+ if (!swapon) {
+ wake_up_var(&clp->cl_state);
+ return;
+ }
+ }
+
+ if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+ return;
+
+ __module_get(THIS_MODULE);
+ refcount_inc(&clp->cl_count);
+
+ /* The rcu_read_lock() is not strictly necessary, as the state
+ * manager is the only thread that ever changes the rpc_xprt
+ * after it's initialized. At this point, we're single threaded. */
+ rcu_read_lock();
+ snprintf(buf, sizeof(buf), "%s-manager",
+ rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+ rcu_read_unlock();
+ task = kthread_run(nfs4_run_state_manager, clp, "%s", buf);
+ if (IS_ERR(task)) {
+ printk(KERN_ERR "%s: kthread_run: %ld\n",
+ __func__, PTR_ERR(task));
+ if (!nfs_client_init_is_complete(clp))
+ nfs_mark_client_ready(clp, PTR_ERR(task));
+ if (swapon)
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
+ nfs4_clear_state_manager_bit(clp);
+ nfs_put_client(clp);
+ module_put(THIS_MODULE);
+ }
+}
+
+/*
+ * Schedule a lease recovery attempt
+ */
+void nfs4_schedule_lease_recovery(struct nfs_client *clp)
+{
+ if (!clp)
+ return;
+ if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+ set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+ dprintk("%s: scheduling lease recovery for server %s\n", __func__,
+ clp->cl_hostname);
+ nfs4_schedule_state_manager(clp);
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery);
+
+/**
+ * nfs4_schedule_migration_recovery - trigger migration recovery
+ *
+ * @server: FSID that is migrating
+ *
+ * Returns zero if recovery has started, otherwise a negative NFS4ERR
+ * value is returned.
+ */
+int nfs4_schedule_migration_recovery(const struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+
+ if (server->fh_expire_type != NFS4_FH_PERSISTENT) {
+ pr_err("NFS: volatile file handles not supported (server %s)\n",
+ clp->cl_hostname);
+ return -NFS4ERR_IO;
+ }
+
+ if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+ return -NFS4ERR_IO;
+
+ dprintk("%s: scheduling migration recovery for (%llx:%llx) on %s\n",
+ __func__,
+ (unsigned long long)server->fsid.major,
+ (unsigned long long)server->fsid.minor,
+ clp->cl_hostname);
+
+ set_bit(NFS_MIG_IN_TRANSITION,
+ &((struct nfs_server *)server)->mig_status);
+ set_bit(NFS4CLNT_MOVED, &clp->cl_state);
+
+ nfs4_schedule_state_manager(clp);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_migration_recovery);
+
+/**
+ * nfs4_schedule_lease_moved_recovery - start lease-moved recovery
+ *
+ * @clp: server to check for moved leases
+ *
+ */
+void nfs4_schedule_lease_moved_recovery(struct nfs_client *clp)
+{
+ dprintk("%s: scheduling lease-moved recovery for client ID %llx on %s\n",
+ __func__, clp->cl_clientid, clp->cl_hostname);
+
+ set_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state);
+ nfs4_schedule_state_manager(clp);
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_lease_moved_recovery);
+
+int nfs4_wait_clnt_recover(struct nfs_client *clp)
+{
+ int res;
+
+ might_sleep();
+
+ refcount_inc(&clp->cl_count);
+ res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
+ nfs_wait_bit_killable,
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
+ if (res)
+ goto out;
+ if (clp->cl_cons_state < 0)
+ res = clp->cl_cons_state;
+out:
+ nfs_put_client(clp);
+ return res;
+}
+
+int nfs4_client_recover_expired_lease(struct nfs_client *clp)
+{
+ unsigned int loop;
+ int ret;
+
+ for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
+ ret = nfs4_wait_clnt_recover(clp);
+ if (ret != 0)
+ break;
+ if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
+ !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
+ break;
+ nfs4_schedule_state_manager(clp);
+ ret = -EIO;
+ }
+ return ret;
+}
+
+/*
+ * nfs40_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
+ * @clp: client to process
+ *
+ * Set the NFS4CLNT_LEASE_EXPIRED state in order to force a
+ * resend of the SETCLIENTID and hence re-establish the
+ * callback channel. Then return all existing delegations.
+ */
+static void nfs40_handle_cb_pathdown(struct nfs_client *clp)
+{
+ set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ nfs_expire_all_delegations(clp);
+ dprintk("%s: handling CB_PATHDOWN recovery for server %s\n", __func__,
+ clp->cl_hostname);
+}
+
+void nfs4_schedule_path_down_recovery(struct nfs_client *clp)
+{
+ nfs40_handle_cb_pathdown(clp);
+ nfs4_schedule_state_manager(clp);
+}
+
+static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
+{
+
+ if (!nfs4_valid_open_stateid(state))
+ return 0;
+ set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+ /* Don't recover state that expired before the reboot */
+ if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) {
+ clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+ return 0;
+ }
+ set_bit(NFS_OWNER_RECLAIM_REBOOT, &state->owner->so_flags);
+ set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+ return 1;
+}
+
+int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
+{
+ if (!nfs4_valid_open_stateid(state))
+ return 0;
+ set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
+ clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+ set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags);
+ set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
+ return 1;
+}
+
+int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state)
+{
+ struct nfs_client *clp = server->nfs_client;
+
+ if (!nfs4_state_mark_reclaim_nograce(clp, state))
+ return -EBADF;
+ nfs_inode_find_delegation_state_and_recover(state->inode,
+ &state->stateid);
+ dprintk("%s: scheduling stateid recovery for server %s\n", __func__,
+ clp->cl_hostname);
+ nfs4_schedule_state_manager(clp);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery);
+
+static struct nfs4_lock_state *
+nfs_state_find_lock_state_by_stateid(struct nfs4_state *state,
+ const nfs4_stateid *stateid)
+{
+ struct nfs4_lock_state *pos;
+
+ list_for_each_entry(pos, &state->lock_states, ls_locks) {
+ if (!test_bit(NFS_LOCK_INITIALIZED, &pos->ls_flags))
+ continue;
+ if (nfs4_stateid_match_or_older(&pos->ls_stateid, stateid))
+ return pos;
+ }
+ return NULL;
+}
+
+static bool nfs_state_lock_state_matches_stateid(struct nfs4_state *state,
+ const nfs4_stateid *stateid)
+{
+ bool found = false;
+
+ if (test_bit(LK_STATE_IN_USE, &state->flags)) {
+ spin_lock(&state->state_lock);
+ if (nfs_state_find_lock_state_by_stateid(state, stateid))
+ found = true;
+ spin_unlock(&state->state_lock);
+ }
+ return found;
+}
+
+void nfs_inode_find_state_and_recover(struct inode *inode,
+ const nfs4_stateid *stateid)
+{
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_open_context *ctx;
+ struct nfs4_state *state;
+ bool found = false;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
+ state = ctx->state;
+ if (state == NULL)
+ continue;
+ if (nfs4_stateid_match_or_older(&state->stateid, stateid) &&
+ nfs4_state_mark_reclaim_nograce(clp, state)) {
+ found = true;
+ continue;
+ }
+ if (test_bit(NFS_OPEN_STATE, &state->flags) &&
+ nfs4_stateid_match_or_older(&state->open_stateid, stateid) &&
+ nfs4_state_mark_reclaim_nograce(clp, state)) {
+ found = true;
+ continue;
+ }
+ if (nfs_state_lock_state_matches_stateid(state, stateid) &&
+ nfs4_state_mark_reclaim_nograce(clp, state))
+ found = true;
+ }
+ rcu_read_unlock();
+
+ nfs_inode_find_delegation_state_and_recover(inode, stateid);
+ if (found)
+ nfs4_schedule_state_manager(clp);
+}
+
+static void nfs4_state_mark_open_context_bad(struct nfs4_state *state, int err)
+{
+ struct inode *inode = state->inode;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_open_context *ctx;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
+ if (ctx->state != state)
+ continue;
+ set_bit(NFS_CONTEXT_BAD, &ctx->flags);
+ pr_warn("NFSv4: state recovery failed for open file %pd2, "
+ "error = %d\n", ctx->dentry, err);
+ }
+ rcu_read_unlock();
+}
+
+static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error)
+{
+ set_bit(NFS_STATE_RECOVERY_FAILED, &state->flags);
+ nfs4_state_mark_open_context_bad(state, error);
+}
+
+
+static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
+{
+ struct inode *inode = state->inode;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct file_lock *fl;
+ struct nfs4_lock_state *lsp;
+ int status = 0;
+ struct file_lock_context *flctx = locks_inode_context(inode);
+ struct list_head *list;
+
+ if (flctx == NULL)
+ return 0;
+
+ list = &flctx->flc_posix;
+
+ /* Guard against delegation returns and new lock/unlock calls */
+ down_write(&nfsi->rwsem);
+ spin_lock(&flctx->flc_lock);
+restart:
+ list_for_each_entry(fl, list, fl_list) {
+ if (nfs_file_open_context(fl->fl_file)->state != state)
+ continue;
+ spin_unlock(&flctx->flc_lock);
+ status = ops->recover_lock(state, fl);
+ switch (status) {
+ case 0:
+ break;
+ case -ETIMEDOUT:
+ case -ESTALE:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_NO_GRACE:
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ goto out;
+ default:
+ pr_err("NFS: %s: unhandled error %d\n",
+ __func__, status);
+ fallthrough;
+ case -ENOMEM:
+ case -NFS4ERR_DENIED:
+ case -NFS4ERR_RECLAIM_BAD:
+ case -NFS4ERR_RECLAIM_CONFLICT:
+ lsp = fl->fl_u.nfs4_fl.owner;
+ if (lsp)
+ set_bit(NFS_LOCK_LOST, &lsp->ls_flags);
+ status = 0;
+ }
+ spin_lock(&flctx->flc_lock);
+ }
+ if (list == &flctx->flc_posix) {
+ list = &flctx->flc_flock;
+ goto restart;
+ }
+ spin_unlock(&flctx->flc_lock);
+out:
+ up_write(&nfsi->rwsem);
+ return status;
+}
+
+#ifdef CONFIG_NFS_V4_2
+static void nfs42_complete_copies(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+ struct nfs4_copy_state *copy;
+
+ if (!test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags) &&
+ !test_bit(NFS_CLNT_SRC_SSC_COPY_STATE, &state->flags))
+ return;
+
+ spin_lock(&sp->so_server->nfs_client->cl_lock);
+ list_for_each_entry(copy, &sp->so_server->ss_copies, copies) {
+ if ((test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags) &&
+ !nfs4_stateid_match_other(&state->stateid,
+ &copy->parent_dst_state->stateid)))
+ continue;
+ copy->flags = 1;
+ if (test_and_clear_bit(NFS_CLNT_DST_SSC_COPY_STATE,
+ &state->flags)) {
+ clear_bit(NFS_CLNT_SRC_SSC_COPY_STATE, &state->flags);
+ complete(&copy->completion);
+ }
+ }
+ list_for_each_entry(copy, &sp->so_server->ss_copies, src_copies) {
+ if ((test_bit(NFS_CLNT_SRC_SSC_COPY_STATE, &state->flags) &&
+ !nfs4_stateid_match_other(&state->stateid,
+ &copy->parent_src_state->stateid)))
+ continue;
+ copy->flags = 1;
+ if (test_and_clear_bit(NFS_CLNT_DST_SSC_COPY_STATE,
+ &state->flags))
+ complete(&copy->completion);
+ }
+ spin_unlock(&sp->so_server->nfs_client->cl_lock);
+}
+#else /* !CONFIG_NFS_V4_2 */
+static inline void nfs42_complete_copies(struct nfs4_state_owner *sp,
+ struct nfs4_state *state)
+{
+}
+#endif /* CONFIG_NFS_V4_2 */
+
+static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_state *state,
+ const struct nfs4_state_recovery_ops *ops,
+ int *lost_locks)
+{
+ struct nfs4_lock_state *lock;
+ int status;
+
+ status = ops->recover_open(sp, state);
+ if (status < 0)
+ return status;
+
+ status = nfs4_reclaim_locks(state, ops);
+ if (status < 0)
+ return status;
+
+ if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) {
+ spin_lock(&state->state_lock);
+ list_for_each_entry(lock, &state->lock_states, ls_locks) {
+ trace_nfs4_state_lock_reclaim(state, lock);
+ if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags) &&
+ !test_bit(NFS_LOCK_UNLOCKING, &lock->ls_flags))
+ *lost_locks += 1;
+ }
+ spin_unlock(&state->state_lock);
+ }
+
+ nfs42_complete_copies(sp, state);
+ clear_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
+ return status;
+}
+
+static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp,
+ const struct nfs4_state_recovery_ops *ops,
+ int *lost_locks)
+{
+ struct nfs4_state *state;
+ unsigned int loop = 0;
+ int status = 0;
+#ifdef CONFIG_NFS_V4_2
+ bool found_ssc_copy_state = false;
+#endif /* CONFIG_NFS_V4_2 */
+
+ /* Note: we rely on the sp->so_states list being ordered
+ * so that we always reclaim open(O_RDWR) and/or open(O_WRITE)
+ * states first.
+ * This is needed to ensure that the server won't give us any
+ * read delegations that we have to return if, say, we are
+ * recovering after a network partition or a reboot from a
+ * server that doesn't support a grace period.
+ */
+ spin_lock(&sp->so_lock);
+ raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
+restart:
+ list_for_each_entry(state, &sp->so_states, open_states) {
+ if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
+ continue;
+ if (!nfs4_valid_open_stateid(state))
+ continue;
+ if (state->state == 0)
+ continue;
+#ifdef CONFIG_NFS_V4_2
+ if (test_bit(NFS_SRV_SSC_COPY_STATE, &state->flags)) {
+ nfs4_state_mark_recovery_failed(state, -EIO);
+ found_ssc_copy_state = true;
+ continue;
+ }
+#endif /* CONFIG_NFS_V4_2 */
+ refcount_inc(&state->count);
+ spin_unlock(&sp->so_lock);
+ status = __nfs4_reclaim_open_state(sp, state, ops, lost_locks);
+
+ switch (status) {
+ default:
+ if (status >= 0) {
+ loop = 0;
+ break;
+ }
+ printk(KERN_ERR "NFS: %s: unhandled error %d\n", __func__, status);
+ fallthrough;
+ case -ENOENT:
+ case -ENOMEM:
+ case -EACCES:
+ case -EROFS:
+ case -EIO:
+ case -ESTALE:
+ /* Open state on this file cannot be recovered */
+ nfs4_state_mark_recovery_failed(state, status);
+ break;
+ case -EAGAIN:
+ ssleep(1);
+ if (loop++ < 10) {
+ set_bit(ops->state_flag_bit, &state->flags);
+ break;
+ }
+ fallthrough;
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_OLD_STATEID:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_RECLAIM_BAD:
+ case -NFS4ERR_RECLAIM_CONFLICT:
+ nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
+ break;
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_NO_GRACE:
+ nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
+ fallthrough;
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case -ETIMEDOUT:
+ goto out_err;
+ }
+ nfs4_put_open_state(state);
+ spin_lock(&sp->so_lock);
+ goto restart;
+ }
+ raw_write_seqcount_end(&sp->so_reclaim_seqcount);
+ spin_unlock(&sp->so_lock);
+#ifdef CONFIG_NFS_V4_2
+ if (found_ssc_copy_state)
+ return -EIO;
+#endif /* CONFIG_NFS_V4_2 */
+ return 0;
+out_err:
+ nfs4_put_open_state(state);
+ spin_lock(&sp->so_lock);
+ raw_write_seqcount_end(&sp->so_reclaim_seqcount);
+ spin_unlock(&sp->so_lock);
+ return status;
+}
+
+static void nfs4_clear_open_state(struct nfs4_state *state)
+{
+ struct nfs4_lock_state *lock;
+
+ clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+ clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+ clear_bit(NFS_O_RDWR_STATE, &state->flags);
+ spin_lock(&state->state_lock);
+ list_for_each_entry(lock, &state->lock_states, ls_locks) {
+ lock->ls_seqid.flags = 0;
+ clear_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags);
+ }
+ spin_unlock(&state->state_lock);
+}
+
+static void nfs4_reset_seqids(struct nfs_server *server,
+ int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_state_owner *sp;
+ struct rb_node *pos;
+ struct nfs4_state *state;
+
+ spin_lock(&clp->cl_lock);
+ for (pos = rb_first(&server->state_owners);
+ pos != NULL;
+ pos = rb_next(pos)) {
+ sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
+ sp->so_seqid.flags = 0;
+ spin_lock(&sp->so_lock);
+ list_for_each_entry(state, &sp->so_states, open_states) {
+ if (mark_reclaim(clp, state))
+ nfs4_clear_open_state(state);
+ }
+ spin_unlock(&sp->so_lock);
+ }
+ spin_unlock(&clp->cl_lock);
+}
+
+static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp,
+ int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+{
+ struct nfs_server *server;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ nfs4_reset_seqids(server, mark_reclaim);
+ rcu_read_unlock();
+}
+
+static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
+{
+ set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+ /* Mark all delegations for reclaim */
+ nfs_delegation_mark_reclaim(clp);
+ nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot);
+}
+
+static int nfs4_reclaim_complete(struct nfs_client *clp,
+ const struct nfs4_state_recovery_ops *ops,
+ const struct cred *cred)
+{
+ /* Notify the server we're done reclaiming our state */
+ if (ops->reclaim_complete)
+ return ops->reclaim_complete(clp, cred);
+ return 0;
+}
+
+static void nfs4_clear_reclaim_server(struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_state_owner *sp;
+ struct rb_node *pos;
+ struct nfs4_state *state;
+
+ spin_lock(&clp->cl_lock);
+ for (pos = rb_first(&server->state_owners);
+ pos != NULL;
+ pos = rb_next(pos)) {
+ sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
+ spin_lock(&sp->so_lock);
+ list_for_each_entry(state, &sp->so_states, open_states) {
+ if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT,
+ &state->flags))
+ continue;
+ nfs4_state_mark_reclaim_nograce(clp, state);
+ }
+ spin_unlock(&sp->so_lock);
+ }
+ spin_unlock(&clp->cl_lock);
+}
+
+static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+
+ if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+ return 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ nfs4_clear_reclaim_server(server);
+ rcu_read_unlock();
+
+ nfs_delegation_reap_unclaimed(clp);
+ return 1;
+}
+
+static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
+{
+ const struct nfs4_state_recovery_ops *ops;
+ const struct cred *cred;
+ int err;
+
+ if (!nfs4_state_clear_reclaim_reboot(clp))
+ return;
+ ops = clp->cl_mvops->reboot_recovery_ops;
+ cred = nfs4_get_clid_cred(clp);
+ err = nfs4_reclaim_complete(clp, ops, cred);
+ put_cred(cred);
+ if (err == -NFS4ERR_CONN_NOT_BOUND_TO_SESSION)
+ set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+}
+
+static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
+{
+ nfs_mark_test_expired_all_delegations(clp);
+ nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
+}
+
+static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
+{
+ switch (error) {
+ case 0:
+ break;
+ case -NFS4ERR_CB_PATH_DOWN:
+ nfs40_handle_cb_pathdown(clp);
+ break;
+ case -NFS4ERR_NO_GRACE:
+ nfs4_state_end_reclaim_reboot(clp);
+ break;
+ case -NFS4ERR_STALE_CLIENTID:
+ set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ nfs4_state_start_reclaim_reboot(clp);
+ break;
+ case -NFS4ERR_EXPIRED:
+ set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ nfs4_state_start_reclaim_nograce(clp);
+ break;
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_SEQ_FALSE_RETRY:
+ case -NFS4ERR_SEQ_MISORDERED:
+ set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+ /* Zero session reset errors */
+ break;
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+ break;
+ default:
+ dprintk("%s: failed to handle error %d for server %s\n",
+ __func__, error, clp->cl_hostname);
+ return error;
+ }
+ dprintk("%s: handled error %d for server %s\n", __func__, error,
+ clp->cl_hostname);
+ return 0;
+}
+
+static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
+{
+ struct nfs4_state_owner *sp;
+ struct nfs_server *server;
+ struct rb_node *pos;
+ LIST_HEAD(freeme);
+ int status = 0;
+ int lost_locks = 0;
+
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ nfs4_purge_state_owners(server, &freeme);
+ spin_lock(&clp->cl_lock);
+ for (pos = rb_first(&server->state_owners);
+ pos != NULL;
+ pos = rb_next(pos)) {
+ sp = rb_entry(pos,
+ struct nfs4_state_owner, so_server_node);
+ if (!test_and_clear_bit(ops->owner_flag_bit,
+ &sp->so_flags))
+ continue;
+ if (!atomic_inc_not_zero(&sp->so_count))
+ continue;
+ spin_unlock(&clp->cl_lock);
+ rcu_read_unlock();
+
+ status = nfs4_reclaim_open_state(sp, ops, &lost_locks);
+ if (status < 0) {
+ if (lost_locks)
+ pr_warn("NFS: %s: lost %d locks\n",
+ clp->cl_hostname, lost_locks);
+ set_bit(ops->owner_flag_bit, &sp->so_flags);
+ nfs4_put_state_owner(sp);
+ status = nfs4_recovery_handle_error(clp, status);
+ return (status != 0) ? status : -EAGAIN;
+ }
+
+ nfs4_put_state_owner(sp);
+ goto restart;
+ }
+ spin_unlock(&clp->cl_lock);
+ }
+ rcu_read_unlock();
+ nfs4_free_state_owners(&freeme);
+ if (lost_locks)
+ pr_warn("NFS: %s: lost %d locks\n",
+ clp->cl_hostname, lost_locks);
+ return 0;
+}
+
+static int nfs4_check_lease(struct nfs_client *clp)
+{
+ const struct cred *cred;
+ const struct nfs4_state_maintenance_ops *ops =
+ clp->cl_mvops->state_renewal_ops;
+ int status;
+
+ /* Is the client already known to have an expired lease? */
+ if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+ return 0;
+ cred = ops->get_state_renewal_cred(clp);
+ if (cred == NULL) {
+ cred = nfs4_get_clid_cred(clp);
+ status = -ENOKEY;
+ if (cred == NULL)
+ goto out;
+ }
+ status = ops->renew_lease(clp, cred);
+ put_cred(cred);
+ if (status == -ETIMEDOUT) {
+ set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+ return 0;
+ }
+out:
+ return nfs4_recovery_handle_error(clp, status);
+}
+
+/* Set NFS4CLNT_LEASE_EXPIRED and reclaim reboot state for all v4.0 errors
+ * and for recoverable errors on EXCHANGE_ID for v4.1
+ */
+static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
+{
+ switch (status) {
+ case -NFS4ERR_SEQ_MISORDERED:
+ if (test_and_set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state))
+ return -ESERVERFAULT;
+ /* Lease confirmation error: retry after purging the lease */
+ ssleep(1);
+ clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+ break;
+ case -NFS4ERR_STALE_CLIENTID:
+ clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+ nfs4_state_start_reclaim_reboot(clp);
+ break;
+ case -NFS4ERR_CLID_INUSE:
+ pr_err("NFS: Server %s reports our clientid is in use\n",
+ clp->cl_hostname);
+ nfs_mark_client_ready(clp, -EPERM);
+ clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+ return -EPERM;
+ case -EACCES:
+ case -NFS4ERR_DELAY:
+ case -EAGAIN:
+ ssleep(1);
+ break;
+
+ case -NFS4ERR_MINOR_VERS_MISMATCH:
+ if (clp->cl_cons_state == NFS_CS_SESSION_INITING)
+ nfs_mark_client_ready(clp, -EPROTONOSUPPORT);
+ dprintk("%s: exit with error %d for server %s\n",
+ __func__, -EPROTONOSUPPORT, clp->cl_hostname);
+ return -EPROTONOSUPPORT;
+ case -ENOSPC:
+ if (clp->cl_cons_state == NFS_CS_SESSION_INITING)
+ nfs_mark_client_ready(clp, -EIO);
+ return -EIO;
+ case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
+ * in nfs4_exchange_id */
+ default:
+ dprintk("%s: exit with error %d for server %s\n", __func__,
+ status, clp->cl_hostname);
+ return status;
+ }
+ set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ dprintk("%s: handled error %d for server %s\n", __func__, status,
+ clp->cl_hostname);
+ return 0;
+}
+
+static int nfs4_establish_lease(struct nfs_client *clp)
+{
+ const struct cred *cred;
+ const struct nfs4_state_recovery_ops *ops =
+ clp->cl_mvops->reboot_recovery_ops;
+ int status;
+
+ status = nfs4_begin_drain_session(clp);
+ if (status != 0)
+ return status;
+ cred = nfs4_get_clid_cred(clp);
+ if (cred == NULL)
+ return -ENOENT;
+ status = ops->establish_clid(clp, cred);
+ put_cred(cred);
+ if (status != 0)
+ return status;
+ pnfs_destroy_all_layouts(clp);
+ return 0;
+}
+
+/*
+ * Returns zero or a negative errno. NFS4ERR values are converted
+ * to local errno values.
+ */
+static int nfs4_reclaim_lease(struct nfs_client *clp)
+{
+ int status;
+
+ status = nfs4_establish_lease(clp);
+ if (status < 0)
+ return nfs4_handle_reclaim_lease_error(clp, status);
+ if (test_and_clear_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state))
+ nfs4_state_start_reclaim_nograce(clp);
+ if (!test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state))
+ set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+ clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+ clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ return 0;
+}
+
+static int nfs4_purge_lease(struct nfs_client *clp)
+{
+ int status;
+
+ status = nfs4_establish_lease(clp);
+ if (status < 0)
+ return nfs4_handle_reclaim_lease_error(clp, status);
+ clear_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state);
+ set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ nfs4_state_start_reclaim_nograce(clp);
+ return 0;
+}
+
+/*
+ * Try remote migration of one FSID from a source server to a
+ * destination server. The source server provides a list of
+ * potential destinations.
+ *
+ * Returns zero or a negative NFS4ERR status code.
+ */
+static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_fs_locations *locations = NULL;
+ struct inode *inode;
+ struct page *page;
+ int status, result;
+
+ dprintk("--> %s: FSID %llx:%llx on \"%s\"\n", __func__,
+ (unsigned long long)server->fsid.major,
+ (unsigned long long)server->fsid.minor,
+ clp->cl_hostname);
+
+ result = 0;
+ page = alloc_page(GFP_KERNEL);
+ locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+ if (page == NULL || locations == NULL) {
+ dprintk("<-- %s: no memory\n", __func__);
+ goto out;
+ }
+ locations->fattr = nfs_alloc_fattr();
+ if (locations->fattr == NULL) {
+ dprintk("<-- %s: no memory\n", __func__);
+ goto out;
+ }
+
+ inode = d_inode(server->super->s_root);
+ result = nfs4_proc_get_locations(server, NFS_FH(inode), locations,
+ page, cred);
+ if (result) {
+ dprintk("<-- %s: failed to retrieve fs_locations: %d\n",
+ __func__, result);
+ goto out;
+ }
+
+ result = -NFS4ERR_NXIO;
+ if (!locations->nlocations)
+ goto out;
+
+ if (!(locations->fattr->valid & NFS_ATTR_FATTR_V4_LOCATIONS)) {
+ dprintk("<-- %s: No fs_locations data, migration skipped\n",
+ __func__);
+ goto out;
+ }
+
+ status = nfs4_begin_drain_session(clp);
+ if (status != 0) {
+ result = status;
+ goto out;
+ }
+
+ status = nfs4_replace_transport(server, locations);
+ if (status != 0) {
+ dprintk("<-- %s: failed to replace transport: %d\n",
+ __func__, status);
+ goto out;
+ }
+
+ result = 0;
+ dprintk("<-- %s: migration succeeded\n", __func__);
+
+out:
+ if (page != NULL)
+ __free_page(page);
+ if (locations != NULL)
+ kfree(locations->fattr);
+ kfree(locations);
+ if (result) {
+ pr_err("NFS: migration recovery failed (server %s)\n",
+ clp->cl_hostname);
+ set_bit(NFS_MIG_FAILED, &server->mig_status);
+ }
+ return result;
+}
+
+/*
+ * Returns zero or a negative NFS4ERR status code.
+ */
+static int nfs4_handle_migration(struct nfs_client *clp)
+{
+ const struct nfs4_state_maintenance_ops *ops =
+ clp->cl_mvops->state_renewal_ops;
+ struct nfs_server *server;
+ const struct cred *cred;
+
+ dprintk("%s: migration reported on \"%s\"\n", __func__,
+ clp->cl_hostname);
+
+ cred = ops->get_state_renewal_cred(clp);
+ if (cred == NULL)
+ return -NFS4ERR_NOENT;
+
+ clp->cl_mig_gen++;
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ int status;
+
+ if (server->mig_gen == clp->cl_mig_gen)
+ continue;
+ server->mig_gen = clp->cl_mig_gen;
+
+ if (!test_and_clear_bit(NFS_MIG_IN_TRANSITION,
+ &server->mig_status))
+ continue;
+
+ rcu_read_unlock();
+ status = nfs4_try_migration(server, cred);
+ if (status < 0) {
+ put_cred(cred);
+ return status;
+ }
+ goto restart;
+ }
+ rcu_read_unlock();
+ put_cred(cred);
+ return 0;
+}
+
+/*
+ * Test each nfs_server on the clp's cl_superblocks list to see
+ * if it's moved to another server. Stop when the server no longer
+ * returns NFS4ERR_LEASE_MOVED.
+ */
+static int nfs4_handle_lease_moved(struct nfs_client *clp)
+{
+ const struct nfs4_state_maintenance_ops *ops =
+ clp->cl_mvops->state_renewal_ops;
+ struct nfs_server *server;
+ const struct cred *cred;
+
+ dprintk("%s: lease moved reported on \"%s\"\n", __func__,
+ clp->cl_hostname);
+
+ cred = ops->get_state_renewal_cred(clp);
+ if (cred == NULL)
+ return -NFS4ERR_NOENT;
+
+ clp->cl_mig_gen++;
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ struct inode *inode;
+ int status;
+
+ if (server->mig_gen == clp->cl_mig_gen)
+ continue;
+ server->mig_gen = clp->cl_mig_gen;
+
+ rcu_read_unlock();
+
+ inode = d_inode(server->super->s_root);
+ status = nfs4_proc_fsid_present(inode, cred);
+ if (status != -NFS4ERR_MOVED)
+ goto restart; /* wasn't this one */
+ if (nfs4_try_migration(server, cred) == -NFS4ERR_LEASE_MOVED)
+ goto restart; /* there are more */
+ goto out;
+ }
+ rcu_read_unlock();
+
+out:
+ put_cred(cred);
+ return 0;
+}
+
+/**
+ * nfs4_discover_server_trunking - Detect server IP address trunking
+ *
+ * @clp: nfs_client under test
+ * @result: OUT: found nfs_client, or clp
+ *
+ * Returns zero or a negative errno. If zero is returned,
+ * an nfs_client pointer is planted in "result".
+ *
+ * Note: since we are invoked in process context, and
+ * not from inside the state manager, we cannot use
+ * nfs4_handle_reclaim_lease_error().
+ */
+int nfs4_discover_server_trunking(struct nfs_client *clp,
+ struct nfs_client **result)
+{
+ const struct nfs4_state_recovery_ops *ops =
+ clp->cl_mvops->reboot_recovery_ops;
+ struct rpc_clnt *clnt;
+ const struct cred *cred;
+ int i, status;
+
+ dprintk("NFS: %s: testing '%s'\n", __func__, clp->cl_hostname);
+
+ clnt = clp->cl_rpcclient;
+ i = 0;
+
+ mutex_lock(&nfs_clid_init_mutex);
+again:
+ status = -ENOENT;
+ cred = nfs4_get_clid_cred(clp);
+ if (cred == NULL)
+ goto out_unlock;
+
+ status = ops->detect_trunking(clp, result, cred);
+ put_cred(cred);
+ switch (status) {
+ case 0:
+ case -EINTR:
+ case -ERESTARTSYS:
+ break;
+ case -ETIMEDOUT:
+ if (clnt->cl_softrtry)
+ break;
+ fallthrough;
+ case -NFS4ERR_DELAY:
+ case -EAGAIN:
+ ssleep(1);
+ fallthrough;
+ case -NFS4ERR_STALE_CLIENTID:
+ dprintk("NFS: %s after status %d, retrying\n",
+ __func__, status);
+ goto again;
+ case -EACCES:
+ if (i++ == 0) {
+ nfs4_root_machine_cred(clp);
+ goto again;
+ }
+ if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX)
+ break;
+ fallthrough;
+ case -NFS4ERR_CLID_INUSE:
+ case -NFS4ERR_WRONGSEC:
+ /* No point in retrying if we already used RPC_AUTH_UNIX */
+ if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX) {
+ status = -EPERM;
+ break;
+ }
+ clnt = rpc_clone_client_set_auth(clnt, RPC_AUTH_UNIX);
+ if (IS_ERR(clnt)) {
+ status = PTR_ERR(clnt);
+ break;
+ }
+ /* Note: this is safe because we haven't yet marked the
+ * client as ready, so we are the only user of
+ * clp->cl_rpcclient
+ */
+ clnt = xchg(&clp->cl_rpcclient, clnt);
+ rpc_shutdown_client(clnt);
+ clnt = clp->cl_rpcclient;
+ goto again;
+
+ case -NFS4ERR_MINOR_VERS_MISMATCH:
+ status = -EPROTONOSUPPORT;
+ break;
+
+ case -EKEYEXPIRED:
+ case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
+ * in nfs4_exchange_id */
+ status = -EKEYEXPIRED;
+ break;
+ default:
+ pr_warn("NFS: %s unhandled error %d. Exiting with error EIO\n",
+ __func__, status);
+ status = -EIO;
+ }
+
+out_unlock:
+ mutex_unlock(&nfs_clid_init_mutex);
+ dprintk("NFS: %s: status = %d\n", __func__, status);
+ return status;
+}
+
+#ifdef CONFIG_NFS_V4_1
+void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
+{
+ struct nfs_client *clp = session->clp;
+
+ switch (err) {
+ default:
+ set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+ break;
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+ }
+ nfs4_schedule_state_manager(clp);
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
+
+void nfs41_notify_server(struct nfs_client *clp)
+{
+ /* Use CHECK_LEASE to ping the server with a SEQUENCE */
+ set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+ nfs4_schedule_state_manager(clp);
+}
+
+static void nfs4_reset_all_state(struct nfs_client *clp)
+{
+ if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
+ set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state);
+ clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+ nfs4_state_start_reclaim_nograce(clp);
+ dprintk("%s: scheduling reset of all state for server %s!\n",
+ __func__, clp->cl_hostname);
+ nfs4_schedule_state_manager(clp);
+ }
+}
+
+static void nfs41_handle_server_reboot(struct nfs_client *clp)
+{
+ if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
+ nfs4_state_start_reclaim_reboot(clp);
+ dprintk("%s: server %s rebooted!\n", __func__,
+ clp->cl_hostname);
+ nfs4_schedule_state_manager(clp);
+ }
+}
+
+static void nfs41_handle_all_state_revoked(struct nfs_client *clp)
+{
+ nfs4_reset_all_state(clp);
+ dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname);
+}
+
+static void nfs41_handle_some_state_revoked(struct nfs_client *clp)
+{
+ nfs4_state_start_reclaim_nograce(clp);
+ nfs4_schedule_state_manager(clp);
+
+ dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname);
+}
+
+static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp)
+{
+ /* FIXME: For now, we destroy all layouts. */
+ pnfs_destroy_all_layouts(clp);
+ nfs_test_expired_all_delegations(clp);
+ dprintk("%s: Recallable state revoked on server %s!\n", __func__,
+ clp->cl_hostname);
+}
+
+static void nfs41_handle_backchannel_fault(struct nfs_client *clp)
+{
+ set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+ nfs4_schedule_state_manager(clp);
+
+ dprintk("%s: server %s declared a backchannel fault\n", __func__,
+ clp->cl_hostname);
+}
+
+static void nfs41_handle_cb_path_down(struct nfs_client *clp)
+{
+ if (test_and_set_bit(NFS4CLNT_BIND_CONN_TO_SESSION,
+ &clp->cl_state) == 0)
+ nfs4_schedule_state_manager(clp);
+}
+
+void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags,
+ bool recovery)
+{
+ if (!flags)
+ return;
+
+ dprintk("%s: \"%s\" (client ID %llx) flags=0x%08x\n",
+ __func__, clp->cl_hostname, clp->cl_clientid, flags);
+ /*
+ * If we're called from the state manager thread, then assume we're
+ * already handling the RECLAIM_NEEDED and/or STATE_REVOKED.
+ * Those flags are expected to remain set until we're done
+ * recovering (see RFC5661, section 18.46.3).
+ */
+ if (recovery)
+ goto out_recovery;
+
+ if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
+ nfs41_handle_server_reboot(clp);
+ if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED))
+ nfs41_handle_all_state_revoked(clp);
+ if (flags & (SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
+ SEQ4_STATUS_ADMIN_STATE_REVOKED))
+ nfs41_handle_some_state_revoked(clp);
+ if (flags & SEQ4_STATUS_LEASE_MOVED)
+ nfs4_schedule_lease_moved_recovery(clp);
+ if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
+ nfs41_handle_recallable_state_revoked(clp);
+out_recovery:
+ if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT)
+ nfs41_handle_backchannel_fault(clp);
+ else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
+ SEQ4_STATUS_CB_PATH_DOWN_SESSION))
+ nfs41_handle_cb_path_down(clp);
+}
+
+static int nfs4_reset_session(struct nfs_client *clp)
+{
+ const struct cred *cred;
+ int status;
+
+ if (!nfs4_has_session(clp))
+ return 0;
+ status = nfs4_begin_drain_session(clp);
+ if (status != 0)
+ return status;
+ cred = nfs4_get_clid_cred(clp);
+ status = nfs4_proc_destroy_session(clp->cl_session, cred);
+ switch (status) {
+ case 0:
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_DEADSESSION:
+ break;
+ case -NFS4ERR_BACK_CHAN_BUSY:
+ case -NFS4ERR_DELAY:
+ set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+ status = 0;
+ ssleep(1);
+ goto out;
+ default:
+ status = nfs4_recovery_handle_error(clp, status);
+ goto out;
+ }
+
+ memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN);
+ status = nfs4_proc_create_session(clp, cred);
+ if (status) {
+ dprintk("%s: session reset failed with status %d for server %s!\n",
+ __func__, status, clp->cl_hostname);
+ status = nfs4_handle_reclaim_lease_error(clp, status);
+ goto out;
+ }
+ nfs41_finish_session_reset(clp);
+ dprintk("%s: session reset was successful for server %s!\n",
+ __func__, clp->cl_hostname);
+out:
+ put_cred(cred);
+ return status;
+}
+
+static int nfs4_bind_conn_to_session(struct nfs_client *clp)
+{
+ const struct cred *cred;
+ int ret;
+
+ if (!nfs4_has_session(clp))
+ return 0;
+ ret = nfs4_begin_drain_session(clp);
+ if (ret != 0)
+ return ret;
+ cred = nfs4_get_clid_cred(clp);
+ ret = nfs4_proc_bind_conn_to_session(clp, cred);
+ put_cred(cred);
+ clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+ switch (ret) {
+ case 0:
+ dprintk("%s: bind_conn_to_session was successful for server %s!\n",
+ __func__, clp->cl_hostname);
+ break;
+ case -NFS4ERR_DELAY:
+ ssleep(1);
+ set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+ break;
+ default:
+ return nfs4_recovery_handle_error(clp, ret);
+ }
+ return 0;
+}
+
+static void nfs4_layoutreturn_any_run(struct nfs_client *clp)
+{
+ int iomode = 0;
+
+ if (test_and_clear_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &clp->cl_state))
+ iomode += IOMODE_READ;
+ if (test_and_clear_bit(NFS4CLNT_RECALL_ANY_LAYOUT_RW, &clp->cl_state))
+ iomode += IOMODE_RW;
+ /* Note: IOMODE_READ + IOMODE_RW == IOMODE_ANY */
+ if (iomode) {
+ pnfs_layout_return_unused_byclid(clp, iomode);
+ set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
+ }
+}
+#else /* CONFIG_NFS_V4_1 */
+static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
+
+static int nfs4_bind_conn_to_session(struct nfs_client *clp)
+{
+ return 0;
+}
+
+static void nfs4_layoutreturn_any_run(struct nfs_client *clp)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static void nfs4_state_manager(struct nfs_client *clp)
+{
+ unsigned int memflags;
+ int status = 0;
+ const char *section = "", *section_sep = "";
+
+ /*
+ * State recovery can deadlock if the direct reclaim code tries
+ * start NFS writeback. So ensure memory allocations are all
+ * GFP_NOFS.
+ */
+ memflags = memalloc_nofs_save();
+
+ /* Ensure exclusive access to NFSv4 state */
+ do {
+ trace_nfs4_state_mgr(clp);
+ clear_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
+ if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) {
+ section = "purge state";
+ status = nfs4_purge_lease(clp);
+ if (status < 0)
+ goto out_error;
+ continue;
+ }
+
+ if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) {
+ section = "lease expired";
+ /* We're going to have to re-establish a clientid */
+ status = nfs4_reclaim_lease(clp);
+ if (status < 0)
+ goto out_error;
+ continue;
+ }
+
+ /* Initialize or reset the session */
+ if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) {
+ section = "reset session";
+ status = nfs4_reset_session(clp);
+ if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+ continue;
+ if (status < 0)
+ goto out_error;
+ }
+
+ /* Send BIND_CONN_TO_SESSION */
+ if (test_and_clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION,
+ &clp->cl_state)) {
+ section = "bind conn to session";
+ status = nfs4_bind_conn_to_session(clp);
+ if (status < 0)
+ goto out_error;
+ continue;
+ }
+
+ if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
+ section = "check lease";
+ status = nfs4_check_lease(clp);
+ if (status < 0)
+ goto out_error;
+ continue;
+ }
+
+ if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) {
+ section = "migration";
+ status = nfs4_handle_migration(clp);
+ if (status < 0)
+ goto out_error;
+ }
+
+ if (test_and_clear_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state)) {
+ section = "lease moved";
+ status = nfs4_handle_lease_moved(clp);
+ if (status < 0)
+ goto out_error;
+ }
+
+ /* First recover reboot state... */
+ if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
+ section = "reclaim reboot";
+ status = nfs4_do_reclaim(clp,
+ clp->cl_mvops->reboot_recovery_ops);
+ if (status == -EAGAIN)
+ continue;
+ if (status < 0)
+ goto out_error;
+ nfs4_state_end_reclaim_reboot(clp);
+ continue;
+ }
+
+ /* Detect expired delegations... */
+ if (test_and_clear_bit(NFS4CLNT_DELEGATION_EXPIRED, &clp->cl_state)) {
+ section = "detect expired delegations";
+ nfs_reap_expired_delegations(clp);
+ continue;
+ }
+
+ /* Now recover expired state... */
+ if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
+ section = "reclaim nograce";
+ status = nfs4_do_reclaim(clp,
+ clp->cl_mvops->nograce_recovery_ops);
+ if (status == -EAGAIN)
+ continue;
+ if (status < 0)
+ goto out_error;
+ clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
+ }
+
+ memalloc_nofs_restore(memflags);
+ nfs4_end_drain_session(clp);
+ nfs4_clear_state_manager_bit(clp);
+
+ if (test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
+ !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING,
+ &clp->cl_state)) {
+ memflags = memalloc_nofs_save();
+ continue;
+ }
+
+ if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state)) {
+ if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
+ nfs_client_return_marked_delegations(clp);
+ set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
+ }
+ nfs4_layoutreturn_any_run(clp);
+ clear_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state);
+ }
+
+ return;
+
+ } while (refcount_read(&clp->cl_count) > 1 && !signalled());
+ goto out_drain;
+
+out_error:
+ if (strlen(section))
+ section_sep = ": ";
+ trace_nfs4_state_mgr_failed(clp, section, status);
+ pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s"
+ " with error %d\n", section_sep, section,
+ clp->cl_hostname, -status);
+ ssleep(1);
+out_drain:
+ memalloc_nofs_restore(memflags);
+ nfs4_end_drain_session(clp);
+ nfs4_clear_state_manager_bit(clp);
+}
+
+static int nfs4_run_state_manager(void *ptr)
+{
+ struct nfs_client *clp = ptr;
+ struct rpc_clnt *cl = clp->cl_rpcclient;
+
+ while (cl != cl->cl_parent)
+ cl = cl->cl_parent;
+
+ allow_signal(SIGKILL);
+again:
+ nfs4_state_manager(clp);
+
+ if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) &&
+ !test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) {
+ wait_var_event_interruptible(&clp->cl_state,
+ test_bit(NFS4CLNT_RUN_MANAGER,
+ &clp->cl_state));
+ if (!atomic_read(&cl->cl_swapper))
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
+ if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
+ !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state))
+ goto again;
+ /* Either no longer a swapper, or were signalled */
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
+ }
+
+ if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
+ test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
+ !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state))
+ goto again;
+
+ nfs_put_client(clp);
+ module_put_and_kthread_exit(0);
+ return 0;
+}
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
new file mode 100644
index 0000000000..d09bcfd7db
--- /dev/null
+++ b/fs/nfs/nfs4super.c
@@ -0,0 +1,314 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2012 Bryan Schumaker <bjschuma@netapp.com>
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_ssc.h>
+#include "delegation.h"
+#include "internal.h"
+#include "nfs4_fs.h"
+#include "nfs4idmap.h"
+#include "dns_resolve.h"
+#include "pnfs.h"
+#include "nfs.h"
+
+#define NFSDBG_FACILITY NFSDBG_VFS
+
+static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc);
+static void nfs4_evict_inode(struct inode *inode);
+
+static const struct super_operations nfs4_sops = {
+ .alloc_inode = nfs_alloc_inode,
+ .free_inode = nfs_free_inode,
+ .write_inode = nfs4_write_inode,
+ .drop_inode = nfs_drop_inode,
+ .statfs = nfs_statfs,
+ .evict_inode = nfs4_evict_inode,
+ .umount_begin = nfs_umount_begin,
+ .show_options = nfs_show_options,
+ .show_devname = nfs_show_devname,
+ .show_path = nfs_show_path,
+ .show_stats = nfs_show_stats,
+};
+
+struct nfs_subversion nfs_v4 = {
+ .owner = THIS_MODULE,
+ .nfs_fs = &nfs4_fs_type,
+ .rpc_vers = &nfs_version4,
+ .rpc_ops = &nfs_v4_clientops,
+ .sops = &nfs4_sops,
+ .xattr = nfs4_xattr_handlers,
+};
+
+static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ int ret = nfs_write_inode(inode, wbc);
+
+ if (ret == 0)
+ ret = pnfs_layoutcommit_inode(inode,
+ wbc->sync_mode == WB_SYNC_ALL);
+ return ret;
+}
+
+/*
+ * Clean out any remaining NFSv4 state that might be left over due
+ * to open() calls that passed nfs_atomic_lookup, but failed to call
+ * nfs_open().
+ */
+static void nfs4_evict_inode(struct inode *inode)
+{
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+ /* If we are holding a delegation, return and free it */
+ nfs_inode_evict_delegation(inode);
+ /* Note that above delegreturn would trigger pnfs return-on-close */
+ pnfs_return_layout(inode);
+ pnfs_destroy_layout_final(NFS_I(inode));
+ /* First call standard NFS clear_inode() code */
+ nfs_clear_inode(inode);
+ nfs4_xattr_cache_zap(inode);
+}
+
+struct nfs_referral_count {
+ struct list_head list;
+ const struct task_struct *task;
+ unsigned int referral_count;
+};
+
+static LIST_HEAD(nfs_referral_count_list);
+static DEFINE_SPINLOCK(nfs_referral_count_list_lock);
+
+static struct nfs_referral_count *nfs_find_referral_count(void)
+{
+ struct nfs_referral_count *p;
+
+ list_for_each_entry(p, &nfs_referral_count_list, list) {
+ if (p->task == current)
+ return p;
+ }
+ return NULL;
+}
+
+#define NFS_MAX_NESTED_REFERRALS 2
+
+static int nfs_referral_loop_protect(void)
+{
+ struct nfs_referral_count *p, *new;
+ int ret = -ENOMEM;
+
+ new = kmalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ goto out;
+ new->task = current;
+ new->referral_count = 1;
+
+ ret = 0;
+ spin_lock(&nfs_referral_count_list_lock);
+ p = nfs_find_referral_count();
+ if (p != NULL) {
+ if (p->referral_count >= NFS_MAX_NESTED_REFERRALS)
+ ret = -ELOOP;
+ else
+ p->referral_count++;
+ } else {
+ list_add(&new->list, &nfs_referral_count_list);
+ new = NULL;
+ }
+ spin_unlock(&nfs_referral_count_list_lock);
+ kfree(new);
+out:
+ return ret;
+}
+
+static void nfs_referral_loop_unprotect(void)
+{
+ struct nfs_referral_count *p;
+
+ spin_lock(&nfs_referral_count_list_lock);
+ p = nfs_find_referral_count();
+ p->referral_count--;
+ if (p->referral_count == 0)
+ list_del(&p->list);
+ else
+ p = NULL;
+ spin_unlock(&nfs_referral_count_list_lock);
+ kfree(p);
+}
+
+static int do_nfs4_mount(struct nfs_server *server,
+ struct fs_context *fc,
+ const char *hostname,
+ const char *export_path)
+{
+ struct nfs_fs_context *root_ctx;
+ struct fs_context *root_fc;
+ struct vfsmount *root_mnt;
+ struct dentry *dentry;
+ size_t len;
+ int ret;
+
+ struct fs_parameter param = {
+ .key = "source",
+ .type = fs_value_is_string,
+ .dirfd = -1,
+ };
+
+ if (IS_ERR(server))
+ return PTR_ERR(server);
+
+ root_fc = vfs_dup_fs_context(fc);
+ if (IS_ERR(root_fc)) {
+ nfs_free_server(server);
+ return PTR_ERR(root_fc);
+ }
+ kfree(root_fc->source);
+ root_fc->source = NULL;
+
+ root_ctx = nfs_fc2context(root_fc);
+ root_ctx->internal = true;
+ root_ctx->server = server;
+ /* We leave export_path unset as it's not used to find the root. */
+
+ len = strlen(hostname) + 5;
+ param.string = kmalloc(len, GFP_KERNEL);
+ if (param.string == NULL) {
+ put_fs_context(root_fc);
+ return -ENOMEM;
+ }
+
+ /* Does hostname needs to be enclosed in brackets? */
+ if (strchr(hostname, ':'))
+ param.size = snprintf(param.string, len, "[%s]:/", hostname);
+ else
+ param.size = snprintf(param.string, len, "%s:/", hostname);
+ ret = vfs_parse_fs_param(root_fc, &param);
+ kfree(param.string);
+ if (ret < 0) {
+ put_fs_context(root_fc);
+ return ret;
+ }
+ root_mnt = fc_mount(root_fc);
+ put_fs_context(root_fc);
+
+ if (IS_ERR(root_mnt))
+ return PTR_ERR(root_mnt);
+
+ ret = nfs_referral_loop_protect();
+ if (ret) {
+ mntput(root_mnt);
+ return ret;
+ }
+
+ dentry = mount_subtree(root_mnt, export_path);
+ nfs_referral_loop_unprotect();
+
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ fc->root = dentry;
+ return 0;
+}
+
+int nfs4_try_get_tree(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ int err;
+
+ dfprintk(MOUNT, "--> nfs4_try_get_tree()\n");
+
+ /* We create a mount for the server's root, walk to the requested
+ * location and then create another mount for that.
+ */
+ err= do_nfs4_mount(nfs4_create_server(fc),
+ fc, ctx->nfs_server.hostname,
+ ctx->nfs_server.export_path);
+ if (err) {
+ nfs_ferrorf(fc, MOUNT, "NFS4: Couldn't follow remote path");
+ dfprintk(MOUNT, "<-- nfs4_try_get_tree() = %d [error]\n", err);
+ } else {
+ dfprintk(MOUNT, "<-- nfs4_try_get_tree() = 0\n");
+ }
+ return err;
+}
+
+/*
+ * Create an NFS4 server record on referral traversal
+ */
+int nfs4_get_referral_tree(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ int err;
+
+ dprintk("--> nfs4_referral_mount()\n");
+
+ /* create a new volume representation */
+ err = do_nfs4_mount(nfs4_create_referral_server(fc),
+ fc, ctx->nfs_server.hostname,
+ ctx->nfs_server.export_path);
+ if (err) {
+ nfs_ferrorf(fc, MOUNT, "NFS4: Couldn't follow remote path");
+ dfprintk(MOUNT, "<-- nfs4_get_referral_tree() = %d [error]\n", err);
+ } else {
+ dfprintk(MOUNT, "<-- nfs4_get_referral_tree() = 0\n");
+ }
+ return err;
+}
+
+static int __init init_nfs_v4(void)
+{
+ int err;
+
+ err = nfs_dns_resolver_init();
+ if (err)
+ goto out;
+
+ err = nfs_idmap_init();
+ if (err)
+ goto out1;
+
+#ifdef CONFIG_NFS_V4_2
+ err = nfs4_xattr_cache_init();
+ if (err)
+ goto out2;
+#endif
+
+ err = nfs4_register_sysctl();
+ if (err)
+ goto out2;
+
+#ifdef CONFIG_NFS_V4_2
+ nfs42_ssc_register_ops();
+#endif
+ register_nfs_version(&nfs_v4);
+ return 0;
+out2:
+ nfs_idmap_quit();
+out1:
+ nfs_dns_resolver_destroy();
+out:
+ return err;
+}
+
+static void __exit exit_nfs_v4(void)
+{
+ /* Not called in the _init(), conditionally loaded */
+ nfs4_pnfs_v3_ds_connect_unload();
+
+ unregister_nfs_version(&nfs_v4);
+#ifdef CONFIG_NFS_V4_2
+ nfs4_xattr_cache_exit();
+ nfs42_ssc_unregister_ops();
+#endif
+ nfs4_unregister_sysctl();
+ nfs_idmap_quit();
+ nfs_dns_resolver_destroy();
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_nfs_v4);
+module_exit(exit_nfs_v4);
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
new file mode 100644
index 0000000000..e776200e9a
--- /dev/null
+++ b/fs/nfs/nfs4sysctl.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/nfs4sysctl.c
+ *
+ * Sysctl interface to NFS v4 parameters
+ *
+ * Copyright (c) 2006 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/sysctl.h>
+#include <linux/nfs_fs.h>
+
+#include "nfs4_fs.h"
+#include "nfs4idmap.h"
+#include "callback.h"
+
+static const int nfs_set_port_min;
+static const int nfs_set_port_max = 65535;
+static struct ctl_table_header *nfs4_callback_sysctl_table;
+
+static struct ctl_table nfs4_cb_sysctls[] = {
+ {
+ .procname = "nfs_callback_tcpport",
+ .data = &nfs_callback_set_tcpport,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (int *)&nfs_set_port_min,
+ .extra2 = (int *)&nfs_set_port_max,
+ },
+ {
+ .procname = "idmap_cache_timeout",
+ .data = &nfs_idmap_cache_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+int nfs4_register_sysctl(void)
+{
+ nfs4_callback_sysctl_table = register_sysctl("fs/nfs",
+ nfs4_cb_sysctls);
+ if (nfs4_callback_sysctl_table == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+void nfs4_unregister_sysctl(void)
+{
+ unregister_sysctl_table(nfs4_callback_sysctl_table);
+ nfs4_callback_sysctl_table = NULL;
+}
diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c
new file mode 100644
index 0000000000..d9ac556beb
--- /dev/null
+++ b/fs/nfs/nfs4trace.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
+#include "internal.h"
+#include "nfs4session.h"
+#include "callback.h"
+#include "pnfs.h"
+
+#define CREATE_TRACE_POINTS
+#include "nfs4trace.h"
+
+#ifdef CONFIG_NFS_V4_1
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_read);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_commit_ds);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_init_read);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_init_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_get_mirror_count);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_done);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_done);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_pagelist);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_pagelist);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_read_error);
+EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_write_error);
+EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_commit_error);
+#endif
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
new file mode 100644
index 0000000000..d27919d724
--- /dev/null
+++ b/fs/nfs/nfs4trace.h
@@ -0,0 +1,2559 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nfs4
+
+#if !defined(_TRACE_NFS4_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NFS4_H
+
+#include <linux/tracepoint.h>
+#include <trace/misc/sunrpc.h>
+
+#include <trace/misc/fs.h>
+#include <trace/misc/nfs.h>
+
+#define show_nfs_fattr_flags(valid) \
+ __print_flags((unsigned long)valid, "|", \
+ { NFS_ATTR_FATTR_TYPE, "TYPE" }, \
+ { NFS_ATTR_FATTR_MODE, "MODE" }, \
+ { NFS_ATTR_FATTR_NLINK, "NLINK" }, \
+ { NFS_ATTR_FATTR_OWNER, "OWNER" }, \
+ { NFS_ATTR_FATTR_GROUP, "GROUP" }, \
+ { NFS_ATTR_FATTR_RDEV, "RDEV" }, \
+ { NFS_ATTR_FATTR_SIZE, "SIZE" }, \
+ { NFS_ATTR_FATTR_FSID, "FSID" }, \
+ { NFS_ATTR_FATTR_FILEID, "FILEID" }, \
+ { NFS_ATTR_FATTR_ATIME, "ATIME" }, \
+ { NFS_ATTR_FATTR_MTIME, "MTIME" }, \
+ { NFS_ATTR_FATTR_CTIME, "CTIME" }, \
+ { NFS_ATTR_FATTR_CHANGE, "CHANGE" }, \
+ { NFS_ATTR_FATTR_OWNER_NAME, "OWNER_NAME" }, \
+ { NFS_ATTR_FATTR_GROUP_NAME, "GROUP_NAME" })
+
+DECLARE_EVENT_CLASS(nfs4_clientid_event,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ int error
+ ),
+
+ TP_ARGS(clp, error),
+
+ TP_STRUCT__entry(
+ __string(dstaddr, clp->cl_hostname)
+ __field(unsigned long, error)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error < 0 ? -error : 0;
+ __assign_str(dstaddr, clp->cl_hostname);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) dstaddr=%s",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ __get_str(dstaddr)
+ )
+);
+#define DEFINE_NFS4_CLIENTID_EVENT(name) \
+ DEFINE_EVENT(nfs4_clientid_event, name, \
+ TP_PROTO( \
+ const struct nfs_client *clp, \
+ int error \
+ ), \
+ TP_ARGS(clp, error))
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_setclientid);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_setclientid_confirm);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_renew);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_renew_async);
+#ifdef CONFIG_NFS_V4_1
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_exchange_id);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_create_session);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_destroy_session);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_destroy_clientid);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_bind_conn_to_session);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_sequence);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_reclaim_complete);
+
+TRACE_EVENT(nfs4_sequence_done,
+ TP_PROTO(
+ const struct nfs4_session *session,
+ const struct nfs4_sequence_res *res
+ ),
+ TP_ARGS(session, res),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, session)
+ __field(unsigned int, slot_nr)
+ __field(unsigned int, seq_nr)
+ __field(unsigned int, highest_slotid)
+ __field(unsigned int, target_highest_slotid)
+ __field(unsigned long, status_flags)
+ __field(unsigned long, error)
+ ),
+
+ TP_fast_assign(
+ const struct nfs4_slot *sr_slot = res->sr_slot;
+ __entry->session = nfs_session_id_hash(&session->sess_id);
+ __entry->slot_nr = sr_slot->slot_nr;
+ __entry->seq_nr = sr_slot->seq_nr;
+ __entry->highest_slotid = res->sr_highest_slotid;
+ __entry->target_highest_slotid =
+ res->sr_target_highest_slotid;
+ __entry->status_flags = res->sr_status_flags;
+ __entry->error = res->sr_status < 0 ?
+ -res->sr_status : 0;
+ ),
+ TP_printk(
+ "error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u "
+ "highest_slotid=%u target_highest_slotid=%u "
+ "status_flags=0x%lx (%s)",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ __entry->session,
+ __entry->slot_nr,
+ __entry->seq_nr,
+ __entry->highest_slotid,
+ __entry->target_highest_slotid,
+ __entry->status_flags,
+ show_nfs4_seq4_status(__entry->status_flags)
+ )
+);
+
+struct cb_sequenceargs;
+struct cb_sequenceres;
+
+TRACE_EVENT(nfs4_cb_sequence,
+ TP_PROTO(
+ const struct cb_sequenceargs *args,
+ const struct cb_sequenceres *res,
+ __be32 status
+ ),
+ TP_ARGS(args, res, status),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, session)
+ __field(unsigned int, slot_nr)
+ __field(unsigned int, seq_nr)
+ __field(unsigned int, highest_slotid)
+ __field(unsigned int, cachethis)
+ __field(unsigned long, error)
+ ),
+
+ TP_fast_assign(
+ __entry->session = nfs_session_id_hash(&args->csa_sessionid);
+ __entry->slot_nr = args->csa_slotid;
+ __entry->seq_nr = args->csa_sequenceid;
+ __entry->highest_slotid = args->csa_highestslotid;
+ __entry->cachethis = args->csa_cachethis;
+ __entry->error = be32_to_cpu(status);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u "
+ "highest_slotid=%u",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ __entry->session,
+ __entry->slot_nr,
+ __entry->seq_nr,
+ __entry->highest_slotid
+ )
+);
+
+TRACE_EVENT(nfs4_cb_seqid_err,
+ TP_PROTO(
+ const struct cb_sequenceargs *args,
+ __be32 status
+ ),
+ TP_ARGS(args, status),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, session)
+ __field(unsigned int, slot_nr)
+ __field(unsigned int, seq_nr)
+ __field(unsigned int, highest_slotid)
+ __field(unsigned int, cachethis)
+ __field(unsigned long, error)
+ ),
+
+ TP_fast_assign(
+ __entry->session = nfs_session_id_hash(&args->csa_sessionid);
+ __entry->slot_nr = args->csa_slotid;
+ __entry->seq_nr = args->csa_sequenceid;
+ __entry->highest_slotid = args->csa_highestslotid;
+ __entry->cachethis = args->csa_cachethis;
+ __entry->error = be32_to_cpu(status);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u "
+ "highest_slotid=%u",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ __entry->session,
+ __entry->slot_nr,
+ __entry->seq_nr,
+ __entry->highest_slotid
+ )
+);
+
+TRACE_EVENT(nfs4_cb_offload,
+ TP_PROTO(
+ const struct nfs_fh *cb_fh,
+ const nfs4_stateid *cb_stateid,
+ uint64_t cb_count,
+ int cb_error,
+ int cb_how_stable
+ ),
+
+ TP_ARGS(cb_fh, cb_stateid, cb_count, cb_error,
+ cb_how_stable),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(u32, fhandle)
+ __field(loff_t, cb_count)
+ __field(int, cb_how)
+ __field(int, cb_stateid_seq)
+ __field(u32, cb_stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->error = cb_error < 0 ? -cb_error : 0;
+ __entry->fhandle = nfs_fhandle_hash(cb_fh);
+ __entry->cb_stateid_seq =
+ be32_to_cpu(cb_stateid->seqid);
+ __entry->cb_stateid_hash =
+ nfs_stateid_hash(cb_stateid);
+ __entry->cb_count = cb_count;
+ __entry->cb_how = cb_how_stable;
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fhandle=0x%08x cb_stateid=%d:0x%08x "
+ "cb_count=%llu cb_how=%s",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ __entry->fhandle,
+ __entry->cb_stateid_seq, __entry->cb_stateid_hash,
+ __entry->cb_count,
+ show_nfs_stable_how(__entry->cb_how)
+ )
+);
+#endif /* CONFIG_NFS_V4_1 */
+
+TRACE_EVENT(nfs4_setup_sequence,
+ TP_PROTO(
+ const struct nfs4_session *session,
+ const struct nfs4_sequence_args *args
+ ),
+ TP_ARGS(session, args),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, session)
+ __field(unsigned int, slot_nr)
+ __field(unsigned int, seq_nr)
+ __field(unsigned int, highest_used_slotid)
+ ),
+
+ TP_fast_assign(
+ const struct nfs4_slot *sa_slot = args->sa_slot;
+ __entry->session = session ? nfs_session_id_hash(&session->sess_id) : 0;
+ __entry->slot_nr = sa_slot->slot_nr;
+ __entry->seq_nr = sa_slot->seq_nr;
+ __entry->highest_used_slotid =
+ sa_slot->table->highest_used_slotid;
+ ),
+ TP_printk(
+ "session=0x%08x slot_nr=%u seq_nr=%u "
+ "highest_used_slotid=%u",
+ __entry->session,
+ __entry->slot_nr,
+ __entry->seq_nr,
+ __entry->highest_used_slotid
+ )
+);
+
+TRACE_DEFINE_ENUM(NFS4CLNT_MANAGER_RUNNING);
+TRACE_DEFINE_ENUM(NFS4CLNT_CHECK_LEASE);
+TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_EXPIRED);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECLAIM_REBOOT);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECLAIM_NOGRACE);
+TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN);
+TRACE_DEFINE_ENUM(NFS4CLNT_SESSION_RESET);
+TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_CONFIRM);
+TRACE_DEFINE_ENUM(NFS4CLNT_SERVER_SCOPE_MISMATCH);
+TRACE_DEFINE_ENUM(NFS4CLNT_PURGE_STATE);
+TRACE_DEFINE_ENUM(NFS4CLNT_BIND_CONN_TO_SESSION);
+TRACE_DEFINE_ENUM(NFS4CLNT_MOVED);
+TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_MOVED);
+TRACE_DEFINE_ENUM(NFS4CLNT_DELEGATION_EXPIRED);
+TRACE_DEFINE_ENUM(NFS4CLNT_RUN_MANAGER);
+TRACE_DEFINE_ENUM(NFS4CLNT_MANAGER_AVAILABLE);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_RUNNING);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_READ);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_RW);
+TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_DELAYED);
+
+#define show_nfs4_clp_state(state) \
+ __print_flags(state, "|", \
+ { BIT(NFS4CLNT_MANAGER_RUNNING), "MANAGER_RUNNING" }, \
+ { BIT(NFS4CLNT_CHECK_LEASE), "CHECK_LEASE" }, \
+ { BIT(NFS4CLNT_LEASE_EXPIRED), "LEASE_EXPIRED" }, \
+ { BIT(NFS4CLNT_RECLAIM_REBOOT), "RECLAIM_REBOOT" }, \
+ { BIT(NFS4CLNT_RECLAIM_NOGRACE), "RECLAIM_NOGRACE" }, \
+ { BIT(NFS4CLNT_DELEGRETURN), "DELEGRETURN" }, \
+ { BIT(NFS4CLNT_SESSION_RESET), "SESSION_RESET" }, \
+ { BIT(NFS4CLNT_LEASE_CONFIRM), "LEASE_CONFIRM" }, \
+ { BIT(NFS4CLNT_SERVER_SCOPE_MISMATCH), "SERVER_SCOPE_MISMATCH" }, \
+ { BIT(NFS4CLNT_PURGE_STATE), "PURGE_STATE" }, \
+ { BIT(NFS4CLNT_BIND_CONN_TO_SESSION), "BIND_CONN_TO_SESSION" }, \
+ { BIT(NFS4CLNT_MOVED), "MOVED" }, \
+ { BIT(NFS4CLNT_LEASE_MOVED), "LEASE_MOVED" }, \
+ { BIT(NFS4CLNT_DELEGATION_EXPIRED), "DELEGATION_EXPIRED" }, \
+ { BIT(NFS4CLNT_RUN_MANAGER), "RUN_MANAGER" }, \
+ { BIT(NFS4CLNT_MANAGER_AVAILABLE), "MANAGER_AVAILABLE" }, \
+ { BIT(NFS4CLNT_RECALL_RUNNING), "RECALL_RUNNING" }, \
+ { BIT(NFS4CLNT_RECALL_ANY_LAYOUT_READ), "RECALL_ANY_LAYOUT_READ" }, \
+ { BIT(NFS4CLNT_RECALL_ANY_LAYOUT_RW), "RECALL_ANY_LAYOUT_RW" }, \
+ { BIT(NFS4CLNT_DELEGRETURN_DELAYED), "DELERETURN_DELAYED" })
+
+TRACE_EVENT(nfs4_state_mgr,
+ TP_PROTO(
+ const struct nfs_client *clp
+ ),
+
+ TP_ARGS(clp),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, state)
+ __string(hostname, clp->cl_hostname)
+ ),
+
+ TP_fast_assign(
+ __entry->state = clp->cl_state;
+ __assign_str(hostname, clp->cl_hostname);
+ ),
+
+ TP_printk(
+ "hostname=%s clp state=%s", __get_str(hostname),
+ show_nfs4_clp_state(__entry->state)
+ )
+)
+
+TRACE_EVENT(nfs4_state_mgr_failed,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ const char *section,
+ int status
+ ),
+
+ TP_ARGS(clp, section, status),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(unsigned long, state)
+ __string(hostname, clp->cl_hostname)
+ __string(section, section)
+ ),
+
+ TP_fast_assign(
+ __entry->error = status < 0 ? -status : 0;
+ __entry->state = clp->cl_state;
+ __assign_str(hostname, clp->cl_hostname);
+ __assign_str(section, section);
+ ),
+
+ TP_printk(
+ "hostname=%s clp state=%s error=%ld (%s) section=%s",
+ __get_str(hostname),
+ show_nfs4_clp_state(__entry->state), -__entry->error,
+ show_nfs4_status(__entry->error), __get_str(section)
+
+ )
+)
+
+TRACE_EVENT(nfs4_xdr_bad_operation,
+ TP_PROTO(
+ const struct xdr_stream *xdr,
+ u32 op,
+ u32 expected
+ ),
+
+ TP_ARGS(xdr, op, expected),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, task_id)
+ __field(unsigned int, client_id)
+ __field(u32, xid)
+ __field(u32, op)
+ __field(u32, expected)
+ ),
+
+ TP_fast_assign(
+ const struct rpc_rqst *rqstp = xdr->rqst;
+ const struct rpc_task *task = rqstp->rq_task;
+
+ __entry->task_id = task->tk_pid;
+ __entry->client_id = task->tk_client->cl_clid;
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->op = op;
+ __entry->expected = expected;
+ ),
+
+ TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
+ " xid=0x%08x operation=%u, expected=%u",
+ __entry->task_id, __entry->client_id, __entry->xid,
+ __entry->op, __entry->expected
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs4_xdr_event,
+ TP_PROTO(
+ const struct xdr_stream *xdr,
+ u32 op,
+ u32 error
+ ),
+
+ TP_ARGS(xdr, op, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, task_id)
+ __field(unsigned int, client_id)
+ __field(u32, xid)
+ __field(u32, op)
+ __field(unsigned long, error)
+ ),
+
+ TP_fast_assign(
+ const struct rpc_rqst *rqstp = xdr->rqst;
+ const struct rpc_task *task = rqstp->rq_task;
+
+ __entry->task_id = task->tk_pid;
+ __entry->client_id = task->tk_client->cl_clid;
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->op = op;
+ __entry->error = error;
+ ),
+
+ TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
+ " xid=0x%08x error=%ld (%s) operation=%u",
+ __entry->task_id, __entry->client_id, __entry->xid,
+ -__entry->error, show_nfs4_status(__entry->error),
+ __entry->op
+ )
+);
+#define DEFINE_NFS4_XDR_EVENT(name) \
+ DEFINE_EVENT(nfs4_xdr_event, name, \
+ TP_PROTO( \
+ const struct xdr_stream *xdr, \
+ u32 op, \
+ u32 error \
+ ), \
+ TP_ARGS(xdr, op, error))
+DEFINE_NFS4_XDR_EVENT(nfs4_xdr_status);
+DEFINE_NFS4_XDR_EVENT(nfs4_xdr_bad_filehandle);
+
+DECLARE_EVENT_CLASS(nfs4_cb_error_class,
+ TP_PROTO(
+ __be32 xid,
+ u32 cb_ident
+ ),
+
+ TP_ARGS(xid, cb_ident),
+
+ TP_STRUCT__entry(
+ __field(u32, xid)
+ __field(u32, cbident)
+ ),
+
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(xid);
+ __entry->cbident = cb_ident;
+ ),
+
+ TP_printk(
+ "xid=0x%08x cb_ident=0x%08x",
+ __entry->xid, __entry->cbident
+ )
+);
+
+#define DEFINE_CB_ERROR_EVENT(name) \
+ DEFINE_EVENT(nfs4_cb_error_class, nfs_cb_##name, \
+ TP_PROTO( \
+ __be32 xid, \
+ u32 cb_ident \
+ ), \
+ TP_ARGS(xid, cb_ident))
+
+DEFINE_CB_ERROR_EVENT(no_clp);
+DEFINE_CB_ERROR_EVENT(badprinc);
+
+DECLARE_EVENT_CLASS(nfs4_open_event,
+ TP_PROTO(
+ const struct nfs_open_context *ctx,
+ int flags,
+ int error
+ ),
+
+ TP_ARGS(ctx, flags, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(unsigned long, flags)
+ __field(unsigned long, fmode)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, dir)
+ __string(name, ctx->dentry->d_name.name)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, openstateid_seq)
+ __field(u32, openstateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct nfs4_state *state = ctx->state;
+ const struct inode *inode = NULL;
+
+ __entry->error = -error;
+ __entry->flags = flags;
+ __entry->fmode = (__force unsigned long)ctx->mode;
+ __entry->dev = ctx->dentry->d_sb->s_dev;
+ if (!IS_ERR_OR_NULL(state)) {
+ inode = state->inode;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ __entry->openstateid_seq =
+ be32_to_cpu(state->open_stateid.seqid);
+ __entry->openstateid_hash =
+ nfs_stateid_hash(&state->open_stateid);
+ } else {
+ __entry->stateid_seq = 0;
+ __entry->stateid_hash = 0;
+ __entry->openstateid_seq = 0;
+ __entry->openstateid_hash = 0;
+ }
+ if (inode != NULL) {
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ } else {
+ __entry->fileid = 0;
+ __entry->fhandle = 0;
+ }
+ __entry->dir = NFS_FILEID(d_inode(ctx->dentry->d_parent));
+ __assign_str(name, ctx->dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) flags=%lu (%s) fmode=%s "
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "name=%02x:%02x:%llu/%s stateid=%d:0x%08x "
+ "openstateid=%d:0x%08x",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ __entry->flags,
+ show_fs_fcntl_open_flags(__entry->flags),
+ show_fs_fmode_flags(__entry->fmode),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name),
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->openstateid_seq, __entry->openstateid_hash
+ )
+);
+
+#define DEFINE_NFS4_OPEN_EVENT(name) \
+ DEFINE_EVENT(nfs4_open_event, name, \
+ TP_PROTO( \
+ const struct nfs_open_context *ctx, \
+ int flags, \
+ int error \
+ ), \
+ TP_ARGS(ctx, flags, error))
+DEFINE_NFS4_OPEN_EVENT(nfs4_open_reclaim);
+DEFINE_NFS4_OPEN_EVENT(nfs4_open_expired);
+DEFINE_NFS4_OPEN_EVENT(nfs4_open_file);
+
+TRACE_EVENT(nfs4_cached_open,
+ TP_PROTO(
+ const struct nfs4_state *state
+ ),
+ TP_ARGS(state),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned int, fmode)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->fmode = (__force unsigned int)state->state;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ ),
+
+ TP_printk(
+ "fmode=%s fileid=%02x:%02x:%llu "
+ "fhandle=0x%08x stateid=%d:0x%08x",
+ __entry->fmode ? show_fs_fmode_flags(__entry->fmode) :
+ "closed",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
+TRACE_EVENT(nfs4_close,
+ TP_PROTO(
+ const struct nfs4_state *state,
+ const struct nfs_closeargs *args,
+ const struct nfs_closeres *res,
+ int error
+ ),
+
+ TP_ARGS(state, args, res, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned int, fmode)
+ __field(unsigned long, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->fmode = (__force unsigned int)state->state;
+ __entry->error = error < 0 ? -error : 0;
+ __entry->stateid_seq =
+ be32_to_cpu(args->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&args->stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fmode=%s fileid=%02x:%02x:%llu "
+ "fhandle=0x%08x openstateid=%d:0x%08x",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ __entry->fmode ? show_fs_fmode_flags(__entry->fmode) :
+ "closed",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs4_lock_event,
+ TP_PROTO(
+ const struct file_lock *request,
+ const struct nfs4_state *state,
+ int cmd,
+ int error
+ ),
+
+ TP_ARGS(request, state, cmd, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(unsigned long, cmd)
+ __field(unsigned long, type)
+ __field(loff_t, start)
+ __field(loff_t, end)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->error = error < 0 ? -error : 0;
+ __entry->cmd = cmd;
+ __entry->type = request->fl_type;
+ __entry->start = request->fl_start;
+ __entry->end = request->fl_end;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) cmd=%s:%s range=%lld:%lld "
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ show_fs_fcntl_cmd(__entry->cmd),
+ show_fs_fcntl_lock_type(__entry->type),
+ (long long)__entry->start,
+ (long long)__entry->end,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
+#define DEFINE_NFS4_LOCK_EVENT(name) \
+ DEFINE_EVENT(nfs4_lock_event, name, \
+ TP_PROTO( \
+ const struct file_lock *request, \
+ const struct nfs4_state *state, \
+ int cmd, \
+ int error \
+ ), \
+ TP_ARGS(request, state, cmd, error))
+DEFINE_NFS4_LOCK_EVENT(nfs4_get_lock);
+DEFINE_NFS4_LOCK_EVENT(nfs4_unlock);
+
+TRACE_EVENT(nfs4_set_lock,
+ TP_PROTO(
+ const struct file_lock *request,
+ const struct nfs4_state *state,
+ const nfs4_stateid *lockstateid,
+ int cmd,
+ int error
+ ),
+
+ TP_ARGS(request, state, lockstateid, cmd, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(unsigned long, cmd)
+ __field(unsigned long, type)
+ __field(loff_t, start)
+ __field(loff_t, end)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, lockstateid_seq)
+ __field(u32, lockstateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->error = error < 0 ? -error : 0;
+ __entry->cmd = cmd;
+ __entry->type = request->fl_type;
+ __entry->start = request->fl_start;
+ __entry->end = request->fl_end;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ __entry->lockstateid_seq =
+ be32_to_cpu(lockstateid->seqid);
+ __entry->lockstateid_hash =
+ nfs_stateid_hash(lockstateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) cmd=%s:%s range=%lld:%lld "
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x lockstateid=%d:0x%08x",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ show_fs_fcntl_cmd(__entry->cmd),
+ show_fs_fcntl_lock_type(__entry->type),
+ (long long)__entry->start,
+ (long long)__entry->end,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->lockstateid_seq, __entry->lockstateid_hash
+ )
+);
+
+TRACE_DEFINE_ENUM(LK_STATE_IN_USE);
+TRACE_DEFINE_ENUM(NFS_DELEGATED_STATE);
+TRACE_DEFINE_ENUM(NFS_OPEN_STATE);
+TRACE_DEFINE_ENUM(NFS_O_RDONLY_STATE);
+TRACE_DEFINE_ENUM(NFS_O_WRONLY_STATE);
+TRACE_DEFINE_ENUM(NFS_O_RDWR_STATE);
+TRACE_DEFINE_ENUM(NFS_STATE_RECLAIM_REBOOT);
+TRACE_DEFINE_ENUM(NFS_STATE_RECLAIM_NOGRACE);
+TRACE_DEFINE_ENUM(NFS_STATE_POSIX_LOCKS);
+TRACE_DEFINE_ENUM(NFS_STATE_RECOVERY_FAILED);
+TRACE_DEFINE_ENUM(NFS_STATE_MAY_NOTIFY_LOCK);
+TRACE_DEFINE_ENUM(NFS_STATE_CHANGE_WAIT);
+TRACE_DEFINE_ENUM(NFS_CLNT_DST_SSC_COPY_STATE);
+TRACE_DEFINE_ENUM(NFS_CLNT_SRC_SSC_COPY_STATE);
+TRACE_DEFINE_ENUM(NFS_SRV_SSC_COPY_STATE);
+
+#define show_nfs4_state_flags(flags) \
+ __print_flags(flags, "|", \
+ { LK_STATE_IN_USE, "IN_USE" }, \
+ { NFS_DELEGATED_STATE, "DELEGATED" }, \
+ { NFS_OPEN_STATE, "OPEN" }, \
+ { NFS_O_RDONLY_STATE, "O_RDONLY" }, \
+ { NFS_O_WRONLY_STATE, "O_WRONLY" }, \
+ { NFS_O_RDWR_STATE, "O_RDWR" }, \
+ { NFS_STATE_RECLAIM_REBOOT, "RECLAIM_REBOOT" }, \
+ { NFS_STATE_RECLAIM_NOGRACE, "RECLAIM_NOGRACE" }, \
+ { NFS_STATE_POSIX_LOCKS, "POSIX_LOCKS" }, \
+ { NFS_STATE_RECOVERY_FAILED, "RECOVERY_FAILED" }, \
+ { NFS_STATE_MAY_NOTIFY_LOCK, "MAY_NOTIFY_LOCK" }, \
+ { NFS_STATE_CHANGE_WAIT, "CHANGE_WAIT" }, \
+ { NFS_CLNT_DST_SSC_COPY_STATE, "CLNT_DST_SSC_COPY" }, \
+ { NFS_CLNT_SRC_SSC_COPY_STATE, "CLNT_SRC_SSC_COPY" }, \
+ { NFS_SRV_SSC_COPY_STATE, "SRV_SSC_COPY" })
+
+#define show_nfs4_lock_flags(flags) \
+ __print_flags(flags, "|", \
+ { BIT(NFS_LOCK_INITIALIZED), "INITIALIZED" }, \
+ { BIT(NFS_LOCK_LOST), "LOST" })
+
+TRACE_EVENT(nfs4_state_lock_reclaim,
+ TP_PROTO(
+ const struct nfs4_state *state,
+ const struct nfs4_lock_state *lock
+ ),
+
+ TP_ARGS(state, lock),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned long, state_flags)
+ __field(unsigned long, lock_flags)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->state_flags = state->flags;
+ __entry->lock_flags = lock->ls_flags;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x state_flags=%s lock_flags=%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid, __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ show_nfs4_state_flags(__entry->state_flags),
+ show_nfs4_lock_flags(__entry->lock_flags)
+ )
+)
+
+DECLARE_EVENT_CLASS(nfs4_set_delegation_event,
+ TP_PROTO(
+ const struct inode *inode,
+ fmode_t fmode
+ ),
+
+ TP_ARGS(inode, fmode),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned int, fmode)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->fmode = (__force unsigned int)fmode;
+ ),
+
+ TP_printk(
+ "fmode=%s fileid=%02x:%02x:%llu fhandle=0x%08x",
+ show_fs_fmode_flags(__entry->fmode),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle
+ )
+);
+#define DEFINE_NFS4_SET_DELEGATION_EVENT(name) \
+ DEFINE_EVENT(nfs4_set_delegation_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ fmode_t fmode \
+ ), \
+ TP_ARGS(inode, fmode))
+DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_set_delegation);
+DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_reclaim_delegation);
+
+TRACE_EVENT(nfs4_delegreturn_exit,
+ TP_PROTO(
+ const struct nfs4_delegreturnargs *args,
+ const struct nfs4_delegreturnres *res,
+ int error
+ ),
+
+ TP_ARGS(args, res, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(unsigned long, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = res->server->s_dev;
+ __entry->fhandle = nfs_fhandle_hash(args->fhandle);
+ __entry->error = error < 0 ? -error : 0;
+ __entry->stateid_seq =
+ be32_to_cpu(args->stateid->seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(args->stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) dev=%02x:%02x fhandle=0x%08x "
+ "stateid=%d:0x%08x",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
+#ifdef CONFIG_NFS_V4_1
+DECLARE_EVENT_CLASS(nfs4_test_stateid_event,
+ TP_PROTO(
+ const struct nfs4_state *state,
+ const struct nfs4_lock_state *lsp,
+ int error
+ ),
+
+ TP_ARGS(state, lsp, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->error = error < 0 ? -error : 0;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
+#define DEFINE_NFS4_TEST_STATEID_EVENT(name) \
+ DEFINE_EVENT(nfs4_test_stateid_event, name, \
+ TP_PROTO( \
+ const struct nfs4_state *state, \
+ const struct nfs4_lock_state *lsp, \
+ int error \
+ ), \
+ TP_ARGS(state, lsp, error))
+DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_delegation_stateid);
+DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_open_stateid);
+DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_lock_stateid);
+#endif /* CONFIG_NFS_V4_1 */
+
+DECLARE_EVENT_CLASS(nfs4_lookup_event,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct qstr *name,
+ int error
+ ),
+
+ TP_ARGS(dir, name, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, error)
+ __field(u64, dir)
+ __string(name, name->name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->error = -error;
+ __assign_str(name, name->name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) name=%02x:%02x:%llu/%s",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+#define DEFINE_NFS4_LOOKUP_EVENT(name) \
+ DEFINE_EVENT(nfs4_lookup_event, name, \
+ TP_PROTO( \
+ const struct inode *dir, \
+ const struct qstr *name, \
+ int error \
+ ), \
+ TP_ARGS(dir, name, error))
+
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_lookup);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_symlink);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_mkdir);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_mknod);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_remove);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_get_fs_locations);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_secinfo);
+
+TRACE_EVENT(nfs4_lookupp,
+ TP_PROTO(
+ const struct inode *inode,
+ int error
+ ),
+
+ TP_ARGS(inode, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, ino)
+ __field(unsigned long, error)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = NFS_FILEID(inode);
+ __entry->error = error < 0 ? -error : 0;
+ ),
+
+ TP_printk(
+ "error=%ld (%s) inode=%02x:%02x:%llu",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->ino
+ )
+);
+
+TRACE_EVENT(nfs4_rename,
+ TP_PROTO(
+ const struct inode *olddir,
+ const struct qstr *oldname,
+ const struct inode *newdir,
+ const struct qstr *newname,
+ int error
+ ),
+
+ TP_ARGS(olddir, oldname, newdir, newname, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, error)
+ __field(u64, olddir)
+ __string(oldname, oldname->name)
+ __field(u64, newdir)
+ __string(newname, newname->name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = olddir->i_sb->s_dev;
+ __entry->olddir = NFS_FILEID(olddir);
+ __entry->newdir = NFS_FILEID(newdir);
+ __entry->error = error < 0 ? -error : 0;
+ __assign_str(oldname, oldname->name);
+ __assign_str(newname, newname->name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) oldname=%02x:%02x:%llu/%s "
+ "newname=%02x:%02x:%llu/%s",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->olddir,
+ __get_str(oldname),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->newdir,
+ __get_str(newname)
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs4_inode_event,
+ TP_PROTO(
+ const struct inode *inode,
+ int error
+ ),
+
+ TP_ARGS(inode, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned long, error)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->error = error < 0 ? -error : 0;
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle
+ )
+);
+
+#define DEFINE_NFS4_INODE_EVENT(name) \
+ DEFINE_EVENT(nfs4_inode_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ int error \
+ ), \
+ TP_ARGS(inode, error))
+
+DEFINE_NFS4_INODE_EVENT(nfs4_access);
+DEFINE_NFS4_INODE_EVENT(nfs4_readlink);
+DEFINE_NFS4_INODE_EVENT(nfs4_readdir);
+DEFINE_NFS4_INODE_EVENT(nfs4_get_acl);
+DEFINE_NFS4_INODE_EVENT(nfs4_set_acl);
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+DEFINE_NFS4_INODE_EVENT(nfs4_get_security_label);
+DEFINE_NFS4_INODE_EVENT(nfs4_set_security_label);
+#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
+
+DECLARE_EVENT_CLASS(nfs4_inode_stateid_event,
+ TP_PROTO(
+ const struct inode *inode,
+ const nfs4_stateid *stateid,
+ int error
+ ),
+
+ TP_ARGS(inode, stateid, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned long, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->error = error < 0 ? -error : 0;
+ __entry->stateid_seq =
+ be32_to_cpu(stateid->seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
+#define DEFINE_NFS4_INODE_STATEID_EVENT(name) \
+ DEFINE_EVENT(nfs4_inode_stateid_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ const nfs4_stateid *stateid, \
+ int error \
+ ), \
+ TP_ARGS(inode, stateid, error))
+
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update_wait);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_close_stateid_update_wait);
+
+DECLARE_EVENT_CLASS(nfs4_getattr_event,
+ TP_PROTO(
+ const struct nfs_server *server,
+ const struct nfs_fh *fhandle,
+ const struct nfs_fattr *fattr,
+ int error
+ ),
+
+ TP_ARGS(server, fhandle, fattr, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned int, valid)
+ __field(unsigned long, error)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = server->s_dev;
+ __entry->valid = fattr->valid;
+ __entry->fhandle = nfs_fhandle_hash(fhandle);
+ __entry->fileid = (fattr->valid & NFS_ATTR_FATTR_FILEID) ? fattr->fileid : 0;
+ __entry->error = error < 0 ? -error : 0;
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "valid=%s",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ show_nfs_fattr_flags(__entry->valid)
+ )
+);
+
+#define DEFINE_NFS4_GETATTR_EVENT(name) \
+ DEFINE_EVENT(nfs4_getattr_event, name, \
+ TP_PROTO( \
+ const struct nfs_server *server, \
+ const struct nfs_fh *fhandle, \
+ const struct nfs_fattr *fattr, \
+ int error \
+ ), \
+ TP_ARGS(server, fhandle, fattr, error))
+DEFINE_NFS4_GETATTR_EVENT(nfs4_getattr);
+DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root);
+DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo);
+
+DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ const struct nfs_fh *fhandle,
+ const struct inode *inode,
+ int error
+ ),
+
+ TP_ARGS(clp, fhandle, inode, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __string(dstaddr, clp ? clp->cl_hostname : "unknown")
+ ),
+
+ TP_fast_assign(
+ __entry->error = error < 0 ? -error : 0;
+ __entry->fhandle = nfs_fhandle_hash(fhandle);
+ if (!IS_ERR_OR_NULL(inode)) {
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ } else {
+ __entry->fileid = 0;
+ __entry->dev = 0;
+ }
+ __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown");
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "dstaddr=%s",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __get_str(dstaddr)
+ )
+);
+
+#define DEFINE_NFS4_INODE_CALLBACK_EVENT(name) \
+ DEFINE_EVENT(nfs4_inode_callback_event, name, \
+ TP_PROTO( \
+ const struct nfs_client *clp, \
+ const struct nfs_fh *fhandle, \
+ const struct inode *inode, \
+ int error \
+ ), \
+ TP_ARGS(clp, fhandle, inode, error))
+DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr);
+
+DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ const struct nfs_fh *fhandle,
+ const struct inode *inode,
+ const nfs4_stateid *stateid,
+ int error
+ ),
+
+ TP_ARGS(clp, fhandle, inode, stateid, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __string(dstaddr, clp ? clp->cl_hostname : "unknown")
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error < 0 ? -error : 0;
+ __entry->fhandle = nfs_fhandle_hash(fhandle);
+ if (!IS_ERR_OR_NULL(inode)) {
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ } else {
+ __entry->fileid = 0;
+ __entry->dev = 0;
+ }
+ __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown");
+ __entry->stateid_seq =
+ be32_to_cpu(stateid->seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x dstaddr=%s",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __get_str(dstaddr)
+ )
+);
+
+#define DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(name) \
+ DEFINE_EVENT(nfs4_inode_stateid_callback_event, name, \
+ TP_PROTO( \
+ const struct nfs_client *clp, \
+ const struct nfs_fh *fhandle, \
+ const struct inode *inode, \
+ const nfs4_stateid *stateid, \
+ int error \
+ ), \
+ TP_ARGS(clp, fhandle, inode, stateid, error))
+DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_recall);
+DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_layoutrecall_file);
+
+DECLARE_EVENT_CLASS(nfs4_idmap_event,
+ TP_PROTO(
+ const char *name,
+ int len,
+ u32 id,
+ int error
+ ),
+
+ TP_ARGS(name, len, id, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(u32, id)
+ __dynamic_array(char, name, len > 0 ? len + 1 : 1)
+ ),
+
+ TP_fast_assign(
+ if (len < 0)
+ len = 0;
+ __entry->error = error < 0 ? error : 0;
+ __entry->id = id;
+ memcpy(__get_str(name), name, len);
+ __get_str(name)[len] = 0;
+ ),
+
+ TP_printk(
+ "error=%ld (%s) id=%u name=%s",
+ -__entry->error, show_nfs4_status(__entry->error),
+ __entry->id,
+ __get_str(name)
+ )
+);
+#define DEFINE_NFS4_IDMAP_EVENT(name) \
+ DEFINE_EVENT(nfs4_idmap_event, name, \
+ TP_PROTO( \
+ const char *name, \
+ int len, \
+ u32 id, \
+ int error \
+ ), \
+ TP_ARGS(name, len, id, error))
+DEFINE_NFS4_IDMAP_EVENT(nfs4_map_name_to_uid);
+DEFINE_NFS4_IDMAP_EVENT(nfs4_map_group_to_gid);
+DEFINE_NFS4_IDMAP_EVENT(nfs4_map_uid_to_name);
+DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group);
+
+#ifdef CONFIG_NFS_V4_1
+#define NFS4_LSEG_LAYOUT_STATEID_HASH(lseg) \
+ (lseg ? nfs_stateid_hash(&lseg->pls_layout->plh_stateid) : 0)
+#else
+#define NFS4_LSEG_LAYOUT_STATEID_HASH(lseg) (0)
+#endif
+
+DECLARE_EVENT_CLASS(nfs4_read_event,
+ TP_PROTO(
+ const struct nfs_pgio_header *hdr,
+ int error
+ ),
+
+ TP_ARGS(hdr, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, arg_count)
+ __field(u32, res_count)
+ __field(unsigned long, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = hdr->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = hdr->args.fh ?
+ hdr->args.fh : &nfsi->fh;
+ const struct nfs4_state *state =
+ hdr->args.context->state;
+ const struct pnfs_layout_segment *lseg = hdr->lseg;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ __entry->offset = hdr->args.offset;
+ __entry->arg_count = hdr->args.count;
+ __entry->res_count = hdr->res.count;
+ __entry->error = error < 0 ? -error : 0;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ __entry->layoutstateid_seq = lseg ? lseg->pls_seq : 0;
+ __entry->layoutstateid_hash =
+ NFS4_LSEG_LAYOUT_STATEID_HASH(lseg);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u stateid=%d:0x%08x "
+ "layoutstateid=%d:0x%08x",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset,
+ __entry->arg_count, __entry->res_count,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash
+ )
+);
+#define DEFINE_NFS4_READ_EVENT(name) \
+ DEFINE_EVENT(nfs4_read_event, name, \
+ TP_PROTO( \
+ const struct nfs_pgio_header *hdr, \
+ int error \
+ ), \
+ TP_ARGS(hdr, error))
+DEFINE_NFS4_READ_EVENT(nfs4_read);
+#ifdef CONFIG_NFS_V4_1
+DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
+#endif /* CONFIG_NFS_V4_1 */
+
+DECLARE_EVENT_CLASS(nfs4_write_event,
+ TP_PROTO(
+ const struct nfs_pgio_header *hdr,
+ int error
+ ),
+
+ TP_ARGS(hdr, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, arg_count)
+ __field(u32, res_count)
+ __field(unsigned long, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = hdr->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = hdr->args.fh ?
+ hdr->args.fh : &nfsi->fh;
+ const struct nfs4_state *state =
+ hdr->args.context->state;
+ const struct pnfs_layout_segment *lseg = hdr->lseg;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ __entry->offset = hdr->args.offset;
+ __entry->arg_count = hdr->args.count;
+ __entry->res_count = hdr->res.count;
+ __entry->error = error < 0 ? -error : 0;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ __entry->layoutstateid_seq = lseg ? lseg->pls_seq : 0;
+ __entry->layoutstateid_hash =
+ NFS4_LSEG_LAYOUT_STATEID_HASH(lseg);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u stateid=%d:0x%08x "
+ "layoutstateid=%d:0x%08x",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset,
+ __entry->arg_count, __entry->res_count,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash
+ )
+);
+
+#define DEFINE_NFS4_WRITE_EVENT(name) \
+ DEFINE_EVENT(nfs4_write_event, name, \
+ TP_PROTO( \
+ const struct nfs_pgio_header *hdr, \
+ int error \
+ ), \
+ TP_ARGS(hdr, error))
+DEFINE_NFS4_WRITE_EVENT(nfs4_write);
+#ifdef CONFIG_NFS_V4_1
+DEFINE_NFS4_WRITE_EVENT(nfs4_pnfs_write);
+#endif /* CONFIG_NFS_V4_1 */
+
+DECLARE_EVENT_CLASS(nfs4_commit_event,
+ TP_PROTO(
+ const struct nfs_commit_data *data,
+ int error
+ ),
+
+ TP_ARGS(data, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned long, error)
+ __field(loff_t, offset)
+ __field(u32, count)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = data->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = data->args.fh ?
+ data->args.fh : &nfsi->fh;
+ const struct pnfs_layout_segment *lseg = data->lseg;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ __entry->offset = data->args.offset;
+ __entry->count = data->args.count;
+ __entry->error = error < 0 ? -error : 0;
+ __entry->layoutstateid_seq = lseg ? lseg->pls_seq : 0;
+ __entry->layoutstateid_hash =
+ NFS4_LSEG_LAYOUT_STATEID_HASH(lseg);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u layoutstateid=%d:0x%08x",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset,
+ __entry->count,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash
+ )
+);
+#define DEFINE_NFS4_COMMIT_EVENT(name) \
+ DEFINE_EVENT(nfs4_commit_event, name, \
+ TP_PROTO( \
+ const struct nfs_commit_data *data, \
+ int error \
+ ), \
+ TP_ARGS(data, error))
+DEFINE_NFS4_COMMIT_EVENT(nfs4_commit);
+#ifdef CONFIG_NFS_V4_1
+DEFINE_NFS4_COMMIT_EVENT(nfs4_pnfs_commit_ds);
+
+TRACE_EVENT(nfs4_layoutget,
+ TP_PROTO(
+ const struct nfs_open_context *ctx,
+ const struct pnfs_layout_range *args,
+ const struct pnfs_layout_range *res,
+ const nfs4_stateid *layout_stateid,
+ int error
+ ),
+
+ TP_ARGS(ctx, args, res, layout_stateid, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u32, iomode)
+ __field(u64, offset)
+ __field(u64, count)
+ __field(unsigned long, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = d_inode(ctx->dentry);
+ const struct nfs4_state *state = ctx->state;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->iomode = args->iomode;
+ __entry->offset = args->offset;
+ __entry->count = args->length;
+ __entry->error = error < 0 ? -error : 0;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ if (!error) {
+ __entry->layoutstateid_seq =
+ be32_to_cpu(layout_stateid->seqid);
+ __entry->layoutstateid_hash =
+ nfs_stateid_hash(layout_stateid);
+ } else {
+ __entry->layoutstateid_seq = 0;
+ __entry->layoutstateid_hash = 0;
+ }
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "iomode=%s offset=%llu count=%llu stateid=%d:0x%08x "
+ "layoutstateid=%d:0x%08x",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ show_pnfs_layout_iomode(__entry->iomode),
+ (unsigned long long)__entry->offset,
+ (unsigned long long)__entry->count,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash
+ )
+);
+
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutcommit);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn_on_close);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layouterror);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutstats);
+
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_UNKNOWN);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_NO_PNFS);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_MDSTHRESH);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_NOMEM);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_BULK_RECALL);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_FOUND_CACHED);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_RETURN);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_BLOCKED);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_INVALID_OPEN);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_RETRY);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_EXIT);
+
+#define show_pnfs_update_layout_reason(reason) \
+ __print_symbolic(reason, \
+ { PNFS_UPDATE_LAYOUT_UNKNOWN, "unknown" }, \
+ { PNFS_UPDATE_LAYOUT_NO_PNFS, "no pnfs" }, \
+ { PNFS_UPDATE_LAYOUT_RD_ZEROLEN, "read+zerolen" }, \
+ { PNFS_UPDATE_LAYOUT_MDSTHRESH, "mdsthresh" }, \
+ { PNFS_UPDATE_LAYOUT_NOMEM, "nomem" }, \
+ { PNFS_UPDATE_LAYOUT_BULK_RECALL, "bulk recall" }, \
+ { PNFS_UPDATE_LAYOUT_IO_TEST_FAIL, "io test fail" }, \
+ { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" }, \
+ { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" }, \
+ { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \
+ { PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" }, \
+ { PNFS_UPDATE_LAYOUT_RETRY, "retrying" }, \
+ { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }, \
+ { PNFS_UPDATE_LAYOUT_EXIT, "exit" })
+
+TRACE_EVENT(pnfs_update_layout,
+ TP_PROTO(struct inode *inode,
+ loff_t pos,
+ u64 count,
+ enum pnfs_iomode iomode,
+ struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ enum pnfs_update_layout_reason reason
+ ),
+ TP_ARGS(inode, pos, count, iomode, lo, lseg, reason),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u32, fhandle)
+ __field(loff_t, pos)
+ __field(u64, count)
+ __field(enum pnfs_iomode, iomode)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
+ __field(long, lseg)
+ __field(enum pnfs_update_layout_reason, reason)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->pos = pos;
+ __entry->count = count;
+ __entry->iomode = iomode;
+ __entry->reason = reason;
+ if (lo != NULL && pnfs_layout_is_valid(lo)) {
+ __entry->layoutstateid_seq =
+ be32_to_cpu(lo->plh_stateid.seqid);
+ __entry->layoutstateid_hash =
+ nfs_stateid_hash(&lo->plh_stateid);
+ } else {
+ __entry->layoutstateid_seq = 0;
+ __entry->layoutstateid_hash = 0;
+ }
+ __entry->lseg = (long)lseg;
+ ),
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "iomode=%s pos=%llu count=%llu "
+ "layoutstateid=%d:0x%08x lseg=0x%lx (%s)",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ show_pnfs_layout_iomode(__entry->iomode),
+ (unsigned long long)__entry->pos,
+ (unsigned long long)__entry->count,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash,
+ __entry->lseg,
+ show_pnfs_update_layout_reason(__entry->reason)
+ )
+);
+
+DECLARE_EVENT_CLASS(pnfs_layout_event,
+ TP_PROTO(struct inode *inode,
+ loff_t pos,
+ u64 count,
+ enum pnfs_iomode iomode,
+ struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg
+ ),
+ TP_ARGS(inode, pos, count, iomode, lo, lseg),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u32, fhandle)
+ __field(loff_t, pos)
+ __field(u64, count)
+ __field(enum pnfs_iomode, iomode)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
+ __field(long, lseg)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->pos = pos;
+ __entry->count = count;
+ __entry->iomode = iomode;
+ if (lo != NULL && pnfs_layout_is_valid(lo)) {
+ __entry->layoutstateid_seq =
+ be32_to_cpu(lo->plh_stateid.seqid);
+ __entry->layoutstateid_hash =
+ nfs_stateid_hash(&lo->plh_stateid);
+ } else {
+ __entry->layoutstateid_seq = 0;
+ __entry->layoutstateid_hash = 0;
+ }
+ __entry->lseg = (long)lseg;
+ ),
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "iomode=%s pos=%llu count=%llu "
+ "layoutstateid=%d:0x%08x lseg=0x%lx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ show_pnfs_layout_iomode(__entry->iomode),
+ (unsigned long long)__entry->pos,
+ (unsigned long long)__entry->count,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash,
+ __entry->lseg
+ )
+);
+
+#define DEFINE_PNFS_LAYOUT_EVENT(name) \
+ DEFINE_EVENT(pnfs_layout_event, name, \
+ TP_PROTO(struct inode *inode, \
+ loff_t pos, \
+ u64 count, \
+ enum pnfs_iomode iomode, \
+ struct pnfs_layout_hdr *lo, \
+ struct pnfs_layout_segment *lseg \
+ ), \
+ TP_ARGS(inode, pos, count, iomode, lo, lseg))
+
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_init_read);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_init_write);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_get_mirror_count);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_done);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_done);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_pagelist);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_pagelist);
+
+DECLARE_EVENT_CLASS(nfs4_deviceid_event,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ const struct nfs4_deviceid *deviceid
+ ),
+
+ TP_ARGS(clp, deviceid),
+
+ TP_STRUCT__entry(
+ __string(dstaddr, clp->cl_hostname)
+ __array(unsigned char, deviceid, NFS4_DEVICEID4_SIZE)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dstaddr, clp->cl_hostname);
+ memcpy(__entry->deviceid, deviceid->data,
+ NFS4_DEVICEID4_SIZE);
+ ),
+
+ TP_printk(
+ "deviceid=%s, dstaddr=%s",
+ __print_hex(__entry->deviceid, NFS4_DEVICEID4_SIZE),
+ __get_str(dstaddr)
+ )
+);
+#define DEFINE_PNFS_DEVICEID_EVENT(name) \
+ DEFINE_EVENT(nfs4_deviceid_event, name, \
+ TP_PROTO(const struct nfs_client *clp, \
+ const struct nfs4_deviceid *deviceid \
+ ), \
+ TP_ARGS(clp, deviceid))
+DEFINE_PNFS_DEVICEID_EVENT(nfs4_deviceid_free);
+
+DECLARE_EVENT_CLASS(nfs4_deviceid_status,
+ TP_PROTO(
+ const struct nfs_server *server,
+ const struct nfs4_deviceid *deviceid,
+ int status
+ ),
+
+ TP_ARGS(server, deviceid, status),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, status)
+ __string(dstaddr, server->nfs_client->cl_hostname)
+ __array(unsigned char, deviceid, NFS4_DEVICEID4_SIZE)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = server->s_dev;
+ __entry->status = status;
+ __assign_str(dstaddr, server->nfs_client->cl_hostname);
+ memcpy(__entry->deviceid, deviceid->data,
+ NFS4_DEVICEID4_SIZE);
+ ),
+
+ TP_printk(
+ "dev=%02x:%02x: deviceid=%s, dstaddr=%s, status=%d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_hex(__entry->deviceid, NFS4_DEVICEID4_SIZE),
+ __get_str(dstaddr),
+ __entry->status
+ )
+);
+#define DEFINE_PNFS_DEVICEID_STATUS(name) \
+ DEFINE_EVENT(nfs4_deviceid_status, name, \
+ TP_PROTO(const struct nfs_server *server, \
+ const struct nfs4_deviceid *deviceid, \
+ int status \
+ ), \
+ TP_ARGS(server, deviceid, status))
+DEFINE_PNFS_DEVICEID_STATUS(nfs4_getdeviceinfo);
+DEFINE_PNFS_DEVICEID_STATUS(nfs4_find_deviceid);
+
+DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,
+ TP_PROTO(
+ const struct nfs_pgio_header *hdr
+ ),
+
+ TP_ARGS(hdr),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, count)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __string(dstaddr, hdr->ds_clp ?
+ rpc_peeraddr2str(hdr->ds_clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown")
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = hdr->inode;
+
+ __entry->error = hdr->res.op_status;
+ __entry->fhandle = nfs_fhandle_hash(hdr->args.fh);
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->offset = hdr->args.offset;
+ __entry->count = hdr->args.count;
+ __entry->stateid_seq =
+ be32_to_cpu(hdr->args.stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&hdr->args.stateid);
+ __assign_str(dstaddr, hdr->ds_clp ?
+ rpc_peeraddr2str(hdr->ds_clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown");
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%llu count=%u stateid=%d:0x%08x dstaddr=%s",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->offset, __entry->count,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __get_str(dstaddr)
+ )
+);
+
+#define DEFINE_NFS4_FLEXFILES_IO_EVENT(name) \
+ DEFINE_EVENT(nfs4_flexfiles_io_event, name, \
+ TP_PROTO( \
+ const struct nfs_pgio_header *hdr \
+ ), \
+ TP_ARGS(hdr))
+DEFINE_NFS4_FLEXFILES_IO_EVENT(ff_layout_read_error);
+DEFINE_NFS4_FLEXFILES_IO_EVENT(ff_layout_write_error);
+
+TRACE_EVENT(ff_layout_commit_error,
+ TP_PROTO(
+ const struct nfs_commit_data *data
+ ),
+
+ TP_ARGS(data),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, count)
+ __string(dstaddr, data->ds_clp ?
+ rpc_peeraddr2str(data->ds_clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown")
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = data->inode;
+
+ __entry->error = data->res.op_status;
+ __entry->fhandle = nfs_fhandle_hash(data->args.fh);
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->offset = data->args.offset;
+ __entry->count = data->args.count;
+ __assign_str(dstaddr, data->ds_clp ?
+ rpc_peeraddr2str(data->ds_clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown");
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%llu count=%u dstaddr=%s",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->offset, __entry->count,
+ __get_str(dstaddr)
+ )
+);
+
+#ifdef CONFIG_NFS_V4_2
+TRACE_DEFINE_ENUM(NFS4_CONTENT_DATA);
+TRACE_DEFINE_ENUM(NFS4_CONTENT_HOLE);
+
+#define show_llseek_mode(what) \
+ __print_symbolic(what, \
+ { NFS4_CONTENT_DATA, "DATA" }, \
+ { NFS4_CONTENT_HOLE, "HOLE" })
+
+TRACE_EVENT(nfs4_llseek,
+ TP_PROTO(
+ const struct inode *inode,
+ const struct nfs42_seek_args *args,
+ const struct nfs42_seek_res *res,
+ int error
+ ),
+
+ TP_ARGS(inode, args, res, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(u32, fhandle)
+ __field(u32, fileid)
+ __field(dev_t, dev)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(loff_t, offset_s)
+ __field(u32, what)
+ __field(loff_t, offset_r)
+ __field(u32, eof)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = args->sa_fh;
+
+ __entry->fileid = nfsi->fileid;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ __entry->offset_s = args->sa_offset;
+ __entry->stateid_seq =
+ be32_to_cpu(args->sa_stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&args->sa_stateid);
+ __entry->what = args->sa_what;
+ if (error) {
+ __entry->error = -error;
+ __entry->offset_r = 0;
+ __entry->eof = 0;
+ } else {
+ __entry->error = 0;
+ __entry->offset_r = res->sr_offset;
+ __entry->eof = res->sr_eof;
+ }
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x offset_s=%llu what=%s "
+ "offset_r=%llu eof=%u",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->offset_s,
+ show_llseek_mode(__entry->what),
+ __entry->offset_r,
+ __entry->eof
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs4_sparse_event,
+ TP_PROTO(
+ const struct inode *inode,
+ const struct nfs42_falloc_args *args,
+ int error
+ ),
+
+ TP_ARGS(inode, args, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(loff_t, offset)
+ __field(loff_t, len)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error < 0 ? -error : 0;
+ __entry->offset = args->falloc_offset;
+ __entry->len = args->falloc_length;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(args->falloc_stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&args->falloc_stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x offset=%llu len=%llu",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ (long long)__entry->offset,
+ (long long)__entry->len
+ )
+);
+#define DEFINE_NFS4_SPARSE_EVENT(name) \
+ DEFINE_EVENT(nfs4_sparse_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ const struct nfs42_falloc_args *args, \
+ int error \
+ ), \
+ TP_ARGS(inode, args, error))
+DEFINE_NFS4_SPARSE_EVENT(nfs4_fallocate);
+DEFINE_NFS4_SPARSE_EVENT(nfs4_deallocate);
+
+TRACE_EVENT(nfs4_copy,
+ TP_PROTO(
+ const struct inode *src_inode,
+ const struct inode *dst_inode,
+ const struct nfs42_copy_args *args,
+ const struct nfs42_copy_res *res,
+ const struct nl4_server *nss,
+ int error
+ ),
+
+ TP_ARGS(src_inode, dst_inode, args, res, nss, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(u32, src_fhandle)
+ __field(u32, src_fileid)
+ __field(u32, dst_fhandle)
+ __field(u32, dst_fileid)
+ __field(dev_t, src_dev)
+ __field(dev_t, dst_dev)
+ __field(int, src_stateid_seq)
+ __field(u32, src_stateid_hash)
+ __field(int, dst_stateid_seq)
+ __field(u32, dst_stateid_hash)
+ __field(loff_t, src_offset)
+ __field(loff_t, dst_offset)
+ __field(bool, sync)
+ __field(loff_t, len)
+ __field(int, res_stateid_seq)
+ __field(u32, res_stateid_hash)
+ __field(loff_t, res_count)
+ __field(bool, res_sync)
+ __field(bool, res_cons)
+ __field(bool, intra)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *src_nfsi = NFS_I(src_inode);
+ const struct nfs_inode *dst_nfsi = NFS_I(dst_inode);
+
+ __entry->src_fileid = src_nfsi->fileid;
+ __entry->src_dev = src_inode->i_sb->s_dev;
+ __entry->src_fhandle = nfs_fhandle_hash(args->src_fh);
+ __entry->src_offset = args->src_pos;
+ __entry->dst_fileid = dst_nfsi->fileid;
+ __entry->dst_dev = dst_inode->i_sb->s_dev;
+ __entry->dst_fhandle = nfs_fhandle_hash(args->dst_fh);
+ __entry->dst_offset = args->dst_pos;
+ __entry->len = args->count;
+ __entry->sync = args->sync;
+ __entry->src_stateid_seq =
+ be32_to_cpu(args->src_stateid.seqid);
+ __entry->src_stateid_hash =
+ nfs_stateid_hash(&args->src_stateid);
+ __entry->dst_stateid_seq =
+ be32_to_cpu(args->dst_stateid.seqid);
+ __entry->dst_stateid_hash =
+ nfs_stateid_hash(&args->dst_stateid);
+ __entry->intra = nss ? 0 : 1;
+ if (error) {
+ __entry->error = -error;
+ __entry->res_stateid_seq = 0;
+ __entry->res_stateid_hash = 0;
+ __entry->res_count = 0;
+ __entry->res_sync = 0;
+ __entry->res_cons = 0;
+ } else {
+ __entry->error = 0;
+ __entry->res_stateid_seq =
+ be32_to_cpu(res->write_res.stateid.seqid);
+ __entry->res_stateid_hash =
+ nfs_stateid_hash(&res->write_res.stateid);
+ __entry->res_count = res->write_res.count;
+ __entry->res_sync = res->synchronous;
+ __entry->res_cons = res->consecutive;
+ }
+ ),
+
+ TP_printk(
+ "error=%ld (%s) intra=%d src_fileid=%02x:%02x:%llu "
+ "src_fhandle=0x%08x dst_fileid=%02x:%02x:%llu "
+ "dst_fhandle=0x%08x src_stateid=%d:0x%08x "
+ "dst_stateid=%d:0x%08x src_offset=%llu dst_offset=%llu "
+ "len=%llu sync=%d cb_stateid=%d:0x%08x res_sync=%d "
+ "res_cons=%d res_count=%llu",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ __entry->intra,
+ MAJOR(__entry->src_dev), MINOR(__entry->src_dev),
+ (unsigned long long)__entry->src_fileid,
+ __entry->src_fhandle,
+ MAJOR(__entry->dst_dev), MINOR(__entry->dst_dev),
+ (unsigned long long)__entry->dst_fileid,
+ __entry->dst_fhandle,
+ __entry->src_stateid_seq, __entry->src_stateid_hash,
+ __entry->dst_stateid_seq, __entry->dst_stateid_hash,
+ __entry->src_offset,
+ __entry->dst_offset,
+ __entry->len,
+ __entry->sync,
+ __entry->res_stateid_seq, __entry->res_stateid_hash,
+ __entry->res_sync,
+ __entry->res_cons,
+ __entry->res_count
+ )
+);
+
+TRACE_EVENT(nfs4_clone,
+ TP_PROTO(
+ const struct inode *src_inode,
+ const struct inode *dst_inode,
+ const struct nfs42_clone_args *args,
+ int error
+ ),
+
+ TP_ARGS(src_inode, dst_inode, args, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(u32, src_fhandle)
+ __field(u32, src_fileid)
+ __field(u32, dst_fhandle)
+ __field(u32, dst_fileid)
+ __field(dev_t, src_dev)
+ __field(dev_t, dst_dev)
+ __field(loff_t, src_offset)
+ __field(loff_t, dst_offset)
+ __field(int, src_stateid_seq)
+ __field(u32, src_stateid_hash)
+ __field(int, dst_stateid_seq)
+ __field(u32, dst_stateid_hash)
+ __field(loff_t, len)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *src_nfsi = NFS_I(src_inode);
+ const struct nfs_inode *dst_nfsi = NFS_I(dst_inode);
+
+ __entry->src_fileid = src_nfsi->fileid;
+ __entry->src_dev = src_inode->i_sb->s_dev;
+ __entry->src_fhandle = nfs_fhandle_hash(args->src_fh);
+ __entry->src_offset = args->src_offset;
+ __entry->dst_fileid = dst_nfsi->fileid;
+ __entry->dst_dev = dst_inode->i_sb->s_dev;
+ __entry->dst_fhandle = nfs_fhandle_hash(args->dst_fh);
+ __entry->dst_offset = args->dst_offset;
+ __entry->len = args->count;
+ __entry->error = error < 0 ? -error : 0;
+ __entry->src_stateid_seq =
+ be32_to_cpu(args->src_stateid.seqid);
+ __entry->src_stateid_hash =
+ nfs_stateid_hash(&args->src_stateid);
+ __entry->dst_stateid_seq =
+ be32_to_cpu(args->dst_stateid.seqid);
+ __entry->dst_stateid_hash =
+ nfs_stateid_hash(&args->dst_stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) src_fileid=%02x:%02x:%llu "
+ "src_fhandle=0x%08x dst_fileid=%02x:%02x:%llu "
+ "dst_fhandle=0x%08x src_stateid=%d:0x%08x "
+ "dst_stateid=%d:0x%08x src_offset=%llu "
+ "dst_offset=%llu len=%llu",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->src_dev), MINOR(__entry->src_dev),
+ (unsigned long long)__entry->src_fileid,
+ __entry->src_fhandle,
+ MAJOR(__entry->dst_dev), MINOR(__entry->dst_dev),
+ (unsigned long long)__entry->dst_fileid,
+ __entry->dst_fhandle,
+ __entry->src_stateid_seq, __entry->src_stateid_hash,
+ __entry->dst_stateid_seq, __entry->dst_stateid_hash,
+ __entry->src_offset,
+ __entry->dst_offset,
+ __entry->len
+ )
+);
+
+TRACE_EVENT(nfs4_copy_notify,
+ TP_PROTO(
+ const struct inode *inode,
+ const struct nfs42_copy_notify_args *args,
+ const struct nfs42_copy_notify_res *res,
+ int error
+ ),
+
+ TP_ARGS(inode, args, res, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(u32, fhandle)
+ __field(u32, fileid)
+ __field(dev_t, dev)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, res_stateid_seq)
+ __field(u32, res_stateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->fileid = nfsi->fileid;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fhandle = nfs_fhandle_hash(args->cna_src_fh);
+ __entry->stateid_seq =
+ be32_to_cpu(args->cna_src_stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&args->cna_src_stateid);
+ if (error) {
+ __entry->error = -error;
+ __entry->res_stateid_seq = 0;
+ __entry->res_stateid_hash = 0;
+ } else {
+ __entry->error = 0;
+ __entry->res_stateid_seq =
+ be32_to_cpu(res->cnr_stateid.seqid);
+ __entry->res_stateid_hash =
+ nfs_stateid_hash(&res->cnr_stateid);
+ }
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x res_stateid=%d:0x%08x",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->res_stateid_seq, __entry->res_stateid_hash
+ )
+);
+
+TRACE_EVENT(nfs4_offload_cancel,
+ TP_PROTO(
+ const struct nfs42_offload_status_args *args,
+ int error
+ ),
+
+ TP_ARGS(args, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(u32, fhandle)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->fhandle = nfs_fhandle_hash(args->osa_src_fh);
+ __entry->error = error < 0 ? -error : 0;
+ __entry->stateid_seq =
+ be32_to_cpu(args->osa_stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&args->osa_stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fhandle=0x%08x stateid=%d:0x%08x",
+ -__entry->error,
+ show_nfs4_status(__entry->error),
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs4_xattr_event,
+ TP_PROTO(
+ const struct inode *inode,
+ const char *name,
+ int error
+ ),
+
+ TP_ARGS(inode, name, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __string(name, name)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error < 0 ? -error : 0;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __assign_str(name, name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "name=%s",
+ -__entry->error, show_nfs4_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __get_str(name)
+ )
+);
+#define DEFINE_NFS4_XATTR_EVENT(name) \
+ DEFINE_EVENT(nfs4_xattr_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ const char *name, \
+ int error \
+ ), \
+ TP_ARGS(inode, name, error))
+DEFINE_NFS4_XATTR_EVENT(nfs4_getxattr);
+DEFINE_NFS4_XATTR_EVENT(nfs4_setxattr);
+DEFINE_NFS4_XATTR_EVENT(nfs4_removexattr);
+
+DEFINE_NFS4_INODE_EVENT(nfs4_listxattr);
+#endif /* CONFIG_NFS_V4_2 */
+
+#endif /* CONFIG_NFS_V4_1 */
+
+#endif /* _TRACE_NFS4_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE nfs4trace
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
new file mode 100644
index 0000000000..deec76cf5a
--- /dev/null
+++ b/fs/nfs/nfs4xdr.c
@@ -0,0 +1,7721 @@
+/*
+ * fs/nfs/nfs4xdr.c
+ *
+ * Client-side XDR for NFSv4.
+ *
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Kendrick Smith <kmsmith@umich.edu>
+ * Andy Adamson <andros@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/kdev_t.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/msg_prot.h>
+#include <linux/sunrpc/gss_api.h>
+#include <linux/nfs.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+
+#include "nfs4_fs.h"
+#include "nfs4trace.h"
+#include "internal.h"
+#include "nfs4idmap.h"
+#include "nfs4session.h"
+#include "pnfs.h"
+#include "netns.h"
+
+#define NFSDBG_FACILITY NFSDBG_XDR
+
+/* Mapping from NFS error code to "errno" error code. */
+#define errno_NFSERR_IO EIO
+
+struct compound_hdr;
+static int nfs4_stat_to_errno(int);
+static void encode_layoutget(struct xdr_stream *xdr,
+ const struct nfs4_layoutget_args *args,
+ struct compound_hdr *hdr);
+static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
+ struct nfs4_layoutget_res *res);
+
+/* NFSv4 COMPOUND tags are only wanted for debugging purposes */
+#ifdef DEBUG
+#define NFS4_MAXTAGLEN 20
+#else
+#define NFS4_MAXTAGLEN 0
+#endif
+
+/* lock,open owner id:
+ * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2)
+ */
+#define pagepad_maxsz (1)
+#define open_owner_id_maxsz (1 + 2 + 1 + 1 + 2)
+#define lock_owner_id_maxsz (1 + 1 + 4)
+#define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2))
+#define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2))
+#define op_encode_hdr_maxsz (1)
+#define op_decode_hdr_maxsz (2)
+#define encode_stateid_maxsz (XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define decode_stateid_maxsz (XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define encode_verifier_maxsz (XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define decode_verifier_maxsz (XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define encode_putfh_maxsz (op_encode_hdr_maxsz + 1 + \
+ (NFS4_FHSIZE >> 2))
+#define decode_putfh_maxsz (op_decode_hdr_maxsz)
+#define encode_putrootfh_maxsz (op_encode_hdr_maxsz)
+#define decode_putrootfh_maxsz (op_decode_hdr_maxsz)
+#define encode_getfh_maxsz (op_encode_hdr_maxsz)
+#define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \
+ ((3+NFS4_FHSIZE) >> 2))
+#define nfs4_fattr_bitmap_maxsz 4
+#define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
+#define nfstime4_maxsz (3)
+#define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2))
+#define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2))
+#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#define nfs4_group_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+/* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */
+#define nfs4_label_maxsz (4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN))
+#else
+#define nfs4_label_maxsz 0
+#endif
+/* We support only one layout type per file system */
+#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)
+/* This is based on getfattr, which uses the most attributes: */
+#define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \
+ 3*nfstime4_maxsz + \
+ nfs4_owner_maxsz + \
+ nfs4_group_maxsz + nfs4_label_maxsz + \
+ decode_mdsthreshold_maxsz))
+#define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \
+ nfs4_fattr_value_maxsz)
+#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
+#define encode_attrs_maxsz (nfs4_fattr_bitmap_maxsz + \
+ 1 + 2 + 1 + \
+ nfs4_owner_maxsz + \
+ nfs4_group_maxsz + \
+ nfs4_label_maxsz + \
+ 1 + nfstime4_maxsz + \
+ 1 + nfstime4_maxsz)
+#define encode_savefh_maxsz (op_encode_hdr_maxsz)
+#define decode_savefh_maxsz (op_decode_hdr_maxsz)
+#define encode_restorefh_maxsz (op_encode_hdr_maxsz)
+#define decode_restorefh_maxsz (op_decode_hdr_maxsz)
+#define encode_fsinfo_maxsz (encode_getattr_maxsz)
+/* The 5 accounts for the PNFS attributes, and assumes that at most three
+ * layout types will be returned.
+ */
+#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \
+ nfs4_fattr_bitmap_maxsz + 1 + \
+ 1 /* lease time */ + \
+ 2 /* max filesize */ + \
+ 2 /* max read */ + \
+ 2 /* max write */ + \
+ nfstime4_maxsz /* time delta */ + \
+ 5 /* fs layout types */ + \
+ 1 /* layout blksize */ + \
+ 1 /* clone blksize */ + \
+ 1 /* change attr type */ + \
+ 1 /* xattr support */)
+#define encode_renew_maxsz (op_encode_hdr_maxsz + 3)
+#define decode_renew_maxsz (op_decode_hdr_maxsz)
+#define encode_setclientid_maxsz \
+ (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \
+ /* client name */ \
+ 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+ 1 /* sc_prog */ + \
+ 1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \
+ 1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \
+ 1) /* sc_cb_ident */
+#define decode_setclientid_maxsz \
+ (op_decode_hdr_maxsz + \
+ 2 /* clientid */ + \
+ XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \
+ 1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \
+ 1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN))
+#define encode_setclientid_confirm_maxsz \
+ (op_encode_hdr_maxsz + \
+ 3 + (NFS4_VERIFIER_SIZE >> 2))
+#define decode_setclientid_confirm_maxsz \
+ (op_decode_hdr_maxsz)
+#define encode_lookup_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz)
+#define decode_lookup_maxsz (op_decode_hdr_maxsz)
+#define encode_lookupp_maxsz (op_encode_hdr_maxsz)
+#define decode_lookupp_maxsz (op_decode_hdr_maxsz)
+#define encode_share_access_maxsz \
+ (2)
+#define encode_createmode_maxsz (1 + encode_attrs_maxsz + encode_verifier_maxsz)
+#define encode_opentype_maxsz (1 + encode_createmode_maxsz)
+#define encode_claim_null_maxsz (1 + nfs4_name_maxsz)
+#define encode_open_maxsz (op_encode_hdr_maxsz + \
+ 2 + encode_share_access_maxsz + 2 + \
+ open_owner_id_maxsz + \
+ encode_opentype_maxsz + \
+ encode_claim_null_maxsz)
+#define decode_space_limit_maxsz (3)
+#define decode_ace_maxsz (3 + nfs4_owner_maxsz)
+#define decode_delegation_maxsz (1 + decode_stateid_maxsz + 1 + \
+ decode_space_limit_maxsz + \
+ decode_ace_maxsz)
+#define decode_change_info_maxsz (5)
+#define decode_open_maxsz (op_decode_hdr_maxsz + \
+ decode_stateid_maxsz + \
+ decode_change_info_maxsz + 1 + \
+ nfs4_fattr_bitmap_maxsz + \
+ decode_delegation_maxsz)
+#define encode_open_confirm_maxsz \
+ (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + 1)
+#define decode_open_confirm_maxsz \
+ (op_decode_hdr_maxsz + \
+ decode_stateid_maxsz)
+#define encode_open_downgrade_maxsz \
+ (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + 1 + \
+ encode_share_access_maxsz)
+#define decode_open_downgrade_maxsz \
+ (op_decode_hdr_maxsz + \
+ decode_stateid_maxsz)
+#define encode_close_maxsz (op_encode_hdr_maxsz + \
+ 1 + encode_stateid_maxsz)
+#define decode_close_maxsz (op_decode_hdr_maxsz + \
+ decode_stateid_maxsz)
+#define encode_setattr_maxsz (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + \
+ encode_attrs_maxsz)
+#define decode_setattr_maxsz (op_decode_hdr_maxsz + \
+ nfs4_fattr_bitmap_maxsz)
+#define encode_read_maxsz (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + 3)
+#define decode_read_maxsz (op_decode_hdr_maxsz + 2 + pagepad_maxsz)
+#define encode_readdir_maxsz (op_encode_hdr_maxsz + \
+ 2 + encode_verifier_maxsz + 5 + \
+ nfs4_label_maxsz)
+#define decode_readdir_maxsz (op_decode_hdr_maxsz + \
+ decode_verifier_maxsz + pagepad_maxsz)
+#define encode_readlink_maxsz (op_encode_hdr_maxsz)
+#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1 + pagepad_maxsz)
+#define encode_write_maxsz (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + 4)
+#define decode_write_maxsz (op_decode_hdr_maxsz + \
+ 2 + decode_verifier_maxsz)
+#define encode_commit_maxsz (op_encode_hdr_maxsz + 3)
+#define decode_commit_maxsz (op_decode_hdr_maxsz + \
+ decode_verifier_maxsz)
+#define encode_remove_maxsz (op_encode_hdr_maxsz + \
+ nfs4_name_maxsz)
+#define decode_remove_maxsz (op_decode_hdr_maxsz + \
+ decode_change_info_maxsz)
+#define encode_rename_maxsz (op_encode_hdr_maxsz + \
+ 2 * nfs4_name_maxsz)
+#define decode_rename_maxsz (op_decode_hdr_maxsz + \
+ decode_change_info_maxsz + \
+ decode_change_info_maxsz)
+#define encode_link_maxsz (op_encode_hdr_maxsz + \
+ nfs4_name_maxsz)
+#define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz)
+#define encode_lockowner_maxsz (7)
+#define encode_lock_maxsz (op_encode_hdr_maxsz + \
+ 7 + \
+ 1 + encode_stateid_maxsz + 1 + \
+ encode_lockowner_maxsz)
+#define decode_lock_denied_maxsz \
+ (8 + decode_lockowner_maxsz)
+#define decode_lock_maxsz (op_decode_hdr_maxsz + \
+ decode_lock_denied_maxsz)
+#define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \
+ encode_lockowner_maxsz)
+#define decode_lockt_maxsz (op_decode_hdr_maxsz + \
+ decode_lock_denied_maxsz)
+#define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \
+ encode_stateid_maxsz + \
+ 4)
+#define decode_locku_maxsz (op_decode_hdr_maxsz + \
+ decode_stateid_maxsz)
+#define encode_release_lockowner_maxsz \
+ (op_encode_hdr_maxsz + \
+ encode_lockowner_maxsz)
+#define decode_release_lockowner_maxsz \
+ (op_decode_hdr_maxsz)
+#define encode_access_maxsz (op_encode_hdr_maxsz + 1)
+#define decode_access_maxsz (op_decode_hdr_maxsz + 2)
+#define encode_symlink_maxsz (op_encode_hdr_maxsz + \
+ 1 + nfs4_name_maxsz + \
+ 1 + \
+ nfs4_fattr_maxsz)
+#define decode_symlink_maxsz (op_decode_hdr_maxsz + 8)
+#define encode_create_maxsz (op_encode_hdr_maxsz + \
+ 1 + 2 + nfs4_name_maxsz + \
+ encode_attrs_maxsz)
+#define decode_create_maxsz (op_decode_hdr_maxsz + \
+ decode_change_info_maxsz + \
+ nfs4_fattr_bitmap_maxsz)
+#define encode_statfs_maxsz (encode_getattr_maxsz)
+#define decode_statfs_maxsz (decode_getattr_maxsz)
+#define encode_delegreturn_maxsz (op_encode_hdr_maxsz + 4)
+#define decode_delegreturn_maxsz (op_decode_hdr_maxsz)
+#define encode_getacl_maxsz (encode_getattr_maxsz)
+#define decode_getacl_maxsz (op_decode_hdr_maxsz + \
+ nfs4_fattr_bitmap_maxsz + 1 + pagepad_maxsz)
+#define encode_setacl_maxsz (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + 3)
+#define decode_setacl_maxsz (decode_setattr_maxsz)
+#define encode_fs_locations_maxsz \
+ (encode_getattr_maxsz)
+#define decode_fs_locations_maxsz \
+ (pagepad_maxsz)
+#define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz)
+#define decode_secinfo_maxsz (op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4))
+
+#if defined(CONFIG_NFS_V4_1)
+#define NFS4_MAX_MACHINE_NAME_LEN (64)
+#define IMPL_NAME_LIMIT (sizeof(utsname()->sysname) + sizeof(utsname()->release) + \
+ sizeof(utsname()->version) + sizeof(utsname()->machine) + 8)
+
+#define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
+ encode_verifier_maxsz + \
+ 1 /* co_ownerid.len */ + \
+ /* eia_clientowner */ \
+ 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+ 1 /* flags */ + \
+ 1 /* spa_how */ + \
+ /* max is SP4_MACH_CRED (for now) */ + \
+ 1 + NFS4_OP_MAP_NUM_WORDS + \
+ 1 + NFS4_OP_MAP_NUM_WORDS + \
+ 1 /* implementation id array of size 1 */ + \
+ 1 /* nii_domain */ + \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+ 1 /* nii_name */ + \
+ XDR_QUADLEN(IMPL_NAME_LIMIT) + \
+ 3 /* nii_date */)
+#define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \
+ 2 /* eir_clientid */ + \
+ 1 /* eir_sequenceid */ + \
+ 1 /* eir_flags */ + \
+ 1 /* spr_how */ + \
+ /* max is SP4_MACH_CRED (for now) */ + \
+ 1 + NFS4_OP_MAP_NUM_WORDS + \
+ 1 + NFS4_OP_MAP_NUM_WORDS + \
+ 2 /* eir_server_owner.so_minor_id */ + \
+ /* eir_server_owner.so_major_id<> */ \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
+ /* eir_server_scope<> */ \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
+ 1 /* eir_server_impl_id array length */ + \
+ 1 /* nii_domain */ + \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+ 1 /* nii_name */ + \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+ 3 /* nii_date */)
+#define encode_channel_attrs_maxsz (6 + 1 /* ca_rdma_ird.len (0) */)
+#define decode_channel_attrs_maxsz (6 + \
+ 1 /* ca_rdma_ird.len */ + \
+ 1 /* ca_rdma_ird */)
+#define encode_create_session_maxsz (op_encode_hdr_maxsz + \
+ 2 /* csa_clientid */ + \
+ 1 /* csa_sequence */ + \
+ 1 /* csa_flags */ + \
+ encode_channel_attrs_maxsz + \
+ encode_channel_attrs_maxsz + \
+ 1 /* csa_cb_program */ + \
+ 1 /* csa_sec_parms.len (1) */ + \
+ 1 /* cb_secflavor (AUTH_SYS) */ + \
+ 1 /* stamp */ + \
+ 1 /* machinename.len */ + \
+ XDR_QUADLEN(NFS4_MAX_MACHINE_NAME_LEN) + \
+ 1 /* uid */ + \
+ 1 /* gid */ + \
+ 1 /* gids.len (0) */)
+#define decode_create_session_maxsz (op_decode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+ 1 /* csr_sequence */ + \
+ 1 /* csr_flags */ + \
+ decode_channel_attrs_maxsz + \
+ decode_channel_attrs_maxsz)
+#define encode_bind_conn_to_session_maxsz (op_encode_hdr_maxsz + \
+ /* bctsa_sessid */ \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+ 1 /* bctsa_dir */ + \
+ 1 /* bctsa_use_conn_in_rdma_mode */)
+#define decode_bind_conn_to_session_maxsz (op_decode_hdr_maxsz + \
+ /* bctsr_sessid */ \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+ 1 /* bctsr_dir */ + \
+ 1 /* bctsr_use_conn_in_rdma_mode */)
+#define encode_destroy_session_maxsz (op_encode_hdr_maxsz + 4)
+#define decode_destroy_session_maxsz (op_decode_hdr_maxsz)
+#define encode_destroy_clientid_maxsz (op_encode_hdr_maxsz + 2)
+#define decode_destroy_clientid_maxsz (op_decode_hdr_maxsz)
+#define encode_sequence_maxsz (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 4)
+#define decode_sequence_maxsz (op_decode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
+#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
+#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
+#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
+ 1 /* layout type */ + \
+ 1 /* maxcount */ + \
+ 1 /* bitmap size */ + \
+ 1 /* notification bitmap length */ + \
+ 1 /* notification bitmap, word 0 */)
+#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
+ 1 /* layout type */ + \
+ 1 /* opaque devaddr4 length */ + \
+ /* devaddr4 payload is read into page */ \
+ 1 /* notification bitmap length */ + \
+ 1 /* notification bitmap, word 0 */ + \
+ pagepad_maxsz /* possible XDR padding */)
+#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
+ encode_stateid_maxsz)
+#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
+ decode_stateid_maxsz + \
+ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \
+ pagepad_maxsz)
+#define encode_layoutcommit_maxsz (op_encode_hdr_maxsz + \
+ 2 /* offset */ + \
+ 2 /* length */ + \
+ 1 /* reclaim */ + \
+ encode_stateid_maxsz + \
+ 1 /* new offset (true) */ + \
+ 2 /* last byte written */ + \
+ 1 /* nt_timechanged (false) */ + \
+ 1 /* layoutupdate4 layout type */ + \
+ 1 /* layoutupdate4 opaqueue len */)
+ /* the actual content of layoutupdate4 should
+ be allocated by drivers and spliced in
+ using xdr_write_pages */
+#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
+#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + \
+ 1 + \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT))
+#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
+ 1 + decode_stateid_maxsz)
+#define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1)
+#define decode_secinfo_no_name_maxsz decode_secinfo_maxsz
+#define encode_test_stateid_maxsz (op_encode_hdr_maxsz + 2 + \
+ XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define decode_test_stateid_maxsz (op_decode_hdr_maxsz + 2 + 1)
+#define encode_free_stateid_maxsz (op_encode_hdr_maxsz + 1 + \
+ XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define decode_free_stateid_maxsz (op_decode_hdr_maxsz)
+#else /* CONFIG_NFS_V4_1 */
+#define encode_sequence_maxsz 0
+#define decode_sequence_maxsz 0
+#define encode_layoutreturn_maxsz 0
+#define decode_layoutreturn_maxsz 0
+#define encode_layoutget_maxsz 0
+#define decode_layoutget_maxsz 0
+#endif /* CONFIG_NFS_V4_1 */
+
+#define NFS4_enc_compound_sz (1024) /* XXX: large enough? */
+#define NFS4_dec_compound_sz (1024) /* XXX: large enough? */
+#define NFS4_enc_read_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_read_maxsz)
+#define NFS4_dec_read_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_read_maxsz)
+#define NFS4_enc_readlink_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_readlink_maxsz)
+#define NFS4_dec_readlink_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_readlink_maxsz)
+#define NFS4_enc_readdir_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_readdir_maxsz)
+#define NFS4_dec_readdir_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_readdir_maxsz)
+#define NFS4_enc_write_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_write_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_write_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_write_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_commit_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_commit_maxsz)
+#define NFS4_dec_commit_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_commit_maxsz)
+#define NFS4_enc_open_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_open_maxsz + \
+ encode_access_maxsz + \
+ encode_getfh_maxsz + \
+ encode_getattr_maxsz + \
+ encode_layoutget_maxsz)
+#define NFS4_dec_open_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_open_maxsz + \
+ decode_access_maxsz + \
+ decode_getfh_maxsz + \
+ decode_getattr_maxsz + \
+ decode_layoutget_maxsz)
+#define NFS4_enc_open_confirm_sz \
+ (compound_encode_hdr_maxsz + \
+ encode_putfh_maxsz + \
+ encode_open_confirm_maxsz)
+#define NFS4_dec_open_confirm_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_putfh_maxsz + \
+ decode_open_confirm_maxsz)
+#define NFS4_enc_open_noattr_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_open_maxsz + \
+ encode_access_maxsz + \
+ encode_getattr_maxsz + \
+ encode_layoutget_maxsz)
+#define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_open_maxsz + \
+ decode_access_maxsz + \
+ decode_getattr_maxsz + \
+ decode_layoutget_maxsz)
+#define NFS4_enc_open_downgrade_sz \
+ (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_layoutreturn_maxsz + \
+ encode_open_downgrade_maxsz)
+#define NFS4_dec_open_downgrade_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_layoutreturn_maxsz + \
+ decode_open_downgrade_maxsz)
+#define NFS4_enc_close_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_layoutreturn_maxsz + \
+ encode_close_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_layoutreturn_maxsz + \
+ decode_close_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_setattr_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_setattr_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_setattr_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_fsinfo_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_fsinfo_maxsz)
+#define NFS4_dec_fsinfo_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_fsinfo_maxsz)
+#define NFS4_enc_renew_sz (compound_encode_hdr_maxsz + \
+ encode_renew_maxsz)
+#define NFS4_dec_renew_sz (compound_decode_hdr_maxsz + \
+ decode_renew_maxsz)
+#define NFS4_enc_setclientid_sz (compound_encode_hdr_maxsz + \
+ encode_setclientid_maxsz)
+#define NFS4_dec_setclientid_sz (compound_decode_hdr_maxsz + \
+ decode_setclientid_maxsz)
+#define NFS4_enc_setclientid_confirm_sz \
+ (compound_encode_hdr_maxsz + \
+ encode_setclientid_confirm_maxsz)
+#define NFS4_dec_setclientid_confirm_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_setclientid_confirm_maxsz)
+#define NFS4_enc_lock_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_lock_maxsz)
+#define NFS4_dec_lock_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_lock_maxsz)
+#define NFS4_enc_lockt_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_lockt_maxsz)
+#define NFS4_dec_lockt_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_lockt_maxsz)
+#define NFS4_enc_locku_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_locku_maxsz)
+#define NFS4_dec_locku_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_locku_maxsz)
+#define NFS4_enc_release_lockowner_sz \
+ (compound_encode_hdr_maxsz + \
+ encode_lockowner_maxsz)
+#define NFS4_dec_release_lockowner_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_lockowner_maxsz)
+#define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_access_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_access_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_access_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_getattr_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_getattr_maxsz + \
+ encode_renew_maxsz)
+#define NFS4_dec_getattr_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_getattr_maxsz + \
+ decode_renew_maxsz)
+#define NFS4_enc_lookup_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_lookup_maxsz + \
+ encode_getattr_maxsz + \
+ encode_getfh_maxsz)
+#define NFS4_dec_lookup_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_lookup_maxsz + \
+ decode_getattr_maxsz + \
+ decode_getfh_maxsz)
+#define NFS4_enc_lookupp_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_lookupp_maxsz + \
+ encode_getattr_maxsz + \
+ encode_getfh_maxsz)
+#define NFS4_dec_lookupp_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_lookupp_maxsz + \
+ decode_getattr_maxsz + \
+ decode_getfh_maxsz)
+#define NFS4_enc_lookup_root_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putrootfh_maxsz + \
+ encode_getattr_maxsz + \
+ encode_getfh_maxsz)
+#define NFS4_dec_lookup_root_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putrootfh_maxsz + \
+ decode_getattr_maxsz + \
+ decode_getfh_maxsz)
+#define NFS4_enc_remove_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_remove_maxsz)
+#define NFS4_dec_remove_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_remove_maxsz)
+#define NFS4_enc_rename_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_savefh_maxsz + \
+ encode_putfh_maxsz + \
+ encode_rename_maxsz)
+#define NFS4_dec_rename_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_savefh_maxsz + \
+ decode_putfh_maxsz + \
+ decode_rename_maxsz)
+#define NFS4_enc_link_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_savefh_maxsz + \
+ encode_putfh_maxsz + \
+ encode_link_maxsz + \
+ encode_restorefh_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_link_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_savefh_maxsz + \
+ decode_putfh_maxsz + \
+ decode_link_maxsz + \
+ decode_restorefh_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_symlink_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_symlink_maxsz + \
+ encode_getattr_maxsz + \
+ encode_getfh_maxsz)
+#define NFS4_dec_symlink_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_symlink_maxsz + \
+ decode_getattr_maxsz + \
+ decode_getfh_maxsz)
+#define NFS4_enc_create_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_create_maxsz + \
+ encode_getfh_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_create_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_create_maxsz + \
+ decode_getfh_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_pathconf_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_pathconf_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_statfs_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_statfs_maxsz)
+#define NFS4_dec_statfs_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_statfs_maxsz)
+#define NFS4_enc_server_caps_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_server_caps_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_delegreturn_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_layoutreturn_maxsz + \
+ encode_delegreturn_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_layoutreturn_maxsz + \
+ decode_delegreturn_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_getacl_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_getacl_maxsz)
+#define NFS4_dec_getacl_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_getacl_maxsz)
+#define NFS4_enc_setacl_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_setacl_maxsz)
+#define NFS4_dec_setacl_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_setacl_maxsz)
+#define NFS4_enc_fs_locations_sz \
+ (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_lookup_maxsz + \
+ encode_fs_locations_maxsz + \
+ encode_renew_maxsz)
+#define NFS4_dec_fs_locations_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_lookup_maxsz + \
+ decode_fs_locations_maxsz + \
+ decode_renew_maxsz)
+#define NFS4_enc_secinfo_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_secinfo_maxsz)
+#define NFS4_dec_secinfo_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_secinfo_maxsz)
+#define NFS4_enc_fsid_present_sz \
+ (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_getfh_maxsz + \
+ encode_renew_maxsz)
+#define NFS4_dec_fsid_present_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_getfh_maxsz + \
+ decode_renew_maxsz)
+#if defined(CONFIG_NFS_V4_1)
+#define NFS4_enc_bind_conn_to_session_sz \
+ (compound_encode_hdr_maxsz + \
+ encode_bind_conn_to_session_maxsz)
+#define NFS4_dec_bind_conn_to_session_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_bind_conn_to_session_maxsz)
+#define NFS4_enc_exchange_id_sz \
+ (compound_encode_hdr_maxsz + \
+ encode_exchange_id_maxsz)
+#define NFS4_dec_exchange_id_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_exchange_id_maxsz)
+#define NFS4_enc_create_session_sz \
+ (compound_encode_hdr_maxsz + \
+ encode_create_session_maxsz)
+#define NFS4_dec_create_session_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_create_session_maxsz)
+#define NFS4_enc_destroy_session_sz (compound_encode_hdr_maxsz + \
+ encode_destroy_session_maxsz)
+#define NFS4_dec_destroy_session_sz (compound_decode_hdr_maxsz + \
+ decode_destroy_session_maxsz)
+#define NFS4_enc_destroy_clientid_sz (compound_encode_hdr_maxsz + \
+ encode_destroy_clientid_maxsz)
+#define NFS4_dec_destroy_clientid_sz (compound_decode_hdr_maxsz + \
+ decode_destroy_clientid_maxsz)
+#define NFS4_enc_sequence_sz \
+ (compound_decode_hdr_maxsz + \
+ encode_sequence_maxsz)
+#define NFS4_dec_sequence_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz)
+#endif
+#define NFS4_enc_get_lease_time_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putrootfh_maxsz + \
+ encode_fsinfo_maxsz)
+#define NFS4_dec_get_lease_time_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putrootfh_maxsz + \
+ decode_fsinfo_maxsz)
+#if defined(CONFIG_NFS_V4_1)
+#define NFS4_enc_reclaim_complete_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_reclaim_complete_maxsz)
+#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_reclaim_complete_maxsz)
+#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz +\
+ encode_getdeviceinfo_maxsz)
+#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_getdeviceinfo_maxsz)
+#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_layoutget_maxsz)
+#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_layoutget_maxsz)
+#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz +\
+ encode_putfh_maxsz + \
+ encode_layoutcommit_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_layoutcommit_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_layoutreturn_maxsz)
+#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_layoutreturn_maxsz)
+#define NFS4_enc_secinfo_no_name_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putrootfh_maxsz +\
+ encode_secinfo_no_name_maxsz)
+#define NFS4_dec_secinfo_no_name_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putrootfh_maxsz + \
+ decode_secinfo_no_name_maxsz)
+#define NFS4_enc_test_stateid_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_test_stateid_maxsz)
+#define NFS4_dec_test_stateid_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_test_stateid_maxsz)
+#define NFS4_enc_free_stateid_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_free_stateid_maxsz)
+#define NFS4_dec_free_stateid_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_free_stateid_maxsz)
+
+const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+ compound_encode_hdr_maxsz +
+ encode_sequence_maxsz +
+ encode_putfh_maxsz +
+ encode_getattr_maxsz) *
+ XDR_UNIT);
+
+const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+ compound_decode_hdr_maxsz +
+ decode_sequence_maxsz +
+ decode_putfh_maxsz) *
+ XDR_UNIT);
+
+const u32 nfs41_maxgetdevinfo_overhead = ((RPC_MAX_REPHEADER_WITH_AUTH +
+ compound_decode_hdr_maxsz +
+ decode_sequence_maxsz) *
+ XDR_UNIT);
+EXPORT_SYMBOL_GPL(nfs41_maxgetdevinfo_overhead);
+#endif /* CONFIG_NFS_V4_1 */
+
+static const umode_t nfs_type2fmt[] = {
+ [NF4BAD] = 0,
+ [NF4REG] = S_IFREG,
+ [NF4DIR] = S_IFDIR,
+ [NF4BLK] = S_IFBLK,
+ [NF4CHR] = S_IFCHR,
+ [NF4LNK] = S_IFLNK,
+ [NF4SOCK] = S_IFSOCK,
+ [NF4FIFO] = S_IFIFO,
+ [NF4ATTRDIR] = 0,
+ [NF4NAMEDATTR] = 0,
+};
+
+struct compound_hdr {
+ int32_t status;
+ uint32_t nops;
+ __be32 * nops_p;
+ uint32_t taglen;
+ char * tag;
+ uint32_t replen; /* expected reply words */
+ u32 minorversion;
+};
+
+static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes)
+{
+ __be32 *p = xdr_reserve_space(xdr, nbytes);
+ BUG_ON(!p);
+ return p;
+}
+
+static void encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len)
+{
+ WARN_ON_ONCE(xdr_stream_encode_opaque_fixed(xdr, buf, len) < 0);
+}
+
+static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
+{
+ WARN_ON_ONCE(xdr_stream_encode_opaque(xdr, str, len) < 0);
+}
+
+static void encode_uint32(struct xdr_stream *xdr, u32 n)
+{
+ WARN_ON_ONCE(xdr_stream_encode_u32(xdr, n) < 0);
+}
+
+static void encode_uint64(struct xdr_stream *xdr, u64 n)
+{
+ WARN_ON_ONCE(xdr_stream_encode_u64(xdr, n) < 0);
+}
+
+static ssize_t xdr_encode_bitmap4(struct xdr_stream *xdr,
+ const __u32 *bitmap, size_t len)
+{
+ ssize_t ret;
+
+ /* Trim empty words */
+ while (len > 0 && bitmap[len-1] == 0)
+ len--;
+ ret = xdr_stream_encode_uint32_array(xdr, bitmap, len);
+ if (WARN_ON_ONCE(ret < 0))
+ return ret;
+ return len;
+}
+
+static size_t mask_bitmap4(const __u32 *bitmap, const __u32 *mask,
+ __u32 *res, size_t len)
+{
+ size_t i;
+ __u32 tmp;
+
+ while (len > 0 && (bitmap[len-1] == 0 || mask[len-1] == 0))
+ len--;
+ for (i = len; i-- > 0;) {
+ tmp = bitmap[i] & mask[i];
+ res[i] = tmp;
+ }
+ return len;
+}
+
+static void encode_nfs4_seqid(struct xdr_stream *xdr,
+ const struct nfs_seqid *seqid)
+{
+ if (seqid != NULL)
+ encode_uint32(xdr, seqid->sequence->counter);
+ else
+ encode_uint32(xdr, 0);
+}
+
+static void encode_compound_hdr(struct xdr_stream *xdr,
+ struct rpc_rqst *req,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ /* initialize running count of expected bytes in reply.
+ * NOTE: the replied tag SHOULD be the same is the one sent,
+ * but this is not required as a MUST for the server to do so. */
+ hdr->replen = 3 + hdr->taglen;
+
+ WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN);
+ encode_string(xdr, hdr->taglen, hdr->tag);
+ p = reserve_space(xdr, 8);
+ *p++ = cpu_to_be32(hdr->minorversion);
+ hdr->nops_p = p;
+ *p = cpu_to_be32(hdr->nops);
+}
+
+static void encode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 op,
+ uint32_t replen,
+ struct compound_hdr *hdr)
+{
+ encode_uint32(xdr, op);
+ hdr->nops++;
+ hdr->replen += replen;
+}
+
+static void encode_nops(struct compound_hdr *hdr)
+{
+ WARN_ON_ONCE(hdr->nops > NFS4_MAX_OPS);
+ *hdr->nops_p = htonl(hdr->nops);
+}
+
+static void encode_nfs4_stateid(struct xdr_stream *xdr, const nfs4_stateid *stateid)
+{
+ encode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
+}
+
+static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
+{
+ encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE);
+}
+
+static __be32 *
+xdr_encode_nfstime4(__be32 *p, const struct timespec64 *t)
+{
+ p = xdr_encode_hyper(p, t->tv_sec);
+ *p++ = cpu_to_be32(t->tv_nsec);
+ return p;
+}
+
+static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
+ const struct nfs4_label *label,
+ const umode_t *umask,
+ const struct nfs_server *server,
+ const uint32_t attrmask[])
+{
+ char owner_name[IDMAP_NAMESZ];
+ char owner_group[IDMAP_NAMESZ];
+ int owner_namelen = 0;
+ int owner_grouplen = 0;
+ __be32 *p;
+ uint32_t len = 0;
+ uint32_t bmval[3] = { 0 };
+
+ /*
+ * We reserve enough space to write the entire attribute buffer at once.
+ */
+ if ((iap->ia_valid & ATTR_SIZE) && (attrmask[0] & FATTR4_WORD0_SIZE)) {
+ bmval[0] |= FATTR4_WORD0_SIZE;
+ len += 8;
+ }
+ if (iap->ia_valid & ATTR_MODE) {
+ if (umask && (attrmask[2] & FATTR4_WORD2_MODE_UMASK)) {
+ bmval[2] |= FATTR4_WORD2_MODE_UMASK;
+ len += 8;
+ } else if (attrmask[1] & FATTR4_WORD1_MODE) {
+ bmval[1] |= FATTR4_WORD1_MODE;
+ len += 4;
+ }
+ }
+ if ((iap->ia_valid & ATTR_UID) && (attrmask[1] & FATTR4_WORD1_OWNER)) {
+ owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
+ if (owner_namelen < 0) {
+ dprintk("nfs: couldn't resolve uid %d to string\n",
+ from_kuid(&init_user_ns, iap->ia_uid));
+ /* XXX */
+ strcpy(owner_name, "nobody");
+ owner_namelen = sizeof("nobody") - 1;
+ /* goto out; */
+ }
+ bmval[1] |= FATTR4_WORD1_OWNER;
+ len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
+ }
+ if ((iap->ia_valid & ATTR_GID) &&
+ (attrmask[1] & FATTR4_WORD1_OWNER_GROUP)) {
+ owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ);
+ if (owner_grouplen < 0) {
+ dprintk("nfs: couldn't resolve gid %d to string\n",
+ from_kgid(&init_user_ns, iap->ia_gid));
+ strcpy(owner_group, "nobody");
+ owner_grouplen = sizeof("nobody") - 1;
+ /* goto out; */
+ }
+ bmval[1] |= FATTR4_WORD1_OWNER_GROUP;
+ len += 4 + (XDR_QUADLEN(owner_grouplen) << 2);
+ }
+ if (attrmask[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
+ if (iap->ia_valid & ATTR_ATIME_SET) {
+ bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
+ len += 4 + (nfstime4_maxsz << 2);
+ } else if (iap->ia_valid & ATTR_ATIME) {
+ bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
+ len += 4;
+ }
+ }
+ if (attrmask[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
+ if (iap->ia_valid & ATTR_MTIME_SET) {
+ bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
+ len += 4 + (nfstime4_maxsz << 2);
+ } else if (iap->ia_valid & ATTR_MTIME) {
+ bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
+ len += 4;
+ }
+ }
+
+ if (label && (attrmask[2] & FATTR4_WORD2_SECURITY_LABEL)) {
+ len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
+ bmval[2] |= FATTR4_WORD2_SECURITY_LABEL;
+ }
+
+ xdr_encode_bitmap4(xdr, bmval, ARRAY_SIZE(bmval));
+ xdr_stream_encode_opaque_inline(xdr, (void **)&p, len);
+
+ if (bmval[0] & FATTR4_WORD0_SIZE)
+ p = xdr_encode_hyper(p, iap->ia_size);
+ if (bmval[1] & FATTR4_WORD1_MODE)
+ *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
+ if (bmval[1] & FATTR4_WORD1_OWNER)
+ p = xdr_encode_opaque(p, owner_name, owner_namelen);
+ if (bmval[1] & FATTR4_WORD1_OWNER_GROUP)
+ p = xdr_encode_opaque(p, owner_group, owner_grouplen);
+ if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
+ if (iap->ia_valid & ATTR_ATIME_SET) {
+ *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+ p = xdr_encode_nfstime4(p, &iap->ia_atime);
+ } else
+ *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
+ }
+ if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
+ if (iap->ia_valid & ATTR_MTIME_SET) {
+ *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+ p = xdr_encode_nfstime4(p, &iap->ia_mtime);
+ } else
+ *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
+ }
+ if (label && (bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) {
+ *p++ = cpu_to_be32(label->lfs);
+ *p++ = cpu_to_be32(label->pi);
+ *p++ = cpu_to_be32(label->len);
+ p = xdr_encode_opaque_fixed(p, label->label, label->len);
+ }
+ if (bmval[2] & FATTR4_WORD2_MODE_UMASK) {
+ *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
+ *p++ = cpu_to_be32(*umask);
+ }
+
+/* out: */
+}
+
+static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_ACCESS, decode_access_maxsz, hdr);
+ encode_uint32(xdr, access);
+}
+
+static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr);
+ encode_nfs4_seqid(xdr, arg->seqid);
+ encode_nfs4_stateid(xdr, &arg->stateid);
+}
+
+static void encode_commit(struct xdr_stream *xdr, const struct nfs_commitargs *args, struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr);
+ p = reserve_space(xdr, 12);
+ p = xdr_encode_hyper(p, args->offset);
+ *p = cpu_to_be32(args->count);
+}
+
+static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_CREATE, decode_create_maxsz, hdr);
+ encode_uint32(xdr, create->ftype);
+
+ switch (create->ftype) {
+ case NF4LNK:
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(create->u.symlink.len);
+ xdr_write_pages(xdr, create->u.symlink.pages, 0,
+ create->u.symlink.len);
+ xdr->buf->flags |= XDRBUF_WRITE;
+ break;
+
+ case NF4BLK: case NF4CHR:
+ p = reserve_space(xdr, 8);
+ *p++ = cpu_to_be32(create->u.device.specdata1);
+ *p = cpu_to_be32(create->u.device.specdata2);
+ break;
+
+ default:
+ break;
+ }
+
+ encode_string(xdr, create->name->len, create->name->name);
+ encode_attrs(xdr, create->attrs, create->label, &create->umask,
+ create->server, create->server->attr_bitmask);
+}
+
+static void encode_getattr(struct xdr_stream *xdr,
+ const __u32 *bitmap, const __u32 *mask, size_t len,
+ struct compound_hdr *hdr)
+{
+ __u32 masked_bitmap[nfs4_fattr_bitmap_maxsz];
+
+ encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr);
+ if (mask) {
+ if (WARN_ON_ONCE(len > ARRAY_SIZE(masked_bitmap)))
+ len = ARRAY_SIZE(masked_bitmap);
+ len = mask_bitmap4(bitmap, mask, masked_bitmap, len);
+ bitmap = masked_bitmap;
+ }
+ xdr_encode_bitmap4(xdr, bitmap, len);
+}
+
+static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
+{
+ encode_getattr(xdr, nfs4_fattr_bitmap, bitmask,
+ ARRAY_SIZE(nfs4_fattr_bitmap), hdr);
+}
+
+static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask,
+ const u32 *open_bitmap,
+ struct compound_hdr *hdr)
+{
+ encode_getattr(xdr, open_bitmap, bitmask, 3, hdr);
+}
+
+static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
+{
+ encode_getattr(xdr, nfs4_fsinfo_bitmap, bitmask,
+ ARRAY_SIZE(nfs4_fsinfo_bitmap), hdr);
+}
+
+static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
+{
+ encode_getattr(xdr, nfs4_fs_locations_bitmap, bitmask,
+ ARRAY_SIZE(nfs4_fs_locations_bitmap), hdr);
+}
+
+static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_GETFH, decode_getfh_maxsz, hdr);
+}
+
+static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_LINK, decode_link_maxsz, hdr);
+ encode_string(xdr, name->len, name->name);
+}
+
+static inline int nfs4_lock_type(struct file_lock *fl, int block)
+{
+ if (fl->fl_type == F_RDLCK)
+ return block ? NFS4_READW_LT : NFS4_READ_LT;
+ return block ? NFS4_WRITEW_LT : NFS4_WRITE_LT;
+}
+
+static inline uint64_t nfs4_lock_length(struct file_lock *fl)
+{
+ if (fl->fl_end == OFFSET_MAX)
+ return ~(uint64_t)0;
+ return fl->fl_end - fl->fl_start + 1;
+}
+
+static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 32);
+ p = xdr_encode_hyper(p, lowner->clientid);
+ *p++ = cpu_to_be32(20);
+ p = xdr_encode_opaque_fixed(p, "lock id:", 8);
+ *p++ = cpu_to_be32(lowner->s_dev);
+ xdr_encode_hyper(p, lowner->id);
+}
+
+/*
+ * opcode,type,reclaim,offset,length,new_lock_owner = 32
+ * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40
+ */
+static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args, struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LOCK, decode_lock_maxsz, hdr);
+ p = reserve_space(xdr, 28);
+ *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
+ *p++ = cpu_to_be32(args->reclaim);
+ p = xdr_encode_hyper(p, args->fl->fl_start);
+ p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+ *p = cpu_to_be32(args->new_lock_owner);
+ if (args->new_lock_owner){
+ encode_nfs4_seqid(xdr, args->open_seqid);
+ encode_nfs4_stateid(xdr, &args->open_stateid);
+ encode_nfs4_seqid(xdr, args->lock_seqid);
+ encode_lockowner(xdr, &args->lock_owner);
+ }
+ else {
+ encode_nfs4_stateid(xdr, &args->lock_stateid);
+ encode_nfs4_seqid(xdr, args->lock_seqid);
+ }
+}
+
+static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LOCKT, decode_lockt_maxsz, hdr);
+ p = reserve_space(xdr, 20);
+ *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
+ p = xdr_encode_hyper(p, args->fl->fl_start);
+ p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+ encode_lockowner(xdr, &args->lock_owner);
+}
+
+static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr);
+ encode_uint32(xdr, nfs4_lock_type(args->fl, 0));
+ encode_nfs4_seqid(xdr, args->seqid);
+ encode_nfs4_stateid(xdr, &args->stateid);
+ p = reserve_space(xdr, 16);
+ p = xdr_encode_hyper(p, args->fl->fl_start);
+ xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+}
+
+static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_RELEASE_LOCKOWNER, decode_release_lockowner_maxsz, hdr);
+ encode_lockowner(xdr, lowner);
+}
+
+static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_LOOKUP, decode_lookup_maxsz, hdr);
+ encode_string(xdr, name->len, name->name);
+}
+
+static void encode_lookupp(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_LOOKUPP, decode_lookupp_maxsz, hdr);
+}
+
+static void encode_share_access(struct xdr_stream *xdr, u32 share_access)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 8);
+ *p++ = cpu_to_be32(share_access);
+ *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */
+}
+
+static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
+{
+ __be32 *p;
+ /*
+ * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
+ * owner 4 = 32
+ */
+ encode_nfs4_seqid(xdr, arg->seqid);
+ encode_share_access(xdr, arg->share_access);
+ p = reserve_space(xdr, 36);
+ p = xdr_encode_hyper(p, arg->clientid);
+ *p++ = cpu_to_be32(24);
+ p = xdr_encode_opaque_fixed(p, "open id:", 8);
+ *p++ = cpu_to_be32(arg->server->s_dev);
+ *p++ = cpu_to_be32(arg->id.uniquifier);
+ xdr_encode_hyper(p, arg->id.create_time);
+}
+
+static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ switch(arg->createmode) {
+ case NFS4_CREATE_UNCHECKED:
+ *p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
+ encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask,
+ arg->server, arg->server->attr_bitmask);
+ break;
+ case NFS4_CREATE_GUARDED:
+ *p = cpu_to_be32(NFS4_CREATE_GUARDED);
+ encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask,
+ arg->server, arg->server->attr_bitmask);
+ break;
+ case NFS4_CREATE_EXCLUSIVE:
+ *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
+ encode_nfs4_verifier(xdr, &arg->u.verifier);
+ break;
+ case NFS4_CREATE_EXCLUSIVE4_1:
+ *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
+ encode_nfs4_verifier(xdr, &arg->u.verifier);
+ encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask,
+ arg->server, arg->server->exclcreat_bitmask);
+ }
+}
+
+static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *arg)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ switch (arg->open_flags & O_CREAT) {
+ case 0:
+ *p = cpu_to_be32(NFS4_OPEN_NOCREATE);
+ break;
+ default:
+ *p = cpu_to_be32(NFS4_OPEN_CREATE);
+ encode_createmode(xdr, arg);
+ }
+}
+
+static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delegation_type)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ switch (delegation_type) {
+ case 0:
+ *p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
+ break;
+ case FMODE_READ:
+ *p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ);
+ break;
+ case FMODE_WRITE|FMODE_READ:
+ *p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *name)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
+ encode_string(xdr, name->len, name->name);
+}
+
+static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
+ encode_delegation_type(xdr, type);
+}
+
+static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struct qstr *name, const nfs4_stateid *stateid)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
+ encode_nfs4_stateid(xdr, stateid);
+ encode_string(xdr, name->len, name->name);
+}
+
+static inline void encode_claim_fh(struct xdr_stream *xdr)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(NFS4_OPEN_CLAIM_FH);
+}
+
+static inline void encode_claim_delegate_cur_fh(struct xdr_stream *xdr, const nfs4_stateid *stateid)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEG_CUR_FH);
+ encode_nfs4_stateid(xdr, stateid);
+}
+
+static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_OPEN, decode_open_maxsz, hdr);
+ encode_openhdr(xdr, arg);
+ encode_opentype(xdr, arg);
+ switch (arg->claim) {
+ case NFS4_OPEN_CLAIM_NULL:
+ encode_claim_null(xdr, arg->name);
+ break;
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ encode_claim_previous(xdr, arg->u.delegation_type);
+ break;
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation);
+ break;
+ case NFS4_OPEN_CLAIM_FH:
+ encode_claim_fh(xdr);
+ break;
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ encode_claim_delegate_cur_fh(xdr, &arg->u.delegation);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_OPEN_CONFIRM, decode_open_confirm_maxsz, hdr);
+ encode_nfs4_stateid(xdr, arg->stateid);
+ encode_nfs4_seqid(xdr, arg->seqid);
+}
+
+static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &arg->stateid);
+ encode_nfs4_seqid(xdr, arg->seqid);
+ encode_share_access(xdr, arg->share_access);
+}
+
+static void
+encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_PUTFH, decode_putfh_maxsz, hdr);
+ encode_string(xdr, fh->size, fh->data);
+}
+
+static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr);
+}
+
+static void encode_read(struct xdr_stream *xdr, const struct nfs_pgio_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_READ, decode_read_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->stateid);
+
+ p = reserve_space(xdr, 12);
+ p = xdr_encode_hyper(p, args->offset);
+ *p = cpu_to_be32(args->count);
+}
+
+static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
+{
+ uint32_t attrs[3] = {
+ FATTR4_WORD0_RDATTR_ERROR,
+ FATTR4_WORD1_MOUNTED_ON_FILEID,
+ };
+ uint32_t dircount = readdir->count;
+ uint32_t maxcount = readdir->count;
+ __be32 *p, verf[2];
+ uint32_t attrlen = 0;
+ unsigned int i;
+
+ if (readdir->plus) {
+ attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
+ FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE|FATTR4_WORD0_FILEID;
+ attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER|
+ FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
+ FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
+ FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+ attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
+ }
+ /* Use mounted_on_fileid only if the server supports it */
+ if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID))
+ attrs[0] |= FATTR4_WORD0_FILEID;
+ for (i = 0; i < ARRAY_SIZE(attrs); i++) {
+ attrs[i] &= readdir->bitmask[i];
+ if (attrs[i] != 0)
+ attrlen = i+1;
+ }
+
+ encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr);
+ encode_uint64(xdr, readdir->cookie);
+ encode_nfs4_verifier(xdr, &readdir->verifier);
+ p = reserve_space(xdr, 12 + (attrlen << 2));
+ *p++ = cpu_to_be32(dircount);
+ *p++ = cpu_to_be32(maxcount);
+ *p++ = cpu_to_be32(attrlen);
+ for (i = 0; i < attrlen; i++)
+ *p++ = cpu_to_be32(attrs[i]);
+ memcpy(verf, readdir->verifier.data, sizeof(verf));
+
+ dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n",
+ __func__,
+ (unsigned long long)readdir->cookie,
+ verf[0], verf[1],
+ attrs[0] & readdir->bitmask[0],
+ attrs[1] & readdir->bitmask[1],
+ attrs[2] & readdir->bitmask[2]);
+}
+
+static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_READLINK, decode_readlink_maxsz, hdr);
+}
+
+static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_REMOVE, decode_remove_maxsz, hdr);
+ encode_string(xdr, name->len, name->name);
+}
+
+static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_RENAME, decode_rename_maxsz, hdr);
+ encode_string(xdr, oldname->len, oldname->name);
+ encode_string(xdr, newname->len, newname->name);
+}
+
+static void encode_renew(struct xdr_stream *xdr, clientid4 clid,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_RENEW, decode_renew_maxsz, hdr);
+ encode_uint64(xdr, clid);
+}
+
+static void
+encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_RESTOREFH, decode_restorefh_maxsz, hdr);
+}
+
+static void nfs4_acltype_to_bitmap(enum nfs4_acl_type type, __u32 bitmap[2])
+{
+ switch (type) {
+ default:
+ bitmap[0] = FATTR4_WORD0_ACL;
+ bitmap[1] = 0;
+ break;
+ case NFS4ACL_DACL:
+ bitmap[0] = 0;
+ bitmap[1] = FATTR4_WORD1_DACL;
+ break;
+ case NFS4ACL_SACL:
+ bitmap[0] = 0;
+ bitmap[1] = FATTR4_WORD1_SACL;
+ }
+}
+
+static void encode_setacl(struct xdr_stream *xdr,
+ const struct nfs_setaclargs *arg,
+ struct compound_hdr *hdr)
+{
+ __u32 bitmap[2];
+
+ nfs4_acltype_to_bitmap(arg->acl_type, bitmap);
+
+ encode_op_hdr(xdr, OP_SETATTR, decode_setacl_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &zero_stateid);
+ xdr_encode_bitmap4(xdr, bitmap, ARRAY_SIZE(bitmap));
+ encode_uint32(xdr, arg->acl_len);
+ xdr_write_pages(xdr, arg->acl_pages, 0, arg->acl_len);
+}
+
+static void
+encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_SAVEFH, decode_savefh_maxsz, hdr);
+}
+
+static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &arg->stateid);
+ encode_attrs(xdr, arg->iap, arg->label, NULL, server,
+ server->attr_bitmask);
+}
+
+static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr);
+ encode_nfs4_verifier(xdr, setclientid->sc_verifier);
+
+ encode_string(xdr, strlen(setclientid->sc_clnt->cl_owner_id),
+ setclientid->sc_clnt->cl_owner_id);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(setclientid->sc_prog);
+ encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
+ encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(setclientid->sc_clnt->cl_cb_ident);
+}
+
+static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM,
+ decode_setclientid_confirm_maxsz, hdr);
+ encode_uint64(xdr, arg->clientid);
+ encode_nfs4_verifier(xdr, &arg->confirm);
+}
+
+static void encode_write(struct xdr_stream *xdr, const struct nfs_pgio_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_WRITE, decode_write_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->stateid);
+
+ p = reserve_space(xdr, 16);
+ p = xdr_encode_hyper(p, args->offset);
+ *p++ = cpu_to_be32(args->stable);
+ *p = cpu_to_be32(args->count);
+
+ xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
+}
+
+static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_DELEGRETURN, decode_delegreturn_maxsz, hdr);
+ encode_nfs4_stateid(xdr, stateid);
+}
+
+static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_SECINFO, decode_secinfo_maxsz, hdr);
+ encode_string(xdr, name->len, name->name);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/* NFSv4.1 operations */
+static void encode_bind_conn_to_session(struct xdr_stream *xdr,
+ const struct nfs41_bind_conn_to_session_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION,
+ decode_bind_conn_to_session_maxsz, hdr);
+ encode_opaque_fixed(xdr, args->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+ p = xdr_reserve_space(xdr, 8);
+ *p++ = cpu_to_be32(args->dir);
+ *p = (args->use_conn_in_rdma_mode) ? cpu_to_be32(1) : cpu_to_be32(0);
+}
+
+static void encode_op_map(struct xdr_stream *xdr, const struct nfs4_op_map *op_map)
+{
+ unsigned int i;
+ encode_uint32(xdr, NFS4_OP_MAP_NUM_WORDS);
+ for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++)
+ encode_uint32(xdr, op_map->u.words[i]);
+}
+
+static void encode_exchange_id(struct xdr_stream *xdr,
+ const struct nfs41_exchange_id_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+ char impl_name[IMPL_NAME_LIMIT];
+ int len = 0;
+
+ encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr);
+ encode_nfs4_verifier(xdr, &args->verifier);
+
+ encode_string(xdr, strlen(args->client->cl_owner_id),
+ args->client->cl_owner_id);
+
+ encode_uint32(xdr, args->flags);
+ encode_uint32(xdr, args->state_protect.how);
+
+ switch (args->state_protect.how) {
+ case SP4_NONE:
+ break;
+ case SP4_MACH_CRED:
+ encode_op_map(xdr, &args->state_protect.enforce);
+ encode_op_map(xdr, &args->state_protect.allow);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ if (send_implementation_id &&
+ sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 &&
+ sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN)
+ <= sizeof(impl_name) + 1)
+ len = snprintf(impl_name, sizeof(impl_name), "%s %s %s %s",
+ utsname()->sysname, utsname()->release,
+ utsname()->version, utsname()->machine);
+
+ if (len > 0) {
+ encode_uint32(xdr, 1); /* implementation id array length=1 */
+
+ encode_string(xdr,
+ sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) - 1,
+ CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN);
+ encode_string(xdr, len, impl_name);
+ /* just send zeros for nii_date - the date is in nii_name */
+ p = reserve_space(xdr, 12);
+ p = xdr_encode_hyper(p, 0);
+ *p = cpu_to_be32(0);
+ } else
+ encode_uint32(xdr, 0); /* implementation id array length=0 */
+}
+
+static void encode_create_session(struct xdr_stream *xdr,
+ const struct nfs41_create_session_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+ struct nfs_client *clp = args->client;
+ struct rpc_clnt *clnt = clp->cl_rpcclient;
+ struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+ u32 max_resp_sz_cached;
+
+ /*
+ * Assumes OPEN is the biggest non-idempotent compound.
+ * 2 is the verifier.
+ */
+ max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + 2)
+ * XDR_UNIT + RPC_MAX_AUTH_SIZE;
+
+ encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr);
+ p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12);
+ p = xdr_encode_hyper(p, args->clientid);
+ *p++ = cpu_to_be32(args->seqid); /*Sequence id */
+ *p++ = cpu_to_be32(args->flags); /*flags */
+
+ /* Fore Channel */
+ *p++ = cpu_to_be32(0); /* header padding size */
+ *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */
+ *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */
+ *p++ = cpu_to_be32(max_resp_sz_cached); /* Max resp sz cached */
+ *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */
+ *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */
+ *p++ = cpu_to_be32(0); /* rdmachannel_attrs */
+
+ /* Back Channel */
+ *p++ = cpu_to_be32(0); /* header padding size */
+ *p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz); /* max req size */
+ *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz); /* max resp size */
+ *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */
+ *p++ = cpu_to_be32(args->bc_attrs.max_ops); /* max operations */
+ *p++ = cpu_to_be32(args->bc_attrs.max_reqs); /* max requests */
+ *p++ = cpu_to_be32(0); /* rdmachannel_attrs */
+
+ *p++ = cpu_to_be32(args->cb_program); /* cb_program */
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */
+
+ /* authsys_parms rfc1831 */
+ *p++ = cpu_to_be32(ktime_to_ns(nn->boot_time)); /* stamp */
+ p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen);
+ *p++ = cpu_to_be32(0); /* UID */
+ *p++ = cpu_to_be32(0); /* GID */
+ *p = cpu_to_be32(0); /* No more gids */
+}
+
+static void encode_destroy_session(struct xdr_stream *xdr,
+ const struct nfs4_session *session,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_DESTROY_SESSION, decode_destroy_session_maxsz, hdr);
+ encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+}
+
+static void encode_destroy_clientid(struct xdr_stream *xdr,
+ uint64_t clientid,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_DESTROY_CLIENTID, decode_destroy_clientid_maxsz, hdr);
+ encode_uint64(xdr, clientid);
+}
+
+static void encode_reclaim_complete(struct xdr_stream *xdr,
+ const struct nfs41_reclaim_complete_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_RECLAIM_COMPLETE, decode_reclaim_complete_maxsz, hdr);
+ encode_uint32(xdr, args->one_fs);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static void encode_sequence(struct xdr_stream *xdr,
+ const struct nfs4_sequence_args *args,
+ struct compound_hdr *hdr)
+{
+#if defined(CONFIG_NFS_V4_1)
+ struct nfs4_session *session;
+ struct nfs4_slot_table *tp;
+ struct nfs4_slot *slot = args->sa_slot;
+ __be32 *p;
+
+ tp = slot->table;
+ session = tp->session;
+ if (!session)
+ return;
+
+ encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr);
+
+ /*
+ * Sessionid + seqid + slotid + max slotid + cache_this
+ */
+ dprintk("%s: sessionid=%u:%u:%u:%u seqid=%d slotid=%d "
+ "max_slotid=%d cache_this=%d\n",
+ __func__,
+ ((u32 *)session->sess_id.data)[0],
+ ((u32 *)session->sess_id.data)[1],
+ ((u32 *)session->sess_id.data)[2],
+ ((u32 *)session->sess_id.data)[3],
+ slot->seq_nr, slot->slot_nr,
+ tp->highest_used_slotid, args->sa_cache_this);
+ p = reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 16);
+ p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+ *p++ = cpu_to_be32(slot->seq_nr);
+ *p++ = cpu_to_be32(slot->slot_nr);
+ *p++ = cpu_to_be32(tp->highest_used_slotid);
+ *p = cpu_to_be32(args->sa_cache_this);
+#endif /* CONFIG_NFS_V4_1 */
+}
+
+#ifdef CONFIG_NFS_V4_1
+static void
+encode_getdeviceinfo(struct xdr_stream *xdr,
+ const struct nfs4_getdeviceinfo_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr);
+ p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 4 + 4);
+ p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
+ NFS4_DEVICEID4_SIZE);
+ *p++ = cpu_to_be32(args->pdev->layout_type);
+ *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */
+
+ p = reserve_space(xdr, 4 + 4);
+ *p++ = cpu_to_be32(1); /* bitmap length */
+ *p++ = cpu_to_be32(args->notify_types);
+}
+
+static void
+encode_layoutget(struct xdr_stream *xdr,
+ const struct nfs4_layoutget_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LAYOUTGET, decode_layoutget_maxsz, hdr);
+ p = reserve_space(xdr, 36);
+ *p++ = cpu_to_be32(0); /* Signal layout available */
+ *p++ = cpu_to_be32(args->type);
+ *p++ = cpu_to_be32(args->range.iomode);
+ p = xdr_encode_hyper(p, args->range.offset);
+ p = xdr_encode_hyper(p, args->range.length);
+ p = xdr_encode_hyper(p, args->minlength);
+ encode_nfs4_stateid(xdr, &args->stateid);
+ encode_uint32(xdr, args->maxcount);
+
+ dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
+ __func__,
+ args->type,
+ args->range.iomode,
+ (unsigned long)args->range.offset,
+ (unsigned long)args->range.length,
+ args->maxcount);
+}
+
+static int
+encode_layoutcommit(struct xdr_stream *xdr,
+ struct inode *inode,
+ const struct nfs4_layoutcommit_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
+ NFS_SERVER(args->inode)->pnfs_curr_ld->id);
+
+ encode_op_hdr(xdr, OP_LAYOUTCOMMIT, decode_layoutcommit_maxsz, hdr);
+ p = reserve_space(xdr, 20);
+ /* Only whole file layouts */
+ p = xdr_encode_hyper(p, 0); /* offset */
+ p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */
+ *p = cpu_to_be32(0); /* reclaim */
+ encode_nfs4_stateid(xdr, &args->stateid);
+ if (args->lastbytewritten != U64_MAX) {
+ p = reserve_space(xdr, 20);
+ *p++ = cpu_to_be32(1); /* newoffset = TRUE */
+ p = xdr_encode_hyper(p, args->lastbytewritten);
+ } else {
+ p = reserve_space(xdr, 12);
+ *p++ = cpu_to_be32(0); /* newoffset = FALSE */
+ }
+ *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
+ *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
+
+ encode_uint32(xdr, args->layoutupdate_len);
+ if (args->layoutupdate_pages)
+ xdr_write_pages(xdr, args->layoutupdate_pages, 0,
+ args->layoutupdate_len);
+
+ return 0;
+}
+
+static void
+encode_layoutreturn(struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr);
+ p = reserve_space(xdr, 16);
+ *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */
+ *p++ = cpu_to_be32(args->layout_type);
+ *p++ = cpu_to_be32(args->range.iomode);
+ *p = cpu_to_be32(RETURN_FILE);
+ p = reserve_space(xdr, 16);
+ p = xdr_encode_hyper(p, args->range.offset);
+ p = xdr_encode_hyper(p, args->range.length);
+ spin_lock(&args->inode->i_lock);
+ encode_nfs4_stateid(xdr, &args->stateid);
+ spin_unlock(&args->inode->i_lock);
+ if (args->ld_private->ops && args->ld_private->ops->encode)
+ args->ld_private->ops->encode(xdr, args, args->ld_private);
+ else
+ encode_uint32(xdr, 0);
+}
+
+static int
+encode_secinfo_no_name(struct xdr_stream *xdr,
+ const struct nfs41_secinfo_no_name_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_SECINFO_NO_NAME, decode_secinfo_no_name_maxsz, hdr);
+ encode_uint32(xdr, args->style);
+ return 0;
+}
+
+static void encode_test_stateid(struct xdr_stream *xdr,
+ const struct nfs41_test_stateid_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_TEST_STATEID, decode_test_stateid_maxsz, hdr);
+ encode_uint32(xdr, 1);
+ encode_nfs4_stateid(xdr, args->stateid);
+}
+
+static void encode_free_stateid(struct xdr_stream *xdr,
+ const struct nfs41_free_stateid_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->stateid);
+}
+#else
+static inline void
+encode_layoutreturn(struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args,
+ struct compound_hdr *hdr)
+{
+}
+
+static void
+encode_layoutget(struct xdr_stream *xdr,
+ const struct nfs4_layoutget_args *args,
+ struct compound_hdr *hdr)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * END OF "GENERIC" ENCODE ROUTINES.
+ */
+
+static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
+{
+#if defined(CONFIG_NFS_V4_1)
+ struct nfs4_session *session = args->sa_slot->table->session;
+ if (session)
+ return session->clp->cl_mvops->minor_version;
+#endif /* CONFIG_NFS_V4_1 */
+ return 0;
+}
+
+/*
+ * Encode an ACCESS request
+ */
+static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_accessargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_access(xdr, args->access, &hdr);
+ if (args->bitmask)
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode LOOKUP request
+ */
+static void nfs4_xdr_enc_lookup(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_lookup_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->dir_fh, &hdr);
+ encode_lookup(xdr, args->name, &hdr);
+ encode_getfh(xdr, &hdr);
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode LOOKUPP request
+ */
+static void nfs4_xdr_enc_lookupp(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_lookupp_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_lookupp(xdr, &hdr);
+ encode_getfh(xdr, &hdr);
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode LOOKUP_ROOT request
+ */
+static void nfs4_xdr_enc_lookup_root(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_lookup_root_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putrootfh(xdr, &hdr);
+ encode_getfh(xdr, &hdr);
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode REMOVE request
+ */
+static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_removeargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_remove(xdr, &args->name, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode RENAME request
+ */
+static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_renameargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->old_dir, &hdr);
+ encode_savefh(xdr, &hdr);
+ encode_putfh(xdr, args->new_dir, &hdr);
+ encode_rename(xdr, args->old_name, args->new_name, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode LINK request
+ */
+static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_link_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_savefh(xdr, &hdr);
+ encode_putfh(xdr, args->dir_fh, &hdr);
+ encode_link(xdr, args->name, &hdr);
+ encode_restorefh(xdr, &hdr);
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode CREATE request
+ */
+static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_create_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->dir_fh, &hdr);
+ encode_create(xdr, args, &hdr);
+ encode_getfh(xdr, &hdr);
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode SYMLINK request
+ */
+static void nfs4_xdr_enc_symlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_create_arg *args = data;
+
+ nfs4_xdr_enc_create(req, xdr, args);
+}
+
+/*
+ * Encode GETATTR request
+ */
+static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_getattr_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode a CLOSE request
+ */
+static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_closeargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ if (args->lr_args)
+ encode_layoutreturn(xdr, args->lr_args, &hdr);
+ if (args->bitmask != NULL)
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_close(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode an OPEN request
+ */
+static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_openargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_open(xdr, args, &hdr);
+ encode_getfh(xdr, &hdr);
+ if (args->access)
+ encode_access(xdr, args->access, &hdr);
+ encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
+ if (args->lg_args) {
+ encode_layoutget(xdr, args->lg_args, &hdr);
+ rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0,
+ args->lg_args->layout.pglen,
+ hdr.replen - pagepad_maxsz);
+ }
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode an OPEN_CONFIRM request
+ */
+static void nfs4_xdr_enc_open_confirm(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_open_confirmargs *args = data;
+ struct compound_hdr hdr = {
+ .nops = 0,
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_open_confirm(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode an OPEN request with no attributes.
+ */
+static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_openargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_open(xdr, args, &hdr);
+ if (args->access)
+ encode_access(xdr, args->access, &hdr);
+ encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
+ if (args->lg_args) {
+ encode_layoutget(xdr, args->lg_args, &hdr);
+ rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0,
+ args->lg_args->layout.pglen,
+ hdr.replen - pagepad_maxsz);
+ }
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode an OPEN_DOWNGRADE request
+ */
+static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_closeargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ if (args->lr_args)
+ encode_layoutreturn(xdr, args->lr_args, &hdr);
+ encode_open_downgrade(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode a LOCK request
+ */
+static void nfs4_xdr_enc_lock(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_lock_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_lock(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode a LOCKT request
+ */
+static void nfs4_xdr_enc_lockt(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_lockt_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_lockt(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode a LOCKU request
+ */
+static void nfs4_xdr_enc_locku(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_locku_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_locku(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+static void nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_release_lockowner_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = 0,
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_release_lockowner(xdr, &args->lock_owner, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode a READLINK request
+ */
+static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_readlink *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_readlink(xdr, args, req, &hdr);
+
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->pglen, hdr.replen - pagepad_maxsz);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode a READDIR request
+ */
+static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_readdir_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_readdir(xdr, args, req, &hdr);
+
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, hdr.replen - pagepad_maxsz);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode a READ request
+ */
+static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_pgio_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_read(xdr, args, &hdr);
+
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, hdr.replen - pagepad_maxsz);
+ req->rq_rcv_buf.flags |= XDRBUF_READ;
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode an SETATTR request
+ */
+static void nfs4_xdr_enc_setattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_setattrargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_setattr(xdr, args, args->server, &hdr);
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode a GETACL request
+ */
+static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_getaclargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+ __u32 nfs4_acl_bitmap[2];
+ uint32_t replen;
+
+ nfs4_acltype_to_bitmap(args->acl_type, nfs4_acl_bitmap);
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ replen = hdr.replen + op_decode_hdr_maxsz;
+ encode_getattr(xdr, nfs4_acl_bitmap, NULL,
+ ARRAY_SIZE(nfs4_acl_bitmap), &hdr);
+
+ rpc_prepare_reply_pages(req, args->acl_pages, 0,
+ args->acl_len, replen);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode a WRITE request
+ */
+static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_pgio_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_write(xdr, args, &hdr);
+ req->rq_snd_buf.flags |= XDRBUF_WRITE;
+ if (args->bitmask)
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * a COMMIT request
+ */
+static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_commitargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_commit(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * FSINFO request
+ */
+static void nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_fsinfo_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_fsinfo(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * a PATHCONF request
+ */
+static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_pathconf_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_getattr(xdr, nfs4_pathconf_bitmap, args->bitmask,
+ ARRAY_SIZE(nfs4_pathconf_bitmap), &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * a STATFS request
+ */
+static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_statfs_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_getattr(xdr, nfs4_statfs_bitmap, args->bitmask,
+ ARRAY_SIZE(nfs4_statfs_bitmap), &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * GETATTR_BITMAP request
+ */
+static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_server_caps_arg *args = data;
+ const u32 *bitmask = args->bitmask;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fhandle, &hdr);
+ encode_getattr(xdr, bitmask, NULL, 3, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * a RENEW request
+ */
+static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+
+{
+ const struct nfs_client *clp = data;
+ struct compound_hdr hdr = {
+ .nops = 0,
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_renew(xdr, clp->cl_clientid, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * a SETCLIENTID request
+ */
+static void nfs4_xdr_enc_setclientid(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_setclientid *sc = data;
+ struct compound_hdr hdr = {
+ .nops = 0,
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_setclientid(xdr, sc, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * a SETCLIENTID_CONFIRM request
+ */
+static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_setclientid_res *arg = data;
+ struct compound_hdr hdr = {
+ .nops = 0,
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_setclientid_confirm(xdr, arg, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * DELEGRETURN request
+ */
+static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_delegreturnargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fhandle, &hdr);
+ if (args->lr_args)
+ encode_layoutreturn(xdr, args->lr_args, &hdr);
+ if (args->bitmask)
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_delegreturn(xdr, args->stateid, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode FS_LOCATIONS request
+ */
+static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_fs_locations_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+ uint32_t replen;
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ if (args->migration) {
+ encode_putfh(xdr, args->fh, &hdr);
+ replen = hdr.replen;
+ encode_fs_locations(xdr, args->bitmask, &hdr);
+ if (args->renew)
+ encode_renew(xdr, args->clientid, &hdr);
+ } else {
+ encode_putfh(xdr, args->dir_fh, &hdr);
+ encode_lookup(xdr, args->name, &hdr);
+ replen = hdr.replen;
+ encode_fs_locations(xdr, args->bitmask, &hdr);
+ }
+
+ rpc_prepare_reply_pages(req, (struct page **)&args->page, 0,
+ PAGE_SIZE, replen);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode SECINFO request
+ */
+static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_secinfo_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->dir_fh, &hdr);
+ encode_secinfo(xdr, args->name, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode FSID_PRESENT request
+ */
+static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_fsid_present_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_getfh(xdr, &hdr);
+ if (args->renew)
+ encode_renew(xdr, args->clientid, &hdr);
+ encode_nops(&hdr);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * BIND_CONN_TO_SESSION request
+ */
+static void nfs4_xdr_enc_bind_conn_to_session(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs41_bind_conn_to_session_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = args->client->cl_mvops->minor_version,
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_bind_conn_to_session(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * EXCHANGE_ID request
+ */
+static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs41_exchange_id_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = args->client->cl_mvops->minor_version,
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_exchange_id(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * a CREATE_SESSION request
+ */
+static void nfs4_xdr_enc_create_session(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs41_create_session_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = args->client->cl_mvops->minor_version,
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_create_session(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * a DESTROY_SESSION request
+ */
+static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_session *session = data;
+ struct compound_hdr hdr = {
+ .minorversion = session->clp->cl_mvops->minor_version,
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_destroy_session(xdr, session, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * a DESTROY_CLIENTID request
+ */
+static void nfs4_xdr_enc_destroy_clientid(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_client *clp = data;
+ struct compound_hdr hdr = {
+ .minorversion = clp->cl_mvops->minor_version,
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_destroy_clientid(xdr, clp->cl_clientid, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * a SEQUENCE request
+ */
+static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_sequence_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+#endif
+
+/*
+ * a GET_LEASE_TIME request
+ */
+static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_get_lease_time_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
+ };
+ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->la_seq_args, &hdr);
+ encode_putrootfh(xdr, &hdr);
+ encode_fsinfo(xdr, lease_bitmap, &hdr);
+ encode_nops(&hdr);
+}
+
+#ifdef CONFIG_NFS_V4_1
+
+/*
+ * a RECLAIM_COMPLETE request
+ */
+static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs41_reclaim_complete_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args)
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_reclaim_complete(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode GETDEVICEINFO request
+ */
+static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_getdeviceinfo_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+ uint32_t replen;
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+
+ replen = hdr.replen + op_decode_hdr_maxsz + 2;
+
+ encode_getdeviceinfo(xdr, args, &hdr);
+
+ /* set up reply kvec. device_addr4 opaque data is read into the
+ * pages */
+ rpc_prepare_reply_pages(req, args->pdev->pages, args->pdev->pgbase,
+ args->pdev->pglen, replen);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode LAYOUTGET request
+ */
+static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_layoutget_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+ encode_layoutget(xdr, args, &hdr);
+
+ rpc_prepare_reply_pages(req, args->layout.pages, 0,
+ args->layout.pglen, hdr.replen - pagepad_maxsz);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode LAYOUTCOMMIT request
+ */
+static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *priv)
+{
+ const struct nfs4_layoutcommit_args *args = priv;
+ struct nfs4_layoutcommit_data *data =
+ container_of(args, struct nfs4_layoutcommit_data, args);
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+ encode_layoutcommit(xdr, data->args.inode, args, &hdr);
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode LAYOUTRETURN request
+ */
+static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_layoutreturn_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+ encode_layoutreturn(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode SECINFO_NO_NAME request
+ */
+static void nfs4_xdr_enc_secinfo_no_name(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs41_secinfo_no_name_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putrootfh(xdr, &hdr);
+ encode_secinfo_no_name(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode TEST_STATEID request
+ */
+static void nfs4_xdr_enc_test_stateid(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs41_test_stateid_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_test_stateid(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode FREE_STATEID request
+ */
+static void nfs4_xdr_enc_free_stateid(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs41_free_stateid_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_free_stateid(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
+{
+ ssize_t ret = xdr_stream_decode_opaque_inline(xdr, (void **)string,
+ NFS4_OPAQUE_LIMIT);
+ if (unlikely(ret < 0))
+ return -EIO;
+ *len = ret;
+ return 0;
+}
+
+static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+ ssize_t ret;
+ void *ptr;
+ u32 tmp;
+
+ if (xdr_stream_decode_u32(xdr, &tmp) < 0)
+ return -EIO;
+ hdr->status = tmp;
+
+ ret = xdr_stream_decode_opaque_inline(xdr, &ptr, NFS4_OPAQUE_LIMIT);
+ if (ret < 0)
+ return -EIO;
+ hdr->taglen = ret;
+ hdr->tag = ptr;
+
+ if (xdr_stream_decode_u32(xdr, &tmp) < 0)
+ return -EIO;
+ hdr->nops = tmp;
+ if (unlikely(hdr->nops < 1))
+ return nfs4_stat_to_errno(hdr->status);
+ return 0;
+}
+
+static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected,
+ int *nfs_retval)
+{
+ __be32 *p;
+ uint32_t opnum;
+ int32_t nfserr;
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ opnum = be32_to_cpup(p++);
+ if (unlikely(opnum != expected))
+ goto out_bad_operation;
+ if (unlikely(*p != cpu_to_be32(NFS_OK)))
+ goto out_status;
+ *nfs_retval = 0;
+ return true;
+out_status:
+ nfserr = be32_to_cpup(p);
+ trace_nfs4_xdr_status(xdr, opnum, nfserr);
+ *nfs_retval = nfs4_stat_to_errno(nfserr);
+ return true;
+out_bad_operation:
+ trace_nfs4_xdr_bad_operation(xdr, opnum, expected);
+ *nfs_retval = -EREMOTEIO;
+ return false;
+out_overflow:
+ *nfs_retval = -EIO;
+ return false;
+}
+
+static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
+{
+ int retval;
+
+ __decode_op_hdr(xdr, expected, &retval);
+ return retval;
+}
+
+/* Dummy routine */
+static int decode_ace(struct xdr_stream *xdr, void *ace)
+{
+ __be32 *p;
+ unsigned int strlen;
+ char *str;
+
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(!p))
+ return -EIO;
+ return decode_opaque_inline(xdr, &strlen, &str);
+}
+
+static ssize_t
+decode_bitmap4(struct xdr_stream *xdr, uint32_t *bitmap, size_t sz)
+{
+ ssize_t ret;
+
+ ret = xdr_stream_decode_uint32_array(xdr, bitmap, sz);
+ if (likely(ret >= 0))
+ return ret;
+ if (ret != -EMSGSIZE)
+ return -EIO;
+ return sz;
+}
+
+static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
+{
+ ssize_t ret;
+ ret = decode_bitmap4(xdr, bitmap, 3);
+ return ret < 0 ? ret : 0;
+}
+
+static int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, unsigned int *savep)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *attrlen = be32_to_cpup(p);
+ *savep = xdr_stream_pos(xdr);
+ return 0;
+}
+
+static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
+{
+ if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) {
+ int ret;
+ ret = decode_attr_bitmap(xdr, bitmask);
+ if (unlikely(ret < 0))
+ return ret;
+ bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
+ } else
+ bitmask[0] = bitmask[1] = bitmask[2] = 0;
+ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
+ bitmask[0], bitmask[1], bitmask[2]);
+ return 0;
+}
+
+static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type)
+{
+ __be32 *p;
+ int ret = 0;
+
+ *type = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *type = be32_to_cpup(p);
+ if (*type < NF4REG || *type > NF4NAMEDATTR) {
+ dprintk("%s: bad type %d\n", __func__, *type);
+ return -EIO;
+ }
+ bitmap[0] &= ~FATTR4_WORD0_TYPE;
+ ret = NFS_ATTR_FATTR_TYPE;
+ }
+ dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
+ return ret;
+}
+
+static int decode_attr_fh_expire_type(struct xdr_stream *xdr,
+ uint32_t *bitmap, uint32_t *type)
+{
+ __be32 *p;
+
+ *type = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_FH_EXPIRE_TYPE - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_FH_EXPIRE_TYPE)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *type = be32_to_cpup(p);
+ bitmap[0] &= ~FATTR4_WORD0_FH_EXPIRE_TYPE;
+ }
+ dprintk("%s: expire type=0x%x\n", __func__, *type);
+ return 0;
+}
+
+static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
+{
+ __be32 *p;
+ int ret = 0;
+
+ *change = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, change);
+ bitmap[0] &= ~FATTR4_WORD0_CHANGE;
+ ret = NFS_ATTR_FATTR_CHANGE;
+ }
+ dprintk("%s: change attribute=%Lu\n", __func__,
+ (unsigned long long)*change);
+ return ret;
+}
+
+static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
+{
+ __be32 *p;
+ int ret = 0;
+
+ *size = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, size);
+ bitmap[0] &= ~FATTR4_WORD0_SIZE;
+ ret = NFS_ATTR_FATTR_SIZE;
+ }
+ dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
+ return ret;
+}
+
+static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+ __be32 *p;
+
+ *res = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *res = be32_to_cpup(p);
+ bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
+ }
+ dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
+ return 0;
+}
+
+static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+ __be32 *p;
+
+ *res = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *res = be32_to_cpup(p);
+ bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
+ }
+ dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
+ return 0;
+}
+
+static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
+{
+ __be32 *p;
+ int ret = 0;
+
+ fsid->major = 0;
+ fsid->minor = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_FSID)) {
+ p = xdr_inline_decode(xdr, 16);
+ if (unlikely(!p))
+ return -EIO;
+ p = xdr_decode_hyper(p, &fsid->major);
+ xdr_decode_hyper(p, &fsid->minor);
+ bitmap[0] &= ~FATTR4_WORD0_FSID;
+ ret = NFS_ATTR_FATTR_FSID;
+ }
+ dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__,
+ (unsigned long long)fsid->major,
+ (unsigned long long)fsid->minor);
+ return ret;
+}
+
+static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+ __be32 *p;
+
+ *res = 60;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *res = be32_to_cpup(p);
+ bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
+ }
+ dprintk("%s: lease time=%u\n", __func__, (unsigned int)*res);
+ return 0;
+}
+
+static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *res)
+{
+ __be32 *p;
+
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_RDATTR_ERROR - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
+ *res = -be32_to_cpup(p);
+ }
+ return 0;
+}
+
+static int decode_attr_exclcreat_supported(struct xdr_stream *xdr,
+ uint32_t *bitmap, uint32_t *bitmask)
+{
+ if (likely(bitmap[2] & FATTR4_WORD2_SUPPATTR_EXCLCREAT)) {
+ int ret;
+ ret = decode_attr_bitmap(xdr, bitmask);
+ if (unlikely(ret < 0))
+ return ret;
+ bitmap[2] &= ~FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+ } else
+ bitmask[0] = bitmask[1] = bitmask[2] = 0;
+ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
+ bitmask[0], bitmask[1], bitmask[2]);
+ return 0;
+}
+
+static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
+{
+ __be32 *p;
+ u32 len;
+
+ if (fh != NULL)
+ memset(fh, 0, sizeof(*fh));
+
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEHANDLE - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ len = be32_to_cpup(p);
+ if (len > NFS4_FHSIZE || len == 0) {
+ trace_nfs4_xdr_bad_filehandle(xdr, OP_READDIR,
+ NFS4ERR_BADHANDLE);
+ return -EREMOTEIO;
+ }
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return -EIO;
+ if (fh != NULL) {
+ memcpy(fh->data, p, len);
+ fh->size = len;
+ }
+ bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE;
+ }
+ return 0;
+}
+
+static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+ __be32 *p;
+
+ *res = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *res = be32_to_cpup(p);
+ bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
+ }
+ dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
+ return 0;
+}
+
+static int decode_attr_case_insensitive(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+ __be32 *p;
+
+ *res = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_CASE_INSENSITIVE - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_CASE_INSENSITIVE)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *res = be32_to_cpup(p);
+ bitmap[0] &= ~FATTR4_WORD0_CASE_INSENSITIVE;
+ }
+ dprintk("%s: case_insensitive=%s\n", __func__, *res == 0 ? "false" : "true");
+ return 0;
+}
+
+static int decode_attr_case_preserving(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+ __be32 *p;
+
+ *res = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_CASE_PRESERVING - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_CASE_PRESERVING)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *res = be32_to_cpup(p);
+ bitmap[0] &= ~FATTR4_WORD0_CASE_PRESERVING;
+ }
+ dprintk("%s: case_preserving=%s\n", __func__, *res == 0 ? "false" : "true");
+ return 0;
+}
+
+static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
+{
+ __be32 *p;
+ int ret = 0;
+
+ *fileid = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, fileid);
+ bitmap[0] &= ~FATTR4_WORD0_FILEID;
+ ret = NFS_ATTR_FATTR_FILEID;
+ }
+ dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
+ return ret;
+}
+
+static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
+{
+ __be32 *p;
+ int ret = 0;
+
+ *fileid = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, fileid);
+ bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
+ ret = NFS_ATTR_FATTR_MOUNTED_ON_FILEID;
+ }
+ dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
+ return ret;
+}
+
+static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+ __be32 *p;
+ int status = 0;
+
+ *res = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, res);
+ bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
+ }
+ dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
+ return status;
+}
+
+static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+ __be32 *p;
+ int status = 0;
+
+ *res = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, res);
+ bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
+ }
+ dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
+ return status;
+}
+
+static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+ __be32 *p;
+ int status = 0;
+
+ *res = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, res);
+ bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
+ }
+ dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
+ return status;
+}
+
+static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
+{
+ u32 n;
+ __be32 *p;
+ int status = 0;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ n = be32_to_cpup(p);
+ if (n == 0)
+ goto root_path;
+ dprintk("pathname4: ");
+ if (n > NFS4_PATHNAME_MAXCOMPONENTS) {
+ dprintk("cannot parse %d components in path\n", n);
+ goto out_eio;
+ }
+ for (path->ncomponents = 0; path->ncomponents < n; path->ncomponents++) {
+ struct nfs4_string *component = &path->components[path->ncomponents];
+ status = decode_opaque_inline(xdr, &component->len, &component->data);
+ if (unlikely(status != 0))
+ goto out_eio;
+ ifdebug (XDR)
+ pr_cont("%s%.*s ",
+ (path->ncomponents != n ? "/ " : ""),
+ component->len, component->data);
+ }
+out:
+ return status;
+root_path:
+/* a root pathname is sent as a zero component4 */
+ path->ncomponents = 1;
+ path->components[0].len=0;
+ path->components[0].data=NULL;
+ dprintk("pathname4: /\n");
+ goto out;
+out_eio:
+ dprintk(" status %d", status);
+ status = -EIO;
+ goto out;
+}
+
+static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
+{
+ int n;
+ __be32 *p;
+ int status = -EIO;
+
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_FS_LOCATIONS -1U)))
+ goto out;
+ status = 0;
+ if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
+ goto out;
+ bitmap[0] &= ~FATTR4_WORD0_FS_LOCATIONS;
+ status = -EIO;
+ /* Ignore borken servers that return unrequested attrs */
+ if (unlikely(res == NULL))
+ goto out;
+ dprintk("%s: fsroot:\n", __func__);
+ status = decode_pathname(xdr, &res->fs_path);
+ if (unlikely(status != 0))
+ goto out;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_eio;
+ n = be32_to_cpup(p);
+ for (res->nlocations = 0; res->nlocations < n; res->nlocations++) {
+ u32 m;
+ struct nfs4_fs_location *loc;
+
+ if (res->nlocations == NFS4_FS_LOCATIONS_MAXENTRIES)
+ break;
+ loc = &res->locations[res->nlocations];
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_eio;
+ m = be32_to_cpup(p);
+
+ dprintk("%s: servers:\n", __func__);
+ for (loc->nservers = 0; loc->nservers < m; loc->nservers++) {
+ struct nfs4_string *server;
+
+ if (loc->nservers == NFS4_FS_LOCATION_MAXSERVERS) {
+ unsigned int i;
+ dprintk("%s: using first %u of %u servers "
+ "returned for location %u\n",
+ __func__,
+ NFS4_FS_LOCATION_MAXSERVERS,
+ m, res->nlocations);
+ for (i = loc->nservers; i < m; i++) {
+ unsigned int len;
+ char *data;
+ status = decode_opaque_inline(xdr, &len, &data);
+ if (unlikely(status != 0))
+ goto out_eio;
+ }
+ break;
+ }
+ server = &loc->servers[loc->nservers];
+ status = decode_opaque_inline(xdr, &server->len, &server->data);
+ if (unlikely(status != 0))
+ goto out_eio;
+ dprintk("%s ", server->data);
+ }
+ status = decode_pathname(xdr, &loc->rootpath);
+ if (unlikely(status != 0))
+ goto out_eio;
+ }
+ if (res->nlocations != 0)
+ status = NFS_ATTR_FATTR_V4_LOCATIONS;
+out:
+ dprintk("%s: fs_locations done, error = %d\n", __func__, status);
+ return status;
+out_eio:
+ status = -EIO;
+ goto out;
+}
+
+static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+ __be32 *p;
+ int status = 0;
+
+ *res = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, res);
+ bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
+ }
+ dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
+ return status;
+}
+
+static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
+{
+ __be32 *p;
+ int status = 0;
+
+ *maxlink = 1;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *maxlink = be32_to_cpup(p);
+ bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
+ }
+ dprintk("%s: maxlink=%u\n", __func__, *maxlink);
+ return status;
+}
+
+static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
+{
+ __be32 *p;
+ int status = 0;
+
+ *maxname = 1024;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *maxname = be32_to_cpup(p);
+ bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
+ }
+ dprintk("%s: maxname=%u\n", __func__, *maxname);
+ return status;
+}
+
+static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+ __be32 *p;
+ int status = 0;
+
+ *res = 1024;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXREAD - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) {
+ uint64_t maxread;
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, &maxread);
+ if (maxread > 0x7FFFFFFF)
+ maxread = 0x7FFFFFFF;
+ *res = (uint32_t)maxread;
+ bitmap[0] &= ~FATTR4_WORD0_MAXREAD;
+ }
+ dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res);
+ return status;
+}
+
+static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+ __be32 *p;
+ int status = 0;
+
+ *res = 1024;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXWRITE - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) {
+ uint64_t maxwrite;
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, &maxwrite);
+ if (maxwrite > 0x7FFFFFFF)
+ maxwrite = 0x7FFFFFFF;
+ *res = (uint32_t)maxwrite;
+ bitmap[0] &= ~FATTR4_WORD0_MAXWRITE;
+ }
+ dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res);
+ return status;
+}
+
+static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
+{
+ uint32_t tmp;
+ __be32 *p;
+ int ret = 0;
+
+ *mode = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ tmp = be32_to_cpup(p);
+ *mode = tmp & ~S_IFMT;
+ bitmap[1] &= ~FATTR4_WORD1_MODE;
+ ret = NFS_ATTR_FATTR_MODE;
+ }
+ dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
+ return ret;
+}
+
+static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
+{
+ __be32 *p;
+ int ret = 0;
+
+ *nlink = 1;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *nlink = be32_to_cpup(p);
+ bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
+ ret = NFS_ATTR_FATTR_NLINK;
+ }
+ dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
+ return ret;
+}
+
+static ssize_t decode_nfs4_string(struct xdr_stream *xdr,
+ struct nfs4_string *name, gfp_t gfp_flags)
+{
+ ssize_t ret;
+
+ ret = xdr_stream_decode_string_dup(xdr, &name->data,
+ XDR_MAX_NETOBJ, gfp_flags);
+ name->len = 0;
+ if (ret > 0)
+ name->len = ret;
+ return ret;
+}
+
+static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
+ const struct nfs_server *server, kuid_t *uid,
+ struct nfs4_string *owner_name)
+{
+ ssize_t len;
+ char *p;
+
+ *uid = make_kuid(&init_user_ns, -2);
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
+ return -EIO;
+ if (!(bitmap[1] & FATTR4_WORD1_OWNER))
+ return 0;
+ bitmap[1] &= ~FATTR4_WORD1_OWNER;
+
+ if (owner_name != NULL) {
+ len = decode_nfs4_string(xdr, owner_name, GFP_NOIO);
+ if (len <= 0)
+ goto out;
+ dprintk("%s: name=%s\n", __func__, owner_name->data);
+ return NFS_ATTR_FATTR_OWNER_NAME;
+ } else {
+ len = xdr_stream_decode_opaque_inline(xdr, (void **)&p,
+ XDR_MAX_NETOBJ);
+ if (len <= 0 || nfs_map_name_to_uid(server, p, len, uid) != 0)
+ goto out;
+ dprintk("%s: uid=%d\n", __func__, (int)from_kuid(&init_user_ns, *uid));
+ return NFS_ATTR_FATTR_OWNER;
+ }
+out:
+ if (len == -EBADMSG)
+ return -EIO;
+ return 0;
+}
+
+static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
+ const struct nfs_server *server, kgid_t *gid,
+ struct nfs4_string *group_name)
+{
+ ssize_t len;
+ char *p;
+
+ *gid = make_kgid(&init_user_ns, -2);
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
+ return -EIO;
+ if (!(bitmap[1] & FATTR4_WORD1_OWNER_GROUP))
+ return 0;
+ bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
+
+ if (group_name != NULL) {
+ len = decode_nfs4_string(xdr, group_name, GFP_NOIO);
+ if (len <= 0)
+ goto out;
+ dprintk("%s: name=%s\n", __func__, group_name->data);
+ return NFS_ATTR_FATTR_GROUP_NAME;
+ } else {
+ len = xdr_stream_decode_opaque_inline(xdr, (void **)&p,
+ XDR_MAX_NETOBJ);
+ if (len <= 0 || nfs_map_group_to_gid(server, p, len, gid) != 0)
+ goto out;
+ dprintk("%s: gid=%d\n", __func__, (int)from_kgid(&init_user_ns, *gid));
+ return NFS_ATTR_FATTR_GROUP;
+ }
+out:
+ if (len == -EBADMSG)
+ return -EIO;
+ return 0;
+}
+
+static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
+{
+ uint32_t major = 0, minor = 0;
+ __be32 *p;
+ int ret = 0;
+
+ *rdev = MKDEV(0,0);
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) {
+ dev_t tmp;
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ major = be32_to_cpup(p++);
+ minor = be32_to_cpup(p);
+ tmp = MKDEV(major, minor);
+ if (MAJOR(tmp) == major && MINOR(tmp) == minor)
+ *rdev = tmp;
+ bitmap[1] &= ~ FATTR4_WORD1_RAWDEV;
+ ret = NFS_ATTR_FATTR_RDEV;
+ }
+ dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
+ return ret;
+}
+
+static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+ __be32 *p;
+ int status = 0;
+
+ *res = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, res);
+ bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
+ }
+ dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
+ return status;
+}
+
+static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+ __be32 *p;
+ int status = 0;
+
+ *res = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, res);
+ bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
+ }
+ dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
+ return status;
+}
+
+static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+ __be32 *p;
+ int status = 0;
+
+ *res = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, res);
+ bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
+ }
+ dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
+ return status;
+}
+
+static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
+{
+ __be32 *p;
+ int ret = 0;
+
+ *used = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, used);
+ bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
+ ret = NFS_ATTR_FATTR_SPACE_USED;
+ }
+ dprintk("%s: space used=%Lu\n", __func__,
+ (unsigned long long)*used);
+ return ret;
+}
+
+static __be32 *
+xdr_decode_nfstime4(__be32 *p, struct timespec64 *t)
+{
+ __u64 sec;
+
+ p = xdr_decode_hyper(p, &sec);
+ t-> tv_sec = sec;
+ t->tv_nsec = be32_to_cpup(p++);
+ return p;
+}
+
+static int decode_attr_time(struct xdr_stream *xdr, struct timespec64 *time)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, nfstime4_maxsz << 2);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_nfstime4(p, time);
+ return 0;
+}
+
+static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time)
+{
+ int status = 0;
+
+ time->tv_sec = 0;
+ time->tv_nsec = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_ACCESS - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) {
+ status = decode_attr_time(xdr, time);
+ if (status == 0)
+ status = NFS_ATTR_FATTR_ATIME;
+ bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS;
+ }
+ dprintk("%s: atime=%lld\n", __func__, time->tv_sec);
+ return status;
+}
+
+static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time)
+{
+ int status = 0;
+
+ time->tv_sec = 0;
+ time->tv_nsec = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_METADATA - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) {
+ status = decode_attr_time(xdr, time);
+ if (status == 0)
+ status = NFS_ATTR_FATTR_CTIME;
+ bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA;
+ }
+ dprintk("%s: ctime=%lld\n", __func__, time->tv_sec);
+ return status;
+}
+
+static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
+ struct timespec64 *time)
+{
+ int status = 0;
+
+ time->tv_sec = 0;
+ time->tv_nsec = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_DELTA - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_TIME_DELTA)) {
+ status = decode_attr_time(xdr, time);
+ bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA;
+ }
+ dprintk("%s: time_delta=%lld %ld\n", __func__, time->tv_sec,
+ time->tv_nsec);
+ return status;
+}
+
+static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
+ struct nfs4_label *label)
+{
+ uint32_t pi = 0;
+ uint32_t lfs = 0;
+ __u32 len;
+ __be32 *p;
+ int status = 0;
+
+ if (unlikely(bitmap[2] & (FATTR4_WORD2_SECURITY_LABEL - 1U)))
+ return -EIO;
+ if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ lfs = be32_to_cpup(p++);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ pi = be32_to_cpup(p++);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ len = be32_to_cpup(p++);
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return -EIO;
+ bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+ if (len < NFS4_MAXLABELLEN) {
+ if (label && label->len) {
+ if (label->len < len)
+ return -ERANGE;
+ memcpy(label->label, p, len);
+ label->len = len;
+ label->pi = pi;
+ label->lfs = lfs;
+ status = NFS_ATTR_FATTR_V4_SECURITY_LABEL;
+ }
+ } else
+ printk(KERN_WARNING "%s: label too long (%u)!\n",
+ __func__, len);
+ if (label && label->label)
+ dprintk("%s: label=%.*s, len=%d, PI=%d, LFS=%d\n",
+ __func__, label->len, (char *)label->label,
+ label->len, label->pi, label->lfs);
+ }
+ return status;
+}
+
+static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time)
+{
+ int status = 0;
+
+ time->tv_sec = 0;
+ time->tv_nsec = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_MODIFY - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) {
+ status = decode_attr_time(xdr, time);
+ if (status == 0)
+ status = NFS_ATTR_FATTR_MTIME;
+ bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY;
+ }
+ dprintk("%s: mtime=%lld\n", __func__, time->tv_sec);
+ return status;
+}
+
+static int decode_attr_xattrsupport(struct xdr_stream *xdr, uint32_t *bitmap,
+ uint32_t *res)
+{
+ __be32 *p;
+
+ *res = 0;
+ if (unlikely(bitmap[2] & (FATTR4_WORD2_XATTR_SUPPORT - 1U)))
+ return -EIO;
+ if (likely(bitmap[2] & FATTR4_WORD2_XATTR_SUPPORT)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *res = be32_to_cpup(p);
+ bitmap[2] &= ~FATTR4_WORD2_XATTR_SUPPORT;
+ }
+ dprintk("%s: XATTR support=%s\n", __func__,
+ *res == 0 ? "false" : "true");
+ return 0;
+}
+
+static int verify_attr_len(struct xdr_stream *xdr, unsigned int savep, uint32_t attrlen)
+{
+ unsigned int attrwords = XDR_QUADLEN(attrlen);
+ unsigned int nwords = (xdr_stream_pos(xdr) - savep) >> 2;
+
+ if (unlikely(attrwords != nwords)) {
+ dprintk("%s: server returned incorrect attribute length: "
+ "%u %c %u\n",
+ __func__,
+ attrwords << 2,
+ (attrwords < nwords) ? '<' : '>',
+ nwords << 2);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 20);
+ if (unlikely(!p))
+ return -EIO;
+ cinfo->atomic = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &cinfo->before);
+ xdr_decode_hyper(p, &cinfo->after);
+ return 0;
+}
+
+static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access)
+{
+ __be32 *p;
+ uint32_t supp, acc;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_ACCESS);
+ if (status)
+ return status;
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ supp = be32_to_cpup(p++);
+ acc = be32_to_cpup(p);
+ *supported = supp;
+ *access = acc;
+ return 0;
+}
+
+static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
+{
+ ssize_t ret = xdr_stream_decode_opaque_fixed(xdr, buf, len);
+ if (unlikely(ret < 0))
+ return -EIO;
+ return 0;
+}
+
+static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
+}
+
+static int decode_open_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_OPEN_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
+static int decode_lock_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_LOCK_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
+static int decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_DELEGATION_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
+static int decode_invalid_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ nfs4_stateid dummy;
+
+ nfs4_stateid_copy(stateid, &invalid_stateid);
+ return decode_stateid(xdr, &dummy);
+}
+
+static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_CLOSE);
+ if (status != -EIO)
+ nfs_increment_open_seqid(status, res->seqid);
+ if (!status)
+ status = decode_invalid_stateid(xdr, &res->stateid);
+ return status;
+}
+
+static int decode_verifier(struct xdr_stream *xdr, void *verifier)
+{
+ return decode_opaque_fixed(xdr, verifier, NFS4_VERIFIER_SIZE);
+}
+
+static int decode_write_verifier(struct xdr_stream *xdr, struct nfs_write_verifier *verifier)
+{
+ return decode_opaque_fixed(xdr, verifier->data, NFS4_VERIFIER_SIZE);
+}
+
+static int decode_commit(struct xdr_stream *xdr, struct nfs_commitres *res)
+{
+ struct nfs_writeverf *verf = res->verf;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_COMMIT);
+ if (!status)
+ status = decode_write_verifier(xdr, &verf->verifier);
+ if (!status)
+ verf->committed = NFS_FILE_SYNC;
+ return status;
+}
+
+static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
+{
+ __be32 *p;
+ uint32_t bmlen;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_CREATE);
+ if (status)
+ return status;
+ if ((status = decode_change_info(xdr, cinfo)))
+ return status;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ bmlen = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, bmlen << 2);
+ if (likely(p))
+ return 0;
+ return -EIO;
+}
+
+static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
+{
+ unsigned int savep;
+ uint32_t attrlen, bitmap[3] = {0};
+ int status;
+
+ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_supported(xdr, bitmap, res->attr_bitmask)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_fh_expire_type(xdr, bitmap,
+ &res->fh_expire_type)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_link_support(xdr, bitmap, &res->has_links)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_symlink_support(xdr, bitmap, &res->has_symlinks)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_case_insensitive(xdr, bitmap, &res->case_insensitive)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_case_preserving(xdr, bitmap, &res->case_preserving)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_exclcreat_supported(xdr, bitmap,
+ res->exclcreat_bitmask)) != 0)
+ goto xdr_error;
+ status = verify_attr_len(xdr, savep, attrlen);
+xdr_error:
+ dprintk("%s: xdr returned %d!\n", __func__, -status);
+ return status;
+}
+
+static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
+{
+ unsigned int savep;
+ uint32_t attrlen, bitmap[3] = {0};
+ int status;
+
+ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+ goto xdr_error;
+
+ if ((status = decode_attr_files_avail(xdr, bitmap, &fsstat->afiles)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_files_free(xdr, bitmap, &fsstat->ffiles)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_files_total(xdr, bitmap, &fsstat->tfiles)) != 0)
+ goto xdr_error;
+
+ status = -EIO;
+ if (unlikely(bitmap[0]))
+ goto xdr_error;
+
+ if ((status = decode_attr_space_avail(xdr, bitmap, &fsstat->abytes)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_space_free(xdr, bitmap, &fsstat->fbytes)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_space_total(xdr, bitmap, &fsstat->tbytes)) != 0)
+ goto xdr_error;
+
+ status = verify_attr_len(xdr, savep, attrlen);
+xdr_error:
+ dprintk("%s: xdr returned %d!\n", __func__, -status);
+ return status;
+}
+
+static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
+{
+ unsigned int savep;
+ uint32_t attrlen, bitmap[3] = {0};
+ int status;
+
+ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+ goto xdr_error;
+
+ if ((status = decode_attr_maxlink(xdr, bitmap, &pathconf->max_link)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_maxname(xdr, bitmap, &pathconf->max_namelen)) != 0)
+ goto xdr_error;
+
+ status = verify_attr_len(xdr, savep, attrlen);
+xdr_error:
+ dprintk("%s: xdr returned %d!\n", __func__, -status);
+ return status;
+}
+
+static int decode_threshold_hint(struct xdr_stream *xdr,
+ uint32_t *bitmap,
+ uint64_t *res,
+ uint32_t hint_bit)
+{
+ __be32 *p;
+
+ *res = 0;
+ if (likely(bitmap[0] & hint_bit)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, res);
+ }
+ return 0;
+}
+
+static int decode_first_threshold_item4(struct xdr_stream *xdr,
+ struct nfs4_threshold *res)
+{
+ __be32 *p;
+ unsigned int savep;
+ uint32_t bitmap[3] = {0,}, attrlen;
+ int status;
+
+ /* layout type */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ res->l_type = be32_to_cpup(p);
+
+ /* thi_hintset bitmap */
+ status = decode_attr_bitmap(xdr, bitmap);
+ if (status < 0)
+ goto xdr_error;
+
+ /* thi_hintlist length */
+ status = decode_attr_length(xdr, &attrlen, &savep);
+ if (status < 0)
+ goto xdr_error;
+ /* thi_hintlist */
+ status = decode_threshold_hint(xdr, bitmap, &res->rd_sz, THRESHOLD_RD);
+ if (status < 0)
+ goto xdr_error;
+ status = decode_threshold_hint(xdr, bitmap, &res->wr_sz, THRESHOLD_WR);
+ if (status < 0)
+ goto xdr_error;
+ status = decode_threshold_hint(xdr, bitmap, &res->rd_io_sz,
+ THRESHOLD_RD_IO);
+ if (status < 0)
+ goto xdr_error;
+ status = decode_threshold_hint(xdr, bitmap, &res->wr_io_sz,
+ THRESHOLD_WR_IO);
+ if (status < 0)
+ goto xdr_error;
+
+ status = verify_attr_len(xdr, savep, attrlen);
+ res->bm = bitmap[0];
+
+ dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
+ __func__, res->bm, res->rd_sz, res->wr_sz, res->rd_io_sz,
+ res->wr_io_sz);
+xdr_error:
+ dprintk("%s ret=%d!\n", __func__, status);
+ return status;
+}
+
+/*
+ * Thresholds on pNFS direct I/O vrs MDS I/O
+ */
+static int decode_attr_mdsthreshold(struct xdr_stream *xdr,
+ uint32_t *bitmap,
+ struct nfs4_threshold *res)
+{
+ __be32 *p;
+ int status = 0;
+ uint32_t num;
+
+ if (unlikely(bitmap[2] & (FATTR4_WORD2_MDSTHRESHOLD - 1U)))
+ return -EIO;
+ if (bitmap[2] & FATTR4_WORD2_MDSTHRESHOLD) {
+ /* Did the server return an unrequested attribute? */
+ if (unlikely(res == NULL))
+ return -EREMOTEIO;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ num = be32_to_cpup(p);
+ if (num == 0)
+ return 0;
+ if (num > 1)
+ printk(KERN_INFO "%s: Warning: Multiple pNFS layout "
+ "drivers per filesystem not supported\n",
+ __func__);
+
+ status = decode_first_threshold_item4(xdr, res);
+ bitmap[2] &= ~FATTR4_WORD2_MDSTHRESHOLD;
+ }
+ return status;
+}
+
+static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
+ struct nfs_fattr *fattr, struct nfs_fh *fh,
+ struct nfs4_fs_locations *fs_loc, const struct nfs_server *server)
+{
+ int status;
+ umode_t fmode = 0;
+ uint32_t type;
+ int32_t err;
+
+ status = decode_attr_type(xdr, bitmap, &type);
+ if (status < 0)
+ goto xdr_error;
+ fattr->mode = 0;
+ if (status != 0) {
+ fattr->mode |= nfs_type2fmt[type];
+ fattr->valid |= status;
+ }
+
+ status = decode_attr_change(xdr, bitmap, &fattr->change_attr);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_size(xdr, bitmap, &fattr->size);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_fsid(xdr, bitmap, &fattr->fsid);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ err = 0;
+ status = decode_attr_error(xdr, bitmap, &err);
+ if (status < 0)
+ goto xdr_error;
+
+ status = decode_attr_filehandle(xdr, bitmap, fh);
+ if (status < 0)
+ goto xdr_error;
+
+ status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_fs_locations(xdr, bitmap, fs_loc);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = -EIO;
+ if (unlikely(bitmap[0]))
+ goto xdr_error;
+
+ status = decode_attr_mode(xdr, bitmap, &fmode);
+ if (status < 0)
+ goto xdr_error;
+ if (status != 0) {
+ fattr->mode |= fmode;
+ fattr->valid |= status;
+ }
+
+ status = decode_attr_nlink(xdr, bitmap, &fattr->nlink);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, fattr->owner_name);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_group(xdr, bitmap, server, &fattr->gid, fattr->group_name);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_rdev(xdr, bitmap, &fattr->rdev);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_time_access(xdr, bitmap, &fattr->atime);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_mounted_on_fileid(xdr, bitmap, &fattr->mounted_on_fileid);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = -EIO;
+ if (unlikely(bitmap[1]))
+ goto xdr_error;
+
+ status = decode_attr_mdsthreshold(xdr, bitmap, fattr->mdsthreshold);
+ if (status < 0)
+ goto xdr_error;
+
+ status = decode_attr_security_label(xdr, bitmap, fattr->label);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+xdr_error:
+ dprintk("%s: xdr returned %d\n", __func__, -status);
+ return status;
+}
+
+static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+ struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc,
+ const struct nfs_server *server)
+{
+ unsigned int savep;
+ uint32_t attrlen,
+ bitmap[3] = {0};
+ int status;
+
+ status = decode_op_hdr(xdr, OP_GETATTR);
+ if (status < 0)
+ goto xdr_error;
+
+ status = decode_attr_bitmap(xdr, bitmap);
+ if (status < 0)
+ goto xdr_error;
+
+ status = decode_attr_length(xdr, &attrlen, &savep);
+ if (status < 0)
+ goto xdr_error;
+
+ status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, server);
+ if (status < 0)
+ goto xdr_error;
+
+ status = verify_attr_len(xdr, savep, attrlen);
+xdr_error:
+ dprintk("%s: xdr returned %d\n", __func__, -status);
+ return status;
+}
+
+static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+ const struct nfs_server *server)
+{
+ return decode_getfattr_generic(xdr, fattr, NULL, NULL, server);
+}
+
+/*
+ * Decode potentially multiple layout types.
+ */
+static int decode_pnfs_layout_types(struct xdr_stream *xdr,
+ struct nfs_fsinfo *fsinfo)
+{
+ __be32 *p;
+ uint32_t i;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ fsinfo->nlayouttypes = be32_to_cpup(p);
+
+ /* pNFS is not supported by the underlying file system */
+ if (fsinfo->nlayouttypes == 0)
+ return 0;
+
+ /* Decode and set first layout type, move xdr->p past unused types */
+ p = xdr_inline_decode(xdr, fsinfo->nlayouttypes * 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ /* If we get too many, then just cap it at the max */
+ if (fsinfo->nlayouttypes > NFS_MAX_LAYOUT_TYPES) {
+ printk(KERN_INFO "NFS: %s: Warning: Too many (%u) pNFS layout types\n",
+ __func__, fsinfo->nlayouttypes);
+ fsinfo->nlayouttypes = NFS_MAX_LAYOUT_TYPES;
+ }
+
+ for(i = 0; i < fsinfo->nlayouttypes; ++i)
+ fsinfo->layouttype[i] = be32_to_cpup(p++);
+ return 0;
+}
+
+/*
+ * The type of file system exported.
+ * Note we must ensure that layouttype is set in any non-error case.
+ */
+static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
+ struct nfs_fsinfo *fsinfo)
+{
+ int status = 0;
+
+ dprintk("%s: bitmap is %x\n", __func__, bitmap[1]);
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
+ return -EIO;
+ if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
+ status = decode_pnfs_layout_types(xdr, fsinfo);
+ bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
+ }
+ return status;
+}
+
+/*
+ * The prefered block size for layout directed io
+ */
+static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
+ uint32_t *res)
+{
+ __be32 *p;
+
+ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
+ *res = 0;
+ if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *res = be32_to_cpup(p);
+ bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE;
+ }
+ return 0;
+}
+
+/*
+ * The granularity of a CLONE operation.
+ */
+static int decode_attr_clone_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
+ uint32_t *res)
+{
+ __be32 *p;
+
+ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
+ *res = 0;
+ if (bitmap[2] & FATTR4_WORD2_CLONE_BLKSIZE) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *res = be32_to_cpup(p);
+ bitmap[2] &= ~FATTR4_WORD2_CLONE_BLKSIZE;
+ }
+ return 0;
+}
+
+static int decode_attr_change_attr_type(struct xdr_stream *xdr,
+ uint32_t *bitmap,
+ enum nfs4_change_attr_type *res)
+{
+ u32 tmp = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+
+ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
+ if (bitmap[2] & FATTR4_WORD2_CHANGE_ATTR_TYPE) {
+ if (xdr_stream_decode_u32(xdr, &tmp))
+ return -EIO;
+ bitmap[2] &= ~FATTR4_WORD2_CHANGE_ATTR_TYPE;
+ }
+
+ switch(tmp) {
+ case NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR:
+ case NFS4_CHANGE_TYPE_IS_VERSION_COUNTER:
+ case NFS4_CHANGE_TYPE_IS_VERSION_COUNTER_NOPNFS:
+ case NFS4_CHANGE_TYPE_IS_TIME_METADATA:
+ *res = tmp;
+ break;
+ default:
+ *res = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+ }
+ return 0;
+}
+
+static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
+{
+ unsigned int savep;
+ uint32_t attrlen, bitmap[3];
+ int status;
+
+ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+ goto xdr_error;
+
+ fsinfo->rtmult = fsinfo->wtmult = 512; /* ??? */
+
+ if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_maxfilesize(xdr, bitmap, &fsinfo->maxfilesize)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_maxread(xdr, bitmap, &fsinfo->rtmax)) != 0)
+ goto xdr_error;
+ fsinfo->rtpref = fsinfo->dtpref = fsinfo->rtmax;
+ if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
+ goto xdr_error;
+ fsinfo->wtpref = fsinfo->wtmax;
+
+ status = -EIO;
+ if (unlikely(bitmap[0]))
+ goto xdr_error;
+
+ status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
+ if (status != 0)
+ goto xdr_error;
+ status = decode_attr_pnfstype(xdr, bitmap, fsinfo);
+ if (status != 0)
+ goto xdr_error;
+
+ status = -EIO;
+ if (unlikely(bitmap[1]))
+ goto xdr_error;
+
+ status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
+ if (status)
+ goto xdr_error;
+ status = decode_attr_clone_blksize(xdr, bitmap, &fsinfo->clone_blksize);
+ if (status)
+ goto xdr_error;
+
+ status = decode_attr_change_attr_type(xdr, bitmap,
+ &fsinfo->change_attr_type);
+ if (status)
+ goto xdr_error;
+
+ status = decode_attr_xattrsupport(xdr, bitmap,
+ &fsinfo->xattr_support);
+ if (status)
+ goto xdr_error;
+
+ status = verify_attr_len(xdr, savep, attrlen);
+xdr_error:
+ dprintk("%s: xdr returned %d!\n", __func__, -status);
+ return status;
+}
+
+static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+ __be32 *p;
+ uint32_t len;
+ int status;
+
+ /* Zero handle first to allow comparisons */
+ memset(fh, 0, sizeof(*fh));
+
+ status = decode_op_hdr(xdr, OP_GETFH);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ len = be32_to_cpup(p);
+ if (len > NFS4_FHSIZE || len == 0) {
+ trace_nfs4_xdr_bad_filehandle(xdr, OP_GETFH, NFS4ERR_BADHANDLE);
+ return -EREMOTEIO;
+ }
+ fh->size = len;
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return -EIO;
+ memcpy(fh->data, p, len);
+ return 0;
+}
+
+static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_LINK);
+ if (status)
+ return status;
+ return decode_change_info(xdr, cinfo);
+}
+
+/*
+ * We create the owner, so we know a proper owner.id length is 4.
+ */
+static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
+{
+ uint64_t offset, length, clientid;
+ __be32 *p;
+ uint32_t namelen, type;
+
+ p = xdr_inline_decode(xdr, 32); /* read 32 bytes */
+ if (unlikely(!p))
+ return -EIO;
+ p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */
+ p = xdr_decode_hyper(p, &length);
+ type = be32_to_cpup(p++); /* 4 byte read */
+ if (fl != NULL) { /* manipulate file lock */
+ fl->fl_start = (loff_t)offset;
+ fl->fl_end = fl->fl_start + (loff_t)length - 1;
+ if (length == ~(uint64_t)0)
+ fl->fl_end = OFFSET_MAX;
+ fl->fl_type = F_WRLCK;
+ if (type & 1)
+ fl->fl_type = F_RDLCK;
+ fl->fl_pid = 0;
+ }
+ p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */
+ namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */
+ p = xdr_inline_decode(xdr, namelen); /* variable size field */
+ if (likely(!p))
+ return -EIO;
+ return -NFS4ERR_DENIED;
+}
+
+static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_LOCK);
+ if (status == -EIO)
+ goto out;
+ if (status == 0) {
+ status = decode_lock_stateid(xdr, &res->stateid);
+ if (unlikely(status))
+ goto out;
+ } else if (status == -NFS4ERR_DENIED)
+ status = decode_lock_denied(xdr, NULL);
+ if (res->open_seqid != NULL)
+ nfs_increment_open_seqid(status, res->open_seqid);
+ nfs_increment_lock_seqid(status, res->lock_seqid);
+out:
+ return status;
+}
+
+static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res)
+{
+ int status;
+ status = decode_op_hdr(xdr, OP_LOCKT);
+ if (status == -NFS4ERR_DENIED)
+ return decode_lock_denied(xdr, res->denied);
+ return status;
+}
+
+static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_LOCKU);
+ if (status != -EIO)
+ nfs_increment_lock_seqid(status, res->seqid);
+ if (status == 0)
+ status = decode_lock_stateid(xdr, &res->stateid);
+ return status;
+}
+
+static int decode_release_lockowner(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER);
+}
+
+static int decode_lookup(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_LOOKUP);
+}
+
+static int decode_lookupp(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_LOOKUPP);
+}
+
+/* This is too sick! */
+static int decode_space_limit(struct xdr_stream *xdr,
+ unsigned long *pagemod_limit)
+{
+ __be32 *p;
+ uint32_t limit_type, nblocks, blocksize;
+ u64 maxsize = 0;
+
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(!p))
+ return -EIO;
+ limit_type = be32_to_cpup(p++);
+ switch (limit_type) {
+ case NFS4_LIMIT_SIZE:
+ xdr_decode_hyper(p, &maxsize);
+ break;
+ case NFS4_LIMIT_BLOCKS:
+ nblocks = be32_to_cpup(p++);
+ blocksize = be32_to_cpup(p);
+ maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
+ }
+ maxsize >>= PAGE_SHIFT;
+ *pagemod_limit = min_t(u64, maxsize, ULONG_MAX);
+ return 0;
+}
+
+static int decode_rw_delegation(struct xdr_stream *xdr,
+ uint32_t delegation_type,
+ struct nfs_openres *res)
+{
+ __be32 *p;
+ int status;
+
+ status = decode_delegation_stateid(xdr, &res->delegation);
+ if (unlikely(status))
+ return status;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ res->do_recall = be32_to_cpup(p);
+
+ switch (delegation_type) {
+ case NFS4_OPEN_DELEGATE_READ:
+ res->delegation_type = FMODE_READ;
+ break;
+ case NFS4_OPEN_DELEGATE_WRITE:
+ res->delegation_type = FMODE_WRITE|FMODE_READ;
+ if (decode_space_limit(xdr, &res->pagemod_limit) < 0)
+ return -EIO;
+ }
+ return decode_ace(xdr, NULL);
+}
+
+static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+{
+ __be32 *p;
+ uint32_t why_no_delegation;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ why_no_delegation = be32_to_cpup(p);
+ switch (why_no_delegation) {
+ case WND4_CONTENTION:
+ case WND4_RESOURCE:
+ xdr_inline_decode(xdr, 4);
+ /* Ignore for now */
+ }
+ return 0;
+}
+
+static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+{
+ __be32 *p;
+ uint32_t delegation_type;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ delegation_type = be32_to_cpup(p);
+ res->delegation_type = 0;
+ switch (delegation_type) {
+ case NFS4_OPEN_DELEGATE_NONE:
+ return 0;
+ case NFS4_OPEN_DELEGATE_READ:
+ case NFS4_OPEN_DELEGATE_WRITE:
+ return decode_rw_delegation(xdr, delegation_type, res);
+ case NFS4_OPEN_DELEGATE_NONE_EXT:
+ return decode_no_delegation(xdr, res);
+ }
+ return -EIO;
+}
+
+static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
+{
+ __be32 *p;
+ uint32_t savewords, bmlen, i;
+ int status;
+
+ if (!__decode_op_hdr(xdr, OP_OPEN, &status))
+ return status;
+ nfs_increment_open_seqid(status, res->seqid);
+ if (status)
+ return status;
+ status = decode_open_stateid(xdr, &res->stateid);
+ if (unlikely(status))
+ return status;
+
+ decode_change_info(xdr, &res->cinfo);
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ res->rflags = be32_to_cpup(p++);
+ bmlen = be32_to_cpup(p);
+ if (bmlen > 10)
+ goto xdr_error;
+
+ p = xdr_inline_decode(xdr, bmlen << 2);
+ if (unlikely(!p))
+ return -EIO;
+ savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
+ for (i = 0; i < savewords; ++i)
+ res->attrset[i] = be32_to_cpup(p++);
+ for (; i < NFS4_BITMAP_SIZE; i++)
+ res->attrset[i] = 0;
+
+ return decode_delegation(xdr, res);
+xdr_error:
+ dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen);
+ return -EIO;
+}
+
+static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
+ if (status != -EIO)
+ nfs_increment_open_seqid(status, res->seqid);
+ if (!status)
+ status = decode_open_stateid(xdr, &res->stateid);
+ return status;
+}
+
+static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE);
+ if (status != -EIO)
+ nfs_increment_open_seqid(status, res->seqid);
+ if (!status)
+ status = decode_open_stateid(xdr, &res->stateid);
+ return status;
+}
+
+static int decode_putfh(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_PUTFH);
+}
+
+static int decode_putrootfh(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_PUTROOTFH);
+}
+
+static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req,
+ struct nfs_pgio_res *res)
+{
+ __be32 *p;
+ uint32_t count, eof, recvd;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_READ);
+ if (status)
+ return status;
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ eof = be32_to_cpup(p++);
+ count = be32_to_cpup(p);
+ recvd = xdr_read_pages(xdr, count);
+ if (count > recvd) {
+ dprintk("NFS: server cheating in read reply: "
+ "count %u > recvd %u\n", count, recvd);
+ count = recvd;
+ eof = 0;
+ }
+ res->eof = eof;
+ res->count = count;
+ return 0;
+}
+
+static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
+{
+ int status;
+ __be32 verf[2];
+
+ status = decode_op_hdr(xdr, OP_READDIR);
+ if (!status)
+ status = decode_verifier(xdr, readdir->verifier.data);
+ if (unlikely(status))
+ return status;
+ memcpy(verf, readdir->verifier.data, sizeof(verf));
+ dprintk("%s: verifier = %08x:%08x\n",
+ __func__, verf[0], verf[1]);
+ return xdr_read_pages(xdr, xdr->buf->page_len);
+}
+
+static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
+{
+ struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+ u32 len, recvd;
+ __be32 *p;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_READLINK);
+ if (status)
+ return status;
+
+ /* Convert length of symlink */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ len = be32_to_cpup(p);
+ if (len >= rcvbuf->page_len || len <= 0) {
+ dprintk("nfs: server returned giant symlink!\n");
+ return -ENAMETOOLONG;
+ }
+ recvd = xdr_read_pages(xdr, len);
+ if (recvd < len) {
+ dprintk("NFS: server cheating in readlink reply: "
+ "count %u > recvd %u\n", len, recvd);
+ return -EIO;
+ }
+ /*
+ * The XDR encode routine has set things up so that
+ * the link text will be copied directly into the
+ * buffer. We just have to do overflow-checking,
+ * and null-terminate the text (the VFS expects
+ * null-termination).
+ */
+ xdr_terminate_string(rcvbuf, len);
+ return 0;
+}
+
+static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_REMOVE);
+ if (status)
+ goto out;
+ status = decode_change_info(xdr, cinfo);
+out:
+ return status;
+}
+
+static int decode_rename(struct xdr_stream *xdr, struct nfs4_change_info *old_cinfo,
+ struct nfs4_change_info *new_cinfo)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_RENAME);
+ if (status)
+ goto out;
+ if ((status = decode_change_info(xdr, old_cinfo)))
+ goto out;
+ status = decode_change_info(xdr, new_cinfo);
+out:
+ return status;
+}
+
+static int decode_renew(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_RENEW);
+}
+
+static int
+decode_restorefh(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_RESTOREFH);
+}
+
+static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
+ struct nfs_getaclres *res, enum nfs4_acl_type type)
+{
+ unsigned int savep;
+ uint32_t attrlen,
+ bitmap[3] = {0};
+ int status;
+
+ res->acl_len = 0;
+ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+ goto out;
+
+ xdr_enter_page(xdr, xdr->buf->page_len);
+
+ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+ goto out;
+ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+ goto out;
+
+ switch (type) {
+ default:
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U)))
+ return -EIO;
+ if (!(bitmap[0] & FATTR4_WORD0_ACL))
+ return -EOPNOTSUPP;
+ break;
+ case NFS4ACL_DACL:
+ if (unlikely(bitmap[0] || bitmap[1] & (FATTR4_WORD1_DACL - 1U)))
+ return -EIO;
+ if (!(bitmap[1] & FATTR4_WORD1_DACL))
+ return -EOPNOTSUPP;
+ break;
+ case NFS4ACL_SACL:
+ if (unlikely(bitmap[0] || bitmap[1] & (FATTR4_WORD1_SACL - 1U)))
+ return -EIO;
+ if (!(bitmap[1] & FATTR4_WORD1_SACL))
+ return -EOPNOTSUPP;
+ }
+
+ /* The bitmap (xdr len + bitmaps) and the attr xdr len words
+ * are stored with the acl data to handle the problem of
+ * variable length bitmaps.*/
+ res->acl_data_offset = xdr_page_pos(xdr);
+ res->acl_len = attrlen;
+
+ /* Check for receive buffer overflow */
+ if (res->acl_len > xdr_stream_remaining(xdr) ||
+ res->acl_len + res->acl_data_offset > xdr->buf->page_len) {
+ res->acl_flags |= NFS4_ACL_TRUNC;
+ dprintk("NFS: acl reply: attrlen %u > page_len %zu\n",
+ attrlen, xdr_stream_remaining(xdr));
+ }
+out:
+ return status;
+}
+
+static int
+decode_savefh(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_SAVEFH);
+}
+
+static int decode_setattr(struct xdr_stream *xdr)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_SETATTR);
+ if (status)
+ return status;
+ if (decode_bitmap4(xdr, NULL, 0) >= 0)
+ return 0;
+ return -EIO;
+}
+
+static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res)
+{
+ __be32 *p;
+ uint32_t opnum;
+ int32_t nfserr;
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ opnum = be32_to_cpup(p++);
+ if (opnum != OP_SETCLIENTID) {
+ dprintk("nfs: decode_setclientid: Server returned operation"
+ " %d\n", opnum);
+ return -EIO;
+ }
+ nfserr = be32_to_cpup(p);
+ if (nfserr == NFS_OK) {
+ p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
+ if (unlikely(!p))
+ return -EIO;
+ p = xdr_decode_hyper(p, &res->clientid);
+ memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE);
+ } else if (nfserr == NFSERR_CLID_INUSE) {
+ uint32_t len;
+
+ /* skip netid string */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ len = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return -EIO;
+
+ /* skip uaddr string */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ len = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return -EIO;
+ return -NFSERR_CLID_INUSE;
+ } else
+ return nfs4_stat_to_errno(nfserr);
+
+ return 0;
+}
+
+static int decode_setclientid_confirm(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM);
+}
+
+static int decode_write(struct xdr_stream *xdr, struct nfs_pgio_res *res)
+{
+ __be32 *p;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_WRITE);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ res->count = be32_to_cpup(p++);
+ res->verf->committed = be32_to_cpup(p++);
+ return decode_write_verifier(xdr, &res->verf->verifier);
+}
+
+static int decode_delegreturn(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_DELEGRETURN);
+}
+
+static int decode_secinfo_gss(struct xdr_stream *xdr,
+ struct nfs4_secinfo4 *flavor)
+{
+ u32 oid_len;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ oid_len = be32_to_cpup(p);
+ if (oid_len > GSS_OID_MAX_LEN)
+ return -EINVAL;
+
+ p = xdr_inline_decode(xdr, oid_len);
+ if (unlikely(!p))
+ return -EIO;
+ memcpy(flavor->flavor_info.oid.data, p, oid_len);
+ flavor->flavor_info.oid.len = oid_len;
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ flavor->flavor_info.qop = be32_to_cpup(p++);
+ flavor->flavor_info.service = be32_to_cpup(p);
+
+ return 0;
+}
+
+static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
+{
+ struct nfs4_secinfo4 *sec_flavor;
+ unsigned int i, num_flavors;
+ int status;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ res->flavors->num_flavors = 0;
+ num_flavors = be32_to_cpup(p);
+
+ for (i = 0; i < num_flavors; i++) {
+ sec_flavor = &res->flavors->flavors[i];
+ if ((char *)&sec_flavor[1] - (char *)res->flavors > PAGE_SIZE)
+ break;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ sec_flavor->flavor = be32_to_cpup(p);
+
+ if (sec_flavor->flavor == RPC_AUTH_GSS) {
+ status = decode_secinfo_gss(xdr, sec_flavor);
+ if (status)
+ goto out;
+ }
+ res->flavors->num_flavors++;
+ }
+
+ status = 0;
+out:
+ return status;
+}
+
+static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
+{
+ int status = decode_op_hdr(xdr, OP_SECINFO);
+ if (status)
+ return status;
+ return decode_secinfo_common(xdr, res);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+static int decode_secinfo_no_name(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
+{
+ int status = decode_op_hdr(xdr, OP_SECINFO_NO_NAME);
+ if (status)
+ return status;
+ return decode_secinfo_common(xdr, res);
+}
+
+static int decode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map)
+{
+ if (xdr_stream_decode_uint32_array(xdr, op_map->u.words,
+ ARRAY_SIZE(op_map->u.words)) < 0)
+ return -EIO;
+ return 0;
+}
+
+static int decode_exchange_id(struct xdr_stream *xdr,
+ struct nfs41_exchange_id_res *res)
+{
+ __be32 *p;
+ uint32_t dummy;
+ char *dummy_str;
+ int status;
+ uint32_t impl_id_count;
+
+ status = decode_op_hdr(xdr, OP_EXCHANGE_ID);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, &res->clientid);
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(!p))
+ return -EIO;
+ res->seqid = be32_to_cpup(p++);
+ res->flags = be32_to_cpup(p++);
+
+ res->state_protect.how = be32_to_cpup(p);
+ switch (res->state_protect.how) {
+ case SP4_NONE:
+ break;
+ case SP4_MACH_CRED:
+ status = decode_op_map(xdr, &res->state_protect.enforce);
+ if (status)
+ return status;
+ status = decode_op_map(xdr, &res->state_protect.allow);
+ if (status)
+ return status;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -EIO;
+ }
+
+ /* server_owner4.so_minor_id */
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ p = xdr_decode_hyper(p, &res->server_owner->minor_id);
+
+ /* server_owner4.so_major_id */
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
+ memcpy(res->server_owner->major_id, dummy_str, dummy);
+ res->server_owner->major_id_sz = dummy;
+
+ /* server_scope4 */
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
+ memcpy(res->server_scope->server_scope, dummy_str, dummy);
+ res->server_scope->server_scope_sz = dummy;
+
+ /* Implementation Id */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ impl_id_count = be32_to_cpup(p++);
+
+ if (impl_id_count) {
+ /* nii_domain */
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
+ memcpy(res->impl_id->domain, dummy_str, dummy);
+
+ /* nii_name */
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
+ memcpy(res->impl_id->name, dummy_str, dummy);
+
+ /* nii_date */
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(!p))
+ return -EIO;
+ p = xdr_decode_hyper(p, &res->impl_id->date.seconds);
+ res->impl_id->date.nseconds = be32_to_cpup(p);
+
+ /* if there's more than one entry, ignore the rest */
+ }
+ return 0;
+}
+
+static int decode_chan_attrs(struct xdr_stream *xdr,
+ struct nfs4_channel_attrs *attrs)
+{
+ __be32 *p;
+ u32 nr_attrs, val;
+
+ p = xdr_inline_decode(xdr, 28);
+ if (unlikely(!p))
+ return -EIO;
+ val = be32_to_cpup(p++); /* headerpadsz */
+ if (val)
+ return -EINVAL; /* no support for header padding yet */
+ attrs->max_rqst_sz = be32_to_cpup(p++);
+ attrs->max_resp_sz = be32_to_cpup(p++);
+ attrs->max_resp_sz_cached = be32_to_cpup(p++);
+ attrs->max_ops = be32_to_cpup(p++);
+ attrs->max_reqs = be32_to_cpup(p++);
+ nr_attrs = be32_to_cpup(p);
+ if (unlikely(nr_attrs > 1)) {
+ printk(KERN_WARNING "NFS: %s: Invalid rdma channel attrs "
+ "count %u\n", __func__, nr_attrs);
+ return -EINVAL;
+ }
+ if (nr_attrs == 1) {
+ p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */
+ if (unlikely(!p))
+ return -EIO;
+ }
+ return 0;
+}
+
+static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
+{
+ return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN);
+}
+
+static int decode_bind_conn_to_session(struct xdr_stream *xdr,
+ struct nfs41_bind_conn_to_session_res *res)
+{
+ __be32 *p;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION);
+ if (!status)
+ status = decode_sessionid(xdr, &res->sessionid);
+ if (unlikely(status))
+ return status;
+
+ /* dir flags, rdma mode bool */
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+
+ res->dir = be32_to_cpup(p++);
+ if (res->dir == 0 || res->dir > NFS4_CDFS4_BOTH)
+ return -EIO;
+ if (be32_to_cpup(p) == 0)
+ res->use_conn_in_rdma_mode = false;
+ else
+ res->use_conn_in_rdma_mode = true;
+
+ return 0;
+}
+
+static int decode_create_session(struct xdr_stream *xdr,
+ struct nfs41_create_session_res *res)
+{
+ __be32 *p;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_CREATE_SESSION);
+ if (!status)
+ status = decode_sessionid(xdr, &res->sessionid);
+ if (unlikely(status))
+ return status;
+
+ /* seqid, flags */
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ res->seqid = be32_to_cpup(p++);
+ res->flags = be32_to_cpup(p);
+
+ /* Channel attributes */
+ status = decode_chan_attrs(xdr, &res->fc_attrs);
+ if (!status)
+ status = decode_chan_attrs(xdr, &res->bc_attrs);
+ return status;
+}
+
+static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
+{
+ return decode_op_hdr(xdr, OP_DESTROY_SESSION);
+}
+
+static int decode_destroy_clientid(struct xdr_stream *xdr, void *dummy)
+{
+ return decode_op_hdr(xdr, OP_DESTROY_CLIENTID);
+}
+
+static int decode_reclaim_complete(struct xdr_stream *xdr, void *dummy)
+{
+ return decode_op_hdr(xdr, OP_RECLAIM_COMPLETE);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static int decode_sequence(struct xdr_stream *xdr,
+ struct nfs4_sequence_res *res,
+ struct rpc_rqst *rqstp)
+{
+#if defined(CONFIG_NFS_V4_1)
+ struct nfs4_session *session;
+ struct nfs4_sessionid id;
+ u32 dummy;
+ int status;
+ __be32 *p;
+
+ if (res->sr_slot == NULL)
+ return 0;
+ if (!res->sr_slot->table->session)
+ return 0;
+
+ status = decode_op_hdr(xdr, OP_SEQUENCE);
+ if (!status)
+ status = decode_sessionid(xdr, &id);
+ if (unlikely(status))
+ goto out_err;
+
+ /*
+ * If the server returns different values for sessionID, slotID or
+ * sequence number, the server is looney tunes.
+ */
+ status = -EREMOTEIO;
+ session = res->sr_slot->table->session;
+
+ if (memcmp(id.data, session->sess_id.data,
+ NFS4_MAX_SESSIONID_LEN)) {
+ dprintk("%s Invalid session id\n", __func__);
+ goto out_err;
+ }
+
+ p = xdr_inline_decode(xdr, 20);
+ if (unlikely(!p))
+ goto out_overflow;
+
+ /* seqid */
+ dummy = be32_to_cpup(p++);
+ if (dummy != res->sr_slot->seq_nr) {
+ dprintk("%s Invalid sequence number\n", __func__);
+ goto out_err;
+ }
+ /* slot id */
+ dummy = be32_to_cpup(p++);
+ if (dummy != res->sr_slot->slot_nr) {
+ dprintk("%s Invalid slot id\n", __func__);
+ goto out_err;
+ }
+ /* highest slot id */
+ res->sr_highest_slotid = be32_to_cpup(p++);
+ /* target highest slot id */
+ res->sr_target_highest_slotid = be32_to_cpup(p++);
+ /* result flags */
+ res->sr_status_flags = be32_to_cpup(p);
+ status = 0;
+out_err:
+ res->sr_status = status;
+ return status;
+out_overflow:
+ status = -EIO;
+ goto out_err;
+#else /* CONFIG_NFS_V4_1 */
+ return 0;
+#endif /* CONFIG_NFS_V4_1 */
+}
+
+#if defined(CONFIG_NFS_V4_1)
+static int decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_LAYOUT_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
+static int decode_getdeviceinfo(struct xdr_stream *xdr,
+ struct nfs4_getdeviceinfo_res *res)
+{
+ struct pnfs_device *pdev = res->pdev;
+ __be32 *p;
+ uint32_t len, type;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_GETDEVICEINFO);
+ if (status) {
+ if (status == -ETOOSMALL) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ pdev->mincount = be32_to_cpup(p);
+ dprintk("%s: Min count too small. mincnt = %u\n",
+ __func__, pdev->mincount);
+ }
+ return status;
+ }
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ type = be32_to_cpup(p++);
+ if (type != pdev->layout_type) {
+ dprintk("%s: layout mismatch req: %u pdev: %u\n",
+ __func__, pdev->layout_type, type);
+ return -EINVAL;
+ }
+ /*
+ * Get the length of the opaque device_addr4. xdr_read_pages places
+ * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages)
+ * and places the remaining xdr data in xdr_buf->tail
+ */
+ pdev->mincount = be32_to_cpup(p);
+ if (xdr_read_pages(xdr, pdev->mincount) != pdev->mincount)
+ return -EIO;
+
+ /* Parse notification bitmap, verifying that it is zero. */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ len = be32_to_cpup(p);
+ if (len) {
+ uint32_t i;
+
+ p = xdr_inline_decode(xdr, 4 * len);
+ if (unlikely(!p))
+ return -EIO;
+
+ res->notification = be32_to_cpup(p++);
+ for (i = 1; i < len; i++) {
+ if (be32_to_cpup(p++)) {
+ dprintk("%s: unsupported notification\n",
+ __func__);
+ return -EIO;
+ }
+ }
+ }
+ return 0;
+}
+
+static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
+ struct nfs4_layoutget_res *res)
+{
+ __be32 *p;
+ int status;
+ u32 layout_count;
+ u32 recvd;
+
+ status = decode_op_hdr(xdr, OP_LAYOUTGET);
+ if (status)
+ goto out;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ res->return_on_close = be32_to_cpup(p);
+ decode_layout_stateid(xdr, &res->stateid);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ layout_count = be32_to_cpup(p);
+ if (!layout_count) {
+ dprintk("%s: server responded with empty layout array\n",
+ __func__);
+ status = -EINVAL;
+ goto out;
+ }
+
+ p = xdr_inline_decode(xdr, 28);
+ if (unlikely(!p))
+ goto out_overflow;
+ p = xdr_decode_hyper(p, &res->range.offset);
+ p = xdr_decode_hyper(p, &res->range.length);
+ res->range.iomode = be32_to_cpup(p++);
+ res->type = be32_to_cpup(p++);
+ res->layoutp->len = be32_to_cpup(p);
+
+ dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",
+ __func__,
+ (unsigned long)res->range.offset,
+ (unsigned long)res->range.length,
+ res->range.iomode,
+ res->type,
+ res->layoutp->len);
+
+ recvd = xdr_read_pages(xdr, res->layoutp->len);
+ if (res->layoutp->len > recvd) {
+ dprintk("NFS: server cheating in layoutget reply: "
+ "layout len %u > recvd %u\n",
+ res->layoutp->len, recvd);
+ status = -EINVAL;
+ goto out;
+ }
+
+ if (layout_count > 1) {
+ /* We only handle a length one array at the moment. Any
+ * further entries are just ignored. Note that this means
+ * the client may see a response that is less than the
+ * minimum it requested.
+ */
+ dprintk("%s: server responded with %d layouts, dropping tail\n",
+ __func__, layout_count);
+ }
+
+out:
+ res->status = status;
+ return status;
+out_overflow:
+ status = -EIO;
+ goto out;
+}
+
+static int decode_layoutreturn(struct xdr_stream *xdr,
+ struct nfs4_layoutreturn_res *res)
+{
+ __be32 *p;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
+ if (status)
+ return status;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ res->lrs_present = be32_to_cpup(p);
+ if (res->lrs_present)
+ status = decode_layout_stateid(xdr, &res->stateid);
+ else
+ nfs4_stateid_copy(&res->stateid, &invalid_stateid);
+ return status;
+}
+
+static int decode_layoutcommit(struct xdr_stream *xdr,
+ struct rpc_rqst *req,
+ struct nfs4_layoutcommit_res *res)
+{
+ __be32 *p;
+ __u32 sizechanged;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
+ res->status = status;
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ sizechanged = be32_to_cpup(p);
+
+ if (sizechanged) {
+ /* throw away new size */
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ }
+ return 0;
+}
+
+static int decode_test_stateid(struct xdr_stream *xdr,
+ struct nfs41_test_stateid_res *res)
+{
+ __be32 *p;
+ int status;
+ int num_res;
+
+ status = decode_op_hdr(xdr, OP_TEST_STATEID);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ num_res = be32_to_cpup(p++);
+ if (num_res != 1)
+ return -EIO;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ res->status = be32_to_cpup(p++);
+
+ return status;
+}
+
+static int decode_free_stateid(struct xdr_stream *xdr,
+ struct nfs41_free_stateid_res *res)
+{
+ res->status = decode_op_hdr(xdr, OP_FREE_STATEID);
+ return res->status;
+}
+#else
+static inline
+int decode_layoutreturn(struct xdr_stream *xdr,
+ struct nfs4_layoutreturn_res *res)
+{
+ return 0;
+}
+
+static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
+ struct nfs4_layoutget_res *res)
+{
+ return 0;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * END OF "GENERIC" DECODE ROUTINES.
+ */
+
+/*
+ * Decode OPEN_DOWNGRADE response
+ */
+static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_closeres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ if (res->lr_res) {
+ status = decode_layoutreturn(xdr, res->lr_res);
+ res->lr_ret = status;
+ if (status)
+ goto out;
+ }
+ status = decode_open_downgrade(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode ACCESS response
+ */
+static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_accessres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status != 0)
+ goto out;
+ status = decode_access(xdr, &res->supported, &res->access);
+ if (status != 0)
+ goto out;
+ if (res->fattr)
+ decode_getfattr(xdr, res->fattr, res->server);
+out:
+ return status;
+}
+
+/*
+ * Decode LOOKUP response
+ */
+static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_lookup_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_lookup(xdr);
+ if (status)
+ goto out;
+ status = decode_getfh(xdr, res->fh);
+ if (status)
+ goto out;
+ status = decode_getfattr(xdr, res->fattr, res->server);
+out:
+ return status;
+}
+
+/*
+ * Decode LOOKUPP response
+ */
+static int nfs4_xdr_dec_lookupp(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_lookupp_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_lookupp(xdr);
+ if (status)
+ goto out;
+ status = decode_getfh(xdr, res->fh);
+ if (status)
+ goto out;
+ status = decode_getfattr(xdr, res->fattr, res->server);
+out:
+ return status;
+}
+
+/*
+ * Decode LOOKUP_ROOT response
+ */
+static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_lookup_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putrootfh(xdr);
+ if (status)
+ goto out;
+ status = decode_getfh(xdr, res->fh);
+ if (status == 0)
+ status = decode_getfattr(xdr, res->fattr, res->server);
+out:
+ return status;
+}
+
+/*
+ * Decode REMOVE response
+ */
+static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_removeres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_remove(xdr, &res->cinfo);
+out:
+ return status;
+}
+
+/*
+ * Decode RENAME response
+ */
+static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_renameres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_savefh(xdr);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo);
+out:
+ return status;
+}
+
+/*
+ * Decode LINK response
+ */
+static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_link_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_savefh(xdr);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_link(xdr, &res->cinfo);
+ if (status)
+ goto out;
+ /*
+ * Note order: OP_LINK leaves the directory as the current
+ * filehandle.
+ */
+ status = decode_restorefh(xdr);
+ if (status)
+ goto out;
+ decode_getfattr(xdr, res->fattr, res->server);
+out:
+ return status;
+}
+
+/*
+ * Decode CREATE response
+ */
+static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_create_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_create(xdr, &res->dir_cinfo);
+ if (status)
+ goto out;
+ status = decode_getfh(xdr, res->fh);
+ if (status)
+ goto out;
+ decode_getfattr(xdr, res->fattr, res->server);
+out:
+ return status;
+}
+
+/*
+ * Decode SYMLINK response
+ */
+static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *res)
+{
+ return nfs4_xdr_dec_create(rqstp, xdr, res);
+}
+
+/*
+ * Decode GETATTR response
+ */
+static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_getattr_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_getfattr(xdr, res->fattr, res->server);
+out:
+ return status;
+}
+
+/*
+ * Encode an SETACL request
+ */
+static void nfs4_xdr_enc_setacl(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_setaclargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_setacl(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Decode SETACL response
+ */
+static int
+nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_setaclres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_setattr(xdr);
+out:
+ return status;
+}
+
+/*
+ * Decode GETACL response
+ */
+static int
+nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_getaclres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ if (res->acl_scratch != NULL)
+ xdr_set_scratch_page(xdr, res->acl_scratch);
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_getacl(xdr, rqstp, res, res->acl_type);
+
+out:
+ return status;
+}
+
+/*
+ * Decode CLOSE response
+ */
+static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_closeres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ if (res->lr_res) {
+ status = decode_layoutreturn(xdr, res->lr_res);
+ res->lr_ret = status;
+ if (status)
+ goto out;
+ }
+ if (res->fattr != NULL) {
+ status = decode_getfattr(xdr, res->fattr, res->server);
+ if (status != 0)
+ goto out;
+ }
+ status = decode_close(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode OPEN response
+ */
+static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_openres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_open(xdr, res);
+ if (status)
+ goto out;
+ status = decode_getfh(xdr, &res->fh);
+ if (status)
+ goto out;
+ if (res->access_request)
+ decode_access(xdr, &res->access_supported, &res->access_result);
+ decode_getfattr(xdr, res->f_attr, res->server);
+ if (res->lg_res)
+ decode_layoutget(xdr, rqstp, res->lg_res);
+out:
+ return status;
+}
+
+/*
+ * Decode OPEN_CONFIRM response
+ */
+static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_open_confirmres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_open_confirm(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode OPEN response
+ */
+static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_openres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_open(xdr, res);
+ if (status)
+ goto out;
+ if (res->access_request)
+ decode_access(xdr, &res->access_supported, &res->access_result);
+ decode_getfattr(xdr, res->f_attr, res->server);
+ if (res->lg_res)
+ decode_layoutget(xdr, rqstp, res->lg_res);
+out:
+ return status;
+}
+
+/*
+ * Decode SETATTR response
+ */
+static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_setattrres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_setattr(xdr);
+ if (status)
+ goto out;
+ decode_getfattr(xdr, res->fattr, res->server);
+out:
+ return status;
+}
+
+/*
+ * Decode LOCK response
+ */
+static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_lock_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_lock(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode LOCKT response
+ */
+static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_lockt_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_lockt(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode LOCKU response
+ */
+static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_locku_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_locku(xdr, res);
+out:
+ return status;
+}
+
+static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr, void *dummy)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_release_lockowner(xdr);
+ return status;
+}
+
+/*
+ * Decode READLINK response
+ */
+static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_readlink_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_readlink(xdr, rqstp);
+out:
+ return status;
+}
+
+/*
+ * Decode READDIR response
+ */
+static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_readdir_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_readdir(xdr, rqstp, res);
+out:
+ return status;
+}
+
+/*
+ * Decode Read response
+ */
+static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_pgio_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ res->op_status = hdr.status;
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_read(xdr, rqstp, res);
+ if (!status)
+ status = res->count;
+out:
+ return status;
+}
+
+/*
+ * Decode WRITE response
+ */
+static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_pgio_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ res->op_status = hdr.status;
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_write(xdr, res);
+ if (status)
+ goto out;
+ if (res->fattr)
+ decode_getfattr(xdr, res->fattr, res->server);
+ if (!status)
+ status = res->count;
+out:
+ return status;
+}
+
+/*
+ * Decode COMMIT response
+ */
+static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_commitres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ res->op_status = hdr.status;
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_commit(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode FSINFO response
+ */
+static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_fsinfo_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_sequence(xdr, &res->seq_res, req);
+ if (!status)
+ status = decode_putfh(xdr);
+ if (!status)
+ status = decode_fsinfo(xdr, res->fsinfo);
+ return status;
+}
+
+/*
+ * Decode PATHCONF response
+ */
+static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_pathconf_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_sequence(xdr, &res->seq_res, req);
+ if (!status)
+ status = decode_putfh(xdr);
+ if (!status)
+ status = decode_pathconf(xdr, res->pathconf);
+ return status;
+}
+
+/*
+ * Decode STATFS response
+ */
+static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_statfs_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_sequence(xdr, &res->seq_res, req);
+ if (!status)
+ status = decode_putfh(xdr);
+ if (!status)
+ status = decode_statfs(xdr, res->fsstat);
+ return status;
+}
+
+/*
+ * Decode GETATTR_BITMAP response
+ */
+static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_server_caps_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, req);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_server_caps(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode RENEW response
+ */
+static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *__unused)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_renew(xdr);
+ return status;
+}
+
+/*
+ * Decode SETCLIENTID response
+ */
+static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_setclientid_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_setclientid(xdr, res);
+ return status;
+}
+
+/*
+ * Decode SETCLIENTID_CONFIRM response
+ */
+static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_setclientid_confirm(xdr);
+ return status;
+}
+
+/*
+ * Decode DELEGRETURN response
+ */
+static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_delegreturnres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status != 0)
+ goto out;
+ if (res->lr_res) {
+ status = decode_layoutreturn(xdr, res->lr_res);
+ res->lr_ret = status;
+ if (status)
+ goto out;
+ }
+ if (res->fattr) {
+ status = decode_getfattr(xdr, res->fattr, res->server);
+ if (status != 0)
+ goto out;
+ }
+ status = decode_delegreturn(xdr);
+out:
+ return status;
+}
+
+/*
+ * Decode FS_LOCATIONS response
+ */
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_fs_locations_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, req);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ if (res->migration) {
+ xdr_enter_page(xdr, PAGE_SIZE);
+ status = decode_getfattr_generic(xdr,
+ res->fs_locations->fattr,
+ NULL, res->fs_locations,
+ res->fs_locations->server);
+ if (status)
+ goto out;
+ if (res->renew)
+ status = decode_renew(xdr);
+ } else {
+ status = decode_lookup(xdr);
+ if (status)
+ goto out;
+ xdr_enter_page(xdr, PAGE_SIZE);
+ status = decode_getfattr_generic(xdr,
+ res->fs_locations->fattr,
+ NULL, res->fs_locations,
+ res->fs_locations->server);
+ }
+out:
+ return status;
+}
+
+/*
+ * Decode SECINFO response
+ */
+static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_secinfo_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_secinfo(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode FSID_PRESENT response
+ */
+static int nfs4_xdr_dec_fsid_present(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_fsid_present_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_getfh(xdr, res->fh);
+ if (status)
+ goto out;
+ if (res->renew)
+ status = decode_renew(xdr);
+out:
+ return status;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * Decode BIND_CONN_TO_SESSION response
+ */
+static int nfs4_xdr_dec_bind_conn_to_session(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_bind_conn_to_session(xdr, res);
+ return status;
+}
+
+/*
+ * Decode EXCHANGE_ID response
+ */
+static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_exchange_id(xdr, res);
+ return status;
+}
+
+/*
+ * Decode CREATE_SESSION response
+ */
+static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_create_session(xdr, res);
+ return status;
+}
+
+/*
+ * Decode DESTROY_SESSION response
+ */
+static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_destroy_session(xdr, res);
+ return status;
+}
+
+/*
+ * Decode DESTROY_CLIENTID response
+ */
+static int nfs4_xdr_dec_destroy_clientid(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_destroy_clientid(xdr, res);
+ return status;
+}
+
+/*
+ * Decode SEQUENCE response
+ */
+static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_sequence(xdr, res, rqstp);
+ return status;
+}
+
+#endif
+
+/*
+ * Decode GET_LEASE_TIME response
+ */
+static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_get_lease_time_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_sequence(xdr, &res->lr_seq_res, rqstp);
+ if (!status)
+ status = decode_putrootfh(xdr);
+ if (!status)
+ status = decode_fsinfo(xdr, res->lr_fsinfo);
+ return status;
+}
+
+#ifdef CONFIG_NFS_V4_1
+
+/*
+ * Decode RECLAIM_COMPLETE response
+ */
+static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs41_reclaim_complete_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (!status)
+ status = decode_reclaim_complete(xdr, NULL);
+ return status;
+}
+
+/*
+ * Decode GETDEVINFO response
+ */
+static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_getdeviceinfo_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status != 0)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status != 0)
+ goto out;
+ status = decode_getdeviceinfo(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode LAYOUTGET response
+ */
+static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_layoutget_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_layoutget(xdr, rqstp, res);
+out:
+ return status;
+}
+
+/*
+ * Decode LAYOUTRETURN response
+ */
+static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_layoutreturn_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_layoutreturn(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode LAYOUTCOMMIT response
+ */
+static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_layoutcommit_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_layoutcommit(xdr, rqstp, res);
+ if (status)
+ goto out;
+ decode_getfattr(xdr, res->fattr, res->server);
+out:
+ return status;
+}
+
+/*
+ * Decode SECINFO_NO_NAME response
+ */
+static int nfs4_xdr_dec_secinfo_no_name(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_secinfo_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putrootfh(xdr);
+ if (status)
+ goto out;
+ status = decode_secinfo_no_name(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode TEST_STATEID response
+ */
+static int nfs4_xdr_dec_test_stateid(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs41_test_stateid_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_test_stateid(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode FREE_STATEID response
+ */
+static int nfs4_xdr_dec_free_stateid(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs41_free_stateid_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_free_stateid(xdr, res);
+out:
+ return status;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_decode_dirent - Decode a single NFSv4 directory entry stored in
+ * the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ */
+int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+ bool plus)
+{
+ unsigned int savep;
+ uint32_t bitmap[3] = {0};
+ uint32_t len;
+ uint64_t new_cookie;
+ __be32 *p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EAGAIN;
+ if (*p == xdr_zero) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EAGAIN;
+ if (*p == xdr_zero)
+ return -EAGAIN;
+ entry->eof = 1;
+ return -EBADCOOKIE;
+ }
+
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(!p))
+ return -EAGAIN;
+ p = xdr_decode_hyper(p, &new_cookie);
+ entry->len = be32_to_cpup(p);
+
+ p = xdr_inline_decode(xdr, entry->len);
+ if (unlikely(!p))
+ return -EAGAIN;
+ entry->name = (const char *) p;
+
+ /*
+ * In case the server doesn't return an inode number,
+ * we fake one here. (We don't use inode number 0,
+ * since glibc seems to choke on it...)
+ */
+ entry->ino = 1;
+ entry->fattr->valid = 0;
+
+ if (decode_attr_bitmap(xdr, bitmap) < 0)
+ return -EAGAIN;
+
+ if (decode_attr_length(xdr, &len, &savep) < 0)
+ return -EAGAIN;
+
+ if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
+ NULL, entry->server) < 0)
+ return -EAGAIN;
+ if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
+ entry->ino = entry->fattr->mounted_on_fileid;
+ else if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
+ entry->ino = entry->fattr->fileid;
+
+ entry->d_type = DT_UNKNOWN;
+ if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
+ entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
+
+ entry->cookie = new_cookie;
+
+ return 0;
+}
+
+/*
+ * We need to translate between nfs status return values and
+ * the local errno values which may not be the same.
+ */
+static struct {
+ int stat;
+ int errno;
+} nfs_errtbl[] = {
+ { NFS4_OK, 0 },
+ { NFS4ERR_PERM, -EPERM },
+ { NFS4ERR_NOENT, -ENOENT },
+ { NFS4ERR_IO, -errno_NFSERR_IO},
+ { NFS4ERR_NXIO, -ENXIO },
+ { NFS4ERR_ACCESS, -EACCES },
+ { NFS4ERR_EXIST, -EEXIST },
+ { NFS4ERR_XDEV, -EXDEV },
+ { NFS4ERR_NOTDIR, -ENOTDIR },
+ { NFS4ERR_ISDIR, -EISDIR },
+ { NFS4ERR_INVAL, -EINVAL },
+ { NFS4ERR_FBIG, -EFBIG },
+ { NFS4ERR_NOSPC, -ENOSPC },
+ { NFS4ERR_ROFS, -EROFS },
+ { NFS4ERR_MLINK, -EMLINK },
+ { NFS4ERR_NAMETOOLONG, -ENAMETOOLONG },
+ { NFS4ERR_NOTEMPTY, -ENOTEMPTY },
+ { NFS4ERR_DQUOT, -EDQUOT },
+ { NFS4ERR_STALE, -ESTALE },
+ { NFS4ERR_BADHANDLE, -EBADHANDLE },
+ { NFS4ERR_BAD_COOKIE, -EBADCOOKIE },
+ { NFS4ERR_NOTSUPP, -ENOTSUPP },
+ { NFS4ERR_TOOSMALL, -ETOOSMALL },
+ { NFS4ERR_SERVERFAULT, -EREMOTEIO },
+ { NFS4ERR_BADTYPE, -EBADTYPE },
+ { NFS4ERR_LOCKED, -EAGAIN },
+ { NFS4ERR_SYMLINK, -ELOOP },
+ { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP },
+ { NFS4ERR_DEADLOCK, -EDEADLK },
+ { NFS4ERR_NOXATTR, -ENODATA },
+ { NFS4ERR_XATTR2BIG, -E2BIG },
+ { -1, -EIO }
+};
+
+/*
+ * Convert an NFS error code to a local one.
+ * This one is used jointly by NFSv2 and NFSv3.
+ */
+static int
+nfs4_stat_to_errno(int stat)
+{
+ int i;
+ for (i = 0; nfs_errtbl[i].stat != -1; i++) {
+ if (nfs_errtbl[i].stat == stat)
+ return nfs_errtbl[i].errno;
+ }
+ if (stat <= 10000 || stat > 10100) {
+ /* The server is looney tunes. */
+ return -EREMOTEIO;
+ }
+ /* If we cannot translate the error, the recovery routines should
+ * handle it.
+ * Note: remaining NFSv4 error codes have values > 10000, so should
+ * not conflict with native Linux error codes.
+ */
+ return -stat;
+}
+
+#ifdef CONFIG_NFS_V4_2
+#include "nfs42xdr.c"
+#endif /* CONFIG_NFS_V4_2 */
+
+#define PROC(proc, argtype, restype) \
+[NFSPROC4_CLNT_##proc] = { \
+ .p_proc = NFSPROC4_COMPOUND, \
+ .p_encode = nfs4_xdr_##argtype, \
+ .p_decode = nfs4_xdr_##restype, \
+ .p_arglen = NFS4_##argtype##_sz, \
+ .p_replen = NFS4_##restype##_sz, \
+ .p_statidx = NFSPROC4_CLNT_##proc, \
+ .p_name = #proc, \
+}
+
+#define STUB(proc) \
+[NFSPROC4_CLNT_##proc] = { \
+ .p_name = #proc, \
+}
+
+#if defined(CONFIG_NFS_V4_1)
+#define PROC41(proc, argtype, restype) \
+ PROC(proc, argtype, restype)
+#else
+#define PROC41(proc, argtype, restype) \
+ STUB(proc)
+#endif
+
+#if defined(CONFIG_NFS_V4_2)
+#define PROC42(proc, argtype, restype) \
+ PROC(proc, argtype, restype)
+#else
+#define PROC42(proc, argtype, restype) \
+ STUB(proc)
+#endif
+
+const struct rpc_procinfo nfs4_procedures[] = {
+ PROC(READ, enc_read, dec_read),
+ PROC(WRITE, enc_write, dec_write),
+ PROC(COMMIT, enc_commit, dec_commit),
+ PROC(OPEN, enc_open, dec_open),
+ PROC(OPEN_CONFIRM, enc_open_confirm, dec_open_confirm),
+ PROC(OPEN_NOATTR, enc_open_noattr, dec_open_noattr),
+ PROC(OPEN_DOWNGRADE, enc_open_downgrade, dec_open_downgrade),
+ PROC(CLOSE, enc_close, dec_close),
+ PROC(SETATTR, enc_setattr, dec_setattr),
+ PROC(FSINFO, enc_fsinfo, dec_fsinfo),
+ PROC(RENEW, enc_renew, dec_renew),
+ PROC(SETCLIENTID, enc_setclientid, dec_setclientid),
+ PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm),
+ PROC(LOCK, enc_lock, dec_lock),
+ PROC(LOCKT, enc_lockt, dec_lockt),
+ PROC(LOCKU, enc_locku, dec_locku),
+ PROC(ACCESS, enc_access, dec_access),
+ PROC(GETATTR, enc_getattr, dec_getattr),
+ PROC(LOOKUP, enc_lookup, dec_lookup),
+ PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root),
+ PROC(REMOVE, enc_remove, dec_remove),
+ PROC(RENAME, enc_rename, dec_rename),
+ PROC(LINK, enc_link, dec_link),
+ PROC(SYMLINK, enc_symlink, dec_symlink),
+ PROC(CREATE, enc_create, dec_create),
+ PROC(PATHCONF, enc_pathconf, dec_pathconf),
+ PROC(STATFS, enc_statfs, dec_statfs),
+ PROC(READLINK, enc_readlink, dec_readlink),
+ PROC(READDIR, enc_readdir, dec_readdir),
+ PROC(SERVER_CAPS, enc_server_caps, dec_server_caps),
+ PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn),
+ PROC(GETACL, enc_getacl, dec_getacl),
+ PROC(SETACL, enc_setacl, dec_setacl),
+ PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations),
+ PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
+ PROC(SECINFO, enc_secinfo, dec_secinfo),
+ PROC(FSID_PRESENT, enc_fsid_present, dec_fsid_present),
+ PROC41(EXCHANGE_ID, enc_exchange_id, dec_exchange_id),
+ PROC41(CREATE_SESSION, enc_create_session, dec_create_session),
+ PROC41(DESTROY_SESSION, enc_destroy_session, dec_destroy_session),
+ PROC41(SEQUENCE, enc_sequence, dec_sequence),
+ PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time),
+ PROC41(RECLAIM_COMPLETE,enc_reclaim_complete, dec_reclaim_complete),
+ PROC41(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
+ PROC41(LAYOUTGET, enc_layoutget, dec_layoutget),
+ PROC41(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit),
+ PROC41(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn),
+ PROC41(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name),
+ PROC41(TEST_STATEID, enc_test_stateid, dec_test_stateid),
+ PROC41(FREE_STATEID, enc_free_stateid, dec_free_stateid),
+ STUB(GETDEVICELIST),
+ PROC41(BIND_CONN_TO_SESSION,
+ enc_bind_conn_to_session, dec_bind_conn_to_session),
+ PROC41(DESTROY_CLIENTID,enc_destroy_clientid, dec_destroy_clientid),
+ PROC42(SEEK, enc_seek, dec_seek),
+ PROC42(ALLOCATE, enc_allocate, dec_allocate),
+ PROC42(DEALLOCATE, enc_deallocate, dec_deallocate),
+ PROC42(LAYOUTSTATS, enc_layoutstats, dec_layoutstats),
+ PROC42(CLONE, enc_clone, dec_clone),
+ PROC42(COPY, enc_copy, dec_copy),
+ PROC42(OFFLOAD_CANCEL, enc_offload_cancel, dec_offload_cancel),
+ PROC42(COPY_NOTIFY, enc_copy_notify, dec_copy_notify),
+ PROC(LOOKUPP, enc_lookupp, dec_lookupp),
+ PROC42(LAYOUTERROR, enc_layouterror, dec_layouterror),
+ PROC42(GETXATTR, enc_getxattr, dec_getxattr),
+ PROC42(SETXATTR, enc_setxattr, dec_setxattr),
+ PROC42(LISTXATTRS, enc_listxattrs, dec_listxattrs),
+ PROC42(REMOVEXATTR, enc_removexattr, dec_removexattr),
+ PROC42(READ_PLUS, enc_read_plus, dec_read_plus),
+};
+
+static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];
+const struct rpc_version nfs_version4 = {
+ .number = 4,
+ .nrprocs = ARRAY_SIZE(nfs4_procedures),
+ .procs = nfs4_procedures,
+ .counts = nfs_version4_counts,
+};
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
new file mode 100644
index 0000000000..7600100ba2
--- /dev/null
+++ b/fs/nfs/nfsroot.c
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1995, 1996 Gero Kuhlmann <gero@gkminix.han.de>
+ *
+ * Allow an NFS filesystem to be mounted as root. The way this works is:
+ * (1) Use the IP autoconfig mechanism to set local IP addresses and routes.
+ * (2) Construct the device string and the options string using DHCP
+ * option 17 and/or kernel command line options.
+ * (3) When mount_root() sets up the root file system, pass these strings
+ * to the NFS client's regular mount interface via sys_mount().
+ *
+ *
+ * Changes:
+ *
+ * Alan Cox : Removed get_address name clash with FPU.
+ * Alan Cox : Reformatted a bit.
+ * Gero Kuhlmann : Code cleanup
+ * Michael Rausch : Fixed recognition of an incoming RARP answer.
+ * Martin Mares : (2.0) Auto-configuration via BOOTP supported.
+ * Martin Mares : Manual selection of interface & BOOTP/RARP.
+ * Martin Mares : Using network routes instead of host routes,
+ * allowing the default configuration to be used
+ * for normal operation of the host.
+ * Martin Mares : Randomized timer with exponential backoff
+ * installed to minimize network congestion.
+ * Martin Mares : Code cleanup.
+ * Martin Mares : (2.1) BOOTP and RARP made configuration options.
+ * Martin Mares : Server hostname generation fixed.
+ * Gerd Knorr : Fixed wired inode handling
+ * Martin Mares : (2.2) "0.0.0.0" addresses from command line ignored.
+ * Martin Mares : RARP replies not tested for server address.
+ * Gero Kuhlmann : (2.3) Some bug fixes and code cleanup again (please
+ * send me your new patches _before_ bothering
+ * Linus so that I don' always have to cleanup
+ * _afterwards_ - thanks)
+ * Gero Kuhlmann : Last changes of Martin Mares undone.
+ * Gero Kuhlmann : RARP replies are tested for specified server
+ * again. However, it's now possible to have
+ * different RARP and NFS servers.
+ * Gero Kuhlmann : "0.0.0.0" addresses from command line are
+ * now mapped to INADDR_NONE.
+ * Gero Kuhlmann : Fixed a bug which prevented BOOTP path name
+ * from being used (thanks to Leo Spiekman)
+ * Andy Walker : Allow to specify the NFS server in nfs_root
+ * without giving a path name
+ * Swen Thümmler : Allow to specify the NFS options in nfs_root
+ * without giving a path name. Fix BOOTP request
+ * for domainname (domainname is NIS domain, not
+ * DNS domain!). Skip dummy devices for BOOTP.
+ * Jacek Zapala : Fixed a bug which prevented server-ip address
+ * from nfsroot parameter from being used.
+ * Olaf Kirch : Adapted to new NFS code.
+ * Jakub Jelinek : Free used code segment.
+ * Marko Kohtala : Fixed some bugs.
+ * Martin Mares : Debug message cleanup
+ * Martin Mares : Changed to use the new generic IP layer autoconfig
+ * code. BOOTP and RARP moved there.
+ * Martin Mares : Default path now contains host name instead of
+ * host IP address (but host name defaults to IP
+ * address anyway).
+ * Martin Mares : Use root_server_addr appropriately during setup.
+ * Martin Mares : Rewrote parameter parsing, now hopefully giving
+ * correct overriding.
+ * Trond Myklebust : Add in preliminary support for NFSv3 and TCP.
+ * Fix bug in root_nfs_addr(). nfs_data.namlen
+ * is NOT for the length of the hostname.
+ * Hua Qin : Support for mounting root file system via
+ * NFS over TCP.
+ * Fabian Frederick: Option parser rebuilt (using parser lib)
+ * Chuck Lever : Use super.c's text-based mount option parsing
+ * Chuck Lever : Add "nfsrootdebug".
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/nfs.h>
+#include <linux/nfs_fs.h>
+#include <linux/utsname.h>
+#include <linux/root_dev.h>
+#include <net/ipconfig.h>
+
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_ROOT
+
+/* Default path we try to mount. "%s" gets replaced by our IP address */
+#define NFS_ROOT "/tftpboot/%s"
+
+/* Default NFSROOT mount options. */
+#if defined(CONFIG_NFS_V2)
+#define NFS_DEF_OPTIONS "vers=2,tcp,rsize=4096,wsize=4096"
+#elif defined(CONFIG_NFS_V3)
+#define NFS_DEF_OPTIONS "vers=3,tcp,rsize=4096,wsize=4096"
+#else
+#define NFS_DEF_OPTIONS "vers=4,tcp,rsize=4096,wsize=4096"
+#endif
+
+/* Parameters passed from the kernel command line */
+static char nfs_root_parms[NFS_MAXPATHLEN + 1] __initdata = "";
+
+/* Text-based mount options passed to super.c */
+static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS;
+
+/* Address of NFS server */
+static __be32 servaddr __initdata = htonl(INADDR_NONE);
+
+/* Name of directory to mount */
+static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
+
+/* server:export path string passed to super.c */
+static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = "";
+
+#ifdef NFS_DEBUG
+/*
+ * When the "nfsrootdebug" kernel command line option is specified,
+ * enable debugging messages for NFSROOT.
+ */
+static int __init nfs_root_debug(char *__unused)
+{
+ nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT;
+ return 1;
+}
+
+__setup("nfsrootdebug", nfs_root_debug);
+#endif
+
+/*
+ * Parse NFS server and directory information passed on the kernel
+ * command line.
+ *
+ * nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
+ *
+ * If there is a "%s" token in the <root-dir> string, it is replaced
+ * by the ASCII-representation of the client's IP address.
+ */
+static int __init nfs_root_setup(char *line)
+{
+ ROOT_DEV = Root_NFS;
+
+ if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) {
+ strscpy(nfs_root_parms, line, sizeof(nfs_root_parms));
+ } else {
+ size_t n = strlen(line) + sizeof(NFS_ROOT) - 1;
+ if (n >= sizeof(nfs_root_parms))
+ line[sizeof(nfs_root_parms) - sizeof(NFS_ROOT) - 2] = '\0';
+ sprintf(nfs_root_parms, NFS_ROOT, line);
+ }
+
+ /*
+ * Extract the IP address of the NFS server containing our
+ * root file system, if one was specified.
+ *
+ * Note: root_nfs_parse_addr() removes the server-ip from
+ * nfs_root_parms, if it exists.
+ */
+ root_server_addr = root_nfs_parse_addr(nfs_root_parms);
+
+ return 1;
+}
+
+__setup("nfsroot=", nfs_root_setup);
+
+static int __init root_nfs_copy(char *dest, const char *src,
+ const size_t destlen)
+{
+ if (strscpy(dest, src, destlen) == -E2BIG)
+ return -1;
+ return 0;
+}
+
+static int __init root_nfs_cat(char *dest, const char *src,
+ const size_t destlen)
+{
+ size_t len = strlen(dest);
+
+ if (len && dest[len - 1] != ',')
+ if (strlcat(dest, ",", destlen) > destlen)
+ return -1;
+
+ if (strlcat(dest, src, destlen) > destlen)
+ return -1;
+ return 0;
+}
+
+/*
+ * Parse out root export path and mount options from
+ * passed-in string @incoming.
+ *
+ * Copy the export path into @exppath.
+ */
+static int __init root_nfs_parse_options(char *incoming, char *exppath,
+ const size_t exppathlen)
+{
+ char *p;
+
+ /*
+ * Set the NFS remote path
+ */
+ p = strsep(&incoming, ",");
+ if (*p != '\0' && strcmp(p, "default") != 0)
+ if (root_nfs_copy(exppath, p, exppathlen))
+ return -1;
+
+ /*
+ * @incoming now points to the rest of the string; if it
+ * contains something, append it to our root options buffer
+ */
+ if (incoming != NULL && *incoming != '\0')
+ if (root_nfs_cat(nfs_root_options, incoming,
+ sizeof(nfs_root_options)))
+ return -1;
+ return 0;
+}
+
+/*
+ * Decode the export directory path name and NFS options from
+ * the kernel command line. This has to be done late in order to
+ * use a dynamically acquired client IP address for the remote
+ * root directory path.
+ *
+ * Returns zero if successful; otherwise -1 is returned.
+ */
+static int __init root_nfs_data(char *cmdline)
+{
+ char mand_options[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
+ int len, retval = -1;
+ char *tmp = NULL;
+ const size_t tmplen = sizeof(nfs_export_path);
+
+ tmp = kzalloc(tmplen, GFP_KERNEL);
+ if (tmp == NULL)
+ goto out_nomem;
+ strcpy(tmp, NFS_ROOT);
+
+ if (root_server_path[0] != '\0') {
+ dprintk("Root-NFS: DHCPv4 option 17: %s\n",
+ root_server_path);
+ if (root_nfs_parse_options(root_server_path, tmp, tmplen))
+ goto out_optionstoolong;
+ }
+
+ if (cmdline[0] != '\0') {
+ dprintk("Root-NFS: nfsroot=%s\n", cmdline);
+ if (root_nfs_parse_options(cmdline, tmp, tmplen))
+ goto out_optionstoolong;
+ }
+
+ /*
+ * Append mandatory options for nfsroot so they override
+ * what has come before
+ */
+ snprintf(mand_options, sizeof(mand_options), "nolock,addr=%pI4",
+ &servaddr);
+ if (root_nfs_cat(nfs_root_options, mand_options,
+ sizeof(nfs_root_options)))
+ goto out_optionstoolong;
+
+ /*
+ * Set up nfs_root_device. For NFS mounts, this looks like
+ *
+ * server:/path
+ *
+ * At this point, utsname()->nodename contains our local
+ * IP address or hostname, set by ipconfig. If "%s" exists
+ * in tmp, substitute the nodename, then shovel the whole
+ * mess into nfs_root_device.
+ */
+ len = snprintf(nfs_export_path, sizeof(nfs_export_path),
+ tmp, utsname()->nodename);
+ if (len >= (int)sizeof(nfs_export_path))
+ goto out_devnametoolong;
+ len = snprintf(nfs_root_device, sizeof(nfs_root_device),
+ "%pI4:%s", &servaddr, nfs_export_path);
+ if (len >= (int)sizeof(nfs_root_device))
+ goto out_devnametoolong;
+
+ retval = 0;
+
+out:
+ kfree(tmp);
+ return retval;
+out_nomem:
+ printk(KERN_ERR "Root-NFS: could not allocate memory\n");
+ goto out;
+out_optionstoolong:
+ printk(KERN_ERR "Root-NFS: mount options string too long\n");
+ goto out;
+out_devnametoolong:
+ printk(KERN_ERR "Root-NFS: root device name too long.\n");
+ goto out;
+}
+
+/**
+ * nfs_root_data - Return prepared 'data' for NFSROOT mount
+ * @root_device: OUT: address of string containing NFSROOT device
+ * @root_data: OUT: address of string containing NFSROOT mount options
+ *
+ * Returns zero and sets @root_device and @root_data if successful,
+ * otherwise -1 is returned.
+ */
+int __init nfs_root_data(char **root_device, char **root_data)
+{
+ servaddr = root_server_addr;
+ if (servaddr == htonl(INADDR_NONE)) {
+ printk(KERN_ERR "Root-NFS: no NFS server address\n");
+ return -1;
+ }
+
+ if (root_nfs_data(nfs_root_parms) < 0)
+ return -1;
+
+ *root_device = nfs_root_device;
+ *root_data = nfs_root_options;
+ return 0;
+}
diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c
new file mode 100644
index 0000000000..5d1bfccbb4
--- /dev/null
+++ b/fs/nfs/nfstrace.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/nfs_fs.h>
+#include <linux/namei.h>
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+#include "nfstrace.h"
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_xdr_status);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_xdr_bad_filehandle);
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
new file mode 100644
index 0000000000..4e90ca5311
--- /dev/null
+++ b/fs/nfs/nfstrace.h
@@ -0,0 +1,1738 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nfs
+
+#if !defined(_TRACE_NFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NFS_H
+
+#include <linux/tracepoint.h>
+#include <linux/iversion.h>
+
+#include <trace/misc/fs.h>
+#include <trace/misc/nfs.h>
+#include <trace/misc/sunrpc.h>
+
+#define nfs_show_cache_validity(v) \
+ __print_flags(v, "|", \
+ { NFS_INO_INVALID_DATA, "INVALID_DATA" }, \
+ { NFS_INO_INVALID_ATIME, "INVALID_ATIME" }, \
+ { NFS_INO_INVALID_ACCESS, "INVALID_ACCESS" }, \
+ { NFS_INO_INVALID_ACL, "INVALID_ACL" }, \
+ { NFS_INO_REVAL_FORCED, "REVAL_FORCED" }, \
+ { NFS_INO_INVALID_LABEL, "INVALID_LABEL" }, \
+ { NFS_INO_INVALID_CHANGE, "INVALID_CHANGE" }, \
+ { NFS_INO_INVALID_CTIME, "INVALID_CTIME" }, \
+ { NFS_INO_INVALID_MTIME, "INVALID_MTIME" }, \
+ { NFS_INO_INVALID_SIZE, "INVALID_SIZE" }, \
+ { NFS_INO_INVALID_OTHER, "INVALID_OTHER" }, \
+ { NFS_INO_DATA_INVAL_DEFER, "DATA_INVAL_DEFER" }, \
+ { NFS_INO_INVALID_BLOCKS, "INVALID_BLOCKS" }, \
+ { NFS_INO_INVALID_XATTR, "INVALID_XATTR" }, \
+ { NFS_INO_INVALID_NLINK, "INVALID_NLINK" }, \
+ { NFS_INO_INVALID_MODE, "INVALID_MODE" })
+
+#define nfs_show_nfsi_flags(v) \
+ __print_flags(v, "|", \
+ { BIT(NFS_INO_STALE), "STALE" }, \
+ { BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \
+ { BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \
+ { BIT(NFS_INO_LAYOUTCOMMIT), "NEED_LAYOUTCOMMIT" }, \
+ { BIT(NFS_INO_LAYOUTCOMMITTING), "LAYOUTCOMMIT" }, \
+ { BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \
+ { BIT(NFS_INO_ODIRECT), "ODIRECT" })
+
+DECLARE_EVENT_CLASS(nfs_inode_event,
+ TP_PROTO(
+ const struct inode *inode
+ ),
+
+ TP_ARGS(inode),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode_peek_iversion_raw(inode);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu ",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (unsigned long long)__entry->version
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs_inode_event_done,
+ TP_PROTO(
+ const struct inode *inode,
+ int error
+ ),
+
+ TP_ARGS(inode, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(unsigned char, type)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, size)
+ __field(unsigned long, nfsi_flags)
+ __field(unsigned long, cache_validity)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ __entry->error = error < 0 ? -error : 0;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->type = nfs_umode_to_dtype(inode->i_mode);
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->size = i_size_read(inode);
+ __entry->nfsi_flags = nfsi->flags;
+ __entry->cache_validity = nfsi->cache_validity;
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "type=%u (%s) version=%llu size=%lld "
+ "cache_validity=0x%lx (%s) nfs_flags=0x%lx (%s)",
+ -__entry->error, show_nfs_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->type,
+ show_fs_dirent_type(__entry->type),
+ (unsigned long long)__entry->version,
+ (long long)__entry->size,
+ __entry->cache_validity,
+ nfs_show_cache_validity(__entry->cache_validity),
+ __entry->nfsi_flags,
+ nfs_show_nfsi_flags(__entry->nfsi_flags)
+ )
+);
+
+#define DEFINE_NFS_INODE_EVENT(name) \
+ DEFINE_EVENT(nfs_inode_event, name, \
+ TP_PROTO( \
+ const struct inode *inode \
+ ), \
+ TP_ARGS(inode))
+#define DEFINE_NFS_INODE_EVENT_DONE(name) \
+ DEFINE_EVENT(nfs_inode_event_done, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ int error \
+ ), \
+ TP_ARGS(inode, error))
+DEFINE_NFS_INODE_EVENT(nfs_set_inode_stale);
+DEFINE_NFS_INODE_EVENT(nfs_refresh_inode_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_refresh_inode_exit);
+DEFINE_NFS_INODE_EVENT(nfs_revalidate_inode_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_revalidate_inode_exit);
+DEFINE_NFS_INODE_EVENT(nfs_invalidate_mapping_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_invalidate_mapping_exit);
+DEFINE_NFS_INODE_EVENT(nfs_getattr_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_getattr_exit);
+DEFINE_NFS_INODE_EVENT(nfs_setattr_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_setattr_exit);
+DEFINE_NFS_INODE_EVENT(nfs_writeback_inode_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_inode_exit);
+DEFINE_NFS_INODE_EVENT(nfs_fsync_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit);
+DEFINE_NFS_INODE_EVENT(nfs_access_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_set_cache_invalid);
+DEFINE_NFS_INODE_EVENT(nfs_readdir_force_readdirplus);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_readdir_cache_fill_done);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_readdir_uncached_done);
+
+TRACE_EVENT(nfs_access_exit,
+ TP_PROTO(
+ const struct inode *inode,
+ unsigned int mask,
+ unsigned int permitted,
+ int error
+ ),
+
+ TP_ARGS(inode, mask, permitted, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(unsigned char, type)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, size)
+ __field(unsigned long, nfsi_flags)
+ __field(unsigned long, cache_validity)
+ __field(unsigned int, mask)
+ __field(unsigned int, permitted)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ __entry->error = error < 0 ? -error : 0;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->type = nfs_umode_to_dtype(inode->i_mode);
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->size = i_size_read(inode);
+ __entry->nfsi_flags = nfsi->flags;
+ __entry->cache_validity = nfsi->cache_validity;
+ __entry->mask = mask;
+ __entry->permitted = permitted;
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "type=%u (%s) version=%llu size=%lld "
+ "cache_validity=0x%lx (%s) nfs_flags=0x%lx (%s) "
+ "mask=0x%x permitted=0x%x",
+ -__entry->error, show_nfs_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->type,
+ show_fs_dirent_type(__entry->type),
+ (unsigned long long)__entry->version,
+ (long long)__entry->size,
+ __entry->cache_validity,
+ nfs_show_cache_validity(__entry->cache_validity),
+ __entry->nfsi_flags,
+ nfs_show_nfsi_flags(__entry->nfsi_flags),
+ __entry->mask, __entry->permitted
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs_update_size_class,
+ TP_PROTO(
+ const struct inode *inode,
+ loff_t new_size
+ ),
+
+ TP_ARGS(inode, new_size),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, cur_size)
+ __field(loff_t, new_size)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->fileid = nfsi->fileid;
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->cur_size = i_size_read(inode);
+ __entry->new_size = new_size;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu cursize=%lld newsize=%lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->cur_size, __entry->new_size
+ )
+);
+
+#define DEFINE_NFS_UPDATE_SIZE_EVENT(name) \
+ DEFINE_EVENT(nfs_update_size_class, nfs_size_##name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ loff_t new_size \
+ ), \
+ TP_ARGS(inode, new_size))
+
+DEFINE_NFS_UPDATE_SIZE_EVENT(truncate);
+DEFINE_NFS_UPDATE_SIZE_EVENT(wcc);
+DEFINE_NFS_UPDATE_SIZE_EVENT(update);
+DEFINE_NFS_UPDATE_SIZE_EVENT(grow);
+
+DECLARE_EVENT_CLASS(nfs_inode_range_event,
+ TP_PROTO(
+ const struct inode *inode,
+ loff_t range_start,
+ loff_t range_end
+ ),
+
+ TP_ARGS(inode, range_start, range_end),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, range_start)
+ __field(loff_t, range_end)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->fileid = nfsi->fileid;
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->range_start = range_start;
+ __entry->range_end = range_end;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu "
+ "range=[%lld, %lld]",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->range_start, __entry->range_end
+ )
+);
+
+#define DEFINE_NFS_INODE_RANGE_EVENT(name) \
+ DEFINE_EVENT(nfs_inode_range_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ loff_t range_start, \
+ loff_t range_end \
+ ), \
+ TP_ARGS(inode, range_start, range_end))
+
+DEFINE_NFS_INODE_RANGE_EVENT(nfs_readdir_invalidate_cache_range);
+
+DECLARE_EVENT_CLASS(nfs_readdir_event,
+ TP_PROTO(
+ const struct file *file,
+ const __be32 *verifier,
+ u64 cookie,
+ pgoff_t page_index,
+ unsigned int dtsize
+ ),
+
+ TP_ARGS(file, verifier, cookie, page_index, dtsize),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ __array(char, verifier, NFS4_VERIFIER_SIZE)
+ __field(u64, cookie)
+ __field(pgoff_t, index)
+ __field(unsigned int, dtsize)
+ ),
+
+ TP_fast_assign(
+ const struct inode *dir = file_inode(file);
+ const struct nfs_inode *nfsi = NFS_I(dir);
+
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode_peek_iversion_raw(dir);
+ if (cookie != 0)
+ memcpy(__entry->verifier, verifier,
+ NFS4_VERIFIER_SIZE);
+ else
+ memset(__entry->verifier, 0,
+ NFS4_VERIFIER_SIZE);
+ __entry->cookie = cookie;
+ __entry->index = page_index;
+ __entry->dtsize = dtsize;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu "
+ "cookie=%s:0x%llx cache_index=%lu dtsize=%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid, __entry->fhandle,
+ __entry->version, show_nfs4_verifier(__entry->verifier),
+ (unsigned long long)__entry->cookie, __entry->index,
+ __entry->dtsize
+ )
+);
+
+#define DEFINE_NFS_READDIR_EVENT(name) \
+ DEFINE_EVENT(nfs_readdir_event, name, \
+ TP_PROTO( \
+ const struct file *file, \
+ const __be32 *verifier, \
+ u64 cookie, \
+ pgoff_t page_index, \
+ unsigned int dtsize \
+ ), \
+ TP_ARGS(file, verifier, cookie, page_index, dtsize))
+
+DEFINE_NFS_READDIR_EVENT(nfs_readdir_cache_fill);
+DEFINE_NFS_READDIR_EVENT(nfs_readdir_uncached);
+
+DECLARE_EVENT_CLASS(nfs_lookup_event,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct dentry *dentry,
+ unsigned int flags
+ ),
+
+ TP_ARGS(dir, dentry, flags),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, flags)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->flags = flags;
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
+ __entry->flags,
+ show_fs_lookup_flags(__entry->flags),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+#define DEFINE_NFS_LOOKUP_EVENT(name) \
+ DEFINE_EVENT(nfs_lookup_event, name, \
+ TP_PROTO( \
+ const struct inode *dir, \
+ const struct dentry *dentry, \
+ unsigned int flags \
+ ), \
+ TP_ARGS(dir, dentry, flags))
+
+DECLARE_EVENT_CLASS(nfs_lookup_event_done,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct dentry *dentry,
+ unsigned int flags,
+ int error
+ ),
+
+ TP_ARGS(dir, dentry, flags, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(unsigned long, flags)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->error = error < 0 ? -error : 0;
+ __entry->flags = flags;
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
+ -__entry->error, show_nfs_status(__entry->error),
+ __entry->flags,
+ show_fs_lookup_flags(__entry->flags),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+#define DEFINE_NFS_LOOKUP_EVENT_DONE(name) \
+ DEFINE_EVENT(nfs_lookup_event_done, name, \
+ TP_PROTO( \
+ const struct inode *dir, \
+ const struct dentry *dentry, \
+ unsigned int flags, \
+ int error \
+ ), \
+ TP_ARGS(dir, dentry, flags, error))
+
+DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_enter);
+DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_exit);
+DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_revalidate_enter);
+DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_revalidate_exit);
+DEFINE_NFS_LOOKUP_EVENT(nfs_readdir_lookup);
+DEFINE_NFS_LOOKUP_EVENT(nfs_readdir_lookup_revalidate_failed);
+DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_readdir_lookup_revalidate);
+
+TRACE_EVENT(nfs_atomic_open_enter,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct nfs_open_context *ctx,
+ unsigned int flags
+ ),
+
+ TP_ARGS(dir, ctx, flags),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, flags)
+ __field(unsigned long, fmode)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, ctx->dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->flags = flags;
+ __entry->fmode = (__force unsigned long)ctx->mode;
+ __assign_str(name, ctx->dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "flags=0x%lx (%s) fmode=%s name=%02x:%02x:%llu/%s",
+ __entry->flags,
+ show_fs_fcntl_open_flags(__entry->flags),
+ show_fs_fmode_flags(__entry->fmode),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfs_atomic_open_exit,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct nfs_open_context *ctx,
+ unsigned int flags,
+ int error
+ ),
+
+ TP_ARGS(dir, ctx, flags, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(unsigned long, flags)
+ __field(unsigned long, fmode)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, ctx->dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->error = -error;
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->flags = flags;
+ __entry->fmode = (__force unsigned long)ctx->mode;
+ __assign_str(name, ctx->dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) flags=0x%lx (%s) fmode=%s "
+ "name=%02x:%02x:%llu/%s",
+ -__entry->error, show_nfs_status(__entry->error),
+ __entry->flags,
+ show_fs_fcntl_open_flags(__entry->flags),
+ show_fs_fmode_flags(__entry->fmode),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfs_create_enter,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct dentry *dentry,
+ unsigned int flags
+ ),
+
+ TP_ARGS(dir, dentry, flags),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, flags)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->flags = flags;
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
+ __entry->flags,
+ show_fs_fcntl_open_flags(__entry->flags),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfs_create_exit,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct dentry *dentry,
+ unsigned int flags,
+ int error
+ ),
+
+ TP_ARGS(dir, dentry, flags, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(unsigned long, flags)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->error = -error;
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->flags = flags;
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
+ -__entry->error, show_nfs_status(__entry->error),
+ __entry->flags,
+ show_fs_fcntl_open_flags(__entry->flags),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs_directory_event,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct dentry *dentry
+ ),
+
+ TP_ARGS(dir, dentry),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "name=%02x:%02x:%llu/%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+#define DEFINE_NFS_DIRECTORY_EVENT(name) \
+ DEFINE_EVENT(nfs_directory_event, name, \
+ TP_PROTO( \
+ const struct inode *dir, \
+ const struct dentry *dentry \
+ ), \
+ TP_ARGS(dir, dentry))
+
+DECLARE_EVENT_CLASS(nfs_directory_event_done,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct dentry *dentry,
+ int error
+ ),
+
+ TP_ARGS(dir, dentry, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->error = error < 0 ? -error : 0;
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) name=%02x:%02x:%llu/%s",
+ -__entry->error, show_nfs_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+#define DEFINE_NFS_DIRECTORY_EVENT_DONE(name) \
+ DEFINE_EVENT(nfs_directory_event_done, name, \
+ TP_PROTO( \
+ const struct inode *dir, \
+ const struct dentry *dentry, \
+ int error \
+ ), \
+ TP_ARGS(dir, dentry, error))
+
+DEFINE_NFS_DIRECTORY_EVENT(nfs_mknod_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_mknod_exit);
+DEFINE_NFS_DIRECTORY_EVENT(nfs_mkdir_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_mkdir_exit);
+DEFINE_NFS_DIRECTORY_EVENT(nfs_rmdir_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_rmdir_exit);
+DEFINE_NFS_DIRECTORY_EVENT(nfs_remove_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_remove_exit);
+DEFINE_NFS_DIRECTORY_EVENT(nfs_unlink_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_unlink_exit);
+DEFINE_NFS_DIRECTORY_EVENT(nfs_symlink_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_symlink_exit);
+
+TRACE_EVENT(nfs_link_enter,
+ TP_PROTO(
+ const struct inode *inode,
+ const struct inode *dir,
+ const struct dentry *dentry
+ ),
+
+ TP_ARGS(inode, dir, dentry),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dir = NFS_FILEID(dir);
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->fileid,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfs_link_exit,
+ TP_PROTO(
+ const struct inode *inode,
+ const struct inode *dir,
+ const struct dentry *dentry,
+ int error
+ ),
+
+ TP_ARGS(inode, dir, dentry, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dir = NFS_FILEID(dir);
+ __entry->error = error < 0 ? -error : 0;
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s",
+ -__entry->error, show_nfs_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->fileid,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs_rename_event,
+ TP_PROTO(
+ const struct inode *old_dir,
+ const struct dentry *old_dentry,
+ const struct inode *new_dir,
+ const struct dentry *new_dentry
+ ),
+
+ TP_ARGS(old_dir, old_dentry, new_dir, new_dentry),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, old_dir)
+ __field(u64, new_dir)
+ __string(old_name, old_dentry->d_name.name)
+ __string(new_name, new_dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = old_dir->i_sb->s_dev;
+ __entry->old_dir = NFS_FILEID(old_dir);
+ __entry->new_dir = NFS_FILEID(new_dir);
+ __assign_str(old_name, old_dentry->d_name.name);
+ __assign_str(new_name, new_dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "old_name=%02x:%02x:%llu/%s new_name=%02x:%02x:%llu/%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->old_dir,
+ __get_str(old_name),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->new_dir,
+ __get_str(new_name)
+ )
+);
+#define DEFINE_NFS_RENAME_EVENT(name) \
+ DEFINE_EVENT(nfs_rename_event, name, \
+ TP_PROTO( \
+ const struct inode *old_dir, \
+ const struct dentry *old_dentry, \
+ const struct inode *new_dir, \
+ const struct dentry *new_dentry \
+ ), \
+ TP_ARGS(old_dir, old_dentry, new_dir, new_dentry))
+
+DECLARE_EVENT_CLASS(nfs_rename_event_done,
+ TP_PROTO(
+ const struct inode *old_dir,
+ const struct dentry *old_dentry,
+ const struct inode *new_dir,
+ const struct dentry *new_dentry,
+ int error
+ ),
+
+ TP_ARGS(old_dir, old_dentry, new_dir, new_dentry, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, error)
+ __field(u64, old_dir)
+ __string(old_name, old_dentry->d_name.name)
+ __field(u64, new_dir)
+ __string(new_name, new_dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = old_dir->i_sb->s_dev;
+ __entry->error = -error;
+ __entry->old_dir = NFS_FILEID(old_dir);
+ __entry->new_dir = NFS_FILEID(new_dir);
+ __assign_str(old_name, old_dentry->d_name.name);
+ __assign_str(new_name, new_dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) old_name=%02x:%02x:%llu/%s "
+ "new_name=%02x:%02x:%llu/%s",
+ -__entry->error, show_nfs_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->old_dir,
+ __get_str(old_name),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->new_dir,
+ __get_str(new_name)
+ )
+);
+#define DEFINE_NFS_RENAME_EVENT_DONE(name) \
+ DEFINE_EVENT(nfs_rename_event_done, name, \
+ TP_PROTO( \
+ const struct inode *old_dir, \
+ const struct dentry *old_dentry, \
+ const struct inode *new_dir, \
+ const struct dentry *new_dentry, \
+ int error \
+ ), \
+ TP_ARGS(old_dir, old_dentry, new_dir, \
+ new_dentry, error))
+
+DEFINE_NFS_RENAME_EVENT(nfs_rename_enter);
+DEFINE_NFS_RENAME_EVENT_DONE(nfs_rename_exit);
+
+DEFINE_NFS_RENAME_EVENT_DONE(nfs_sillyrename_rename);
+
+TRACE_EVENT(nfs_sillyrename_unlink,
+ TP_PROTO(
+ const struct nfs_unlinkdata *data,
+ int error
+ ),
+
+ TP_ARGS(data, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, error)
+ __field(u64, dir)
+ __dynamic_array(char, name, data->args.name.len + 1)
+ ),
+
+ TP_fast_assign(
+ struct inode *dir = d_inode(data->dentry->d_parent);
+ size_t len = data->args.name.len;
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->error = -error;
+ memcpy(__get_str(name),
+ data->args.name.name, len);
+ __get_str(name)[len] = 0;
+ ),
+
+ TP_printk(
+ "error=%ld (%s) name=%02x:%02x:%llu/%s",
+ -__entry->error, show_nfs_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs_folio_event,
+ TP_PROTO(
+ const struct inode *inode,
+ struct folio *folio
+ ),
+
+ TP_ARGS(inode, folio),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, offset)
+ __field(u32, count)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->offset = folio_file_pos(folio);
+ __entry->count = nfs_folio_length(folio);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu "
+ "offset=%lld count=%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->offset, __entry->count
+ )
+);
+
+#define DEFINE_NFS_FOLIO_EVENT(name) \
+ DEFINE_EVENT(nfs_folio_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ struct folio *folio \
+ ), \
+ TP_ARGS(inode, folio))
+
+DECLARE_EVENT_CLASS(nfs_folio_event_done,
+ TP_PROTO(
+ const struct inode *inode,
+ struct folio *folio,
+ int ret
+ ),
+
+ TP_ARGS(inode, folio, ret),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(int, ret)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, offset)
+ __field(u32, count)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->offset = folio_file_pos(folio);
+ __entry->count = nfs_folio_length(folio);
+ __entry->ret = ret;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu "
+ "offset=%lld count=%u ret=%d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->offset, __entry->count, __entry->ret
+ )
+);
+
+#define DEFINE_NFS_FOLIO_EVENT_DONE(name) \
+ DEFINE_EVENT(nfs_folio_event_done, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ struct folio *folio, \
+ int ret \
+ ), \
+ TP_ARGS(inode, folio, ret))
+
+DEFINE_NFS_FOLIO_EVENT(nfs_aop_readpage);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_aop_readpage_done);
+
+DEFINE_NFS_FOLIO_EVENT(nfs_writeback_folio);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writeback_folio_done);
+
+DEFINE_NFS_FOLIO_EVENT(nfs_invalidate_folio);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_launder_folio_done);
+
+TRACE_EVENT(nfs_aop_readahead,
+ TP_PROTO(
+ const struct inode *inode,
+ loff_t pos,
+ unsigned int nr_pages
+ ),
+
+ TP_ARGS(inode, pos, nr_pages),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, offset)
+ __field(unsigned int, nr_pages)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->offset = pos;
+ __entry->nr_pages = nr_pages;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld nr_pages=%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->offset, __entry->nr_pages
+ )
+);
+
+TRACE_EVENT(nfs_aop_readahead_done,
+ TP_PROTO(
+ const struct inode *inode,
+ unsigned int nr_pages,
+ int ret
+ ),
+
+ TP_ARGS(inode, nr_pages, ret),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(int, ret)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, offset)
+ __field(unsigned int, nr_pages)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->nr_pages = nr_pages;
+ __entry->ret = ret;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu nr_pages=%u ret=%d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->version,
+ __entry->nr_pages, __entry->ret
+ )
+);
+
+TRACE_EVENT(nfs_initiate_read,
+ TP_PROTO(
+ const struct nfs_pgio_header *hdr
+ ),
+
+ TP_ARGS(hdr),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, count)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = hdr->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = hdr->args.fh ?
+ hdr->args.fh : &nfsi->fh;
+
+ __entry->offset = hdr->args.offset;
+ __entry->count = hdr->args.count;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset, __entry->count
+ )
+);
+
+TRACE_EVENT(nfs_readpage_done,
+ TP_PROTO(
+ const struct rpc_task *task,
+ const struct nfs_pgio_header *hdr
+ ),
+
+ TP_ARGS(task, hdr),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, arg_count)
+ __field(u32, res_count)
+ __field(bool, eof)
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = hdr->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = hdr->args.fh ?
+ hdr->args.fh : &nfsi->fh;
+
+ __entry->error = task->tk_status;
+ __entry->offset = hdr->args.offset;
+ __entry->arg_count = hdr->args.count;
+ __entry->res_count = hdr->res.count;
+ __entry->eof = hdr->res.eof;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk(
+ "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u%s", __entry->error,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset, __entry->arg_count,
+ __entry->res_count, __entry->eof ? " eof" : ""
+ )
+);
+
+TRACE_EVENT(nfs_readpage_short,
+ TP_PROTO(
+ const struct rpc_task *task,
+ const struct nfs_pgio_header *hdr
+ ),
+
+ TP_ARGS(task, hdr),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, arg_count)
+ __field(u32, res_count)
+ __field(bool, eof)
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = hdr->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = hdr->args.fh ?
+ hdr->args.fh : &nfsi->fh;
+
+ __entry->error = task->tk_status;
+ __entry->offset = hdr->args.offset;
+ __entry->arg_count = hdr->args.count;
+ __entry->res_count = hdr->res.count;
+ __entry->eof = hdr->res.eof;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk(
+ "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u%s", __entry->error,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset, __entry->arg_count,
+ __entry->res_count, __entry->eof ? " eof" : ""
+ )
+);
+
+
+TRACE_EVENT(nfs_pgio_error,
+ TP_PROTO(
+ const struct nfs_pgio_header *hdr,
+ int error,
+ loff_t pos
+ ),
+
+ TP_ARGS(hdr, error, pos),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, arg_count)
+ __field(u32, res_count)
+ __field(loff_t, pos)
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = hdr->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = hdr->args.fh ?
+ hdr->args.fh : &nfsi->fh;
+
+ __entry->error = error;
+ __entry->offset = hdr->args.offset;
+ __entry->arg_count = hdr->args.count;
+ __entry->res_count = hdr->res.count;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk("error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u pos=%llu", __entry->error,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid, __entry->fhandle,
+ (long long)__entry->offset, __entry->arg_count, __entry->res_count,
+ __entry->pos
+ )
+);
+
+TRACE_EVENT(nfs_initiate_write,
+ TP_PROTO(
+ const struct nfs_pgio_header *hdr
+ ),
+
+ TP_ARGS(hdr),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, count)
+ __field(unsigned long, stable)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = hdr->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = hdr->args.fh ?
+ hdr->args.fh : &nfsi->fh;
+
+ __entry->offset = hdr->args.offset;
+ __entry->count = hdr->args.count;
+ __entry->stable = hdr->args.stable;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u stable=%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset, __entry->count,
+ show_nfs_stable_how(__entry->stable)
+ )
+);
+
+TRACE_EVENT(nfs_writeback_done,
+ TP_PROTO(
+ const struct rpc_task *task,
+ const struct nfs_pgio_header *hdr
+ ),
+
+ TP_ARGS(task, hdr),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, arg_count)
+ __field(u32, res_count)
+ __field(int, error)
+ __field(unsigned long, stable)
+ __array(char, verifier, NFS4_VERIFIER_SIZE)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = hdr->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = hdr->args.fh ?
+ hdr->args.fh : &nfsi->fh;
+ const struct nfs_writeverf *verf = hdr->res.verf;
+
+ __entry->error = task->tk_status;
+ __entry->offset = hdr->args.offset;
+ __entry->arg_count = hdr->args.count;
+ __entry->res_count = hdr->res.count;
+ __entry->stable = verf->committed;
+ memcpy(__entry->verifier,
+ &verf->verifier,
+ NFS4_VERIFIER_SIZE);
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk(
+ "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u stable=%s "
+ "verifier=%s", __entry->error,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset, __entry->arg_count,
+ __entry->res_count,
+ show_nfs_stable_how(__entry->stable),
+ show_nfs4_verifier(__entry->verifier)
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs_page_error_class,
+ TP_PROTO(
+ const struct inode *inode,
+ const struct nfs_page *req,
+ int error
+ ),
+
+ TP_ARGS(inode, req, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(unsigned int, count)
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->offset = req_offset(req);
+ __entry->count = req->wb_bytes;
+ __entry->error = error;
+ ),
+
+ TP_printk(
+ "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u", __entry->error,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->offset,
+ __entry->count
+ )
+);
+
+#define DEFINE_NFS_PAGEERR_EVENT(name) \
+ DEFINE_EVENT(nfs_page_error_class, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ const struct nfs_page *req, \
+ int error \
+ ), \
+ TP_ARGS(inode, req, error))
+
+DEFINE_NFS_PAGEERR_EVENT(nfs_write_error);
+DEFINE_NFS_PAGEERR_EVENT(nfs_comp_error);
+DEFINE_NFS_PAGEERR_EVENT(nfs_commit_error);
+
+TRACE_EVENT(nfs_initiate_commit,
+ TP_PROTO(
+ const struct nfs_commit_data *data
+ ),
+
+ TP_ARGS(data),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, count)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = data->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = data->args.fh ?
+ data->args.fh : &nfsi->fh;
+
+ __entry->offset = data->args.offset;
+ __entry->count = data->args.count;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset, __entry->count
+ )
+);
+
+TRACE_EVENT(nfs_commit_done,
+ TP_PROTO(
+ const struct rpc_task *task,
+ const struct nfs_commit_data *data
+ ),
+
+ TP_ARGS(task, data),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(int, error)
+ __field(unsigned long, stable)
+ __array(char, verifier, NFS4_VERIFIER_SIZE)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = data->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = data->args.fh ?
+ data->args.fh : &nfsi->fh;
+ const struct nfs_writeverf *verf = data->res.verf;
+
+ __entry->error = task->tk_status;
+ __entry->offset = data->args.offset;
+ __entry->stable = verf->committed;
+ memcpy(__entry->verifier,
+ &verf->verifier,
+ NFS4_VERIFIER_SIZE);
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk(
+ "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld stable=%s verifier=%s", __entry->error,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset,
+ show_nfs_stable_how(__entry->stable),
+ show_nfs4_verifier(__entry->verifier)
+ )
+);
+
+#define nfs_show_direct_req_flags(v) \
+ __print_flags(v, "|", \
+ { NFS_ODIRECT_DO_COMMIT, "DO_COMMIT" }, \
+ { NFS_ODIRECT_RESCHED_WRITES, "RESCHED_WRITES" }, \
+ { NFS_ODIRECT_SHOULD_DIRTY, "SHOULD DIRTY" }, \
+ { NFS_ODIRECT_DONE, "DONE" } )
+
+DECLARE_EVENT_CLASS(nfs_direct_req_class,
+ TP_PROTO(
+ const struct nfs_direct_req *dreq
+ ),
+
+ TP_ARGS(dreq),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u32, fhandle)
+ __field(loff_t, offset)
+ __field(ssize_t, count)
+ __field(ssize_t, bytes_left)
+ __field(ssize_t, error)
+ __field(int, flags)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = dreq->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = &nfsi->fh;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ __entry->offset = dreq->io_start;
+ __entry->count = dreq->count;
+ __entry->bytes_left = dreq->bytes_left;
+ __entry->error = dreq->error;
+ __entry->flags = dreq->flags;
+ ),
+
+ TP_printk(
+ "error=%zd fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%zd bytes_left=%zd flags=%s",
+ __entry->error, MAJOR(__entry->dev),
+ MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->offset,
+ __entry->count, __entry->bytes_left,
+ nfs_show_direct_req_flags(__entry->flags)
+ )
+);
+
+#define DEFINE_NFS_DIRECT_REQ_EVENT(name) \
+ DEFINE_EVENT(nfs_direct_req_class, name, \
+ TP_PROTO( \
+ const struct nfs_direct_req *dreq \
+ ), \
+ TP_ARGS(dreq))
+
+DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_commit_complete);
+DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_resched_write);
+DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_complete);
+DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_completion);
+DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_schedule_iovec);
+DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_reschedule_io);
+
+TRACE_EVENT(nfs_fh_to_dentry,
+ TP_PROTO(
+ const struct super_block *sb,
+ const struct nfs_fh *fh,
+ u64 fileid,
+ int error
+ ),
+
+ TP_ARGS(sb, fh, fileid, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error;
+ __entry->dev = sb->s_dev;
+ __entry->fileid = fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk(
+ "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x ",
+ __entry->error,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle
+ )
+);
+
+TRACE_EVENT(nfs_mount_assign,
+ TP_PROTO(
+ const char *option,
+ const char *value
+ ),
+
+ TP_ARGS(option, value),
+
+ TP_STRUCT__entry(
+ __string(option, option)
+ __string(value, value)
+ ),
+
+ TP_fast_assign(
+ __assign_str(option, option);
+ __assign_str(value, value);
+ ),
+
+ TP_printk("option %s=%s",
+ __get_str(option), __get_str(value)
+ )
+);
+
+TRACE_EVENT(nfs_mount_option,
+ TP_PROTO(
+ const struct fs_parameter *param
+ ),
+
+ TP_ARGS(param),
+
+ TP_STRUCT__entry(
+ __string(option, param->key)
+ ),
+
+ TP_fast_assign(
+ __assign_str(option, param->key);
+ ),
+
+ TP_printk("option %s", __get_str(option))
+);
+
+TRACE_EVENT(nfs_mount_path,
+ TP_PROTO(
+ const char *path
+ ),
+
+ TP_ARGS(path),
+
+ TP_STRUCT__entry(
+ __string(path, path)
+ ),
+
+ TP_fast_assign(
+ __assign_str(path, path);
+ ),
+
+ TP_printk("path='%s'", __get_str(path))
+);
+
+DECLARE_EVENT_CLASS(nfs_xdr_event,
+ TP_PROTO(
+ const struct xdr_stream *xdr,
+ int error
+ ),
+
+ TP_ARGS(xdr, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, task_id)
+ __field(unsigned int, client_id)
+ __field(u32, xid)
+ __field(int, version)
+ __field(unsigned long, error)
+ __string(program,
+ xdr->rqst->rq_task->tk_client->cl_program->name)
+ __string(procedure,
+ xdr->rqst->rq_task->tk_msg.rpc_proc->p_name)
+ ),
+
+ TP_fast_assign(
+ const struct rpc_rqst *rqstp = xdr->rqst;
+ const struct rpc_task *task = rqstp->rq_task;
+
+ __entry->task_id = task->tk_pid;
+ __entry->client_id = task->tk_client->cl_clid;
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->version = task->tk_client->cl_vers;
+ __entry->error = error;
+ __assign_str(program,
+ task->tk_client->cl_program->name);
+ __assign_str(procedure, task->tk_msg.rpc_proc->p_name);
+ ),
+
+ TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
+ " xid=0x%08x %sv%d %s error=%ld (%s)",
+ __entry->task_id, __entry->client_id, __entry->xid,
+ __get_str(program), __entry->version,
+ __get_str(procedure), -__entry->error,
+ show_nfs_status(__entry->error)
+ )
+);
+#define DEFINE_NFS_XDR_EVENT(name) \
+ DEFINE_EVENT(nfs_xdr_event, name, \
+ TP_PROTO( \
+ const struct xdr_stream *xdr, \
+ int error \
+ ), \
+ TP_ARGS(xdr, error))
+DEFINE_NFS_XDR_EVENT(nfs_xdr_status);
+DEFINE_NFS_XDR_EVENT(nfs_xdr_bad_filehandle);
+
+#endif /* _TRACE_NFS_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE nfstrace
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
new file mode 100644
index 0000000000..6efb5068c1
--- /dev/null
+++ b/fs/nfs/pagelist.c
@@ -0,0 +1,1588 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/pagelist.c
+ *
+ * A set of helper functions for managing NFS read and write requests.
+ * The main purpose of these routines is to provide support for the
+ * coalescing of several requests into a single RPC call.
+ *
+ * Copyright 2000, 2001 (c) Trond Myklebust <trond.myklebust@fys.uio.no>
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/sched.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/nfs_mount.h>
+#include <linux/export.h>
+#include <linux/filelock.h>
+
+#include "internal.h"
+#include "pnfs.h"
+#include "nfstrace.h"
+#include "fscache.h"
+
+#define NFSDBG_FACILITY NFSDBG_PAGECACHE
+
+static struct kmem_cache *nfs_page_cachep;
+static const struct rpc_call_ops nfs_pgio_common_ops;
+
+struct nfs_page_iter_page {
+ const struct nfs_page *req;
+ size_t count;
+};
+
+static void nfs_page_iter_page_init(struct nfs_page_iter_page *i,
+ const struct nfs_page *req)
+{
+ i->req = req;
+ i->count = 0;
+}
+
+static void nfs_page_iter_page_advance(struct nfs_page_iter_page *i, size_t sz)
+{
+ const struct nfs_page *req = i->req;
+ size_t tmp = i->count + sz;
+
+ i->count = (tmp < req->wb_bytes) ? tmp : req->wb_bytes;
+}
+
+static struct page *nfs_page_iter_page_get(struct nfs_page_iter_page *i)
+{
+ const struct nfs_page *req = i->req;
+ struct page *page;
+
+ if (i->count != req->wb_bytes) {
+ size_t base = i->count + req->wb_pgbase;
+ size_t len = PAGE_SIZE - offset_in_page(base);
+
+ page = nfs_page_to_page(req, base);
+ nfs_page_iter_page_advance(i, len);
+ return page;
+ }
+ return NULL;
+}
+
+static struct nfs_pgio_mirror *
+nfs_pgio_get_mirror(struct nfs_pageio_descriptor *desc, u32 idx)
+{
+ if (desc->pg_ops->pg_get_mirror)
+ return desc->pg_ops->pg_get_mirror(desc, idx);
+ return &desc->pg_mirrors[0];
+}
+
+struct nfs_pgio_mirror *
+nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc)
+{
+ return nfs_pgio_get_mirror(desc, desc->pg_mirror_idx);
+}
+EXPORT_SYMBOL_GPL(nfs_pgio_current_mirror);
+
+static u32
+nfs_pgio_set_current_mirror(struct nfs_pageio_descriptor *desc, u32 idx)
+{
+ if (desc->pg_ops->pg_set_mirror)
+ return desc->pg_ops->pg_set_mirror(desc, idx);
+ return desc->pg_mirror_idx;
+}
+
+void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr,
+ void (*release)(struct nfs_pgio_header *hdr))
+{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+
+ hdr->req = nfs_list_entry(mirror->pg_list.next);
+ hdr->inode = desc->pg_inode;
+ hdr->cred = nfs_req_openctx(hdr->req)->cred;
+ hdr->io_start = req_offset(hdr->req);
+ hdr->good_bytes = mirror->pg_count;
+ hdr->io_completion = desc->pg_io_completion;
+ hdr->dreq = desc->pg_dreq;
+ nfs_netfs_set_pgio_header(hdr, desc);
+ hdr->release = release;
+ hdr->completion_ops = desc->pg_completion_ops;
+ if (hdr->completion_ops->init_hdr)
+ hdr->completion_ops->init_hdr(hdr);
+
+ hdr->pgio_mirror_idx = desc->pg_mirror_idx;
+}
+EXPORT_SYMBOL_GPL(nfs_pgheader_init);
+
+void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
+{
+ unsigned int new = pos - hdr->io_start;
+
+ trace_nfs_pgio_error(hdr, error, pos);
+ if (hdr->good_bytes > new) {
+ hdr->good_bytes = new;
+ clear_bit(NFS_IOHDR_EOF, &hdr->flags);
+ if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags))
+ hdr->error = error;
+ }
+}
+
+static inline struct nfs_page *nfs_page_alloc(void)
+{
+ struct nfs_page *p =
+ kmem_cache_zalloc(nfs_page_cachep, nfs_io_gfp_mask());
+ if (p)
+ INIT_LIST_HEAD(&p->wb_list);
+ return p;
+}
+
+static inline void
+nfs_page_free(struct nfs_page *p)
+{
+ kmem_cache_free(nfs_page_cachep, p);
+}
+
+/**
+ * nfs_iocounter_wait - wait for i/o to complete
+ * @l_ctx: nfs_lock_context with io_counter to use
+ *
+ * returns -ERESTARTSYS if interrupted by a fatal signal.
+ * Otherwise returns 0 once the io_count hits 0.
+ */
+int
+nfs_iocounter_wait(struct nfs_lock_context *l_ctx)
+{
+ return wait_var_event_killable(&l_ctx->io_count,
+ !atomic_read(&l_ctx->io_count));
+}
+
+/**
+ * nfs_async_iocounter_wait - wait on a rpc_waitqueue for I/O
+ * to complete
+ * @task: the rpc_task that should wait
+ * @l_ctx: nfs_lock_context with io_counter to check
+ *
+ * Returns true if there is outstanding I/O to wait on and the
+ * task has been put to sleep.
+ */
+bool
+nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx)
+{
+ struct inode *inode = d_inode(l_ctx->open_context->dentry);
+ bool ret = false;
+
+ if (atomic_read(&l_ctx->io_count) > 0) {
+ rpc_sleep_on(&NFS_SERVER(inode)->uoc_rpcwaitq, task, NULL);
+ ret = true;
+ }
+
+ if (atomic_read(&l_ctx->io_count) == 0) {
+ rpc_wake_up_queued_task(&NFS_SERVER(inode)->uoc_rpcwaitq, task);
+ ret = false;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait);
+
+/*
+ * nfs_page_lock_head_request - page lock the head of the page group
+ * @req: any member of the page group
+ */
+struct nfs_page *
+nfs_page_group_lock_head(struct nfs_page *req)
+{
+ struct nfs_page *head = req->wb_head;
+
+ while (!nfs_lock_request(head)) {
+ int ret = nfs_wait_on_request(head);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ }
+ if (head != req)
+ kref_get(&head->wb_kref);
+ return head;
+}
+
+/*
+ * nfs_unroll_locks - unlock all newly locked reqs and wait on @req
+ * @head: head request of page group, must be holding head lock
+ * @req: request that couldn't lock and needs to wait on the req bit lock
+ *
+ * This is a helper function for nfs_lock_and_join_requests
+ * returns 0 on success, < 0 on error.
+ */
+static void
+nfs_unroll_locks(struct nfs_page *head, struct nfs_page *req)
+{
+ struct nfs_page *tmp;
+
+ /* relinquish all the locks successfully grabbed this run */
+ for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) {
+ if (!kref_read(&tmp->wb_kref))
+ continue;
+ nfs_unlock_and_release_request(tmp);
+ }
+}
+
+/*
+ * nfs_page_group_lock_subreq - try to lock a subrequest
+ * @head: head request of page group
+ * @subreq: request to lock
+ *
+ * This is a helper function for nfs_lock_and_join_requests which
+ * must be called with the head request and page group both locked.
+ * On error, it returns with the page group unlocked.
+ */
+static int
+nfs_page_group_lock_subreq(struct nfs_page *head, struct nfs_page *subreq)
+{
+ int ret;
+
+ if (!kref_get_unless_zero(&subreq->wb_kref))
+ return 0;
+ while (!nfs_lock_request(subreq)) {
+ nfs_page_group_unlock(head);
+ ret = nfs_wait_on_request(subreq);
+ if (!ret)
+ ret = nfs_page_group_lock(head);
+ if (ret < 0) {
+ nfs_unroll_locks(head, subreq);
+ nfs_release_request(subreq);
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/*
+ * nfs_page_group_lock_subrequests - try to lock the subrequests
+ * @head: head request of page group
+ *
+ * This is a helper function for nfs_lock_and_join_requests which
+ * must be called with the head request locked.
+ */
+int nfs_page_group_lock_subrequests(struct nfs_page *head)
+{
+ struct nfs_page *subreq;
+ int ret;
+
+ ret = nfs_page_group_lock(head);
+ if (ret < 0)
+ return ret;
+ /* lock each request in the page group */
+ for (subreq = head->wb_this_page; subreq != head;
+ subreq = subreq->wb_this_page) {
+ ret = nfs_page_group_lock_subreq(head, subreq);
+ if (ret < 0)
+ return ret;
+ }
+ nfs_page_group_unlock(head);
+ return 0;
+}
+
+/*
+ * nfs_page_set_headlock - set the request PG_HEADLOCK
+ * @req: request that is to be locked
+ *
+ * this lock must be held when modifying req->wb_head
+ *
+ * return 0 on success, < 0 on error
+ */
+int
+nfs_page_set_headlock(struct nfs_page *req)
+{
+ if (!test_and_set_bit(PG_HEADLOCK, &req->wb_flags))
+ return 0;
+
+ set_bit(PG_CONTENDED1, &req->wb_flags);
+ smp_mb__after_atomic();
+ return wait_on_bit_lock(&req->wb_flags, PG_HEADLOCK,
+ TASK_UNINTERRUPTIBLE);
+}
+
+/*
+ * nfs_page_clear_headlock - clear the request PG_HEADLOCK
+ * @req: request that is to be locked
+ */
+void
+nfs_page_clear_headlock(struct nfs_page *req)
+{
+ clear_bit_unlock(PG_HEADLOCK, &req->wb_flags);
+ smp_mb__after_atomic();
+ if (!test_bit(PG_CONTENDED1, &req->wb_flags))
+ return;
+ wake_up_bit(&req->wb_flags, PG_HEADLOCK);
+}
+
+/*
+ * nfs_page_group_lock - lock the head of the page group
+ * @req: request in group that is to be locked
+ *
+ * this lock must be held when traversing or modifying the page
+ * group list
+ *
+ * return 0 on success, < 0 on error
+ */
+int
+nfs_page_group_lock(struct nfs_page *req)
+{
+ int ret;
+
+ ret = nfs_page_set_headlock(req);
+ if (ret || req->wb_head == req)
+ return ret;
+ return nfs_page_set_headlock(req->wb_head);
+}
+
+/*
+ * nfs_page_group_unlock - unlock the head of the page group
+ * @req: request in group that is to be unlocked
+ */
+void
+nfs_page_group_unlock(struct nfs_page *req)
+{
+ if (req != req->wb_head)
+ nfs_page_clear_headlock(req->wb_head);
+ nfs_page_clear_headlock(req);
+}
+
+/*
+ * nfs_page_group_sync_on_bit_locked
+ *
+ * must be called with page group lock held
+ */
+static bool
+nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
+{
+ struct nfs_page *head = req->wb_head;
+ struct nfs_page *tmp;
+
+ WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags));
+ WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags));
+
+ tmp = req->wb_this_page;
+ while (tmp != req) {
+ if (!test_bit(bit, &tmp->wb_flags))
+ return false;
+ tmp = tmp->wb_this_page;
+ }
+
+ /* true! reset all bits */
+ tmp = req;
+ do {
+ clear_bit(bit, &tmp->wb_flags);
+ tmp = tmp->wb_this_page;
+ } while (tmp != req);
+
+ return true;
+}
+
+/*
+ * nfs_page_group_sync_on_bit - set bit on current request, but only
+ * return true if the bit is set for all requests in page group
+ * @req - request in page group
+ * @bit - PG_* bit that is used to sync page group
+ */
+bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
+{
+ bool ret;
+
+ nfs_page_group_lock(req);
+ ret = nfs_page_group_sync_on_bit_locked(req, bit);
+ nfs_page_group_unlock(req);
+
+ return ret;
+}
+
+/*
+ * nfs_page_group_init - Initialize the page group linkage for @req
+ * @req - a new nfs request
+ * @prev - the previous request in page group, or NULL if @req is the first
+ * or only request in the group (the head).
+ */
+static inline void
+nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
+{
+ struct inode *inode;
+ WARN_ON_ONCE(prev == req);
+
+ if (!prev) {
+ /* a head request */
+ req->wb_head = req;
+ req->wb_this_page = req;
+ } else {
+ /* a subrequest */
+ WARN_ON_ONCE(prev->wb_this_page != prev->wb_head);
+ WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags));
+ req->wb_head = prev->wb_head;
+ req->wb_this_page = prev->wb_this_page;
+ prev->wb_this_page = req;
+
+ /* All subrequests take a ref on the head request until
+ * nfs_page_group_destroy is called */
+ kref_get(&req->wb_head->wb_kref);
+
+ /* grab extra ref and bump the request count if head request
+ * has extra ref from the write/commit path to handle handoff
+ * between write and commit lists. */
+ if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) {
+ inode = nfs_page_to_inode(req);
+ set_bit(PG_INODE_REF, &req->wb_flags);
+ kref_get(&req->wb_kref);
+ atomic_long_inc(&NFS_I(inode)->nrequests);
+ }
+ }
+}
+
+/*
+ * nfs_page_group_destroy - sync the destruction of page groups
+ * @req - request that no longer needs the page group
+ *
+ * releases the page group reference from each member once all
+ * members have called this function.
+ */
+static void
+nfs_page_group_destroy(struct kref *kref)
+{
+ struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
+ struct nfs_page *head = req->wb_head;
+ struct nfs_page *tmp, *next;
+
+ if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
+ goto out;
+
+ tmp = req;
+ do {
+ next = tmp->wb_this_page;
+ /* unlink and free */
+ tmp->wb_this_page = tmp;
+ tmp->wb_head = tmp;
+ nfs_free_request(tmp);
+ tmp = next;
+ } while (tmp != req);
+out:
+ /* subrequests must release the ref on the head request */
+ if (head != req)
+ nfs_release_request(head);
+}
+
+static struct nfs_page *nfs_page_create(struct nfs_lock_context *l_ctx,
+ unsigned int pgbase, pgoff_t index,
+ unsigned int offset, unsigned int count)
+{
+ struct nfs_page *req;
+ struct nfs_open_context *ctx = l_ctx->open_context;
+
+ if (test_bit(NFS_CONTEXT_BAD, &ctx->flags))
+ return ERR_PTR(-EBADF);
+ /* try to allocate the request struct */
+ req = nfs_page_alloc();
+ if (req == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ req->wb_lock_context = l_ctx;
+ refcount_inc(&l_ctx->count);
+ atomic_inc(&l_ctx->io_count);
+
+ /* Initialize the request struct. Initially, we assume a
+ * long write-back delay. This will be adjusted in
+ * update_nfs_request below if the region is not locked. */
+ req->wb_pgbase = pgbase;
+ req->wb_index = index;
+ req->wb_offset = offset;
+ req->wb_bytes = count;
+ kref_init(&req->wb_kref);
+ req->wb_nio = 0;
+ return req;
+}
+
+static void nfs_page_assign_folio(struct nfs_page *req, struct folio *folio)
+{
+ if (folio != NULL) {
+ req->wb_folio = folio;
+ folio_get(folio);
+ set_bit(PG_FOLIO, &req->wb_flags);
+ }
+}
+
+static void nfs_page_assign_page(struct nfs_page *req, struct page *page)
+{
+ if (page != NULL) {
+ req->wb_page = page;
+ get_page(page);
+ }
+}
+
+/**
+ * nfs_page_create_from_page - Create an NFS read/write request.
+ * @ctx: open context to use
+ * @page: page to write
+ * @pgbase: starting offset within the page for the write
+ * @offset: file offset for the write
+ * @count: number of bytes to read/write
+ *
+ * The page must be locked by the caller. This makes sure we never
+ * create two different requests for the same page.
+ * User should ensure it is safe to sleep in this function.
+ */
+struct nfs_page *nfs_page_create_from_page(struct nfs_open_context *ctx,
+ struct page *page,
+ unsigned int pgbase, loff_t offset,
+ unsigned int count)
+{
+ struct nfs_lock_context *l_ctx = nfs_get_lock_context(ctx);
+ struct nfs_page *ret;
+
+ if (IS_ERR(l_ctx))
+ return ERR_CAST(l_ctx);
+ ret = nfs_page_create(l_ctx, pgbase, offset >> PAGE_SHIFT,
+ offset_in_page(offset), count);
+ if (!IS_ERR(ret)) {
+ nfs_page_assign_page(ret, page);
+ nfs_page_group_init(ret, NULL);
+ }
+ nfs_put_lock_context(l_ctx);
+ return ret;
+}
+
+/**
+ * nfs_page_create_from_folio - Create an NFS read/write request.
+ * @ctx: open context to use
+ * @folio: folio to write
+ * @offset: starting offset within the folio for the write
+ * @count: number of bytes to read/write
+ *
+ * The page must be locked by the caller. This makes sure we never
+ * create two different requests for the same page.
+ * User should ensure it is safe to sleep in this function.
+ */
+struct nfs_page *nfs_page_create_from_folio(struct nfs_open_context *ctx,
+ struct folio *folio,
+ unsigned int offset,
+ unsigned int count)
+{
+ struct nfs_lock_context *l_ctx = nfs_get_lock_context(ctx);
+ struct nfs_page *ret;
+
+ if (IS_ERR(l_ctx))
+ return ERR_CAST(l_ctx);
+ ret = nfs_page_create(l_ctx, offset, folio_index(folio), offset, count);
+ if (!IS_ERR(ret)) {
+ nfs_page_assign_folio(ret, folio);
+ nfs_page_group_init(ret, NULL);
+ }
+ nfs_put_lock_context(l_ctx);
+ return ret;
+}
+
+static struct nfs_page *
+nfs_create_subreq(struct nfs_page *req,
+ unsigned int pgbase,
+ unsigned int offset,
+ unsigned int count)
+{
+ struct nfs_page *last;
+ struct nfs_page *ret;
+ struct folio *folio = nfs_page_to_folio(req);
+ struct page *page = nfs_page_to_page(req, pgbase);
+
+ ret = nfs_page_create(req->wb_lock_context, pgbase, req->wb_index,
+ offset, count);
+ if (!IS_ERR(ret)) {
+ if (folio)
+ nfs_page_assign_folio(ret, folio);
+ else
+ nfs_page_assign_page(ret, page);
+ /* find the last request */
+ for (last = req->wb_head;
+ last->wb_this_page != req->wb_head;
+ last = last->wb_this_page)
+ ;
+
+ nfs_lock_request(ret);
+ nfs_page_group_init(ret, last);
+ ret->wb_nio = req->wb_nio;
+ }
+ return ret;
+}
+
+/**
+ * nfs_unlock_request - Unlock request and wake up sleepers.
+ * @req: pointer to request
+ */
+void nfs_unlock_request(struct nfs_page *req)
+{
+ clear_bit_unlock(PG_BUSY, &req->wb_flags);
+ smp_mb__after_atomic();
+ if (!test_bit(PG_CONTENDED2, &req->wb_flags))
+ return;
+ wake_up_bit(&req->wb_flags, PG_BUSY);
+}
+
+/**
+ * nfs_unlock_and_release_request - Unlock request and release the nfs_page
+ * @req: pointer to request
+ */
+void nfs_unlock_and_release_request(struct nfs_page *req)
+{
+ nfs_unlock_request(req);
+ nfs_release_request(req);
+}
+
+/*
+ * nfs_clear_request - Free up all resources allocated to the request
+ * @req:
+ *
+ * Release page and open context resources associated with a read/write
+ * request after it has completed.
+ */
+static void nfs_clear_request(struct nfs_page *req)
+{
+ struct folio *folio = nfs_page_to_folio(req);
+ struct page *page = req->wb_page;
+ struct nfs_lock_context *l_ctx = req->wb_lock_context;
+ struct nfs_open_context *ctx;
+
+ if (folio != NULL) {
+ folio_put(folio);
+ req->wb_folio = NULL;
+ clear_bit(PG_FOLIO, &req->wb_flags);
+ } else if (page != NULL) {
+ put_page(page);
+ req->wb_page = NULL;
+ }
+ if (l_ctx != NULL) {
+ if (atomic_dec_and_test(&l_ctx->io_count)) {
+ wake_up_var(&l_ctx->io_count);
+ ctx = l_ctx->open_context;
+ if (test_bit(NFS_CONTEXT_UNLOCK, &ctx->flags))
+ rpc_wake_up(&NFS_SERVER(d_inode(ctx->dentry))->uoc_rpcwaitq);
+ }
+ nfs_put_lock_context(l_ctx);
+ req->wb_lock_context = NULL;
+ }
+}
+
+/**
+ * nfs_free_request - Release the count on an NFS read/write request
+ * @req: request to release
+ *
+ * Note: Should never be called with the spinlock held!
+ */
+void nfs_free_request(struct nfs_page *req)
+{
+ WARN_ON_ONCE(req->wb_this_page != req);
+
+ /* extra debug: make sure no sync bits are still set */
+ WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
+ WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags));
+ WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags));
+ WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags));
+ WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags));
+
+ /* Release struct file and open context */
+ nfs_clear_request(req);
+ nfs_page_free(req);
+}
+
+void nfs_release_request(struct nfs_page *req)
+{
+ kref_put(&req->wb_kref, nfs_page_group_destroy);
+}
+EXPORT_SYMBOL_GPL(nfs_release_request);
+
+/**
+ * nfs_wait_on_request - Wait for a request to complete.
+ * @req: request to wait upon.
+ *
+ * Interruptible by fatal signals only.
+ * The user is responsible for holding a count on the request.
+ */
+int
+nfs_wait_on_request(struct nfs_page *req)
+{
+ if (!test_bit(PG_BUSY, &req->wb_flags))
+ return 0;
+ set_bit(PG_CONTENDED2, &req->wb_flags);
+ smp_mb__after_atomic();
+ return wait_on_bit_io(&req->wb_flags, PG_BUSY,
+ TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL_GPL(nfs_wait_on_request);
+
+/*
+ * nfs_generic_pg_test - determine if requests can be coalesced
+ * @desc: pointer to descriptor
+ * @prev: previous request in desc, or NULL
+ * @req: this request
+ *
+ * Returns zero if @req cannot be coalesced into @desc, otherwise it returns
+ * the size of the request.
+ */
+size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *prev, struct nfs_page *req)
+{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+
+ if (mirror->pg_count > mirror->pg_bsize) {
+ /* should never happen */
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ /*
+ * Limit the request size so that we can still allocate a page array
+ * for it without upsetting the slab allocator.
+ */
+ if (((mirror->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
+ sizeof(struct page *) > PAGE_SIZE)
+ return 0;
+
+ return min(mirror->pg_bsize - mirror->pg_count, (size_t)req->wb_bytes);
+}
+EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
+
+struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops)
+{
+ struct nfs_pgio_header *hdr = ops->rw_alloc_header();
+
+ if (hdr) {
+ INIT_LIST_HEAD(&hdr->pages);
+ hdr->rw_ops = ops;
+ }
+ return hdr;
+}
+EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc);
+
+/**
+ * nfs_pgio_data_destroy - make @hdr suitable for reuse
+ *
+ * Frees memory and releases refs from nfs_generic_pgio, so that it may
+ * be called again.
+ *
+ * @hdr: A header that has had nfs_generic_pgio called
+ */
+static void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr)
+{
+ if (hdr->args.context)
+ put_nfs_open_context(hdr->args.context);
+ if (hdr->page_array.pagevec != hdr->page_array.page_array)
+ kfree(hdr->page_array.pagevec);
+}
+
+/*
+ * nfs_pgio_header_free - Free a read or write header
+ * @hdr: The header to free
+ */
+void nfs_pgio_header_free(struct nfs_pgio_header *hdr)
+{
+ nfs_pgio_data_destroy(hdr);
+ hdr->rw_ops->rw_free_header(hdr);
+}
+EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
+
+/**
+ * nfs_pgio_rpcsetup - Set up arguments for a pageio call
+ * @hdr: The pageio hdr
+ * @pgbase: base
+ * @count: Number of bytes to read
+ * @how: How to commit data (writes only)
+ * @cinfo: Commit information for the call (writes only)
+ */
+static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, unsigned int pgbase,
+ unsigned int count, int how,
+ struct nfs_commit_info *cinfo)
+{
+ struct nfs_page *req = hdr->req;
+
+ /* Set up the RPC argument and reply structs
+ * NB: take care not to mess about with hdr->commit et al. */
+
+ hdr->args.fh = NFS_FH(hdr->inode);
+ hdr->args.offset = req_offset(req);
+ /* pnfs_set_layoutcommit needs this */
+ hdr->mds_offset = hdr->args.offset;
+ hdr->args.pgbase = pgbase;
+ hdr->args.pages = hdr->page_array.pagevec;
+ hdr->args.count = count;
+ hdr->args.context = get_nfs_open_context(nfs_req_openctx(req));
+ hdr->args.lock_context = req->wb_lock_context;
+ hdr->args.stable = NFS_UNSTABLE;
+ switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
+ case 0:
+ break;
+ case FLUSH_COND_STABLE:
+ if (nfs_reqs_to_commit(cinfo))
+ break;
+ fallthrough;
+ default:
+ hdr->args.stable = NFS_FILE_SYNC;
+ }
+
+ hdr->res.fattr = &hdr->fattr;
+ hdr->res.count = 0;
+ hdr->res.eof = 0;
+ hdr->res.verf = &hdr->verf;
+ nfs_fattr_init(&hdr->fattr);
+}
+
+/**
+ * nfs_pgio_prepare - Prepare pageio hdr to go over the wire
+ * @task: The current task
+ * @calldata: pageio header to prepare
+ */
+static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs_pgio_header *hdr = calldata;
+ int err;
+ err = NFS_PROTO(hdr->inode)->pgio_rpc_prepare(task, hdr);
+ if (err)
+ rpc_exit(task, err);
+}
+
+int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
+ const struct cred *cred, const struct nfs_rpc_ops *rpc_ops,
+ const struct rpc_call_ops *call_ops, int how, int flags)
+{
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_argp = &hdr->args,
+ .rpc_resp = &hdr->res,
+ .rpc_cred = cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clnt,
+ .task = &hdr->task,
+ .rpc_message = &msg,
+ .callback_ops = call_ops,
+ .callback_data = hdr,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC | flags,
+ };
+
+ if (nfs_server_capable(hdr->inode, NFS_CAP_MOVEABLE))
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
+ hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
+
+ dprintk("NFS: initiated pgio call "
+ "(req %s/%llu, %u bytes @ offset %llu)\n",
+ hdr->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(hdr->inode),
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
+
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ rpc_put_task(task);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
+
+/**
+ * nfs_pgio_error - Clean up from a pageio error
+ * @hdr: pageio header
+ */
+static void nfs_pgio_error(struct nfs_pgio_header *hdr)
+{
+ set_bit(NFS_IOHDR_REDO, &hdr->flags);
+ hdr->completion_ops->completion(hdr);
+}
+
+/**
+ * nfs_pgio_release - Release pageio data
+ * @calldata: The pageio header to release
+ */
+static void nfs_pgio_release(void *calldata)
+{
+ struct nfs_pgio_header *hdr = calldata;
+ hdr->completion_ops->completion(hdr);
+}
+
+static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror,
+ unsigned int bsize)
+{
+ INIT_LIST_HEAD(&mirror->pg_list);
+ mirror->pg_bytes_written = 0;
+ mirror->pg_count = 0;
+ mirror->pg_bsize = bsize;
+ mirror->pg_base = 0;
+ mirror->pg_recoalesce = 0;
+}
+
+/**
+ * nfs_pageio_init - initialise a page io descriptor
+ * @desc: pointer to descriptor
+ * @inode: pointer to inode
+ * @pg_ops: pointer to pageio operations
+ * @compl_ops: pointer to pageio completion operations
+ * @rw_ops: pointer to nfs read/write operations
+ * @bsize: io block size
+ * @io_flags: extra parameters for the io function
+ */
+void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
+ struct inode *inode,
+ const struct nfs_pageio_ops *pg_ops,
+ const struct nfs_pgio_completion_ops *compl_ops,
+ const struct nfs_rw_ops *rw_ops,
+ size_t bsize,
+ int io_flags)
+{
+ desc->pg_moreio = 0;
+ desc->pg_inode = inode;
+ desc->pg_ops = pg_ops;
+ desc->pg_completion_ops = compl_ops;
+ desc->pg_rw_ops = rw_ops;
+ desc->pg_ioflags = io_flags;
+ desc->pg_error = 0;
+ desc->pg_lseg = NULL;
+ desc->pg_io_completion = NULL;
+ desc->pg_dreq = NULL;
+ nfs_netfs_reset_pageio_descriptor(desc);
+ desc->pg_bsize = bsize;
+
+ desc->pg_mirror_count = 1;
+ desc->pg_mirror_idx = 0;
+
+ desc->pg_mirrors_dynamic = NULL;
+ desc->pg_mirrors = desc->pg_mirrors_static;
+ nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize);
+ desc->pg_maxretrans = 0;
+}
+
+/**
+ * nfs_pgio_result - Basic pageio error handling
+ * @task: The task that ran
+ * @calldata: Pageio header to check
+ */
+static void nfs_pgio_result(struct rpc_task *task, void *calldata)
+{
+ struct nfs_pgio_header *hdr = calldata;
+ struct inode *inode = hdr->inode;
+
+ if (hdr->rw_ops->rw_done(task, hdr, inode) != 0)
+ return;
+ if (task->tk_status < 0)
+ nfs_set_pgio_error(hdr, task->tk_status, hdr->args.offset);
+ else
+ hdr->rw_ops->rw_result(task, hdr);
+}
+
+/*
+ * Create an RPC task for the given read or write request and kick it.
+ * The page must have been locked by the caller.
+ *
+ * It may happen that the page we're passed is not marked dirty.
+ * This is the case if nfs_updatepage detects a conflicting request
+ * that has been written but not committed.
+ */
+int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr)
+{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+ struct nfs_page *req;
+ struct page **pages,
+ *last_page;
+ struct list_head *head = &mirror->pg_list;
+ struct nfs_commit_info cinfo;
+ struct nfs_page_array *pg_array = &hdr->page_array;
+ unsigned int pagecount, pageused;
+ unsigned int pg_base = offset_in_page(mirror->pg_base);
+ gfp_t gfp_flags = nfs_io_gfp_mask();
+
+ pagecount = nfs_page_array_len(pg_base, mirror->pg_count);
+ pg_array->npages = pagecount;
+
+ if (pagecount <= ARRAY_SIZE(pg_array->page_array))
+ pg_array->pagevec = pg_array->page_array;
+ else {
+ pg_array->pagevec = kcalloc(pagecount, sizeof(struct page *), gfp_flags);
+ if (!pg_array->pagevec) {
+ pg_array->npages = 0;
+ nfs_pgio_error(hdr);
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
+ }
+ }
+
+ nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
+ pages = hdr->page_array.pagevec;
+ last_page = NULL;
+ pageused = 0;
+ while (!list_empty(head)) {
+ struct nfs_page_iter_page i;
+ struct page *page;
+
+ req = nfs_list_entry(head->next);
+ nfs_list_move_request(req, &hdr->pages);
+
+ if (req->wb_pgbase == 0)
+ last_page = NULL;
+
+ nfs_page_iter_page_init(&i, req);
+ while ((page = nfs_page_iter_page_get(&i)) != NULL) {
+ if (last_page != page) {
+ pageused++;
+ if (pageused > pagecount)
+ goto full;
+ *pages++ = last_page = page;
+ }
+ }
+ }
+full:
+ if (WARN_ON_ONCE(pageused != pagecount)) {
+ nfs_pgio_error(hdr);
+ desc->pg_error = -EINVAL;
+ return desc->pg_error;
+ }
+
+ if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
+ (desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
+ desc->pg_ioflags &= ~FLUSH_COND_STABLE;
+
+ /* Set up the argument struct */
+ nfs_pgio_rpcsetup(hdr, pg_base, mirror->pg_count, desc->pg_ioflags,
+ &cinfo);
+ desc->pg_rpc_callops = &nfs_pgio_common_ops;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_generic_pgio);
+
+static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
+{
+ struct nfs_pgio_header *hdr;
+ int ret;
+ unsigned short task_flags = 0;
+
+ hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+ if (!hdr) {
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
+ }
+ nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
+ ret = nfs_generic_pgio(desc, hdr);
+ if (ret == 0) {
+ if (NFS_SERVER(hdr->inode)->nfs_client->cl_minorversion)
+ task_flags = RPC_TASK_MOVEABLE;
+ ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
+ hdr,
+ hdr->cred,
+ NFS_PROTO(hdr->inode),
+ desc->pg_rpc_callops,
+ desc->pg_ioflags,
+ RPC_TASK_CRED_NOREF | task_flags);
+ }
+ return ret;
+}
+
+static struct nfs_pgio_mirror *
+nfs_pageio_alloc_mirrors(struct nfs_pageio_descriptor *desc,
+ unsigned int mirror_count)
+{
+ struct nfs_pgio_mirror *ret;
+ unsigned int i;
+
+ kfree(desc->pg_mirrors_dynamic);
+ desc->pg_mirrors_dynamic = NULL;
+ if (mirror_count == 1)
+ return desc->pg_mirrors_static;
+ ret = kmalloc_array(mirror_count, sizeof(*ret), nfs_io_gfp_mask());
+ if (ret != NULL) {
+ for (i = 0; i < mirror_count; i++)
+ nfs_pageio_mirror_init(&ret[i], desc->pg_bsize);
+ desc->pg_mirrors_dynamic = ret;
+ }
+ return ret;
+}
+
+/*
+ * nfs_pageio_setup_mirroring - determine if mirroring is to be used
+ * by calling the pg_get_mirror_count op
+ */
+static void nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ unsigned int mirror_count = 1;
+
+ if (pgio->pg_ops->pg_get_mirror_count)
+ mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
+ if (mirror_count == pgio->pg_mirror_count || pgio->pg_error < 0)
+ return;
+
+ if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX) {
+ pgio->pg_error = -EINVAL;
+ return;
+ }
+
+ pgio->pg_mirrors = nfs_pageio_alloc_mirrors(pgio, mirror_count);
+ if (pgio->pg_mirrors == NULL) {
+ pgio->pg_error = -ENOMEM;
+ pgio->pg_mirrors = pgio->pg_mirrors_static;
+ mirror_count = 1;
+ }
+ pgio->pg_mirror_count = mirror_count;
+}
+
+static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
+{
+ pgio->pg_mirror_count = 1;
+ pgio->pg_mirror_idx = 0;
+ pgio->pg_mirrors = pgio->pg_mirrors_static;
+ kfree(pgio->pg_mirrors_dynamic);
+ pgio->pg_mirrors_dynamic = NULL;
+}
+
+static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
+ const struct nfs_lock_context *l2)
+{
+ return l1->lockowner == l2->lockowner;
+}
+
+static bool nfs_page_is_contiguous(const struct nfs_page *prev,
+ const struct nfs_page *req)
+{
+ size_t prev_end = prev->wb_pgbase + prev->wb_bytes;
+
+ if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
+ return false;
+ if (req->wb_pgbase == 0)
+ return prev_end == nfs_page_max_length(prev);
+ if (req->wb_pgbase == prev_end) {
+ struct folio *folio = nfs_page_to_folio(req);
+ if (folio)
+ return folio == nfs_page_to_folio(prev);
+ return req->wb_page == prev->wb_page;
+ }
+ return false;
+}
+
+/**
+ * nfs_coalesce_size - test two requests for compatibility
+ * @prev: pointer to nfs_page
+ * @req: pointer to nfs_page
+ * @pgio: pointer to nfs_pagio_descriptor
+ *
+ * The nfs_page structures 'prev' and 'req' are compared to ensure that the
+ * page data area they describe is contiguous, and that their RPC
+ * credentials, NFSv4 open state, and lockowners are the same.
+ *
+ * Returns size of the request that can be coalesced
+ */
+static unsigned int nfs_coalesce_size(struct nfs_page *prev,
+ struct nfs_page *req,
+ struct nfs_pageio_descriptor *pgio)
+{
+ struct file_lock_context *flctx;
+
+ if (prev) {
+ if (!nfs_match_open_context(nfs_req_openctx(req), nfs_req_openctx(prev)))
+ return 0;
+ flctx = locks_inode_context(d_inode(nfs_req_openctx(req)->dentry));
+ if (flctx != NULL &&
+ !(list_empty_careful(&flctx->flc_posix) &&
+ list_empty_careful(&flctx->flc_flock)) &&
+ !nfs_match_lock_context(req->wb_lock_context,
+ prev->wb_lock_context))
+ return 0;
+ if (!nfs_page_is_contiguous(prev, req))
+ return 0;
+ }
+ return pgio->pg_ops->pg_test(pgio, prev, req);
+}
+
+/**
+ * nfs_pageio_do_add_request - Attempt to coalesce a request into a page list.
+ * @desc: destination io descriptor
+ * @req: request
+ *
+ * If the request 'req' was successfully coalesced into the existing list
+ * of pages 'desc', it returns the size of req.
+ */
+static unsigned int
+nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *req)
+{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+ struct nfs_page *prev = NULL;
+ unsigned int size;
+
+ if (list_empty(&mirror->pg_list)) {
+ if (desc->pg_ops->pg_init)
+ desc->pg_ops->pg_init(desc, req);
+ if (desc->pg_error < 0)
+ return 0;
+ mirror->pg_base = req->wb_pgbase;
+ mirror->pg_count = 0;
+ mirror->pg_recoalesce = 0;
+ } else
+ prev = nfs_list_entry(mirror->pg_list.prev);
+
+ if (desc->pg_maxretrans && req->wb_nio > desc->pg_maxretrans) {
+ if (NFS_SERVER(desc->pg_inode)->flags & NFS_MOUNT_SOFTERR)
+ desc->pg_error = -ETIMEDOUT;
+ else
+ desc->pg_error = -EIO;
+ return 0;
+ }
+
+ size = nfs_coalesce_size(prev, req, desc);
+ if (size < req->wb_bytes)
+ return size;
+ nfs_list_move_request(req, &mirror->pg_list);
+ mirror->pg_count += req->wb_bytes;
+ return req->wb_bytes;
+}
+
+/*
+ * Helper for nfs_pageio_add_request and nfs_pageio_complete
+ */
+static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
+{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+ if (!list_empty(&mirror->pg_list)) {
+ int error = desc->pg_ops->pg_doio(desc);
+ if (error < 0)
+ desc->pg_error = error;
+ if (list_empty(&mirror->pg_list))
+ mirror->pg_bytes_written += mirror->pg_count;
+ }
+}
+
+static void
+nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *req)
+{
+ LIST_HEAD(head);
+
+ nfs_list_move_request(req, &head);
+ desc->pg_completion_ops->error_cleanup(&head, desc->pg_error);
+}
+
+/**
+ * __nfs_pageio_add_request - Attempt to coalesce a request into a page list.
+ * @desc: destination io descriptor
+ * @req: request
+ *
+ * This may split a request into subrequests which are all part of the
+ * same page group. If so, it will submit @req as the last one, to ensure
+ * the pointer to @req is still valid in case of failure.
+ *
+ * Returns true if the request 'req' was successfully coalesced into the
+ * existing list of pages 'desc'.
+ */
+static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *req)
+{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+ struct nfs_page *subreq;
+ unsigned int size, subreq_size;
+
+ nfs_page_group_lock(req);
+
+ subreq = req;
+ subreq_size = subreq->wb_bytes;
+ for(;;) {
+ size = nfs_pageio_do_add_request(desc, subreq);
+ if (size == subreq_size) {
+ /* We successfully submitted a request */
+ if (subreq == req)
+ break;
+ req->wb_pgbase += size;
+ req->wb_bytes -= size;
+ req->wb_offset += size;
+ subreq_size = req->wb_bytes;
+ subreq = req;
+ continue;
+ }
+ if (WARN_ON_ONCE(subreq != req)) {
+ nfs_page_group_unlock(req);
+ nfs_pageio_cleanup_request(desc, subreq);
+ subreq = req;
+ subreq_size = req->wb_bytes;
+ nfs_page_group_lock(req);
+ }
+ if (!size) {
+ /* Can't coalesce any more, so do I/O */
+ nfs_page_group_unlock(req);
+ desc->pg_moreio = 1;
+ nfs_pageio_doio(desc);
+ if (desc->pg_error < 0 || mirror->pg_recoalesce)
+ return 0;
+ /* retry add_request for this subreq */
+ nfs_page_group_lock(req);
+ continue;
+ }
+ subreq = nfs_create_subreq(req, req->wb_pgbase,
+ req->wb_offset, size);
+ if (IS_ERR(subreq))
+ goto err_ptr;
+ subreq_size = size;
+ }
+
+ nfs_page_group_unlock(req);
+ return 1;
+err_ptr:
+ desc->pg_error = PTR_ERR(subreq);
+ nfs_page_group_unlock(req);
+ return 0;
+}
+
+static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
+{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+ LIST_HEAD(head);
+
+ do {
+ list_splice_init(&mirror->pg_list, &head);
+ mirror->pg_recoalesce = 0;
+
+ while (!list_empty(&head)) {
+ struct nfs_page *req;
+
+ req = list_first_entry(&head, struct nfs_page, wb_list);
+ if (__nfs_pageio_add_request(desc, req))
+ continue;
+ if (desc->pg_error < 0) {
+ list_splice_tail(&head, &mirror->pg_list);
+ mirror->pg_recoalesce = 1;
+ return 0;
+ }
+ break;
+ }
+ } while (mirror->pg_recoalesce);
+ return 1;
+}
+
+static int nfs_pageio_add_request_mirror(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *req)
+{
+ int ret;
+
+ do {
+ ret = __nfs_pageio_add_request(desc, req);
+ if (ret)
+ break;
+ if (desc->pg_error < 0)
+ break;
+ ret = nfs_do_recoalesce(desc);
+ } while (ret);
+
+ return ret;
+}
+
+static void nfs_pageio_error_cleanup(struct nfs_pageio_descriptor *desc)
+{
+ u32 midx;
+ struct nfs_pgio_mirror *mirror;
+
+ if (!desc->pg_error)
+ return;
+
+ for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+ mirror = nfs_pgio_get_mirror(desc, midx);
+ desc->pg_completion_ops->error_cleanup(&mirror->pg_list,
+ desc->pg_error);
+ }
+}
+
+int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *req)
+{
+ u32 midx;
+ unsigned int pgbase, offset, bytes;
+ struct nfs_page *dupreq;
+
+ pgbase = req->wb_pgbase;
+ offset = req->wb_offset;
+ bytes = req->wb_bytes;
+
+ nfs_pageio_setup_mirroring(desc, req);
+ if (desc->pg_error < 0)
+ goto out_failed;
+
+ /* Create the mirror instances first, and fire them off */
+ for (midx = 1; midx < desc->pg_mirror_count; midx++) {
+ nfs_page_group_lock(req);
+
+ dupreq = nfs_create_subreq(req,
+ pgbase, offset, bytes);
+
+ nfs_page_group_unlock(req);
+ if (IS_ERR(dupreq)) {
+ desc->pg_error = PTR_ERR(dupreq);
+ goto out_failed;
+ }
+
+ nfs_pgio_set_current_mirror(desc, midx);
+ if (!nfs_pageio_add_request_mirror(desc, dupreq))
+ goto out_cleanup_subreq;
+ }
+
+ nfs_pgio_set_current_mirror(desc, 0);
+ if (!nfs_pageio_add_request_mirror(desc, req))
+ goto out_failed;
+
+ return 1;
+
+out_cleanup_subreq:
+ nfs_pageio_cleanup_request(desc, dupreq);
+out_failed:
+ nfs_pageio_error_cleanup(desc);
+ return 0;
+}
+
+/*
+ * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an
+ * nfs_pageio_descriptor
+ * @desc: pointer to io descriptor
+ * @mirror_idx: pointer to mirror index
+ */
+static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
+ u32 mirror_idx)
+{
+ struct nfs_pgio_mirror *mirror;
+ u32 restore_idx;
+
+ restore_idx = nfs_pgio_set_current_mirror(desc, mirror_idx);
+ mirror = nfs_pgio_current_mirror(desc);
+
+ for (;;) {
+ nfs_pageio_doio(desc);
+ if (desc->pg_error < 0 || !mirror->pg_recoalesce)
+ break;
+ if (!nfs_do_recoalesce(desc))
+ break;
+ }
+ nfs_pgio_set_current_mirror(desc, restore_idx);
+}
+
+/*
+ * nfs_pageio_resend - Transfer requests to new descriptor and resend
+ * @hdr - the pgio header to move request from
+ * @desc - the pageio descriptor to add requests to
+ *
+ * Try to move each request (nfs_page) from @hdr to @desc then attempt
+ * to send them.
+ *
+ * Returns 0 on success and < 0 on error.
+ */
+int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr)
+{
+ LIST_HEAD(pages);
+
+ desc->pg_io_completion = hdr->io_completion;
+ desc->pg_dreq = hdr->dreq;
+ nfs_netfs_set_pageio_descriptor(desc, hdr);
+ list_splice_init(&hdr->pages, &pages);
+ while (!list_empty(&pages)) {
+ struct nfs_page *req = nfs_list_entry(pages.next);
+
+ if (!nfs_pageio_add_request(desc, req))
+ break;
+ }
+ nfs_pageio_complete(desc);
+ if (!list_empty(&pages)) {
+ int err = desc->pg_error < 0 ? desc->pg_error : -EIO;
+ hdr->completion_ops->error_cleanup(&pages, err);
+ nfs_set_pgio_error(hdr, err, hdr->io_start);
+ return err;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_resend);
+
+/**
+ * nfs_pageio_complete - Complete I/O then cleanup an nfs_pageio_descriptor
+ * @desc: pointer to io descriptor
+ */
+void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
+{
+ u32 midx;
+
+ for (midx = 0; midx < desc->pg_mirror_count; midx++)
+ nfs_pageio_complete_mirror(desc, midx);
+
+ if (desc->pg_error < 0)
+ nfs_pageio_error_cleanup(desc);
+ if (desc->pg_ops->pg_cleanup)
+ desc->pg_ops->pg_cleanup(desc);
+ nfs_pageio_cleanup_mirroring(desc);
+}
+
+/**
+ * nfs_pageio_cond_complete - Conditional I/O completion
+ * @desc: pointer to io descriptor
+ * @index: page index
+ *
+ * It is important to ensure that processes don't try to take locks
+ * on non-contiguous ranges of pages as that might deadlock. This
+ * function should be called before attempting to wait on a locked
+ * nfs_page. It will complete the I/O if the page index 'index'
+ * is not contiguous with the existing list of pages in 'desc'.
+ */
+void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
+{
+ struct nfs_pgio_mirror *mirror;
+ struct nfs_page *prev;
+ struct folio *folio;
+ u32 midx;
+
+ for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+ mirror = nfs_pgio_get_mirror(desc, midx);
+ if (!list_empty(&mirror->pg_list)) {
+ prev = nfs_list_entry(mirror->pg_list.prev);
+ folio = nfs_page_to_folio(prev);
+ if (folio) {
+ if (index == folio_next_index(folio))
+ continue;
+ } else if (index == prev->wb_index + 1)
+ continue;
+ nfs_pageio_complete(desc);
+ break;
+ }
+ }
+}
+
+/*
+ * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1)
+ */
+void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio)
+{
+ nfs_pageio_complete(pgio);
+}
+
+int __init nfs_init_nfspagecache(void)
+{
+ nfs_page_cachep = kmem_cache_create("nfs_page",
+ sizeof(struct nfs_page),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (nfs_page_cachep == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void nfs_destroy_nfspagecache(void)
+{
+ kmem_cache_destroy(nfs_page_cachep);
+}
+
+static const struct rpc_call_ops nfs_pgio_common_ops = {
+ .rpc_call_prepare = nfs_pgio_prepare,
+ .rpc_call_done = nfs_pgio_result,
+ .rpc_release = nfs_pgio_release,
+};
+
+const struct nfs_pageio_ops nfs_pgio_rw_ops = {
+ .pg_test = nfs_generic_pg_test,
+ .pg_doio = nfs_generic_pg_pgios,
+};
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
new file mode 100644
index 0000000000..9084f156d6
--- /dev/null
+++ b/fs/nfs/pnfs.c
@@ -0,0 +1,3358 @@
+/*
+ * pNFS functions to call and manage layout drivers.
+ *
+ * Copyright (c) 2002 [year of first publication]
+ * The Regents of the University of Michigan
+ * All Rights Reserved
+ *
+ * Dean Hildebrand <dhildebz@umich.edu>
+ *
+ * Permission is granted to use, copy, create derivative works, and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the University of Michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. If
+ * the above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * This software is provided as is, without representation or warranty
+ * of any kind either express or implied, including without limitation
+ * the implied warranties of merchantability, fitness for a particular
+ * purpose, or noninfringement. The Regents of the University of
+ * Michigan shall not be liable for any damages, including special,
+ * indirect, incidental, or consequential damages, with respect to any
+ * claim arising out of or in connection with the use of the software,
+ * even if it has been or is hereafter advised of the possibility of
+ * such damages.
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/module.h>
+#include <linux/sort.h>
+#include "internal.h"
+#include "pnfs.h"
+#include "iostat.h"
+#include "nfs4trace.h"
+#include "delegation.h"
+#include "nfs42.h"
+#include "nfs4_fs.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS
+#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
+
+/* Locking:
+ *
+ * pnfs_spinlock:
+ * protects pnfs_modules_tbl.
+ */
+static DEFINE_SPINLOCK(pnfs_spinlock);
+
+/*
+ * pnfs_modules_tbl holds all pnfs modules
+ */
+static LIST_HEAD(pnfs_modules_tbl);
+
+static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo);
+static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
+ struct list_head *free_me,
+ const struct pnfs_layout_range *range,
+ u32 seq);
+static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
+ struct list_head *tmp_list);
+
+/* Return the registered pnfs layout driver module matching given id */
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver_locked(u32 id)
+{
+ struct pnfs_layoutdriver_type *local;
+
+ list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
+ if (local->id == id)
+ goto out;
+ local = NULL;
+out:
+ dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
+ return local;
+}
+
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver(u32 id)
+{
+ struct pnfs_layoutdriver_type *local;
+
+ spin_lock(&pnfs_spinlock);
+ local = find_pnfs_driver_locked(id);
+ if (local != NULL && !try_module_get(local->owner)) {
+ dprintk("%s: Could not grab reference on module\n", __func__);
+ local = NULL;
+ }
+ spin_unlock(&pnfs_spinlock);
+ return local;
+}
+
+const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id)
+{
+ return find_pnfs_driver(id);
+}
+
+void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld)
+{
+ if (ld)
+ module_put(ld->owner);
+}
+
+void
+unset_pnfs_layoutdriver(struct nfs_server *nfss)
+{
+ if (nfss->pnfs_curr_ld) {
+ if (nfss->pnfs_curr_ld->clear_layoutdriver)
+ nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
+ /* Decrement the MDS count. Purge the deviceid cache if zero */
+ if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count))
+ nfs4_deviceid_purge_client(nfss->nfs_client);
+ module_put(nfss->pnfs_curr_ld->owner);
+ }
+ nfss->pnfs_curr_ld = NULL;
+}
+
+/*
+ * When the server sends a list of layout types, we choose one in the order
+ * given in the list below.
+ *
+ * FIXME: should this list be configurable in some fashion? module param?
+ * mount option? something else?
+ */
+static const u32 ld_prefs[] = {
+ LAYOUT_SCSI,
+ LAYOUT_BLOCK_VOLUME,
+ LAYOUT_OSD2_OBJECTS,
+ LAYOUT_FLEX_FILES,
+ LAYOUT_NFSV4_1_FILES,
+ 0
+};
+
+static int
+ld_cmp(const void *e1, const void *e2)
+{
+ u32 ld1 = *((u32 *)e1);
+ u32 ld2 = *((u32 *)e2);
+ int i;
+
+ for (i = 0; ld_prefs[i] != 0; i++) {
+ if (ld1 == ld_prefs[i])
+ return -1;
+
+ if (ld2 == ld_prefs[i])
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Try to set the server's pnfs module to the pnfs layout type specified by id.
+ * Currently only one pNFS layout driver per filesystem is supported.
+ *
+ * @ids array of layout types supported by MDS.
+ */
+void
+set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
+ struct nfs_fsinfo *fsinfo)
+{
+ struct pnfs_layoutdriver_type *ld_type = NULL;
+ u32 id;
+ int i;
+
+ if (fsinfo->nlayouttypes == 0)
+ goto out_no_driver;
+ if (!(server->nfs_client->cl_exchange_flags &
+ (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
+ printk(KERN_ERR "NFS: %s: cl_exchange_flags 0x%x\n",
+ __func__, server->nfs_client->cl_exchange_flags);
+ goto out_no_driver;
+ }
+
+ sort(fsinfo->layouttype, fsinfo->nlayouttypes,
+ sizeof(*fsinfo->layouttype), ld_cmp, NULL);
+
+ for (i = 0; i < fsinfo->nlayouttypes; i++) {
+ id = fsinfo->layouttype[i];
+ ld_type = find_pnfs_driver(id);
+ if (!ld_type) {
+ request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX,
+ id);
+ ld_type = find_pnfs_driver(id);
+ }
+ if (ld_type)
+ break;
+ }
+
+ if (!ld_type) {
+ dprintk("%s: No pNFS module found!\n", __func__);
+ goto out_no_driver;
+ }
+
+ server->pnfs_curr_ld = ld_type;
+ if (ld_type->set_layoutdriver
+ && ld_type->set_layoutdriver(server, mntfh)) {
+ printk(KERN_ERR "NFS: %s: Error initializing pNFS layout "
+ "driver %u.\n", __func__, id);
+ module_put(ld_type->owner);
+ goto out_no_driver;
+ }
+ /* Bump the MDS count */
+ atomic_inc(&server->nfs_client->cl_mds_count);
+
+ dprintk("%s: pNFS module for %u set\n", __func__, id);
+ return;
+
+out_no_driver:
+ dprintk("%s: Using NFSv4 I/O\n", __func__);
+ server->pnfs_curr_ld = NULL;
+}
+
+int
+pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+ int status = -EINVAL;
+ struct pnfs_layoutdriver_type *tmp;
+
+ if (ld_type->id == 0) {
+ printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__);
+ return status;
+ }
+ if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
+ printk(KERN_ERR "NFS: %s Layout driver must provide "
+ "alloc_lseg and free_lseg.\n", __func__);
+ return status;
+ }
+
+ spin_lock(&pnfs_spinlock);
+ tmp = find_pnfs_driver_locked(ld_type->id);
+ if (!tmp) {
+ list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
+ status = 0;
+ dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
+ ld_type->name);
+ } else {
+ printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n",
+ __func__, ld_type->id);
+ }
+ spin_unlock(&pnfs_spinlock);
+
+ return status;
+}
+EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
+
+void
+pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+ dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
+ spin_lock(&pnfs_spinlock);
+ list_del(&ld_type->pnfs_tblid);
+ spin_unlock(&pnfs_spinlock);
+}
+EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
+
+/*
+ * pNFS client layout cache
+ */
+
+/* Need to hold i_lock if caller does not already hold reference */
+void
+pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ refcount_inc(&lo->plh_refcount);
+}
+
+static struct pnfs_layout_hdr *
+pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
+ return ld->alloc_layout_hdr(ino, gfp_flags);
+}
+
+static void
+pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct nfs_server *server = NFS_SERVER(lo->plh_inode);
+ struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
+
+ if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) {
+ struct nfs_client *clp = server->nfs_client;
+
+ spin_lock(&clp->cl_lock);
+ list_del_rcu(&lo->plh_layouts);
+ spin_unlock(&clp->cl_lock);
+ }
+ put_cred(lo->plh_lc_cred);
+ return ld->free_layout_hdr(lo);
+}
+
+static void
+pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct nfs_inode *nfsi = NFS_I(lo->plh_inode);
+ dprintk("%s: freeing layout cache %p\n", __func__, lo);
+ nfsi->layout = NULL;
+ /* Reset MDS Threshold I/O counters */
+ nfsi->write_io = 0;
+ nfsi->read_io = 0;
+}
+
+void
+pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct inode *inode;
+ unsigned long i_state;
+
+ if (!lo)
+ return;
+ inode = lo->plh_inode;
+ pnfs_layoutreturn_before_put_layout_hdr(lo);
+
+ if (refcount_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+ if (!list_empty(&lo->plh_segs))
+ WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
+ pnfs_detach_layout_hdr(lo);
+ i_state = inode->i_state;
+ spin_unlock(&inode->i_lock);
+ pnfs_free_layout_hdr(lo);
+ /* Notify pnfs_destroy_layout_final() that we're done */
+ if (i_state & (I_FREEING | I_CLEAR))
+ wake_up_var(lo);
+ }
+}
+
+static struct inode *
+pnfs_grab_inode_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct inode *inode = igrab(lo->plh_inode);
+ if (inode)
+ return inode;
+ set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags);
+ return NULL;
+}
+
+/*
+ * Compare 2 layout stateid sequence ids, to see which is newer,
+ * taking into account wraparound issues.
+ */
+static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
+{
+ return (s32)(s1 - s2) > 0;
+}
+
+static void pnfs_barrier_update(struct pnfs_layout_hdr *lo, u32 newseq)
+{
+ if (pnfs_seqid_is_newer(newseq, lo->plh_barrier) || !lo->plh_barrier)
+ lo->plh_barrier = newseq;
+}
+
+static void
+pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
+ u32 seq)
+{
+ if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
+ iomode = IOMODE_ANY;
+ lo->plh_return_iomode = iomode;
+ set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+ /*
+ * We must set lo->plh_return_seq to avoid livelocks with
+ * pnfs_layout_need_return()
+ */
+ if (seq == 0)
+ seq = be32_to_cpu(lo->plh_stateid.seqid);
+ if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq))
+ lo->plh_return_seq = seq;
+ pnfs_barrier_update(lo, seq);
+}
+
+static void
+pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
+{
+ struct pnfs_layout_segment *lseg;
+ lo->plh_return_iomode = 0;
+ lo->plh_return_seq = 0;
+ clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+ list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
+ if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ continue;
+ pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
+ }
+}
+
+static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
+{
+ clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
+ clear_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
+ rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
+}
+
+static void
+pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg,
+ struct list_head *free_me)
+{
+ clear_bit(NFS_LSEG_ROC, &lseg->pls_flags);
+ clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
+ if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags))
+ pnfs_lseg_dec_and_remove_zero(lseg, free_me);
+ if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
+ pnfs_lseg_dec_and_remove_zero(lseg, free_me);
+}
+
+/*
+ * Update the seqid of a layout stateid after receiving
+ * NFS4ERR_OLD_STATEID
+ */
+bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
+ struct pnfs_layout_range *dst_range,
+ struct inode *inode)
+{
+ struct pnfs_layout_hdr *lo;
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+ bool ret = false;
+ LIST_HEAD(head);
+ int err;
+
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (lo && pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match_other(dst, &lo->plh_stateid)) {
+ /* Is our call using the most recent seqid? If so, bump it */
+ if (!nfs4_stateid_is_newer(&lo->plh_stateid, dst)) {
+ nfs4_stateid_seqid_inc(dst);
+ ret = true;
+ goto out;
+ }
+ /* Try to update the seqid to the most recent */
+ err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0);
+ if (err != -EBUSY) {
+ dst->seqid = lo->plh_stateid.seqid;
+ *dst_range = range;
+ ret = true;
+ }
+ }
+out:
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&head);
+ return ret;
+}
+
+/*
+ * Mark a pnfs_layout_hdr and all associated layout segments as invalid
+ *
+ * In order to continue using the pnfs_layout_hdr, a full recovery
+ * is required.
+ * Note that caller must hold inode->i_lock.
+ */
+int
+pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
+ struct list_head *lseg_list)
+{
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+ struct pnfs_layout_segment *lseg, *next;
+
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+ list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+ pnfs_clear_lseg_state(lseg, lseg_list);
+ pnfs_clear_layoutreturn_info(lo);
+ pnfs_free_returned_lsegs(lo, lseg_list, &range, 0);
+ set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
+ if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
+ !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
+ pnfs_clear_layoutreturn_waitbit(lo);
+ return !list_empty(&lo->plh_segs);
+}
+
+static int
+pnfs_iomode_to_fail_bit(u32 iomode)
+{
+ return iomode == IOMODE_RW ?
+ NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
+}
+
+static void
+pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
+{
+ lo->plh_retry_timestamp = jiffies;
+ if (!test_and_set_bit(fail_bit, &lo->plh_flags))
+ refcount_inc(&lo->plh_refcount);
+}
+
+static void
+pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
+{
+ if (test_and_clear_bit(fail_bit, &lo->plh_flags))
+ refcount_dec(&lo->plh_refcount);
+}
+
+static void
+pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
+{
+ struct inode *inode = lo->plh_inode;
+ struct pnfs_layout_range range = {
+ .iomode = iomode,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+ LIST_HEAD(head);
+
+ spin_lock(&inode->i_lock);
+ pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+ pnfs_mark_matching_lsegs_return(lo, &head, &range, 0);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&head);
+ dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
+ iomode == IOMODE_RW ? "RW" : "READ");
+}
+
+static bool
+pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
+{
+ unsigned long start, end;
+ int fail_bit = pnfs_iomode_to_fail_bit(iomode);
+
+ if (test_bit(fail_bit, &lo->plh_flags) == 0)
+ return false;
+ end = jiffies;
+ start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT;
+ if (!time_in_range(lo->plh_retry_timestamp, start, end)) {
+ /* It is time to retry the failed layoutgets */
+ pnfs_layout_clear_fail_bit(lo, fail_bit);
+ return false;
+ }
+ return true;
+}
+
+static void
+pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
+ const struct pnfs_layout_range *range,
+ const nfs4_stateid *stateid)
+{
+ INIT_LIST_HEAD(&lseg->pls_list);
+ INIT_LIST_HEAD(&lseg->pls_lc_list);
+ INIT_LIST_HEAD(&lseg->pls_commits);
+ refcount_set(&lseg->pls_refcount, 1);
+ set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
+ lseg->pls_layout = lo;
+ lseg->pls_range = *range;
+ lseg->pls_seq = be32_to_cpu(stateid->seqid);
+}
+
+static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
+{
+ if (lseg != NULL) {
+ struct inode *inode = lseg->pls_layout->plh_inode;
+ NFS_SERVER(inode)->pnfs_curr_ld->free_lseg(lseg);
+ }
+}
+
+static void
+pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg)
+{
+ WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+ list_del_init(&lseg->pls_list);
+ /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
+ refcount_dec(&lo->plh_refcount);
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ return;
+ if (list_empty(&lo->plh_segs) &&
+ !test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
+ !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
+ if (atomic_read(&lo->plh_outstanding) == 0)
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+ clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+ }
+}
+
+static bool
+pnfs_cache_lseg_for_layoutreturn(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg)
+{
+ if (test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
+ pnfs_layout_is_valid(lo)) {
+ pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
+ list_move_tail(&lseg->pls_list, &lo->plh_return_segs);
+ return true;
+ }
+ return false;
+}
+
+void
+pnfs_put_lseg(struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_layout_hdr *lo;
+ struct inode *inode;
+
+ if (!lseg)
+ return;
+
+ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
+ refcount_read(&lseg->pls_refcount),
+ test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+
+ lo = lseg->pls_layout;
+ inode = lo->plh_inode;
+
+ if (refcount_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
+ pnfs_get_layout_hdr(lo);
+ pnfs_layout_remove_lseg(lo, lseg);
+ if (pnfs_cache_lseg_for_layoutreturn(lo, lseg))
+ lseg = NULL;
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg(lseg);
+ pnfs_put_layout_hdr(lo);
+ }
+}
+EXPORT_SYMBOL_GPL(pnfs_put_lseg);
+
+/*
+ * is l2 fully contained in l1?
+ * start1 end1
+ * [----------------------------------)
+ * start2 end2
+ * [----------------)
+ */
+static bool
+pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
+{
+ u64 start1 = l1->offset;
+ u64 end1 = pnfs_end_offset(start1, l1->length);
+ u64 start2 = l2->offset;
+ u64 end2 = pnfs_end_offset(start2, l2->length);
+
+ return (start1 <= start2) && (end1 >= end2);
+}
+
+static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
+ struct list_head *tmp_list)
+{
+ if (!refcount_dec_and_test(&lseg->pls_refcount))
+ return false;
+ pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
+ list_add(&lseg->pls_list, tmp_list);
+ return true;
+}
+
+/* Returns 1 if lseg is removed from list, 0 otherwise */
+static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
+ struct list_head *tmp_list)
+{
+ int rv = 0;
+
+ if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
+ /* Remove the reference keeping the lseg in the
+ * list. It will now be removed when all
+ * outstanding io is finished.
+ */
+ dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+ refcount_read(&lseg->pls_refcount));
+ if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list))
+ rv = 1;
+ }
+ return rv;
+}
+
+static bool
+pnfs_should_free_range(const struct pnfs_layout_range *lseg_range,
+ const struct pnfs_layout_range *recall_range)
+{
+ return (recall_range->iomode == IOMODE_ANY ||
+ lseg_range->iomode == recall_range->iomode) &&
+ pnfs_lseg_range_intersecting(lseg_range, recall_range);
+}
+
+static bool
+pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg,
+ const struct pnfs_layout_range *recall_range,
+ u32 seq)
+{
+ if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq))
+ return false;
+ if (recall_range == NULL)
+ return true;
+ return pnfs_should_free_range(&lseg->pls_range, recall_range);
+}
+
+/**
+ * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later
+ * @lo: layout header containing the lsegs
+ * @tmp_list: list head where doomed lsegs should go
+ * @recall_range: optional recall range argument to match (may be NULL)
+ * @seq: only invalidate lsegs obtained prior to this sequence (may be 0)
+ *
+ * Walk the list of lsegs in the layout header, and tear down any that should
+ * be destroyed. If "recall_range" is specified then the segment must match
+ * that range. If "seq" is non-zero, then only match segments that were handed
+ * out at or before that sequence.
+ *
+ * Returns number of matching invalid lsegs remaining in list after scanning
+ * it and purging them.
+ */
+int
+pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+ struct list_head *tmp_list,
+ const struct pnfs_layout_range *recall_range,
+ u32 seq)
+{
+ struct pnfs_layout_segment *lseg, *next;
+ struct nfs_server *server = NFS_SERVER(lo->plh_inode);
+ int remaining = 0;
+
+ dprintk("%s:Begin lo %p\n", __func__, lo);
+
+ if (list_empty(&lo->plh_segs))
+ return 0;
+ list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+ if (pnfs_match_lseg_recall(lseg, recall_range, seq)) {
+ dprintk("%s: freeing lseg %p iomode %d seq %u "
+ "offset %llu length %llu\n", __func__,
+ lseg, lseg->pls_range.iomode, lseg->pls_seq,
+ lseg->pls_range.offset, lseg->pls_range.length);
+ if (mark_lseg_invalid(lseg, tmp_list))
+ continue;
+ remaining++;
+ pnfs_lseg_cancel_io(server, lseg);
+ }
+ dprintk("%s:Return %i\n", __func__, remaining);
+ return remaining;
+}
+
+static void
+pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
+ struct list_head *free_me,
+ const struct pnfs_layout_range *range,
+ u32 seq)
+{
+ struct pnfs_layout_segment *lseg, *next;
+
+ list_for_each_entry_safe(lseg, next, &lo->plh_return_segs, pls_list) {
+ if (pnfs_match_lseg_recall(lseg, range, seq))
+ list_move_tail(&lseg->pls_list, free_me);
+ }
+}
+
+/* note free_me must contain lsegs from a single layout_hdr */
+void
+pnfs_free_lseg_list(struct list_head *free_me)
+{
+ struct pnfs_layout_segment *lseg, *tmp;
+
+ if (list_empty(free_me))
+ return;
+
+ list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
+ list_del(&lseg->pls_list);
+ pnfs_free_lseg(lseg);
+ }
+}
+
+static struct pnfs_layout_hdr *__pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+ struct pnfs_layout_hdr *lo;
+ LIST_HEAD(tmp_list);
+
+ spin_lock(&nfsi->vfs_inode.i_lock);
+ lo = nfsi->layout;
+ if (lo) {
+ pnfs_get_layout_hdr(lo);
+ pnfs_mark_layout_stateid_invalid(lo, &tmp_list);
+ pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
+ pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
+ spin_unlock(&nfsi->vfs_inode.i_lock);
+ pnfs_free_lseg_list(&tmp_list);
+ nfs_commit_inode(&nfsi->vfs_inode, 0);
+ pnfs_put_layout_hdr(lo);
+ } else
+ spin_unlock(&nfsi->vfs_inode.i_lock);
+ return lo;
+}
+
+void pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+ __pnfs_destroy_layout(nfsi);
+}
+EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
+
+static bool pnfs_layout_removed(struct nfs_inode *nfsi,
+ struct pnfs_layout_hdr *lo)
+{
+ bool ret;
+
+ spin_lock(&nfsi->vfs_inode.i_lock);
+ ret = nfsi->layout != lo;
+ spin_unlock(&nfsi->vfs_inode.i_lock);
+ return ret;
+}
+
+void pnfs_destroy_layout_final(struct nfs_inode *nfsi)
+{
+ struct pnfs_layout_hdr *lo = __pnfs_destroy_layout(nfsi);
+
+ if (lo)
+ wait_var_event(lo, pnfs_layout_removed(nfsi, lo));
+}
+
+static bool
+pnfs_layout_add_bulk_destroy_list(struct inode *inode,
+ struct list_head *layout_list)
+{
+ struct pnfs_layout_hdr *lo;
+ bool ret = false;
+
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) {
+ pnfs_get_layout_hdr(lo);
+ list_add(&lo->plh_bulk_destroy, layout_list);
+ ret = true;
+ }
+ spin_unlock(&inode->i_lock);
+ return ret;
+}
+
+/* Caller must hold rcu_read_lock and clp->cl_lock */
+static int
+pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
+ struct nfs_server *server,
+ struct list_head *layout_list)
+ __must_hold(&clp->cl_lock)
+ __must_hold(RCU)
+{
+ struct pnfs_layout_hdr *lo, *next;
+ struct inode *inode;
+
+ list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
+ if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
+ test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) ||
+ !list_empty(&lo->plh_bulk_destroy))
+ continue;
+ /* If the sb is being destroyed, just bail */
+ if (!nfs_sb_active(server->super))
+ break;
+ inode = pnfs_grab_inode_layout_hdr(lo);
+ if (inode != NULL) {
+ if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags))
+ list_del_rcu(&lo->plh_layouts);
+ if (pnfs_layout_add_bulk_destroy_list(inode,
+ layout_list))
+ continue;
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+ iput(inode);
+ } else {
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+ }
+ nfs_sb_deactive(server->super);
+ spin_lock(&clp->cl_lock);
+ rcu_read_lock();
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static int
+pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
+ bool is_bulk_recall)
+{
+ struct pnfs_layout_hdr *lo;
+ struct inode *inode;
+ LIST_HEAD(lseg_list);
+ int ret = 0;
+
+ while (!list_empty(layout_list)) {
+ lo = list_entry(layout_list->next, struct pnfs_layout_hdr,
+ plh_bulk_destroy);
+ dprintk("%s freeing layout for inode %lu\n", __func__,
+ lo->plh_inode->i_ino);
+ inode = lo->plh_inode;
+
+ pnfs_layoutcommit_inode(inode, false);
+
+ spin_lock(&inode->i_lock);
+ list_del_init(&lo->plh_bulk_destroy);
+ if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) {
+ if (is_bulk_recall)
+ set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+ ret = -EAGAIN;
+ }
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&lseg_list);
+ /* Free all lsegs that are attached to commit buckets */
+ nfs_commit_inode(inode, 0);
+ pnfs_put_layout_hdr(lo);
+ nfs_iput_and_deactive(inode);
+ }
+ return ret;
+}
+
+int
+pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
+ struct nfs_fsid *fsid,
+ bool is_recall)
+{
+ struct nfs_server *server;
+ LIST_HEAD(layout_list);
+
+ spin_lock(&clp->cl_lock);
+ rcu_read_lock();
+restart:
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0)
+ continue;
+ if (pnfs_layout_bulk_destroy_byserver_locked(clp,
+ server,
+ &layout_list) != 0)
+ goto restart;
+ }
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+
+ if (list_empty(&layout_list))
+ return 0;
+ return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
+}
+
+int
+pnfs_destroy_layouts_byclid(struct nfs_client *clp,
+ bool is_recall)
+{
+ struct nfs_server *server;
+ LIST_HEAD(layout_list);
+
+ spin_lock(&clp->cl_lock);
+ rcu_read_lock();
+restart:
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ if (pnfs_layout_bulk_destroy_byserver_locked(clp,
+ server,
+ &layout_list) != 0)
+ goto restart;
+ }
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+
+ if (list_empty(&layout_list))
+ return 0;
+ return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
+}
+
+/*
+ * Called by the state manager to remove all layouts established under an
+ * expired lease.
+ */
+void
+pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+ nfs4_deviceid_mark_client_invalid(clp);
+ nfs4_deviceid_purge_client(clp);
+
+ pnfs_destroy_layouts_byclid(clp, false);
+}
+
+static void
+pnfs_set_layout_cred(struct pnfs_layout_hdr *lo, const struct cred *cred)
+{
+ const struct cred *old;
+
+ if (cred && cred_fscmp(lo->plh_lc_cred, cred) != 0) {
+ old = xchg(&lo->plh_lc_cred, get_cred(cred));
+ put_cred(old);
+ }
+}
+
+/* update lo->plh_stateid with new if is more recent */
+void
+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
+ const struct cred *cred, bool update_barrier)
+{
+ u32 oldseq = be32_to_cpu(lo->plh_stateid.seqid);
+ u32 newseq = be32_to_cpu(new->seqid);
+
+ if (!pnfs_layout_is_valid(lo)) {
+ pnfs_set_layout_cred(lo, cred);
+ nfs4_stateid_copy(&lo->plh_stateid, new);
+ lo->plh_barrier = newseq;
+ pnfs_clear_layoutreturn_info(lo);
+ clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+ return;
+ }
+
+ if (pnfs_seqid_is_newer(newseq, oldseq))
+ nfs4_stateid_copy(&lo->plh_stateid, new);
+
+ if (update_barrier) {
+ pnfs_barrier_update(lo, newseq);
+ return;
+ }
+ /*
+ * Because of wraparound, we want to keep the barrier
+ * "close" to the current seqids. We really only want to
+ * get here from a layoutget call.
+ */
+ if (atomic_read(&lo->plh_outstanding) == 1)
+ pnfs_barrier_update(lo, be32_to_cpu(lo->plh_stateid.seqid));
+}
+
+static bool
+pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *stateid)
+{
+ u32 seqid = be32_to_cpu(stateid->seqid);
+
+ return lo->plh_barrier && pnfs_seqid_is_newer(lo->plh_barrier, seqid);
+}
+
+/* lget is set to 1 if called from inside send_layoutget call chain */
+static bool
+pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
+{
+ return lo->plh_block_lgets ||
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+}
+
+static struct nfs_server *
+pnfs_find_server(struct inode *inode, struct nfs_open_context *ctx)
+{
+ struct nfs_server *server;
+
+ if (inode) {
+ server = NFS_SERVER(inode);
+ } else {
+ struct dentry *parent_dir = dget_parent(ctx->dentry);
+ server = NFS_SERVER(parent_dir->d_inode);
+ dput(parent_dir);
+ }
+ return server;
+}
+
+static void nfs4_free_pages(struct page **pages, size_t size)
+{
+ int i;
+
+ if (!pages)
+ return;
+
+ for (i = 0; i < size; i++) {
+ if (!pages[i])
+ break;
+ __free_page(pages[i]);
+ }
+ kfree(pages);
+}
+
+static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags)
+{
+ struct page **pages;
+ int i;
+
+ pages = kmalloc_array(size, sizeof(struct page *), gfp_flags);
+ if (!pages) {
+ dprintk("%s: can't alloc array of %zu pages\n", __func__, size);
+ return NULL;
+ }
+
+ for (i = 0; i < size; i++) {
+ pages[i] = alloc_page(gfp_flags);
+ if (!pages[i]) {
+ dprintk("%s: failed to allocate page\n", __func__);
+ nfs4_free_pages(pages, i);
+ return NULL;
+ }
+ }
+
+ return pages;
+}
+
+static struct nfs4_layoutget *
+pnfs_alloc_init_layoutget_args(struct inode *ino,
+ struct nfs_open_context *ctx,
+ const nfs4_stateid *stateid,
+ const struct pnfs_layout_range *range,
+ gfp_t gfp_flags)
+{
+ struct nfs_server *server = pnfs_find_server(ino, ctx);
+ size_t max_reply_sz = server->pnfs_curr_ld->max_layoutget_response;
+ size_t max_pages = max_response_pages(server);
+ struct nfs4_layoutget *lgp;
+
+ dprintk("--> %s\n", __func__);
+
+ lgp = kzalloc(sizeof(*lgp), gfp_flags);
+ if (lgp == NULL)
+ return NULL;
+
+ if (max_reply_sz) {
+ size_t npages = (max_reply_sz + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (npages < max_pages)
+ max_pages = npages;
+ }
+
+ lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags);
+ if (!lgp->args.layout.pages) {
+ kfree(lgp);
+ return NULL;
+ }
+ lgp->args.layout.pglen = max_pages * PAGE_SIZE;
+ lgp->res.layoutp = &lgp->args.layout;
+
+ /* Don't confuse uninitialised result and success */
+ lgp->res.status = -NFS4ERR_DELAY;
+
+ lgp->args.minlength = PAGE_SIZE;
+ if (lgp->args.minlength > range->length)
+ lgp->args.minlength = range->length;
+ if (ino) {
+ loff_t i_size = i_size_read(ino);
+
+ if (range->iomode == IOMODE_READ) {
+ if (range->offset >= i_size)
+ lgp->args.minlength = 0;
+ else if (i_size - range->offset < lgp->args.minlength)
+ lgp->args.minlength = i_size - range->offset;
+ }
+ }
+ lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
+ pnfs_copy_range(&lgp->args.range, range);
+ lgp->args.type = server->pnfs_curr_ld->id;
+ lgp->args.inode = ino;
+ lgp->args.ctx = get_nfs_open_context(ctx);
+ nfs4_stateid_copy(&lgp->args.stateid, stateid);
+ lgp->gfp_flags = gfp_flags;
+ lgp->cred = ctx->cred;
+ return lgp;
+}
+
+void pnfs_layoutget_free(struct nfs4_layoutget *lgp)
+{
+ size_t max_pages = lgp->args.layout.pglen / PAGE_SIZE;
+
+ nfs4_free_pages(lgp->args.layout.pages, max_pages);
+ pnfs_put_layout_hdr(lgp->lo);
+ put_nfs_open_context(lgp->args.ctx);
+ kfree(lgp);
+}
+
+static void pnfs_clear_layoutcommit(struct inode *inode,
+ struct list_head *head)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct pnfs_layout_segment *lseg, *tmp;
+
+ if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
+ return;
+ list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) {
+ if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
+ continue;
+ pnfs_lseg_dec_and_remove_zero(lseg, head);
+ }
+}
+
+void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *arg_stateid,
+ const struct pnfs_layout_range *range,
+ const nfs4_stateid *stateid)
+{
+ struct inode *inode = lo->plh_inode;
+ LIST_HEAD(freeme);
+
+ spin_lock(&inode->i_lock);
+ if (!pnfs_layout_is_valid(lo) ||
+ !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid))
+ goto out_unlock;
+ if (stateid) {
+ u32 seq = be32_to_cpu(arg_stateid->seqid);
+
+ pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq);
+ pnfs_free_returned_lsegs(lo, &freeme, range, seq);
+ pnfs_set_layout_stateid(lo, stateid, NULL, true);
+ } else
+ pnfs_mark_layout_stateid_invalid(lo, &freeme);
+out_unlock:
+ pnfs_clear_layoutreturn_waitbit(lo);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&freeme);
+
+}
+
+static bool
+pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
+ nfs4_stateid *stateid,
+ const struct cred **cred,
+ enum pnfs_iomode *iomode)
+{
+ /* Serialise LAYOUTGET/LAYOUTRETURN */
+ if (atomic_read(&lo->plh_outstanding) != 0)
+ return false;
+ if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
+ return false;
+ set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
+ pnfs_get_layout_hdr(lo);
+ nfs4_stateid_copy(stateid, &lo->plh_stateid);
+ *cred = get_cred(lo->plh_lc_cred);
+ if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
+ if (lo->plh_return_seq != 0)
+ stateid->seqid = cpu_to_be32(lo->plh_return_seq);
+ if (iomode != NULL)
+ *iomode = lo->plh_return_iomode;
+ pnfs_clear_layoutreturn_info(lo);
+ } else if (iomode != NULL)
+ *iomode = IOMODE_ANY;
+ pnfs_barrier_update(lo, be32_to_cpu(stateid->seqid));
+ return true;
+}
+
+static void
+pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args,
+ struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *stateid,
+ enum pnfs_iomode iomode)
+{
+ struct inode *inode = lo->plh_inode;
+
+ args->layout_type = NFS_SERVER(inode)->pnfs_curr_ld->id;
+ args->inode = inode;
+ args->range.iomode = iomode;
+ args->range.offset = 0;
+ args->range.length = NFS4_MAX_UINT64;
+ args->layout = lo;
+ nfs4_stateid_copy(&args->stateid, stateid);
+}
+
+static int
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *stateid,
+ const struct cred **pcred,
+ enum pnfs_iomode iomode,
+ bool sync)
+{
+ struct inode *ino = lo->plh_inode;
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
+ struct nfs4_layoutreturn *lrp;
+ const struct cred *cred = *pcred;
+ int status = 0;
+
+ *pcred = NULL;
+ lrp = kzalloc(sizeof(*lrp), nfs_io_gfp_mask());
+ if (unlikely(lrp == NULL)) {
+ status = -ENOMEM;
+ spin_lock(&ino->i_lock);
+ pnfs_clear_layoutreturn_waitbit(lo);
+ spin_unlock(&ino->i_lock);
+ put_cred(cred);
+ pnfs_put_layout_hdr(lo);
+ goto out;
+ }
+
+ pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode);
+ lrp->args.ld_private = &lrp->ld_private;
+ lrp->clp = NFS_SERVER(ino)->nfs_client;
+ lrp->cred = cred;
+ if (ld->prepare_layoutreturn)
+ ld->prepare_layoutreturn(&lrp->args);
+
+ status = nfs4_proc_layoutreturn(lrp, sync);
+out:
+ dprintk("<-- %s status: %d\n", __func__, status);
+ return status;
+}
+
+static bool
+pnfs_layout_segments_returnable(struct pnfs_layout_hdr *lo,
+ enum pnfs_iomode iomode,
+ u32 seq)
+{
+ struct pnfs_layout_range recall_range = {
+ .length = NFS4_MAX_UINT64,
+ .iomode = iomode,
+ };
+ return pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
+ &recall_range, seq) != -EBUSY;
+}
+
+/* Return true if layoutreturn is needed */
+static bool
+pnfs_layout_need_return(struct pnfs_layout_hdr *lo)
+{
+ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ return false;
+ return pnfs_layout_segments_returnable(lo, lo->plh_return_iomode,
+ lo->plh_return_seq);
+}
+
+static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct inode *inode= lo->plh_inode;
+
+ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ return;
+ spin_lock(&inode->i_lock);
+ if (pnfs_layout_need_return(lo)) {
+ const struct cred *cred;
+ nfs4_stateid stateid;
+ enum pnfs_iomode iomode;
+ bool send;
+
+ send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
+ spin_unlock(&inode->i_lock);
+ if (send) {
+ /* Send an async layoutreturn so we dont deadlock */
+ pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
+ }
+ } else
+ spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
+ * when the layout segment list is empty.
+ *
+ * Note that a pnfs_layout_hdr can exist with an empty layout segment
+ * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the
+ * deviceid is marked invalid.
+ */
+int
+_pnfs_return_layout(struct inode *ino)
+{
+ struct pnfs_layout_hdr *lo = NULL;
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+ LIST_HEAD(tmp_list);
+ const struct cred *cred;
+ nfs4_stateid stateid;
+ int status = 0;
+ bool send, valid_layout;
+
+ dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino);
+
+ spin_lock(&ino->i_lock);
+ lo = nfsi->layout;
+ if (!lo) {
+ spin_unlock(&ino->i_lock);
+ dprintk("NFS: %s no layout to return\n", __func__);
+ goto out;
+ }
+ /* Reference matched in nfs4_layoutreturn_release */
+ pnfs_get_layout_hdr(lo);
+ /* Is there an outstanding layoutreturn ? */
+ if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
+ spin_unlock(&ino->i_lock);
+ if (wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
+ TASK_UNINTERRUPTIBLE))
+ goto out_put_layout_hdr;
+ spin_lock(&ino->i_lock);
+ }
+ valid_layout = pnfs_layout_is_valid(lo);
+ pnfs_clear_layoutcommit(ino, &tmp_list);
+ pnfs_mark_matching_lsegs_return(lo, &tmp_list, &range, 0);
+
+ if (NFS_SERVER(ino)->pnfs_curr_ld->return_range)
+ NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
+
+ /* Don't send a LAYOUTRETURN if list was initially empty */
+ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) ||
+ !valid_layout) {
+ spin_unlock(&ino->i_lock);
+ dprintk("NFS: %s no layout segments to return\n", __func__);
+ goto out_wait_layoutreturn;
+ }
+
+ send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL);
+ spin_unlock(&ino->i_lock);
+ if (send)
+ status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, true);
+out_wait_layoutreturn:
+ wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE);
+out_put_layout_hdr:
+ pnfs_free_lseg_list(&tmp_list);
+ pnfs_put_layout_hdr(lo);
+out:
+ dprintk("<-- %s status: %d\n", __func__, status);
+ return status;
+}
+
+int
+pnfs_commit_and_return_layout(struct inode *inode)
+{
+ struct pnfs_layout_hdr *lo;
+ int ret;
+
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (lo == NULL) {
+ spin_unlock(&inode->i_lock);
+ return 0;
+ }
+ pnfs_get_layout_hdr(lo);
+ /* Block new layoutgets and read/write to ds */
+ lo->plh_block_lgets++;
+ spin_unlock(&inode->i_lock);
+ filemap_fdatawait(inode->i_mapping);
+ ret = pnfs_layoutcommit_inode(inode, true);
+ if (ret == 0)
+ ret = _pnfs_return_layout(inode);
+ spin_lock(&inode->i_lock);
+ lo->plh_block_lgets--;
+ spin_unlock(&inode->i_lock);
+ pnfs_put_layout_hdr(lo);
+ return ret;
+}
+
+bool pnfs_roc(struct inode *ino,
+ struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ const struct cred *cred)
+{
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct nfs_open_context *ctx;
+ struct nfs4_state *state;
+ struct pnfs_layout_hdr *lo;
+ struct pnfs_layout_segment *lseg, *next;
+ const struct cred *lc_cred;
+ nfs4_stateid stateid;
+ enum pnfs_iomode iomode = 0;
+ bool layoutreturn = false, roc = false;
+ bool skip_read = false;
+
+ if (!nfs_have_layout(ino))
+ return false;
+retry:
+ rcu_read_lock();
+ spin_lock(&ino->i_lock);
+ lo = nfsi->layout;
+ if (!lo || !pnfs_layout_is_valid(lo) ||
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+ lo = NULL;
+ goto out_noroc;
+ }
+ pnfs_get_layout_hdr(lo);
+ if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
+ spin_unlock(&ino->i_lock);
+ rcu_read_unlock();
+ wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
+ TASK_UNINTERRUPTIBLE);
+ pnfs_put_layout_hdr(lo);
+ goto retry;
+ }
+
+ /* no roc if we hold a delegation */
+ if (nfs4_check_delegation(ino, FMODE_READ)) {
+ if (nfs4_check_delegation(ino, FMODE_WRITE))
+ goto out_noroc;
+ skip_read = true;
+ }
+
+ list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
+ state = ctx->state;
+ if (state == NULL)
+ continue;
+ /* Don't return layout if there is open file state */
+ if (state->state & FMODE_WRITE)
+ goto out_noroc;
+ if (state->state & FMODE_READ)
+ skip_read = true;
+ }
+
+
+ list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) {
+ if (skip_read && lseg->pls_range.iomode == IOMODE_READ)
+ continue;
+ /* If we are sending layoutreturn, invalidate all valid lsegs */
+ if (!test_and_clear_bit(NFS_LSEG_ROC, &lseg->pls_flags))
+ continue;
+ /*
+ * Note: mark lseg for return so pnfs_layout_remove_lseg
+ * doesn't invalidate the layout for us.
+ */
+ set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
+ if (!mark_lseg_invalid(lseg, &lo->plh_return_segs))
+ continue;
+ pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
+ }
+
+ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ goto out_noroc;
+
+ /* ROC in two conditions:
+ * 1. there are ROC lsegs
+ * 2. we don't send layoutreturn
+ */
+ /* lo ref dropped in pnfs_roc_release() */
+ layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &lc_cred, &iomode);
+ /* If the creds don't match, we can't compound the layoutreturn */
+ if (!layoutreturn || cred_fscmp(cred, lc_cred) != 0)
+ goto out_noroc;
+
+ roc = layoutreturn;
+ pnfs_init_layoutreturn_args(args, lo, &stateid, iomode);
+ res->lrs_present = 0;
+ layoutreturn = false;
+ put_cred(lc_cred);
+
+out_noroc:
+ spin_unlock(&ino->i_lock);
+ rcu_read_unlock();
+ pnfs_layoutcommit_inode(ino, true);
+ if (roc) {
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
+ if (ld->prepare_layoutreturn)
+ ld->prepare_layoutreturn(args);
+ pnfs_put_layout_hdr(lo);
+ return true;
+ }
+ if (layoutreturn)
+ pnfs_send_layoutreturn(lo, &stateid, &lc_cred, iomode, true);
+ pnfs_put_layout_hdr(lo);
+ return false;
+}
+
+int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp,
+ struct nfs4_layoutreturn_res **respp, int *ret)
+{
+ struct nfs4_layoutreturn_args *arg = *argpp;
+ int retval = -EAGAIN;
+
+ if (!arg)
+ return 0;
+ /* Handle Layoutreturn errors */
+ switch (*ret) {
+ case 0:
+ retval = 0;
+ break;
+ case -NFS4ERR_NOMATCHING_LAYOUT:
+ /* Was there an RPC level error? If not, retry */
+ if (task->tk_rpc_status == 0)
+ break;
+ /* If the call was not sent, let caller handle it */
+ if (!RPC_WAS_SENT(task))
+ return 0;
+ /*
+ * Otherwise, assume the call succeeded and
+ * that we need to release the layout
+ */
+ *ret = 0;
+ (*respp)->lrs_present = 0;
+ retval = 0;
+ break;
+ case -NFS4ERR_DELAY:
+ /* Let the caller handle the retry */
+ *ret = -NFS4ERR_NOMATCHING_LAYOUT;
+ return 0;
+ case -NFS4ERR_OLD_STATEID:
+ if (!nfs4_layout_refresh_old_stateid(&arg->stateid,
+ &arg->range, arg->inode))
+ break;
+ *ret = -NFS4ERR_NOMATCHING_LAYOUT;
+ return -EAGAIN;
+ }
+ *argpp = NULL;
+ *respp = NULL;
+ return retval;
+}
+
+void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ int ret)
+{
+ struct pnfs_layout_hdr *lo = args->layout;
+ struct inode *inode = args->inode;
+ const nfs4_stateid *res_stateid = NULL;
+ struct nfs4_xdr_opaque_data *ld_private = args->ld_private;
+
+ switch (ret) {
+ case -NFS4ERR_NOMATCHING_LAYOUT:
+ spin_lock(&inode->i_lock);
+ if (pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match_other(&args->stateid, &lo->plh_stateid))
+ pnfs_set_plh_return_info(lo, args->range.iomode, 0);
+ pnfs_clear_layoutreturn_waitbit(lo);
+ spin_unlock(&inode->i_lock);
+ break;
+ case 0:
+ if (res->lrs_present)
+ res_stateid = &res->stateid;
+ fallthrough;
+ default:
+ pnfs_layoutreturn_free_lsegs(lo, &args->stateid, &args->range,
+ res_stateid);
+ }
+ trace_nfs4_layoutreturn_on_close(args->inode, &args->stateid, ret);
+ if (ld_private && ld_private->ops && ld_private->ops->free)
+ ld_private->ops->free(ld_private);
+ pnfs_put_layout_hdr(lo);
+}
+
+bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
+{
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct pnfs_layout_hdr *lo;
+ bool sleep = false;
+
+ /* we might not have grabbed lo reference. so need to check under
+ * i_lock */
+ spin_lock(&ino->i_lock);
+ lo = nfsi->layout;
+ if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
+ rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
+ sleep = true;
+ }
+ spin_unlock(&ino->i_lock);
+ return sleep;
+}
+
+/*
+ * Compare two layout segments for sorting into layout cache.
+ * We want to preferentially return RW over RO layouts, so ensure those
+ * are seen first.
+ */
+static s64
+pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
+{
+ s64 d;
+
+ /* high offset > low offset */
+ d = l1->offset - l2->offset;
+ if (d)
+ return d;
+
+ /* short length > long length */
+ d = l2->length - l1->length;
+ if (d)
+ return d;
+
+ /* read > read/write */
+ return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
+}
+
+static bool
+pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
+{
+ return pnfs_lseg_range_cmp(l1, l2) > 0;
+}
+
+static bool
+pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg,
+ struct pnfs_layout_segment *old)
+{
+ return false;
+}
+
+void
+pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ bool (*is_after)(const struct pnfs_layout_range *,
+ const struct pnfs_layout_range *),
+ bool (*do_merge)(struct pnfs_layout_segment *,
+ struct pnfs_layout_segment *),
+ struct list_head *free_me)
+{
+ struct pnfs_layout_segment *lp, *tmp;
+
+ dprintk("%s:Begin\n", __func__);
+
+ list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) {
+ if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0)
+ continue;
+ if (do_merge(lseg, lp)) {
+ mark_lseg_invalid(lp, free_me);
+ continue;
+ }
+ if (is_after(&lseg->pls_range, &lp->pls_range))
+ continue;
+ list_add_tail(&lseg->pls_list, &lp->pls_list);
+ dprintk("%s: inserted lseg %p "
+ "iomode %d offset %llu length %llu before "
+ "lp %p iomode %d offset %llu length %llu\n",
+ __func__, lseg, lseg->pls_range.iomode,
+ lseg->pls_range.offset, lseg->pls_range.length,
+ lp, lp->pls_range.iomode, lp->pls_range.offset,
+ lp->pls_range.length);
+ goto out;
+ }
+ list_add_tail(&lseg->pls_list, &lo->plh_segs);
+ dprintk("%s: inserted lseg %p "
+ "iomode %d offset %llu length %llu at tail\n",
+ __func__, lseg, lseg->pls_range.iomode,
+ lseg->pls_range.offset, lseg->pls_range.length);
+out:
+ pnfs_get_layout_hdr(lo);
+
+ dprintk("%s:Return\n", __func__);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg);
+
+static void
+pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ struct list_head *free_me)
+{
+ struct inode *inode = lo->plh_inode;
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+ if (ld->add_lseg != NULL)
+ ld->add_lseg(lo, lseg, free_me);
+ else
+ pnfs_generic_layout_insert_lseg(lo, lseg,
+ pnfs_lseg_range_is_after,
+ pnfs_lseg_no_merge,
+ free_me);
+}
+
+static struct pnfs_layout_hdr *
+alloc_init_layout_hdr(struct inode *ino,
+ struct nfs_open_context *ctx,
+ gfp_t gfp_flags)
+{
+ struct pnfs_layout_hdr *lo;
+
+ lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
+ if (!lo)
+ return NULL;
+ refcount_set(&lo->plh_refcount, 1);
+ INIT_LIST_HEAD(&lo->plh_layouts);
+ INIT_LIST_HEAD(&lo->plh_segs);
+ INIT_LIST_HEAD(&lo->plh_return_segs);
+ INIT_LIST_HEAD(&lo->plh_bulk_destroy);
+ lo->plh_inode = ino;
+ lo->plh_lc_cred = get_cred(ctx->cred);
+ lo->plh_flags |= 1 << NFS_LAYOUT_INVALID_STID;
+ return lo;
+}
+
+static struct pnfs_layout_hdr *
+pnfs_find_alloc_layout(struct inode *ino,
+ struct nfs_open_context *ctx,
+ gfp_t gfp_flags)
+ __releases(&ino->i_lock)
+ __acquires(&ino->i_lock)
+{
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct pnfs_layout_hdr *new = NULL;
+
+ dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
+
+ if (nfsi->layout != NULL)
+ goto out_existing;
+ spin_unlock(&ino->i_lock);
+ new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
+ spin_lock(&ino->i_lock);
+
+ if (likely(nfsi->layout == NULL)) { /* Won the race? */
+ nfsi->layout = new;
+ return new;
+ } else if (new != NULL)
+ pnfs_free_layout_hdr(new);
+out_existing:
+ pnfs_get_layout_hdr(nfsi->layout);
+ return nfsi->layout;
+}
+
+/*
+ * iomode matching rules:
+ * iomode lseg strict match
+ * iomode
+ * ----- ----- ------ -----
+ * ANY READ N/A true
+ * ANY RW N/A true
+ * RW READ N/A false
+ * RW RW N/A true
+ * READ READ N/A true
+ * READ RW true false
+ * READ RW false true
+ */
+static bool
+pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
+ const struct pnfs_layout_range *range,
+ bool strict_iomode)
+{
+ struct pnfs_layout_range range1;
+
+ if ((range->iomode == IOMODE_RW &&
+ ls_range->iomode != IOMODE_RW) ||
+ (range->iomode != ls_range->iomode &&
+ strict_iomode) ||
+ !pnfs_lseg_range_intersecting(ls_range, range))
+ return false;
+
+ /* range1 covers only the first byte in the range */
+ range1 = *range;
+ range1.length = 1;
+ return pnfs_lseg_range_contained(ls_range, &range1);
+}
+
+/*
+ * lookup range in layout
+ */
+static struct pnfs_layout_segment *
+pnfs_find_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range,
+ bool strict_iomode)
+{
+ struct pnfs_layout_segment *lseg, *ret = NULL;
+
+ dprintk("%s:Begin\n", __func__);
+
+ list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
+ if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
+ pnfs_lseg_range_match(&lseg->pls_range, range,
+ strict_iomode)) {
+ ret = pnfs_get_lseg(lseg);
+ break;
+ }
+ }
+
+ dprintk("%s:Return lseg %p ref %d\n",
+ __func__, ret, ret ? refcount_read(&ret->pls_refcount) : 0);
+ return ret;
+}
+
+/*
+ * Use mdsthreshold hints set at each OPEN to determine if I/O should go
+ * to the MDS or over pNFS
+ *
+ * The nfs_inode read_io and write_io fields are cumulative counters reset
+ * when there are no layout segments. Note that in pnfs_update_layout iomode
+ * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a
+ * WRITE request.
+ *
+ * A return of true means use MDS I/O.
+ *
+ * From rfc 5661:
+ * If a file's size is smaller than the file size threshold, data accesses
+ * SHOULD be sent to the metadata server. If an I/O request has a length that
+ * is below the I/O size threshold, the I/O SHOULD be sent to the metadata
+ * server. If both file size and I/O size are provided, the client SHOULD
+ * reach or exceed both thresholds before sending its read or write
+ * requests to the data server.
+ */
+static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
+ struct inode *ino, int iomode)
+{
+ struct nfs4_threshold *t = ctx->mdsthreshold;
+ struct nfs_inode *nfsi = NFS_I(ino);
+ loff_t fsize = i_size_read(ino);
+ bool size = false, size_set = false, io = false, io_set = false, ret = false;
+
+ if (t == NULL)
+ return ret;
+
+ dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
+ __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz);
+
+ switch (iomode) {
+ case IOMODE_READ:
+ if (t->bm & THRESHOLD_RD) {
+ dprintk("%s fsize %llu\n", __func__, fsize);
+ size_set = true;
+ if (fsize < t->rd_sz)
+ size = true;
+ }
+ if (t->bm & THRESHOLD_RD_IO) {
+ dprintk("%s nfsi->read_io %llu\n", __func__,
+ nfsi->read_io);
+ io_set = true;
+ if (nfsi->read_io < t->rd_io_sz)
+ io = true;
+ }
+ break;
+ case IOMODE_RW:
+ if (t->bm & THRESHOLD_WR) {
+ dprintk("%s fsize %llu\n", __func__, fsize);
+ size_set = true;
+ if (fsize < t->wr_sz)
+ size = true;
+ }
+ if (t->bm & THRESHOLD_WR_IO) {
+ dprintk("%s nfsi->write_io %llu\n", __func__,
+ nfsi->write_io);
+ io_set = true;
+ if (nfsi->write_io < t->wr_io_sz)
+ io = true;
+ }
+ break;
+ }
+ if (size_set && io_set) {
+ if (size && io)
+ ret = true;
+ } else if (size || io)
+ ret = true;
+
+ dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret);
+ return ret;
+}
+
+static int pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
+{
+ /*
+ * send layoutcommit as it can hold up layoutreturn due to lseg
+ * reference
+ */
+ pnfs_layoutcommit_inode(lo->plh_inode, false);
+ return wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
+ nfs_wait_bit_killable,
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
+}
+
+static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo)
+{
+ atomic_inc(&lo->plh_outstanding);
+}
+
+static void nfs_layoutget_end(struct pnfs_layout_hdr *lo)
+{
+ if (atomic_dec_and_test(&lo->plh_outstanding) &&
+ test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags))
+ wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN);
+}
+
+static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo)
+{
+ return test_bit(NFS_LAYOUT_FIRST_LAYOUTGET, &lo->plh_flags);
+}
+
+static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
+{
+ unsigned long *bitlock = &lo->plh_flags;
+
+ clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
+ smp_mb__after_atomic();
+ wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
+}
+
+static void _add_to_server_list(struct pnfs_layout_hdr *lo,
+ struct nfs_server *server)
+{
+ if (!test_and_set_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) {
+ struct nfs_client *clp = server->nfs_client;
+
+ /* The lo must be on the clp list if there is any
+ * chance of a CB_LAYOUTRECALL(FILE) coming in.
+ */
+ spin_lock(&clp->cl_lock);
+ list_add_tail_rcu(&lo->plh_layouts, &server->layouts);
+ spin_unlock(&clp->cl_lock);
+ }
+}
+
+/*
+ * Layout segment is retreived from the server if not cached.
+ * The appropriate layout segment is referenced and returned to the caller.
+ */
+struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino,
+ struct nfs_open_context *ctx,
+ loff_t pos,
+ u64 count,
+ enum pnfs_iomode iomode,
+ bool strict_iomode,
+ gfp_t gfp_flags)
+{
+ struct pnfs_layout_range arg = {
+ .iomode = iomode,
+ .offset = pos,
+ .length = count,
+ };
+ unsigned pg_offset;
+ struct nfs_server *server = NFS_SERVER(ino);
+ struct nfs_client *clp = server->nfs_client;
+ struct pnfs_layout_hdr *lo = NULL;
+ struct pnfs_layout_segment *lseg = NULL;
+ struct nfs4_layoutget *lgp;
+ nfs4_stateid stateid;
+ long timeout = 0;
+ unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
+ bool first;
+
+ if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_NO_PNFS);
+ goto out;
+ }
+
+ if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_MDSTHRESH);
+ goto out;
+ }
+
+lookup_again:
+ lseg = ERR_PTR(nfs4_client_recover_expired_lease(clp));
+ if (IS_ERR(lseg))
+ goto out;
+ first = false;
+ spin_lock(&ino->i_lock);
+ lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
+ if (lo == NULL) {
+ spin_unlock(&ino->i_lock);
+ lseg = ERR_PTR(-ENOMEM);
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_NOMEM);
+ goto out;
+ }
+
+ /* Do we even need to bother with this? */
+ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_BULK_RECALL);
+ dprintk("%s matches recall, use MDS\n", __func__);
+ goto out_unlock;
+ }
+
+ /* if LAYOUTGET already failed once we don't try again */
+ if (pnfs_layout_io_test_failed(lo, iomode)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
+ goto out_unlock;
+ }
+
+ /*
+ * If the layout segment list is empty, but there are outstanding
+ * layoutget calls, then they might be subject to a layoutrecall.
+ */
+ if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) &&
+ atomic_read(&lo->plh_outstanding) != 0) {
+ spin_unlock(&ino->i_lock);
+ lseg = ERR_PTR(wait_on_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN,
+ TASK_KILLABLE));
+ if (IS_ERR(lseg))
+ goto out_put_layout_hdr;
+ pnfs_put_layout_hdr(lo);
+ goto lookup_again;
+ }
+
+ /*
+ * Because we free lsegs when sending LAYOUTRETURN, we need to wait
+ * for LAYOUTRETURN.
+ */
+ if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
+ spin_unlock(&ino->i_lock);
+ dprintk("%s wait for layoutreturn\n", __func__);
+ lseg = ERR_PTR(pnfs_prepare_to_retry_layoutget(lo));
+ if (!IS_ERR(lseg)) {
+ pnfs_put_layout_hdr(lo);
+ dprintk("%s retrying\n", __func__);
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ lseg,
+ PNFS_UPDATE_LAYOUT_RETRY);
+ goto lookup_again;
+ }
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_RETURN);
+ goto out_put_layout_hdr;
+ }
+
+ lseg = pnfs_find_lseg(lo, &arg, strict_iomode);
+ if (lseg) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_FOUND_CACHED);
+ goto out_unlock;
+ }
+
+ /*
+ * Choose a stateid for the LAYOUTGET. If we don't have a layout
+ * stateid, or it has been invalidated, then we must use the open
+ * stateid.
+ */
+ if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
+ int status;
+
+ /*
+ * The first layoutget for the file. Need to serialize per
+ * RFC 5661 Errata 3208.
+ */
+ if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
+ &lo->plh_flags)) {
+ spin_unlock(&ino->i_lock);
+ lseg = ERR_PTR(wait_on_bit(&lo->plh_flags,
+ NFS_LAYOUT_FIRST_LAYOUTGET,
+ TASK_KILLABLE));
+ if (IS_ERR(lseg))
+ goto out_put_layout_hdr;
+ pnfs_put_layout_hdr(lo);
+ dprintk("%s retrying\n", __func__);
+ goto lookup_again;
+ }
+
+ spin_unlock(&ino->i_lock);
+ first = true;
+ status = nfs4_select_rw_stateid(ctx->state,
+ iomode == IOMODE_RW ? FMODE_WRITE : FMODE_READ,
+ NULL, &stateid, NULL);
+ if (status != 0) {
+ lseg = ERR_PTR(status);
+ trace_pnfs_update_layout(ino, pos, count,
+ iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_INVALID_OPEN);
+ nfs4_schedule_stateid_recovery(server, ctx->state);
+ pnfs_clear_first_layoutget(lo);
+ pnfs_put_layout_hdr(lo);
+ goto lookup_again;
+ }
+ spin_lock(&ino->i_lock);
+ } else {
+ nfs4_stateid_copy(&stateid, &lo->plh_stateid);
+ }
+
+ if (pnfs_layoutgets_blocked(lo)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_BLOCKED);
+ goto out_unlock;
+ }
+ nfs_layoutget_begin(lo);
+ spin_unlock(&ino->i_lock);
+
+ _add_to_server_list(lo, server);
+
+ pg_offset = arg.offset & ~PAGE_MASK;
+ if (pg_offset) {
+ arg.offset -= pg_offset;
+ arg.length += pg_offset;
+ }
+ if (arg.length != NFS4_MAX_UINT64)
+ arg.length = PAGE_ALIGN(arg.length);
+
+ lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &stateid, &arg, gfp_flags);
+ if (!lgp) {
+ lseg = ERR_PTR(-ENOMEM);
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, NULL,
+ PNFS_UPDATE_LAYOUT_NOMEM);
+ nfs_layoutget_end(lo);
+ goto out_put_layout_hdr;
+ }
+
+ lgp->lo = lo;
+ pnfs_get_layout_hdr(lo);
+
+ lseg = nfs4_proc_layoutget(lgp, &timeout);
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
+ nfs_layoutget_end(lo);
+ if (IS_ERR(lseg)) {
+ switch(PTR_ERR(lseg)) {
+ case -EBUSY:
+ if (time_after(jiffies, giveup))
+ lseg = NULL;
+ break;
+ case -ERECALLCONFLICT:
+ case -EAGAIN:
+ break;
+ case -ENODATA:
+ /* The server returned NFS4ERR_LAYOUTUNAVAILABLE */
+ pnfs_layout_set_fail_bit(
+ lo, pnfs_iomode_to_fail_bit(iomode));
+ lseg = NULL;
+ goto out_put_layout_hdr;
+ default:
+ if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
+ pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+ lseg = NULL;
+ }
+ goto out_put_layout_hdr;
+ }
+ if (lseg) {
+ if (first)
+ pnfs_clear_first_layoutget(lo);
+ trace_pnfs_update_layout(ino, pos, count,
+ iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
+ pnfs_put_layout_hdr(lo);
+ goto lookup_again;
+ }
+ } else {
+ pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+ }
+
+out_put_layout_hdr:
+ if (first)
+ pnfs_clear_first_layoutget(lo);
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_EXIT);
+ pnfs_put_layout_hdr(lo);
+out:
+ dprintk("%s: inode %s/%llu pNFS layout segment %s for "
+ "(%s, offset: %llu, length: %llu)\n",
+ __func__, ino->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(ino),
+ IS_ERR_OR_NULL(lseg) ? "not found" : "found",
+ iomode==IOMODE_RW ? "read/write" : "read-only",
+ (unsigned long long)pos,
+ (unsigned long long)count);
+ return lseg;
+out_unlock:
+ spin_unlock(&ino->i_lock);
+ goto out_put_layout_hdr;
+}
+EXPORT_SYMBOL_GPL(pnfs_update_layout);
+
+static bool
+pnfs_sanity_check_layout_range(struct pnfs_layout_range *range)
+{
+ switch (range->iomode) {
+ case IOMODE_READ:
+ case IOMODE_RW:
+ break;
+ default:
+ return false;
+ }
+ if (range->offset == NFS4_MAX_UINT64)
+ return false;
+ if (range->length == 0)
+ return false;
+ if (range->length != NFS4_MAX_UINT64 &&
+ range->length > NFS4_MAX_UINT64 - range->offset)
+ return false;
+ return true;
+}
+
+static struct pnfs_layout_hdr *
+_pnfs_grab_empty_layout(struct inode *ino, struct nfs_open_context *ctx)
+{
+ struct pnfs_layout_hdr *lo;
+
+ spin_lock(&ino->i_lock);
+ lo = pnfs_find_alloc_layout(ino, ctx, nfs_io_gfp_mask());
+ if (!lo)
+ goto out_unlock;
+ if (!test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags))
+ goto out_unlock;
+ if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+ goto out_unlock;
+ if (pnfs_layoutgets_blocked(lo))
+ goto out_unlock;
+ if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, &lo->plh_flags))
+ goto out_unlock;
+ nfs_layoutget_begin(lo);
+ spin_unlock(&ino->i_lock);
+ _add_to_server_list(lo, NFS_SERVER(ino));
+ return lo;
+
+out_unlock:
+ spin_unlock(&ino->i_lock);
+ pnfs_put_layout_hdr(lo);
+ return NULL;
+}
+
+static void _lgopen_prepare_attached(struct nfs4_opendata *data,
+ struct nfs_open_context *ctx)
+{
+ struct inode *ino = data->dentry->d_inode;
+ struct pnfs_layout_range rng = {
+ .iomode = (data->o_arg.fmode & FMODE_WRITE) ?
+ IOMODE_RW: IOMODE_READ,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+ struct nfs4_layoutget *lgp;
+ struct pnfs_layout_hdr *lo;
+
+ /* Heuristic: don't send layoutget if we have cached data */
+ if (rng.iomode == IOMODE_READ &&
+ (i_size_read(ino) == 0 || ino->i_mapping->nrpages != 0))
+ return;
+
+ lo = _pnfs_grab_empty_layout(ino, ctx);
+ if (!lo)
+ return;
+ lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid, &rng,
+ nfs_io_gfp_mask());
+ if (!lgp) {
+ pnfs_clear_first_layoutget(lo);
+ nfs_layoutget_end(lo);
+ pnfs_put_layout_hdr(lo);
+ return;
+ }
+ lgp->lo = lo;
+ data->lgp = lgp;
+ data->o_arg.lg_args = &lgp->args;
+ data->o_res.lg_res = &lgp->res;
+}
+
+static void _lgopen_prepare_floating(struct nfs4_opendata *data,
+ struct nfs_open_context *ctx)
+{
+ struct inode *ino = data->dentry->d_inode;
+ struct pnfs_layout_range rng = {
+ .iomode = (data->o_arg.fmode & FMODE_WRITE) ?
+ IOMODE_RW: IOMODE_READ,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+ struct nfs4_layoutget *lgp;
+
+ lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid, &rng,
+ nfs_io_gfp_mask());
+ if (!lgp)
+ return;
+ data->lgp = lgp;
+ data->o_arg.lg_args = &lgp->args;
+ data->o_res.lg_res = &lgp->res;
+}
+
+void pnfs_lgopen_prepare(struct nfs4_opendata *data,
+ struct nfs_open_context *ctx)
+{
+ struct nfs_server *server = NFS_SERVER(data->dir->d_inode);
+
+ if (!(pnfs_enabled_sb(server) &&
+ server->pnfs_curr_ld->flags & PNFS_LAYOUTGET_ON_OPEN))
+ return;
+ /* Could check on max_ops, but currently hardcoded high enough */
+ if (!nfs_server_capable(data->dir->d_inode, NFS_CAP_LGOPEN))
+ return;
+ if (data->lgp)
+ return;
+ if (data->state)
+ _lgopen_prepare_attached(data, ctx);
+ else
+ _lgopen_prepare_floating(data, ctx);
+}
+
+void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp,
+ struct nfs_open_context *ctx)
+{
+ struct pnfs_layout_hdr *lo;
+ struct pnfs_layout_segment *lseg;
+ struct nfs_server *srv = NFS_SERVER(ino);
+ u32 iomode;
+
+ if (!lgp)
+ return;
+ dprintk("%s: entered with status %i\n", __func__, lgp->res.status);
+ if (lgp->res.status) {
+ switch (lgp->res.status) {
+ default:
+ break;
+ /*
+ * Halt lgopen attempts if the server doesn't recognise
+ * the "current stateid" value, the layout type, or the
+ * layoutget operation as being valid.
+ * Also if it complains about too many ops in the compound
+ * or of the request/reply being too big.
+ */
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_NOTSUPP:
+ case -NFS4ERR_REP_TOO_BIG:
+ case -NFS4ERR_REP_TOO_BIG_TO_CACHE:
+ case -NFS4ERR_REQ_TOO_BIG:
+ case -NFS4ERR_TOO_MANY_OPS:
+ case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
+ srv->caps &= ~NFS_CAP_LGOPEN;
+ }
+ return;
+ }
+ if (!lgp->lo) {
+ lo = _pnfs_grab_empty_layout(ino, ctx);
+ if (!lo)
+ return;
+ lgp->lo = lo;
+ } else
+ lo = lgp->lo;
+
+ lseg = pnfs_layout_process(lgp);
+ if (!IS_ERR(lseg)) {
+ iomode = lgp->args.range.iomode;
+ pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+ pnfs_put_lseg(lseg);
+ }
+}
+
+void nfs4_lgopen_release(struct nfs4_layoutget *lgp)
+{
+ if (lgp != NULL) {
+ if (lgp->lo) {
+ pnfs_clear_first_layoutget(lgp->lo);
+ nfs_layoutget_end(lgp->lo);
+ }
+ pnfs_layoutget_free(lgp);
+ }
+}
+
+struct pnfs_layout_segment *
+pnfs_layout_process(struct nfs4_layoutget *lgp)
+{
+ struct pnfs_layout_hdr *lo = lgp->lo;
+ struct nfs4_layoutget_res *res = &lgp->res;
+ struct pnfs_layout_segment *lseg;
+ struct inode *ino = lo->plh_inode;
+ LIST_HEAD(free_me);
+
+ if (!pnfs_sanity_check_layout_range(&res->range))
+ return ERR_PTR(-EINVAL);
+
+ /* Inject layout blob into I/O device driver */
+ lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
+ if (IS_ERR_OR_NULL(lseg)) {
+ if (!lseg)
+ lseg = ERR_PTR(-ENOMEM);
+
+ dprintk("%s: Could not allocate layout: error %ld\n",
+ __func__, PTR_ERR(lseg));
+ return lseg;
+ }
+
+ pnfs_init_lseg(lo, lseg, &res->range, &res->stateid);
+
+ spin_lock(&ino->i_lock);
+ if (pnfs_layoutgets_blocked(lo)) {
+ dprintk("%s forget reply due to state\n", __func__);
+ goto out_forget;
+ }
+
+ if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) &&
+ !pnfs_is_first_layoutget(lo))
+ goto out_forget;
+
+ if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
+ /* existing state ID, make sure the sequence number matches. */
+ if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
+ if (!pnfs_layout_is_valid(lo))
+ lo->plh_barrier = 0;
+ dprintk("%s forget reply due to sequence\n", __func__);
+ goto out_forget;
+ }
+ pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, false);
+ } else if (pnfs_layout_is_valid(lo)) {
+ /*
+ * We got an entirely new state ID. Mark all segments for the
+ * inode invalid, and retry the layoutget
+ */
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .length = NFS4_MAX_UINT64,
+ };
+ pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0);
+ goto out_forget;
+ } else {
+ /* We have a completely new layout */
+ pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true);
+ }
+
+ pnfs_get_lseg(lseg);
+ pnfs_layout_insert_lseg(lo, lseg, &free_me);
+
+
+ if (res->return_on_close)
+ set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
+
+ spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&free_me);
+ return lseg;
+
+out_forget:
+ spin_unlock(&ino->i_lock);
+ lseg->pls_layout = lo;
+ NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+ return ERR_PTR(-EAGAIN);
+}
+
+/**
+ * pnfs_mark_matching_lsegs_return - Free or return matching layout segments
+ * @lo: pointer to layout header
+ * @tmp_list: list header to be used with pnfs_free_lseg_list()
+ * @return_range: describe layout segment ranges to be returned
+ * @seq: stateid seqid to match
+ *
+ * This function is mainly intended for use by layoutrecall. It attempts
+ * to free the layout segment immediately, or else to mark it for return
+ * as soon as its reference count drops to zero.
+ *
+ * Returns
+ * - 0: a layoutreturn needs to be scheduled.
+ * - EBUSY: there are layout segment that are still in use.
+ * - ENOENT: there are no layout segments that need to be returned.
+ */
+int
+pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
+ struct list_head *tmp_list,
+ const struct pnfs_layout_range *return_range,
+ u32 seq)
+{
+ struct pnfs_layout_segment *lseg, *next;
+ struct nfs_server *server = NFS_SERVER(lo->plh_inode);
+ int remaining = 0;
+
+ dprintk("%s:Begin lo %p\n", __func__, lo);
+
+ assert_spin_locked(&lo->plh_inode->i_lock);
+
+ if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ tmp_list = &lo->plh_return_segs;
+
+ list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+ if (pnfs_match_lseg_recall(lseg, return_range, seq)) {
+ dprintk("%s: marking lseg %p iomode %d "
+ "offset %llu length %llu\n", __func__,
+ lseg, lseg->pls_range.iomode,
+ lseg->pls_range.offset,
+ lseg->pls_range.length);
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ tmp_list = &lo->plh_return_segs;
+ if (mark_lseg_invalid(lseg, tmp_list))
+ continue;
+ remaining++;
+ set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
+ pnfs_lseg_cancel_io(server, lseg);
+ }
+
+ if (remaining) {
+ pnfs_set_plh_return_info(lo, return_range->iomode, seq);
+ return -EBUSY;
+ }
+
+ if (!list_empty(&lo->plh_return_segs)) {
+ pnfs_set_plh_return_info(lo, return_range->iomode, seq);
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static void
+pnfs_mark_layout_for_return(struct inode *inode,
+ const struct pnfs_layout_range *range)
+{
+ struct pnfs_layout_hdr *lo;
+ bool return_now = false;
+
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (!pnfs_layout_is_valid(lo)) {
+ spin_unlock(&inode->i_lock);
+ return;
+ }
+ pnfs_set_plh_return_info(lo, range->iomode, 0);
+ /*
+ * mark all matching lsegs so that we are sure to have no live
+ * segments at hand when sending layoutreturn. See pnfs_put_lseg()
+ * for how it works.
+ */
+ if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, range, 0) != -EBUSY) {
+ const struct cred *cred;
+ nfs4_stateid stateid;
+ enum pnfs_iomode iomode;
+
+ return_now = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
+ spin_unlock(&inode->i_lock);
+ if (return_now)
+ pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
+ } else {
+ spin_unlock(&inode->i_lock);
+ nfs_commit_inode(inode, 0);
+ }
+}
+
+void pnfs_error_mark_layout_for_return(struct inode *inode,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_layout_range range = {
+ .iomode = lseg->pls_range.iomode,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+
+ pnfs_mark_layout_for_return(inode, &range);
+}
+EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
+
+static bool
+pnfs_layout_can_be_returned(struct pnfs_layout_hdr *lo)
+{
+ return pnfs_layout_is_valid(lo) &&
+ !test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) &&
+ !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
+}
+
+static struct pnfs_layout_segment *
+pnfs_find_first_lseg(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range,
+ enum pnfs_iomode iomode)
+{
+ struct pnfs_layout_segment *lseg;
+
+ list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
+ if (!test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
+ continue;
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ continue;
+ if (lseg->pls_range.iomode != iomode && iomode != IOMODE_ANY)
+ continue;
+ if (pnfs_lseg_range_intersecting(&lseg->pls_range, range))
+ return lseg;
+ }
+ return NULL;
+}
+
+/* Find open file states whose mode matches that of the range */
+static bool
+pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range)
+{
+ struct list_head *head;
+ struct nfs_open_context *ctx;
+ fmode_t mode = 0;
+
+ if (!pnfs_layout_can_be_returned(lo) ||
+ !pnfs_find_first_lseg(lo, range, range->iomode))
+ return false;
+
+ head = &NFS_I(lo->plh_inode)->open_files;
+ list_for_each_entry_rcu(ctx, head, list) {
+ if (ctx->state)
+ mode |= ctx->state->state & (FMODE_READ|FMODE_WRITE);
+ }
+
+ switch (range->iomode) {
+ default:
+ break;
+ case IOMODE_READ:
+ mode &= ~FMODE_WRITE;
+ break;
+ case IOMODE_RW:
+ if (pnfs_find_first_lseg(lo, range, IOMODE_READ))
+ mode &= ~FMODE_READ;
+ }
+ return mode == 0;
+}
+
+static int pnfs_layout_return_unused_byserver(struct nfs_server *server,
+ void *data)
+{
+ const struct pnfs_layout_range *range = data;
+ const struct cred *cred;
+ struct pnfs_layout_hdr *lo;
+ struct inode *inode;
+ nfs4_stateid stateid;
+ enum pnfs_iomode iomode;
+
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
+ inode = lo->plh_inode;
+ if (!inode || !pnfs_layout_can_be_returned(lo) ||
+ test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ continue;
+ spin_lock(&inode->i_lock);
+ if (!lo->plh_inode ||
+ !pnfs_should_return_unused_layout(lo, range)) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+ pnfs_get_layout_hdr(lo);
+ pnfs_set_plh_return_info(lo, range->iomode, 0);
+ if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
+ range, 0) != 0 ||
+ !pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode)) {
+ spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
+ pnfs_put_layout_hdr(lo);
+ cond_resched();
+ goto restart;
+ }
+ spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
+ pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
+ pnfs_put_layout_hdr(lo);
+ cond_resched();
+ goto restart;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+void
+pnfs_layout_return_unused_byclid(struct nfs_client *clp,
+ enum pnfs_iomode iomode)
+{
+ struct pnfs_layout_range range = {
+ .iomode = iomode,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+
+ nfs_client_for_each_server(clp, pnfs_layout_return_unused_byserver,
+ &range);
+}
+
+void
+pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio)
+{
+ if (pgio->pg_lseg == NULL ||
+ test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags))
+ return;
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout);
+
+/*
+ * Check for any intersection between the request and the pgio->pg_lseg,
+ * and if none, put this pgio->pg_lseg away.
+ */
+void
+pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+{
+ if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) {
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_range);
+
+void
+pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+{
+ u64 rd_size;
+
+ pnfs_generic_pg_check_layout(pgio);
+ pnfs_generic_pg_check_range(pgio, req);
+ if (pgio->pg_lseg == NULL) {
+ if (pgio->pg_dreq == NULL)
+ rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
+ else
+ rd_size = nfs_dreq_bytes_left(pgio->pg_dreq,
+ req_offset(req));
+
+ pgio->pg_lseg =
+ pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
+ req_offset(req), rd_size,
+ IOMODE_READ, false,
+ nfs_io_gfp_mask());
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
+ /* If no lseg, fall back to read through mds */
+ if (pgio->pg_lseg == NULL)
+ nfs_pageio_reset_read_mds(pgio);
+
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
+
+void
+pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req, u64 wb_size)
+{
+ pnfs_generic_pg_check_layout(pgio);
+ pnfs_generic_pg_check_range(pgio, req);
+ if (pgio->pg_lseg == NULL) {
+ pgio->pg_lseg =
+ pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
+ req_offset(req), wb_size, IOMODE_RW,
+ false, nfs_io_gfp_mask());
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
+ /* If no lseg, fall back to write through mds */
+ if (pgio->pg_lseg == NULL)
+ nfs_pageio_reset_write_mds(pgio);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
+
+void
+pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc)
+{
+ if (desc->pg_lseg) {
+ pnfs_put_lseg(desc->pg_lseg);
+ desc->pg_lseg = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup);
+
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+size_t
+pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *prev, struct nfs_page *req)
+{
+ unsigned int size;
+ u64 seg_end, req_start, seg_left;
+
+ size = nfs_generic_pg_test(pgio, prev, req);
+ if (!size)
+ return 0;
+
+ /*
+ * 'size' contains the number of bytes left in the current page (up
+ * to the original size asked for in @req->wb_bytes).
+ *
+ * Calculate how many bytes are left in the layout segment
+ * and if there are less bytes than 'size', return that instead.
+ *
+ * Please also note that 'end_offset' is actually the offset of the
+ * first byte that lies outside the pnfs_layout_range. FIXME?
+ *
+ */
+ if (pgio->pg_lseg) {
+ seg_end = pnfs_end_offset(pgio->pg_lseg->pls_range.offset,
+ pgio->pg_lseg->pls_range.length);
+ req_start = req_offset(req);
+
+ /* start of request is past the last byte of this segment */
+ if (req_start >= seg_end)
+ return 0;
+
+ /* adjust 'size' iff there are fewer bytes left in the
+ * segment than what nfs_generic_pg_test returned */
+ seg_left = seg_end - req_start;
+ if (seg_left < size)
+ size = (unsigned int)seg_left;
+ }
+
+ return size;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
+
+int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
+{
+ struct nfs_pageio_descriptor pgio;
+
+ /* Resend all requests through the MDS */
+ nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
+ hdr->completion_ops);
+ return nfs_pageio_resend(&pgio, hdr);
+}
+EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
+
+static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
+{
+
+ dprintk("pnfs write error = %d\n", hdr->pnfs_error);
+ if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
+ PNFS_LAYOUTRET_ON_ERROR) {
+ pnfs_return_layout(hdr->inode);
+ }
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
+ hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
+}
+
+/*
+ * Called by non rpc-based layout drivers
+ */
+void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
+{
+ if (likely(!hdr->pnfs_error)) {
+ pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
+ hdr->mds_offset + hdr->res.count);
+ hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
+ }
+ trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
+ if (unlikely(hdr->pnfs_error))
+ pnfs_ld_handle_write_error(hdr);
+ hdr->mds_ops->rpc_release(hdr);
+}
+EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
+
+static void
+pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr)
+{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ list_splice_tail_init(&hdr->pages, &mirror->pg_list);
+ nfs_pageio_reset_write_mds(desc);
+ mirror->pg_recoalesce = 1;
+ }
+ hdr->completion_ops->completion(hdr);
+}
+
+static enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
+ const struct rpc_call_ops *call_ops,
+ struct pnfs_layout_segment *lseg,
+ int how)
+{
+ struct inode *inode = hdr->inode;
+ enum pnfs_try_status trypnfs;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+
+ hdr->mds_ops = call_ops;
+
+ dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
+ inode->i_ino, hdr->args.count, hdr->args.offset, how);
+ trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
+ if (trypnfs != PNFS_NOT_ATTEMPTED)
+ nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
+ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+ return trypnfs;
+}
+
+static void
+pnfs_do_write(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr, int how)
+{
+ const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
+ struct pnfs_layout_segment *lseg = desc->pg_lseg;
+ enum pnfs_try_status trypnfs;
+
+ trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
+ switch (trypnfs) {
+ case PNFS_NOT_ATTEMPTED:
+ pnfs_write_through_mds(desc, hdr);
+ break;
+ case PNFS_ATTEMPTED:
+ break;
+ case PNFS_TRY_AGAIN:
+ /* cleanup hdr and prepare to redo pnfs */
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+ list_splice_init(&hdr->pages, &mirror->pg_list);
+ mirror->pg_recoalesce = 1;
+ }
+ hdr->mds_ops->rpc_release(hdr);
+ }
+}
+
+static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
+{
+ pnfs_put_lseg(hdr->lseg);
+ nfs_pgio_header_free(hdr);
+}
+
+int
+pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
+{
+ struct nfs_pgio_header *hdr;
+ int ret;
+
+ hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+ if (!hdr) {
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
+ }
+ nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
+
+ hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
+ ret = nfs_generic_pgio(desc, hdr);
+ if (!ret)
+ pnfs_do_write(desc, hdr, desc->pg_ioflags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
+
+int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
+{
+ struct nfs_pageio_descriptor pgio;
+
+ /* Resend all requests through the MDS */
+ nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
+ return nfs_pageio_resend(&pgio, hdr);
+}
+EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
+
+static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
+{
+ dprintk("pnfs read error = %d\n", hdr->pnfs_error);
+ if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
+ PNFS_LAYOUTRET_ON_ERROR) {
+ pnfs_return_layout(hdr->inode);
+ }
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
+ hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
+}
+
+/*
+ * Called by non rpc-based layout drivers
+ */
+void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
+{
+ if (likely(!hdr->pnfs_error))
+ hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
+ trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
+ if (unlikely(hdr->pnfs_error))
+ pnfs_ld_handle_read_error(hdr);
+ hdr->mds_ops->rpc_release(hdr);
+}
+EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
+
+static void
+pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr)
+{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ list_splice_tail_init(&hdr->pages, &mirror->pg_list);
+ nfs_pageio_reset_read_mds(desc);
+ mirror->pg_recoalesce = 1;
+ }
+ hdr->completion_ops->completion(hdr);
+}
+
+/*
+ * Call the appropriate parallel I/O subsystem read function.
+ */
+static enum pnfs_try_status
+pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
+ const struct rpc_call_ops *call_ops,
+ struct pnfs_layout_segment *lseg)
+{
+ struct inode *inode = hdr->inode;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+ enum pnfs_try_status trypnfs;
+
+ hdr->mds_ops = call_ops;
+
+ dprintk("%s: Reading ino:%lu %u@%llu\n",
+ __func__, inode->i_ino, hdr->args.count, hdr->args.offset);
+
+ trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
+ if (trypnfs != PNFS_NOT_ATTEMPTED)
+ nfs_inc_stats(inode, NFSIOS_PNFS_READ);
+ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+ return trypnfs;
+}
+
+/* Resend all requests through pnfs. */
+void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr,
+ unsigned int mirror_idx)
+{
+ struct nfs_pageio_descriptor pgio;
+
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ /* Prevent deadlocks with layoutreturn! */
+ pnfs_put_lseg(hdr->lseg);
+ hdr->lseg = NULL;
+
+ nfs_pageio_init_read(&pgio, hdr->inode, false,
+ hdr->completion_ops);
+ pgio.pg_mirror_idx = mirror_idx;
+ hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr);
+ }
+}
+EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
+
+static void
+pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
+{
+ const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
+ struct pnfs_layout_segment *lseg = desc->pg_lseg;
+ enum pnfs_try_status trypnfs;
+
+ trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
+ switch (trypnfs) {
+ case PNFS_NOT_ATTEMPTED:
+ pnfs_read_through_mds(desc, hdr);
+ break;
+ case PNFS_ATTEMPTED:
+ break;
+ case PNFS_TRY_AGAIN:
+ /* cleanup hdr and prepare to redo pnfs */
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+ list_splice_init(&hdr->pages, &mirror->pg_list);
+ mirror->pg_recoalesce = 1;
+ }
+ hdr->mds_ops->rpc_release(hdr);
+ }
+}
+
+static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
+{
+ pnfs_put_lseg(hdr->lseg);
+ nfs_pgio_header_free(hdr);
+}
+
+int
+pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
+{
+ struct nfs_pgio_header *hdr;
+ int ret;
+
+ hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+ if (!hdr) {
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
+ }
+ nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
+ hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
+ ret = nfs_generic_pgio(desc, hdr);
+ if (!ret)
+ pnfs_do_read(desc, hdr);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
+
+static void pnfs_clear_layoutcommitting(struct inode *inode)
+{
+ unsigned long *bitlock = &NFS_I(inode)->flags;
+
+ clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
+ smp_mb__after_atomic();
+ wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
+}
+
+/*
+ * There can be multiple RW segments.
+ */
+static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
+{
+ struct pnfs_layout_segment *lseg;
+
+ list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
+ if (lseg->pls_range.iomode == IOMODE_RW &&
+ test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
+ list_add(&lseg->pls_lc_list, listp);
+ }
+}
+
+static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp)
+{
+ struct pnfs_layout_segment *lseg, *tmp;
+
+ /* Matched by references in pnfs_set_layoutcommit */
+ list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) {
+ list_del_init(&lseg->pls_lc_list);
+ pnfs_put_lseg(lseg);
+ }
+
+ pnfs_clear_layoutcommitting(inode);
+}
+
+void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
+{
+ pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode);
+}
+EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
+
+void
+pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg,
+ loff_t end_pos)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ bool mark_as_dirty = false;
+
+ spin_lock(&inode->i_lock);
+ if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
+ nfsi->layout->plh_lwb = end_pos;
+ mark_as_dirty = true;
+ dprintk("%s: Set layoutcommit for inode %lu ",
+ __func__, inode->i_ino);
+ } else if (end_pos > nfsi->layout->plh_lwb)
+ nfsi->layout->plh_lwb = end_pos;
+ if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) {
+ /* references matched in nfs4_layoutcommit_release */
+ pnfs_get_lseg(lseg);
+ }
+ spin_unlock(&inode->i_lock);
+ dprintk("%s: lseg %p end_pos %llu\n",
+ __func__, lseg, nfsi->layout->plh_lwb);
+
+ /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
+ * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
+ if (mark_as_dirty)
+ mark_inode_dirty_sync(inode);
+}
+EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
+
+void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
+{
+ struct nfs_server *nfss = NFS_SERVER(data->args.inode);
+
+ if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
+ nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
+ pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list);
+}
+
+/*
+ * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
+ * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
+ * data to disk to allow the server to recover the data if it crashes.
+ * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag
+ * is off, and a COMMIT is sent to a data server, or
+ * if WRITEs to a data server return NFS_DATA_SYNC.
+ */
+int
+pnfs_layoutcommit_inode(struct inode *inode, bool sync)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ struct nfs4_layoutcommit_data *data;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ loff_t end_pos;
+ int status;
+
+ if (!pnfs_layoutcommit_outstanding(inode))
+ return 0;
+
+ dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
+
+ status = -EAGAIN;
+ if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
+ if (!sync)
+ goto out;
+ status = wait_on_bit_lock_action(&nfsi->flags,
+ NFS_INO_LAYOUTCOMMITTING,
+ nfs_wait_bit_killable,
+ TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
+ if (status)
+ goto out;
+ }
+
+ status = -ENOMEM;
+ /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
+ data = kzalloc(sizeof(*data), nfs_io_gfp_mask());
+ if (!data)
+ goto clear_layoutcommitting;
+
+ status = 0;
+ spin_lock(&inode->i_lock);
+ if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
+ goto out_unlock;
+
+ INIT_LIST_HEAD(&data->lseg_list);
+ pnfs_list_write_lseg(inode, &data->lseg_list);
+
+ end_pos = nfsi->layout->plh_lwb;
+
+ nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
+ data->cred = get_cred(nfsi->layout->plh_lc_cred);
+ spin_unlock(&inode->i_lock);
+
+ data->args.inode = inode;
+ nfs_fattr_init(&data->fattr);
+ data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
+ data->res.fattr = &data->fattr;
+ if (end_pos != 0)
+ data->args.lastbytewritten = end_pos - 1;
+ else
+ data->args.lastbytewritten = U64_MAX;
+ data->res.server = NFS_SERVER(inode);
+
+ if (ld->prepare_layoutcommit) {
+ status = ld->prepare_layoutcommit(&data->args);
+ if (status) {
+ put_cred(data->cred);
+ spin_lock(&inode->i_lock);
+ set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
+ if (end_pos > nfsi->layout->plh_lwb)
+ nfsi->layout->plh_lwb = end_pos;
+ goto out_unlock;
+ }
+ }
+
+
+ status = nfs4_proc_layoutcommit(data, sync);
+out:
+ if (status)
+ mark_inode_dirty_sync(inode);
+ dprintk("<-- %s status %d\n", __func__, status);
+ return status;
+out_unlock:
+ spin_unlock(&inode->i_lock);
+ kfree(data);
+clear_layoutcommitting:
+ pnfs_clear_layoutcommitting(inode);
+ goto out;
+}
+EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
+
+int
+pnfs_generic_sync(struct inode *inode, bool datasync)
+{
+ return pnfs_layoutcommit_inode(inode, true);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_sync);
+
+struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
+{
+ struct nfs4_threshold *thp;
+
+ thp = kzalloc(sizeof(*thp), nfs_io_gfp_mask());
+ if (!thp) {
+ dprintk("%s mdsthreshold allocation failed\n", __func__);
+ return NULL;
+ }
+ return thp;
+}
+
+#if IS_ENABLED(CONFIG_NFS_V4_2)
+int
+pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs42_layoutstat_data *data;
+ struct pnfs_layout_hdr *hdr;
+ int status = 0;
+
+ if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
+ goto out;
+
+ if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS))
+ goto out;
+
+ if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
+ goto out;
+
+ spin_lock(&inode->i_lock);
+ if (!NFS_I(inode)->layout) {
+ spin_unlock(&inode->i_lock);
+ goto out_clear_layoutstats;
+ }
+ hdr = NFS_I(inode)->layout;
+ pnfs_get_layout_hdr(hdr);
+ spin_unlock(&inode->i_lock);
+
+ data = kzalloc(sizeof(*data), gfp_flags);
+ if (!data) {
+ status = -ENOMEM;
+ goto out_put;
+ }
+
+ data->args.fh = NFS_FH(inode);
+ data->args.inode = inode;
+ status = ld->prepare_layoutstats(&data->args);
+ if (status)
+ goto out_free;
+
+ status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data);
+
+out:
+ dprintk("%s returns %d\n", __func__, status);
+ return status;
+
+out_free:
+ kfree(data);
+out_put:
+ pnfs_put_layout_hdr(hdr);
+out_clear_layoutstats:
+ smp_mb__before_atomic();
+ clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
+ smp_mb__after_atomic();
+ goto out;
+}
+EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
+#endif
+
+unsigned int layoutstats_timer;
+module_param(layoutstats_timer, uint, 0644);
+EXPORT_SYMBOL_GPL(layoutstats_timer);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
new file mode 100644
index 0000000000..d886c8226d
--- /dev/null
+++ b/fs/nfs/pnfs.h
@@ -0,0 +1,932 @@
+/*
+ * pNFS client data structures.
+ *
+ * Copyright (c) 2002
+ * The Regents of the University of Michigan
+ * All Rights Reserved
+ *
+ * Dean Hildebrand <dhildebz@umich.edu>
+ *
+ * Permission is granted to use, copy, create derivative works, and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the University of Michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. If
+ * the above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * This software is provided as is, without representation or warranty
+ * of any kind either express or implied, including without limitation
+ * the implied warranties of merchantability, fitness for a particular
+ * purpose, or noninfringement. The Regents of the University of
+ * Michigan shall not be liable for any damages, including special,
+ * indirect, incidental, or consequential damages, with respect to any
+ * claim arising out of or in connection with the use of the software,
+ * even if it has been or is hereafter advised of the possibility of
+ * such damages.
+ */
+
+#ifndef FS_NFS_PNFS_H
+#define FS_NFS_PNFS_H
+
+#include <linux/refcount.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/workqueue.h>
+
+struct nfs4_opendata;
+
+enum {
+ NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
+ NFS_LSEG_ROC, /* roc bit received from server */
+ NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */
+ NFS_LSEG_LAYOUTRETURN, /* layoutreturn bit set for layoutreturn */
+ NFS_LSEG_UNAVAILABLE, /* unavailable bit set for temporary problem */
+};
+
+/* Individual ip address */
+struct nfs4_pnfs_ds_addr {
+ struct sockaddr_storage da_addr;
+ size_t da_addrlen;
+ struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */
+ char *da_remotestr; /* human readable addr+port */
+ const char *da_netid;
+ int da_transport;
+};
+
+struct nfs4_pnfs_ds {
+ struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
+ char *ds_remotestr; /* comma sep list of addrs */
+ struct list_head ds_addrs;
+ struct nfs_client *ds_clp;
+ refcount_t ds_count;
+ unsigned long ds_state;
+#define NFS4DS_CONNECTING 0 /* ds is establishing connection */
+};
+
+struct pnfs_layout_segment {
+ struct list_head pls_list;
+ struct list_head pls_lc_list;
+ struct list_head pls_commits;
+ struct pnfs_layout_range pls_range;
+ refcount_t pls_refcount;
+ u32 pls_seq;
+ unsigned long pls_flags;
+ struct pnfs_layout_hdr *pls_layout;
+};
+
+enum pnfs_try_status {
+ PNFS_ATTEMPTED = 0,
+ PNFS_NOT_ATTEMPTED = 1,
+ PNFS_TRY_AGAIN = 2,
+};
+
+#ifdef CONFIG_NFS_V4_1
+
+#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
+
+/*
+ * Default data server connection timeout and retrans vaules.
+ * Set by module parameters dataserver_timeo and dataserver_retrans.
+ */
+#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */
+#define NFS4_DEF_DS_RETRANS 5
+#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ)
+
+enum {
+ NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
+ NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
+ NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
+ NFS_LAYOUT_RETURN, /* layoutreturn in progress */
+ NFS_LAYOUT_RETURN_LOCK, /* Serialise layoutreturn */
+ NFS_LAYOUT_RETURN_REQUESTED, /* Return this layout ASAP */
+ NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
+ NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
+ NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */
+ NFS_LAYOUT_HASHED, /* The layout visible */
+ NFS_LAYOUT_DRAIN,
+};
+
+enum layoutdriver_policy_flags {
+ /* Should the pNFS client commit and return the layout upon truncate to
+ * a smaller size */
+ PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
+ PNFS_LAYOUTRET_ON_ERROR = 1 << 1,
+ PNFS_READ_WHOLE_PAGE = 1 << 2,
+ PNFS_LAYOUTGET_ON_OPEN = 1 << 3,
+};
+
+struct nfs4_deviceid_node;
+
+/* Per-layout driver specific registration structure */
+struct pnfs_layoutdriver_type {
+ struct list_head pnfs_tblid;
+ const u32 id;
+ const char *name;
+ struct module *owner;
+ unsigned flags;
+ unsigned max_deviceinfo_size;
+ unsigned max_layoutget_response;
+
+ int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
+ int (*clear_layoutdriver) (struct nfs_server *);
+
+ struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
+ void (*free_layout_hdr) (struct pnfs_layout_hdr *);
+
+ struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
+ void (*free_lseg) (struct pnfs_layout_segment *lseg);
+ void (*add_lseg) (struct pnfs_layout_hdr *layoutid,
+ struct pnfs_layout_segment *lseg,
+ struct list_head *free_me);
+
+ void (*return_range) (struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range);
+
+ /* test for nfs page cache coalescing */
+ const struct nfs_pageio_ops *pg_read_ops;
+ const struct nfs_pageio_ops *pg_write_ops;
+
+ struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode);
+
+ int (*sync)(struct inode *inode, bool datasync);
+
+ /*
+ * Return PNFS_ATTEMPTED to indicate the layout code has attempted
+ * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
+ */
+ enum pnfs_try_status (*read_pagelist)(struct nfs_pgio_header *);
+ enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int);
+
+ void (*free_deviceid_node) (struct nfs4_deviceid_node *);
+ struct nfs4_deviceid_node * (*alloc_deviceid_node)
+ (struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags);
+
+ int (*prepare_layoutreturn) (struct nfs4_layoutreturn_args *);
+
+ void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
+ int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args);
+ int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args);
+
+ void (*cancel_io)(struct pnfs_layout_segment *lseg);
+};
+
+struct pnfs_commit_ops {
+ void (*setup_ds_info)(struct pnfs_ds_commit_info *,
+ struct pnfs_layout_segment *);
+ void (*release_ds_info)(struct pnfs_ds_commit_info *,
+ struct inode *inode);
+ int (*commit_pagelist)(struct inode *inode,
+ struct list_head *mds_pages,
+ int how,
+ struct nfs_commit_info *cinfo);
+ void (*mark_request_commit) (struct nfs_page *req,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx);
+ void (*clear_request_commit) (struct nfs_page *req,
+ struct nfs_commit_info *cinfo);
+ int (*scan_commit_lists) (struct nfs_commit_info *cinfo,
+ int max);
+ void (*recover_commit_reqs) (struct list_head *list,
+ struct nfs_commit_info *cinfo);
+ struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
+ struct folio *folio);
+};
+
+struct pnfs_layout_hdr {
+ refcount_t plh_refcount;
+ atomic_t plh_outstanding; /* number of RPCs out */
+ struct list_head plh_layouts; /* other client layouts */
+ struct list_head plh_bulk_destroy;
+ struct list_head plh_segs; /* layout segments list */
+ struct list_head plh_return_segs; /* invalid layout segments */
+ unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
+ unsigned long plh_retry_timestamp;
+ unsigned long plh_flags;
+ nfs4_stateid plh_stateid;
+ u32 plh_barrier; /* ignore lower seqids */
+ u32 plh_return_seq;
+ enum pnfs_iomode plh_return_iomode;
+ loff_t plh_lwb; /* last write byte for layoutcommit */
+ const struct cred *plh_lc_cred; /* layoutcommit cred */
+ struct inode *plh_inode;
+ struct rcu_head plh_rcu;
+};
+
+struct pnfs_device {
+ struct nfs4_deviceid dev_id;
+ unsigned int layout_type;
+ unsigned int mincount;
+ unsigned int maxcount; /* gdia_maxcount */
+ struct page **pages;
+ unsigned int pgbase;
+ unsigned int pglen; /* reply buffer length */
+ unsigned char nocache : 1;/* May not be cached */
+};
+
+#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
+
+struct pnfs_devicelist {
+ unsigned int eof;
+ unsigned int num_devs;
+ struct nfs4_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
+};
+
+extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
+extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
+extern const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id);
+extern void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld);
+
+/* nfs4proc.c */
+extern size_t max_response_pages(struct nfs_server *server);
+extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
+ struct pnfs_device *dev,
+ const struct cred *cred);
+extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout);
+extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
+
+/* pnfs.c */
+void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
+void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
+
+void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *);
+void unset_pnfs_layoutdriver(struct nfs_server *);
+void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio);
+void pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req);
+void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
+int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
+void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req, u64 wb_size);
+void pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *);
+int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
+size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *prev, struct nfs_page *req);
+void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg);
+struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp);
+void pnfs_layoutget_free(struct nfs4_layoutget *lgp);
+void pnfs_free_lseg_list(struct list_head *tmp_list);
+void pnfs_destroy_layout(struct nfs_inode *);
+void pnfs_destroy_layout_final(struct nfs_inode *);
+void pnfs_destroy_all_layouts(struct nfs_client *);
+int pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
+ struct nfs_fsid *fsid,
+ bool is_recall);
+int pnfs_destroy_layouts_byclid(struct nfs_client *clp,
+ bool is_recall);
+bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
+ struct pnfs_layout_range *dst_range,
+ struct inode *inode);
+void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
+void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *new,
+ const struct cred *cred,
+ bool update_barrier);
+int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+ struct list_head *tmp_list,
+ const struct pnfs_layout_range *recall_range,
+ u32 seq);
+int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
+ struct list_head *tmp_list,
+ const struct pnfs_layout_range *recall_range,
+ u32 seq);
+int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
+ struct list_head *lseg_list);
+bool pnfs_roc(struct inode *ino,
+ struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ const struct cred *cred);
+int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp,
+ struct nfs4_layoutreturn_res **respp, int *ret);
+void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ int ret);
+bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task);
+void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t);
+void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
+int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
+int pnfs_generic_sync(struct inode *inode, bool datasync);
+int pnfs_nfs_generic_sync(struct inode *inode, bool datasync);
+int _pnfs_return_layout(struct inode *);
+int pnfs_commit_and_return_layout(struct inode *);
+void pnfs_ld_write_done(struct nfs_pgio_header *);
+void pnfs_ld_read_done(struct nfs_pgio_header *);
+void pnfs_read_resend_pnfs(struct nfs_pgio_header *, unsigned int mirror_idx);
+struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
+ struct nfs_open_context *ctx,
+ loff_t pos,
+ u64 count,
+ enum pnfs_iomode iomode,
+ bool strict_iomode,
+ gfp_t gfp_flags);
+void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *arg_stateid,
+ const struct pnfs_layout_range *range,
+ const nfs4_stateid *stateid);
+
+void pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ bool (*is_after)(const struct pnfs_layout_range *lseg_range,
+ const struct pnfs_layout_range *old),
+ bool (*do_merge)(struct pnfs_layout_segment *lseg,
+ struct pnfs_layout_segment *old),
+ struct list_head *free_me);
+
+void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
+int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
+int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
+struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
+void pnfs_error_mark_layout_for_return(struct inode *inode,
+ struct pnfs_layout_segment *lseg);
+void pnfs_layout_return_unused_byclid(struct nfs_client *clp,
+ enum pnfs_iomode iomode);
+
+/* nfs4_deviceid_flags */
+enum {
+ NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */
+ NFS_DEVICEID_UNAVAILABLE, /* device temporarily unavailable */
+ NFS_DEVICEID_NOCACHE, /* device may not be cached */
+};
+
+/* pnfs_dev.c */
+struct nfs4_deviceid_node {
+ struct hlist_node node;
+ struct hlist_node tmpnode;
+ const struct pnfs_layoutdriver_type *ld;
+ const struct nfs_client *nfs_client;
+ unsigned long flags;
+ unsigned long timestamp_unavailable;
+ struct nfs4_deviceid deviceid;
+ struct rcu_head rcu;
+ atomic_t ref;
+};
+
+struct nfs4_deviceid_node *
+nfs4_find_get_deviceid(struct nfs_server *server,
+ const struct nfs4_deviceid *id, const struct cred *cred,
+ gfp_t gfp_mask);
+void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *,
+ const struct nfs4_deviceid *);
+bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
+void nfs4_mark_deviceid_available(struct nfs4_deviceid_node *node);
+void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
+bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
+void nfs4_deviceid_purge_client(const struct nfs_client *);
+
+/* pnfs_nfs.c */
+struct pnfs_commit_array *pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags);
+void pnfs_free_commit_array(struct pnfs_commit_array *p);
+struct pnfs_commit_array *pnfs_add_commit_array(struct pnfs_ds_commit_info *,
+ struct pnfs_commit_array *,
+ struct pnfs_layout_segment *);
+
+void pnfs_generic_ds_cinfo_release_lseg(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg);
+void pnfs_generic_ds_cinfo_destroy(struct pnfs_ds_commit_info *fl_cinfo);
+
+void pnfs_generic_clear_request_commit(struct nfs_page *req,
+ struct nfs_commit_info *cinfo);
+void pnfs_generic_commit_release(void *calldata);
+void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data);
+void pnfs_generic_rw_release(void *data);
+void pnfs_generic_recover_commit_reqs(struct list_head *dst,
+ struct nfs_commit_info *cinfo);
+struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo,
+ struct folio *folio);
+int pnfs_generic_commit_pagelist(struct inode *inode,
+ struct list_head *mds_pages,
+ int how,
+ struct nfs_commit_info *cinfo,
+ int (*initiate_commit)(struct nfs_commit_data *data,
+ int how));
+int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max);
+void pnfs_generic_write_commit_done(struct rpc_task *task, void *data);
+void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds);
+struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs,
+ gfp_t gfp_flags);
+void nfs4_pnfs_v3_ds_connect_unload(void);
+int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
+ struct nfs4_deviceid_node *devid, unsigned int timeo,
+ unsigned int retrans, u32 version, u32 minor_version);
+struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net,
+ struct xdr_stream *xdr,
+ gfp_t gfp_flags);
+void pnfs_layout_mark_request_commit(struct nfs_page *req,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx);
+void pnfs_lgopen_prepare(struct nfs4_opendata *data,
+ struct nfs_open_context *ctx);
+void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp,
+ struct nfs_open_context *ctx);
+void nfs4_lgopen_release(struct nfs4_layoutget *lgp);
+
+static inline bool nfs_have_layout(struct inode *inode)
+{
+ return NFS_I(inode)->layout != NULL;
+}
+
+static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo)
+{
+ return test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) == 0;
+}
+
+static inline struct nfs4_deviceid_node *
+nfs4_get_deviceid(struct nfs4_deviceid_node *d)
+{
+ atomic_inc(&d->ref);
+ return d;
+}
+
+static inline struct pnfs_layout_segment *
+pnfs_get_lseg(struct pnfs_layout_segment *lseg)
+{
+ if (lseg) {
+ refcount_inc(&lseg->pls_refcount);
+ smp_mb__after_atomic();
+ }
+ return lseg;
+}
+
+static inline bool
+pnfs_is_valid_lseg(struct pnfs_layout_segment *lseg)
+{
+ return test_bit(NFS_LSEG_VALID, &lseg->pls_flags) != 0;
+}
+
+/* Return true if a layout driver is being used for this mountpoint */
+static inline int pnfs_enabled_sb(struct nfs_server *nfss)
+{
+ return nfss->pnfs_curr_ld != NULL;
+}
+
+static inline int
+pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (fl_cinfo == NULL || fl_cinfo->ncommitting == 0)
+ return PNFS_NOT_ATTEMPTED;
+ return fl_cinfo->ops->commit_pagelist(inode, mds_pages, how, cinfo);
+}
+
+static inline struct pnfs_ds_commit_info *
+pnfs_get_ds_info(struct inode *inode)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+ if (ld == NULL || ld->get_ds_info == NULL)
+ return NULL;
+ return ld->get_ds_info(inode);
+}
+
+static inline void
+pnfs_init_ds_commit_info_ops(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+ struct pnfs_ds_commit_info *inode_cinfo = pnfs_get_ds_info(inode);
+ if (inode_cinfo != NULL)
+ fl_cinfo->ops = inode_cinfo->ops;
+}
+
+static inline void
+pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo)
+{
+ INIT_LIST_HEAD(&fl_cinfo->commits);
+ fl_cinfo->ops = NULL;
+}
+
+static inline void
+pnfs_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+ if (fl_cinfo->ops != NULL && fl_cinfo->ops->release_ds_info != NULL)
+ fl_cinfo->ops->release_ds_info(fl_cinfo, inode);
+}
+
+static inline void
+pnfs_generic_mark_devid_invalid(struct nfs4_deviceid_node *node)
+{
+ set_bit(NFS_DEVICEID_INVALID, &node->flags);
+}
+
+static inline bool
+pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo, u32 ds_commit_idx)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (!lseg || !fl_cinfo->ops || !fl_cinfo->ops->mark_request_commit)
+ return false;
+ fl_cinfo->ops->mark_request_commit(req, lseg, cinfo, ds_commit_idx);
+ return true;
+}
+
+static inline bool
+pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (!fl_cinfo || !fl_cinfo->ops || !fl_cinfo->ops->clear_request_commit)
+ return false;
+ fl_cinfo->ops->clear_request_commit(req, cinfo);
+ return true;
+}
+
+static inline int
+pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
+ int max)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (!fl_cinfo || fl_cinfo->nwritten == 0)
+ return 0;
+ return fl_cinfo->ops->scan_commit_lists(cinfo, max);
+}
+
+static inline void
+pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (fl_cinfo && fl_cinfo->nwritten != 0)
+ fl_cinfo->ops->recover_commit_reqs(head, cinfo);
+}
+
+static inline struct nfs_page *
+pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
+ struct folio *folio)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (!fl_cinfo->ops || !fl_cinfo->ops->search_commit_reqs)
+ return NULL;
+ return fl_cinfo->ops->search_commit_reqs(cinfo, folio);
+}
+
+/* Should the pNFS client commit and return the layout upon a setattr */
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+ if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+ return false;
+ return NFS_SERVER(inode)->pnfs_curr_ld->flags &
+ PNFS_LAYOUTRET_ON_SETATTR;
+}
+
+static inline bool
+pnfs_ld_read_whole_page(struct inode *inode)
+{
+ if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+ return false;
+ return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE;
+}
+
+static inline int
+pnfs_sync_inode(struct inode *inode, bool datasync)
+{
+ if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+ return 0;
+ return NFS_SERVER(inode)->pnfs_curr_ld->sync(inode, datasync);
+}
+
+static inline bool
+pnfs_layoutcommit_outstanding(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ return test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags) != 0 ||
+ test_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags) != 0;
+}
+
+static inline int pnfs_return_layout(struct inode *ino)
+{
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct nfs_server *nfss = NFS_SERVER(ino);
+
+ if (pnfs_enabled_sb(nfss) && nfsi->layout) {
+ set_bit(NFS_LAYOUT_RETURN_REQUESTED, &nfsi->layout->plh_flags);
+ return _pnfs_return_layout(ino);
+ }
+
+ return 0;
+}
+
+static inline bool
+pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
+ struct nfs_server *nfss)
+{
+ return (dst && src && src->bm != 0 && nfss->pnfs_curr_ld &&
+ nfss->pnfs_curr_ld->id == src->l_type);
+}
+
+static inline u64
+pnfs_calc_offset_end(u64 offset, u64 len)
+{
+ if (len == NFS4_MAX_UINT64 || len >= NFS4_MAX_UINT64 - offset)
+ return NFS4_MAX_UINT64;
+ return offset + len - 1;
+}
+
+static inline u64
+pnfs_calc_offset_length(u64 offset, u64 end)
+{
+ if (end == NFS4_MAX_UINT64 || end <= offset)
+ return NFS4_MAX_UINT64;
+ return 1 + end - offset;
+}
+
+static inline void
+pnfs_copy_range(struct pnfs_layout_range *dst,
+ const struct pnfs_layout_range *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+
+static inline u64
+pnfs_end_offset(u64 start, u64 len)
+{
+ if (NFS4_MAX_UINT64 - start <= len)
+ return NFS4_MAX_UINT64;
+ return start + len;
+}
+
+/*
+ * Are 2 ranges intersecting?
+ * start1 end1
+ * [----------------------------------)
+ * start2 end2
+ * [----------------)
+ */
+static inline bool
+pnfs_is_range_intersecting(u64 start1, u64 end1, u64 start2, u64 end2)
+{
+ return (end1 == NFS4_MAX_UINT64 || start2 < end1) &&
+ (end2 == NFS4_MAX_UINT64 || start1 < end2);
+}
+
+static inline bool
+pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
+{
+ u64 end1 = pnfs_end_offset(l1->offset, l1->length);
+ u64 end2 = pnfs_end_offset(l2->offset, l2->length);
+
+ return pnfs_is_range_intersecting(l1->offset, end1, l2->offset, end2);
+}
+
+static inline bool
+pnfs_lseg_request_intersecting(struct pnfs_layout_segment *lseg, struct nfs_page *req)
+{
+ u64 seg_last = pnfs_end_offset(lseg->pls_range.offset, lseg->pls_range.length);
+ u64 req_last = req_offset(req) + req->wb_bytes;
+
+ return pnfs_is_range_intersecting(lseg->pls_range.offset, seg_last,
+ req_offset(req), req_last);
+}
+
+static inline void pnfs_lseg_cancel_io(struct nfs_server *server,
+ struct pnfs_layout_segment *lseg)
+{
+ if (server->pnfs_curr_ld->cancel_io)
+ server->pnfs_curr_ld->cancel_io(lseg);
+}
+
+extern unsigned int layoutstats_timer;
+
+#ifdef NFS_DEBUG
+void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
+#else
+static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id)
+{
+}
+
+#endif /* NFS_DEBUG */
+#else /* CONFIG_NFS_V4_1 */
+
+static inline bool nfs_have_layout(struct inode *inode)
+{
+ return false;
+}
+
+static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+}
+
+static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+}
+
+static inline void pnfs_destroy_layout_final(struct nfs_inode *nfsi)
+{
+}
+
+static inline struct pnfs_layout_segment *
+pnfs_get_lseg(struct pnfs_layout_segment *lseg)
+{
+ return NULL;
+}
+
+static inline void pnfs_put_lseg(struct pnfs_layout_segment *lseg)
+{
+}
+
+static inline int pnfs_return_layout(struct inode *ino)
+{
+ return 0;
+}
+
+static inline int pnfs_commit_and_return_layout(struct inode *inode)
+{
+ return 0;
+}
+
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+ return false;
+}
+
+static inline bool
+pnfs_ld_read_whole_page(struct inode *inode)
+{
+ return false;
+}
+
+static inline int
+pnfs_sync_inode(struct inode *inode, bool datasync)
+{
+ return 0;
+}
+
+static inline bool
+pnfs_layoutcommit_outstanding(struct inode *inode)
+{
+ return false;
+}
+
+
+static inline bool
+pnfs_roc(struct inode *ino,
+ struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ const struct cred *cred)
+{
+ return false;
+}
+
+static inline int
+pnfs_roc_done(struct rpc_task *task,
+ struct nfs4_layoutreturn_args **argpp,
+ struct nfs4_layoutreturn_res **respp,
+ int *ret)
+{
+ return 0;
+}
+
+static inline void
+pnfs_roc_release(struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ int ret)
+{
+}
+
+static inline bool
+pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
+{
+ return false;
+}
+
+static inline void set_pnfs_layoutdriver(struct nfs_server *s,
+ const struct nfs_fh *mntfh,
+ struct nfs_fsinfo *fsinfo)
+{
+}
+
+static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
+{
+}
+
+static inline int
+pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how,
+ struct nfs_commit_info *cinfo)
+{
+ return PNFS_NOT_ATTEMPTED;
+}
+
+static inline struct pnfs_ds_commit_info *
+pnfs_get_ds_info(struct inode *inode)
+{
+ return NULL;
+}
+
+static inline void
+pnfs_init_ds_commit_info_ops(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+}
+
+static inline void
+pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo)
+{
+}
+
+static inline void
+pnfs_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+}
+
+static inline bool
+pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo, u32 ds_commit_idx)
+{
+ return false;
+}
+
+static inline bool
+pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo)
+{
+ return false;
+}
+
+static inline int
+pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
+ int max)
+{
+ return 0;
+}
+
+static inline void
+pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo)
+{
+}
+
+static inline struct nfs_page *
+pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
+ struct folio *folio)
+{
+ return NULL;
+}
+
+static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
+{
+ return 0;
+}
+
+static inline bool
+pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
+ struct nfs_server *nfss)
+{
+ return false;
+}
+
+static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
+{
+ return NULL;
+}
+
+static inline void nfs4_pnfs_v3_ds_connect_unload(void)
+{
+}
+
+static inline bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
+ struct pnfs_layout_range *dst_range,
+ struct inode *inode)
+{
+ return false;
+}
+
+static inline void pnfs_lgopen_prepare(struct nfs4_opendata *data,
+ struct nfs_open_context *ctx)
+{
+}
+
+static inline void pnfs_parse_lgopen(struct inode *ino,
+ struct nfs4_layoutget *lgp,
+ struct nfs_open_context *ctx)
+{
+}
+
+static inline void nfs4_lgopen_release(struct nfs4_layoutget *lgp)
+{
+}
+
+static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo)
+{
+ return false;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+#if IS_ENABLED(CONFIG_NFS_V4_2)
+int pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags);
+#else
+static inline int
+pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
+{
+ return 0;
+}
+#endif
+
+#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
new file mode 100644
index 0000000000..178001c901
--- /dev/null
+++ b/fs/nfs/pnfs_dev.c
@@ -0,0 +1,384 @@
+/*
+ * Device operations for the pnfs client.
+ *
+ * Copyright (c) 2002
+ * The Regents of the University of Michigan
+ * All Rights Reserved
+ *
+ * Dean Hildebrand <dhildebz@umich.edu>
+ * Garth Goodson <Garth.Goodson@netapp.com>
+ *
+ * Permission is granted to use, copy, create derivative works, and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the University of Michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. If
+ * the above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * This software is provided as is, without representation or warranty
+ * of any kind either express or implied, including without limitation
+ * the implied warranties of merchantability, fitness for a particular
+ * purpose, or noninfringement. The Regents of the University of
+ * Michigan shall not be liable for any damages, including special,
+ * indirect, incidental, or consequential damages, with respect to any
+ * claim arising out of or in connection with the use of the software,
+ * even if it has been or is hereafter advised of the possibility of
+ * such damages.
+ */
+
+#include <linux/export.h>
+#include <linux/nfs_fs.h>
+#include "nfs4session.h"
+#include "internal.h"
+#include "pnfs.h"
+
+#include "nfs4trace.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS
+
+/*
+ * Device ID RCU cache. A device ID is unique per server and layout type.
+ */
+#define NFS4_DEVICE_ID_HASH_BITS 5
+#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
+#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
+
+
+static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];
+static DEFINE_SPINLOCK(nfs4_deviceid_lock);
+
+#ifdef NFS_DEBUG
+void
+nfs4_print_deviceid(const struct nfs4_deviceid *id)
+{
+ u32 *p = (u32 *)id;
+
+ dprintk("%s: device id= [%x%x%x%x]\n", __func__,
+ p[0], p[1], p[2], p[3]);
+}
+EXPORT_SYMBOL_GPL(nfs4_print_deviceid);
+#endif
+
+static inline u32
+nfs4_deviceid_hash(const struct nfs4_deviceid *id)
+{
+ unsigned char *cptr = (unsigned char *)id->data;
+ unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+ u32 x = 0;
+
+ while (nbytes--) {
+ x *= 37;
+ x += *cptr++;
+ }
+ return x & NFS4_DEVICE_ID_HASH_MASK;
+}
+
+static struct nfs4_deviceid_node *
+_lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
+ const struct nfs_client *clp, const struct nfs4_deviceid *id,
+ long hash)
+{
+ struct nfs4_deviceid_node *d;
+
+ hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node)
+ if (d->ld == ld && d->nfs_client == clp &&
+ !memcmp(&d->deviceid, id, sizeof(*id))) {
+ if (atomic_read(&d->ref))
+ return d;
+ else
+ continue;
+ }
+ return NULL;
+}
+
+static struct nfs4_deviceid_node *
+nfs4_get_device_info(struct nfs_server *server,
+ const struct nfs4_deviceid *dev_id,
+ const struct cred *cred, gfp_t gfp_flags)
+{
+ struct nfs4_deviceid_node *d = NULL;
+ struct pnfs_device *pdev = NULL;
+ struct page **pages = NULL;
+ u32 max_resp_sz;
+ int max_pages;
+ int rc, i;
+
+ /*
+ * Use the session max response size as the basis for setting
+ * GETDEVICEINFO's maxcount
+ */
+ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+ if (server->pnfs_curr_ld->max_deviceinfo_size &&
+ server->pnfs_curr_ld->max_deviceinfo_size < max_resp_sz)
+ max_resp_sz = server->pnfs_curr_ld->max_deviceinfo_size;
+ max_pages = nfs_page_array_len(0, max_resp_sz);
+ dprintk("%s: server %p max_resp_sz %u max_pages %d\n",
+ __func__, server, max_resp_sz, max_pages);
+
+ pdev = kzalloc(sizeof(*pdev), gfp_flags);
+ if (!pdev)
+ return NULL;
+
+ pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
+ if (!pages)
+ goto out_free_pdev;
+
+ for (i = 0; i < max_pages; i++) {
+ pages[i] = alloc_page(gfp_flags);
+ if (!pages[i])
+ goto out_free_pages;
+ }
+
+ memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
+ pdev->layout_type = server->pnfs_curr_ld->id;
+ pdev->pages = pages;
+ pdev->pgbase = 0;
+ pdev->pglen = max_resp_sz;
+ pdev->mincount = 0;
+ pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
+
+ rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
+ dprintk("%s getdevice info returns %d\n", __func__, rc);
+ if (rc)
+ goto out_free_pages;
+
+ /*
+ * Found new device, need to decode it and then add it to the
+ * list of known devices for this mountpoint.
+ */
+ d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev,
+ gfp_flags);
+ if (d && pdev->nocache)
+ set_bit(NFS_DEVICEID_NOCACHE, &d->flags);
+
+out_free_pages:
+ while (--i >= 0)
+ __free_page(pages[i]);
+ kfree(pages);
+out_free_pdev:
+ kfree(pdev);
+ dprintk("<-- %s d %p\n", __func__, d);
+ return d;
+}
+
+/*
+ * Lookup a deviceid in cache and get a reference count on it if found
+ *
+ * @clp nfs_client associated with deviceid
+ * @id deviceid to look up
+ */
+static struct nfs4_deviceid_node *
+__nfs4_find_get_deviceid(struct nfs_server *server,
+ const struct nfs4_deviceid *id, long hash)
+{
+ struct nfs4_deviceid_node *d;
+
+ rcu_read_lock();
+ d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id,
+ hash);
+ if (d != NULL && !atomic_inc_not_zero(&d->ref))
+ d = NULL;
+ rcu_read_unlock();
+ return d;
+}
+
+struct nfs4_deviceid_node *
+nfs4_find_get_deviceid(struct nfs_server *server,
+ const struct nfs4_deviceid *id, const struct cred *cred,
+ gfp_t gfp_mask)
+{
+ long hash = nfs4_deviceid_hash(id);
+ struct nfs4_deviceid_node *d, *new;
+
+ d = __nfs4_find_get_deviceid(server, id, hash);
+ if (d)
+ goto found;
+
+ new = nfs4_get_device_info(server, id, cred, gfp_mask);
+ if (!new) {
+ trace_nfs4_find_deviceid(server, id, -ENOENT);
+ return new;
+ }
+
+ spin_lock(&nfs4_deviceid_lock);
+ d = __nfs4_find_get_deviceid(server, id, hash);
+ if (d) {
+ spin_unlock(&nfs4_deviceid_lock);
+ server->pnfs_curr_ld->free_deviceid_node(new);
+ } else {
+ atomic_inc(&new->ref);
+ hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
+ spin_unlock(&nfs4_deviceid_lock);
+ d = new;
+ }
+found:
+ trace_nfs4_find_deviceid(server, id, 0);
+ return d;
+}
+EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
+
+/*
+ * Remove a deviceid from cache
+ *
+ * @clp nfs_client associated with deviceid
+ * @id the deviceid to unhash
+ *
+ * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise.
+ */
+void
+nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
+ const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+ struct nfs4_deviceid_node *d;
+
+ spin_lock(&nfs4_deviceid_lock);
+ rcu_read_lock();
+ d = _lookup_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
+ rcu_read_unlock();
+ if (!d) {
+ spin_unlock(&nfs4_deviceid_lock);
+ return;
+ }
+ hlist_del_init_rcu(&d->node);
+ clear_bit(NFS_DEVICEID_NOCACHE, &d->flags);
+ spin_unlock(&nfs4_deviceid_lock);
+
+ /* balance the initial ref set in pnfs_insert_deviceid */
+ nfs4_put_deviceid_node(d);
+}
+EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
+
+void
+nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, struct nfs_server *server,
+ const struct nfs4_deviceid *id)
+{
+ INIT_HLIST_NODE(&d->node);
+ INIT_HLIST_NODE(&d->tmpnode);
+ d->ld = server->pnfs_curr_ld;
+ d->nfs_client = server->nfs_client;
+ d->flags = 0;
+ d->deviceid = *id;
+ atomic_set(&d->ref, 1);
+}
+EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
+
+/*
+ * Dereference a deviceid node and delete it when its reference count drops
+ * to zero.
+ *
+ * @d deviceid node to put
+ *
+ * return true iff the node was deleted
+ * Note that since the test for d->ref == 0 is sufficient to establish
+ * that the node is no longer hashed in the global device id cache.
+ */
+bool
+nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
+{
+ if (test_bit(NFS_DEVICEID_NOCACHE, &d->flags)) {
+ if (atomic_add_unless(&d->ref, -1, 2))
+ return false;
+ nfs4_delete_deviceid(d->ld, d->nfs_client, &d->deviceid);
+ }
+ if (!atomic_dec_and_test(&d->ref))
+ return false;
+ trace_nfs4_deviceid_free(d->nfs_client, &d->deviceid);
+ d->ld->free_deviceid_node(d);
+ return true;
+}
+EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
+
+void
+nfs4_mark_deviceid_available(struct nfs4_deviceid_node *node)
+{
+ if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) {
+ clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+ smp_mb__after_atomic();
+ }
+}
+EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_available);
+
+void
+nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node)
+{
+ node->timestamp_unavailable = jiffies;
+ smp_mb__before_atomic();
+ set_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+ smp_mb__after_atomic();
+}
+EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_unavailable);
+
+bool
+nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node)
+{
+ if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) {
+ unsigned long start, end;
+
+ end = jiffies;
+ start = end - PNFS_DEVICE_RETRY_TIMEOUT;
+ if (time_in_range(node->timestamp_unavailable, start, end))
+ return true;
+ clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+ smp_mb__after_atomic();
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(nfs4_test_deviceid_unavailable);
+
+static void
+_deviceid_purge_client(const struct nfs_client *clp, long hash)
+{
+ struct nfs4_deviceid_node *d;
+ HLIST_HEAD(tmp);
+
+ spin_lock(&nfs4_deviceid_lock);
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node)
+ if (d->nfs_client == clp && atomic_read(&d->ref)) {
+ hlist_del_init_rcu(&d->node);
+ hlist_add_head(&d->tmpnode, &tmp);
+ clear_bit(NFS_DEVICEID_NOCACHE, &d->flags);
+ }
+ rcu_read_unlock();
+ spin_unlock(&nfs4_deviceid_lock);
+
+ if (hlist_empty(&tmp))
+ return;
+
+ while (!hlist_empty(&tmp)) {
+ d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode);
+ hlist_del(&d->tmpnode);
+ nfs4_put_deviceid_node(d);
+ }
+}
+
+void
+nfs4_deviceid_purge_client(const struct nfs_client *clp)
+{
+ long h;
+
+ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS))
+ return;
+ for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++)
+ _deviceid_purge_client(clp, h);
+}
+
+/*
+ * Stop use of all deviceids associated with an nfs_client
+ */
+void
+nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
+{
+ struct nfs4_deviceid_node *d;
+ int i;
+
+ rcu_read_lock();
+ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i ++){
+ hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[i], node)
+ if (d->nfs_client == clp)
+ set_bit(NFS_DEVICEID_INVALID, &d->flags);
+ }
+ rcu_read_unlock();
+}
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
new file mode 100644
index 0000000000..afd23910f3
--- /dev/null
+++ b/fs/nfs/pnfs_nfs.c
@@ -0,0 +1,1209 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Common NFS I/O operations for the pnfs file based
+ * layout drivers.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tom Haynes <loghyr@primarydata.com>
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/module.h>
+
+#include "nfs4session.h"
+#include "internal.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS
+
+void pnfs_generic_rw_release(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ nfs_put_client(hdr->ds_clp);
+ hdr->mds_ops->rpc_release(data);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_rw_release);
+
+/* Fake up some data that will cause nfs_commit_release to retry the writes. */
+void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data)
+{
+ struct nfs_writeverf *verf = data->res.verf;
+
+ data->task.tk_status = 0;
+ memset(&verf->verifier, 0, sizeof(verf->verifier));
+ verf->committed = NFS_UNSTABLE;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_prepare_to_resend_writes);
+
+void pnfs_generic_write_commit_done(struct rpc_task *task, void *data)
+{
+ struct nfs_commit_data *wdata = data;
+
+ /* Note this may cause RPC to be resent */
+ wdata->mds_ops->rpc_call_done(task, data);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_write_commit_done);
+
+void pnfs_generic_commit_release(void *calldata)
+{
+ struct nfs_commit_data *data = calldata;
+
+ data->completion_ops->completion(data);
+ pnfs_put_lseg(data->lseg);
+ nfs_put_client(data->ds_clp);
+ nfs_commitdata_release(data);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
+
+static struct pnfs_layout_segment *
+pnfs_free_bucket_lseg(struct pnfs_commit_bucket *bucket)
+{
+ if (list_empty(&bucket->committing) && list_empty(&bucket->written)) {
+ struct pnfs_layout_segment *freeme = bucket->lseg;
+ bucket->lseg = NULL;
+ return freeme;
+ }
+ return NULL;
+}
+
+/* The generic layer is about to remove the req from the commit list.
+ * If this will make the bucket empty, it will need to put the lseg reference.
+ * Note this must be called holding nfsi->commit_mutex
+ */
+void
+pnfs_generic_clear_request_commit(struct nfs_page *req,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_commit_bucket *bucket = NULL;
+
+ if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
+ goto out;
+ cinfo->ds->nwritten--;
+ if (list_is_singular(&req->wb_list))
+ bucket = list_first_entry(&req->wb_list,
+ struct pnfs_commit_bucket, written);
+out:
+ nfs_request_remove_commit_list(req, cinfo);
+ if (bucket)
+ pnfs_put_lseg(pnfs_free_bucket_lseg(bucket));
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit);
+
+struct pnfs_commit_array *
+pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags)
+{
+ struct pnfs_commit_array *p;
+ struct pnfs_commit_bucket *b;
+
+ p = kmalloc(struct_size(p, buckets, n), gfp_flags);
+ if (!p)
+ return NULL;
+ p->nbuckets = n;
+ INIT_LIST_HEAD(&p->cinfo_list);
+ INIT_LIST_HEAD(&p->lseg_list);
+ p->lseg = NULL;
+ for (b = &p->buckets[0]; n != 0; b++, n--) {
+ INIT_LIST_HEAD(&b->written);
+ INIT_LIST_HEAD(&b->committing);
+ b->lseg = NULL;
+ b->direct_verf.committed = NFS_INVALID_STABLE_HOW;
+ }
+ return p;
+}
+EXPORT_SYMBOL_GPL(pnfs_alloc_commit_array);
+
+void
+pnfs_free_commit_array(struct pnfs_commit_array *p)
+{
+ kfree_rcu(p, rcu);
+}
+EXPORT_SYMBOL_GPL(pnfs_free_commit_array);
+
+static struct pnfs_commit_array *
+pnfs_find_commit_array_by_lseg(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_commit_array *array;
+
+ list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+ if (array->lseg == lseg)
+ return array;
+ }
+ return NULL;
+}
+
+struct pnfs_commit_array *
+pnfs_add_commit_array(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_commit_array *new,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_commit_array *array;
+
+ array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg);
+ if (array)
+ return array;
+ new->lseg = lseg;
+ refcount_set(&new->refcount, 1);
+ list_add_rcu(&new->cinfo_list, &fl_cinfo->commits);
+ list_add(&new->lseg_list, &lseg->pls_commits);
+ return new;
+}
+EXPORT_SYMBOL_GPL(pnfs_add_commit_array);
+
+static struct pnfs_commit_array *
+pnfs_lookup_commit_array(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_commit_array *array;
+
+ rcu_read_lock();
+ array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg);
+ if (!array) {
+ rcu_read_unlock();
+ fl_cinfo->ops->setup_ds_info(fl_cinfo, lseg);
+ rcu_read_lock();
+ array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg);
+ }
+ rcu_read_unlock();
+ return array;
+}
+
+static void
+pnfs_release_commit_array_locked(struct pnfs_commit_array *array)
+{
+ list_del_rcu(&array->cinfo_list);
+ list_del(&array->lseg_list);
+ pnfs_free_commit_array(array);
+}
+
+static void
+pnfs_put_commit_array_locked(struct pnfs_commit_array *array)
+{
+ if (refcount_dec_and_test(&array->refcount))
+ pnfs_release_commit_array_locked(array);
+}
+
+static void
+pnfs_put_commit_array(struct pnfs_commit_array *array, struct inode *inode)
+{
+ if (refcount_dec_and_lock(&array->refcount, &inode->i_lock)) {
+ pnfs_release_commit_array_locked(array);
+ spin_unlock(&inode->i_lock);
+ }
+}
+
+static struct pnfs_commit_array *
+pnfs_get_commit_array(struct pnfs_commit_array *array)
+{
+ if (refcount_inc_not_zero(&array->refcount))
+ return array;
+ return NULL;
+}
+
+static void
+pnfs_remove_and_free_commit_array(struct pnfs_commit_array *array)
+{
+ array->lseg = NULL;
+ list_del_init(&array->lseg_list);
+ pnfs_put_commit_array_locked(array);
+}
+
+void
+pnfs_generic_ds_cinfo_release_lseg(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_commit_array *array, *tmp;
+
+ list_for_each_entry_safe(array, tmp, &lseg->pls_commits, lseg_list)
+ pnfs_remove_and_free_commit_array(array);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_ds_cinfo_release_lseg);
+
+void
+pnfs_generic_ds_cinfo_destroy(struct pnfs_ds_commit_info *fl_cinfo)
+{
+ struct pnfs_commit_array *array, *tmp;
+
+ list_for_each_entry_safe(array, tmp, &fl_cinfo->commits, cinfo_list)
+ pnfs_remove_and_free_commit_array(array);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_ds_cinfo_destroy);
+
+/*
+ * Locks the nfs_page requests for commit and moves them to
+ * @bucket->committing.
+ */
+static int
+pnfs_bucket_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
+ struct nfs_commit_info *cinfo,
+ int max)
+{
+ struct list_head *src = &bucket->written;
+ struct list_head *dst = &bucket->committing;
+ int ret;
+
+ lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
+ ret = nfs_scan_commit_list(src, dst, cinfo, max);
+ if (ret) {
+ cinfo->ds->nwritten -= ret;
+ cinfo->ds->ncommitting += ret;
+ }
+ return ret;
+}
+
+static int pnfs_bucket_scan_array(struct nfs_commit_info *cinfo,
+ struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets,
+ int max)
+{
+ unsigned int i;
+ int rv = 0, cnt;
+
+ for (i = 0; i < nbuckets && max != 0; i++) {
+ cnt = pnfs_bucket_scan_ds_commit_list(&buckets[i], cinfo, max);
+ rv += cnt;
+ max -= cnt;
+ }
+ return rv;
+}
+
+/* Move reqs from written to committing lists, returning count
+ * of number moved.
+ */
+int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+ struct pnfs_commit_array *array;
+ int rv = 0, cnt;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+ if (!array->lseg || !pnfs_get_commit_array(array))
+ continue;
+ rcu_read_unlock();
+ cnt = pnfs_bucket_scan_array(cinfo, array->buckets,
+ array->nbuckets, max);
+ rcu_read_lock();
+ pnfs_put_commit_array(array, cinfo->inode);
+ rv += cnt;
+ max -= cnt;
+ if (!max)
+ break;
+ }
+ rcu_read_unlock();
+ return rv;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists);
+
+static unsigned int
+pnfs_bucket_recover_commit_reqs(struct list_head *dst,
+ struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_commit_bucket *b;
+ struct pnfs_layout_segment *freeme;
+ unsigned int nwritten, ret = 0;
+ unsigned int i;
+
+restart:
+ for (i = 0, b = buckets; i < nbuckets; i++, b++) {
+ nwritten = nfs_scan_commit_list(&b->written, dst, cinfo, 0);
+ if (!nwritten)
+ continue;
+ ret += nwritten;
+ freeme = pnfs_free_bucket_lseg(b);
+ if (freeme) {
+ pnfs_put_lseg(freeme);
+ goto restart;
+ }
+ }
+ return ret;
+}
+
+/* Pull everything off the committing lists and dump into @dst. */
+void pnfs_generic_recover_commit_reqs(struct list_head *dst,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+ struct pnfs_commit_array *array;
+ unsigned int nwritten;
+
+ lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
+ rcu_read_lock();
+ list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+ if (!array->lseg || !pnfs_get_commit_array(array))
+ continue;
+ rcu_read_unlock();
+ nwritten = pnfs_bucket_recover_commit_reqs(dst,
+ array->buckets,
+ array->nbuckets,
+ cinfo);
+ rcu_read_lock();
+ pnfs_put_commit_array(array, cinfo->inode);
+ fl_cinfo->nwritten -= nwritten;
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs);
+
+static struct nfs_page *
+pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets, struct folio *folio)
+{
+ struct nfs_page *req;
+ struct pnfs_commit_bucket *b;
+ unsigned int i;
+
+ /* Linearly search the commit lists for each bucket until a matching
+ * request is found */
+ for (i = 0, b = buckets; i < nbuckets; i++, b++) {
+ list_for_each_entry(req, &b->written, wb_list) {
+ if (nfs_page_to_folio(req) == folio)
+ return req->wb_head;
+ }
+ list_for_each_entry(req, &b->committing, wb_list) {
+ if (nfs_page_to_folio(req) == folio)
+ return req->wb_head;
+ }
+ }
+ return NULL;
+}
+
+/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head request
+ * for @folio
+ * @cinfo - commit info for current inode
+ * @folio - page to search for matching head request
+ *
+ * Return: the head request if one is found, otherwise %NULL.
+ */
+struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo,
+ struct folio *folio)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+ struct pnfs_commit_array *array;
+ struct nfs_page *req;
+
+ list_for_each_entry(array, &fl_cinfo->commits, cinfo_list) {
+ req = pnfs_bucket_search_commit_reqs(array->buckets,
+ array->nbuckets, folio);
+ if (req)
+ return req;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_search_commit_reqs);
+
+static struct pnfs_layout_segment *
+pnfs_bucket_get_committing(struct list_head *head,
+ struct pnfs_commit_bucket *bucket,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_layout_segment *lseg;
+ struct list_head *pos;
+
+ list_for_each(pos, &bucket->committing)
+ cinfo->ds->ncommitting--;
+ list_splice_init(&bucket->committing, head);
+ lseg = pnfs_free_bucket_lseg(bucket);
+ if (!lseg)
+ lseg = pnfs_get_lseg(bucket->lseg);
+ return lseg;
+}
+
+static struct nfs_commit_data *
+pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket,
+ struct nfs_commit_info *cinfo)
+{
+ struct nfs_commit_data *data = nfs_commitdata_alloc();
+
+ if (!data)
+ return NULL;
+ data->lseg = pnfs_bucket_get_committing(&data->pages, bucket, cinfo);
+ return data;
+}
+
+static void pnfs_generic_retry_commit(struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets,
+ struct nfs_commit_info *cinfo,
+ unsigned int idx)
+{
+ struct pnfs_commit_bucket *bucket;
+ struct pnfs_layout_segment *freeme;
+ LIST_HEAD(pages);
+
+ for (bucket = buckets; idx < nbuckets; bucket++, idx++) {
+ if (list_empty(&bucket->committing))
+ continue;
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+ freeme = pnfs_bucket_get_committing(&pages, bucket, cinfo);
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+ nfs_retry_commit(&pages, freeme, cinfo, idx);
+ pnfs_put_lseg(freeme);
+ }
+}
+
+static unsigned int
+pnfs_bucket_alloc_ds_commits(struct list_head *list,
+ struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_commit_bucket *bucket;
+ struct nfs_commit_data *data;
+ unsigned int i;
+ unsigned int nreq = 0;
+
+ for (i = 0, bucket = buckets; i < nbuckets; i++, bucket++) {
+ if (list_empty(&bucket->committing))
+ continue;
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+ if (!list_empty(&bucket->committing)) {
+ data = pnfs_bucket_fetch_commitdata(bucket, cinfo);
+ if (!data)
+ goto out_error;
+ data->ds_commit_index = i;
+ list_add_tail(&data->list, list);
+ nreq++;
+ }
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+ }
+ return nreq;
+out_error:
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+ /* Clean up on error */
+ pnfs_generic_retry_commit(buckets, nbuckets, cinfo, i);
+ return nreq;
+}
+
+static unsigned int
+pnfs_alloc_ds_commits_list(struct list_head *list,
+ struct pnfs_ds_commit_info *fl_cinfo,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_commit_array *array;
+ unsigned int ret = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+ if (!array->lseg || !pnfs_get_commit_array(array))
+ continue;
+ rcu_read_unlock();
+ ret += pnfs_bucket_alloc_ds_commits(list, array->buckets,
+ array->nbuckets, cinfo);
+ rcu_read_lock();
+ pnfs_put_commit_array(array, cinfo->inode);
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+/* This follows nfs_commit_list pretty closely */
+int
+pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
+ int how, struct nfs_commit_info *cinfo,
+ int (*initiate_commit)(struct nfs_commit_data *data,
+ int how))
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+ struct nfs_commit_data *data, *tmp;
+ LIST_HEAD(list);
+ unsigned int nreq = 0;
+
+ if (!list_empty(mds_pages)) {
+ data = nfs_commitdata_alloc();
+ if (!data) {
+ nfs_retry_commit(mds_pages, NULL, cinfo, -1);
+ return -ENOMEM;
+ }
+ data->ds_commit_index = -1;
+ list_splice_init(mds_pages, &data->pages);
+ list_add_tail(&data->list, &list);
+ nreq++;
+ }
+
+ nreq += pnfs_alloc_ds_commits_list(&list, fl_cinfo, cinfo);
+ if (nreq == 0)
+ goto out;
+
+ list_for_each_entry_safe(data, tmp, &list, list) {
+ list_del(&data->list);
+ if (data->ds_commit_index < 0) {
+ nfs_init_commit(data, NULL, NULL, cinfo);
+ nfs_initiate_commit(NFS_CLIENT(inode), data,
+ NFS_PROTO(data->inode),
+ data->mds_ops, how,
+ RPC_TASK_CRED_NOREF);
+ } else {
+ nfs_init_commit(data, NULL, data->lseg, cinfo);
+ initiate_commit(data, how);
+ }
+ }
+out:
+ return PNFS_ATTEMPTED;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist);
+
+/*
+ * Data server cache
+ *
+ * Data servers can be mapped to different device ids.
+ * nfs4_pnfs_ds reference counting
+ * - set to 1 on allocation
+ * - incremented when a device id maps a data server already in the cache.
+ * - decremented when deviceid is removed from the cache.
+ */
+static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
+static LIST_HEAD(nfs4_data_server_cache);
+
+/* Debug routines */
+static void
+print_ds(struct nfs4_pnfs_ds *ds)
+{
+ if (ds == NULL) {
+ printk(KERN_WARNING "%s NULL device\n", __func__);
+ return;
+ }
+ printk(KERN_WARNING " ds %s\n"
+ " ref count %d\n"
+ " client %p\n"
+ " cl_exchange_flags %x\n",
+ ds->ds_remotestr,
+ refcount_read(&ds->ds_count), ds->ds_clp,
+ ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
+}
+
+static bool
+same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
+{
+ struct sockaddr_in *a, *b;
+ struct sockaddr_in6 *a6, *b6;
+
+ if (addr1->sa_family != addr2->sa_family)
+ return false;
+
+ switch (addr1->sa_family) {
+ case AF_INET:
+ a = (struct sockaddr_in *)addr1;
+ b = (struct sockaddr_in *)addr2;
+
+ if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
+ a->sin_port == b->sin_port)
+ return true;
+ break;
+
+ case AF_INET6:
+ a6 = (struct sockaddr_in6 *)addr1;
+ b6 = (struct sockaddr_in6 *)addr2;
+
+ /* LINKLOCAL addresses must have matching scope_id */
+ if (ipv6_addr_src_scope(&a6->sin6_addr) ==
+ IPV6_ADDR_SCOPE_LINKLOCAL &&
+ a6->sin6_scope_id != b6->sin6_scope_id)
+ return false;
+
+ if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
+ a6->sin6_port == b6->sin6_port)
+ return true;
+ break;
+
+ default:
+ dprintk("%s: unhandled address family: %u\n",
+ __func__, addr1->sa_family);
+ return false;
+ }
+
+ return false;
+}
+
+/*
+ * Checks if 'dsaddrs1' contains a subset of 'dsaddrs2'. If it does,
+ * declare a match.
+ */
+static bool
+_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
+ const struct list_head *dsaddrs2)
+{
+ struct nfs4_pnfs_ds_addr *da1, *da2;
+ struct sockaddr *sa1, *sa2;
+ bool match = false;
+
+ list_for_each_entry(da1, dsaddrs1, da_node) {
+ sa1 = (struct sockaddr *)&da1->da_addr;
+ match = false;
+ list_for_each_entry(da2, dsaddrs2, da_node) {
+ sa2 = (struct sockaddr *)&da2->da_addr;
+ match = same_sockaddr(sa1, sa2);
+ if (match)
+ break;
+ }
+ if (!match)
+ break;
+ }
+ return match;
+}
+
+/*
+ * Lookup DS by addresses. nfs4_ds_cache_lock is held
+ */
+static struct nfs4_pnfs_ds *
+_data_server_lookup_locked(const struct list_head *dsaddrs)
+{
+ struct nfs4_pnfs_ds *ds;
+
+ list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
+ if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
+ return ds;
+ return NULL;
+}
+
+static struct nfs4_pnfs_ds_addr *nfs4_pnfs_ds_addr_alloc(gfp_t gfp_flags)
+{
+ struct nfs4_pnfs_ds_addr *da = kzalloc(sizeof(*da), gfp_flags);
+ if (da)
+ INIT_LIST_HEAD(&da->da_node);
+ return da;
+}
+
+static void nfs4_pnfs_ds_addr_free(struct nfs4_pnfs_ds_addr *da)
+{
+ kfree(da->da_remotestr);
+ kfree(da->da_netid);
+ kfree(da);
+}
+
+static void destroy_ds(struct nfs4_pnfs_ds *ds)
+{
+ struct nfs4_pnfs_ds_addr *da;
+
+ dprintk("--> %s\n", __func__);
+ ifdebug(FACILITY)
+ print_ds(ds);
+
+ nfs_put_client(ds->ds_clp);
+
+ while (!list_empty(&ds->ds_addrs)) {
+ da = list_first_entry(&ds->ds_addrs,
+ struct nfs4_pnfs_ds_addr,
+ da_node);
+ list_del_init(&da->da_node);
+ nfs4_pnfs_ds_addr_free(da);
+ }
+
+ kfree(ds->ds_remotestr);
+ kfree(ds);
+}
+
+void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds)
+{
+ if (refcount_dec_and_lock(&ds->ds_count,
+ &nfs4_ds_cache_lock)) {
+ list_del_init(&ds->ds_node);
+ spin_unlock(&nfs4_ds_cache_lock);
+ destroy_ds(ds);
+ }
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_put);
+
+/*
+ * Create a string with a human readable address and port to avoid
+ * complicated setup around many dprinks.
+ */
+static char *
+nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
+{
+ struct nfs4_pnfs_ds_addr *da;
+ char *remotestr;
+ size_t len;
+ char *p;
+
+ len = 3; /* '{', '}' and eol */
+ list_for_each_entry(da, dsaddrs, da_node) {
+ len += strlen(da->da_remotestr) + 1; /* string plus comma */
+ }
+
+ remotestr = kzalloc(len, gfp_flags);
+ if (!remotestr)
+ return NULL;
+
+ p = remotestr;
+ *(p++) = '{';
+ len--;
+ list_for_each_entry(da, dsaddrs, da_node) {
+ size_t ll = strlen(da->da_remotestr);
+
+ if (ll > len)
+ goto out_err;
+
+ memcpy(p, da->da_remotestr, ll);
+ p += ll;
+ len -= ll;
+
+ if (len < 1)
+ goto out_err;
+ (*p++) = ',';
+ len--;
+ }
+ if (len < 2)
+ goto out_err;
+ *(p++) = '}';
+ *p = '\0';
+ return remotestr;
+out_err:
+ kfree(remotestr);
+ return NULL;
+}
+
+/*
+ * Given a list of multipath struct nfs4_pnfs_ds_addr, add it to ds cache if
+ * uncached and return cached struct nfs4_pnfs_ds.
+ */
+struct nfs4_pnfs_ds *
+nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
+{
+ struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
+ char *remotestr;
+
+ if (list_empty(dsaddrs)) {
+ dprintk("%s: no addresses defined\n", __func__);
+ goto out;
+ }
+
+ ds = kzalloc(sizeof(*ds), gfp_flags);
+ if (!ds)
+ goto out;
+
+ /* this is only used for debugging, so it's ok if its NULL */
+ remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
+
+ spin_lock(&nfs4_ds_cache_lock);
+ tmp_ds = _data_server_lookup_locked(dsaddrs);
+ if (tmp_ds == NULL) {
+ INIT_LIST_HEAD(&ds->ds_addrs);
+ list_splice_init(dsaddrs, &ds->ds_addrs);
+ ds->ds_remotestr = remotestr;
+ refcount_set(&ds->ds_count, 1);
+ INIT_LIST_HEAD(&ds->ds_node);
+ ds->ds_clp = NULL;
+ list_add(&ds->ds_node, &nfs4_data_server_cache);
+ dprintk("%s add new data server %s\n", __func__,
+ ds->ds_remotestr);
+ } else {
+ kfree(remotestr);
+ kfree(ds);
+ refcount_inc(&tmp_ds->ds_count);
+ dprintk("%s data server %s found, inc'ed ds_count to %d\n",
+ __func__, tmp_ds->ds_remotestr,
+ refcount_read(&tmp_ds->ds_count));
+ ds = tmp_ds;
+ }
+ spin_unlock(&nfs4_ds_cache_lock);
+out:
+ return ds;
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_add);
+
+static int nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
+{
+ might_sleep();
+ return wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, TASK_KILLABLE);
+}
+
+static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
+{
+ smp_mb__before_atomic();
+ clear_and_wake_up_bit(NFS4DS_CONNECTING, &ds->ds_state);
+}
+
+static struct nfs_client *(*get_v3_ds_connect)(
+ struct nfs_server *mds_srv,
+ const struct sockaddr_storage *ds_addr,
+ int ds_addrlen,
+ int ds_proto,
+ unsigned int ds_timeo,
+ unsigned int ds_retrans);
+
+static bool load_v3_ds_connect(void)
+{
+ if (!get_v3_ds_connect) {
+ get_v3_ds_connect = symbol_request(nfs3_set_ds_client);
+ WARN_ON_ONCE(!get_v3_ds_connect);
+ }
+
+ return(get_v3_ds_connect != NULL);
+}
+
+void nfs4_pnfs_v3_ds_connect_unload(void)
+{
+ if (get_v3_ds_connect) {
+ symbol_put(nfs3_set_ds_client);
+ get_v3_ds_connect = NULL;
+ }
+}
+
+static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
+ struct nfs4_pnfs_ds *ds,
+ unsigned int timeo,
+ unsigned int retrans)
+{
+ struct nfs_client *clp = ERR_PTR(-EIO);
+ struct nfs4_pnfs_ds_addr *da;
+ unsigned long connect_timeout = timeo * (retrans + 1) * HZ / 10;
+ int status = 0;
+
+ dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr);
+
+ if (!load_v3_ds_connect())
+ return -EPROTONOSUPPORT;
+
+ list_for_each_entry(da, &ds->ds_addrs, da_node) {
+ dprintk("%s: DS %s: trying address %s\n",
+ __func__, ds->ds_remotestr, da->da_remotestr);
+
+ if (!IS_ERR(clp)) {
+ struct xprt_create xprt_args = {
+ .ident = da->da_transport,
+ .net = clp->cl_net,
+ .dstaddr = (struct sockaddr *)&da->da_addr,
+ .addrlen = da->da_addrlen,
+ .servername = clp->cl_hostname,
+ .connect_timeout = connect_timeout,
+ .reconnect_timeout = connect_timeout,
+ };
+
+ if (da->da_transport != clp->cl_proto)
+ continue;
+ if (da->da_addr.ss_family != clp->cl_addr.ss_family)
+ continue;
+ /* Add this address as an alias */
+ rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
+ rpc_clnt_test_and_add_xprt, NULL);
+ continue;
+ }
+ clp = get_v3_ds_connect(mds_srv,
+ &da->da_addr,
+ da->da_addrlen, da->da_transport,
+ timeo, retrans);
+ if (IS_ERR(clp))
+ continue;
+ clp->cl_rpcclient->cl_softerr = 0;
+ clp->cl_rpcclient->cl_softrtry = 0;
+ }
+
+ if (IS_ERR(clp)) {
+ status = PTR_ERR(clp);
+ goto out;
+ }
+
+ smp_wmb();
+ WRITE_ONCE(ds->ds_clp, clp);
+ dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
+out:
+ return status;
+}
+
+static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
+ struct nfs4_pnfs_ds *ds,
+ unsigned int timeo,
+ unsigned int retrans,
+ u32 minor_version)
+{
+ struct nfs_client *clp = ERR_PTR(-EIO);
+ struct nfs4_pnfs_ds_addr *da;
+ int status = 0;
+
+ dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr);
+
+ list_for_each_entry(da, &ds->ds_addrs, da_node) {
+ dprintk("%s: DS %s: trying address %s\n",
+ __func__, ds->ds_remotestr, da->da_remotestr);
+
+ if (!IS_ERR(clp) && clp->cl_mvops->session_trunk) {
+ struct xprt_create xprt_args = {
+ .ident = da->da_transport,
+ .net = clp->cl_net,
+ .dstaddr = (struct sockaddr *)&da->da_addr,
+ .addrlen = da->da_addrlen,
+ .servername = clp->cl_hostname,
+ };
+ struct nfs4_add_xprt_data xprtdata = {
+ .clp = clp,
+ };
+ struct rpc_add_xprt_test rpcdata = {
+ .add_xprt_test = clp->cl_mvops->session_trunk,
+ .data = &xprtdata,
+ };
+
+ if (da->da_transport != clp->cl_proto)
+ continue;
+ if (da->da_addr.ss_family != clp->cl_addr.ss_family)
+ continue;
+ /**
+ * Test this address for session trunking and
+ * add as an alias
+ */
+ xprtdata.cred = nfs4_get_clid_cred(clp);
+ rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
+ rpc_clnt_setup_test_and_add_xprt,
+ &rpcdata);
+ if (xprtdata.cred)
+ put_cred(xprtdata.cred);
+ } else {
+ clp = nfs4_set_ds_client(mds_srv,
+ &da->da_addr,
+ da->da_addrlen,
+ da->da_transport, timeo,
+ retrans, minor_version);
+ if (IS_ERR(clp))
+ continue;
+
+ status = nfs4_init_ds_session(clp,
+ mds_srv->nfs_client->cl_lease_time);
+ if (status) {
+ nfs_put_client(clp);
+ clp = ERR_PTR(-EIO);
+ continue;
+ }
+
+ }
+ }
+
+ if (IS_ERR(clp)) {
+ status = PTR_ERR(clp);
+ goto out;
+ }
+
+ smp_wmb();
+ WRITE_ONCE(ds->ds_clp, clp);
+ dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
+out:
+ return status;
+}
+
+/*
+ * Create an rpc connection to the nfs4_pnfs_ds data server.
+ * Currently only supports IPv4 and IPv6 addresses.
+ * If connection fails, make devid unavailable and return a -errno.
+ */
+int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
+ struct nfs4_deviceid_node *devid, unsigned int timeo,
+ unsigned int retrans, u32 version, u32 minor_version)
+{
+ int err;
+
+ do {
+ err = nfs4_wait_ds_connect(ds);
+ if (err || ds->ds_clp)
+ goto out;
+ if (nfs4_test_deviceid_unavailable(devid))
+ return -ENODEV;
+ } while (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) != 0);
+
+ if (ds->ds_clp)
+ goto connect_done;
+
+ switch (version) {
+ case 3:
+ err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo, retrans);
+ break;
+ case 4:
+ err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo, retrans,
+ minor_version);
+ break;
+ default:
+ dprintk("%s: unsupported DS version %d\n", __func__, version);
+ err = -EPROTONOSUPPORT;
+ }
+
+connect_done:
+ nfs4_clear_ds_conn_bit(ds);
+out:
+ /*
+ * At this point the ds->ds_clp should be ready, but it might have
+ * hit an error.
+ */
+ if (!err) {
+ if (!ds->ds_clp || !nfs_client_init_is_complete(ds->ds_clp)) {
+ WARN_ON_ONCE(ds->ds_clp ||
+ !nfs4_test_deviceid_unavailable(devid));
+ return -EINVAL;
+ }
+ err = nfs_client_init_status(ds->ds_clp);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_connect);
+
+/*
+ * Currently only supports ipv4, ipv6 and one multi-path address.
+ */
+struct nfs4_pnfs_ds_addr *
+nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags)
+{
+ struct nfs4_pnfs_ds_addr *da = NULL;
+ char *buf, *portstr;
+ __be16 port;
+ ssize_t nlen, rlen;
+ int tmp[2];
+ char *netid;
+ size_t len;
+ char *startsep = "";
+ char *endsep = "";
+
+
+ /* r_netid */
+ nlen = xdr_stream_decode_string_dup(xdr, &netid, XDR_MAX_NETOBJ,
+ gfp_flags);
+ if (unlikely(nlen < 0))
+ goto out_err;
+
+ /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
+ /* port is ".ABC.DEF", 8 chars max */
+ rlen = xdr_stream_decode_string_dup(xdr, &buf, INET6_ADDRSTRLEN +
+ IPV6_SCOPE_ID_LEN + 8, gfp_flags);
+ if (unlikely(rlen < 0))
+ goto out_free_netid;
+
+ /* replace port '.' with '-' */
+ portstr = strrchr(buf, '.');
+ if (!portstr) {
+ dprintk("%s: Failed finding expected dot in port\n",
+ __func__);
+ goto out_free_buf;
+ }
+ *portstr = '-';
+
+ /* find '.' between address and port */
+ portstr = strrchr(buf, '.');
+ if (!portstr) {
+ dprintk("%s: Failed finding expected dot between address and "
+ "port\n", __func__);
+ goto out_free_buf;
+ }
+ *portstr = '\0';
+
+ da = nfs4_pnfs_ds_addr_alloc(gfp_flags);
+ if (unlikely(!da))
+ goto out_free_buf;
+
+ if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
+ sizeof(da->da_addr))) {
+ dprintk("%s: error parsing address %s\n", __func__, buf);
+ goto out_free_da;
+ }
+
+ portstr++;
+ sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
+ port = htons((tmp[0] << 8) | (tmp[1]));
+
+ switch (da->da_addr.ss_family) {
+ case AF_INET:
+ ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
+ da->da_addrlen = sizeof(struct sockaddr_in);
+ break;
+
+ case AF_INET6:
+ ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
+ da->da_addrlen = sizeof(struct sockaddr_in6);
+ startsep = "[";
+ endsep = "]";
+ break;
+
+ default:
+ dprintk("%s: unsupported address family: %u\n",
+ __func__, da->da_addr.ss_family);
+ goto out_free_da;
+ }
+
+ da->da_transport = xprt_find_transport_ident(netid);
+ if (da->da_transport < 0) {
+ dprintk("%s: ERROR: unknown r_netid \"%s\"\n",
+ __func__, netid);
+ goto out_free_da;
+ }
+
+ da->da_netid = netid;
+
+ /* save human readable address */
+ len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
+ da->da_remotestr = kzalloc(len, gfp_flags);
+
+ /* NULL is ok, only used for dprintk */
+ if (da->da_remotestr)
+ snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
+ buf, endsep, ntohs(port));
+
+ dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
+ kfree(buf);
+ return da;
+
+out_free_da:
+ kfree(da);
+out_free_buf:
+ dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
+ kfree(buf);
+out_free_netid:
+ kfree(netid);
+out_err:
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nfs4_decode_mp_ds_addr);
+
+void
+pnfs_layout_mark_request_commit(struct nfs_page *req,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx)
+{
+ struct list_head *list;
+ struct pnfs_commit_array *array;
+ struct pnfs_commit_bucket *bucket;
+
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+ array = pnfs_lookup_commit_array(cinfo->ds, lseg);
+ if (!array || !pnfs_is_valid_lseg(lseg))
+ goto out_resched;
+ bucket = &array->buckets[ds_commit_idx];
+ list = &bucket->written;
+ /* Non-empty buckets hold a reference on the lseg. That ref
+ * is normally transferred to the COMMIT call and released
+ * there. It could also be released if the last req is pulled
+ * off due to a rewrite, in which case it will be done in
+ * pnfs_common_clear_request_commit
+ */
+ if (!bucket->lseg)
+ bucket->lseg = pnfs_get_lseg(lseg);
+ set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+ cinfo->ds->nwritten++;
+
+ nfs_request_add_commit_list_locked(req, list, cinfo);
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+ nfs_folio_mark_unstable(nfs_page_to_folio(req), cinfo);
+ return;
+out_resched:
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+ cinfo->completion_ops->resched_write(cinfo, req);
+}
+EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
+
+int
+pnfs_nfs_generic_sync(struct inode *inode, bool datasync)
+{
+ int ret;
+
+ if (!pnfs_layoutcommit_outstanding(inode))
+ return 0;
+ ret = nfs_commit_inode(inode, FLUSH_SYNC);
+ if (ret < 0)
+ return ret;
+ if (datasync)
+ return 0;
+ return pnfs_layoutcommit_inode(inode, true);
+}
+EXPORT_SYMBOL_GPL(pnfs_nfs_generic_sync);
+
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
new file mode 100644
index 0000000000..e3570c656b
--- /dev/null
+++ b/fs/nfs/proc.c
@@ -0,0 +1,763 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/proc.c
+ *
+ * Copyright (C) 1992, 1993, 1994 Rick Sladkey
+ *
+ * OS-independent nfs remote procedure call functions
+ *
+ * Tuned by Alan Cox <A.Cox@swansea.ac.uk> for >3K buffers
+ * so at last we can have decent(ish) throughput off a
+ * Sun server.
+ *
+ * Coding optimized and cleaned up by Florian La Roche.
+ * Note: Error returns are optimized for NFS_OK, which isn't translated via
+ * nfs_stat_to_errno(), but happens to be already the right return code.
+ *
+ * Also, the code currently doesn't check the size of the packet, when
+ * it decodes the packet.
+ *
+ * Feel free to fix it and mail me the diffs if it worries you.
+ *
+ * Completely rewritten to support the new RPC call interface;
+ * rewrote and moved the entire XDR stuff to xdr.c
+ * --Olaf Kirch June 1996
+ *
+ * The code below initializes all auto variables explicitly, otherwise
+ * it will fail to work as a module (gcc generates a memset call for an
+ * incomplete struct).
+ */
+
+#include <linux/types.h>
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/pagemap.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/lockd/bind.h>
+#include <linux/freezer.h>
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_PROC
+
+/*
+ * Bare-bones access to getattr: this is for nfs_read_super.
+ */
+static int
+nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ struct nfs_fattr *fattr = info->fattr;
+ struct nfs2_fsstat fsinfo;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_GETATTR],
+ .rpc_argp = fhandle,
+ .rpc_resp = fattr,
+ };
+ int status;
+
+ dprintk("%s: call getattr\n", __func__);
+ nfs_fattr_init(fattr);
+ status = rpc_call_sync(server->client, &msg, 0);
+ /* Retry with default authentication if different */
+ if (status && server->nfs_client->cl_rpcclient != server->client)
+ status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
+ dprintk("%s: reply getattr: %d\n", __func__, status);
+ if (status)
+ return status;
+ dprintk("%s: call statfs\n", __func__);
+ msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS];
+ msg.rpc_resp = &fsinfo;
+ status = rpc_call_sync(server->client, &msg, 0);
+ /* Retry with default authentication if different */
+ if (status && server->nfs_client->cl_rpcclient != server->client)
+ status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
+ dprintk("%s: reply statfs: %d\n", __func__, status);
+ if (status)
+ return status;
+ info->rtmax = NFS_MAXDATA;
+ info->rtpref = fsinfo.tsize;
+ info->rtmult = fsinfo.bsize;
+ info->wtmax = NFS_MAXDATA;
+ info->wtpref = fsinfo.tsize;
+ info->wtmult = fsinfo.bsize;
+ info->dtpref = fsinfo.tsize;
+ info->maxfilesize = 0x7FFFFFFF;
+ info->lease_time = 0;
+ info->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+ info->xattr_support = 0;
+ return 0;
+}
+
+/*
+ * One function for each procedure in the NFS protocol.
+ */
+static int
+nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct inode *inode)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_GETATTR],
+ .rpc_argp = fhandle,
+ .rpc_resp = fattr,
+ };
+ int status;
+ unsigned short task_flags = 0;
+
+ /* Is this is an attribute revalidation, subject to softreval? */
+ if (inode && (server->flags & NFS_MOUNT_SOFTREVAL))
+ task_flags |= RPC_TASK_TIMEOUT;
+
+ dprintk("NFS call getattr\n");
+ nfs_fattr_init(fattr);
+ status = rpc_call_sync(server->client, &msg, task_flags);
+ dprintk("NFS reply getattr: %d\n", status);
+ return status;
+}
+
+static int
+nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
+ struct iattr *sattr)
+{
+ struct inode *inode = d_inode(dentry);
+ struct nfs_sattrargs arg = {
+ .fh = NFS_FH(inode),
+ .sattr = sattr
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_SETATTR],
+ .rpc_argp = &arg,
+ .rpc_resp = fattr,
+ };
+ int status;
+
+ /* Mask out the non-modebit related stuff from attr->ia_mode */
+ sattr->ia_mode &= S_IALLUGO;
+
+ dprintk("NFS call setattr\n");
+ if (sattr->ia_valid & ATTR_FILE)
+ msg.rpc_cred = nfs_file_cred(sattr->ia_file);
+ nfs_fattr_init(fattr);
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ if (status == 0)
+ nfs_setattr_update_inode(inode, sattr, fattr);
+ dprintk("NFS reply setattr: %d\n", status);
+ return status;
+}
+
+static int
+nfs_proc_lookup(struct inode *dir, struct dentry *dentry,
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+{
+ struct nfs_diropargs arg = {
+ .fh = NFS_FH(dir),
+ .name = dentry->d_name.name,
+ .len = dentry->d_name.len
+ };
+ struct nfs_diropok res = {
+ .fh = fhandle,
+ .fattr = fattr
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_LOOKUP],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int status;
+ unsigned short task_flags = 0;
+
+ /* Is this is an attribute revalidation, subject to softreval? */
+ if (nfs_lookup_is_soft_revalidate(dentry))
+ task_flags |= RPC_TASK_TIMEOUT;
+
+ dprintk("NFS call lookup %pd2\n", dentry);
+ nfs_fattr_init(fattr);
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, task_flags);
+ dprintk("NFS reply lookup: %d\n", status);
+ return status;
+}
+
+static int nfs_proc_readlink(struct inode *inode, struct page *page,
+ unsigned int pgbase, unsigned int pglen)
+{
+ struct nfs_readlinkargs args = {
+ .fh = NFS_FH(inode),
+ .pgbase = pgbase,
+ .pglen = pglen,
+ .pages = &page
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_READLINK],
+ .rpc_argp = &args,
+ };
+ int status;
+
+ dprintk("NFS call readlink\n");
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ dprintk("NFS reply readlink: %d\n", status);
+ return status;
+}
+
+struct nfs_createdata {
+ struct nfs_createargs arg;
+ struct nfs_diropok res;
+ struct nfs_fh fhandle;
+ struct nfs_fattr fattr;
+};
+
+static struct nfs_createdata *nfs_alloc_createdata(struct inode *dir,
+ struct dentry *dentry, struct iattr *sattr)
+{
+ struct nfs_createdata *data;
+
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
+
+ if (data != NULL) {
+ data->arg.fh = NFS_FH(dir);
+ data->arg.name = dentry->d_name.name;
+ data->arg.len = dentry->d_name.len;
+ data->arg.sattr = sattr;
+ nfs_fattr_init(&data->fattr);
+ data->fhandle.size = 0;
+ data->res.fh = &data->fhandle;
+ data->res.fattr = &data->fattr;
+ }
+ return data;
+};
+
+static void nfs_free_createdata(const struct nfs_createdata *data)
+{
+ kfree(data);
+}
+
+static int
+nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+ int flags)
+{
+ struct nfs_createdata *data;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_CREATE],
+ };
+ int status = -ENOMEM;
+
+ dprintk("NFS call create %pd\n", dentry);
+ data = nfs_alloc_createdata(dir, dentry, sattr);
+ if (data == NULL)
+ goto out;
+ msg.rpc_argp = &data->arg;
+ msg.rpc_resp = &data->res;
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ nfs_mark_for_revalidate(dir);
+ if (status == 0)
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ nfs_free_createdata(data);
+out:
+ dprintk("NFS reply create: %d\n", status);
+ return status;
+}
+
+/*
+ * In NFSv2, mknod is grafted onto the create call.
+ */
+static int
+nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+ dev_t rdev)
+{
+ struct nfs_createdata *data;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_CREATE],
+ };
+ umode_t mode;
+ int status = -ENOMEM;
+
+ dprintk("NFS call mknod %pd\n", dentry);
+
+ mode = sattr->ia_mode;
+ if (S_ISFIFO(mode)) {
+ sattr->ia_mode = (mode & ~S_IFMT) | S_IFCHR;
+ sattr->ia_valid &= ~ATTR_SIZE;
+ } else if (S_ISCHR(mode) || S_ISBLK(mode)) {
+ sattr->ia_valid |= ATTR_SIZE;
+ sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */
+ }
+
+ data = nfs_alloc_createdata(dir, dentry, sattr);
+ if (data == NULL)
+ goto out;
+ msg.rpc_argp = &data->arg;
+ msg.rpc_resp = &data->res;
+
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ nfs_mark_for_revalidate(dir);
+
+ if (status == -EINVAL && S_ISFIFO(mode)) {
+ sattr->ia_mode = mode;
+ nfs_fattr_init(data->res.fattr);
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ }
+ if (status == 0)
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ nfs_free_createdata(data);
+out:
+ dprintk("NFS reply mknod: %d\n", status);
+ return status;
+}
+
+static int
+nfs_proc_remove(struct inode *dir, struct dentry *dentry)
+{
+ struct nfs_removeargs arg = {
+ .fh = NFS_FH(dir),
+ .name = dentry->d_name,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_REMOVE],
+ .rpc_argp = &arg,
+ };
+ int status;
+
+ dprintk("NFS call remove %pd2\n",dentry);
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ nfs_mark_for_revalidate(dir);
+
+ dprintk("NFS reply remove: %d\n", status);
+ return status;
+}
+
+static void
+nfs_proc_unlink_setup(struct rpc_message *msg,
+ struct dentry *dentry,
+ struct inode *inode)
+{
+ msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE];
+}
+
+static void nfs_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
+{
+ rpc_call_start(task);
+}
+
+static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
+{
+ nfs_mark_for_revalidate(dir);
+ return 1;
+}
+
+static void
+nfs_proc_rename_setup(struct rpc_message *msg,
+ struct dentry *old_dentry,
+ struct dentry *new_dentry)
+{
+ msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME];
+}
+
+static void nfs_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
+{
+ rpc_call_start(task);
+}
+
+static int
+nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+ struct inode *new_dir)
+{
+ nfs_mark_for_revalidate(old_dir);
+ nfs_mark_for_revalidate(new_dir);
+ return 1;
+}
+
+static int
+nfs_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name)
+{
+ struct nfs_linkargs arg = {
+ .fromfh = NFS_FH(inode),
+ .tofh = NFS_FH(dir),
+ .toname = name->name,
+ .tolen = name->len
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_LINK],
+ .rpc_argp = &arg,
+ };
+ int status;
+
+ dprintk("NFS call link %s\n", name->name);
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ nfs_mark_for_revalidate(inode);
+ nfs_mark_for_revalidate(dir);
+ dprintk("NFS reply link: %d\n", status);
+ return status;
+}
+
+static int
+nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
+ unsigned int len, struct iattr *sattr)
+{
+ struct nfs_fh *fh;
+ struct nfs_fattr *fattr;
+ struct nfs_symlinkargs arg = {
+ .fromfh = NFS_FH(dir),
+ .fromname = dentry->d_name.name,
+ .fromlen = dentry->d_name.len,
+ .pages = &page,
+ .pathlen = len,
+ .sattr = sattr
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_SYMLINK],
+ .rpc_argp = &arg,
+ };
+ int status = -ENAMETOOLONG;
+
+ dprintk("NFS call symlink %pd\n", dentry);
+
+ if (len > NFS2_MAXPATHLEN)
+ goto out;
+
+ fh = nfs_alloc_fhandle();
+ fattr = nfs_alloc_fattr();
+ status = -ENOMEM;
+ if (fh == NULL || fattr == NULL)
+ goto out_free;
+
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ nfs_mark_for_revalidate(dir);
+
+ /*
+ * V2 SYMLINK requests don't return any attributes. Setting the
+ * filehandle size to zero indicates to nfs_instantiate that it
+ * should fill in the data with a LOOKUP call on the wire.
+ */
+ if (status == 0)
+ status = nfs_instantiate(dentry, fh, fattr);
+
+out_free:
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fh);
+out:
+ dprintk("NFS reply symlink: %d\n", status);
+ return status;
+}
+
+static int
+nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
+{
+ struct nfs_createdata *data;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_MKDIR],
+ };
+ int status = -ENOMEM;
+
+ dprintk("NFS call mkdir %pd\n", dentry);
+ data = nfs_alloc_createdata(dir, dentry, sattr);
+ if (data == NULL)
+ goto out;
+ msg.rpc_argp = &data->arg;
+ msg.rpc_resp = &data->res;
+
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ nfs_mark_for_revalidate(dir);
+ if (status == 0)
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ nfs_free_createdata(data);
+out:
+ dprintk("NFS reply mkdir: %d\n", status);
+ return status;
+}
+
+static int
+nfs_proc_rmdir(struct inode *dir, const struct qstr *name)
+{
+ struct nfs_diropargs arg = {
+ .fh = NFS_FH(dir),
+ .name = name->name,
+ .len = name->len
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_RMDIR],
+ .rpc_argp = &arg,
+ };
+ int status;
+
+ dprintk("NFS call rmdir %s\n", name->name);
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ nfs_mark_for_revalidate(dir);
+ dprintk("NFS reply rmdir: %d\n", status);
+ return status;
+}
+
+/*
+ * The READDIR implementation is somewhat hackish - we pass a temporary
+ * buffer to the encode function, which installs it in the receive
+ * the receive iovec. The decode function just parses the reply to make
+ * sure it is syntactically correct; the entries itself are decoded
+ * from nfs_readdir by calling the decode_entry function directly.
+ */
+static int nfs_proc_readdir(struct nfs_readdir_arg *nr_arg,
+ struct nfs_readdir_res *nr_res)
+{
+ struct inode *dir = d_inode(nr_arg->dentry);
+ struct nfs_readdirargs arg = {
+ .fh = NFS_FH(dir),
+ .cookie = nr_arg->cookie,
+ .count = nr_arg->page_len,
+ .pages = nr_arg->pages,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_READDIR],
+ .rpc_argp = &arg,
+ .rpc_cred = nr_arg->cred,
+ };
+ int status;
+
+ dprintk("NFS call readdir %llu\n", (unsigned long long)nr_arg->cookie);
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ nr_res->verf[0] = nr_res->verf[1] = 0;
+
+ nfs_invalidate_atime(dir);
+
+ dprintk("NFS reply readdir: %d\n", status);
+ return status;
+}
+
+static int
+nfs_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsstat *stat)
+{
+ struct nfs2_fsstat fsinfo;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_STATFS],
+ .rpc_argp = fhandle,
+ .rpc_resp = &fsinfo,
+ };
+ int status;
+
+ dprintk("NFS call statfs\n");
+ nfs_fattr_init(stat->fattr);
+ status = rpc_call_sync(server->client, &msg, 0);
+ dprintk("NFS reply statfs: %d\n", status);
+ if (status)
+ goto out;
+ stat->tbytes = (u64)fsinfo.blocks * fsinfo.bsize;
+ stat->fbytes = (u64)fsinfo.bfree * fsinfo.bsize;
+ stat->abytes = (u64)fsinfo.bavail * fsinfo.bsize;
+ stat->tfiles = 0;
+ stat->ffiles = 0;
+ stat->afiles = 0;
+out:
+ return status;
+}
+
+static int
+nfs_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ struct nfs2_fsstat fsinfo;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_STATFS],
+ .rpc_argp = fhandle,
+ .rpc_resp = &fsinfo,
+ };
+ int status;
+
+ dprintk("NFS call fsinfo\n");
+ nfs_fattr_init(info->fattr);
+ status = rpc_call_sync(server->client, &msg, 0);
+ dprintk("NFS reply fsinfo: %d\n", status);
+ if (status)
+ goto out;
+ info->rtmax = NFS_MAXDATA;
+ info->rtpref = fsinfo.tsize;
+ info->rtmult = fsinfo.bsize;
+ info->wtmax = NFS_MAXDATA;
+ info->wtpref = fsinfo.tsize;
+ info->wtmult = fsinfo.bsize;
+ info->dtpref = fsinfo.tsize;
+ info->maxfilesize = 0x7FFFFFFF;
+ info->lease_time = 0;
+out:
+ return status;
+}
+
+static int
+nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_pathconf *info)
+{
+ info->max_link = 0;
+ info->max_namelen = NFS2_MAXNAMLEN;
+ return 0;
+}
+
+static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+ struct inode *inode = hdr->inode;
+
+ nfs_invalidate_atime(inode);
+ if (task->tk_status >= 0) {
+ nfs_refresh_inode(inode, hdr->res.fattr);
+ /* Emulate the eof flag, which isn't normally needed in NFSv2
+ * as it is guaranteed to always return the file attributes
+ */
+ if ((hdr->res.count == 0 && hdr->args.count > 0) ||
+ hdr->args.offset + hdr->res.count >= hdr->res.fattr->size)
+ hdr->res.eof = 1;
+ }
+ return 0;
+}
+
+static void nfs_proc_read_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg)
+{
+ msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
+}
+
+static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ rpc_call_start(task);
+ return 0;
+}
+
+static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+ if (task->tk_status >= 0) {
+ hdr->res.count = hdr->args.count;
+ nfs_writeback_update_inode(hdr);
+ }
+ return 0;
+}
+
+static void nfs_proc_write_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg,
+ struct rpc_clnt **clnt)
+{
+ /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
+ hdr->args.stable = NFS_FILE_SYNC;
+ msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
+}
+
+static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
+{
+ BUG();
+}
+
+static void
+nfs_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg,
+ struct rpc_clnt **clnt)
+{
+ BUG();
+}
+
+static int
+nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = file_inode(filp);
+
+ return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl, NULL);
+}
+
+/* Helper functions for NFS lock bounds checking */
+#define NFS_LOCK32_OFFSET_MAX ((__s32)0x7fffffffUL)
+static int nfs_lock_check_bounds(const struct file_lock *fl)
+{
+ __s32 start, end;
+
+ start = (__s32)fl->fl_start;
+ if ((loff_t)start != fl->fl_start)
+ goto out_einval;
+
+ if (fl->fl_end != OFFSET_MAX) {
+ end = (__s32)fl->fl_end;
+ if ((loff_t)end != fl->fl_end)
+ goto out_einval;
+ } else
+ end = NFS_LOCK32_OFFSET_MAX;
+
+ if (start < 0 || start > end)
+ goto out_einval;
+ return 0;
+out_einval:
+ return -EINVAL;
+}
+
+static int nfs_have_delegation(struct inode *inode, fmode_t flags)
+{
+ return 0;
+}
+
+static const struct inode_operations nfs_dir_inode_operations = {
+ .create = nfs_create,
+ .lookup = nfs_lookup,
+ .link = nfs_link,
+ .unlink = nfs_unlink,
+ .symlink = nfs_symlink,
+ .mkdir = nfs_mkdir,
+ .rmdir = nfs_rmdir,
+ .mknod = nfs_mknod,
+ .rename = nfs_rename,
+ .permission = nfs_permission,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+};
+
+static const struct inode_operations nfs_file_inode_operations = {
+ .permission = nfs_permission,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+};
+
+const struct nfs_rpc_ops nfs_v2_clientops = {
+ .version = 2, /* protocol version */
+ .dentry_ops = &nfs_dentry_operations,
+ .dir_inode_ops = &nfs_dir_inode_operations,
+ .file_inode_ops = &nfs_file_inode_operations,
+ .file_ops = &nfs_file_operations,
+ .getroot = nfs_proc_get_root,
+ .submount = nfs_submount,
+ .try_get_tree = nfs_try_get_tree,
+ .getattr = nfs_proc_getattr,
+ .setattr = nfs_proc_setattr,
+ .lookup = nfs_proc_lookup,
+ .access = NULL, /* access */
+ .readlink = nfs_proc_readlink,
+ .create = nfs_proc_create,
+ .remove = nfs_proc_remove,
+ .unlink_setup = nfs_proc_unlink_setup,
+ .unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare,
+ .unlink_done = nfs_proc_unlink_done,
+ .rename_setup = nfs_proc_rename_setup,
+ .rename_rpc_prepare = nfs_proc_rename_rpc_prepare,
+ .rename_done = nfs_proc_rename_done,
+ .link = nfs_proc_link,
+ .symlink = nfs_proc_symlink,
+ .mkdir = nfs_proc_mkdir,
+ .rmdir = nfs_proc_rmdir,
+ .readdir = nfs_proc_readdir,
+ .mknod = nfs_proc_mknod,
+ .statfs = nfs_proc_statfs,
+ .fsinfo = nfs_proc_fsinfo,
+ .pathconf = nfs_proc_pathconf,
+ .decode_dirent = nfs2_decode_dirent,
+ .pgio_rpc_prepare = nfs_proc_pgio_rpc_prepare,
+ .read_setup = nfs_proc_read_setup,
+ .read_done = nfs_read_done,
+ .write_setup = nfs_proc_write_setup,
+ .write_done = nfs_write_done,
+ .commit_setup = nfs_proc_commit_setup,
+ .commit_rpc_prepare = nfs_proc_commit_rpc_prepare,
+ .lock = nfs_proc_lock,
+ .lock_check_bounds = nfs_lock_check_bounds,
+ .close_context = nfs_close_context,
+ .have_delegation = nfs_have_delegation,
+ .alloc_client = nfs_alloc_client,
+ .init_client = nfs_init_client,
+ .free_client = nfs_free_client,
+ .create_server = nfs_create_server,
+ .clone_server = nfs_clone_server,
+};
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
new file mode 100644
index 0000000000..7dc21a48e3
--- /dev/null
+++ b/fs/nfs/read.c
@@ -0,0 +1,458 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/read.c
+ *
+ * Block I/O for NFS
+ *
+ * Partial copy of Linus' read cache modifications to fs/nfs/file.c
+ * modified for async RPC by okir@monad.swb.de
+ */
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/pagemap.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/module.h>
+
+#include "nfs4_fs.h"
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+#include "pnfs.h"
+#include "nfstrace.h"
+
+#define NFSDBG_FACILITY NFSDBG_PAGECACHE
+
+const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
+static const struct nfs_rw_ops nfs_rw_read_ops;
+
+static struct kmem_cache *nfs_rdata_cachep;
+
+static struct nfs_pgio_header *nfs_readhdr_alloc(void)
+{
+ struct nfs_pgio_header *p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
+
+ if (p)
+ p->rw_mode = FMODE_READ;
+ return p;
+}
+
+static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
+{
+ if (rhdr->res.scratch != NULL)
+ kfree(rhdr->res.scratch);
+ kmem_cache_free(nfs_rdata_cachep, rhdr);
+}
+
+static int nfs_return_empty_folio(struct folio *folio)
+{
+ folio_zero_segment(folio, 0, folio_size(folio));
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ return 0;
+}
+
+void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode, bool force_mds,
+ const struct nfs_pgio_completion_ops *compl_ops)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops;
+
+#ifdef CONFIG_NFS_V4_1
+ if (server->pnfs_curr_ld && !force_mds)
+ pg_ops = server->pnfs_curr_ld->pg_read_ops;
+#endif
+ nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops,
+ server->rsize, 0);
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
+
+void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio)
+{
+ struct nfs_pgio_mirror *pgm;
+ unsigned long npages;
+
+ nfs_pageio_complete(pgio);
+
+ /* It doesn't make sense to do mirrored reads! */
+ WARN_ON_ONCE(pgio->pg_mirror_count != 1);
+
+ pgm = &pgio->pg_mirrors[0];
+ NFS_I(pgio->pg_inode)->read_io += pgm->pg_bytes_written;
+ npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ nfs_add_stats(pgio->pg_inode, NFSIOS_READPAGES, npages);
+}
+
+
+void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
+{
+ struct nfs_pgio_mirror *mirror;
+
+ if (pgio->pg_ops && pgio->pg_ops->pg_cleanup)
+ pgio->pg_ops->pg_cleanup(pgio);
+
+ pgio->pg_ops = &nfs_pgio_rw_ops;
+
+ /* read path should never have more than one mirror */
+ WARN_ON_ONCE(pgio->pg_mirror_count != 1);
+
+ mirror = &pgio->pg_mirrors[0];
+ mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
+
+bool nfs_read_alloc_scratch(struct nfs_pgio_header *hdr, size_t size)
+{
+ WARN_ON(hdr->res.scratch != NULL);
+ hdr->res.scratch = kmalloc(size, GFP_KERNEL);
+ return hdr->res.scratch != NULL;
+}
+EXPORT_SYMBOL_GPL(nfs_read_alloc_scratch);
+
+static void nfs_readpage_release(struct nfs_page *req, int error)
+{
+ struct folio *folio = nfs_page_to_folio(req);
+
+ if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
+ folio_set_error(folio);
+ if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
+ if (nfs_netfs_folio_unlock(folio))
+ folio_unlock(folio);
+
+ nfs_release_request(req);
+}
+
+static void nfs_page_group_set_uptodate(struct nfs_page *req)
+{
+ if (nfs_page_group_sync_on_bit(req, PG_UPTODATE))
+ folio_mark_uptodate(nfs_page_to_folio(req));
+}
+
+static void nfs_read_completion(struct nfs_pgio_header *hdr)
+{
+ unsigned long bytes = 0;
+ int error;
+
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
+ goto out;
+ while (!list_empty(&hdr->pages)) {
+ struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+ struct folio *folio = nfs_page_to_folio(req);
+ unsigned long start = req->wb_pgbase;
+ unsigned long end = req->wb_pgbase + req->wb_bytes;
+
+ if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
+ /* note: regions of the page not covered by a
+ * request are zeroed in nfs_read_add_folio
+ */
+ if (bytes > hdr->good_bytes) {
+ /* nothing in this request was good, so zero
+ * the full extent of the request */
+ folio_zero_segment(folio, start, end);
+
+ } else if (hdr->good_bytes - bytes < req->wb_bytes) {
+ /* part of this request has good bytes, but
+ * not all. zero the bad bytes */
+ start += hdr->good_bytes - bytes;
+ WARN_ON(start < req->wb_pgbase);
+ folio_zero_segment(folio, start, end);
+ }
+ }
+ error = 0;
+ bytes += req->wb_bytes;
+ if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
+ if (bytes <= hdr->good_bytes)
+ nfs_page_group_set_uptodate(req);
+ else {
+ error = hdr->error;
+ xchg(&nfs_req_openctx(req)->error, error);
+ }
+ } else
+ nfs_page_group_set_uptodate(req);
+ nfs_list_remove_request(req);
+ nfs_readpage_release(req, error);
+ }
+ nfs_netfs_read_completion(hdr);
+
+out:
+ hdr->release(hdr);
+}
+
+static void nfs_initiate_read(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg,
+ const struct nfs_rpc_ops *rpc_ops,
+ struct rpc_task_setup *task_setup_data, int how)
+{
+ rpc_ops->read_setup(hdr, msg);
+ nfs_netfs_initiate_read(hdr);
+ trace_nfs_initiate_read(hdr);
+}
+
+static void
+nfs_async_read_error(struct list_head *head, int error)
+{
+ struct nfs_page *req;
+
+ while (!list_empty(head)) {
+ req = nfs_list_entry(head->next);
+ nfs_list_remove_request(req);
+ nfs_readpage_release(req, error);
+ }
+}
+
+const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
+ .error_cleanup = nfs_async_read_error,
+ .completion = nfs_read_completion,
+};
+
+/*
+ * This is the callback from RPC telling us whether a reply was
+ * received or some error occurred (timeout or socket shutdown).
+ */
+static int nfs_readpage_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr,
+ struct inode *inode)
+{
+ int status = NFS_PROTO(inode)->read_done(task, hdr);
+ if (status != 0)
+ return status;
+
+ nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, hdr->res.count);
+ trace_nfs_readpage_done(task, hdr);
+
+ if (task->tk_status == -ESTALE) {
+ nfs_set_inode_stale(inode);
+ nfs_mark_for_revalidate(inode);
+ }
+ return 0;
+}
+
+static void nfs_readpage_retry(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ struct nfs_pgio_args *argp = &hdr->args;
+ struct nfs_pgio_res *resp = &hdr->res;
+
+ /* This is a short read! */
+ nfs_inc_stats(hdr->inode, NFSIOS_SHORTREAD);
+ trace_nfs_readpage_short(task, hdr);
+
+ /* Has the server at least made some progress? */
+ if (resp->count == 0) {
+ nfs_set_pgio_error(hdr, -EIO, argp->offset);
+ return;
+ }
+
+ /* For non rpc-based layout drivers, retry-through-MDS */
+ if (!task->tk_ops) {
+ hdr->pnfs_error = -EAGAIN;
+ return;
+ }
+
+ /* Yes, so retry the read at the end of the hdr */
+ hdr->mds_offset += resp->count;
+ argp->offset += resp->count;
+ argp->pgbase += resp->count;
+ argp->count -= resp->count;
+ resp->count = 0;
+ resp->eof = 0;
+ rpc_restart_call_prepare(task);
+}
+
+static void nfs_readpage_result(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (hdr->res.eof) {
+ loff_t pos = hdr->args.offset + hdr->res.count;
+ unsigned int new = pos - hdr->io_start;
+
+ if (hdr->good_bytes > new) {
+ hdr->good_bytes = new;
+ set_bit(NFS_IOHDR_EOF, &hdr->flags);
+ clear_bit(NFS_IOHDR_ERROR, &hdr->flags);
+ }
+ } else if (hdr->res.count < hdr->args.count)
+ nfs_readpage_retry(task, hdr);
+}
+
+int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio,
+ struct nfs_open_context *ctx,
+ struct folio *folio)
+{
+ struct inode *inode = folio_file_mapping(folio)->host;
+ struct nfs_server *server = NFS_SERVER(inode);
+ size_t fsize = folio_size(folio);
+ unsigned int rsize = server->rsize;
+ struct nfs_page *new;
+ unsigned int len, aligned_len;
+ int error;
+
+ len = nfs_folio_length(folio);
+ if (len == 0)
+ return nfs_return_empty_folio(folio);
+
+ aligned_len = min_t(unsigned int, ALIGN(len, rsize), fsize);
+
+ new = nfs_page_create_from_folio(ctx, folio, 0, aligned_len);
+ if (IS_ERR(new)) {
+ error = PTR_ERR(new);
+ goto out;
+ }
+
+ if (len < fsize)
+ folio_zero_segment(folio, len, fsize);
+ if (!nfs_pageio_add_request(pgio, new)) {
+ nfs_list_remove_request(new);
+ error = pgio->pg_error;
+ nfs_readpage_release(new, error);
+ goto out;
+ }
+ return 0;
+out:
+ return error;
+}
+
+/*
+ * Read a page over NFS.
+ * We read the page synchronously in the following case:
+ * - The error flag is set for this page. This happens only when a
+ * previous async read operation failed.
+ */
+int nfs_read_folio(struct file *file, struct folio *folio)
+{
+ struct inode *inode = file_inode(file);
+ struct nfs_pageio_descriptor pgio;
+ struct nfs_open_context *ctx;
+ int ret;
+
+ trace_nfs_aop_readpage(inode, folio);
+ nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
+ task_io_account_read(folio_size(folio));
+
+ /*
+ * Try to flush any pending writes to the file..
+ *
+ * NOTE! Because we own the folio lock, there cannot
+ * be any new pending writes generated at this point
+ * for this folio (other folios can be written to).
+ */
+ ret = nfs_wb_folio(inode, folio);
+ if (ret)
+ goto out_unlock;
+ if (folio_test_uptodate(folio))
+ goto out_unlock;
+
+ ret = -ESTALE;
+ if (NFS_STALE(inode))
+ goto out_unlock;
+
+ ret = nfs_netfs_read_folio(file, folio);
+ if (!ret)
+ goto out;
+
+ ctx = get_nfs_open_context(nfs_file_open_context(file));
+
+ xchg(&ctx->error, 0);
+ nfs_pageio_init_read(&pgio, inode, false,
+ &nfs_async_read_completion_ops);
+
+ ret = nfs_read_add_folio(&pgio, ctx, folio);
+ if (ret)
+ goto out_put;
+
+ nfs_pageio_complete_read(&pgio);
+ ret = pgio.pg_error < 0 ? pgio.pg_error : 0;
+ if (!ret) {
+ ret = folio_wait_locked_killable(folio);
+ if (!folio_test_uptodate(folio) && !ret)
+ ret = xchg(&ctx->error, 0);
+ }
+out_put:
+ put_nfs_open_context(ctx);
+out:
+ trace_nfs_aop_readpage_done(inode, folio, ret);
+ return ret;
+out_unlock:
+ folio_unlock(folio);
+ goto out;
+}
+
+void nfs_readahead(struct readahead_control *ractl)
+{
+ struct nfs_pageio_descriptor pgio;
+ struct nfs_open_context *ctx;
+ unsigned int nr_pages = readahead_count(ractl);
+ struct file *file = ractl->file;
+ struct inode *inode = ractl->mapping->host;
+ struct folio *folio;
+ int ret;
+
+ trace_nfs_aop_readahead(inode, readahead_pos(ractl), nr_pages);
+ nfs_inc_stats(inode, NFSIOS_VFSREADPAGES);
+ task_io_account_read(readahead_length(ractl));
+
+ ret = -ESTALE;
+ if (NFS_STALE(inode))
+ goto out;
+
+ ret = nfs_netfs_readahead(ractl);
+ if (!ret)
+ goto out;
+
+ if (file == NULL) {
+ ret = -EBADF;
+ ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
+ if (ctx == NULL)
+ goto out;
+ } else
+ ctx = get_nfs_open_context(nfs_file_open_context(file));
+
+ nfs_pageio_init_read(&pgio, inode, false,
+ &nfs_async_read_completion_ops);
+
+ while ((folio = readahead_folio(ractl)) != NULL) {
+ ret = nfs_read_add_folio(&pgio, ctx, folio);
+ if (ret)
+ break;
+ }
+
+ nfs_pageio_complete_read(&pgio);
+
+ put_nfs_open_context(ctx);
+out:
+ trace_nfs_aop_readahead_done(inode, nr_pages, ret);
+}
+
+int __init nfs_init_readpagecache(void)
+{
+ nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
+ sizeof(struct nfs_pgio_header),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (nfs_rdata_cachep == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void nfs_destroy_readpagecache(void)
+{
+ kmem_cache_destroy(nfs_rdata_cachep);
+}
+
+static const struct nfs_rw_ops nfs_rw_read_ops = {
+ .rw_alloc_header = nfs_readhdr_alloc,
+ .rw_free_header = nfs_readhdr_free,
+ .rw_done = nfs_readpage_done,
+ .rw_result = nfs_readpage_result,
+ .rw_initiate = nfs_initiate_read,
+};
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
new file mode 100644
index 0000000000..0d6473cb00
--- /dev/null
+++ b/fs/nfs/super.c
@@ -0,0 +1,1428 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/super.c
+ *
+ * Copyright (C) 1992 Rick Sladkey
+ *
+ * nfs superblock handling functions
+ *
+ * Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some
+ * experimental NFS changes. Modularisation taken straight from SYS5 fs.
+ *
+ * Change to nfs_read_super() to permit NFS mounts to multi-homed hosts.
+ * J.S.Peatfield@damtp.cam.ac.uk
+ *
+ * Split from inode.c by David Howells <dhowells@redhat.com>
+ *
+ * - superblocks are indexed on server only - all inodes, dentries, etc. associated with a
+ * particular server are held in the same superblock
+ * - NFS superblocks can have several effective roots to the dentry tree
+ * - directory type roots are spliced into the tree when a path from one root reaches the root
+ * of another (see nfs_lookup())
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/xprtsock.h>
+#include <linux/sunrpc/xprtrdma.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/slab.h>
+#include <net/ipv6.h>
+#include <linux/netdevice.h>
+#include <linux/nfs_xdr.h>
+#include <linux/magic.h>
+#include <linux/parser.h>
+#include <linux/nsproxy.h>
+#include <linux/rcupdate.h>
+
+#include <linux/uaccess.h>
+#include <linux/nfs_ssc.h>
+
+#include <uapi/linux/tls.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#include "fscache.h"
+#include "nfs4session.h"
+#include "pnfs.h"
+#include "nfs.h"
+#include "netns.h"
+#include "sysfs.h"
+
+#define NFSDBG_FACILITY NFSDBG_VFS
+
+const struct super_operations nfs_sops = {
+ .alloc_inode = nfs_alloc_inode,
+ .free_inode = nfs_free_inode,
+ .write_inode = nfs_write_inode,
+ .drop_inode = nfs_drop_inode,
+ .statfs = nfs_statfs,
+ .evict_inode = nfs_evict_inode,
+ .umount_begin = nfs_umount_begin,
+ .show_options = nfs_show_options,
+ .show_devname = nfs_show_devname,
+ .show_path = nfs_show_path,
+ .show_stats = nfs_show_stats,
+};
+EXPORT_SYMBOL_GPL(nfs_sops);
+
+#ifdef CONFIG_NFS_V4_2
+static const struct nfs_ssc_client_ops nfs_ssc_clnt_ops_tbl = {
+ .sco_sb_deactive = nfs_sb_deactive,
+};
+#endif
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+static int __init register_nfs4_fs(void)
+{
+ return register_filesystem(&nfs4_fs_type);
+}
+
+static void unregister_nfs4_fs(void)
+{
+ unregister_filesystem(&nfs4_fs_type);
+}
+#else
+static int __init register_nfs4_fs(void)
+{
+ return 0;
+}
+
+static void unregister_nfs4_fs(void)
+{
+}
+#endif
+
+#ifdef CONFIG_NFS_V4_2
+static void nfs_ssc_register_ops(void)
+{
+ nfs_ssc_register(&nfs_ssc_clnt_ops_tbl);
+}
+
+static void nfs_ssc_unregister_ops(void)
+{
+ nfs_ssc_unregister(&nfs_ssc_clnt_ops_tbl);
+}
+#endif /* CONFIG_NFS_V4_2 */
+
+static struct shrinker acl_shrinker = {
+ .count_objects = nfs_access_cache_count,
+ .scan_objects = nfs_access_cache_scan,
+ .seeks = DEFAULT_SEEKS,
+};
+
+/*
+ * Register the NFS filesystems
+ */
+int __init register_nfs_fs(void)
+{
+ int ret;
+
+ ret = register_filesystem(&nfs_fs_type);
+ if (ret < 0)
+ goto error_0;
+
+ ret = register_nfs4_fs();
+ if (ret < 0)
+ goto error_1;
+
+ ret = nfs_register_sysctl();
+ if (ret < 0)
+ goto error_2;
+ ret = register_shrinker(&acl_shrinker, "nfs-acl");
+ if (ret < 0)
+ goto error_3;
+#ifdef CONFIG_NFS_V4_2
+ nfs_ssc_register_ops();
+#endif
+ return 0;
+error_3:
+ nfs_unregister_sysctl();
+error_2:
+ unregister_nfs4_fs();
+error_1:
+ unregister_filesystem(&nfs_fs_type);
+error_0:
+ return ret;
+}
+
+/*
+ * Unregister the NFS filesystems
+ */
+void __exit unregister_nfs_fs(void)
+{
+ unregister_shrinker(&acl_shrinker);
+ nfs_unregister_sysctl();
+ unregister_nfs4_fs();
+#ifdef CONFIG_NFS_V4_2
+ nfs_ssc_unregister_ops();
+#endif
+ unregister_filesystem(&nfs_fs_type);
+}
+
+bool nfs_sb_active(struct super_block *sb)
+{
+ struct nfs_server *server = NFS_SB(sb);
+
+ if (!atomic_inc_not_zero(&sb->s_active))
+ return false;
+ if (atomic_inc_return(&server->active) != 1)
+ atomic_dec(&sb->s_active);
+ return true;
+}
+EXPORT_SYMBOL_GPL(nfs_sb_active);
+
+void nfs_sb_deactive(struct super_block *sb)
+{
+ struct nfs_server *server = NFS_SB(sb);
+
+ if (atomic_dec_and_test(&server->active))
+ deactivate_super(sb);
+}
+EXPORT_SYMBOL_GPL(nfs_sb_deactive);
+
+static int __nfs_list_for_each_server(struct list_head *head,
+ int (*fn)(struct nfs_server *, void *),
+ void *data)
+{
+ struct nfs_server *server, *last = NULL;
+ int ret = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, head, client_link) {
+ if (!(server->super && nfs_sb_active(server->super)))
+ continue;
+ rcu_read_unlock();
+ if (last)
+ nfs_sb_deactive(last->super);
+ last = server;
+ ret = fn(server, data);
+ if (ret)
+ goto out;
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+out:
+ if (last)
+ nfs_sb_deactive(last->super);
+ return ret;
+}
+
+int nfs_client_for_each_server(struct nfs_client *clp,
+ int (*fn)(struct nfs_server *, void *),
+ void *data)
+{
+ return __nfs_list_for_each_server(&clp->cl_superblocks, fn, data);
+}
+EXPORT_SYMBOL_GPL(nfs_client_for_each_server);
+
+/*
+ * Deliver file system statistics to userspace
+ */
+int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct nfs_server *server = NFS_SB(dentry->d_sb);
+ unsigned char blockbits;
+ unsigned long blockres;
+ struct nfs_fh *fh = NFS_FH(d_inode(dentry));
+ struct nfs_fsstat res;
+ int error = -ENOMEM;
+
+ res.fattr = nfs_alloc_fattr();
+ if (res.fattr == NULL)
+ goto out_err;
+
+ error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
+ if (unlikely(error == -ESTALE)) {
+ struct dentry *pd_dentry;
+
+ pd_dentry = dget_parent(dentry);
+ nfs_zap_caches(d_inode(pd_dentry));
+ dput(pd_dentry);
+ }
+ nfs_free_fattr(res.fattr);
+ if (error < 0)
+ goto out_err;
+
+ buf->f_type = NFS_SUPER_MAGIC;
+
+ /*
+ * Current versions of glibc do not correctly handle the
+ * case where f_frsize != f_bsize. Eventually we want to
+ * report the value of wtmult in this field.
+ */
+ buf->f_frsize = dentry->d_sb->s_blocksize;
+
+ /*
+ * On most *nix systems, f_blocks, f_bfree, and f_bavail
+ * are reported in units of f_frsize. Linux hasn't had
+ * an f_frsize field in its statfs struct until recently,
+ * thus historically Linux's sys_statfs reports these
+ * fields in units of f_bsize.
+ */
+ buf->f_bsize = dentry->d_sb->s_blocksize;
+ blockbits = dentry->d_sb->s_blocksize_bits;
+ blockres = (1 << blockbits) - 1;
+ buf->f_blocks = (res.tbytes + blockres) >> blockbits;
+ buf->f_bfree = (res.fbytes + blockres) >> blockbits;
+ buf->f_bavail = (res.abytes + blockres) >> blockbits;
+
+ buf->f_files = res.tfiles;
+ buf->f_ffree = res.afiles;
+
+ buf->f_namelen = server->namelen;
+
+ return 0;
+
+ out_err:
+ dprintk("%s: statfs error = %d\n", __func__, -error);
+ return error;
+}
+EXPORT_SYMBOL_GPL(nfs_statfs);
+
+/*
+ * Map the security flavour number to a name
+ */
+static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
+{
+ static const struct {
+ rpc_authflavor_t flavour;
+ const char *str;
+ } sec_flavours[NFS_AUTH_INFO_MAX_FLAVORS] = {
+ /* update NFS_AUTH_INFO_MAX_FLAVORS when this list changes! */
+ { RPC_AUTH_NULL, "null" },
+ { RPC_AUTH_UNIX, "sys" },
+ { RPC_AUTH_GSS_KRB5, "krb5" },
+ { RPC_AUTH_GSS_KRB5I, "krb5i" },
+ { RPC_AUTH_GSS_KRB5P, "krb5p" },
+ { RPC_AUTH_GSS_LKEY, "lkey" },
+ { RPC_AUTH_GSS_LKEYI, "lkeyi" },
+ { RPC_AUTH_GSS_LKEYP, "lkeyp" },
+ { RPC_AUTH_GSS_SPKM, "spkm" },
+ { RPC_AUTH_GSS_SPKMI, "spkmi" },
+ { RPC_AUTH_GSS_SPKMP, "spkmp" },
+ { UINT_MAX, "unknown" }
+ };
+ int i;
+
+ for (i = 0; sec_flavours[i].flavour != UINT_MAX; i++) {
+ if (sec_flavours[i].flavour == flavour)
+ break;
+ }
+ return sec_flavours[i].str;
+}
+
+static void nfs_show_mountd_netid(struct seq_file *m, struct nfs_server *nfss,
+ int showdefaults)
+{
+ struct sockaddr *sap = (struct sockaddr *) &nfss->mountd_address;
+ char *proto = NULL;
+
+ switch (sap->sa_family) {
+ case AF_INET:
+ switch (nfss->mountd_protocol) {
+ case IPPROTO_UDP:
+ proto = RPCBIND_NETID_UDP;
+ break;
+ case IPPROTO_TCP:
+ proto = RPCBIND_NETID_TCP;
+ break;
+ }
+ break;
+ case AF_INET6:
+ switch (nfss->mountd_protocol) {
+ case IPPROTO_UDP:
+ proto = RPCBIND_NETID_UDP6;
+ break;
+ case IPPROTO_TCP:
+ proto = RPCBIND_NETID_TCP6;
+ break;
+ }
+ break;
+ }
+ if (proto || showdefaults)
+ seq_printf(m, ",mountproto=%s", proto ?: "auto");
+}
+
+static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
+ int showdefaults)
+{
+ struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address;
+
+ if (nfss->flags & NFS_MOUNT_LEGACY_INTERFACE)
+ return;
+
+ switch (sap->sa_family) {
+ case AF_INET: {
+ struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+ seq_printf(m, ",mountaddr=%pI4", &sin->sin_addr.s_addr);
+ break;
+ }
+ case AF_INET6: {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+ seq_printf(m, ",mountaddr=%pI6c", &sin6->sin6_addr);
+ break;
+ }
+ default:
+ if (showdefaults)
+ seq_puts(m, ",mountaddr=unspecified");
+ }
+
+ if (nfss->mountd_version || showdefaults)
+ seq_printf(m, ",mountvers=%u", nfss->mountd_version);
+ if ((nfss->mountd_port &&
+ nfss->mountd_port != (unsigned short)NFS_UNSPEC_PORT) ||
+ showdefaults)
+ seq_printf(m, ",mountport=%u", nfss->mountd_port);
+
+ nfs_show_mountd_netid(m, nfss, showdefaults);
+}
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
+ int showdefaults)
+{
+ struct nfs_client *clp = nfss->nfs_client;
+
+ seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
+}
+#else
+static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
+ int showdefaults)
+{
+}
+#endif
+
+static void nfs_show_nfs_version(struct seq_file *m,
+ unsigned int version,
+ unsigned int minorversion)
+{
+ seq_printf(m, ",vers=%u", version);
+ if (version == 4)
+ seq_printf(m, ".%u", minorversion);
+}
+
+/*
+ * Describe the mount options in force on this server representation
+ */
+static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
+ int showdefaults)
+{
+ static const struct proc_nfs_info {
+ int flag;
+ const char *str;
+ const char *nostr;
+ } nfs_info[] = {
+ { NFS_MOUNT_SOFT, ",soft", "" },
+ { NFS_MOUNT_SOFTERR, ",softerr", "" },
+ { NFS_MOUNT_SOFTREVAL, ",softreval", "" },
+ { NFS_MOUNT_POSIX, ",posix", "" },
+ { NFS_MOUNT_NOCTO, ",nocto", "" },
+ { NFS_MOUNT_NOAC, ",noac", "" },
+ { NFS_MOUNT_NONLM, ",nolock", "" },
+ { NFS_MOUNT_NOACL, ",noacl", "" },
+ { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
+ { NFS_MOUNT_UNSHARED, ",nosharecache", "" },
+ { NFS_MOUNT_NORESVPORT, ",noresvport", "" },
+ { 0, NULL, NULL }
+ };
+ const struct proc_nfs_info *nfs_infop;
+ struct nfs_client *clp = nfss->nfs_client;
+ u32 version = clp->rpc_ops->version;
+ int local_flock, local_fcntl;
+
+ nfs_show_nfs_version(m, version, clp->cl_minorversion);
+ seq_printf(m, ",rsize=%u", nfss->rsize);
+ seq_printf(m, ",wsize=%u", nfss->wsize);
+ if (nfss->bsize != 0)
+ seq_printf(m, ",bsize=%u", nfss->bsize);
+ seq_printf(m, ",namlen=%u", nfss->namelen);
+ if (nfss->acregmin != NFS_DEF_ACREGMIN*HZ || showdefaults)
+ seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ);
+ if (nfss->acregmax != NFS_DEF_ACREGMAX*HZ || showdefaults)
+ seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ);
+ if (nfss->acdirmin != NFS_DEF_ACDIRMIN*HZ || showdefaults)
+ seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ);
+ if (nfss->acdirmax != NFS_DEF_ACDIRMAX*HZ || showdefaults)
+ seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ);
+ if (!(nfss->flags & (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR)))
+ seq_puts(m, ",hard");
+ for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
+ if (nfss->flags & nfs_infop->flag)
+ seq_puts(m, nfs_infop->str);
+ else
+ seq_puts(m, nfs_infop->nostr);
+ }
+ rcu_read_lock();
+ seq_printf(m, ",proto=%s",
+ rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID));
+ rcu_read_unlock();
+ if (clp->cl_nconnect > 0)
+ seq_printf(m, ",nconnect=%u", clp->cl_nconnect);
+ if (version == 4) {
+ if (clp->cl_max_connect > 1)
+ seq_printf(m, ",max_connect=%u", clp->cl_max_connect);
+ if (nfss->port != NFS_PORT)
+ seq_printf(m, ",port=%u", nfss->port);
+ } else
+ if (nfss->port)
+ seq_printf(m, ",port=%u", nfss->port);
+
+ seq_printf(m, ",timeo=%lu", 10U * nfss->client->cl_timeout->to_initval / HZ);
+ seq_printf(m, ",retrans=%u", nfss->client->cl_timeout->to_retries);
+ seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor));
+ switch (clp->cl_xprtsec.policy) {
+ case RPC_XPRTSEC_TLS_ANON:
+ seq_puts(m, ",xprtsec=tls");
+ break;
+ case RPC_XPRTSEC_TLS_X509:
+ seq_puts(m, ",xprtsec=mtls");
+ break;
+ default:
+ break;
+ }
+
+ if (version != 4)
+ nfs_show_mountd_options(m, nfss, showdefaults);
+ else
+ nfs_show_nfsv4_options(m, nfss, showdefaults);
+
+ if (nfss->options & NFS_OPTION_FSCACHE)
+ seq_puts(m, ",fsc");
+
+ if (nfss->options & NFS_OPTION_MIGRATION)
+ seq_puts(m, ",migration");
+
+ if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) {
+ if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
+ seq_puts(m, ",lookupcache=none");
+ else
+ seq_puts(m, ",lookupcache=pos");
+ }
+
+ local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK;
+ local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL;
+
+ if (!local_flock && !local_fcntl)
+ seq_puts(m, ",local_lock=none");
+ else if (local_flock && local_fcntl)
+ seq_puts(m, ",local_lock=all");
+ else if (local_flock)
+ seq_puts(m, ",local_lock=flock");
+ else
+ seq_puts(m, ",local_lock=posix");
+
+ if (nfss->flags & NFS_MOUNT_WRITE_EAGER) {
+ if (nfss->flags & NFS_MOUNT_WRITE_WAIT)
+ seq_puts(m, ",write=wait");
+ else
+ seq_puts(m, ",write=eager");
+ }
+}
+
+/*
+ * Describe the mount options on this VFS mountpoint
+ */
+int nfs_show_options(struct seq_file *m, struct dentry *root)
+{
+ struct nfs_server *nfss = NFS_SB(root->d_sb);
+
+ nfs_show_mount_options(m, nfss, 0);
+
+ rcu_read_lock();
+ seq_printf(m, ",addr=%s",
+ rpc_peeraddr2str(nfss->nfs_client->cl_rpcclient,
+ RPC_DISPLAY_ADDR));
+ rcu_read_unlock();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_show_options);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+static void show_lease(struct seq_file *m, struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+ unsigned long expire;
+
+ seq_printf(m, ",lease_time=%ld", clp->cl_lease_time / HZ);
+ expire = clp->cl_last_renewal + clp->cl_lease_time;
+ seq_printf(m, ",lease_expired=%ld",
+ time_after(expire, jiffies) ? 0 : (jiffies - expire) / HZ);
+}
+#ifdef CONFIG_NFS_V4_1
+static void show_sessions(struct seq_file *m, struct nfs_server *server)
+{
+ if (nfs4_has_session(server->nfs_client))
+ seq_puts(m, ",sessions");
+}
+#else
+static void show_sessions(struct seq_file *m, struct nfs_server *server) {}
+#endif
+#endif
+
+#ifdef CONFIG_NFS_V4_1
+static void show_pnfs(struct seq_file *m, struct nfs_server *server)
+{
+ seq_printf(m, ",pnfs=");
+ if (server->pnfs_curr_ld)
+ seq_printf(m, "%s", server->pnfs_curr_ld->name);
+ else
+ seq_printf(m, "not configured");
+}
+
+static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss)
+{
+ if (nfss->nfs_client && nfss->nfs_client->cl_implid) {
+ struct nfs41_impl_id *impl_id = nfss->nfs_client->cl_implid;
+ seq_printf(m, "\n\timpl_id:\tname='%s',domain='%s',"
+ "date='%llu,%u'",
+ impl_id->name, impl_id->domain,
+ impl_id->date.seconds, impl_id->date.nseconds);
+ }
+}
+#else
+#if IS_ENABLED(CONFIG_NFS_V4)
+static void show_pnfs(struct seq_file *m, struct nfs_server *server)
+{
+}
+#endif
+static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss)
+{
+}
+#endif
+
+int nfs_show_devname(struct seq_file *m, struct dentry *root)
+{
+ char *page = (char *) __get_free_page(GFP_KERNEL);
+ char *devname, *dummy;
+ int err = 0;
+ if (!page)
+ return -ENOMEM;
+ devname = nfs_path(&dummy, root, page, PAGE_SIZE, 0);
+ if (IS_ERR(devname))
+ err = PTR_ERR(devname);
+ else
+ seq_escape(m, devname, " \t\n\\");
+ free_page((unsigned long)page);
+ return err;
+}
+EXPORT_SYMBOL_GPL(nfs_show_devname);
+
+int nfs_show_path(struct seq_file *m, struct dentry *dentry)
+{
+ seq_puts(m, "/");
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_show_path);
+
+/*
+ * Present statistical information for this VFS mountpoint
+ */
+int nfs_show_stats(struct seq_file *m, struct dentry *root)
+{
+ int i, cpu;
+ struct nfs_server *nfss = NFS_SB(root->d_sb);
+ struct rpc_auth *auth = nfss->client->cl_auth;
+ struct nfs_iostats totals = { };
+
+ seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS);
+
+ /*
+ * Display all mount option settings
+ */
+ seq_puts(m, "\n\topts:\t");
+ seq_puts(m, sb_rdonly(root->d_sb) ? "ro" : "rw");
+ seq_puts(m, root->d_sb->s_flags & SB_SYNCHRONOUS ? ",sync" : "");
+ seq_puts(m, root->d_sb->s_flags & SB_NOATIME ? ",noatime" : "");
+ seq_puts(m, root->d_sb->s_flags & SB_NODIRATIME ? ",nodiratime" : "");
+ nfs_show_mount_options(m, nfss, 1);
+
+ seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
+
+ show_implementation_id(m, nfss);
+
+ seq_puts(m, "\n\tcaps:\t");
+ seq_printf(m, "caps=0x%x", nfss->caps);
+ seq_printf(m, ",wtmult=%u", nfss->wtmult);
+ seq_printf(m, ",dtsize=%u", nfss->dtsize);
+ seq_printf(m, ",bsize=%u", nfss->bsize);
+ seq_printf(m, ",namlen=%u", nfss->namelen);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+ if (nfss->nfs_client->rpc_ops->version == 4) {
+ seq_puts(m, "\n\tnfsv4:\t");
+ seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
+ seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
+ seq_printf(m, ",bm2=0x%x", nfss->attr_bitmask[2]);
+ seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
+ show_sessions(m, nfss);
+ show_pnfs(m, nfss);
+ show_lease(m, nfss);
+ }
+#endif
+
+ /*
+ * Display security flavor in effect for this mount
+ */
+ seq_printf(m, "\n\tsec:\tflavor=%u", auth->au_ops->au_flavor);
+ if (auth->au_flavor)
+ seq_printf(m, ",pseudoflavor=%u", auth->au_flavor);
+
+ /*
+ * Display superblock I/O counters
+ */
+ for_each_possible_cpu(cpu) {
+ struct nfs_iostats *stats;
+
+ preempt_disable();
+ stats = per_cpu_ptr(nfss->io_stats, cpu);
+
+ for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
+ totals.events[i] += stats->events[i];
+ for (i = 0; i < __NFSIOS_BYTESMAX; i++)
+ totals.bytes[i] += stats->bytes[i];
+
+ preempt_enable();
+ }
+
+ seq_puts(m, "\n\tevents:\t");
+ for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
+ seq_printf(m, "%lu ", totals.events[i]);
+ seq_puts(m, "\n\tbytes:\t");
+ for (i = 0; i < __NFSIOS_BYTESMAX; i++)
+ seq_printf(m, "%Lu ", totals.bytes[i]);
+ seq_putc(m, '\n');
+
+ rpc_clnt_show_stats(m, nfss->client);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_show_stats);
+
+/*
+ * Begin unmount by attempting to remove all automounted mountpoints we added
+ * in response to xdev traversals and referrals
+ */
+void nfs_umount_begin(struct super_block *sb)
+{
+ struct nfs_server *server;
+ struct rpc_clnt *rpc;
+
+ server = NFS_SB(sb);
+ /* -EIO all pending I/O */
+ rpc = server->client_acl;
+ if (!IS_ERR(rpc))
+ rpc_killall_tasks(rpc);
+ rpc = server->client;
+ if (!IS_ERR(rpc))
+ rpc_killall_tasks(rpc);
+}
+EXPORT_SYMBOL_GPL(nfs_umount_begin);
+
+/*
+ * Return true if 'match' is in auth_info or auth_info is empty.
+ * Return false otherwise.
+ */
+bool nfs_auth_info_match(const struct nfs_auth_info *auth_info,
+ rpc_authflavor_t match)
+{
+ int i;
+
+ if (!auth_info->flavor_len)
+ return true;
+
+ for (i = 0; i < auth_info->flavor_len; i++) {
+ if (auth_info->flavors[i] == match)
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(nfs_auth_info_match);
+
+/*
+ * Ensure that a specified authtype in ctx->auth_info is supported by
+ * the server. Returns 0 and sets ctx->selected_flavor if it's ok, and
+ * -EACCES if not.
+ */
+static int nfs_verify_authflavors(struct nfs_fs_context *ctx,
+ rpc_authflavor_t *server_authlist,
+ unsigned int count)
+{
+ rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
+ bool found_auth_null = false;
+ unsigned int i;
+
+ /*
+ * If the sec= mount option is used, the specified flavor or AUTH_NULL
+ * must be in the list returned by the server.
+ *
+ * AUTH_NULL has a special meaning when it's in the server list - it
+ * means that the server will ignore the rpc creds, so any flavor
+ * can be used but still use the sec= that was specified.
+ *
+ * Note also that the MNT procedure in MNTv1 does not return a list
+ * of supported security flavors. In this case, nfs_mount() fabricates
+ * a security flavor list containing just AUTH_NULL.
+ */
+ for (i = 0; i < count; i++) {
+ flavor = server_authlist[i];
+
+ if (nfs_auth_info_match(&ctx->auth_info, flavor))
+ goto out;
+
+ if (flavor == RPC_AUTH_NULL)
+ found_auth_null = true;
+ }
+
+ if (found_auth_null) {
+ flavor = ctx->auth_info.flavors[0];
+ goto out;
+ }
+
+ dfprintk(MOUNT,
+ "NFS: specified auth flavors not supported by server\n");
+ return -EACCES;
+
+out:
+ ctx->selected_flavor = flavor;
+ dfprintk(MOUNT, "NFS: using auth flavor %u\n", ctx->selected_flavor);
+ return 0;
+}
+
+/*
+ * Use the remote server's MOUNT service to request the NFS file handle
+ * corresponding to the provided path.
+ */
+static int nfs_request_mount(struct fs_context *fc,
+ struct nfs_fh *root_fh,
+ rpc_authflavor_t *server_authlist,
+ unsigned int *server_authlist_len)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct nfs_mount_request request = {
+ .sap = &ctx->mount_server._address,
+ .dirpath = ctx->nfs_server.export_path,
+ .protocol = ctx->mount_server.protocol,
+ .fh = root_fh,
+ .noresvport = ctx->flags & NFS_MOUNT_NORESVPORT,
+ .auth_flav_len = server_authlist_len,
+ .auth_flavs = server_authlist,
+ .net = fc->net_ns,
+ };
+ int status;
+
+ if (ctx->mount_server.version == 0) {
+ switch (ctx->version) {
+ default:
+ ctx->mount_server.version = NFS_MNT3_VERSION;
+ break;
+ case 2:
+ ctx->mount_server.version = NFS_MNT_VERSION;
+ }
+ }
+ request.version = ctx->mount_server.version;
+
+ if (ctx->mount_server.hostname)
+ request.hostname = ctx->mount_server.hostname;
+ else
+ request.hostname = ctx->nfs_server.hostname;
+
+ /*
+ * Construct the mount server's address.
+ */
+ if (ctx->mount_server.address.sa_family == AF_UNSPEC) {
+ memcpy(request.sap, &ctx->nfs_server._address,
+ ctx->nfs_server.addrlen);
+ ctx->mount_server.addrlen = ctx->nfs_server.addrlen;
+ }
+ request.salen = ctx->mount_server.addrlen;
+ nfs_set_port(request.sap, &ctx->mount_server.port, 0);
+
+ /*
+ * Now ask the mount server to map our export path
+ * to a file handle.
+ */
+ status = nfs_mount(&request, ctx->timeo, ctx->retrans);
+ if (status != 0) {
+ dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
+ request.hostname, status);
+ return status;
+ }
+
+ return 0;
+}
+
+static struct nfs_server *nfs_try_mount_request(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ int status;
+ unsigned int i;
+ bool tried_auth_unix = false;
+ bool auth_null_in_list = false;
+ struct nfs_server *server = ERR_PTR(-EACCES);
+ rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS];
+ unsigned int authlist_len = ARRAY_SIZE(authlist);
+
+ status = nfs_request_mount(fc, ctx->mntfh, authlist, &authlist_len);
+ if (status)
+ return ERR_PTR(status);
+
+ /*
+ * Was a sec= authflavor specified in the options? First, verify
+ * whether the server supports it, and then just try to use it if so.
+ */
+ if (ctx->auth_info.flavor_len > 0) {
+ status = nfs_verify_authflavors(ctx, authlist, authlist_len);
+ dfprintk(MOUNT, "NFS: using auth flavor %u\n",
+ ctx->selected_flavor);
+ if (status)
+ return ERR_PTR(status);
+ return ctx->nfs_mod->rpc_ops->create_server(fc);
+ }
+
+ /*
+ * No sec= option was provided. RFC 2623, section 2.7 suggests we
+ * SHOULD prefer the flavor listed first. However, some servers list
+ * AUTH_NULL first. Avoid ever choosing AUTH_NULL.
+ */
+ for (i = 0; i < authlist_len; ++i) {
+ rpc_authflavor_t flavor;
+ struct rpcsec_gss_info info;
+
+ flavor = authlist[i];
+ switch (flavor) {
+ case RPC_AUTH_UNIX:
+ tried_auth_unix = true;
+ break;
+ case RPC_AUTH_NULL:
+ auth_null_in_list = true;
+ continue;
+ default:
+ if (rpcauth_get_gssinfo(flavor, &info) != 0)
+ continue;
+ break;
+ }
+ dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor);
+ ctx->selected_flavor = flavor;
+ server = ctx->nfs_mod->rpc_ops->create_server(fc);
+ if (!IS_ERR(server))
+ return server;
+ }
+
+ /*
+ * Nothing we tried so far worked. At this point, give up if we've
+ * already tried AUTH_UNIX or if the server's list doesn't contain
+ * AUTH_NULL
+ */
+ if (tried_auth_unix || !auth_null_in_list)
+ return server;
+
+ /* Last chance! Try AUTH_UNIX */
+ dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX);
+ ctx->selected_flavor = RPC_AUTH_UNIX;
+ return ctx->nfs_mod->rpc_ops->create_server(fc);
+}
+
+int nfs_try_get_tree(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+
+ if (ctx->need_mount)
+ ctx->server = nfs_try_mount_request(fc);
+ else
+ ctx->server = ctx->nfs_mod->rpc_ops->create_server(fc);
+
+ return nfs_get_tree_common(fc);
+}
+EXPORT_SYMBOL_GPL(nfs_try_get_tree);
+
+
+#define NFS_REMOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
+ | NFS_MOUNT_SECURE \
+ | NFS_MOUNT_TCP \
+ | NFS_MOUNT_VER3 \
+ | NFS_MOUNT_KERBEROS \
+ | NFS_MOUNT_NONLM \
+ | NFS_MOUNT_BROKEN_SUID \
+ | NFS_MOUNT_STRICTLOCK \
+ | NFS_MOUNT_LEGACY_INTERFACE)
+
+#define NFS_MOUNT_CMP_FLAGMASK (NFS_REMOUNT_CMP_FLAGMASK & \
+ ~(NFS_MOUNT_UNSHARED | NFS_MOUNT_NORESVPORT))
+
+static int
+nfs_compare_remount_data(struct nfs_server *nfss,
+ struct nfs_fs_context *ctx)
+{
+ if ((ctx->flags ^ nfss->flags) & NFS_REMOUNT_CMP_FLAGMASK ||
+ ctx->rsize != nfss->rsize ||
+ ctx->wsize != nfss->wsize ||
+ ctx->version != nfss->nfs_client->rpc_ops->version ||
+ ctx->minorversion != nfss->nfs_client->cl_minorversion ||
+ ctx->retrans != nfss->client->cl_timeout->to_retries ||
+ !nfs_auth_info_match(&ctx->auth_info, nfss->client->cl_auth->au_flavor) ||
+ ctx->acregmin != nfss->acregmin / HZ ||
+ ctx->acregmax != nfss->acregmax / HZ ||
+ ctx->acdirmin != nfss->acdirmin / HZ ||
+ ctx->acdirmax != nfss->acdirmax / HZ ||
+ ctx->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
+ (ctx->options & NFS_OPTION_FSCACHE) != (nfss->options & NFS_OPTION_FSCACHE) ||
+ ctx->nfs_server.port != nfss->port ||
+ ctx->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
+ !rpc_cmp_addr((struct sockaddr *)&ctx->nfs_server.address,
+ (struct sockaddr *)&nfss->nfs_client->cl_addr))
+ return -EINVAL;
+
+ return 0;
+}
+
+int nfs_reconfigure(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct super_block *sb = fc->root->d_sb;
+ struct nfs_server *nfss = sb->s_fs_info;
+ int ret;
+
+ sync_filesystem(sb);
+
+ /*
+ * Userspace mount programs that send binary options generally send
+ * them populated with default values. We have no way to know which
+ * ones were explicitly specified. Fall back to legacy behavior and
+ * just return success.
+ */
+ if (ctx->skip_reconfig_option_check)
+ return 0;
+
+ /*
+ * noac is a special case. It implies -o sync, but that's not
+ * necessarily reflected in the mtab options. reconfigure_super
+ * will clear SB_SYNCHRONOUS if -o sync wasn't specified in the
+ * remount options, so we have to explicitly reset it.
+ */
+ if (ctx->flags & NFS_MOUNT_NOAC) {
+ fc->sb_flags |= SB_SYNCHRONOUS;
+ fc->sb_flags_mask |= SB_SYNCHRONOUS;
+ }
+
+ /* compare new mount options with old ones */
+ ret = nfs_compare_remount_data(nfss, ctx);
+ if (ret)
+ return ret;
+
+ return nfs_probe_server(nfss, NFS_FH(d_inode(fc->root)));
+}
+EXPORT_SYMBOL_GPL(nfs_reconfigure);
+
+/*
+ * Finish setting up an NFS superblock
+ */
+static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx)
+{
+ struct nfs_server *server = NFS_SB(sb);
+
+ sb->s_blocksize_bits = 0;
+ sb->s_blocksize = 0;
+ sb->s_xattr = server->nfs_client->cl_nfs_mod->xattr;
+ sb->s_op = server->nfs_client->cl_nfs_mod->sops;
+ if (ctx->bsize)
+ sb->s_blocksize = nfs_block_size(ctx->bsize, &sb->s_blocksize_bits);
+
+ switch (server->nfs_client->rpc_ops->version) {
+ case 2:
+ sb->s_time_gran = 1000;
+ sb->s_time_min = 0;
+ sb->s_time_max = U32_MAX;
+ break;
+ case 3:
+ /*
+ * The VFS shouldn't apply the umask to mode bits.
+ * We will do so ourselves when necessary.
+ */
+ sb->s_flags |= SB_POSIXACL;
+ sb->s_time_gran = 1;
+ sb->s_time_min = 0;
+ sb->s_time_max = U32_MAX;
+ sb->s_export_op = &nfs_export_ops;
+ break;
+ case 4:
+ sb->s_flags |= SB_POSIXACL;
+ sb->s_time_gran = 1;
+ sb->s_time_min = S64_MIN;
+ sb->s_time_max = S64_MAX;
+ if (server->caps & NFS_CAP_ATOMIC_OPEN_V1)
+ sb->s_export_op = &nfs_export_ops;
+ break;
+ }
+
+ sb->s_magic = NFS_SUPER_MAGIC;
+
+ /* We probably want something more informative here */
+ snprintf(sb->s_id, sizeof(sb->s_id),
+ "%u:%u", MAJOR(sb->s_dev), MINOR(sb->s_dev));
+
+ if (sb->s_blocksize == 0)
+ sb->s_blocksize = nfs_block_bits(server->wsize,
+ &sb->s_blocksize_bits);
+
+ nfs_super_set_maxbytes(sb, server->maxfilesize);
+ nfs_sysfs_move_server_to_sb(sb);
+ server->has_sec_mnt_opts = ctx->has_sec_mnt_opts;
+}
+
+static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b,
+ const struct fs_context *fc)
+{
+ const struct nfs_server *a = s->s_fs_info;
+ const struct rpc_clnt *clnt_a = a->client;
+ const struct rpc_clnt *clnt_b = b->client;
+
+ if ((s->s_flags & NFS_SB_MASK) != (fc->sb_flags & NFS_SB_MASK))
+ goto Ebusy;
+ if (a->nfs_client != b->nfs_client)
+ goto Ebusy;
+ if ((a->flags ^ b->flags) & NFS_MOUNT_CMP_FLAGMASK)
+ goto Ebusy;
+ if (a->wsize != b->wsize)
+ goto Ebusy;
+ if (a->rsize != b->rsize)
+ goto Ebusy;
+ if (a->acregmin != b->acregmin)
+ goto Ebusy;
+ if (a->acregmax != b->acregmax)
+ goto Ebusy;
+ if (a->acdirmin != b->acdirmin)
+ goto Ebusy;
+ if (a->acdirmax != b->acdirmax)
+ goto Ebusy;
+ if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
+ goto Ebusy;
+ return 1;
+Ebusy:
+ return 0;
+}
+
+static int nfs_set_super(struct super_block *s, struct fs_context *fc)
+{
+ struct nfs_server *server = fc->s_fs_info;
+ int ret;
+
+ s->s_d_op = server->nfs_client->rpc_ops->dentry_ops;
+ ret = set_anon_super(s, server);
+ if (ret == 0)
+ server->s_dev = s->s_dev;
+ return ret;
+}
+
+static int nfs_compare_super_address(struct nfs_server *server1,
+ struct nfs_server *server2)
+{
+ struct sockaddr *sap1, *sap2;
+ struct rpc_xprt *xprt1 = server1->client->cl_xprt;
+ struct rpc_xprt *xprt2 = server2->client->cl_xprt;
+
+ if (!net_eq(xprt1->xprt_net, xprt2->xprt_net))
+ return 0;
+
+ sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr;
+ sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr;
+
+ if (sap1->sa_family != sap2->sa_family)
+ return 0;
+
+ switch (sap1->sa_family) {
+ case AF_INET: {
+ struct sockaddr_in *sin1 = (struct sockaddr_in *)sap1;
+ struct sockaddr_in *sin2 = (struct sockaddr_in *)sap2;
+ if (sin1->sin_addr.s_addr != sin2->sin_addr.s_addr)
+ return 0;
+ if (sin1->sin_port != sin2->sin_port)
+ return 0;
+ break;
+ }
+ case AF_INET6: {
+ struct sockaddr_in6 *sin1 = (struct sockaddr_in6 *)sap1;
+ struct sockaddr_in6 *sin2 = (struct sockaddr_in6 *)sap2;
+ if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
+ return 0;
+ if (sin1->sin6_port != sin2->sin6_port)
+ return 0;
+ break;
+ }
+ default:
+ return 0;
+ }
+
+ return 1;
+}
+
+static int nfs_compare_userns(const struct nfs_server *old,
+ const struct nfs_server *new)
+{
+ const struct user_namespace *oldns = &init_user_ns;
+ const struct user_namespace *newns = &init_user_ns;
+
+ if (old->client && old->client->cl_cred)
+ oldns = old->client->cl_cred->user_ns;
+ if (new->client && new->client->cl_cred)
+ newns = new->client->cl_cred->user_ns;
+ if (oldns != newns)
+ return 0;
+ return 1;
+}
+
+static int nfs_compare_super(struct super_block *sb, struct fs_context *fc)
+{
+ struct nfs_server *server = fc->s_fs_info, *old = NFS_SB(sb);
+
+ if (!nfs_compare_super_address(old, server))
+ return 0;
+ /* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */
+ if (old->flags & NFS_MOUNT_UNSHARED)
+ return 0;
+ if (memcmp(&old->fsid, &server->fsid, sizeof(old->fsid)) != 0)
+ return 0;
+ if (!nfs_compare_userns(old, server))
+ return 0;
+ if ((old->has_sec_mnt_opts || fc->security) &&
+ security_sb_mnt_opts_compat(sb, fc->security))
+ return 0;
+ return nfs_compare_mount_options(sb, server, fc);
+}
+
+#ifdef CONFIG_NFS_FSCACHE
+static int nfs_get_cache_cookie(struct super_block *sb,
+ struct nfs_fs_context *ctx)
+{
+ struct nfs_server *nfss = NFS_SB(sb);
+ char *uniq = NULL;
+ int ulen = 0;
+
+ nfss->fscache = NULL;
+
+ if (!ctx)
+ return 0;
+
+ if (ctx->clone_data.sb) {
+ struct nfs_server *mnt_s = NFS_SB(ctx->clone_data.sb);
+ if (!(mnt_s->options & NFS_OPTION_FSCACHE))
+ return 0;
+ if (mnt_s->fscache_uniq) {
+ uniq = mnt_s->fscache_uniq;
+ ulen = strlen(uniq);
+ }
+ } else {
+ if (!(ctx->options & NFS_OPTION_FSCACHE))
+ return 0;
+ if (ctx->fscache_uniq) {
+ uniq = ctx->fscache_uniq;
+ ulen = strlen(ctx->fscache_uniq);
+ }
+ }
+
+ return nfs_fscache_get_super_cookie(sb, uniq, ulen);
+}
+#else
+static int nfs_get_cache_cookie(struct super_block *sb,
+ struct nfs_fs_context *ctx)
+{
+ return 0;
+}
+#endif
+
+int nfs_get_tree_common(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct super_block *s;
+ int (*compare_super)(struct super_block *, struct fs_context *) = nfs_compare_super;
+ struct nfs_server *server = ctx->server;
+ int error;
+
+ ctx->server = NULL;
+ if (IS_ERR(server))
+ return PTR_ERR(server);
+
+ if (server->flags & NFS_MOUNT_UNSHARED)
+ compare_super = NULL;
+
+ /* -o noac implies -o sync */
+ if (server->flags & NFS_MOUNT_NOAC)
+ fc->sb_flags |= SB_SYNCHRONOUS;
+
+ if (ctx->clone_data.sb)
+ if (ctx->clone_data.sb->s_flags & SB_SYNCHRONOUS)
+ fc->sb_flags |= SB_SYNCHRONOUS;
+
+ /* Get a superblock - note that we may end up sharing one that already exists */
+ fc->s_fs_info = server;
+ s = sget_fc(fc, compare_super, nfs_set_super);
+ fc->s_fs_info = NULL;
+ if (IS_ERR(s)) {
+ error = PTR_ERR(s);
+ nfs_errorf(fc, "NFS: Couldn't get superblock");
+ goto out_err_nosb;
+ }
+
+ if (s->s_fs_info != server) {
+ nfs_free_server(server);
+ server = NULL;
+ } else {
+ error = super_setup_bdi_name(s, "%u:%u", MAJOR(server->s_dev),
+ MINOR(server->s_dev));
+ if (error)
+ goto error_splat_super;
+ s->s_bdi->io_pages = server->rpages;
+ server->super = s;
+ }
+
+ if (!s->s_root) {
+ unsigned bsize = ctx->clone_data.inherited_bsize;
+ /* initial superblock/root creation */
+ nfs_fill_super(s, ctx);
+ if (bsize) {
+ s->s_blocksize_bits = bsize;
+ s->s_blocksize = 1U << bsize;
+ }
+ error = nfs_get_cache_cookie(s, ctx);
+ if (error < 0)
+ goto error_splat_super;
+ }
+
+ error = nfs_get_root(s, fc);
+ if (error < 0) {
+ nfs_errorf(fc, "NFS: Couldn't get root dentry");
+ goto error_splat_super;
+ }
+
+ s->s_flags |= SB_ACTIVE;
+ error = 0;
+
+out:
+ return error;
+
+out_err_nosb:
+ nfs_free_server(server);
+ goto out;
+error_splat_super:
+ deactivate_locked_super(s);
+ goto out;
+}
+
+/*
+ * Destroy an NFS superblock
+ */
+void nfs_kill_super(struct super_block *s)
+{
+ struct nfs_server *server = NFS_SB(s);
+
+ nfs_sysfs_move_sb_to_server(server);
+ kill_anon_super(s);
+
+ nfs_fscache_release_super_cookie(s);
+
+ nfs_free_server(server);
+}
+EXPORT_SYMBOL_GPL(nfs_kill_super);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+
+/*
+ * NFS v4 module parameters need to stay in the
+ * NFS client for backwards compatibility
+ */
+unsigned int nfs_callback_set_tcpport;
+unsigned short nfs_callback_nr_threads;
+/* Default cache timeout is 10 minutes */
+unsigned int nfs_idmap_cache_timeout = 600;
+/* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */
+bool nfs4_disable_idmapping = true;
+unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE;
+unsigned short max_session_cb_slots = NFS4_DEF_CB_SLOT_TABLE_SIZE;
+unsigned short send_implementation_id = 1;
+char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = "";
+bool recover_lost_locks = false;
+
+EXPORT_SYMBOL_GPL(nfs_callback_nr_threads);
+EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport);
+EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout);
+EXPORT_SYMBOL_GPL(nfs4_disable_idmapping);
+EXPORT_SYMBOL_GPL(max_session_slots);
+EXPORT_SYMBOL_GPL(max_session_cb_slots);
+EXPORT_SYMBOL_GPL(send_implementation_id);
+EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier);
+EXPORT_SYMBOL_GPL(recover_lost_locks);
+
+#define NFS_CALLBACK_MAXPORTNR (65535U)
+
+static int param_set_portnr(const char *val, const struct kernel_param *kp)
+{
+ unsigned long num;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+ ret = kstrtoul(val, 0, &num);
+ if (ret || num > NFS_CALLBACK_MAXPORTNR)
+ return -EINVAL;
+ *((unsigned int *)kp->arg) = num;
+ return 0;
+}
+static const struct kernel_param_ops param_ops_portnr = {
+ .set = param_set_portnr,
+ .get = param_get_uint,
+};
+#define param_check_portnr(name, p) __param_check(name, p, unsigned int)
+
+module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
+module_param_named(callback_nr_threads, nfs_callback_nr_threads, ushort, 0644);
+MODULE_PARM_DESC(callback_nr_threads, "Number of threads that will be "
+ "assigned to the NFSv4 callback channels.");
+module_param(nfs_idmap_cache_timeout, int, 0644);
+module_param(nfs4_disable_idmapping, bool, 0644);
+module_param_string(nfs4_unique_id, nfs4_client_id_uniquifier,
+ NFS4_CLIENT_ID_UNIQ_LEN, 0600);
+MODULE_PARM_DESC(nfs4_disable_idmapping,
+ "Turn off NFSv4 idmapping when using 'sec=sys'");
+module_param(max_session_slots, ushort, 0644);
+MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 "
+ "requests the client will negotiate");
+module_param(max_session_cb_slots, ushort, 0644);
+MODULE_PARM_DESC(max_session_cb_slots, "Maximum number of parallel NFSv4.1 "
+ "callbacks the client will process for a given server");
+module_param(send_implementation_id, ushort, 0644);
+MODULE_PARM_DESC(send_implementation_id,
+ "Send implementation ID with NFSv4.1 exchange_id");
+MODULE_PARM_DESC(nfs4_unique_id, "nfs_client_id4 uniquifier string");
+
+module_param(recover_lost_locks, bool, 0644);
+MODULE_PARM_DESC(recover_lost_locks,
+ "If the server reports that a lock might be lost, "
+ "try to recover it risking data corruption.");
+
+
+#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
new file mode 100644
index 0000000000..0e27a2e4e6
--- /dev/null
+++ b/fs/nfs/symlink.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/symlink.c
+ *
+ * Copyright (C) 1992 Rick Sladkey
+ *
+ * Optimization changes Copyright (C) 1994 Florian La Roche
+ *
+ * Jun 7 1999, cache symlink lookups in the page cache. -DaveM
+ *
+ * nfs symlink handling code
+ */
+
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs_fs.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+
+/* Symlink caching in the page cache is even more simplistic
+ * and straight-forward than readdir caching.
+ */
+
+static int nfs_symlink_filler(struct file *file, struct folio *folio)
+{
+ struct inode *inode = folio->mapping->host;
+ int error;
+
+ error = NFS_PROTO(inode)->readlink(inode, &folio->page, 0, PAGE_SIZE);
+ if (error < 0)
+ goto error;
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ return 0;
+
+error:
+ folio_set_error(folio);
+ folio_unlock(folio);
+ return -EIO;
+}
+
+static const char *nfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
+{
+ struct page *page;
+ void *err;
+
+ if (!dentry) {
+ err = ERR_PTR(nfs_revalidate_mapping_rcu(inode));
+ if (err)
+ return err;
+ page = find_get_page(inode->i_mapping, 0);
+ if (!page)
+ return ERR_PTR(-ECHILD);
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-ECHILD);
+ }
+ } else {
+ err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
+ if (err)
+ return err;
+ page = read_cache_page(&inode->i_data, 0, nfs_symlink_filler,
+ NULL);
+ if (IS_ERR(page))
+ return ERR_CAST(page);
+ }
+ set_delayed_call(done, page_put_link, page);
+ return page_address(page);
+}
+
+/*
+ * symlinks can't do much...
+ */
+const struct inode_operations nfs_symlink_inode_operations = {
+ .get_link = nfs_get_link,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+};
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
new file mode 100644
index 0000000000..f39e2089bc
--- /dev/null
+++ b/fs/nfs/sysctl.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/sysctl.c
+ *
+ * Sysctl interface to NFS parameters
+ */
+#include <linux/types.h>
+#include <linux/linkage.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+#include <linux/nfs_fs.h>
+
+static struct ctl_table_header *nfs_callback_sysctl_table;
+
+static struct ctl_table nfs_cb_sysctls[] = {
+ {
+ .procname = "nfs_mountpoint_timeout",
+ .data = &nfs_mountpoint_expiry_timeout,
+ .maxlen = sizeof(nfs_mountpoint_expiry_timeout),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nfs_congestion_kb",
+ .data = &nfs_congestion_kb,
+ .maxlen = sizeof(nfs_congestion_kb),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+int nfs_register_sysctl(void)
+{
+ nfs_callback_sysctl_table = register_sysctl("fs/nfs", nfs_cb_sysctls);
+ if (nfs_callback_sysctl_table == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+void nfs_unregister_sysctl(void)
+{
+ unregister_sysctl_table(nfs_callback_sysctl_table);
+ nfs_callback_sysctl_table = NULL;
+}
diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
new file mode 100644
index 0000000000..bf378ecd5d
--- /dev/null
+++ b/fs/nfs/sysfs.c
@@ -0,0 +1,361 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Hammerspace Inc
+ */
+
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/nfs_fs.h>
+#include <linux/rcupdate.h>
+#include <linux/lockd/lockd.h>
+
+#include "nfs4_fs.h"
+#include "netns.h"
+#include "sysfs.h"
+
+static struct kset *nfs_kset;
+
+static void nfs_kset_release(struct kobject *kobj)
+{
+ struct kset *kset = container_of(kobj, struct kset, kobj);
+ kfree(kset);
+}
+
+static const struct kobj_ns_type_operations *nfs_netns_object_child_ns_type(
+ const struct kobject *kobj)
+{
+ return &net_ns_type_operations;
+}
+
+static struct kobj_type nfs_kset_type = {
+ .release = nfs_kset_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .child_ns_type = nfs_netns_object_child_ns_type,
+};
+
+int nfs_sysfs_init(void)
+{
+ int ret;
+
+ nfs_kset = kzalloc(sizeof(*nfs_kset), GFP_KERNEL);
+ if (!nfs_kset)
+ return -ENOMEM;
+
+ ret = kobject_set_name(&nfs_kset->kobj, "nfs");
+ if (ret) {
+ kfree(nfs_kset);
+ return ret;
+ }
+
+ nfs_kset->kobj.parent = fs_kobj;
+ nfs_kset->kobj.ktype = &nfs_kset_type;
+ nfs_kset->kobj.kset = NULL;
+
+ ret = kset_register(nfs_kset);
+ if (ret) {
+ kfree(nfs_kset);
+ return ret;
+ }
+
+ return 0;
+}
+
+void nfs_sysfs_exit(void)
+{
+ kset_unregister(nfs_kset);
+}
+
+static ssize_t nfs_netns_identifier_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct nfs_netns_client *c = container_of(kobj,
+ struct nfs_netns_client,
+ kobject);
+ ssize_t ret;
+
+ rcu_read_lock();
+ ret = sysfs_emit(buf, "%s\n", rcu_dereference(c->identifier));
+ rcu_read_unlock();
+ return ret;
+}
+
+/* Strip trailing '\n' */
+static size_t nfs_string_strip(const char *c, size_t len)
+{
+ while (len > 0 && c[len-1] == '\n')
+ --len;
+ return len;
+}
+
+static ssize_t nfs_netns_identifier_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct nfs_netns_client *c = container_of(kobj,
+ struct nfs_netns_client,
+ kobject);
+ const char *old;
+ char *p;
+ size_t len;
+
+ len = nfs_string_strip(buf, min_t(size_t, count, CONTAINER_ID_MAXLEN));
+ if (!len)
+ return 0;
+ p = kmemdup_nul(buf, len, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ old = rcu_dereference_protected(xchg(&c->identifier, (char __rcu *)p), 1);
+ if (old) {
+ synchronize_rcu();
+ kfree(old);
+ }
+ return count;
+}
+
+static void nfs_netns_client_release(struct kobject *kobj)
+{
+ struct nfs_netns_client *c = container_of(kobj,
+ struct nfs_netns_client,
+ kobject);
+
+ kfree(rcu_dereference_raw(c->identifier));
+}
+
+static const void *nfs_netns_client_namespace(const struct kobject *kobj)
+{
+ return container_of(kobj, struct nfs_netns_client, kobject)->net;
+}
+
+static struct kobj_attribute nfs_netns_client_id = __ATTR(identifier,
+ 0644, nfs_netns_identifier_show, nfs_netns_identifier_store);
+
+static struct attribute *nfs_netns_client_attrs[] = {
+ &nfs_netns_client_id.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(nfs_netns_client);
+
+static struct kobj_type nfs_netns_client_type = {
+ .release = nfs_netns_client_release,
+ .default_groups = nfs_netns_client_groups,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .namespace = nfs_netns_client_namespace,
+};
+
+static void nfs_netns_object_release(struct kobject *kobj)
+{
+ struct nfs_netns_client *c = container_of(kobj,
+ struct nfs_netns_client,
+ nfs_net_kobj);
+ kfree(c);
+}
+
+static const void *nfs_netns_namespace(const struct kobject *kobj)
+{
+ return container_of(kobj, struct nfs_netns_client, nfs_net_kobj)->net;
+}
+
+static struct kobj_type nfs_netns_object_type = {
+ .release = nfs_netns_object_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .namespace = nfs_netns_namespace,
+};
+
+static struct nfs_netns_client *nfs_netns_client_alloc(struct kobject *parent,
+ struct net *net)
+{
+ struct nfs_netns_client *p;
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (p) {
+ p->net = net;
+ p->kobject.kset = nfs_kset;
+ p->nfs_net_kobj.kset = nfs_kset;
+
+ if (kobject_init_and_add(&p->nfs_net_kobj, &nfs_netns_object_type,
+ parent, "net") != 0) {
+ kobject_put(&p->nfs_net_kobj);
+ return NULL;
+ }
+
+ if (kobject_init_and_add(&p->kobject, &nfs_netns_client_type,
+ &p->nfs_net_kobj, "nfs_client") == 0)
+ return p;
+
+ kobject_put(&p->kobject);
+ }
+ return NULL;
+}
+
+void nfs_netns_sysfs_setup(struct nfs_net *netns, struct net *net)
+{
+ struct nfs_netns_client *clp;
+
+ clp = nfs_netns_client_alloc(&nfs_kset->kobj, net);
+ if (clp) {
+ netns->nfs_client = clp;
+ kobject_uevent(&clp->kobject, KOBJ_ADD);
+ }
+}
+
+void nfs_netns_sysfs_destroy(struct nfs_net *netns)
+{
+ struct nfs_netns_client *clp = netns->nfs_client;
+
+ if (clp) {
+ kobject_uevent(&clp->kobject, KOBJ_REMOVE);
+ kobject_del(&clp->kobject);
+ kobject_put(&clp->kobject);
+ kobject_del(&clp->nfs_net_kobj);
+ kobject_put(&clp->nfs_net_kobj);
+ netns->nfs_client = NULL;
+ }
+}
+
+static bool shutdown_match_client(const struct rpc_task *task, const void *data)
+{
+ return true;
+}
+
+static void shutdown_client(struct rpc_clnt *clnt)
+{
+ clnt->cl_shutdown = 1;
+ rpc_cancel_tasks(clnt, -EIO, shutdown_match_client, NULL);
+}
+
+static ssize_t
+shutdown_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct nfs_server *server = container_of(kobj, struct nfs_server, kobj);
+ bool shutdown = server->flags & NFS_MOUNT_SHUTDOWN;
+ return sysfs_emit(buf, "%d\n", shutdown);
+}
+
+static ssize_t
+shutdown_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct nfs_server *server;
+ int ret, val;
+
+ server = container_of(kobj, struct nfs_server, kobj);
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val != 1)
+ return -EINVAL;
+
+ /* already shut down? */
+ if (server->flags & NFS_MOUNT_SHUTDOWN)
+ goto out;
+
+ server->flags |= NFS_MOUNT_SHUTDOWN;
+ shutdown_client(server->client);
+ shutdown_client(server->nfs_client->cl_rpcclient);
+
+ if (!IS_ERR(server->client_acl))
+ shutdown_client(server->client_acl);
+
+ if (server->nlm_host)
+ shutdown_client(server->nlm_host->h_rpcclnt);
+out:
+ return count;
+}
+
+static struct kobj_attribute nfs_sysfs_attr_shutdown = __ATTR_RW(shutdown);
+
+#define RPC_CLIENT_NAME_SIZE 64
+
+void nfs_sysfs_link_rpc_client(struct nfs_server *server,
+ struct rpc_clnt *clnt, const char *uniq)
+{
+ char name[RPC_CLIENT_NAME_SIZE];
+ int ret;
+
+ strcpy(name, clnt->cl_program->name);
+ strcat(name, uniq ? uniq : "");
+ strcat(name, "_client");
+
+ ret = sysfs_create_link_nowarn(&server->kobj,
+ &clnt->cl_sysfs->kobject, name);
+ if (ret < 0)
+ pr_warn("NFS: can't create link to %s in sysfs (%d)\n",
+ name, ret);
+}
+EXPORT_SYMBOL_GPL(nfs_sysfs_link_rpc_client);
+
+static void nfs_sysfs_sb_release(struct kobject *kobj)
+{
+ /* no-op: why? see lib/kobject.c kobject_cleanup() */
+}
+
+static const void *nfs_netns_server_namespace(const struct kobject *kobj)
+{
+ return container_of(kobj, struct nfs_server, kobj)->nfs_client->cl_net;
+}
+
+static struct kobj_type nfs_sb_ktype = {
+ .release = nfs_sysfs_sb_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .namespace = nfs_netns_server_namespace,
+ .child_ns_type = nfs_netns_object_child_ns_type,
+};
+
+void nfs_sysfs_add_server(struct nfs_server *server)
+{
+ int ret;
+
+ ret = kobject_init_and_add(&server->kobj, &nfs_sb_ktype,
+ &nfs_kset->kobj, "server-%d", server->s_sysfs_id);
+ if (ret < 0) {
+ pr_warn("NFS: nfs sysfs add server-%d failed (%d)\n",
+ server->s_sysfs_id, ret);
+ return;
+ }
+ ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_shutdown.attr,
+ nfs_netns_server_namespace(&server->kobj));
+ if (ret < 0)
+ pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n",
+ server->s_sysfs_id, ret);
+}
+EXPORT_SYMBOL_GPL(nfs_sysfs_add_server);
+
+void nfs_sysfs_move_server_to_sb(struct super_block *s)
+{
+ struct nfs_server *server = s->s_fs_info;
+ int ret;
+
+ ret = kobject_rename(&server->kobj, s->s_id);
+ if (ret < 0)
+ pr_warn("NFS: rename sysfs %s failed (%d)\n",
+ server->kobj.name, ret);
+}
+
+void nfs_sysfs_move_sb_to_server(struct nfs_server *server)
+{
+ const char *s;
+ int ret = -ENOMEM;
+
+ s = kasprintf(GFP_KERNEL, "server-%d", server->s_sysfs_id);
+ if (s) {
+ ret = kobject_rename(&server->kobj, s);
+ kfree(s);
+ }
+ if (ret < 0)
+ pr_warn("NFS: rename sysfs %s failed (%d)\n",
+ server->kobj.name, ret);
+}
+
+/* unlink, not dec-ref */
+void nfs_sysfs_remove_server(struct nfs_server *server)
+{
+ kobject_del(&server->kobj);
+}
diff --git a/fs/nfs/sysfs.h b/fs/nfs/sysfs.h
new file mode 100644
index 0000000000..c5d1990cad
--- /dev/null
+++ b/fs/nfs/sysfs.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2019 Hammerspace Inc
+ */
+
+#ifndef __NFS_SYSFS_H
+#define __NFS_SYSFS_H
+
+#define CONTAINER_ID_MAXLEN (64)
+
+struct nfs_netns_client {
+ struct kobject kobject;
+ struct kobject nfs_net_kobj;
+ struct net *net;
+ const char __rcu *identifier;
+};
+
+extern struct kobject *nfs_net_kobj;
+
+extern int nfs_sysfs_init(void);
+extern void nfs_sysfs_exit(void);
+
+void nfs_netns_sysfs_setup(struct nfs_net *netns, struct net *net);
+void nfs_netns_sysfs_destroy(struct nfs_net *netns);
+
+void nfs_sysfs_link_rpc_client(struct nfs_server *server,
+ struct rpc_clnt *clnt, const char *sysfs_prefix);
+void nfs_sysfs_add_server(struct nfs_server *s);
+void nfs_sysfs_move_server_to_sb(struct super_block *s);
+void nfs_sysfs_move_sb_to_server(struct nfs_server *s);
+void nfs_sysfs_remove_server(struct nfs_server *s);
+
+#endif
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
new file mode 100644
index 0000000000..150a953a8b
--- /dev/null
+++ b/fs/nfs/unlink.c
@@ -0,0 +1,531 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/unlink.c
+ *
+ * nfs sillydelete handling
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/dcache.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs_fs.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/namei.h>
+#include <linux/fsnotify.h>
+
+#include "internal.h"
+#include "nfs4_fs.h"
+#include "iostat.h"
+#include "delegation.h"
+
+#include "nfstrace.h"
+
+/**
+ * nfs_free_unlinkdata - release data from a sillydelete operation.
+ * @data: pointer to unlink structure.
+ */
+static void
+nfs_free_unlinkdata(struct nfs_unlinkdata *data)
+{
+ put_cred(data->cred);
+ kfree(data->args.name.name);
+ kfree(data);
+}
+
+/**
+ * nfs_async_unlink_done - Sillydelete post-processing
+ * @task: rpc_task of the sillydelete
+ * @calldata: pointer to nfs_unlinkdata
+ *
+ * Do the directory attribute update.
+ */
+static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs_unlinkdata *data = calldata;
+ struct inode *dir = d_inode(data->dentry->d_parent);
+
+ trace_nfs_sillyrename_unlink(data, task->tk_status);
+ if (!NFS_PROTO(dir)->unlink_done(task, dir))
+ rpc_restart_call_prepare(task);
+}
+
+/**
+ * nfs_async_unlink_release - Release the sillydelete data.
+ * @calldata: struct nfs_unlinkdata to release
+ *
+ * We need to call nfs_put_unlinkdata as a 'tk_release' task since the
+ * rpc_task would be freed too.
+ */
+static void nfs_async_unlink_release(void *calldata)
+{
+ struct nfs_unlinkdata *data = calldata;
+ struct dentry *dentry = data->dentry;
+ struct super_block *sb = dentry->d_sb;
+
+ up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
+ d_lookup_done(dentry);
+ nfs_free_unlinkdata(data);
+ dput(dentry);
+ nfs_sb_deactive(sb);
+}
+
+static void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs_unlinkdata *data = calldata;
+ struct inode *dir = d_inode(data->dentry->d_parent);
+ NFS_PROTO(dir)->unlink_rpc_prepare(task, data);
+}
+
+static const struct rpc_call_ops nfs_unlink_ops = {
+ .rpc_call_done = nfs_async_unlink_done,
+ .rpc_release = nfs_async_unlink_release,
+ .rpc_call_prepare = nfs_unlink_prepare,
+};
+
+static void nfs_do_call_unlink(struct inode *inode, struct nfs_unlinkdata *data)
+{
+ struct rpc_message msg = {
+ .rpc_argp = &data->args,
+ .rpc_resp = &data->res,
+ .rpc_cred = data->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_message = &msg,
+ .callback_ops = &nfs_unlink_ops,
+ .callback_data = data,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
+ };
+ struct rpc_task *task;
+ struct inode *dir = d_inode(data->dentry->d_parent);
+
+ if (nfs_server_capable(inode, NFS_CAP_MOVEABLE))
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
+ nfs_sb_active(dir->i_sb);
+ data->args.fh = NFS_FH(dir);
+ nfs_fattr_init(data->res.dir_attr);
+
+ NFS_PROTO(dir)->unlink_setup(&msg, data->dentry, inode);
+
+ task_setup_data.rpc_client = NFS_CLIENT(dir);
+ task = rpc_run_task(&task_setup_data);
+ if (!IS_ERR(task))
+ rpc_put_task_async(task);
+}
+
+static int nfs_call_unlink(struct dentry *dentry, struct inode *inode, struct nfs_unlinkdata *data)
+{
+ struct inode *dir = d_inode(dentry->d_parent);
+ struct dentry *alias;
+
+ down_read_non_owner(&NFS_I(dir)->rmdir_sem);
+ alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
+ if (IS_ERR(alias)) {
+ up_read_non_owner(&NFS_I(dir)->rmdir_sem);
+ return 0;
+ }
+ if (!d_in_lookup(alias)) {
+ int ret;
+ void *devname_garbage = NULL;
+
+ /*
+ * Hey, we raced with lookup... See if we need to transfer
+ * the sillyrename information to the aliased dentry.
+ */
+ spin_lock(&alias->d_lock);
+ if (d_really_is_positive(alias) &&
+ !nfs_compare_fh(NFS_FH(inode), NFS_FH(d_inode(alias))) &&
+ !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
+ devname_garbage = alias->d_fsdata;
+ alias->d_fsdata = data;
+ alias->d_flags |= DCACHE_NFSFS_RENAMED;
+ ret = 1;
+ } else
+ ret = 0;
+ spin_unlock(&alias->d_lock);
+ dput(alias);
+ up_read_non_owner(&NFS_I(dir)->rmdir_sem);
+ /*
+ * If we'd displaced old cached devname, free it. At that
+ * point dentry is definitely not a root, so we won't need
+ * that anymore.
+ */
+ kfree(devname_garbage);
+ return ret;
+ }
+ data->dentry = alias;
+ nfs_do_call_unlink(inode, data);
+ return 1;
+}
+
+/**
+ * nfs_async_unlink - asynchronous unlinking of a file
+ * @dentry: parent directory of dentry
+ * @name: name of dentry to unlink
+ */
+static int
+nfs_async_unlink(struct dentry *dentry, const struct qstr *name)
+{
+ struct nfs_unlinkdata *data;
+ int status = -ENOMEM;
+ void *devname_garbage = NULL;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (data == NULL)
+ goto out;
+ data->args.name.name = kstrdup(name->name, GFP_KERNEL);
+ if (!data->args.name.name)
+ goto out_free;
+ data->args.name.len = name->len;
+
+ data->cred = get_current_cred();
+ data->res.dir_attr = &data->dir_attr;
+ init_waitqueue_head(&data->wq);
+
+ status = -EBUSY;
+ spin_lock(&dentry->d_lock);
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+ goto out_unlock;
+ dentry->d_flags |= DCACHE_NFSFS_RENAMED;
+ devname_garbage = dentry->d_fsdata;
+ dentry->d_fsdata = data;
+ spin_unlock(&dentry->d_lock);
+ /*
+ * If we'd displaced old cached devname, free it. At that
+ * point dentry is definitely not a root, so we won't need
+ * that anymore.
+ */
+ kfree(devname_garbage);
+ return 0;
+out_unlock:
+ spin_unlock(&dentry->d_lock);
+ put_cred(data->cred);
+ kfree(data->args.name.name);
+out_free:
+ kfree(data);
+out:
+ return status;
+}
+
+/**
+ * nfs_complete_unlink - Initialize completion of the sillydelete
+ * @dentry: dentry to delete
+ * @inode: inode
+ *
+ * Since we're most likely to be called by dentry_iput(), we
+ * only use the dentry to find the sillydelete. We then copy the name
+ * into the qstr.
+ */
+void
+nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
+{
+ struct nfs_unlinkdata *data;
+
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+ data = dentry->d_fsdata;
+ dentry->d_fsdata = NULL;
+ spin_unlock(&dentry->d_lock);
+
+ if (NFS_STALE(inode) || !nfs_call_unlink(dentry, inode, data))
+ nfs_free_unlinkdata(data);
+}
+
+/* Cancel a queued async unlink. Called when a sillyrename run fails. */
+static void
+nfs_cancel_async_unlink(struct dentry *dentry)
+{
+ spin_lock(&dentry->d_lock);
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+ struct nfs_unlinkdata *data = dentry->d_fsdata;
+
+ dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+ dentry->d_fsdata = NULL;
+ spin_unlock(&dentry->d_lock);
+ nfs_free_unlinkdata(data);
+ return;
+ }
+ spin_unlock(&dentry->d_lock);
+}
+
+/**
+ * nfs_async_rename_done - Sillyrename post-processing
+ * @task: rpc_task of the sillyrename
+ * @calldata: nfs_renamedata for the sillyrename
+ *
+ * Do the directory attribute updates and the d_move
+ */
+static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs_renamedata *data = calldata;
+ struct inode *old_dir = data->old_dir;
+ struct inode *new_dir = data->new_dir;
+ struct dentry *old_dentry = data->old_dentry;
+
+ trace_nfs_sillyrename_rename(old_dir, old_dentry,
+ new_dir, data->new_dentry, task->tk_status);
+ if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
+ rpc_restart_call_prepare(task);
+ return;
+ }
+
+ if (data->complete)
+ data->complete(task, data);
+}
+
+/**
+ * nfs_async_rename_release - Release the sillyrename data.
+ * @calldata: the struct nfs_renamedata to be released
+ */
+static void nfs_async_rename_release(void *calldata)
+{
+ struct nfs_renamedata *data = calldata;
+ struct super_block *sb = data->old_dir->i_sb;
+
+ if (d_really_is_positive(data->old_dentry))
+ nfs_mark_for_revalidate(d_inode(data->old_dentry));
+
+ /* The result of the rename is unknown. Play it safe by
+ * forcing a new lookup */
+ if (data->cancelled) {
+ spin_lock(&data->old_dir->i_lock);
+ nfs_force_lookup_revalidate(data->old_dir);
+ spin_unlock(&data->old_dir->i_lock);
+ if (data->new_dir != data->old_dir) {
+ spin_lock(&data->new_dir->i_lock);
+ nfs_force_lookup_revalidate(data->new_dir);
+ spin_unlock(&data->new_dir->i_lock);
+ }
+ }
+
+ dput(data->old_dentry);
+ dput(data->new_dentry);
+ iput(data->old_dir);
+ iput(data->new_dir);
+ nfs_sb_deactive(sb);
+ put_cred(data->cred);
+ kfree(data);
+}
+
+static void nfs_rename_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs_renamedata *data = calldata;
+ NFS_PROTO(data->old_dir)->rename_rpc_prepare(task, data);
+}
+
+static const struct rpc_call_ops nfs_rename_ops = {
+ .rpc_call_done = nfs_async_rename_done,
+ .rpc_release = nfs_async_rename_release,
+ .rpc_call_prepare = nfs_rename_prepare,
+};
+
+/**
+ * nfs_async_rename - perform an asynchronous rename operation
+ * @old_dir: directory that currently holds the dentry to be renamed
+ * @new_dir: target directory for the rename
+ * @old_dentry: original dentry to be renamed
+ * @new_dentry: dentry to which the old_dentry should be renamed
+ * @complete: Function to run on successful completion
+ *
+ * It's expected that valid references to the dentries and inodes are held
+ */
+struct rpc_task *
+nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
+ struct dentry *old_dentry, struct dentry *new_dentry,
+ void (*complete)(struct rpc_task *, struct nfs_renamedata *))
+{
+ struct nfs_renamedata *data;
+ struct rpc_message msg = { };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_message = &msg,
+ .callback_ops = &nfs_rename_ops,
+ .workqueue = nfsiod_workqueue,
+ .rpc_client = NFS_CLIENT(old_dir),
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
+ };
+
+ if (nfs_server_capable(old_dir, NFS_CAP_MOVEABLE) &&
+ nfs_server_capable(new_dir, NFS_CAP_MOVEABLE))
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (data == NULL)
+ return ERR_PTR(-ENOMEM);
+ task_setup_data.task = &data->task;
+ task_setup_data.callback_data = data;
+
+ data->cred = get_current_cred();
+
+ msg.rpc_argp = &data->args;
+ msg.rpc_resp = &data->res;
+ msg.rpc_cred = data->cred;
+
+ /* set up nfs_renamedata */
+ data->old_dir = old_dir;
+ ihold(old_dir);
+ data->new_dir = new_dir;
+ ihold(new_dir);
+ data->old_dentry = dget(old_dentry);
+ data->new_dentry = dget(new_dentry);
+ nfs_fattr_init(&data->old_fattr);
+ nfs_fattr_init(&data->new_fattr);
+ data->complete = complete;
+
+ /* set up nfs_renameargs */
+ data->args.old_dir = NFS_FH(old_dir);
+ data->args.old_name = &old_dentry->d_name;
+ data->args.new_dir = NFS_FH(new_dir);
+ data->args.new_name = &new_dentry->d_name;
+
+ /* set up nfs_renameres */
+ data->res.old_fattr = &data->old_fattr;
+ data->res.new_fattr = &data->new_fattr;
+
+ nfs_sb_active(old_dir->i_sb);
+
+ NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dentry, new_dentry);
+
+ return rpc_run_task(&task_setup_data);
+}
+
+/*
+ * Perform tasks needed when a sillyrename is done such as cancelling the
+ * queued async unlink if it failed.
+ */
+static void
+nfs_complete_sillyrename(struct rpc_task *task, struct nfs_renamedata *data)
+{
+ struct dentry *dentry = data->old_dentry;
+
+ if (task->tk_status != 0) {
+ nfs_cancel_async_unlink(dentry);
+ return;
+ }
+}
+
+#define SILLYNAME_PREFIX ".nfs"
+#define SILLYNAME_PREFIX_LEN ((unsigned)sizeof(SILLYNAME_PREFIX) - 1)
+#define SILLYNAME_FILEID_LEN ((unsigned)sizeof(u64) << 1)
+#define SILLYNAME_COUNTER_LEN ((unsigned)sizeof(unsigned int) << 1)
+#define SILLYNAME_LEN (SILLYNAME_PREFIX_LEN + \
+ SILLYNAME_FILEID_LEN + \
+ SILLYNAME_COUNTER_LEN)
+
+/**
+ * nfs_sillyrename - Perform a silly-rename of a dentry
+ * @dir: inode of directory that contains dentry
+ * @dentry: dentry to be sillyrenamed
+ *
+ * NFSv2/3 is stateless and the server doesn't know when the client is
+ * holding a file open. To prevent application problems when a file is
+ * unlinked while it's still open, the client performs a "silly-rename".
+ * That is, it renames the file to a hidden file in the same directory,
+ * and only performs the unlink once the last reference to it is put.
+ *
+ * The final cleanup is done during dentry_iput.
+ *
+ * (Note: NFSv4 is stateful, and has opens, so in theory an NFSv4 server
+ * could take responsibility for keeping open files referenced. The server
+ * would also need to ensure that opened-but-deleted files were kept over
+ * reboots. However, we may not assume a server does so. (RFC 5661
+ * does provide an OPEN4_RESULT_PRESERVE_UNLINKED flag that a server can
+ * use to advertise that it does this; some day we may take advantage of
+ * it.))
+ */
+int
+nfs_sillyrename(struct inode *dir, struct dentry *dentry)
+{
+ static unsigned int sillycounter;
+ unsigned char silly[SILLYNAME_LEN + 1];
+ unsigned long long fileid;
+ struct dentry *sdentry;
+ struct inode *inode = d_inode(dentry);
+ struct rpc_task *task;
+ int error = -EBUSY;
+
+ dfprintk(VFS, "NFS: silly-rename(%pd2, ct=%d)\n",
+ dentry, d_count(dentry));
+ nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
+
+ /*
+ * We don't allow a dentry to be silly-renamed twice.
+ */
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+ goto out;
+
+ fileid = NFS_FILEID(d_inode(dentry));
+
+ sdentry = NULL;
+ do {
+ int slen;
+ dput(sdentry);
+ sillycounter++;
+ slen = scnprintf(silly, sizeof(silly),
+ SILLYNAME_PREFIX "%0*llx%0*x",
+ SILLYNAME_FILEID_LEN, fileid,
+ SILLYNAME_COUNTER_LEN, sillycounter);
+
+ dfprintk(VFS, "NFS: trying to rename %pd to %s\n",
+ dentry, silly);
+
+ sdentry = lookup_one_len(silly, dentry->d_parent, slen);
+ /*
+ * N.B. Better to return EBUSY here ... it could be
+ * dangerous to delete the file while it's in use.
+ */
+ if (IS_ERR(sdentry))
+ goto out;
+ } while (d_inode(sdentry) != NULL); /* need negative lookup */
+
+ ihold(inode);
+
+ /* queue unlink first. Can't do this from rpc_release as it
+ * has to allocate memory
+ */
+ error = nfs_async_unlink(dentry, &sdentry->d_name);
+ if (error)
+ goto out_dput;
+
+ /* run the rename task, undo unlink if it fails */
+ task = nfs_async_rename(dir, dir, dentry, sdentry,
+ nfs_complete_sillyrename);
+ if (IS_ERR(task)) {
+ error = -EBUSY;
+ nfs_cancel_async_unlink(dentry);
+ goto out_dput;
+ }
+
+ /* wait for the RPC task to complete, unless a SIGKILL intervenes */
+ error = rpc_wait_for_completion_task(task);
+ if (error == 0)
+ error = task->tk_status;
+ switch (error) {
+ case 0:
+ /* The rename succeeded */
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter();
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE |
+ NFS_INO_INVALID_CTIME |
+ NFS_INO_REVAL_FORCED);
+ spin_unlock(&inode->i_lock);
+ d_move(dentry, sdentry);
+ break;
+ case -ERESTARTSYS:
+ /* The result of the rename is unknown. Play it safe by
+ * forcing a new lookup */
+ d_drop(dentry);
+ d_drop(sdentry);
+ }
+ rpc_put_task(task);
+out_dput:
+ iput(inode);
+ dput(sdentry);
+out:
+ return error;
+}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
new file mode 100644
index 0000000000..9d82d50ce0
--- /dev/null
+++ b/fs/nfs/write.c
@@ -0,0 +1,2217 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/write.c
+ *
+ * Write file data over NFS.
+ *
+ * Copyright (C) 1996, 1997, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/migrate.h>
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs_page.h>
+#include <linux/backing-dev.h>
+#include <linux/export.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
+#include <linux/iversion.h>
+#include <linux/filelock.h>
+
+#include <linux/uaccess.h>
+#include <linux/sched/mm.h>
+
+#include "delegation.h"
+#include "internal.h"
+#include "iostat.h"
+#include "nfs4_fs.h"
+#include "fscache.h"
+#include "pnfs.h"
+
+#include "nfstrace.h"
+
+#define NFSDBG_FACILITY NFSDBG_PAGECACHE
+
+#define MIN_POOL_WRITE (32)
+#define MIN_POOL_COMMIT (4)
+
+struct nfs_io_completion {
+ void (*complete)(void *data);
+ void *data;
+ struct kref refcount;
+};
+
+/*
+ * Local function declarations
+ */
+static void nfs_redirty_request(struct nfs_page *req);
+static const struct rpc_call_ops nfs_commit_ops;
+static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
+static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
+static const struct nfs_rw_ops nfs_rw_write_ops;
+static void nfs_inode_remove_request(struct nfs_page *req);
+static void nfs_clear_request_commit(struct nfs_commit_info *cinfo,
+ struct nfs_page *req);
+static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
+ struct inode *inode);
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+ struct folio *folio);
+
+static struct kmem_cache *nfs_wdata_cachep;
+static mempool_t *nfs_wdata_mempool;
+static struct kmem_cache *nfs_cdata_cachep;
+static mempool_t *nfs_commit_mempool;
+
+struct nfs_commit_data *nfs_commitdata_alloc(void)
+{
+ struct nfs_commit_data *p;
+
+ p = kmem_cache_zalloc(nfs_cdata_cachep, nfs_io_gfp_mask());
+ if (!p) {
+ p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT);
+ if (!p)
+ return NULL;
+ memset(p, 0, sizeof(*p));
+ }
+ INIT_LIST_HEAD(&p->pages);
+ return p;
+}
+EXPORT_SYMBOL_GPL(nfs_commitdata_alloc);
+
+void nfs_commit_free(struct nfs_commit_data *p)
+{
+ mempool_free(p, nfs_commit_mempool);
+}
+EXPORT_SYMBOL_GPL(nfs_commit_free);
+
+static struct nfs_pgio_header *nfs_writehdr_alloc(void)
+{
+ struct nfs_pgio_header *p;
+
+ p = kmem_cache_zalloc(nfs_wdata_cachep, nfs_io_gfp_mask());
+ if (!p) {
+ p = mempool_alloc(nfs_wdata_mempool, GFP_NOWAIT);
+ if (!p)
+ return NULL;
+ memset(p, 0, sizeof(*p));
+ }
+ p->rw_mode = FMODE_WRITE;
+ return p;
+}
+
+static void nfs_writehdr_free(struct nfs_pgio_header *hdr)
+{
+ mempool_free(hdr, nfs_wdata_mempool);
+}
+
+static struct nfs_io_completion *nfs_io_completion_alloc(gfp_t gfp_flags)
+{
+ return kmalloc(sizeof(struct nfs_io_completion), gfp_flags);
+}
+
+static void nfs_io_completion_init(struct nfs_io_completion *ioc,
+ void (*complete)(void *), void *data)
+{
+ ioc->complete = complete;
+ ioc->data = data;
+ kref_init(&ioc->refcount);
+}
+
+static void nfs_io_completion_release(struct kref *kref)
+{
+ struct nfs_io_completion *ioc = container_of(kref,
+ struct nfs_io_completion, refcount);
+ ioc->complete(ioc->data);
+ kfree(ioc);
+}
+
+static void nfs_io_completion_get(struct nfs_io_completion *ioc)
+{
+ if (ioc != NULL)
+ kref_get(&ioc->refcount);
+}
+
+static void nfs_io_completion_put(struct nfs_io_completion *ioc)
+{
+ if (ioc != NULL)
+ kref_put(&ioc->refcount, nfs_io_completion_release);
+}
+
+static void
+nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode)
+{
+ if (!test_and_set_bit(PG_INODE_REF, &req->wb_flags)) {
+ kref_get(&req->wb_kref);
+ atomic_long_inc(&NFS_I(inode)->nrequests);
+ }
+}
+
+static int
+nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode)
+{
+ int ret;
+
+ if (!test_bit(PG_REMOVE, &req->wb_flags))
+ return 0;
+ ret = nfs_page_group_lock(req);
+ if (ret)
+ return ret;
+ if (test_and_clear_bit(PG_REMOVE, &req->wb_flags))
+ nfs_page_set_inode_ref(req, inode);
+ nfs_page_group_unlock(req);
+ return 0;
+}
+
+static struct nfs_page *nfs_folio_private_request(struct folio *folio)
+{
+ return folio_get_private(folio);
+}
+
+/**
+ * nfs_folio_find_private_request - find head request associated with a folio
+ * @folio: pointer to folio
+ *
+ * must be called while holding the inode lock.
+ *
+ * returns matching head request with reference held, or NULL if not found.
+ */
+static struct nfs_page *nfs_folio_find_private_request(struct folio *folio)
+{
+ struct address_space *mapping = folio_file_mapping(folio);
+ struct nfs_page *req;
+
+ if (!folio_test_private(folio))
+ return NULL;
+ spin_lock(&mapping->private_lock);
+ req = nfs_folio_private_request(folio);
+ if (req) {
+ WARN_ON_ONCE(req->wb_head != req);
+ kref_get(&req->wb_kref);
+ }
+ spin_unlock(&mapping->private_lock);
+ return req;
+}
+
+static struct nfs_page *nfs_folio_find_swap_request(struct folio *folio)
+{
+ struct inode *inode = folio_file_mapping(folio)->host;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_page *req = NULL;
+ if (!folio_test_swapcache(folio))
+ return NULL;
+ mutex_lock(&nfsi->commit_mutex);
+ if (folio_test_swapcache(folio)) {
+ req = nfs_page_search_commits_for_head_request_locked(nfsi,
+ folio);
+ if (req) {
+ WARN_ON_ONCE(req->wb_head != req);
+ kref_get(&req->wb_kref);
+ }
+ }
+ mutex_unlock(&nfsi->commit_mutex);
+ return req;
+}
+
+/**
+ * nfs_folio_find_head_request - find head request associated with a folio
+ * @folio: pointer to folio
+ *
+ * returns matching head request with reference held, or NULL if not found.
+ */
+static struct nfs_page *nfs_folio_find_head_request(struct folio *folio)
+{
+ struct nfs_page *req;
+
+ req = nfs_folio_find_private_request(folio);
+ if (!req)
+ req = nfs_folio_find_swap_request(folio);
+ return req;
+}
+
+static struct nfs_page *nfs_folio_find_and_lock_request(struct folio *folio)
+{
+ struct inode *inode = folio_file_mapping(folio)->host;
+ struct nfs_page *req, *head;
+ int ret;
+
+ for (;;) {
+ req = nfs_folio_find_head_request(folio);
+ if (!req)
+ return req;
+ head = nfs_page_group_lock_head(req);
+ if (head != req)
+ nfs_release_request(req);
+ if (IS_ERR(head))
+ return head;
+ ret = nfs_cancel_remove_inode(head, inode);
+ if (ret < 0) {
+ nfs_unlock_and_release_request(head);
+ return ERR_PTR(ret);
+ }
+ /* Ensure that nobody removed the request before we locked it */
+ if (head == nfs_folio_private_request(folio))
+ break;
+ if (folio_test_swapcache(folio))
+ break;
+ nfs_unlock_and_release_request(head);
+ }
+ return head;
+}
+
+/* Adjust the file length if we're writing beyond the end */
+static void nfs_grow_file(struct folio *folio, unsigned int offset,
+ unsigned int count)
+{
+ struct inode *inode = folio_file_mapping(folio)->host;
+ loff_t end, i_size;
+ pgoff_t end_index;
+
+ spin_lock(&inode->i_lock);
+ i_size = i_size_read(inode);
+ end_index = ((i_size - 1) >> folio_shift(folio)) << folio_order(folio);
+ if (i_size > 0 && folio_index(folio) < end_index)
+ goto out;
+ end = folio_file_pos(folio) + (loff_t)offset + (loff_t)count;
+ if (i_size >= end)
+ goto out;
+ trace_nfs_size_grow(inode, end);
+ i_size_write(inode, end);
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
+ nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
+out:
+ spin_unlock(&inode->i_lock);
+ nfs_fscache_invalidate(inode, 0);
+}
+
+/* A writeback failed: mark the page as bad, and invalidate the page cache */
+static void nfs_set_pageerror(struct address_space *mapping)
+{
+ struct inode *inode = mapping->host;
+
+ nfs_zap_mapping(mapping->host, mapping);
+ /* Force file size revalidation */
+ spin_lock(&inode->i_lock);
+ nfs_set_cache_invalid(inode, NFS_INO_REVAL_FORCED |
+ NFS_INO_INVALID_CHANGE |
+ NFS_INO_INVALID_SIZE);
+ spin_unlock(&inode->i_lock);
+}
+
+static void nfs_mapping_set_error(struct folio *folio, int error)
+{
+ struct address_space *mapping = folio_file_mapping(folio);
+
+ folio_set_error(folio);
+ filemap_set_wb_err(mapping, error);
+ if (mapping->host)
+ errseq_set(&mapping->host->i_sb->s_wb_err,
+ error == -ENOSPC ? -ENOSPC : -EIO);
+ nfs_set_pageerror(mapping);
+}
+
+/*
+ * nfs_page_group_search_locked
+ * @head - head request of page group
+ * @page_offset - offset into page
+ *
+ * Search page group with head @head to find a request that contains the
+ * page offset @page_offset.
+ *
+ * Returns a pointer to the first matching nfs request, or NULL if no
+ * match is found.
+ *
+ * Must be called with the page group lock held
+ */
+static struct nfs_page *
+nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset)
+{
+ struct nfs_page *req;
+
+ req = head;
+ do {
+ if (page_offset >= req->wb_pgbase &&
+ page_offset < (req->wb_pgbase + req->wb_bytes))
+ return req;
+
+ req = req->wb_this_page;
+ } while (req != head);
+
+ return NULL;
+}
+
+/*
+ * nfs_page_group_covers_page
+ * @head - head request of page group
+ *
+ * Return true if the page group with head @head covers the whole page,
+ * returns false otherwise
+ */
+static bool nfs_page_group_covers_page(struct nfs_page *req)
+{
+ unsigned int len = nfs_folio_length(nfs_page_to_folio(req));
+ struct nfs_page *tmp;
+ unsigned int pos = 0;
+
+ nfs_page_group_lock(req);
+
+ for (;;) {
+ tmp = nfs_page_group_search_locked(req->wb_head, pos);
+ if (!tmp)
+ break;
+ pos = tmp->wb_pgbase + tmp->wb_bytes;
+ }
+
+ nfs_page_group_unlock(req);
+ return pos >= len;
+}
+
+/* We can set the PG_uptodate flag if we see that a write request
+ * covers the full page.
+ */
+static void nfs_mark_uptodate(struct nfs_page *req)
+{
+ struct folio *folio = nfs_page_to_folio(req);
+
+ if (folio_test_uptodate(folio))
+ return;
+ if (!nfs_page_group_covers_page(req))
+ return;
+ folio_mark_uptodate(folio);
+}
+
+static int wb_priority(struct writeback_control *wbc)
+{
+ int ret = 0;
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ ret = FLUSH_COND_STABLE;
+ return ret;
+}
+
+/*
+ * NFS congestion control
+ */
+
+int nfs_congestion_kb;
+
+#define NFS_CONGESTION_ON_THRESH (nfs_congestion_kb >> (PAGE_SHIFT-10))
+#define NFS_CONGESTION_OFF_THRESH \
+ (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
+
+static void nfs_folio_set_writeback(struct folio *folio)
+{
+ struct nfs_server *nfss = NFS_SERVER(folio_file_mapping(folio)->host);
+
+ folio_start_writeback(folio);
+ if (atomic_long_inc_return(&nfss->writeback) > NFS_CONGESTION_ON_THRESH)
+ nfss->write_congested = 1;
+}
+
+static void nfs_folio_end_writeback(struct folio *folio)
+{
+ struct nfs_server *nfss = NFS_SERVER(folio_file_mapping(folio)->host);
+
+ folio_end_writeback(folio);
+ if (atomic_long_dec_return(&nfss->writeback) <
+ NFS_CONGESTION_OFF_THRESH)
+ nfss->write_congested = 0;
+}
+
+static void nfs_page_end_writeback(struct nfs_page *req)
+{
+ if (nfs_page_group_sync_on_bit(req, PG_WB_END)) {
+ nfs_unlock_request(req);
+ nfs_folio_end_writeback(nfs_page_to_folio(req));
+ } else
+ nfs_unlock_request(req);
+}
+
+/*
+ * nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests
+ *
+ * @destroy_list - request list (using wb_this_page) terminated by @old_head
+ * @old_head - the old head of the list
+ *
+ * All subrequests must be locked and removed from all lists, so at this point
+ * they are only "active" in this function, and possibly in nfs_wait_on_request
+ * with a reference held by some other context.
+ */
+static void
+nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
+ struct nfs_page *old_head,
+ struct inode *inode)
+{
+ while (destroy_list) {
+ struct nfs_page *subreq = destroy_list;
+
+ destroy_list = (subreq->wb_this_page == old_head) ?
+ NULL : subreq->wb_this_page;
+
+ /* Note: lock subreq in order to change subreq->wb_head */
+ nfs_page_set_headlock(subreq);
+ WARN_ON_ONCE(old_head != subreq->wb_head);
+
+ /* make sure old group is not used */
+ subreq->wb_this_page = subreq;
+ subreq->wb_head = subreq;
+
+ clear_bit(PG_REMOVE, &subreq->wb_flags);
+
+ /* Note: races with nfs_page_group_destroy() */
+ if (!kref_read(&subreq->wb_kref)) {
+ /* Check if we raced with nfs_page_group_destroy() */
+ if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags)) {
+ nfs_page_clear_headlock(subreq);
+ nfs_free_request(subreq);
+ } else
+ nfs_page_clear_headlock(subreq);
+ continue;
+ }
+ nfs_page_clear_headlock(subreq);
+
+ nfs_release_request(old_head);
+
+ if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) {
+ nfs_release_request(subreq);
+ atomic_long_dec(&NFS_I(inode)->nrequests);
+ }
+
+ /* subreq is now totally disconnected from page group or any
+ * write / commit lists. last chance to wake any waiters */
+ nfs_unlock_and_release_request(subreq);
+ }
+}
+
+/*
+ * nfs_join_page_group - destroy subrequests of the head req
+ * @head: the page used to lookup the "page group" of nfs_page structures
+ * @inode: Inode to which the request belongs.
+ *
+ * This function joins all sub requests to the head request by first
+ * locking all requests in the group, cancelling any pending operations
+ * and finally updating the head request to cover the whole range covered by
+ * the (former) group. All subrequests are removed from any write or commit
+ * lists, unlinked from the group and destroyed.
+ */
+void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo,
+ struct inode *inode)
+{
+ struct nfs_page *subreq;
+ struct nfs_page *destroy_list = NULL;
+ unsigned int pgbase, off, bytes;
+
+ pgbase = head->wb_pgbase;
+ bytes = head->wb_bytes;
+ off = head->wb_offset;
+ for (subreq = head->wb_this_page; subreq != head;
+ subreq = subreq->wb_this_page) {
+ /* Subrequests should always form a contiguous range */
+ if (pgbase > subreq->wb_pgbase) {
+ off -= pgbase - subreq->wb_pgbase;
+ bytes += pgbase - subreq->wb_pgbase;
+ pgbase = subreq->wb_pgbase;
+ }
+ bytes = max(subreq->wb_pgbase + subreq->wb_bytes
+ - pgbase, bytes);
+ }
+
+ /* Set the head request's range to cover the former page group */
+ head->wb_pgbase = pgbase;
+ head->wb_bytes = bytes;
+ head->wb_offset = off;
+
+ /* Now that all requests are locked, make sure they aren't on any list.
+ * Commit list removal accounting is done after locks are dropped */
+ subreq = head;
+ do {
+ nfs_clear_request_commit(cinfo, subreq);
+ subreq = subreq->wb_this_page;
+ } while (subreq != head);
+
+ /* unlink subrequests from head, destroy them later */
+ if (head->wb_this_page != head) {
+ /* destroy list will be terminated by head */
+ destroy_list = head->wb_this_page;
+ head->wb_this_page = head;
+ }
+
+ nfs_destroy_unlinked_subrequests(destroy_list, head, inode);
+}
+
+/*
+ * nfs_lock_and_join_requests - join all subreqs to the head req
+ * @folio: the folio used to lookup the "page group" of nfs_page structures
+ *
+ * This function joins all sub requests to the head request by first
+ * locking all requests in the group, cancelling any pending operations
+ * and finally updating the head request to cover the whole range covered by
+ * the (former) group. All subrequests are removed from any write or commit
+ * lists, unlinked from the group and destroyed.
+ *
+ * Returns a locked, referenced pointer to the head request - which after
+ * this call is guaranteed to be the only request associated with the page.
+ * Returns NULL if no requests are found for @folio, or a ERR_PTR if an
+ * error was encountered.
+ */
+static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio)
+{
+ struct inode *inode = folio_file_mapping(folio)->host;
+ struct nfs_page *head;
+ struct nfs_commit_info cinfo;
+ int ret;
+
+ nfs_init_cinfo_from_inode(&cinfo, inode);
+ /*
+ * A reference is taken only on the head request which acts as a
+ * reference to the whole page group - the group will not be destroyed
+ * until the head reference is released.
+ */
+ head = nfs_folio_find_and_lock_request(folio);
+ if (IS_ERR_OR_NULL(head))
+ return head;
+
+ /* lock each request in the page group */
+ ret = nfs_page_group_lock_subrequests(head);
+ if (ret < 0) {
+ nfs_unlock_and_release_request(head);
+ return ERR_PTR(ret);
+ }
+
+ nfs_join_page_group(head, &cinfo, inode);
+
+ return head;
+}
+
+static void nfs_write_error(struct nfs_page *req, int error)
+{
+ trace_nfs_write_error(nfs_page_to_inode(req), req, error);
+ nfs_mapping_set_error(nfs_page_to_folio(req), error);
+ nfs_inode_remove_request(req);
+ nfs_page_end_writeback(req);
+ nfs_release_request(req);
+}
+
+/*
+ * Find an associated nfs write request, and prepare to flush it out
+ * May return an error if the user signalled nfs_wait_on_request().
+ */
+static int nfs_page_async_flush(struct folio *folio,
+ struct writeback_control *wbc,
+ struct nfs_pageio_descriptor *pgio)
+{
+ struct nfs_page *req;
+ int ret = 0;
+
+ req = nfs_lock_and_join_requests(folio);
+ if (!req)
+ goto out;
+ ret = PTR_ERR(req);
+ if (IS_ERR(req))
+ goto out;
+
+ nfs_folio_set_writeback(folio);
+ WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
+
+ /* If there is a fatal error that covers this write, just exit */
+ ret = pgio->pg_error;
+ if (nfs_error_is_fatal_on_server(ret))
+ goto out_launder;
+
+ ret = 0;
+ if (!nfs_pageio_add_request(pgio, req)) {
+ ret = pgio->pg_error;
+ /*
+ * Remove the problematic req upon fatal errors on the server
+ */
+ if (nfs_error_is_fatal_on_server(ret))
+ goto out_launder;
+ if (wbc->sync_mode == WB_SYNC_NONE)
+ ret = AOP_WRITEPAGE_ACTIVATE;
+ folio_redirty_for_writepage(wbc, folio);
+ nfs_redirty_request(req);
+ pgio->pg_error = 0;
+ } else
+ nfs_add_stats(folio_file_mapping(folio)->host,
+ NFSIOS_WRITEPAGES, 1);
+out:
+ return ret;
+out_launder:
+ nfs_write_error(req, ret);
+ return 0;
+}
+
+static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc,
+ struct nfs_pageio_descriptor *pgio)
+{
+ nfs_pageio_cond_complete(pgio, folio_index(folio));
+ return nfs_page_async_flush(folio, wbc, pgio);
+}
+
+/*
+ * Write an mmapped page to the server.
+ */
+static int nfs_writepage_locked(struct folio *folio,
+ struct writeback_control *wbc)
+{
+ struct nfs_pageio_descriptor pgio;
+ struct inode *inode = folio_file_mapping(folio)->host;
+ int err;
+
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ NFS_SERVER(inode)->write_congested)
+ return AOP_WRITEPAGE_ACTIVATE;
+
+ nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
+ nfs_pageio_init_write(&pgio, inode, 0, false,
+ &nfs_async_write_completion_ops);
+ err = nfs_do_writepage(folio, wbc, &pgio);
+ pgio.pg_error = 0;
+ nfs_pageio_complete(&pgio);
+ return err;
+}
+
+int nfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct folio *folio = page_folio(page);
+ int ret;
+
+ ret = nfs_writepage_locked(folio, wbc);
+ if (ret != AOP_WRITEPAGE_ACTIVATE)
+ unlock_page(page);
+ return ret;
+}
+
+static int nfs_writepages_callback(struct folio *folio,
+ struct writeback_control *wbc, void *data)
+{
+ int ret;
+
+ ret = nfs_do_writepage(folio, wbc, data);
+ if (ret != AOP_WRITEPAGE_ACTIVATE)
+ folio_unlock(folio);
+ return ret;
+}
+
+static void nfs_io_completion_commit(void *inode)
+{
+ nfs_commit_inode(inode, 0);
+}
+
+int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ struct nfs_pageio_descriptor pgio;
+ struct nfs_io_completion *ioc = NULL;
+ unsigned int mntflags = NFS_SERVER(inode)->flags;
+ int priority = 0;
+ int err;
+
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ NFS_SERVER(inode)->write_congested)
+ return 0;
+
+ nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
+
+ if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate ||
+ wbc->for_background || wbc->for_sync || wbc->for_reclaim) {
+ ioc = nfs_io_completion_alloc(GFP_KERNEL);
+ if (ioc)
+ nfs_io_completion_init(ioc, nfs_io_completion_commit,
+ inode);
+ priority = wb_priority(wbc);
+ }
+
+ do {
+ nfs_pageio_init_write(&pgio, inode, priority, false,
+ &nfs_async_write_completion_ops);
+ pgio.pg_io_completion = ioc;
+ err = write_cache_pages(mapping, wbc, nfs_writepages_callback,
+ &pgio);
+ pgio.pg_error = 0;
+ nfs_pageio_complete(&pgio);
+ } while (err < 0 && !nfs_error_is_fatal(err));
+ nfs_io_completion_put(ioc);
+
+ if (err < 0)
+ goto out_err;
+ return 0;
+out_err:
+ return err;
+}
+
+/*
+ * Insert a write request into an inode
+ */
+static void nfs_inode_add_request(struct nfs_page *req)
+{
+ struct folio *folio = nfs_page_to_folio(req);
+ struct address_space *mapping = folio_file_mapping(folio);
+ struct nfs_inode *nfsi = NFS_I(mapping->host);
+
+ WARN_ON_ONCE(req->wb_this_page != req);
+
+ /* Lock the request! */
+ nfs_lock_request(req);
+
+ /*
+ * Swap-space should not get truncated. Hence no need to plug the race
+ * with invalidate/truncate.
+ */
+ spin_lock(&mapping->private_lock);
+ if (likely(!folio_test_swapcache(folio))) {
+ set_bit(PG_MAPPED, &req->wb_flags);
+ folio_set_private(folio);
+ folio->private = req;
+ }
+ spin_unlock(&mapping->private_lock);
+ atomic_long_inc(&nfsi->nrequests);
+ /* this a head request for a page group - mark it as having an
+ * extra reference so sub groups can follow suit.
+ * This flag also informs pgio layer when to bump nrequests when
+ * adding subrequests. */
+ WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags));
+ kref_get(&req->wb_kref);
+}
+
+/*
+ * Remove a write request from an inode
+ */
+static void nfs_inode_remove_request(struct nfs_page *req)
+{
+ struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req));
+
+ if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
+ struct folio *folio = nfs_page_to_folio(req->wb_head);
+ struct address_space *mapping = folio_file_mapping(folio);
+
+ spin_lock(&mapping->private_lock);
+ if (likely(folio && !folio_test_swapcache(folio))) {
+ folio->private = NULL;
+ folio_clear_private(folio);
+ clear_bit(PG_MAPPED, &req->wb_head->wb_flags);
+ }
+ spin_unlock(&mapping->private_lock);
+ }
+
+ if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
+ atomic_long_dec(&nfsi->nrequests);
+ nfs_release_request(req);
+ }
+}
+
+static void nfs_mark_request_dirty(struct nfs_page *req)
+{
+ struct folio *folio = nfs_page_to_folio(req);
+ if (folio)
+ filemap_dirty_folio(folio_mapping(folio), folio);
+}
+
+/*
+ * nfs_page_search_commits_for_head_request_locked
+ *
+ * Search through commit lists on @inode for the head request for @folio.
+ * Must be called while holding the inode (which is cinfo) lock.
+ *
+ * Returns the head request if found, or NULL if not found.
+ */
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+ struct folio *folio)
+{
+ struct nfs_page *freq, *t;
+ struct nfs_commit_info cinfo;
+ struct inode *inode = &nfsi->vfs_inode;
+
+ nfs_init_cinfo_from_inode(&cinfo, inode);
+
+ /* search through pnfs commit lists */
+ freq = pnfs_search_commit_reqs(inode, &cinfo, folio);
+ if (freq)
+ return freq->wb_head;
+
+ /* Linearly search the commit list for the correct request */
+ list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
+ if (nfs_page_to_folio(freq) == folio)
+ return freq->wb_head;
+ }
+
+ return NULL;
+}
+
+/**
+ * nfs_request_add_commit_list_locked - add request to a commit list
+ * @req: pointer to a struct nfs_page
+ * @dst: commit list head
+ * @cinfo: holds list lock and accounting info
+ *
+ * This sets the PG_CLEAN bit, updates the cinfo count of
+ * number of outstanding requests requiring a commit as well as
+ * the MM page stats.
+ *
+ * The caller must hold NFS_I(cinfo->inode)->commit_mutex, and the
+ * nfs_page lock.
+ */
+void
+nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
+ struct nfs_commit_info *cinfo)
+{
+ set_bit(PG_CLEAN, &req->wb_flags);
+ nfs_list_add_request(req, dst);
+ atomic_long_inc(&cinfo->mds->ncommit);
+}
+EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
+
+/**
+ * nfs_request_add_commit_list - add request to a commit list
+ * @req: pointer to a struct nfs_page
+ * @cinfo: holds list lock and accounting info
+ *
+ * This sets the PG_CLEAN bit, updates the cinfo count of
+ * number of outstanding requests requiring a commit as well as
+ * the MM page stats.
+ *
+ * The caller must _not_ hold the cinfo->lock, but must be
+ * holding the nfs_page lock.
+ */
+void
+nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo)
+{
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+ nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo);
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+ nfs_folio_mark_unstable(nfs_page_to_folio(req), cinfo);
+}
+EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
+
+/**
+ * nfs_request_remove_commit_list - Remove request from a commit list
+ * @req: pointer to a nfs_page
+ * @cinfo: holds list lock and accounting info
+ *
+ * This clears the PG_CLEAN bit, and updates the cinfo's count of
+ * number of outstanding requests requiring a commit
+ * It does not update the MM page stats.
+ *
+ * The caller _must_ hold the cinfo->lock and the nfs_page lock.
+ */
+void
+nfs_request_remove_commit_list(struct nfs_page *req,
+ struct nfs_commit_info *cinfo)
+{
+ if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags))
+ return;
+ nfs_list_remove_request(req);
+ atomic_long_dec(&cinfo->mds->ncommit);
+}
+EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list);
+
+static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
+ struct inode *inode)
+{
+ cinfo->inode = inode;
+ cinfo->mds = &NFS_I(inode)->commit_info;
+ cinfo->ds = pnfs_get_ds_info(inode);
+ cinfo->dreq = NULL;
+ cinfo->completion_ops = &nfs_commit_completion_ops;
+}
+
+void nfs_init_cinfo(struct nfs_commit_info *cinfo,
+ struct inode *inode,
+ struct nfs_direct_req *dreq)
+{
+ if (dreq)
+ nfs_init_cinfo_from_dreq(cinfo, dreq);
+ else
+ nfs_init_cinfo_from_inode(cinfo, inode);
+}
+EXPORT_SYMBOL_GPL(nfs_init_cinfo);
+
+/*
+ * Add a request to the inode's commit list.
+ */
+void
+nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo, u32 ds_commit_idx)
+{
+ if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx))
+ return;
+ nfs_request_add_commit_list(req, cinfo);
+}
+
+static void nfs_folio_clear_commit(struct folio *folio)
+{
+ if (folio) {
+ long nr = folio_nr_pages(folio);
+
+ node_stat_mod_folio(folio, NR_WRITEBACK, -nr);
+ wb_stat_mod(&inode_to_bdi(folio_file_mapping(folio)->host)->wb,
+ WB_WRITEBACK, -nr);
+ }
+}
+
+/* Called holding the request lock on @req */
+static void nfs_clear_request_commit(struct nfs_commit_info *cinfo,
+ struct nfs_page *req)
+{
+ if (test_bit(PG_CLEAN, &req->wb_flags)) {
+ struct nfs_open_context *ctx = nfs_req_openctx(req);
+ struct inode *inode = d_inode(ctx->dentry);
+
+ mutex_lock(&NFS_I(inode)->commit_mutex);
+ if (!pnfs_clear_request_commit(req, cinfo)) {
+ nfs_request_remove_commit_list(req, cinfo);
+ }
+ mutex_unlock(&NFS_I(inode)->commit_mutex);
+ nfs_folio_clear_commit(nfs_page_to_folio(req));
+ }
+}
+
+int nfs_write_need_commit(struct nfs_pgio_header *hdr)
+{
+ if (hdr->verf.committed == NFS_DATA_SYNC)
+ return hdr->lseg == NULL;
+ return hdr->verf.committed != NFS_FILE_SYNC;
+}
+
+static void nfs_async_write_init(struct nfs_pgio_header *hdr)
+{
+ nfs_io_completion_get(hdr->io_completion);
+}
+
+static void nfs_write_completion(struct nfs_pgio_header *hdr)
+{
+ struct nfs_commit_info cinfo;
+ unsigned long bytes = 0;
+
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
+ goto out;
+ nfs_init_cinfo_from_inode(&cinfo, hdr->inode);
+ while (!list_empty(&hdr->pages)) {
+ struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+
+ bytes += req->wb_bytes;
+ nfs_list_remove_request(req);
+ if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) &&
+ (hdr->good_bytes < bytes)) {
+ trace_nfs_comp_error(hdr->inode, req, hdr->error);
+ nfs_mapping_set_error(nfs_page_to_folio(req),
+ hdr->error);
+ goto remove_req;
+ }
+ if (nfs_write_need_commit(hdr)) {
+ /* Reset wb_nio, since the write was successful. */
+ req->wb_nio = 0;
+ memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
+ nfs_mark_request_commit(req, hdr->lseg, &cinfo,
+ hdr->pgio_mirror_idx);
+ goto next;
+ }
+remove_req:
+ nfs_inode_remove_request(req);
+next:
+ nfs_page_end_writeback(req);
+ nfs_release_request(req);
+ }
+out:
+ nfs_io_completion_put(hdr->io_completion);
+ hdr->release(hdr);
+}
+
+unsigned long
+nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
+{
+ return atomic_long_read(&cinfo->mds->ncommit);
+}
+
+/* NFS_I(cinfo->inode)->commit_mutex held by caller */
+int
+nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
+ struct nfs_commit_info *cinfo, int max)
+{
+ struct nfs_page *req, *tmp;
+ int ret = 0;
+
+ list_for_each_entry_safe(req, tmp, src, wb_list) {
+ kref_get(&req->wb_kref);
+ if (!nfs_lock_request(req)) {
+ nfs_release_request(req);
+ continue;
+ }
+ nfs_request_remove_commit_list(req, cinfo);
+ clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+ nfs_list_add_request(req, dst);
+ ret++;
+ if ((ret == max) && !cinfo->dreq)
+ break;
+ cond_resched();
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_scan_commit_list);
+
+/*
+ * nfs_scan_commit - Scan an inode for commit requests
+ * @inode: NFS inode to scan
+ * @dst: mds destination list
+ * @cinfo: mds and ds lists of reqs ready to commit
+ *
+ * Moves requests from the inode's 'commit' request list.
+ * The requests are *not* checked to ensure that they form a contiguous set.
+ */
+int
+nfs_scan_commit(struct inode *inode, struct list_head *dst,
+ struct nfs_commit_info *cinfo)
+{
+ int ret = 0;
+
+ if (!atomic_long_read(&cinfo->mds->ncommit))
+ return 0;
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+ if (atomic_long_read(&cinfo->mds->ncommit) > 0) {
+ const int max = INT_MAX;
+
+ ret = nfs_scan_commit_list(&cinfo->mds->list, dst,
+ cinfo, max);
+ ret += pnfs_scan_commit_lists(inode, cinfo, max - ret);
+ }
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+ return ret;
+}
+
+/*
+ * Search for an existing write request, and attempt to update
+ * it to reflect a new dirty region on a given page.
+ *
+ * If the attempt fails, then the existing request is flushed out
+ * to disk.
+ */
+static struct nfs_page *nfs_try_to_update_request(struct folio *folio,
+ unsigned int offset,
+ unsigned int bytes)
+{
+ struct nfs_page *req;
+ unsigned int rqend;
+ unsigned int end;
+ int error;
+
+ end = offset + bytes;
+
+ req = nfs_lock_and_join_requests(folio);
+ if (IS_ERR_OR_NULL(req))
+ return req;
+
+ rqend = req->wb_offset + req->wb_bytes;
+ /*
+ * Tell the caller to flush out the request if
+ * the offsets are non-contiguous.
+ * Note: nfs_flush_incompatible() will already
+ * have flushed out requests having wrong owners.
+ */
+ if (offset > rqend || end < req->wb_offset)
+ goto out_flushme;
+
+ /* Okay, the request matches. Update the region */
+ if (offset < req->wb_offset) {
+ req->wb_offset = offset;
+ req->wb_pgbase = offset;
+ }
+ if (end > rqend)
+ req->wb_bytes = end - req->wb_offset;
+ else
+ req->wb_bytes = rqend - req->wb_offset;
+ req->wb_nio = 0;
+ return req;
+out_flushme:
+ /*
+ * Note: we mark the request dirty here because
+ * nfs_lock_and_join_requests() cannot preserve
+ * commit flags, so we have to replay the write.
+ */
+ nfs_mark_request_dirty(req);
+ nfs_unlock_and_release_request(req);
+ error = nfs_wb_folio(folio_file_mapping(folio)->host, folio);
+ return (error < 0) ? ERR_PTR(error) : NULL;
+}
+
+/*
+ * Try to update an existing write request, or create one if there is none.
+ *
+ * Note: Should always be called with the Page Lock held to prevent races
+ * if we have to add a new request. Also assumes that the caller has
+ * already called nfs_flush_incompatible() if necessary.
+ */
+static struct nfs_page *nfs_setup_write_request(struct nfs_open_context *ctx,
+ struct folio *folio,
+ unsigned int offset,
+ unsigned int bytes)
+{
+ struct nfs_page *req;
+
+ req = nfs_try_to_update_request(folio, offset, bytes);
+ if (req != NULL)
+ goto out;
+ req = nfs_page_create_from_folio(ctx, folio, offset, bytes);
+ if (IS_ERR(req))
+ goto out;
+ nfs_inode_add_request(req);
+out:
+ return req;
+}
+
+static int nfs_writepage_setup(struct nfs_open_context *ctx,
+ struct folio *folio, unsigned int offset,
+ unsigned int count)
+{
+ struct nfs_page *req;
+
+ req = nfs_setup_write_request(ctx, folio, offset, count);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ /* Update file length */
+ nfs_grow_file(folio, offset, count);
+ nfs_mark_uptodate(req);
+ nfs_mark_request_dirty(req);
+ nfs_unlock_and_release_request(req);
+ return 0;
+}
+
+int nfs_flush_incompatible(struct file *file, struct folio *folio)
+{
+ struct nfs_open_context *ctx = nfs_file_open_context(file);
+ struct nfs_lock_context *l_ctx;
+ struct file_lock_context *flctx = locks_inode_context(file_inode(file));
+ struct nfs_page *req;
+ int do_flush, status;
+ /*
+ * Look for a request corresponding to this page. If there
+ * is one, and it belongs to another file, we flush it out
+ * before we try to copy anything into the page. Do this
+ * due to the lack of an ACCESS-type call in NFSv2.
+ * Also do the same if we find a request from an existing
+ * dropped page.
+ */
+ do {
+ req = nfs_folio_find_head_request(folio);
+ if (req == NULL)
+ return 0;
+ l_ctx = req->wb_lock_context;
+ do_flush = nfs_page_to_folio(req) != folio ||
+ !nfs_match_open_context(nfs_req_openctx(req), ctx);
+ if (l_ctx && flctx &&
+ !(list_empty_careful(&flctx->flc_posix) &&
+ list_empty_careful(&flctx->flc_flock))) {
+ do_flush |= l_ctx->lockowner != current->files;
+ }
+ nfs_release_request(req);
+ if (!do_flush)
+ return 0;
+ status = nfs_wb_folio(folio_file_mapping(folio)->host, folio);
+ } while (status == 0);
+ return status;
+}
+
+/*
+ * Avoid buffered writes when a open context credential's key would
+ * expire soon.
+ *
+ * Returns -EACCES if the key will expire within RPC_KEY_EXPIRE_FAIL.
+ *
+ * Return 0 and set a credential flag which triggers the inode to flush
+ * and performs NFS_FILE_SYNC writes if the key will expired within
+ * RPC_KEY_EXPIRE_TIMEO.
+ */
+int
+nfs_key_timeout_notify(struct file *filp, struct inode *inode)
+{
+ struct nfs_open_context *ctx = nfs_file_open_context(filp);
+
+ if (nfs_ctx_key_to_expire(ctx, inode) &&
+ !rcu_access_pointer(ctx->ll_cred))
+ /* Already expired! */
+ return -EACCES;
+ return 0;
+}
+
+/*
+ * Test if the open context credential key is marked to expire soon.
+ */
+bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode)
+{
+ struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth;
+ struct rpc_cred *cred, *new, *old = NULL;
+ struct auth_cred acred = {
+ .cred = ctx->cred,
+ };
+ bool ret = false;
+
+ rcu_read_lock();
+ cred = rcu_dereference(ctx->ll_cred);
+ if (cred && !(cred->cr_ops->crkey_timeout &&
+ cred->cr_ops->crkey_timeout(cred)))
+ goto out;
+ rcu_read_unlock();
+
+ new = auth->au_ops->lookup_cred(auth, &acred, 0);
+ if (new == cred) {
+ put_rpccred(new);
+ return true;
+ }
+ if (IS_ERR_OR_NULL(new)) {
+ new = NULL;
+ ret = true;
+ } else if (new->cr_ops->crkey_timeout &&
+ new->cr_ops->crkey_timeout(new))
+ ret = true;
+
+ rcu_read_lock();
+ old = rcu_dereference_protected(xchg(&ctx->ll_cred,
+ RCU_INITIALIZER(new)), 1);
+out:
+ rcu_read_unlock();
+ put_rpccred(old);
+ return ret;
+}
+
+/*
+ * If the page cache is marked as unsafe or invalid, then we can't rely on
+ * the PageUptodate() flag. In this case, we will need to turn off
+ * write optimisations that depend on the page contents being correct.
+ */
+static bool nfs_folio_write_uptodate(struct folio *folio, unsigned int pagelen)
+{
+ struct inode *inode = folio_file_mapping(folio)->host;
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ if (nfs_have_delegated_attributes(inode))
+ goto out;
+ if (nfsi->cache_validity &
+ (NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE))
+ return false;
+ smp_rmb();
+ if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags) && pagelen != 0)
+ return false;
+out:
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA && pagelen != 0)
+ return false;
+ return folio_test_uptodate(folio) != 0;
+}
+
+static bool
+is_whole_file_wrlock(struct file_lock *fl)
+{
+ return fl->fl_start == 0 && fl->fl_end == OFFSET_MAX &&
+ fl->fl_type == F_WRLCK;
+}
+
+/* If we know the page is up to date, and we're not using byte range locks (or
+ * if we have the whole file locked for writing), it may be more efficient to
+ * extend the write to cover the entire page in order to avoid fragmentation
+ * inefficiencies.
+ *
+ * If the file is opened for synchronous writes then we can just skip the rest
+ * of the checks.
+ */
+static int nfs_can_extend_write(struct file *file, struct folio *folio,
+ unsigned int pagelen)
+{
+ struct inode *inode = file_inode(file);
+ struct file_lock_context *flctx = locks_inode_context(inode);
+ struct file_lock *fl;
+ int ret;
+
+ if (file->f_flags & O_DSYNC)
+ return 0;
+ if (!nfs_folio_write_uptodate(folio, pagelen))
+ return 0;
+ if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
+ return 1;
+ if (!flctx || (list_empty_careful(&flctx->flc_flock) &&
+ list_empty_careful(&flctx->flc_posix)))
+ return 1;
+
+ /* Check to see if there are whole file write locks */
+ ret = 0;
+ spin_lock(&flctx->flc_lock);
+ if (!list_empty(&flctx->flc_posix)) {
+ fl = list_first_entry(&flctx->flc_posix, struct file_lock,
+ fl_list);
+ if (is_whole_file_wrlock(fl))
+ ret = 1;
+ } else if (!list_empty(&flctx->flc_flock)) {
+ fl = list_first_entry(&flctx->flc_flock, struct file_lock,
+ fl_list);
+ if (fl->fl_type == F_WRLCK)
+ ret = 1;
+ }
+ spin_unlock(&flctx->flc_lock);
+ return ret;
+}
+
+/*
+ * Update and possibly write a cached page of an NFS file.
+ *
+ * XXX: Keep an eye on generic_file_read to make sure it doesn't do bad
+ * things with a page scheduled for an RPC call (e.g. invalidate it).
+ */
+int nfs_update_folio(struct file *file, struct folio *folio,
+ unsigned int offset, unsigned int count)
+{
+ struct nfs_open_context *ctx = nfs_file_open_context(file);
+ struct address_space *mapping = folio_file_mapping(folio);
+ struct inode *inode = mapping->host;
+ unsigned int pagelen = nfs_folio_length(folio);
+ int status = 0;
+
+ nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
+
+ dprintk("NFS: nfs_update_folio(%pD2 %d@%lld)\n", file, count,
+ (long long)(folio_file_pos(folio) + offset));
+
+ if (!count)
+ goto out;
+
+ if (nfs_can_extend_write(file, folio, pagelen)) {
+ count = max(count + offset, pagelen);
+ offset = 0;
+ }
+
+ status = nfs_writepage_setup(ctx, folio, offset, count);
+ if (status < 0)
+ nfs_set_pageerror(mapping);
+out:
+ dprintk("NFS: nfs_update_folio returns %d (isize %lld)\n",
+ status, (long long)i_size_read(inode));
+ return status;
+}
+
+static int flush_task_priority(int how)
+{
+ switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) {
+ case FLUSH_HIGHPRI:
+ return RPC_PRIORITY_HIGH;
+ case FLUSH_LOWPRI:
+ return RPC_PRIORITY_LOW;
+ }
+ return RPC_PRIORITY_NORMAL;
+}
+
+static void nfs_initiate_write(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg,
+ const struct nfs_rpc_ops *rpc_ops,
+ struct rpc_task_setup *task_setup_data, int how)
+{
+ int priority = flush_task_priority(how);
+
+ if (IS_SWAPFILE(hdr->inode))
+ task_setup_data->flags |= RPC_TASK_SWAPPER;
+ task_setup_data->priority = priority;
+ rpc_ops->write_setup(hdr, msg, &task_setup_data->rpc_client);
+ trace_nfs_initiate_write(hdr);
+}
+
+/* If a nfs_flush_* function fails, it should remove reqs from @head and
+ * call this on each, which will prepare them to be retried on next
+ * writeback using standard nfs.
+ */
+static void nfs_redirty_request(struct nfs_page *req)
+{
+ struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req));
+
+ /* Bump the transmission count */
+ req->wb_nio++;
+ nfs_mark_request_dirty(req);
+ atomic_long_inc(&nfsi->redirtied_pages);
+ nfs_page_end_writeback(req);
+ nfs_release_request(req);
+}
+
+static void nfs_async_write_error(struct list_head *head, int error)
+{
+ struct nfs_page *req;
+
+ while (!list_empty(head)) {
+ req = nfs_list_entry(head->next);
+ nfs_list_remove_request(req);
+ if (nfs_error_is_fatal_on_server(error))
+ nfs_write_error(req, error);
+ else
+ nfs_redirty_request(req);
+ }
+}
+
+static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr)
+{
+ nfs_async_write_error(&hdr->pages, 0);
+}
+
+static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = {
+ .init_hdr = nfs_async_write_init,
+ .error_cleanup = nfs_async_write_error,
+ .completion = nfs_write_completion,
+ .reschedule_io = nfs_async_write_reschedule_io,
+};
+
+void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode, int ioflags, bool force_mds,
+ const struct nfs_pgio_completion_ops *compl_ops)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops;
+
+#ifdef CONFIG_NFS_V4_1
+ if (server->pnfs_curr_ld && !force_mds)
+ pg_ops = server->pnfs_curr_ld->pg_write_ops;
+#endif
+ nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops,
+ server->wsize, ioflags);
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
+
+void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
+{
+ struct nfs_pgio_mirror *mirror;
+
+ if (pgio->pg_ops && pgio->pg_ops->pg_cleanup)
+ pgio->pg_ops->pg_cleanup(pgio);
+
+ pgio->pg_ops = &nfs_pgio_rw_ops;
+
+ nfs_pageio_stop_mirroring(pgio);
+
+ mirror = &pgio->pg_mirrors[0];
+ mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
+
+
+void nfs_commit_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs_commit_data *data = calldata;
+
+ NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
+}
+
+static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr,
+ struct nfs_fattr *fattr)
+{
+ struct nfs_pgio_args *argp = &hdr->args;
+ struct nfs_pgio_res *resp = &hdr->res;
+ u64 size = argp->offset + resp->count;
+
+ if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
+ fattr->size = size;
+ if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode)) {
+ fattr->valid &= ~NFS_ATTR_FATTR_SIZE;
+ return;
+ }
+ if (size != fattr->size)
+ return;
+ /* Set attribute barrier */
+ nfs_fattr_set_barrier(fattr);
+ /* ...and update size */
+ fattr->valid |= NFS_ATTR_FATTR_SIZE;
+}
+
+void nfs_writeback_update_inode(struct nfs_pgio_header *hdr)
+{
+ struct nfs_fattr *fattr = &hdr->fattr;
+ struct inode *inode = hdr->inode;
+
+ spin_lock(&inode->i_lock);
+ nfs_writeback_check_extend(hdr, fattr);
+ nfs_post_op_update_inode_force_wcc_locked(inode, fattr);
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_writeback_update_inode);
+
+/*
+ * This function is called when the WRITE call is complete.
+ */
+static int nfs_writeback_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr,
+ struct inode *inode)
+{
+ int status;
+
+ /*
+ * ->write_done will attempt to use post-op attributes to detect
+ * conflicting writes by other clients. A strict interpretation
+ * of close-to-open would allow us to continue caching even if
+ * another writer had changed the file, but some applications
+ * depend on tighter cache coherency when writing.
+ */
+ status = NFS_PROTO(inode)->write_done(task, hdr);
+ if (status != 0)
+ return status;
+
+ nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
+ trace_nfs_writeback_done(task, hdr);
+
+ if (task->tk_status >= 0) {
+ enum nfs3_stable_how committed = hdr->res.verf->committed;
+
+ if (committed == NFS_UNSTABLE) {
+ /*
+ * We have some uncommitted data on the server at
+ * this point, so ensure that we keep track of that
+ * fact irrespective of what later writes do.
+ */
+ set_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags);
+ }
+
+ if (committed < hdr->args.stable) {
+ /* We tried a write call, but the server did not
+ * commit data to stable storage even though we
+ * requested it.
+ * Note: There is a known bug in Tru64 < 5.0 in which
+ * the server reports NFS_DATA_SYNC, but performs
+ * NFS_FILE_SYNC. We therefore implement this checking
+ * as a dprintk() in order to avoid filling syslog.
+ */
+ static unsigned long complain;
+
+ /* Note this will print the MDS for a DS write */
+ if (time_before(complain, jiffies)) {
+ dprintk("NFS: faulty NFS server %s:"
+ " (committed = %d) != (stable = %d)\n",
+ NFS_SERVER(inode)->nfs_client->cl_hostname,
+ committed, hdr->args.stable);
+ complain = jiffies + 300 * HZ;
+ }
+ }
+ }
+
+ /* Deal with the suid/sgid bit corner case */
+ if (nfs_should_remove_suid(inode)) {
+ spin_lock(&inode->i_lock);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE);
+ spin_unlock(&inode->i_lock);
+ }
+ return 0;
+}
+
+/*
+ * This function is called when the WRITE call is complete.
+ */
+static void nfs_writeback_result(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ struct nfs_pgio_args *argp = &hdr->args;
+ struct nfs_pgio_res *resp = &hdr->res;
+
+ if (resp->count < argp->count) {
+ static unsigned long complain;
+
+ /* This a short write! */
+ nfs_inc_stats(hdr->inode, NFSIOS_SHORTWRITE);
+
+ /* Has the server at least made some progress? */
+ if (resp->count == 0) {
+ if (time_before(complain, jiffies)) {
+ printk(KERN_WARNING
+ "NFS: Server wrote zero bytes, expected %u.\n",
+ argp->count);
+ complain = jiffies + 300 * HZ;
+ }
+ nfs_set_pgio_error(hdr, -EIO, argp->offset);
+ task->tk_status = -EIO;
+ return;
+ }
+
+ /* For non rpc-based layout drivers, retry-through-MDS */
+ if (!task->tk_ops) {
+ hdr->pnfs_error = -EAGAIN;
+ return;
+ }
+
+ /* Was this an NFSv2 write or an NFSv3 stable write? */
+ if (resp->verf->committed != NFS_UNSTABLE) {
+ /* Resend from where the server left off */
+ hdr->mds_offset += resp->count;
+ argp->offset += resp->count;
+ argp->pgbase += resp->count;
+ argp->count -= resp->count;
+ } else {
+ /* Resend as a stable write in order to avoid
+ * headaches in the case of a server crash.
+ */
+ argp->stable = NFS_FILE_SYNC;
+ }
+ resp->count = 0;
+ resp->verf->committed = 0;
+ rpc_restart_call_prepare(task);
+ }
+}
+
+static int wait_on_commit(struct nfs_mds_commit_info *cinfo)
+{
+ return wait_var_event_killable(&cinfo->rpcs_out,
+ !atomic_read(&cinfo->rpcs_out));
+}
+
+static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo)
+{
+ atomic_inc(&cinfo->rpcs_out);
+}
+
+bool nfs_commit_end(struct nfs_mds_commit_info *cinfo)
+{
+ if (atomic_dec_and_test(&cinfo->rpcs_out)) {
+ wake_up_var(&cinfo->rpcs_out);
+ return true;
+ }
+ return false;
+}
+
+void nfs_commitdata_release(struct nfs_commit_data *data)
+{
+ put_nfs_open_context(data->context);
+ nfs_commit_free(data);
+}
+EXPORT_SYMBOL_GPL(nfs_commitdata_release);
+
+int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
+ const struct nfs_rpc_ops *nfs_ops,
+ const struct rpc_call_ops *call_ops,
+ int how, int flags)
+{
+ struct rpc_task *task;
+ int priority = flush_task_priority(how);
+ struct rpc_message msg = {
+ .rpc_argp = &data->args,
+ .rpc_resp = &data->res,
+ .rpc_cred = data->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .task = &data->task,
+ .rpc_client = clnt,
+ .rpc_message = &msg,
+ .callback_ops = call_ops,
+ .callback_data = data,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC | flags,
+ .priority = priority,
+ };
+
+ if (nfs_server_capable(data->inode, NFS_CAP_MOVEABLE))
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
+ /* Set up the initial task struct. */
+ nfs_ops->commit_setup(data, &msg, &task_setup_data.rpc_client);
+ trace_nfs_initiate_commit(data);
+
+ dprintk("NFS: initiated commit call\n");
+
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ if (how & FLUSH_SYNC)
+ rpc_wait_for_completion_task(task);
+ rpc_put_task(task);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_commit);
+
+static loff_t nfs_get_lwb(struct list_head *head)
+{
+ loff_t lwb = 0;
+ struct nfs_page *req;
+
+ list_for_each_entry(req, head, wb_list)
+ if (lwb < (req_offset(req) + req->wb_bytes))
+ lwb = req_offset(req) + req->wb_bytes;
+
+ return lwb;
+}
+
+/*
+ * Set up the argument/result storage required for the RPC call.
+ */
+void nfs_init_commit(struct nfs_commit_data *data,
+ struct list_head *head,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo)
+{
+ struct nfs_page *first;
+ struct nfs_open_context *ctx;
+ struct inode *inode;
+
+ /* Set up the RPC argument and reply structs
+ * NB: take care not to mess about with data->commit et al. */
+
+ if (head)
+ list_splice_init(head, &data->pages);
+
+ first = nfs_list_entry(data->pages.next);
+ ctx = nfs_req_openctx(first);
+ inode = d_inode(ctx->dentry);
+
+ data->inode = inode;
+ data->cred = ctx->cred;
+ data->lseg = lseg; /* reference transferred */
+ /* only set lwb for pnfs commit */
+ if (lseg)
+ data->lwb = nfs_get_lwb(&data->pages);
+ data->mds_ops = &nfs_commit_ops;
+ data->completion_ops = cinfo->completion_ops;
+ data->dreq = cinfo->dreq;
+
+ data->args.fh = NFS_FH(data->inode);
+ /* Note: we always request a commit of the entire inode */
+ data->args.offset = 0;
+ data->args.count = 0;
+ data->context = get_nfs_open_context(ctx);
+ data->res.fattr = &data->fattr;
+ data->res.verf = &data->verf;
+ nfs_fattr_init(&data->fattr);
+ nfs_commit_begin(cinfo->mds);
+}
+EXPORT_SYMBOL_GPL(nfs_init_commit);
+
+void nfs_retry_commit(struct list_head *page_list,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx)
+{
+ struct nfs_page *req;
+
+ while (!list_empty(page_list)) {
+ req = nfs_list_entry(page_list->next);
+ nfs_list_remove_request(req);
+ nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx);
+ nfs_folio_clear_commit(nfs_page_to_folio(req));
+ nfs_unlock_and_release_request(req);
+ }
+}
+EXPORT_SYMBOL_GPL(nfs_retry_commit);
+
+static void nfs_commit_resched_write(struct nfs_commit_info *cinfo,
+ struct nfs_page *req)
+{
+ struct folio *folio = nfs_page_to_folio(req);
+
+ filemap_dirty_folio(folio_mapping(folio), folio);
+}
+
+/*
+ * Commit dirty pages
+ */
+static int
+nfs_commit_list(struct inode *inode, struct list_head *head, int how,
+ struct nfs_commit_info *cinfo)
+{
+ struct nfs_commit_data *data;
+ unsigned short task_flags = 0;
+
+ /* another commit raced with us */
+ if (list_empty(head))
+ return 0;
+
+ data = nfs_commitdata_alloc();
+ if (!data) {
+ nfs_retry_commit(head, NULL, cinfo, -1);
+ return -ENOMEM;
+ }
+
+ /* Set up the argument struct */
+ nfs_init_commit(data, head, NULL, cinfo);
+ if (NFS_SERVER(inode)->nfs_client->cl_minorversion)
+ task_flags = RPC_TASK_MOVEABLE;
+ return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
+ data->mds_ops, how,
+ RPC_TASK_CRED_NOREF | task_flags);
+}
+
+/*
+ * COMMIT call returned
+ */
+static void nfs_commit_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs_commit_data *data = calldata;
+
+ /* Call the NFS version-specific code */
+ NFS_PROTO(data->inode)->commit_done(task, data);
+ trace_nfs_commit_done(task, data);
+}
+
+static void nfs_commit_release_pages(struct nfs_commit_data *data)
+{
+ const struct nfs_writeverf *verf = data->res.verf;
+ struct nfs_page *req;
+ int status = data->task.tk_status;
+ struct nfs_commit_info cinfo;
+ struct nfs_server *nfss;
+ struct folio *folio;
+
+ while (!list_empty(&data->pages)) {
+ req = nfs_list_entry(data->pages.next);
+ nfs_list_remove_request(req);
+ folio = nfs_page_to_folio(req);
+ nfs_folio_clear_commit(folio);
+
+ dprintk("NFS: commit (%s/%llu %d@%lld)",
+ nfs_req_openctx(req)->dentry->d_sb->s_id,
+ (unsigned long long)NFS_FILEID(d_inode(nfs_req_openctx(req)->dentry)),
+ req->wb_bytes,
+ (long long)req_offset(req));
+ if (status < 0) {
+ if (folio) {
+ trace_nfs_commit_error(data->inode, req,
+ status);
+ nfs_mapping_set_error(folio, status);
+ nfs_inode_remove_request(req);
+ }
+ dprintk_cont(", error = %d\n", status);
+ goto next;
+ }
+
+ /* Okay, COMMIT succeeded, apparently. Check the verifier
+ * returned by the server against all stored verfs. */
+ if (nfs_write_match_verf(verf, req)) {
+ /* We have a match */
+ if (folio)
+ nfs_inode_remove_request(req);
+ dprintk_cont(" OK\n");
+ goto next;
+ }
+ /* We have a mismatch. Write the page again */
+ dprintk_cont(" mismatch\n");
+ nfs_mark_request_dirty(req);
+ atomic_long_inc(&NFS_I(data->inode)->redirtied_pages);
+ next:
+ nfs_unlock_and_release_request(req);
+ /* Latency breaker */
+ cond_resched();
+ }
+ nfss = NFS_SERVER(data->inode);
+ if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
+ nfss->write_congested = 0;
+
+ nfs_init_cinfo(&cinfo, data->inode, data->dreq);
+ nfs_commit_end(cinfo.mds);
+}
+
+static void nfs_commit_release(void *calldata)
+{
+ struct nfs_commit_data *data = calldata;
+
+ data->completion_ops->completion(data);
+ nfs_commitdata_release(calldata);
+}
+
+static const struct rpc_call_ops nfs_commit_ops = {
+ .rpc_call_prepare = nfs_commit_prepare,
+ .rpc_call_done = nfs_commit_done,
+ .rpc_release = nfs_commit_release,
+};
+
+static const struct nfs_commit_completion_ops nfs_commit_completion_ops = {
+ .completion = nfs_commit_release_pages,
+ .resched_write = nfs_commit_resched_write,
+};
+
+int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
+ int how, struct nfs_commit_info *cinfo)
+{
+ int status;
+
+ status = pnfs_commit_list(inode, head, how, cinfo);
+ if (status == PNFS_NOT_ATTEMPTED)
+ status = nfs_commit_list(inode, head, how, cinfo);
+ return status;
+}
+
+static int __nfs_commit_inode(struct inode *inode, int how,
+ struct writeback_control *wbc)
+{
+ LIST_HEAD(head);
+ struct nfs_commit_info cinfo;
+ int may_wait = how & FLUSH_SYNC;
+ int ret, nscan;
+
+ how &= ~FLUSH_SYNC;
+ nfs_init_cinfo_from_inode(&cinfo, inode);
+ nfs_commit_begin(cinfo.mds);
+ for (;;) {
+ ret = nscan = nfs_scan_commit(inode, &head, &cinfo);
+ if (ret <= 0)
+ break;
+ ret = nfs_generic_commit_list(inode, &head, how, &cinfo);
+ if (ret < 0)
+ break;
+ ret = 0;
+ if (wbc && wbc->sync_mode == WB_SYNC_NONE) {
+ if (nscan < wbc->nr_to_write)
+ wbc->nr_to_write -= nscan;
+ else
+ wbc->nr_to_write = 0;
+ }
+ if (nscan < INT_MAX)
+ break;
+ cond_resched();
+ }
+ nfs_commit_end(cinfo.mds);
+ if (ret || !may_wait)
+ return ret;
+ return wait_on_commit(cinfo.mds);
+}
+
+int nfs_commit_inode(struct inode *inode, int how)
+{
+ return __nfs_commit_inode(inode, how, NULL);
+}
+EXPORT_SYMBOL_GPL(nfs_commit_inode);
+
+int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ int flags = FLUSH_SYNC;
+ int ret = 0;
+
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ /* no commits means nothing needs to be done */
+ if (!atomic_long_read(&nfsi->commit_info.ncommit))
+ goto check_requests_outstanding;
+
+ /* Don't commit yet if this is a non-blocking flush and there
+ * are a lot of outstanding writes for this mapping.
+ */
+ if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))
+ goto out_mark_dirty;
+
+ /* don't wait for the COMMIT response */
+ flags = 0;
+ }
+
+ ret = __nfs_commit_inode(inode, flags, wbc);
+ if (!ret) {
+ if (flags & FLUSH_SYNC)
+ return 0;
+ } else if (atomic_long_read(&nfsi->commit_info.ncommit))
+ goto out_mark_dirty;
+
+check_requests_outstanding:
+ if (!atomic_read(&nfsi->commit_info.rpcs_out))
+ return ret;
+out_mark_dirty:
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_write_inode);
+
+/*
+ * Wrapper for filemap_write_and_wait_range()
+ *
+ * Needed for pNFS in order to ensure data becomes visible to the
+ * client.
+ */
+int nfs_filemap_write_and_wait_range(struct address_space *mapping,
+ loff_t lstart, loff_t lend)
+{
+ int ret;
+
+ ret = filemap_write_and_wait_range(mapping, lstart, lend);
+ if (ret == 0)
+ ret = pnfs_sync_inode(mapping->host, true);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_filemap_write_and_wait_range);
+
+/*
+ * flush the inode to disk.
+ */
+int nfs_wb_all(struct inode *inode)
+{
+ int ret;
+
+ trace_nfs_writeback_inode_enter(inode);
+
+ ret = filemap_write_and_wait(inode->i_mapping);
+ if (ret)
+ goto out;
+ ret = nfs_commit_inode(inode, FLUSH_SYNC);
+ if (ret < 0)
+ goto out;
+ pnfs_sync_inode(inode, true);
+ ret = 0;
+
+out:
+ trace_nfs_writeback_inode_exit(inode, ret);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_wb_all);
+
+int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio)
+{
+ struct nfs_page *req;
+ int ret = 0;
+
+ folio_wait_writeback(folio);
+
+ /* blocking call to cancel all requests and join to a single (head)
+ * request */
+ req = nfs_lock_and_join_requests(folio);
+
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ } else if (req) {
+ /* all requests from this folio have been cancelled by
+ * nfs_lock_and_join_requests, so just remove the head
+ * request from the inode / page_private pointer and
+ * release it */
+ nfs_inode_remove_request(req);
+ nfs_unlock_and_release_request(req);
+ }
+
+ return ret;
+}
+
+/**
+ * nfs_wb_folio - Write back all requests on one page
+ * @inode: pointer to page
+ * @folio: pointer to folio
+ *
+ * Assumes that the folio has been locked by the caller, and will
+ * not unlock it.
+ */
+int nfs_wb_folio(struct inode *inode, struct folio *folio)
+{
+ loff_t range_start = folio_file_pos(folio);
+ loff_t range_end = range_start + (loff_t)folio_size(folio) - 1;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = 0,
+ .range_start = range_start,
+ .range_end = range_end,
+ };
+ int ret;
+
+ trace_nfs_writeback_folio(inode, folio);
+
+ for (;;) {
+ folio_wait_writeback(folio);
+ if (folio_clear_dirty_for_io(folio)) {
+ ret = nfs_writepage_locked(folio, &wbc);
+ if (ret < 0)
+ goto out_error;
+ continue;
+ }
+ ret = 0;
+ if (!folio_test_private(folio))
+ break;
+ ret = nfs_commit_inode(inode, FLUSH_SYNC);
+ if (ret < 0)
+ goto out_error;
+ }
+out_error:
+ trace_nfs_writeback_folio_done(inode, folio, ret);
+ return ret;
+}
+
+#ifdef CONFIG_MIGRATION
+int nfs_migrate_folio(struct address_space *mapping, struct folio *dst,
+ struct folio *src, enum migrate_mode mode)
+{
+ /*
+ * If the private flag is set, the folio is currently associated with
+ * an in-progress read or write request. Don't try to migrate it.
+ *
+ * FIXME: we could do this in principle, but we'll need a way to ensure
+ * that we can safely release the inode reference while holding
+ * the folio lock.
+ */
+ if (folio_test_private(src))
+ return -EBUSY;
+
+ if (folio_test_fscache(src)) {
+ if (mode == MIGRATE_ASYNC)
+ return -EBUSY;
+ folio_wait_fscache(src);
+ }
+
+ return migrate_folio(mapping, dst, src, mode);
+}
+#endif
+
+int __init nfs_init_writepagecache(void)
+{
+ nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
+ sizeof(struct nfs_pgio_header),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (nfs_wdata_cachep == NULL)
+ return -ENOMEM;
+
+ nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE,
+ nfs_wdata_cachep);
+ if (nfs_wdata_mempool == NULL)
+ goto out_destroy_write_cache;
+
+ nfs_cdata_cachep = kmem_cache_create("nfs_commit_data",
+ sizeof(struct nfs_commit_data),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (nfs_cdata_cachep == NULL)
+ goto out_destroy_write_mempool;
+
+ nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
+ nfs_cdata_cachep);
+ if (nfs_commit_mempool == NULL)
+ goto out_destroy_commit_cache;
+
+ /*
+ * NFS congestion size, scale with available memory.
+ *
+ * 64MB: 8192k
+ * 128MB: 11585k
+ * 256MB: 16384k
+ * 512MB: 23170k
+ * 1GB: 32768k
+ * 2GB: 46340k
+ * 4GB: 65536k
+ * 8GB: 92681k
+ * 16GB: 131072k
+ *
+ * This allows larger machines to have larger/more transfers.
+ * Limit the default to 256M
+ */
+ nfs_congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10);
+ if (nfs_congestion_kb > 256*1024)
+ nfs_congestion_kb = 256*1024;
+
+ return 0;
+
+out_destroy_commit_cache:
+ kmem_cache_destroy(nfs_cdata_cachep);
+out_destroy_write_mempool:
+ mempool_destroy(nfs_wdata_mempool);
+out_destroy_write_cache:
+ kmem_cache_destroy(nfs_wdata_cachep);
+ return -ENOMEM;
+}
+
+void nfs_destroy_writepagecache(void)
+{
+ mempool_destroy(nfs_commit_mempool);
+ kmem_cache_destroy(nfs_cdata_cachep);
+ mempool_destroy(nfs_wdata_mempool);
+ kmem_cache_destroy(nfs_wdata_cachep);
+}
+
+static const struct nfs_rw_ops nfs_rw_write_ops = {
+ .rw_alloc_header = nfs_writehdr_alloc,
+ .rw_free_header = nfs_writehdr_free,
+ .rw_done = nfs_writeback_done,
+ .rw_result = nfs_writeback_result,
+ .rw_initiate = nfs_initiate_write,
+};