summaryrefslogtreecommitdiffstats
path: root/fs/nfs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 10:05:51 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 10:05:51 +0000
commit5d1646d90e1f2cceb9f0828f4b28318cd0ec7744 (patch)
treea94efe259b9009378be6d90eb30d2b019d95c194 /fs/nfs
parentInitial commit. (diff)
downloadlinux-5d1646d90e1f2cceb9f0828f4b28318cd0ec7744.tar.xz
linux-5d1646d90e1f2cceb9f0828f4b28318cd0ec7744.zip
Adding upstream version 5.10.209.upstream/5.10.209upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--fs/nfs/Kconfig216
-rw-r--r--fs/nfs/Makefile37
-rw-r--r--fs/nfs/blocklayout/Makefile7
-rw-r--r--fs/nfs/blocklayout/blocklayout.c1055
-rw-r--r--fs/nfs/blocklayout/blocklayout.h197
-rw-r--r--fs/nfs/blocklayout/dev.c548
-rw-r--r--fs/nfs/blocklayout/extent_tree.c647
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c288
-rw-r--r--fs/nfs/cache_lib.c158
-rw-r--r--fs/nfs/cache_lib.h32
-rw-r--r--fs/nfs/callback.c465
-rw-r--r--fs/nfs/callback.h223
-rw-r--r--fs/nfs/callback_proc.c731
-rw-r--r--fs/nfs/callback_xdr.c1091
-rw-r--r--fs/nfs/client.c1358
-rw-r--r--fs/nfs/delegation.c1455
-rw-r--r--fs/nfs/delegation.h92
-rw-r--r--fs/nfs/dir.c2820
-rw-r--r--fs/nfs/direct.c1034
-rw-r--r--fs/nfs/dns_resolve.c481
-rw-r--r--fs/nfs/dns_resolve.h37
-rw-r--r--fs/nfs/export.c174
-rw-r--r--fs/nfs/file.c874
-rw-r--r--fs/nfs/filelayout/Makefile6
-rw-r--r--fs/nfs/filelayout/filelayout.c1155
-rw-r--r--fs/nfs/filelayout/filelayout.h118
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c305
-rw-r--r--fs/nfs/flexfilelayout/Makefile6
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c2545
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h226
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c619
-rw-r--r--fs/nfs/fs_context.c1520
-rw-r--r--fs/nfs/fscache-index.c140
-rw-r--r--fs/nfs/fscache.c516
-rw-r--r--fs/nfs/fscache.h242
-rw-r--r--fs/nfs/getroot.c165
-rw-r--r--fs/nfs/inode.c2346
-rw-r--r--fs/nfs/internal.h852
-rw-r--r--fs/nfs/io.c148
-rw-r--r--fs/nfs/iostat.h77
-rw-r--r--fs/nfs/mount_clnt.c540
-rw-r--r--fs/nfs/namespace.c367
-rw-r--r--fs/nfs/netns.h44
-rw-r--r--fs/nfs/nfs.h30
-rw-r--r--fs/nfs/nfs2super.c32
-rw-r--r--fs/nfs/nfs2xdr.c1156
-rw-r--r--fs/nfs/nfs3_fs.h37
-rw-r--r--fs/nfs/nfs3acl.c344
-rw-r--r--fs/nfs/nfs3client.c119
-rw-r--r--fs/nfs/nfs3proc.c1044
-rw-r--r--fs/nfs/nfs3super.c36
-rw-r--r--fs/nfs/nfs3xdr.c2576
-rw-r--r--fs/nfs/nfs42.h64
-rw-r--r--fs/nfs/nfs42proc.c1373
-rw-r--r--fs/nfs/nfs42xattr.c1057
-rw-r--r--fs/nfs/nfs42xdr.c1592
-rw-r--r--fs/nfs/nfs4_fs.h671
-rw-r--r--fs/nfs/nfs4client.c1342
-rw-r--r--fs/nfs/nfs4file.c470
-rw-r--r--fs/nfs/nfs4getroot.c41
-rw-r--r--fs/nfs/nfs4idmap.c806
-rw-r--r--fs/nfs/nfs4idmap.h68
-rw-r--r--fs/nfs/nfs4namespace.c572
-rw-r--r--fs/nfs/nfs4proc.c10530
-rw-r--r--fs/nfs/nfs4renewd.c157
-rw-r--r--fs/nfs/nfs4session.c653
-rw-r--r--fs/nfs/nfs4session.h184
-rw-r--r--fs/nfs/nfs4state.c2768
-rw-r--r--fs/nfs/nfs4super.c314
-rw-r--r--fs/nfs/nfs4sysctl.c70
-rw-r--r--fs/nfs/nfs4trace.c31
-rw-r--r--fs/nfs/nfs4trace.h2311
-rw-r--r--fs/nfs/nfs4xdr.c7634
-rw-r--r--fs/nfs/nfsroot.c316
-rw-r--r--fs/nfs/nfstrace.c14
-rw-r--r--fs/nfs/nfstrace.h1444
-rw-r--r--fs/nfs/pagelist.c1470
-rw-r--r--fs/nfs/pnfs.c3360
-rw-r--r--fs/nfs/pnfs.h925
-rw-r--r--fs/nfs/pnfs_dev.c377
-rw-r--r--fs/nfs/pnfs_nfs.c1215
-rw-r--r--fs/nfs/proc.c763
-rw-r--r--fs/nfs/read.c486
-rw-r--r--fs/nfs/super.c1401
-rw-r--r--fs/nfs/symlink.c85
-rw-r--r--fs/nfs/sysctl.c65
-rw-r--r--fs/nfs/sysfs.c191
-rw-r--r--fs/nfs/sysfs.h25
-rw-r--r--fs/nfs/unlink.c521
-rw-r--r--fs/nfs/write.c2180
-rw-r--r--fs/nfs_common/Makefile10
-rw-r--r--fs/nfs_common/grace.c143
-rw-r--r--fs/nfs_common/nfs_ssc.c94
-rw-r--r--fs/nfs_common/nfsacl.c297
-rw-r--r--fs/nfsd/Kconfig159
-rw-r--r--fs/nfsd/Makefile24
-rw-r--r--fs/nfsd/acl.h51
-rw-r--r--fs/nfsd/auth.c94
-rw-r--r--fs/nfsd/auth.h17
-rw-r--r--fs/nfsd/blocklayout.c433
-rw-r--r--fs/nfsd/blocklayoutxdr.c235
-rw-r--r--fs/nfsd/blocklayoutxdr.h62
-rw-r--r--fs/nfsd/cache.h89
-rw-r--r--fs/nfsd/current_stateid.h41
-rw-r--r--fs/nfsd/export.c1329
-rw-r--r--fs/nfsd/export.h117
-rw-r--r--fs/nfsd/fault_inject.c142
-rw-r--r--fs/nfsd/filecache.c1092
-rw-r--r--fs/nfsd/filecache.h63
-rw-r--r--fs/nfsd/flexfilelayout.c135
-rw-r--r--fs/nfsd/flexfilelayoutxdr.c125
-rw-r--r--fs/nfsd/flexfilelayoutxdr.h50
-rw-r--r--fs/nfsd/idmap.h60
-rw-r--r--fs/nfsd/lockd.c78
-rw-r--r--fs/nfsd/netns.h187
-rw-r--r--fs/nfsd/nfs2acl.c438
-rw-r--r--fs/nfsd/nfs3acl.c286
-rw-r--r--fs/nfsd/nfs3proc.c935
-rw-r--r--fs/nfsd/nfs3xdr.c1171
-rw-r--r--fs/nfsd/nfs4acl.c884
-rw-r--r--fs/nfsd/nfs4callback.c1381
-rw-r--r--fs/nfsd/nfs4idmap.c686
-rw-r--r--fs/nfsd/nfs4layouts.c786
-rw-r--r--fs/nfsd/nfs4proc.c3329
-rw-r--r--fs/nfsd/nfs4recover.c2169
-rw-r--r--fs/nfsd/nfs4state.c7588
-rw-r--r--fs/nfsd/nfs4xdr.c5354
-rw-r--r--fs/nfsd/nfscache.c609
-rw-r--r--fs/nfsd/nfsctl.c1578
-rw-r--r--fs/nfsd/nfsd.h489
-rw-r--r--fs/nfsd/nfsfh.c713
-rw-r--r--fs/nfsd/nfsfh.h326
-rw-r--r--fs/nfsd/nfsproc.c856
-rw-r--r--fs/nfsd/nfssvc.c1135
-rw-r--r--fs/nfsd/nfsxdr.c616
-rw-r--r--fs/nfsd/pnfs.h100
-rw-r--r--fs/nfsd/state.h696
-rw-r--r--fs/nfsd/stats.c104
-rw-r--r--fs/nfsd/stats.h44
-rw-r--r--fs/nfsd/trace.c3
-rw-r--r--fs/nfsd/trace.h756
-rw-r--r--fs/nfsd/vfs.c2407
-rw-r--r--fs/nfsd/vfs.h169
-rw-r--r--fs/nfsd/xdr.h179
-rw-r--r--fs/nfsd/xdr3.h324
-rw-r--r--fs/nfsd/xdr4.h902
-rw-r--r--fs/nfsd/xdr4cb.h50
147 files changed, 121067 insertions, 0 deletions
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
new file mode 100644
index 000000000..14a72224b
--- /dev/null
+++ b/fs/nfs/Kconfig
@@ -0,0 +1,216 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config NFS_FS
+ tristate "NFS client support"
+ depends on INET && FILE_LOCKING && MULTIUSER
+ select LOCKD
+ select SUNRPC
+ select NFS_ACL_SUPPORT if NFS_V3_ACL
+ help
+ Choose Y here if you want to access files residing on other
+ computers using Sun's Network File System protocol. To compile
+ this file system support as a module, choose M here: the module
+ will be called nfs.
+
+ To mount file systems exported by NFS servers, you also need to
+ install the user space mount.nfs command which can be found in
+ the Linux nfs-utils package, available from http://linux-nfs.org/.
+ Information about using the mount command is available in the
+ mount(8) man page. More detail about the Linux NFS client
+ implementation is available via the nfs(5) man page.
+
+ Below you can choose which versions of the NFS protocol are
+ available in the kernel to mount NFS servers. Support for NFS
+ version 2 (RFC 1094) is always available when NFS_FS is selected.
+
+ To configure a system which mounts its root file system via NFS
+ at boot time, say Y here, select "Kernel level IP
+ autoconfiguration" in the NETWORK menu, and select "Root file
+ system on NFS" below. You cannot compile this file system as a
+ module in this case.
+
+ If unsure, say N.
+
+config NFS_V2
+ tristate "NFS client support for NFS version 2"
+ depends on NFS_FS
+ default y
+ help
+ This option enables support for version 2 of the NFS protocol
+ (RFC 1094) in the kernel's NFS client.
+
+ If unsure, say Y.
+
+config NFS_V3
+ tristate "NFS client support for NFS version 3"
+ depends on NFS_FS
+ default y
+ help
+ This option enables support for version 3 of the NFS protocol
+ (RFC 1813) in the kernel's NFS client.
+
+ If unsure, say Y.
+
+config NFS_V3_ACL
+ bool "NFS client support for the NFSv3 ACL protocol extension"
+ depends on NFS_V3
+ help
+ Some NFS servers support an auxiliary NFSv3 ACL protocol that
+ Sun added to Solaris but never became an official part of the
+ NFS version 3 protocol. This protocol extension allows
+ applications on NFS clients to manipulate POSIX Access Control
+ Lists on files residing on NFS servers. NFS servers enforce
+ ACLs on local files whether this protocol is available or not.
+
+ Choose Y here if your NFS server supports the Solaris NFSv3 ACL
+ protocol extension and you want your NFS client to allow
+ applications to access and modify ACLs on files on the server.
+
+ Most NFS servers don't support the Solaris NFSv3 ACL protocol
+ extension. You can choose N here or specify the "noacl" mount
+ option to prevent your NFS client from trying to use the NFSv3
+ ACL protocol.
+
+ If unsure, say N.
+
+config NFS_V4
+ tristate "NFS client support for NFS version 4"
+ depends on NFS_FS
+ select SUNRPC_GSS
+ select KEYS
+ help
+ This option enables support for version 4 of the NFS protocol
+ (RFC 3530) in the kernel's NFS client.
+
+ To mount NFS servers using NFSv4, you also need to install user
+ space programs which can be found in the Linux nfs-utils package,
+ available from http://linux-nfs.org/.
+
+ If unsure, say Y.
+
+config NFS_SWAP
+ bool "Provide swap over NFS support"
+ default n
+ depends on NFS_FS && SWAP
+ select SUNRPC_SWAP
+ help
+ This option enables swapon to work on files located on NFS mounts.
+
+config NFS_V4_1
+ bool "NFS client support for NFSv4.1"
+ depends on NFS_V4
+ select SUNRPC_BACKCHANNEL
+ help
+ This option enables support for minor version 1 of the NFSv4 protocol
+ (RFC 5661) in the kernel's NFS client.
+
+ If unsure, say N.
+
+config NFS_V4_2
+ bool "NFS client support for NFSv4.2"
+ depends on NFS_V4_1
+ help
+ This option enables support for minor version 2 of the NFSv4 protocol
+ in the kernel's NFS client.
+
+ If unsure, say N.
+
+config PNFS_FILE_LAYOUT
+ tristate
+ depends on NFS_V4_1
+ default NFS_V4
+
+config PNFS_BLOCK
+ tristate
+ depends on NFS_V4_1 && BLK_DEV_DM
+ default NFS_V4
+
+config PNFS_FLEXFILE_LAYOUT
+ tristate
+ depends on NFS_V4_1 && NFS_V3
+ default NFS_V4
+
+config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
+ string "NFSv4.1 Implementation ID Domain"
+ depends on NFS_V4_1
+ default "kernel.org"
+ help
+ This option defines the domain portion of the implementation ID that
+ may be sent in the NFS exchange_id operation. The value must be in
+ the format of a DNS domain name and should be set to the DNS domain
+ name of the distribution.
+ If the NFS client is unchanged from the upstream kernel, this
+ option should be set to the default "kernel.org".
+
+config NFS_V4_1_MIGRATION
+ bool "NFSv4.1 client support for migration"
+ depends on NFS_V4_1
+ default n
+ help
+ This option makes the NFS client advertise to NFSv4.1 servers that
+ it can support NFSv4 migration.
+
+ The NFSv4.1 pieces of the Linux NFSv4 migration implementation are
+ still experimental. If you are not an NFSv4 developer, say N here.
+
+config NFS_V4_SECURITY_LABEL
+ bool
+ depends on NFS_V4_2 && SECURITY
+ default y
+
+config ROOT_NFS
+ bool "Root file system on NFS"
+ depends on NFS_FS=y && IP_PNP
+ help
+ If you want your system to mount its root file system via NFS,
+ choose Y here. This is common practice for managing systems
+ without local permanent storage. For details, read
+ <file:Documentation/admin-guide/nfs/nfsroot.rst>.
+
+ Most people say N here.
+
+config NFS_FSCACHE
+ bool "Provide NFS client caching support"
+ depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
+ help
+ Say Y here if you want NFS data to be cached locally on disc through
+ the general filesystem cache manager
+
+config NFS_USE_LEGACY_DNS
+ bool "Use the legacy NFS DNS resolver"
+ depends on NFS_V4
+ help
+ The kernel now provides a method for translating a host name into an
+ IP address. Select Y here if you would rather use your own DNS
+ resolver script.
+
+ If unsure, say N
+
+config NFS_USE_KERNEL_DNS
+ bool
+ depends on NFS_V4 && !NFS_USE_LEGACY_DNS
+ select DNS_RESOLVER
+ default y
+
+config NFS_DEBUG
+ bool
+ depends on NFS_FS && SUNRPC_DEBUG
+ select CRC32
+ default y
+
+config NFS_DISABLE_UDP_SUPPORT
+ bool "NFS: Disable NFS UDP protocol support"
+ depends on NFS_FS
+ default y
+ help
+ Choose Y here to disable the use of NFS over UDP. NFS over UDP
+ on modern networks (1Gb+) can lead to data corruption caused by
+ fragmentation during high loads.
+
+config NFS_V4_2_READ_PLUS
+ bool "NFS: Enable support for the NFSv4.2 READ_PLUS operation"
+ depends on NFS_V4_2
+ default n
+ help
+ This is intended for developers only. The READ_PLUS operation has
+ been shown to have issues under specific conditions and should not
+ be used in production.
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
new file mode 100644
index 000000000..22d11fdc6
--- /dev/null
+++ b/fs/nfs/Makefile
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the Linux nfs filesystem routines.
+#
+
+obj-$(CONFIG_NFS_FS) += nfs.o
+
+CFLAGS_nfstrace.o += -I$(src)
+nfs-y := client.o dir.o file.o getroot.o inode.o super.o \
+ io.o direct.o pagelist.o read.o symlink.o unlink.o \
+ write.o namespace.o mount_clnt.o nfstrace.o \
+ export.o sysfs.o fs_context.o
+nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
+nfs-$(CONFIG_SYSCTL) += sysctl.o
+nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
+
+obj-$(CONFIG_NFS_V2) += nfsv2.o
+nfsv2-y := nfs2super.o proc.o nfs2xdr.o
+
+obj-$(CONFIG_NFS_V3) += nfsv3.o
+nfsv3-y := nfs3super.o nfs3client.o nfs3proc.o nfs3xdr.o
+nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
+
+obj-$(CONFIG_NFS_V4) += nfsv4.o
+CFLAGS_nfs4trace.o += -I$(src)
+nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
+ delegation.o nfs4idmap.o callback.o callback_xdr.o callback_proc.o \
+ nfs4namespace.o nfs4getroot.o nfs4client.o nfs4session.o \
+ dns_resolve.o nfs4trace.o
+nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
+nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
+nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o pnfs_nfs.o
+nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o nfs42xattr.o
+
+obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/
+obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
+obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += flexfilelayout/
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
new file mode 100644
index 000000000..7668a1bfb
--- /dev/null
+++ b/fs/nfs/blocklayout/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for the pNFS block layout driver kernel module
+#
+obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
+
+blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
new file mode 100644
index 000000000..73000aa2d
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -0,0 +1,1055 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayout.c
+ *
+ * Module for the NFSv4.1 pNFS block layout driver.
+ *
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@citi.umich.edu>
+ * Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/bio.h> /* struct bio */
+#include <linux/prefetch.h>
+#include <linux/pagevec.h>
+
+#include "../pnfs.h"
+#include "../nfs4session.h"
+#include "../internal.h"
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
+MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
+
+static bool is_hole(struct pnfs_block_extent *be)
+{
+ switch (be->be_state) {
+ case PNFS_BLOCK_NONE_DATA:
+ return true;
+ case PNFS_BLOCK_INVALID_DATA:
+ return be->be_tag ? false : true;
+ default:
+ return false;
+ }
+}
+
+/* The data we are handed might be spread across several bios. We need
+ * to track when the last one is finished.
+ */
+struct parallel_io {
+ struct kref refcnt;
+ void (*pnfs_callback) (void *data);
+ void *data;
+};
+
+static inline struct parallel_io *alloc_parallel(void *data)
+{
+ struct parallel_io *rv;
+
+ rv = kmalloc(sizeof(*rv), GFP_NOFS);
+ if (rv) {
+ rv->data = data;
+ kref_init(&rv->refcnt);
+ }
+ return rv;
+}
+
+static inline void get_parallel(struct parallel_io *p)
+{
+ kref_get(&p->refcnt);
+}
+
+static void destroy_parallel(struct kref *kref)
+{
+ struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
+
+ dprintk("%s enter\n", __func__);
+ p->pnfs_callback(p->data);
+ kfree(p);
+}
+
+static inline void put_parallel(struct parallel_io *p)
+{
+ kref_put(&p->refcnt, destroy_parallel);
+}
+
+static struct bio *
+bl_submit_bio(struct bio *bio)
+{
+ if (bio) {
+ get_parallel(bio->bi_private);
+ dprintk("%s submitting %s bio %u@%llu\n", __func__,
+ bio_op(bio) == READ ? "read" : "write",
+ bio->bi_iter.bi_size,
+ (unsigned long long)bio->bi_iter.bi_sector);
+ submit_bio(bio);
+ }
+ return NULL;
+}
+
+static struct bio *
+bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
+ bio_end_io_t end_io, struct parallel_io *par)
+{
+ struct bio *bio;
+
+ npg = min(npg, BIO_MAX_PAGES);
+ bio = bio_alloc(GFP_NOIO, npg);
+ if (!bio && (current->flags & PF_MEMALLOC)) {
+ while (!bio && (npg /= 2))
+ bio = bio_alloc(GFP_NOIO, npg);
+ }
+
+ if (bio) {
+ bio->bi_iter.bi_sector = disk_sector;
+ bio_set_dev(bio, bdev);
+ bio->bi_end_io = end_io;
+ bio->bi_private = par;
+ }
+ return bio;
+}
+
+static bool offset_in_map(u64 offset, struct pnfs_block_dev_map *map)
+{
+ return offset >= map->start && offset < map->start + map->len;
+}
+
+static struct bio *
+do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
+ struct page *page, struct pnfs_block_dev_map *map,
+ struct pnfs_block_extent *be, bio_end_io_t end_io,
+ struct parallel_io *par, unsigned int offset, int *len)
+{
+ struct pnfs_block_dev *dev =
+ container_of(be->be_device, struct pnfs_block_dev, node);
+ u64 disk_addr, end;
+
+ dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
+ npg, rw, (unsigned long long)isect, offset, *len);
+
+ /* translate to device offset */
+ isect += be->be_v_offset;
+ isect -= be->be_f_offset;
+
+ /* translate to physical disk offset */
+ disk_addr = (u64)isect << SECTOR_SHIFT;
+ if (!offset_in_map(disk_addr, map)) {
+ if (!dev->map(dev, disk_addr, map) || !offset_in_map(disk_addr, map))
+ return ERR_PTR(-EIO);
+ bio = bl_submit_bio(bio);
+ }
+ disk_addr += map->disk_offset;
+ disk_addr -= map->start;
+
+ /* limit length to what the device mapping allows */
+ end = disk_addr + *len;
+ if (end >= map->start + map->len)
+ *len = map->start + map->len - disk_addr;
+
+retry:
+ if (!bio) {
+ bio = bl_alloc_init_bio(npg, map->bdev,
+ disk_addr >> SECTOR_SHIFT, end_io, par);
+ if (!bio)
+ return ERR_PTR(-ENOMEM);
+ bio_set_op_attrs(bio, rw, 0);
+ }
+ if (bio_add_page(bio, page, *len, offset) < *len) {
+ bio = bl_submit_bio(bio);
+ goto retry;
+ }
+ return bio;
+}
+
+static void bl_mark_devices_unavailable(struct nfs_pgio_header *header, bool rw)
+{
+ struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+ size_t bytes_left = header->args.count;
+ sector_t isect, extent_length = 0;
+ struct pnfs_block_extent be;
+
+ isect = header->args.offset >> SECTOR_SHIFT;
+ bytes_left += header->args.offset - (isect << SECTOR_SHIFT);
+
+ while (bytes_left > 0) {
+ if (!ext_tree_lookup(bl, isect, &be, rw))
+ return;
+ extent_length = be.be_length - (isect - be.be_f_offset);
+ nfs4_mark_deviceid_unavailable(be.be_device);
+ isect += extent_length;
+ if (bytes_left > extent_length << SECTOR_SHIFT)
+ bytes_left -= extent_length << SECTOR_SHIFT;
+ else
+ bytes_left = 0;
+ }
+}
+
+static void bl_end_io_read(struct bio *bio)
+{
+ struct parallel_io *par = bio->bi_private;
+
+ if (bio->bi_status) {
+ struct nfs_pgio_header *header = par->data;
+
+ if (!header->pnfs_error)
+ header->pnfs_error = -EIO;
+ pnfs_set_lo_fail(header->lseg);
+ bl_mark_devices_unavailable(header, false);
+ }
+
+ bio_put(bio);
+ put_parallel(par);
+}
+
+static void bl_read_cleanup(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_pgio_header *hdr;
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ hdr = container_of(task, struct nfs_pgio_header, task);
+ pnfs_ld_read_done(hdr);
+}
+
+static void
+bl_end_par_io_read(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ hdr->task.tk_status = hdr->pnfs_error;
+ INIT_WORK(&hdr->task.u.tk_work, bl_read_cleanup);
+ schedule_work(&hdr->task.u.tk_work);
+}
+
+static enum pnfs_try_status
+bl_read_pagelist(struct nfs_pgio_header *header)
+{
+ struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+ struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
+ struct bio *bio = NULL;
+ struct pnfs_block_extent be;
+ sector_t isect, extent_length = 0;
+ struct parallel_io *par;
+ loff_t f_offset = header->args.offset;
+ size_t bytes_left = header->args.count;
+ unsigned int pg_offset = header->args.pgbase, pg_len;
+ struct page **pages = header->args.pages;
+ int pg_index = header->args.pgbase >> PAGE_SHIFT;
+ const bool is_dio = (header->dreq != NULL);
+ struct blk_plug plug;
+ int i;
+
+ dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
+ header->page_array.npages, f_offset,
+ (unsigned int)header->args.count);
+
+ par = alloc_parallel(header);
+ if (!par)
+ return PNFS_NOT_ATTEMPTED;
+ par->pnfs_callback = bl_end_par_io_read;
+
+ blk_start_plug(&plug);
+
+ isect = (sector_t) (f_offset >> SECTOR_SHIFT);
+ /* Code assumes extents are page-aligned */
+ for (i = pg_index; i < header->page_array.npages; i++) {
+ if (extent_length <= 0) {
+ /* We've used up the previous extent */
+ bio = bl_submit_bio(bio);
+
+ /* Get the next one */
+ if (!ext_tree_lookup(bl, isect, &be, false)) {
+ header->pnfs_error = -EIO;
+ goto out;
+ }
+ extent_length = be.be_length - (isect - be.be_f_offset);
+ }
+
+ if (is_dio) {
+ if (pg_offset + bytes_left > PAGE_SIZE)
+ pg_len = PAGE_SIZE - pg_offset;
+ else
+ pg_len = bytes_left;
+ } else {
+ BUG_ON(pg_offset != 0);
+ pg_len = PAGE_SIZE;
+ }
+
+ if (is_hole(&be)) {
+ bio = bl_submit_bio(bio);
+ /* Fill hole w/ zeroes w/o accessing device */
+ dprintk("%s Zeroing page for hole\n", __func__);
+ zero_user_segment(pages[i], pg_offset, pg_len);
+
+ /* invalidate map */
+ map.start = NFS4_MAX_UINT64;
+ } else {
+ bio = do_add_page_to_bio(bio,
+ header->page_array.npages - i,
+ READ,
+ isect, pages[i], &map, &be,
+ bl_end_io_read, par,
+ pg_offset, &pg_len);
+ if (IS_ERR(bio)) {
+ header->pnfs_error = PTR_ERR(bio);
+ bio = NULL;
+ goto out;
+ }
+ }
+ isect += (pg_len >> SECTOR_SHIFT);
+ extent_length -= (pg_len >> SECTOR_SHIFT);
+ f_offset += pg_len;
+ bytes_left -= pg_len;
+ pg_offset = 0;
+ }
+ if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
+ header->res.eof = 1;
+ header->res.count = header->inode->i_size - header->args.offset;
+ } else {
+ header->res.count = (isect << SECTOR_SHIFT) - header->args.offset;
+ }
+out:
+ bl_submit_bio(bio);
+ blk_finish_plug(&plug);
+ put_parallel(par);
+ return PNFS_ATTEMPTED;
+}
+
+static void bl_end_io_write(struct bio *bio)
+{
+ struct parallel_io *par = bio->bi_private;
+ struct nfs_pgio_header *header = par->data;
+
+ if (bio->bi_status) {
+ if (!header->pnfs_error)
+ header->pnfs_error = -EIO;
+ pnfs_set_lo_fail(header->lseg);
+ bl_mark_devices_unavailable(header, true);
+ }
+ bio_put(bio);
+ put_parallel(par);
+}
+
+/* Function scheduled for call during bl_end_par_io_write,
+ * it marks sectors as written and extends the commitlist.
+ */
+static void bl_write_cleanup(struct work_struct *work)
+{
+ struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work);
+ struct nfs_pgio_header *hdr =
+ container_of(task, struct nfs_pgio_header, task);
+
+ dprintk("%s enter\n", __func__);
+
+ if (likely(!hdr->pnfs_error)) {
+ struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg);
+ u64 start = hdr->args.offset & (loff_t)PAGE_MASK;
+ u64 end = (hdr->args.offset + hdr->args.count +
+ PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
+ u64 lwb = hdr->args.offset + hdr->args.count;
+
+ ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
+ (end - start) >> SECTOR_SHIFT, lwb);
+ }
+
+ pnfs_ld_write_done(hdr);
+}
+
+/* Called when last of bios associated with a bl_write_pagelist call finishes */
+static void bl_end_par_io_write(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ hdr->task.tk_status = hdr->pnfs_error;
+ hdr->verf.committed = NFS_FILE_SYNC;
+ INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
+ schedule_work(&hdr->task.u.tk_work);
+}
+
+static enum pnfs_try_status
+bl_write_pagelist(struct nfs_pgio_header *header, int sync)
+{
+ struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+ struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
+ struct bio *bio = NULL;
+ struct pnfs_block_extent be;
+ sector_t isect, extent_length = 0;
+ struct parallel_io *par = NULL;
+ loff_t offset = header->args.offset;
+ size_t count = header->args.count;
+ struct page **pages = header->args.pages;
+ int pg_index = header->args.pgbase >> PAGE_SHIFT;
+ unsigned int pg_len;
+ struct blk_plug plug;
+ int i;
+
+ dprintk("%s enter, %zu@%lld\n", __func__, count, offset);
+
+ /* At this point, header->page_aray is a (sequential) list of nfs_pages.
+ * We want to write each, and if there is an error set pnfs_error
+ * to have it redone using nfs.
+ */
+ par = alloc_parallel(header);
+ if (!par)
+ return PNFS_NOT_ATTEMPTED;
+ par->pnfs_callback = bl_end_par_io_write;
+
+ blk_start_plug(&plug);
+
+ /* we always write out the whole page */
+ offset = offset & (loff_t)PAGE_MASK;
+ isect = offset >> SECTOR_SHIFT;
+
+ for (i = pg_index; i < header->page_array.npages; i++) {
+ if (extent_length <= 0) {
+ /* We've used up the previous extent */
+ bio = bl_submit_bio(bio);
+ /* Get the next one */
+ if (!ext_tree_lookup(bl, isect, &be, true)) {
+ header->pnfs_error = -EINVAL;
+ goto out;
+ }
+
+ extent_length = be.be_length - (isect - be.be_f_offset);
+ }
+
+ pg_len = PAGE_SIZE;
+ bio = do_add_page_to_bio(bio, header->page_array.npages - i,
+ WRITE, isect, pages[i], &map, &be,
+ bl_end_io_write, par,
+ 0, &pg_len);
+ if (IS_ERR(bio)) {
+ header->pnfs_error = PTR_ERR(bio);
+ bio = NULL;
+ goto out;
+ }
+
+ offset += pg_len;
+ count -= pg_len;
+ isect += (pg_len >> SECTOR_SHIFT);
+ extent_length -= (pg_len >> SECTOR_SHIFT);
+ }
+
+ header->res.count = header->args.count;
+out:
+ bl_submit_bio(bio);
+ blk_finish_plug(&plug);
+ put_parallel(par);
+ return PNFS_ATTEMPTED;
+}
+
+static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+ int err;
+
+ dprintk("%s enter\n", __func__);
+
+ err = ext_tree_remove(bl, true, 0, LLONG_MAX);
+ WARN_ON(err);
+
+ kfree_rcu(bl, bl_layout.plh_rcu);
+}
+
+static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode,
+ gfp_t gfp_flags, bool is_scsi_layout)
+{
+ struct pnfs_block_layout *bl;
+
+ dprintk("%s enter\n", __func__);
+ bl = kzalloc(sizeof(*bl), gfp_flags);
+ if (!bl)
+ return NULL;
+
+ bl->bl_ext_rw = RB_ROOT;
+ bl->bl_ext_ro = RB_ROOT;
+ spin_lock_init(&bl->bl_ext_lock);
+
+ bl->bl_scsi_layout = is_scsi_layout;
+ return &bl->bl_layout;
+}
+
+static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
+ gfp_t gfp_flags)
+{
+ return __bl_alloc_layout_hdr(inode, gfp_flags, false);
+}
+
+static struct pnfs_layout_hdr *sl_alloc_layout_hdr(struct inode *inode,
+ gfp_t gfp_flags)
+{
+ return __bl_alloc_layout_hdr(inode, gfp_flags, true);
+}
+
+static void bl_free_lseg(struct pnfs_layout_segment *lseg)
+{
+ dprintk("%s enter\n", __func__);
+ kfree(lseg);
+}
+
+/* Tracks info needed to ensure extents in layout obey constraints of spec */
+struct layout_verification {
+ u32 mode; /* R or RW */
+ u64 start; /* Expected start of next non-COW extent */
+ u64 inval; /* Start of INVAL coverage */
+ u64 cowread; /* End of COW read coverage */
+};
+
+/* Verify the extent meets the layout requirements of the pnfs-block draft,
+ * section 2.3.1.
+ */
+static int verify_extent(struct pnfs_block_extent *be,
+ struct layout_verification *lv)
+{
+ if (lv->mode == IOMODE_READ) {
+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
+ be->be_state == PNFS_BLOCK_INVALID_DATA)
+ return -EIO;
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ return 0;
+ }
+ /* lv->mode == IOMODE_RW */
+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ if (lv->cowread > lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ lv->inval = lv->start;
+ return 0;
+ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ return 0;
+ } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
+ if (be->be_f_offset > lv->start)
+ return -EIO;
+ if (be->be_f_offset < lv->inval)
+ return -EIO;
+ if (be->be_f_offset < lv->cowread)
+ return -EIO;
+ /* It looks like you might want to min this with lv->start,
+ * but you really don't.
+ */
+ lv->inval = lv->inval + be->be_length;
+ lv->cowread = be->be_f_offset + be->be_length;
+ return 0;
+ } else
+ return -EIO;
+}
+
+static int decode_sector_number(__be32 **rp, sector_t *sp)
+{
+ uint64_t s;
+
+ *rp = xdr_decode_hyper(*rp, &s);
+ if (s & 0x1ff) {
+ printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
+ return -1;
+ }
+ *sp = s >> SECTOR_SHIFT;
+ return 0;
+}
+
+static struct nfs4_deviceid_node *
+bl_find_get_deviceid(struct nfs_server *server,
+ const struct nfs4_deviceid *id, const struct cred *cred,
+ gfp_t gfp_mask)
+{
+ struct nfs4_deviceid_node *node;
+ unsigned long start, end;
+
+retry:
+ node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
+ if (!node)
+ return ERR_PTR(-ENODEV);
+
+ if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
+ return node;
+
+ end = jiffies;
+ start = end - PNFS_DEVICE_RETRY_TIMEOUT;
+ if (!time_in_range(node->timestamp_unavailable, start, end)) {
+ nfs4_delete_deviceid(node->ld, node->nfs_client, id);
+ goto retry;
+ }
+
+ nfs4_put_deviceid_node(node);
+ return ERR_PTR(-ENODEV);
+}
+
+static int
+bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo,
+ struct layout_verification *lv, struct list_head *extents,
+ gfp_t gfp_mask)
+{
+ struct pnfs_block_extent *be;
+ struct nfs4_deviceid id;
+ int error;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE);
+ if (!p)
+ return -EIO;
+
+ be = kzalloc(sizeof(*be), GFP_NOFS);
+ if (!be)
+ return -ENOMEM;
+
+ memcpy(&id, p, NFS4_DEVICEID4_SIZE);
+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+ be->be_device = bl_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
+ lo->plh_lc_cred, gfp_mask);
+ if (IS_ERR(be->be_device)) {
+ error = PTR_ERR(be->be_device);
+ goto out_free_be;
+ }
+
+ /*
+ * The next three values are read in as bytes, but stored in the
+ * extent structure in 512-byte granularity.
+ */
+ error = -EIO;
+ if (decode_sector_number(&p, &be->be_f_offset) < 0)
+ goto out_put_deviceid;
+ if (decode_sector_number(&p, &be->be_length) < 0)
+ goto out_put_deviceid;
+ if (decode_sector_number(&p, &be->be_v_offset) < 0)
+ goto out_put_deviceid;
+ be->be_state = be32_to_cpup(p++);
+
+ error = verify_extent(be, lv);
+ if (error) {
+ dprintk("%s: extent verification failed\n", __func__);
+ goto out_put_deviceid;
+ }
+
+ list_add_tail(&be->be_list, extents);
+ return 0;
+
+out_put_deviceid:
+ nfs4_put_deviceid_node(be->be_device);
+out_free_be:
+ kfree(be);
+ return error;
+}
+
+static struct pnfs_layout_segment *
+bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
+ gfp_t gfp_mask)
+{
+ struct layout_verification lv = {
+ .mode = lgr->range.iomode,
+ .start = lgr->range.offset >> SECTOR_SHIFT,
+ .inval = lgr->range.offset >> SECTOR_SHIFT,
+ .cowread = lgr->range.offset >> SECTOR_SHIFT,
+ };
+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+ struct pnfs_layout_segment *lseg;
+ struct xdr_buf buf;
+ struct xdr_stream xdr;
+ struct page *scratch;
+ int status, i;
+ uint32_t count;
+ __be32 *p;
+ LIST_HEAD(extents);
+
+ dprintk("---> %s\n", __func__);
+
+ lseg = kzalloc(sizeof(*lseg), gfp_mask);
+ if (!lseg)
+ return ERR_PTR(-ENOMEM);
+
+ status = -ENOMEM;
+ scratch = alloc_page(gfp_mask);
+ if (!scratch)
+ goto out;
+
+ xdr_init_decode_pages(&xdr, &buf,
+ lgr->layoutp->pages, lgr->layoutp->len);
+ xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
+
+ status = -EIO;
+ p = xdr_inline_decode(&xdr, 4);
+ if (unlikely(!p))
+ goto out_free_scratch;
+
+ count = be32_to_cpup(p++);
+ dprintk("%s: number of extents %d\n", __func__, count);
+
+ /*
+ * Decode individual extents, putting them in temporary staging area
+ * until whole layout is decoded to make error recovery easier.
+ */
+ for (i = 0; i < count; i++) {
+ status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask);
+ if (status)
+ goto process_extents;
+ }
+
+ if (lgr->range.offset + lgr->range.length !=
+ lv.start << SECTOR_SHIFT) {
+ dprintk("%s Final length mismatch\n", __func__);
+ status = -EIO;
+ goto process_extents;
+ }
+
+ if (lv.start < lv.cowread) {
+ dprintk("%s Final uncovered COW extent\n", __func__);
+ status = -EIO;
+ }
+
+process_extents:
+ while (!list_empty(&extents)) {
+ struct pnfs_block_extent *be =
+ list_first_entry(&extents, struct pnfs_block_extent,
+ be_list);
+ list_del(&be->be_list);
+
+ if (!status)
+ status = ext_tree_insert(bl, be);
+
+ if (status) {
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ }
+ }
+
+out_free_scratch:
+ __free_page(scratch);
+out:
+ dprintk("%s returns %d\n", __func__, status);
+ switch (status) {
+ case -ENODEV:
+ /* Our extent block devices are unavailable */
+ set_bit(NFS_LSEG_UNAVAILABLE, &lseg->pls_flags);
+ fallthrough;
+ case 0:
+ return lseg;
+ default:
+ kfree(lseg);
+ return ERR_PTR(status);
+ }
+}
+
+static void
+bl_return_range(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range)
+{
+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+ sector_t offset = range->offset >> SECTOR_SHIFT, end;
+
+ if (range->offset % 8) {
+ dprintk("%s: offset %lld not block size aligned\n",
+ __func__, range->offset);
+ return;
+ }
+
+ if (range->length != NFS4_MAX_UINT64) {
+ if (range->length % 8) {
+ dprintk("%s: length %lld not block size aligned\n",
+ __func__, range->length);
+ return;
+ }
+
+ end = offset + (range->length >> SECTOR_SHIFT);
+ } else {
+ end = round_down(NFS4_MAX_UINT64, PAGE_SIZE);
+ }
+
+ ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end);
+}
+
+static int
+bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg)
+{
+ return ext_tree_prepare_commit(arg);
+}
+
+static void
+bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
+{
+ ext_tree_mark_committed(&lcdata->args, lcdata->res.status);
+}
+
+static int
+bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
+{
+ dprintk("%s enter\n", __func__);
+
+ if (server->pnfs_blksize == 0) {
+ dprintk("%s Server did not return blksize\n", __func__);
+ return -EINVAL;
+ }
+ if (server->pnfs_blksize > PAGE_SIZE) {
+ printk(KERN_ERR "%s: pNFS blksize %d not supported.\n",
+ __func__, server->pnfs_blksize);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static bool
+is_aligned_req(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req, unsigned int alignment, bool is_write)
+{
+ /*
+ * Always accept buffered writes, higher layers take care of the
+ * right alignment.
+ */
+ if (pgio->pg_dreq == NULL)
+ return true;
+
+ if (!IS_ALIGNED(req->wb_offset, alignment))
+ return false;
+
+ if (IS_ALIGNED(req->wb_bytes, alignment))
+ return true;
+
+ if (is_write &&
+ (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode))) {
+ /*
+ * If the write goes up to the inode size, just write
+ * the full page. Data past the inode size is
+ * guaranteed to be zeroed by the higher level client
+ * code, and this behaviour is mandated by RFC 5663
+ * section 2.3.2.
+ */
+ return true;
+ }
+
+ return false;
+}
+
+static void
+bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+{
+ if (!is_aligned_req(pgio, req, SECTOR_SIZE, false)) {
+ nfs_pageio_reset_read_mds(pgio);
+ return;
+ }
+
+ pnfs_generic_pg_init_read(pgio, req);
+
+ if (pgio->pg_lseg &&
+ test_bit(NFS_LSEG_UNAVAILABLE, &pgio->pg_lseg->pls_flags)) {
+ pnfs_error_mark_layout_for_return(pgio->pg_inode, pgio->pg_lseg);
+ pnfs_set_lo_fail(pgio->pg_lseg);
+ nfs_pageio_reset_read_mds(pgio);
+ }
+}
+
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
+bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+ struct nfs_page *req)
+{
+ if (!is_aligned_req(pgio, req, SECTOR_SIZE, false))
+ return 0;
+ return pnfs_generic_pg_test(pgio, prev, req);
+}
+
+/*
+ * Return the number of contiguous bytes for a given inode
+ * starting at page frame idx.
+ */
+static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
+{
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t end;
+
+ /* Optimize common case that writes from 0 to end of file */
+ end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (end != inode->i_mapping->nrpages) {
+ rcu_read_lock();
+ end = page_cache_next_miss(mapping, idx + 1, ULONG_MAX);
+ rcu_read_unlock();
+ }
+
+ if (!end)
+ return i_size_read(inode) - (idx << PAGE_SHIFT);
+ else
+ return (end - idx) << PAGE_SHIFT;
+}
+
+static void
+bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+{
+ u64 wb_size;
+
+ if (!is_aligned_req(pgio, req, PAGE_SIZE, true)) {
+ nfs_pageio_reset_write_mds(pgio);
+ return;
+ }
+
+ if (pgio->pg_dreq == NULL)
+ wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
+ req->wb_index);
+ else
+ wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+
+ pnfs_generic_pg_init_write(pgio, req, wb_size);
+
+ if (pgio->pg_lseg &&
+ test_bit(NFS_LSEG_UNAVAILABLE, &pgio->pg_lseg->pls_flags)) {
+
+ pnfs_error_mark_layout_for_return(pgio->pg_inode, pgio->pg_lseg);
+ pnfs_set_lo_fail(pgio->pg_lseg);
+ nfs_pageio_reset_write_mds(pgio);
+ }
+}
+
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
+bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+ struct nfs_page *req)
+{
+ if (!is_aligned_req(pgio, req, PAGE_SIZE, true))
+ return 0;
+ return pnfs_generic_pg_test(pgio, prev, req);
+}
+
+static const struct nfs_pageio_ops bl_pg_read_ops = {
+ .pg_init = bl_pg_init_read,
+ .pg_test = bl_pg_test_read,
+ .pg_doio = pnfs_generic_pg_readpages,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static const struct nfs_pageio_ops bl_pg_write_ops = {
+ .pg_init = bl_pg_init_write,
+ .pg_test = bl_pg_test_write,
+ .pg_doio = pnfs_generic_pg_writepages,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static struct pnfs_layoutdriver_type blocklayout_type = {
+ .id = LAYOUT_BLOCK_VOLUME,
+ .name = "LAYOUT_BLOCK_VOLUME",
+ .owner = THIS_MODULE,
+ .flags = PNFS_LAYOUTRET_ON_SETATTR |
+ PNFS_LAYOUTRET_ON_ERROR |
+ PNFS_READ_WHOLE_PAGE,
+ .read_pagelist = bl_read_pagelist,
+ .write_pagelist = bl_write_pagelist,
+ .alloc_layout_hdr = bl_alloc_layout_hdr,
+ .free_layout_hdr = bl_free_layout_hdr,
+ .alloc_lseg = bl_alloc_lseg,
+ .free_lseg = bl_free_lseg,
+ .return_range = bl_return_range,
+ .prepare_layoutcommit = bl_prepare_layoutcommit,
+ .cleanup_layoutcommit = bl_cleanup_layoutcommit,
+ .set_layoutdriver = bl_set_layoutdriver,
+ .alloc_deviceid_node = bl_alloc_deviceid_node,
+ .free_deviceid_node = bl_free_deviceid_node,
+ .pg_read_ops = &bl_pg_read_ops,
+ .pg_write_ops = &bl_pg_write_ops,
+ .sync = pnfs_generic_sync,
+};
+
+static struct pnfs_layoutdriver_type scsilayout_type = {
+ .id = LAYOUT_SCSI,
+ .name = "LAYOUT_SCSI",
+ .owner = THIS_MODULE,
+ .flags = PNFS_LAYOUTRET_ON_SETATTR |
+ PNFS_LAYOUTRET_ON_ERROR |
+ PNFS_READ_WHOLE_PAGE,
+ .read_pagelist = bl_read_pagelist,
+ .write_pagelist = bl_write_pagelist,
+ .alloc_layout_hdr = sl_alloc_layout_hdr,
+ .free_layout_hdr = bl_free_layout_hdr,
+ .alloc_lseg = bl_alloc_lseg,
+ .free_lseg = bl_free_lseg,
+ .return_range = bl_return_range,
+ .prepare_layoutcommit = bl_prepare_layoutcommit,
+ .cleanup_layoutcommit = bl_cleanup_layoutcommit,
+ .set_layoutdriver = bl_set_layoutdriver,
+ .alloc_deviceid_node = bl_alloc_deviceid_node,
+ .free_deviceid_node = bl_free_deviceid_node,
+ .pg_read_ops = &bl_pg_read_ops,
+ .pg_write_ops = &bl_pg_write_ops,
+ .sync = pnfs_generic_sync,
+};
+
+
+static int __init nfs4blocklayout_init(void)
+{
+ int ret;
+
+ dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
+
+ ret = bl_init_pipefs();
+ if (ret)
+ goto out;
+
+ ret = pnfs_register_layoutdriver(&blocklayout_type);
+ if (ret)
+ goto out_cleanup_pipe;
+
+ ret = pnfs_register_layoutdriver(&scsilayout_type);
+ if (ret)
+ goto out_unregister_block;
+ return 0;
+
+out_unregister_block:
+ pnfs_unregister_layoutdriver(&blocklayout_type);
+out_cleanup_pipe:
+ bl_cleanup_pipefs();
+out:
+ return ret;
+}
+
+static void __exit nfs4blocklayout_exit(void)
+{
+ dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
+ __func__);
+
+ pnfs_unregister_layoutdriver(&scsilayout_type);
+ pnfs_unregister_layoutdriver(&blocklayout_type);
+ bl_cleanup_pipefs();
+}
+
+MODULE_ALIAS("nfs-layouttype4-3");
+MODULE_ALIAS("nfs-layouttype4-5");
+
+module_init(nfs4blocklayout_init);
+module_exit(nfs4blocklayout_exit);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
new file mode 100644
index 000000000..716bc75e9
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -0,0 +1,197 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayout.h
+ *
+ * Module for the NFSv4.1 pNFS block layout driver.
+ *
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@citi.umich.edu>
+ * Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#ifndef FS_NFS_NFS4BLOCKLAYOUT_H
+#define FS_NFS_NFS4BLOCKLAYOUT_H
+
+#include <linux/device-mapper.h>
+#include <linux/nfs_fs.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
+#include "../nfs4_fs.h"
+#include "../pnfs.h"
+#include "../netns.h"
+
+#define PAGE_CACHE_SECTORS (PAGE_SIZE >> SECTOR_SHIFT)
+#define PAGE_CACHE_SECTOR_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
+#define SECTOR_SIZE (1 << SECTOR_SHIFT)
+
+struct pnfs_block_dev;
+
+#define PNFS_BLOCK_MAX_UUIDS 4
+#define PNFS_BLOCK_MAX_DEVICES 64
+
+/*
+ * Random upper cap for the uuid length to avoid unbounded allocation.
+ * Not actually limited by the protocol.
+ */
+#define PNFS_BLOCK_UUID_LEN 128
+
+struct pnfs_block_volume {
+ enum pnfs_block_volume_type type;
+ union {
+ struct {
+ int len;
+ int nr_sigs;
+ struct {
+ u64 offset;
+ u32 sig_len;
+ u8 sig[PNFS_BLOCK_UUID_LEN];
+ } sigs[PNFS_BLOCK_MAX_UUIDS];
+ } simple;
+ struct {
+ u64 start;
+ u64 len;
+ u32 volume;
+ } slice;
+ struct {
+ u32 volumes_count;
+ u32 volumes[PNFS_BLOCK_MAX_DEVICES];
+ } concat;
+ struct {
+ u64 chunk_size;
+ u32 volumes_count;
+ u32 volumes[PNFS_BLOCK_MAX_DEVICES];
+ } stripe;
+ struct {
+ enum scsi_code_set code_set;
+ enum scsi_designator_type designator_type;
+ int designator_len;
+ u8 designator[256];
+ u64 pr_key;
+ } scsi;
+ };
+};
+
+struct pnfs_block_dev_map {
+ u64 start;
+ u64 len;
+ u64 disk_offset;
+ struct block_device *bdev;
+};
+
+struct pnfs_block_dev {
+ struct nfs4_deviceid_node node;
+
+ u64 start;
+ u64 len;
+
+ u32 nr_children;
+ struct pnfs_block_dev *children;
+ u64 chunk_size;
+
+ struct block_device *bdev;
+ u64 disk_offset;
+
+ u64 pr_key;
+ bool pr_registered;
+
+ bool (*map)(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map);
+};
+
+/* sector_t fields are all in 512-byte sectors */
+struct pnfs_block_extent {
+ union {
+ struct rb_node be_node;
+ struct list_head be_list;
+ };
+ struct nfs4_deviceid_node *be_device;
+ sector_t be_f_offset; /* the starting offset in the file */
+ sector_t be_length; /* the size of the extent */
+ sector_t be_v_offset; /* the starting offset in the volume */
+ enum pnfs_block_extent_state be_state; /* the state of this extent */
+#define EXTENT_WRITTEN 1
+#define EXTENT_COMMITTING 2
+ unsigned int be_tag;
+};
+
+struct pnfs_block_layout {
+ struct pnfs_layout_hdr bl_layout;
+ struct rb_root bl_ext_rw;
+ struct rb_root bl_ext_ro;
+ spinlock_t bl_ext_lock; /* Protects list manipulation */
+ bool bl_scsi_layout;
+ u64 bl_lwb;
+};
+
+static inline struct pnfs_block_layout *
+BLK_LO2EXT(struct pnfs_layout_hdr *lo)
+{
+ return container_of(lo, struct pnfs_block_layout, bl_layout);
+}
+
+static inline struct pnfs_block_layout *
+BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
+{
+ return BLK_LO2EXT(lseg->pls_layout);
+}
+
+struct bl_pipe_msg {
+ struct rpc_pipe_msg msg;
+ wait_queue_head_t *bl_wq;
+};
+
+struct bl_msg_hdr {
+ u8 type;
+ u16 totallen; /* length of entire message, including hdr itself */
+};
+
+#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */
+#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/
+#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */
+#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
+#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
+
+/* dev.c */
+struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_mask);
+void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
+
+/* extent_tree.c */
+int ext_tree_insert(struct pnfs_block_layout *bl,
+ struct pnfs_block_extent *new);
+int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
+ sector_t end);
+int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
+ sector_t len, u64 lwb);
+bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
+ struct pnfs_block_extent *ret, bool rw);
+int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
+void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
+
+/* rpc_pipefs.c */
+dev_t bl_resolve_deviceid(struct nfs_server *server,
+ struct pnfs_block_volume *b, gfp_t gfp_mask);
+int __init bl_init_pipefs(void);
+void bl_cleanup_pipefs(void);
+
+#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
new file mode 100644
index 000000000..6e3a14fdf
--- /dev/null
+++ b/fs/nfs/blocklayout/dev.c
@@ -0,0 +1,548 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014-2016 Christoph Hellwig.
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/blkdev.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_xdr.h>
+#include <linux/pr.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static void
+bl_free_device(struct pnfs_block_dev *dev)
+{
+ if (dev->nr_children) {
+ int i;
+
+ for (i = 0; i < dev->nr_children; i++)
+ bl_free_device(&dev->children[i]);
+ kfree(dev->children);
+ } else {
+ if (dev->pr_registered) {
+ const struct pr_ops *ops =
+ dev->bdev->bd_disk->fops->pr_ops;
+ int error;
+
+ error = ops->pr_register(dev->bdev, dev->pr_key, 0,
+ false);
+ if (error)
+ pr_err("failed to unregister PR key.\n");
+ }
+
+ if (dev->bdev)
+ blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
+ }
+}
+
+void
+bl_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+ struct pnfs_block_dev *dev =
+ container_of(d, struct pnfs_block_dev, node);
+
+ bl_free_device(dev);
+ kfree_rcu(dev, node.rcu);
+}
+
+static int
+nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
+{
+ __be32 *p;
+ int i;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
+ b->type = be32_to_cpup(p++);
+
+ switch (b->type) {
+ case PNFS_BLOCK_VOLUME_SIMPLE:
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
+ b->simple.nr_sigs = be32_to_cpup(p++);
+ if (!b->simple.nr_sigs || b->simple.nr_sigs > PNFS_BLOCK_MAX_UUIDS) {
+ dprintk("Bad signature count: %d\n", b->simple.nr_sigs);
+ return -EIO;
+ }
+
+ b->simple.len = 4 + 4;
+ for (i = 0; i < b->simple.nr_sigs; i++) {
+ p = xdr_inline_decode(xdr, 8 + 4);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
+ b->simple.sigs[i].sig_len = be32_to_cpup(p++);
+ if (b->simple.sigs[i].sig_len > PNFS_BLOCK_UUID_LEN) {
+ pr_info("signature too long: %d\n",
+ b->simple.sigs[i].sig_len);
+ return -EIO;
+ }
+
+ p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
+ if (!p)
+ return -EIO;
+ memcpy(&b->simple.sigs[i].sig, p,
+ b->simple.sigs[i].sig_len);
+
+ b->simple.len += 8 + 4 + \
+ (XDR_QUADLEN(b->simple.sigs[i].sig_len) << 2);
+ }
+ break;
+ case PNFS_BLOCK_VOLUME_SLICE:
+ p = xdr_inline_decode(xdr, 8 + 8 + 4);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &b->slice.start);
+ p = xdr_decode_hyper(p, &b->slice.len);
+ b->slice.volume = be32_to_cpup(p++);
+ break;
+ case PNFS_BLOCK_VOLUME_CONCAT:
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
+
+ b->concat.volumes_count = be32_to_cpup(p++);
+ if (b->concat.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
+ dprintk("Too many volumes: %d\n", b->concat.volumes_count);
+ return -EIO;
+ }
+
+ p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
+ if (!p)
+ return -EIO;
+ for (i = 0; i < b->concat.volumes_count; i++)
+ b->concat.volumes[i] = be32_to_cpup(p++);
+ break;
+ case PNFS_BLOCK_VOLUME_STRIPE:
+ p = xdr_inline_decode(xdr, 8 + 4);
+ if (!p)
+ return -EIO;
+
+ p = xdr_decode_hyper(p, &b->stripe.chunk_size);
+ b->stripe.volumes_count = be32_to_cpup(p++);
+ if (b->stripe.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
+ dprintk("Too many volumes: %d\n", b->stripe.volumes_count);
+ return -EIO;
+ }
+
+ p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
+ if (!p)
+ return -EIO;
+ for (i = 0; i < b->stripe.volumes_count; i++)
+ b->stripe.volumes[i] = be32_to_cpup(p++);
+ break;
+ case PNFS_BLOCK_VOLUME_SCSI:
+ p = xdr_inline_decode(xdr, 4 + 4 + 4);
+ if (!p)
+ return -EIO;
+ b->scsi.code_set = be32_to_cpup(p++);
+ b->scsi.designator_type = be32_to_cpup(p++);
+ b->scsi.designator_len = be32_to_cpup(p++);
+ p = xdr_inline_decode(xdr, b->scsi.designator_len);
+ if (!p)
+ return -EIO;
+ if (b->scsi.designator_len > 256)
+ return -EIO;
+ memcpy(&b->scsi.designator, p, b->scsi.designator_len);
+ p = xdr_inline_decode(xdr, 8);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &b->scsi.pr_key);
+ break;
+ default:
+ dprintk("unknown volume type!\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map)
+{
+ map->start = dev->start;
+ map->len = dev->len;
+ map->disk_offset = dev->disk_offset;
+ map->bdev = dev->bdev;
+ return true;
+}
+
+static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map)
+{
+ int i;
+
+ for (i = 0; i < dev->nr_children; i++) {
+ struct pnfs_block_dev *child = &dev->children[i];
+
+ if (child->start > offset ||
+ child->start + child->len <= offset)
+ continue;
+
+ child->map(child, offset - child->start, map);
+ return true;
+ }
+
+ dprintk("%s: ran off loop!\n", __func__);
+ return false;
+}
+
+static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map)
+{
+ struct pnfs_block_dev *child;
+ u64 chunk;
+ u32 chunk_idx;
+ u64 disk_offset;
+
+ chunk = div_u64(offset, dev->chunk_size);
+ div_u64_rem(chunk, dev->nr_children, &chunk_idx);
+
+ if (chunk_idx >= dev->nr_children) {
+ dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
+ __func__, chunk_idx, offset, dev->chunk_size);
+ /* error, should not happen */
+ return false;
+ }
+
+ /* truncate offset to the beginning of the stripe */
+ offset = chunk * dev->chunk_size;
+
+ /* disk offset of the stripe */
+ disk_offset = div_u64(offset, dev->nr_children);
+
+ child = &dev->children[chunk_idx];
+ child->map(child, disk_offset, map);
+
+ map->start += offset;
+ map->disk_offset += disk_offset;
+ map->len = dev->chunk_size;
+ return true;
+}
+
+static int
+bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
+
+
+static int
+bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ struct block_device *bdev;
+ dev_t dev;
+
+ dev = bl_resolve_deviceid(server, v, gfp_mask);
+ if (!dev)
+ return -EIO;
+
+ bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
+ if (IS_ERR(bdev)) {
+ printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
+ MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
+ return PTR_ERR(bdev);
+ }
+ d->bdev = bdev;
+
+
+ d->len = i_size_read(d->bdev->bd_inode);
+ d->map = bl_map_simple;
+
+ printk(KERN_INFO "pNFS: using block device %s\n",
+ d->bdev->bd_disk->disk_name);
+ return 0;
+}
+
+static bool
+bl_validate_designator(struct pnfs_block_volume *v)
+{
+ switch (v->scsi.designator_type) {
+ case PS_DESIGNATOR_EUI64:
+ if (v->scsi.code_set != PS_CODE_SET_BINARY)
+ return false;
+
+ if (v->scsi.designator_len != 8 &&
+ v->scsi.designator_len != 10 &&
+ v->scsi.designator_len != 16)
+ return false;
+
+ return true;
+ case PS_DESIGNATOR_NAA:
+ if (v->scsi.code_set != PS_CODE_SET_BINARY)
+ return false;
+
+ if (v->scsi.designator_len != 8 &&
+ v->scsi.designator_len != 16)
+ return false;
+
+ return true;
+ case PS_DESIGNATOR_T10:
+ case PS_DESIGNATOR_NAME:
+ pr_err("pNFS: unsupported designator "
+ "(code set %d, type %d, len %d.\n",
+ v->scsi.code_set,
+ v->scsi.designator_type,
+ v->scsi.designator_len);
+ return false;
+ default:
+ pr_err("pNFS: invalid designator "
+ "(code set %d, type %d, len %d.\n",
+ v->scsi.code_set,
+ v->scsi.designator_type,
+ v->scsi.designator_len);
+ return false;
+ }
+}
+
+/*
+ * Try to open the udev path for the WWN. At least on Debian the udev
+ * by-id path will always point to the dm-multipath device if one exists.
+ */
+static struct block_device *
+bl_open_udev_path(struct pnfs_block_volume *v)
+{
+ struct block_device *bdev;
+ const char *devname;
+
+ devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN",
+ v->scsi.designator_len, v->scsi.designator);
+ if (!devname)
+ return ERR_PTR(-ENOMEM);
+
+ bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
+ if (IS_ERR(bdev)) {
+ pr_warn("pNFS: failed to open device %s (%ld)\n",
+ devname, PTR_ERR(bdev));
+ }
+
+ kfree(devname);
+ return bdev;
+}
+
+/*
+ * Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the
+ * wwn- links will only point to the first discovered SCSI device there.
+ */
+static struct block_device *
+bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v)
+{
+ struct block_device *bdev;
+ const char *devname;
+
+ devname = kasprintf(GFP_KERNEL,
+ "/dev/disk/by-id/dm-uuid-mpath-%d%*phN",
+ v->scsi.designator_type,
+ v->scsi.designator_len, v->scsi.designator);
+ if (!devname)
+ return ERR_PTR(-ENOMEM);
+
+ bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
+ kfree(devname);
+ return bdev;
+}
+
+static int
+bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ struct block_device *bdev;
+ const struct pr_ops *ops;
+ int error;
+
+ if (!bl_validate_designator(v))
+ return -EINVAL;
+
+ bdev = bl_open_dm_mpath_udev_path(v);
+ if (IS_ERR(bdev))
+ bdev = bl_open_udev_path(v);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+ d->bdev = bdev;
+
+ d->len = i_size_read(d->bdev->bd_inode);
+ d->map = bl_map_simple;
+ d->pr_key = v->scsi.pr_key;
+
+ pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
+ d->bdev->bd_disk->disk_name, d->pr_key);
+
+ ops = d->bdev->bd_disk->fops->pr_ops;
+ if (!ops) {
+ pr_err("pNFS: block device %s does not support reservations.",
+ d->bdev->bd_disk->disk_name);
+ error = -EINVAL;
+ goto out_blkdev_put;
+ }
+
+ error = ops->pr_register(d->bdev, 0, d->pr_key, true);
+ if (error) {
+ pr_err("pNFS: failed to register key for block device %s.",
+ d->bdev->bd_disk->disk_name);
+ goto out_blkdev_put;
+ }
+
+ d->pr_registered = true;
+ return 0;
+
+out_blkdev_put:
+ blkdev_put(d->bdev, FMODE_READ | FMODE_WRITE);
+ return error;
+}
+
+static int
+bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ int ret;
+
+ ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask);
+ if (ret)
+ return ret;
+
+ d->disk_offset = v->slice.start;
+ d->len = v->slice.len;
+ return 0;
+}
+
+static int
+bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ u64 len = 0;
+ int ret, i;
+
+ d->children = kcalloc(v->concat.volumes_count,
+ sizeof(struct pnfs_block_dev), gfp_mask);
+ if (!d->children)
+ return -ENOMEM;
+
+ for (i = 0; i < v->concat.volumes_count; i++) {
+ ret = bl_parse_deviceid(server, &d->children[i],
+ volumes, v->concat.volumes[i], gfp_mask);
+ if (ret)
+ return ret;
+
+ d->nr_children++;
+ d->children[i].start += len;
+ len += d->children[i].len;
+ }
+
+ d->len = len;
+ d->map = bl_map_concat;
+ return 0;
+}
+
+static int
+bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ u64 len = 0;
+ int ret, i;
+
+ d->children = kcalloc(v->stripe.volumes_count,
+ sizeof(struct pnfs_block_dev), gfp_mask);
+ if (!d->children)
+ return -ENOMEM;
+
+ for (i = 0; i < v->stripe.volumes_count; i++) {
+ ret = bl_parse_deviceid(server, &d->children[i],
+ volumes, v->stripe.volumes[i], gfp_mask);
+ if (ret)
+ return ret;
+
+ d->nr_children++;
+ len += d->children[i].len;
+ }
+
+ d->len = len;
+ d->chunk_size = v->stripe.chunk_size;
+ d->map = bl_map_stripe;
+ return 0;
+}
+
+static int
+bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ switch (volumes[idx].type) {
+ case PNFS_BLOCK_VOLUME_SIMPLE:
+ return bl_parse_simple(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_SLICE:
+ return bl_parse_slice(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_CONCAT:
+ return bl_parse_concat(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_STRIPE:
+ return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_SCSI:
+ return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
+ default:
+ dprintk("unsupported volume type: %d\n", volumes[idx].type);
+ return -EIO;
+ }
+}
+
+struct nfs4_deviceid_node *
+bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_mask)
+{
+ struct nfs4_deviceid_node *node = NULL;
+ struct pnfs_block_volume *volumes;
+ struct pnfs_block_dev *top;
+ struct xdr_stream xdr;
+ struct xdr_buf buf;
+ struct page *scratch;
+ int nr_volumes, ret, i;
+ __be32 *p;
+
+ scratch = alloc_page(gfp_mask);
+ if (!scratch)
+ goto out;
+
+ xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
+ xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
+
+ p = xdr_inline_decode(&xdr, sizeof(__be32));
+ if (!p)
+ goto out_free_scratch;
+ nr_volumes = be32_to_cpup(p++);
+
+ volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume),
+ gfp_mask);
+ if (!volumes)
+ goto out_free_scratch;
+
+ for (i = 0; i < nr_volumes; i++) {
+ ret = nfs4_block_decode_volume(&xdr, &volumes[i]);
+ if (ret < 0)
+ goto out_free_volumes;
+ }
+
+ top = kzalloc(sizeof(*top), gfp_mask);
+ if (!top)
+ goto out_free_volumes;
+
+ ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask);
+
+ node = &top->node;
+ nfs4_init_deviceid_node(node, server, &pdev->dev_id);
+ if (ret)
+ nfs4_mark_deviceid_unavailable(node);
+
+out_free_volumes:
+ kfree(volumes);
+out_free_scratch:
+ __free_page(scratch);
+out:
+ return node;
+}
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
new file mode 100644
index 000000000..8f7cff7a4
--- /dev/null
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -0,0 +1,647 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014-2016 Christoph Hellwig.
+ */
+
+#include <linux/vmalloc.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static inline struct pnfs_block_extent *
+ext_node(struct rb_node *node)
+{
+ return rb_entry(node, struct pnfs_block_extent, be_node);
+}
+
+static struct pnfs_block_extent *
+ext_tree_first(struct rb_root *root)
+{
+ struct rb_node *node = rb_first(root);
+ return node ? ext_node(node) : NULL;
+}
+
+static struct pnfs_block_extent *
+ext_tree_prev(struct pnfs_block_extent *be)
+{
+ struct rb_node *node = rb_prev(&be->be_node);
+ return node ? ext_node(node) : NULL;
+}
+
+static struct pnfs_block_extent *
+ext_tree_next(struct pnfs_block_extent *be)
+{
+ struct rb_node *node = rb_next(&be->be_node);
+ return node ? ext_node(node) : NULL;
+}
+
+static inline sector_t
+ext_f_end(struct pnfs_block_extent *be)
+{
+ return be->be_f_offset + be->be_length;
+}
+
+static struct pnfs_block_extent *
+__ext_tree_search(struct rb_root *root, sector_t start)
+{
+ struct rb_node *node = root->rb_node;
+ struct pnfs_block_extent *be = NULL;
+
+ while (node) {
+ be = ext_node(node);
+ if (start < be->be_f_offset)
+ node = node->rb_left;
+ else if (start >= ext_f_end(be))
+ node = node->rb_right;
+ else
+ return be;
+ }
+
+ if (be) {
+ if (start < be->be_f_offset)
+ return be;
+
+ if (start >= ext_f_end(be))
+ return ext_tree_next(be);
+ }
+
+ return NULL;
+}
+
+static bool
+ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
+{
+ if (be1->be_state != be2->be_state)
+ return false;
+ if (be1->be_device != be2->be_device)
+ return false;
+
+ if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
+ return false;
+
+ if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
+ (be1->be_v_offset + be1->be_length != be2->be_v_offset))
+ return false;
+
+ if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
+ be1->be_tag != be2->be_tag)
+ return false;
+
+ return true;
+}
+
+static struct pnfs_block_extent *
+ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
+{
+ struct pnfs_block_extent *left = ext_tree_prev(be);
+
+ if (left && ext_can_merge(left, be)) {
+ left->be_length += be->be_length;
+ rb_erase(&be->be_node, root);
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ return left;
+ }
+
+ return be;
+}
+
+static struct pnfs_block_extent *
+ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
+{
+ struct pnfs_block_extent *right = ext_tree_next(be);
+
+ if (right && ext_can_merge(be, right)) {
+ be->be_length += right->be_length;
+ rb_erase(&right->be_node, root);
+ nfs4_put_deviceid_node(right->be_device);
+ kfree(right);
+ }
+
+ return be;
+}
+
+static void __ext_put_deviceids(struct list_head *head)
+{
+ struct pnfs_block_extent *be, *tmp;
+
+ list_for_each_entry_safe(be, tmp, head, be_list) {
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ }
+}
+
+static void
+__ext_tree_insert(struct rb_root *root,
+ struct pnfs_block_extent *new, bool merge_ok)
+{
+ struct rb_node **p = &root->rb_node, *parent = NULL;
+ struct pnfs_block_extent *be;
+
+ while (*p) {
+ parent = *p;
+ be = ext_node(parent);
+
+ if (new->be_f_offset < be->be_f_offset) {
+ if (merge_ok && ext_can_merge(new, be)) {
+ be->be_f_offset = new->be_f_offset;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA)
+ be->be_v_offset = new->be_v_offset;
+ be->be_length += new->be_length;
+ be = ext_try_to_merge_left(root, be);
+ goto free_new;
+ }
+ p = &(*p)->rb_left;
+ } else if (new->be_f_offset >= ext_f_end(be)) {
+ if (merge_ok && ext_can_merge(be, new)) {
+ be->be_length += new->be_length;
+ be = ext_try_to_merge_right(root, be);
+ goto free_new;
+ }
+ p = &(*p)->rb_right;
+ } else {
+ BUG();
+ }
+ }
+
+ rb_link_node(&new->be_node, parent, p);
+ rb_insert_color(&new->be_node, root);
+ return;
+free_new:
+ nfs4_put_deviceid_node(new->be_device);
+ kfree(new);
+}
+
+static int
+__ext_tree_remove(struct rb_root *root,
+ sector_t start, sector_t end, struct list_head *tmp)
+{
+ struct pnfs_block_extent *be;
+ sector_t len1 = 0, len2 = 0;
+ sector_t orig_v_offset;
+ sector_t orig_len;
+
+ be = __ext_tree_search(root, start);
+ if (!be)
+ return 0;
+ if (be->be_f_offset >= end)
+ return 0;
+
+ orig_v_offset = be->be_v_offset;
+ orig_len = be->be_length;
+
+ if (start > be->be_f_offset)
+ len1 = start - be->be_f_offset;
+ if (ext_f_end(be) > end)
+ len2 = ext_f_end(be) - end;
+
+ if (len2 > 0) {
+ if (len1 > 0) {
+ struct pnfs_block_extent *new;
+
+ new = kzalloc(sizeof(*new), GFP_ATOMIC);
+ if (!new)
+ return -ENOMEM;
+
+ be->be_length = len1;
+
+ new->be_f_offset = end;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA) {
+ new->be_v_offset =
+ orig_v_offset + orig_len - len2;
+ }
+ new->be_length = len2;
+ new->be_state = be->be_state;
+ new->be_tag = be->be_tag;
+ new->be_device = nfs4_get_deviceid(be->be_device);
+
+ __ext_tree_insert(root, new, true);
+ } else {
+ be->be_f_offset = end;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA) {
+ be->be_v_offset =
+ orig_v_offset + orig_len - len2;
+ }
+ be->be_length = len2;
+ }
+ } else {
+ if (len1 > 0) {
+ be->be_length = len1;
+ be = ext_tree_next(be);
+ }
+
+ while (be && ext_f_end(be) <= end) {
+ struct pnfs_block_extent *next = ext_tree_next(be);
+
+ rb_erase(&be->be_node, root);
+ list_add_tail(&be->be_list, tmp);
+ be = next;
+ }
+
+ if (be && be->be_f_offset < end) {
+ len1 = ext_f_end(be) - end;
+ be->be_f_offset = end;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA)
+ be->be_v_offset += be->be_length - len1;
+ be->be_length = len1;
+ }
+ }
+
+ return 0;
+}
+
+int
+ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
+{
+ struct pnfs_block_extent *be;
+ struct rb_root *root;
+ int err = 0;
+
+ switch (new->be_state) {
+ case PNFS_BLOCK_READWRITE_DATA:
+ case PNFS_BLOCK_INVALID_DATA:
+ root = &bl->bl_ext_rw;
+ break;
+ case PNFS_BLOCK_READ_DATA:
+ case PNFS_BLOCK_NONE_DATA:
+ root = &bl->bl_ext_ro;
+ break;
+ default:
+ dprintk("invalid extent type\n");
+ return -EINVAL;
+ }
+
+ spin_lock(&bl->bl_ext_lock);
+retry:
+ be = __ext_tree_search(root, new->be_f_offset);
+ if (!be || be->be_f_offset >= ext_f_end(new)) {
+ __ext_tree_insert(root, new, true);
+ } else if (new->be_f_offset >= be->be_f_offset) {
+ if (ext_f_end(new) <= ext_f_end(be)) {
+ nfs4_put_deviceid_node(new->be_device);
+ kfree(new);
+ } else {
+ sector_t new_len = ext_f_end(new) - ext_f_end(be);
+ sector_t diff = new->be_length - new_len;
+
+ new->be_f_offset += diff;
+ new->be_v_offset += diff;
+ new->be_length = new_len;
+ goto retry;
+ }
+ } else if (ext_f_end(new) <= ext_f_end(be)) {
+ new->be_length = be->be_f_offset - new->be_f_offset;
+ __ext_tree_insert(root, new, true);
+ } else {
+ struct pnfs_block_extent *split;
+ sector_t new_len = ext_f_end(new) - ext_f_end(be);
+ sector_t diff = new->be_length - new_len;
+
+ split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
+ if (!split) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ split->be_length = be->be_f_offset - split->be_f_offset;
+ split->be_device = nfs4_get_deviceid(new->be_device);
+ __ext_tree_insert(root, split, true);
+
+ new->be_f_offset += diff;
+ new->be_v_offset += diff;
+ new->be_length = new_len;
+ goto retry;
+ }
+out:
+ spin_unlock(&bl->bl_ext_lock);
+ return err;
+}
+
+static bool
+__ext_tree_lookup(struct rb_root *root, sector_t isect,
+ struct pnfs_block_extent *ret)
+{
+ struct rb_node *node;
+ struct pnfs_block_extent *be;
+
+ node = root->rb_node;
+ while (node) {
+ be = ext_node(node);
+ if (isect < be->be_f_offset)
+ node = node->rb_left;
+ else if (isect >= ext_f_end(be))
+ node = node->rb_right;
+ else {
+ *ret = *be;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool
+ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
+ struct pnfs_block_extent *ret, bool rw)
+{
+ bool found = false;
+
+ spin_lock(&bl->bl_ext_lock);
+ if (!rw)
+ found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
+ if (!found)
+ found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
+ spin_unlock(&bl->bl_ext_lock);
+
+ return found;
+}
+
+int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
+ sector_t start, sector_t end)
+{
+ int err, err2;
+ LIST_HEAD(tmp);
+
+ spin_lock(&bl->bl_ext_lock);
+ err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
+ if (rw) {
+ err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp);
+ if (!err)
+ err = err2;
+ }
+ spin_unlock(&bl->bl_ext_lock);
+
+ __ext_put_deviceids(&tmp);
+ return err;
+}
+
+static int
+ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
+ sector_t split)
+{
+ struct pnfs_block_extent *new;
+ sector_t orig_len = be->be_length;
+
+ new = kzalloc(sizeof(*new), GFP_ATOMIC);
+ if (!new)
+ return -ENOMEM;
+
+ be->be_length = split - be->be_f_offset;
+
+ new->be_f_offset = split;
+ if (be->be_state != PNFS_BLOCK_NONE_DATA)
+ new->be_v_offset = be->be_v_offset + be->be_length;
+ new->be_length = orig_len - be->be_length;
+ new->be_state = be->be_state;
+ new->be_tag = be->be_tag;
+ new->be_device = nfs4_get_deviceid(be->be_device);
+
+ __ext_tree_insert(root, new, false);
+ return 0;
+}
+
+int
+ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
+ sector_t len, u64 lwb)
+{
+ struct rb_root *root = &bl->bl_ext_rw;
+ sector_t end = start + len;
+ struct pnfs_block_extent *be;
+ int err = 0;
+ LIST_HEAD(tmp);
+
+ spin_lock(&bl->bl_ext_lock);
+ /*
+ * First remove all COW extents or holes from written to range.
+ */
+ err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
+ if (err)
+ goto out;
+
+ /*
+ * Then mark all invalid extents in the range as written to.
+ */
+ for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
+ if (be->be_f_offset >= end)
+ break;
+
+ if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
+ continue;
+
+ if (be->be_f_offset < start) {
+ struct pnfs_block_extent *left = ext_tree_prev(be);
+
+ if (left && ext_can_merge(left, be)) {
+ sector_t diff = start - be->be_f_offset;
+
+ left->be_length += diff;
+
+ be->be_f_offset += diff;
+ be->be_v_offset += diff;
+ be->be_length -= diff;
+ } else {
+ err = ext_tree_split(root, be, start);
+ if (err)
+ goto out;
+ }
+ }
+
+ if (ext_f_end(be) > end) {
+ struct pnfs_block_extent *right = ext_tree_next(be);
+
+ if (right && ext_can_merge(be, right)) {
+ sector_t diff = end - be->be_f_offset;
+
+ be->be_length -= diff;
+
+ right->be_f_offset -= diff;
+ right->be_v_offset -= diff;
+ right->be_length += diff;
+ } else {
+ err = ext_tree_split(root, be, end);
+ if (err)
+ goto out;
+ }
+ }
+
+ if (be->be_f_offset >= start && ext_f_end(be) <= end) {
+ be->be_tag = EXTENT_WRITTEN;
+ be = ext_try_to_merge_left(root, be);
+ be = ext_try_to_merge_right(root, be);
+ }
+ }
+out:
+ if (bl->bl_lwb < lwb)
+ bl->bl_lwb = lwb;
+ spin_unlock(&bl->bl_ext_lock);
+
+ __ext_put_deviceids(&tmp);
+ return err;
+}
+
+static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
+{
+ if (bl->bl_scsi_layout)
+ return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
+ else
+ return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
+}
+
+static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
+ size_t buffer_size)
+{
+ if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
+ int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
+
+ for (i = 0; i < nr_pages; i++)
+ put_page(arg->layoutupdate_pages[i]);
+ vfree(arg->start_p);
+ kfree(arg->layoutupdate_pages);
+ } else {
+ put_page(arg->layoutupdate_page);
+ }
+}
+
+static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
+{
+ p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
+ NFS4_DEVICEID4_SIZE);
+ p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+ p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+ p = xdr_encode_hyper(p, 0LL);
+ *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
+ return p;
+}
+
+static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
+{
+ p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+ return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+}
+
+static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
+ size_t buffer_size, size_t *count, __u64 *lastbyte)
+{
+ struct pnfs_block_extent *be;
+ int ret = 0;
+
+ spin_lock(&bl->bl_ext_lock);
+ for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
+ if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
+ be->be_tag != EXTENT_WRITTEN)
+ continue;
+
+ (*count)++;
+ if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
+ /* keep counting.. */
+ ret = -ENOSPC;
+ continue;
+ }
+
+ if (bl->bl_scsi_layout)
+ p = encode_scsi_range(be, p);
+ else
+ p = encode_block_extent(be, p);
+ be->be_tag = EXTENT_COMMITTING;
+ }
+ *lastbyte = bl->bl_lwb - 1;
+ bl->bl_lwb = 0;
+ spin_unlock(&bl->bl_ext_lock);
+
+ return ret;
+}
+
+int
+ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
+{
+ struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
+ size_t count = 0, buffer_size = PAGE_SIZE;
+ __be32 *start_p;
+ int ret;
+
+ dprintk("%s enter\n", __func__);
+
+ arg->layoutupdate_page = alloc_page(GFP_NOFS);
+ if (!arg->layoutupdate_page)
+ return -ENOMEM;
+ start_p = page_address(arg->layoutupdate_page);
+ arg->layoutupdate_pages = &arg->layoutupdate_page;
+
+retry:
+ ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten);
+ if (unlikely(ret)) {
+ ext_tree_free_commitdata(arg, buffer_size);
+
+ buffer_size = ext_tree_layoutupdate_size(bl, count);
+ count = 0;
+
+ arg->layoutupdate_pages =
+ kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
+ sizeof(struct page *), GFP_NOFS);
+ if (!arg->layoutupdate_pages)
+ return -ENOMEM;
+
+ start_p = __vmalloc(buffer_size, GFP_NOFS);
+ if (!start_p) {
+ kfree(arg->layoutupdate_pages);
+ return -ENOMEM;
+ }
+
+ goto retry;
+ }
+
+ *start_p = cpu_to_be32(count);
+ arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
+
+ if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
+ void *p = start_p, *end = p + arg->layoutupdate_len;
+ struct page *page = NULL;
+ int i = 0;
+
+ arg->start_p = start_p;
+ for ( ; p < end; p += PAGE_SIZE) {
+ page = vmalloc_to_page(p);
+ arg->layoutupdate_pages[i++] = page;
+ get_page(page);
+ }
+ }
+
+ dprintk("%s found %zu ranges\n", __func__, count);
+ return 0;
+}
+
+void
+ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
+{
+ struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
+ struct rb_root *root = &bl->bl_ext_rw;
+ struct pnfs_block_extent *be;
+
+ dprintk("%s status %d\n", __func__, status);
+
+ ext_tree_free_commitdata(arg, arg->layoutupdate_len);
+
+ spin_lock(&bl->bl_ext_lock);
+ for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
+ if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
+ be->be_tag != EXTENT_COMMITTING)
+ continue;
+
+ if (status) {
+ /*
+ * Mark as written and try again.
+ *
+ * XXX: some real error handling here wouldn't hurt..
+ */
+ be->be_tag = EXTENT_WRITTEN;
+ } else {
+ be->be_state = PNFS_BLOCK_READWRITE_DATA;
+ be->be_tag = 0;
+ }
+
+ be = ext_try_to_merge_left(root, be);
+ be = ext_try_to_merge_right(root, be);
+ }
+ spin_unlock(&bl->bl_ext_lock);
+}
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
new file mode 100644
index 000000000..ef9db135c
--- /dev/null
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2006,2007 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@citi.umich.edu>
+ * Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include <linux/module.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static void
+nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b)
+{
+ int i;
+
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(b->type);
+ *p++ = cpu_to_be32(b->simple.nr_sigs);
+ for (i = 0; i < b->simple.nr_sigs; i++) {
+ p = xdr_encode_hyper(p, b->simple.sigs[i].offset);
+ p = xdr_encode_opaque(p, b->simple.sigs[i].sig,
+ b->simple.sigs[i].sig_len);
+ }
+}
+
+dev_t
+bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
+ gfp_t gfp_mask)
+{
+ struct net *net = server->nfs_client->cl_net;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct bl_dev_msg *reply = &nn->bl_mount_reply;
+ struct bl_pipe_msg bl_pipe_msg;
+ struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
+ struct bl_msg_hdr *bl_msg;
+ DECLARE_WAITQUEUE(wq, current);
+ dev_t dev = 0;
+ int rc;
+
+ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
+
+ mutex_lock(&nn->bl_mutex);
+ bl_pipe_msg.bl_wq = &nn->bl_wq;
+
+ b->simple.len += 4; /* single volume */
+ if (b->simple.len > PAGE_SIZE)
+ goto out_unlock;
+
+ memset(msg, 0, sizeof(*msg));
+ msg->len = sizeof(*bl_msg) + b->simple.len;
+ msg->data = kzalloc(msg->len, gfp_mask);
+ if (!msg->data)
+ goto out_free_data;
+
+ bl_msg = msg->data;
+ bl_msg->type = BL_DEVICE_MOUNT;
+ bl_msg->totallen = b->simple.len;
+ nfs4_encode_simple(msg->data + sizeof(*bl_msg), b);
+
+ dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
+ add_wait_queue(&nn->bl_wq, &wq);
+ rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
+ if (rc < 0) {
+ remove_wait_queue(&nn->bl_wq, &wq);
+ goto out_free_data;
+ }
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ remove_wait_queue(&nn->bl_wq, &wq);
+
+ if (reply->status != BL_DEVICE_REQUEST_PROC) {
+ printk(KERN_WARNING "%s failed to decode device: %d\n",
+ __func__, reply->status);
+ goto out_free_data;
+ }
+
+ dev = MKDEV(reply->major, reply->minor);
+out_free_data:
+ kfree(msg->data);
+out_unlock:
+ mutex_unlock(&nn->bl_mutex);
+ return dev;
+}
+
+static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
+ size_t mlen)
+{
+ struct nfs_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info,
+ nfs_net_id);
+
+ if (mlen != sizeof (struct bl_dev_msg))
+ return -EINVAL;
+
+ if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
+ return -EFAULT;
+
+ wake_up(&nn->bl_wq);
+
+ return mlen;
+}
+
+static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+ struct bl_pipe_msg *bl_pipe_msg =
+ container_of(msg, struct bl_pipe_msg, msg);
+
+ if (msg->errno >= 0)
+ return;
+ wake_up(bl_pipe_msg->bl_wq);
+}
+
+static const struct rpc_pipe_ops bl_upcall_ops = {
+ .upcall = rpc_pipe_generic_upcall,
+ .downcall = bl_pipe_downcall,
+ .destroy_msg = bl_pipe_destroy_msg,
+};
+
+static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
+ struct rpc_pipe *pipe)
+{
+ struct dentry *dir, *dentry;
+
+ dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
+ if (dir == NULL)
+ return ERR_PTR(-ENOENT);
+ dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
+ dput(dir);
+ return dentry;
+}
+
+static void nfs4blocklayout_unregister_sb(struct super_block *sb,
+ struct rpc_pipe *pipe)
+{
+ if (pipe->dentry)
+ rpc_unlink(pipe->dentry);
+}
+
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct super_block *sb = ptr;
+ struct net *net = sb->s_fs_info;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct dentry *dentry;
+ int ret = 0;
+
+ if (!try_module_get(THIS_MODULE))
+ return 0;
+
+ if (nn->bl_device_pipe == NULL) {
+ module_put(THIS_MODULE);
+ return 0;
+ }
+
+ switch (event) {
+ case RPC_PIPEFS_MOUNT:
+ dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
+ if (IS_ERR(dentry)) {
+ ret = PTR_ERR(dentry);
+ break;
+ }
+ nn->bl_device_pipe->dentry = dentry;
+ break;
+ case RPC_PIPEFS_UMOUNT:
+ if (nn->bl_device_pipe->dentry)
+ nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
+ break;
+ default:
+ ret = -ENOTSUPP;
+ break;
+ }
+ module_put(THIS_MODULE);
+ return ret;
+}
+
+static struct notifier_block nfs4blocklayout_block = {
+ .notifier_call = rpc_pipefs_event,
+};
+
+static struct dentry *nfs4blocklayout_register_net(struct net *net,
+ struct rpc_pipe *pipe)
+{
+ struct super_block *pipefs_sb;
+ struct dentry *dentry;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (!pipefs_sb)
+ return NULL;
+ dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
+ rpc_put_sb_net(net);
+ return dentry;
+}
+
+static void nfs4blocklayout_unregister_net(struct net *net,
+ struct rpc_pipe *pipe)
+{
+ struct super_block *pipefs_sb;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (pipefs_sb) {
+ nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
+ rpc_put_sb_net(net);
+ }
+}
+
+static int nfs4blocklayout_net_init(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct dentry *dentry;
+
+ mutex_init(&nn->bl_mutex);
+ init_waitqueue_head(&nn->bl_wq);
+ nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
+ if (IS_ERR(nn->bl_device_pipe))
+ return PTR_ERR(nn->bl_device_pipe);
+ dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
+ if (IS_ERR(dentry)) {
+ rpc_destroy_pipe_data(nn->bl_device_pipe);
+ return PTR_ERR(dentry);
+ }
+ nn->bl_device_pipe->dentry = dentry;
+ return 0;
+}
+
+static void nfs4blocklayout_net_exit(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
+ rpc_destroy_pipe_data(nn->bl_device_pipe);
+ nn->bl_device_pipe = NULL;
+}
+
+static struct pernet_operations nfs4blocklayout_net_ops = {
+ .init = nfs4blocklayout_net_init,
+ .exit = nfs4blocklayout_net_exit,
+};
+
+int __init bl_init_pipefs(void)
+{
+ int ret;
+
+ ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
+ if (ret)
+ goto out;
+ ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
+ if (ret)
+ goto out_unregister_notifier;
+ return 0;
+
+out_unregister_notifier:
+ rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+out:
+ return ret;
+}
+
+void bl_cleanup_pipefs(void)
+{
+ rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+ unregister_pernet_subsys(&nfs4blocklayout_net_ops);
+}
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
new file mode 100644
index 000000000..ef6729568
--- /dev/null
+++ b/fs/nfs/cache_lib.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/cache_lib.c
+ *
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <net/net_namespace.h>
+
+#include "cache_lib.h"
+
+#define NFS_CACHE_UPCALL_PATHLEN 256
+#define NFS_CACHE_UPCALL_TIMEOUT 15
+
+static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] =
+ "/sbin/nfs_cache_getent";
+static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT;
+
+module_param_string(cache_getent, nfs_cache_getent_prog,
+ sizeof(nfs_cache_getent_prog), 0600);
+MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program");
+module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600);
+MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which "
+ "the cache upcall is assumed to have failed");
+
+int nfs_cache_upcall(struct cache_detail *cd, char *entry_name)
+{
+ static char *envp[] = { "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ NULL
+ };
+ char *argv[] = {
+ nfs_cache_getent_prog,
+ cd->name,
+ entry_name,
+ NULL
+ };
+ int ret = -EACCES;
+
+ if (nfs_cache_getent_prog[0] == '\0')
+ goto out;
+ ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ /*
+ * Disable the upcall mechanism if we're getting an ENOENT or
+ * EACCES error. The admin can re-enable it on the fly by using
+ * sysfs to set the 'cache_getent' parameter once the problem
+ * has been fixed.
+ */
+ if (ret == -ENOENT || ret == -EACCES)
+ nfs_cache_getent_prog[0] = '\0';
+out:
+ return ret > 0 ? 0 : ret;
+}
+
+/*
+ * Deferred request handling
+ */
+void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq)
+{
+ if (refcount_dec_and_test(&dreq->count))
+ kfree(dreq);
+}
+
+static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany)
+{
+ struct nfs_cache_defer_req *dreq;
+
+ dreq = container_of(d, struct nfs_cache_defer_req, deferred_req);
+
+ complete(&dreq->completion);
+ nfs_cache_defer_req_put(dreq);
+}
+
+static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req)
+{
+ struct nfs_cache_defer_req *dreq;
+
+ dreq = container_of(req, struct nfs_cache_defer_req, req);
+ dreq->deferred_req.revisit = nfs_dns_cache_revisit;
+ refcount_inc(&dreq->count);
+
+ return &dreq->deferred_req;
+}
+
+struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void)
+{
+ struct nfs_cache_defer_req *dreq;
+
+ dreq = kzalloc(sizeof(*dreq), GFP_KERNEL);
+ if (dreq) {
+ init_completion(&dreq->completion);
+ refcount_set(&dreq->count, 1);
+ dreq->req.defer = nfs_dns_cache_defer;
+ }
+ return dreq;
+}
+
+int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
+{
+ if (wait_for_completion_timeout(&dreq->completion,
+ nfs_cache_getent_timeout * HZ) == 0)
+ return -ETIMEDOUT;
+ return 0;
+}
+
+int nfs_cache_register_sb(struct super_block *sb, struct cache_detail *cd)
+{
+ int ret;
+ struct dentry *dir;
+
+ dir = rpc_d_lookup_sb(sb, "cache");
+ ret = sunrpc_cache_register_pipefs(dir, cd->name, 0600, cd);
+ dput(dir);
+ return ret;
+}
+
+int nfs_cache_register_net(struct net *net, struct cache_detail *cd)
+{
+ struct super_block *pipefs_sb;
+ int ret = 0;
+
+ sunrpc_init_cache_detail(cd);
+ pipefs_sb = rpc_get_sb_net(net);
+ if (pipefs_sb) {
+ ret = nfs_cache_register_sb(pipefs_sb, cd);
+ rpc_put_sb_net(net);
+ if (ret)
+ sunrpc_destroy_cache_detail(cd);
+ }
+ return ret;
+}
+
+void nfs_cache_unregister_sb(struct super_block *sb, struct cache_detail *cd)
+{
+ sunrpc_cache_unregister_pipefs(cd);
+}
+
+void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd)
+{
+ struct super_block *pipefs_sb;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (pipefs_sb) {
+ nfs_cache_unregister_sb(pipefs_sb, cd);
+ rpc_put_sb_net(net);
+ }
+ sunrpc_destroy_cache_detail(cd);
+}
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
new file mode 100644
index 000000000..220ee409a
--- /dev/null
+++ b/fs/nfs/cache_lib.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+
+#include <linux/completion.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/atomic.h>
+
+/*
+ * Deferred request handling
+ */
+struct nfs_cache_defer_req {
+ struct cache_req req;
+ struct cache_deferred_req deferred_req;
+ struct completion completion;
+ refcount_t count;
+};
+
+extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name);
+extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
+extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
+extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
+
+extern int nfs_cache_register_net(struct net *net, struct cache_detail *cd);
+extern void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd);
+extern int nfs_cache_register_sb(struct super_block *sb,
+ struct cache_detail *cd);
+extern void nfs_cache_unregister_sb(struct super_block *sb,
+ struct cache_detail *cd);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
new file mode 100644
index 000000000..7817ad94a
--- /dev/null
+++ b/fs/nfs/callback.c
@@ -0,0 +1,465 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/callback.c
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFSv4 callback handling
+ */
+
+#include <linux/completion.h>
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/sched/signal.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/nfs_fs.h>
+#include <linux/errno.h>
+#include <linux/mutex.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/sunrpc/bc_xprt.h>
+
+#include <net/inet_sock.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "internal.h"
+#include "netns.h"
+
+#define NFSDBG_FACILITY NFSDBG_CALLBACK
+
+struct nfs_callback_data {
+ unsigned int users;
+ struct svc_serv *serv;
+};
+
+static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1];
+static DEFINE_MUTEX(nfs_callback_mutex);
+static struct svc_program nfs4_callback_program;
+
+static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net)
+{
+ const struct cred *cred = current_cred();
+ int ret;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ ret = svc_create_xprt(serv, "tcp", net, PF_INET,
+ nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
+ cred);
+ if (ret <= 0)
+ goto out_err;
+ nn->nfs_callback_tcpport = ret;
+ dprintk("NFS: Callback listener port = %u (af %u, net %x)\n",
+ nn->nfs_callback_tcpport, PF_INET, net->ns.inum);
+
+ ret = svc_create_xprt(serv, "tcp", net, PF_INET6,
+ nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
+ cred);
+ if (ret > 0) {
+ nn->nfs_callback_tcpport6 = ret;
+ dprintk("NFS: Callback listener port = %u (af %u, net %x)\n",
+ nn->nfs_callback_tcpport6, PF_INET6, net->ns.inum);
+ } else if (ret != -EAFNOSUPPORT)
+ goto out_err;
+ return 0;
+
+out_err:
+ return (ret) ? ret : -ENOMEM;
+}
+
+/*
+ * This is the NFSv4 callback kernel thread.
+ */
+static int
+nfs4_callback_svc(void *vrqstp)
+{
+ int err;
+ struct svc_rqst *rqstp = vrqstp;
+
+ set_freezable();
+
+ while (!kthread_freezable_should_stop(NULL)) {
+
+ if (signal_pending(current))
+ flush_signals(current);
+ /*
+ * Listen for a request on the socket
+ */
+ err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT);
+ if (err == -EAGAIN || err == -EINTR)
+ continue;
+ svc_process(rqstp);
+ }
+ svc_exit_thread(rqstp);
+ module_put_and_exit(0);
+ return 0;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * The callback service for NFSv4.1 callbacks
+ */
+static int
+nfs41_callback_svc(void *vrqstp)
+{
+ struct svc_rqst *rqstp = vrqstp;
+ struct svc_serv *serv = rqstp->rq_server;
+ struct rpc_rqst *req;
+ int error;
+ DEFINE_WAIT(wq);
+
+ set_freezable();
+
+ while (!kthread_freezable_should_stop(NULL)) {
+
+ if (signal_pending(current))
+ flush_signals(current);
+
+ prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
+ spin_lock_bh(&serv->sv_cb_lock);
+ if (!list_empty(&serv->sv_cb_list)) {
+ req = list_first_entry(&serv->sv_cb_list,
+ struct rpc_rqst, rq_bc_list);
+ list_del(&req->rq_bc_list);
+ spin_unlock_bh(&serv->sv_cb_lock);
+ finish_wait(&serv->sv_cb_waitq, &wq);
+ dprintk("Invoking bc_svc_process()\n");
+ error = bc_svc_process(serv, req, rqstp);
+ dprintk("bc_svc_process() returned w/ error code= %d\n",
+ error);
+ } else {
+ spin_unlock_bh(&serv->sv_cb_lock);
+ if (!kthread_should_stop())
+ schedule();
+ finish_wait(&serv->sv_cb_waitq, &wq);
+ }
+ }
+ svc_exit_thread(rqstp);
+ module_put_and_exit(0);
+ return 0;
+}
+
+static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
+ struct svc_serv *serv)
+{
+ if (minorversion)
+ /*
+ * Save the svc_serv in the transport so that it can
+ * be referenced when the session backchannel is initialized
+ */
+ xprt->bc_serv = serv;
+}
+#else
+static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
+ struct svc_serv *serv)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
+ struct svc_serv *serv)
+{
+ int nrservs = nfs_callback_nr_threads;
+ int ret;
+
+ nfs_callback_bc_serv(minorversion, xprt, serv);
+
+ if (nrservs < NFS4_MIN_NR_CALLBACK_THREADS)
+ nrservs = NFS4_MIN_NR_CALLBACK_THREADS;
+
+ if (serv->sv_nrthreads-1 == nrservs)
+ return 0;
+
+ ret = serv->sv_ops->svo_setup(serv, NULL, nrservs);
+ if (ret) {
+ serv->sv_ops->svo_setup(serv, NULL, 0);
+ return ret;
+ }
+ dprintk("nfs_callback_up: service started\n");
+ return 0;
+}
+
+static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ if (--nn->cb_users[minorversion])
+ return;
+
+ dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum);
+ svc_shutdown_net(serv, net);
+}
+
+static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
+ struct net *net, struct rpc_xprt *xprt)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ int ret;
+
+ if (nn->cb_users[minorversion]++)
+ return 0;
+
+ dprintk("NFS: create per-net callback data; net=%x\n", net->ns.inum);
+
+ ret = svc_bind(serv, net);
+ if (ret < 0) {
+ printk(KERN_WARNING "NFS: bind callback service failed\n");
+ goto err_bind;
+ }
+
+ ret = 0;
+ if (!IS_ENABLED(CONFIG_NFS_V4_1) || minorversion == 0)
+ ret = nfs4_callback_up_net(serv, net);
+ else if (xprt->ops->bc_setup)
+ set_bc_enabled(serv);
+ else
+ ret = -EPROTONOSUPPORT;
+
+ if (ret < 0) {
+ printk(KERN_ERR "NFS: callback service start failed\n");
+ goto err_socks;
+ }
+ return 0;
+
+err_socks:
+ svc_rpcb_cleanup(serv, net);
+err_bind:
+ nn->cb_users[minorversion]--;
+ dprintk("NFS: Couldn't create callback socket: err = %d; "
+ "net = %x\n", ret, net->ns.inum);
+ return ret;
+}
+
+static const struct svc_serv_ops nfs40_cb_sv_ops = {
+ .svo_function = nfs4_callback_svc,
+ .svo_enqueue_xprt = svc_xprt_do_enqueue,
+ .svo_setup = svc_set_num_threads_sync,
+ .svo_module = THIS_MODULE,
+};
+#if defined(CONFIG_NFS_V4_1)
+static const struct svc_serv_ops nfs41_cb_sv_ops = {
+ .svo_function = nfs41_callback_svc,
+ .svo_enqueue_xprt = svc_xprt_do_enqueue,
+ .svo_setup = svc_set_num_threads_sync,
+ .svo_module = THIS_MODULE,
+};
+
+static const struct svc_serv_ops *nfs4_cb_sv_ops[] = {
+ [0] = &nfs40_cb_sv_ops,
+ [1] = &nfs41_cb_sv_ops,
+};
+#else
+static const struct svc_serv_ops *nfs4_cb_sv_ops[] = {
+ [0] = &nfs40_cb_sv_ops,
+ [1] = NULL,
+};
+#endif
+
+static struct svc_serv *nfs_callback_create_svc(int minorversion)
+{
+ struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+ const struct svc_serv_ops *sv_ops;
+ struct svc_serv *serv;
+
+ /*
+ * Check whether we're already up and running.
+ */
+ if (cb_info->serv) {
+ /*
+ * Note: increase service usage, because later in case of error
+ * svc_destroy() will be called.
+ */
+ svc_get(cb_info->serv);
+ return cb_info->serv;
+ }
+
+ switch (minorversion) {
+ case 0:
+ sv_ops = nfs4_cb_sv_ops[0];
+ break;
+ default:
+ sv_ops = nfs4_cb_sv_ops[1];
+ }
+
+ if (sv_ops == NULL)
+ return ERR_PTR(-ENOTSUPP);
+
+ /*
+ * Sanity check: if there's no task,
+ * we should be the first user ...
+ */
+ if (cb_info->users)
+ printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
+ cb_info->users);
+
+ serv = svc_create_pooled(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
+ if (!serv) {
+ printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
+ return ERR_PTR(-ENOMEM);
+ }
+ cb_info->serv = serv;
+ /* As there is only one thread we need to over-ride the
+ * default maximum of 80 connections
+ */
+ serv->sv_maxconn = 1024;
+ dprintk("nfs_callback_create_svc: service created\n");
+ return serv;
+}
+
+/*
+ * Bring up the callback thread if it is not already up.
+ */
+int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
+{
+ struct svc_serv *serv;
+ struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+ int ret;
+ struct net *net = xprt->xprt_net;
+
+ mutex_lock(&nfs_callback_mutex);
+
+ serv = nfs_callback_create_svc(minorversion);
+ if (IS_ERR(serv)) {
+ ret = PTR_ERR(serv);
+ goto err_create;
+ }
+
+ ret = nfs_callback_up_net(minorversion, serv, net, xprt);
+ if (ret < 0)
+ goto err_net;
+
+ ret = nfs_callback_start_svc(minorversion, xprt, serv);
+ if (ret < 0)
+ goto err_start;
+
+ cb_info->users++;
+ /*
+ * svc_create creates the svc_serv with sv_nrthreads == 1, and then
+ * svc_prepare_thread increments that. So we need to call svc_destroy
+ * on both success and failure so that the refcount is 1 when the
+ * thread exits.
+ */
+err_net:
+ if (!cb_info->users)
+ cb_info->serv = NULL;
+ svc_destroy(serv);
+err_create:
+ mutex_unlock(&nfs_callback_mutex);
+ return ret;
+
+err_start:
+ nfs_callback_down_net(minorversion, serv, net);
+ dprintk("NFS: Couldn't create server thread; err = %d\n", ret);
+ goto err_net;
+}
+
+/*
+ * Kill the callback thread if it's no longer being used.
+ */
+void nfs_callback_down(int minorversion, struct net *net)
+{
+ struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+ struct svc_serv *serv;
+
+ mutex_lock(&nfs_callback_mutex);
+ serv = cb_info->serv;
+ nfs_callback_down_net(minorversion, serv, net);
+ cb_info->users--;
+ if (cb_info->users == 0) {
+ svc_get(serv);
+ serv->sv_ops->svo_setup(serv, NULL, 0);
+ svc_destroy(serv);
+ dprintk("nfs_callback_down: service destroyed\n");
+ cb_info->serv = NULL;
+ }
+ mutex_unlock(&nfs_callback_mutex);
+}
+
+/* Boolean check of RPC_AUTH_GSS principal */
+int
+check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
+{
+ char *p = rqstp->rq_cred.cr_principal;
+
+ if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
+ return 1;
+
+ /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */
+ if (clp->cl_minorversion != 0)
+ return 0;
+ /*
+ * It might just be a normal user principal, in which case
+ * userspace won't bother to tell us the name at all.
+ */
+ if (p == NULL)
+ return 0;
+
+ /*
+ * Did we get the acceptor from userland during the SETCLIENID
+ * negotiation?
+ */
+ if (clp->cl_acceptor)
+ return !strcmp(p, clp->cl_acceptor);
+
+ /*
+ * Otherwise try to verify it using the cl_hostname. Note that this
+ * doesn't work if a non-canonical hostname was used in the devname.
+ */
+
+ /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
+
+ if (memcmp(p, "nfs@", 4) != 0)
+ return 0;
+ p += 4;
+ if (strcmp(p, clp->cl_hostname) != 0)
+ return 0;
+ return 1;
+}
+
+/*
+ * pg_authenticate method for nfsv4 callback threads.
+ *
+ * The authflavor has been negotiated, so an incorrect flavor is a server
+ * bug. Deny packets with incorrect authflavor.
+ *
+ * All other checking done after NFS decoding where the nfs_client can be
+ * found in nfs4_callback_compound
+ */
+static int nfs_callback_authenticate(struct svc_rqst *rqstp)
+{
+ switch (rqstp->rq_authop->flavour) {
+ case RPC_AUTH_NULL:
+ if (rqstp->rq_proc != CB_NULL)
+ return SVC_DENIED;
+ break;
+ case RPC_AUTH_GSS:
+ /* No RPC_AUTH_GSS support yet in NFSv4.1 */
+ if (svc_is_backchannel(rqstp))
+ return SVC_DENIED;
+ }
+ return SVC_OK;
+}
+
+/*
+ * Define NFS4 callback program
+ */
+static const struct svc_version *nfs4_callback_version[] = {
+ [1] = &nfs4_callback_version1,
+ [4] = &nfs4_callback_version4,
+};
+
+static struct svc_stat nfs4_callback_stats;
+
+static struct svc_program nfs4_callback_program = {
+ .pg_prog = NFS4_CALLBACK, /* RPC service number */
+ .pg_nvers = ARRAY_SIZE(nfs4_callback_version), /* Number of entries */
+ .pg_vers = nfs4_callback_version, /* version table */
+ .pg_name = "NFSv4 callback", /* service name */
+ .pg_class = "nfs", /* authentication class */
+ .pg_stats = &nfs4_callback_stats,
+ .pg_authenticate = nfs_callback_authenticate,
+ .pg_init_request = svc_generic_init_request,
+ .pg_rpcbind_set = svc_generic_rpcbind_set,
+};
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
new file mode 100644
index 000000000..ccd4f245c
--- /dev/null
+++ b/fs/nfs/callback.h
@@ -0,0 +1,223 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/fs/nfs/callback.h
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFSv4 callback definitions
+ */
+#ifndef __LINUX_FS_NFS_CALLBACK_H
+#define __LINUX_FS_NFS_CALLBACK_H
+#include <linux/sunrpc/svc.h>
+
+#define NFS4_CALLBACK 0x40000000
+#define NFS4_CALLBACK_XDRSIZE 2048
+#define NFS4_CALLBACK_BUFSIZE (1024 + NFS4_CALLBACK_XDRSIZE)
+
+enum nfs4_callback_procnum {
+ CB_NULL = 0,
+ CB_COMPOUND = 1,
+};
+
+enum nfs4_callback_opnum {
+ OP_CB_GETATTR = 3,
+ OP_CB_RECALL = 4,
+/* Callback operations new to NFSv4.1 */
+ OP_CB_LAYOUTRECALL = 5,
+ OP_CB_NOTIFY = 6,
+ OP_CB_PUSH_DELEG = 7,
+ OP_CB_RECALL_ANY = 8,
+ OP_CB_RECALLABLE_OBJ_AVAIL = 9,
+ OP_CB_RECALL_SLOT = 10,
+ OP_CB_SEQUENCE = 11,
+ OP_CB_WANTS_CANCELLED = 12,
+ OP_CB_NOTIFY_LOCK = 13,
+ OP_CB_NOTIFY_DEVICEID = 14,
+/* Callback operations new to NFSv4.2 */
+ OP_CB_OFFLOAD = 15,
+ OP_CB_ILLEGAL = 10044,
+};
+
+struct nfs4_slot;
+struct cb_process_state {
+ __be32 drc_status;
+ struct nfs_client *clp;
+ struct nfs4_slot *slot;
+ u32 minorversion;
+ struct net *net;
+};
+
+struct cb_compound_hdr_arg {
+ unsigned int taglen;
+ const char *tag;
+ unsigned int minorversion;
+ unsigned int cb_ident; /* v4.0 callback identifier */
+ unsigned nops;
+};
+
+struct cb_compound_hdr_res {
+ __be32 *status;
+ unsigned int taglen;
+ const char *tag;
+ __be32 *nops;
+};
+
+struct cb_getattrargs {
+ struct nfs_fh fh;
+ uint32_t bitmap[2];
+};
+
+struct cb_getattrres {
+ __be32 status;
+ uint32_t bitmap[2];
+ uint64_t size;
+ uint64_t change_attr;
+ struct timespec64 ctime;
+ struct timespec64 mtime;
+};
+
+struct cb_recallargs {
+ struct nfs_fh fh;
+ nfs4_stateid stateid;
+ uint32_t truncate;
+};
+
+#if defined(CONFIG_NFS_V4_1)
+
+struct referring_call {
+ uint32_t rc_sequenceid;
+ uint32_t rc_slotid;
+};
+
+struct referring_call_list {
+ struct nfs4_sessionid rcl_sessionid;
+ uint32_t rcl_nrefcalls;
+ struct referring_call *rcl_refcalls;
+};
+
+struct cb_sequenceargs {
+ struct sockaddr *csa_addr;
+ struct nfs4_sessionid csa_sessionid;
+ uint32_t csa_sequenceid;
+ uint32_t csa_slotid;
+ uint32_t csa_highestslotid;
+ uint32_t csa_cachethis;
+ uint32_t csa_nrclists;
+ struct referring_call_list *csa_rclists;
+};
+
+struct cb_sequenceres {
+ __be32 csr_status;
+ struct nfs4_sessionid csr_sessionid;
+ uint32_t csr_sequenceid;
+ uint32_t csr_slotid;
+ uint32_t csr_highestslotid;
+ uint32_t csr_target_highestslotid;
+};
+
+extern __be32 nfs4_callback_sequence(void *argp, void *resp,
+ struct cb_process_state *cps);
+
+#define RCA4_TYPE_MASK_RDATA_DLG 0
+#define RCA4_TYPE_MASK_WDATA_DLG 1
+#define RCA4_TYPE_MASK_DIR_DLG 2
+#define RCA4_TYPE_MASK_FILE_LAYOUT 3
+#define RCA4_TYPE_MASK_BLK_LAYOUT 4
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
+#define PNFS_FF_RCA4_TYPE_MASK_READ 16
+#define PNFS_FF_RCA4_TYPE_MASK_RW 17
+#define RCA4_TYPE_MASK_ALL 0x3f31f
+
+struct cb_recallanyargs {
+ uint32_t craa_objs_to_keep;
+ uint32_t craa_type_mask;
+};
+
+extern __be32 nfs4_callback_recallany(void *argp, void *resp,
+ struct cb_process_state *cps);
+
+struct cb_recallslotargs {
+ uint32_t crsa_target_highest_slotid;
+};
+extern __be32 nfs4_callback_recallslot(void *argp, void *resp,
+ struct cb_process_state *cps);
+
+struct cb_layoutrecallargs {
+ uint32_t cbl_recall_type;
+ uint32_t cbl_layout_type;
+ uint32_t cbl_layoutchanged;
+ union {
+ struct {
+ struct nfs_fh cbl_fh;
+ struct pnfs_layout_range cbl_range;
+ nfs4_stateid cbl_stateid;
+ };
+ struct nfs_fsid cbl_fsid;
+ };
+};
+
+extern __be32 nfs4_callback_layoutrecall(void *argp, void *resp,
+ struct cb_process_state *cps);
+
+struct cb_devicenotifyitem {
+ uint32_t cbd_notify_type;
+ uint32_t cbd_layout_type;
+ struct nfs4_deviceid cbd_dev_id;
+ uint32_t cbd_immediate;
+};
+
+struct cb_devicenotifyargs {
+ uint32_t ndevs;
+ struct cb_devicenotifyitem *devs;
+};
+
+extern __be32 nfs4_callback_devicenotify(void *argp, void *resp,
+ struct cb_process_state *cps);
+
+struct cb_notify_lock_args {
+ struct nfs_fh cbnl_fh;
+ struct nfs_lowner cbnl_owner;
+ bool cbnl_valid;
+};
+
+extern __be32 nfs4_callback_notify_lock(void *argp, void *resp,
+ struct cb_process_state *cps);
+#endif /* CONFIG_NFS_V4_1 */
+#ifdef CONFIG_NFS_V4_2
+struct cb_offloadargs {
+ struct nfs_fh coa_fh;
+ nfs4_stateid coa_stateid;
+ uint32_t error;
+ uint64_t wr_count;
+ struct nfs_writeverf wr_writeverf;
+};
+
+extern __be32 nfs4_callback_offload(void *args, void *dummy,
+ struct cb_process_state *cps);
+#endif /* CONFIG_NFS_V4_2 */
+extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
+extern __be32 nfs4_callback_getattr(void *argp, void *resp,
+ struct cb_process_state *cps);
+extern __be32 nfs4_callback_recall(void *argp, void *resp,
+ struct cb_process_state *cps);
+#if IS_ENABLED(CONFIG_NFS_V4)
+extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
+extern void nfs_callback_down(int minorversion, struct net *net);
+#endif /* CONFIG_NFS_V4 */
+/*
+ * nfs41: Callbacks are expected to not cause substantial latency,
+ * so we limit their concurrency to 1 by setting up the maximum number
+ * of slots for the backchannel.
+ */
+#define NFS41_BC_MIN_CALLBACKS 1
+#define NFS41_BC_MAX_CALLBACKS 1
+
+#define NFS4_MIN_NR_CALLBACK_THREADS 1
+
+extern unsigned int nfs_callback_set_tcpport;
+extern unsigned short nfs_callback_nr_threads;
+
+#endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
new file mode 100644
index 000000000..bfdd21224
--- /dev/null
+++ b/fs/nfs/callback_proc.c
@@ -0,0 +1,731 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/callback_proc.c
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFSv4 callback procedures
+ */
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "internal.h"
+#include "pnfs.h"
+#include "nfs4session.h"
+#include "nfs4trace.h"
+
+#define NFSDBG_FACILITY NFSDBG_CALLBACK
+
+__be32 nfs4_callback_getattr(void *argp, void *resp,
+ struct cb_process_state *cps)
+{
+ struct cb_getattrargs *args = argp;
+ struct cb_getattrres *res = resp;
+ struct nfs_delegation *delegation;
+ struct inode *inode;
+
+ res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+ if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
+ goto out;
+
+ res->bitmap[0] = res->bitmap[1] = 0;
+ res->status = htonl(NFS4ERR_BADHANDLE);
+
+ dprintk_rcu("NFS: GETATTR callback request from %s\n",
+ rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+ inode = nfs_delegation_find_inode(cps->clp, &args->fh);
+ if (IS_ERR(inode)) {
+ if (inode == ERR_PTR(-EAGAIN))
+ res->status = htonl(NFS4ERR_DELAY);
+ trace_nfs4_cb_getattr(cps->clp, &args->fh, NULL,
+ -ntohl(res->status));
+ goto out;
+ }
+ rcu_read_lock();
+ delegation = nfs4_get_valid_delegation(inode);
+ if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0)
+ goto out_iput;
+ res->size = i_size_read(inode);
+ res->change_attr = delegation->change_attr;
+ if (nfs_have_writebacks(inode))
+ res->change_attr++;
+ res->ctime = inode->i_ctime;
+ res->mtime = inode->i_mtime;
+ res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) &
+ args->bitmap[0];
+ res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) &
+ args->bitmap[1];
+ res->status = 0;
+out_iput:
+ rcu_read_unlock();
+ trace_nfs4_cb_getattr(cps->clp, &args->fh, inode, -ntohl(res->status));
+ nfs_iput_and_deactive(inode);
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
+ return res->status;
+}
+
+__be32 nfs4_callback_recall(void *argp, void *resp,
+ struct cb_process_state *cps)
+{
+ struct cb_recallargs *args = argp;
+ struct inode *inode;
+ __be32 res;
+
+ res = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+ if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
+ goto out;
+
+ dprintk_rcu("NFS: RECALL callback request from %s\n",
+ rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+ res = htonl(NFS4ERR_BADHANDLE);
+ inode = nfs_delegation_find_inode(cps->clp, &args->fh);
+ if (IS_ERR(inode)) {
+ if (inode == ERR_PTR(-EAGAIN))
+ res = htonl(NFS4ERR_DELAY);
+ trace_nfs4_cb_recall(cps->clp, &args->fh, NULL,
+ &args->stateid, -ntohl(res));
+ goto out;
+ }
+ /* Set up a helper thread to actually return the delegation */
+ switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
+ case 0:
+ res = 0;
+ break;
+ case -ENOENT:
+ res = htonl(NFS4ERR_BAD_STATEID);
+ break;
+ default:
+ res = htonl(NFS4ERR_RESOURCE);
+ }
+ trace_nfs4_cb_recall(cps->clp, &args->fh, inode,
+ &args->stateid, -ntohl(res));
+ nfs_iput_and_deactive(inode);
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
+ return res;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+/*
+ * Lookup a layout inode by stateid
+ *
+ * Note: returns a refcount on the inode and superblock
+ */
+static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp,
+ const nfs4_stateid *stateid)
+ __must_hold(RCU)
+{
+ struct nfs_server *server;
+ struct inode *inode;
+ struct pnfs_layout_hdr *lo;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
+ if (!pnfs_layout_is_valid(lo))
+ continue;
+ if (!nfs4_stateid_match_other(stateid, &lo->plh_stateid))
+ continue;
+ if (nfs_sb_active(server->super))
+ inode = igrab(lo->plh_inode);
+ else
+ inode = ERR_PTR(-EAGAIN);
+ rcu_read_unlock();
+ if (inode)
+ return inode;
+ nfs_sb_deactive(server->super);
+ return ERR_PTR(-EAGAIN);
+ }
+ }
+ rcu_read_unlock();
+ return ERR_PTR(-ENOENT);
+}
+
+/*
+ * Lookup a layout inode by filehandle.
+ *
+ * Note: returns a refcount on the inode and superblock
+ *
+ */
+static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp,
+ const struct nfs_fh *fh)
+{
+ struct nfs_server *server;
+ struct nfs_inode *nfsi;
+ struct inode *inode;
+ struct pnfs_layout_hdr *lo;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
+ nfsi = NFS_I(lo->plh_inode);
+ if (nfs_compare_fh(fh, &nfsi->fh))
+ continue;
+ if (nfsi->layout != lo)
+ continue;
+ if (nfs_sb_active(server->super))
+ inode = igrab(lo->plh_inode);
+ else
+ inode = ERR_PTR(-EAGAIN);
+ rcu_read_unlock();
+ if (inode)
+ return inode;
+ nfs_sb_deactive(server->super);
+ return ERR_PTR(-EAGAIN);
+ }
+ }
+ rcu_read_unlock();
+ return ERR_PTR(-ENOENT);
+}
+
+static struct inode *nfs_layout_find_inode(struct nfs_client *clp,
+ const struct nfs_fh *fh,
+ const nfs4_stateid *stateid)
+{
+ struct inode *inode;
+
+ inode = nfs_layout_find_inode_by_stateid(clp, stateid);
+ if (inode == ERR_PTR(-ENOENT))
+ inode = nfs_layout_find_inode_by_fh(clp, fh);
+ return inode;
+}
+
+/*
+ * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing)
+ */
+static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *new)
+{
+ u32 oldseq, newseq;
+
+ /* Is the stateid not initialised? */
+ if (!pnfs_layout_is_valid(lo))
+ return NFS4ERR_NOMATCHING_LAYOUT;
+
+ /* Mismatched stateid? */
+ if (!nfs4_stateid_match_other(&lo->plh_stateid, new))
+ return NFS4ERR_BAD_STATEID;
+
+ newseq = be32_to_cpu(new->seqid);
+ /* Are we already in a layout recall situation? */
+ if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
+ lo->plh_return_seq != 0) {
+ if (newseq < lo->plh_return_seq)
+ return NFS4ERR_OLD_STATEID;
+ if (newseq > lo->plh_return_seq)
+ return NFS4ERR_DELAY;
+ goto out;
+ }
+
+ /* Check that the stateid matches what we think it should be. */
+ oldseq = be32_to_cpu(lo->plh_stateid.seqid);
+ if (newseq > oldseq + 1)
+ return NFS4ERR_DELAY;
+ /* Crazy server! */
+ if (newseq <= oldseq)
+ return NFS4ERR_OLD_STATEID;
+out:
+ return NFS_OK;
+}
+
+static u32 initiate_file_draining(struct nfs_client *clp,
+ struct cb_layoutrecallargs *args)
+{
+ struct inode *ino;
+ struct pnfs_layout_hdr *lo;
+ u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+ LIST_HEAD(free_me_list);
+
+ ino = nfs_layout_find_inode(clp, &args->cbl_fh, &args->cbl_stateid);
+ if (IS_ERR(ino)) {
+ if (ino == ERR_PTR(-EAGAIN))
+ rv = NFS4ERR_DELAY;
+ goto out_noput;
+ }
+
+ pnfs_layoutcommit_inode(ino, false);
+
+
+ spin_lock(&ino->i_lock);
+ lo = NFS_I(ino)->layout;
+ if (!lo) {
+ spin_unlock(&ino->i_lock);
+ goto out;
+ }
+ pnfs_get_layout_hdr(lo);
+ rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid);
+ if (rv != NFS_OK)
+ goto unlock;
+
+ /*
+ * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return)
+ */
+ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+ rv = NFS4ERR_DELAY;
+ goto unlock;
+ }
+
+ pnfs_set_layout_stateid(lo, &args->cbl_stateid, NULL, true);
+ switch (pnfs_mark_matching_lsegs_return(lo, &free_me_list,
+ &args->cbl_range,
+ be32_to_cpu(args->cbl_stateid.seqid))) {
+ case 0:
+ case -EBUSY:
+ /* There are layout segments that need to be returned */
+ rv = NFS4_OK;
+ break;
+ case -ENOENT:
+ set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
+ /* Embrace your forgetfulness! */
+ rv = NFS4ERR_NOMATCHING_LAYOUT;
+
+ if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
+ NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
+ &args->cbl_range);
+ }
+ }
+unlock:
+ spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&free_me_list);
+ /* Free all lsegs that are attached to commit buckets */
+ nfs_commit_inode(ino, 0);
+ pnfs_put_layout_hdr(lo);
+out:
+ nfs_iput_and_deactive(ino);
+out_noput:
+ trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino,
+ &args->cbl_stateid, -rv);
+ return rv;
+}
+
+static u32 initiate_bulk_draining(struct nfs_client *clp,
+ struct cb_layoutrecallargs *args)
+{
+ int stat;
+
+ if (args->cbl_recall_type == RETURN_FSID)
+ stat = pnfs_destroy_layouts_byfsid(clp, &args->cbl_fsid, true);
+ else
+ stat = pnfs_destroy_layouts_byclid(clp, true);
+ if (stat != 0)
+ return NFS4ERR_DELAY;
+ return NFS4ERR_NOMATCHING_LAYOUT;
+}
+
+static u32 do_callback_layoutrecall(struct nfs_client *clp,
+ struct cb_layoutrecallargs *args)
+{
+ if (args->cbl_recall_type == RETURN_FILE)
+ return initiate_file_draining(clp, args);
+ return initiate_bulk_draining(clp, args);
+}
+
+__be32 nfs4_callback_layoutrecall(void *argp, void *resp,
+ struct cb_process_state *cps)
+{
+ struct cb_layoutrecallargs *args = argp;
+ u32 res = NFS4ERR_OP_NOT_IN_SESSION;
+
+ if (cps->clp)
+ res = do_callback_layoutrecall(cps->clp, args);
+ return cpu_to_be32(res);
+}
+
+static void pnfs_recall_all_layouts(struct nfs_client *clp)
+{
+ struct cb_layoutrecallargs args;
+
+ /* Pretend we got a CB_LAYOUTRECALL(ALL) */
+ memset(&args, 0, sizeof(args));
+ args.cbl_recall_type = RETURN_ALL;
+ /* FIXME we ignore errors, what should we do? */
+ do_callback_layoutrecall(clp, &args);
+}
+
+__be32 nfs4_callback_devicenotify(void *argp, void *resp,
+ struct cb_process_state *cps)
+{
+ struct cb_devicenotifyargs *args = argp;
+ const struct pnfs_layoutdriver_type *ld = NULL;
+ uint32_t i;
+ __be32 res = 0;
+
+ if (!cps->clp) {
+ res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
+ goto out;
+ }
+
+ for (i = 0; i < args->ndevs; i++) {
+ struct cb_devicenotifyitem *dev = &args->devs[i];
+
+ if (!ld || ld->id != dev->cbd_layout_type) {
+ pnfs_put_layoutdriver(ld);
+ ld = pnfs_find_layoutdriver(dev->cbd_layout_type);
+ if (!ld)
+ continue;
+ }
+ nfs4_delete_deviceid(ld, cps->clp, &dev->cbd_dev_id);
+ }
+ pnfs_put_layoutdriver(ld);
+out:
+ kfree(args->devs);
+ return res;
+}
+
+/*
+ * Validate the sequenceID sent by the server.
+ * Return success if the sequenceID is one more than what we last saw on
+ * this slot, accounting for wraparound. Increments the slot's sequence.
+ *
+ * We don't yet implement a duplicate request cache, instead we set the
+ * back channel ca_maxresponsesize_cached to zero. This is OK for now
+ * since we only currently implement idempotent callbacks anyway.
+ *
+ * We have a single slot backchannel at this time, so we don't bother
+ * checking the used_slots bit array on the table. The lower layer guarantees
+ * a single outstanding callback request at a time.
+ */
+static __be32
+validate_seqid(const struct nfs4_slot_table *tbl, const struct nfs4_slot *slot,
+ const struct cb_sequenceargs * args)
+{
+ __be32 ret;
+
+ ret = cpu_to_be32(NFS4ERR_BADSLOT);
+ if (args->csa_slotid > tbl->server_highest_slotid)
+ goto out_err;
+
+ /* Replay */
+ if (args->csa_sequenceid == slot->seq_nr) {
+ ret = cpu_to_be32(NFS4ERR_DELAY);
+ if (nfs4_test_locked_slot(tbl, slot->slot_nr))
+ goto out_err;
+
+ /* Signal process_op to set this error on next op */
+ ret = cpu_to_be32(NFS4ERR_RETRY_UNCACHED_REP);
+ if (args->csa_cachethis == 0)
+ goto out_err;
+
+ /* Liar! We never allowed you to set csa_cachethis != 0 */
+ ret = cpu_to_be32(NFS4ERR_SEQ_FALSE_RETRY);
+ goto out_err;
+ }
+
+ /* Note: wraparound relies on seq_nr being of type u32 */
+ /* Misordered request */
+ ret = cpu_to_be32(NFS4ERR_SEQ_MISORDERED);
+ if (args->csa_sequenceid != slot->seq_nr + 1)
+ goto out_err;
+
+ return cpu_to_be32(NFS4_OK);
+
+out_err:
+ trace_nfs4_cb_seqid_err(args, ret);
+ return ret;
+}
+
+/*
+ * For each referring call triple, check the session's slot table for
+ * a match. If the slot is in use and the sequence numbers match, the
+ * client is still waiting for a response to the original request.
+ */
+static int referring_call_exists(struct nfs_client *clp,
+ uint32_t nrclists,
+ struct referring_call_list *rclists,
+ spinlock_t *lock)
+ __releases(lock)
+ __acquires(lock)
+{
+ int status = 0;
+ int i, j;
+ struct nfs4_session *session;
+ struct nfs4_slot_table *tbl;
+ struct referring_call_list *rclist;
+ struct referring_call *ref;
+
+ /*
+ * XXX When client trunking is implemented, this becomes
+ * a session lookup from within the loop
+ */
+ session = clp->cl_session;
+ tbl = &session->fc_slot_table;
+
+ for (i = 0; i < nrclists; i++) {
+ rclist = &rclists[i];
+ if (memcmp(session->sess_id.data,
+ rclist->rcl_sessionid.data,
+ NFS4_MAX_SESSIONID_LEN) != 0)
+ continue;
+
+ for (j = 0; j < rclist->rcl_nrefcalls; j++) {
+ ref = &rclist->rcl_refcalls[j];
+ spin_unlock(lock);
+ status = nfs4_slot_wait_on_seqid(tbl, ref->rc_slotid,
+ ref->rc_sequenceid, HZ >> 1) < 0;
+ spin_lock(lock);
+ if (status)
+ goto out;
+ }
+ }
+
+out:
+ return status;
+}
+
+__be32 nfs4_callback_sequence(void *argp, void *resp,
+ struct cb_process_state *cps)
+{
+ struct cb_sequenceargs *args = argp;
+ struct cb_sequenceres *res = resp;
+ struct nfs4_slot_table *tbl;
+ struct nfs4_slot *slot;
+ struct nfs_client *clp;
+ int i;
+ __be32 status = htonl(NFS4ERR_BADSESSION);
+
+ clp = nfs4_find_client_sessionid(cps->net, args->csa_addr,
+ &args->csa_sessionid, cps->minorversion);
+ if (clp == NULL)
+ goto out;
+
+ if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
+ goto out;
+
+ tbl = &clp->cl_session->bc_slot_table;
+
+ /* Set up res before grabbing the spinlock */
+ memcpy(&res->csr_sessionid, &args->csa_sessionid,
+ sizeof(res->csr_sessionid));
+ res->csr_sequenceid = args->csa_sequenceid;
+ res->csr_slotid = args->csa_slotid;
+
+ spin_lock(&tbl->slot_tbl_lock);
+ /* state manager is resetting the session */
+ if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
+ status = htonl(NFS4ERR_DELAY);
+ /* Return NFS4ERR_BADSESSION if we're draining the session
+ * in order to reset it.
+ */
+ if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
+ status = htonl(NFS4ERR_BADSESSION);
+ goto out_unlock;
+ }
+
+ status = htonl(NFS4ERR_BADSLOT);
+ slot = nfs4_lookup_slot(tbl, args->csa_slotid);
+ if (IS_ERR(slot))
+ goto out_unlock;
+
+ res->csr_highestslotid = tbl->server_highest_slotid;
+ res->csr_target_highestslotid = tbl->target_highest_slotid;
+
+ status = validate_seqid(tbl, slot, args);
+ if (status)
+ goto out_unlock;
+ if (!nfs4_try_to_lock_slot(tbl, slot)) {
+ status = htonl(NFS4ERR_DELAY);
+ goto out_unlock;
+ }
+ cps->slot = slot;
+
+ /* The ca_maxresponsesize_cached is 0 with no DRC */
+ if (args->csa_cachethis != 0) {
+ status = htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
+ goto out_unlock;
+ }
+
+ /*
+ * Check for pending referring calls. If a match is found, a
+ * related callback was received before the response to the original
+ * call.
+ */
+ if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists,
+ &tbl->slot_tbl_lock) < 0) {
+ status = htonl(NFS4ERR_DELAY);
+ goto out_unlock;
+ }
+
+ /*
+ * RFC5661 20.9.3
+ * If CB_SEQUENCE returns an error, then the state of the slot
+ * (sequence ID, cached reply) MUST NOT change.
+ */
+ slot->seq_nr = args->csa_sequenceid;
+out_unlock:
+ spin_unlock(&tbl->slot_tbl_lock);
+
+out:
+ cps->clp = clp; /* put in nfs4_callback_compound */
+ for (i = 0; i < args->csa_nrclists; i++)
+ kfree(args->csa_rclists[i].rcl_refcalls);
+ kfree(args->csa_rclists);
+
+ if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
+ cps->drc_status = status;
+ status = 0;
+ } else
+ res->csr_status = status;
+
+ trace_nfs4_cb_sequence(args, res, status);
+ return status;
+}
+
+static bool
+validate_bitmap_values(unsigned int mask)
+{
+ return (mask & ~RCA4_TYPE_MASK_ALL) == 0;
+}
+
+__be32 nfs4_callback_recallany(void *argp, void *resp,
+ struct cb_process_state *cps)
+{
+ struct cb_recallanyargs *args = argp;
+ __be32 status;
+ fmode_t flags = 0;
+ bool schedule_manager = false;
+
+ status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
+ if (!cps->clp) /* set in cb_sequence */
+ goto out;
+
+ dprintk_rcu("NFS: RECALL_ANY callback request from %s\n",
+ rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+ status = cpu_to_be32(NFS4ERR_INVAL);
+ if (!validate_bitmap_values(args->craa_type_mask))
+ goto out;
+
+ status = cpu_to_be32(NFS4_OK);
+ if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_RDATA_DLG))
+ flags = FMODE_READ;
+ if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_WDATA_DLG))
+ flags |= FMODE_WRITE;
+ if (flags)
+ nfs_expire_unused_delegation_types(cps->clp, flags);
+
+ if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_FILE_LAYOUT))
+ pnfs_recall_all_layouts(cps->clp);
+
+ if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_READ)) {
+ set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &cps->clp->cl_state);
+ schedule_manager = true;
+ }
+ if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_RW)) {
+ set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_RW, &cps->clp->cl_state);
+ schedule_manager = true;
+ }
+ if (schedule_manager)
+ nfs4_schedule_state_manager(cps->clp);
+
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+ return status;
+}
+
+/* Reduce the fore channel's max_slots to the target value */
+__be32 nfs4_callback_recallslot(void *argp, void *resp,
+ struct cb_process_state *cps)
+{
+ struct cb_recallslotargs *args = argp;
+ struct nfs4_slot_table *fc_tbl;
+ __be32 status;
+
+ status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+ if (!cps->clp) /* set in cb_sequence */
+ goto out;
+
+ dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target highest slotid %u\n",
+ rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
+ args->crsa_target_highest_slotid);
+
+ fc_tbl = &cps->clp->cl_session->fc_slot_table;
+
+ status = htonl(NFS4_OK);
+
+ nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid);
+ nfs41_notify_server(cps->clp);
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+ return status;
+}
+
+__be32 nfs4_callback_notify_lock(void *argp, void *resp,
+ struct cb_process_state *cps)
+{
+ struct cb_notify_lock_args *args = argp;
+
+ if (!cps->clp) /* set in cb_sequence */
+ return htonl(NFS4ERR_OP_NOT_IN_SESSION);
+
+ dprintk_rcu("NFS: CB_NOTIFY_LOCK request from %s\n",
+ rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+ /* Don't wake anybody if the string looked bogus */
+ if (args->cbnl_valid)
+ __wake_up(&cps->clp->cl_lock_waitq, TASK_NORMAL, 0, args);
+
+ return htonl(NFS4_OK);
+}
+#endif /* CONFIG_NFS_V4_1 */
+#ifdef CONFIG_NFS_V4_2
+static void nfs4_copy_cb_args(struct nfs4_copy_state *cp_state,
+ struct cb_offloadargs *args)
+{
+ cp_state->count = args->wr_count;
+ cp_state->error = args->error;
+ if (!args->error) {
+ cp_state->verf.committed = args->wr_writeverf.committed;
+ memcpy(&cp_state->verf.verifier.data[0],
+ &args->wr_writeverf.verifier.data[0],
+ NFS4_VERIFIER_SIZE);
+ }
+}
+
+__be32 nfs4_callback_offload(void *data, void *dummy,
+ struct cb_process_state *cps)
+{
+ struct cb_offloadargs *args = data;
+ struct nfs_server *server;
+ struct nfs4_copy_state *copy, *tmp_copy;
+ bool found = false;
+
+ copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS);
+ if (!copy)
+ return htonl(NFS4ERR_SERVERFAULT);
+
+ spin_lock(&cps->clp->cl_lock);
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &cps->clp->cl_superblocks,
+ client_link) {
+ list_for_each_entry(tmp_copy, &server->ss_copies, copies) {
+ if (memcmp(args->coa_stateid.other,
+ tmp_copy->stateid.other,
+ sizeof(args->coa_stateid.other)))
+ continue;
+ nfs4_copy_cb_args(tmp_copy, args);
+ complete(&tmp_copy->completion);
+ found = true;
+ goto out;
+ }
+ }
+out:
+ rcu_read_unlock();
+ if (!found) {
+ memcpy(&copy->stateid, &args->coa_stateid, NFS4_STATEID_SIZE);
+ nfs4_copy_cb_args(copy, args);
+ list_add_tail(&copy->copies, &cps->clp->pending_cb_stateids);
+ } else
+ kfree(copy);
+ spin_unlock(&cps->clp->cl_lock);
+
+ return 0;
+}
+#endif /* CONFIG_NFS_V4_2 */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
new file mode 100644
index 000000000..ca8a4aa35
--- /dev/null
+++ b/fs/nfs/callback_xdr.c
@@ -0,0 +1,1091 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/callback_xdr.c
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFSv4 callback encode/decode procedures
+ */
+#include <linux/kernel.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/ratelimit.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/bc_xprt.h>
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "internal.h"
+#include "nfs4session.h"
+#include "nfs4trace.h"
+
+#define CB_OP_TAGLEN_MAXSZ (512)
+#define CB_OP_HDR_RES_MAXSZ (2 * 4) // opcode, status
+#define CB_OP_GETATTR_BITMAP_MAXSZ (4 * 4) // bitmap length, 3 bitmaps
+#define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
+ CB_OP_GETATTR_BITMAP_MAXSZ + \
+ /* change, size, ctime, mtime */\
+ (2 + 2 + 3 + 3) * 4)
+#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+
+#if defined(CONFIG_NFS_V4_1)
+#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
+ NFS4_MAX_SESSIONID_LEN + \
+ (1 + 3) * 4) // seqid, 3 slotids
+#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_NOTIFY_LOCK_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#endif /* CONFIG_NFS_V4_1 */
+#ifdef CONFIG_NFS_V4_2
+#define CB_OP_OFFLOAD_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#endif /* CONFIG_NFS_V4_2 */
+
+#define NFSDBG_FACILITY NFSDBG_CALLBACK
+
+/* Internal error code */
+#define NFS4ERR_RESOURCE_HDR 11050
+
+struct callback_op {
+ __be32 (*process_op)(void *, void *, struct cb_process_state *);
+ __be32 (*decode_args)(struct svc_rqst *, struct xdr_stream *, void *);
+ __be32 (*encode_res)(struct svc_rqst *, struct xdr_stream *,
+ const void *);
+ long res_maxsize;
+};
+
+static struct callback_op callback_ops[];
+
+static __be32 nfs4_callback_null(struct svc_rqst *rqstp)
+{
+ return htonl(NFS4_OK);
+}
+
+static int nfs4_decode_void(struct svc_rqst *rqstp, __be32 *p)
+{
+ return xdr_argsize_check(rqstp, p);
+}
+
+static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p)
+{
+ return xdr_ressize_check(rqstp, p);
+}
+
+static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len,
+ const char **str, size_t maxlen)
+{
+ ssize_t err;
+
+ err = xdr_stream_decode_opaque_inline(xdr, (void **)str, maxlen);
+ if (err < 0)
+ return cpu_to_be32(NFS4ERR_RESOURCE);
+ *len = err;
+ return 0;
+}
+
+static __be32 decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ fh->size = ntohl(*p);
+ if (fh->size > NFS4_FHSIZE)
+ return htonl(NFS4ERR_BADHANDLE);
+ p = xdr_inline_decode(xdr, fh->size);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ memcpy(&fh->data[0], p, fh->size);
+ memset(&fh->data[fh->size], 0, sizeof(fh->data) - fh->size);
+ return 0;
+}
+
+static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
+{
+ __be32 *p;
+ unsigned int attrlen;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ attrlen = ntohl(*p);
+ p = xdr_inline_decode(xdr, attrlen << 2);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ if (likely(attrlen > 0))
+ bitmap[0] = ntohl(*p++);
+ if (attrlen > 1)
+ bitmap[1] = ntohl(*p);
+ return 0;
+}
+
+static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ memcpy(stateid->data, p, NFS4_STATEID_SIZE);
+ return 0;
+}
+
+static __be32 decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_DELEGATION_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
+static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
+{
+ __be32 *p;
+ __be32 status;
+
+ status = decode_string(xdr, &hdr->taglen, &hdr->tag, CB_OP_TAGLEN_MAXSZ);
+ if (unlikely(status != 0))
+ return status;
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ hdr->minorversion = ntohl(*p++);
+ /* Check for minor version support */
+ if (hdr->minorversion <= NFS4_MAX_MINOR_VERSION) {
+ hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 and v4.2 */
+ } else {
+ pr_warn_ratelimited("NFS: %s: NFSv4 server callback with "
+ "illegal minor version %u!\n",
+ __func__, hdr->minorversion);
+ return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+ }
+ hdr->nops = ntohl(*p);
+ return 0;
+}
+
+static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
+{
+ __be32 *p;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE_HDR);
+ *op = ntohl(*p);
+ return 0;
+}
+
+static __be32 decode_getattr_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr, void *argp)
+{
+ struct cb_getattrargs *args = argp;
+ __be32 status;
+
+ status = decode_fh(xdr, &args->fh);
+ if (unlikely(status != 0))
+ return status;
+ return decode_bitmap(xdr, args->bitmap);
+}
+
+static __be32 decode_recall_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr, void *argp)
+{
+ struct cb_recallargs *args = argp;
+ __be32 *p;
+ __be32 status;
+
+ status = decode_delegation_stateid(xdr, &args->stateid);
+ if (unlikely(status != 0))
+ return status;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ args->truncate = ntohl(*p);
+ return decode_fh(xdr, &args->fh);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+static __be32 decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_LAYOUT_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
+static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr, void *argp)
+{
+ struct cb_layoutrecallargs *args = argp;
+ __be32 *p;
+ __be32 status = 0;
+ uint32_t iomode;
+
+ p = xdr_inline_decode(xdr, 4 * sizeof(uint32_t));
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
+
+ args->cbl_layout_type = ntohl(*p++);
+ /* Depite the spec's xdr, iomode really belongs in the FILE switch,
+ * as it is unusable and ignored with the other types.
+ */
+ iomode = ntohl(*p++);
+ args->cbl_layoutchanged = ntohl(*p++);
+ args->cbl_recall_type = ntohl(*p++);
+
+ if (args->cbl_recall_type == RETURN_FILE) {
+ args->cbl_range.iomode = iomode;
+ status = decode_fh(xdr, &args->cbl_fh);
+ if (unlikely(status != 0))
+ return status;
+
+ p = xdr_inline_decode(xdr, 2 * sizeof(uint64_t));
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
+ p = xdr_decode_hyper(p, &args->cbl_range.offset);
+ p = xdr_decode_hyper(p, &args->cbl_range.length);
+ return decode_layout_stateid(xdr, &args->cbl_stateid);
+ } else if (args->cbl_recall_type == RETURN_FSID) {
+ p = xdr_inline_decode(xdr, 2 * sizeof(uint64_t));
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
+ p = xdr_decode_hyper(p, &args->cbl_fsid.major);
+ p = xdr_decode_hyper(p, &args->cbl_fsid.minor);
+ } else if (args->cbl_recall_type != RETURN_ALL)
+ return htonl(NFS4ERR_BADXDR);
+ return 0;
+}
+
+static
+__be32 decode_devicenotify_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *argp)
+{
+ struct cb_devicenotifyargs *args = argp;
+ uint32_t tmp, n, i;
+ __be32 *p;
+ __be32 status = 0;
+
+ /* Num of device notifications */
+ p = xdr_inline_decode(xdr, sizeof(uint32_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto out;
+ }
+ n = ntohl(*p++);
+ if (n == 0)
+ goto out;
+
+ args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL);
+ if (!args->devs) {
+ status = htonl(NFS4ERR_DELAY);
+ goto out;
+ }
+
+ /* Decode each dev notification */
+ for (i = 0; i < n; i++) {
+ struct cb_devicenotifyitem *dev = &args->devs[i];
+
+ p = xdr_inline_decode(xdr, (4 * sizeof(uint32_t)) +
+ NFS4_DEVICEID4_SIZE);
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto err;
+ }
+
+ tmp = ntohl(*p++); /* bitmap size */
+ if (tmp != 1) {
+ status = htonl(NFS4ERR_INVAL);
+ goto err;
+ }
+ dev->cbd_notify_type = ntohl(*p++);
+ if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
+ dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) {
+ status = htonl(NFS4ERR_INVAL);
+ goto err;
+ }
+
+ tmp = ntohl(*p++); /* opaque size */
+ if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) &&
+ (tmp != NFS4_DEVICEID4_SIZE + 8)) ||
+ ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) &&
+ (tmp != NFS4_DEVICEID4_SIZE + 4))) {
+ status = htonl(NFS4ERR_INVAL);
+ goto err;
+ }
+ dev->cbd_layout_type = ntohl(*p++);
+ memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE);
+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+ if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
+ p = xdr_inline_decode(xdr, sizeof(uint32_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto err;
+ }
+ dev->cbd_immediate = ntohl(*p++);
+ } else {
+ dev->cbd_immediate = 0;
+ }
+
+ dprintk("%s: type %d layout 0x%x immediate %d\n",
+ __func__, dev->cbd_notify_type, dev->cbd_layout_type,
+ dev->cbd_immediate);
+ }
+ args->ndevs = n;
+ dprintk("%s: ndevs %d\n", __func__, args->ndevs);
+ return 0;
+err:
+ kfree(args->devs);
+out:
+ args->devs = NULL;
+ args->ndevs = 0;
+ dprintk("%s: status %d ndevs %d\n",
+ __func__, ntohl(status), args->ndevs);
+ return status;
+}
+
+static __be32 decode_sessionid(struct xdr_stream *xdr,
+ struct nfs4_sessionid *sid)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+
+ memcpy(sid->data, p, NFS4_MAX_SESSIONID_LEN);
+ return 0;
+}
+
+static __be32 decode_rc_list(struct xdr_stream *xdr,
+ struct referring_call_list *rc_list)
+{
+ __be32 *p;
+ int i;
+ __be32 status;
+
+ status = decode_sessionid(xdr, &rc_list->rcl_sessionid);
+ if (status)
+ goto out;
+
+ status = htonl(NFS4ERR_RESOURCE);
+ p = xdr_inline_decode(xdr, sizeof(uint32_t));
+ if (unlikely(p == NULL))
+ goto out;
+
+ rc_list->rcl_nrefcalls = ntohl(*p++);
+ if (rc_list->rcl_nrefcalls) {
+ p = xdr_inline_decode(xdr,
+ rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t));
+ if (unlikely(p == NULL))
+ goto out;
+ rc_list->rcl_refcalls = kmalloc_array(rc_list->rcl_nrefcalls,
+ sizeof(*rc_list->rcl_refcalls),
+ GFP_KERNEL);
+ if (unlikely(rc_list->rcl_refcalls == NULL))
+ goto out;
+ for (i = 0; i < rc_list->rcl_nrefcalls; i++) {
+ rc_list->rcl_refcalls[i].rc_sequenceid = ntohl(*p++);
+ rc_list->rcl_refcalls[i].rc_slotid = ntohl(*p++);
+ }
+ }
+ status = 0;
+
+out:
+ return status;
+}
+
+static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *argp)
+{
+ struct cb_sequenceargs *args = argp;
+ __be32 *p;
+ int i;
+ __be32 status;
+
+ status = decode_sessionid(xdr, &args->csa_sessionid);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 5 * sizeof(uint32_t));
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+
+ args->csa_addr = svc_addr(rqstp);
+ args->csa_sequenceid = ntohl(*p++);
+ args->csa_slotid = ntohl(*p++);
+ args->csa_highestslotid = ntohl(*p++);
+ args->csa_cachethis = ntohl(*p++);
+ args->csa_nrclists = ntohl(*p++);
+ args->csa_rclists = NULL;
+ if (args->csa_nrclists) {
+ args->csa_rclists = kmalloc_array(args->csa_nrclists,
+ sizeof(*args->csa_rclists),
+ GFP_KERNEL);
+ if (unlikely(args->csa_rclists == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+
+ for (i = 0; i < args->csa_nrclists; i++) {
+ status = decode_rc_list(xdr, &args->csa_rclists[i]);
+ if (status) {
+ args->csa_nrclists = i;
+ goto out_free;
+ }
+ }
+ }
+ return 0;
+
+out_free:
+ for (i = 0; i < args->csa_nrclists; i++)
+ kfree(args->csa_rclists[i].rcl_refcalls);
+ kfree(args->csa_rclists);
+ return status;
+}
+
+static __be32 decode_recallany_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *argp)
+{
+ struct cb_recallanyargs *args = argp;
+ uint32_t bitmap[2];
+ __be32 *p, status;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
+ args->craa_objs_to_keep = ntohl(*p++);
+ status = decode_bitmap(xdr, bitmap);
+ if (unlikely(status))
+ return status;
+ args->craa_type_mask = bitmap[0];
+
+ return 0;
+}
+
+static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *argp)
+{
+ struct cb_recallslotargs *args = argp;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
+ args->crsa_target_highest_slotid = ntohl(*p++);
+ return 0;
+}
+
+static __be32 decode_lockowner(struct xdr_stream *xdr, struct cb_notify_lock_args *args)
+{
+ __be32 *p;
+ unsigned int len;
+
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
+
+ p = xdr_decode_hyper(p, &args->cbnl_owner.clientid);
+ len = be32_to_cpu(*p);
+
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
+
+ /* Only try to decode if the length is right */
+ if (len == 20) {
+ p += 2; /* skip "lock id:" */
+ args->cbnl_owner.s_dev = be32_to_cpu(*p++);
+ xdr_decode_hyper(p, &args->cbnl_owner.id);
+ args->cbnl_valid = true;
+ } else {
+ args->cbnl_owner.s_dev = 0;
+ args->cbnl_owner.id = 0;
+ args->cbnl_valid = false;
+ }
+ return 0;
+}
+
+static __be32 decode_notify_lock_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr, void *argp)
+{
+ struct cb_notify_lock_args *args = argp;
+ __be32 status;
+
+ status = decode_fh(xdr, &args->cbnl_fh);
+ if (unlikely(status != 0))
+ return status;
+ return decode_lockowner(xdr, args);
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+#ifdef CONFIG_NFS_V4_2
+static __be32 decode_write_response(struct xdr_stream *xdr,
+ struct cb_offloadargs *args)
+{
+ __be32 *p;
+
+ /* skip the always zero field */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out;
+ p++;
+
+ /* decode count, stable_how, verifier */
+ p = xdr_inline_decode(xdr, 8 + 4);
+ if (unlikely(!p))
+ goto out;
+ p = xdr_decode_hyper(p, &args->wr_count);
+ args->wr_writeverf.committed = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, NFS4_VERIFIER_SIZE);
+ if (likely(p)) {
+ memcpy(&args->wr_writeverf.verifier.data[0], p,
+ NFS4_VERIFIER_SIZE);
+ return 0;
+ }
+out:
+ return htonl(NFS4ERR_RESOURCE);
+}
+
+static __be32 decode_offload_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct cb_offloadargs *args = data;
+ __be32 *p;
+ __be32 status;
+
+ /* decode fh */
+ status = decode_fh(xdr, &args->coa_fh);
+ if (unlikely(status != 0))
+ return status;
+
+ /* decode stateid */
+ status = decode_stateid(xdr, &args->coa_stateid);
+ if (unlikely(status != 0))
+ return status;
+
+ /* decode status */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out;
+ args->error = ntohl(*p++);
+ if (!args->error) {
+ status = decode_write_response(xdr, args);
+ if (unlikely(status != 0))
+ return status;
+ } else {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out;
+ p = xdr_decode_hyper(p, &args->wr_count);
+ }
+ return 0;
+out:
+ return htonl(NFS4ERR_RESOURCE);
+}
+#endif /* CONFIG_NFS_V4_2 */
+static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
+{
+ if (unlikely(xdr_stream_encode_opaque(xdr, str, len) < 0))
+ return cpu_to_be32(NFS4ERR_RESOURCE);
+ return 0;
+}
+
+static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, size_t sz)
+{
+ if (xdr_stream_encode_uint32_array(xdr, bitmap, sz) < 0)
+ return cpu_to_be32(NFS4ERR_RESOURCE);
+ return 0;
+}
+
+static __be32 encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t change)
+{
+ __be32 *p;
+
+ if (!(bitmap[0] & FATTR4_WORD0_CHANGE))
+ return 0;
+ p = xdr_reserve_space(xdr, 8);
+ if (unlikely(!p))
+ return htonl(NFS4ERR_RESOURCE);
+ p = xdr_encode_hyper(p, change);
+ return 0;
+}
+
+static __be32 encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t size)
+{
+ __be32 *p;
+
+ if (!(bitmap[0] & FATTR4_WORD0_SIZE))
+ return 0;
+ p = xdr_reserve_space(xdr, 8);
+ if (unlikely(!p))
+ return htonl(NFS4ERR_RESOURCE);
+ p = xdr_encode_hyper(p, size);
+ return 0;
+}
+
+static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec64 *time)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 12);
+ if (unlikely(!p))
+ return htonl(NFS4ERR_RESOURCE);
+ p = xdr_encode_hyper(p, time->tv_sec);
+ *p = htonl(time->tv_nsec);
+ return 0;
+}
+
+static __be32 encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec64 *time)
+{
+ if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA))
+ return 0;
+ return encode_attr_time(xdr,time);
+}
+
+static __be32 encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec64 *time)
+{
+ if (!(bitmap[1] & FATTR4_WORD1_TIME_MODIFY))
+ return 0;
+ return encode_attr_time(xdr,time);
+}
+
+static __be32 encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr)
+{
+ __be32 status;
+
+ hdr->status = xdr_reserve_space(xdr, 4);
+ if (unlikely(hdr->status == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ status = encode_string(xdr, hdr->taglen, hdr->tag);
+ if (unlikely(status != 0))
+ return status;
+ hdr->nops = xdr_reserve_space(xdr, 4);
+ if (unlikely(hdr->nops == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ return 0;
+}
+
+static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 8);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE_HDR);
+ *p++ = htonl(op);
+ *p = res;
+ return 0;
+}
+
+static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr,
+ const void *resp)
+{
+ const struct cb_getattrres *res = resp;
+ __be32 *savep = NULL;
+ __be32 status = res->status;
+
+ if (unlikely(status != 0))
+ goto out;
+ status = encode_attr_bitmap(xdr, res->bitmap, ARRAY_SIZE(res->bitmap));
+ if (unlikely(status != 0))
+ goto out;
+ status = cpu_to_be32(NFS4ERR_RESOURCE);
+ savep = xdr_reserve_space(xdr, sizeof(*savep));
+ if (unlikely(!savep))
+ goto out;
+ status = encode_attr_change(xdr, res->bitmap, res->change_attr);
+ if (unlikely(status != 0))
+ goto out;
+ status = encode_attr_size(xdr, res->bitmap, res->size);
+ if (unlikely(status != 0))
+ goto out;
+ status = encode_attr_ctime(xdr, res->bitmap, &res->ctime);
+ if (unlikely(status != 0))
+ goto out;
+ status = encode_attr_mtime(xdr, res->bitmap, &res->mtime);
+ *savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1)));
+out:
+ return status;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static __be32 encode_sessionid(struct xdr_stream *xdr,
+ const struct nfs4_sessionid *sid)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+
+ memcpy(p, sid, NFS4_MAX_SESSIONID_LEN);
+ return 0;
+}
+
+static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ const void *resp)
+{
+ const struct cb_sequenceres *res = resp;
+ __be32 *p;
+ __be32 status = res->csr_status;
+
+ if (unlikely(status != 0))
+ return status;
+
+ status = encode_sessionid(xdr, &res->csr_sessionid);
+ if (status)
+ return status;
+
+ p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t));
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+
+ *p++ = htonl(res->csr_sequenceid);
+ *p++ = htonl(res->csr_slotid);
+ *p++ = htonl(res->csr_highestslotid);
+ *p++ = htonl(res->csr_target_highestslotid);
+ return 0;
+}
+
+static __be32
+preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+ if (op_nr == OP_CB_SEQUENCE) {
+ if (nop != 0)
+ return htonl(NFS4ERR_SEQUENCE_POS);
+ } else {
+ if (nop == 0)
+ return htonl(NFS4ERR_OP_NOT_IN_SESSION);
+ }
+
+ switch (op_nr) {
+ case OP_CB_GETATTR:
+ case OP_CB_RECALL:
+ case OP_CB_SEQUENCE:
+ case OP_CB_RECALL_ANY:
+ case OP_CB_RECALL_SLOT:
+ case OP_CB_LAYOUTRECALL:
+ case OP_CB_NOTIFY_DEVICEID:
+ case OP_CB_NOTIFY_LOCK:
+ *op = &callback_ops[op_nr];
+ break;
+
+ case OP_CB_NOTIFY:
+ case OP_CB_PUSH_DELEG:
+ case OP_CB_RECALLABLE_OBJ_AVAIL:
+ case OP_CB_WANTS_CANCELLED:
+ return htonl(NFS4ERR_NOTSUPP);
+
+ default:
+ return htonl(NFS4ERR_OP_ILLEGAL);
+ }
+
+ return htonl(NFS_OK);
+}
+
+static void nfs4_callback_free_slot(struct nfs4_session *session,
+ struct nfs4_slot *slot)
+{
+ struct nfs4_slot_table *tbl = &session->bc_slot_table;
+
+ spin_lock(&tbl->slot_tbl_lock);
+ /*
+ * Let the state manager know callback processing done.
+ * A single slot, so highest used slotid is either 0 or -1
+ */
+ nfs4_free_slot(tbl, slot);
+ spin_unlock(&tbl->slot_tbl_lock);
+}
+
+static void nfs4_cb_free_slot(struct cb_process_state *cps)
+{
+ if (cps->slot) {
+ nfs4_callback_free_slot(cps->clp->cl_session, cps->slot);
+ cps->slot = NULL;
+ }
+}
+
+#else /* CONFIG_NFS_V4_1 */
+
+static __be32
+preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+ return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+}
+
+static void nfs4_cb_free_slot(struct cb_process_state *cps)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+#ifdef CONFIG_NFS_V4_2
+static __be32
+preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+ __be32 status = preprocess_nfs41_op(nop, op_nr, op);
+ if (status != htonl(NFS4ERR_OP_ILLEGAL))
+ return status;
+
+ if (op_nr == OP_CB_OFFLOAD) {
+ *op = &callback_ops[op_nr];
+ return htonl(NFS_OK);
+ } else
+ return htonl(NFS4ERR_NOTSUPP);
+ return htonl(NFS4ERR_OP_ILLEGAL);
+}
+#else /* CONFIG_NFS_V4_2 */
+static __be32
+preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+ return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+}
+#endif /* CONFIG_NFS_V4_2 */
+
+static __be32
+preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
+{
+ switch (op_nr) {
+ case OP_CB_GETATTR:
+ case OP_CB_RECALL:
+ *op = &callback_ops[op_nr];
+ break;
+ default:
+ return htonl(NFS4ERR_OP_ILLEGAL);
+ }
+
+ return htonl(NFS_OK);
+}
+
+static __be32 process_op(int nop, struct svc_rqst *rqstp,
+ struct xdr_stream *xdr_in, void *argp,
+ struct xdr_stream *xdr_out, void *resp,
+ struct cb_process_state *cps)
+{
+ struct callback_op *op = &callback_ops[0];
+ unsigned int op_nr;
+ __be32 status;
+ long maxlen;
+ __be32 res;
+
+ status = decode_op_hdr(xdr_in, &op_nr);
+ if (unlikely(status))
+ return status;
+
+ switch (cps->minorversion) {
+ case 0:
+ status = preprocess_nfs4_op(op_nr, &op);
+ break;
+ case 1:
+ status = preprocess_nfs41_op(nop, op_nr, &op);
+ break;
+ case 2:
+ status = preprocess_nfs42_op(nop, op_nr, &op);
+ break;
+ default:
+ status = htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+ }
+
+ if (status == htonl(NFS4ERR_OP_ILLEGAL))
+ op_nr = OP_CB_ILLEGAL;
+ if (status)
+ goto encode_hdr;
+
+ if (cps->drc_status) {
+ status = cps->drc_status;
+ goto encode_hdr;
+ }
+
+ maxlen = xdr_out->end - xdr_out->p;
+ if (maxlen > 0 && maxlen < PAGE_SIZE) {
+ status = op->decode_args(rqstp, xdr_in, argp);
+ if (likely(status == 0))
+ status = op->process_op(argp, resp, cps);
+ } else
+ status = htonl(NFS4ERR_RESOURCE);
+
+encode_hdr:
+ res = encode_op_hdr(xdr_out, op_nr, status);
+ if (unlikely(res))
+ return res;
+ if (op->encode_res != NULL && status == 0)
+ status = op->encode_res(rqstp, xdr_out, resp);
+ return status;
+}
+
+/*
+ * Decode, process and encode a COMPOUND
+ */
+static __be32 nfs4_callback_compound(struct svc_rqst *rqstp)
+{
+ struct cb_compound_hdr_arg hdr_arg = { 0 };
+ struct cb_compound_hdr_res hdr_res = { NULL };
+ struct xdr_stream xdr_in, xdr_out;
+ __be32 *p, status;
+ struct cb_process_state cps = {
+ .drc_status = 0,
+ .clp = NULL,
+ .net = SVC_NET(rqstp),
+ };
+ unsigned int nops = 0;
+
+ xdr_init_decode(&xdr_in, &rqstp->rq_arg,
+ rqstp->rq_arg.head[0].iov_base, NULL);
+
+ p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
+ xdr_init_encode(&xdr_out, &rqstp->rq_res, p, NULL);
+
+ status = decode_compound_hdr_arg(&xdr_in, &hdr_arg);
+ if (status == htonl(NFS4ERR_RESOURCE))
+ return rpc_garbage_args;
+
+ if (hdr_arg.minorversion == 0) {
+ cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident);
+ if (!cps.clp) {
+ trace_nfs_cb_no_clp(rqstp->rq_xid, hdr_arg.cb_ident);
+ goto out_invalidcred;
+ }
+ if (!check_gss_callback_principal(cps.clp, rqstp)) {
+ trace_nfs_cb_badprinc(rqstp->rq_xid, hdr_arg.cb_ident);
+ nfs_put_client(cps.clp);
+ goto out_invalidcred;
+ }
+ }
+
+ cps.minorversion = hdr_arg.minorversion;
+ hdr_res.taglen = hdr_arg.taglen;
+ hdr_res.tag = hdr_arg.tag;
+ if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) {
+ if (cps.clp)
+ nfs_put_client(cps.clp);
+ return rpc_system_err;
+ }
+ while (status == 0 && nops != hdr_arg.nops) {
+ status = process_op(nops, rqstp, &xdr_in,
+ rqstp->rq_argp, &xdr_out, rqstp->rq_resp,
+ &cps);
+ nops++;
+ }
+
+ /* Buffer overflow in decode_ops_hdr or encode_ops_hdr. Return
+ * resource error in cb_compound status without returning op */
+ if (unlikely(status == htonl(NFS4ERR_RESOURCE_HDR))) {
+ status = htonl(NFS4ERR_RESOURCE);
+ nops--;
+ }
+
+ *hdr_res.status = status;
+ *hdr_res.nops = htonl(nops);
+ nfs4_cb_free_slot(&cps);
+ nfs_put_client(cps.clp);
+ return rpc_success;
+
+out_invalidcred:
+ pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n");
+ return svc_return_autherr(rqstp, rpc_autherr_badcred);
+}
+
+/*
+ * Define NFS4 callback COMPOUND ops.
+ */
+static struct callback_op callback_ops[] = {
+ [0] = {
+ .res_maxsize = CB_OP_HDR_RES_MAXSZ,
+ },
+ [OP_CB_GETATTR] = {
+ .process_op = nfs4_callback_getattr,
+ .decode_args = decode_getattr_args,
+ .encode_res = encode_getattr_res,
+ .res_maxsize = CB_OP_GETATTR_RES_MAXSZ,
+ },
+ [OP_CB_RECALL] = {
+ .process_op = nfs4_callback_recall,
+ .decode_args = decode_recall_args,
+ .res_maxsize = CB_OP_RECALL_RES_MAXSZ,
+ },
+#if defined(CONFIG_NFS_V4_1)
+ [OP_CB_LAYOUTRECALL] = {
+ .process_op = nfs4_callback_layoutrecall,
+ .decode_args = decode_layoutrecall_args,
+ .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
+ },
+ [OP_CB_NOTIFY_DEVICEID] = {
+ .process_op = nfs4_callback_devicenotify,
+ .decode_args = decode_devicenotify_args,
+ .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ,
+ },
+ [OP_CB_SEQUENCE] = {
+ .process_op = nfs4_callback_sequence,
+ .decode_args = decode_cb_sequence_args,
+ .encode_res = encode_cb_sequence_res,
+ .res_maxsize = CB_OP_SEQUENCE_RES_MAXSZ,
+ },
+ [OP_CB_RECALL_ANY] = {
+ .process_op = nfs4_callback_recallany,
+ .decode_args = decode_recallany_args,
+ .res_maxsize = CB_OP_RECALLANY_RES_MAXSZ,
+ },
+ [OP_CB_RECALL_SLOT] = {
+ .process_op = nfs4_callback_recallslot,
+ .decode_args = decode_recallslot_args,
+ .res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ,
+ },
+ [OP_CB_NOTIFY_LOCK] = {
+ .process_op = nfs4_callback_notify_lock,
+ .decode_args = decode_notify_lock_args,
+ .res_maxsize = CB_OP_NOTIFY_LOCK_RES_MAXSZ,
+ },
+#endif /* CONFIG_NFS_V4_1 */
+#ifdef CONFIG_NFS_V4_2
+ [OP_CB_OFFLOAD] = {
+ .process_op = nfs4_callback_offload,
+ .decode_args = decode_offload_args,
+ .res_maxsize = CB_OP_OFFLOAD_RES_MAXSZ,
+ },
+#endif /* CONFIG_NFS_V4_2 */
+};
+
+/*
+ * Define NFS4 callback procedures
+ */
+static const struct svc_procedure nfs4_callback_procedures1[] = {
+ [CB_NULL] = {
+ .pc_func = nfs4_callback_null,
+ .pc_decode = nfs4_decode_void,
+ .pc_encode = nfs4_encode_void,
+ .pc_xdrressize = 1,
+ },
+ [CB_COMPOUND] = {
+ .pc_func = nfs4_callback_compound,
+ .pc_encode = nfs4_encode_void,
+ .pc_argsize = 256,
+ .pc_ressize = 256,
+ .pc_xdrressize = NFS4_CALLBACK_BUFSIZE,
+ }
+};
+
+static unsigned int nfs4_callback_count1[ARRAY_SIZE(nfs4_callback_procedures1)];
+const struct svc_version nfs4_callback_version1 = {
+ .vs_vers = 1,
+ .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1),
+ .vs_proc = nfs4_callback_procedures1,
+ .vs_count = nfs4_callback_count1,
+ .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
+ .vs_dispatch = NULL,
+ .vs_hidden = true,
+ .vs_need_cong_ctrl = true,
+};
+
+static unsigned int nfs4_callback_count4[ARRAY_SIZE(nfs4_callback_procedures1)];
+const struct svc_version nfs4_callback_version4 = {
+ .vs_vers = 4,
+ .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1),
+ .vs_proc = nfs4_callback_procedures1,
+ .vs_count = nfs4_callback_count4,
+ .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
+ .vs_dispatch = NULL,
+ .vs_hidden = true,
+ .vs_need_cong_ctrl = true,
+};
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
new file mode 100644
index 000000000..818ff8b1b
--- /dev/null
+++ b/fs/nfs/client.c
@@ -0,0 +1,1358 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* client.c: NFS client sharing and management code
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/xprtsock.h>
+#include <linux/sunrpc/xprtrdma.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <net/ipv6.h>
+#include <linux/nfs_xdr.h>
+#include <linux/sunrpc/bc_xprt.h>
+#include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
+
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#include "fscache.h"
+#include "pnfs.h"
+#include "nfs.h"
+#include "netns.h"
+#include "sysfs.h"
+#include "nfs42.h"
+
+#define NFSDBG_FACILITY NFSDBG_CLIENT
+
+static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
+static DEFINE_SPINLOCK(nfs_version_lock);
+static DEFINE_MUTEX(nfs_version_mutex);
+static LIST_HEAD(nfs_versions);
+
+/*
+ * RPC cruft for NFS
+ */
+static const struct rpc_version *nfs_version[5] = {
+ [2] = NULL,
+ [3] = NULL,
+ [4] = NULL,
+};
+
+const struct rpc_program nfs_program = {
+ .name = "nfs",
+ .number = NFS_PROGRAM,
+ .nrvers = ARRAY_SIZE(nfs_version),
+ .version = nfs_version,
+ .stats = &nfs_rpcstat,
+ .pipe_dir_name = NFS_PIPE_DIRNAME,
+};
+
+struct rpc_stat nfs_rpcstat = {
+ .program = &nfs_program
+};
+
+static struct nfs_subversion *find_nfs_version(unsigned int version)
+{
+ struct nfs_subversion *nfs;
+ spin_lock(&nfs_version_lock);
+
+ list_for_each_entry(nfs, &nfs_versions, list) {
+ if (nfs->rpc_ops->version == version) {
+ spin_unlock(&nfs_version_lock);
+ return nfs;
+ }
+ }
+
+ spin_unlock(&nfs_version_lock);
+ return ERR_PTR(-EPROTONOSUPPORT);
+}
+
+struct nfs_subversion *get_nfs_version(unsigned int version)
+{
+ struct nfs_subversion *nfs = find_nfs_version(version);
+
+ if (IS_ERR(nfs)) {
+ mutex_lock(&nfs_version_mutex);
+ request_module("nfsv%d", version);
+ nfs = find_nfs_version(version);
+ mutex_unlock(&nfs_version_mutex);
+ }
+
+ if (!IS_ERR(nfs) && !try_module_get(nfs->owner))
+ return ERR_PTR(-EAGAIN);
+ return nfs;
+}
+
+void put_nfs_version(struct nfs_subversion *nfs)
+{
+ module_put(nfs->owner);
+}
+
+void register_nfs_version(struct nfs_subversion *nfs)
+{
+ spin_lock(&nfs_version_lock);
+
+ list_add(&nfs->list, &nfs_versions);
+ nfs_version[nfs->rpc_ops->version] = nfs->rpc_vers;
+
+ spin_unlock(&nfs_version_lock);
+}
+EXPORT_SYMBOL_GPL(register_nfs_version);
+
+void unregister_nfs_version(struct nfs_subversion *nfs)
+{
+ spin_lock(&nfs_version_lock);
+
+ nfs_version[nfs->rpc_ops->version] = NULL;
+ list_del(&nfs->list);
+
+ spin_unlock(&nfs_version_lock);
+}
+EXPORT_SYMBOL_GPL(unregister_nfs_version);
+
+/*
+ * Allocate a shared client record
+ *
+ * Since these are allocated/deallocated very rarely, we don't
+ * bother putting them in a slab cache...
+ */
+struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
+{
+ struct nfs_client *clp;
+ int err = -ENOMEM;
+
+ if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
+ goto error_0;
+
+ clp->cl_minorversion = cl_init->minorversion;
+ clp->cl_nfs_mod = cl_init->nfs_mod;
+ if (!try_module_get(clp->cl_nfs_mod->owner))
+ goto error_dealloc;
+
+ clp->rpc_ops = clp->cl_nfs_mod->rpc_ops;
+
+ refcount_set(&clp->cl_count, 1);
+ clp->cl_cons_state = NFS_CS_INITING;
+
+ memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen);
+ clp->cl_addrlen = cl_init->addrlen;
+
+ if (cl_init->hostname) {
+ err = -ENOMEM;
+ clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL);
+ if (!clp->cl_hostname)
+ goto error_cleanup;
+ }
+
+ INIT_LIST_HEAD(&clp->cl_superblocks);
+ clp->cl_rpcclient = ERR_PTR(-EINVAL);
+
+ clp->cl_flags = cl_init->init_flags;
+ clp->cl_proto = cl_init->proto;
+ clp->cl_nconnect = cl_init->nconnect;
+ clp->cl_net = get_net(cl_init->net);
+
+ clp->cl_principal = "*";
+ nfs_fscache_get_client_cookie(clp);
+
+ return clp;
+
+error_cleanup:
+ put_nfs_version(clp->cl_nfs_mod);
+error_dealloc:
+ kfree(clp);
+error_0:
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(nfs_alloc_client);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+static void nfs_cleanup_cb_ident_idr(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ idr_destroy(&nn->cb_ident_idr);
+}
+
+/* nfs_client_lock held */
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+ struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+
+ if (clp->cl_cb_ident)
+ idr_remove(&nn->cb_ident_idr, clp->cl_cb_ident);
+}
+
+static void pnfs_init_server(struct nfs_server *server)
+{
+ rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
+}
+
+#else
+static void nfs_cleanup_cb_ident_idr(struct net *net)
+{
+}
+
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+}
+
+static void pnfs_init_server(struct nfs_server *server)
+{
+}
+
+#endif /* CONFIG_NFS_V4 */
+
+/*
+ * Destroy a shared client record
+ */
+void nfs_free_client(struct nfs_client *clp)
+{
+ nfs_fscache_release_client_cookie(clp);
+
+ /* -EIO all pending I/O */
+ if (!IS_ERR(clp->cl_rpcclient))
+ rpc_shutdown_client(clp->cl_rpcclient);
+
+ put_net(clp->cl_net);
+ put_nfs_version(clp->cl_nfs_mod);
+ kfree(clp->cl_hostname);
+ kfree(clp->cl_acceptor);
+ kfree(clp);
+}
+EXPORT_SYMBOL_GPL(nfs_free_client);
+
+/*
+ * Release a reference to a shared client record
+ */
+void nfs_put_client(struct nfs_client *clp)
+{
+ struct nfs_net *nn;
+
+ if (!clp)
+ return;
+
+ nn = net_generic(clp->cl_net, nfs_net_id);
+
+ if (refcount_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) {
+ list_del(&clp->cl_share_link);
+ nfs_cb_idr_remove_locked(clp);
+ spin_unlock(&nn->nfs_client_lock);
+
+ WARN_ON_ONCE(!list_empty(&clp->cl_superblocks));
+
+ clp->rpc_ops->free_client(clp);
+ }
+}
+EXPORT_SYMBOL_GPL(nfs_put_client);
+
+/*
+ * Find an nfs_client on the list that matches the initialisation data
+ * that is supplied.
+ */
+static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data)
+{
+ struct nfs_client *clp;
+ const struct sockaddr *sap = data->addr;
+ struct nfs_net *nn = net_generic(data->net, nfs_net_id);
+ int error;
+
+again:
+ list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
+ const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+ /* Don't match clients that failed to initialise properly */
+ if (clp->cl_cons_state < 0)
+ continue;
+
+ /* If a client is still initializing then we need to wait */
+ if (clp->cl_cons_state > NFS_CS_READY) {
+ refcount_inc(&clp->cl_count);
+ spin_unlock(&nn->nfs_client_lock);
+ error = nfs_wait_client_init_complete(clp);
+ nfs_put_client(clp);
+ spin_lock(&nn->nfs_client_lock);
+ if (error < 0)
+ return ERR_PTR(error);
+ goto again;
+ }
+
+ /* Different NFS versions cannot share the same nfs_client */
+ if (clp->rpc_ops != data->nfs_mod->rpc_ops)
+ continue;
+
+ if (clp->cl_proto != data->proto)
+ continue;
+ /* Match nfsv4 minorversion */
+ if (clp->cl_minorversion != data->minorversion)
+ continue;
+
+ /* Match request for a dedicated DS */
+ if (test_bit(NFS_CS_DS, &data->init_flags) !=
+ test_bit(NFS_CS_DS, &clp->cl_flags))
+ continue;
+
+ /* Match the full socket address */
+ if (!rpc_cmp_addr_port(sap, clap))
+ /* Match all xprt_switch full socket addresses */
+ if (IS_ERR(clp->cl_rpcclient) ||
+ !rpc_clnt_xprt_switch_has_addr(clp->cl_rpcclient,
+ sap))
+ continue;
+
+ refcount_inc(&clp->cl_count);
+ return clp;
+ }
+ return NULL;
+}
+
+/*
+ * Return true if @clp is done initializing, false if still working on it.
+ *
+ * Use nfs_client_init_status to check if it was successful.
+ */
+bool nfs_client_init_is_complete(const struct nfs_client *clp)
+{
+ return clp->cl_cons_state <= NFS_CS_READY;
+}
+EXPORT_SYMBOL_GPL(nfs_client_init_is_complete);
+
+/*
+ * Return 0 if @clp was successfully initialized, -errno otherwise.
+ *
+ * This must be called *after* nfs_client_init_is_complete() returns true,
+ * otherwise it will pop WARN_ON_ONCE and return -EINVAL
+ */
+int nfs_client_init_status(const struct nfs_client *clp)
+{
+ /* called without checking nfs_client_init_is_complete */
+ if (clp->cl_cons_state > NFS_CS_READY) {
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+ return clp->cl_cons_state;
+}
+EXPORT_SYMBOL_GPL(nfs_client_init_status);
+
+int nfs_wait_client_init_complete(const struct nfs_client *clp)
+{
+ return wait_event_killable(nfs_client_active_wq,
+ nfs_client_init_is_complete(clp));
+}
+EXPORT_SYMBOL_GPL(nfs_wait_client_init_complete);
+
+/*
+ * Found an existing client. Make sure it's ready before returning.
+ */
+static struct nfs_client *
+nfs_found_client(const struct nfs_client_initdata *cl_init,
+ struct nfs_client *clp)
+{
+ int error;
+
+ error = nfs_wait_client_init_complete(clp);
+ if (error < 0) {
+ nfs_put_client(clp);
+ return ERR_PTR(-ERESTARTSYS);
+ }
+
+ if (clp->cl_cons_state < NFS_CS_READY) {
+ error = clp->cl_cons_state;
+ nfs_put_client(clp);
+ return ERR_PTR(error);
+ }
+
+ smp_rmb();
+ return clp;
+}
+
+/*
+ * Look up a client by IP address and protocol version
+ * - creates a new record if one doesn't yet exist
+ */
+struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
+{
+ struct nfs_client *clp, *new = NULL;
+ struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
+ const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops;
+
+ if (cl_init->hostname == NULL) {
+ WARN_ON(1);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* see if the client already exists */
+ do {
+ spin_lock(&nn->nfs_client_lock);
+
+ clp = nfs_match_client(cl_init);
+ if (clp) {
+ spin_unlock(&nn->nfs_client_lock);
+ if (new)
+ new->rpc_ops->free_client(new);
+ if (IS_ERR(clp))
+ return clp;
+ return nfs_found_client(cl_init, clp);
+ }
+ if (new) {
+ list_add_tail(&new->cl_share_link,
+ &nn->nfs_client_list);
+ spin_unlock(&nn->nfs_client_lock);
+ return rpc_ops->init_client(new, cl_init);
+ }
+
+ spin_unlock(&nn->nfs_client_lock);
+
+ new = rpc_ops->alloc_client(cl_init);
+ } while (!IS_ERR(new));
+
+ return new;
+}
+EXPORT_SYMBOL_GPL(nfs_get_client);
+
+/*
+ * Mark a server as ready or failed
+ */
+void nfs_mark_client_ready(struct nfs_client *clp, int state)
+{
+ smp_wmb();
+ clp->cl_cons_state = state;
+ wake_up_all(&nfs_client_active_wq);
+}
+EXPORT_SYMBOL_GPL(nfs_mark_client_ready);
+
+/*
+ * Initialise the timeout values for a connection
+ */
+void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
+ int timeo, int retrans)
+{
+ to->to_initval = timeo * HZ / 10;
+ to->to_retries = retrans;
+
+ switch (proto) {
+ case XPRT_TRANSPORT_TCP:
+ case XPRT_TRANSPORT_RDMA:
+ if (retrans == NFS_UNSPEC_RETRANS)
+ to->to_retries = NFS_DEF_TCP_RETRANS;
+ if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0)
+ to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
+ if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
+ to->to_initval = NFS_MAX_TCP_TIMEOUT;
+ to->to_increment = to->to_initval;
+ to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
+ if (to->to_maxval > NFS_MAX_TCP_TIMEOUT)
+ to->to_maxval = NFS_MAX_TCP_TIMEOUT;
+ if (to->to_maxval < to->to_initval)
+ to->to_maxval = to->to_initval;
+ to->to_exponential = 0;
+ break;
+#ifndef CONFIG_NFS_DISABLE_UDP_SUPPORT
+ case XPRT_TRANSPORT_UDP:
+ if (retrans == NFS_UNSPEC_RETRANS)
+ to->to_retries = NFS_DEF_UDP_RETRANS;
+ if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0)
+ to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10;
+ if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
+ to->to_initval = NFS_MAX_UDP_TIMEOUT;
+ to->to_maxval = NFS_MAX_UDP_TIMEOUT;
+ to->to_exponential = 1;
+ break;
+#endif
+ default:
+ BUG();
+ }
+}
+EXPORT_SYMBOL_GPL(nfs_init_timeout_values);
+
+/*
+ * Create an RPC client handle
+ */
+int nfs_create_rpc_client(struct nfs_client *clp,
+ const struct nfs_client_initdata *cl_init,
+ rpc_authflavor_t flavor)
+{
+ struct rpc_clnt *clnt = NULL;
+ struct rpc_create_args args = {
+ .net = clp->cl_net,
+ .protocol = clp->cl_proto,
+ .nconnect = clp->cl_nconnect,
+ .address = (struct sockaddr *)&clp->cl_addr,
+ .addrsize = clp->cl_addrlen,
+ .timeout = cl_init->timeparms,
+ .servername = clp->cl_hostname,
+ .nodename = cl_init->nodename,
+ .program = &nfs_program,
+ .version = clp->rpc_ops->version,
+ .authflavor = flavor,
+ .cred = cl_init->cred,
+ };
+
+ if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags))
+ args.flags |= RPC_CLNT_CREATE_DISCRTRY;
+ if (test_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags))
+ args.flags |= RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT;
+ if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags))
+ args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+ if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags))
+ args.flags |= RPC_CLNT_CREATE_INFINITE_SLOTS;
+ if (test_bit(NFS_CS_NOPING, &clp->cl_flags))
+ args.flags |= RPC_CLNT_CREATE_NOPING;
+ if (test_bit(NFS_CS_REUSEPORT, &clp->cl_flags))
+ args.flags |= RPC_CLNT_CREATE_REUSEPORT;
+
+ if (!IS_ERR(clp->cl_rpcclient))
+ return 0;
+
+ clnt = rpc_create(&args);
+ if (IS_ERR(clnt)) {
+ dprintk("%s: cannot create RPC client. Error = %ld\n",
+ __func__, PTR_ERR(clnt));
+ return PTR_ERR(clnt);
+ }
+
+ clnt->cl_principal = clp->cl_principal;
+ clp->cl_rpcclient = clnt;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_create_rpc_client);
+
+/*
+ * Version 2 or 3 client destruction
+ */
+static void nfs_destroy_server(struct nfs_server *server)
+{
+ if (server->nlm_host)
+ nlmclnt_done(server->nlm_host);
+}
+
+/*
+ * Version 2 or 3 lockd setup
+ */
+static int nfs_start_lockd(struct nfs_server *server)
+{
+ struct nlm_host *host;
+ struct nfs_client *clp = server->nfs_client;
+ struct nlmclnt_initdata nlm_init = {
+ .hostname = clp->cl_hostname,
+ .address = (struct sockaddr *)&clp->cl_addr,
+ .addrlen = clp->cl_addrlen,
+ .nfs_version = clp->rpc_ops->version,
+ .noresvport = server->flags & NFS_MOUNT_NORESVPORT ?
+ 1 : 0,
+ .net = clp->cl_net,
+ .nlmclnt_ops = clp->cl_nfs_mod->rpc_ops->nlmclnt_ops,
+ .cred = current_cred(),
+ };
+
+ if (nlm_init.nfs_version > 3)
+ return 0;
+ if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) &&
+ (server->flags & NFS_MOUNT_LOCAL_FCNTL))
+ return 0;
+
+ switch (clp->cl_proto) {
+ default:
+ nlm_init.protocol = IPPROTO_TCP;
+ break;
+#ifndef CONFIG_NFS_DISABLE_UDP_SUPPORT
+ case XPRT_TRANSPORT_UDP:
+ nlm_init.protocol = IPPROTO_UDP;
+#endif
+ }
+
+ host = nlmclnt_init(&nlm_init);
+ if (IS_ERR(host))
+ return PTR_ERR(host);
+
+ server->nlm_host = host;
+ server->destroy = nfs_destroy_server;
+ return 0;
+}
+
+/*
+ * Create a general RPC client
+ */
+int nfs_init_server_rpcclient(struct nfs_server *server,
+ const struct rpc_timeout *timeo,
+ rpc_authflavor_t pseudoflavour)
+{
+ struct nfs_client *clp = server->nfs_client;
+
+ server->client = rpc_clone_client_set_auth(clp->cl_rpcclient,
+ pseudoflavour);
+ if (IS_ERR(server->client)) {
+ dprintk("%s: couldn't create rpc_client!\n", __func__);
+ return PTR_ERR(server->client);
+ }
+
+ memcpy(&server->client->cl_timeout_default,
+ timeo,
+ sizeof(server->client->cl_timeout_default));
+ server->client->cl_timeout = &server->client->cl_timeout_default;
+ server->client->cl_softrtry = 0;
+ if (server->flags & NFS_MOUNT_SOFTERR)
+ server->client->cl_softerr = 1;
+ if (server->flags & NFS_MOUNT_SOFT)
+ server->client->cl_softrtry = 1;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient);
+
+/**
+ * nfs_init_client - Initialise an NFS2 or NFS3 client
+ *
+ * @clp: nfs_client to initialise
+ * @cl_init: Initialisation parameters
+ *
+ * Returns pointer to an NFS client, or an ERR_PTR value.
+ */
+struct nfs_client *nfs_init_client(struct nfs_client *clp,
+ const struct nfs_client_initdata *cl_init)
+{
+ int error;
+
+ /* the client is already initialised */
+ if (clp->cl_cons_state == NFS_CS_READY)
+ return clp;
+
+ /*
+ * Create a client RPC handle for doing FSSTAT with UNIX auth only
+ * - RFC 2623, sec 2.3.2
+ */
+ error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
+ nfs_mark_client_ready(clp, error == 0 ? NFS_CS_READY : error);
+ if (error < 0) {
+ nfs_put_client(clp);
+ clp = ERR_PTR(error);
+ }
+ return clp;
+}
+EXPORT_SYMBOL_GPL(nfs_init_client);
+
+/*
+ * Create a version 2 or 3 client
+ */
+static int nfs_init_server(struct nfs_server *server,
+ const struct fs_context *fc)
+{
+ const struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct rpc_timeout timeparms;
+ struct nfs_client_initdata cl_init = {
+ .hostname = ctx->nfs_server.hostname,
+ .addr = (const struct sockaddr *)&ctx->nfs_server.address,
+ .addrlen = ctx->nfs_server.addrlen,
+ .nfs_mod = ctx->nfs_mod,
+ .proto = ctx->nfs_server.protocol,
+ .net = fc->net_ns,
+ .timeparms = &timeparms,
+ .cred = server->cred,
+ .nconnect = ctx->nfs_server.nconnect,
+ .init_flags = (1UL << NFS_CS_REUSEPORT),
+ };
+ struct nfs_client *clp;
+ int error;
+
+ nfs_init_timeout_values(&timeparms, ctx->nfs_server.protocol,
+ ctx->timeo, ctx->retrans);
+ if (ctx->flags & NFS_MOUNT_NORESVPORT)
+ set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+
+ /* Allocate or find a client reference we can use */
+ clp = nfs_get_client(&cl_init);
+ if (IS_ERR(clp))
+ return PTR_ERR(clp);
+
+ server->nfs_client = clp;
+
+ /* Initialise the client representation from the mount data */
+ server->flags = ctx->flags;
+ server->options = ctx->options;
+ server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
+ NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
+ NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
+
+ if (ctx->rsize)
+ server->rsize = nfs_block_size(ctx->rsize, NULL);
+ if (ctx->wsize)
+ server->wsize = nfs_block_size(ctx->wsize, NULL);
+
+ server->acregmin = ctx->acregmin * HZ;
+ server->acregmax = ctx->acregmax * HZ;
+ server->acdirmin = ctx->acdirmin * HZ;
+ server->acdirmax = ctx->acdirmax * HZ;
+
+ /* Start lockd here, before we might error out */
+ error = nfs_start_lockd(server);
+ if (error < 0)
+ goto error;
+
+ server->port = ctx->nfs_server.port;
+ server->auth_info = ctx->auth_info;
+
+ error = nfs_init_server_rpcclient(server, &timeparms,
+ ctx->selected_flavor);
+ if (error < 0)
+ goto error;
+
+ /* Preserve the values of mount_server-related mount options */
+ if (ctx->mount_server.addrlen) {
+ memcpy(&server->mountd_address, &ctx->mount_server.address,
+ ctx->mount_server.addrlen);
+ server->mountd_addrlen = ctx->mount_server.addrlen;
+ }
+ server->mountd_version = ctx->mount_server.version;
+ server->mountd_port = ctx->mount_server.port;
+ server->mountd_protocol = ctx->mount_server.protocol;
+
+ server->namelen = ctx->namlen;
+ return 0;
+
+error:
+ server->nfs_client = NULL;
+ nfs_put_client(clp);
+ return error;
+}
+
+/*
+ * Load up the server record from information gained in an fsinfo record
+ */
+static void nfs_server_set_fsinfo(struct nfs_server *server,
+ struct nfs_fsinfo *fsinfo)
+{
+ unsigned long max_rpc_payload, raw_max_rpc_payload;
+
+ /* Work out a lot of parameters */
+ if (server->rsize == 0)
+ server->rsize = nfs_block_size(fsinfo->rtpref, NULL);
+ if (server->wsize == 0)
+ server->wsize = nfs_block_size(fsinfo->wtpref, NULL);
+
+ if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax)
+ server->rsize = nfs_block_size(fsinfo->rtmax, NULL);
+ if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax)
+ server->wsize = nfs_block_size(fsinfo->wtmax, NULL);
+
+ raw_max_rpc_payload = rpc_max_payload(server->client);
+ max_rpc_payload = nfs_block_size(raw_max_rpc_payload, NULL);
+
+ if (server->rsize > max_rpc_payload)
+ server->rsize = max_rpc_payload;
+ if (server->rsize > NFS_MAX_FILE_IO_SIZE)
+ server->rsize = NFS_MAX_FILE_IO_SIZE;
+ server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ if (server->wsize > max_rpc_payload)
+ server->wsize = max_rpc_payload;
+ if (server->wsize > NFS_MAX_FILE_IO_SIZE)
+ server->wsize = NFS_MAX_FILE_IO_SIZE;
+ server->wpages = (server->wsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
+
+ server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
+ if (server->dtsize > PAGE_SIZE * NFS_MAX_READDIR_PAGES)
+ server->dtsize = PAGE_SIZE * NFS_MAX_READDIR_PAGES;
+ if (server->dtsize > server->rsize)
+ server->dtsize = server->rsize;
+
+ if (server->flags & NFS_MOUNT_NOAC) {
+ server->acregmin = server->acregmax = 0;
+ server->acdirmin = server->acdirmax = 0;
+ }
+
+ server->maxfilesize = fsinfo->maxfilesize;
+
+ server->time_delta = fsinfo->time_delta;
+
+ server->clone_blksize = fsinfo->clone_blksize;
+ /* We're airborne Set socket buffersize */
+ rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
+
+#ifdef CONFIG_NFS_V4_2
+ /*
+ * Defaults until limited by the session parameters.
+ */
+ server->gxasize = min_t(unsigned int, raw_max_rpc_payload,
+ XATTR_SIZE_MAX);
+ server->sxasize = min_t(unsigned int, raw_max_rpc_payload,
+ XATTR_SIZE_MAX);
+ server->lxasize = min_t(unsigned int, raw_max_rpc_payload,
+ nfs42_listxattr_xdrsize(XATTR_LIST_MAX));
+
+ if (fsinfo->xattr_support)
+ server->caps |= NFS_CAP_XATTR;
+#endif
+}
+
+/*
+ * Probe filesystem information, including the FSID on v2/v3
+ */
+int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr)
+{
+ struct nfs_fsinfo fsinfo;
+ struct nfs_client *clp = server->nfs_client;
+ int error;
+
+ if (clp->rpc_ops->set_capabilities != NULL) {
+ error = clp->rpc_ops->set_capabilities(server, mntfh);
+ if (error < 0)
+ return error;
+ }
+
+ fsinfo.fattr = fattr;
+ fsinfo.nlayouttypes = 0;
+ memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype));
+ error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
+ if (error < 0)
+ return error;
+
+ nfs_server_set_fsinfo(server, &fsinfo);
+
+ /* Get some general file system info */
+ if (server->namelen == 0) {
+ struct nfs_pathconf pathinfo;
+
+ pathinfo.fattr = fattr;
+ nfs_fattr_init(fattr);
+
+ if (clp->rpc_ops->pathconf(server, mntfh, &pathinfo) >= 0)
+ server->namelen = pathinfo.max_namelen;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_probe_fsinfo);
+
+/*
+ * Copy useful information when duplicating a server record
+ */
+void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source)
+{
+ target->flags = source->flags;
+ target->rsize = source->rsize;
+ target->wsize = source->wsize;
+ target->acregmin = source->acregmin;
+ target->acregmax = source->acregmax;
+ target->acdirmin = source->acdirmin;
+ target->acdirmax = source->acdirmax;
+ target->caps = source->caps;
+ target->options = source->options;
+ target->auth_info = source->auth_info;
+ target->port = source->port;
+}
+EXPORT_SYMBOL_GPL(nfs_server_copy_userdata);
+
+void nfs_server_insert_lists(struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+
+ spin_lock(&nn->nfs_client_lock);
+ list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
+ list_add_tail(&server->master_link, &nn->nfs_volume_list);
+ clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
+ spin_unlock(&nn->nfs_client_lock);
+
+}
+EXPORT_SYMBOL_GPL(nfs_server_insert_lists);
+
+void nfs_server_remove_lists(struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs_net *nn;
+
+ if (clp == NULL)
+ return;
+ nn = net_generic(clp->cl_net, nfs_net_id);
+ spin_lock(&nn->nfs_client_lock);
+ list_del_rcu(&server->client_link);
+ if (list_empty(&clp->cl_superblocks))
+ set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
+ list_del(&server->master_link);
+ spin_unlock(&nn->nfs_client_lock);
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(nfs_server_remove_lists);
+
+/*
+ * Allocate and initialise a server record
+ */
+struct nfs_server *nfs_alloc_server(void)
+{
+ struct nfs_server *server;
+
+ server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
+ if (!server)
+ return NULL;
+
+ server->client = server->client_acl = ERR_PTR(-EINVAL);
+
+ /* Zero out the NFS state stuff */
+ INIT_LIST_HEAD(&server->client_link);
+ INIT_LIST_HEAD(&server->master_link);
+ INIT_LIST_HEAD(&server->delegations);
+ INIT_LIST_HEAD(&server->layouts);
+ INIT_LIST_HEAD(&server->state_owners_lru);
+ INIT_LIST_HEAD(&server->ss_copies);
+
+ atomic_set(&server->active, 0);
+
+ server->io_stats = nfs_alloc_iostats();
+ if (!server->io_stats) {
+ kfree(server);
+ return NULL;
+ }
+
+ ida_init(&server->openowner_id);
+ ida_init(&server->lockowner_id);
+ pnfs_init_server(server);
+ rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC");
+
+ return server;
+}
+EXPORT_SYMBOL_GPL(nfs_alloc_server);
+
+/*
+ * Free up a server record
+ */
+void nfs_free_server(struct nfs_server *server)
+{
+ nfs_server_remove_lists(server);
+
+ if (server->destroy != NULL)
+ server->destroy(server);
+
+ if (!IS_ERR(server->client_acl))
+ rpc_shutdown_client(server->client_acl);
+ if (!IS_ERR(server->client))
+ rpc_shutdown_client(server->client);
+
+ nfs_put_client(server->nfs_client);
+
+ ida_destroy(&server->lockowner_id);
+ ida_destroy(&server->openowner_id);
+ nfs_free_iostats(server->io_stats);
+ put_cred(server->cred);
+ kfree(server);
+ nfs_release_automount_timer();
+}
+EXPORT_SYMBOL_GPL(nfs_free_server);
+
+/*
+ * Create a version 2 or 3 volume record
+ * - keyed on server and FSID
+ */
+struct nfs_server *nfs_create_server(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct nfs_server *server;
+ struct nfs_fattr *fattr;
+ int error;
+
+ server = nfs_alloc_server();
+ if (!server)
+ return ERR_PTR(-ENOMEM);
+
+ server->cred = get_cred(current_cred());
+
+ error = -ENOMEM;
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ goto error;
+
+ /* Get a client representation */
+ error = nfs_init_server(server, fc);
+ if (error < 0)
+ goto error;
+
+ /* Probe the root fh to retrieve its FSID */
+ error = nfs_probe_fsinfo(server, ctx->mntfh, fattr);
+ if (error < 0)
+ goto error;
+ if (server->nfs_client->rpc_ops->version == 3) {
+ if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
+ server->namelen = NFS3_MAXNAMLEN;
+ if (!(ctx->flags & NFS_MOUNT_NORDIRPLUS))
+ server->caps |= NFS_CAP_READDIRPLUS;
+ } else {
+ if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
+ server->namelen = NFS2_MAXNAMLEN;
+ }
+
+ if (!(fattr->valid & NFS_ATTR_FATTR)) {
+ error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh,
+ fattr, NULL, NULL);
+ if (error < 0) {
+ dprintk("nfs_create_server: getattr error = %d\n", -error);
+ goto error;
+ }
+ }
+ memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
+
+ dprintk("Server FSID: %llx:%llx\n",
+ (unsigned long long) server->fsid.major,
+ (unsigned long long) server->fsid.minor);
+
+ nfs_server_insert_lists(server);
+ server->mount_time = jiffies;
+ nfs_free_fattr(fattr);
+ return server;
+
+error:
+ nfs_free_fattr(fattr);
+ nfs_free_server(server);
+ return ERR_PTR(error);
+}
+EXPORT_SYMBOL_GPL(nfs_create_server);
+
+/*
+ * Clone an NFS2, NFS3 or NFS4 server record
+ */
+struct nfs_server *nfs_clone_server(struct nfs_server *source,
+ struct nfs_fh *fh,
+ struct nfs_fattr *fattr,
+ rpc_authflavor_t flavor)
+{
+ struct nfs_server *server;
+ struct nfs_fattr *fattr_fsinfo;
+ int error;
+
+ server = nfs_alloc_server();
+ if (!server)
+ return ERR_PTR(-ENOMEM);
+
+ server->cred = get_cred(source->cred);
+
+ error = -ENOMEM;
+ fattr_fsinfo = nfs_alloc_fattr();
+ if (fattr_fsinfo == NULL)
+ goto out_free_server;
+
+ /* Copy data from the source */
+ server->nfs_client = source->nfs_client;
+ server->destroy = source->destroy;
+ refcount_inc(&server->nfs_client->cl_count);
+ nfs_server_copy_userdata(server, source);
+
+ server->fsid = fattr->fsid;
+
+ error = nfs_init_server_rpcclient(server,
+ source->client->cl_timeout,
+ flavor);
+ if (error < 0)
+ goto out_free_server;
+
+ /* probe the filesystem info for this server filesystem */
+ error = nfs_probe_fsinfo(server, fh, fattr_fsinfo);
+ if (error < 0)
+ goto out_free_server;
+
+ if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
+ server->namelen = NFS4_MAXNAMLEN;
+
+ error = nfs_start_lockd(server);
+ if (error < 0)
+ goto out_free_server;
+
+ nfs_server_insert_lists(server);
+ server->mount_time = jiffies;
+
+ nfs_free_fattr(fattr_fsinfo);
+ return server;
+
+out_free_server:
+ nfs_free_fattr(fattr_fsinfo);
+ nfs_free_server(server);
+ return ERR_PTR(error);
+}
+EXPORT_SYMBOL_GPL(nfs_clone_server);
+
+void nfs_clients_init(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ INIT_LIST_HEAD(&nn->nfs_client_list);
+ INIT_LIST_HEAD(&nn->nfs_volume_list);
+#if IS_ENABLED(CONFIG_NFS_V4)
+ idr_init(&nn->cb_ident_idr);
+#endif
+ spin_lock_init(&nn->nfs_client_lock);
+ nn->boot_time = ktime_get_real();
+
+ nfs_netns_sysfs_setup(nn, net);
+}
+
+void nfs_clients_exit(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ nfs_netns_sysfs_destroy(nn);
+ nfs_cleanup_cb_ident_idr(net);
+ WARN_ON_ONCE(!list_empty(&nn->nfs_client_list));
+ WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list));
+}
+
+#ifdef CONFIG_PROC_FS
+static void *nfs_server_list_start(struct seq_file *p, loff_t *pos);
+static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos);
+static void nfs_server_list_stop(struct seq_file *p, void *v);
+static int nfs_server_list_show(struct seq_file *m, void *v);
+
+static const struct seq_operations nfs_server_list_ops = {
+ .start = nfs_server_list_start,
+ .next = nfs_server_list_next,
+ .stop = nfs_server_list_stop,
+ .show = nfs_server_list_show,
+};
+
+static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos);
+static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos);
+static void nfs_volume_list_stop(struct seq_file *p, void *v);
+static int nfs_volume_list_show(struct seq_file *m, void *v);
+
+static const struct seq_operations nfs_volume_list_ops = {
+ .start = nfs_volume_list_start,
+ .next = nfs_volume_list_next,
+ .stop = nfs_volume_list_stop,
+ .show = nfs_volume_list_show,
+};
+
+/*
+ * set up the iterator to start reading from the server list and return the first item
+ */
+static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
+ __acquires(&nn->nfs_client_lock)
+{
+ struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
+
+ /* lock the list against modification */
+ spin_lock(&nn->nfs_client_lock);
+ return seq_list_start_head(&nn->nfs_client_list, *_pos);
+}
+
+/*
+ * move to next server
+ */
+static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
+{
+ struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
+
+ return seq_list_next(v, &nn->nfs_client_list, pos);
+}
+
+/*
+ * clean up after reading from the transports list
+ */
+static void nfs_server_list_stop(struct seq_file *p, void *v)
+ __releases(&nn->nfs_client_lock)
+{
+ struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
+
+ spin_unlock(&nn->nfs_client_lock);
+}
+
+/*
+ * display a header line followed by a load of call lines
+ */
+static int nfs_server_list_show(struct seq_file *m, void *v)
+{
+ struct nfs_client *clp;
+ struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
+
+ /* display header on line 1 */
+ if (v == &nn->nfs_client_list) {
+ seq_puts(m, "NV SERVER PORT USE HOSTNAME\n");
+ return 0;
+ }
+
+ /* display one transport per line on subsequent lines */
+ clp = list_entry(v, struct nfs_client, cl_share_link);
+
+ /* Check if the client is initialized */
+ if (clp->cl_cons_state != NFS_CS_READY)
+ return 0;
+
+ rcu_read_lock();
+ seq_printf(m, "v%u %s %s %3d %s\n",
+ clp->rpc_ops->version,
+ rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
+ rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
+ refcount_read(&clp->cl_count),
+ clp->cl_hostname);
+ rcu_read_unlock();
+
+ return 0;
+}
+
+/*
+ * set up the iterator to start reading from the volume list and return the first item
+ */
+static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
+ __acquires(&nn->nfs_client_lock)
+{
+ struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
+
+ /* lock the list against modification */
+ spin_lock(&nn->nfs_client_lock);
+ return seq_list_start_head(&nn->nfs_volume_list, *_pos);
+}
+
+/*
+ * move to next volume
+ */
+static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
+{
+ struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
+
+ return seq_list_next(v, &nn->nfs_volume_list, pos);
+}
+
+/*
+ * clean up after reading from the transports list
+ */
+static void nfs_volume_list_stop(struct seq_file *p, void *v)
+ __releases(&nn->nfs_client_lock)
+{
+ struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
+
+ spin_unlock(&nn->nfs_client_lock);
+}
+
+/*
+ * display a header line followed by a load of call lines
+ */
+static int nfs_volume_list_show(struct seq_file *m, void *v)
+{
+ struct nfs_server *server;
+ struct nfs_client *clp;
+ char dev[13]; // 8 for 2^24, 1 for ':', 3 for 2^8, 1 for '\0'
+ char fsid[34]; // 2 * 16 for %llx, 1 for ':', 1 for '\0'
+ struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
+
+ /* display header on line 1 */
+ if (v == &nn->nfs_volume_list) {
+ seq_puts(m, "NV SERVER PORT DEV FSID"
+ " FSC\n");
+ return 0;
+ }
+ /* display one transport per line on subsequent lines */
+ server = list_entry(v, struct nfs_server, master_link);
+ clp = server->nfs_client;
+
+ snprintf(dev, sizeof(dev), "%u:%u",
+ MAJOR(server->s_dev), MINOR(server->s_dev));
+
+ snprintf(fsid, sizeof(fsid), "%llx:%llx",
+ (unsigned long long) server->fsid.major,
+ (unsigned long long) server->fsid.minor);
+
+ rcu_read_lock();
+ seq_printf(m, "v%u %s %s %-12s %-33s %s\n",
+ clp->rpc_ops->version,
+ rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
+ rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
+ dev,
+ fsid,
+ nfs_server_fscache_state(server));
+ rcu_read_unlock();
+
+ return 0;
+}
+
+int nfs_fs_proc_net_init(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct proc_dir_entry *p;
+
+ nn->proc_nfsfs = proc_net_mkdir(net, "nfsfs", net->proc_net);
+ if (!nn->proc_nfsfs)
+ goto error_0;
+
+ /* a file of servers with which we're dealing */
+ p = proc_create_net("servers", S_IFREG|S_IRUGO, nn->proc_nfsfs,
+ &nfs_server_list_ops, sizeof(struct seq_net_private));
+ if (!p)
+ goto error_1;
+
+ /* a file of volumes that we have mounted */
+ p = proc_create_net("volumes", S_IFREG|S_IRUGO, nn->proc_nfsfs,
+ &nfs_volume_list_ops, sizeof(struct seq_net_private));
+ if (!p)
+ goto error_1;
+ return 0;
+
+error_1:
+ remove_proc_subtree("nfsfs", net->proc_net);
+error_0:
+ return -ENOMEM;
+}
+
+void nfs_fs_proc_net_exit(struct net *net)
+{
+ remove_proc_subtree("nfsfs", net->proc_net);
+}
+
+/*
+ * initialise the /proc/fs/nfsfs/ directory
+ */
+int __init nfs_fs_proc_init(void)
+{
+ if (!proc_mkdir("fs/nfsfs", NULL))
+ goto error_0;
+
+ /* a file of servers with which we're dealing */
+ if (!proc_symlink("fs/nfsfs/servers", NULL, "../../net/nfsfs/servers"))
+ goto error_1;
+
+ /* a file of volumes that we have mounted */
+ if (!proc_symlink("fs/nfsfs/volumes", NULL, "../../net/nfsfs/volumes"))
+ goto error_1;
+
+ return 0;
+error_1:
+ remove_proc_subtree("fs/nfsfs", NULL);
+error_0:
+ return -ENOMEM;
+}
+
+/*
+ * clean up the /proc/fs/nfsfs/ directory
+ */
+void nfs_fs_proc_exit(void)
+{
+ remove_proc_subtree("fs/nfsfs", NULL);
+}
+
+#endif /* CONFIG_PROC_FS */
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
new file mode 100644
index 000000000..1eb6c7a14
--- /dev/null
+++ b/fs/nfs/delegation.c
@@ -0,0 +1,1455 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/delegation.c
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFS file delegation management
+ *
+ */
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/iversion.h>
+
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_xdr.h>
+
+#include "nfs4_fs.h"
+#include "nfs4session.h"
+#include "delegation.h"
+#include "internal.h"
+#include "nfs4trace.h"
+
+#define NFS_DEFAULT_DELEGATION_WATERMARK (5000U)
+
+static atomic_long_t nfs_active_delegations;
+static unsigned nfs_delegation_watermark = NFS_DEFAULT_DELEGATION_WATERMARK;
+
+static void __nfs_free_delegation(struct nfs_delegation *delegation)
+{
+ put_cred(delegation->cred);
+ delegation->cred = NULL;
+ kfree_rcu(delegation, rcu);
+}
+
+static void nfs_mark_delegation_revoked(struct nfs_delegation *delegation)
+{
+ if (!test_and_set_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+ delegation->stateid.type = NFS4_INVALID_STATEID_TYPE;
+ atomic_long_dec(&nfs_active_delegations);
+ if (!test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+ nfs_clear_verifier_delegated(delegation->inode);
+ }
+}
+
+static struct nfs_delegation *nfs_get_delegation(struct nfs_delegation *delegation)
+{
+ refcount_inc(&delegation->refcount);
+ return delegation;
+}
+
+static void nfs_put_delegation(struct nfs_delegation *delegation)
+{
+ if (refcount_dec_and_test(&delegation->refcount))
+ __nfs_free_delegation(delegation);
+}
+
+static void nfs_free_delegation(struct nfs_delegation *delegation)
+{
+ nfs_mark_delegation_revoked(delegation);
+ nfs_put_delegation(delegation);
+}
+
+/**
+ * nfs_mark_delegation_referenced - set delegation's REFERENCED flag
+ * @delegation: delegation to process
+ *
+ */
+void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
+{
+ set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
+}
+
+static void nfs_mark_return_delegation(struct nfs_server *server,
+ struct nfs_delegation *delegation)
+{
+ set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+ set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
+}
+
+static bool
+nfs4_is_valid_delegation(const struct nfs_delegation *delegation,
+ fmode_t flags)
+{
+ if (delegation != NULL && (delegation->type & flags) == flags &&
+ !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) &&
+ !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+ return true;
+ return false;
+}
+
+struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (nfs4_is_valid_delegation(delegation, 0))
+ return delegation;
+ return NULL;
+}
+
+static int
+nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
+{
+ struct nfs_delegation *delegation;
+ int ret = 0;
+
+ flags &= FMODE_READ|FMODE_WRITE;
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (nfs4_is_valid_delegation(delegation, flags)) {
+ if (mark)
+ nfs_mark_delegation_referenced(delegation);
+ ret = 1;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+/**
+ * nfs_have_delegation - check if inode has a delegation, mark it
+ * NFS_DELEGATION_REFERENCED if there is one.
+ * @inode: inode to check
+ * @flags: delegation types to check for
+ *
+ * Returns one if inode has the indicated delegation, otherwise zero.
+ */
+int nfs4_have_delegation(struct inode *inode, fmode_t flags)
+{
+ return nfs4_do_check_delegation(inode, flags, true);
+}
+
+/*
+ * nfs4_check_delegation - check if inode has a delegation, do not mark
+ * NFS_DELEGATION_REFERENCED if it has one.
+ */
+int nfs4_check_delegation(struct inode *inode, fmode_t flags)
+{
+ return nfs4_do_check_delegation(inode, flags, false);
+}
+
+static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_stateid *stateid)
+{
+ struct inode *inode = state->inode;
+ struct file_lock *fl;
+ struct file_lock_context *flctx = inode->i_flctx;
+ struct list_head *list;
+ int status = 0;
+
+ if (flctx == NULL)
+ goto out;
+
+ list = &flctx->flc_posix;
+ spin_lock(&flctx->flc_lock);
+restart:
+ list_for_each_entry(fl, list, fl_list) {
+ if (nfs_file_open_context(fl->fl_file)->state != state)
+ continue;
+ spin_unlock(&flctx->flc_lock);
+ status = nfs4_lock_delegation_recall(fl, state, stateid);
+ if (status < 0)
+ goto out;
+ spin_lock(&flctx->flc_lock);
+ }
+ if (list == &flctx->flc_posix) {
+ list = &flctx->flc_flock;
+ goto restart;
+ }
+ spin_unlock(&flctx->flc_lock);
+out:
+ return status;
+}
+
+static int nfs_delegation_claim_opens(struct inode *inode,
+ const nfs4_stateid *stateid, fmode_t type)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_open_context *ctx;
+ struct nfs4_state_owner *sp;
+ struct nfs4_state *state;
+ unsigned int seq;
+ int err;
+
+again:
+ rcu_read_lock();
+ list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
+ state = ctx->state;
+ if (state == NULL)
+ continue;
+ if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
+ continue;
+ if (!nfs4_valid_open_stateid(state))
+ continue;
+ if (!nfs4_stateid_match(&state->stateid, stateid))
+ continue;
+ if (!get_nfs_open_context(ctx))
+ continue;
+ rcu_read_unlock();
+ sp = state->owner;
+ /* Block nfs4_proc_unlck */
+ mutex_lock(&sp->so_delegreturn_mutex);
+ seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+ err = nfs4_open_delegation_recall(ctx, state, stateid);
+ if (!err)
+ err = nfs_delegation_claim_locks(state, stateid);
+ if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+ err = -EAGAIN;
+ mutex_unlock(&sp->so_delegreturn_mutex);
+ put_nfs_open_context(ctx);
+ if (err != 0)
+ return err;
+ goto again;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+/**
+ * nfs_inode_reclaim_delegation - process a delegation reclaim request
+ * @inode: inode to process
+ * @cred: credential to use for request
+ * @type: delegation type
+ * @stateid: delegation stateid
+ * @pagemod_limit: write delegation "space_limit"
+ *
+ */
+void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
+ fmode_t type, const nfs4_stateid *stateid,
+ unsigned long pagemod_limit)
+{
+ struct nfs_delegation *delegation;
+ const struct cred *oldcred = NULL;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation != NULL) {
+ spin_lock(&delegation->lock);
+ nfs4_stateid_copy(&delegation->stateid, stateid);
+ delegation->type = type;
+ delegation->pagemod_limit = pagemod_limit;
+ oldcred = delegation->cred;
+ delegation->cred = get_cred(cred);
+ clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+ if (test_and_clear_bit(NFS_DELEGATION_REVOKED,
+ &delegation->flags))
+ atomic_long_inc(&nfs_active_delegations);
+ spin_unlock(&delegation->lock);
+ rcu_read_unlock();
+ put_cred(oldcred);
+ trace_nfs4_reclaim_delegation(inode, type);
+ } else {
+ rcu_read_unlock();
+ nfs_inode_set_delegation(inode, cred, type, stateid,
+ pagemod_limit);
+ }
+}
+
+static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
+{
+ const struct cred *cred;
+ int res = 0;
+
+ if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+ spin_lock(&delegation->lock);
+ cred = get_cred(delegation->cred);
+ spin_unlock(&delegation->lock);
+ res = nfs4_proc_delegreturn(inode, cred,
+ &delegation->stateid,
+ issync);
+ put_cred(cred);
+ }
+ return res;
+}
+
+static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation)
+{
+ struct inode *inode = NULL;
+
+ spin_lock(&delegation->lock);
+ if (delegation->inode != NULL)
+ inode = igrab(delegation->inode);
+ if (!inode)
+ set_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags);
+ spin_unlock(&delegation->lock);
+ return inode;
+}
+
+static struct nfs_delegation *
+nfs_start_delegation_return_locked(struct nfs_inode *nfsi)
+{
+ struct nfs_delegation *ret = NULL;
+ struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
+
+ if (delegation == NULL)
+ goto out;
+ spin_lock(&delegation->lock);
+ if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
+ clear_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags);
+ /* Refcount matched in nfs_end_delegation_return() */
+ ret = nfs_get_delegation(delegation);
+ }
+ spin_unlock(&delegation->lock);
+ if (ret)
+ nfs_clear_verifier_delegated(&nfsi->vfs_inode);
+out:
+ return ret;
+}
+
+static struct nfs_delegation *
+nfs_start_delegation_return(struct nfs_inode *nfsi)
+{
+ struct nfs_delegation *delegation;
+
+ rcu_read_lock();
+ delegation = nfs_start_delegation_return_locked(nfsi);
+ rcu_read_unlock();
+ return delegation;
+}
+
+static void nfs_abort_delegation_return(struct nfs_delegation *delegation,
+ struct nfs_client *clp, int err)
+{
+
+ spin_lock(&delegation->lock);
+ clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
+ if (err == -EAGAIN) {
+ set_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags);
+ set_bit(NFS4CLNT_DELEGRETURN_DELAYED, &clp->cl_state);
+ }
+ spin_unlock(&delegation->lock);
+}
+
+static struct nfs_delegation *
+nfs_detach_delegation_locked(struct nfs_inode *nfsi,
+ struct nfs_delegation *delegation,
+ struct nfs_client *clp)
+{
+ struct nfs_delegation *deleg_cur =
+ rcu_dereference_protected(nfsi->delegation,
+ lockdep_is_held(&clp->cl_lock));
+
+ if (deleg_cur == NULL || delegation != deleg_cur)
+ return NULL;
+
+ spin_lock(&delegation->lock);
+ if (!delegation->inode) {
+ spin_unlock(&delegation->lock);
+ return NULL;
+ }
+ list_del_rcu(&delegation->super_list);
+ delegation->inode = NULL;
+ rcu_assign_pointer(nfsi->delegation, NULL);
+ spin_unlock(&delegation->lock);
+ return delegation;
+}
+
+static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi,
+ struct nfs_delegation *delegation,
+ struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+
+ spin_lock(&clp->cl_lock);
+ delegation = nfs_detach_delegation_locked(nfsi, delegation, clp);
+ spin_unlock(&clp->cl_lock);
+ return delegation;
+}
+
+static struct nfs_delegation *
+nfs_inode_detach_delegation(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_delegation *delegation;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(nfsi->delegation);
+ if (delegation != NULL)
+ delegation = nfs_detach_delegation(nfsi, delegation, server);
+ rcu_read_unlock();
+ return delegation;
+}
+
+static void
+nfs_update_delegation_cred(struct nfs_delegation *delegation,
+ const struct cred *cred)
+{
+ const struct cred *old;
+
+ if (cred_fscmp(delegation->cred, cred) != 0) {
+ old = xchg(&delegation->cred, get_cred(cred));
+ put_cred(old);
+ }
+}
+
+static void
+nfs_update_inplace_delegation(struct nfs_delegation *delegation,
+ const struct nfs_delegation *update)
+{
+ if (nfs4_stateid_is_newer(&update->stateid, &delegation->stateid)) {
+ delegation->stateid.seqid = update->stateid.seqid;
+ smp_wmb();
+ delegation->type = update->type;
+ delegation->pagemod_limit = update->pagemod_limit;
+ if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+ delegation->change_attr = update->change_attr;
+ nfs_update_delegation_cred(delegation, update->cred);
+ /* smp_mb__before_atomic() is implicit due to xchg() */
+ clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
+ atomic_long_inc(&nfs_active_delegations);
+ }
+ }
+}
+
+/**
+ * nfs_inode_set_delegation - set up a delegation on an inode
+ * @inode: inode to which delegation applies
+ * @cred: cred to use for subsequent delegation processing
+ * @type: delegation type
+ * @stateid: delegation stateid
+ * @pagemod_limit: write delegation "space_limit"
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
+ fmode_t type,
+ const nfs4_stateid *stateid,
+ unsigned long pagemod_limit)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_delegation *delegation, *old_delegation;
+ struct nfs_delegation *freeme = NULL;
+ int status = 0;
+
+ delegation = kmalloc(sizeof(*delegation), GFP_NOFS);
+ if (delegation == NULL)
+ return -ENOMEM;
+ nfs4_stateid_copy(&delegation->stateid, stateid);
+ refcount_set(&delegation->refcount, 1);
+ delegation->type = type;
+ delegation->pagemod_limit = pagemod_limit;
+ delegation->change_attr = inode_peek_iversion_raw(inode);
+ delegation->cred = get_cred(cred);
+ delegation->inode = inode;
+ delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
+ spin_lock_init(&delegation->lock);
+
+ spin_lock(&clp->cl_lock);
+ old_delegation = rcu_dereference_protected(nfsi->delegation,
+ lockdep_is_held(&clp->cl_lock));
+ if (old_delegation == NULL)
+ goto add_new;
+ /* Is this an update of the existing delegation? */
+ if (nfs4_stateid_match_other(&old_delegation->stateid,
+ &delegation->stateid)) {
+ spin_lock(&old_delegation->lock);
+ nfs_update_inplace_delegation(old_delegation,
+ delegation);
+ spin_unlock(&old_delegation->lock);
+ goto out;
+ }
+ if (!test_bit(NFS_DELEGATION_REVOKED, &old_delegation->flags)) {
+ /*
+ * Deal with broken servers that hand out two
+ * delegations for the same file.
+ * Allow for upgrades to a WRITE delegation, but
+ * nothing else.
+ */
+ dfprintk(FILE, "%s: server %s handed out "
+ "a duplicate delegation!\n",
+ __func__, clp->cl_hostname);
+ if (delegation->type == old_delegation->type ||
+ !(delegation->type & FMODE_WRITE)) {
+ freeme = delegation;
+ delegation = NULL;
+ goto out;
+ }
+ if (test_and_set_bit(NFS_DELEGATION_RETURNING,
+ &old_delegation->flags))
+ goto out;
+ }
+ freeme = nfs_detach_delegation_locked(nfsi, old_delegation, clp);
+ if (freeme == NULL)
+ goto out;
+add_new:
+ list_add_tail_rcu(&delegation->super_list, &server->delegations);
+ rcu_assign_pointer(nfsi->delegation, delegation);
+ delegation = NULL;
+
+ atomic_long_inc(&nfs_active_delegations);
+
+ trace_nfs4_set_delegation(inode, type);
+
+ spin_lock(&inode->i_lock);
+ if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME))
+ NFS_I(inode)->cache_validity |= NFS_INO_REVAL_FORCED;
+ spin_unlock(&inode->i_lock);
+out:
+ spin_unlock(&clp->cl_lock);
+ if (delegation != NULL)
+ __nfs_free_delegation(delegation);
+ if (freeme != NULL) {
+ nfs_do_return_delegation(inode, freeme, 0);
+ nfs_free_delegation(freeme);
+ }
+ return status;
+}
+
+/*
+ * Basic procedure for returning a delegation to the server
+ */
+static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync)
+{
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+ int err = 0;
+
+ if (delegation == NULL)
+ return 0;
+ do {
+ if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+ break;
+ err = nfs_delegation_claim_opens(inode, &delegation->stateid,
+ delegation->type);
+ if (!issync || err != -EAGAIN)
+ break;
+ /*
+ * Guard against state recovery
+ */
+ err = nfs4_wait_clnt_recover(clp);
+ } while (err == 0);
+
+ if (err) {
+ nfs_abort_delegation_return(delegation, clp, err);
+ goto out;
+ }
+
+ err = nfs_do_return_delegation(inode, delegation, issync);
+out:
+ /* Refcount matched in nfs_start_delegation_return_locked() */
+ nfs_put_delegation(delegation);
+ return err;
+}
+
+static bool nfs_delegation_need_return(struct nfs_delegation *delegation)
+{
+ bool ret = false;
+
+ if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
+ ret = true;
+ else if (test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags)) {
+ struct inode *inode;
+
+ spin_lock(&delegation->lock);
+ inode = delegation->inode;
+ if (inode && list_empty(&NFS_I(inode)->open_files))
+ ret = true;
+ spin_unlock(&delegation->lock);
+ }
+ if (ret)
+ clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
+ if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags) ||
+ test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+ ret = false;
+
+ return ret;
+}
+
+static int nfs_server_return_marked_delegations(struct nfs_server *server,
+ void __always_unused *data)
+{
+ struct nfs_delegation *delegation;
+ struct nfs_delegation *prev;
+ struct inode *inode;
+ struct inode *place_holder = NULL;
+ struct nfs_delegation *place_holder_deleg = NULL;
+ int err = 0;
+
+restart:
+ /*
+ * To avoid quadratic looping we hold a reference
+ * to an inode place_holder. Each time we restart, we
+ * list delegation in the server from the delegations
+ * of that inode.
+ * prev is an RCU-protected pointer to a delegation which
+ * wasn't marked for return and might be a good choice for
+ * the next place_holder.
+ */
+ prev = NULL;
+ delegation = NULL;
+ rcu_read_lock();
+ if (place_holder)
+ delegation = rcu_dereference(NFS_I(place_holder)->delegation);
+ if (!delegation || delegation != place_holder_deleg)
+ delegation = list_entry_rcu(server->delegations.next,
+ struct nfs_delegation, super_list);
+ list_for_each_entry_from_rcu(delegation, &server->delegations, super_list) {
+ struct inode *to_put = NULL;
+
+ if (test_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags))
+ continue;
+ if (!nfs_delegation_need_return(delegation)) {
+ if (nfs4_is_valid_delegation(delegation, 0))
+ prev = delegation;
+ continue;
+ }
+
+ if (prev) {
+ struct inode *tmp = nfs_delegation_grab_inode(prev);
+ if (tmp) {
+ to_put = place_holder;
+ place_holder = tmp;
+ place_holder_deleg = prev;
+ }
+ }
+
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL) {
+ rcu_read_unlock();
+ iput(to_put);
+ goto restart;
+ }
+ delegation = nfs_start_delegation_return_locked(NFS_I(inode));
+ rcu_read_unlock();
+
+ iput(to_put);
+
+ err = nfs_end_delegation_return(inode, delegation, 0);
+ iput(inode);
+ cond_resched();
+ if (!err)
+ goto restart;
+ set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
+ goto out;
+ }
+ rcu_read_unlock();
+out:
+ iput(place_holder);
+ return err;
+}
+
+static bool nfs_server_clear_delayed_delegations(struct nfs_server *server)
+{
+ struct nfs_delegation *d;
+ bool ret = false;
+
+ list_for_each_entry_rcu (d, &server->delegations, super_list) {
+ if (!test_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags))
+ continue;
+ nfs_mark_return_delegation(server, d);
+ clear_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags);
+ ret = true;
+ }
+ return ret;
+}
+
+static bool nfs_client_clear_delayed_delegations(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+ bool ret = false;
+
+ if (!test_and_clear_bit(NFS4CLNT_DELEGRETURN_DELAYED, &clp->cl_state))
+ goto out;
+ rcu_read_lock();
+ list_for_each_entry_rcu (server, &clp->cl_superblocks, client_link) {
+ if (nfs_server_clear_delayed_delegations(server))
+ ret = true;
+ }
+ rcu_read_unlock();
+out:
+ return ret;
+}
+
+/**
+ * nfs_client_return_marked_delegations - return previously marked delegations
+ * @clp: nfs_client to process
+ *
+ * Note that this function is designed to be called by the state
+ * manager thread. For this reason, it cannot flush the dirty data,
+ * since that could deadlock in case of a state recovery error.
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_client_return_marked_delegations(struct nfs_client *clp)
+{
+ int err = nfs_client_for_each_server(
+ clp, nfs_server_return_marked_delegations, NULL);
+ if (err)
+ return err;
+ /* If a return was delayed, sleep to prevent hard looping */
+ if (nfs_client_clear_delayed_delegations(clp))
+ ssleep(1);
+ return 0;
+}
+
+/**
+ * nfs_inode_evict_delegation - return delegation, don't reclaim opens
+ * @inode: inode to process
+ *
+ * Does not protect against delegation reclaims, therefore really only safe
+ * to be called from nfs4_clear_inode(). Guaranteed to always free
+ * the delegation structure.
+ */
+void nfs_inode_evict_delegation(struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+
+ delegation = nfs_inode_detach_delegation(inode);
+ if (delegation != NULL) {
+ set_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
+ set_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags);
+ nfs_do_return_delegation(inode, delegation, 1);
+ nfs_free_delegation(delegation);
+ }
+}
+
+/**
+ * nfs_inode_return_delegation - synchronously return a delegation
+ * @inode: inode to process
+ *
+ * This routine will always flush any dirty data to disk on the
+ * assumption that if we need to return the delegation, then
+ * we should stop caching.
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs4_inode_return_delegation(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_delegation *delegation;
+ int err = 0;
+
+ nfs_wb_all(inode);
+ delegation = nfs_start_delegation_return(nfsi);
+ if (delegation != NULL)
+ err = nfs_end_delegation_return(inode, delegation, 1);
+ return err;
+}
+
+/**
+ * nfs_inode_return_delegation_on_close - asynchronously return a delegation
+ * @inode: inode to process
+ *
+ * This routine is called on file close in order to determine if the
+ * inode delegation needs to be returned immediately.
+ */
+void nfs4_inode_return_delegation_on_close(struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+ struct nfs_delegation *ret = NULL;
+
+ if (!inode)
+ return;
+ rcu_read_lock();
+ delegation = nfs4_get_valid_delegation(inode);
+ if (!delegation)
+ goto out;
+ if (test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) ||
+ atomic_long_read(&nfs_active_delegations) >= nfs_delegation_watermark) {
+ spin_lock(&delegation->lock);
+ if (delegation->inode &&
+ list_empty(&NFS_I(inode)->open_files) &&
+ !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
+ clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
+ /* Refcount matched in nfs_end_delegation_return() */
+ ret = nfs_get_delegation(delegation);
+ }
+ spin_unlock(&delegation->lock);
+ if (ret)
+ nfs_clear_verifier_delegated(inode);
+ }
+out:
+ rcu_read_unlock();
+ nfs_end_delegation_return(inode, ret, 0);
+}
+
+/**
+ * nfs4_inode_make_writeable
+ * @inode: pointer to inode
+ *
+ * Make the inode writeable by returning the delegation if necessary
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs4_inode_make_writeable(struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+
+ rcu_read_lock();
+ delegation = nfs4_get_valid_delegation(inode);
+ if (delegation == NULL ||
+ (nfs4_has_session(NFS_SERVER(inode)->nfs_client) &&
+ (delegation->type & FMODE_WRITE))) {
+ rcu_read_unlock();
+ return 0;
+ }
+ rcu_read_unlock();
+ return nfs4_inode_return_delegation(inode);
+}
+
+static void nfs_mark_return_if_closed_delegation(struct nfs_server *server,
+ struct nfs_delegation *delegation)
+{
+ set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
+ set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
+}
+
+static bool nfs_server_mark_return_all_delegations(struct nfs_server *server)
+{
+ struct nfs_delegation *delegation;
+ bool ret = false;
+
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ nfs_mark_return_delegation(server, delegation);
+ ret = true;
+ }
+ return ret;
+}
+
+static void nfs_client_mark_return_all_delegations(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ nfs_server_mark_return_all_delegations(server);
+ rcu_read_unlock();
+}
+
+static void nfs_delegation_run_state_manager(struct nfs_client *clp)
+{
+ if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))
+ nfs4_schedule_state_manager(clp);
+}
+
+/**
+ * nfs_expire_all_delegations
+ * @clp: client to process
+ *
+ */
+void nfs_expire_all_delegations(struct nfs_client *clp)
+{
+ nfs_client_mark_return_all_delegations(clp);
+ nfs_delegation_run_state_manager(clp);
+}
+
+/**
+ * nfs_super_return_all_delegations - return delegations for one superblock
+ * @server: pointer to nfs_server to process
+ *
+ */
+void nfs_server_return_all_delegations(struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+ bool need_wait;
+
+ if (clp == NULL)
+ return;
+
+ rcu_read_lock();
+ need_wait = nfs_server_mark_return_all_delegations(server);
+ rcu_read_unlock();
+
+ if (need_wait) {
+ nfs4_schedule_state_manager(clp);
+ nfs4_wait_clnt_recover(clp);
+ }
+}
+
+static void nfs_mark_return_unused_delegation_types(struct nfs_server *server,
+ fmode_t flags)
+{
+ struct nfs_delegation *delegation;
+
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
+ continue;
+ if (delegation->type & flags)
+ nfs_mark_return_if_closed_delegation(server, delegation);
+ }
+}
+
+static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *clp,
+ fmode_t flags)
+{
+ struct nfs_server *server;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ nfs_mark_return_unused_delegation_types(server, flags);
+ rcu_read_unlock();
+}
+
+static void nfs_revoke_delegation(struct inode *inode,
+ const nfs4_stateid *stateid)
+{
+ struct nfs_delegation *delegation;
+ nfs4_stateid tmp;
+ bool ret = false;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation == NULL)
+ goto out;
+ if (stateid == NULL) {
+ nfs4_stateid_copy(&tmp, &delegation->stateid);
+ stateid = &tmp;
+ } else {
+ if (!nfs4_stateid_match_other(stateid, &delegation->stateid))
+ goto out;
+ spin_lock(&delegation->lock);
+ if (stateid->seqid) {
+ if (nfs4_stateid_is_newer(&delegation->stateid, stateid)) {
+ spin_unlock(&delegation->lock);
+ goto out;
+ }
+ delegation->stateid.seqid = stateid->seqid;
+ }
+ spin_unlock(&delegation->lock);
+ }
+ nfs_mark_delegation_revoked(delegation);
+ ret = true;
+out:
+ rcu_read_unlock();
+ if (ret)
+ nfs_inode_find_state_and_recover(inode, stateid);
+}
+
+void nfs_remove_bad_delegation(struct inode *inode,
+ const nfs4_stateid *stateid)
+{
+ nfs_revoke_delegation(inode, stateid);
+}
+EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation);
+
+void nfs_delegation_mark_returned(struct inode *inode,
+ const nfs4_stateid *stateid)
+{
+ struct nfs_delegation *delegation;
+
+ if (!inode)
+ return;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (!delegation)
+ goto out_rcu_unlock;
+
+ spin_lock(&delegation->lock);
+ if (!nfs4_stateid_match_other(stateid, &delegation->stateid))
+ goto out_spin_unlock;
+ if (stateid->seqid) {
+ /* If delegation->stateid is newer, dont mark as returned */
+ if (nfs4_stateid_is_newer(&delegation->stateid, stateid))
+ goto out_clear_returning;
+ if (delegation->stateid.seqid != stateid->seqid)
+ delegation->stateid.seqid = stateid->seqid;
+ }
+
+ nfs_mark_delegation_revoked(delegation);
+
+out_clear_returning:
+ clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
+out_spin_unlock:
+ spin_unlock(&delegation->lock);
+out_rcu_unlock:
+ rcu_read_unlock();
+
+ nfs_inode_find_state_and_recover(inode, stateid);
+}
+
+/**
+ * nfs_expire_unused_delegation_types
+ * @clp: client to process
+ * @flags: delegation types to expire
+ *
+ */
+void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags)
+{
+ nfs_client_mark_return_unused_delegation_types(clp, flags);
+ nfs_delegation_run_state_manager(clp);
+}
+
+static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
+{
+ struct nfs_delegation *delegation;
+
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
+ continue;
+ nfs_mark_return_if_closed_delegation(server, delegation);
+ }
+}
+
+/**
+ * nfs_expire_unreferenced_delegations - Eliminate unused delegations
+ * @clp: nfs_client to process
+ *
+ */
+void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ nfs_mark_return_unreferenced_delegations(server);
+ rcu_read_unlock();
+
+ nfs_delegation_run_state_manager(clp);
+}
+
+/**
+ * nfs_async_inode_return_delegation - asynchronously return a delegation
+ * @inode: inode to process
+ * @stateid: state ID information
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_async_inode_return_delegation(struct inode *inode,
+ const nfs4_stateid *stateid)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs_delegation *delegation;
+
+ rcu_read_lock();
+ delegation = nfs4_get_valid_delegation(inode);
+ if (delegation == NULL)
+ goto out_enoent;
+ if (stateid != NULL &&
+ !clp->cl_mvops->match_stateid(&delegation->stateid, stateid))
+ goto out_enoent;
+ nfs_mark_return_delegation(server, delegation);
+ rcu_read_unlock();
+
+ nfs_delegation_run_state_manager(clp);
+ return 0;
+out_enoent:
+ rcu_read_unlock();
+ return -ENOENT;
+}
+
+static struct inode *
+nfs_delegation_find_inode_server(struct nfs_server *server,
+ const struct nfs_fh *fhandle)
+{
+ struct nfs_delegation *delegation;
+ struct super_block *freeme = NULL;
+ struct inode *res = NULL;
+
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ spin_lock(&delegation->lock);
+ if (delegation->inode != NULL &&
+ !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) &&
+ nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
+ if (nfs_sb_active(server->super)) {
+ freeme = server->super;
+ res = igrab(delegation->inode);
+ }
+ spin_unlock(&delegation->lock);
+ if (res != NULL)
+ return res;
+ if (freeme) {
+ rcu_read_unlock();
+ nfs_sb_deactive(freeme);
+ rcu_read_lock();
+ }
+ return ERR_PTR(-EAGAIN);
+ }
+ spin_unlock(&delegation->lock);
+ }
+ return ERR_PTR(-ENOENT);
+}
+
+/**
+ * nfs_delegation_find_inode - retrieve the inode associated with a delegation
+ * @clp: client state handle
+ * @fhandle: filehandle from a delegation recall
+ *
+ * Returns pointer to inode matching "fhandle," or NULL if a matching inode
+ * cannot be found.
+ */
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp,
+ const struct nfs_fh *fhandle)
+{
+ struct nfs_server *server;
+ struct inode *res;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ res = nfs_delegation_find_inode_server(server, fhandle);
+ if (res != ERR_PTR(-ENOENT)) {
+ rcu_read_unlock();
+ return res;
+ }
+ }
+ rcu_read_unlock();
+ return ERR_PTR(-ENOENT);
+}
+
+static void nfs_delegation_mark_reclaim_server(struct nfs_server *server)
+{
+ struct nfs_delegation *delegation;
+
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ /*
+ * If the delegation may have been admin revoked, then we
+ * cannot reclaim it.
+ */
+ if (test_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags))
+ continue;
+ set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+ }
+}
+
+/**
+ * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed
+ * @clp: nfs_client to process
+ *
+ */
+void nfs_delegation_mark_reclaim(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ nfs_delegation_mark_reclaim_server(server);
+ rcu_read_unlock();
+}
+
+static int nfs_server_reap_unclaimed_delegations(struct nfs_server *server,
+ void __always_unused *data)
+{
+ struct nfs_delegation *delegation;
+ struct inode *inode;
+restart:
+ rcu_read_lock();
+restart_locked:
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ if (test_bit(NFS_DELEGATION_INODE_FREEING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURNING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_NEED_RECLAIM,
+ &delegation->flags) == 0)
+ continue;
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL)
+ goto restart_locked;
+ delegation = nfs_start_delegation_return_locked(NFS_I(inode));
+ rcu_read_unlock();
+ if (delegation != NULL) {
+ if (nfs_detach_delegation(NFS_I(inode), delegation,
+ server) != NULL)
+ nfs_free_delegation(delegation);
+ /* Match nfs_start_delegation_return_locked */
+ nfs_put_delegation(delegation);
+ }
+ iput(inode);
+ cond_resched();
+ goto restart;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+/**
+ * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
+ * @clp: nfs_client to process
+ *
+ */
+void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
+{
+ nfs_client_for_each_server(clp, nfs_server_reap_unclaimed_delegations,
+ NULL);
+}
+
+static inline bool nfs4_server_rebooted(const struct nfs_client *clp)
+{
+ return (clp->cl_state & (BIT(NFS4CLNT_CHECK_LEASE) |
+ BIT(NFS4CLNT_LEASE_EXPIRED) |
+ BIT(NFS4CLNT_SESSION_RESET))) != 0;
+}
+
+static void nfs_mark_test_expired_delegation(struct nfs_server *server,
+ struct nfs_delegation *delegation)
+{
+ if (delegation->stateid.type == NFS4_INVALID_STATEID_TYPE)
+ return;
+ clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+ set_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
+ set_bit(NFS4CLNT_DELEGATION_EXPIRED, &server->nfs_client->cl_state);
+}
+
+static void nfs_inode_mark_test_expired_delegation(struct nfs_server *server,
+ struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation)
+ nfs_mark_test_expired_delegation(server, delegation);
+ rcu_read_unlock();
+
+}
+
+static void nfs_delegation_mark_test_expired_server(struct nfs_server *server)
+{
+ struct nfs_delegation *delegation;
+
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list)
+ nfs_mark_test_expired_delegation(server, delegation);
+}
+
+/**
+ * nfs_mark_test_expired_all_delegations - mark all delegations for testing
+ * @clp: nfs_client to process
+ *
+ * Iterates through all the delegations associated with this server and
+ * marks them as needing to be checked for validity.
+ */
+void nfs_mark_test_expired_all_delegations(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ nfs_delegation_mark_test_expired_server(server);
+ rcu_read_unlock();
+}
+
+/**
+ * nfs_test_expired_all_delegations - test all delegations for a client
+ * @clp: nfs_client to process
+ *
+ * Helper for handling "recallable state revoked" status from server.
+ */
+void nfs_test_expired_all_delegations(struct nfs_client *clp)
+{
+ nfs_mark_test_expired_all_delegations(clp);
+ nfs4_schedule_state_manager(clp);
+}
+
+static void
+nfs_delegation_test_free_expired(struct inode *inode,
+ nfs4_stateid *stateid,
+ const struct cred *cred)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ const struct nfs4_minor_version_ops *ops = server->nfs_client->cl_mvops;
+ int status;
+
+ if (!cred)
+ return;
+ status = ops->test_and_free_expired(server, stateid, cred);
+ if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID)
+ nfs_remove_bad_delegation(inode, stateid);
+}
+
+static int nfs_server_reap_expired_delegations(struct nfs_server *server,
+ void __always_unused *data)
+{
+ struct nfs_delegation *delegation;
+ struct inode *inode;
+ const struct cred *cred;
+ nfs4_stateid stateid;
+restart:
+ rcu_read_lock();
+restart_locked:
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ if (test_bit(NFS_DELEGATION_INODE_FREEING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURNING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_TEST_EXPIRED,
+ &delegation->flags) == 0)
+ continue;
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL)
+ goto restart_locked;
+ spin_lock(&delegation->lock);
+ cred = get_cred_rcu(delegation->cred);
+ nfs4_stateid_copy(&stateid, &delegation->stateid);
+ spin_unlock(&delegation->lock);
+ clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
+ rcu_read_unlock();
+ nfs_delegation_test_free_expired(inode, &stateid, cred);
+ put_cred(cred);
+ if (!nfs4_server_rebooted(server->nfs_client)) {
+ iput(inode);
+ cond_resched();
+ goto restart;
+ }
+ nfs_inode_mark_test_expired_delegation(server,inode);
+ iput(inode);
+ return -EAGAIN;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+/**
+ * nfs_reap_expired_delegations - reap expired delegations
+ * @clp: nfs_client to process
+ *
+ * Iterates through all the delegations associated with this server and
+ * checks if they have may have been revoked. This function is usually
+ * expected to be called in cases where the server may have lost its
+ * lease.
+ */
+void nfs_reap_expired_delegations(struct nfs_client *clp)
+{
+ nfs_client_for_each_server(clp, nfs_server_reap_expired_delegations,
+ NULL);
+}
+
+void nfs_inode_find_delegation_state_and_recover(struct inode *inode,
+ const nfs4_stateid *stateid)
+{
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+ struct nfs_delegation *delegation;
+ bool found = false;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation &&
+ nfs4_stateid_match_or_older(&delegation->stateid, stateid) &&
+ !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+ nfs_mark_test_expired_delegation(NFS_SERVER(inode), delegation);
+ found = true;
+ }
+ rcu_read_unlock();
+ if (found)
+ nfs4_schedule_state_manager(clp);
+}
+
+/**
+ * nfs_delegations_present - check for existence of delegations
+ * @clp: client state handle
+ *
+ * Returns one if there are any nfs_delegation structures attached
+ * to this nfs_client.
+ */
+int nfs_delegations_present(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+ int ret = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ if (!list_empty(&server->delegations)) {
+ ret = 1;
+ break;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * nfs4_refresh_delegation_stateid - Update delegation stateid seqid
+ * @dst: stateid to refresh
+ * @inode: inode to check
+ *
+ * Returns "true" and updates "dst->seqid" * if inode had a delegation
+ * that matches our delegation stateid. Otherwise "false" is returned.
+ */
+bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+ bool ret = false;
+ if (!inode)
+ goto out;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation != NULL &&
+ nfs4_stateid_match_other(dst, &delegation->stateid) &&
+ nfs4_stateid_is_newer(&delegation->stateid, dst) &&
+ !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+ dst->seqid = delegation->stateid.seqid;
+ ret = true;
+ }
+ rcu_read_unlock();
+out:
+ return ret;
+}
+
+/**
+ * nfs4_copy_delegation_stateid - Copy inode's state ID information
+ * @inode: inode to check
+ * @flags: delegation type requirement
+ * @dst: stateid data structure to fill in
+ * @cred: optional argument to retrieve credential
+ *
+ * Returns "true" and fills in "dst->data" * if inode had a delegation,
+ * otherwise "false" is returned.
+ */
+bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
+ nfs4_stateid *dst, const struct cred **cred)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_delegation *delegation;
+ bool ret = false;
+
+ flags &= FMODE_READ|FMODE_WRITE;
+ rcu_read_lock();
+ delegation = rcu_dereference(nfsi->delegation);
+ if (!delegation)
+ goto out;
+ spin_lock(&delegation->lock);
+ ret = nfs4_is_valid_delegation(delegation, flags);
+ if (ret) {
+ nfs4_stateid_copy(dst, &delegation->stateid);
+ nfs_mark_delegation_referenced(delegation);
+ if (cred)
+ *cred = get_cred(delegation->cred);
+ }
+ spin_unlock(&delegation->lock);
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * nfs4_delegation_flush_on_close - Check if we must flush file on close
+ * @inode: inode to check
+ *
+ * This function checks the number of outstanding writes to the file
+ * against the delegation 'space_limit' field to see if
+ * the spec requires us to flush the file on close.
+ */
+bool nfs4_delegation_flush_on_close(const struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_delegation *delegation;
+ bool ret = true;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(nfsi->delegation);
+ if (delegation == NULL || !(delegation->type & FMODE_WRITE))
+ goto out;
+ if (atomic_long_read(&nfsi->nrequests) < delegation->pagemod_limit)
+ ret = false;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+module_param_named(delegation_watermark, nfs_delegation_watermark, uint, 0644);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
new file mode 100644
index 000000000..26f57a99d
--- /dev/null
+++ b/fs/nfs/delegation.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/fs/nfs/delegation.h
+ *
+ * Copyright (c) Trond Myklebust
+ *
+ * Definitions pertaining to NFS delegated files
+ */
+#ifndef FS_NFS_DELEGATION_H
+#define FS_NFS_DELEGATION_H
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+/*
+ * NFSv4 delegation
+ */
+struct nfs_delegation {
+ struct list_head super_list;
+ const struct cred *cred;
+ struct inode *inode;
+ nfs4_stateid stateid;
+ fmode_t type;
+ unsigned long pagemod_limit;
+ __u64 change_attr;
+ unsigned long flags;
+ refcount_t refcount;
+ spinlock_t lock;
+ struct rcu_head rcu;
+};
+
+enum {
+ NFS_DELEGATION_NEED_RECLAIM = 0,
+ NFS_DELEGATION_RETURN,
+ NFS_DELEGATION_RETURN_IF_CLOSED,
+ NFS_DELEGATION_REFERENCED,
+ NFS_DELEGATION_RETURNING,
+ NFS_DELEGATION_REVOKED,
+ NFS_DELEGATION_TEST_EXPIRED,
+ NFS_DELEGATION_INODE_FREEING,
+ NFS_DELEGATION_RETURN_DELAYED,
+};
+
+int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
+ fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit);
+void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
+ fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit);
+int nfs4_inode_return_delegation(struct inode *inode);
+void nfs4_inode_return_delegation_on_close(struct inode *inode);
+int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
+void nfs_inode_evict_delegation(struct inode *inode);
+
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
+void nfs_server_return_all_delegations(struct nfs_server *);
+void nfs_expire_all_delegations(struct nfs_client *clp);
+void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags);
+void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
+int nfs_client_return_marked_delegations(struct nfs_client *clp);
+int nfs_delegations_present(struct nfs_client *clp);
+void nfs_remove_bad_delegation(struct inode *inode, const nfs4_stateid *stateid);
+void nfs_delegation_mark_returned(struct inode *inode, const nfs4_stateid *stateid);
+
+void nfs_delegation_mark_reclaim(struct nfs_client *clp);
+void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
+
+void nfs_mark_test_expired_all_delegations(struct nfs_client *clp);
+void nfs_test_expired_all_delegations(struct nfs_client *clp);
+void nfs_reap_expired_delegations(struct nfs_client *clp);
+
+/* NFSv4 delegation-related procedures */
+int nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, const nfs4_stateid *stateid, int issync);
+int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);
+int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid);
+bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, const struct cred **cred);
+bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
+
+struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode);
+void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
+int nfs4_have_delegation(struct inode *inode, fmode_t flags);
+int nfs4_check_delegation(struct inode *inode, fmode_t flags);
+bool nfs4_delegation_flush_on_close(const struct inode *inode);
+void nfs_inode_find_delegation_state_and_recover(struct inode *inode,
+ const nfs4_stateid *stateid);
+int nfs4_inode_make_writeable(struct inode *inode);
+
+#endif
+
+static inline int nfs_have_delegated_attributes(struct inode *inode)
+{
+ return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) &&
+ !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED);
+}
+
+#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
new file mode 100644
index 000000000..9f88ca7b2
--- /dev/null
+++ b/fs/nfs/dir.c
@@ -0,0 +1,2820 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/dir.c
+ *
+ * Copyright (C) 1992 Rick Sladkey
+ *
+ * nfs directory handling functions
+ *
+ * 10 Apr 1996 Added silly rename for unlink --okir
+ * 28 Sep 1996 Improved directory cache --okir
+ * 23 Aug 1997 Claus Heine claus@momo.math.rwth-aachen.de
+ * Re-implemented silly rename for unlink, newly implemented
+ * silly rename for nfs_rename() following the suggestions
+ * of Olaf Kirch (okir) found in this file.
+ * Following Linus comments on my original hack, this version
+ * depends only on the dcache stuff and doesn't touch the inode
+ * layer (iput() and friends).
+ * 6 Jun 1999 Cache readdir lookups in the page cache. -DaveM
+ */
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/swap.h>
+#include <linux/sched.h>
+#include <linux/kmemleak.h>
+#include <linux/xattr.h>
+
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#include "fscache.h"
+
+#include "nfstrace.h"
+
+/* #define NFS_DEBUG_VERBOSE 1 */
+
+static int nfs_opendir(struct inode *, struct file *);
+static int nfs_closedir(struct inode *, struct file *);
+static int nfs_readdir(struct file *, struct dir_context *);
+static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
+static loff_t nfs_llseek_dir(struct file *, loff_t, int);
+static void nfs_readdir_clear_array(struct page*);
+
+const struct file_operations nfs_dir_operations = {
+ .llseek = nfs_llseek_dir,
+ .read = generic_read_dir,
+ .iterate_shared = nfs_readdir,
+ .open = nfs_opendir,
+ .release = nfs_closedir,
+ .fsync = nfs_fsync_dir,
+};
+
+const struct address_space_operations nfs_dir_aops = {
+ .freepage = nfs_readdir_clear_array,
+};
+
+static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, const struct cred *cred)
+{
+ struct nfs_inode *nfsi = NFS_I(dir);
+ struct nfs_open_dir_context *ctx;
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (ctx != NULL) {
+ ctx->duped = 0;
+ ctx->attr_gencount = nfsi->attr_gencount;
+ ctx->dir_cookie = 0;
+ ctx->dup_cookie = 0;
+ ctx->cred = get_cred(cred);
+ spin_lock(&dir->i_lock);
+ if (list_empty(&nfsi->open_files) &&
+ (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
+ nfsi->cache_validity |= NFS_INO_INVALID_DATA |
+ NFS_INO_REVAL_FORCED;
+ list_add(&ctx->list, &nfsi->open_files);
+ spin_unlock(&dir->i_lock);
+ return ctx;
+ }
+ return ERR_PTR(-ENOMEM);
+}
+
+static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx)
+{
+ spin_lock(&dir->i_lock);
+ list_del(&ctx->list);
+ spin_unlock(&dir->i_lock);
+ put_cred(ctx->cred);
+ kfree(ctx);
+}
+
+/*
+ * Open file
+ */
+static int
+nfs_opendir(struct inode *inode, struct file *filp)
+{
+ int res = 0;
+ struct nfs_open_dir_context *ctx;
+
+ dfprintk(FILE, "NFS: open dir(%pD2)\n", filp);
+
+ nfs_inc_stats(inode, NFSIOS_VFSOPEN);
+
+ ctx = alloc_nfs_open_dir_context(inode, current_cred());
+ if (IS_ERR(ctx)) {
+ res = PTR_ERR(ctx);
+ goto out;
+ }
+ filp->private_data = ctx;
+out:
+ return res;
+}
+
+static int
+nfs_closedir(struct inode *inode, struct file *filp)
+{
+ put_nfs_open_dir_context(file_inode(filp), filp->private_data);
+ return 0;
+}
+
+struct nfs_cache_array_entry {
+ u64 cookie;
+ u64 ino;
+ struct qstr string;
+ unsigned char d_type;
+};
+
+struct nfs_cache_array {
+ int size;
+ int eof_index;
+ u64 last_cookie;
+ struct nfs_cache_array_entry array[];
+};
+
+typedef struct {
+ struct file *file;
+ struct page *page;
+ struct dir_context *ctx;
+ unsigned long page_index;
+ u64 *dir_cookie;
+ u64 last_cookie;
+ loff_t current_index;
+ loff_t prev_index;
+
+ unsigned long dir_verifier;
+ unsigned long timestamp;
+ unsigned long gencount;
+ unsigned int cache_entry_index;
+ bool plus;
+ bool eof;
+} nfs_readdir_descriptor_t;
+
+static
+void nfs_readdir_init_array(struct page *page)
+{
+ struct nfs_cache_array *array;
+
+ array = kmap_atomic(page);
+ memset(array, 0, sizeof(struct nfs_cache_array));
+ array->eof_index = -1;
+ kunmap_atomic(array);
+}
+
+/*
+ * we are freeing strings created by nfs_add_to_readdir_array()
+ */
+static
+void nfs_readdir_clear_array(struct page *page)
+{
+ struct nfs_cache_array *array;
+ int i;
+
+ array = kmap_atomic(page);
+ for (i = 0; i < array->size; i++)
+ kfree(array->array[i].string.name);
+ array->size = 0;
+ kunmap_atomic(array);
+}
+
+/*
+ * the caller is responsible for freeing qstr.name
+ * when called by nfs_readdir_add_to_array, the strings will be freed in
+ * nfs_clear_readdir_array()
+ */
+static
+int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len)
+{
+ string->len = len;
+ string->name = kmemdup_nul(name, len, GFP_KERNEL);
+ if (string->name == NULL)
+ return -ENOMEM;
+ /*
+ * Avoid a kmemleak false positive. The pointer to the name is stored
+ * in a page cache page which kmemleak does not scan.
+ */
+ kmemleak_not_leak(string->name);
+ string->hash = full_name_hash(NULL, name, len);
+ return 0;
+}
+
+static
+int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
+{
+ struct nfs_cache_array *array = kmap(page);
+ struct nfs_cache_array_entry *cache_entry;
+ int ret;
+
+ cache_entry = &array->array[array->size];
+
+ /* Check that this entry lies within the page bounds */
+ ret = -ENOSPC;
+ if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE)
+ goto out;
+
+ cache_entry->cookie = entry->prev_cookie;
+ cache_entry->ino = entry->ino;
+ cache_entry->d_type = entry->d_type;
+ ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
+ if (ret)
+ goto out;
+ array->last_cookie = entry->cookie;
+ array->size++;
+ if (entry->eof != 0)
+ array->eof_index = array->size;
+out:
+ kunmap(page);
+ return ret;
+}
+
+static inline
+int is_32bit_api(void)
+{
+#ifdef CONFIG_COMPAT
+ return in_compat_syscall();
+#else
+ return (BITS_PER_LONG == 32);
+#endif
+}
+
+static
+bool nfs_readdir_use_cookie(const struct file *filp)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return false;
+ return true;
+}
+
+static
+int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+{
+ loff_t diff = desc->ctx->pos - desc->current_index;
+ unsigned int index;
+
+ if (diff < 0)
+ goto out_eof;
+ if (diff >= array->size) {
+ if (array->eof_index >= 0)
+ goto out_eof;
+ return -EAGAIN;
+ }
+
+ index = (unsigned int)diff;
+ *desc->dir_cookie = array->array[index].cookie;
+ desc->cache_entry_index = index;
+ return 0;
+out_eof:
+ desc->eof = true;
+ return -EBADCOOKIE;
+}
+
+static bool
+nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi)
+{
+ if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
+ return false;
+ smp_rmb();
+ return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags);
+}
+
+static
+int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+{
+ int i;
+ loff_t new_pos;
+ int status = -EAGAIN;
+
+ for (i = 0; i < array->size; i++) {
+ if (array->array[i].cookie == *desc->dir_cookie) {
+ struct nfs_inode *nfsi = NFS_I(file_inode(desc->file));
+ struct nfs_open_dir_context *ctx = desc->file->private_data;
+
+ new_pos = desc->current_index + i;
+ if (ctx->attr_gencount != nfsi->attr_gencount ||
+ !nfs_readdir_inode_mapping_valid(nfsi)) {
+ ctx->duped = 0;
+ ctx->attr_gencount = nfsi->attr_gencount;
+ } else if (new_pos < desc->prev_index) {
+ if (ctx->duped > 0
+ && ctx->dup_cookie == *desc->dir_cookie) {
+ if (printk_ratelimit()) {
+ pr_notice("NFS: directory %pD2 contains a readdir loop."
+ "Please contact your server vendor. "
+ "The file: %.*s has duplicate cookie %llu\n",
+ desc->file, array->array[i].string.len,
+ array->array[i].string.name, *desc->dir_cookie);
+ }
+ status = -ELOOP;
+ goto out;
+ }
+ ctx->dup_cookie = *desc->dir_cookie;
+ ctx->duped = -1;
+ }
+ if (nfs_readdir_use_cookie(desc->file))
+ desc->ctx->pos = *desc->dir_cookie;
+ else
+ desc->ctx->pos = new_pos;
+ desc->prev_index = new_pos;
+ desc->cache_entry_index = i;
+ return 0;
+ }
+ }
+ if (array->eof_index >= 0) {
+ status = -EBADCOOKIE;
+ if (*desc->dir_cookie == array->last_cookie)
+ desc->eof = true;
+ }
+out:
+ return status;
+}
+
+static
+int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
+{
+ struct nfs_cache_array *array;
+ int status;
+
+ array = kmap(desc->page);
+
+ if (*desc->dir_cookie == 0)
+ status = nfs_readdir_search_for_pos(array, desc);
+ else
+ status = nfs_readdir_search_for_cookie(array, desc);
+
+ if (status == -EAGAIN) {
+ desc->last_cookie = array->last_cookie;
+ desc->current_index += array->size;
+ desc->page_index++;
+ }
+ kunmap(desc->page);
+ return status;
+}
+
+/* Fill a page with xdr information before transferring to the cache page */
+static
+int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
+ struct nfs_entry *entry, struct file *file, struct inode *inode)
+{
+ struct nfs_open_dir_context *ctx = file->private_data;
+ const struct cred *cred = ctx->cred;
+ unsigned long timestamp, gencount;
+ int error;
+
+ again:
+ timestamp = jiffies;
+ gencount = nfs_inc_attr_generation_counter();
+ desc->dir_verifier = nfs_save_change_attribute(inode);
+ error = NFS_PROTO(inode)->readdir(file_dentry(file), cred, entry->cookie, pages,
+ NFS_SERVER(inode)->dtsize, desc->plus);
+ if (error < 0) {
+ /* We requested READDIRPLUS, but the server doesn't grok it */
+ if (error == -ENOTSUPP && desc->plus) {
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
+ clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
+ desc->plus = false;
+ goto again;
+ }
+ goto error;
+ }
+ desc->timestamp = timestamp;
+ desc->gencount = gencount;
+error:
+ return error;
+}
+
+static int xdr_decode(nfs_readdir_descriptor_t *desc,
+ struct nfs_entry *entry, struct xdr_stream *xdr)
+{
+ struct inode *inode = file_inode(desc->file);
+ int error;
+
+ error = NFS_PROTO(inode)->decode_dirent(xdr, entry, desc->plus);
+ if (error)
+ return error;
+ entry->fattr->time_start = desc->timestamp;
+ entry->fattr->gencount = desc->gencount;
+ return 0;
+}
+
+/* Match file and dirent using either filehandle or fileid
+ * Note: caller is responsible for checking the fsid
+ */
+static
+int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
+{
+ struct inode *inode;
+ struct nfs_inode *nfsi;
+
+ if (d_really_is_negative(dentry))
+ return 0;
+
+ inode = d_inode(dentry);
+ if (is_bad_inode(inode) || NFS_STALE(inode))
+ return 0;
+
+ nfsi = NFS_I(inode);
+ if (entry->fattr->fileid != nfsi->fileid)
+ return 0;
+ if (entry->fh->size && nfs_compare_fh(entry->fh, &nfsi->fh) != 0)
+ return 0;
+ return 1;
+}
+
+static
+bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
+{
+ if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
+ return false;
+ if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
+ return true;
+ if (ctx->pos == 0)
+ return true;
+ return false;
+}
+
+/*
+ * This function is called by the lookup and getattr code to request the
+ * use of readdirplus to accelerate any future lookups in the same
+ * directory.
+ */
+void nfs_advise_use_readdirplus(struct inode *dir)
+{
+ struct nfs_inode *nfsi = NFS_I(dir);
+
+ if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
+ !list_empty(&nfsi->open_files))
+ set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
+}
+
+/*
+ * This function is mainly for use by nfs_getattr().
+ *
+ * If this is an 'ls -l', we want to force use of readdirplus.
+ * Do this by checking if there is an active file descriptor
+ * and calling nfs_advise_use_readdirplus, then forcing a
+ * cache flush.
+ */
+void nfs_force_use_readdirplus(struct inode *dir)
+{
+ struct nfs_inode *nfsi = NFS_I(dir);
+
+ if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
+ !list_empty(&nfsi->open_files)) {
+ set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
+ invalidate_mapping_pages(dir->i_mapping,
+ nfsi->page_index + 1, -1);
+ }
+}
+
+static
+void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry,
+ unsigned long dir_verifier)
+{
+ struct qstr filename = QSTR_INIT(entry->name, entry->len);
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ struct dentry *dentry;
+ struct dentry *alias;
+ struct inode *inode;
+ int status;
+
+ if (!(entry->fattr->valid & NFS_ATTR_FATTR_FILEID))
+ return;
+ if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID))
+ return;
+ if (filename.len == 0)
+ return;
+ /* Validate that the name doesn't contain any illegal '\0' */
+ if (strnlen(filename.name, filename.len) != filename.len)
+ return;
+ /* ...or '/' */
+ if (strnchr(filename.name, filename.len, '/'))
+ return;
+ if (filename.name[0] == '.') {
+ if (filename.len == 1)
+ return;
+ if (filename.len == 2 && filename.name[1] == '.')
+ return;
+ }
+ filename.hash = full_name_hash(parent, filename.name, filename.len);
+
+ dentry = d_lookup(parent, &filename);
+again:
+ if (!dentry) {
+ dentry = d_alloc_parallel(parent, &filename, &wq);
+ if (IS_ERR(dentry))
+ return;
+ }
+ if (!d_in_lookup(dentry)) {
+ /* Is there a mountpoint here? If so, just exit */
+ if (!nfs_fsid_equal(&NFS_SB(dentry->d_sb)->fsid,
+ &entry->fattr->fsid))
+ goto out;
+ if (nfs_same_file(dentry, entry)) {
+ if (!entry->fh->size)
+ goto out;
+ nfs_set_verifier(dentry, dir_verifier);
+ status = nfs_refresh_inode(d_inode(dentry), entry->fattr);
+ if (!status)
+ nfs_setsecurity(d_inode(dentry), entry->fattr, entry->label);
+ goto out;
+ } else {
+ d_invalidate(dentry);
+ dput(dentry);
+ dentry = NULL;
+ goto again;
+ }
+ }
+ if (!entry->fh->size) {
+ d_lookup_done(dentry);
+ goto out;
+ }
+
+ inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label);
+ alias = d_splice_alias(inode, dentry);
+ d_lookup_done(dentry);
+ if (alias) {
+ if (IS_ERR(alias))
+ goto out;
+ dput(dentry);
+ dentry = alias;
+ }
+ nfs_set_verifier(dentry, dir_verifier);
+out:
+ dput(dentry);
+}
+
+/* Perform conversion from xdr to cache array */
+static
+int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
+ struct page **xdr_pages, struct page *page, unsigned int buflen)
+{
+ struct xdr_stream stream;
+ struct xdr_buf buf;
+ struct page *scratch;
+ struct nfs_cache_array *array;
+ unsigned int count = 0;
+ int status;
+
+ scratch = alloc_page(GFP_KERNEL);
+ if (scratch == NULL)
+ return -ENOMEM;
+
+ if (buflen == 0)
+ goto out_nopages;
+
+ xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
+ xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+ do {
+ if (entry->label)
+ entry->label->len = NFS4_MAXLABELLEN;
+
+ status = xdr_decode(desc, entry, &stream);
+ if (status != 0) {
+ if (status == -EAGAIN)
+ status = 0;
+ break;
+ }
+
+ count++;
+
+ if (desc->plus)
+ nfs_prime_dcache(file_dentry(desc->file), entry,
+ desc->dir_verifier);
+
+ status = nfs_readdir_add_to_array(entry, page);
+ if (status != 0)
+ break;
+ } while (!entry->eof);
+
+out_nopages:
+ if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
+ array = kmap(page);
+ array->eof_index = array->size;
+ status = 0;
+ kunmap(page);
+ }
+
+ put_page(scratch);
+ return status;
+}
+
+static
+void nfs_readdir_free_pages(struct page **pages, unsigned int npages)
+{
+ unsigned int i;
+ for (i = 0; i < npages; i++)
+ put_page(pages[i]);
+}
+
+/*
+ * nfs_readdir_alloc_pages() will allocate pages that must be freed with a call
+ * to nfs_readdir_free_pages()
+ */
+static
+int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages)
+{
+ unsigned int i;
+
+ for (i = 0; i < npages; i++) {
+ struct page *page = alloc_page(GFP_KERNEL);
+ if (page == NULL)
+ goto out_freepages;
+ pages[i] = page;
+ }
+ return 0;
+
+out_freepages:
+ nfs_readdir_free_pages(pages, i);
+ return -ENOMEM;
+}
+
+static
+int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
+{
+ struct page *pages[NFS_MAX_READDIR_PAGES];
+ struct nfs_entry entry;
+ struct file *file = desc->file;
+ struct nfs_cache_array *array;
+ int status = -ENOMEM;
+ unsigned int array_size = ARRAY_SIZE(pages);
+
+ nfs_readdir_init_array(page);
+
+ entry.prev_cookie = 0;
+ entry.cookie = desc->last_cookie;
+ entry.eof = 0;
+ entry.fh = nfs_alloc_fhandle();
+ entry.fattr = nfs_alloc_fattr();
+ entry.server = NFS_SERVER(inode);
+ if (entry.fh == NULL || entry.fattr == NULL)
+ goto out;
+
+ entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
+ if (IS_ERR(entry.label)) {
+ status = PTR_ERR(entry.label);
+ goto out;
+ }
+
+ array = kmap(page);
+
+ status = nfs_readdir_alloc_pages(pages, array_size);
+ if (status < 0)
+ goto out_release_array;
+ do {
+ unsigned int pglen;
+ status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
+
+ if (status < 0)
+ break;
+ pglen = status;
+ status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
+ if (status < 0) {
+ if (status == -ENOSPC)
+ status = 0;
+ break;
+ }
+ } while (array->eof_index < 0);
+
+ nfs_readdir_free_pages(pages, array_size);
+out_release_array:
+ kunmap(page);
+ nfs4_label_free(entry.label);
+out:
+ nfs_free_fattr(entry.fattr);
+ nfs_free_fhandle(entry.fh);
+ return status;
+}
+
+/*
+ * Now we cache directories properly, by converting xdr information
+ * to an array that can be used for lookups later. This results in
+ * fewer cache pages, since we can store more information on each page.
+ * We only need to convert from xdr once so future lookups are much simpler
+ */
+static
+int nfs_readdir_filler(void *data, struct page* page)
+{
+ nfs_readdir_descriptor_t *desc = data;
+ struct inode *inode = file_inode(desc->file);
+ int ret;
+
+ ret = nfs_readdir_xdr_to_array(desc, page, inode);
+ if (ret < 0)
+ goto error;
+ SetPageUptodate(page);
+
+ if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
+ /* Should never happen */
+ nfs_zap_mapping(inode, inode->i_mapping);
+ }
+ unlock_page(page);
+ return 0;
+ error:
+ nfs_readdir_clear_array(page);
+ unlock_page(page);
+ return ret;
+}
+
+static
+void cache_page_release(nfs_readdir_descriptor_t *desc)
+{
+ put_page(desc->page);
+ desc->page = NULL;
+}
+
+static
+struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
+{
+ return read_cache_page(desc->file->f_mapping, desc->page_index,
+ nfs_readdir_filler, desc);
+}
+
+/*
+ * Returns 0 if desc->dir_cookie was found on page desc->page_index
+ * and locks the page to prevent removal from the page cache.
+ */
+static
+int find_and_lock_cache_page(nfs_readdir_descriptor_t *desc)
+{
+ struct inode *inode = file_inode(desc->file);
+ struct nfs_inode *nfsi = NFS_I(inode);
+ int res;
+
+ desc->page = get_cache_page(desc);
+ if (IS_ERR(desc->page))
+ return PTR_ERR(desc->page);
+ res = lock_page_killable(desc->page);
+ if (res != 0)
+ goto error;
+ res = -EAGAIN;
+ if (desc->page->mapping != NULL) {
+ res = nfs_readdir_search_array(desc);
+ if (res == 0) {
+ nfsi->page_index = desc->page_index;
+ return 0;
+ }
+ }
+ unlock_page(desc->page);
+error:
+ cache_page_release(desc);
+ return res;
+}
+
+/* Search for desc->dir_cookie from the beginning of the page cache */
+static inline
+int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
+{
+ int res;
+
+ if (desc->page_index == 0) {
+ desc->current_index = 0;
+ desc->prev_index = 0;
+ desc->last_cookie = 0;
+ }
+ do {
+ res = find_and_lock_cache_page(desc);
+ } while (res == -EAGAIN);
+ return res;
+}
+
+/*
+ * Once we've found the start of the dirent within a page: fill 'er up...
+ */
+static
+int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
+{
+ struct file *file = desc->file;
+ int i = 0;
+ int res = 0;
+ struct nfs_cache_array *array = NULL;
+ struct nfs_open_dir_context *ctx = file->private_data;
+
+ array = kmap(desc->page);
+ for (i = desc->cache_entry_index; i < array->size; i++) {
+ struct nfs_cache_array_entry *ent;
+
+ ent = &array->array[i];
+ if (!dir_emit(desc->ctx, ent->string.name, ent->string.len,
+ nfs_compat_user_ino64(ent->ino), ent->d_type)) {
+ desc->eof = true;
+ break;
+ }
+ if (i < (array->size-1))
+ *desc->dir_cookie = array->array[i+1].cookie;
+ else
+ *desc->dir_cookie = array->last_cookie;
+ if (nfs_readdir_use_cookie(file))
+ desc->ctx->pos = *desc->dir_cookie;
+ else
+ desc->ctx->pos++;
+ if (ctx->duped != 0)
+ ctx->duped = 1;
+ }
+ if (array->eof_index >= 0)
+ desc->eof = true;
+
+ kunmap(desc->page);
+ dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
+ (unsigned long long)*desc->dir_cookie, res);
+ return res;
+}
+
+/*
+ * If we cannot find a cookie in our cache, we suspect that this is
+ * because it points to a deleted file, so we ask the server to return
+ * whatever it thinks is the next entry. We then feed this to filldir.
+ * If all goes well, we should then be able to find our way round the
+ * cache on the next call to readdir_search_pagecache();
+ *
+ * NOTE: we cannot add the anonymous page to the pagecache because
+ * the data it contains might not be page aligned. Besides,
+ * we should already have a complete representation of the
+ * directory in the page cache by the time we get here.
+ */
+static inline
+int uncached_readdir(nfs_readdir_descriptor_t *desc)
+{
+ struct page *page = NULL;
+ int status;
+ struct inode *inode = file_inode(desc->file);
+ struct nfs_open_dir_context *ctx = desc->file->private_data;
+
+ dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
+ (unsigned long long)*desc->dir_cookie);
+
+ page = alloc_page(GFP_HIGHUSER);
+ if (!page) {
+ status = -ENOMEM;
+ goto out;
+ }
+
+ desc->page_index = 0;
+ desc->last_cookie = *desc->dir_cookie;
+ desc->page = page;
+ ctx->duped = 0;
+
+ status = nfs_readdir_xdr_to_array(desc, page, inode);
+ if (status < 0)
+ goto out_release;
+
+ status = nfs_do_filldir(desc);
+
+ out_release:
+ nfs_readdir_clear_array(desc->page);
+ cache_page_release(desc);
+ out:
+ dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
+ __func__, status);
+ return status;
+}
+
+/* The file offset position represents the dirent entry number. A
+ last cookie cache takes care of the common case of reading the
+ whole directory.
+ */
+static int nfs_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct dentry *dentry = file_dentry(file);
+ struct inode *inode = d_inode(dentry);
+ struct nfs_open_dir_context *dir_ctx = file->private_data;
+ nfs_readdir_descriptor_t my_desc = {
+ .file = file,
+ .ctx = ctx,
+ .dir_cookie = &dir_ctx->dir_cookie,
+ .plus = nfs_use_readdirplus(inode, ctx),
+ },
+ *desc = &my_desc;
+ int res = 0;
+
+ dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
+ file, (long long)ctx->pos);
+ nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
+
+ /*
+ * ctx->pos points to the dirent entry number.
+ * *desc->dir_cookie has the cookie for the next entry. We have
+ * to either find the entry with the appropriate number or
+ * revalidate the cookie.
+ */
+ if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
+ res = nfs_revalidate_mapping(inode, file->f_mapping);
+ if (res < 0)
+ goto out;
+
+ do {
+ res = readdir_search_pagecache(desc);
+
+ if (res == -EBADCOOKIE) {
+ res = 0;
+ /* This means either end of directory */
+ if (*desc->dir_cookie && !desc->eof) {
+ /* Or that the server has 'lost' a cookie */
+ res = uncached_readdir(desc);
+ if (res == 0)
+ continue;
+ }
+ break;
+ }
+ if (res == -ETOOSMALL && desc->plus) {
+ clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
+ nfs_zap_caches(inode);
+ desc->page_index = 0;
+ desc->plus = false;
+ desc->eof = false;
+ continue;
+ }
+ if (res < 0)
+ break;
+
+ res = nfs_do_filldir(desc);
+ unlock_page(desc->page);
+ cache_page_release(desc);
+ if (res < 0)
+ break;
+ } while (!desc->eof);
+out:
+ if (res > 0)
+ res = 0;
+ dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res);
+ return res;
+}
+
+static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
+{
+ struct nfs_open_dir_context *dir_ctx = filp->private_data;
+
+ dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n",
+ filp, offset, whence);
+
+ switch (whence) {
+ default:
+ return -EINVAL;
+ case SEEK_SET:
+ if (offset < 0)
+ return -EINVAL;
+ spin_lock(&filp->f_lock);
+ break;
+ case SEEK_CUR:
+ if (offset == 0)
+ return filp->f_pos;
+ spin_lock(&filp->f_lock);
+ offset += filp->f_pos;
+ if (offset < 0) {
+ spin_unlock(&filp->f_lock);
+ return -EINVAL;
+ }
+ }
+ if (offset != filp->f_pos) {
+ filp->f_pos = offset;
+ if (nfs_readdir_use_cookie(filp))
+ dir_ctx->dir_cookie = offset;
+ else
+ dir_ctx->dir_cookie = 0;
+ dir_ctx->duped = 0;
+ }
+ spin_unlock(&filp->f_lock);
+ return offset;
+}
+
+/*
+ * All directory operations under NFS are synchronous, so fsync()
+ * is a dummy operation.
+ */
+static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,
+ int datasync)
+{
+ dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync);
+
+ nfs_inc_stats(file_inode(filp), NFSIOS_VFSFSYNC);
+ return 0;
+}
+
+/**
+ * nfs_force_lookup_revalidate - Mark the directory as having changed
+ * @dir: pointer to directory inode
+ *
+ * This forces the revalidation code in nfs_lookup_revalidate() to do a
+ * full lookup on all child dentries of 'dir' whenever a change occurs
+ * on the server that might have invalidated our dcache.
+ *
+ * Note that we reserve bit '0' as a tag to let us know when a dentry
+ * was revalidated while holding a delegation on its inode.
+ *
+ * The caller should be holding dir->i_lock
+ */
+void nfs_force_lookup_revalidate(struct inode *dir)
+{
+ NFS_I(dir)->cache_change_attribute += 2;
+}
+EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);
+
+/**
+ * nfs_verify_change_attribute - Detects NFS remote directory changes
+ * @dir: pointer to parent directory inode
+ * @verf: previously saved change attribute
+ *
+ * Return "false" if the verifiers doesn't match the change attribute.
+ * This would usually indicate that the directory contents have changed on
+ * the server, and that any dentries need revalidating.
+ */
+static bool nfs_verify_change_attribute(struct inode *dir, unsigned long verf)
+{
+ return (verf & ~1UL) == nfs_save_change_attribute(dir);
+}
+
+static void nfs_set_verifier_delegated(unsigned long *verf)
+{
+ *verf |= 1UL;
+}
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+static void nfs_unset_verifier_delegated(unsigned long *verf)
+{
+ *verf &= ~1UL;
+}
+#endif /* IS_ENABLED(CONFIG_NFS_V4) */
+
+static bool nfs_test_verifier_delegated(unsigned long verf)
+{
+ return verf & 1;
+}
+
+static bool nfs_verifier_is_delegated(struct dentry *dentry)
+{
+ return nfs_test_verifier_delegated(dentry->d_time);
+}
+
+static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf)
+{
+ struct inode *inode = d_inode(dentry);
+ struct inode *dir = d_inode(dentry->d_parent);
+
+ if (!nfs_verify_change_attribute(dir, verf))
+ return;
+ if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ nfs_set_verifier_delegated(&verf);
+ dentry->d_time = verf;
+}
+
+/**
+ * nfs_set_verifier - save a parent directory verifier in the dentry
+ * @dentry: pointer to dentry
+ * @verf: verifier to save
+ *
+ * Saves the parent directory verifier in @dentry. If the inode has
+ * a delegation, we also tag the dentry as having been revalidated
+ * while holding a delegation so that we know we don't have to
+ * look it up again after a directory change.
+ */
+void nfs_set_verifier(struct dentry *dentry, unsigned long verf)
+{
+
+ spin_lock(&dentry->d_lock);
+ nfs_set_verifier_locked(dentry, verf);
+ spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_set_verifier);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+/**
+ * nfs_clear_verifier_delegated - clear the dir verifier delegation tag
+ * @inode: pointer to inode
+ *
+ * Iterates through the dentries in the inode alias list and clears
+ * the tag used to indicate that the dentry has been revalidated
+ * while holding a delegation.
+ * This function is intended for use when the delegation is being
+ * returned or revoked.
+ */
+void nfs_clear_verifier_delegated(struct inode *inode)
+{
+ struct dentry *alias;
+
+ if (!inode)
+ return;
+ spin_lock(&inode->i_lock);
+ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+ spin_lock(&alias->d_lock);
+ nfs_unset_verifier_delegated(&alias->d_time);
+ spin_unlock(&alias->d_lock);
+ }
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_clear_verifier_delegated);
+#endif /* IS_ENABLED(CONFIG_NFS_V4) */
+
+/*
+ * A check for whether or not the parent directory has changed.
+ * In the case it has, we assume that the dentries are untrustworthy
+ * and may need to be looked up again.
+ * If rcu_walk prevents us from performing a full check, return 0.
+ */
+static int nfs_check_verifier(struct inode *dir, struct dentry *dentry,
+ int rcu_walk)
+{
+ if (IS_ROOT(dentry))
+ return 1;
+ if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
+ return 0;
+ if (!nfs_verify_change_attribute(dir, dentry->d_time))
+ return 0;
+ /* Revalidate nfsi->cache_change_attribute before we declare a match */
+ if (nfs_mapping_need_revalidate_inode(dir)) {
+ if (rcu_walk)
+ return 0;
+ if (__nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
+ return 0;
+ }
+ if (!nfs_verify_change_attribute(dir, dentry->d_time))
+ return 0;
+ return 1;
+}
+
+/*
+ * Use intent information to check whether or not we're going to do
+ * an O_EXCL create using this path component.
+ */
+static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags)
+{
+ if (NFS_PROTO(dir)->version == 2)
+ return 0;
+ return flags & LOOKUP_EXCL;
+}
+
+/*
+ * Inode and filehandle revalidation for lookups.
+ *
+ * We force revalidation in the cases where the VFS sets LOOKUP_REVAL,
+ * or if the intent information indicates that we're about to open this
+ * particular file and the "nocto" mount flag is not set.
+ *
+ */
+static
+int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ int ret;
+
+ if (IS_AUTOMOUNT(inode))
+ return 0;
+
+ if (flags & LOOKUP_OPEN) {
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ /* A NFSv4 OPEN will revalidate later */
+ if (server->caps & NFS_CAP_ATOMIC_OPEN)
+ goto out;
+ fallthrough;
+ case S_IFDIR:
+ if (server->flags & NFS_MOUNT_NOCTO)
+ break;
+ /* NFS close-to-open cache consistency validation */
+ goto out_force;
+ }
+ }
+
+ /* VFS wants an on-the-wire revalidation */
+ if (flags & LOOKUP_REVAL)
+ goto out_force;
+out:
+ return (inode->i_nlink == 0) ? -ESTALE : 0;
+out_force:
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+ ret = __nfs_revalidate_inode(server, inode);
+ if (ret != 0)
+ return ret;
+ goto out;
+}
+
+static void nfs_mark_dir_for_revalidate(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ spin_lock(&inode->i_lock);
+ nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
+ spin_unlock(&inode->i_lock);
+}
+
+/*
+ * We judge how long we want to trust negative
+ * dentries by looking at the parent inode mtime.
+ *
+ * If parent mtime has changed, we revalidate, else we wait for a
+ * period corresponding to the parent's attribute cache timeout value.
+ *
+ * If LOOKUP_RCU prevents us from performing a full check, return 1
+ * suggesting a reval is needed.
+ *
+ * Note that when creating a new file, or looking up a rename target,
+ * then it shouldn't be necessary to revalidate a negative dentry.
+ */
+static inline
+int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+ return 0;
+ if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
+ return 1;
+ return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU);
+}
+
+static int
+nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry,
+ struct inode *inode, int error)
+{
+ switch (error) {
+ case 1:
+ dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
+ __func__, dentry);
+ return 1;
+ case 0:
+ /*
+ * We can't d_drop the root of a disconnected tree:
+ * its d_hash is on the s_anon list and d_drop() would hide
+ * it from shrink_dcache_for_unmount(), leading to busy
+ * inodes on unmount and further oopses.
+ */
+ if (inode && IS_ROOT(dentry))
+ return 1;
+ dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
+ __func__, dentry);
+ return 0;
+ }
+ dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n",
+ __func__, dentry, error);
+ return error;
+}
+
+static int
+nfs_lookup_revalidate_negative(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ int ret = 1;
+ if (nfs_neg_need_reval(dir, dentry, flags)) {
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+ ret = 0;
+ }
+ return nfs_lookup_revalidate_done(dir, dentry, NULL, ret);
+}
+
+static int
+nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry,
+ struct inode *inode)
+{
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
+}
+
+static int
+nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
+ struct inode *inode)
+{
+ struct nfs_fh *fhandle;
+ struct nfs_fattr *fattr;
+ struct nfs4_label *label;
+ unsigned long dir_verifier;
+ int ret;
+
+ ret = -ENOMEM;
+ fhandle = nfs_alloc_fhandle();
+ fattr = nfs_alloc_fattr();
+ label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+ if (fhandle == NULL || fattr == NULL || IS_ERR(label))
+ goto out;
+
+ dir_verifier = nfs_save_change_attribute(dir);
+ ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr, label);
+ if (ret < 0) {
+ switch (ret) {
+ case -ESTALE:
+ case -ENOENT:
+ ret = 0;
+ break;
+ case -ETIMEDOUT:
+ if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL)
+ ret = 1;
+ }
+ goto out;
+ }
+ ret = 0;
+ if (nfs_compare_fh(NFS_FH(inode), fhandle))
+ goto out;
+ if (nfs_refresh_inode(inode, fattr) < 0)
+ goto out;
+
+ nfs_setsecurity(inode, fattr, label);
+ nfs_set_verifier(dentry, dir_verifier);
+
+ /* set a readdirplus hint that we had a cache miss */
+ nfs_force_use_readdirplus(dir);
+ ret = 1;
+out:
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fhandle);
+ nfs4_label_free(label);
+
+ /*
+ * If the lookup failed despite the dentry change attribute being
+ * a match, then we should revalidate the directory cache.
+ */
+ if (!ret && nfs_verify_change_attribute(dir, dentry->d_time))
+ nfs_mark_dir_for_revalidate(dir);
+ return nfs_lookup_revalidate_done(dir, dentry, inode, ret);
+}
+
+/*
+ * This is called every time the dcache has a lookup hit,
+ * and we should check whether we can really trust that
+ * lookup.
+ *
+ * NOTE! The hit can be a negative hit too, don't assume
+ * we have an inode!
+ *
+ * If the parent directory is seen to have changed, we throw out the
+ * cached dentry and do a new lookup.
+ */
+static int
+nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct inode *inode;
+ int error;
+
+ nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
+ inode = d_inode(dentry);
+
+ if (!inode)
+ return nfs_lookup_revalidate_negative(dir, dentry, flags);
+
+ if (is_bad_inode(inode)) {
+ dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
+ __func__, dentry);
+ goto out_bad;
+ }
+
+ if (nfs_verifier_is_delegated(dentry))
+ return nfs_lookup_revalidate_delegated(dir, dentry, inode);
+
+ /* Force a full look up iff the parent directory has changed */
+ if (!(flags & (LOOKUP_EXCL | LOOKUP_REVAL)) &&
+ nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) {
+ error = nfs_lookup_verify_inode(inode, flags);
+ if (error) {
+ if (error == -ESTALE)
+ nfs_mark_dir_for_revalidate(dir);
+ goto out_bad;
+ }
+ nfs_advise_use_readdirplus(dir);
+ goto out_valid;
+ }
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ if (NFS_STALE(inode))
+ goto out_bad;
+
+ trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
+ error = nfs_lookup_revalidate_dentry(dir, dentry, inode);
+ trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error);
+ return error;
+out_valid:
+ return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
+out_bad:
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+ return nfs_lookup_revalidate_done(dir, dentry, inode, 0);
+}
+
+static int
+__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
+ int (*reval)(struct inode *, struct dentry *, unsigned int))
+{
+ struct dentry *parent;
+ struct inode *dir;
+ int ret;
+
+ if (flags & LOOKUP_RCU) {
+ parent = READ_ONCE(dentry->d_parent);
+ dir = d_inode_rcu(parent);
+ if (!dir)
+ return -ECHILD;
+ ret = reval(dir, dentry, flags);
+ if (parent != READ_ONCE(dentry->d_parent))
+ return -ECHILD;
+ } else {
+ parent = dget_parent(dentry);
+ ret = reval(d_inode(parent), dentry, flags);
+ dput(parent);
+ }
+ return ret;
+}
+
+static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate);
+}
+
+/*
+ * A weaker form of d_revalidate for revalidating just the d_inode(dentry)
+ * when we don't really care about the dentry name. This is called when a
+ * pathwalk ends on a dentry that was not found via a normal lookup in the
+ * parent dir (e.g.: ".", "..", procfs symlinks or mountpoint traversals).
+ *
+ * In this situation, we just want to verify that the inode itself is OK
+ * since the dentry might have changed on the server.
+ */
+static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct inode *inode = d_inode(dentry);
+ int error = 0;
+
+ /*
+ * I believe we can only get a negative dentry here in the case of a
+ * procfs-style symlink. Just assume it's correct for now, but we may
+ * eventually need to do something more here.
+ */
+ if (!inode) {
+ dfprintk(LOOKUPCACHE, "%s: %pd2 has negative inode\n",
+ __func__, dentry);
+ return 1;
+ }
+
+ if (is_bad_inode(inode)) {
+ dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
+ __func__, dentry);
+ return 0;
+ }
+
+ error = nfs_lookup_verify_inode(inode, flags);
+ dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n",
+ __func__, inode->i_ino, error ? "invalid" : "valid");
+ return !error;
+}
+
+/*
+ * This is called from dput() when d_count is going to 0.
+ */
+static int nfs_dentry_delete(const struct dentry *dentry)
+{
+ dfprintk(VFS, "NFS: dentry_delete(%pd2, %x)\n",
+ dentry, dentry->d_flags);
+
+ /* Unhash any dentry with a stale inode */
+ if (d_really_is_positive(dentry) && NFS_STALE(d_inode(dentry)))
+ return 1;
+
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+ /* Unhash it, so that ->d_iput() would be called */
+ return 1;
+ }
+ if (!(dentry->d_sb->s_flags & SB_ACTIVE)) {
+ /* Unhash it, so that ancestors of killed async unlink
+ * files will be cleaned up during umount */
+ return 1;
+ }
+ return 0;
+
+}
+
+/* Ensure that we revalidate inode->i_nlink */
+static void nfs_drop_nlink(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ /* drop the inode if we're reasonably sure this is the last link */
+ if (inode->i_nlink > 0)
+ drop_nlink(inode);
+ NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter();
+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_CTIME
+ | NFS_INO_INVALID_OTHER
+ | NFS_INO_REVAL_FORCED;
+ spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Called when the dentry loses inode.
+ * We use it to clean up silly-renamed files.
+ */
+static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
+{
+ if (S_ISDIR(inode->i_mode))
+ /* drop any readdir cache as it could easily be old */
+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
+
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+ nfs_complete_unlink(dentry, inode);
+ nfs_drop_nlink(inode);
+ }
+ iput(inode);
+}
+
+static void nfs_d_release(struct dentry *dentry)
+{
+ /* free cached devname value, if it survived that far */
+ if (unlikely(dentry->d_fsdata)) {
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+ WARN_ON(1);
+ else
+ kfree(dentry->d_fsdata);
+ }
+}
+
+const struct dentry_operations nfs_dentry_operations = {
+ .d_revalidate = nfs_lookup_revalidate,
+ .d_weak_revalidate = nfs_weak_revalidate,
+ .d_delete = nfs_dentry_delete,
+ .d_iput = nfs_dentry_iput,
+ .d_automount = nfs_d_automount,
+ .d_release = nfs_d_release,
+};
+EXPORT_SYMBOL_GPL(nfs_dentry_operations);
+
+struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
+{
+ struct dentry *res;
+ struct inode *inode = NULL;
+ struct nfs_fh *fhandle = NULL;
+ struct nfs_fattr *fattr = NULL;
+ struct nfs4_label *label = NULL;
+ unsigned long dir_verifier;
+ int error;
+
+ dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry);
+ nfs_inc_stats(dir, NFSIOS_VFSLOOKUP);
+
+ if (unlikely(dentry->d_name.len > NFS_SERVER(dir)->namelen))
+ return ERR_PTR(-ENAMETOOLONG);
+
+ /*
+ * If we're doing an exclusive create, optimize away the lookup
+ * but don't hash the dentry.
+ */
+ if (nfs_is_exclusive_create(dir, flags) || flags & LOOKUP_RENAME_TARGET)
+ return NULL;
+
+ res = ERR_PTR(-ENOMEM);
+ fhandle = nfs_alloc_fhandle();
+ fattr = nfs_alloc_fattr();
+ if (fhandle == NULL || fattr == NULL)
+ goto out;
+
+ label = nfs4_label_alloc(NFS_SERVER(dir), GFP_NOWAIT);
+ if (IS_ERR(label))
+ goto out;
+
+ dir_verifier = nfs_save_change_attribute(dir);
+ trace_nfs_lookup_enter(dir, dentry, flags);
+ error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr, label);
+ if (error == -ENOENT)
+ goto no_entry;
+ if (error < 0) {
+ res = ERR_PTR(error);
+ goto out_label;
+ }
+ inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
+ res = ERR_CAST(inode);
+ if (IS_ERR(res))
+ goto out_label;
+
+ /* Notify readdir to use READDIRPLUS */
+ nfs_force_use_readdirplus(dir);
+
+no_entry:
+ res = d_splice_alias(inode, dentry);
+ if (res != NULL) {
+ if (IS_ERR(res))
+ goto out_label;
+ dentry = res;
+ }
+ nfs_set_verifier(dentry, dir_verifier);
+out_label:
+ trace_nfs_lookup_exit(dir, dentry, flags, error);
+ nfs4_label_free(label);
+out:
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fhandle);
+ return res;
+}
+EXPORT_SYMBOL_GPL(nfs_lookup);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+static int nfs4_lookup_revalidate(struct dentry *, unsigned int);
+
+const struct dentry_operations nfs4_dentry_operations = {
+ .d_revalidate = nfs4_lookup_revalidate,
+ .d_weak_revalidate = nfs_weak_revalidate,
+ .d_delete = nfs_dentry_delete,
+ .d_iput = nfs_dentry_iput,
+ .d_automount = nfs_d_automount,
+ .d_release = nfs_d_release,
+};
+EXPORT_SYMBOL_GPL(nfs4_dentry_operations);
+
+static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags, struct file *filp)
+{
+ return alloc_nfs_open_context(dentry, flags_to_mode(open_flags), filp);
+}
+
+static int do_open(struct inode *inode, struct file *filp)
+{
+ nfs_fscache_open_file(inode, filp);
+ return 0;
+}
+
+static int nfs_finish_open(struct nfs_open_context *ctx,
+ struct dentry *dentry,
+ struct file *file, unsigned open_flags)
+{
+ int err;
+
+ err = finish_open(file, dentry, do_open);
+ if (err)
+ goto out;
+ if (S_ISREG(file->f_path.dentry->d_inode->i_mode))
+ nfs_file_set_open_context(file, ctx);
+ else
+ err = -EOPENSTALE;
+out:
+ return err;
+}
+
+int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
+ struct file *file, unsigned open_flags,
+ umode_t mode)
+{
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ struct nfs_open_context *ctx;
+ struct dentry *res;
+ struct iattr attr = { .ia_valid = ATTR_OPEN };
+ struct inode *inode;
+ unsigned int lookup_flags = 0;
+ bool switched = false;
+ int created = 0;
+ int err;
+
+ /* Expect a negative dentry */
+ BUG_ON(d_inode(dentry));
+
+ dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n",
+ dir->i_sb->s_id, dir->i_ino, dentry);
+
+ err = nfs_check_flags(open_flags);
+ if (err)
+ return err;
+
+ /* NFS only supports OPEN on regular files */
+ if ((open_flags & O_DIRECTORY)) {
+ if (!d_in_lookup(dentry)) {
+ /*
+ * Hashed negative dentry with O_DIRECTORY: dentry was
+ * revalidated and is fine, no need to perform lookup
+ * again
+ */
+ return -ENOENT;
+ }
+ lookup_flags = LOOKUP_OPEN|LOOKUP_DIRECTORY;
+ goto no_open;
+ }
+
+ if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
+ return -ENAMETOOLONG;
+
+ if (open_flags & O_CREAT) {
+ struct nfs_server *server = NFS_SERVER(dir);
+
+ if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+ mode &= ~current_umask();
+
+ attr.ia_valid |= ATTR_MODE;
+ attr.ia_mode = mode;
+ }
+ if (open_flags & O_TRUNC) {
+ attr.ia_valid |= ATTR_SIZE;
+ attr.ia_size = 0;
+ }
+
+ if (!(open_flags & O_CREAT) && !d_in_lookup(dentry)) {
+ d_drop(dentry);
+ switched = true;
+ dentry = d_alloc_parallel(dentry->d_parent,
+ &dentry->d_name, &wq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ if (unlikely(!d_in_lookup(dentry)))
+ return finish_no_open(file, dentry);
+ }
+
+ ctx = create_nfs_open_context(dentry, open_flags, file);
+ err = PTR_ERR(ctx);
+ if (IS_ERR(ctx))
+ goto out;
+
+ trace_nfs_atomic_open_enter(dir, ctx, open_flags);
+ inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, &created);
+ if (created)
+ file->f_mode |= FMODE_CREATED;
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ trace_nfs_atomic_open_exit(dir, ctx, open_flags, err);
+ put_nfs_open_context(ctx);
+ d_drop(dentry);
+ switch (err) {
+ case -ENOENT:
+ d_splice_alias(NULL, dentry);
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ break;
+ case -EISDIR:
+ case -ENOTDIR:
+ goto no_open;
+ case -ELOOP:
+ if (!(open_flags & O_NOFOLLOW))
+ goto no_open;
+ break;
+ /* case -EINVAL: */
+ default:
+ break;
+ }
+ goto out;
+ }
+
+ err = nfs_finish_open(ctx, ctx->dentry, file, open_flags);
+ trace_nfs_atomic_open_exit(dir, ctx, open_flags, err);
+ put_nfs_open_context(ctx);
+out:
+ if (unlikely(switched)) {
+ d_lookup_done(dentry);
+ dput(dentry);
+ }
+ return err;
+
+no_open:
+ res = nfs_lookup(dir, dentry, lookup_flags);
+ if (!res) {
+ inode = d_inode(dentry);
+ if ((lookup_flags & LOOKUP_DIRECTORY) && inode &&
+ !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)))
+ res = ERR_PTR(-ENOTDIR);
+ else if (inode && S_ISREG(inode->i_mode))
+ res = ERR_PTR(-EOPENSTALE);
+ } else if (!IS_ERR(res)) {
+ inode = d_inode(res);
+ if ((lookup_flags & LOOKUP_DIRECTORY) && inode &&
+ !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) {
+ dput(res);
+ res = ERR_PTR(-ENOTDIR);
+ } else if (inode && S_ISREG(inode->i_mode)) {
+ dput(res);
+ res = ERR_PTR(-EOPENSTALE);
+ }
+ }
+ if (switched) {
+ d_lookup_done(dentry);
+ if (!res)
+ res = dentry;
+ else
+ dput(dentry);
+ }
+ if (IS_ERR(res))
+ return PTR_ERR(res);
+ return finish_no_open(file, res);
+}
+EXPORT_SYMBOL_GPL(nfs_atomic_open);
+
+static int
+nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct inode *inode;
+
+ if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
+ goto full_reval;
+ if (d_mountpoint(dentry))
+ goto full_reval;
+
+ inode = d_inode(dentry);
+
+ /* We can't create new files in nfs_open_revalidate(), so we
+ * optimize away revalidation of negative dentries.
+ */
+ if (inode == NULL)
+ goto full_reval;
+
+ if (nfs_verifier_is_delegated(dentry))
+ return nfs_lookup_revalidate_delegated(dir, dentry, inode);
+
+ /* NFS only supports OPEN on regular files */
+ if (!S_ISREG(inode->i_mode))
+ goto full_reval;
+
+ /* We cannot do exclusive creation on a positive dentry */
+ if (flags & (LOOKUP_EXCL | LOOKUP_REVAL))
+ goto reval_dentry;
+
+ /* Check if the directory changed */
+ if (!nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU))
+ goto reval_dentry;
+
+ /* Let f_op->open() actually open (and revalidate) the file */
+ return 1;
+reval_dentry:
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+ return nfs_lookup_revalidate_dentry(dir, dentry, inode);
+
+full_reval:
+ return nfs_do_lookup_revalidate(dir, dentry, flags);
+}
+
+static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return __nfs_lookup_revalidate(dentry, flags,
+ nfs4_do_lookup_revalidate);
+}
+
+#endif /* CONFIG_NFSV4 */
+
+struct dentry *
+nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr,
+ struct nfs4_label *label)
+{
+ struct dentry *parent = dget_parent(dentry);
+ struct inode *dir = d_inode(parent);
+ struct inode *inode;
+ struct dentry *d;
+ int error;
+
+ d_drop(dentry);
+
+ if (fhandle->size == 0) {
+ error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr, NULL);
+ if (error)
+ goto out_error;
+ }
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ if (!(fattr->valid & NFS_ATTR_FATTR)) {
+ struct nfs_server *server = NFS_SB(dentry->d_sb);
+ error = server->nfs_client->rpc_ops->getattr(server, fhandle,
+ fattr, NULL, NULL);
+ if (error < 0)
+ goto out_error;
+ }
+ inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
+ d = d_splice_alias(inode, dentry);
+out:
+ dput(parent);
+ return d;
+out_error:
+ d = ERR_PTR(error);
+ goto out;
+}
+EXPORT_SYMBOL_GPL(nfs_add_or_obtain);
+
+/*
+ * Code common to create, mkdir, and mknod.
+ */
+int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr,
+ struct nfs4_label *label)
+{
+ struct dentry *d;
+
+ d = nfs_add_or_obtain(dentry, fhandle, fattr, label);
+ if (IS_ERR(d))
+ return PTR_ERR(d);
+
+ /* Callers don't care */
+ dput(d);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_instantiate);
+
+/*
+ * Following a failed create operation, we drop the dentry rather
+ * than retain a negative dentry. This avoids a problem in the event
+ * that the operation succeeded on the server, but an error in the
+ * reply path made it appear to have failed.
+ */
+int nfs_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool excl)
+{
+ struct iattr attr;
+ int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT;
+ int error;
+
+ dfprintk(VFS, "NFS: create(%s/%lu), %pd\n",
+ dir->i_sb->s_id, dir->i_ino, dentry);
+
+ attr.ia_mode = mode;
+ attr.ia_valid = ATTR_MODE;
+
+ trace_nfs_create_enter(dir, dentry, open_flags);
+ error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags);
+ trace_nfs_create_exit(dir, dentry, open_flags, error);
+ if (error != 0)
+ goto out_err;
+ return 0;
+out_err:
+ d_drop(dentry);
+ return error;
+}
+EXPORT_SYMBOL_GPL(nfs_create);
+
+/*
+ * See comments for nfs_proc_create regarding failed operations.
+ */
+int
+nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+{
+ struct iattr attr;
+ int status;
+
+ dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n",
+ dir->i_sb->s_id, dir->i_ino, dentry);
+
+ attr.ia_mode = mode;
+ attr.ia_valid = ATTR_MODE;
+
+ trace_nfs_mknod_enter(dir, dentry);
+ status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
+ trace_nfs_mknod_exit(dir, dentry, status);
+ if (status != 0)
+ goto out_err;
+ return 0;
+out_err:
+ d_drop(dentry);
+ return status;
+}
+EXPORT_SYMBOL_GPL(nfs_mknod);
+
+/*
+ * See comments for nfs_proc_create regarding failed operations.
+ */
+int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct iattr attr;
+ int error;
+
+ dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n",
+ dir->i_sb->s_id, dir->i_ino, dentry);
+
+ attr.ia_valid = ATTR_MODE;
+ attr.ia_mode = mode | S_IFDIR;
+
+ trace_nfs_mkdir_enter(dir, dentry);
+ error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
+ trace_nfs_mkdir_exit(dir, dentry, error);
+ if (error != 0)
+ goto out_err;
+ return 0;
+out_err:
+ d_drop(dentry);
+ return error;
+}
+EXPORT_SYMBOL_GPL(nfs_mkdir);
+
+static void nfs_dentry_handle_enoent(struct dentry *dentry)
+{
+ if (simple_positive(dentry))
+ d_delete(dentry);
+}
+
+int nfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ int error;
+
+ dfprintk(VFS, "NFS: rmdir(%s/%lu), %pd\n",
+ dir->i_sb->s_id, dir->i_ino, dentry);
+
+ trace_nfs_rmdir_enter(dir, dentry);
+ if (d_really_is_positive(dentry)) {
+ down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
+ error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
+ /* Ensure the VFS deletes this inode */
+ switch (error) {
+ case 0:
+ clear_nlink(d_inode(dentry));
+ break;
+ case -ENOENT:
+ nfs_dentry_handle_enoent(dentry);
+ }
+ up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
+ } else
+ error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
+ trace_nfs_rmdir_exit(dir, dentry, error);
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(nfs_rmdir);
+
+/*
+ * Remove a file after making sure there are no pending writes,
+ * and after checking that the file has only one user.
+ *
+ * We invalidate the attribute cache and free the inode prior to the operation
+ * to avoid possible races if the server reuses the inode.
+ */
+static int nfs_safe_remove(struct dentry *dentry)
+{
+ struct inode *dir = d_inode(dentry->d_parent);
+ struct inode *inode = d_inode(dentry);
+ int error = -EBUSY;
+
+ dfprintk(VFS, "NFS: safe_remove(%pd2)\n", dentry);
+
+ /* If the dentry was sillyrenamed, we simply call d_delete() */
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+ error = 0;
+ goto out;
+ }
+
+ trace_nfs_remove_enter(dir, dentry);
+ if (inode != NULL) {
+ error = NFS_PROTO(dir)->remove(dir, dentry);
+ if (error == 0)
+ nfs_drop_nlink(inode);
+ } else
+ error = NFS_PROTO(dir)->remove(dir, dentry);
+ if (error == -ENOENT)
+ nfs_dentry_handle_enoent(dentry);
+ trace_nfs_remove_exit(dir, dentry, error);
+out:
+ return error;
+}
+
+/* We do silly rename. In case sillyrename() returns -EBUSY, the inode
+ * belongs to an active ".nfs..." file and we return -EBUSY.
+ *
+ * If sillyrename() returns 0, we do nothing, otherwise we unlink.
+ */
+int nfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ int error;
+ int need_rehash = 0;
+
+ dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id,
+ dir->i_ino, dentry);
+
+ trace_nfs_unlink_enter(dir, dentry);
+ spin_lock(&dentry->d_lock);
+ if (d_count(dentry) > 1) {
+ spin_unlock(&dentry->d_lock);
+ /* Start asynchronous writeout of the inode */
+ write_inode_now(d_inode(dentry), 0);
+ error = nfs_sillyrename(dir, dentry);
+ goto out;
+ }
+ if (!d_unhashed(dentry)) {
+ __d_drop(dentry);
+ need_rehash = 1;
+ }
+ spin_unlock(&dentry->d_lock);
+ error = nfs_safe_remove(dentry);
+ if (!error || error == -ENOENT) {
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ } else if (need_rehash)
+ d_rehash(dentry);
+out:
+ trace_nfs_unlink_exit(dir, dentry, error);
+ return error;
+}
+EXPORT_SYMBOL_GPL(nfs_unlink);
+
+/*
+ * To create a symbolic link, most file systems instantiate a new inode,
+ * add a page to it containing the path, then write it out to the disk
+ * using prepare_write/commit_write.
+ *
+ * Unfortunately the NFS client can't create the in-core inode first
+ * because it needs a file handle to create an in-core inode (see
+ * fs/nfs/inode.c:nfs_fhget). We only have a file handle *after* the
+ * symlink request has completed on the server.
+ *
+ * So instead we allocate a raw page, copy the symname into it, then do
+ * the SYMLINK request with the page as the buffer. If it succeeds, we
+ * now have a new file handle and can instantiate an in-core NFS inode
+ * and move the raw page into its mapping.
+ */
+int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+ struct page *page;
+ char *kaddr;
+ struct iattr attr;
+ unsigned int pathlen = strlen(symname);
+ int error;
+
+ dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s)\n", dir->i_sb->s_id,
+ dir->i_ino, dentry, symname);
+
+ if (pathlen > PAGE_SIZE)
+ return -ENAMETOOLONG;
+
+ attr.ia_mode = S_IFLNK | S_IRWXUGO;
+ attr.ia_valid = ATTR_MODE;
+
+ page = alloc_page(GFP_USER);
+ if (!page)
+ return -ENOMEM;
+
+ kaddr = page_address(page);
+ memcpy(kaddr, symname, pathlen);
+ if (pathlen < PAGE_SIZE)
+ memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
+
+ trace_nfs_symlink_enter(dir, dentry);
+ error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
+ trace_nfs_symlink_exit(dir, dentry, error);
+ if (error != 0) {
+ dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n",
+ dir->i_sb->s_id, dir->i_ino,
+ dentry, symname, error);
+ d_drop(dentry);
+ __free_page(page);
+ return error;
+ }
+
+ /*
+ * No big deal if we can't add this page to the page cache here.
+ * READLINK will get the missing page from the server if needed.
+ */
+ if (!add_to_page_cache_lru(page, d_inode(dentry)->i_mapping, 0,
+ GFP_KERNEL)) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ /*
+ * add_to_page_cache_lru() grabs an extra page refcount.
+ * Drop it here to avoid leaking this page later.
+ */
+ put_page(page);
+ } else
+ __free_page(page);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_symlink);
+
+int
+nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(old_dentry);
+ int error;
+
+ dfprintk(VFS, "NFS: link(%pd2 -> %pd2)\n",
+ old_dentry, dentry);
+
+ trace_nfs_link_enter(inode, dir, dentry);
+ d_drop(dentry);
+ if (S_ISREG(inode->i_mode))
+ nfs_sync_inode(inode);
+ error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
+ if (error == 0) {
+ ihold(inode);
+ d_add(dentry, inode);
+ }
+ trace_nfs_link_exit(inode, dir, dentry, error);
+ return error;
+}
+EXPORT_SYMBOL_GPL(nfs_link);
+
+/*
+ * RENAME
+ * FIXME: Some nfsds, like the Linux user space nfsd, may generate a
+ * different file handle for the same inode after a rename (e.g. when
+ * moving to a different directory). A fail-safe method to do so would
+ * be to look up old_dir/old_name, create a link to new_dir/new_name and
+ * rename the old file using the sillyrename stuff. This way, the original
+ * file in old_dir will go away when the last process iput()s the inode.
+ *
+ * FIXED.
+ *
+ * It actually works quite well. One needs to have the possibility for
+ * at least one ".nfs..." file in each directory the file ever gets
+ * moved or linked to which happens automagically with the new
+ * implementation that only depends on the dcache stuff instead of
+ * using the inode layer
+ *
+ * Unfortunately, things are a little more complicated than indicated
+ * above. For a cross-directory move, we want to make sure we can get
+ * rid of the old inode after the operation. This means there must be
+ * no pending writes (if it's a file), and the use count must be 1.
+ * If these conditions are met, we can drop the dentries before doing
+ * the rename.
+ */
+int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ struct inode *old_inode = d_inode(old_dentry);
+ struct inode *new_inode = d_inode(new_dentry);
+ struct dentry *dentry = NULL, *rehash = NULL;
+ struct rpc_task *task;
+ int error = -EBUSY;
+
+ if (flags)
+ return -EINVAL;
+
+ dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n",
+ old_dentry, new_dentry,
+ d_count(new_dentry));
+
+ trace_nfs_rename_enter(old_dir, old_dentry, new_dir, new_dentry);
+ /*
+ * For non-directories, check whether the target is busy and if so,
+ * make a copy of the dentry and then do a silly-rename. If the
+ * silly-rename succeeds, the copied dentry is hashed and becomes
+ * the new target.
+ */
+ if (new_inode && !S_ISDIR(new_inode->i_mode)) {
+ /*
+ * To prevent any new references to the target during the
+ * rename, we unhash the dentry in advance.
+ */
+ if (!d_unhashed(new_dentry)) {
+ d_drop(new_dentry);
+ rehash = new_dentry;
+ }
+
+ if (d_count(new_dentry) > 2) {
+ int err;
+
+ /* copy the target dentry's name */
+ dentry = d_alloc(new_dentry->d_parent,
+ &new_dentry->d_name);
+ if (!dentry)
+ goto out;
+
+ /* silly-rename the existing target ... */
+ err = nfs_sillyrename(new_dir, new_dentry);
+ if (err)
+ goto out;
+
+ new_dentry = dentry;
+ rehash = NULL;
+ new_inode = NULL;
+ }
+ }
+
+ if (S_ISREG(old_inode->i_mode))
+ nfs_sync_inode(old_inode);
+ task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL);
+ if (IS_ERR(task)) {
+ error = PTR_ERR(task);
+ goto out;
+ }
+
+ error = rpc_wait_for_completion_task(task);
+ if (error != 0) {
+ ((struct nfs_renamedata *)task->tk_calldata)->cancelled = 1;
+ /* Paired with the atomic_dec_and_test() barrier in rpc_do_put_task() */
+ smp_wmb();
+ } else
+ error = task->tk_status;
+ rpc_put_task(task);
+ /* Ensure the inode attributes are revalidated */
+ if (error == 0) {
+ spin_lock(&old_inode->i_lock);
+ NFS_I(old_inode)->attr_gencount = nfs_inc_attr_generation_counter();
+ NFS_I(old_inode)->cache_validity |= NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_CTIME
+ | NFS_INO_REVAL_FORCED;
+ spin_unlock(&old_inode->i_lock);
+ }
+out:
+ if (rehash)
+ d_rehash(rehash);
+ trace_nfs_rename_exit(old_dir, old_dentry,
+ new_dir, new_dentry, error);
+ if (!error) {
+ if (new_inode != NULL)
+ nfs_drop_nlink(new_inode);
+ /*
+ * The d_move() should be here instead of in an async RPC completion
+ * handler because we need the proper locks to move the dentry. If
+ * we're interrupted by a signal, the async RPC completion handler
+ * should mark the directories for revalidation.
+ */
+ d_move(old_dentry, new_dentry);
+ nfs_set_verifier(old_dentry,
+ nfs_save_change_attribute(new_dir));
+ } else if (error == -ENOENT)
+ nfs_dentry_handle_enoent(old_dentry);
+
+ /* new dentry created? */
+ if (dentry)
+ dput(dentry);
+ return error;
+}
+EXPORT_SYMBOL_GPL(nfs_rename);
+
+static DEFINE_SPINLOCK(nfs_access_lru_lock);
+static LIST_HEAD(nfs_access_lru_list);
+static atomic_long_t nfs_access_nr_entries;
+
+static unsigned long nfs_access_max_cachesize = 4*1024*1024;
+module_param(nfs_access_max_cachesize, ulong, 0644);
+MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length");
+
+static void nfs_access_free_entry(struct nfs_access_entry *entry)
+{
+ put_cred(entry->cred);
+ kfree_rcu(entry, rcu_head);
+ smp_mb__before_atomic();
+ atomic_long_dec(&nfs_access_nr_entries);
+ smp_mb__after_atomic();
+}
+
+static void nfs_access_free_list(struct list_head *head)
+{
+ struct nfs_access_entry *cache;
+
+ while (!list_empty(head)) {
+ cache = list_entry(head->next, struct nfs_access_entry, lru);
+ list_del(&cache->lru);
+ nfs_access_free_entry(cache);
+ }
+}
+
+static unsigned long
+nfs_do_access_cache_scan(unsigned int nr_to_scan)
+{
+ LIST_HEAD(head);
+ struct nfs_inode *nfsi, *next;
+ struct nfs_access_entry *cache;
+ long freed = 0;
+
+ spin_lock(&nfs_access_lru_lock);
+ list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
+ struct inode *inode;
+
+ if (nr_to_scan-- == 0)
+ break;
+ inode = &nfsi->vfs_inode;
+ spin_lock(&inode->i_lock);
+ if (list_empty(&nfsi->access_cache_entry_lru))
+ goto remove_lru_entry;
+ cache = list_entry(nfsi->access_cache_entry_lru.next,
+ struct nfs_access_entry, lru);
+ list_move(&cache->lru, &head);
+ rb_erase(&cache->rb_node, &nfsi->access_cache);
+ freed++;
+ if (!list_empty(&nfsi->access_cache_entry_lru))
+ list_move_tail(&nfsi->access_cache_inode_lru,
+ &nfs_access_lru_list);
+ else {
+remove_lru_entry:
+ list_del_init(&nfsi->access_cache_inode_lru);
+ smp_mb__before_atomic();
+ clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
+ smp_mb__after_atomic();
+ }
+ spin_unlock(&inode->i_lock);
+ }
+ spin_unlock(&nfs_access_lru_lock);
+ nfs_access_free_list(&head);
+ return freed;
+}
+
+unsigned long
+nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int nr_to_scan = sc->nr_to_scan;
+ gfp_t gfp_mask = sc->gfp_mask;
+
+ if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+ return SHRINK_STOP;
+ return nfs_do_access_cache_scan(nr_to_scan);
+}
+
+
+unsigned long
+nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries));
+}
+
+static void
+nfs_access_cache_enforce_limit(void)
+{
+ long nr_entries = atomic_long_read(&nfs_access_nr_entries);
+ unsigned long diff;
+ unsigned int nr_to_scan;
+
+ if (nr_entries < 0 || nr_entries <= nfs_access_max_cachesize)
+ return;
+ nr_to_scan = 100;
+ diff = nr_entries - nfs_access_max_cachesize;
+ if (diff < nr_to_scan)
+ nr_to_scan = diff;
+ nfs_do_access_cache_scan(nr_to_scan);
+}
+
+static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
+{
+ struct rb_root *root_node = &nfsi->access_cache;
+ struct rb_node *n;
+ struct nfs_access_entry *entry;
+
+ /* Unhook entries from the cache */
+ while ((n = rb_first(root_node)) != NULL) {
+ entry = rb_entry(n, struct nfs_access_entry, rb_node);
+ rb_erase(n, root_node);
+ list_move(&entry->lru, head);
+ }
+ nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
+}
+
+void nfs_access_zap_cache(struct inode *inode)
+{
+ LIST_HEAD(head);
+
+ if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0)
+ return;
+ /* Remove from global LRU init */
+ spin_lock(&nfs_access_lru_lock);
+ if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
+ list_del_init(&NFS_I(inode)->access_cache_inode_lru);
+
+ spin_lock(&inode->i_lock);
+ __nfs_access_zap_cache(NFS_I(inode), &head);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&nfs_access_lru_lock);
+ nfs_access_free_list(&head);
+}
+EXPORT_SYMBOL_GPL(nfs_access_zap_cache);
+
+static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, const struct cred *cred)
+{
+ struct rb_node *n = NFS_I(inode)->access_cache.rb_node;
+
+ while (n != NULL) {
+ struct nfs_access_entry *entry =
+ rb_entry(n, struct nfs_access_entry, rb_node);
+ int cmp = cred_fscmp(cred, entry->cred);
+
+ if (cmp < 0)
+ n = n->rb_left;
+ else if (cmp > 0)
+ n = n->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, u32 *mask, bool may_block)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_access_entry *cache;
+ bool retry = true;
+ int err;
+
+ spin_lock(&inode->i_lock);
+ for(;;) {
+ if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+ goto out_zap;
+ cache = nfs_access_search_rbtree(inode, cred);
+ err = -ENOENT;
+ if (cache == NULL)
+ goto out;
+ /* Found an entry, is our attribute cache valid? */
+ if (!nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS))
+ break;
+ if (!retry)
+ break;
+ err = -ECHILD;
+ if (!may_block)
+ goto out;
+ spin_unlock(&inode->i_lock);
+ err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (err)
+ return err;
+ spin_lock(&inode->i_lock);
+ retry = false;
+ }
+ *mask = cache->mask;
+ list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru);
+ err = 0;
+out:
+ spin_unlock(&inode->i_lock);
+ return err;
+out_zap:
+ spin_unlock(&inode->i_lock);
+ nfs_access_zap_cache(inode);
+ return -ENOENT;
+}
+
+static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cred, u32 *mask)
+{
+ /* Only check the most recently returned cache entry,
+ * but do it without locking.
+ */
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_access_entry *cache;
+ int err = -ECHILD;
+ struct list_head *lh;
+
+ rcu_read_lock();
+ if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+ goto out;
+ lh = rcu_dereference(list_tail_rcu(&nfsi->access_cache_entry_lru));
+ cache = list_entry(lh, struct nfs_access_entry, lru);
+ if (lh == &nfsi->access_cache_entry_lru ||
+ cred_fscmp(cred, cache->cred) != 0)
+ cache = NULL;
+ if (cache == NULL)
+ goto out;
+ if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS))
+ goto out;
+ *mask = cache->mask;
+ err = 0;
+out:
+ rcu_read_unlock();
+ return err;
+}
+
+int nfs_access_get_cached(struct inode *inode, const struct cred *cred,
+ u32 *mask, bool may_block)
+{
+ int status;
+
+ status = nfs_access_get_cached_rcu(inode, cred, mask);
+ if (status != 0)
+ status = nfs_access_get_cached_locked(inode, cred, mask,
+ may_block);
+
+ return status;
+}
+EXPORT_SYMBOL_GPL(nfs_access_get_cached);
+
+static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct rb_root *root_node = &nfsi->access_cache;
+ struct rb_node **p = &root_node->rb_node;
+ struct rb_node *parent = NULL;
+ struct nfs_access_entry *entry;
+ int cmp;
+
+ spin_lock(&inode->i_lock);
+ while (*p != NULL) {
+ parent = *p;
+ entry = rb_entry(parent, struct nfs_access_entry, rb_node);
+ cmp = cred_fscmp(set->cred, entry->cred);
+
+ if (cmp < 0)
+ p = &parent->rb_left;
+ else if (cmp > 0)
+ p = &parent->rb_right;
+ else
+ goto found;
+ }
+ rb_link_node(&set->rb_node, parent, p);
+ rb_insert_color(&set->rb_node, root_node);
+ list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
+ spin_unlock(&inode->i_lock);
+ return;
+found:
+ rb_replace_node(parent, &set->rb_node, root_node);
+ list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
+ list_del(&entry->lru);
+ spin_unlock(&inode->i_lock);
+ nfs_access_free_entry(entry);
+}
+
+void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
+{
+ struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL);
+ if (cache == NULL)
+ return;
+ RB_CLEAR_NODE(&cache->rb_node);
+ cache->cred = get_cred(set->cred);
+ cache->mask = set->mask;
+
+ /* The above field assignments must be visible
+ * before this item appears on the lru. We cannot easily
+ * use rcu_assign_pointer, so just force the memory barrier.
+ */
+ smp_wmb();
+ nfs_access_add_rbtree(inode, cache);
+
+ /* Update accounting */
+ smp_mb__before_atomic();
+ atomic_long_inc(&nfs_access_nr_entries);
+ smp_mb__after_atomic();
+
+ /* Add inode to global LRU list */
+ if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
+ spin_lock(&nfs_access_lru_lock);
+ if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
+ list_add_tail(&NFS_I(inode)->access_cache_inode_lru,
+ &nfs_access_lru_list);
+ spin_unlock(&nfs_access_lru_lock);
+ }
+ nfs_access_cache_enforce_limit();
+}
+EXPORT_SYMBOL_GPL(nfs_access_add_cache);
+
+#define NFS_MAY_READ (NFS_ACCESS_READ)
+#define NFS_MAY_WRITE (NFS_ACCESS_MODIFY | \
+ NFS_ACCESS_EXTEND | \
+ NFS_ACCESS_DELETE)
+#define NFS_FILE_MAY_WRITE (NFS_ACCESS_MODIFY | \
+ NFS_ACCESS_EXTEND)
+#define NFS_DIR_MAY_WRITE NFS_MAY_WRITE
+#define NFS_MAY_LOOKUP (NFS_ACCESS_LOOKUP)
+#define NFS_MAY_EXECUTE (NFS_ACCESS_EXECUTE)
+static int
+nfs_access_calc_mask(u32 access_result, umode_t umode)
+{
+ int mask = 0;
+
+ if (access_result & NFS_MAY_READ)
+ mask |= MAY_READ;
+ if (S_ISDIR(umode)) {
+ if ((access_result & NFS_DIR_MAY_WRITE) == NFS_DIR_MAY_WRITE)
+ mask |= MAY_WRITE;
+ if ((access_result & NFS_MAY_LOOKUP) == NFS_MAY_LOOKUP)
+ mask |= MAY_EXEC;
+ } else if (S_ISREG(umode)) {
+ if ((access_result & NFS_FILE_MAY_WRITE) == NFS_FILE_MAY_WRITE)
+ mask |= MAY_WRITE;
+ if ((access_result & NFS_MAY_EXECUTE) == NFS_MAY_EXECUTE)
+ mask |= MAY_EXEC;
+ } else if (access_result & NFS_MAY_WRITE)
+ mask |= MAY_WRITE;
+ return mask;
+}
+
+void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result)
+{
+ entry->mask = access_result;
+}
+EXPORT_SYMBOL_GPL(nfs_access_set_mask);
+
+static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask)
+{
+ struct nfs_access_entry cache;
+ bool may_block = (mask & MAY_NOT_BLOCK) == 0;
+ int cache_mask = -1;
+ int status;
+
+ trace_nfs_access_enter(inode);
+
+ status = nfs_access_get_cached(inode, cred, &cache.mask, may_block);
+ if (status == 0)
+ goto out_cached;
+
+ status = -ECHILD;
+ if (!may_block)
+ goto out;
+
+ /*
+ * Determine which access bits we want to ask for...
+ */
+ cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND;
+ if (nfs_server_capable(inode, NFS_CAP_XATTR)) {
+ cache.mask |= NFS_ACCESS_XAREAD | NFS_ACCESS_XAWRITE |
+ NFS_ACCESS_XALIST;
+ }
+ if (S_ISDIR(inode->i_mode))
+ cache.mask |= NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP;
+ else
+ cache.mask |= NFS_ACCESS_EXECUTE;
+ cache.cred = cred;
+ status = NFS_PROTO(inode)->access(inode, &cache);
+ if (status != 0) {
+ if (status == -ESTALE) {
+ if (!S_ISDIR(inode->i_mode))
+ nfs_set_inode_stale(inode);
+ else
+ nfs_zap_caches(inode);
+ }
+ goto out;
+ }
+ nfs_access_add_cache(inode, &cache);
+out_cached:
+ cache_mask = nfs_access_calc_mask(cache.mask, inode->i_mode);
+ if ((mask & ~cache_mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0)
+ status = -EACCES;
+out:
+ trace_nfs_access_exit(inode, mask, cache_mask, status);
+ return status;
+}
+
+static int nfs_open_permission_mask(int openflags)
+{
+ int mask = 0;
+
+ if (openflags & __FMODE_EXEC) {
+ /* ONLY check exec rights */
+ mask = MAY_EXEC;
+ } else {
+ if ((openflags & O_ACCMODE) != O_WRONLY)
+ mask |= MAY_READ;
+ if ((openflags & O_ACCMODE) != O_RDONLY)
+ mask |= MAY_WRITE;
+ }
+
+ return mask;
+}
+
+int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags)
+{
+ return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
+}
+EXPORT_SYMBOL_GPL(nfs_may_open);
+
+static int nfs_execute_ok(struct inode *inode, int mask)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ int ret = 0;
+
+ if (S_ISDIR(inode->i_mode))
+ return 0;
+ if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_OTHER)) {
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+ ret = __nfs_revalidate_inode(server, inode);
+ }
+ if (ret == 0 && !execute_ok(inode))
+ ret = -EACCES;
+ return ret;
+}
+
+int nfs_permission(struct inode *inode, int mask)
+{
+ const struct cred *cred = current_cred();
+ int res = 0;
+
+ nfs_inc_stats(inode, NFSIOS_VFSACCESS);
+
+ if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
+ goto out;
+ /* Is this sys_access() ? */
+ if (mask & (MAY_ACCESS | MAY_CHDIR))
+ goto force_lookup;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFLNK:
+ goto out;
+ case S_IFREG:
+ if ((mask & MAY_OPEN) &&
+ nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN))
+ return 0;
+ break;
+ case S_IFDIR:
+ /*
+ * Optimize away all write operations, since the server
+ * will check permissions when we perform the op.
+ */
+ if ((mask & MAY_WRITE) && !(mask & MAY_READ))
+ goto out;
+ }
+
+force_lookup:
+ if (!NFS_PROTO(inode)->access)
+ goto out_notsup;
+
+ res = nfs_do_access(inode, cred, mask);
+out:
+ if (!res && (mask & MAY_EXEC))
+ res = nfs_execute_ok(inode, mask);
+
+ dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n",
+ inode->i_sb->s_id, inode->i_ino, mask, res);
+ return res;
+out_notsup:
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+
+ res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (res == 0)
+ res = generic_permission(inode, mask);
+ goto out;
+}
+EXPORT_SYMBOL_GPL(nfs_permission);
+
+/*
+ * Local variables:
+ * version-control: t
+ * kept-new-versions: 5
+ * End:
+ */
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
new file mode 100644
index 000000000..5d86ffa72
--- /dev/null
+++ b/fs/nfs/direct.c
@@ -0,0 +1,1034 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/direct.c
+ *
+ * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
+ *
+ * High-performance uncached I/O for the Linux NFS client
+ *
+ * There are important applications whose performance or correctness
+ * depends on uncached access to file data. Database clusters
+ * (multiple copies of the same instance running on separate hosts)
+ * implement their own cache coherency protocol that subsumes file
+ * system cache protocols. Applications that process datasets
+ * considerably larger than the client's memory do not always benefit
+ * from a local cache. A streaming video server, for instance, has no
+ * need to cache the contents of a file.
+ *
+ * When an application requests uncached I/O, all read and write requests
+ * are made directly to the server; data stored or fetched via these
+ * requests is not cached in the Linux page cache. The client does not
+ * correct unaligned requests from applications. All requested bytes are
+ * held on permanent storage before a direct write system call returns to
+ * an application.
+ *
+ * Solaris implements an uncached I/O facility called directio() that
+ * is used for backups and sequential I/O to very large files. Solaris
+ * also supports uncaching whole NFS partitions with "-o forcedirectio,"
+ * an undocumented mount option.
+ *
+ * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
+ * help from Andrew Morton.
+ *
+ * 18 Dec 2001 Initial implementation for 2.4 --cel
+ * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy
+ * 08 Jun 2003 Port to 2.5 APIs --cel
+ * 31 Mar 2004 Handle direct I/O without VFS support --cel
+ * 15 Sep 2004 Parallel async reads --cel
+ * 04 May 2005 support O_DIRECT with aio --cel
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/kref.h>
+#include <linux/slab.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/module.h>
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/sunrpc/clnt.h>
+
+#include <linux/uaccess.h>
+#include <linux/atomic.h>
+
+#include "internal.h"
+#include "iostat.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY NFSDBG_VFS
+
+static struct kmem_cache *nfs_direct_cachep;
+
+struct nfs_direct_req {
+ struct kref kref; /* release manager */
+
+ /* I/O parameters */
+ struct nfs_open_context *ctx; /* file open context info */
+ struct nfs_lock_context *l_ctx; /* Lock context info */
+ struct kiocb * iocb; /* controlling i/o request */
+ struct inode * inode; /* target file of i/o */
+
+ /* completion state */
+ atomic_t io_count; /* i/os we're waiting for */
+ spinlock_t lock; /* protect completion state */
+
+ loff_t io_start; /* Start offset for I/O */
+ ssize_t count, /* bytes actually processed */
+ max_count, /* max expected count */
+ bytes_left, /* bytes left to be sent */
+ error; /* any reported error */
+ struct completion completion; /* wait for i/o completion */
+
+ /* commit state */
+ struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */
+ struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */
+ struct work_struct work;
+ int flags;
+ /* for write */
+#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */
+#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */
+ /* for read */
+#define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */
+#define NFS_ODIRECT_DONE INT_MAX /* write verification failed */
+};
+
+static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
+static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq);
+static void nfs_direct_write_schedule_work(struct work_struct *work);
+
+static inline void get_dreq(struct nfs_direct_req *dreq)
+{
+ atomic_inc(&dreq->io_count);
+}
+
+static inline int put_dreq(struct nfs_direct_req *dreq)
+{
+ return atomic_dec_and_test(&dreq->io_count);
+}
+
+static void
+nfs_direct_handle_truncated(struct nfs_direct_req *dreq,
+ const struct nfs_pgio_header *hdr,
+ ssize_t dreq_len)
+{
+ if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) ||
+ test_bit(NFS_IOHDR_EOF, &hdr->flags)))
+ return;
+ if (dreq->max_count >= dreq_len) {
+ dreq->max_count = dreq_len;
+ if (dreq->count > dreq_len)
+ dreq->count = dreq_len;
+
+ if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
+ dreq->error = hdr->error;
+ else /* Clear outstanding error if this is EOF */
+ dreq->error = 0;
+ }
+}
+
+static void
+nfs_direct_count_bytes(struct nfs_direct_req *dreq,
+ const struct nfs_pgio_header *hdr)
+{
+ loff_t hdr_end = hdr->io_start + hdr->good_bytes;
+ ssize_t dreq_len = 0;
+
+ if (hdr_end > dreq->io_start)
+ dreq_len = hdr_end - dreq->io_start;
+
+ nfs_direct_handle_truncated(dreq, hdr, dreq_len);
+
+ if (dreq_len > dreq->max_count)
+ dreq_len = dreq->max_count;
+
+ if (dreq->count < dreq_len)
+ dreq->count = dreq_len;
+}
+
+/**
+ * nfs_direct_IO - NFS address space operation for direct I/O
+ * @iocb: target I/O control block
+ * @iter: I/O buffer
+ *
+ * The presence of this routine in the address space ops vector means
+ * the NFS client supports direct I/O. However, for most direct IO, we
+ * shunt off direct read and write requests before the VFS gets them,
+ * so this method is only ever called for swap.
+ */
+ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+
+ /* we only support swap file calling nfs_direct_IO */
+ if (!IS_SWAPFILE(inode))
+ return 0;
+
+ VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
+
+ if (iov_iter_rw(iter) == READ)
+ return nfs_file_direct_read(iocb, iter, true);
+ return nfs_file_direct_write(iocb, iter, true);
+}
+
+static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
+{
+ unsigned int i;
+ for (i = 0; i < npages; i++)
+ put_page(pages[i]);
+}
+
+void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
+ struct nfs_direct_req *dreq)
+{
+ cinfo->inode = dreq->inode;
+ cinfo->mds = &dreq->mds_cinfo;
+ cinfo->ds = &dreq->ds_cinfo;
+ cinfo->dreq = dreq;
+ cinfo->completion_ops = &nfs_direct_commit_completion_ops;
+}
+
+static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
+{
+ struct nfs_direct_req *dreq;
+
+ dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL);
+ if (!dreq)
+ return NULL;
+
+ kref_init(&dreq->kref);
+ kref_get(&dreq->kref);
+ init_completion(&dreq->completion);
+ INIT_LIST_HEAD(&dreq->mds_cinfo.list);
+ pnfs_init_ds_commit_info(&dreq->ds_cinfo);
+ INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
+ spin_lock_init(&dreq->lock);
+
+ return dreq;
+}
+
+static void nfs_direct_req_free(struct kref *kref)
+{
+ struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
+
+ pnfs_release_ds_info(&dreq->ds_cinfo, dreq->inode);
+ if (dreq->l_ctx != NULL)
+ nfs_put_lock_context(dreq->l_ctx);
+ if (dreq->ctx != NULL)
+ put_nfs_open_context(dreq->ctx);
+ kmem_cache_free(nfs_direct_cachep, dreq);
+}
+
+static void nfs_direct_req_release(struct nfs_direct_req *dreq)
+{
+ kref_put(&dreq->kref, nfs_direct_req_free);
+}
+
+ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq)
+{
+ return dreq->bytes_left;
+}
+EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
+
+/*
+ * Collects and returns the final error value/byte-count.
+ */
+static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
+{
+ ssize_t result = -EIOCBQUEUED;
+
+ /* Async requests don't wait here */
+ if (dreq->iocb)
+ goto out;
+
+ result = wait_for_completion_killable(&dreq->completion);
+
+ if (!result) {
+ result = dreq->count;
+ WARN_ON_ONCE(dreq->count < 0);
+ }
+ if (!result)
+ result = dreq->error;
+
+out:
+ return (ssize_t) result;
+}
+
+/*
+ * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust
+ * the iocb is still valid here if this is a synchronous request.
+ */
+static void nfs_direct_complete(struct nfs_direct_req *dreq)
+{
+ struct inode *inode = dreq->inode;
+
+ inode_dio_end(inode);
+
+ if (dreq->iocb) {
+ long res = (long) dreq->error;
+ if (dreq->count != 0) {
+ res = (long) dreq->count;
+ WARN_ON_ONCE(dreq->count < 0);
+ }
+ dreq->iocb->ki_complete(dreq->iocb, res, 0);
+ }
+
+ complete(&dreq->completion);
+
+ nfs_direct_req_release(dreq);
+}
+
+static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
+{
+ unsigned long bytes = 0;
+ struct nfs_direct_req *dreq = hdr->dreq;
+
+ spin_lock(&dreq->lock);
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ spin_unlock(&dreq->lock);
+ goto out_put;
+ }
+
+ nfs_direct_count_bytes(dreq, hdr);
+ spin_unlock(&dreq->lock);
+
+ while (!list_empty(&hdr->pages)) {
+ struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+ struct page *page = req->wb_page;
+
+ if (!PageCompound(page) && bytes < hdr->good_bytes &&
+ (dreq->flags == NFS_ODIRECT_SHOULD_DIRTY))
+ set_page_dirty(page);
+ bytes += req->wb_bytes;
+ nfs_list_remove_request(req);
+ nfs_release_request(req);
+ }
+out_put:
+ if (put_dreq(dreq))
+ nfs_direct_complete(dreq);
+ hdr->release(hdr);
+}
+
+static void nfs_read_sync_pgio_error(struct list_head *head, int error)
+{
+ struct nfs_page *req;
+
+ while (!list_empty(head)) {
+ req = nfs_list_entry(head->next);
+ nfs_list_remove_request(req);
+ nfs_release_request(req);
+ }
+}
+
+static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
+{
+ get_dreq(hdr->dreq);
+}
+
+static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
+ .error_cleanup = nfs_read_sync_pgio_error,
+ .init_hdr = nfs_direct_pgio_init,
+ .completion = nfs_direct_read_completion,
+};
+
+/*
+ * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
+ * operation. If nfs_readdata_alloc() or get_user_pages() fails,
+ * bail and stop sending more reads. Read length accounting is
+ * handled automatically by nfs_direct_read_result(). Otherwise, if
+ * no requests have been sent, just return an error.
+ */
+
+static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
+ struct iov_iter *iter,
+ loff_t pos)
+{
+ struct nfs_pageio_descriptor desc;
+ struct inode *inode = dreq->inode;
+ ssize_t result = -EINVAL;
+ size_t requested_bytes = 0;
+ size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE);
+
+ nfs_pageio_init_read(&desc, dreq->inode, false,
+ &nfs_direct_read_completion_ops);
+ get_dreq(dreq);
+ desc.pg_dreq = dreq;
+ inode_dio_begin(inode);
+
+ while (iov_iter_count(iter)) {
+ struct page **pagevec;
+ size_t bytes;
+ size_t pgbase;
+ unsigned npages, i;
+
+ result = iov_iter_get_pages_alloc(iter, &pagevec,
+ rsize, &pgbase);
+ if (result < 0)
+ break;
+
+ bytes = result;
+ iov_iter_advance(iter, bytes);
+ npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
+ for (i = 0; i < npages; i++) {
+ struct nfs_page *req;
+ unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
+ /* XXX do we need to do the eof zeroing found in async_filler? */
+ req = nfs_create_request(dreq->ctx, pagevec[i],
+ pgbase, req_len);
+ if (IS_ERR(req)) {
+ result = PTR_ERR(req);
+ break;
+ }
+ req->wb_index = pos >> PAGE_SHIFT;
+ req->wb_offset = pos & ~PAGE_MASK;
+ if (!nfs_pageio_add_request(&desc, req)) {
+ result = desc.pg_error;
+ nfs_release_request(req);
+ break;
+ }
+ pgbase = 0;
+ bytes -= req_len;
+ requested_bytes += req_len;
+ pos += req_len;
+ dreq->bytes_left -= req_len;
+ }
+ nfs_direct_release_pages(pagevec, npages);
+ kvfree(pagevec);
+ if (result < 0)
+ break;
+ }
+
+ nfs_pageio_complete(&desc);
+
+ /*
+ * If no bytes were started, return the error, and let the
+ * generic layer handle the completion.
+ */
+ if (requested_bytes == 0) {
+ inode_dio_end(inode);
+ nfs_direct_req_release(dreq);
+ return result < 0 ? result : -EIO;
+ }
+
+ if (put_dreq(dreq))
+ nfs_direct_complete(dreq);
+ return requested_bytes;
+}
+
+/**
+ * nfs_file_direct_read - file direct read operation for NFS files
+ * @iocb: target I/O control block
+ * @iter: vector of user buffers into which to read data
+ * @swap: flag indicating this is swap IO, not O_DIRECT IO
+ *
+ * We use this function for direct reads instead of calling
+ * generic_file_aio_read() in order to avoid gfar's check to see if
+ * the request starts before the end of the file. For that check
+ * to work, we must generate a GETATTR before each direct read, and
+ * even then there is a window between the GETATTR and the subsequent
+ * READ where the file size could change. Our preference is simply
+ * to do all reads the application wants, and the server will take
+ * care of managing the end of file boundary.
+ *
+ * This function also eliminates unnecessarily updating the file's
+ * atime locally, as the NFS server sets the file's atime, and this
+ * client must read the updated atime from the server back into its
+ * cache.
+ */
+ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
+ bool swap)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct nfs_direct_req *dreq;
+ struct nfs_lock_context *l_ctx;
+ ssize_t result, requested;
+ size_t count = iov_iter_count(iter);
+ nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
+
+ dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
+ file, count, (long long) iocb->ki_pos);
+
+ result = 0;
+ if (!count)
+ goto out;
+
+ task_io_account_read(count);
+
+ result = -ENOMEM;
+ dreq = nfs_direct_req_alloc();
+ if (dreq == NULL)
+ goto out;
+
+ dreq->inode = inode;
+ dreq->bytes_left = dreq->max_count = count;
+ dreq->io_start = iocb->ki_pos;
+ dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+ l_ctx = nfs_get_lock_context(dreq->ctx);
+ if (IS_ERR(l_ctx)) {
+ result = PTR_ERR(l_ctx);
+ nfs_direct_req_release(dreq);
+ goto out_release;
+ }
+ dreq->l_ctx = l_ctx;
+ if (!is_sync_kiocb(iocb))
+ dreq->iocb = iocb;
+
+ if (iter_is_iovec(iter))
+ dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
+
+ if (!swap)
+ nfs_start_io_direct(inode);
+
+ NFS_I(inode)->read_io += count;
+ requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
+
+ if (!swap)
+ nfs_end_io_direct(inode);
+
+ if (requested > 0) {
+ result = nfs_direct_wait(dreq);
+ if (result > 0) {
+ requested -= result;
+ iocb->ki_pos += result;
+ }
+ iov_iter_revert(iter, requested);
+ } else {
+ result = requested;
+ }
+
+out_release:
+ nfs_direct_req_release(dreq);
+out:
+ return result;
+}
+
+static void nfs_direct_add_page_head(struct list_head *list,
+ struct nfs_page *req)
+{
+ struct nfs_page *head = req->wb_head;
+
+ if (!list_empty(&head->wb_list) || !nfs_lock_request(head))
+ return;
+ if (!list_empty(&head->wb_list)) {
+ nfs_unlock_request(head);
+ return;
+ }
+ list_add(&head->wb_list, list);
+ kref_get(&head->wb_kref);
+ kref_get(&head->wb_kref);
+}
+
+static void nfs_direct_join_group(struct list_head *list,
+ struct nfs_commit_info *cinfo,
+ struct inode *inode)
+{
+ struct nfs_page *req, *subreq;
+
+ list_for_each_entry(req, list, wb_list) {
+ if (req->wb_head != req) {
+ nfs_direct_add_page_head(&req->wb_list, req);
+ continue;
+ }
+ subreq = req->wb_this_page;
+ if (subreq == req)
+ continue;
+ do {
+ /*
+ * Remove subrequests from this list before freeing
+ * them in the call to nfs_join_page_group().
+ */
+ if (!list_empty(&subreq->wb_list)) {
+ nfs_list_remove_request(subreq);
+ nfs_release_request(subreq);
+ }
+ } while ((subreq = subreq->wb_this_page) != req);
+ nfs_join_page_group(req, cinfo, inode);
+ }
+}
+
+static void
+nfs_direct_write_scan_commit_list(struct inode *inode,
+ struct list_head *list,
+ struct nfs_commit_info *cinfo)
+{
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+ pnfs_recover_commit_reqs(list, cinfo);
+ nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+}
+
+static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
+{
+ struct nfs_pageio_descriptor desc;
+ struct nfs_page *req, *tmp;
+ LIST_HEAD(reqs);
+ struct nfs_commit_info cinfo;
+ LIST_HEAD(failed);
+
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
+ nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
+
+ nfs_direct_join_group(&reqs, &cinfo, dreq->inode);
+
+ dreq->count = 0;
+ dreq->max_count = 0;
+ list_for_each_entry(req, &reqs, wb_list)
+ dreq->max_count += req->wb_bytes;
+ nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
+ get_dreq(dreq);
+
+ nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
+ &nfs_direct_write_completion_ops);
+ desc.pg_dreq = dreq;
+
+ list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
+ /* Bump the transmission count */
+ req->wb_nio++;
+ if (!nfs_pageio_add_request(&desc, req)) {
+ nfs_list_move_request(req, &failed);
+ spin_lock(&cinfo.inode->i_lock);
+ dreq->flags = 0;
+ if (desc.pg_error < 0)
+ dreq->error = desc.pg_error;
+ else
+ dreq->error = -EIO;
+ spin_unlock(&cinfo.inode->i_lock);
+ }
+ nfs_release_request(req);
+ }
+ nfs_pageio_complete(&desc);
+
+ while (!list_empty(&failed)) {
+ req = nfs_list_entry(failed.next);
+ nfs_list_remove_request(req);
+ nfs_unlock_and_release_request(req);
+ }
+
+ if (put_dreq(dreq))
+ nfs_direct_write_complete(dreq);
+}
+
+static void nfs_direct_commit_complete(struct nfs_commit_data *data)
+{
+ const struct nfs_writeverf *verf = data->res.verf;
+ struct nfs_direct_req *dreq = data->dreq;
+ struct nfs_commit_info cinfo;
+ struct nfs_page *req;
+ int status = data->task.tk_status;
+
+ if (status < 0) {
+ /* Errors in commit are fatal */
+ dreq->error = status;
+ dreq->max_count = 0;
+ dreq->count = 0;
+ dreq->flags = NFS_ODIRECT_DONE;
+ } else if (dreq->flags == NFS_ODIRECT_DONE)
+ status = dreq->error;
+
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
+
+ while (!list_empty(&data->pages)) {
+ req = nfs_list_entry(data->pages.next);
+ nfs_list_remove_request(req);
+ if (status >= 0 && !nfs_write_match_verf(verf, req)) {
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ /*
+ * Despite the reboot, the write was successful,
+ * so reset wb_nio.
+ */
+ req->wb_nio = 0;
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
+ } else /* Error or match */
+ nfs_release_request(req);
+ nfs_unlock_and_release_request(req);
+ }
+
+ if (nfs_commit_end(cinfo.mds))
+ nfs_direct_write_complete(dreq);
+}
+
+static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
+ struct nfs_page *req)
+{
+ struct nfs_direct_req *dreq = cinfo->dreq;
+
+ spin_lock(&dreq->lock);
+ if (dreq->flags != NFS_ODIRECT_DONE)
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ spin_unlock(&dreq->lock);
+ nfs_mark_request_commit(req, NULL, cinfo, 0);
+}
+
+static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
+ .completion = nfs_direct_commit_complete,
+ .resched_write = nfs_direct_resched_write,
+};
+
+static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
+{
+ int res;
+ struct nfs_commit_info cinfo;
+ LIST_HEAD(mds_list);
+
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
+ nfs_scan_commit(dreq->inode, &mds_list, &cinfo);
+ res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo);
+ if (res < 0) /* res == -ENOMEM */
+ nfs_direct_write_reschedule(dreq);
+}
+
+static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq)
+{
+ struct nfs_commit_info cinfo;
+ struct nfs_page *req;
+ LIST_HEAD(reqs);
+
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
+ nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
+
+ while (!list_empty(&reqs)) {
+ req = nfs_list_entry(reqs.next);
+ nfs_list_remove_request(req);
+ nfs_release_request(req);
+ nfs_unlock_and_release_request(req);
+ }
+}
+
+static void nfs_direct_write_schedule_work(struct work_struct *work)
+{
+ struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
+ int flags = dreq->flags;
+
+ dreq->flags = 0;
+ switch (flags) {
+ case NFS_ODIRECT_DO_COMMIT:
+ nfs_direct_commit_schedule(dreq);
+ break;
+ case NFS_ODIRECT_RESCHED_WRITES:
+ nfs_direct_write_reschedule(dreq);
+ break;
+ default:
+ nfs_direct_write_clear_reqs(dreq);
+ nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
+ nfs_direct_complete(dreq);
+ }
+}
+
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq)
+{
+ queue_work(nfsiod_workqueue, &dreq->work); /* Calls nfs_direct_write_schedule_work */
+}
+
+static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
+{
+ struct nfs_direct_req *dreq = hdr->dreq;
+ struct nfs_commit_info cinfo;
+ struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+ int flags = NFS_ODIRECT_DONE;
+
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
+
+ spin_lock(&dreq->lock);
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ spin_unlock(&dreq->lock);
+ goto out_put;
+ }
+
+ nfs_direct_count_bytes(dreq, hdr);
+ if (hdr->good_bytes != 0 && nfs_write_need_commit(hdr)) {
+ if (!dreq->flags)
+ dreq->flags = NFS_ODIRECT_DO_COMMIT;
+ flags = dreq->flags;
+ }
+ spin_unlock(&dreq->lock);
+
+ while (!list_empty(&hdr->pages)) {
+
+ req = nfs_list_entry(hdr->pages.next);
+ nfs_list_remove_request(req);
+ if (flags == NFS_ODIRECT_DO_COMMIT) {
+ kref_get(&req->wb_kref);
+ memcpy(&req->wb_verf, &hdr->verf.verifier,
+ sizeof(req->wb_verf));
+ nfs_mark_request_commit(req, hdr->lseg, &cinfo,
+ hdr->ds_commit_idx);
+ } else if (flags == NFS_ODIRECT_RESCHED_WRITES) {
+ kref_get(&req->wb_kref);
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
+ }
+ nfs_unlock_and_release_request(req);
+ }
+
+out_put:
+ if (put_dreq(dreq))
+ nfs_direct_write_complete(dreq);
+ hdr->release(hdr);
+}
+
+static void nfs_write_sync_pgio_error(struct list_head *head, int error)
+{
+ struct nfs_page *req;
+
+ while (!list_empty(head)) {
+ req = nfs_list_entry(head->next);
+ nfs_list_remove_request(req);
+ nfs_unlock_and_release_request(req);
+ }
+}
+
+static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
+{
+ struct nfs_direct_req *dreq = hdr->dreq;
+
+ spin_lock(&dreq->lock);
+ if (dreq->error == 0) {
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ /* fake unstable write to let common nfs resend pages */
+ hdr->verf.committed = NFS_UNSTABLE;
+ hdr->good_bytes = hdr->args.offset + hdr->args.count -
+ hdr->io_start;
+ }
+ spin_unlock(&dreq->lock);
+}
+
+static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
+ .error_cleanup = nfs_write_sync_pgio_error,
+ .init_hdr = nfs_direct_pgio_init,
+ .completion = nfs_direct_write_completion,
+ .reschedule_io = nfs_direct_write_reschedule_io,
+};
+
+
+/*
+ * NB: Return the value of the first error return code. Subsequent
+ * errors after the first one are ignored.
+ */
+/*
+ * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
+ * operation. If nfs_writedata_alloc() or get_user_pages() fails,
+ * bail and stop sending more writes. Write length accounting is
+ * handled automatically by nfs_direct_write_result(). Otherwise, if
+ * no requests have been sent, just return an error.
+ */
+static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
+ struct iov_iter *iter,
+ loff_t pos, int ioflags)
+{
+ struct nfs_pageio_descriptor desc;
+ struct inode *inode = dreq->inode;
+ ssize_t result = 0;
+ size_t requested_bytes = 0;
+ size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
+
+ nfs_pageio_init_write(&desc, inode, ioflags, false,
+ &nfs_direct_write_completion_ops);
+ desc.pg_dreq = dreq;
+ get_dreq(dreq);
+ inode_dio_begin(inode);
+
+ NFS_I(inode)->write_io += iov_iter_count(iter);
+ while (iov_iter_count(iter)) {
+ struct page **pagevec;
+ size_t bytes;
+ size_t pgbase;
+ unsigned npages, i;
+
+ result = iov_iter_get_pages_alloc(iter, &pagevec,
+ wsize, &pgbase);
+ if (result < 0)
+ break;
+
+ bytes = result;
+ iov_iter_advance(iter, bytes);
+ npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
+ for (i = 0; i < npages; i++) {
+ struct nfs_page *req;
+ unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
+
+ req = nfs_create_request(dreq->ctx, pagevec[i],
+ pgbase, req_len);
+ if (IS_ERR(req)) {
+ result = PTR_ERR(req);
+ break;
+ }
+
+ if (desc.pg_error < 0) {
+ nfs_free_request(req);
+ result = desc.pg_error;
+ break;
+ }
+
+ nfs_lock_request(req);
+ req->wb_index = pos >> PAGE_SHIFT;
+ req->wb_offset = pos & ~PAGE_MASK;
+ if (!nfs_pageio_add_request(&desc, req)) {
+ result = desc.pg_error;
+ nfs_unlock_and_release_request(req);
+ break;
+ }
+ pgbase = 0;
+ bytes -= req_len;
+ requested_bytes += req_len;
+ pos += req_len;
+ dreq->bytes_left -= req_len;
+ }
+ nfs_direct_release_pages(pagevec, npages);
+ kvfree(pagevec);
+ if (result < 0)
+ break;
+ }
+ nfs_pageio_complete(&desc);
+
+ /*
+ * If no bytes were started, return the error, and let the
+ * generic layer handle the completion.
+ */
+ if (requested_bytes == 0) {
+ inode_dio_end(inode);
+ nfs_direct_req_release(dreq);
+ return result < 0 ? result : -EIO;
+ }
+
+ if (put_dreq(dreq))
+ nfs_direct_write_complete(dreq);
+ return requested_bytes;
+}
+
+/**
+ * nfs_file_direct_write - file direct write operation for NFS files
+ * @iocb: target I/O control block
+ * @iter: vector of user buffers from which to write data
+ * @swap: flag indicating this is swap IO, not O_DIRECT IO
+ *
+ * We use this function for direct writes instead of calling
+ * generic_file_aio_write() in order to avoid taking the inode
+ * semaphore and updating the i_size. The NFS server will set
+ * the new i_size and this client must read the updated size
+ * back into its cache. We let the server do generic write
+ * parameter checking and report problems.
+ *
+ * We eliminate local atime updates, see direct read above.
+ *
+ * We avoid unnecessary page cache invalidations for normal cached
+ * readers of this file.
+ *
+ * Note that O_APPEND is not supported for NFS direct writes, as there
+ * is no atomic O_APPEND write facility in the NFS protocol.
+ */
+ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
+ bool swap)
+{
+ ssize_t result, requested;
+ size_t count;
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct nfs_direct_req *dreq;
+ struct nfs_lock_context *l_ctx;
+ loff_t pos, end;
+
+ dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
+ file, iov_iter_count(iter), (long long) iocb->ki_pos);
+
+ if (swap)
+ /* bypass generic checks */
+ result = iov_iter_count(iter);
+ else
+ result = generic_write_checks(iocb, iter);
+ if (result <= 0)
+ return result;
+ count = result;
+ nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
+
+ pos = iocb->ki_pos;
+ end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT;
+
+ task_io_account_write(count);
+
+ result = -ENOMEM;
+ dreq = nfs_direct_req_alloc();
+ if (!dreq)
+ goto out;
+
+ dreq->inode = inode;
+ dreq->bytes_left = dreq->max_count = count;
+ dreq->io_start = pos;
+ dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+ l_ctx = nfs_get_lock_context(dreq->ctx);
+ if (IS_ERR(l_ctx)) {
+ result = PTR_ERR(l_ctx);
+ nfs_direct_req_release(dreq);
+ goto out_release;
+ }
+ dreq->l_ctx = l_ctx;
+ if (!is_sync_kiocb(iocb))
+ dreq->iocb = iocb;
+ pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode);
+
+ if (swap) {
+ requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
+ FLUSH_STABLE);
+ } else {
+ nfs_start_io_direct(inode);
+
+ requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
+ FLUSH_COND_STABLE);
+
+ if (mapping->nrpages) {
+ invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_SHIFT, end);
+ }
+
+ nfs_end_io_direct(inode);
+ }
+
+ if (requested > 0) {
+ result = nfs_direct_wait(dreq);
+ if (result > 0) {
+ requested -= result;
+ iocb->ki_pos = pos + result;
+ /* XXX: should check the generic_write_sync retval */
+ generic_write_sync(iocb, result);
+ }
+ iov_iter_revert(iter, requested);
+ } else {
+ result = requested;
+ }
+out_release:
+ nfs_direct_req_release(dreq);
+out:
+ return result;
+}
+
+/**
+ * nfs_init_directcache - create a slab cache for nfs_direct_req structures
+ *
+ */
+int __init nfs_init_directcache(void)
+{
+ nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
+ sizeof(struct nfs_direct_req),
+ 0, (SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD),
+ NULL);
+ if (nfs_direct_cachep == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/**
+ * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
+ *
+ */
+void nfs_destroy_directcache(void)
+{
+ kmem_cache_destroy(nfs_direct_cachep);
+}
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
new file mode 100644
index 000000000..e87d500ad
--- /dev/null
+++ b/fs/nfs/dns_resolve.c
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/dns_resolve.c
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * Resolves DNS hostnames into valid ip addresses
+ */
+
+#ifdef CONFIG_NFS_USE_KERNEL_DNS
+
+#include <linux/module.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/dns_resolver.h>
+#include "dns_resolve.h"
+
+ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
+ struct sockaddr *sa, size_t salen)
+{
+ ssize_t ret;
+ char *ip_addr = NULL;
+ int ip_len;
+
+ ip_len = dns_query(net, NULL, name, namelen, NULL, &ip_addr, NULL,
+ false);
+ if (ip_len > 0)
+ ret = rpc_pton(net, ip_addr, ip_len, sa, salen);
+ else
+ ret = -ESRCH;
+ kfree(ip_addr);
+ return ret;
+}
+
+#else
+
+#include <linux/module.h>
+#include <linux/hash.h>
+#include <linux/string.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/seq_file.h>
+#include <linux/inet.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/svcauth.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/nfs_fs.h>
+
+#include "nfs4_fs.h"
+#include "dns_resolve.h"
+#include "cache_lib.h"
+#include "netns.h"
+
+#define NFS_DNS_HASHBITS 4
+#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS)
+
+struct nfs_dns_ent {
+ struct cache_head h;
+
+ char *hostname;
+ size_t namelen;
+
+ struct sockaddr_storage addr;
+ size_t addrlen;
+ struct rcu_head rcu_head;
+};
+
+
+static void nfs_dns_ent_update(struct cache_head *cnew,
+ struct cache_head *ckey)
+{
+ struct nfs_dns_ent *new;
+ struct nfs_dns_ent *key;
+
+ new = container_of(cnew, struct nfs_dns_ent, h);
+ key = container_of(ckey, struct nfs_dns_ent, h);
+
+ memcpy(&new->addr, &key->addr, key->addrlen);
+ new->addrlen = key->addrlen;
+}
+
+static void nfs_dns_ent_init(struct cache_head *cnew,
+ struct cache_head *ckey)
+{
+ struct nfs_dns_ent *new;
+ struct nfs_dns_ent *key;
+
+ new = container_of(cnew, struct nfs_dns_ent, h);
+ key = container_of(ckey, struct nfs_dns_ent, h);
+
+ kfree(new->hostname);
+ new->hostname = kmemdup_nul(key->hostname, key->namelen, GFP_KERNEL);
+ if (new->hostname) {
+ new->namelen = key->namelen;
+ nfs_dns_ent_update(cnew, ckey);
+ } else {
+ new->namelen = 0;
+ new->addrlen = 0;
+ }
+}
+
+static void nfs_dns_ent_free_rcu(struct rcu_head *head)
+{
+ struct nfs_dns_ent *item;
+
+ item = container_of(head, struct nfs_dns_ent, rcu_head);
+ kfree(item->hostname);
+ kfree(item);
+}
+
+static void nfs_dns_ent_put(struct kref *ref)
+{
+ struct nfs_dns_ent *item;
+
+ item = container_of(ref, struct nfs_dns_ent, h.ref);
+ call_rcu(&item->rcu_head, nfs_dns_ent_free_rcu);
+}
+
+static struct cache_head *nfs_dns_ent_alloc(void)
+{
+ struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL);
+
+ if (item != NULL) {
+ item->hostname = NULL;
+ item->namelen = 0;
+ item->addrlen = 0;
+ return &item->h;
+ }
+ return NULL;
+};
+
+static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key)
+{
+ return hash_str(key->hostname, NFS_DNS_HASHBITS);
+}
+
+static void nfs_dns_request(struct cache_detail *cd,
+ struct cache_head *ch,
+ char **bpp, int *blen)
+{
+ struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+
+ qword_add(bpp, blen, key->hostname);
+ (*bpp)[-1] = '\n';
+}
+
+static int nfs_dns_upcall(struct cache_detail *cd,
+ struct cache_head *ch)
+{
+ struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+
+ if (test_and_set_bit(CACHE_PENDING, &ch->flags))
+ return 0;
+ if (!nfs_cache_upcall(cd, key->hostname))
+ return 0;
+ clear_bit(CACHE_PENDING, &ch->flags);
+ return sunrpc_cache_pipe_upcall_timeout(cd, ch);
+}
+
+static int nfs_dns_match(struct cache_head *ca,
+ struct cache_head *cb)
+{
+ struct nfs_dns_ent *a;
+ struct nfs_dns_ent *b;
+
+ a = container_of(ca, struct nfs_dns_ent, h);
+ b = container_of(cb, struct nfs_dns_ent, h);
+
+ if (a->namelen == 0 || a->namelen != b->namelen)
+ return 0;
+ return memcmp(a->hostname, b->hostname, a->namelen) == 0;
+}
+
+static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
+ struct cache_head *h)
+{
+ struct nfs_dns_ent *item;
+ long ttl;
+
+ if (h == NULL) {
+ seq_puts(m, "# ip address hostname ttl\n");
+ return 0;
+ }
+ item = container_of(h, struct nfs_dns_ent, h);
+ ttl = item->h.expiry_time - seconds_since_boot();
+ if (ttl < 0)
+ ttl = 0;
+
+ if (!test_bit(CACHE_NEGATIVE, &h->flags)) {
+ char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1];
+
+ rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf));
+ seq_printf(m, "%15s ", buf);
+ } else
+ seq_puts(m, "<none> ");
+ seq_printf(m, "%15s %ld\n", item->hostname, ttl);
+ return 0;
+}
+
+static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
+ struct nfs_dns_ent *key)
+{
+ struct cache_head *ch;
+
+ ch = sunrpc_cache_lookup_rcu(cd,
+ &key->h,
+ nfs_dns_hash(key));
+ if (!ch)
+ return NULL;
+ return container_of(ch, struct nfs_dns_ent, h);
+}
+
+static struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd,
+ struct nfs_dns_ent *new,
+ struct nfs_dns_ent *key)
+{
+ struct cache_head *ch;
+
+ ch = sunrpc_cache_update(cd,
+ &new->h, &key->h,
+ nfs_dns_hash(key));
+ if (!ch)
+ return NULL;
+ return container_of(ch, struct nfs_dns_ent, h);
+}
+
+static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
+{
+ char buf1[NFS_DNS_HOSTNAME_MAXLEN+1];
+ struct nfs_dns_ent key, *item;
+ unsigned int ttl;
+ ssize_t len;
+ int ret = -EINVAL;
+
+ if (buf[buflen-1] != '\n')
+ goto out;
+ buf[buflen-1] = '\0';
+
+ len = qword_get(&buf, buf1, sizeof(buf1));
+ if (len <= 0)
+ goto out;
+ key.addrlen = rpc_pton(cd->net, buf1, len,
+ (struct sockaddr *)&key.addr,
+ sizeof(key.addr));
+
+ len = qword_get(&buf, buf1, sizeof(buf1));
+ if (len <= 0)
+ goto out;
+
+ key.hostname = buf1;
+ key.namelen = len;
+ memset(&key.h, 0, sizeof(key.h));
+
+ if (get_uint(&buf, &ttl) < 0)
+ goto out;
+ if (ttl == 0)
+ goto out;
+ key.h.expiry_time = ttl + seconds_since_boot();
+
+ ret = -ENOMEM;
+ item = nfs_dns_lookup(cd, &key);
+ if (item == NULL)
+ goto out;
+
+ if (key.addrlen == 0)
+ set_bit(CACHE_NEGATIVE, &key.h.flags);
+
+ item = nfs_dns_update(cd, &key, item);
+ if (item == NULL)
+ goto out;
+
+ ret = 0;
+ cache_put(&item->h, cd);
+out:
+ return ret;
+}
+
+static int do_cache_lookup(struct cache_detail *cd,
+ struct nfs_dns_ent *key,
+ struct nfs_dns_ent **item,
+ struct nfs_cache_defer_req *dreq)
+{
+ int ret = -ENOMEM;
+
+ *item = nfs_dns_lookup(cd, key);
+ if (*item) {
+ ret = cache_check(cd, &(*item)->h, &dreq->req);
+ if (ret)
+ *item = NULL;
+ }
+ return ret;
+}
+
+static int do_cache_lookup_nowait(struct cache_detail *cd,
+ struct nfs_dns_ent *key,
+ struct nfs_dns_ent **item)
+{
+ int ret = -ENOMEM;
+
+ *item = nfs_dns_lookup(cd, key);
+ if (!*item)
+ goto out_err;
+ ret = -ETIMEDOUT;
+ if (!test_bit(CACHE_VALID, &(*item)->h.flags)
+ || (*item)->h.expiry_time < seconds_since_boot()
+ || cd->flush_time > (*item)->h.last_refresh)
+ goto out_put;
+ ret = -ENOENT;
+ if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
+ goto out_put;
+ return 0;
+out_put:
+ cache_put(&(*item)->h, cd);
+out_err:
+ *item = NULL;
+ return ret;
+}
+
+static int do_cache_lookup_wait(struct cache_detail *cd,
+ struct nfs_dns_ent *key,
+ struct nfs_dns_ent **item)
+{
+ struct nfs_cache_defer_req *dreq;
+ int ret = -ENOMEM;
+
+ dreq = nfs_cache_defer_req_alloc();
+ if (!dreq)
+ goto out;
+ ret = do_cache_lookup(cd, key, item, dreq);
+ if (ret == -EAGAIN) {
+ ret = nfs_cache_wait_for_upcall(dreq);
+ if (!ret)
+ ret = do_cache_lookup_nowait(cd, key, item);
+ }
+ nfs_cache_defer_req_put(dreq);
+out:
+ return ret;
+}
+
+ssize_t nfs_dns_resolve_name(struct net *net, char *name,
+ size_t namelen, struct sockaddr *sa, size_t salen)
+{
+ struct nfs_dns_ent key = {
+ .hostname = name,
+ .namelen = namelen,
+ };
+ struct nfs_dns_ent *item = NULL;
+ ssize_t ret;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ ret = do_cache_lookup_wait(nn->nfs_dns_resolve, &key, &item);
+ if (ret == 0) {
+ if (salen >= item->addrlen) {
+ memcpy(sa, &item->addr, item->addrlen);
+ ret = item->addrlen;
+ } else
+ ret = -EOVERFLOW;
+ cache_put(&item->h, nn->nfs_dns_resolve);
+ } else if (ret == -ENOENT)
+ ret = -ESRCH;
+ return ret;
+}
+
+static struct cache_detail nfs_dns_resolve_template = {
+ .owner = THIS_MODULE,
+ .hash_size = NFS_DNS_HASHTBL_SIZE,
+ .name = "dns_resolve",
+ .cache_put = nfs_dns_ent_put,
+ .cache_upcall = nfs_dns_upcall,
+ .cache_request = nfs_dns_request,
+ .cache_parse = nfs_dns_parse,
+ .cache_show = nfs_dns_show,
+ .match = nfs_dns_match,
+ .init = nfs_dns_ent_init,
+ .update = nfs_dns_ent_update,
+ .alloc = nfs_dns_ent_alloc,
+};
+
+
+int nfs_dns_resolver_cache_init(struct net *net)
+{
+ int err;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ nn->nfs_dns_resolve = cache_create_net(&nfs_dns_resolve_template, net);
+ if (IS_ERR(nn->nfs_dns_resolve))
+ return PTR_ERR(nn->nfs_dns_resolve);
+
+ err = nfs_cache_register_net(net, nn->nfs_dns_resolve);
+ if (err)
+ goto err_reg;
+ return 0;
+
+err_reg:
+ cache_destroy_net(nn->nfs_dns_resolve, net);
+ return err;
+}
+
+void nfs_dns_resolver_cache_destroy(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ nfs_cache_unregister_net(net, nn->nfs_dns_resolve);
+ cache_destroy_net(nn->nfs_dns_resolve, net);
+}
+
+static int nfs4_dns_net_init(struct net *net)
+{
+ return nfs_dns_resolver_cache_init(net);
+}
+
+static void nfs4_dns_net_exit(struct net *net)
+{
+ nfs_dns_resolver_cache_destroy(net);
+}
+
+static struct pernet_operations nfs4_dns_resolver_ops = {
+ .init = nfs4_dns_net_init,
+ .exit = nfs4_dns_net_exit,
+};
+
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct super_block *sb = ptr;
+ struct net *net = sb->s_fs_info;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct cache_detail *cd = nn->nfs_dns_resolve;
+ int ret = 0;
+
+ if (cd == NULL)
+ return 0;
+
+ if (!try_module_get(THIS_MODULE))
+ return 0;
+
+ switch (event) {
+ case RPC_PIPEFS_MOUNT:
+ ret = nfs_cache_register_sb(sb, cd);
+ break;
+ case RPC_PIPEFS_UMOUNT:
+ nfs_cache_unregister_sb(sb, cd);
+ break;
+ default:
+ ret = -ENOTSUPP;
+ break;
+ }
+ module_put(THIS_MODULE);
+ return ret;
+}
+
+static struct notifier_block nfs_dns_resolver_block = {
+ .notifier_call = rpc_pipefs_event,
+};
+
+int nfs_dns_resolver_init(void)
+{
+ int err;
+
+ err = register_pernet_subsys(&nfs4_dns_resolver_ops);
+ if (err < 0)
+ goto out;
+ err = rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
+ if (err < 0)
+ goto out1;
+ return 0;
+out1:
+ unregister_pernet_subsys(&nfs4_dns_resolver_ops);
+out:
+ return err;
+}
+
+void nfs_dns_resolver_destroy(void)
+{
+ rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block);
+ unregister_pernet_subsys(&nfs4_dns_resolver_ops);
+}
+#endif
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
new file mode 100644
index 000000000..576ff4b54
--- /dev/null
+++ b/fs/nfs/dns_resolve.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Resolve DNS hostnames into valid ip addresses
+ */
+#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H
+#define __LINUX_FS_NFS_DNS_RESOLVE_H
+
+#define NFS_DNS_HOSTNAME_MAXLEN (128)
+
+
+#ifdef CONFIG_NFS_USE_KERNEL_DNS
+static inline int nfs_dns_resolver_init(void)
+{
+ return 0;
+}
+
+static inline void nfs_dns_resolver_destroy(void)
+{}
+
+static inline int nfs_dns_resolver_cache_init(struct net *net)
+{
+ return 0;
+}
+
+static inline void nfs_dns_resolver_cache_destroy(struct net *net)
+{}
+#else
+extern int nfs_dns_resolver_init(void);
+extern void nfs_dns_resolver_destroy(void);
+extern int nfs_dns_resolver_cache_init(struct net *net);
+extern void nfs_dns_resolver_cache_destroy(struct net *net);
+#endif
+
+extern ssize_t nfs_dns_resolve_name(struct net *net, char *name,
+ size_t namelen, struct sockaddr *sa, size_t salen);
+
+#endif
diff --git a/fs/nfs/export.c b/fs/nfs/export.c
new file mode 100644
index 000000000..3430d6891
--- /dev/null
+++ b/fs/nfs/export.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2015, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+#include <linux/dcache.h>
+#include <linux/exportfs.h>
+#include <linux/nfs.h>
+#include <linux/nfs_fs.h>
+
+#include "internal.h"
+#include "nfstrace.h"
+
+#define NFSDBG_FACILITY NFSDBG_VFS
+
+enum {
+ FILEID_HIGH_OFF = 0, /* inode fileid high */
+ FILEID_LOW_OFF, /* inode fileid low */
+ FILE_I_TYPE_OFF, /* inode type */
+ EMBED_FH_OFF /* embeded server fh */
+};
+
+
+static struct nfs_fh *nfs_exp_embedfh(__u32 *p)
+{
+ return (struct nfs_fh *)(p + EMBED_FH_OFF);
+}
+
+/*
+ * Let's break subtree checking for now... otherwise we'll have to embed parent fh
+ * but there might not be enough space.
+ */
+static int
+nfs_encode_fh(struct inode *inode, __u32 *p, int *max_len, struct inode *parent)
+{
+ struct nfs_fh *server_fh = NFS_FH(inode);
+ struct nfs_fh *clnt_fh = nfs_exp_embedfh(p);
+ size_t fh_size = offsetof(struct nfs_fh, data) + server_fh->size;
+ int len = EMBED_FH_OFF + XDR_QUADLEN(fh_size);
+
+ dprintk("%s: max fh len %d inode %p parent %p",
+ __func__, *max_len, inode, parent);
+
+ if (*max_len < len || IS_AUTOMOUNT(inode)) {
+ dprintk("%s: fh len %d too small, required %d\n",
+ __func__, *max_len, len);
+ *max_len = len;
+ return FILEID_INVALID;
+ }
+
+ p[FILEID_HIGH_OFF] = NFS_FILEID(inode) >> 32;
+ p[FILEID_LOW_OFF] = NFS_FILEID(inode);
+ p[FILE_I_TYPE_OFF] = inode->i_mode & S_IFMT;
+ p[len - 1] = 0; /* Padding */
+ nfs_copy_fh(clnt_fh, server_fh);
+ *max_len = len;
+ dprintk("%s: result fh fileid %llu mode %u size %d\n",
+ __func__, NFS_FILEID(inode), inode->i_mode, *max_len);
+ return *max_len;
+}
+
+static struct dentry *
+nfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ struct nfs4_label *label = NULL;
+ struct nfs_fattr *fattr = NULL;
+ struct nfs_fh *server_fh = nfs_exp_embedfh(fid->raw);
+ size_t fh_size = offsetof(struct nfs_fh, data) + server_fh->size;
+ const struct nfs_rpc_ops *rpc_ops;
+ struct dentry *dentry;
+ struct inode *inode;
+ int len = EMBED_FH_OFF + XDR_QUADLEN(fh_size);
+ u32 *p = fid->raw;
+ int ret;
+
+ /* NULL translates to ESTALE */
+ if (fh_len < len || fh_type != len)
+ return NULL;
+
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL) {
+ dentry = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ fattr->fileid = ((u64)p[FILEID_HIGH_OFF] << 32) + p[FILEID_LOW_OFF];
+ fattr->mode = p[FILE_I_TYPE_OFF];
+ fattr->valid |= NFS_ATTR_FATTR_FILEID | NFS_ATTR_FATTR_TYPE;
+
+ dprintk("%s: fileid %llu mode %d\n", __func__, fattr->fileid, fattr->mode);
+
+ inode = nfs_ilookup(sb, fattr, server_fh);
+ if (inode)
+ goto out_found;
+
+ label = nfs4_label_alloc(NFS_SB(sb), GFP_KERNEL);
+ if (IS_ERR(label)) {
+ dentry = ERR_CAST(label);
+ goto out_free_fattr;
+ }
+
+ rpc_ops = NFS_SB(sb)->nfs_client->rpc_ops;
+ ret = rpc_ops->getattr(NFS_SB(sb), server_fh, fattr, label, NULL);
+ if (ret) {
+ dprintk("%s: getattr failed %d\n", __func__, ret);
+ trace_nfs_fh_to_dentry(sb, server_fh, fattr->fileid, ret);
+ dentry = ERR_PTR(ret);
+ goto out_free_label;
+ }
+
+ inode = nfs_fhget(sb, server_fh, fattr, label);
+
+out_found:
+ dentry = d_obtain_alias(inode);
+
+out_free_label:
+ nfs4_label_free(label);
+out_free_fattr:
+ nfs_free_fattr(fattr);
+out:
+ return dentry;
+}
+
+static struct dentry *
+nfs_get_parent(struct dentry *dentry)
+{
+ int ret;
+ struct inode *inode = d_inode(dentry), *pinode;
+ struct super_block *sb = inode->i_sb;
+ struct nfs_server *server = NFS_SB(sb);
+ struct nfs_fattr *fattr = NULL;
+ struct nfs4_label *label = NULL;
+ struct dentry *parent;
+ struct nfs_rpc_ops const *ops = server->nfs_client->rpc_ops;
+ struct nfs_fh fh;
+
+ if (!ops->lookupp)
+ return ERR_PTR(-EACCES);
+
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL) {
+ parent = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ label = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(label)) {
+ parent = ERR_CAST(label);
+ goto out_free_fattr;
+ }
+
+ ret = ops->lookupp(inode, &fh, fattr, label);
+ if (ret) {
+ parent = ERR_PTR(ret);
+ goto out_free_label;
+ }
+
+ pinode = nfs_fhget(sb, &fh, fattr, label);
+ parent = d_obtain_alias(pinode);
+out_free_label:
+ nfs4_label_free(label);
+out_free_fattr:
+ nfs_free_fattr(fattr);
+out:
+ return parent;
+}
+
+const struct export_operations nfs_export_ops = {
+ .encode_fh = nfs_encode_fh,
+ .fh_to_dentry = nfs_fh_to_dentry,
+ .get_parent = nfs_get_parent,
+};
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
new file mode 100644
index 000000000..7be1a7f7f
--- /dev/null
+++ b/fs/nfs/file.c
@@ -0,0 +1,874 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/file.c
+ *
+ * Copyright (C) 1992 Rick Sladkey
+ *
+ * Changes Copyright (C) 1994 by Florian La Roche
+ * - Do not copy data too often around in the kernel.
+ * - In nfs_file_read the return value of kmalloc wasn't checked.
+ * - Put in a better version of read look-ahead buffering. Original idea
+ * and implementation by Wai S Kok elekokws@ee.nus.sg.
+ *
+ * Expire cache on write to a file by Wai S Kok (Oct 1994).
+ *
+ * Total rewrite of read side for new NFS buffer cache.. Linus.
+ *
+ * nfs regular file handling functions
+ */
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/gfp.h>
+#include <linux/swap.h>
+
+#include <linux/uaccess.h>
+
+#include "delegation.h"
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+#include "pnfs.h"
+
+#include "nfstrace.h"
+
+#define NFSDBG_FACILITY NFSDBG_FILE
+
+static const struct vm_operations_struct nfs_file_vm_ops;
+
+/* Hack for future NFS swap support */
+#ifndef IS_SWAPFILE
+# define IS_SWAPFILE(inode) (0)
+#endif
+
+int nfs_check_flags(int flags)
+{
+ if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT))
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_check_flags);
+
+/*
+ * Open file
+ */
+static int
+nfs_file_open(struct inode *inode, struct file *filp)
+{
+ int res;
+
+ dprintk("NFS: open file(%pD2)\n", filp);
+
+ nfs_inc_stats(inode, NFSIOS_VFSOPEN);
+ res = nfs_check_flags(filp->f_flags);
+ if (res)
+ return res;
+
+ res = nfs_open(inode, filp);
+ return res;
+}
+
+int
+nfs_file_release(struct inode *inode, struct file *filp)
+{
+ dprintk("NFS: release(%pD2)\n", filp);
+
+ nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
+ nfs_file_clear_open_context(filp);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_file_release);
+
+/**
+ * nfs_revalidate_size - Revalidate the file size
+ * @inode: pointer to inode struct
+ * @filp: pointer to struct file
+ *
+ * Revalidates the file length. This is basically a wrapper around
+ * nfs_revalidate_inode() that takes into account the fact that we may
+ * have cached writes (in which case we don't care about the server's
+ * idea of what the file length is), or O_DIRECT (in which case we
+ * shouldn't trust the cache).
+ */
+static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+
+ if (filp->f_flags & O_DIRECT)
+ goto force_reval;
+ if (nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE))
+ goto force_reval;
+ return 0;
+force_reval:
+ return __nfs_revalidate_inode(server, inode);
+}
+
+loff_t nfs_file_llseek(struct file *filp, loff_t offset, int whence)
+{
+ dprintk("NFS: llseek file(%pD2, %lld, %d)\n",
+ filp, offset, whence);
+
+ /*
+ * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
+ * the cached file length
+ */
+ if (whence != SEEK_SET && whence != SEEK_CUR) {
+ struct inode *inode = filp->f_mapping->host;
+
+ int retval = nfs_revalidate_file_size(inode, filp);
+ if (retval < 0)
+ return (loff_t)retval;
+ }
+
+ return generic_file_llseek(filp, offset, whence);
+}
+EXPORT_SYMBOL_GPL(nfs_file_llseek);
+
+/*
+ * Flush all dirty pages, and check for write errors.
+ */
+static int
+nfs_file_flush(struct file *file, fl_owner_t id)
+{
+ struct inode *inode = file_inode(file);
+ errseq_t since;
+
+ dprintk("NFS: flush(%pD2)\n", file);
+
+ nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
+ if ((file->f_mode & FMODE_WRITE) == 0)
+ return 0;
+
+ /* Flush writes to the server and return any errors */
+ since = filemap_sample_wb_err(file->f_mapping);
+ nfs_wb_all(inode);
+ return filemap_check_wb_err(file->f_mapping, since);
+}
+
+ssize_t
+nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t result;
+
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return nfs_file_direct_read(iocb, to, false);
+
+ dprintk("NFS: read(%pD2, %zu@%lu)\n",
+ iocb->ki_filp,
+ iov_iter_count(to), (unsigned long) iocb->ki_pos);
+
+ nfs_start_io_read(inode);
+ result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
+ if (!result) {
+ result = generic_file_read_iter(iocb, to);
+ if (result > 0)
+ nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
+ }
+ nfs_end_io_read(inode);
+ return result;
+}
+EXPORT_SYMBOL_GPL(nfs_file_read);
+
+int
+nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+ struct inode *inode = file_inode(file);
+ int status;
+
+ dprintk("NFS: mmap(%pD2)\n", file);
+
+ /* Note: generic_file_mmap() returns ENOSYS on nommu systems
+ * so we call that before revalidating the mapping
+ */
+ status = generic_file_mmap(file, vma);
+ if (!status) {
+ vma->vm_ops = &nfs_file_vm_ops;
+ status = nfs_revalidate_mapping(inode, file->f_mapping);
+ }
+ return status;
+}
+EXPORT_SYMBOL_GPL(nfs_file_mmap);
+
+/*
+ * Flush any dirty pages for this process, and check for write errors.
+ * The return status from this call provides a reliable indication of
+ * whether any write errors occurred for this process.
+ */
+static int
+nfs_file_fsync_commit(struct file *file, int datasync)
+{
+ struct inode *inode = file_inode(file);
+ int ret, ret2;
+
+ dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);
+
+ nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
+ ret = nfs_commit_inode(inode, FLUSH_SYNC);
+ ret2 = file_check_and_advance_wb_err(file);
+ if (ret2 < 0)
+ return ret2;
+ return ret;
+}
+
+int
+nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ struct nfs_open_context *ctx = nfs_file_open_context(file);
+ struct inode *inode = file_inode(file);
+ int ret;
+
+ trace_nfs_fsync_enter(inode);
+
+ for (;;) {
+ ret = file_write_and_wait_range(file, start, end);
+ if (ret != 0)
+ break;
+ ret = nfs_file_fsync_commit(file, datasync);
+ if (ret != 0)
+ break;
+ ret = pnfs_sync_inode(inode, !!datasync);
+ if (ret != 0)
+ break;
+ if (!test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags))
+ break;
+ /*
+ * If nfs_file_fsync_commit detected a server reboot, then
+ * resend all dirty pages that might have been covered by
+ * the NFS_CONTEXT_RESEND_WRITES flag
+ */
+ start = 0;
+ end = LLONG_MAX;
+ }
+
+ trace_nfs_fsync_exit(inode, ret);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_file_fsync);
+
+/*
+ * Decide whether a read/modify/write cycle may be more efficient
+ * then a modify/write/read cycle when writing to a page in the
+ * page cache.
+ *
+ * Some pNFS layout drivers can only read/write at a certain block
+ * granularity like all block devices and therefore we must perform
+ * read/modify/write whenever a page hasn't read yet and the data
+ * to be written there is not aligned to a block boundary and/or
+ * smaller than the block size.
+ *
+ * The modify/write/read cycle may occur if a page is read before
+ * being completely filled by the writer. In this situation, the
+ * page must be completely written to stable storage on the server
+ * before it can be refilled by reading in the page from the server.
+ * This can lead to expensive, small, FILE_SYNC mode writes being
+ * done.
+ *
+ * It may be more efficient to read the page first if the file is
+ * open for reading in addition to writing, the page is not marked
+ * as Uptodate, it is not dirty or waiting to be committed,
+ * indicating that it was previously allocated and then modified,
+ * that there were valid bytes of data in that range of the file,
+ * and that the new data won't completely replace the old data in
+ * that range of the file.
+ */
+static bool nfs_full_page_write(struct page *page, loff_t pos, unsigned int len)
+{
+ unsigned int pglen = nfs_page_length(page);
+ unsigned int offset = pos & (PAGE_SIZE - 1);
+ unsigned int end = offset + len;
+
+ return !pglen || (end >= pglen && !offset);
+}
+
+static bool nfs_want_read_modify_write(struct file *file, struct page *page,
+ loff_t pos, unsigned int len)
+{
+ /*
+ * Up-to-date pages, those with ongoing or full-page write
+ * don't need read/modify/write
+ */
+ if (PageUptodate(page) || PagePrivate(page) ||
+ nfs_full_page_write(page, pos, len))
+ return false;
+
+ if (pnfs_ld_read_whole_page(file->f_mapping->host))
+ return true;
+ /* Open for reading too? */
+ if (file->f_mode & FMODE_READ)
+ return true;
+ return false;
+}
+
+/*
+ * This does the "real" work of the write. We must allocate and lock the
+ * page to be sent back to the generic routine, which then copies the
+ * data from user space.
+ *
+ * If the writer ends up delaying the write, the writer needs to
+ * increment the page use counts until he is done with the page.
+ */
+static int nfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ int ret;
+ pgoff_t index = pos >> PAGE_SHIFT;
+ struct page *page;
+ int once_thru = 0;
+
+ dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n",
+ file, mapping->host->i_ino, len, (long long) pos);
+
+start:
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+ *pagep = page;
+
+ ret = nfs_flush_incompatible(file, page);
+ if (ret) {
+ unlock_page(page);
+ put_page(page);
+ } else if (!once_thru &&
+ nfs_want_read_modify_write(file, page, pos, len)) {
+ once_thru = 1;
+ ret = nfs_readpage(file, page);
+ put_page(page);
+ if (!ret)
+ goto start;
+ }
+ return ret;
+}
+
+static int nfs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ unsigned offset = pos & (PAGE_SIZE - 1);
+ struct nfs_open_context *ctx = nfs_file_open_context(file);
+ int status;
+
+ dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n",
+ file, mapping->host->i_ino, len, (long long) pos);
+
+ /*
+ * Zero any uninitialised parts of the page, and then mark the page
+ * as up to date if it turns out that we're extending the file.
+ */
+ if (!PageUptodate(page)) {
+ unsigned pglen = nfs_page_length(page);
+ unsigned end = offset + copied;
+
+ if (pglen == 0) {
+ zero_user_segments(page, 0, offset,
+ end, PAGE_SIZE);
+ SetPageUptodate(page);
+ } else if (end >= pglen) {
+ zero_user_segment(page, end, PAGE_SIZE);
+ if (offset == 0)
+ SetPageUptodate(page);
+ } else
+ zero_user_segment(page, pglen, PAGE_SIZE);
+ }
+
+ status = nfs_updatepage(file, page, offset, copied);
+
+ unlock_page(page);
+ put_page(page);
+
+ if (status < 0)
+ return status;
+ NFS_I(mapping->host)->write_io += copied;
+
+ if (nfs_ctx_key_to_expire(ctx, mapping->host))
+ nfs_wb_all(mapping->host);
+
+ return copied;
+}
+
+/*
+ * Partially or wholly invalidate a page
+ * - Release the private state associated with a page if undergoing complete
+ * page invalidation
+ * - Called if either PG_private or PG_fscache is set on the page
+ * - Caller holds page lock
+ */
+static void nfs_invalidate_page(struct page *page, unsigned int offset,
+ unsigned int length)
+{
+ dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n",
+ page, offset, length);
+
+ if (offset != 0 || length < PAGE_SIZE)
+ return;
+ /* Cancel any unstarted writes on this page */
+ nfs_wb_page_cancel(page_file_mapping(page)->host, page);
+
+ nfs_fscache_invalidate_page(page, page->mapping->host);
+}
+
+/*
+ * Attempt to release the private state associated with a page
+ * - Called if either PG_private or PG_fscache is set on the page
+ * - Caller holds page lock
+ * - Return true (may release page) or false (may not)
+ */
+static int nfs_release_page(struct page *page, gfp_t gfp)
+{
+ dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
+
+ /* If PagePrivate() is set, then the page is not freeable */
+ if (PagePrivate(page))
+ return 0;
+ return nfs_fscache_release_page(page, gfp);
+}
+
+static void nfs_check_dirty_writeback(struct page *page,
+ bool *dirty, bool *writeback)
+{
+ struct nfs_inode *nfsi;
+ struct address_space *mapping = page_file_mapping(page);
+
+ if (!mapping || PageSwapCache(page))
+ return;
+
+ /*
+ * Check if an unstable page is currently being committed and
+ * if so, have the VM treat it as if the page is under writeback
+ * so it will not block due to pages that will shortly be freeable.
+ */
+ nfsi = NFS_I(mapping->host);
+ if (atomic_read(&nfsi->commit_info.rpcs_out)) {
+ *writeback = true;
+ return;
+ }
+
+ /*
+ * If PagePrivate() is set, then the page is not freeable and as the
+ * inode is not being committed, it's not going to be cleaned in the
+ * near future so treat it as dirty
+ */
+ if (PagePrivate(page))
+ *dirty = true;
+}
+
+/*
+ * Attempt to clear the private state associated with a page when an error
+ * occurs that requires the cached contents of an inode to be written back or
+ * destroyed
+ * - Called if either PG_private or fscache is set on the page
+ * - Caller holds page lock
+ * - Return 0 if successful, -error otherwise
+ */
+static int nfs_launder_page(struct page *page)
+{
+ struct inode *inode = page_file_mapping(page)->host;
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
+ inode->i_ino, (long long)page_offset(page));
+
+ nfs_fscache_wait_on_page_write(nfsi, page);
+ return nfs_wb_page(inode, page);
+}
+
+static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
+ sector_t *span)
+{
+ unsigned long blocks;
+ long long isize;
+ struct inode *inode = file_inode(file);
+ struct rpc_clnt *clnt = NFS_CLIENT(inode);
+ struct nfs_client *cl = NFS_SERVER(inode)->nfs_client;
+
+ spin_lock(&inode->i_lock);
+ blocks = inode->i_blocks;
+ isize = inode->i_size;
+ spin_unlock(&inode->i_lock);
+ if (blocks*512 < isize) {
+ pr_warn("swap activate: swapfile has holes\n");
+ return -EINVAL;
+ }
+
+ *span = sis->pages;
+
+
+ if (cl->rpc_ops->enable_swap)
+ cl->rpc_ops->enable_swap(inode);
+
+ return rpc_clnt_swap_activate(clnt);
+}
+
+static void nfs_swap_deactivate(struct file *file)
+{
+ struct inode *inode = file_inode(file);
+ struct rpc_clnt *clnt = NFS_CLIENT(inode);
+ struct nfs_client *cl = NFS_SERVER(inode)->nfs_client;
+
+ rpc_clnt_swap_deactivate(clnt);
+ if (cl->rpc_ops->disable_swap)
+ cl->rpc_ops->disable_swap(file_inode(file));
+}
+
+const struct address_space_operations nfs_file_aops = {
+ .readpage = nfs_readpage,
+ .readpages = nfs_readpages,
+ .set_page_dirty = __set_page_dirty_nobuffers,
+ .writepage = nfs_writepage,
+ .writepages = nfs_writepages,
+ .write_begin = nfs_write_begin,
+ .write_end = nfs_write_end,
+ .invalidatepage = nfs_invalidate_page,
+ .releasepage = nfs_release_page,
+ .direct_IO = nfs_direct_IO,
+#ifdef CONFIG_MIGRATION
+ .migratepage = nfs_migrate_page,
+#endif
+ .launder_page = nfs_launder_page,
+ .is_dirty_writeback = nfs_check_dirty_writeback,
+ .error_remove_page = generic_error_remove_page,
+ .swap_activate = nfs_swap_activate,
+ .swap_deactivate = nfs_swap_deactivate,
+};
+
+/*
+ * Notification that a PTE pointing to an NFS page is about to be made
+ * writable, implying that someone is about to modify the page through a
+ * shared-writable mapping
+ */
+static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct file *filp = vmf->vma->vm_file;
+ struct inode *inode = file_inode(filp);
+ unsigned pagelen;
+ vm_fault_t ret = VM_FAULT_NOPAGE;
+ struct address_space *mapping;
+
+ dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n",
+ filp, filp->f_mapping->host->i_ino,
+ (long long)page_offset(page));
+
+ sb_start_pagefault(inode->i_sb);
+
+ /* make sure the cache has finished storing the page */
+ nfs_fscache_wait_on_page_write(NFS_I(inode), page);
+
+ wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING,
+ nfs_wait_bit_killable, TASK_KILLABLE);
+
+ lock_page(page);
+ mapping = page_file_mapping(page);
+ if (mapping != inode->i_mapping)
+ goto out_unlock;
+
+ wait_on_page_writeback(page);
+
+ pagelen = nfs_page_length(page);
+ if (pagelen == 0)
+ goto out_unlock;
+
+ ret = VM_FAULT_LOCKED;
+ if (nfs_flush_incompatible(filp, page) == 0 &&
+ nfs_updatepage(filp, page, 0, pagelen) == 0)
+ goto out;
+
+ ret = VM_FAULT_SIGBUS;
+out_unlock:
+ unlock_page(page);
+out:
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+}
+
+static const struct vm_operations_struct nfs_file_vm_ops = {
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = nfs_vm_page_mkwrite,
+};
+
+static int nfs_need_check_write(struct file *filp, struct inode *inode,
+ int error)
+{
+ struct nfs_open_context *ctx;
+
+ ctx = nfs_file_open_context(filp);
+ if (nfs_error_is_fatal_on_server(error) ||
+ nfs_ctx_key_to_expire(ctx, inode))
+ return 1;
+ return 0;
+}
+
+ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ unsigned long written = 0;
+ ssize_t result;
+ errseq_t since;
+ int error;
+
+ result = nfs_key_timeout_notify(file, inode);
+ if (result)
+ return result;
+
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return nfs_file_direct_write(iocb, from, false);
+
+ dprintk("NFS: write(%pD2, %zu@%Ld)\n",
+ file, iov_iter_count(from), (long long) iocb->ki_pos);
+
+ if (IS_SWAPFILE(inode))
+ goto out_swapfile;
+ /*
+ * O_APPEND implies that we must revalidate the file length.
+ */
+ if (iocb->ki_flags & IOCB_APPEND) {
+ result = nfs_revalidate_file_size(inode, file);
+ if (result)
+ goto out;
+ }
+ if (iocb->ki_pos > i_size_read(inode))
+ nfs_revalidate_mapping(inode, file->f_mapping);
+
+ since = filemap_sample_wb_err(file->f_mapping);
+ nfs_start_io_write(inode);
+ result = generic_write_checks(iocb, from);
+ if (result > 0) {
+ current->backing_dev_info = inode_to_bdi(inode);
+ result = generic_perform_write(file, from, iocb->ki_pos);
+ current->backing_dev_info = NULL;
+ }
+ nfs_end_io_write(inode);
+ if (result <= 0)
+ goto out;
+
+ written = result;
+ iocb->ki_pos += written;
+ result = generic_write_sync(iocb, written);
+ if (result < 0)
+ goto out;
+
+ /* Return error values */
+ error = filemap_check_wb_err(file->f_mapping, since);
+ if (nfs_need_check_write(file, inode, error)) {
+ int err = nfs_wb_all(inode);
+ if (err < 0)
+ result = err;
+ }
+ nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
+out:
+ return result;
+
+out_swapfile:
+ printk(KERN_INFO "NFS: attempt to write to active swap file!\n");
+ return -ETXTBSY;
+}
+EXPORT_SYMBOL_GPL(nfs_file_write);
+
+static int
+do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
+{
+ struct inode *inode = filp->f_mapping->host;
+ int status = 0;
+ unsigned int saved_type = fl->fl_type;
+
+ /* Try local locking first */
+ posix_test_lock(filp, fl);
+ if (fl->fl_type != F_UNLCK) {
+ /* found a conflict */
+ goto out;
+ }
+ fl->fl_type = saved_type;
+
+ if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ goto out_noconflict;
+
+ if (is_local)
+ goto out_noconflict;
+
+ status = NFS_PROTO(inode)->lock(filp, cmd, fl);
+out:
+ return status;
+out_noconflict:
+ fl->fl_type = F_UNLCK;
+ goto out;
+}
+
+static int
+do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
+{
+ struct inode *inode = filp->f_mapping->host;
+ struct nfs_lock_context *l_ctx;
+ int status;
+
+ /*
+ * Flush all pending writes before doing anything
+ * with locks..
+ */
+ nfs_wb_all(inode);
+
+ l_ctx = nfs_get_lock_context(nfs_file_open_context(filp));
+ if (!IS_ERR(l_ctx)) {
+ status = nfs_iocounter_wait(l_ctx);
+ nfs_put_lock_context(l_ctx);
+ /* NOTE: special case
+ * If we're signalled while cleaning up locks on process exit, we
+ * still need to complete the unlock.
+ */
+ if (status < 0 && !(fl->fl_flags & FL_CLOSE))
+ return status;
+ }
+
+ /*
+ * Use local locking if mounted with "-onolock" or with appropriate
+ * "-olocal_lock="
+ */
+ if (!is_local)
+ status = NFS_PROTO(inode)->lock(filp, cmd, fl);
+ else
+ status = locks_lock_file_wait(filp, fl);
+ return status;
+}
+
+static int
+do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
+{
+ struct inode *inode = filp->f_mapping->host;
+ int status;
+
+ /*
+ * Flush all pending writes before doing anything
+ * with locks..
+ */
+ status = nfs_sync_mapping(filp->f_mapping);
+ if (status != 0)
+ goto out;
+
+ /*
+ * Use local locking if mounted with "-onolock" or with appropriate
+ * "-olocal_lock="
+ */
+ if (!is_local)
+ status = NFS_PROTO(inode)->lock(filp, cmd, fl);
+ else
+ status = locks_lock_file_wait(filp, fl);
+ if (status < 0)
+ goto out;
+
+ /*
+ * Invalidate cache to prevent missing any changes. If
+ * the file is mapped, clear the page cache as well so
+ * those mappings will be loaded.
+ *
+ * This makes locking act as a cache coherency point.
+ */
+ nfs_sync_mapping(filp->f_mapping);
+ if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) {
+ nfs_zap_caches(inode);
+ if (mapping_mapped(filp->f_mapping))
+ nfs_revalidate_mapping(inode, filp->f_mapping);
+ }
+out:
+ return status;
+}
+
+/*
+ * Lock a (portion of) a file
+ */
+int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = filp->f_mapping->host;
+ int ret = -ENOLCK;
+ int is_local = 0;
+
+ dprintk("NFS: lock(%pD2, t=%x, fl=%x, r=%lld:%lld)\n",
+ filp, fl->fl_type, fl->fl_flags,
+ (long long)fl->fl_start, (long long)fl->fl_end);
+
+ nfs_inc_stats(inode, NFSIOS_VFSLOCK);
+
+ /* No mandatory locks over NFS */
+ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+ goto out_err;
+
+ if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
+ is_local = 1;
+
+ if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
+ ret = NFS_PROTO(inode)->lock_check_bounds(fl);
+ if (ret < 0)
+ goto out_err;
+ }
+
+ if (IS_GETLK(cmd))
+ ret = do_getlk(filp, cmd, fl, is_local);
+ else if (fl->fl_type == F_UNLCK)
+ ret = do_unlk(filp, cmd, fl, is_local);
+ else
+ ret = do_setlk(filp, cmd, fl, is_local);
+out_err:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_lock);
+
+/*
+ * Lock a (portion of) a file
+ */
+int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = filp->f_mapping->host;
+ int is_local = 0;
+
+ dprintk("NFS: flock(%pD2, t=%x, fl=%x)\n",
+ filp, fl->fl_type, fl->fl_flags);
+
+ if (!(fl->fl_flags & FL_FLOCK))
+ return -ENOLCK;
+
+ /*
+ * The NFSv4 protocol doesn't support LOCK_MAND, which is not part of
+ * any standard. In principle we might be able to support LOCK_MAND
+ * on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the
+ * NFS code is not set up for it.
+ */
+ if (fl->fl_type & LOCK_MAND)
+ return -EINVAL;
+
+ if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
+ is_local = 1;
+
+ /* We're simulating flock() locks using posix locks on the server */
+ if (fl->fl_type == F_UNLCK)
+ return do_unlk(filp, cmd, fl, is_local);
+ return do_setlk(filp, cmd, fl, is_local);
+}
+EXPORT_SYMBOL_GPL(nfs_flock);
+
+const struct file_operations nfs_file_operations = {
+ .llseek = nfs_file_llseek,
+ .read_iter = nfs_file_read,
+ .write_iter = nfs_file_write,
+ .mmap = nfs_file_mmap,
+ .open = nfs_file_open,
+ .flush = nfs_file_flush,
+ .release = nfs_file_release,
+ .fsync = nfs_file_fsync,
+ .lock = nfs_lock,
+ .flock = nfs_flock,
+ .splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
+ .check_flags = nfs_check_flags,
+ .setlease = simple_nosetlease,
+};
+EXPORT_SYMBOL_GPL(nfs_file_operations);
diff --git a/fs/nfs/filelayout/Makefile b/fs/nfs/filelayout/Makefile
new file mode 100644
index 000000000..de056312d
--- /dev/null
+++ b/fs/nfs/filelayout/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for the pNFS Files Layout Driver kernel module
+#
+obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
+nfs_layout_nfsv41_files-y := filelayout.o filelayoutdev.o
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
new file mode 100644
index 000000000..deecfb50d
--- /dev/null
+++ b/fs/nfs/filelayout/filelayout.c
@@ -0,0 +1,1155 @@
+/*
+ * Module for the pnfs nfs4 file layout driver.
+ * Defines all I/O and Policy interface operations, plus code
+ * to register itself with the pNFS client.
+ *
+ * Copyright (c) 2002
+ * The Regents of the University of Michigan
+ * All Rights Reserved
+ *
+ * Dean Hildebrand <dhildebz@umich.edu>
+ *
+ * Permission is granted to use, copy, create derivative works, and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the University of Michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. If
+ * the above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * This software is provided as is, without representation or warranty
+ * of any kind either express or implied, including without limitation
+ * the implied warranties of merchantability, fitness for a particular
+ * purpose, or noninfringement. The Regents of the University of
+ * Michigan shall not be liable for any damages, including special,
+ * indirect, incidental, or consequential damages, with respect to any
+ * claim arising out of or in connection with the use of the software,
+ * even if it has been or is hereafter advised of the possibility of
+ * such damages.
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+
+#include <linux/sunrpc/metrics.h>
+
+#include "../nfs4session.h"
+#include "../internal.h"
+#include "../delegation.h"
+#include "filelayout.h"
+#include "../nfs4trace.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
+MODULE_DESCRIPTION("The NFSv4 file layout driver");
+
+#define FILELAYOUT_POLL_RETRY_MAX (15*HZ)
+static const struct pnfs_commit_ops filelayout_commit_ops;
+
+static loff_t
+filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
+ loff_t offset)
+{
+ u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
+ u64 stripe_no;
+ u32 rem;
+
+ offset -= flseg->pattern_offset;
+ stripe_no = div_u64(offset, stripe_width);
+ div_u64_rem(offset, flseg->stripe_unit, &rem);
+
+ return stripe_no * flseg->stripe_unit + rem;
+}
+
+/* This function is used by the layout driver to calculate the
+ * offset of the file on the dserver based on whether the
+ * layout type is STRIPE_DENSE or STRIPE_SPARSE
+ */
+static loff_t
+filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+
+ switch (flseg->stripe_type) {
+ case STRIPE_SPARSE:
+ return offset;
+
+ case STRIPE_DENSE:
+ return filelayout_get_dense_offset(flseg, offset);
+ }
+
+ BUG();
+}
+
+static void filelayout_reset_write(struct nfs_pgio_header *hdr)
+{
+ struct rpc_task *task = &hdr->task;
+
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ dprintk("%s Reset task %5u for i/o through MDS "
+ "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+ hdr->task.tk_pid,
+ hdr->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(hdr->inode),
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
+
+ task->tk_status = pnfs_write_done_resend_to_mds(hdr);
+ }
+}
+
+static void filelayout_reset_read(struct nfs_pgio_header *hdr)
+{
+ struct rpc_task *task = &hdr->task;
+
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ dprintk("%s Reset task %5u for i/o through MDS "
+ "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+ hdr->task.tk_pid,
+ hdr->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(hdr->inode),
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
+
+ task->tk_status = pnfs_read_done_resend_to_mds(hdr);
+ }
+}
+
+static int filelayout_async_handle_error(struct rpc_task *task,
+ struct nfs4_state *state,
+ struct nfs_client *clp,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_layout_hdr *lo = lseg->pls_layout;
+ struct inode *inode = lo->plh_inode;
+ struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
+ struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
+
+ if (task->tk_status >= 0)
+ return 0;
+
+ switch (task->tk_status) {
+ /* DS session errors */
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case -NFS4ERR_SEQ_FALSE_RETRY:
+ case -NFS4ERR_SEQ_MISORDERED:
+ dprintk("%s ERROR %d, Reset session. Exchangeid "
+ "flags 0x%x\n", __func__, task->tk_status,
+ clp->cl_exchange_flags);
+ nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
+ break;
+ case -NFS4ERR_DELAY:
+ case -NFS4ERR_GRACE:
+ rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
+ break;
+ case -NFS4ERR_RETRY_UNCACHED_REP:
+ break;
+ /* Invalidate Layout errors */
+ case -NFS4ERR_ACCESS:
+ case -NFS4ERR_PNFS_NO_LAYOUT:
+ case -ESTALE: /* mapped NFS4ERR_STALE */
+ case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */
+ case -EISDIR: /* mapped NFS4ERR_ISDIR */
+ case -NFS4ERR_FHEXPIRED:
+ case -NFS4ERR_WRONG_TYPE:
+ dprintk("%s Invalid layout error %d\n", __func__,
+ task->tk_status);
+ /*
+ * Destroy layout so new i/o will get a new layout.
+ * Layout will not be destroyed until all current lseg
+ * references are put. Mark layout as invalid to resend failed
+ * i/o and all i/o waiting on the slot table to the MDS until
+ * layout is destroyed and a new valid layout is obtained.
+ */
+ pnfs_destroy_layout(NFS_I(inode));
+ rpc_wake_up(&tbl->slot_tbl_waitq);
+ goto reset;
+ /* RPC connection errors */
+ case -ECONNREFUSED:
+ case -EHOSTDOWN:
+ case -EHOSTUNREACH:
+ case -ENETUNREACH:
+ case -EIO:
+ case -ETIMEDOUT:
+ case -EPIPE:
+ dprintk("%s DS connection error %d\n", __func__,
+ task->tk_status);
+ nfs4_mark_deviceid_unavailable(devid);
+ pnfs_error_mark_layout_for_return(inode, lseg);
+ pnfs_set_lo_fail(lseg);
+ rpc_wake_up(&tbl->slot_tbl_waitq);
+ fallthrough;
+ default:
+reset:
+ dprintk("%s Retry through MDS. Error %d\n", __func__,
+ task->tk_status);
+ return -NFS4ERR_RESET_TO_MDS;
+ }
+ task->tk_status = 0;
+ return -EAGAIN;
+}
+
+/* NFS_PROTO call done callback routines */
+
+static int filelayout_read_done_cb(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ int err;
+
+ trace_nfs4_pnfs_read(hdr, task->tk_status);
+ err = filelayout_async_handle_error(task, hdr->args.context->state,
+ hdr->ds_clp, hdr->lseg);
+
+ switch (err) {
+ case -NFS4ERR_RESET_TO_MDS:
+ filelayout_reset_read(hdr);
+ return task->tk_status;
+ case -EAGAIN:
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+/*
+ * We reference the rpc_cred of the first WRITE that triggers the need for
+ * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
+ * rfc5661 is not clear about which credential should be used.
+ */
+static void
+filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
+{
+ loff_t end_offs = 0;
+
+ if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
+ hdr->res.verf->committed == NFS_FILE_SYNC)
+ return;
+ if (hdr->res.verf->committed == NFS_DATA_SYNC)
+ end_offs = hdr->mds_offset + (loff_t)hdr->res.count;
+
+ /* Note: if the write is unstable, don't set end_offs until commit */
+ pnfs_set_layoutcommit(hdr->inode, hdr->lseg, end_offs);
+ dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
+ (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
+}
+
+bool
+filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node)
+{
+ return filelayout_test_devid_invalid(node) ||
+ nfs4_test_deviceid_unavailable(node);
+}
+
+static bool
+filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_deviceid_node *node = FILELAYOUT_DEVID_NODE(lseg);
+
+ return filelayout_test_devid_unavailable(node);
+}
+
+/*
+ * Call ops for the async read/write cases
+ * In the case of dense layouts, the offset needs to be reset to its
+ * original value.
+ */
+static void filelayout_read_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+ rpc_exit(task, -EIO);
+ return;
+ }
+ if (filelayout_reset_to_mds(hdr->lseg)) {
+ dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
+ filelayout_reset_read(hdr);
+ rpc_exit(task, 0);
+ return;
+ }
+ hdr->pgio_done_cb = filelayout_read_done_cb;
+
+ if (nfs4_setup_sequence(hdr->ds_clp,
+ &hdr->args.seq_args,
+ &hdr->res.seq_res,
+ task))
+ return;
+ if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+ hdr->args.lock_context, FMODE_READ) == -EIO)
+ rpc_exit(task, -EIO); /* lost lock, terminate I/O */
+}
+
+static void filelayout_read_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
+
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+ task->tk_status == 0) {
+ nfs41_sequence_done(task, &hdr->res.seq_res);
+ return;
+ }
+
+ /* Note this may cause RPC to be resent */
+ hdr->mds_ops->rpc_call_done(task, data);
+}
+
+static void filelayout_read_count_stats(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
+}
+
+static int filelayout_write_done_cb(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ int err;
+
+ trace_nfs4_pnfs_write(hdr, task->tk_status);
+ err = filelayout_async_handle_error(task, hdr->args.context->state,
+ hdr->ds_clp, hdr->lseg);
+
+ switch (err) {
+ case -NFS4ERR_RESET_TO_MDS:
+ filelayout_reset_write(hdr);
+ return task->tk_status;
+ case -EAGAIN:
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
+
+ filelayout_set_layoutcommit(hdr);
+
+ /* zero out the fattr */
+ hdr->fattr.valid = 0;
+ if (task->tk_status >= 0)
+ nfs_writeback_update_inode(hdr);
+
+ return 0;
+}
+
+static int filelayout_commit_done_cb(struct rpc_task *task,
+ struct nfs_commit_data *data)
+{
+ int err;
+
+ trace_nfs4_pnfs_commit_ds(data, task->tk_status);
+ err = filelayout_async_handle_error(task, NULL, data->ds_clp,
+ data->lseg);
+
+ switch (err) {
+ case -NFS4ERR_RESET_TO_MDS:
+ pnfs_generic_prepare_to_resend_writes(data);
+ return -EAGAIN;
+ case -EAGAIN:
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
+
+ pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
+
+ return 0;
+}
+
+static void filelayout_write_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+ rpc_exit(task, -EIO);
+ return;
+ }
+ if (filelayout_reset_to_mds(hdr->lseg)) {
+ dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
+ filelayout_reset_write(hdr);
+ rpc_exit(task, 0);
+ return;
+ }
+ if (nfs4_setup_sequence(hdr->ds_clp,
+ &hdr->args.seq_args,
+ &hdr->res.seq_res,
+ task))
+ return;
+ if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+ hdr->args.lock_context, FMODE_WRITE) == -EIO)
+ rpc_exit(task, -EIO); /* lost lock, terminate I/O */
+}
+
+static void filelayout_write_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+ task->tk_status == 0) {
+ nfs41_sequence_done(task, &hdr->res.seq_res);
+ return;
+ }
+
+ /* Note this may cause RPC to be resent */
+ hdr->mds_ops->rpc_call_done(task, data);
+}
+
+static void filelayout_write_count_stats(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
+}
+
+static void filelayout_commit_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs_commit_data *wdata = data;
+
+ nfs4_setup_sequence(wdata->ds_clp,
+ &wdata->args.seq_args,
+ &wdata->res.seq_res,
+ task);
+}
+
+static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
+{
+ struct nfs_commit_data *cdata = data;
+
+ rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics);
+}
+
+static const struct rpc_call_ops filelayout_read_call_ops = {
+ .rpc_call_prepare = filelayout_read_prepare,
+ .rpc_call_done = filelayout_read_call_done,
+ .rpc_count_stats = filelayout_read_count_stats,
+ .rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops filelayout_write_call_ops = {
+ .rpc_call_prepare = filelayout_write_prepare,
+ .rpc_call_done = filelayout_write_call_done,
+ .rpc_count_stats = filelayout_write_count_stats,
+ .rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops filelayout_commit_call_ops = {
+ .rpc_call_prepare = filelayout_commit_prepare,
+ .rpc_call_done = pnfs_generic_write_commit_done,
+ .rpc_count_stats = filelayout_commit_count_stats,
+ .rpc_release = pnfs_generic_commit_release,
+};
+
+static enum pnfs_try_status
+filelayout_read_pagelist(struct nfs_pgio_header *hdr)
+{
+ struct pnfs_layout_segment *lseg = hdr->lseg;
+ struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
+ loff_t offset = hdr->args.offset;
+ u32 j, idx;
+ struct nfs_fh *fh;
+
+ dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n",
+ __func__, hdr->inode->i_ino,
+ hdr->args.pgbase, (size_t)hdr->args.count, offset);
+
+ /* Retrieve the correct rpc_client for the byte range */
+ j = nfs4_fl_calc_j_index(lseg, offset);
+ idx = nfs4_fl_calc_ds_index(lseg, j);
+ ds = nfs4_fl_prepare_ds(lseg, idx);
+ if (!ds)
+ return PNFS_NOT_ATTEMPTED;
+
+ ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode);
+ if (IS_ERR(ds_clnt))
+ return PNFS_NOT_ATTEMPTED;
+
+ dprintk("%s USE DS: %s cl_count %d\n", __func__,
+ ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count));
+
+ /* No multipath support. Use first DS */
+ refcount_inc(&ds->ds_clp->cl_count);
+ hdr->ds_clp = ds->ds_clp;
+ hdr->ds_commit_idx = idx;
+ fh = nfs4_fl_select_ds_fh(lseg, j);
+ if (fh)
+ hdr->args.fh = fh;
+
+ hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
+ hdr->mds_offset = offset;
+
+ /* Perform an asynchronous read to ds */
+ nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
+ NFS_PROTO(hdr->inode), &filelayout_read_call_ops,
+ 0, RPC_TASK_SOFTCONN);
+ return PNFS_ATTEMPTED;
+}
+
+/* Perform async writes. */
+static enum pnfs_try_status
+filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
+{
+ struct pnfs_layout_segment *lseg = hdr->lseg;
+ struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
+ loff_t offset = hdr->args.offset;
+ u32 j, idx;
+ struct nfs_fh *fh;
+
+ /* Retrieve the correct rpc_client for the byte range */
+ j = nfs4_fl_calc_j_index(lseg, offset);
+ idx = nfs4_fl_calc_ds_index(lseg, j);
+ ds = nfs4_fl_prepare_ds(lseg, idx);
+ if (!ds)
+ return PNFS_NOT_ATTEMPTED;
+
+ ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode);
+ if (IS_ERR(ds_clnt))
+ return PNFS_NOT_ATTEMPTED;
+
+ dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d\n",
+ __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
+ offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count));
+
+ hdr->pgio_done_cb = filelayout_write_done_cb;
+ refcount_inc(&ds->ds_clp->cl_count);
+ hdr->ds_clp = ds->ds_clp;
+ hdr->ds_commit_idx = idx;
+ fh = nfs4_fl_select_ds_fh(lseg, j);
+ if (fh)
+ hdr->args.fh = fh;
+ hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
+
+ /* Perform an asynchronous write */
+ nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
+ NFS_PROTO(hdr->inode), &filelayout_write_call_ops,
+ sync, RPC_TASK_SOFTCONN);
+ return PNFS_ATTEMPTED;
+}
+
+static int
+filelayout_check_deviceid(struct pnfs_layout_hdr *lo,
+ struct nfs4_filelayout_segment *fl,
+ gfp_t gfp_flags)
+{
+ struct nfs4_deviceid_node *d;
+ struct nfs4_file_layout_dsaddr *dsaddr;
+ int status = -EINVAL;
+
+ /* Is the deviceid already set? If so, we're good. */
+ if (fl->dsaddr != NULL)
+ return 0;
+
+ /* find and reference the deviceid */
+ d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &fl->deviceid,
+ lo->plh_lc_cred, gfp_flags);
+ if (d == NULL)
+ goto out;
+
+ dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
+ /* Found deviceid is unavailable */
+ if (filelayout_test_devid_unavailable(&dsaddr->id_node))
+ goto out_put;
+
+ if (fl->first_stripe_index >= dsaddr->stripe_count) {
+ dprintk("%s Bad first_stripe_index %u\n",
+ __func__, fl->first_stripe_index);
+ goto out_put;
+ }
+
+ if ((fl->stripe_type == STRIPE_SPARSE &&
+ fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) ||
+ (fl->stripe_type == STRIPE_DENSE &&
+ fl->num_fh != dsaddr->stripe_count)) {
+ dprintk("%s num_fh %u not valid for given packing\n",
+ __func__, fl->num_fh);
+ goto out_put;
+ }
+ status = 0;
+
+ /*
+ * Atomic compare and xchange to ensure we don't scribble
+ * over a non-NULL pointer.
+ */
+ if (cmpxchg(&fl->dsaddr, NULL, dsaddr) != NULL)
+ goto out_put;
+out:
+ return status;
+out_put:
+ nfs4_fl_put_deviceid(dsaddr);
+ goto out;
+}
+
+/*
+ * filelayout_check_layout()
+ *
+ * Make sure layout segment parameters are sane WRT the device.
+ * At this point no generic layer initialization of the lseg has occurred,
+ * and nothing has been added to the layout_hdr cache.
+ *
+ */
+static int
+filelayout_check_layout(struct pnfs_layout_hdr *lo,
+ struct nfs4_filelayout_segment *fl,
+ struct nfs4_layoutget_res *lgr,
+ gfp_t gfp_flags)
+{
+ int status = -EINVAL;
+
+ dprintk("--> %s\n", __func__);
+
+ /* FIXME: remove this check when layout segment support is added */
+ if (lgr->range.offset != 0 ||
+ lgr->range.length != NFS4_MAX_UINT64) {
+ dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
+ __func__);
+ goto out;
+ }
+
+ if (fl->pattern_offset > lgr->range.offset) {
+ dprintk("%s pattern_offset %lld too large\n",
+ __func__, fl->pattern_offset);
+ goto out;
+ }
+
+ if (!fl->stripe_unit) {
+ dprintk("%s Invalid stripe unit (%u)\n",
+ __func__, fl->stripe_unit);
+ goto out;
+ }
+
+ status = 0;
+out:
+ dprintk("--> %s returns %d\n", __func__, status);
+ return status;
+}
+
+static void _filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
+{
+ int i;
+
+ if (fl->fh_array) {
+ for (i = 0; i < fl->num_fh; i++) {
+ if (!fl->fh_array[i])
+ break;
+ kfree(fl->fh_array[i]);
+ }
+ kfree(fl->fh_array);
+ }
+ kfree(fl);
+}
+
+static int
+filelayout_decode_layout(struct pnfs_layout_hdr *flo,
+ struct nfs4_filelayout_segment *fl,
+ struct nfs4_layoutget_res *lgr,
+ gfp_t gfp_flags)
+{
+ struct xdr_stream stream;
+ struct xdr_buf buf;
+ struct page *scratch;
+ __be32 *p;
+ uint32_t nfl_util;
+ int i;
+
+ dprintk("%s: set_layout_map Begin\n", __func__);
+
+ scratch = alloc_page(gfp_flags);
+ if (!scratch)
+ return -ENOMEM;
+
+ xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
+ xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+ /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
+ * num_fh (4) */
+ p = xdr_inline_decode(&stream, NFS4_DEVICEID4_SIZE + 20);
+ if (unlikely(!p))
+ goto out_err;
+
+ memcpy(&fl->deviceid, p, sizeof(fl->deviceid));
+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+ nfs4_print_deviceid(&fl->deviceid);
+
+ nfl_util = be32_to_cpup(p++);
+ if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
+ fl->commit_through_mds = 1;
+ if (nfl_util & NFL4_UFLG_DENSE)
+ fl->stripe_type = STRIPE_DENSE;
+ else
+ fl->stripe_type = STRIPE_SPARSE;
+ fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK;
+
+ fl->first_stripe_index = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &fl->pattern_offset);
+ fl->num_fh = be32_to_cpup(p++);
+
+ dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n",
+ __func__, nfl_util, fl->num_fh, fl->first_stripe_index,
+ fl->pattern_offset);
+
+ /* Note that a zero value for num_fh is legal for STRIPE_SPARSE.
+ * Futher checking is done in filelayout_check_layout */
+ if (fl->num_fh >
+ max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT))
+ goto out_err;
+
+ if (fl->num_fh > 0) {
+ fl->fh_array = kcalloc(fl->num_fh, sizeof(fl->fh_array[0]),
+ gfp_flags);
+ if (!fl->fh_array)
+ goto out_err;
+ }
+
+ for (i = 0; i < fl->num_fh; i++) {
+ /* Do we want to use a mempool here? */
+ fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), gfp_flags);
+ if (!fl->fh_array[i])
+ goto out_err;
+
+ p = xdr_inline_decode(&stream, 4);
+ if (unlikely(!p))
+ goto out_err;
+ fl->fh_array[i]->size = be32_to_cpup(p++);
+ if (fl->fh_array[i]->size > NFS_MAXFHSIZE) {
+ printk(KERN_ERR "NFS: Too big fh %d received %d\n",
+ i, fl->fh_array[i]->size);
+ goto out_err;
+ }
+
+ p = xdr_inline_decode(&stream, fl->fh_array[i]->size);
+ if (unlikely(!p))
+ goto out_err;
+ memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
+ dprintk("DEBUG: %s: fh len %d\n", __func__,
+ fl->fh_array[i]->size);
+ }
+
+ __free_page(scratch);
+ return 0;
+
+out_err:
+ __free_page(scratch);
+ return -EIO;
+}
+
+static void
+filelayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+
+ dprintk("--> %s\n", __func__);
+ if (fl->dsaddr != NULL)
+ nfs4_fl_put_deviceid(fl->dsaddr);
+ /* This assumes a single RW lseg */
+ if (lseg->pls_range.iomode == IOMODE_RW) {
+ struct nfs4_filelayout *flo;
+ struct inode *inode;
+
+ flo = FILELAYOUT_FROM_HDR(lseg->pls_layout);
+ inode = flo->generic_hdr.plh_inode;
+ spin_lock(&inode->i_lock);
+ pnfs_generic_ds_cinfo_release_lseg(&flo->commit_info, lseg);
+ spin_unlock(&inode->i_lock);
+ }
+ _filelayout_free_lseg(fl);
+}
+
+static struct pnfs_layout_segment *
+filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
+ struct nfs4_layoutget_res *lgr,
+ gfp_t gfp_flags)
+{
+ struct nfs4_filelayout_segment *fl;
+ int rc;
+
+ dprintk("--> %s\n", __func__);
+ fl = kzalloc(sizeof(*fl), gfp_flags);
+ if (!fl)
+ return NULL;
+
+ rc = filelayout_decode_layout(layoutid, fl, lgr, gfp_flags);
+ if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, gfp_flags)) {
+ _filelayout_free_lseg(fl);
+ return NULL;
+ }
+ return &fl->generic_hdr;
+}
+
+static bool
+filelayout_lseg_is_striped(const struct nfs4_filelayout_segment *flseg)
+{
+ return flseg->num_fh > 1;
+}
+
+/*
+ * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
+ *
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
+filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+ struct nfs_page *req)
+{
+ unsigned int size;
+ u64 p_stripe, r_stripe;
+ u32 stripe_offset;
+ u64 segment_offset = pgio->pg_lseg->pls_range.offset;
+ u32 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
+
+ /* calls nfs_generic_pg_test */
+ size = pnfs_generic_pg_test(pgio, prev, req);
+ if (!size)
+ return 0;
+ else if (!filelayout_lseg_is_striped(FILELAYOUT_LSEG(pgio->pg_lseg)))
+ return size;
+
+ /* see if req and prev are in the same stripe */
+ if (prev) {
+ p_stripe = (u64)req_offset(prev) - segment_offset;
+ r_stripe = (u64)req_offset(req) - segment_offset;
+ do_div(p_stripe, stripe_unit);
+ do_div(r_stripe, stripe_unit);
+
+ if (p_stripe != r_stripe)
+ return 0;
+ }
+
+ /* calculate remaining bytes in the current stripe */
+ div_u64_rem((u64)req_offset(req) - segment_offset,
+ stripe_unit,
+ &stripe_offset);
+ WARN_ON_ONCE(stripe_offset > stripe_unit);
+ if (stripe_offset >= stripe_unit)
+ return 0;
+ return min(stripe_unit - (unsigned int)stripe_offset, size);
+}
+
+static struct pnfs_layout_segment *
+fl_pnfs_update_layout(struct inode *ino,
+ struct nfs_open_context *ctx,
+ loff_t pos,
+ u64 count,
+ enum pnfs_iomode iomode,
+ bool strict_iomode,
+ gfp_t gfp_flags)
+{
+ struct pnfs_layout_segment *lseg = NULL;
+ struct pnfs_layout_hdr *lo;
+ struct nfs4_filelayout_segment *fl;
+ int status;
+
+ lseg = pnfs_update_layout(ino, ctx, pos, count, iomode, strict_iomode,
+ gfp_flags);
+ if (IS_ERR_OR_NULL(lseg))
+ goto out;
+
+ lo = NFS_I(ino)->layout;
+ fl = FILELAYOUT_LSEG(lseg);
+
+ status = filelayout_check_deviceid(lo, fl, gfp_flags);
+ if (status) {
+ pnfs_put_lseg(lseg);
+ lseg = NULL;
+ }
+out:
+ return lseg;
+}
+
+static void
+filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ pnfs_generic_pg_check_layout(pgio);
+ if (!pgio->pg_lseg) {
+ pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode,
+ nfs_req_openctx(req),
+ 0,
+ NFS4_MAX_UINT64,
+ IOMODE_READ,
+ false,
+ GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
+ /* If no lseg, fall back to read through mds */
+ if (pgio->pg_lseg == NULL)
+ nfs_pageio_reset_read_mds(pgio);
+}
+
+static void
+filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ pnfs_generic_pg_check_layout(pgio);
+ if (!pgio->pg_lseg) {
+ pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode,
+ nfs_req_openctx(req),
+ 0,
+ NFS4_MAX_UINT64,
+ IOMODE_RW,
+ false,
+ GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
+
+ /* If no lseg, fall back to write through mds */
+ if (pgio->pg_lseg == NULL)
+ nfs_pageio_reset_write_mds(pgio);
+}
+
+static const struct nfs_pageio_ops filelayout_pg_read_ops = {
+ .pg_init = filelayout_pg_init_read,
+ .pg_test = filelayout_pg_test,
+ .pg_doio = pnfs_generic_pg_readpages,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static const struct nfs_pageio_ops filelayout_pg_write_ops = {
+ .pg_init = filelayout_pg_init_write,
+ .pg_test = filelayout_pg_test,
+ .pg_doio = pnfs_generic_pg_writepages,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
+{
+ if (fl->stripe_type == STRIPE_SPARSE)
+ return nfs4_fl_calc_ds_index(&fl->generic_hdr, j);
+ else
+ return j;
+}
+
+static void
+filelayout_mark_request_commit(struct nfs_page *req,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx)
+
+{
+ struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+ u32 i, j;
+
+ if (fl->commit_through_mds) {
+ nfs_request_add_commit_list(req, cinfo);
+ } else {
+ /* Note that we are calling nfs4_fl_calc_j_index on each page
+ * that ends up being committed to a data server. An attractive
+ * alternative is to add a field to nfs_write_data and nfs_page
+ * to store the value calculated in filelayout_write_pagelist
+ * and just use that here.
+ */
+ j = nfs4_fl_calc_j_index(lseg, req_offset(req));
+ i = select_bucket_index(fl, j);
+ pnfs_layout_mark_request_commit(req, lseg, cinfo, i);
+ }
+}
+
+static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+
+ if (flseg->stripe_type == STRIPE_SPARSE)
+ return i;
+ else
+ return nfs4_fl_calc_ds_index(lseg, i);
+}
+
+static struct nfs_fh *
+select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+
+ if (flseg->stripe_type == STRIPE_SPARSE) {
+ if (flseg->num_fh == 1)
+ i = 0;
+ else if (flseg->num_fh == 0)
+ /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
+ return NULL;
+ }
+ return flseg->fh_array[i];
+}
+
+static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
+{
+ struct pnfs_layout_segment *lseg = data->lseg;
+ struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
+ u32 idx;
+ struct nfs_fh *fh;
+
+ idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
+ ds = nfs4_fl_prepare_ds(lseg, idx);
+ if (!ds)
+ goto out_err;
+
+ ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, data->inode);
+ if (IS_ERR(ds_clnt))
+ goto out_err;
+
+ dprintk("%s ino %lu, how %d cl_count %d\n", __func__,
+ data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count));
+ data->commit_done_cb = filelayout_commit_done_cb;
+ refcount_inc(&ds->ds_clp->cl_count);
+ data->ds_clp = ds->ds_clp;
+ fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
+ if (fh)
+ data->args.fh = fh;
+ return nfs_initiate_commit(ds_clnt, data, NFS_PROTO(data->inode),
+ &filelayout_commit_call_ops, how,
+ RPC_TASK_SOFTCONN);
+out_err:
+ pnfs_generic_prepare_to_resend_writes(data);
+ pnfs_generic_commit_release(data);
+ return -EAGAIN;
+}
+
+static int
+filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
+ int how, struct nfs_commit_info *cinfo)
+{
+ return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
+ filelayout_initiate_commit);
+}
+
+static struct nfs4_deviceid_node *
+filelayout_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_flags)
+{
+ struct nfs4_file_layout_dsaddr *dsaddr;
+
+ dsaddr = nfs4_fl_alloc_deviceid_node(server, pdev, gfp_flags);
+ if (!dsaddr)
+ return NULL;
+ return &dsaddr->id_node;
+}
+
+static void
+filelayout_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+ nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node));
+}
+
+static struct pnfs_layout_hdr *
+filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
+{
+ struct nfs4_filelayout *flo;
+
+ flo = kzalloc(sizeof(*flo), gfp_flags);
+ if (flo == NULL)
+ return NULL;
+ pnfs_init_ds_commit_info(&flo->commit_info);
+ flo->commit_info.ops = &filelayout_commit_ops;
+ return &flo->generic_hdr;
+}
+
+static void
+filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ kfree_rcu(FILELAYOUT_FROM_HDR(lo), generic_hdr.plh_rcu);
+}
+
+static struct pnfs_ds_commit_info *
+filelayout_get_ds_info(struct inode *inode)
+{
+ struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
+
+ if (layout == NULL)
+ return NULL;
+ else
+ return &FILELAYOUT_FROM_HDR(layout)->commit_info;
+}
+
+static void
+filelayout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+ struct inode *inode = lseg->pls_layout->plh_inode;
+ struct pnfs_commit_array *array, *new;
+ unsigned int size = (fl->stripe_type == STRIPE_SPARSE) ?
+ fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
+
+ new = pnfs_alloc_commit_array(size, GFP_NOIO);
+ if (new) {
+ spin_lock(&inode->i_lock);
+ array = pnfs_add_commit_array(fl_cinfo, new, lseg);
+ spin_unlock(&inode->i_lock);
+ if (array != new)
+ pnfs_free_commit_array(new);
+ }
+}
+
+static void
+filelayout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+ struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ pnfs_generic_ds_cinfo_destroy(fl_cinfo);
+ spin_unlock(&inode->i_lock);
+}
+
+static const struct pnfs_commit_ops filelayout_commit_ops = {
+ .setup_ds_info = filelayout_setup_ds_info,
+ .release_ds_info = filelayout_release_ds_info,
+ .mark_request_commit = filelayout_mark_request_commit,
+ .clear_request_commit = pnfs_generic_clear_request_commit,
+ .scan_commit_lists = pnfs_generic_scan_commit_lists,
+ .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
+ .search_commit_reqs = pnfs_generic_search_commit_reqs,
+ .commit_pagelist = filelayout_commit_pagelist,
+};
+
+static struct pnfs_layoutdriver_type filelayout_type = {
+ .id = LAYOUT_NFSV4_1_FILES,
+ .name = "LAYOUT_NFSV4_1_FILES",
+ .owner = THIS_MODULE,
+ .flags = PNFS_LAYOUTGET_ON_OPEN,
+ .max_layoutget_response = 4096, /* 1 page or so... */
+ .alloc_layout_hdr = filelayout_alloc_layout_hdr,
+ .free_layout_hdr = filelayout_free_layout_hdr,
+ .alloc_lseg = filelayout_alloc_lseg,
+ .free_lseg = filelayout_free_lseg,
+ .pg_read_ops = &filelayout_pg_read_ops,
+ .pg_write_ops = &filelayout_pg_write_ops,
+ .get_ds_info = &filelayout_get_ds_info,
+ .read_pagelist = filelayout_read_pagelist,
+ .write_pagelist = filelayout_write_pagelist,
+ .alloc_deviceid_node = filelayout_alloc_deviceid_node,
+ .free_deviceid_node = filelayout_free_deviceid_node,
+ .sync = pnfs_nfs_generic_sync,
+};
+
+static int __init nfs4filelayout_init(void)
+{
+ printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n",
+ __func__);
+ return pnfs_register_layoutdriver(&filelayout_type);
+}
+
+static void __exit nfs4filelayout_exit(void)
+{
+ printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n",
+ __func__);
+ pnfs_unregister_layoutdriver(&filelayout_type);
+}
+
+MODULE_ALIAS("nfs-layouttype4-1");
+
+module_init(nfs4filelayout_init);
+module_exit(nfs4filelayout_exit);
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
new file mode 100644
index 000000000..79323b5da
--- /dev/null
+++ b/fs/nfs/filelayout/filelayout.h
@@ -0,0 +1,118 @@
+/*
+ * NFSv4 file layout driver data structures.
+ *
+ * Copyright (c) 2002
+ * The Regents of the University of Michigan
+ * All Rights Reserved
+ *
+ * Dean Hildebrand <dhildebz@umich.edu>
+ *
+ * Permission is granted to use, copy, create derivative works, and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the University of Michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. If
+ * the above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * This software is provided as is, without representation or warranty
+ * of any kind either express or implied, including without limitation
+ * the implied warranties of merchantability, fitness for a particular
+ * purpose, or noninfringement. The Regents of the University of
+ * Michigan shall not be liable for any damages, including special,
+ * indirect, incidental, or consequential damages, with respect to any
+ * claim arising out of or in connection with the use of the software,
+ * even if it has been or is hereafter advised of the possibility of
+ * such damages.
+ */
+
+#ifndef FS_NFS_NFS4FILELAYOUT_H
+#define FS_NFS_NFS4FILELAYOUT_H
+
+#include "../pnfs.h"
+
+/*
+ * Field testing shows we need to support up to 4096 stripe indices.
+ * We store each index as a u8 (u32 on the wire) to keep the memory footprint
+ * reasonable. This in turn means we support a maximum of 256
+ * RFC 5661 multipath_list4 structures.
+ */
+#define NFS4_PNFS_MAX_STRIPE_CNT 4096
+#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */
+
+enum stripetype4 {
+ STRIPE_SPARSE = 1,
+ STRIPE_DENSE = 2
+};
+
+struct nfs4_file_layout_dsaddr {
+ struct nfs4_deviceid_node id_node;
+ u32 stripe_count;
+ u8 *stripe_indices;
+ u32 ds_num;
+ struct nfs4_pnfs_ds *ds_list[1];
+};
+
+struct nfs4_filelayout_segment {
+ struct pnfs_layout_segment generic_hdr;
+ u32 stripe_type;
+ u32 commit_through_mds;
+ u32 stripe_unit;
+ u32 first_stripe_index;
+ u64 pattern_offset;
+ struct nfs4_deviceid deviceid;
+ struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
+ unsigned int num_fh;
+ struct nfs_fh **fh_array;
+};
+
+struct nfs4_filelayout {
+ struct pnfs_layout_hdr generic_hdr;
+ struct pnfs_ds_commit_info commit_info;
+};
+
+static inline struct nfs4_filelayout *
+FILELAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo)
+{
+ return container_of(lo, struct nfs4_filelayout, generic_hdr);
+}
+
+static inline struct nfs4_filelayout_segment *
+FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
+{
+ return container_of(lseg,
+ struct nfs4_filelayout_segment,
+ generic_hdr);
+}
+
+static inline struct nfs4_deviceid_node *
+FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg)
+{
+ return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node;
+}
+
+static inline bool
+filelayout_test_devid_invalid(struct nfs4_deviceid_node *node)
+{
+ return test_bit(NFS_DEVICEID_INVALID, &node->flags);
+}
+
+extern bool
+filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node);
+
+extern struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
+
+u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
+u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
+struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
+ u32 ds_idx);
+
+extern struct nfs4_file_layout_dsaddr *
+nfs4_fl_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_flags);
+extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
+extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
+
+#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
new file mode 100644
index 000000000..d913e8188
--- /dev/null
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -0,0 +1,305 @@
+/*
+ * Device operations for the pnfs nfs4 file layout driver.
+ *
+ * Copyright (c) 2002
+ * The Regents of the University of Michigan
+ * All Rights Reserved
+ *
+ * Dean Hildebrand <dhildebz@umich.edu>
+ * Garth Goodson <Garth.Goodson@netapp.com>
+ *
+ * Permission is granted to use, copy, create derivative works, and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the University of Michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. If
+ * the above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * This software is provided as is, without representation or warranty
+ * of any kind either express or implied, including without limitation
+ * the implied warranties of merchantability, fitness for a particular
+ * purpose, or noninfringement. The Regents of the University of
+ * Michigan shall not be liable for any damages, including special,
+ * indirect, incidental, or consequential damages, with respect to any
+ * claim arising out of or in connection with the use of the software,
+ * even if it has been or is hereafter advised of the possibility of
+ * such damages.
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+
+#include "../internal.h"
+#include "../nfs4session.h"
+#include "filelayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
+static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
+
+void
+nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+ struct nfs4_pnfs_ds *ds;
+ int i;
+
+ nfs4_print_deviceid(&dsaddr->id_node.deviceid);
+
+ for (i = 0; i < dsaddr->ds_num; i++) {
+ ds = dsaddr->ds_list[i];
+ if (ds != NULL)
+ nfs4_pnfs_ds_put(ds);
+ }
+ kfree(dsaddr->stripe_indices);
+ kfree_rcu(dsaddr, id_node.rcu);
+}
+
+/* Decode opaque device data and return the result */
+struct nfs4_file_layout_dsaddr *
+nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags)
+{
+ int i;
+ u32 cnt, num;
+ u8 *indexp;
+ __be32 *p;
+ u8 *stripe_indices;
+ u8 max_stripe_index;
+ struct nfs4_file_layout_dsaddr *dsaddr = NULL;
+ struct xdr_stream stream;
+ struct xdr_buf buf;
+ struct page *scratch;
+ struct list_head dsaddrs;
+ struct nfs4_pnfs_ds_addr *da;
+
+ /* set up xdr stream */
+ scratch = alloc_page(gfp_flags);
+ if (!scratch)
+ goto out_err;
+
+ xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
+ xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+ /* Get the stripe count (number of stripe index) */
+ p = xdr_inline_decode(&stream, 4);
+ if (unlikely(!p))
+ goto out_err_free_scratch;
+
+ cnt = be32_to_cpup(p);
+ dprintk("%s stripe count %d\n", __func__, cnt);
+ if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
+ printk(KERN_WARNING "NFS: %s: stripe count %d greater than "
+ "supported maximum %d\n", __func__,
+ cnt, NFS4_PNFS_MAX_STRIPE_CNT);
+ goto out_err_free_scratch;
+ }
+
+ /* read stripe indices */
+ stripe_indices = kcalloc(cnt, sizeof(u8), gfp_flags);
+ if (!stripe_indices)
+ goto out_err_free_scratch;
+
+ p = xdr_inline_decode(&stream, cnt << 2);
+ if (unlikely(!p))
+ goto out_err_free_stripe_indices;
+
+ indexp = &stripe_indices[0];
+ max_stripe_index = 0;
+ for (i = 0; i < cnt; i++) {
+ *indexp = be32_to_cpup(p++);
+ max_stripe_index = max(max_stripe_index, *indexp);
+ indexp++;
+ }
+
+ /* Check the multipath list count */
+ p = xdr_inline_decode(&stream, 4);
+ if (unlikely(!p))
+ goto out_err_free_stripe_indices;
+
+ num = be32_to_cpup(p);
+ dprintk("%s ds_num %u\n", __func__, num);
+ if (num > NFS4_PNFS_MAX_MULTI_CNT) {
+ printk(KERN_WARNING "NFS: %s: multipath count %d greater than "
+ "supported maximum %d\n", __func__,
+ num, NFS4_PNFS_MAX_MULTI_CNT);
+ goto out_err_free_stripe_indices;
+ }
+
+ /* validate stripe indices are all < num */
+ if (max_stripe_index >= num) {
+ printk(KERN_WARNING "NFS: %s: stripe index %u >= num ds %u\n",
+ __func__, max_stripe_index, num);
+ goto out_err_free_stripe_indices;
+ }
+
+ dsaddr = kzalloc(sizeof(*dsaddr) +
+ (sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
+ gfp_flags);
+ if (!dsaddr)
+ goto out_err_free_stripe_indices;
+
+ dsaddr->stripe_count = cnt;
+ dsaddr->stripe_indices = stripe_indices;
+ stripe_indices = NULL;
+ dsaddr->ds_num = num;
+ nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id);
+
+ INIT_LIST_HEAD(&dsaddrs);
+
+ for (i = 0; i < dsaddr->ds_num; i++) {
+ int j;
+ u32 mp_count;
+
+ p = xdr_inline_decode(&stream, 4);
+ if (unlikely(!p))
+ goto out_err_free_deviceid;
+
+ mp_count = be32_to_cpup(p); /* multipath count */
+ for (j = 0; j < mp_count; j++) {
+ da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
+ &stream, gfp_flags);
+ if (da)
+ list_add_tail(&da->da_node, &dsaddrs);
+ }
+ if (list_empty(&dsaddrs)) {
+ dprintk("%s: no suitable DS addresses found\n",
+ __func__);
+ goto out_err_free_deviceid;
+ }
+
+ dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
+ if (!dsaddr->ds_list[i])
+ goto out_err_drain_dsaddrs;
+
+ /* If DS was already in cache, free ds addrs */
+ while (!list_empty(&dsaddrs)) {
+ da = list_first_entry(&dsaddrs,
+ struct nfs4_pnfs_ds_addr,
+ da_node);
+ list_del_init(&da->da_node);
+ kfree(da->da_remotestr);
+ kfree(da);
+ }
+ }
+
+ __free_page(scratch);
+ return dsaddr;
+
+out_err_drain_dsaddrs:
+ while (!list_empty(&dsaddrs)) {
+ da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
+ da_node);
+ list_del_init(&da->da_node);
+ kfree(da->da_remotestr);
+ kfree(da);
+ }
+out_err_free_deviceid:
+ nfs4_fl_free_deviceid(dsaddr);
+ /* stripe_indicies was part of dsaddr */
+ goto out_err_free_scratch;
+out_err_free_stripe_indices:
+ kfree(stripe_indices);
+out_err_free_scratch:
+ __free_page(scratch);
+out_err:
+ dprintk("%s ERROR: returning NULL\n", __func__);
+ return NULL;
+}
+
+void
+nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+ nfs4_put_deviceid_node(&dsaddr->id_node);
+}
+
+/*
+ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
+ * Then: ((res + fsi) % dsaddr->stripe_count)
+ */
+u32
+nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+ u64 tmp;
+
+ tmp = offset - flseg->pattern_offset;
+ do_div(tmp, flseg->stripe_unit);
+ tmp += flseg->first_stripe_index;
+ return do_div(tmp, flseg->dsaddr->stripe_count);
+}
+
+u32
+nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
+{
+ return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
+}
+
+struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
+{
+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+ u32 i;
+
+ if (flseg->stripe_type == STRIPE_SPARSE) {
+ if (flseg->num_fh == 1)
+ i = 0;
+ else if (flseg->num_fh == 0)
+ /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
+ return NULL;
+ else
+ i = nfs4_fl_calc_ds_index(lseg, j);
+ } else
+ i = j;
+ return flseg->fh_array[i];
+}
+
+/* Upon return, either ds is connected, or ds is NULL */
+struct nfs4_pnfs_ds *
+nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
+{
+ struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
+ struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
+ struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
+ struct nfs4_pnfs_ds *ret = ds;
+ struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
+ int status;
+
+ if (ds == NULL) {
+ printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
+ __func__, ds_idx);
+ pnfs_generic_mark_devid_invalid(devid);
+ goto out;
+ }
+ smp_rmb();
+ if (ds->ds_clp)
+ goto out_test_devid;
+
+ status = nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
+ dataserver_retrans, 4,
+ s->nfs_client->cl_minorversion);
+ if (status) {
+ nfs4_mark_deviceid_unavailable(devid);
+ ret = NULL;
+ goto out;
+ }
+
+out_test_devid:
+ if (ret->ds_clp == NULL ||
+ filelayout_test_devid_unavailable(devid))
+ ret = NULL;
+out:
+ return ret;
+}
+
+module_param(dataserver_retrans, uint, 0644);
+MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client "
+ "retries a request before it attempts further "
+ " recovery action.");
+module_param(dataserver_timeo, uint, 0644);
+MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
+ "NFSv4.1 client waits for a response from a "
+ " data server before it retries an NFS request.");
diff --git a/fs/nfs/flexfilelayout/Makefile b/fs/nfs/flexfilelayout/Makefile
new file mode 100644
index 000000000..49f03422b
--- /dev/null
+++ b/fs/nfs/flexfilelayout/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for the pNFS Flexfile Layout Driver kernel module
+#
+obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += nfs_layout_flexfiles.o
+nfs_layout_flexfiles-y := flexfilelayout.o flexfilelayoutdev.o
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
new file mode 100644
index 000000000..e4f2820ba
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -0,0 +1,2545 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Module for pnfs flexfile layout driver.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs_page.h>
+#include <linux/module.h>
+#include <linux/sched/mm.h>
+
+#include <linux/sunrpc/metrics.h>
+
+#include "flexfilelayout.h"
+#include "../nfs4session.h"
+#include "../nfs4idmap.h"
+#include "../internal.h"
+#include "../delegation.h"
+#include "../nfs4trace.h"
+#include "../iostat.h"
+#include "../nfs.h"
+#include "../nfs42.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+#define FF_LAYOUT_POLL_RETRY_MAX (15*HZ)
+#define FF_LAYOUTRETURN_MAXERR 20
+
+static unsigned short io_maxretrans;
+
+static const struct pnfs_commit_ops ff_layout_commit_ops;
+static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr);
+static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
+ struct nfs42_layoutstat_devinfo *devinfo,
+ int dev_limit);
+static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
+ const struct nfs42_layoutstat_devinfo *devinfo,
+ struct nfs4_ff_layout_mirror *mirror);
+
+static struct pnfs_layout_hdr *
+ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
+{
+ struct nfs4_flexfile_layout *ffl;
+
+ ffl = kzalloc(sizeof(*ffl), gfp_flags);
+ if (ffl) {
+ pnfs_init_ds_commit_info(&ffl->commit_info);
+ INIT_LIST_HEAD(&ffl->error_list);
+ INIT_LIST_HEAD(&ffl->mirrors);
+ ffl->last_report_time = ktime_get();
+ ffl->commit_info.ops = &ff_layout_commit_ops;
+ return &ffl->generic_hdr;
+ } else
+ return NULL;
+}
+
+static void
+ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(lo);
+ struct nfs4_ff_layout_ds_err *err, *n;
+
+ list_for_each_entry_safe(err, n, &ffl->error_list, list) {
+ list_del(&err->list);
+ kfree(err);
+ }
+ kfree_rcu(ffl, generic_hdr.plh_rcu);
+}
+
+static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
+ if (unlikely(p == NULL))
+ return -ENOBUFS;
+ stateid->type = NFS4_PNFS_DS_STATEID_TYPE;
+ memcpy(stateid->data, p, NFS4_STATEID_SIZE);
+ dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
+ p[0], p[1], p[2], p[3]);
+ return 0;
+}
+
+static int decode_deviceid(struct xdr_stream *xdr, struct nfs4_deviceid *devid)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE);
+ if (unlikely(!p))
+ return -ENOBUFS;
+ memcpy(devid, p, NFS4_DEVICEID4_SIZE);
+ nfs4_print_deviceid(devid);
+ return 0;
+}
+
+static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -ENOBUFS;
+ fh->size = be32_to_cpup(p++);
+ if (fh->size > NFS_MAXFHSIZE) {
+ printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n",
+ fh->size);
+ return -EOVERFLOW;
+ }
+ /* fh.data */
+ p = xdr_inline_decode(xdr, fh->size);
+ if (unlikely(!p))
+ return -ENOBUFS;
+ memcpy(&fh->data, p, fh->size);
+ dprintk("%s: fh len %d\n", __func__, fh->size);
+
+ return 0;
+}
+
+/*
+ * Currently only stringified uids and gids are accepted.
+ * I.e., kerberos is not supported to the DSes, so no pricipals.
+ *
+ * That means that one common function will suffice, but when
+ * principals are added, this should be split to accomodate
+ * calls to both nfs_map_name_to_uid() and nfs_map_group_to_gid().
+ */
+static int
+decode_name(struct xdr_stream *xdr, u32 *id)
+{
+ __be32 *p;
+ int len;
+
+ /* opaque_length(4)*/
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -ENOBUFS;
+ len = be32_to_cpup(p++);
+ if (len < 0)
+ return -EINVAL;
+
+ dprintk("%s: len %u\n", __func__, len);
+
+ /* opaque body */
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return -ENOBUFS;
+
+ if (!nfs_map_string_to_numeric((char *)p, len, id))
+ return -EINVAL;
+
+ return 0;
+}
+
+static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
+ const struct nfs4_ff_layout_mirror *m2)
+{
+ int i, j;
+
+ if (m1->fh_versions_cnt != m2->fh_versions_cnt)
+ return false;
+ for (i = 0; i < m1->fh_versions_cnt; i++) {
+ bool found_fh = false;
+ for (j = 0; j < m2->fh_versions_cnt; j++) {
+ if (nfs_compare_fh(&m1->fh_versions[i],
+ &m2->fh_versions[j]) == 0) {
+ found_fh = true;
+ break;
+ }
+ }
+ if (!found_fh)
+ return false;
+ }
+ return true;
+}
+
+static struct nfs4_ff_layout_mirror *
+ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
+ struct nfs4_ff_layout_mirror *mirror)
+{
+ struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
+ struct nfs4_ff_layout_mirror *pos;
+ struct inode *inode = lo->plh_inode;
+
+ spin_lock(&inode->i_lock);
+ list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
+ if (memcmp(&mirror->devid, &pos->devid, sizeof(pos->devid)) != 0)
+ continue;
+ if (!ff_mirror_match_fh(mirror, pos))
+ continue;
+ if (refcount_inc_not_zero(&pos->ref)) {
+ spin_unlock(&inode->i_lock);
+ return pos;
+ }
+ }
+ list_add(&mirror->mirrors, &ff_layout->mirrors);
+ mirror->layout = lo;
+ spin_unlock(&inode->i_lock);
+ return mirror;
+}
+
+static void
+ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+ struct inode *inode;
+ if (mirror->layout == NULL)
+ return;
+ inode = mirror->layout->plh_inode;
+ spin_lock(&inode->i_lock);
+ list_del(&mirror->mirrors);
+ spin_unlock(&inode->i_lock);
+ mirror->layout = NULL;
+}
+
+static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+
+ mirror = kzalloc(sizeof(*mirror), gfp_flags);
+ if (mirror != NULL) {
+ spin_lock_init(&mirror->lock);
+ refcount_set(&mirror->ref, 1);
+ INIT_LIST_HEAD(&mirror->mirrors);
+ }
+ return mirror;
+}
+
+static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+ const struct cred *cred;
+
+ ff_layout_remove_mirror(mirror);
+ kfree(mirror->fh_versions);
+ cred = rcu_access_pointer(mirror->ro_cred);
+ put_cred(cred);
+ cred = rcu_access_pointer(mirror->rw_cred);
+ put_cred(cred);
+ nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
+ kfree(mirror);
+}
+
+static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+ if (mirror != NULL && refcount_dec_and_test(&mirror->ref))
+ ff_layout_free_mirror(mirror);
+}
+
+static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
+{
+ u32 i;
+
+ for (i = 0; i < fls->mirror_array_cnt; i++)
+ ff_layout_put_mirror(fls->mirror_array[i]);
+}
+
+static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
+{
+ if (fls) {
+ ff_layout_free_mirror_array(fls);
+ kfree(fls);
+ }
+}
+
+static bool
+ff_lseg_match_mirrors(struct pnfs_layout_segment *l1,
+ struct pnfs_layout_segment *l2)
+{
+ const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1);
+ const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1);
+ u32 i;
+
+ if (fl1->mirror_array_cnt != fl2->mirror_array_cnt)
+ return false;
+ for (i = 0; i < fl1->mirror_array_cnt; i++) {
+ if (fl1->mirror_array[i] != fl2->mirror_array[i])
+ return false;
+ }
+ return true;
+}
+
+static bool
+ff_lseg_range_is_after(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
+{
+ u64 end1, end2;
+
+ if (l1->iomode != l2->iomode)
+ return l1->iomode != IOMODE_READ;
+ end1 = pnfs_calc_offset_end(l1->offset, l1->length);
+ end2 = pnfs_calc_offset_end(l2->offset, l2->length);
+ if (end1 < l2->offset)
+ return false;
+ if (end2 < l1->offset)
+ return true;
+ return l2->offset <= l1->offset;
+}
+
+static bool
+ff_lseg_merge(struct pnfs_layout_segment *new,
+ struct pnfs_layout_segment *old)
+{
+ u64 new_end, old_end;
+
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
+ return false;
+ if (new->pls_range.iomode != old->pls_range.iomode)
+ return false;
+ old_end = pnfs_calc_offset_end(old->pls_range.offset,
+ old->pls_range.length);
+ if (old_end < new->pls_range.offset)
+ return false;
+ new_end = pnfs_calc_offset_end(new->pls_range.offset,
+ new->pls_range.length);
+ if (new_end < old->pls_range.offset)
+ return false;
+ if (!ff_lseg_match_mirrors(new, old))
+ return false;
+
+ /* Mergeable: copy info from 'old' to 'new' */
+ if (new_end < old_end)
+ new_end = old_end;
+ if (new->pls_range.offset < old->pls_range.offset)
+ new->pls_range.offset = old->pls_range.offset;
+ new->pls_range.length = pnfs_calc_offset_length(new->pls_range.offset,
+ new_end);
+ if (test_bit(NFS_LSEG_ROC, &old->pls_flags))
+ set_bit(NFS_LSEG_ROC, &new->pls_flags);
+ return true;
+}
+
+static void
+ff_layout_add_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ struct list_head *free_me)
+{
+ pnfs_generic_layout_insert_lseg(lo, lseg,
+ ff_lseg_range_is_after,
+ ff_lseg_merge,
+ free_me);
+}
+
+static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
+{
+ int i, j;
+
+ for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
+ for (j = i + 1; j < fls->mirror_array_cnt; j++)
+ if (fls->mirror_array[i]->efficiency <
+ fls->mirror_array[j]->efficiency)
+ swap(fls->mirror_array[i],
+ fls->mirror_array[j]);
+ }
+}
+
+static struct pnfs_layout_segment *
+ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
+ struct nfs4_layoutget_res *lgr,
+ gfp_t gfp_flags)
+{
+ struct pnfs_layout_segment *ret;
+ struct nfs4_ff_layout_segment *fls = NULL;
+ struct xdr_stream stream;
+ struct xdr_buf buf;
+ struct page *scratch;
+ u64 stripe_unit;
+ u32 mirror_array_cnt;
+ __be32 *p;
+ int i, rc;
+
+ dprintk("--> %s\n", __func__);
+ scratch = alloc_page(gfp_flags);
+ if (!scratch)
+ return ERR_PTR(-ENOMEM);
+
+ xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
+ lgr->layoutp->len);
+ xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+ /* stripe unit and mirror_array_cnt */
+ rc = -EIO;
+ p = xdr_inline_decode(&stream, 8 + 4);
+ if (!p)
+ goto out_err_free;
+
+ p = xdr_decode_hyper(p, &stripe_unit);
+ mirror_array_cnt = be32_to_cpup(p++);
+ dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__,
+ stripe_unit, mirror_array_cnt);
+
+ if (mirror_array_cnt > NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT ||
+ mirror_array_cnt == 0)
+ goto out_err_free;
+
+ rc = -ENOMEM;
+ fls = kzalloc(struct_size(fls, mirror_array, mirror_array_cnt),
+ gfp_flags);
+ if (!fls)
+ goto out_err_free;
+
+ fls->mirror_array_cnt = mirror_array_cnt;
+ fls->stripe_unit = stripe_unit;
+
+ for (i = 0; i < fls->mirror_array_cnt; i++) {
+ struct nfs4_ff_layout_mirror *mirror;
+ struct cred *kcred;
+ const struct cred __rcu *cred;
+ kuid_t uid;
+ kgid_t gid;
+ u32 ds_count, fh_count, id;
+ int j;
+
+ rc = -EIO;
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_err_free;
+ ds_count = be32_to_cpup(p);
+
+ /* FIXME: allow for striping? */
+ if (ds_count != 1)
+ goto out_err_free;
+
+ fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags);
+ if (fls->mirror_array[i] == NULL) {
+ rc = -ENOMEM;
+ goto out_err_free;
+ }
+
+ fls->mirror_array[i]->ds_count = ds_count;
+
+ /* deviceid */
+ rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid);
+ if (rc)
+ goto out_err_free;
+
+ /* efficiency */
+ rc = -EIO;
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_err_free;
+ fls->mirror_array[i]->efficiency = be32_to_cpup(p);
+
+ /* stateid */
+ rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid);
+ if (rc)
+ goto out_err_free;
+
+ /* fh */
+ rc = -EIO;
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_err_free;
+ fh_count = be32_to_cpup(p);
+
+ fls->mirror_array[i]->fh_versions =
+ kcalloc(fh_count, sizeof(struct nfs_fh),
+ gfp_flags);
+ if (fls->mirror_array[i]->fh_versions == NULL) {
+ rc = -ENOMEM;
+ goto out_err_free;
+ }
+
+ for (j = 0; j < fh_count; j++) {
+ rc = decode_nfs_fh(&stream,
+ &fls->mirror_array[i]->fh_versions[j]);
+ if (rc)
+ goto out_err_free;
+ }
+
+ fls->mirror_array[i]->fh_versions_cnt = fh_count;
+
+ /* user */
+ rc = decode_name(&stream, &id);
+ if (rc)
+ goto out_err_free;
+
+ uid = make_kuid(&init_user_ns, id);
+
+ /* group */
+ rc = decode_name(&stream, &id);
+ if (rc)
+ goto out_err_free;
+
+ gid = make_kgid(&init_user_ns, id);
+
+ if (gfp_flags & __GFP_FS)
+ kcred = prepare_kernel_cred(NULL);
+ else {
+ unsigned int nofs_flags = memalloc_nofs_save();
+ kcred = prepare_kernel_cred(NULL);
+ memalloc_nofs_restore(nofs_flags);
+ }
+ rc = -ENOMEM;
+ if (!kcred)
+ goto out_err_free;
+ kcred->fsuid = uid;
+ kcred->fsgid = gid;
+ cred = RCU_INITIALIZER(kcred);
+
+ if (lgr->range.iomode == IOMODE_READ)
+ rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
+ else
+ rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
+
+ mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
+ if (mirror != fls->mirror_array[i]) {
+ /* swap cred ptrs so free_mirror will clean up old */
+ if (lgr->range.iomode == IOMODE_READ) {
+ cred = xchg(&mirror->ro_cred, cred);
+ rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
+ } else {
+ cred = xchg(&mirror->rw_cred, cred);
+ rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
+ }
+ ff_layout_free_mirror(fls->mirror_array[i]);
+ fls->mirror_array[i] = mirror;
+ }
+
+ dprintk("%s: iomode %s uid %u gid %u\n", __func__,
+ lgr->range.iomode == IOMODE_READ ? "READ" : "RW",
+ from_kuid(&init_user_ns, uid),
+ from_kgid(&init_user_ns, gid));
+ }
+
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_sort_mirrors;
+ fls->flags = be32_to_cpup(p);
+
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_sort_mirrors;
+ for (i=0; i < fls->mirror_array_cnt; i++)
+ fls->mirror_array[i]->report_interval = be32_to_cpup(p);
+
+out_sort_mirrors:
+ ff_layout_sort_mirrors(fls);
+ ret = &fls->generic_hdr;
+ dprintk("<-- %s (success)\n", __func__);
+out_free_page:
+ __free_page(scratch);
+ return ret;
+out_err_free:
+ _ff_layout_free_lseg(fls);
+ ret = ERR_PTR(rc);
+ dprintk("<-- %s (%d)\n", __func__, rc);
+ goto out_free_page;
+}
+
+static void
+ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+
+ dprintk("--> %s\n", __func__);
+
+ if (lseg->pls_range.iomode == IOMODE_RW) {
+ struct nfs4_flexfile_layout *ffl;
+ struct inode *inode;
+
+ ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout);
+ inode = ffl->generic_hdr.plh_inode;
+ spin_lock(&inode->i_lock);
+ pnfs_generic_ds_cinfo_release_lseg(&ffl->commit_info, lseg);
+ spin_unlock(&inode->i_lock);
+ }
+ _ff_layout_free_lseg(fls);
+}
+
+static void
+nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
+{
+ /* first IO request? */
+ if (atomic_inc_return(&timer->n_ops) == 1) {
+ timer->start_time = now;
+ }
+}
+
+static ktime_t
+nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
+{
+ ktime_t start;
+
+ if (atomic_dec_return(&timer->n_ops) < 0)
+ WARN_ON_ONCE(1);
+
+ start = timer->start_time;
+ timer->start_time = now;
+ return ktime_sub(now, start);
+}
+
+static bool
+nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
+ struct nfs4_ff_layoutstat *layoutstat,
+ ktime_t now)
+{
+ s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL;
+ struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout);
+
+ nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
+ if (!mirror->start_time)
+ mirror->start_time = now;
+ if (mirror->report_interval != 0)
+ report_interval = (s64)mirror->report_interval * 1000LL;
+ else if (layoutstats_timer != 0)
+ report_interval = (s64)layoutstats_timer * 1000LL;
+ if (ktime_to_ms(ktime_sub(now, ffl->last_report_time)) >=
+ report_interval) {
+ ffl->last_report_time = now;
+ return true;
+ }
+
+ return false;
+}
+
+static void
+nfs4_ff_layout_stat_io_update_requested(struct nfs4_ff_layoutstat *layoutstat,
+ __u64 requested)
+{
+ struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
+
+ iostat->ops_requested++;
+ iostat->bytes_requested += requested;
+}
+
+static void
+nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
+ __u64 requested,
+ __u64 completed,
+ ktime_t time_completed,
+ ktime_t time_started)
+{
+ struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
+ ktime_t completion_time = ktime_sub(time_completed, time_started);
+ ktime_t timer;
+
+ iostat->ops_completed++;
+ iostat->bytes_completed += completed;
+ iostat->bytes_not_delivered += requested - completed;
+
+ timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer, time_completed);
+ iostat->total_busy_time =
+ ktime_add(iostat->total_busy_time, timer);
+ iostat->aggregate_completion_time =
+ ktime_add(iostat->aggregate_completion_time,
+ completion_time);
+}
+
+static void
+nfs4_ff_layout_stat_io_start_read(struct inode *inode,
+ struct nfs4_ff_layout_mirror *mirror,
+ __u64 requested, ktime_t now)
+{
+ bool report;
+
+ spin_lock(&mirror->lock);
+ report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now);
+ nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
+ set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
+ spin_unlock(&mirror->lock);
+
+ if (report)
+ pnfs_report_layoutstat(inode, GFP_KERNEL);
+}
+
+static void
+nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
+ struct nfs4_ff_layout_mirror *mirror,
+ __u64 requested,
+ __u64 completed)
+{
+ spin_lock(&mirror->lock);
+ nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
+ requested, completed,
+ ktime_get(), task->tk_start);
+ set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
+ spin_unlock(&mirror->lock);
+}
+
+static void
+nfs4_ff_layout_stat_io_start_write(struct inode *inode,
+ struct nfs4_ff_layout_mirror *mirror,
+ __u64 requested, ktime_t now)
+{
+ bool report;
+
+ spin_lock(&mirror->lock);
+ report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now);
+ nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
+ set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
+ spin_unlock(&mirror->lock);
+
+ if (report)
+ pnfs_report_layoutstat(inode, GFP_NOIO);
+}
+
+static void
+nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
+ struct nfs4_ff_layout_mirror *mirror,
+ __u64 requested,
+ __u64 completed,
+ enum nfs3_stable_how committed)
+{
+ if (committed == NFS_UNSTABLE)
+ requested = completed = 0;
+
+ spin_lock(&mirror->lock);
+ nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
+ requested, completed, ktime_get(), task->tk_start);
+ set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
+ spin_unlock(&mirror->lock);
+}
+
+static void
+ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx)
+{
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+ if (devid)
+ nfs4_mark_deviceid_unavailable(devid);
+}
+
+static void
+ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx)
+{
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+ if (devid)
+ nfs4_mark_deviceid_available(devid);
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
+ u32 start_idx, u32 *best_idx,
+ bool check_device)
+{
+ struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_pnfs_ds *ds;
+ bool fail_return = false;
+ u32 idx;
+
+ /* mirrors are initially sorted by efficiency */
+ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
+ if (idx+1 == fls->mirror_array_cnt)
+ fail_return = !check_device;
+
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, fail_return);
+ if (!ds)
+ continue;
+
+ if (check_device &&
+ nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node))
+ continue;
+
+ *best_idx = idx;
+ return ds;
+ }
+
+ return NULL;
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg,
+ u32 start_idx, u32 *best_idx)
+{
+ return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false);
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg,
+ u32 start_idx, u32 *best_idx)
+{
+ return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true);
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
+ u32 start_idx, u32 *best_idx)
+{
+ struct nfs4_pnfs_ds *ds;
+
+ ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx);
+ if (ds)
+ return ds;
+ return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio,
+ u32 *best_idx)
+{
+ struct pnfs_layout_segment *lseg = pgio->pg_lseg;
+ struct nfs4_pnfs_ds *ds;
+
+ ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx,
+ best_idx);
+ if (ds || !pgio->pg_mirror_idx)
+ return ds;
+ return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx);
+}
+
+static void
+ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req,
+ bool strict_iomode)
+{
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ nfs_req_openctx(req),
+ req_offset(req),
+ req->wb_bytes,
+ IOMODE_READ,
+ strict_iomode,
+ GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ }
+}
+
+static void
+ff_layout_pg_check_layout(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ pnfs_generic_pg_check_layout(pgio);
+ pnfs_generic_pg_check_range(pgio, req);
+}
+
+static void
+ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ struct nfs_pgio_mirror *pgm;
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_pnfs_ds *ds;
+ u32 ds_idx;
+
+retry:
+ ff_layout_pg_check_layout(pgio, req);
+ /* Use full layout for now */
+ if (!pgio->pg_lseg) {
+ ff_layout_pg_get_read(pgio, req, false);
+ if (!pgio->pg_lseg)
+ goto out_nolseg;
+ }
+ if (ff_layout_avoid_read_on_rw(pgio->pg_lseg)) {
+ ff_layout_pg_get_read(pgio, req, true);
+ if (!pgio->pg_lseg)
+ goto out_nolseg;
+ }
+
+ ds = ff_layout_get_ds_for_read(pgio, &ds_idx);
+ if (!ds) {
+ if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
+ goto out_mds;
+ pnfs_generic_pg_cleanup(pgio);
+ /* Sleep for 1 second before retrying */
+ ssleep(1);
+ goto retry;
+ }
+
+ mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
+ pgm = &pgio->pg_mirrors[0];
+ pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
+
+ pgio->pg_mirror_idx = ds_idx;
+
+ if (NFS_SERVER(pgio->pg_inode)->flags &
+ (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
+ pgio->pg_maxretrans = io_maxretrans;
+ return;
+out_nolseg:
+ if (pgio->pg_error < 0)
+ return;
+out_mds:
+ trace_pnfs_mds_fallback_pg_init_read(pgio->pg_inode,
+ 0, NFS4_MAX_UINT64, IOMODE_READ,
+ NFS_I(pgio->pg_inode)->layout,
+ pgio->pg_lseg);
+ pgio->pg_maxretrans = 0;
+ nfs_pageio_reset_read_mds(pgio);
+}
+
+static void
+ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs_pgio_mirror *pgm;
+ struct nfs4_pnfs_ds *ds;
+ u32 i;
+
+retry:
+ ff_layout_pg_check_layout(pgio, req);
+ if (!pgio->pg_lseg) {
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ nfs_req_openctx(req),
+ req_offset(req),
+ req->wb_bytes,
+ IOMODE_RW,
+ false,
+ GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
+ /* If no lseg, fall back to write through mds */
+ if (pgio->pg_lseg == NULL)
+ goto out_mds;
+
+ /* Use a direct mapping of ds_idx to pgio mirror_idx */
+ if (pgio->pg_mirror_count != FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg))
+ goto out_eagain;
+
+ for (i = 0; i < pgio->pg_mirror_count; i++) {
+ mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
+ ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true);
+ if (!ds) {
+ if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
+ goto out_mds;
+ pnfs_generic_pg_cleanup(pgio);
+ /* Sleep for 1 second before retrying */
+ ssleep(1);
+ goto retry;
+ }
+ pgm = &pgio->pg_mirrors[i];
+ pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
+ }
+
+ if (NFS_SERVER(pgio->pg_inode)->flags &
+ (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
+ pgio->pg_maxretrans = io_maxretrans;
+ return;
+out_eagain:
+ pnfs_generic_pg_cleanup(pgio);
+ pgio->pg_error = -EAGAIN;
+ return;
+out_mds:
+ trace_pnfs_mds_fallback_pg_init_write(pgio->pg_inode,
+ 0, NFS4_MAX_UINT64, IOMODE_RW,
+ NFS_I(pgio->pg_inode)->layout,
+ pgio->pg_lseg);
+ pgio->pg_maxretrans = 0;
+ nfs_pageio_reset_write_mds(pgio);
+ pgio->pg_error = -EAGAIN;
+}
+
+static unsigned int
+ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ if (!pgio->pg_lseg) {
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ nfs_req_openctx(req),
+ req_offset(req),
+ req->wb_bytes,
+ IOMODE_RW,
+ false,
+ GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ goto out;
+ }
+ }
+ if (pgio->pg_lseg)
+ return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
+
+ trace_pnfs_mds_fallback_pg_get_mirror_count(pgio->pg_inode,
+ 0, NFS4_MAX_UINT64, IOMODE_RW,
+ NFS_I(pgio->pg_inode)->layout,
+ pgio->pg_lseg);
+ /* no lseg means that pnfs is not in use, so no mirroring here */
+ nfs_pageio_reset_write_mds(pgio);
+out:
+ return 1;
+}
+
+static u32
+ff_layout_pg_set_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx)
+{
+ u32 old = desc->pg_mirror_idx;
+
+ desc->pg_mirror_idx = idx;
+ return old;
+}
+
+static struct nfs_pgio_mirror *
+ff_layout_pg_get_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx)
+{
+ return &desc->pg_mirrors[idx];
+}
+
+static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
+ .pg_init = ff_layout_pg_init_read,
+ .pg_test = pnfs_generic_pg_test,
+ .pg_doio = pnfs_generic_pg_readpages,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static const struct nfs_pageio_ops ff_layout_pg_write_ops = {
+ .pg_init = ff_layout_pg_init_write,
+ .pg_test = pnfs_generic_pg_test,
+ .pg_doio = pnfs_generic_pg_writepages,
+ .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
+ .pg_get_mirror = ff_layout_pg_get_mirror_write,
+ .pg_set_mirror = ff_layout_pg_set_mirror_write,
+};
+
+static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
+{
+ struct rpc_task *task = &hdr->task;
+
+ pnfs_layoutcommit_inode(hdr->inode, false);
+
+ if (retry_pnfs) {
+ dprintk("%s Reset task %5u for i/o through pNFS "
+ "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+ hdr->task.tk_pid,
+ hdr->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(hdr->inode),
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
+
+ hdr->completion_ops->reschedule_io(hdr);
+ return;
+ }
+
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ dprintk("%s Reset task %5u for i/o through MDS "
+ "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+ hdr->task.tk_pid,
+ hdr->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(hdr->inode),
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
+
+ trace_pnfs_mds_fallback_write_done(hdr->inode,
+ hdr->args.offset, hdr->args.count,
+ IOMODE_RW, NFS_I(hdr->inode)->layout,
+ hdr->lseg);
+ task->tk_status = pnfs_write_done_resend_to_mds(hdr);
+ }
+}
+
+static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr)
+{
+ u32 idx = hdr->pgio_mirror_idx + 1;
+ u32 new_idx = 0;
+
+ if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx))
+ ff_layout_send_layouterror(hdr->lseg);
+ else
+ pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
+ pnfs_read_resend_pnfs(hdr, new_idx);
+}
+
+static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
+{
+ struct rpc_task *task = &hdr->task;
+
+ pnfs_layoutcommit_inode(hdr->inode, false);
+ pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
+
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ dprintk("%s Reset task %5u for i/o through MDS "
+ "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+ hdr->task.tk_pid,
+ hdr->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(hdr->inode),
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
+
+ trace_pnfs_mds_fallback_read_done(hdr->inode,
+ hdr->args.offset, hdr->args.count,
+ IOMODE_READ, NFS_I(hdr->inode)->layout,
+ hdr->lseg);
+ task->tk_status = pnfs_read_done_resend_to_mds(hdr);
+ }
+}
+
+static int ff_layout_async_handle_error_v4(struct rpc_task *task,
+ struct nfs4_state *state,
+ struct nfs_client *clp,
+ struct pnfs_layout_segment *lseg,
+ u32 idx)
+{
+ struct pnfs_layout_hdr *lo = lseg->pls_layout;
+ struct inode *inode = lo->plh_inode;
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+ struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
+
+ switch (task->tk_status) {
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case -NFS4ERR_SEQ_FALSE_RETRY:
+ case -NFS4ERR_SEQ_MISORDERED:
+ dprintk("%s ERROR %d, Reset session. Exchangeid "
+ "flags 0x%x\n", __func__, task->tk_status,
+ clp->cl_exchange_flags);
+ nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
+ break;
+ case -NFS4ERR_DELAY:
+ case -NFS4ERR_GRACE:
+ rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX);
+ break;
+ case -NFS4ERR_RETRY_UNCACHED_REP:
+ break;
+ /* Invalidate Layout errors */
+ case -NFS4ERR_PNFS_NO_LAYOUT:
+ case -ESTALE: /* mapped NFS4ERR_STALE */
+ case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */
+ case -EISDIR: /* mapped NFS4ERR_ISDIR */
+ case -NFS4ERR_FHEXPIRED:
+ case -NFS4ERR_WRONG_TYPE:
+ dprintk("%s Invalid layout error %d\n", __func__,
+ task->tk_status);
+ /*
+ * Destroy layout so new i/o will get a new layout.
+ * Layout will not be destroyed until all current lseg
+ * references are put. Mark layout as invalid to resend failed
+ * i/o and all i/o waiting on the slot table to the MDS until
+ * layout is destroyed and a new valid layout is obtained.
+ */
+ pnfs_destroy_layout(NFS_I(inode));
+ rpc_wake_up(&tbl->slot_tbl_waitq);
+ goto reset;
+ /* RPC connection errors */
+ case -ECONNREFUSED:
+ case -EHOSTDOWN:
+ case -EHOSTUNREACH:
+ case -ENETUNREACH:
+ case -EIO:
+ case -ETIMEDOUT:
+ case -EPIPE:
+ dprintk("%s DS connection error %d\n", __func__,
+ task->tk_status);
+ nfs4_delete_deviceid(devid->ld, devid->nfs_client,
+ &devid->deviceid);
+ rpc_wake_up(&tbl->slot_tbl_waitq);
+ fallthrough;
+ default:
+ if (ff_layout_avoid_mds_available_ds(lseg))
+ return -NFS4ERR_RESET_TO_PNFS;
+reset:
+ dprintk("%s Retry through MDS. Error %d\n", __func__,
+ task->tk_status);
+ return -NFS4ERR_RESET_TO_MDS;
+ }
+ task->tk_status = 0;
+ return -EAGAIN;
+}
+
+/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
+static int ff_layout_async_handle_error_v3(struct rpc_task *task,
+ struct pnfs_layout_segment *lseg,
+ u32 idx)
+{
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+ switch (task->tk_status) {
+ /* File access problems. Don't mark the device as unavailable */
+ case -EACCES:
+ case -ESTALE:
+ case -EISDIR:
+ case -EBADHANDLE:
+ case -ELOOP:
+ case -ENOSPC:
+ break;
+ case -EJUKEBOX:
+ nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+ goto out_retry;
+ default:
+ dprintk("%s DS connection error %d\n", __func__,
+ task->tk_status);
+ nfs4_delete_deviceid(devid->ld, devid->nfs_client,
+ &devid->deviceid);
+ }
+ /* FIXME: Need to prevent infinite looping here. */
+ return -NFS4ERR_RESET_TO_PNFS;
+out_retry:
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
+ rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
+ return -EAGAIN;
+}
+
+static int ff_layout_async_handle_error(struct rpc_task *task,
+ struct nfs4_state *state,
+ struct nfs_client *clp,
+ struct pnfs_layout_segment *lseg,
+ u32 idx)
+{
+ int vers = clp->cl_nfs_mod->rpc_vers->number;
+
+ if (task->tk_status >= 0) {
+ ff_layout_mark_ds_reachable(lseg, idx);
+ return 0;
+ }
+
+ /* Handle the case of an invalid layout segment */
+ if (!pnfs_is_valid_lseg(lseg))
+ return -NFS4ERR_RESET_TO_PNFS;
+
+ switch (vers) {
+ case 3:
+ return ff_layout_async_handle_error_v3(task, lseg, idx);
+ case 4:
+ return ff_layout_async_handle_error_v4(task, state, clp,
+ lseg, idx);
+ default:
+ /* should never happen */
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+}
+
+static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
+ u32 idx, u64 offset, u64 length,
+ u32 *op_status, int opnum, int error)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+ u32 status = *op_status;
+ int err;
+
+ if (status == 0) {
+ switch (error) {
+ case -ETIMEDOUT:
+ case -EPFNOSUPPORT:
+ case -EPROTONOSUPPORT:
+ case -EOPNOTSUPP:
+ case -EINVAL:
+ case -ECONNREFUSED:
+ case -ECONNRESET:
+ case -EHOSTDOWN:
+ case -EHOSTUNREACH:
+ case -ENETUNREACH:
+ case -EADDRINUSE:
+ case -ENOBUFS:
+ case -EPIPE:
+ case -EPERM:
+ *op_status = status = NFS4ERR_NXIO;
+ break;
+ case -EACCES:
+ *op_status = status = NFS4ERR_ACCESS;
+ break;
+ default:
+ return;
+ }
+ }
+
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+ mirror, offset, length, status, opnum,
+ GFP_NOIO);
+
+ switch (status) {
+ case NFS4ERR_DELAY:
+ case NFS4ERR_GRACE:
+ break;
+ case NFS4ERR_NXIO:
+ ff_layout_mark_ds_unreachable(lseg, idx);
+ /*
+ * Don't return the layout if this is a read and we still
+ * have layouts to try
+ */
+ if (opnum == OP_READ)
+ break;
+ fallthrough;
+ default:
+ pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
+ lseg);
+ }
+
+ dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
+}
+
+/* NFS_PROTO call done callback routines */
+static int ff_layout_read_done_cb(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ int err;
+
+ if (task->tk_status < 0) {
+ ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+ hdr->args.offset, hdr->args.count,
+ &hdr->res.op_status, OP_READ,
+ task->tk_status);
+ trace_ff_layout_read_error(hdr);
+ }
+
+ err = ff_layout_async_handle_error(task, hdr->args.context->state,
+ hdr->ds_clp, hdr->lseg,
+ hdr->pgio_mirror_idx);
+
+ trace_nfs4_pnfs_read(hdr, err);
+ clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
+ clear_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags);
+ switch (err) {
+ case -NFS4ERR_RESET_TO_PNFS:
+ set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
+ return task->tk_status;
+ case -NFS4ERR_RESET_TO_MDS:
+ set_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags);
+ return task->tk_status;
+ case -EAGAIN:
+ goto out_eagain;
+ }
+
+ return 0;
+out_eagain:
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+}
+
+static bool
+ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg)
+{
+ return !(FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_LAYOUTCOMMIT);
+}
+
+/*
+ * We reference the rpc_cred of the first WRITE that triggers the need for
+ * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
+ * rfc5661 is not clear about which credential should be used.
+ *
+ * Flexlayout client should treat DS replied FILE_SYNC as DATA_SYNC, so
+ * to follow http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751
+ * we always send layoutcommit after DS writes.
+ */
+static void
+ff_layout_set_layoutcommit(struct inode *inode,
+ struct pnfs_layout_segment *lseg,
+ loff_t end_offset)
+{
+ if (!ff_layout_need_layoutcommit(lseg))
+ return;
+
+ pnfs_set_layoutcommit(inode, lseg, end_offset);
+ dprintk("%s inode %lu pls_end_pos %llu\n", __func__, inode->i_ino,
+ (unsigned long long) NFS_I(inode)->layout->plh_lwb);
+}
+
+static void ff_layout_read_record_layoutstats_start(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
+ nfs4_ff_layout_stat_io_start_read(hdr->inode,
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count,
+ task->tk_start);
+}
+
+static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
+ nfs4_ff_layout_stat_io_end_read(task,
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count,
+ hdr->res.count);
+ set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags);
+}
+
+static int ff_layout_read_prepare_common(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+ rpc_exit(task, -EIO);
+ return -EIO;
+ }
+
+ ff_layout_read_record_layoutstats_start(task, hdr);
+ return 0;
+}
+
+/*
+ * Call ops for the async read/write cases
+ * In the case of dense layouts, the offset needs to be reset to its
+ * original value.
+ */
+static void ff_layout_read_prepare_v3(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (ff_layout_read_prepare_common(task, hdr))
+ return;
+
+ rpc_call_start(task);
+}
+
+static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (nfs4_setup_sequence(hdr->ds_clp,
+ &hdr->args.seq_args,
+ &hdr->res.seq_res,
+ task))
+ return;
+
+ ff_layout_read_prepare_common(task, hdr);
+}
+
+static void ff_layout_read_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
+
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+ task->tk_status == 0) {
+ nfs4_sequence_done(task, &hdr->res.seq_res);
+ return;
+ }
+
+ /* Note this may cause RPC to be resent */
+ hdr->mds_ops->rpc_call_done(task, hdr);
+}
+
+static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ ff_layout_read_record_layoutstats_done(task, hdr);
+ rpc_count_iostats_metrics(task,
+ &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
+}
+
+static void ff_layout_read_release(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ ff_layout_read_record_layoutstats_done(&hdr->task, hdr);
+ if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags))
+ ff_layout_resend_pnfs_read(hdr);
+ else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
+ ff_layout_reset_read(hdr);
+ pnfs_generic_rw_release(data);
+}
+
+
+static int ff_layout_write_done_cb(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ loff_t end_offs = 0;
+ int err;
+
+ if (task->tk_status < 0) {
+ ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+ hdr->args.offset, hdr->args.count,
+ &hdr->res.op_status, OP_WRITE,
+ task->tk_status);
+ trace_ff_layout_write_error(hdr);
+ }
+
+ err = ff_layout_async_handle_error(task, hdr->args.context->state,
+ hdr->ds_clp, hdr->lseg,
+ hdr->pgio_mirror_idx);
+
+ trace_nfs4_pnfs_write(hdr, err);
+ clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
+ clear_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags);
+ switch (err) {
+ case -NFS4ERR_RESET_TO_PNFS:
+ set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
+ return task->tk_status;
+ case -NFS4ERR_RESET_TO_MDS:
+ set_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags);
+ return task->tk_status;
+ case -EAGAIN:
+ return -EAGAIN;
+ }
+
+ if (hdr->res.verf->committed == NFS_FILE_SYNC ||
+ hdr->res.verf->committed == NFS_DATA_SYNC)
+ end_offs = hdr->mds_offset + (loff_t)hdr->res.count;
+
+ /* Note: if the write is unstable, don't set end_offs until commit */
+ ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, end_offs);
+
+ /* zero out fattr since we don't care DS attr at all */
+ hdr->fattr.valid = 0;
+ if (task->tk_status >= 0)
+ nfs_writeback_update_inode(hdr);
+
+ return 0;
+}
+
+static int ff_layout_commit_done_cb(struct rpc_task *task,
+ struct nfs_commit_data *data)
+{
+ int err;
+
+ if (task->tk_status < 0) {
+ ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
+ data->args.offset, data->args.count,
+ &data->res.op_status, OP_COMMIT,
+ task->tk_status);
+ trace_ff_layout_commit_error(data);
+ }
+
+ err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
+ data->lseg, data->ds_commit_index);
+
+ trace_nfs4_pnfs_commit_ds(data, err);
+ switch (err) {
+ case -NFS4ERR_RESET_TO_PNFS:
+ pnfs_generic_prepare_to_resend_writes(data);
+ return -EAGAIN;
+ case -NFS4ERR_RESET_TO_MDS:
+ pnfs_generic_prepare_to_resend_writes(data);
+ return -EAGAIN;
+ case -EAGAIN:
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
+
+ ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb);
+
+ return 0;
+}
+
+static void ff_layout_write_record_layoutstats_start(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
+ nfs4_ff_layout_stat_io_start_write(hdr->inode,
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count,
+ task->tk_start);
+}
+
+static void ff_layout_write_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
+ nfs4_ff_layout_stat_io_end_write(task,
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count, hdr->res.count,
+ hdr->res.verf->committed);
+ set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags);
+}
+
+static int ff_layout_write_prepare_common(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+ rpc_exit(task, -EIO);
+ return -EIO;
+ }
+
+ ff_layout_write_record_layoutstats_start(task, hdr);
+ return 0;
+}
+
+static void ff_layout_write_prepare_v3(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (ff_layout_write_prepare_common(task, hdr))
+ return;
+
+ rpc_call_start(task);
+}
+
+static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (nfs4_setup_sequence(hdr->ds_clp,
+ &hdr->args.seq_args,
+ &hdr->res.seq_res,
+ task))
+ return;
+
+ ff_layout_write_prepare_common(task, hdr);
+}
+
+static void ff_layout_write_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+ task->tk_status == 0) {
+ nfs4_sequence_done(task, &hdr->res.seq_res);
+ return;
+ }
+
+ /* Note this may cause RPC to be resent */
+ hdr->mds_ops->rpc_call_done(task, hdr);
+}
+
+static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ ff_layout_write_record_layoutstats_done(task, hdr);
+ rpc_count_iostats_metrics(task,
+ &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
+}
+
+static void ff_layout_write_release(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ ff_layout_write_record_layoutstats_done(&hdr->task, hdr);
+ if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) {
+ ff_layout_send_layouterror(hdr->lseg);
+ ff_layout_reset_write(hdr, true);
+ } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
+ ff_layout_reset_write(hdr, false);
+ pnfs_generic_rw_release(data);
+}
+
+static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task,
+ struct nfs_commit_data *cdata)
+{
+ if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags))
+ return;
+ nfs4_ff_layout_stat_io_start_write(cdata->inode,
+ FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+ 0, task->tk_start);
+}
+
+static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_commit_data *cdata)
+{
+ struct nfs_page *req;
+ __u64 count = 0;
+
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags))
+ return;
+
+ if (task->tk_status == 0) {
+ list_for_each_entry(req, &cdata->pages, wb_list)
+ count += req->wb_bytes;
+ }
+ nfs4_ff_layout_stat_io_end_write(task,
+ FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+ count, count, NFS_FILE_SYNC);
+ set_bit(NFS_LSEG_LAYOUTRETURN, &cdata->lseg->pls_flags);
+}
+
+static void ff_layout_commit_prepare_common(struct rpc_task *task,
+ struct nfs_commit_data *cdata)
+{
+ ff_layout_commit_record_layoutstats_start(task, cdata);
+}
+
+static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
+{
+ ff_layout_commit_prepare_common(task, data);
+ rpc_call_start(task);
+}
+
+static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
+{
+ struct nfs_commit_data *wdata = data;
+
+ if (nfs4_setup_sequence(wdata->ds_clp,
+ &wdata->args.seq_args,
+ &wdata->res.seq_res,
+ task))
+ return;
+ ff_layout_commit_prepare_common(task, data);
+}
+
+static void ff_layout_commit_done(struct rpc_task *task, void *data)
+{
+ pnfs_generic_write_commit_done(task, data);
+}
+
+static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
+{
+ struct nfs_commit_data *cdata = data;
+
+ ff_layout_commit_record_layoutstats_done(task, cdata);
+ rpc_count_iostats_metrics(task,
+ &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
+}
+
+static void ff_layout_commit_release(void *data)
+{
+ struct nfs_commit_data *cdata = data;
+
+ ff_layout_commit_record_layoutstats_done(&cdata->task, cdata);
+ pnfs_generic_commit_release(data);
+}
+
+static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
+ .rpc_call_prepare = ff_layout_read_prepare_v3,
+ .rpc_call_done = ff_layout_read_call_done,
+ .rpc_count_stats = ff_layout_read_count_stats,
+ .rpc_release = ff_layout_read_release,
+};
+
+static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
+ .rpc_call_prepare = ff_layout_read_prepare_v4,
+ .rpc_call_done = ff_layout_read_call_done,
+ .rpc_count_stats = ff_layout_read_count_stats,
+ .rpc_release = ff_layout_read_release,
+};
+
+static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
+ .rpc_call_prepare = ff_layout_write_prepare_v3,
+ .rpc_call_done = ff_layout_write_call_done,
+ .rpc_count_stats = ff_layout_write_count_stats,
+ .rpc_release = ff_layout_write_release,
+};
+
+static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
+ .rpc_call_prepare = ff_layout_write_prepare_v4,
+ .rpc_call_done = ff_layout_write_call_done,
+ .rpc_count_stats = ff_layout_write_count_stats,
+ .rpc_release = ff_layout_write_release,
+};
+
+static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
+ .rpc_call_prepare = ff_layout_commit_prepare_v3,
+ .rpc_call_done = ff_layout_commit_done,
+ .rpc_count_stats = ff_layout_commit_count_stats,
+ .rpc_release = ff_layout_commit_release,
+};
+
+static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
+ .rpc_call_prepare = ff_layout_commit_prepare_v4,
+ .rpc_call_done = ff_layout_commit_done,
+ .rpc_count_stats = ff_layout_commit_count_stats,
+ .rpc_release = ff_layout_commit_release,
+};
+
+static enum pnfs_try_status
+ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
+{
+ struct pnfs_layout_segment *lseg = hdr->lseg;
+ struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
+ struct nfs4_ff_layout_mirror *mirror;
+ const struct cred *ds_cred;
+ loff_t offset = hdr->args.offset;
+ u32 idx = hdr->pgio_mirror_idx;
+ int vers;
+ struct nfs_fh *fh;
+
+ dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n",
+ __func__, hdr->inode->i_ino,
+ hdr->args.pgbase, (size_t)hdr->args.count, offset);
+
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
+ if (!ds)
+ goto out_failed;
+
+ ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
+ hdr->inode);
+ if (IS_ERR(ds_clnt))
+ goto out_failed;
+
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
+ if (!ds_cred)
+ goto out_failed;
+
+ vers = nfs4_ff_layout_ds_version(mirror);
+
+ dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
+ ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers);
+
+ hdr->pgio_done_cb = ff_layout_read_done_cb;
+ refcount_inc(&ds->ds_clp->cl_count);
+ hdr->ds_clp = ds->ds_clp;
+ fh = nfs4_ff_layout_select_ds_fh(mirror);
+ if (fh)
+ hdr->args.fh = fh;
+
+ nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
+
+ /*
+ * Note that if we ever decide to split across DSes,
+ * then we may need to handle dense-like offsets.
+ */
+ hdr->args.offset = offset;
+ hdr->mds_offset = offset;
+
+ /* Perform an asynchronous read to ds */
+ nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
+ vers == 3 ? &ff_layout_read_call_ops_v3 :
+ &ff_layout_read_call_ops_v4,
+ 0, RPC_TASK_SOFTCONN);
+ put_cred(ds_cred);
+ return PNFS_ATTEMPTED;
+
+out_failed:
+ if (ff_layout_avoid_mds_available_ds(lseg))
+ return PNFS_TRY_AGAIN;
+ trace_pnfs_mds_fallback_read_pagelist(hdr->inode,
+ hdr->args.offset, hdr->args.count,
+ IOMODE_READ, NFS_I(hdr->inode)->layout, lseg);
+ return PNFS_NOT_ATTEMPTED;
+}
+
+/* Perform async writes. */
+static enum pnfs_try_status
+ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
+{
+ struct pnfs_layout_segment *lseg = hdr->lseg;
+ struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
+ struct nfs4_ff_layout_mirror *mirror;
+ const struct cred *ds_cred;
+ loff_t offset = hdr->args.offset;
+ int vers;
+ struct nfs_fh *fh;
+ u32 idx = hdr->pgio_mirror_idx;
+
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
+ if (!ds)
+ goto out_failed;
+
+ ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
+ hdr->inode);
+ if (IS_ERR(ds_clnt))
+ goto out_failed;
+
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
+ if (!ds_cred)
+ goto out_failed;
+
+ vers = nfs4_ff_layout_ds_version(mirror);
+
+ dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n",
+ __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
+ offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count),
+ vers);
+
+ hdr->pgio_done_cb = ff_layout_write_done_cb;
+ refcount_inc(&ds->ds_clp->cl_count);
+ hdr->ds_clp = ds->ds_clp;
+ hdr->ds_commit_idx = idx;
+ fh = nfs4_ff_layout_select_ds_fh(mirror);
+ if (fh)
+ hdr->args.fh = fh;
+
+ nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
+
+ /*
+ * Note that if we ever decide to split across DSes,
+ * then we may need to handle dense-like offsets.
+ */
+ hdr->args.offset = offset;
+
+ /* Perform an asynchronous write */
+ nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
+ vers == 3 ? &ff_layout_write_call_ops_v3 :
+ &ff_layout_write_call_ops_v4,
+ sync, RPC_TASK_SOFTCONN);
+ put_cred(ds_cred);
+ return PNFS_ATTEMPTED;
+
+out_failed:
+ if (ff_layout_avoid_mds_available_ds(lseg))
+ return PNFS_TRY_AGAIN;
+ trace_pnfs_mds_fallback_write_pagelist(hdr->inode,
+ hdr->args.offset, hdr->args.count,
+ IOMODE_RW, NFS_I(hdr->inode)->layout, lseg);
+ return PNFS_NOT_ATTEMPTED;
+}
+
+static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+ return i;
+}
+
+static struct nfs_fh *
+select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+
+ /* FIXME: Assume that there is only one NFS version available
+ * for the DS.
+ */
+ return &flseg->mirror_array[i]->fh_versions[0];
+}
+
+static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
+{
+ struct pnfs_layout_segment *lseg = data->lseg;
+ struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
+ struct nfs4_ff_layout_mirror *mirror;
+ const struct cred *ds_cred;
+ u32 idx;
+ int vers, ret;
+ struct nfs_fh *fh;
+
+ if (!lseg || !(pnfs_is_valid_lseg(lseg) ||
+ test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)))
+ goto out_err;
+
+ idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
+ if (!ds)
+ goto out_err;
+
+ ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
+ data->inode);
+ if (IS_ERR(ds_clnt))
+ goto out_err;
+
+ ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred);
+ if (!ds_cred)
+ goto out_err;
+
+ vers = nfs4_ff_layout_ds_version(mirror);
+
+ dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
+ data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count),
+ vers);
+ data->commit_done_cb = ff_layout_commit_done_cb;
+ data->cred = ds_cred;
+ refcount_inc(&ds->ds_clp->cl_count);
+ data->ds_clp = ds->ds_clp;
+ fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
+ if (fh)
+ data->args.fh = fh;
+
+ ret = nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
+ vers == 3 ? &ff_layout_commit_call_ops_v3 :
+ &ff_layout_commit_call_ops_v4,
+ how, RPC_TASK_SOFTCONN);
+ put_cred(ds_cred);
+ return ret;
+out_err:
+ pnfs_generic_prepare_to_resend_writes(data);
+ pnfs_generic_commit_release(data);
+ return -EAGAIN;
+}
+
+static int
+ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
+ int how, struct nfs_commit_info *cinfo)
+{
+ return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
+ ff_layout_initiate_commit);
+}
+
+static struct pnfs_ds_commit_info *
+ff_layout_get_ds_info(struct inode *inode)
+{
+ struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
+
+ if (layout == NULL)
+ return NULL;
+
+ return &FF_LAYOUT_FROM_HDR(layout)->commit_info;
+}
+
+static void
+ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+ struct inode *inode = lseg->pls_layout->plh_inode;
+ struct pnfs_commit_array *array, *new;
+
+ new = pnfs_alloc_commit_array(flseg->mirror_array_cnt, GFP_NOIO);
+ if (new) {
+ spin_lock(&inode->i_lock);
+ array = pnfs_add_commit_array(fl_cinfo, new, lseg);
+ spin_unlock(&inode->i_lock);
+ if (array != new)
+ pnfs_free_commit_array(new);
+ }
+}
+
+static void
+ff_layout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+ struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ pnfs_generic_ds_cinfo_destroy(fl_cinfo);
+ spin_unlock(&inode->i_lock);
+}
+
+static void
+ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+ nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
+ id_node));
+}
+
+static int ff_layout_encode_ioerr(struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args,
+ const struct nfs4_flexfile_layoutreturn_args *ff_args)
+{
+ __be32 *start;
+
+ start = xdr_reserve_space(xdr, 4);
+ if (unlikely(!start))
+ return -E2BIG;
+
+ *start = cpu_to_be32(ff_args->num_errors);
+ /* This assume we always return _ALL_ layouts */
+ return ff_layout_encode_ds_ioerr(xdr, &ff_args->errors);
+}
+
+static void
+encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len)
+{
+ WARN_ON_ONCE(xdr_stream_encode_opaque_fixed(xdr, buf, len) < 0);
+}
+
+static void
+ff_layout_encode_ff_iostat_head(struct xdr_stream *xdr,
+ const nfs4_stateid *stateid,
+ const struct nfs42_layoutstat_devinfo *devinfo)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 8 + 8);
+ p = xdr_encode_hyper(p, devinfo->offset);
+ p = xdr_encode_hyper(p, devinfo->length);
+ encode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE);
+ p = xdr_reserve_space(xdr, 4*8);
+ p = xdr_encode_hyper(p, devinfo->read_count);
+ p = xdr_encode_hyper(p, devinfo->read_bytes);
+ p = xdr_encode_hyper(p, devinfo->write_count);
+ p = xdr_encode_hyper(p, devinfo->write_bytes);
+ encode_opaque_fixed(xdr, devinfo->dev_id.data, NFS4_DEVICEID4_SIZE);
+}
+
+static void
+ff_layout_encode_ff_iostat(struct xdr_stream *xdr,
+ const nfs4_stateid *stateid,
+ const struct nfs42_layoutstat_devinfo *devinfo)
+{
+ ff_layout_encode_ff_iostat_head(xdr, stateid, devinfo);
+ ff_layout_encode_ff_layoutupdate(xdr, devinfo,
+ devinfo->ld_private.data);
+}
+
+/* report nothing for now */
+static void ff_layout_encode_iostats_array(struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args,
+ struct nfs4_flexfile_layoutreturn_args *ff_args)
+{
+ __be32 *p;
+ int i;
+
+ p = xdr_reserve_space(xdr, 4);
+ *p = cpu_to_be32(ff_args->num_dev);
+ for (i = 0; i < ff_args->num_dev; i++)
+ ff_layout_encode_ff_iostat(xdr,
+ &args->layout->plh_stateid,
+ &ff_args->devinfo[i]);
+}
+
+static void
+ff_layout_free_iostats_array(struct nfs42_layoutstat_devinfo *devinfo,
+ unsigned int num_entries)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_entries; i++) {
+ if (!devinfo[i].ld_private.ops)
+ continue;
+ if (!devinfo[i].ld_private.ops->free)
+ continue;
+ devinfo[i].ld_private.ops->free(&devinfo[i].ld_private);
+ }
+}
+
+static struct nfs4_deviceid_node *
+ff_layout_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_flags)
+{
+ struct nfs4_ff_layout_ds *dsaddr;
+
+ dsaddr = nfs4_ff_alloc_deviceid_node(server, pdev, gfp_flags);
+ if (!dsaddr)
+ return NULL;
+ return &dsaddr->id_node;
+}
+
+static void
+ff_layout_encode_layoutreturn(struct xdr_stream *xdr,
+ const void *voidargs,
+ const struct nfs4_xdr_opaque_data *ff_opaque)
+{
+ const struct nfs4_layoutreturn_args *args = voidargs;
+ struct nfs4_flexfile_layoutreturn_args *ff_args = ff_opaque->data;
+ struct xdr_buf tmp_buf = {
+ .head = {
+ [0] = {
+ .iov_base = page_address(ff_args->pages[0]),
+ },
+ },
+ .buflen = PAGE_SIZE,
+ };
+ struct xdr_stream tmp_xdr;
+ __be32 *start;
+
+ dprintk("%s: Begin\n", __func__);
+
+ xdr_init_encode(&tmp_xdr, &tmp_buf, NULL, NULL);
+
+ ff_layout_encode_ioerr(&tmp_xdr, args, ff_args);
+ ff_layout_encode_iostats_array(&tmp_xdr, args, ff_args);
+
+ start = xdr_reserve_space(xdr, 4);
+ *start = cpu_to_be32(tmp_buf.len);
+ xdr_write_pages(xdr, ff_args->pages, 0, tmp_buf.len);
+
+ dprintk("%s: Return\n", __func__);
+}
+
+static void
+ff_layout_free_layoutreturn(struct nfs4_xdr_opaque_data *args)
+{
+ struct nfs4_flexfile_layoutreturn_args *ff_args;
+
+ if (!args->data)
+ return;
+ ff_args = args->data;
+ args->data = NULL;
+
+ ff_layout_free_ds_ioerr(&ff_args->errors);
+ ff_layout_free_iostats_array(ff_args->devinfo, ff_args->num_dev);
+
+ put_page(ff_args->pages[0]);
+ kfree(ff_args);
+}
+
+static const struct nfs4_xdr_opaque_ops layoutreturn_ops = {
+ .encode = ff_layout_encode_layoutreturn,
+ .free = ff_layout_free_layoutreturn,
+};
+
+static int
+ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args)
+{
+ struct nfs4_flexfile_layoutreturn_args *ff_args;
+ struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(args->layout);
+
+ ff_args = kmalloc(sizeof(*ff_args), GFP_KERNEL);
+ if (!ff_args)
+ goto out_nomem;
+ ff_args->pages[0] = alloc_page(GFP_KERNEL);
+ if (!ff_args->pages[0])
+ goto out_nomem_free;
+
+ INIT_LIST_HEAD(&ff_args->errors);
+ ff_args->num_errors = ff_layout_fetch_ds_ioerr(args->layout,
+ &args->range, &ff_args->errors,
+ FF_LAYOUTRETURN_MAXERR);
+
+ spin_lock(&args->inode->i_lock);
+ ff_args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr,
+ &ff_args->devinfo[0], ARRAY_SIZE(ff_args->devinfo));
+ spin_unlock(&args->inode->i_lock);
+
+ args->ld_private->ops = &layoutreturn_ops;
+ args->ld_private->data = ff_args;
+ return 0;
+out_nomem_free:
+ kfree(ff_args);
+out_nomem:
+ return -ENOMEM;
+}
+
+#ifdef CONFIG_NFS_V4_2
+void
+ff_layout_send_layouterror(struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_layout_hdr *lo = lseg->pls_layout;
+ struct nfs42_layout_error *errors;
+ LIST_HEAD(head);
+
+ if (!nfs_server_capable(lo->plh_inode, NFS_CAP_LAYOUTERROR))
+ return;
+ ff_layout_fetch_ds_ioerr(lo, &lseg->pls_range, &head, -1);
+ if (list_empty(&head))
+ return;
+
+ errors = kmalloc_array(NFS42_LAYOUTERROR_MAX,
+ sizeof(*errors), GFP_NOFS);
+ if (errors != NULL) {
+ const struct nfs4_ff_layout_ds_err *pos;
+ size_t n = 0;
+
+ list_for_each_entry(pos, &head, list) {
+ errors[n].offset = pos->offset;
+ errors[n].length = pos->length;
+ nfs4_stateid_copy(&errors[n].stateid, &pos->stateid);
+ errors[n].errors[0].dev_id = pos->deviceid;
+ errors[n].errors[0].status = pos->status;
+ errors[n].errors[0].opnum = pos->opnum;
+ n++;
+ if (!list_is_last(&pos->list, &head) &&
+ n < NFS42_LAYOUTERROR_MAX)
+ continue;
+ if (nfs42_proc_layouterror(lseg, errors, n) < 0)
+ break;
+ n = 0;
+ }
+ kfree(errors);
+ }
+ ff_layout_free_ds_ioerr(&head);
+}
+#else
+void
+ff_layout_send_layouterror(struct pnfs_layout_segment *lseg)
+{
+}
+#endif
+
+static int
+ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen)
+{
+ const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+
+ return snprintf(buf, buflen, "%pI4", &sin->sin_addr);
+}
+
+static size_t
+ff_layout_ntop6_noscopeid(const struct sockaddr *sap, char *buf,
+ const int buflen)
+{
+ const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+ const struct in6_addr *addr = &sin6->sin6_addr;
+
+ /*
+ * RFC 4291, Section 2.2.2
+ *
+ * Shorthanded ANY address
+ */
+ if (ipv6_addr_any(addr))
+ return snprintf(buf, buflen, "::");
+
+ /*
+ * RFC 4291, Section 2.2.2
+ *
+ * Shorthanded loopback address
+ */
+ if (ipv6_addr_loopback(addr))
+ return snprintf(buf, buflen, "::1");
+
+ /*
+ * RFC 4291, Section 2.2.3
+ *
+ * Special presentation address format for mapped v4
+ * addresses.
+ */
+ if (ipv6_addr_v4mapped(addr))
+ return snprintf(buf, buflen, "::ffff:%pI4",
+ &addr->s6_addr32[3]);
+
+ /*
+ * RFC 4291, Section 2.2.1
+ */
+ return snprintf(buf, buflen, "%pI6c", addr);
+}
+
+/* Derived from rpc_sockaddr2uaddr */
+static void
+ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da)
+{
+ struct sockaddr *sap = (struct sockaddr *)&da->da_addr;
+ char portbuf[RPCBIND_MAXUADDRPLEN];
+ char addrbuf[RPCBIND_MAXUADDRLEN];
+ char *netid;
+ unsigned short port;
+ int len, netid_len;
+ __be32 *p;
+
+ switch (sap->sa_family) {
+ case AF_INET:
+ if (ff_layout_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0)
+ return;
+ port = ntohs(((struct sockaddr_in *)sap)->sin_port);
+ netid = "tcp";
+ netid_len = 3;
+ break;
+ case AF_INET6:
+ if (ff_layout_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0)
+ return;
+ port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
+ netid = "tcp6";
+ netid_len = 4;
+ break;
+ default:
+ /* we only support tcp and tcp6 */
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff);
+ len = strlcat(addrbuf, portbuf, sizeof(addrbuf));
+
+ p = xdr_reserve_space(xdr, 4 + netid_len);
+ xdr_encode_opaque(p, netid, netid_len);
+
+ p = xdr_reserve_space(xdr, 4 + len);
+ xdr_encode_opaque(p, addrbuf, len);
+}
+
+static void
+ff_layout_encode_nfstime(struct xdr_stream *xdr,
+ ktime_t t)
+{
+ struct timespec64 ts;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 12);
+ ts = ktime_to_timespec64(t);
+ p = xdr_encode_hyper(p, ts.tv_sec);
+ *p++ = cpu_to_be32(ts.tv_nsec);
+}
+
+static void
+ff_layout_encode_io_latency(struct xdr_stream *xdr,
+ struct nfs4_ff_io_stat *stat)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 5 * 8);
+ p = xdr_encode_hyper(p, stat->ops_requested);
+ p = xdr_encode_hyper(p, stat->bytes_requested);
+ p = xdr_encode_hyper(p, stat->ops_completed);
+ p = xdr_encode_hyper(p, stat->bytes_completed);
+ p = xdr_encode_hyper(p, stat->bytes_not_delivered);
+ ff_layout_encode_nfstime(xdr, stat->total_busy_time);
+ ff_layout_encode_nfstime(xdr, stat->aggregate_completion_time);
+}
+
+static void
+ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
+ const struct nfs42_layoutstat_devinfo *devinfo,
+ struct nfs4_ff_layout_mirror *mirror)
+{
+ struct nfs4_pnfs_ds_addr *da;
+ struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds;
+ struct nfs_fh *fh = &mirror->fh_versions[0];
+ __be32 *p;
+
+ da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node);
+ dprintk("%s: DS %s: encoding address %s\n",
+ __func__, ds->ds_remotestr, da->da_remotestr);
+ /* netaddr4 */
+ ff_layout_encode_netaddr(xdr, da);
+ /* nfs_fh4 */
+ p = xdr_reserve_space(xdr, 4 + fh->size);
+ xdr_encode_opaque(p, fh->data, fh->size);
+ /* ff_io_latency4 read */
+ spin_lock(&mirror->lock);
+ ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat);
+ /* ff_io_latency4 write */
+ ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat);
+ spin_unlock(&mirror->lock);
+ /* nfstime4 */
+ ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time));
+ /* bool */
+ p = xdr_reserve_space(xdr, 4);
+ *p = cpu_to_be32(false);
+}
+
+static void
+ff_layout_encode_layoutstats(struct xdr_stream *xdr, const void *args,
+ const struct nfs4_xdr_opaque_data *opaque)
+{
+ struct nfs42_layoutstat_devinfo *devinfo = container_of(opaque,
+ struct nfs42_layoutstat_devinfo, ld_private);
+ __be32 *start;
+
+ /* layoutupdate length */
+ start = xdr_reserve_space(xdr, 4);
+ ff_layout_encode_ff_layoutupdate(xdr, devinfo, opaque->data);
+
+ *start = cpu_to_be32((xdr->p - start - 1) * 4);
+}
+
+static void
+ff_layout_free_layoutstats(struct nfs4_xdr_opaque_data *opaque)
+{
+ struct nfs4_ff_layout_mirror *mirror = opaque->data;
+
+ ff_layout_put_mirror(mirror);
+}
+
+static const struct nfs4_xdr_opaque_ops layoutstat_ops = {
+ .encode = ff_layout_encode_layoutstats,
+ .free = ff_layout_free_layoutstats,
+};
+
+static int
+ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
+ struct nfs42_layoutstat_devinfo *devinfo,
+ int dev_limit)
+{
+ struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_deviceid_node *dev;
+ int i = 0;
+
+ list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
+ if (i >= dev_limit)
+ break;
+ if (IS_ERR_OR_NULL(mirror->mirror_ds))
+ continue;
+ if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags))
+ continue;
+ /* mirror refcount put in cleanup_layoutstats */
+ if (!refcount_inc_not_zero(&mirror->ref))
+ continue;
+ dev = &mirror->mirror_ds->id_node;
+ memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
+ devinfo->offset = 0;
+ devinfo->length = NFS4_MAX_UINT64;
+ spin_lock(&mirror->lock);
+ devinfo->read_count = mirror->read_stat.io_stat.ops_completed;
+ devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
+ devinfo->write_count = mirror->write_stat.io_stat.ops_completed;
+ devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
+ spin_unlock(&mirror->lock);
+ devinfo->layout_type = LAYOUT_FLEX_FILES;
+ devinfo->ld_private.ops = &layoutstat_ops;
+ devinfo->ld_private.data = mirror;
+
+ devinfo++;
+ i++;
+ }
+ return i;
+}
+
+static int
+ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
+{
+ struct nfs4_flexfile_layout *ff_layout;
+ const int dev_count = PNFS_LAYOUTSTATS_MAXDEV;
+
+ /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
+ args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO);
+ if (!args->devinfo)
+ return -ENOMEM;
+
+ spin_lock(&args->inode->i_lock);
+ ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
+ args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr,
+ &args->devinfo[0], dev_count);
+ spin_unlock(&args->inode->i_lock);
+ if (!args->num_dev) {
+ kfree(args->devinfo);
+ args->devinfo = NULL;
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+static int
+ff_layout_set_layoutdriver(struct nfs_server *server,
+ const struct nfs_fh *dummy)
+{
+#if IS_ENABLED(CONFIG_NFS_V4_2)
+ server->caps |= NFS_CAP_LAYOUTSTATS;
+#endif
+ return 0;
+}
+
+static const struct pnfs_commit_ops ff_layout_commit_ops = {
+ .setup_ds_info = ff_layout_setup_ds_info,
+ .release_ds_info = ff_layout_release_ds_info,
+ .mark_request_commit = pnfs_layout_mark_request_commit,
+ .clear_request_commit = pnfs_generic_clear_request_commit,
+ .scan_commit_lists = pnfs_generic_scan_commit_lists,
+ .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
+ .commit_pagelist = ff_layout_commit_pagelist,
+};
+
+static struct pnfs_layoutdriver_type flexfilelayout_type = {
+ .id = LAYOUT_FLEX_FILES,
+ .name = "LAYOUT_FLEX_FILES",
+ .owner = THIS_MODULE,
+ .flags = PNFS_LAYOUTGET_ON_OPEN,
+ .max_layoutget_response = 4096, /* 1 page or so... */
+ .set_layoutdriver = ff_layout_set_layoutdriver,
+ .alloc_layout_hdr = ff_layout_alloc_layout_hdr,
+ .free_layout_hdr = ff_layout_free_layout_hdr,
+ .alloc_lseg = ff_layout_alloc_lseg,
+ .free_lseg = ff_layout_free_lseg,
+ .add_lseg = ff_layout_add_lseg,
+ .pg_read_ops = &ff_layout_pg_read_ops,
+ .pg_write_ops = &ff_layout_pg_write_ops,
+ .get_ds_info = ff_layout_get_ds_info,
+ .free_deviceid_node = ff_layout_free_deviceid_node,
+ .read_pagelist = ff_layout_read_pagelist,
+ .write_pagelist = ff_layout_write_pagelist,
+ .alloc_deviceid_node = ff_layout_alloc_deviceid_node,
+ .prepare_layoutreturn = ff_layout_prepare_layoutreturn,
+ .sync = pnfs_nfs_generic_sync,
+ .prepare_layoutstats = ff_layout_prepare_layoutstats,
+};
+
+static int __init nfs4flexfilelayout_init(void)
+{
+ printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
+ __func__);
+ return pnfs_register_layoutdriver(&flexfilelayout_type);
+}
+
+static void __exit nfs4flexfilelayout_exit(void)
+{
+ printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
+ __func__);
+ pnfs_unregister_layoutdriver(&flexfilelayout_type);
+}
+
+MODULE_ALIAS("nfs-layouttype4-4");
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("The NFSv4 flexfile layout driver");
+
+module_init(nfs4flexfilelayout_init);
+module_exit(nfs4flexfilelayout_exit);
+
+module_param(io_maxretrans, ushort, 0644);
+MODULE_PARM_DESC(io_maxretrans, "The number of times the NFSv4.1 client "
+ "retries an I/O request before returning an error. ");
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
new file mode 100644
index 000000000..354a031c6
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -0,0 +1,226 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NFSv4 flexfile layout driver data structures.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+
+#ifndef FS_NFS_NFS4FLEXFILELAYOUT_H
+#define FS_NFS_NFS4FLEXFILELAYOUT_H
+
+#define FF_FLAGS_NO_LAYOUTCOMMIT 1
+#define FF_FLAGS_NO_IO_THRU_MDS 2
+#define FF_FLAGS_NO_READ_IO 4
+
+#include <linux/refcount.h>
+#include "../pnfs.h"
+
+/* XXX: Let's filter out insanely large mirror count for now to avoid oom
+ * due to network error etc. */
+#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
+
+/* LAYOUTSTATS report interval in ms */
+#define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L)
+#define FF_LAYOUTSTATS_MAXDEV 4
+
+struct nfs4_ff_ds_version {
+ u32 version;
+ u32 minor_version;
+ u32 rsize;
+ u32 wsize;
+ bool tightly_coupled;
+};
+
+/* chained in global deviceid hlist */
+struct nfs4_ff_layout_ds {
+ struct nfs4_deviceid_node id_node;
+ u32 ds_versions_cnt;
+ struct nfs4_ff_ds_version *ds_versions;
+ struct nfs4_pnfs_ds *ds;
+};
+
+struct nfs4_ff_layout_ds_err {
+ struct list_head list; /* linked in mirror error_list */
+ u64 offset;
+ u64 length;
+ int status;
+ enum nfs_opnum4 opnum;
+ nfs4_stateid stateid;
+ struct nfs4_deviceid deviceid;
+};
+
+struct nfs4_ff_io_stat {
+ __u64 ops_requested;
+ __u64 bytes_requested;
+ __u64 ops_completed;
+ __u64 bytes_completed;
+ __u64 bytes_not_delivered;
+ ktime_t total_busy_time;
+ ktime_t aggregate_completion_time;
+};
+
+struct nfs4_ff_busy_timer {
+ ktime_t start_time;
+ atomic_t n_ops;
+};
+
+struct nfs4_ff_layoutstat {
+ struct nfs4_ff_io_stat io_stat;
+ struct nfs4_ff_busy_timer busy_timer;
+};
+
+struct nfs4_ff_layout_mirror {
+ struct pnfs_layout_hdr *layout;
+ struct list_head mirrors;
+ u32 ds_count;
+ u32 efficiency;
+ struct nfs4_deviceid devid;
+ struct nfs4_ff_layout_ds *mirror_ds;
+ u32 fh_versions_cnt;
+ struct nfs_fh *fh_versions;
+ nfs4_stateid stateid;
+ const struct cred __rcu *ro_cred;
+ const struct cred __rcu *rw_cred;
+ refcount_t ref;
+ spinlock_t lock;
+ unsigned long flags;
+ struct nfs4_ff_layoutstat read_stat;
+ struct nfs4_ff_layoutstat write_stat;
+ ktime_t start_time;
+ u32 report_interval;
+};
+
+#define NFS4_FF_MIRROR_STAT_AVAIL (0)
+
+struct nfs4_ff_layout_segment {
+ struct pnfs_layout_segment generic_hdr;
+ u64 stripe_unit;
+ u32 flags;
+ u32 mirror_array_cnt;
+ struct nfs4_ff_layout_mirror *mirror_array[];
+};
+
+struct nfs4_flexfile_layout {
+ struct pnfs_layout_hdr generic_hdr;
+ struct pnfs_ds_commit_info commit_info;
+ struct list_head mirrors;
+ struct list_head error_list; /* nfs4_ff_layout_ds_err */
+ ktime_t last_report_time; /* Layoutstat report times */
+};
+
+struct nfs4_flexfile_layoutreturn_args {
+ struct list_head errors;
+ struct nfs42_layoutstat_devinfo devinfo[FF_LAYOUTSTATS_MAXDEV];
+ unsigned int num_errors;
+ unsigned int num_dev;
+ struct page *pages[1];
+};
+
+static inline struct nfs4_flexfile_layout *
+FF_LAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo)
+{
+ return container_of(lo, struct nfs4_flexfile_layout, generic_hdr);
+}
+
+static inline struct nfs4_ff_layout_segment *
+FF_LAYOUT_LSEG(struct pnfs_layout_segment *lseg)
+{
+ return container_of(lseg,
+ struct nfs4_ff_layout_segment,
+ generic_hdr);
+}
+
+static inline struct nfs4_ff_layout_ds *
+FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node)
+{
+ return container_of(node, struct nfs4_ff_layout_ds, id_node);
+}
+
+static inline struct nfs4_ff_layout_mirror *
+FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx)
+{
+ struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+
+ if (idx < fls->mirror_array_cnt)
+ return fls->mirror_array[idx];
+ return NULL;
+}
+
+static inline struct nfs4_deviceid_node *
+FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
+{
+ struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, idx);
+
+ if (mirror != NULL) {
+ struct nfs4_ff_layout_ds *mirror_ds = mirror->mirror_ds;
+
+ if (!IS_ERR_OR_NULL(mirror_ds))
+ return &mirror_ds->id_node;
+ }
+ return NULL;
+}
+
+static inline u32
+FF_LAYOUT_MIRROR_COUNT(struct pnfs_layout_segment *lseg)
+{
+ return FF_LAYOUT_LSEG(lseg)->mirror_array_cnt;
+}
+
+static inline bool
+ff_layout_no_fallback_to_mds(struct pnfs_layout_segment *lseg)
+{
+ return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_IO_THRU_MDS;
+}
+
+static inline bool
+ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg)
+{
+ return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_READ_IO;
+}
+
+static inline int
+nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror)
+{
+ return mirror->mirror_ds->ds_versions[0].version;
+}
+
+struct nfs4_ff_layout_ds *
+nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags);
+void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
+void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
+int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
+ struct nfs4_ff_layout_mirror *mirror, u64 offset,
+ u64 length, int status, enum nfs_opnum4 opnum,
+ gfp_t gfp_flags);
+void ff_layout_send_layouterror(struct pnfs_layout_segment *lseg);
+int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head);
+void ff_layout_free_ds_ioerr(struct list_head *head);
+unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range,
+ struct list_head *head,
+ unsigned int maxnum);
+struct nfs_fh *
+nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror);
+void
+nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
+ nfs4_stateid *stateid);
+
+struct nfs4_pnfs_ds *
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
+ struct nfs4_ff_layout_mirror *mirror,
+ bool fail_return);
+
+struct rpc_clnt *
+nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
+ struct nfs_client *ds_clp,
+ struct inode *inode);
+const struct cred *ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
+ const struct pnfs_layout_range *range,
+ const struct cred *mdscred);
+bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg);
+bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg);
+
+#endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
new file mode 100644
index 000000000..1f1229710
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -0,0 +1,619 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Device operations for the pnfs nfs4 file layout driver.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/sunrpc/addr.h>
+
+#include "../internal.h"
+#include "../nfs4session.h"
+#include "flexfilelayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static unsigned int dataserver_timeo = NFS_DEF_TCP_TIMEO;
+static unsigned int dataserver_retrans;
+
+static bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
+
+void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
+{
+ if (!IS_ERR_OR_NULL(mirror_ds))
+ nfs4_put_deviceid_node(&mirror_ds->id_node);
+}
+
+void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
+{
+ nfs4_print_deviceid(&mirror_ds->id_node.deviceid);
+ nfs4_pnfs_ds_put(mirror_ds->ds);
+ kfree(mirror_ds->ds_versions);
+ kfree_rcu(mirror_ds, id_node.rcu);
+}
+
+/* Decode opaque device data and construct new_ds using it */
+struct nfs4_ff_layout_ds *
+nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags)
+{
+ struct xdr_stream stream;
+ struct xdr_buf buf;
+ struct page *scratch;
+ struct list_head dsaddrs;
+ struct nfs4_pnfs_ds_addr *da;
+ struct nfs4_ff_layout_ds *new_ds = NULL;
+ struct nfs4_ff_ds_version *ds_versions = NULL;
+ u32 mp_count;
+ u32 version_count;
+ __be32 *p;
+ int i, ret = -ENOMEM;
+
+ /* set up xdr stream */
+ scratch = alloc_page(gfp_flags);
+ if (!scratch)
+ goto out_err;
+
+ new_ds = kzalloc(sizeof(struct nfs4_ff_layout_ds), gfp_flags);
+ if (!new_ds)
+ goto out_scratch;
+
+ nfs4_init_deviceid_node(&new_ds->id_node,
+ server,
+ &pdev->dev_id);
+ INIT_LIST_HEAD(&dsaddrs);
+
+ xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
+ xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+ /* multipath count */
+ p = xdr_inline_decode(&stream, 4);
+ if (unlikely(!p))
+ goto out_err_drain_dsaddrs;
+ mp_count = be32_to_cpup(p);
+ dprintk("%s: multipath ds count %d\n", __func__, mp_count);
+
+ for (i = 0; i < mp_count; i++) {
+ /* multipath ds */
+ da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
+ &stream, gfp_flags);
+ if (da)
+ list_add_tail(&da->da_node, &dsaddrs);
+ }
+ if (list_empty(&dsaddrs)) {
+ dprintk("%s: no suitable DS addresses found\n",
+ __func__);
+ ret = -ENOMEDIUM;
+ goto out_err_drain_dsaddrs;
+ }
+
+ /* version count */
+ p = xdr_inline_decode(&stream, 4);
+ if (unlikely(!p))
+ goto out_err_drain_dsaddrs;
+ version_count = be32_to_cpup(p);
+ dprintk("%s: version count %d\n", __func__, version_count);
+
+ ds_versions = kcalloc(version_count,
+ sizeof(struct nfs4_ff_ds_version),
+ gfp_flags);
+ if (!ds_versions)
+ goto out_scratch;
+
+ for (i = 0; i < version_count; i++) {
+ /* 20 = version(4) + minor_version(4) + rsize(4) + wsize(4) +
+ * tightly_coupled(4) */
+ p = xdr_inline_decode(&stream, 20);
+ if (unlikely(!p))
+ goto out_err_drain_dsaddrs;
+ ds_versions[i].version = be32_to_cpup(p++);
+ ds_versions[i].minor_version = be32_to_cpup(p++);
+ ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL);
+ ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL);
+ ds_versions[i].tightly_coupled = be32_to_cpup(p);
+
+ if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE)
+ ds_versions[i].rsize = NFS_MAX_FILE_IO_SIZE;
+ if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE)
+ ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE;
+
+ /*
+ * check for valid major/minor combination.
+ * currently we support dataserver which talk:
+ * v3, v4.0, v4.1, v4.2
+ */
+ if (!((ds_versions[i].version == 3 && ds_versions[i].minor_version == 0) ||
+ (ds_versions[i].version == 4 && ds_versions[i].minor_version < 3))) {
+ dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__,
+ i, ds_versions[i].version,
+ ds_versions[i].minor_version);
+ ret = -EPROTONOSUPPORT;
+ goto out_err_drain_dsaddrs;
+ }
+
+ dprintk("%s: [%d] vers %u minor_ver %u rsize %u wsize %u coupled %d\n",
+ __func__, i, ds_versions[i].version,
+ ds_versions[i].minor_version,
+ ds_versions[i].rsize,
+ ds_versions[i].wsize,
+ ds_versions[i].tightly_coupled);
+ }
+
+ new_ds->ds_versions = ds_versions;
+ new_ds->ds_versions_cnt = version_count;
+
+ new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
+ if (!new_ds->ds)
+ goto out_err_drain_dsaddrs;
+
+ /* If DS was already in cache, free ds addrs */
+ while (!list_empty(&dsaddrs)) {
+ da = list_first_entry(&dsaddrs,
+ struct nfs4_pnfs_ds_addr,
+ da_node);
+ list_del_init(&da->da_node);
+ kfree(da->da_remotestr);
+ kfree(da);
+ }
+
+ __free_page(scratch);
+ return new_ds;
+
+out_err_drain_dsaddrs:
+ while (!list_empty(&dsaddrs)) {
+ da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
+ da_node);
+ list_del_init(&da->da_node);
+ kfree(da->da_remotestr);
+ kfree(da);
+ }
+
+ kfree(ds_versions);
+out_scratch:
+ __free_page(scratch);
+out_err:
+ kfree(new_ds);
+
+ dprintk("%s ERROR: returning %d\n", __func__, ret);
+ return NULL;
+}
+
+static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
+ u64 offset, u64 length)
+{
+ u64 end;
+
+ end = max_t(u64, pnfs_end_offset(err->offset, err->length),
+ pnfs_end_offset(offset, length));
+ err->offset = min_t(u64, err->offset, offset);
+ err->length = end - err->offset;
+}
+
+static int
+ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1,
+ const struct nfs4_ff_layout_ds_err *e2)
+{
+ int ret;
+
+ if (e1->opnum != e2->opnum)
+ return e1->opnum < e2->opnum ? -1 : 1;
+ if (e1->status != e2->status)
+ return e1->status < e2->status ? -1 : 1;
+ ret = memcmp(e1->stateid.data, e2->stateid.data,
+ sizeof(e1->stateid.data));
+ if (ret != 0)
+ return ret;
+ ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid));
+ if (ret != 0)
+ return ret;
+ if (pnfs_end_offset(e1->offset, e1->length) < e2->offset)
+ return -1;
+ if (e1->offset > pnfs_end_offset(e2->offset, e2->length))
+ return 1;
+ /* If ranges overlap or are contiguous, they are the same */
+ return 0;
+}
+
+static void
+ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
+ struct nfs4_ff_layout_ds_err *dserr)
+{
+ struct nfs4_ff_layout_ds_err *err, *tmp;
+ struct list_head *head = &flo->error_list;
+ int match;
+
+ /* Do insertion sort w/ merges */
+ list_for_each_entry_safe(err, tmp, &flo->error_list, list) {
+ match = ff_ds_error_match(err, dserr);
+ if (match < 0)
+ continue;
+ if (match > 0) {
+ /* Add entry "dserr" _before_ entry "err" */
+ head = &err->list;
+ break;
+ }
+ /* Entries match, so merge "err" into "dserr" */
+ extend_ds_error(dserr, err->offset, err->length);
+ list_replace(&err->list, &dserr->list);
+ kfree(err);
+ return;
+ }
+
+ list_add_tail(&dserr->list, head);
+}
+
+int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
+ struct nfs4_ff_layout_mirror *mirror, u64 offset,
+ u64 length, int status, enum nfs_opnum4 opnum,
+ gfp_t gfp_flags)
+{
+ struct nfs4_ff_layout_ds_err *dserr;
+
+ if (status == 0)
+ return 0;
+
+ if (IS_ERR_OR_NULL(mirror->mirror_ds))
+ return -EINVAL;
+
+ dserr = kmalloc(sizeof(*dserr), gfp_flags);
+ if (!dserr)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&dserr->list);
+ dserr->offset = offset;
+ dserr->length = length;
+ dserr->status = status;
+ dserr->opnum = opnum;
+ nfs4_stateid_copy(&dserr->stateid, &mirror->stateid);
+ memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid,
+ NFS4_DEVICEID4_SIZE);
+
+ spin_lock(&flo->generic_hdr.plh_inode->i_lock);
+ ff_layout_add_ds_error_locked(flo, dserr);
+ spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
+ return 0;
+}
+
+static const struct cred *
+ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
+{
+ const struct cred *cred, __rcu **pcred;
+
+ if (iomode == IOMODE_READ)
+ pcred = &mirror->ro_cred;
+ else
+ pcred = &mirror->rw_cred;
+
+ rcu_read_lock();
+ do {
+ cred = rcu_dereference(*pcred);
+ if (!cred)
+ break;
+
+ cred = get_cred_rcu(cred);
+ } while(!cred);
+ rcu_read_unlock();
+ return cred;
+}
+
+struct nfs_fh *
+nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror)
+{
+ /* FIXME: For now assume there is only 1 version available for the DS */
+ return &mirror->fh_versions[0];
+}
+
+void
+nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
+ nfs4_stateid *stateid)
+{
+ if (nfs4_ff_layout_ds_version(mirror) == 4)
+ nfs4_stateid_copy(stateid, &mirror->stateid);
+}
+
+static bool
+ff_layout_init_mirror_ds(struct pnfs_layout_hdr *lo,
+ struct nfs4_ff_layout_mirror *mirror)
+{
+ if (mirror == NULL)
+ goto outerr;
+ if (mirror->mirror_ds == NULL) {
+ struct nfs4_deviceid_node *node;
+ struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV);
+
+ node = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode),
+ &mirror->devid, lo->plh_lc_cred,
+ GFP_KERNEL);
+ if (node)
+ mirror_ds = FF_LAYOUT_MIRROR_DS(node);
+
+ /* check for race with another call to this function */
+ if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) &&
+ mirror_ds != ERR_PTR(-ENODEV))
+ nfs4_put_deviceid_node(node);
+ }
+
+ if (IS_ERR(mirror->mirror_ds))
+ goto outerr;
+
+ return true;
+outerr:
+ return false;
+}
+
+/**
+ * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call
+ * @lseg: the layout segment we're operating on
+ * @mirror: layout mirror describing the DS to use
+ * @fail_return: return layout on connect failure?
+ *
+ * Try to prepare a DS connection to accept an RPC call. This involves
+ * selecting a mirror to use and connecting the client to it if it's not
+ * already connected.
+ *
+ * Since we only need a single functioning mirror to satisfy a read, we don't
+ * want to return the layout if there is one. For writes though, any down
+ * mirror should result in a LAYOUTRETURN. @fail_return is how we distinguish
+ * between the two cases.
+ *
+ * Returns a pointer to a connected DS object on success or NULL on failure.
+ */
+struct nfs4_pnfs_ds *
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
+ struct nfs4_ff_layout_mirror *mirror,
+ bool fail_return)
+{
+ struct nfs4_pnfs_ds *ds = NULL;
+ struct inode *ino = lseg->pls_layout->plh_inode;
+ struct nfs_server *s = NFS_SERVER(ino);
+ unsigned int max_payload;
+ int status;
+
+ if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror))
+ goto noconnect;
+
+ ds = mirror->mirror_ds->ds;
+ if (READ_ONCE(ds->ds_clp))
+ goto out;
+ /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
+ smp_rmb();
+
+ /* FIXME: For now we assume the server sent only one version of NFS
+ * to use for the DS.
+ */
+ status = nfs4_pnfs_ds_connect(s, ds, &mirror->mirror_ds->id_node,
+ dataserver_timeo, dataserver_retrans,
+ mirror->mirror_ds->ds_versions[0].version,
+ mirror->mirror_ds->ds_versions[0].minor_version);
+
+ /* connect success, check rsize/wsize limit */
+ if (!status) {
+ max_payload =
+ nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
+ NULL);
+ if (mirror->mirror_ds->ds_versions[0].rsize > max_payload)
+ mirror->mirror_ds->ds_versions[0].rsize = max_payload;
+ if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
+ mirror->mirror_ds->ds_versions[0].wsize = max_payload;
+ goto out;
+ }
+noconnect:
+ ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+ mirror, lseg->pls_range.offset,
+ lseg->pls_range.length, NFS4ERR_NXIO,
+ OP_ILLEGAL, GFP_NOIO);
+ ff_layout_send_layouterror(lseg);
+ if (fail_return || !ff_layout_has_available_ds(lseg))
+ pnfs_error_mark_layout_for_return(ino, lseg);
+ ds = NULL;
+out:
+ return ds;
+}
+
+const struct cred *
+ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
+ const struct pnfs_layout_range *range,
+ const struct cred *mdscred)
+{
+ const struct cred *cred;
+
+ if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) {
+ cred = ff_layout_get_mirror_cred(mirror, range->iomode);
+ if (!cred)
+ cred = get_cred(mdscred);
+ } else {
+ cred = get_cred(mdscred);
+ }
+ return cred;
+}
+
+/**
+ * nfs4_ff_find_or_create_ds_client - Find or create a DS rpc client
+ * @mirror: pointer to the mirror
+ * @ds_clp: nfs_client for the DS
+ * @inode: pointer to inode
+ *
+ * Find or create a DS rpc client with th MDS server rpc client auth flavor
+ * in the nfs_client cl_ds_clients list.
+ */
+struct rpc_clnt *
+nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
+ struct nfs_client *ds_clp, struct inode *inode)
+{
+ switch (mirror->mirror_ds->ds_versions[0].version) {
+ case 3:
+ /* For NFSv3 DS, flavor is set when creating DS connections */
+ return ds_clp->cl_rpcclient;
+ case 4:
+ return nfs4_find_or_create_ds_client(ds_clp, inode);
+ default:
+ BUG();
+ }
+}
+
+void ff_layout_free_ds_ioerr(struct list_head *head)
+{
+ struct nfs4_ff_layout_ds_err *err;
+
+ while (!list_empty(head)) {
+ err = list_first_entry(head,
+ struct nfs4_ff_layout_ds_err,
+ list);
+ list_del(&err->list);
+ kfree(err);
+ }
+}
+
+/* called with inode i_lock held */
+int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head)
+{
+ struct nfs4_ff_layout_ds_err *err;
+ __be32 *p;
+
+ list_for_each_entry(err, head, list) {
+ /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
+ * + array length + deviceid(NFS4_DEVICEID4_SIZE)
+ * + status(4) + opnum(4)
+ */
+ p = xdr_reserve_space(xdr,
+ 28 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
+ if (unlikely(!p))
+ return -ENOBUFS;
+ p = xdr_encode_hyper(p, err->offset);
+ p = xdr_encode_hyper(p, err->length);
+ p = xdr_encode_opaque_fixed(p, &err->stateid,
+ NFS4_STATEID_SIZE);
+ /* Encode 1 error */
+ *p++ = cpu_to_be32(1);
+ p = xdr_encode_opaque_fixed(p, &err->deviceid,
+ NFS4_DEVICEID4_SIZE);
+ *p++ = cpu_to_be32(err->status);
+ *p++ = cpu_to_be32(err->opnum);
+ dprintk("%s: offset %llu length %llu status %d op %d\n",
+ __func__, err->offset, err->length, err->status,
+ err->opnum);
+ }
+
+ return 0;
+}
+
+static
+unsigned int do_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range,
+ struct list_head *head,
+ unsigned int maxnum)
+{
+ struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo);
+ struct inode *inode = lo->plh_inode;
+ struct nfs4_ff_layout_ds_err *err, *n;
+ unsigned int ret = 0;
+
+ spin_lock(&inode->i_lock);
+ list_for_each_entry_safe(err, n, &flo->error_list, list) {
+ if (!pnfs_is_range_intersecting(err->offset,
+ pnfs_end_offset(err->offset, err->length),
+ range->offset,
+ pnfs_end_offset(range->offset, range->length)))
+ continue;
+ if (!maxnum)
+ break;
+ list_move(&err->list, head);
+ maxnum--;
+ ret++;
+ }
+ spin_unlock(&inode->i_lock);
+ return ret;
+}
+
+unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range,
+ struct list_head *head,
+ unsigned int maxnum)
+{
+ unsigned int ret;
+
+ ret = do_layout_fetch_ds_ioerr(lo, range, head, maxnum);
+ /* If we're over the max, discard all remaining entries */
+ if (ret == maxnum) {
+ LIST_HEAD(discard);
+ do_layout_fetch_ds_ioerr(lo, range, &discard, -1);
+ ff_layout_free_ds_ioerr(&discard);
+ }
+ return ret;
+}
+
+static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_deviceid_node *devid;
+ u32 idx;
+
+ for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ if (mirror) {
+ if (!mirror->mirror_ds)
+ return true;
+ if (IS_ERR(mirror->mirror_ds))
+ continue;
+ devid = &mirror->mirror_ds->id_node;
+ if (!nfs4_test_deviceid_unavailable(devid))
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_deviceid_node *devid;
+ u32 idx;
+
+ for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ if (!mirror || IS_ERR(mirror->mirror_ds))
+ return false;
+ if (!mirror->mirror_ds)
+ continue;
+ devid = &mirror->mirror_ds->id_node;
+ if (nfs4_test_deviceid_unavailable(devid))
+ return false;
+ }
+
+ return FF_LAYOUT_MIRROR_COUNT(lseg) != 0;
+}
+
+static bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+ if (lseg->pls_range.iomode == IOMODE_READ)
+ return ff_read_layout_has_available_ds(lseg);
+ /* Note: RW layout needs all mirrors available */
+ return ff_rw_layout_has_available_ds(lseg);
+}
+
+bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg)
+{
+ return ff_layout_no_fallback_to_mds(lseg) ||
+ ff_layout_has_available_ds(lseg);
+}
+
+bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg)
+{
+ return lseg->pls_range.iomode == IOMODE_RW &&
+ ff_layout_no_read_on_rw(lseg);
+}
+
+module_param(dataserver_retrans, uint, 0644);
+MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client "
+ "retries a request before it attempts further "
+ " recovery action.");
+module_param(dataserver_timeo, uint, 0644);
+MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
+ "NFSv4.1 client waits for a response from a "
+ " data server before it retries an NFS request.");
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
new file mode 100644
index 000000000..d60c086c6
--- /dev/null
+++ b/fs/nfs/fs_context.c
@@ -0,0 +1,1520 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/fs_context.c
+ *
+ * Copyright (C) 1992 Rick Sladkey
+ * Conversion to new mount api Copyright (C) David Howells
+ *
+ * NFS mount handling.
+ *
+ * Split from fs/nfs/super.c by David Howells <dhowells@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include "nfs.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_MOUNT
+
+#if IS_ENABLED(CONFIG_NFS_V3)
+#define NFS_DEFAULT_VERSION 3
+#else
+#define NFS_DEFAULT_VERSION 2
+#endif
+
+#define NFS_MAX_CONNECTIONS 16
+
+enum nfs_param {
+ Opt_ac,
+ Opt_acdirmax,
+ Opt_acdirmin,
+ Opt_acl,
+ Opt_acregmax,
+ Opt_acregmin,
+ Opt_actimeo,
+ Opt_addr,
+ Opt_bg,
+ Opt_bsize,
+ Opt_clientaddr,
+ Opt_cto,
+ Opt_fg,
+ Opt_fscache,
+ Opt_fscache_flag,
+ Opt_hard,
+ Opt_intr,
+ Opt_local_lock,
+ Opt_lock,
+ Opt_lookupcache,
+ Opt_migration,
+ Opt_minorversion,
+ Opt_mountaddr,
+ Opt_mounthost,
+ Opt_mountport,
+ Opt_mountproto,
+ Opt_mountvers,
+ Opt_namelen,
+ Opt_nconnect,
+ Opt_port,
+ Opt_posix,
+ Opt_proto,
+ Opt_rdirplus,
+ Opt_rdma,
+ Opt_resvport,
+ Opt_retrans,
+ Opt_retry,
+ Opt_rsize,
+ Opt_sec,
+ Opt_sharecache,
+ Opt_sloppy,
+ Opt_soft,
+ Opt_softerr,
+ Opt_softreval,
+ Opt_source,
+ Opt_tcp,
+ Opt_timeo,
+ Opt_udp,
+ Opt_v,
+ Opt_vers,
+ Opt_wsize,
+};
+
+enum {
+ Opt_local_lock_all,
+ Opt_local_lock_flock,
+ Opt_local_lock_none,
+ Opt_local_lock_posix,
+};
+
+static const struct constant_table nfs_param_enums_local_lock[] = {
+ { "all", Opt_local_lock_all },
+ { "flock", Opt_local_lock_flock },
+ { "posix", Opt_local_lock_posix },
+ { "none", Opt_local_lock_none },
+ {}
+};
+
+enum {
+ Opt_lookupcache_all,
+ Opt_lookupcache_none,
+ Opt_lookupcache_positive,
+};
+
+static const struct constant_table nfs_param_enums_lookupcache[] = {
+ { "all", Opt_lookupcache_all },
+ { "none", Opt_lookupcache_none },
+ { "pos", Opt_lookupcache_positive },
+ { "positive", Opt_lookupcache_positive },
+ {}
+};
+
+static const struct fs_parameter_spec nfs_fs_parameters[] = {
+ fsparam_flag_no("ac", Opt_ac),
+ fsparam_u32 ("acdirmax", Opt_acdirmax),
+ fsparam_u32 ("acdirmin", Opt_acdirmin),
+ fsparam_flag_no("acl", Opt_acl),
+ fsparam_u32 ("acregmax", Opt_acregmax),
+ fsparam_u32 ("acregmin", Opt_acregmin),
+ fsparam_u32 ("actimeo", Opt_actimeo),
+ fsparam_string("addr", Opt_addr),
+ fsparam_flag ("bg", Opt_bg),
+ fsparam_u32 ("bsize", Opt_bsize),
+ fsparam_string("clientaddr", Opt_clientaddr),
+ fsparam_flag_no("cto", Opt_cto),
+ fsparam_flag ("fg", Opt_fg),
+ fsparam_flag_no("fsc", Opt_fscache_flag),
+ fsparam_string("fsc", Opt_fscache),
+ fsparam_flag ("hard", Opt_hard),
+ __fsparam(NULL, "intr", Opt_intr,
+ fs_param_neg_with_no|fs_param_deprecated, NULL),
+ fsparam_enum ("local_lock", Opt_local_lock, nfs_param_enums_local_lock),
+ fsparam_flag_no("lock", Opt_lock),
+ fsparam_enum ("lookupcache", Opt_lookupcache, nfs_param_enums_lookupcache),
+ fsparam_flag_no("migration", Opt_migration),
+ fsparam_u32 ("minorversion", Opt_minorversion),
+ fsparam_string("mountaddr", Opt_mountaddr),
+ fsparam_string("mounthost", Opt_mounthost),
+ fsparam_u32 ("mountport", Opt_mountport),
+ fsparam_string("mountproto", Opt_mountproto),
+ fsparam_u32 ("mountvers", Opt_mountvers),
+ fsparam_u32 ("namlen", Opt_namelen),
+ fsparam_u32 ("nconnect", Opt_nconnect),
+ fsparam_string("nfsvers", Opt_vers),
+ fsparam_u32 ("port", Opt_port),
+ fsparam_flag_no("posix", Opt_posix),
+ fsparam_string("proto", Opt_proto),
+ fsparam_flag_no("rdirplus", Opt_rdirplus),
+ fsparam_flag ("rdma", Opt_rdma),
+ fsparam_flag_no("resvport", Opt_resvport),
+ fsparam_u32 ("retrans", Opt_retrans),
+ fsparam_string("retry", Opt_retry),
+ fsparam_u32 ("rsize", Opt_rsize),
+ fsparam_string("sec", Opt_sec),
+ fsparam_flag_no("sharecache", Opt_sharecache),
+ fsparam_flag ("sloppy", Opt_sloppy),
+ fsparam_flag ("soft", Opt_soft),
+ fsparam_flag ("softerr", Opt_softerr),
+ fsparam_flag ("softreval", Opt_softreval),
+ fsparam_string("source", Opt_source),
+ fsparam_flag ("tcp", Opt_tcp),
+ fsparam_u32 ("timeo", Opt_timeo),
+ fsparam_flag ("udp", Opt_udp),
+ fsparam_flag ("v2", Opt_v),
+ fsparam_flag ("v3", Opt_v),
+ fsparam_flag ("v4", Opt_v),
+ fsparam_flag ("v4.0", Opt_v),
+ fsparam_flag ("v4.1", Opt_v),
+ fsparam_flag ("v4.2", Opt_v),
+ fsparam_string("vers", Opt_vers),
+ fsparam_u32 ("wsize", Opt_wsize),
+ {}
+};
+
+enum {
+ Opt_vers_2,
+ Opt_vers_3,
+ Opt_vers_4,
+ Opt_vers_4_0,
+ Opt_vers_4_1,
+ Opt_vers_4_2,
+};
+
+static const struct constant_table nfs_vers_tokens[] = {
+ { "2", Opt_vers_2 },
+ { "3", Opt_vers_3 },
+ { "4", Opt_vers_4 },
+ { "4.0", Opt_vers_4_0 },
+ { "4.1", Opt_vers_4_1 },
+ { "4.2", Opt_vers_4_2 },
+ {}
+};
+
+enum {
+ Opt_xprt_rdma,
+ Opt_xprt_rdma6,
+ Opt_xprt_tcp,
+ Opt_xprt_tcp6,
+ Opt_xprt_udp,
+ Opt_xprt_udp6,
+ nr__Opt_xprt
+};
+
+static const struct constant_table nfs_xprt_protocol_tokens[] = {
+ { "rdma", Opt_xprt_rdma },
+ { "rdma6", Opt_xprt_rdma6 },
+ { "tcp", Opt_xprt_tcp },
+ { "tcp6", Opt_xprt_tcp6 },
+ { "udp", Opt_xprt_udp },
+ { "udp6", Opt_xprt_udp6 },
+ {}
+};
+
+enum {
+ Opt_sec_krb5,
+ Opt_sec_krb5i,
+ Opt_sec_krb5p,
+ Opt_sec_lkey,
+ Opt_sec_lkeyi,
+ Opt_sec_lkeyp,
+ Opt_sec_none,
+ Opt_sec_spkm,
+ Opt_sec_spkmi,
+ Opt_sec_spkmp,
+ Opt_sec_sys,
+ nr__Opt_sec
+};
+
+static const struct constant_table nfs_secflavor_tokens[] = {
+ { "krb5", Opt_sec_krb5 },
+ { "krb5i", Opt_sec_krb5i },
+ { "krb5p", Opt_sec_krb5p },
+ { "lkey", Opt_sec_lkey },
+ { "lkeyi", Opt_sec_lkeyi },
+ { "lkeyp", Opt_sec_lkeyp },
+ { "none", Opt_sec_none },
+ { "null", Opt_sec_none },
+ { "spkm3", Opt_sec_spkm },
+ { "spkm3i", Opt_sec_spkmi },
+ { "spkm3p", Opt_sec_spkmp },
+ { "sys", Opt_sec_sys },
+ {}
+};
+
+/*
+ * Sanity-check a server address provided by the mount command.
+ *
+ * Address family must be initialized, and address must not be
+ * the ANY address for that family.
+ */
+static int nfs_verify_server_address(struct sockaddr *addr)
+{
+ switch (addr->sa_family) {
+ case AF_INET: {
+ struct sockaddr_in *sa = (struct sockaddr_in *)addr;
+ return sa->sin_addr.s_addr != htonl(INADDR_ANY);
+ }
+ case AF_INET6: {
+ struct in6_addr *sa = &((struct sockaddr_in6 *)addr)->sin6_addr;
+ return !ipv6_addr_any(sa);
+ }
+ }
+
+ dfprintk(MOUNT, "NFS: Invalid IP address specified\n");
+ return 0;
+}
+
+/*
+ * Sanity check the NFS transport protocol.
+ *
+ */
+static void nfs_validate_transport_protocol(struct nfs_fs_context *ctx)
+{
+ switch (ctx->nfs_server.protocol) {
+ case XPRT_TRANSPORT_UDP:
+ case XPRT_TRANSPORT_TCP:
+ case XPRT_TRANSPORT_RDMA:
+ break;
+ default:
+ ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+ }
+}
+
+/*
+ * For text based NFSv2/v3 mounts, the mount protocol transport default
+ * settings should depend upon the specified NFS transport.
+ */
+static void nfs_set_mount_transport_protocol(struct nfs_fs_context *ctx)
+{
+ nfs_validate_transport_protocol(ctx);
+
+ if (ctx->mount_server.protocol == XPRT_TRANSPORT_UDP ||
+ ctx->mount_server.protocol == XPRT_TRANSPORT_TCP)
+ return;
+ switch (ctx->nfs_server.protocol) {
+ case XPRT_TRANSPORT_UDP:
+ ctx->mount_server.protocol = XPRT_TRANSPORT_UDP;
+ break;
+ case XPRT_TRANSPORT_TCP:
+ case XPRT_TRANSPORT_RDMA:
+ ctx->mount_server.protocol = XPRT_TRANSPORT_TCP;
+ }
+}
+
+/*
+ * Add 'flavor' to 'auth_info' if not already present.
+ * Returns true if 'flavor' ends up in the list, false otherwise
+ */
+static int nfs_auth_info_add(struct fs_context *fc,
+ struct nfs_auth_info *auth_info,
+ rpc_authflavor_t flavor)
+{
+ unsigned int i;
+ unsigned int max_flavor_len = ARRAY_SIZE(auth_info->flavors);
+
+ /* make sure this flavor isn't already in the list */
+ for (i = 0; i < auth_info->flavor_len; i++) {
+ if (flavor == auth_info->flavors[i])
+ return 0;
+ }
+
+ if (auth_info->flavor_len + 1 >= max_flavor_len)
+ return nfs_invalf(fc, "NFS: too many sec= flavors");
+
+ auth_info->flavors[auth_info->flavor_len++] = flavor;
+ return 0;
+}
+
+/*
+ * Parse the value of the 'sec=' option.
+ */
+static int nfs_parse_security_flavors(struct fs_context *fc,
+ struct fs_parameter *param)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ rpc_authflavor_t pseudoflavor;
+ char *string = param->string, *p;
+ int ret;
+
+ dfprintk(MOUNT, "NFS: parsing %s=%s option\n", param->key, param->string);
+
+ while ((p = strsep(&string, ":")) != NULL) {
+ if (!*p)
+ continue;
+ switch (lookup_constant(nfs_secflavor_tokens, p, -1)) {
+ case Opt_sec_none:
+ pseudoflavor = RPC_AUTH_NULL;
+ break;
+ case Opt_sec_sys:
+ pseudoflavor = RPC_AUTH_UNIX;
+ break;
+ case Opt_sec_krb5:
+ pseudoflavor = RPC_AUTH_GSS_KRB5;
+ break;
+ case Opt_sec_krb5i:
+ pseudoflavor = RPC_AUTH_GSS_KRB5I;
+ break;
+ case Opt_sec_krb5p:
+ pseudoflavor = RPC_AUTH_GSS_KRB5P;
+ break;
+ case Opt_sec_lkey:
+ pseudoflavor = RPC_AUTH_GSS_LKEY;
+ break;
+ case Opt_sec_lkeyi:
+ pseudoflavor = RPC_AUTH_GSS_LKEYI;
+ break;
+ case Opt_sec_lkeyp:
+ pseudoflavor = RPC_AUTH_GSS_LKEYP;
+ break;
+ case Opt_sec_spkm:
+ pseudoflavor = RPC_AUTH_GSS_SPKM;
+ break;
+ case Opt_sec_spkmi:
+ pseudoflavor = RPC_AUTH_GSS_SPKMI;
+ break;
+ case Opt_sec_spkmp:
+ pseudoflavor = RPC_AUTH_GSS_SPKMP;
+ break;
+ default:
+ return nfs_invalf(fc, "NFS: sec=%s option not recognized", p);
+ }
+
+ ret = nfs_auth_info_add(fc, &ctx->auth_info, pseudoflavor);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int nfs_parse_version_string(struct fs_context *fc,
+ const char *string)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+
+ ctx->flags &= ~NFS_MOUNT_VER3;
+ switch (lookup_constant(nfs_vers_tokens, string, -1)) {
+ case Opt_vers_2:
+ ctx->version = 2;
+ break;
+ case Opt_vers_3:
+ ctx->flags |= NFS_MOUNT_VER3;
+ ctx->version = 3;
+ break;
+ case Opt_vers_4:
+ /* Backward compatibility option. In future,
+ * the mount program should always supply
+ * a NFSv4 minor version number.
+ */
+ ctx->version = 4;
+ break;
+ case Opt_vers_4_0:
+ ctx->version = 4;
+ ctx->minorversion = 0;
+ break;
+ case Opt_vers_4_1:
+ ctx->version = 4;
+ ctx->minorversion = 1;
+ break;
+ case Opt_vers_4_2:
+ ctx->version = 4;
+ ctx->minorversion = 2;
+ break;
+ default:
+ return nfs_invalf(fc, "NFS: Unsupported NFS version");
+ }
+ return 0;
+}
+
+/*
+ * Parse a single mount parameter.
+ */
+static int nfs_fs_context_parse_param(struct fs_context *fc,
+ struct fs_parameter *param)
+{
+ struct fs_parse_result result;
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ unsigned short protofamily, mountfamily;
+ unsigned int len;
+ int ret, opt;
+
+ dfprintk(MOUNT, "NFS: parsing nfs mount option '%s'\n", param->key);
+
+ opt = fs_parse(fc, nfs_fs_parameters, param, &result);
+ if (opt < 0)
+ return ctx->sloppy ? 1 : opt;
+
+ switch (opt) {
+ case Opt_source:
+ if (fc->source)
+ return nfs_invalf(fc, "NFS: Multiple sources not supported");
+ fc->source = param->string;
+ param->string = NULL;
+ break;
+
+ /*
+ * boolean options: foo/nofoo
+ */
+ case Opt_soft:
+ ctx->flags |= NFS_MOUNT_SOFT;
+ ctx->flags &= ~NFS_MOUNT_SOFTERR;
+ break;
+ case Opt_softerr:
+ ctx->flags |= NFS_MOUNT_SOFTERR | NFS_MOUNT_SOFTREVAL;
+ ctx->flags &= ~NFS_MOUNT_SOFT;
+ break;
+ case Opt_hard:
+ ctx->flags &= ~(NFS_MOUNT_SOFT |
+ NFS_MOUNT_SOFTERR |
+ NFS_MOUNT_SOFTREVAL);
+ break;
+ case Opt_softreval:
+ if (result.negated)
+ ctx->flags &= ~NFS_MOUNT_SOFTREVAL;
+ else
+ ctx->flags |= NFS_MOUNT_SOFTREVAL;
+ break;
+ case Opt_posix:
+ if (result.negated)
+ ctx->flags &= ~NFS_MOUNT_POSIX;
+ else
+ ctx->flags |= NFS_MOUNT_POSIX;
+ break;
+ case Opt_cto:
+ if (result.negated)
+ ctx->flags |= NFS_MOUNT_NOCTO;
+ else
+ ctx->flags &= ~NFS_MOUNT_NOCTO;
+ break;
+ case Opt_ac:
+ if (result.negated)
+ ctx->flags |= NFS_MOUNT_NOAC;
+ else
+ ctx->flags &= ~NFS_MOUNT_NOAC;
+ break;
+ case Opt_lock:
+ if (result.negated) {
+ ctx->flags |= NFS_MOUNT_NONLM;
+ ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL);
+ } else {
+ ctx->flags &= ~NFS_MOUNT_NONLM;
+ ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL);
+ }
+ break;
+ case Opt_udp:
+ ctx->flags &= ~NFS_MOUNT_TCP;
+ ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP;
+ break;
+ case Opt_tcp:
+ ctx->flags |= NFS_MOUNT_TCP;
+ ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+ break;
+ case Opt_rdma:
+ ctx->flags |= NFS_MOUNT_TCP; /* for side protocols */
+ ctx->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
+ xprt_load_transport(param->key);
+ break;
+ case Opt_acl:
+ if (result.negated)
+ ctx->flags |= NFS_MOUNT_NOACL;
+ else
+ ctx->flags &= ~NFS_MOUNT_NOACL;
+ break;
+ case Opt_rdirplus:
+ if (result.negated)
+ ctx->flags |= NFS_MOUNT_NORDIRPLUS;
+ else
+ ctx->flags &= ~NFS_MOUNT_NORDIRPLUS;
+ break;
+ case Opt_sharecache:
+ if (result.negated)
+ ctx->flags |= NFS_MOUNT_UNSHARED;
+ else
+ ctx->flags &= ~NFS_MOUNT_UNSHARED;
+ break;
+ case Opt_resvport:
+ if (result.negated)
+ ctx->flags |= NFS_MOUNT_NORESVPORT;
+ else
+ ctx->flags &= ~NFS_MOUNT_NORESVPORT;
+ break;
+ case Opt_fscache_flag:
+ if (result.negated)
+ ctx->options &= ~NFS_OPTION_FSCACHE;
+ else
+ ctx->options |= NFS_OPTION_FSCACHE;
+ kfree(ctx->fscache_uniq);
+ ctx->fscache_uniq = NULL;
+ break;
+ case Opt_fscache:
+ ctx->options |= NFS_OPTION_FSCACHE;
+ kfree(ctx->fscache_uniq);
+ ctx->fscache_uniq = param->string;
+ param->string = NULL;
+ break;
+ case Opt_migration:
+ if (result.negated)
+ ctx->options &= ~NFS_OPTION_MIGRATION;
+ else
+ ctx->options |= NFS_OPTION_MIGRATION;
+ break;
+
+ /*
+ * options that take numeric values
+ */
+ case Opt_port:
+ if (result.uint_32 > USHRT_MAX)
+ goto out_of_bounds;
+ ctx->nfs_server.port = result.uint_32;
+ break;
+ case Opt_rsize:
+ ctx->rsize = result.uint_32;
+ break;
+ case Opt_wsize:
+ ctx->wsize = result.uint_32;
+ break;
+ case Opt_bsize:
+ ctx->bsize = result.uint_32;
+ break;
+ case Opt_timeo:
+ if (result.uint_32 < 1 || result.uint_32 > INT_MAX)
+ goto out_of_bounds;
+ ctx->timeo = result.uint_32;
+ break;
+ case Opt_retrans:
+ if (result.uint_32 > INT_MAX)
+ goto out_of_bounds;
+ ctx->retrans = result.uint_32;
+ break;
+ case Opt_acregmin:
+ ctx->acregmin = result.uint_32;
+ break;
+ case Opt_acregmax:
+ ctx->acregmax = result.uint_32;
+ break;
+ case Opt_acdirmin:
+ ctx->acdirmin = result.uint_32;
+ break;
+ case Opt_acdirmax:
+ ctx->acdirmax = result.uint_32;
+ break;
+ case Opt_actimeo:
+ ctx->acregmin = result.uint_32;
+ ctx->acregmax = result.uint_32;
+ ctx->acdirmin = result.uint_32;
+ ctx->acdirmax = result.uint_32;
+ break;
+ case Opt_namelen:
+ ctx->namlen = result.uint_32;
+ break;
+ case Opt_mountport:
+ if (result.uint_32 > USHRT_MAX)
+ goto out_of_bounds;
+ ctx->mount_server.port = result.uint_32;
+ break;
+ case Opt_mountvers:
+ if (result.uint_32 < NFS_MNT_VERSION ||
+ result.uint_32 > NFS_MNT3_VERSION)
+ goto out_of_bounds;
+ ctx->mount_server.version = result.uint_32;
+ break;
+ case Opt_minorversion:
+ if (result.uint_32 > NFS4_MAX_MINOR_VERSION)
+ goto out_of_bounds;
+ ctx->minorversion = result.uint_32;
+ break;
+
+ /*
+ * options that take text values
+ */
+ case Opt_v:
+ ret = nfs_parse_version_string(fc, param->key + 1);
+ if (ret < 0)
+ return ret;
+ break;
+ case Opt_vers:
+ ret = nfs_parse_version_string(fc, param->string);
+ if (ret < 0)
+ return ret;
+ break;
+ case Opt_sec:
+ ret = nfs_parse_security_flavors(fc, param);
+ if (ret < 0)
+ return ret;
+ break;
+
+ case Opt_proto:
+ protofamily = AF_INET;
+ switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) {
+ case Opt_xprt_udp6:
+ protofamily = AF_INET6;
+ fallthrough;
+ case Opt_xprt_udp:
+ ctx->flags &= ~NFS_MOUNT_TCP;
+ ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP;
+ break;
+ case Opt_xprt_tcp6:
+ protofamily = AF_INET6;
+ fallthrough;
+ case Opt_xprt_tcp:
+ ctx->flags |= NFS_MOUNT_TCP;
+ ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+ break;
+ case Opt_xprt_rdma6:
+ protofamily = AF_INET6;
+ fallthrough;
+ case Opt_xprt_rdma:
+ /* vector side protocols to TCP */
+ ctx->flags |= NFS_MOUNT_TCP;
+ ctx->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
+ xprt_load_transport(param->string);
+ break;
+ default:
+ return nfs_invalf(fc, "NFS: Unrecognized transport protocol");
+ }
+
+ ctx->protofamily = protofamily;
+ break;
+
+ case Opt_mountproto:
+ mountfamily = AF_INET;
+ switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) {
+ case Opt_xprt_udp6:
+ mountfamily = AF_INET6;
+ fallthrough;
+ case Opt_xprt_udp:
+ ctx->mount_server.protocol = XPRT_TRANSPORT_UDP;
+ break;
+ case Opt_xprt_tcp6:
+ mountfamily = AF_INET6;
+ fallthrough;
+ case Opt_xprt_tcp:
+ ctx->mount_server.protocol = XPRT_TRANSPORT_TCP;
+ break;
+ case Opt_xprt_rdma: /* not used for side protocols */
+ default:
+ return nfs_invalf(fc, "NFS: Unrecognized transport protocol");
+ }
+ ctx->mountfamily = mountfamily;
+ break;
+
+ case Opt_addr:
+ len = rpc_pton(fc->net_ns, param->string, param->size,
+ &ctx->nfs_server.address,
+ sizeof(ctx->nfs_server._address));
+ if (len == 0)
+ goto out_invalid_address;
+ ctx->nfs_server.addrlen = len;
+ break;
+ case Opt_clientaddr:
+ kfree(ctx->client_address);
+ ctx->client_address = param->string;
+ param->string = NULL;
+ break;
+ case Opt_mounthost:
+ kfree(ctx->mount_server.hostname);
+ ctx->mount_server.hostname = param->string;
+ param->string = NULL;
+ break;
+ case Opt_mountaddr:
+ len = rpc_pton(fc->net_ns, param->string, param->size,
+ &ctx->mount_server.address,
+ sizeof(ctx->mount_server._address));
+ if (len == 0)
+ goto out_invalid_address;
+ ctx->mount_server.addrlen = len;
+ break;
+ case Opt_nconnect:
+ if (result.uint_32 < 1 || result.uint_32 > NFS_MAX_CONNECTIONS)
+ goto out_of_bounds;
+ ctx->nfs_server.nconnect = result.uint_32;
+ break;
+ case Opt_lookupcache:
+ switch (result.uint_32) {
+ case Opt_lookupcache_all:
+ ctx->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE);
+ break;
+ case Opt_lookupcache_positive:
+ ctx->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE;
+ ctx->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG;
+ break;
+ case Opt_lookupcache_none:
+ ctx->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE;
+ break;
+ default:
+ goto out_invalid_value;
+ }
+ break;
+ case Opt_local_lock:
+ switch (result.uint_32) {
+ case Opt_local_lock_all:
+ ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK |
+ NFS_MOUNT_LOCAL_FCNTL);
+ break;
+ case Opt_local_lock_flock:
+ ctx->flags |= NFS_MOUNT_LOCAL_FLOCK;
+ break;
+ case Opt_local_lock_posix:
+ ctx->flags |= NFS_MOUNT_LOCAL_FCNTL;
+ break;
+ case Opt_local_lock_none:
+ ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK |
+ NFS_MOUNT_LOCAL_FCNTL);
+ break;
+ default:
+ goto out_invalid_value;
+ }
+ break;
+
+ /*
+ * Special options
+ */
+ case Opt_sloppy:
+ ctx->sloppy = true;
+ dfprintk(MOUNT, "NFS: relaxing parsing rules\n");
+ break;
+ }
+
+ return 0;
+
+out_invalid_value:
+ return nfs_invalf(fc, "NFS: Bad mount option value specified");
+out_invalid_address:
+ return nfs_invalf(fc, "NFS: Bad IP address specified");
+out_of_bounds:
+ return nfs_invalf(fc, "NFS: Value for '%s' out of range", param->key);
+}
+
+/*
+ * Split fc->source into "hostname:export_path".
+ *
+ * The leftmost colon demarks the split between the server's hostname
+ * and the export path. If the hostname starts with a left square
+ * bracket, then it may contain colons.
+ *
+ * Note: caller frees hostname and export path, even on error.
+ */
+static int nfs_parse_source(struct fs_context *fc,
+ size_t maxnamlen, size_t maxpathlen)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ const char *dev_name = fc->source;
+ size_t len;
+ const char *end;
+
+ if (unlikely(!dev_name || !*dev_name)) {
+ dfprintk(MOUNT, "NFS: device name not specified\n");
+ return -EINVAL;
+ }
+
+ /* Is the host name protected with square brakcets? */
+ if (*dev_name == '[') {
+ end = strchr(++dev_name, ']');
+ if (end == NULL || end[1] != ':')
+ goto out_bad_devname;
+
+ len = end - dev_name;
+ end++;
+ } else {
+ const char *comma;
+
+ end = strchr(dev_name, ':');
+ if (end == NULL)
+ goto out_bad_devname;
+ len = end - dev_name;
+
+ /* kill possible hostname list: not supported */
+ comma = memchr(dev_name, ',', len);
+ if (comma)
+ len = comma - dev_name;
+ }
+
+ if (len > maxnamlen)
+ goto out_hostname;
+
+ kfree(ctx->nfs_server.hostname);
+
+ /* N.B. caller will free nfs_server.hostname in all cases */
+ ctx->nfs_server.hostname = kmemdup_nul(dev_name, len, GFP_KERNEL);
+ if (!ctx->nfs_server.hostname)
+ goto out_nomem;
+ len = strlen(++end);
+ if (len > maxpathlen)
+ goto out_path;
+ ctx->nfs_server.export_path = kmemdup_nul(end, len, GFP_KERNEL);
+ if (!ctx->nfs_server.export_path)
+ goto out_nomem;
+
+ dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", ctx->nfs_server.export_path);
+ return 0;
+
+out_bad_devname:
+ return nfs_invalf(fc, "NFS: device name not in host:path format");
+out_nomem:
+ nfs_errorf(fc, "NFS: not enough memory to parse device name");
+ return -ENOMEM;
+out_hostname:
+ nfs_errorf(fc, "NFS: server hostname too long");
+ return -ENAMETOOLONG;
+out_path:
+ nfs_errorf(fc, "NFS: export pathname too long");
+ return -ENAMETOOLONG;
+}
+
+static inline bool is_remount_fc(struct fs_context *fc)
+{
+ return fc->root != NULL;
+}
+
+/*
+ * Parse monolithic NFS2/NFS3 mount data
+ * - fills in the mount root filehandle
+ *
+ * For option strings, user space handles the following behaviors:
+ *
+ * + DNS: mapping server host name to IP address ("addr=" option)
+ *
+ * + failure mode: how to behave if a mount request can't be handled
+ * immediately ("fg/bg" option)
+ *
+ * + retry: how often to retry a mount request ("retry=" option)
+ *
+ * + breaking back: trying proto=udp after proto=tcp, v2 after v3,
+ * mountproto=tcp after mountproto=udp, and so on
+ */
+static int nfs23_parse_monolithic(struct fs_context *fc,
+ struct nfs_mount_data *data)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct nfs_fh *mntfh = ctx->mntfh;
+ struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address;
+ int extra_flags = NFS_MOUNT_LEGACY_INTERFACE;
+
+ if (data == NULL)
+ goto out_no_data;
+
+ ctx->version = NFS_DEFAULT_VERSION;
+ switch (data->version) {
+ case 1:
+ data->namlen = 0;
+ fallthrough;
+ case 2:
+ data->bsize = 0;
+ fallthrough;
+ case 3:
+ if (data->flags & NFS_MOUNT_VER3)
+ goto out_no_v3;
+ data->root.size = NFS2_FHSIZE;
+ memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
+ /* Turn off security negotiation */
+ extra_flags |= NFS_MOUNT_SECFLAVOUR;
+ fallthrough;
+ case 4:
+ if (data->flags & NFS_MOUNT_SECFLAVOUR)
+ goto out_no_sec;
+ fallthrough;
+ case 5:
+ memset(data->context, 0, sizeof(data->context));
+ fallthrough;
+ case 6:
+ if (data->flags & NFS_MOUNT_VER3) {
+ if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
+ goto out_invalid_fh;
+ mntfh->size = data->root.size;
+ ctx->version = 3;
+ } else {
+ mntfh->size = NFS2_FHSIZE;
+ ctx->version = 2;
+ }
+
+
+ memcpy(mntfh->data, data->root.data, mntfh->size);
+ if (mntfh->size < sizeof(mntfh->data))
+ memset(mntfh->data + mntfh->size, 0,
+ sizeof(mntfh->data) - mntfh->size);
+
+ /*
+ * for proto == XPRT_TRANSPORT_UDP, which is what uses
+ * to_exponential, implying shift: limit the shift value
+ * to BITS_PER_LONG (majortimeo is unsigned long)
+ */
+ if (!(data->flags & NFS_MOUNT_TCP)) /* this will be UDP */
+ if (data->retrans >= 64) /* shift value is too large */
+ goto out_invalid_data;
+
+ /*
+ * Translate to nfs_fs_context, which nfs_fill_super
+ * can deal with.
+ */
+ ctx->flags = data->flags & NFS_MOUNT_FLAGMASK;
+ ctx->flags |= extra_flags;
+ ctx->rsize = data->rsize;
+ ctx->wsize = data->wsize;
+ ctx->timeo = data->timeo;
+ ctx->retrans = data->retrans;
+ ctx->acregmin = data->acregmin;
+ ctx->acregmax = data->acregmax;
+ ctx->acdirmin = data->acdirmin;
+ ctx->acdirmax = data->acdirmax;
+ ctx->need_mount = false;
+
+ memcpy(sap, &data->addr, sizeof(data->addr));
+ ctx->nfs_server.addrlen = sizeof(data->addr);
+ ctx->nfs_server.port = ntohs(data->addr.sin_port);
+ if (sap->sa_family != AF_INET ||
+ !nfs_verify_server_address(sap))
+ goto out_no_address;
+
+ if (!(data->flags & NFS_MOUNT_TCP))
+ ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP;
+ /* N.B. caller will free nfs_server.hostname in all cases */
+ ctx->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
+ if (!ctx->nfs_server.hostname)
+ goto out_nomem;
+
+ ctx->namlen = data->namlen;
+ ctx->bsize = data->bsize;
+
+ if (data->flags & NFS_MOUNT_SECFLAVOUR)
+ ctx->selected_flavor = data->pseudoflavor;
+ else
+ ctx->selected_flavor = RPC_AUTH_UNIX;
+
+ if (!(data->flags & NFS_MOUNT_NONLM))
+ ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK|
+ NFS_MOUNT_LOCAL_FCNTL);
+ else
+ ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK|
+ NFS_MOUNT_LOCAL_FCNTL);
+
+ /*
+ * The legacy version 6 binary mount data from userspace has a
+ * field used only to transport selinux information into the
+ * kernel. To continue to support that functionality we
+ * have a touch of selinux knowledge here in the NFS code. The
+ * userspace code converted context=blah to just blah so we are
+ * converting back to the full string selinux understands.
+ */
+ if (data->context[0]){
+#ifdef CONFIG_SECURITY_SELINUX
+ int ret;
+
+ data->context[NFS_MAX_CONTEXT_LEN] = '\0';
+ ret = vfs_parse_fs_string(fc, "context",
+ data->context, strlen(data->context));
+ if (ret < 0)
+ return ret;
+#else
+ return -EINVAL;
+#endif
+ }
+
+ break;
+ default:
+ goto generic;
+ }
+
+ ctx->skip_reconfig_option_check = true;
+ return 0;
+
+generic:
+ return generic_parse_monolithic(fc, data);
+
+out_no_data:
+ if (is_remount_fc(fc)) {
+ ctx->skip_reconfig_option_check = true;
+ return 0;
+ }
+ return nfs_invalf(fc, "NFS: mount program didn't pass any mount data");
+
+out_no_v3:
+ return nfs_invalf(fc, "NFS: nfs_mount_data version does not support v3");
+
+out_no_sec:
+ return nfs_invalf(fc, "NFS: nfs_mount_data version supports only AUTH_SYS");
+
+out_nomem:
+ dfprintk(MOUNT, "NFS: not enough memory to handle mount options");
+ return -ENOMEM;
+
+out_no_address:
+ return nfs_invalf(fc, "NFS: mount program didn't pass remote address");
+
+out_invalid_fh:
+ return nfs_invalf(fc, "NFS: invalid root filehandle");
+
+out_invalid_data:
+ return nfs_invalf(fc, "NFS: invalid binary mount data");
+}
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+struct compat_nfs_string {
+ compat_uint_t len;
+ compat_uptr_t data;
+};
+
+static inline void compat_nfs_string(struct nfs_string *dst,
+ struct compat_nfs_string *src)
+{
+ dst->data = compat_ptr(src->data);
+ dst->len = src->len;
+}
+
+struct compat_nfs4_mount_data_v1 {
+ compat_int_t version;
+ compat_int_t flags;
+ compat_int_t rsize;
+ compat_int_t wsize;
+ compat_int_t timeo;
+ compat_int_t retrans;
+ compat_int_t acregmin;
+ compat_int_t acregmax;
+ compat_int_t acdirmin;
+ compat_int_t acdirmax;
+ struct compat_nfs_string client_addr;
+ struct compat_nfs_string mnt_path;
+ struct compat_nfs_string hostname;
+ compat_uint_t host_addrlen;
+ compat_uptr_t host_addr;
+ compat_int_t proto;
+ compat_int_t auth_flavourlen;
+ compat_uptr_t auth_flavours;
+};
+
+static void nfs4_compat_mount_data_conv(struct nfs4_mount_data *data)
+{
+ struct compat_nfs4_mount_data_v1 *compat =
+ (struct compat_nfs4_mount_data_v1 *)data;
+
+ /* copy the fields backwards */
+ data->auth_flavours = compat_ptr(compat->auth_flavours);
+ data->auth_flavourlen = compat->auth_flavourlen;
+ data->proto = compat->proto;
+ data->host_addr = compat_ptr(compat->host_addr);
+ data->host_addrlen = compat->host_addrlen;
+ compat_nfs_string(&data->hostname, &compat->hostname);
+ compat_nfs_string(&data->mnt_path, &compat->mnt_path);
+ compat_nfs_string(&data->client_addr, &compat->client_addr);
+ data->acdirmax = compat->acdirmax;
+ data->acdirmin = compat->acdirmin;
+ data->acregmax = compat->acregmax;
+ data->acregmin = compat->acregmin;
+ data->retrans = compat->retrans;
+ data->timeo = compat->timeo;
+ data->wsize = compat->wsize;
+ data->rsize = compat->rsize;
+ data->flags = compat->flags;
+ data->version = compat->version;
+}
+
+/*
+ * Validate NFSv4 mount options
+ */
+static int nfs4_parse_monolithic(struct fs_context *fc,
+ struct nfs4_mount_data *data)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address;
+ char *c;
+
+ if (!data) {
+ if (is_remount_fc(fc))
+ goto done;
+ return nfs_invalf(fc,
+ "NFS4: mount program didn't pass any mount data");
+ }
+
+ ctx->version = 4;
+
+ if (data->version != 1)
+ return generic_parse_monolithic(fc, data);
+
+ if (in_compat_syscall())
+ nfs4_compat_mount_data_conv(data);
+
+ if (data->host_addrlen > sizeof(ctx->nfs_server.address))
+ goto out_no_address;
+ if (data->host_addrlen == 0)
+ goto out_no_address;
+ ctx->nfs_server.addrlen = data->host_addrlen;
+ if (copy_from_user(sap, data->host_addr, data->host_addrlen))
+ return -EFAULT;
+ if (!nfs_verify_server_address(sap))
+ goto out_no_address;
+ ctx->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port);
+
+ if (data->auth_flavourlen) {
+ rpc_authflavor_t pseudoflavor;
+
+ if (data->auth_flavourlen > 1)
+ goto out_inval_auth;
+ if (copy_from_user(&pseudoflavor, data->auth_flavours,
+ sizeof(pseudoflavor)))
+ return -EFAULT;
+ ctx->selected_flavor = pseudoflavor;
+ } else {
+ ctx->selected_flavor = RPC_AUTH_UNIX;
+ }
+
+ c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+ ctx->nfs_server.hostname = c;
+
+ c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+ ctx->nfs_server.export_path = c;
+ dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c);
+
+ c = strndup_user(data->client_addr.data, 16);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+ ctx->client_address = c;
+
+ /*
+ * Translate to nfs_fs_context, which nfs_fill_super
+ * can deal with.
+ */
+
+ ctx->flags = data->flags & NFS4_MOUNT_FLAGMASK;
+ ctx->rsize = data->rsize;
+ ctx->wsize = data->wsize;
+ ctx->timeo = data->timeo;
+ ctx->retrans = data->retrans;
+ ctx->acregmin = data->acregmin;
+ ctx->acregmax = data->acregmax;
+ ctx->acdirmin = data->acdirmin;
+ ctx->acdirmax = data->acdirmax;
+ ctx->nfs_server.protocol = data->proto;
+ nfs_validate_transport_protocol(ctx);
+ if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP)
+ goto out_invalid_transport_udp;
+done:
+ ctx->skip_reconfig_option_check = true;
+ return 0;
+
+out_inval_auth:
+ return nfs_invalf(fc, "NFS4: Invalid number of RPC auth flavours %d",
+ data->auth_flavourlen);
+
+out_no_address:
+ return nfs_invalf(fc, "NFS4: mount program didn't pass remote address");
+
+out_invalid_transport_udp:
+ return nfs_invalf(fc, "NFS: Unsupported transport protocol udp");
+}
+#endif
+
+/*
+ * Parse a monolithic block of data from sys_mount().
+ */
+static int nfs_fs_context_parse_monolithic(struct fs_context *fc,
+ void *data)
+{
+ if (fc->fs_type == &nfs_fs_type)
+ return nfs23_parse_monolithic(fc, data);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+ if (fc->fs_type == &nfs4_fs_type)
+ return nfs4_parse_monolithic(fc, data);
+#endif
+
+ return nfs_invalf(fc, "NFS: Unsupported monolithic data version");
+}
+
+/*
+ * Validate the preparsed information in the config.
+ */
+static int nfs_fs_context_validate(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct nfs_subversion *nfs_mod;
+ struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address;
+ int max_namelen = PAGE_SIZE;
+ int max_pathlen = NFS_MAXPATHLEN;
+ int port = 0;
+ int ret;
+
+ if (!fc->source)
+ goto out_no_device_name;
+
+ /* Check for sanity first. */
+ if (ctx->minorversion && ctx->version != 4)
+ goto out_minorversion_mismatch;
+
+ if (ctx->options & NFS_OPTION_MIGRATION &&
+ (ctx->version != 4 || ctx->minorversion != 0))
+ goto out_migration_misuse;
+
+ /* Verify that any proto=/mountproto= options match the address
+ * families in the addr=/mountaddr= options.
+ */
+ if (ctx->protofamily != AF_UNSPEC &&
+ ctx->protofamily != ctx->nfs_server.address.sa_family)
+ goto out_proto_mismatch;
+
+ if (ctx->mountfamily != AF_UNSPEC) {
+ if (ctx->mount_server.addrlen) {
+ if (ctx->mountfamily != ctx->mount_server.address.sa_family)
+ goto out_mountproto_mismatch;
+ } else {
+ if (ctx->mountfamily != ctx->nfs_server.address.sa_family)
+ goto out_mountproto_mismatch;
+ }
+ }
+
+ if (!nfs_verify_server_address(sap))
+ goto out_no_address;
+
+ if (ctx->version == 4) {
+ if (IS_ENABLED(CONFIG_NFS_V4)) {
+ if (ctx->nfs_server.protocol == XPRT_TRANSPORT_RDMA)
+ port = NFS_RDMA_PORT;
+ else
+ port = NFS_PORT;
+ max_namelen = NFS4_MAXNAMLEN;
+ max_pathlen = NFS4_MAXPATHLEN;
+ nfs_validate_transport_protocol(ctx);
+ if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP)
+ goto out_invalid_transport_udp;
+ ctx->flags &= ~(NFS_MOUNT_NONLM | NFS_MOUNT_NOACL |
+ NFS_MOUNT_VER3 | NFS_MOUNT_LOCAL_FLOCK |
+ NFS_MOUNT_LOCAL_FCNTL);
+ } else {
+ goto out_v4_not_compiled;
+ }
+ } else {
+ nfs_set_mount_transport_protocol(ctx);
+#ifdef CONFIG_NFS_DISABLE_UDP_SUPPORT
+ if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP)
+ goto out_invalid_transport_udp;
+#endif
+ if (ctx->nfs_server.protocol == XPRT_TRANSPORT_RDMA)
+ port = NFS_RDMA_PORT;
+ }
+
+ nfs_set_port(sap, &ctx->nfs_server.port, port);
+
+ ret = nfs_parse_source(fc, max_namelen, max_pathlen);
+ if (ret < 0)
+ return ret;
+
+ /* Load the NFS protocol module if we haven't done so yet */
+ if (!ctx->nfs_mod) {
+ nfs_mod = get_nfs_version(ctx->version);
+ if (IS_ERR(nfs_mod)) {
+ ret = PTR_ERR(nfs_mod);
+ goto out_version_unavailable;
+ }
+ ctx->nfs_mod = nfs_mod;
+ }
+
+ /* Ensure the filesystem context has the correct fs_type */
+ if (fc->fs_type != ctx->nfs_mod->nfs_fs) {
+ module_put(fc->fs_type->owner);
+ __module_get(ctx->nfs_mod->nfs_fs->owner);
+ fc->fs_type = ctx->nfs_mod->nfs_fs;
+ }
+ return 0;
+
+out_no_device_name:
+ return nfs_invalf(fc, "NFS: Device name not specified");
+out_v4_not_compiled:
+ nfs_errorf(fc, "NFS: NFSv4 is not compiled into kernel");
+ return -EPROTONOSUPPORT;
+out_invalid_transport_udp:
+ return nfs_invalf(fc, "NFS: Unsupported transport protocol udp");
+out_no_address:
+ return nfs_invalf(fc, "NFS: mount program didn't pass remote address");
+out_mountproto_mismatch:
+ return nfs_invalf(fc, "NFS: Mount server address does not match mountproto= option");
+out_proto_mismatch:
+ return nfs_invalf(fc, "NFS: Server address does not match proto= option");
+out_minorversion_mismatch:
+ return nfs_invalf(fc, "NFS: Mount option vers=%u does not support minorversion=%u",
+ ctx->version, ctx->minorversion);
+out_migration_misuse:
+ return nfs_invalf(fc, "NFS: 'Migration' not supported for this NFS version");
+out_version_unavailable:
+ nfs_errorf(fc, "NFS: Version unavailable");
+ return ret;
+}
+
+/*
+ * Create an NFS superblock by the appropriate method.
+ */
+static int nfs_get_tree(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ int err = nfs_fs_context_validate(fc);
+
+ if (err)
+ return err;
+ if (!ctx->internal)
+ return ctx->nfs_mod->rpc_ops->try_get_tree(fc);
+ else
+ return nfs_get_tree_common(fc);
+}
+
+/*
+ * Handle duplication of a configuration. The caller copied *src into *sc, but
+ * it can't deal with resource pointers in the filesystem context, so we have
+ * to do that. We need to clear pointers, copy data or get extra refs as
+ * appropriate.
+ */
+static int nfs_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
+{
+ struct nfs_fs_context *src = nfs_fc2context(src_fc), *ctx;
+
+ ctx = kmemdup(src, sizeof(struct nfs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->mntfh = nfs_alloc_fhandle();
+ if (!ctx->mntfh) {
+ kfree(ctx);
+ return -ENOMEM;
+ }
+ nfs_copy_fh(ctx->mntfh, src->mntfh);
+
+ __module_get(ctx->nfs_mod->owner);
+ ctx->client_address = NULL;
+ ctx->mount_server.hostname = NULL;
+ ctx->nfs_server.export_path = NULL;
+ ctx->nfs_server.hostname = NULL;
+ ctx->fscache_uniq = NULL;
+ ctx->clone_data.fattr = NULL;
+ fc->fs_private = ctx;
+ return 0;
+}
+
+static void nfs_fs_context_free(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+
+ if (ctx) {
+ if (ctx->server)
+ nfs_free_server(ctx->server);
+ if (ctx->nfs_mod)
+ put_nfs_version(ctx->nfs_mod);
+ kfree(ctx->client_address);
+ kfree(ctx->mount_server.hostname);
+ kfree(ctx->nfs_server.export_path);
+ kfree(ctx->nfs_server.hostname);
+ kfree(ctx->fscache_uniq);
+ nfs_free_fhandle(ctx->mntfh);
+ nfs_free_fattr(ctx->clone_data.fattr);
+ kfree(ctx);
+ }
+}
+
+static const struct fs_context_operations nfs_fs_context_ops = {
+ .free = nfs_fs_context_free,
+ .dup = nfs_fs_context_dup,
+ .parse_param = nfs_fs_context_parse_param,
+ .parse_monolithic = nfs_fs_context_parse_monolithic,
+ .get_tree = nfs_get_tree,
+ .reconfigure = nfs_reconfigure,
+};
+
+/*
+ * Prepare superblock configuration. We use the namespaces attached to the
+ * context. This may be the current process's namespaces, or it may be a
+ * container's namespaces.
+ */
+static int nfs_init_fs_context(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct nfs_fs_context), GFP_KERNEL);
+ if (unlikely(!ctx))
+ return -ENOMEM;
+
+ ctx->mntfh = nfs_alloc_fhandle();
+ if (unlikely(!ctx->mntfh)) {
+ kfree(ctx);
+ return -ENOMEM;
+ }
+
+ ctx->protofamily = AF_UNSPEC;
+ ctx->mountfamily = AF_UNSPEC;
+ ctx->mount_server.port = NFS_UNSPEC_PORT;
+
+ if (fc->root) {
+ /* reconfigure, start with the current config */
+ struct nfs_server *nfss = fc->root->d_sb->s_fs_info;
+ struct net *net = nfss->nfs_client->cl_net;
+
+ ctx->flags = nfss->flags;
+ ctx->rsize = nfss->rsize;
+ ctx->wsize = nfss->wsize;
+ ctx->retrans = nfss->client->cl_timeout->to_retries;
+ ctx->selected_flavor = nfss->client->cl_auth->au_flavor;
+ ctx->acregmin = nfss->acregmin / HZ;
+ ctx->acregmax = nfss->acregmax / HZ;
+ ctx->acdirmin = nfss->acdirmin / HZ;
+ ctx->acdirmax = nfss->acdirmax / HZ;
+ ctx->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
+ ctx->nfs_server.port = nfss->port;
+ ctx->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
+ ctx->version = nfss->nfs_client->rpc_ops->version;
+ ctx->minorversion = nfss->nfs_client->cl_minorversion;
+
+ memcpy(&ctx->nfs_server.address, &nfss->nfs_client->cl_addr,
+ ctx->nfs_server.addrlen);
+
+ if (fc->net_ns != net) {
+ put_net(fc->net_ns);
+ fc->net_ns = get_net(net);
+ }
+
+ ctx->nfs_mod = nfss->nfs_client->cl_nfs_mod;
+ __module_get(ctx->nfs_mod->owner);
+ } else {
+ /* defaults */
+ ctx->timeo = NFS_UNSPEC_TIMEO;
+ ctx->retrans = NFS_UNSPEC_RETRANS;
+ ctx->acregmin = NFS_DEF_ACREGMIN;
+ ctx->acregmax = NFS_DEF_ACREGMAX;
+ ctx->acdirmin = NFS_DEF_ACDIRMIN;
+ ctx->acdirmax = NFS_DEF_ACDIRMAX;
+ ctx->nfs_server.port = NFS_UNSPEC_PORT;
+ ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+ ctx->selected_flavor = RPC_AUTH_MAXFLAVOR;
+ ctx->minorversion = 0;
+ ctx->need_mount = true;
+ }
+ fc->fs_private = ctx;
+ fc->ops = &nfs_fs_context_ops;
+ return 0;
+}
+
+struct file_system_type nfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "nfs",
+ .init_fs_context = nfs_init_fs_context,
+ .parameters = nfs_fs_parameters,
+ .kill_sb = nfs_kill_super,
+ .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
+};
+MODULE_ALIAS_FS("nfs");
+EXPORT_SYMBOL_GPL(nfs_fs_type);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+struct file_system_type nfs4_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "nfs4",
+ .init_fs_context = nfs_init_fs_context,
+ .parameters = nfs_fs_parameters,
+ .kill_sb = nfs_kill_super,
+ .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
+};
+MODULE_ALIAS_FS("nfs4");
+MODULE_ALIAS("nfs4");
+EXPORT_SYMBOL_GPL(nfs4_fs_type);
+#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
new file mode 100644
index 000000000..573b1da93
--- /dev/null
+++ b/fs/nfs/fscache-index.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* NFS FS-Cache index structure definition
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+#include <linux/iversion.h>
+
+#include "internal.h"
+#include "fscache.h"
+
+#define NFSDBG_FACILITY NFSDBG_FSCACHE
+
+/*
+ * Define the NFS filesystem for FS-Cache. Upon registration FS-Cache sticks
+ * the cookie for the top-level index object for NFS into here. The top-level
+ * index can than have other cache objects inserted into it.
+ */
+struct fscache_netfs nfs_fscache_netfs = {
+ .name = "nfs",
+ .version = 0,
+};
+
+/*
+ * Register NFS for caching
+ */
+int nfs_fscache_register(void)
+{
+ return fscache_register_netfs(&nfs_fscache_netfs);
+}
+
+/*
+ * Unregister NFS for caching
+ */
+void nfs_fscache_unregister(void)
+{
+ fscache_unregister_netfs(&nfs_fscache_netfs);
+}
+
+/*
+ * Define the server object for FS-Cache. This is used to describe a server
+ * object to fscache_acquire_cookie(). It is keyed by the NFS protocol and
+ * server address parameters.
+ */
+const struct fscache_cookie_def nfs_fscache_server_index_def = {
+ .name = "NFS.server",
+ .type = FSCACHE_COOKIE_TYPE_INDEX,
+};
+
+/*
+ * Define the superblock object for FS-Cache. This is used to describe a
+ * superblock object to fscache_acquire_cookie(). It is keyed by all the NFS
+ * parameters that might cause a separate superblock.
+ */
+const struct fscache_cookie_def nfs_fscache_super_index_def = {
+ .name = "NFS.super",
+ .type = FSCACHE_COOKIE_TYPE_INDEX,
+};
+
+/*
+ * Consult the netfs about the state of an object
+ * - This function can be absent if the index carries no state data
+ * - The netfs data from the cookie being used as the target is
+ * presented, as is the auxiliary data
+ */
+static
+enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
+ const void *data,
+ uint16_t datalen,
+ loff_t object_size)
+{
+ struct nfs_fscache_inode_auxdata auxdata;
+ struct nfs_inode *nfsi = cookie_netfs_data;
+
+ if (datalen != sizeof(auxdata))
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec;
+ auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec;
+ auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec;
+ auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec;
+
+ if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+ auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
+
+ if (memcmp(data, &auxdata, datalen) != 0)
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ return FSCACHE_CHECKAUX_OKAY;
+}
+
+/*
+ * Get an extra reference on a read context.
+ * - This function can be absent if the completion function doesn't require a
+ * context.
+ * - The read context is passed back to NFS in the event that a data read on the
+ * cache fails with EIO - in which case the server must be contacted to
+ * retrieve the data, which requires the read context for security.
+ */
+static void nfs_fh_get_context(void *cookie_netfs_data, void *context)
+{
+ get_nfs_open_context(context);
+}
+
+/*
+ * Release an extra reference on a read context.
+ * - This function can be absent if the completion function doesn't require a
+ * context.
+ */
+static void nfs_fh_put_context(void *cookie_netfs_data, void *context)
+{
+ if (context)
+ put_nfs_open_context(context);
+}
+
+/*
+ * Define the inode object for FS-Cache. This is used to describe an inode
+ * object to fscache_acquire_cookie(). It is keyed by the NFS file handle for
+ * an inode.
+ *
+ * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime
+ * held in the cache auxiliary data for the data storage object with those in
+ * the inode struct in memory.
+ */
+const struct fscache_cookie_def nfs_fscache_inode_object_def = {
+ .name = "NFS.fh",
+ .type = FSCACHE_COOKIE_TYPE_DATAFILE,
+ .check_aux = nfs_fscache_inode_check_aux,
+ .get_context = nfs_fh_get_context,
+ .put_context = nfs_fh_put_context,
+};
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
new file mode 100644
index 000000000..a60df88ef
--- /dev/null
+++ b/fs/nfs/fscache.c
@@ -0,0 +1,516 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* NFS filesystem cache interface
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/iversion.h>
+
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+
+#define NFSDBG_FACILITY NFSDBG_FSCACHE
+
+static struct rb_root nfs_fscache_keys = RB_ROOT;
+static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
+
+/*
+ * Layout of the key for an NFS server cache object.
+ */
+struct nfs_server_key {
+ struct {
+ uint16_t nfsversion; /* NFS protocol version */
+ uint32_t minorversion; /* NFSv4 minor version */
+ uint16_t family; /* address family */
+ __be16 port; /* IP port */
+ } hdr;
+ union {
+ struct in_addr ipv4_addr; /* IPv4 address */
+ struct in6_addr ipv6_addr; /* IPv6 address */
+ };
+} __packed;
+
+/*
+ * Get the per-client index cookie for an NFS client if the appropriate mount
+ * flag was set
+ * - We always try and get an index cookie for the client, but get filehandle
+ * cookies on a per-superblock basis, depending on the mount flags
+ */
+void nfs_fscache_get_client_cookie(struct nfs_client *clp)
+{
+ const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
+ const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
+ struct nfs_server_key key;
+ uint16_t len = sizeof(key.hdr);
+
+ memset(&key, 0, sizeof(key));
+ key.hdr.nfsversion = clp->rpc_ops->version;
+ key.hdr.minorversion = clp->cl_minorversion;
+ key.hdr.family = clp->cl_addr.ss_family;
+
+ switch (clp->cl_addr.ss_family) {
+ case AF_INET:
+ key.hdr.port = sin->sin_port;
+ key.ipv4_addr = sin->sin_addr;
+ len += sizeof(key.ipv4_addr);
+ break;
+
+ case AF_INET6:
+ key.hdr.port = sin6->sin6_port;
+ key.ipv6_addr = sin6->sin6_addr;
+ len += sizeof(key.ipv6_addr);
+ break;
+
+ default:
+ printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
+ clp->cl_addr.ss_family);
+ clp->fscache = NULL;
+ return;
+ }
+
+ /* create a cache index for looking up filehandles */
+ clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
+ &nfs_fscache_server_index_def,
+ &key, len,
+ NULL, 0,
+ clp, 0, true);
+ dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
+ clp, clp->fscache);
+}
+
+/*
+ * Dispose of a per-client cookie
+ */
+void nfs_fscache_release_client_cookie(struct nfs_client *clp)
+{
+ dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n",
+ clp, clp->fscache);
+
+ fscache_relinquish_cookie(clp->fscache, NULL, false);
+ clp->fscache = NULL;
+}
+
+/*
+ * Get the cache cookie for an NFS superblock. We have to handle
+ * uniquification here because the cache doesn't do it for us.
+ *
+ * The default uniquifier is just an empty string, but it may be overridden
+ * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent
+ * superblock across an automount point of some nature.
+ */
+void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int ulen)
+{
+ struct nfs_fscache_key *key, *xkey;
+ struct nfs_server *nfss = NFS_SB(sb);
+ struct rb_node **p, *parent;
+ int diff;
+
+ nfss->fscache_key = NULL;
+ nfss->fscache = NULL;
+ if (!uniq) {
+ uniq = "";
+ ulen = 1;
+ }
+
+ key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
+ if (!key)
+ return;
+
+ key->nfs_client = nfss->nfs_client;
+ key->key.super.s_flags = sb->s_flags & NFS_SB_MASK;
+ key->key.nfs_server.flags = nfss->flags;
+ key->key.nfs_server.rsize = nfss->rsize;
+ key->key.nfs_server.wsize = nfss->wsize;
+ key->key.nfs_server.acregmin = nfss->acregmin;
+ key->key.nfs_server.acregmax = nfss->acregmax;
+ key->key.nfs_server.acdirmin = nfss->acdirmin;
+ key->key.nfs_server.acdirmax = nfss->acdirmax;
+ key->key.nfs_server.fsid = nfss->fsid;
+ key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor;
+
+ key->key.uniq_len = ulen;
+ memcpy(key->key.uniquifier, uniq, ulen);
+
+ spin_lock(&nfs_fscache_keys_lock);
+ p = &nfs_fscache_keys.rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ xkey = rb_entry(parent, struct nfs_fscache_key, node);
+
+ if (key->nfs_client < xkey->nfs_client)
+ goto go_left;
+ if (key->nfs_client > xkey->nfs_client)
+ goto go_right;
+
+ diff = memcmp(&key->key, &xkey->key, sizeof(key->key));
+ if (diff < 0)
+ goto go_left;
+ if (diff > 0)
+ goto go_right;
+
+ if (key->key.uniq_len == 0)
+ goto non_unique;
+ diff = memcmp(key->key.uniquifier,
+ xkey->key.uniquifier,
+ key->key.uniq_len);
+ if (diff < 0)
+ goto go_left;
+ if (diff > 0)
+ goto go_right;
+ goto non_unique;
+
+ go_left:
+ p = &(*p)->rb_left;
+ continue;
+ go_right:
+ p = &(*p)->rb_right;
+ }
+
+ rb_link_node(&key->node, parent, p);
+ rb_insert_color(&key->node, &nfs_fscache_keys);
+ spin_unlock(&nfs_fscache_keys_lock);
+ nfss->fscache_key = key;
+
+ /* create a cache index for looking up filehandles */
+ nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
+ &nfs_fscache_super_index_def,
+ &key->key,
+ sizeof(key->key) + ulen,
+ NULL, 0,
+ nfss, 0, true);
+ dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
+ nfss, nfss->fscache);
+ return;
+
+non_unique:
+ spin_unlock(&nfs_fscache_keys_lock);
+ kfree(key);
+ nfss->fscache_key = NULL;
+ nfss->fscache = NULL;
+ printk(KERN_WARNING "NFS:"
+ " Cache request denied due to non-unique superblock keys\n");
+}
+
+/*
+ * release a per-superblock cookie
+ */
+void nfs_fscache_release_super_cookie(struct super_block *sb)
+{
+ struct nfs_server *nfss = NFS_SB(sb);
+
+ dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
+ nfss, nfss->fscache);
+
+ fscache_relinquish_cookie(nfss->fscache, NULL, false);
+ nfss->fscache = NULL;
+
+ if (nfss->fscache_key) {
+ spin_lock(&nfs_fscache_keys_lock);
+ rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys);
+ spin_unlock(&nfs_fscache_keys_lock);
+ kfree(nfss->fscache_key);
+ nfss->fscache_key = NULL;
+ }
+}
+
+static void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata,
+ struct nfs_inode *nfsi)
+{
+ memset(auxdata, 0, sizeof(*auxdata));
+ auxdata->mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec;
+ auxdata->mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec;
+ auxdata->ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec;
+ auxdata->ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec;
+
+ if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+ auxdata->change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
+}
+
+/*
+ * Initialise the per-inode cache cookie pointer for an NFS inode.
+ */
+void nfs_fscache_init_inode(struct inode *inode)
+{
+ struct nfs_fscache_inode_auxdata auxdata;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ nfsi->fscache = NULL;
+ if (!(nfss->fscache && S_ISREG(inode->i_mode)))
+ return;
+
+ nfs_fscache_update_auxdata(&auxdata, nfsi);
+
+ nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache,
+ &nfs_fscache_inode_object_def,
+ nfsi->fh.data, nfsi->fh.size,
+ &auxdata, sizeof(auxdata),
+ nfsi, nfsi->vfs_inode.i_size, false);
+}
+
+/*
+ * Release a per-inode cookie.
+ */
+void nfs_fscache_clear_inode(struct inode *inode)
+{
+ struct nfs_fscache_inode_auxdata auxdata;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct fscache_cookie *cookie = nfs_i_fscache(inode);
+
+ dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie);
+
+ nfs_fscache_update_auxdata(&auxdata, nfsi);
+ fscache_relinquish_cookie(cookie, &auxdata, false);
+ nfsi->fscache = NULL;
+}
+
+static bool nfs_fscache_can_enable(void *data)
+{
+ struct inode *inode = data;
+
+ return !inode_is_open_for_write(inode);
+}
+
+/*
+ * Enable or disable caching for a file that is being opened as appropriate.
+ * The cookie is allocated when the inode is initialised, but is not enabled at
+ * that time. Enablement is deferred to file-open time to avoid stat() and
+ * access() thrashing the cache.
+ *
+ * For now, with NFS, only regular files that are open read-only will be able
+ * to use the cache.
+ *
+ * We enable the cache for an inode if we open it read-only and it isn't
+ * currently open for writing. We disable the cache if the inode is open
+ * write-only.
+ *
+ * The caller uses the file struct to pin i_writecount on the inode before
+ * calling us when a file is opened for writing, so we can make use of that.
+ *
+ * Note that this may be invoked multiple times in parallel by parallel
+ * nfs_open() functions.
+ */
+void nfs_fscache_open_file(struct inode *inode, struct file *filp)
+{
+ struct nfs_fscache_inode_auxdata auxdata;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct fscache_cookie *cookie = nfs_i_fscache(inode);
+
+ if (!fscache_cookie_valid(cookie))
+ return;
+
+ nfs_fscache_update_auxdata(&auxdata, nfsi);
+
+ if (inode_is_open_for_write(inode)) {
+ dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi);
+ clear_bit(NFS_INO_FSCACHE, &nfsi->flags);
+ fscache_disable_cookie(cookie, &auxdata, true);
+ fscache_uncache_all_inode_pages(cookie, inode);
+ } else {
+ dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi);
+ fscache_enable_cookie(cookie, &auxdata, nfsi->vfs_inode.i_size,
+ nfs_fscache_can_enable, inode);
+ if (fscache_cookie_enabled(cookie))
+ set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+ }
+}
+EXPORT_SYMBOL_GPL(nfs_fscache_open_file);
+
+/*
+ * Release the caching state associated with a page, if the page isn't busy
+ * interacting with the cache.
+ * - Returns true (can release page) or false (page busy).
+ */
+int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+ if (PageFsCache(page)) {
+ struct fscache_cookie *cookie = nfs_i_fscache(page->mapping->host);
+
+ BUG_ON(!cookie);
+ dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
+ cookie, page, NFS_I(page->mapping->host));
+
+ if (!fscache_maybe_release_page(cookie, page, gfp))
+ return 0;
+
+ nfs_inc_fscache_stats(page->mapping->host,
+ NFSIOS_FSCACHE_PAGES_UNCACHED);
+ }
+
+ return 1;
+}
+
+/*
+ * Release the caching state associated with a page if undergoing complete page
+ * invalidation.
+ */
+void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)
+{
+ struct fscache_cookie *cookie = nfs_i_fscache(inode);
+
+ BUG_ON(!cookie);
+
+ dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n",
+ cookie, page, NFS_I(inode));
+
+ fscache_wait_on_page_write(cookie, page);
+
+ BUG_ON(!PageLocked(page));
+ fscache_uncache_page(cookie, page);
+ nfs_inc_fscache_stats(page->mapping->host,
+ NFSIOS_FSCACHE_PAGES_UNCACHED);
+}
+
+/*
+ * Handle completion of a page being read from the cache.
+ * - Called in process (keventd) context.
+ */
+static void nfs_readpage_from_fscache_complete(struct page *page,
+ void *context,
+ int error)
+{
+ dfprintk(FSCACHE,
+ "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n",
+ page, context, error);
+
+ /* if the read completes with an error, we just unlock the page and let
+ * the VM reissue the readpage */
+ if (!error) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ } else {
+ error = nfs_readpage_async(context, page->mapping->host, page);
+ if (error)
+ unlock_page(page);
+ }
+}
+
+/*
+ * Retrieve a page from fscache
+ */
+int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+ struct inode *inode, struct page *page)
+{
+ int ret;
+
+ dfprintk(FSCACHE,
+ "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
+ nfs_i_fscache(inode), page, page->index, page->flags, inode);
+
+ ret = fscache_read_or_alloc_page(nfs_i_fscache(inode),
+ page,
+ nfs_readpage_from_fscache_complete,
+ ctx,
+ GFP_KERNEL);
+
+ switch (ret) {
+ case 0: /* read BIO submitted (page in fscache) */
+ dfprintk(FSCACHE,
+ "NFS: readpage_from_fscache: BIO submitted\n");
+ nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK);
+ return ret;
+
+ case -ENOBUFS: /* inode not in cache */
+ case -ENODATA: /* page not in cache */
+ nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL);
+ dfprintk(FSCACHE,
+ "NFS: readpage_from_fscache %d\n", ret);
+ return 1;
+
+ default:
+ dfprintk(FSCACHE, "NFS: readpage_from_fscache %d\n", ret);
+ nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL);
+ }
+ return ret;
+}
+
+/*
+ * Retrieve a set of pages from fscache
+ */
+int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+ struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ unsigned npages = *nr_pages;
+ int ret;
+
+ dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
+ nfs_i_fscache(inode), npages, inode);
+
+ ret = fscache_read_or_alloc_pages(nfs_i_fscache(inode),
+ mapping, pages, nr_pages,
+ nfs_readpage_from_fscache_complete,
+ ctx,
+ mapping_gfp_mask(mapping));
+ if (*nr_pages < npages)
+ nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK,
+ npages);
+ if (*nr_pages > 0)
+ nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL,
+ *nr_pages);
+
+ switch (ret) {
+ case 0: /* read submitted to the cache for all pages */
+ BUG_ON(!list_empty(pages));
+ BUG_ON(*nr_pages != 0);
+ dfprintk(FSCACHE,
+ "NFS: nfs_getpages_from_fscache: submitted\n");
+
+ return ret;
+
+ case -ENOBUFS: /* some pages aren't cached and can't be */
+ case -ENODATA: /* some pages aren't cached */
+ dfprintk(FSCACHE,
+ "NFS: nfs_getpages_from_fscache: no page: %d\n", ret);
+ return 1;
+
+ default:
+ dfprintk(FSCACHE,
+ "NFS: nfs_getpages_from_fscache: ret %d\n", ret);
+ }
+
+ return ret;
+}
+
+/*
+ * Store a newly fetched page in fscache
+ * - PG_fscache must be set on the page
+ */
+void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
+{
+ int ret;
+
+ dfprintk(FSCACHE,
+ "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n",
+ nfs_i_fscache(inode), page, page->index, page->flags, sync);
+
+ ret = fscache_write_page(nfs_i_fscache(inode), page,
+ inode->i_size, GFP_KERNEL);
+ dfprintk(FSCACHE,
+ "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
+ page, page->index, page->flags, ret);
+
+ if (ret != 0) {
+ fscache_uncache_page(nfs_i_fscache(inode), page);
+ nfs_inc_fscache_stats(inode,
+ NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL);
+ nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED);
+ } else {
+ nfs_inc_fscache_stats(inode,
+ NFSIOS_FSCACHE_PAGES_WRITTEN_OK);
+ }
+}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
new file mode 100644
index 000000000..6754c8607
--- /dev/null
+++ b/fs/nfs/fscache.h
@@ -0,0 +1,242 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* NFS filesystem cache interface definitions
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#ifndef _NFS_FSCACHE_H
+#define _NFS_FSCACHE_H
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/fscache.h>
+
+#ifdef CONFIG_NFS_FSCACHE
+
+/*
+ * set of NFS FS-Cache objects that form a superblock key
+ */
+struct nfs_fscache_key {
+ struct rb_node node;
+ struct nfs_client *nfs_client; /* the server */
+
+ /* the elements of the unique key - as used by nfs_compare_super() and
+ * nfs_compare_mount_options() to distinguish superblocks */
+ struct {
+ struct {
+ unsigned long s_flags; /* various flags
+ * (& NFS_MS_MASK) */
+ } super;
+
+ struct {
+ struct nfs_fsid fsid;
+ int flags;
+ unsigned int rsize; /* read size */
+ unsigned int wsize; /* write size */
+ unsigned int acregmin; /* attr cache timeouts */
+ unsigned int acregmax;
+ unsigned int acdirmin;
+ unsigned int acdirmax;
+ } nfs_server;
+
+ struct {
+ rpc_authflavor_t au_flavor;
+ } rpc_auth;
+
+ /* uniquifier - can be used if nfs_server.flags includes
+ * NFS_MOUNT_UNSHARED */
+ u8 uniq_len;
+ char uniquifier[0];
+ } key;
+};
+
+/*
+ * Definition of the auxiliary data attached to NFS inode storage objects
+ * within the cache.
+ *
+ * The contents of this struct are recorded in the on-disk local cache in the
+ * auxiliary data attached to the data storage object backing an inode. This
+ * permits coherency to be managed when a new inode binds to an already extant
+ * cache object.
+ */
+struct nfs_fscache_inode_auxdata {
+ s64 mtime_sec;
+ s64 mtime_nsec;
+ s64 ctime_sec;
+ s64 ctime_nsec;
+ u64 change_attr;
+};
+
+/*
+ * fscache-index.c
+ */
+extern struct fscache_netfs nfs_fscache_netfs;
+extern const struct fscache_cookie_def nfs_fscache_server_index_def;
+extern const struct fscache_cookie_def nfs_fscache_super_index_def;
+extern const struct fscache_cookie_def nfs_fscache_inode_object_def;
+
+extern int nfs_fscache_register(void);
+extern void nfs_fscache_unregister(void);
+
+/*
+ * fscache.c
+ */
+extern void nfs_fscache_get_client_cookie(struct nfs_client *);
+extern void nfs_fscache_release_client_cookie(struct nfs_client *);
+
+extern void nfs_fscache_get_super_cookie(struct super_block *, const char *, int);
+extern void nfs_fscache_release_super_cookie(struct super_block *);
+
+extern void nfs_fscache_init_inode(struct inode *);
+extern void nfs_fscache_clear_inode(struct inode *);
+extern void nfs_fscache_open_file(struct inode *, struct file *);
+
+extern void __nfs_fscache_invalidate_page(struct page *, struct inode *);
+extern int nfs_fscache_release_page(struct page *, gfp_t);
+
+extern int __nfs_readpage_from_fscache(struct nfs_open_context *,
+ struct inode *, struct page *);
+extern int __nfs_readpages_from_fscache(struct nfs_open_context *,
+ struct inode *, struct address_space *,
+ struct list_head *, unsigned *);
+extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int);
+
+/*
+ * wait for a page to complete writing to the cache
+ */
+static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
+ struct page *page)
+{
+ if (PageFsCache(page))
+ fscache_wait_on_page_write(nfsi->fscache, page);
+}
+
+/*
+ * release the caching state associated with a page if undergoing complete page
+ * invalidation
+ */
+static inline void nfs_fscache_invalidate_page(struct page *page,
+ struct inode *inode)
+{
+ if (PageFsCache(page))
+ __nfs_fscache_invalidate_page(page, inode);
+}
+
+/*
+ * Retrieve a page from an inode data storage object.
+ */
+static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+ struct inode *inode,
+ struct page *page)
+{
+ if (NFS_I(inode)->fscache)
+ return __nfs_readpage_from_fscache(ctx, inode, page);
+ return -ENOBUFS;
+}
+
+/*
+ * Retrieve a set of pages from an inode data storage object.
+ */
+static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+ struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ if (NFS_I(inode)->fscache)
+ return __nfs_readpages_from_fscache(ctx, inode, mapping, pages,
+ nr_pages);
+ return -ENOBUFS;
+}
+
+/*
+ * Store a page newly fetched from the server in an inode data storage object
+ * in the cache.
+ */
+static inline void nfs_readpage_to_fscache(struct inode *inode,
+ struct page *page,
+ int sync)
+{
+ if (PageFsCache(page))
+ __nfs_readpage_to_fscache(inode, page, sync);
+}
+
+/*
+ * Invalidate the contents of fscache for this inode. This will not sleep.
+ */
+static inline void nfs_fscache_invalidate(struct inode *inode)
+{
+ fscache_invalidate(NFS_I(inode)->fscache);
+}
+
+/*
+ * Wait for an object to finish being invalidated.
+ */
+static inline void nfs_fscache_wait_on_invalidate(struct inode *inode)
+{
+ fscache_wait_on_invalidate(NFS_I(inode)->fscache);
+}
+
+/*
+ * indicate the client caching state as readable text
+ */
+static inline const char *nfs_server_fscache_state(struct nfs_server *server)
+{
+ if (server->fscache)
+ return "yes";
+ return "no ";
+}
+
+#else /* CONFIG_NFS_FSCACHE */
+static inline int nfs_fscache_register(void) { return 0; }
+static inline void nfs_fscache_unregister(void) {}
+
+static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}
+static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
+
+static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
+
+static inline void nfs_fscache_init_inode(struct inode *inode) {}
+static inline void nfs_fscache_clear_inode(struct inode *inode) {}
+static inline void nfs_fscache_open_file(struct inode *inode,
+ struct file *filp) {}
+
+static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+ return 1; /* True: may release page */
+}
+static inline void nfs_fscache_invalidate_page(struct page *page,
+ struct inode *inode) {}
+static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
+ struct page *page) {}
+
+static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+ struct inode *inode,
+ struct page *page)
+{
+ return -ENOBUFS;
+}
+static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+ struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ return -ENOBUFS;
+}
+static inline void nfs_readpage_to_fscache(struct inode *inode,
+ struct page *page, int sync) {}
+
+
+static inline void nfs_fscache_invalidate(struct inode *inode) {}
+static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) {}
+
+static inline const char *nfs_server_fscache_state(struct nfs_server *server)
+{
+ return "no ";
+}
+
+#endif /* CONFIG_NFS_FSCACHE */
+#endif /* _NFS_FSCACHE_H */
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
new file mode 100644
index 000000000..aaeeb4659
--- /dev/null
+++ b/fs/nfs/getroot.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* getroot.c: get the root dentry for an NFS mount
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_CLIENT
+
+/*
+ * Set the superblock root dentry.
+ * Note that this function frees the inode in case of error.
+ */
+static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *inode)
+{
+ /* The mntroot acts as the dummy root dentry for this superblock */
+ if (sb->s_root == NULL) {
+ sb->s_root = d_make_root(inode);
+ if (sb->s_root == NULL)
+ return -ENOMEM;
+ ihold(inode);
+ /*
+ * Ensure that this dentry is invisible to d_find_alias().
+ * Otherwise, it may be spliced into the tree by
+ * d_splice_alias if a parent directory from the same
+ * filesystem gets mounted at a later time.
+ * This again causes shrink_dcache_for_umount_subtree() to
+ * Oops, since the test for IS_ROOT() will fail.
+ */
+ spin_lock(&d_inode(sb->s_root)->i_lock);
+ spin_lock(&sb->s_root->d_lock);
+ hlist_del_init(&sb->s_root->d_u.d_alias);
+ spin_unlock(&sb->s_root->d_lock);
+ spin_unlock(&d_inode(sb->s_root)->i_lock);
+ }
+ return 0;
+}
+
+/*
+ * get an NFS2/NFS3 root dentry from the root filehandle
+ */
+int nfs_get_root(struct super_block *s, struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct nfs_server *server = NFS_SB(s);
+ struct nfs_fsinfo fsinfo;
+ struct dentry *root;
+ struct inode *inode;
+ char *name;
+ int error = -ENOMEM;
+ unsigned long kflags = 0, kflags_out = 0;
+
+ name = kstrdup(fc->source, GFP_KERNEL);
+ if (!name)
+ goto out;
+
+ /* get the actual root for this mount */
+ fsinfo.fattr = nfs_alloc_fattr();
+ if (fsinfo.fattr == NULL)
+ goto out_name;
+
+ fsinfo.fattr->label = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(fsinfo.fattr->label))
+ goto out_fattr;
+ error = server->nfs_client->rpc_ops->getroot(server, ctx->mntfh, &fsinfo);
+ if (error < 0) {
+ dprintk("nfs_get_root: getattr error = %d\n", -error);
+ nfs_errorf(fc, "NFS: Couldn't getattr on root");
+ goto out_label;
+ }
+
+ inode = nfs_fhget(s, ctx->mntfh, fsinfo.fattr, NULL);
+ if (IS_ERR(inode)) {
+ dprintk("nfs_get_root: get root inode failed\n");
+ error = PTR_ERR(inode);
+ nfs_errorf(fc, "NFS: Couldn't get root inode");
+ goto out_label;
+ }
+
+ error = nfs_superblock_set_dummy_root(s, inode);
+ if (error != 0)
+ goto out_label;
+
+ /* root dentries normally start off anonymous and get spliced in later
+ * if the dentry tree reaches them; however if the dentry already
+ * exists, we'll pick it up at this point and use it as the root
+ */
+ root = d_obtain_root(inode);
+ if (IS_ERR(root)) {
+ dprintk("nfs_get_root: get root dentry failed\n");
+ error = PTR_ERR(root);
+ nfs_errorf(fc, "NFS: Couldn't get root dentry");
+ goto out_label;
+ }
+
+ security_d_instantiate(root, inode);
+ spin_lock(&root->d_lock);
+ if (IS_ROOT(root) && !root->d_fsdata &&
+ !(root->d_flags & DCACHE_NFSFS_RENAMED)) {
+ root->d_fsdata = name;
+ name = NULL;
+ }
+ spin_unlock(&root->d_lock);
+ fc->root = root;
+ if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL)
+ kflags |= SECURITY_LSM_NATIVE_LABELS;
+ if (ctx->clone_data.sb) {
+ if (d_inode(fc->root)->i_fop != &nfs_dir_operations) {
+ error = -ESTALE;
+ goto error_splat_root;
+ }
+ /* clone lsm security options from the parent to the new sb */
+ error = security_sb_clone_mnt_opts(ctx->clone_data.sb,
+ s, kflags, &kflags_out);
+ } else {
+ error = security_sb_set_mnt_opts(s, fc->security,
+ kflags, &kflags_out);
+ }
+ if (error)
+ goto error_splat_root;
+ if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL &&
+ !(kflags_out & SECURITY_LSM_NATIVE_LABELS))
+ NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL;
+
+ nfs_setsecurity(inode, fsinfo.fattr, fsinfo.fattr->label);
+ error = 0;
+
+out_label:
+ nfs4_label_free(fsinfo.fattr->label);
+out_fattr:
+ nfs_free_fattr(fsinfo.fattr);
+out_name:
+ kfree(name);
+out:
+ return error;
+error_splat_root:
+ dput(fc->root);
+ fc->root = NULL;
+ goto out_label;
+}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
new file mode 100644
index 000000000..36f415278
--- /dev/null
+++ b/fs/nfs/inode.c
@@ -0,0 +1,2346 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/inode.c
+ *
+ * Copyright (C) 1992 Rick Sladkey
+ *
+ * nfs inode and superblock handling functions
+ *
+ * Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some
+ * experimental NFS changes. Modularisation taken straight from SYS5 fs.
+ *
+ * Change to nfs_read_super() to permit NFS mounts to multi-homed hosts.
+ * J.S.Peatfield@damtp.cam.ac.uk
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched/signal.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/nfs_xdr.h>
+#include <linux/slab.h>
+#include <linux/compat.h>
+#include <linux/freezer.h>
+#include <linux/uaccess.h>
+#include <linux/iversion.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#include "fscache.h"
+#include "pnfs.h"
+#include "nfs.h"
+#include "netns.h"
+#include "sysfs.h"
+
+#include "nfstrace.h"
+
+#define NFSDBG_FACILITY NFSDBG_VFS
+
+#define NFS_64_BIT_INODE_NUMBERS_ENABLED 1
+
+/* Default is to see 64-bit inode numbers */
+static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
+
+static int nfs_update_inode(struct inode *, struct nfs_fattr *);
+
+static struct kmem_cache * nfs_inode_cachep;
+
+static inline unsigned long
+nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
+{
+ return nfs_fileid_to_ino_t(fattr->fileid);
+}
+
+static int nfs_wait_killable(int mode)
+{
+ freezable_schedule_unsafe();
+ if (signal_pending_state(mode, current))
+ return -ERESTARTSYS;
+ return 0;
+}
+
+int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
+{
+ return nfs_wait_killable(mode);
+}
+EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
+
+/**
+ * nfs_compat_user_ino64 - returns the user-visible inode number
+ * @fileid: 64-bit fileid
+ *
+ * This function returns a 32-bit inode number if the boot parameter
+ * nfs.enable_ino64 is zero.
+ */
+u64 nfs_compat_user_ino64(u64 fileid)
+{
+#ifdef CONFIG_COMPAT
+ compat_ulong_t ino;
+#else
+ unsigned long ino;
+#endif
+
+ if (enable_ino64)
+ return fileid;
+ ino = fileid;
+ if (sizeof(ino) < sizeof(fileid))
+ ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8;
+ return ino;
+}
+
+int nfs_drop_inode(struct inode *inode)
+{
+ return NFS_STALE(inode) || generic_drop_inode(inode);
+}
+EXPORT_SYMBOL_GPL(nfs_drop_inode);
+
+void nfs_clear_inode(struct inode *inode)
+{
+ /*
+ * The following should never happen...
+ */
+ WARN_ON_ONCE(nfs_have_writebacks(inode));
+ WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files));
+ nfs_zap_acl_cache(inode);
+ nfs_access_zap_cache(inode);
+ nfs_fscache_clear_inode(inode);
+}
+EXPORT_SYMBOL_GPL(nfs_clear_inode);
+
+void nfs_evict_inode(struct inode *inode)
+{
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+ nfs_clear_inode(inode);
+}
+
+int nfs_sync_inode(struct inode *inode)
+{
+ inode_dio_wait(inode);
+ return nfs_wb_all(inode);
+}
+EXPORT_SYMBOL_GPL(nfs_sync_inode);
+
+/**
+ * nfs_sync_mapping - helper to flush all mmapped dirty data to disk
+ * @mapping: pointer to struct address_space
+ */
+int nfs_sync_mapping(struct address_space *mapping)
+{
+ int ret = 0;
+
+ if (mapping->nrpages != 0) {
+ unmap_mapping_range(mapping, 0, 0, 0);
+ ret = nfs_wb_all(mapping->host);
+ }
+ return ret;
+}
+
+static int nfs_attribute_timeout(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
+}
+
+static bool nfs_check_cache_invalid_delegated(struct inode *inode, unsigned long flags)
+{
+ unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+
+ /* Special case for the pagecache or access cache */
+ if (flags == NFS_INO_REVAL_PAGECACHE &&
+ !(cache_validity & NFS_INO_REVAL_FORCED))
+ return false;
+ return (cache_validity & flags) != 0;
+}
+
+static bool nfs_check_cache_invalid_not_delegated(struct inode *inode, unsigned long flags)
+{
+ unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+
+ if ((cache_validity & flags) != 0)
+ return true;
+ if (nfs_attribute_timeout(inode))
+ return true;
+ return false;
+}
+
+bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags)
+{
+ if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ return nfs_check_cache_invalid_delegated(inode, flags);
+
+ return nfs_check_cache_invalid_not_delegated(inode, flags);
+}
+EXPORT_SYMBOL_GPL(nfs_check_cache_invalid);
+
+#ifdef CONFIG_NFS_V4_2
+static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi)
+{
+ return nfsi->xattr_cache != NULL;
+}
+#else
+static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi)
+{
+ return false;
+}
+#endif
+
+static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ bool have_delegation = NFS_PROTO(inode)->have_delegation(inode, FMODE_READ);
+
+ if (have_delegation) {
+ if (!(flags & NFS_INO_REVAL_FORCED))
+ flags &= ~NFS_INO_INVALID_OTHER;
+ flags &= ~(NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_SIZE
+ | NFS_INO_REVAL_PAGECACHE
+ | NFS_INO_INVALID_XATTR);
+ } else if (flags & NFS_INO_REVAL_PAGECACHE)
+ flags |= NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE;
+
+ if (!nfs_has_xattr_cache(nfsi))
+ flags &= ~NFS_INO_INVALID_XATTR;
+ if (inode->i_mapping->nrpages == 0)
+ flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER);
+ nfsi->cache_validity |= flags;
+ if (flags & NFS_INO_INVALID_DATA)
+ nfs_fscache_invalidate(inode);
+}
+
+/*
+ * Invalidate the local caches
+ */
+static void nfs_zap_caches_locked(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ int mode = inode->i_mode;
+
+ nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
+
+ nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
+ nfsi->attrtimeo_timestamp = jiffies;
+
+ memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
+ if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_DATA
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_XATTR
+ | NFS_INO_REVAL_PAGECACHE);
+ } else
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_XATTR
+ | NFS_INO_REVAL_PAGECACHE);
+ nfs_zap_label_cache_locked(nfsi);
+}
+
+void nfs_zap_caches(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ nfs_zap_caches_locked(inode);
+ spin_unlock(&inode->i_lock);
+}
+
+void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
+{
+ if (mapping->nrpages != 0) {
+ spin_lock(&inode->i_lock);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
+ spin_unlock(&inode->i_lock);
+ }
+}
+
+void nfs_zap_acl_cache(struct inode *inode)
+{
+ void (*clear_acl_cache)(struct inode *);
+
+ clear_acl_cache = NFS_PROTO(inode)->clear_acl_cache;
+ if (clear_acl_cache != NULL)
+ clear_acl_cache(inode);
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ACL;
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_zap_acl_cache);
+
+void nfs_invalidate_atime(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_invalidate_atime);
+
+/*
+ * Invalidate, but do not unhash, the inode.
+ * NB: must be called with inode->i_lock held!
+ */
+static void nfs_set_inode_stale_locked(struct inode *inode)
+{
+ set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+ nfs_zap_caches_locked(inode);
+ trace_nfs_set_inode_stale(inode);
+}
+
+void nfs_set_inode_stale(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ nfs_set_inode_stale_locked(inode);
+ spin_unlock(&inode->i_lock);
+}
+
+struct nfs_find_desc {
+ struct nfs_fh *fh;
+ struct nfs_fattr *fattr;
+};
+
+/*
+ * In NFSv3 we can have 64bit inode numbers. In order to support
+ * this, and re-exported directories (also seen in NFSv2)
+ * we are forced to allow 2 different inodes to have the same
+ * i_ino.
+ */
+static int
+nfs_find_actor(struct inode *inode, void *opaque)
+{
+ struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque;
+ struct nfs_fh *fh = desc->fh;
+ struct nfs_fattr *fattr = desc->fattr;
+
+ if (NFS_FILEID(inode) != fattr->fileid)
+ return 0;
+ if (inode_wrong_type(inode, fattr->mode))
+ return 0;
+ if (nfs_compare_fh(NFS_FH(inode), fh))
+ return 0;
+ if (is_bad_inode(inode) || NFS_STALE(inode))
+ return 0;
+ return 1;
+}
+
+static int
+nfs_init_locked(struct inode *inode, void *opaque)
+{
+ struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque;
+ struct nfs_fattr *fattr = desc->fattr;
+
+ set_nfs_fileid(inode, fattr->fileid);
+ inode->i_mode = fattr->mode;
+ nfs_copy_fh(NFS_FH(inode), desc->fh);
+ return 0;
+}
+
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static void nfs_clear_label_invalid(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_LABEL;
+ spin_unlock(&inode->i_lock);
+}
+
+void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
+{
+ int error;
+
+ if (label == NULL)
+ return;
+
+ if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) {
+ error = security_inode_notifysecctx(inode, label->label,
+ label->len);
+ if (error)
+ printk(KERN_ERR "%s() %s %d "
+ "security_inode_notifysecctx() %d\n",
+ __func__,
+ (char *)label->label,
+ label->len, error);
+ nfs_clear_label_invalid(inode);
+ }
+}
+
+struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags)
+{
+ struct nfs4_label *label = NULL;
+ int minor_version = server->nfs_client->cl_minorversion;
+
+ if (minor_version < 2)
+ return label;
+
+ if (!(server->caps & NFS_CAP_SECURITY_LABEL))
+ return label;
+
+ label = kzalloc(sizeof(struct nfs4_label), flags);
+ if (label == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ label->label = kzalloc(NFS4_MAXLABELLEN, flags);
+ if (label->label == NULL) {
+ kfree(label);
+ return ERR_PTR(-ENOMEM);
+ }
+ label->len = NFS4_MAXLABELLEN;
+
+ return label;
+}
+EXPORT_SYMBOL_GPL(nfs4_label_alloc);
+#else
+void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
+{
+}
+#endif
+EXPORT_SYMBOL_GPL(nfs_setsecurity);
+
+/* Search for inode identified by fh, fileid and i_mode in inode cache. */
+struct inode *
+nfs_ilookup(struct super_block *sb, struct nfs_fattr *fattr, struct nfs_fh *fh)
+{
+ struct nfs_find_desc desc = {
+ .fh = fh,
+ .fattr = fattr,
+ };
+ struct inode *inode;
+ unsigned long hash;
+
+ if (!(fattr->valid & NFS_ATTR_FATTR_FILEID) ||
+ !(fattr->valid & NFS_ATTR_FATTR_TYPE))
+ return NULL;
+
+ hash = nfs_fattr_to_ino_t(fattr);
+ inode = ilookup5(sb, hash, nfs_find_actor, &desc);
+
+ dprintk("%s: returning %p\n", __func__, inode);
+ return inode;
+}
+
+/*
+ * This is our front-end to iget that looks up inodes by file handle
+ * instead of inode number.
+ */
+struct inode *
+nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label)
+{
+ struct nfs_find_desc desc = {
+ .fh = fh,
+ .fattr = fattr
+ };
+ struct inode *inode = ERR_PTR(-ENOENT);
+ unsigned long hash;
+
+ nfs_attr_check_mountpoint(sb, fattr);
+
+ if (nfs_attr_use_mounted_on_fileid(fattr))
+ fattr->fileid = fattr->mounted_on_fileid;
+ else if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
+ goto out_no_inode;
+ if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
+ goto out_no_inode;
+
+ hash = nfs_fattr_to_ino_t(fattr);
+
+ inode = iget5_locked(sb, hash, nfs_find_actor, nfs_init_locked, &desc);
+ if (inode == NULL) {
+ inode = ERR_PTR(-ENOMEM);
+ goto out_no_inode;
+ }
+
+ if (inode->i_state & I_NEW) {
+ struct nfs_inode *nfsi = NFS_I(inode);
+ unsigned long now = jiffies;
+
+ /* We set i_ino for the few things that still rely on it,
+ * such as stat(2) */
+ inode->i_ino = hash;
+
+ /* We can't support update_atime(), since the server will reset it */
+ inode->i_flags |= S_NOATIME|S_NOCMTIME;
+ inode->i_mode = fattr->mode;
+ nfsi->cache_validity = 0;
+ if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
+ && nfs_server_capable(inode, NFS_CAP_MODE))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+ /* Why so? Because we want revalidate for devices/FIFOs, and
+ * that's precisely what we have in nfs_file_inode_operations.
+ */
+ inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops;
+ if (S_ISREG(inode->i_mode)) {
+ inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
+ inode->i_data.a_ops = &nfs_file_aops;
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
+ inode->i_fop = &nfs_dir_operations;
+ inode->i_data.a_ops = &nfs_dir_aops;
+ /* Deal with crossing mountpoints */
+ if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT ||
+ fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+ if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
+ inode->i_op = &nfs_referral_inode_operations;
+ else
+ inode->i_op = &nfs_mountpoint_inode_operations;
+ inode->i_fop = NULL;
+ inode->i_flags |= S_AUTOMOUNT;
+ }
+ } else if (S_ISLNK(inode->i_mode)) {
+ inode->i_op = &nfs_symlink_inode_operations;
+ inode_nohighmem(inode);
+ } else
+ init_special_inode(inode, inode->i_mode, fattr->rdev);
+
+ memset(&inode->i_atime, 0, sizeof(inode->i_atime));
+ memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
+ memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
+ inode_set_iversion_raw(inode, 0);
+ inode->i_size = 0;
+ clear_nlink(inode);
+ inode->i_uid = make_kuid(&init_user_ns, -2);
+ inode->i_gid = make_kgid(&init_user_ns, -2);
+ inode->i_blocks = 0;
+ memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+ nfsi->write_io = 0;
+ nfsi->read_io = 0;
+
+ nfsi->read_cache_jiffies = fattr->time_start;
+ nfsi->attr_gencount = fattr->gencount;
+ if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+ inode->i_atime = fattr->atime;
+ else if (nfs_server_capable(inode, NFS_CAP_ATIME))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
+ if (fattr->valid & NFS_ATTR_FATTR_MTIME)
+ inode->i_mtime = fattr->mtime;
+ else if (nfs_server_capable(inode, NFS_CAP_MTIME))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
+ if (fattr->valid & NFS_ATTR_FATTR_CTIME)
+ inode->i_ctime = fattr->ctime;
+ else if (nfs_server_capable(inode, NFS_CAP_CTIME))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME);
+ if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
+ inode_set_iversion_raw(inode, fattr->change_attr);
+ else
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE);
+ if (fattr->valid & NFS_ATTR_FATTR_SIZE)
+ inode->i_size = nfs_size_to_loff_t(fattr->size);
+ else
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_SIZE);
+ if (fattr->valid & NFS_ATTR_FATTR_NLINK)
+ set_nlink(inode, fattr->nlink);
+ else if (nfs_server_capable(inode, NFS_CAP_NLINK))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+ if (fattr->valid & NFS_ATTR_FATTR_OWNER)
+ inode->i_uid = fattr->uid;
+ else if (nfs_server_capable(inode, NFS_CAP_OWNER))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+ if (fattr->valid & NFS_ATTR_FATTR_GROUP)
+ inode->i_gid = fattr->gid;
+ else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+ if (nfs_server_capable(inode, NFS_CAP_XATTR))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR);
+ if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+ inode->i_blocks = fattr->du.nfs2.blocks;
+ if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
+ /*
+ * report the blocks in 512byte units
+ */
+ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
+ }
+
+ if (nfsi->cache_validity != 0)
+ nfsi->cache_validity |= NFS_INO_REVAL_FORCED;
+
+ nfs_setsecurity(inode, fattr, label);
+
+ nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
+ nfsi->attrtimeo_timestamp = now;
+ nfsi->access_cache = RB_ROOT;
+
+ nfs_fscache_init_inode(inode);
+
+ unlock_new_inode(inode);
+ } else {
+ int err = nfs_refresh_inode(inode, fattr);
+ if (err < 0) {
+ iput(inode);
+ inode = ERR_PTR(err);
+ goto out_no_inode;
+ }
+ }
+ dprintk("NFS: nfs_fhget(%s/%Lu fh_crc=0x%08x ct=%d)\n",
+ inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(inode),
+ nfs_display_fhandle_hash(fh),
+ atomic_read(&inode->i_count));
+
+out:
+ return inode;
+
+out_no_inode:
+ dprintk("nfs_fhget: iget failed with error %ld\n", PTR_ERR(inode));
+ goto out;
+}
+EXPORT_SYMBOL_GPL(nfs_fhget);
+
+#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN)
+
+int
+nfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = d_inode(dentry);
+ struct nfs_fattr *fattr;
+ int error = 0;
+
+ nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
+
+ /* skip mode change if it's just for clearing setuid/setgid */
+ if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
+ attr->ia_valid &= ~ATTR_MODE;
+
+ if (attr->ia_valid & ATTR_SIZE) {
+ BUG_ON(!S_ISREG(inode->i_mode));
+
+ error = inode_newsize_ok(inode, attr->ia_size);
+ if (error)
+ return error;
+
+ if (attr->ia_size == i_size_read(inode))
+ attr->ia_valid &= ~ATTR_SIZE;
+ }
+
+ /* Optimization: if the end result is no change, don't RPC */
+ attr->ia_valid &= NFS_VALID_ATTRS;
+ if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0)
+ return 0;
+
+ trace_nfs_setattr_enter(inode);
+
+ /* Write all dirty data */
+ if (S_ISREG(inode->i_mode))
+ nfs_sync_inode(inode);
+
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
+ if (error == 0)
+ error = nfs_refresh_inode(inode, fattr);
+ nfs_free_fattr(fattr);
+out:
+ trace_nfs_setattr_exit(inode, error);
+ return error;
+}
+EXPORT_SYMBOL_GPL(nfs_setattr);
+
+/**
+ * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall
+ * @inode: inode of the file used
+ * @offset: file offset to start truncating
+ *
+ * This is a copy of the common vmtruncate, but with the locking
+ * corrected to take into account the fact that NFS requires
+ * inode->i_size to be updated under the inode->i_lock.
+ * Note: must be called with inode->i_lock held!
+ */
+static int nfs_vmtruncate(struct inode * inode, loff_t offset)
+{
+ int err;
+
+ err = inode_newsize_ok(inode, offset);
+ if (err)
+ goto out;
+
+ i_size_write(inode, offset);
+ /* Optimisation */
+ if (offset == 0)
+ NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_DATA |
+ NFS_INO_DATA_INVAL_DEFER);
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
+
+ spin_unlock(&inode->i_lock);
+ truncate_pagecache(inode, offset);
+ spin_lock(&inode->i_lock);
+out:
+ return err;
+}
+
+/**
+ * nfs_setattr_update_inode - Update inode metadata after a setattr call.
+ * @inode: pointer to struct inode
+ * @attr: pointer to struct iattr
+ * @fattr: pointer to struct nfs_fattr
+ *
+ * Note: we do this in the *proc.c in order to ensure that
+ * it works for things like exclusive creates too.
+ */
+void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
+ struct nfs_fattr *fattr)
+{
+ /* Barrier: bump the attribute generation count. */
+ nfs_fattr_set_barrier(fattr);
+
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->attr_gencount = fattr->gencount;
+ if ((attr->ia_valid & ATTR_SIZE) != 0) {
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
+ nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
+ nfs_vmtruncate(inode, attr->ia_size);
+ }
+ if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_CTIME;
+ if ((attr->ia_valid & ATTR_MODE) != 0) {
+ int mode = attr->ia_mode & S_IALLUGO;
+ mode |= inode->i_mode & ~S_IALLUGO;
+ inode->i_mode = mode;
+ }
+ if ((attr->ia_valid & ATTR_UID) != 0)
+ inode->i_uid = attr->ia_uid;
+ if ((attr->ia_valid & ATTR_GID) != 0)
+ inode->i_gid = attr->ia_gid;
+ if (fattr->valid & NFS_ATTR_FATTR_CTIME)
+ inode->i_ctime = fattr->ctime;
+ else
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_CTIME);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL);
+ }
+ if (attr->ia_valid & (ATTR_ATIME_SET|ATTR_ATIME)) {
+ NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_ATIME
+ | NFS_INO_INVALID_CTIME);
+ if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+ inode->i_atime = fattr->atime;
+ else if (attr->ia_valid & ATTR_ATIME_SET)
+ inode->i_atime = attr->ia_atime;
+ else
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
+
+ if (fattr->valid & NFS_ATTR_FATTR_CTIME)
+ inode->i_ctime = fattr->ctime;
+ else
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_CTIME);
+ }
+ if (attr->ia_valid & (ATTR_MTIME_SET|ATTR_MTIME)) {
+ NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_MTIME
+ | NFS_INO_INVALID_CTIME);
+ if (fattr->valid & NFS_ATTR_FATTR_MTIME)
+ inode->i_mtime = fattr->mtime;
+ else if (attr->ia_valid & ATTR_MTIME_SET)
+ inode->i_mtime = attr->ia_mtime;
+ else
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
+
+ if (fattr->valid & NFS_ATTR_FATTR_CTIME)
+ inode->i_ctime = fattr->ctime;
+ else
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_CTIME);
+ }
+ if (fattr->valid)
+ nfs_update_inode(inode, fattr);
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
+
+static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry)
+{
+ struct dentry *parent;
+
+ if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
+ return;
+ parent = dget_parent(dentry);
+ nfs_force_use_readdirplus(d_inode(parent));
+ dput(parent);
+}
+
+static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry)
+{
+ struct dentry *parent;
+
+ if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
+ return;
+ parent = dget_parent(dentry);
+ nfs_advise_use_readdirplus(d_inode(parent));
+ dput(parent);
+}
+
+static bool nfs_need_revalidate_inode(struct inode *inode)
+{
+ if (NFS_I(inode)->cache_validity &
+ (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
+ return true;
+ if (nfs_attribute_cache_expired(inode))
+ return true;
+ return false;
+}
+
+int nfs_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+ struct nfs_server *server = NFS_SERVER(inode);
+ unsigned long cache_validity;
+ int err = 0;
+ bool force_sync = query_flags & AT_STATX_FORCE_SYNC;
+ bool do_update = false;
+
+ trace_nfs_getattr_enter(inode);
+
+ if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) {
+ nfs_readdirplus_parent_cache_hit(path->dentry);
+ goto out_no_update;
+ }
+
+ /* Flush out writes to the server in order to update c/mtime. */
+ if ((request_mask & (STATX_CTIME | STATX_MTIME)) &&
+ S_ISREG(inode->i_mode))
+ filemap_write_and_wait(inode->i_mapping);
+
+ /*
+ * We may force a getattr if the user cares about atime.
+ *
+ * Note that we only have to check the vfsmount flags here:
+ * - NFS always sets S_NOATIME by so checking it would give a
+ * bogus result
+ * - NFS never sets SB_NOATIME or SB_NODIRATIME so there is
+ * no point in checking those.
+ */
+ if ((path->mnt->mnt_flags & MNT_NOATIME) ||
+ ((path->mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
+ request_mask &= ~STATX_ATIME;
+
+ /* Is the user requesting attributes that might need revalidation? */
+ if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME|
+ STATX_MTIME|STATX_UID|STATX_GID|
+ STATX_SIZE|STATX_BLOCKS)))
+ goto out_no_revalidate;
+
+ /* Check whether the cached attributes are stale */
+ do_update |= force_sync || nfs_attribute_cache_expired(inode);
+ cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+ do_update |= cache_validity &
+ (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL);
+ if (request_mask & STATX_ATIME)
+ do_update |= cache_validity & NFS_INO_INVALID_ATIME;
+ if (request_mask & (STATX_CTIME|STATX_MTIME))
+ do_update |= cache_validity & NFS_INO_REVAL_PAGECACHE;
+ if (request_mask & STATX_BLOCKS)
+ do_update |= cache_validity & NFS_INO_INVALID_BLOCKS;
+ if (do_update) {
+ /* Update the attribute cache */
+ if (!(server->flags & NFS_MOUNT_NOAC))
+ nfs_readdirplus_parent_cache_miss(path->dentry);
+ else
+ nfs_readdirplus_parent_cache_hit(path->dentry);
+ err = __nfs_revalidate_inode(server, inode);
+ if (err)
+ goto out;
+ } else
+ nfs_readdirplus_parent_cache_hit(path->dentry);
+out_no_revalidate:
+ /* Only return attributes that were revalidated. */
+ stat->result_mask &= request_mask;
+out_no_update:
+ generic_fillattr(inode, stat);
+ stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
+ if (S_ISDIR(inode->i_mode))
+ stat->blksize = NFS_SERVER(inode)->dtsize;
+out:
+ trace_nfs_getattr_exit(inode, err);
+ return err;
+}
+EXPORT_SYMBOL_GPL(nfs_getattr);
+
+static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
+{
+ refcount_set(&l_ctx->count, 1);
+ l_ctx->lockowner = current->files;
+ INIT_LIST_HEAD(&l_ctx->list);
+ atomic_set(&l_ctx->io_count, 0);
+}
+
+static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
+{
+ struct nfs_lock_context *pos;
+
+ list_for_each_entry_rcu(pos, &ctx->lock_context.list, list) {
+ if (pos->lockowner != current->files)
+ continue;
+ if (refcount_inc_not_zero(&pos->count))
+ return pos;
+ }
+ return NULL;
+}
+
+struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
+{
+ struct nfs_lock_context *res, *new = NULL;
+ struct inode *inode = d_inode(ctx->dentry);
+
+ rcu_read_lock();
+ res = __nfs_find_lock_context(ctx);
+ rcu_read_unlock();
+ if (res == NULL) {
+ new = kmalloc(sizeof(*new), GFP_KERNEL);
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ nfs_init_lock_context(new);
+ spin_lock(&inode->i_lock);
+ res = __nfs_find_lock_context(ctx);
+ if (res == NULL) {
+ new->open_context = get_nfs_open_context(ctx);
+ if (new->open_context) {
+ list_add_tail_rcu(&new->list,
+ &ctx->lock_context.list);
+ res = new;
+ new = NULL;
+ } else
+ res = ERR_PTR(-EBADF);
+ }
+ spin_unlock(&inode->i_lock);
+ kfree(new);
+ }
+ return res;
+}
+EXPORT_SYMBOL_GPL(nfs_get_lock_context);
+
+void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
+{
+ struct nfs_open_context *ctx = l_ctx->open_context;
+ struct inode *inode = d_inode(ctx->dentry);
+
+ if (!refcount_dec_and_lock(&l_ctx->count, &inode->i_lock))
+ return;
+ list_del_rcu(&l_ctx->list);
+ spin_unlock(&inode->i_lock);
+ put_nfs_open_context(ctx);
+ kfree_rcu(l_ctx, rcu_head);
+}
+EXPORT_SYMBOL_GPL(nfs_put_lock_context);
+
+/**
+ * nfs_close_context - Common close_context() routine NFSv2/v3
+ * @ctx: pointer to context
+ * @is_sync: is this a synchronous close
+ *
+ * Ensure that the attributes are up to date if we're mounted
+ * with close-to-open semantics and we have cached data that will
+ * need to be revalidated on open.
+ */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+ struct nfs_inode *nfsi;
+ struct inode *inode;
+ struct nfs_server *server;
+
+ if (!(ctx->mode & FMODE_WRITE))
+ return;
+ if (!is_sync)
+ return;
+ inode = d_inode(ctx->dentry);
+ if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ return;
+ nfsi = NFS_I(inode);
+ if (inode->i_mapping->nrpages == 0)
+ return;
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ return;
+ if (!list_empty(&nfsi->open_files))
+ return;
+ server = NFS_SERVER(inode);
+ if (server->flags & NFS_MOUNT_NOCTO)
+ return;
+ nfs_revalidate_inode(server, inode);
+}
+EXPORT_SYMBOL_GPL(nfs_close_context);
+
+struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
+ fmode_t f_mode,
+ struct file *filp)
+{
+ struct nfs_open_context *ctx;
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+ nfs_sb_active(dentry->d_sb);
+ ctx->dentry = dget(dentry);
+ if (filp)
+ ctx->cred = get_cred(filp->f_cred);
+ else
+ ctx->cred = get_current_cred();
+ ctx->ll_cred = NULL;
+ ctx->state = NULL;
+ ctx->mode = f_mode;
+ ctx->flags = 0;
+ ctx->error = 0;
+ ctx->flock_owner = (fl_owner_t)filp;
+ nfs_init_lock_context(&ctx->lock_context);
+ ctx->lock_context.open_context = ctx;
+ INIT_LIST_HEAD(&ctx->list);
+ ctx->mdsthreshold = NULL;
+ return ctx;
+}
+EXPORT_SYMBOL_GPL(alloc_nfs_open_context);
+
+struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
+{
+ if (ctx != NULL && refcount_inc_not_zero(&ctx->lock_context.count))
+ return ctx;
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(get_nfs_open_context);
+
+static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
+{
+ struct inode *inode = d_inode(ctx->dentry);
+ struct super_block *sb = ctx->dentry->d_sb;
+
+ if (!refcount_dec_and_test(&ctx->lock_context.count))
+ return;
+ if (!list_empty(&ctx->list)) {
+ spin_lock(&inode->i_lock);
+ list_del_rcu(&ctx->list);
+ spin_unlock(&inode->i_lock);
+ }
+ if (inode != NULL)
+ NFS_PROTO(inode)->close_context(ctx, is_sync);
+ put_cred(ctx->cred);
+ dput(ctx->dentry);
+ nfs_sb_deactive(sb);
+ put_rpccred(ctx->ll_cred);
+ kfree(ctx->mdsthreshold);
+ kfree_rcu(ctx, rcu_head);
+}
+
+void put_nfs_open_context(struct nfs_open_context *ctx)
+{
+ __put_nfs_open_context(ctx, 0);
+}
+EXPORT_SYMBOL_GPL(put_nfs_open_context);
+
+static void put_nfs_open_context_sync(struct nfs_open_context *ctx)
+{
+ __put_nfs_open_context(ctx, 1);
+}
+
+/*
+ * Ensure that mmap has a recent RPC credential for use when writing out
+ * shared pages
+ */
+void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
+{
+ struct inode *inode = d_inode(ctx->dentry);
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ spin_lock(&inode->i_lock);
+ if (list_empty(&nfsi->open_files) &&
+ (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
+ nfsi->cache_validity |= NFS_INO_INVALID_DATA |
+ NFS_INO_REVAL_FORCED;
+ list_add_tail_rcu(&ctx->list, &nfsi->open_files);
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
+
+void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+{
+ filp->private_data = get_nfs_open_context(ctx);
+ set_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags);
+ if (list_empty(&ctx->list))
+ nfs_inode_attach_open_context(ctx);
+}
+EXPORT_SYMBOL_GPL(nfs_file_set_open_context);
+
+/*
+ * Given an inode, search for an open context with the desired characteristics
+ */
+struct nfs_open_context *nfs_find_open_context(struct inode *inode, const struct cred *cred, fmode_t mode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_open_context *pos, *ctx = NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pos, &nfsi->open_files, list) {
+ if (cred != NULL && cred_fscmp(pos->cred, cred) != 0)
+ continue;
+ if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
+ continue;
+ if (!test_bit(NFS_CONTEXT_FILE_OPEN, &pos->flags))
+ continue;
+ ctx = get_nfs_open_context(pos);
+ if (ctx)
+ break;
+ }
+ rcu_read_unlock();
+ return ctx;
+}
+
+void nfs_file_clear_open_context(struct file *filp)
+{
+ struct nfs_open_context *ctx = nfs_file_open_context(filp);
+
+ if (ctx) {
+ struct inode *inode = d_inode(ctx->dentry);
+
+ clear_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags);
+ /*
+ * We fatal error on write before. Try to writeback
+ * every page again.
+ */
+ if (ctx->error < 0)
+ invalidate_inode_pages2(inode->i_mapping);
+ filp->private_data = NULL;
+ put_nfs_open_context_sync(ctx);
+ }
+}
+
+/*
+ * These allocate and release file read/write context information.
+ */
+int nfs_open(struct inode *inode, struct file *filp)
+{
+ struct nfs_open_context *ctx;
+
+ ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ nfs_file_set_open_context(filp, ctx);
+ put_nfs_open_context(ctx);
+ nfs_fscache_open_file(inode, filp);
+ return 0;
+}
+
+/*
+ * This function is called whenever some part of NFS notices that
+ * the cached attributes have to be refreshed.
+ */
+int
+__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
+{
+ int status = -ESTALE;
+ struct nfs4_label *label = NULL;
+ struct nfs_fattr *fattr = NULL;
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ dfprintk(PAGECACHE, "NFS: revalidating (%s/%Lu)\n",
+ inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode));
+
+ trace_nfs_revalidate_inode_enter(inode);
+
+ if (is_bad_inode(inode))
+ goto out;
+ if (NFS_STALE(inode))
+ goto out;
+
+ /* pNFS: Attributes aren't updated until we layoutcommit */
+ if (S_ISREG(inode->i_mode)) {
+ status = pnfs_sync_inode(inode, false);
+ if (status)
+ goto out;
+ }
+
+ status = -ENOMEM;
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ goto out;
+
+ nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
+
+ label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+ if (IS_ERR(label)) {
+ status = PTR_ERR(label);
+ goto out;
+ }
+
+ status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr,
+ label, inode);
+ if (status != 0) {
+ dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n",
+ inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(inode), status);
+ switch (status) {
+ case -ETIMEDOUT:
+ /* A soft timeout occurred. Use cached information? */
+ if (server->flags & NFS_MOUNT_SOFTREVAL)
+ status = 0;
+ break;
+ case -ESTALE:
+ if (!S_ISDIR(inode->i_mode))
+ nfs_set_inode_stale(inode);
+ else
+ nfs_zap_caches(inode);
+ }
+ goto err_out;
+ }
+
+ status = nfs_refresh_inode(inode, fattr);
+ if (status) {
+ dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n",
+ inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(inode), status);
+ goto err_out;
+ }
+
+ if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
+ nfs_zap_acl_cache(inode);
+
+ nfs_setsecurity(inode, fattr, label);
+
+ dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n",
+ inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(inode));
+
+err_out:
+ nfs4_label_free(label);
+out:
+ nfs_free_fattr(fattr);
+ trace_nfs_revalidate_inode_exit(inode, status);
+ return status;
+}
+
+int nfs_attribute_cache_expired(struct inode *inode)
+{
+ if (nfs_have_delegated_attributes(inode))
+ return 0;
+ return nfs_attribute_timeout(inode);
+}
+
+/**
+ * nfs_revalidate_inode - Revalidate the inode attributes
+ * @server: pointer to nfs_server struct
+ * @inode: pointer to inode struct
+ *
+ * Updates inode attribute information by retrieving the data from the server.
+ */
+int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
+{
+ if (!nfs_need_revalidate_inode(inode))
+ return NFS_STALE(inode) ? -ESTALE : 0;
+ return __nfs_revalidate_inode(server, inode);
+}
+EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
+
+static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ int ret;
+
+ if (mapping->nrpages != 0) {
+ if (S_ISREG(inode->i_mode)) {
+ ret = nfs_sync_mapping(mapping);
+ if (ret < 0)
+ return ret;
+ }
+ ret = invalidate_inode_pages2(mapping);
+ if (ret < 0)
+ return ret;
+ }
+ if (S_ISDIR(inode->i_mode)) {
+ spin_lock(&inode->i_lock);
+ memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+ spin_unlock(&inode->i_lock);
+ }
+ nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
+ nfs_fscache_wait_on_invalidate(inode);
+
+ dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n",
+ inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(inode));
+ return 0;
+}
+
+bool nfs_mapping_need_revalidate_inode(struct inode *inode)
+{
+ return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) ||
+ NFS_STALE(inode);
+}
+
+int nfs_revalidate_mapping_rcu(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ unsigned long *bitlock = &nfsi->flags;
+ int ret = 0;
+
+ if (IS_SWAPFILE(inode))
+ goto out;
+ if (nfs_mapping_need_revalidate_inode(inode)) {
+ ret = -ECHILD;
+ goto out;
+ }
+ spin_lock(&inode->i_lock);
+ if (test_bit(NFS_INO_INVALIDATING, bitlock) ||
+ (nfsi->cache_validity & NFS_INO_INVALID_DATA))
+ ret = -ECHILD;
+ spin_unlock(&inode->i_lock);
+out:
+ return ret;
+}
+
+/**
+ * nfs_revalidate_mapping - Revalidate the pagecache
+ * @inode: pointer to host inode
+ * @mapping: pointer to mapping
+ */
+int nfs_revalidate_mapping(struct inode *inode,
+ struct address_space *mapping)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ unsigned long *bitlock = &nfsi->flags;
+ int ret = 0;
+
+ /* swapfiles are not supposed to be shared. */
+ if (IS_SWAPFILE(inode))
+ goto out;
+
+ if (nfs_mapping_need_revalidate_inode(inode)) {
+ ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (ret < 0)
+ goto out;
+ }
+
+ /*
+ * We must clear NFS_INO_INVALID_DATA first to ensure that
+ * invalidations that come in while we're shooting down the mappings
+ * are respected. But, that leaves a race window where one revalidator
+ * can clear the flag, and then another checks it before the mapping
+ * gets invalidated. Fix that by serializing access to this part of
+ * the function.
+ *
+ * At the same time, we need to allow other tasks to see whether we
+ * might be in the middle of invalidating the pages, so we only set
+ * the bit lock here if it looks like we're going to be doing that.
+ */
+ for (;;) {
+ ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING,
+ nfs_wait_bit_killable, TASK_KILLABLE);
+ if (ret)
+ goto out;
+ spin_lock(&inode->i_lock);
+ if (test_bit(NFS_INO_INVALIDATING, bitlock)) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ break;
+ spin_unlock(&inode->i_lock);
+ goto out;
+ }
+
+ set_bit(NFS_INO_INVALIDATING, bitlock);
+ smp_wmb();
+ nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA|
+ NFS_INO_DATA_INVAL_DEFER);
+ spin_unlock(&inode->i_lock);
+ trace_nfs_invalidate_mapping_enter(inode);
+ ret = nfs_invalidate_mapping(inode, mapping);
+ trace_nfs_invalidate_mapping_exit(inode, ret);
+
+ clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
+ smp_mb__after_atomic();
+ wake_up_bit(bitlock, NFS_INO_INVALIDATING);
+out:
+ return ret;
+}
+
+static bool nfs_file_has_writers(struct nfs_inode *nfsi)
+{
+ struct inode *inode = &nfsi->vfs_inode;
+
+ if (!S_ISREG(inode->i_mode))
+ return false;
+ if (list_empty(&nfsi->open_files))
+ return false;
+ return inode_is_open_for_write(inode);
+}
+
+static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi)
+{
+ return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi);
+}
+
+static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+ struct timespec64 ts;
+
+ if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
+ && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
+ && inode_eq_iversion_raw(inode, fattr->pre_change_attr)) {
+ inode_set_iversion_raw(inode, fattr->change_attr);
+ if (S_ISDIR(inode->i_mode))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
+ else if (nfs_server_capable(inode, NFS_CAP_XATTR))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR);
+ }
+ /* If we have atomic WCC data, we may update some attributes */
+ ts = inode->i_ctime;
+ if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
+ && (fattr->valid & NFS_ATTR_FATTR_CTIME)
+ && timespec64_equal(&ts, &fattr->pre_ctime)) {
+ inode->i_ctime = fattr->ctime;
+ }
+
+ ts = inode->i_mtime;
+ if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
+ && (fattr->valid & NFS_ATTR_FATTR_MTIME)
+ && timespec64_equal(&ts, &fattr->pre_mtime)) {
+ inode->i_mtime = fattr->mtime;
+ if (S_ISDIR(inode->i_mode))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
+ && (fattr->valid & NFS_ATTR_FATTR_SIZE)
+ && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
+ && !nfs_have_writebacks(inode)) {
+ i_size_write(inode, nfs_size_to_loff_t(fattr->size));
+ }
+}
+
+/**
+ * nfs_check_inode_attributes - verify consistency of the inode attribute cache
+ * @inode: pointer to inode
+ * @fattr: updated attributes
+ *
+ * Verifies the attribute cache. If we have just changed the attributes,
+ * so that fattr carries weak cache consistency data, then it may
+ * also update the ctime/mtime/change_attribute.
+ */
+static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fattr)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ loff_t cur_size, new_isize;
+ unsigned long invalid = 0;
+ struct timespec64 ts;
+
+ if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ return 0;
+
+ if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) {
+ /* Only a mounted-on-fileid? Just exit */
+ if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
+ return 0;
+ /* Has the inode gone and changed behind our back? */
+ } else if (nfsi->fileid != fattr->fileid) {
+ /* Is this perhaps the mounted-on fileid? */
+ if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) &&
+ nfsi->fileid == fattr->mounted_on_fileid)
+ return 0;
+ return -ESTALE;
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && inode_wrong_type(inode, fattr->mode))
+ return -ESTALE;
+
+
+ if (!nfs_file_has_buffered_writers(nfsi)) {
+ /* Verify a few of the more important attributes */
+ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr))
+ invalid |= NFS_INO_INVALID_CHANGE
+ | NFS_INO_REVAL_PAGECACHE;
+
+ ts = inode->i_mtime;
+ if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime))
+ invalid |= NFS_INO_INVALID_MTIME;
+
+ ts = inode->i_ctime;
+ if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec64_equal(&ts, &fattr->ctime))
+ invalid |= NFS_INO_INVALID_CTIME;
+
+ if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+ cur_size = i_size_read(inode);
+ new_isize = nfs_size_to_loff_t(fattr->size);
+ if (cur_size != new_isize)
+ invalid |= NFS_INO_INVALID_SIZE
+ | NFS_INO_REVAL_PAGECACHE;
+ }
+ }
+
+ /* Have any file permissions changed? */
+ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
+ invalid |= NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_OTHER;
+ if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid))
+ invalid |= NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_OTHER;
+ if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid))
+ invalid |= NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_OTHER;
+
+ /* Has the link count changed? */
+ if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
+ invalid |= NFS_INO_INVALID_OTHER;
+
+ ts = inode->i_atime;
+ if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec64_equal(&ts, &fattr->atime))
+ invalid |= NFS_INO_INVALID_ATIME;
+
+ if (invalid != 0)
+ nfs_set_cache_invalid(inode, invalid);
+
+ nfsi->read_cache_jiffies = fattr->time_start;
+ return 0;
+}
+
+static atomic_long_t nfs_attr_generation_counter;
+
+static unsigned long nfs_read_attr_generation_counter(void)
+{
+ return atomic_long_read(&nfs_attr_generation_counter);
+}
+
+unsigned long nfs_inc_attr_generation_counter(void)
+{
+ return atomic_long_inc_return(&nfs_attr_generation_counter);
+}
+EXPORT_SYMBOL_GPL(nfs_inc_attr_generation_counter);
+
+void nfs_fattr_init(struct nfs_fattr *fattr)
+{
+ fattr->valid = 0;
+ fattr->time_start = jiffies;
+ fattr->gencount = nfs_inc_attr_generation_counter();
+ fattr->owner_name = NULL;
+ fattr->group_name = NULL;
+}
+EXPORT_SYMBOL_GPL(nfs_fattr_init);
+
+/**
+ * nfs_fattr_set_barrier
+ * @fattr: attributes
+ *
+ * Used to set a barrier after an attribute was updated. This
+ * barrier ensures that older attributes from RPC calls that may
+ * have raced with our update cannot clobber these new values.
+ * Note that you are still responsible for ensuring that other
+ * operations which change the attribute on the server do not
+ * collide.
+ */
+void nfs_fattr_set_barrier(struct nfs_fattr *fattr)
+{
+ fattr->gencount = nfs_inc_attr_generation_counter();
+}
+
+struct nfs_fattr *nfs_alloc_fattr(void)
+{
+ struct nfs_fattr *fattr;
+
+ fattr = kmalloc(sizeof(*fattr), GFP_NOFS);
+ if (fattr != NULL)
+ nfs_fattr_init(fattr);
+ return fattr;
+}
+EXPORT_SYMBOL_GPL(nfs_alloc_fattr);
+
+struct nfs_fh *nfs_alloc_fhandle(void)
+{
+ struct nfs_fh *fh;
+
+ fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS);
+ if (fh != NULL)
+ fh->size = 0;
+ return fh;
+}
+EXPORT_SYMBOL_GPL(nfs_alloc_fhandle);
+
+#ifdef NFS_DEBUG
+/*
+ * _nfs_display_fhandle_hash - calculate the crc32 hash for the filehandle
+ * in the same way that wireshark does
+ *
+ * @fh: file handle
+ *
+ * For debugging only.
+ */
+u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh)
+{
+ /* wireshark uses 32-bit AUTODIN crc and does a bitwise
+ * not on the result */
+ return nfs_fhandle_hash(fh);
+}
+EXPORT_SYMBOL_GPL(_nfs_display_fhandle_hash);
+
+/*
+ * _nfs_display_fhandle - display an NFS file handle on the console
+ *
+ * @fh: file handle to display
+ * @caption: display caption
+ *
+ * For debugging only.
+ */
+void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption)
+{
+ unsigned short i;
+
+ if (fh == NULL || fh->size == 0) {
+ printk(KERN_DEFAULT "%s at %p is empty\n", caption, fh);
+ return;
+ }
+
+ printk(KERN_DEFAULT "%s at %p is %u bytes, crc: 0x%08x:\n",
+ caption, fh, fh->size, _nfs_display_fhandle_hash(fh));
+ for (i = 0; i < fh->size; i += 16) {
+ __be32 *pos = (__be32 *)&fh->data[i];
+
+ switch ((fh->size - i - 1) >> 2) {
+ case 0:
+ printk(KERN_DEFAULT " %08x\n",
+ be32_to_cpup(pos));
+ break;
+ case 1:
+ printk(KERN_DEFAULT " %08x %08x\n",
+ be32_to_cpup(pos), be32_to_cpup(pos + 1));
+ break;
+ case 2:
+ printk(KERN_DEFAULT " %08x %08x %08x\n",
+ be32_to_cpup(pos), be32_to_cpup(pos + 1),
+ be32_to_cpup(pos + 2));
+ break;
+ default:
+ printk(KERN_DEFAULT " %08x %08x %08x %08x\n",
+ be32_to_cpup(pos), be32_to_cpup(pos + 1),
+ be32_to_cpup(pos + 2), be32_to_cpup(pos + 3));
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(_nfs_display_fhandle);
+#endif
+
+/**
+ * nfs_inode_attrs_need_update - check if the inode attributes need updating
+ * @inode: pointer to inode
+ * @fattr: attributes
+ *
+ * Attempt to divine whether or not an RPC call reply carrying stale
+ * attributes got scheduled after another call carrying updated ones.
+ *
+ * To do so, the function first assumes that a more recent ctime means
+ * that the attributes in fattr are newer, however it also attempt to
+ * catch the case where ctime either didn't change, or went backwards
+ * (if someone reset the clock on the server) by looking at whether
+ * or not this RPC call was started after the inode was last updated.
+ * Note also the check for wraparound of 'attr_gencount'
+ *
+ * The function returns 'true' if it thinks the attributes in 'fattr' are
+ * more recent than the ones cached in the inode.
+ *
+ */
+static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+ unsigned long attr_gencount = NFS_I(inode)->attr_gencount;
+
+ return (long)(fattr->gencount - attr_gencount) > 0 ||
+ (long)(attr_gencount - nfs_read_attr_generation_counter()) > 0;
+}
+
+static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
+{
+ int ret;
+
+ trace_nfs_refresh_inode_enter(inode);
+
+ if (nfs_inode_attrs_need_update(inode, fattr))
+ ret = nfs_update_inode(inode, fattr);
+ else
+ ret = nfs_check_inode_attributes(inode, fattr);
+
+ trace_nfs_refresh_inode_exit(inode, ret);
+ return ret;
+}
+
+/**
+ * nfs_refresh_inode - try to update the inode attribute cache
+ * @inode: pointer to inode
+ * @fattr: updated attributes
+ *
+ * Check that an RPC call that returned attributes has not overlapped with
+ * other recent updates of the inode metadata, then decide whether it is
+ * safe to do a full update of the inode attributes, or whether just to
+ * call nfs_check_inode_attributes.
+ */
+int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+ int status;
+
+ if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+ return 0;
+ spin_lock(&inode->i_lock);
+ status = nfs_refresh_inode_locked(inode, fattr);
+ spin_unlock(&inode->i_lock);
+
+ return status;
+}
+EXPORT_SYMBOL_GPL(nfs_refresh_inode);
+
+static int nfs_post_op_update_inode_locked(struct inode *inode,
+ struct nfs_fattr *fattr, unsigned int invalid)
+{
+ if (S_ISDIR(inode->i_mode))
+ invalid |= NFS_INO_INVALID_DATA;
+ nfs_set_cache_invalid(inode, invalid);
+ if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+ return 0;
+ return nfs_refresh_inode_locked(inode, fattr);
+}
+
+/**
+ * nfs_post_op_update_inode - try to update the inode attribute cache
+ * @inode: pointer to inode
+ * @fattr: updated attributes
+ *
+ * After an operation that has changed the inode metadata, mark the
+ * attribute cache as being invalid, then try to update it.
+ *
+ * NB: if the server didn't return any post op attributes, this
+ * function will force the retrieval of attributes before the next
+ * NFS request. Thus it should be used only for operations that
+ * are expected to change one or more attributes, to avoid
+ * unnecessary NFS requests and trips through nfs_update_inode().
+ */
+int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+ int status;
+
+ spin_lock(&inode->i_lock);
+ nfs_fattr_set_barrier(fattr);
+ status = nfs_post_op_update_inode_locked(inode, fattr,
+ NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_CTIME
+ | NFS_INO_REVAL_FORCED);
+ spin_unlock(&inode->i_lock);
+
+ return status;
+}
+EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
+
+/**
+ * nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache
+ * @inode: pointer to inode
+ * @fattr: updated attributes
+ *
+ * After an operation that has changed the inode metadata, mark the
+ * attribute cache as being invalid, then try to update it. Fake up
+ * weak cache consistency data, if none exist.
+ *
+ * This function is mainly designed to be used by the ->write_done() functions.
+ */
+int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr)
+{
+ int status;
+
+ /* Don't do a WCC update if these attributes are already stale */
+ if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
+ !nfs_inode_attrs_need_update(inode, fattr)) {
+ fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
+ | NFS_ATTR_FATTR_PRESIZE
+ | NFS_ATTR_FATTR_PREMTIME
+ | NFS_ATTR_FATTR_PRECTIME);
+ goto out_noforce;
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
+ fattr->pre_change_attr = inode_peek_iversion_raw(inode);
+ fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
+ fattr->pre_ctime = inode->i_ctime;
+ fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
+ fattr->pre_mtime = inode->i_mtime;
+ fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) {
+ fattr->pre_size = i_size_read(inode);
+ fattr->valid |= NFS_ATTR_FATTR_PRESIZE;
+ }
+out_noforce:
+ status = nfs_post_op_update_inode_locked(inode, fattr,
+ NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_CTIME
+ | NFS_INO_INVALID_MTIME
+ | NFS_INO_INVALID_BLOCKS);
+ return status;
+}
+
+/**
+ * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache
+ * @inode: pointer to inode
+ * @fattr: updated attributes
+ *
+ * After an operation that has changed the inode metadata, mark the
+ * attribute cache as being invalid, then try to update it. Fake up
+ * weak cache consistency data, if none exist.
+ *
+ * This function is mainly designed to be used by the ->write_done() functions.
+ */
+int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
+{
+ int status;
+
+ spin_lock(&inode->i_lock);
+ nfs_fattr_set_barrier(fattr);
+ status = nfs_post_op_update_inode_force_wcc_locked(inode, fattr);
+ spin_unlock(&inode->i_lock);
+ return status;
+}
+EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc);
+
+
+/*
+ * Many nfs protocol calls return the new file attributes after
+ * an operation. Here we update the inode to reflect the state
+ * of the server's inode.
+ *
+ * This is a bit tricky because we have to make sure all dirty pages
+ * have been sent off to the server before calling invalidate_inode_pages.
+ * To make sure no other process adds more write requests while we try
+ * our best to flush them, we make them sleep during the attribute refresh.
+ *
+ * A very similar scenario holds for the dir cache.
+ */
+static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+ struct nfs_server *server;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ loff_t cur_isize, new_isize;
+ unsigned long invalid = 0;
+ unsigned long now = jiffies;
+ unsigned long save_cache_validity;
+ bool have_writers = nfs_file_has_buffered_writers(nfsi);
+ bool cache_revalidated = true;
+ bool attr_changed = false;
+ bool have_delegation;
+
+ dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
+ __func__, inode->i_sb->s_id, inode->i_ino,
+ nfs_display_fhandle_hash(NFS_FH(inode)),
+ atomic_read(&inode->i_count), fattr->valid);
+
+ if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) {
+ /* Only a mounted-on-fileid? Just exit */
+ if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
+ return 0;
+ /* Has the inode gone and changed behind our back? */
+ } else if (nfsi->fileid != fattr->fileid) {
+ /* Is this perhaps the mounted-on fileid? */
+ if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) &&
+ nfsi->fileid == fattr->mounted_on_fileid)
+ return 0;
+ printk(KERN_ERR "NFS: server %s error: fileid changed\n"
+ "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n",
+ NFS_SERVER(inode)->nfs_client->cl_hostname,
+ inode->i_sb->s_id, (long long)nfsi->fileid,
+ (long long)fattr->fileid);
+ goto out_err;
+ }
+
+ /*
+ * Make sure the inode's type hasn't changed.
+ */
+ if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && inode_wrong_type(inode, fattr->mode)) {
+ /*
+ * Big trouble! The inode has become a different object.
+ */
+ printk(KERN_DEBUG "NFS: %s: inode %lu mode changed, %07o to %07o\n",
+ __func__, inode->i_ino, inode->i_mode, fattr->mode);
+ goto out_err;
+ }
+
+ server = NFS_SERVER(inode);
+ /* Update the fsid? */
+ if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
+ !nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
+ !IS_AUTOMOUNT(inode))
+ server->fsid = fattr->fsid;
+
+ /* Save the delegation state before clearing cache_validity */
+ have_delegation = nfs_have_delegated_attributes(inode);
+
+ /*
+ * Update the read time so we don't revalidate too often.
+ */
+ nfsi->read_cache_jiffies = fattr->time_start;
+
+ save_cache_validity = nfsi->cache_validity;
+ nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ATIME
+ | NFS_INO_REVAL_FORCED
+ | NFS_INO_REVAL_PAGECACHE
+ | NFS_INO_INVALID_BLOCKS);
+
+ /* Do atomic weak cache consistency updates */
+ nfs_wcc_update_inode(inode, fattr);
+
+ if (pnfs_layoutcommit_outstanding(inode)) {
+ nfsi->cache_validity |=
+ save_cache_validity &
+ (NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME |
+ NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE |
+ NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
+
+ /* More cache consistency checks */
+ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
+ if (!inode_eq_iversion_raw(inode, fattr->change_attr)) {
+ /* Could it be a race with writeback? */
+ if (!(have_writers || have_delegation)) {
+ invalid |= NFS_INO_INVALID_DATA
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_INVALID_XATTR;
+ /* Force revalidate of all attributes */
+ save_cache_validity |= NFS_INO_INVALID_CTIME
+ | NFS_INO_INVALID_MTIME
+ | NFS_INO_INVALID_SIZE
+ | NFS_INO_INVALID_OTHER;
+ if (S_ISDIR(inode->i_mode))
+ nfs_force_lookup_revalidate(inode);
+ dprintk("NFS: change_attr change on server for file %s/%ld\n",
+ inode->i_sb->s_id,
+ inode->i_ino);
+ } else if (!have_delegation)
+ nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER;
+ inode_set_iversion_raw(inode, fattr->change_attr);
+ attr_changed = true;
+ }
+ } else {
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_CHANGE
+ | NFS_INO_REVAL_PAGECACHE
+ | NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
+
+ if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
+ inode->i_mtime = fattr->mtime;
+ } else if (server->caps & NFS_CAP_MTIME) {
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_MTIME
+ | NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
+
+ if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
+ inode->i_ctime = fattr->ctime;
+ } else if (server->caps & NFS_CAP_CTIME) {
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_CTIME
+ | NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
+
+ /* Check if our cached file size is stale */
+ if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+ new_isize = nfs_size_to_loff_t(fattr->size);
+ cur_isize = i_size_read(inode);
+ if (new_isize != cur_isize && !have_delegation) {
+ /* Do we perhaps have any outstanding writes, or has
+ * the file grown beyond our last write? */
+ if (!nfs_have_writebacks(inode) || new_isize > cur_isize) {
+ i_size_write(inode, new_isize);
+ if (!have_writers)
+ invalid |= NFS_INO_INVALID_DATA;
+ attr_changed = true;
+ }
+ dprintk("NFS: isize change on server for file %s/%ld "
+ "(%Ld to %Ld)\n",
+ inode->i_sb->s_id,
+ inode->i_ino,
+ (long long)cur_isize,
+ (long long)new_isize);
+ }
+ } else {
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_SIZE
+ | NFS_INO_REVAL_PAGECACHE
+ | NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
+
+
+ if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+ inode->i_atime = fattr->atime;
+ else if (server->caps & NFS_CAP_ATIME) {
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_ATIME
+ | NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
+
+ if (fattr->valid & NFS_ATTR_FATTR_MODE) {
+ if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
+ umode_t newmode = inode->i_mode & S_IFMT;
+ newmode |= fattr->mode & S_IALLUGO;
+ inode->i_mode = newmode;
+ invalid |= NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
+ attr_changed = true;
+ }
+ } else if (server->caps & NFS_CAP_MODE) {
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_OTHER
+ | NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
+
+ if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
+ if (!uid_eq(inode->i_uid, fattr->uid)) {
+ invalid |= NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
+ inode->i_uid = fattr->uid;
+ attr_changed = true;
+ }
+ } else if (server->caps & NFS_CAP_OWNER) {
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_OTHER
+ | NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
+
+ if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
+ if (!gid_eq(inode->i_gid, fattr->gid)) {
+ invalid |= NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
+ inode->i_gid = fattr->gid;
+ attr_changed = true;
+ }
+ } else if (server->caps & NFS_CAP_OWNER_GROUP) {
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_OTHER
+ | NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
+
+ if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
+ if (inode->i_nlink != fattr->nlink) {
+ if (S_ISDIR(inode->i_mode))
+ invalid |= NFS_INO_INVALID_DATA;
+ set_nlink(inode, fattr->nlink);
+ attr_changed = true;
+ }
+ } else if (server->caps & NFS_CAP_NLINK) {
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_OTHER
+ | NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
+
+ if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
+ /*
+ * report the blocks in 512byte units
+ */
+ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
+ } else if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+ inode->i_blocks = fattr->du.nfs2.blocks;
+ else {
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_BLOCKS
+ | NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
+
+ /* Update attrtimeo value if we're out of the unstable period */
+ if (attr_changed) {
+ invalid &= ~NFS_INO_INVALID_ATTR;
+ nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
+ nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
+ nfsi->attrtimeo_timestamp = now;
+ /* Set barrier to be more recent than all outstanding updates */
+ nfsi->attr_gencount = nfs_inc_attr_generation_counter();
+ } else {
+ if (cache_revalidated) {
+ if (!time_in_range_open(now, nfsi->attrtimeo_timestamp,
+ nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
+ nfsi->attrtimeo <<= 1;
+ if (nfsi->attrtimeo > NFS_MAXATTRTIMEO(inode))
+ nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
+ }
+ nfsi->attrtimeo_timestamp = now;
+ }
+ /* Set the barrier to be more recent than this fattr */
+ if ((long)(fattr->gencount - nfsi->attr_gencount) > 0)
+ nfsi->attr_gencount = fattr->gencount;
+ }
+
+ /* Don't invalidate the data if we were to blame */
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
+ || S_ISLNK(inode->i_mode)))
+ invalid &= ~NFS_INO_INVALID_DATA;
+ nfs_set_cache_invalid(inode, invalid);
+
+ return 0;
+ out_err:
+ /*
+ * No need to worry about unhashing the dentry, as the
+ * lookup validation will know that the inode is bad.
+ * (But we fall through to invalidate the caches.)
+ */
+ nfs_set_inode_stale_locked(inode);
+ return -ESTALE;
+}
+
+struct inode *nfs_alloc_inode(struct super_block *sb)
+{
+ struct nfs_inode *nfsi;
+ nfsi = kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL);
+ if (!nfsi)
+ return NULL;
+ nfsi->flags = 0UL;
+ nfsi->cache_validity = 0UL;
+#if IS_ENABLED(CONFIG_NFS_V4)
+ nfsi->nfs4_acl = NULL;
+#endif /* CONFIG_NFS_V4 */
+#ifdef CONFIG_NFS_V4_2
+ nfsi->xattr_cache = NULL;
+#endif
+ return &nfsi->vfs_inode;
+}
+EXPORT_SYMBOL_GPL(nfs_alloc_inode);
+
+void nfs_free_inode(struct inode *inode)
+{
+ kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
+}
+EXPORT_SYMBOL_GPL(nfs_free_inode);
+
+static inline void nfs4_init_once(struct nfs_inode *nfsi)
+{
+#if IS_ENABLED(CONFIG_NFS_V4)
+ INIT_LIST_HEAD(&nfsi->open_states);
+ nfsi->delegation = NULL;
+ init_rwsem(&nfsi->rwsem);
+ nfsi->layout = NULL;
+#endif
+}
+
+static void init_once(void *foo)
+{
+ struct nfs_inode *nfsi = (struct nfs_inode *) foo;
+
+ inode_init_once(&nfsi->vfs_inode);
+ INIT_LIST_HEAD(&nfsi->open_files);
+ INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
+ INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
+ INIT_LIST_HEAD(&nfsi->commit_info.list);
+ atomic_long_set(&nfsi->nrequests, 0);
+ atomic_long_set(&nfsi->commit_info.ncommit, 0);
+ atomic_set(&nfsi->commit_info.rpcs_out, 0);
+ init_rwsem(&nfsi->rmdir_sem);
+ mutex_init(&nfsi->commit_mutex);
+ nfs4_init_once(nfsi);
+ nfsi->cache_change_attribute = 0;
+}
+
+static int __init nfs_init_inodecache(void)
+{
+ nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
+ sizeof(struct nfs_inode),
+ 0, (SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ init_once);
+ if (nfs_inode_cachep == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void nfs_destroy_inodecache(void)
+{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
+ kmem_cache_destroy(nfs_inode_cachep);
+}
+
+struct workqueue_struct *nfsiod_workqueue;
+EXPORT_SYMBOL_GPL(nfsiod_workqueue);
+
+/*
+ * start up the nfsiod workqueue
+ */
+static int nfsiod_start(void)
+{
+ struct workqueue_struct *wq;
+ dprintk("RPC: creating workqueue nfsiod\n");
+ wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+ if (wq == NULL)
+ return -ENOMEM;
+ nfsiod_workqueue = wq;
+ return 0;
+}
+
+/*
+ * Destroy the nfsiod workqueue
+ */
+static void nfsiod_stop(void)
+{
+ struct workqueue_struct *wq;
+
+ wq = nfsiod_workqueue;
+ if (wq == NULL)
+ return;
+ nfsiod_workqueue = NULL;
+ destroy_workqueue(wq);
+}
+
+unsigned int nfs_net_id;
+EXPORT_SYMBOL_GPL(nfs_net_id);
+
+static int nfs_net_init(struct net *net)
+{
+ nfs_clients_init(net);
+ return nfs_fs_proc_net_init(net);
+}
+
+static void nfs_net_exit(struct net *net)
+{
+ nfs_fs_proc_net_exit(net);
+ nfs_clients_exit(net);
+}
+
+static struct pernet_operations nfs_net_ops = {
+ .init = nfs_net_init,
+ .exit = nfs_net_exit,
+ .id = &nfs_net_id,
+ .size = sizeof(struct nfs_net),
+};
+
+/*
+ * Initialize NFS
+ */
+static int __init init_nfs_fs(void)
+{
+ int err;
+
+ err = nfs_sysfs_init();
+ if (err < 0)
+ goto out10;
+
+ err = register_pernet_subsys(&nfs_net_ops);
+ if (err < 0)
+ goto out9;
+
+ err = nfs_fscache_register();
+ if (err < 0)
+ goto out8;
+
+ err = nfsiod_start();
+ if (err)
+ goto out7;
+
+ err = nfs_fs_proc_init();
+ if (err)
+ goto out6;
+
+ err = nfs_init_nfspagecache();
+ if (err)
+ goto out5;
+
+ err = nfs_init_inodecache();
+ if (err)
+ goto out4;
+
+ err = nfs_init_readpagecache();
+ if (err)
+ goto out3;
+
+ err = nfs_init_writepagecache();
+ if (err)
+ goto out2;
+
+ err = nfs_init_directcache();
+ if (err)
+ goto out1;
+
+ rpc_proc_register(&init_net, &nfs_rpcstat);
+
+ err = register_nfs_fs();
+ if (err)
+ goto out0;
+
+ return 0;
+out0:
+ rpc_proc_unregister(&init_net, "nfs");
+ nfs_destroy_directcache();
+out1:
+ nfs_destroy_writepagecache();
+out2:
+ nfs_destroy_readpagecache();
+out3:
+ nfs_destroy_inodecache();
+out4:
+ nfs_destroy_nfspagecache();
+out5:
+ nfs_fs_proc_exit();
+out6:
+ nfsiod_stop();
+out7:
+ nfs_fscache_unregister();
+out8:
+ unregister_pernet_subsys(&nfs_net_ops);
+out9:
+ nfs_sysfs_exit();
+out10:
+ return err;
+}
+
+static void __exit exit_nfs_fs(void)
+{
+ nfs_destroy_directcache();
+ nfs_destroy_writepagecache();
+ nfs_destroy_readpagecache();
+ nfs_destroy_inodecache();
+ nfs_destroy_nfspagecache();
+ nfs_fscache_unregister();
+ unregister_pernet_subsys(&nfs_net_ops);
+ rpc_proc_unregister(&init_net, "nfs");
+ unregister_nfs_fs();
+ nfs_fs_proc_exit();
+ nfsiod_stop();
+ nfs_sysfs_exit();
+}
+
+/* Not quite true; I just maintain it */
+MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
+MODULE_LICENSE("GPL");
+module_param(enable_ino64, bool, 0644);
+
+module_init(init_nfs_fs)
+module_exit(exit_nfs_fs)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
new file mode 100644
index 000000000..a7e0970b5
--- /dev/null
+++ b/fs/nfs/internal.h
@@ -0,0 +1,852 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NFS internal definitions
+ */
+
+#include "nfs4_fs.h"
+#include <linux/fs_context.h>
+#include <linux/security.h>
+#include <linux/crc32.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/nfs_page.h>
+#include <linux/wait_bit.h>
+
+#define NFS_SB_MASK (SB_RDONLY|SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS)
+
+extern const struct export_operations nfs_export_ops;
+
+struct nfs_string;
+struct nfs_pageio_descriptor;
+
+static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)
+{
+ if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid))
+ fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT;
+}
+
+static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr)
+{
+ if (((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) == 0) ||
+ (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) &&
+ ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0)))
+ return 0;
+ return 1;
+}
+
+static inline bool nfs_lookup_is_soft_revalidate(const struct dentry *dentry)
+{
+ if (!(NFS_SB(dentry->d_sb)->flags & NFS_MOUNT_SOFTREVAL))
+ return false;
+ if (!d_is_positive(dentry) || !NFS_FH(d_inode(dentry))->size)
+ return false;
+ return true;
+}
+
+static inline fmode_t flags_to_mode(int flags)
+{
+ fmode_t res = (__force fmode_t)flags & FMODE_EXEC;
+ if ((flags & O_ACCMODE) != O_WRONLY)
+ res |= FMODE_READ;
+ if ((flags & O_ACCMODE) != O_RDONLY)
+ res |= FMODE_WRITE;
+ return res;
+}
+
+/*
+ * Note: RFC 1813 doesn't limit the number of auth flavors that
+ * a server can return, so make something up.
+ */
+#define NFS_MAX_SECFLAVORS (12)
+
+/*
+ * Value used if the user did not specify a port value.
+ */
+#define NFS_UNSPEC_PORT (-1)
+
+#define NFS_UNSPEC_RETRANS (UINT_MAX)
+#define NFS_UNSPEC_TIMEO (UINT_MAX)
+
+/*
+ * Maximum number of pages that readdir can use for creating
+ * a vmapped array of pages.
+ */
+#define NFS_MAX_READDIR_PAGES 8
+
+struct nfs_client_initdata {
+ unsigned long init_flags;
+ const char *hostname; /* Hostname of the server */
+ const struct sockaddr *addr; /* Address of the server */
+ const char *nodename; /* Hostname of the client */
+ const char *ip_addr; /* IP address of the client */
+ size_t addrlen;
+ struct nfs_subversion *nfs_mod;
+ int proto;
+ u32 minorversion;
+ unsigned int nconnect;
+ struct net *net;
+ const struct rpc_timeout *timeparms;
+ const struct cred *cred;
+};
+
+/*
+ * In-kernel mount arguments
+ */
+struct nfs_fs_context {
+ bool internal;
+ bool skip_reconfig_option_check;
+ bool need_mount;
+ bool sloppy;
+ unsigned int flags; /* NFS{,4}_MOUNT_* flags */
+ unsigned int rsize, wsize;
+ unsigned int timeo, retrans;
+ unsigned int acregmin, acregmax;
+ unsigned int acdirmin, acdirmax;
+ unsigned int namlen;
+ unsigned int options;
+ unsigned int bsize;
+ struct nfs_auth_info auth_info;
+ rpc_authflavor_t selected_flavor;
+ char *client_address;
+ unsigned int version;
+ unsigned int minorversion;
+ char *fscache_uniq;
+ unsigned short protofamily;
+ unsigned short mountfamily;
+
+ struct {
+ union {
+ struct sockaddr address;
+ struct sockaddr_storage _address;
+ };
+ size_t addrlen;
+ char *hostname;
+ u32 version;
+ int port;
+ unsigned short protocol;
+ } mount_server;
+
+ struct {
+ union {
+ struct sockaddr address;
+ struct sockaddr_storage _address;
+ };
+ size_t addrlen;
+ char *hostname;
+ char *export_path;
+ int port;
+ unsigned short protocol;
+ unsigned short nconnect;
+ unsigned short export_path_len;
+ } nfs_server;
+
+ struct nfs_fh *mntfh;
+ struct nfs_server *server;
+ struct nfs_subversion *nfs_mod;
+
+ /* Information for a cloned mount. */
+ struct nfs_clone_mount {
+ struct super_block *sb;
+ struct dentry *dentry;
+ struct nfs_fattr *fattr;
+ unsigned int inherited_bsize;
+ } clone_data;
+};
+
+#define nfs_errorf(fc, fmt, ...) ((fc)->log.log ? \
+ errorf(fc, fmt, ## __VA_ARGS__) : \
+ ({ dprintk(fmt "\n", ## __VA_ARGS__); }))
+
+#define nfs_ferrorf(fc, fac, fmt, ...) ((fc)->log.log ? \
+ errorf(fc, fmt, ## __VA_ARGS__) : \
+ ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); }))
+
+#define nfs_invalf(fc, fmt, ...) ((fc)->log.log ? \
+ invalf(fc, fmt, ## __VA_ARGS__) : \
+ ({ dprintk(fmt "\n", ## __VA_ARGS__); -EINVAL; }))
+
+#define nfs_finvalf(fc, fac, fmt, ...) ((fc)->log.log ? \
+ invalf(fc, fmt, ## __VA_ARGS__) : \
+ ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); -EINVAL; }))
+
+#define nfs_warnf(fc, fmt, ...) ((fc)->log.log ? \
+ warnf(fc, fmt, ## __VA_ARGS__) : \
+ ({ dprintk(fmt "\n", ## __VA_ARGS__); }))
+
+#define nfs_fwarnf(fc, fac, fmt, ...) ((fc)->log.log ? \
+ warnf(fc, fmt, ## __VA_ARGS__) : \
+ ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); }))
+
+static inline struct nfs_fs_context *nfs_fc2context(const struct fs_context *fc)
+{
+ return fc->fs_private;
+}
+
+/* mount_clnt.c */
+struct nfs_mount_request {
+ struct sockaddr *sap;
+ size_t salen;
+ char *hostname;
+ char *dirpath;
+ u32 version;
+ unsigned short protocol;
+ struct nfs_fh *fh;
+ int noresvport;
+ unsigned int *auth_flav_len;
+ rpc_authflavor_t *auth_flavs;
+ struct net *net;
+};
+
+extern int nfs_mount(struct nfs_mount_request *info);
+extern void nfs_umount(const struct nfs_mount_request *info);
+
+/* client.c */
+extern const struct rpc_program nfs_program;
+extern void nfs_clients_init(struct net *net);
+extern void nfs_clients_exit(struct net *net);
+extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *);
+int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t);
+struct nfs_client *nfs_get_client(const struct nfs_client_initdata *);
+int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
+void nfs_server_insert_lists(struct nfs_server *);
+void nfs_server_remove_lists(struct nfs_server *);
+void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans);
+int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t,
+ rpc_authflavor_t);
+struct nfs_server *nfs_alloc_server(void);
+void nfs_server_copy_userdata(struct nfs_server *, struct nfs_server *);
+
+extern void nfs_put_client(struct nfs_client *);
+extern void nfs_free_client(struct nfs_client *);
+extern struct nfs_client *nfs4_find_client_ident(struct net *, int);
+extern struct nfs_client *
+nfs4_find_client_sessionid(struct net *, const struct sockaddr *,
+ struct nfs4_sessionid *, u32);
+extern struct nfs_server *nfs_create_server(struct fs_context *);
+extern struct nfs_server *nfs4_create_server(struct fs_context *);
+extern struct nfs_server *nfs4_create_referral_server(struct fs_context *);
+extern int nfs4_update_server(struct nfs_server *server, const char *hostname,
+ struct sockaddr *sap, size_t salen,
+ struct net *net);
+extern void nfs_free_server(struct nfs_server *server);
+extern struct nfs_server *nfs_clone_server(struct nfs_server *,
+ struct nfs_fh *,
+ struct nfs_fattr *,
+ rpc_authflavor_t);
+extern bool nfs_client_init_is_complete(const struct nfs_client *clp);
+extern int nfs_client_init_status(const struct nfs_client *clp);
+extern int nfs_wait_client_init_complete(const struct nfs_client *clp);
+extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
+extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
+ const struct sockaddr *ds_addr,
+ int ds_addrlen, int ds_proto,
+ unsigned int ds_timeo,
+ unsigned int ds_retrans,
+ u32 minor_version);
+extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
+ struct inode *);
+extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
+ const struct sockaddr *ds_addr, int ds_addrlen,
+ int ds_proto, unsigned int ds_timeo,
+ unsigned int ds_retrans);
+#ifdef CONFIG_PROC_FS
+extern int __init nfs_fs_proc_init(void);
+extern void nfs_fs_proc_exit(void);
+extern int nfs_fs_proc_net_init(struct net *net);
+extern void nfs_fs_proc_net_exit(struct net *net);
+#else
+static inline int nfs_fs_proc_net_init(struct net *net)
+{
+ return 0;
+}
+static inline void nfs_fs_proc_net_exit(struct net *net)
+{
+}
+static inline int nfs_fs_proc_init(void)
+{
+ return 0;
+}
+static inline void nfs_fs_proc_exit(void)
+{
+}
+#endif
+
+/* callback_xdr.c */
+extern const struct svc_version nfs4_callback_version1;
+extern const struct svc_version nfs4_callback_version4;
+
+/* fs_context.c */
+extern struct file_system_type nfs_fs_type;
+
+/* pagelist.c */
+extern int __init nfs_init_nfspagecache(void);
+extern void nfs_destroy_nfspagecache(void);
+extern int __init nfs_init_readpagecache(void);
+extern void nfs_destroy_readpagecache(void);
+extern int __init nfs_init_writepagecache(void);
+extern void nfs_destroy_writepagecache(void);
+
+extern int __init nfs_init_directcache(void);
+extern void nfs_destroy_directcache(void);
+extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr,
+ void (*release)(struct nfs_pgio_header *hdr));
+void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
+int nfs_iocounter_wait(struct nfs_lock_context *l_ctx);
+
+extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
+struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
+void nfs_pgio_header_free(struct nfs_pgio_header *);
+int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
+int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
+ const struct cred *cred, const struct nfs_rpc_ops *rpc_ops,
+ const struct rpc_call_ops *call_ops, int how, int flags);
+void nfs_free_request(struct nfs_page *req);
+struct nfs_pgio_mirror *
+nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
+
+static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1,
+ const struct nfs_open_context *ctx2)
+{
+ return cred_fscmp(ctx1->cred, ctx2->cred) == 0 && ctx1->state == ctx2->state;
+}
+
+/* nfs2xdr.c */
+extern const struct rpc_procinfo nfs_procedures[];
+extern int nfs2_decode_dirent(struct xdr_stream *,
+ struct nfs_entry *, bool);
+
+/* nfs3xdr.c */
+extern const struct rpc_procinfo nfs3_procedures[];
+extern int nfs3_decode_dirent(struct xdr_stream *,
+ struct nfs_entry *, bool);
+
+/* nfs4xdr.c */
+#if IS_ENABLED(CONFIG_NFS_V4)
+extern int nfs4_decode_dirent(struct xdr_stream *,
+ struct nfs_entry *, bool);
+#endif
+#ifdef CONFIG_NFS_V4_1
+extern const u32 nfs41_maxread_overhead;
+extern const u32 nfs41_maxwrite_overhead;
+extern const u32 nfs41_maxgetdevinfo_overhead;
+#endif
+
+/* nfs4proc.c */
+#if IS_ENABLED(CONFIG_NFS_V4)
+extern const struct rpc_procinfo nfs4_procedures[];
+#endif
+
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags);
+static inline struct nfs4_label *
+nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src)
+{
+ if (!dst || !src)
+ return NULL;
+
+ if (src->len > NFS4_MAXLABELLEN)
+ return NULL;
+
+ dst->lfs = src->lfs;
+ dst->pi = src->pi;
+ dst->len = src->len;
+ memcpy(dst->label, src->label, src->len);
+
+ return dst;
+}
+static inline void nfs4_label_free(struct nfs4_label *label)
+{
+ if (label) {
+ kfree(label->label);
+ kfree(label);
+ }
+ return;
+}
+
+static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
+{
+ if (nfs_server_capable(&nfsi->vfs_inode, NFS_CAP_SECURITY_LABEL))
+ nfsi->cache_validity |= NFS_INO_INVALID_LABEL;
+}
+#else
+static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; }
+static inline void nfs4_label_free(void *label) {}
+static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
+{
+}
+static inline struct nfs4_label *
+nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src)
+{
+ return NULL;
+}
+#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
+
+/* proc.c */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
+extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
+ const struct nfs_client_initdata *);
+
+/* dir.c */
+extern void nfs_advise_use_readdirplus(struct inode *dir);
+extern void nfs_force_use_readdirplus(struct inode *dir);
+extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
+ struct shrink_control *sc);
+extern unsigned long nfs_access_cache_scan(struct shrinker *shrink,
+ struct shrink_control *sc);
+struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int);
+int nfs_create(struct inode *, struct dentry *, umode_t, bool);
+int nfs_mkdir(struct inode *, struct dentry *, umode_t);
+int nfs_rmdir(struct inode *, struct dentry *);
+int nfs_unlink(struct inode *, struct dentry *);
+int nfs_symlink(struct inode *, struct dentry *, const char *);
+int nfs_link(struct dentry *, struct inode *, struct dentry *);
+int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
+int nfs_rename(struct inode *, struct dentry *,
+ struct inode *, struct dentry *, unsigned int);
+
+/* file.c */
+int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync);
+loff_t nfs_file_llseek(struct file *, loff_t, int);
+ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
+int nfs_file_mmap(struct file *, struct vm_area_struct *);
+ssize_t nfs_file_write(struct kiocb *, struct iov_iter *);
+int nfs_file_release(struct inode *, struct file *);
+int nfs_lock(struct file *, int, struct file_lock *);
+int nfs_flock(struct file *, int, struct file_lock *);
+int nfs_check_flags(int);
+
+/* inode.c */
+extern struct workqueue_struct *nfsiod_workqueue;
+extern struct inode *nfs_alloc_inode(struct super_block *sb);
+extern void nfs_free_inode(struct inode *);
+extern int nfs_write_inode(struct inode *, struct writeback_control *);
+extern int nfs_drop_inode(struct inode *);
+extern void nfs_clear_inode(struct inode *);
+extern void nfs_evict_inode(struct inode *);
+void nfs_zap_acl_cache(struct inode *inode);
+extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
+extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
+extern int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode);
+
+/* super.c */
+extern const struct super_operations nfs_sops;
+bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t);
+int nfs_try_get_tree(struct fs_context *);
+int nfs_get_tree_common(struct fs_context *);
+void nfs_kill_super(struct super_block *);
+
+extern struct rpc_stat nfs_rpcstat;
+
+extern int __init register_nfs_fs(void);
+extern void __exit unregister_nfs_fs(void);
+extern bool nfs_sb_active(struct super_block *sb);
+extern void nfs_sb_deactive(struct super_block *sb);
+extern int nfs_client_for_each_server(struct nfs_client *clp,
+ int (*fn)(struct nfs_server *, void *),
+ void *data);
+/* io.c */
+extern void nfs_start_io_read(struct inode *inode);
+extern void nfs_end_io_read(struct inode *inode);
+extern void nfs_start_io_write(struct inode *inode);
+extern void nfs_end_io_write(struct inode *inode);
+extern void nfs_start_io_direct(struct inode *inode);
+extern void nfs_end_io_direct(struct inode *inode);
+
+static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi)
+{
+ return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0;
+}
+
+/* namespace.c */
+#define NFS_PATH_CANONICAL 1
+extern char *nfs_path(char **p, struct dentry *dentry,
+ char *buffer, ssize_t buflen, unsigned flags);
+extern struct vfsmount *nfs_d_automount(struct path *path);
+int nfs_submount(struct fs_context *, struct nfs_server *);
+int nfs_do_submount(struct fs_context *);
+
+/* getroot.c */
+extern int nfs_get_root(struct super_block *s, struct fs_context *fc);
+#if IS_ENABLED(CONFIG_NFS_V4)
+extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool);
+#endif
+
+struct nfs_pgio_completion_ops;
+/* read.c */
+extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode, bool force_mds,
+ const struct nfs_pgio_completion_ops *compl_ops);
+extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
+extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
+
+/* super.c */
+void nfs_umount_begin(struct super_block *);
+int nfs_statfs(struct dentry *, struct kstatfs *);
+int nfs_show_options(struct seq_file *, struct dentry *);
+int nfs_show_devname(struct seq_file *, struct dentry *);
+int nfs_show_path(struct seq_file *, struct dentry *);
+int nfs_show_stats(struct seq_file *, struct dentry *);
+int nfs_reconfigure(struct fs_context *);
+
+/* write.c */
+extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode, int ioflags, bool force_mds,
+ const struct nfs_pgio_completion_ops *compl_ops);
+extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
+extern void nfs_commit_free(struct nfs_commit_data *p);
+extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
+extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
+extern int nfs_initiate_commit(struct rpc_clnt *clnt,
+ struct nfs_commit_data *data,
+ const struct nfs_rpc_ops *nfs_ops,
+ const struct rpc_call_ops *call_ops,
+ int how, int flags);
+extern void nfs_init_commit(struct nfs_commit_data *data,
+ struct list_head *head,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo);
+int nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
+ struct nfs_commit_info *cinfo, int max);
+unsigned long nfs_reqs_to_commit(struct nfs_commit_info *);
+int nfs_scan_commit(struct inode *inode, struct list_head *dst,
+ struct nfs_commit_info *cinfo);
+void nfs_mark_request_commit(struct nfs_page *req,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx);
+int nfs_write_need_commit(struct nfs_pgio_header *);
+void nfs_writeback_update_inode(struct nfs_pgio_header *hdr);
+int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
+ int how, struct nfs_commit_info *cinfo);
+void nfs_retry_commit(struct list_head *page_list,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx);
+void nfs_commitdata_release(struct nfs_commit_data *data);
+void nfs_request_add_commit_list(struct nfs_page *req,
+ struct nfs_commit_info *cinfo);
+void nfs_request_add_commit_list_locked(struct nfs_page *req,
+ struct list_head *dst,
+ struct nfs_commit_info *cinfo);
+void nfs_request_remove_commit_list(struct nfs_page *req,
+ struct nfs_commit_info *cinfo);
+void nfs_init_cinfo(struct nfs_commit_info *cinfo,
+ struct inode *inode,
+ struct nfs_direct_req *dreq);
+int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
+bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode);
+void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio);
+
+int nfs_filemap_write_and_wait_range(struct address_space *mapping,
+ loff_t lstart, loff_t lend);
+
+#ifdef CONFIG_NFS_V4_1
+static inline void
+pnfs_bucket_clear_pnfs_ds_commit_verifiers(struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets)
+{
+ unsigned int i;
+
+ for (i = 0; i < nbuckets; i++)
+ buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
+}
+static inline
+void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
+{
+ struct pnfs_commit_array *array;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(array, &cinfo->commits, cinfo_list)
+ pnfs_bucket_clear_pnfs_ds_commit_verifiers(array->buckets,
+ array->nbuckets);
+ rcu_read_unlock();
+}
+#else
+static inline
+void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
+{
+}
+#endif
+
+#ifdef CONFIG_MIGRATION
+extern int nfs_migrate_page(struct address_space *,
+ struct page *, struct page *, enum migrate_mode);
+#endif
+
+static inline int
+nfs_write_verifier_cmp(const struct nfs_write_verifier *v1,
+ const struct nfs_write_verifier *v2)
+{
+ return memcmp(v1->data, v2->data, sizeof(v1->data));
+}
+
+static inline bool
+nfs_write_match_verf(const struct nfs_writeverf *verf,
+ struct nfs_page *req)
+{
+ return verf->committed > NFS_UNSTABLE &&
+ !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier);
+}
+
+static inline gfp_t nfs_io_gfp_mask(void)
+{
+ if (current->flags & PF_WQ_WORKER)
+ return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
+ return GFP_KERNEL;
+}
+
+/* unlink.c */
+extern struct rpc_task *
+nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
+ struct dentry *old_dentry, struct dentry *new_dentry,
+ void (*complete)(struct rpc_task *, struct nfs_renamedata *));
+extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry);
+
+/* direct.c */
+void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
+ struct nfs_direct_req *dreq);
+extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
+
+/* nfs4proc.c */
+extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
+ const struct nfs_client_initdata *);
+extern int nfs40_walk_client_list(struct nfs_client *clp,
+ struct nfs_client **result,
+ const struct cred *cred);
+extern int nfs41_walk_client_list(struct nfs_client *clp,
+ struct nfs_client **result,
+ const struct cred *cred);
+extern void nfs4_test_session_trunk(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt,
+ void *data);
+
+static inline struct inode *nfs_igrab_and_active(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (sb && nfs_sb_active(sb)) {
+ if (igrab(inode))
+ return inode;
+ nfs_sb_deactive(sb);
+ }
+ return NULL;
+}
+
+static inline void nfs_iput_and_deactive(struct inode *inode)
+{
+ if (inode != NULL) {
+ struct super_block *sb = inode->i_sb;
+
+ iput(inode);
+ nfs_sb_deactive(sb);
+ }
+}
+
+/*
+ * Determine the device name as a string
+ */
+static inline char *nfs_devname(struct dentry *dentry,
+ char *buffer, ssize_t buflen)
+{
+ char *dummy;
+ return nfs_path(&dummy, dentry, buffer, buflen, NFS_PATH_CANONICAL);
+}
+
+/*
+ * Determine the actual block size (and log2 thereof)
+ */
+static inline
+unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp)
+{
+ /* make sure blocksize is a power of two */
+ if ((bsize & (bsize - 1)) || nrbitsp) {
+ unsigned char nrbits;
+
+ for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--)
+ ;
+ bsize = 1 << nrbits;
+ if (nrbitsp)
+ *nrbitsp = nrbits;
+ }
+
+ return bsize;
+}
+
+/*
+ * Calculate the number of 512byte blocks used.
+ */
+static inline blkcnt_t nfs_calc_block_size(u64 tsize)
+{
+ blkcnt_t used = (tsize + 511) >> 9;
+ return (used > ULONG_MAX) ? ULONG_MAX : used;
+}
+
+/*
+ * Compute and set NFS server blocksize
+ */
+static inline
+unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
+{
+ if (bsize < NFS_MIN_FILE_IO_SIZE)
+ bsize = NFS_DEF_FILE_IO_SIZE;
+ else if (bsize >= NFS_MAX_FILE_IO_SIZE)
+ bsize = NFS_MAX_FILE_IO_SIZE;
+
+ return nfs_block_bits(bsize, nrbitsp);
+}
+
+/*
+ * Determine the maximum file size for a superblock
+ */
+static inline
+void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
+{
+ sb->s_maxbytes = (loff_t)maxfilesize;
+ if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0)
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+}
+
+/*
+ * Record the page as unstable (an extra writeback period) and mark its
+ * inode as dirty.
+ */
+static inline
+void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo)
+{
+ if (!cinfo->dreq) {
+ struct inode *inode = page_file_mapping(page)->host;
+
+ /* This page is really still in write-back - just that the
+ * writeback is happening on the server now.
+ */
+ inc_node_page_state(page, NR_WRITEBACK);
+ inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+ }
+}
+
+/*
+ * Determine the number of bytes of data the page contains
+ */
+static inline
+unsigned int nfs_page_length(struct page *page)
+{
+ loff_t i_size = i_size_read(page_file_mapping(page)->host);
+
+ if (i_size > 0) {
+ pgoff_t index = page_index(page);
+ pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT;
+ if (index < end_index)
+ return PAGE_SIZE;
+ if (index == end_index)
+ return ((i_size - 1) & ~PAGE_MASK) + 1;
+ }
+ return 0;
+}
+
+/*
+ * Convert a umode to a dirent->d_type
+ */
+static inline
+unsigned char nfs_umode_to_dtype(umode_t mode)
+{
+ return (mode >> 12) & 15;
+}
+
+/*
+ * Determine the number of pages in an array of length 'len' and
+ * with a base offset of 'base'
+ */
+static inline
+unsigned int nfs_page_array_len(unsigned int base, size_t len)
+{
+ return ((unsigned long)len + (unsigned long)base +
+ PAGE_SIZE - 1) >> PAGE_SHIFT;
+}
+
+/*
+ * Convert a struct timespec64 into a 64-bit change attribute
+ *
+ * This does approximately the same thing as timespec64_to_ns(),
+ * but for calculation efficiency, we multiply the seconds by
+ * 1024*1024*1024.
+ */
+static inline
+u64 nfs_timespec_to_change_attr(const struct timespec64 *ts)
+{
+ return ((u64)ts->tv_sec << 30) + ts->tv_nsec;
+}
+
+#ifdef CONFIG_CRC32
+/**
+ * nfs_fhandle_hash - calculate the crc32 hash for the filehandle
+ * @fh - pointer to filehandle
+ *
+ * returns a crc32 hash for the filehandle that is compatible with
+ * the one displayed by "wireshark".
+ */
+static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
+{
+ return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size);
+}
+static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid)
+{
+ return ~crc32_le(0xFFFFFFFF, &stateid->other[0],
+ NFS4_STATEID_OTHER_SIZE);
+}
+#else
+static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
+{
+ return 0;
+}
+static inline u32 nfs_stateid_hash(nfs4_stateid *stateid)
+{
+ return 0;
+}
+#endif
+
+static inline bool nfs_error_is_fatal(int err)
+{
+ switch (err) {
+ case -ERESTARTSYS:
+ case -EINTR:
+ case -EACCES:
+ case -EDQUOT:
+ case -EFBIG:
+ case -EIO:
+ case -ENOSPC:
+ case -EROFS:
+ case -ESTALE:
+ case -E2BIG:
+ case -ENOMEM:
+ case -ETIMEDOUT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool nfs_error_is_fatal_on_server(int err)
+{
+ switch (err) {
+ case 0:
+ case -ERESTARTSYS:
+ case -EINTR:
+ case -ENOMEM:
+ return false;
+ }
+ return nfs_error_is_fatal(err);
+}
+
+/*
+ * Select between a default port value and a user-specified port value.
+ * If a zero value is set, then autobind will be used.
+ */
+static inline void nfs_set_port(struct sockaddr *sap, int *port,
+ const unsigned short default_port)
+{
+ if (*port == NFS_UNSPEC_PORT)
+ *port = default_port;
+
+ rpc_set_port(sap, *port);
+}
diff --git a/fs/nfs/io.c b/fs/nfs/io.c
new file mode 100644
index 000000000..5088fda9b
--- /dev/null
+++ b/fs/nfs/io.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2016 Trond Myklebust
+ *
+ * I/O and data path helper functionality.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/rwsem.h>
+#include <linux/fs.h>
+#include <linux/nfs_fs.h>
+
+#include "internal.h"
+
+/* Call with exclusively locked inode->i_rwsem */
+static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode)
+{
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
+ clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
+ inode_dio_wait(inode);
+ }
+}
+
+/**
+ * nfs_start_io_read - declare the file is being used for buffered reads
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ * On exit, the function ensures that the NFS_INO_ODIRECT flag is unset,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that buffered read operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas direct I/O
+ * operations need to wait to grab an exclusive lock in order to set
+ * NFS_INO_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
+ */
+void
+nfs_start_io_read(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ /* Be an optimist! */
+ down_read(&inode->i_rwsem);
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0)
+ return;
+ up_read(&inode->i_rwsem);
+ /* Slow path.... */
+ down_write(&inode->i_rwsem);
+ nfs_block_o_direct(nfsi, inode);
+ downgrade_write(&inode->i_rwsem);
+}
+
+/**
+ * nfs_end_io_read - declare that the buffered read operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_read(struct inode *inode)
+{
+ up_read(&inode->i_rwsem);
+}
+
+/**
+ * nfs_start_io_write - declare the file is being used for buffered writes
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ */
+void
+nfs_start_io_write(struct inode *inode)
+{
+ down_write(&inode->i_rwsem);
+ nfs_block_o_direct(NFS_I(inode), inode);
+}
+
+/**
+ * nfs_end_io_write - declare that the buffered write operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered write operation is done, and release the
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_write(struct inode *inode)
+{
+ up_write(&inode->i_rwsem);
+}
+
+/* Call with exclusively locked inode->i_rwsem */
+static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode)
+{
+ if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
+ set_bit(NFS_INO_ODIRECT, &nfsi->flags);
+ nfs_sync_mapping(inode->i_mapping);
+ }
+}
+
+/**
+ * nfs_end_io_direct - declare the file is being used for direct i/o
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is about to start, and ensure
+ * that we block all buffered I/O.
+ * On exit, the function ensures that the NFS_INO_ODIRECT flag is set,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that direct I/O operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas buffered I/O
+ * operations need to wait to grab an exclusive lock in order to clear
+ * NFS_INO_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
+ */
+void
+nfs_start_io_direct(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ /* Be an optimist! */
+ down_read(&inode->i_rwsem);
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0)
+ return;
+ up_read(&inode->i_rwsem);
+ /* Slow path.... */
+ down_write(&inode->i_rwsem);
+ nfs_block_buffered(nfsi, inode);
+ downgrade_write(&inode->i_rwsem);
+}
+
+/**
+ * nfs_end_io_direct - declare that the direct i/o operation is done
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_direct(struct inode *inode)
+{
+ up_read(&inode->i_rwsem);
+}
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
new file mode 100644
index 000000000..2ddaab1ac
--- /dev/null
+++ b/fs/nfs/iostat.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/fs/nfs/iostat.h
+ *
+ * Declarations for NFS client per-mount statistics
+ *
+ * Copyright (C) 2005, 2006 Chuck Lever <cel@netapp.com>
+ *
+ */
+
+#ifndef _NFS_IOSTAT
+#define _NFS_IOSTAT
+
+#include <linux/percpu.h>
+#include <linux/cache.h>
+#include <linux/nfs_iostat.h>
+
+struct nfs_iostats {
+ unsigned long long bytes[__NFSIOS_BYTESMAX];
+#ifdef CONFIG_NFS_FSCACHE
+ unsigned long long fscache[__NFSIOS_FSCACHEMAX];
+#endif
+ unsigned long events[__NFSIOS_COUNTSMAX];
+} ____cacheline_aligned;
+
+static inline void nfs_inc_server_stats(const struct nfs_server *server,
+ enum nfs_stat_eventcounters stat)
+{
+ this_cpu_inc(server->io_stats->events[stat]);
+}
+
+static inline void nfs_inc_stats(const struct inode *inode,
+ enum nfs_stat_eventcounters stat)
+{
+ nfs_inc_server_stats(NFS_SERVER(inode), stat);
+}
+
+static inline void nfs_add_server_stats(const struct nfs_server *server,
+ enum nfs_stat_bytecounters stat,
+ long addend)
+{
+ this_cpu_add(server->io_stats->bytes[stat], addend);
+}
+
+static inline void nfs_add_stats(const struct inode *inode,
+ enum nfs_stat_bytecounters stat,
+ long addend)
+{
+ nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
+}
+
+#ifdef CONFIG_NFS_FSCACHE
+static inline void nfs_add_fscache_stats(struct inode *inode,
+ enum nfs_stat_fscachecounters stat,
+ long addend)
+{
+ this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend);
+}
+static inline void nfs_inc_fscache_stats(struct inode *inode,
+ enum nfs_stat_fscachecounters stat)
+{
+ this_cpu_inc(NFS_SERVER(inode)->io_stats->fscache[stat]);
+}
+#endif
+
+static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void)
+{
+ return alloc_percpu(struct nfs_iostats);
+}
+
+static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats)
+{
+ if (stats != NULL)
+ free_percpu(stats);
+}
+
+#endif /* _NFS_IOSTAT */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
new file mode 100644
index 000000000..dda5c3e65
--- /dev/null
+++ b/fs/nfs/mount_clnt.c
@@ -0,0 +1,540 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * In-kernel MOUNT protocol client
+ *
+ * Copyright (C) 1997, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/uio.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs_fs.h>
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_MOUNT
+
+/*
+ * Defined by RFC 1094, section A.3; and RFC 1813, section 5.1.4
+ */
+#define MNTPATHLEN (1024)
+
+/*
+ * XDR data type sizes
+ */
+#define encode_dirpath_sz (1 + XDR_QUADLEN(MNTPATHLEN))
+#define MNT_status_sz (1)
+#define MNT_fhandle_sz XDR_QUADLEN(NFS2_FHSIZE)
+#define MNT_fhandlev3_sz XDR_QUADLEN(NFS3_FHSIZE)
+#define MNT_authflav3_sz (1 + NFS_MAX_SECFLAVORS)
+
+/*
+ * XDR argument and result sizes
+ */
+#define MNT_enc_dirpath_sz encode_dirpath_sz
+#define MNT_dec_mountres_sz (MNT_status_sz + MNT_fhandle_sz)
+#define MNT_dec_mountres3_sz (MNT_status_sz + MNT_fhandlev3_sz + \
+ MNT_authflav3_sz)
+
+/*
+ * Defined by RFC 1094, section A.5
+ */
+enum {
+ MOUNTPROC_NULL = 0,
+ MOUNTPROC_MNT = 1,
+ MOUNTPROC_DUMP = 2,
+ MOUNTPROC_UMNT = 3,
+ MOUNTPROC_UMNTALL = 4,
+ MOUNTPROC_EXPORT = 5,
+};
+
+/*
+ * Defined by RFC 1813, section 5.2
+ */
+enum {
+ MOUNTPROC3_NULL = 0,
+ MOUNTPROC3_MNT = 1,
+ MOUNTPROC3_DUMP = 2,
+ MOUNTPROC3_UMNT = 3,
+ MOUNTPROC3_UMNTALL = 4,
+ MOUNTPROC3_EXPORT = 5,
+};
+
+static const struct rpc_program mnt_program;
+
+/*
+ * Defined by OpenGroup XNFS Version 3W, chapter 8
+ */
+enum mountstat {
+ MNT_OK = 0,
+ MNT_EPERM = 1,
+ MNT_ENOENT = 2,
+ MNT_EACCES = 13,
+ MNT_EINVAL = 22,
+};
+
+static struct {
+ u32 status;
+ int errno;
+} mnt_errtbl[] = {
+ { .status = MNT_OK, .errno = 0, },
+ { .status = MNT_EPERM, .errno = -EPERM, },
+ { .status = MNT_ENOENT, .errno = -ENOENT, },
+ { .status = MNT_EACCES, .errno = -EACCES, },
+ { .status = MNT_EINVAL, .errno = -EINVAL, },
+};
+
+/*
+ * Defined by RFC 1813, section 5.1.5
+ */
+enum mountstat3 {
+ MNT3_OK = 0, /* no error */
+ MNT3ERR_PERM = 1, /* Not owner */
+ MNT3ERR_NOENT = 2, /* No such file or directory */
+ MNT3ERR_IO = 5, /* I/O error */
+ MNT3ERR_ACCES = 13, /* Permission denied */
+ MNT3ERR_NOTDIR = 20, /* Not a directory */
+ MNT3ERR_INVAL = 22, /* Invalid argument */
+ MNT3ERR_NAMETOOLONG = 63, /* Filename too long */
+ MNT3ERR_NOTSUPP = 10004, /* Operation not supported */
+ MNT3ERR_SERVERFAULT = 10006, /* A failure on the server */
+};
+
+static struct {
+ u32 status;
+ int errno;
+} mnt3_errtbl[] = {
+ { .status = MNT3_OK, .errno = 0, },
+ { .status = MNT3ERR_PERM, .errno = -EPERM, },
+ { .status = MNT3ERR_NOENT, .errno = -ENOENT, },
+ { .status = MNT3ERR_IO, .errno = -EIO, },
+ { .status = MNT3ERR_ACCES, .errno = -EACCES, },
+ { .status = MNT3ERR_NOTDIR, .errno = -ENOTDIR, },
+ { .status = MNT3ERR_INVAL, .errno = -EINVAL, },
+ { .status = MNT3ERR_NAMETOOLONG, .errno = -ENAMETOOLONG, },
+ { .status = MNT3ERR_NOTSUPP, .errno = -ENOTSUPP, },
+ { .status = MNT3ERR_SERVERFAULT, .errno = -EREMOTEIO, },
+};
+
+struct mountres {
+ int errno;
+ struct nfs_fh *fh;
+ unsigned int *auth_count;
+ rpc_authflavor_t *auth_flavors;
+};
+
+struct mnt_fhstatus {
+ u32 status;
+ struct nfs_fh *fh;
+};
+
+/**
+ * nfs_mount - Obtain an NFS file handle for the given host and path
+ * @info: pointer to mount request arguments
+ *
+ * Uses default timeout parameters specified by underlying transport. On
+ * successful return, the auth_flavs list and auth_flav_len will be populated
+ * with the list from the server or a faked-up list if the server didn't
+ * provide one.
+ */
+int nfs_mount(struct nfs_mount_request *info)
+{
+ struct mountres result = {
+ .fh = info->fh,
+ .auth_count = info->auth_flav_len,
+ .auth_flavors = info->auth_flavs,
+ };
+ struct rpc_message msg = {
+ .rpc_argp = info->dirpath,
+ .rpc_resp = &result,
+ };
+ struct rpc_create_args args = {
+ .net = info->net,
+ .protocol = info->protocol,
+ .address = info->sap,
+ .addrsize = info->salen,
+ .servername = info->hostname,
+ .program = &mnt_program,
+ .version = info->version,
+ .authflavor = RPC_AUTH_UNIX,
+ .cred = current_cred(),
+ };
+ struct rpc_clnt *mnt_clnt;
+ int status;
+
+ dprintk("NFS: sending MNT request for %s:%s\n",
+ (info->hostname ? info->hostname : "server"),
+ info->dirpath);
+
+ if (strlen(info->dirpath) > MNTPATHLEN)
+ return -ENAMETOOLONG;
+
+ if (info->noresvport)
+ args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+
+ mnt_clnt = rpc_create(&args);
+ if (IS_ERR(mnt_clnt))
+ goto out_clnt_err;
+
+ if (info->version == NFS_MNT3_VERSION)
+ msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT];
+ else
+ msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT];
+
+ status = rpc_call_sync(mnt_clnt, &msg, RPC_TASK_SOFT|RPC_TASK_TIMEOUT);
+ rpc_shutdown_client(mnt_clnt);
+
+ if (status < 0)
+ goto out_call_err;
+ if (result.errno != 0)
+ goto out_mnt_err;
+
+ dprintk("NFS: MNT request succeeded\n");
+ status = 0;
+
+ /*
+ * If the server didn't provide a flavor list, allow the
+ * client to try any flavor.
+ */
+ if (info->version != NFS_MNT3_VERSION || *info->auth_flav_len == 0) {
+ dprintk("NFS: Faking up auth_flavs list\n");
+ info->auth_flavs[0] = RPC_AUTH_NULL;
+ *info->auth_flav_len = 1;
+ }
+out:
+ return status;
+
+out_clnt_err:
+ status = PTR_ERR(mnt_clnt);
+ dprintk("NFS: failed to create MNT RPC client, status=%d\n", status);
+ goto out;
+
+out_call_err:
+ dprintk("NFS: MNT request failed, status=%d\n", status);
+ goto out;
+
+out_mnt_err:
+ dprintk("NFS: MNT server returned result %d\n", result.errno);
+ status = result.errno;
+ goto out;
+}
+
+/**
+ * nfs_umount - Notify a server that we have unmounted this export
+ * @info: pointer to umount request arguments
+ *
+ * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always
+ * use UDP.
+ */
+void nfs_umount(const struct nfs_mount_request *info)
+{
+ static const struct rpc_timeout nfs_umnt_timeout = {
+ .to_initval = 1 * HZ,
+ .to_maxval = 3 * HZ,
+ .to_retries = 2,
+ };
+ struct rpc_create_args args = {
+ .net = info->net,
+ .protocol = IPPROTO_UDP,
+ .address = info->sap,
+ .addrsize = info->salen,
+ .timeout = &nfs_umnt_timeout,
+ .servername = info->hostname,
+ .program = &mnt_program,
+ .version = info->version,
+ .authflavor = RPC_AUTH_UNIX,
+ .flags = RPC_CLNT_CREATE_NOPING,
+ .cred = current_cred(),
+ };
+ struct rpc_message msg = {
+ .rpc_argp = info->dirpath,
+ };
+ struct rpc_clnt *clnt;
+ int status;
+
+ if (strlen(info->dirpath) > MNTPATHLEN)
+ return;
+
+ if (info->noresvport)
+ args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+
+ clnt = rpc_create(&args);
+ if (IS_ERR(clnt))
+ goto out_clnt_err;
+
+ dprintk("NFS: sending UMNT request for %s:%s\n",
+ (info->hostname ? info->hostname : "server"), info->dirpath);
+
+ if (info->version == NFS_MNT3_VERSION)
+ msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT];
+ else
+ msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT];
+
+ status = rpc_call_sync(clnt, &msg, 0);
+ rpc_shutdown_client(clnt);
+
+ if (unlikely(status < 0))
+ goto out_call_err;
+
+ return;
+
+out_clnt_err:
+ dprintk("NFS: failed to create UMNT RPC client, status=%ld\n",
+ PTR_ERR(clnt));
+ return;
+
+out_call_err:
+ dprintk("NFS: UMNT request failed, status=%d\n", status);
+}
+
+/*
+ * XDR encode/decode functions for MOUNT
+ */
+
+static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
+{
+ const u32 pathname_len = strlen(pathname);
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4 + pathname_len);
+ xdr_encode_opaque(p, pathname, pathname_len);
+}
+
+static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *dirpath)
+{
+ encode_mntdirpath(xdr, dirpath);
+}
+
+/*
+ * RFC 1094: "A non-zero status indicates some sort of error. In this
+ * case, the status is a UNIX error number." This can be problematic
+ * if the server and client use different errno values for the same
+ * error.
+ *
+ * However, the OpenGroup XNFS spec provides a simple mapping that is
+ * independent of local errno values on the server and the client.
+ */
+static int decode_status(struct xdr_stream *xdr, struct mountres *res)
+{
+ unsigned int i;
+ u32 status;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -EIO;
+ status = be32_to_cpup(p);
+
+ for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
+ if (mnt_errtbl[i].status == status) {
+ res->errno = mnt_errtbl[i].errno;
+ return 0;
+ }
+ }
+
+ dprintk("NFS: unrecognized MNT status code: %u\n", status);
+ res->errno = -EACCES;
+ return 0;
+}
+
+static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res)
+{
+ struct nfs_fh *fh = res->fh;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS2_FHSIZE);
+ if (unlikely(p == NULL))
+ return -EIO;
+
+ fh->size = NFS2_FHSIZE;
+ memcpy(fh->data, p, NFS2_FHSIZE);
+ return 0;
+}
+
+static int mnt_xdr_dec_mountres(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct mountres *res = data;
+ int status;
+
+ status = decode_status(xdr, res);
+ if (unlikely(status != 0 || res->errno != 0))
+ return status;
+ return decode_fhandle(xdr, res);
+}
+
+static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
+{
+ unsigned int i;
+ u32 status;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -EIO;
+ status = be32_to_cpup(p);
+
+ for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
+ if (mnt3_errtbl[i].status == status) {
+ res->errno = mnt3_errtbl[i].errno;
+ return 0;
+ }
+ }
+
+ dprintk("NFS: unrecognized MNT3 status code: %u\n", status);
+ res->errno = -EACCES;
+ return 0;
+}
+
+static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res)
+{
+ struct nfs_fh *fh = res->fh;
+ u32 size;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -EIO;
+
+ size = be32_to_cpup(p);
+ if (size > NFS3_FHSIZE || size == 0)
+ return -EIO;
+
+ p = xdr_inline_decode(xdr, size);
+ if (unlikely(p == NULL))
+ return -EIO;
+
+ fh->size = size;
+ memcpy(fh->data, p, size);
+ return 0;
+}
+
+static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
+{
+ rpc_authflavor_t *flavors = res->auth_flavors;
+ unsigned int *count = res->auth_count;
+ u32 entries, i;
+ __be32 *p;
+
+ if (*count == 0)
+ return 0;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -EIO;
+ entries = be32_to_cpup(p);
+ dprintk("NFS: received %u auth flavors\n", entries);
+ if (entries > NFS_MAX_SECFLAVORS)
+ entries = NFS_MAX_SECFLAVORS;
+
+ p = xdr_inline_decode(xdr, 4 * entries);
+ if (unlikely(p == NULL))
+ return -EIO;
+
+ if (entries > *count)
+ entries = *count;
+
+ for (i = 0; i < entries; i++) {
+ flavors[i] = be32_to_cpup(p++);
+ dprintk("NFS: auth flavor[%u]: %d\n", i, flavors[i]);
+ }
+ *count = i;
+
+ return 0;
+}
+
+static int mnt_xdr_dec_mountres3(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct mountres *res = data;
+ int status;
+
+ status = decode_fhs_status(xdr, res);
+ if (unlikely(status != 0 || res->errno != 0))
+ return status;
+ status = decode_fhandle3(xdr, res);
+ if (unlikely(status != 0)) {
+ res->errno = -EBADHANDLE;
+ return 0;
+ }
+ return decode_auth_flavors(xdr, res);
+}
+
+static const struct rpc_procinfo mnt_procedures[] = {
+ [MOUNTPROC_MNT] = {
+ .p_proc = MOUNTPROC_MNT,
+ .p_encode = mnt_xdr_enc_dirpath,
+ .p_decode = mnt_xdr_dec_mountres,
+ .p_arglen = MNT_enc_dirpath_sz,
+ .p_replen = MNT_dec_mountres_sz,
+ .p_statidx = MOUNTPROC_MNT,
+ .p_name = "MOUNT",
+ },
+ [MOUNTPROC_UMNT] = {
+ .p_proc = MOUNTPROC_UMNT,
+ .p_encode = mnt_xdr_enc_dirpath,
+ .p_arglen = MNT_enc_dirpath_sz,
+ .p_statidx = MOUNTPROC_UMNT,
+ .p_name = "UMOUNT",
+ },
+};
+
+static const struct rpc_procinfo mnt3_procedures[] = {
+ [MOUNTPROC3_MNT] = {
+ .p_proc = MOUNTPROC3_MNT,
+ .p_encode = mnt_xdr_enc_dirpath,
+ .p_decode = mnt_xdr_dec_mountres3,
+ .p_arglen = MNT_enc_dirpath_sz,
+ .p_replen = MNT_dec_mountres3_sz,
+ .p_statidx = MOUNTPROC3_MNT,
+ .p_name = "MOUNT",
+ },
+ [MOUNTPROC3_UMNT] = {
+ .p_proc = MOUNTPROC3_UMNT,
+ .p_encode = mnt_xdr_enc_dirpath,
+ .p_arglen = MNT_enc_dirpath_sz,
+ .p_statidx = MOUNTPROC3_UMNT,
+ .p_name = "UMOUNT",
+ },
+};
+
+static unsigned int mnt_counts[ARRAY_SIZE(mnt_procedures)];
+static const struct rpc_version mnt_version1 = {
+ .number = 1,
+ .nrprocs = ARRAY_SIZE(mnt_procedures),
+ .procs = mnt_procedures,
+ .counts = mnt_counts,
+};
+
+static unsigned int mnt3_counts[ARRAY_SIZE(mnt3_procedures)];
+static const struct rpc_version mnt_version3 = {
+ .number = 3,
+ .nrprocs = ARRAY_SIZE(mnt3_procedures),
+ .procs = mnt3_procedures,
+ .counts = mnt3_counts,
+};
+
+static const struct rpc_version *mnt_version[] = {
+ NULL,
+ &mnt_version1,
+ NULL,
+ &mnt_version3,
+};
+
+static struct rpc_stat mnt_stats;
+
+static const struct rpc_program mnt_program = {
+ .name = "mount",
+ .number = NFS_MNT_PROGRAM,
+ .nrvers = ARRAY_SIZE(mnt_version),
+ .version = mnt_version,
+ .stats = &mnt_stats,
+};
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
new file mode 100644
index 000000000..1f03445b5
--- /dev/null
+++ b/fs/nfs/namespace.c
@@ -0,0 +1,367 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/namespace.c
+ *
+ * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ * - Modified by David Howells <dhowells@redhat.com>
+ *
+ * NFS namespace
+ */
+
+#include <linux/module.h>
+#include <linux/dcache.h>
+#include <linux/gfp.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/nfs_fs.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/vfs.h>
+#include <linux/sunrpc/gss_api.h>
+#include "internal.h"
+#include "nfs.h"
+
+#define NFSDBG_FACILITY NFSDBG_VFS
+
+static void nfs_expire_automounts(struct work_struct *work);
+
+static LIST_HEAD(nfs_automount_list);
+static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts);
+int nfs_mountpoint_expiry_timeout = 500 * HZ;
+
+/*
+ * nfs_path - reconstruct the path given an arbitrary dentry
+ * @base - used to return pointer to the end of devname part of path
+ * @dentry_in - pointer to dentry
+ * @buffer - result buffer
+ * @buflen_in - length of buffer
+ * @flags - options (see below)
+ *
+ * Helper function for constructing the server pathname
+ * by arbitrary hashed dentry.
+ *
+ * This is mainly for use in figuring out the path on the
+ * server side when automounting on top of an existing partition
+ * and in generating /proc/mounts and friends.
+ *
+ * Supported flags:
+ * NFS_PATH_CANONICAL: ensure there is exactly one slash after
+ * the original device (export) name
+ * (if unset, the original name is returned verbatim)
+ */
+char *nfs_path(char **p, struct dentry *dentry_in, char *buffer,
+ ssize_t buflen_in, unsigned flags)
+{
+ char *end;
+ int namelen;
+ unsigned seq;
+ const char *base;
+ struct dentry *dentry;
+ ssize_t buflen;
+
+rename_retry:
+ buflen = buflen_in;
+ dentry = dentry_in;
+ end = buffer+buflen;
+ *--end = '\0';
+ buflen--;
+
+ seq = read_seqbegin(&rename_lock);
+ rcu_read_lock();
+ while (1) {
+ spin_lock(&dentry->d_lock);
+ if (IS_ROOT(dentry))
+ break;
+ namelen = dentry->d_name.len;
+ buflen -= namelen + 1;
+ if (buflen < 0)
+ goto Elong_unlock;
+ end -= namelen;
+ memcpy(end, dentry->d_name.name, namelen);
+ *--end = '/';
+ spin_unlock(&dentry->d_lock);
+ dentry = dentry->d_parent;
+ }
+ if (read_seqretry(&rename_lock, seq)) {
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ goto rename_retry;
+ }
+ if ((flags & NFS_PATH_CANONICAL) && *end != '/') {
+ if (--buflen < 0) {
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ goto Elong;
+ }
+ *--end = '/';
+ }
+ *p = end;
+ base = dentry->d_fsdata;
+ if (!base) {
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ WARN_ON(1);
+ return end;
+ }
+ namelen = strlen(base);
+ if (*end == '/') {
+ /* Strip off excess slashes in base string */
+ while (namelen > 0 && base[namelen - 1] == '/')
+ namelen--;
+ }
+ buflen -= namelen;
+ if (buflen < 0) {
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ goto Elong;
+ }
+ end -= namelen;
+ memcpy(end, base, namelen);
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ return end;
+Elong_unlock:
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ if (read_seqretry(&rename_lock, seq))
+ goto rename_retry;
+Elong:
+ return ERR_PTR(-ENAMETOOLONG);
+}
+EXPORT_SYMBOL_GPL(nfs_path);
+
+/*
+ * nfs_d_automount - Handle crossing a mountpoint on the server
+ * @path - The mountpoint
+ *
+ * When we encounter a mountpoint on the server, we want to set up
+ * a mountpoint on the client too, to prevent inode numbers from
+ * colliding, and to allow "df" to work properly.
+ * On NFSv4, we also want to allow for the fact that different
+ * filesystems may be migrated to different servers in a failover
+ * situation, and that different filesystems may want to use
+ * different security flavours.
+ */
+struct vfsmount *nfs_d_automount(struct path *path)
+{
+ struct nfs_fs_context *ctx;
+ struct fs_context *fc;
+ struct vfsmount *mnt = ERR_PTR(-ENOMEM);
+ struct nfs_server *server = NFS_SB(path->dentry->d_sb);
+ struct nfs_client *client = server->nfs_client;
+ int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout);
+ int ret;
+
+ if (IS_ROOT(path->dentry))
+ return ERR_PTR(-ESTALE);
+
+ /* Open a new filesystem context, transferring parameters from the
+ * parent superblock, including the network namespace.
+ */
+ fc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry);
+ if (IS_ERR(fc))
+ return ERR_CAST(fc);
+
+ ctx = nfs_fc2context(fc);
+ ctx->clone_data.dentry = path->dentry;
+ ctx->clone_data.sb = path->dentry->d_sb;
+ ctx->clone_data.fattr = nfs_alloc_fattr();
+ if (!ctx->clone_data.fattr)
+ goto out_fc;
+
+ if (fc->net_ns != client->cl_net) {
+ put_net(fc->net_ns);
+ fc->net_ns = get_net(client->cl_net);
+ }
+
+ /* for submounts we want the same server; referrals will reassign */
+ memcpy(&ctx->nfs_server.address, &client->cl_addr, client->cl_addrlen);
+ ctx->nfs_server.addrlen = client->cl_addrlen;
+ ctx->nfs_server.port = server->port;
+
+ ctx->version = client->rpc_ops->version;
+ ctx->minorversion = client->cl_minorversion;
+ ctx->nfs_mod = client->cl_nfs_mod;
+ __module_get(ctx->nfs_mod->owner);
+
+ ret = client->rpc_ops->submount(fc, server);
+ if (ret < 0) {
+ mnt = ERR_PTR(ret);
+ goto out_fc;
+ }
+
+ up_write(&fc->root->d_sb->s_umount);
+ mnt = vfs_create_mount(fc);
+ if (IS_ERR(mnt))
+ goto out_fc;
+
+ mntget(mnt); /* prevent immediate expiration */
+ if (timeout <= 0)
+ goto out_fc;
+
+ mnt_set_expiry(mnt, &nfs_automount_list);
+ schedule_delayed_work(&nfs_automount_task, timeout);
+
+out_fc:
+ put_fs_context(fc);
+ return mnt;
+}
+
+static int
+nfs_namespace_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ if (NFS_FH(d_inode(path->dentry))->size != 0)
+ return nfs_getattr(path, stat, request_mask, query_flags);
+ generic_fillattr(d_inode(path->dentry), stat);
+ return 0;
+}
+
+static int
+nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ if (NFS_FH(d_inode(dentry))->size != 0)
+ return nfs_setattr(dentry, attr);
+ return -EACCES;
+}
+
+const struct inode_operations nfs_mountpoint_inode_operations = {
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+};
+
+const struct inode_operations nfs_referral_inode_operations = {
+ .getattr = nfs_namespace_getattr,
+ .setattr = nfs_namespace_setattr,
+};
+
+static void nfs_expire_automounts(struct work_struct *work)
+{
+ struct list_head *list = &nfs_automount_list;
+ int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout);
+
+ mark_mounts_for_expiry(list);
+ if (!list_empty(list) && timeout > 0)
+ schedule_delayed_work(&nfs_automount_task, timeout);
+}
+
+void nfs_release_automount_timer(void)
+{
+ if (list_empty(&nfs_automount_list))
+ cancel_delayed_work(&nfs_automount_task);
+}
+
+/**
+ * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
+ * @fc: pointer to struct nfs_fs_context
+ *
+ */
+int nfs_do_submount(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct dentry *dentry = ctx->clone_data.dentry;
+ struct nfs_server *server;
+ char *buffer, *p;
+ int ret;
+
+ /* create a new volume representation */
+ server = ctx->nfs_mod->rpc_ops->clone_server(NFS_SB(ctx->clone_data.sb),
+ ctx->mntfh,
+ ctx->clone_data.fattr,
+ ctx->selected_flavor);
+
+ if (IS_ERR(server))
+ return PTR_ERR(server);
+
+ ctx->server = server;
+
+ buffer = kmalloc(4096, GFP_USER);
+ if (!buffer)
+ return -ENOMEM;
+
+ ctx->internal = true;
+ ctx->clone_data.inherited_bsize = ctx->clone_data.sb->s_blocksize_bits;
+
+ p = nfs_devname(dentry, buffer, 4096);
+ if (IS_ERR(p)) {
+ nfs_errorf(fc, "NFS: Couldn't determine submount pathname");
+ ret = PTR_ERR(p);
+ } else {
+ ret = vfs_parse_fs_string(fc, "source", p, buffer + 4096 - p);
+ if (!ret)
+ ret = vfs_get_tree(fc);
+ }
+ kfree(buffer);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_do_submount);
+
+int nfs_submount(struct fs_context *fc, struct nfs_server *server)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct dentry *dentry = ctx->clone_data.dentry;
+ struct dentry *parent = dget_parent(dentry);
+ int err;
+
+ /* Look it up again to get its attributes */
+ err = server->nfs_client->rpc_ops->lookup(d_inode(parent), dentry,
+ ctx->mntfh, ctx->clone_data.fattr,
+ NULL);
+ dput(parent);
+ if (err != 0)
+ return err;
+
+ ctx->selected_flavor = server->client->cl_auth->au_flavor;
+ return nfs_do_submount(fc);
+}
+EXPORT_SYMBOL_GPL(nfs_submount);
+
+static int param_set_nfs_timeout(const char *val, const struct kernel_param *kp)
+{
+ long num;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+ ret = kstrtol(val, 0, &num);
+ if (ret)
+ return -EINVAL;
+ if (num > 0) {
+ if (num >= INT_MAX / HZ)
+ num = INT_MAX;
+ else
+ num *= HZ;
+ *((int *)kp->arg) = num;
+ if (!list_empty(&nfs_automount_list))
+ mod_delayed_work(system_wq, &nfs_automount_task, num);
+ } else {
+ *((int *)kp->arg) = -1*HZ;
+ cancel_delayed_work(&nfs_automount_task);
+ }
+ return 0;
+}
+
+static int param_get_nfs_timeout(char *buffer, const struct kernel_param *kp)
+{
+ long num = *((int *)kp->arg);
+
+ if (num > 0) {
+ if (num >= INT_MAX - (HZ - 1))
+ num = INT_MAX / HZ;
+ else
+ num = (num + (HZ - 1)) / HZ;
+ } else
+ num = -1;
+ return scnprintf(buffer, PAGE_SIZE, "%li\n", num);
+}
+
+static const struct kernel_param_ops param_ops_nfs_timeout = {
+ .set = param_set_nfs_timeout,
+ .get = param_get_nfs_timeout,
+};
+#define param_check_nfs_timeout(name, p) __param_check(name, p, int);
+
+module_param(nfs_mountpoint_expiry_timeout, nfs_timeout, 0644);
+MODULE_PARM_DESC(nfs_mountpoint_expiry_timeout,
+ "Set the NFS automounted mountpoint timeout value (seconds)."
+ "Values <= 0 turn expiration off.");
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
new file mode 100644
index 000000000..c8374f74d
--- /dev/null
+++ b/fs/nfs/netns.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NFS-private data for each "struct net". Accessed with net_generic().
+ */
+
+#ifndef __NFS_NETNS_H__
+#define __NFS_NETNS_H__
+
+#include <linux/nfs4.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+struct bl_dev_msg {
+ int32_t status;
+ uint32_t major, minor;
+};
+
+struct nfs_netns_client;
+
+struct nfs_net {
+ struct cache_detail *nfs_dns_resolve;
+ struct rpc_pipe *bl_device_pipe;
+ struct bl_dev_msg bl_mount_reply;
+ wait_queue_head_t bl_wq;
+ struct mutex bl_mutex;
+ struct list_head nfs_client_list;
+ struct list_head nfs_volume_list;
+#if IS_ENABLED(CONFIG_NFS_V4)
+ struct idr cb_ident_idr; /* Protected by nfs_client_lock */
+ unsigned short nfs_callback_tcpport;
+ unsigned short nfs_callback_tcpport6;
+ int cb_users[NFS4_MAX_MINOR_VERSION + 1];
+#endif
+ struct nfs_netns_client *nfs_client;
+ spinlock_t nfs_client_lock;
+ ktime_t boot_time;
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *proc_nfsfs;
+#endif
+};
+
+extern unsigned int nfs_net_id;
+
+#endif
diff --git a/fs/nfs/nfs.h b/fs/nfs/nfs.h
new file mode 100644
index 000000000..5ba00610a
--- /dev/null
+++ b/fs/nfs/nfs.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2012 Netapp, Inc. All rights reserved.
+ *
+ * Function and structures exported by the NFS module
+ * for use by NFS version-specific modules.
+ */
+#ifndef __LINUX_INTERNAL_NFS_H
+#define __LINUX_INTERNAL_NFS_H
+
+#include <linux/fs.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs_xdr.h>
+
+struct nfs_subversion {
+ struct module *owner; /* THIS_MODULE pointer */
+ struct file_system_type *nfs_fs; /* NFS filesystem type */
+ const struct rpc_version *rpc_vers; /* NFS version information */
+ const struct nfs_rpc_ops *rpc_ops; /* NFS operations */
+ const struct super_operations *sops; /* NFS Super operations */
+ const struct xattr_handler **xattr; /* NFS xattr handlers */
+ struct list_head list; /* List of NFS versions */
+};
+
+struct nfs_subversion *get_nfs_version(unsigned int);
+void put_nfs_version(struct nfs_subversion *);
+void register_nfs_version(struct nfs_subversion *);
+void unregister_nfs_version(struct nfs_subversion *);
+
+#endif /* __LINUX_INTERNAL_NFS_H */
diff --git a/fs/nfs/nfs2super.c b/fs/nfs/nfs2super.c
new file mode 100644
index 000000000..467f21ee6
--- /dev/null
+++ b/fs/nfs/nfs2super.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2012 Netapp, Inc. All rights reserved.
+ */
+#include <linux/module.h>
+#include <linux/nfs_fs.h>
+#include "internal.h"
+#include "nfs.h"
+
+static struct nfs_subversion nfs_v2 = {
+ .owner = THIS_MODULE,
+ .nfs_fs = &nfs_fs_type,
+ .rpc_vers = &nfs_version2,
+ .rpc_ops = &nfs_v2_clientops,
+ .sops = &nfs_sops,
+};
+
+static int __init init_nfs_v2(void)
+{
+ register_nfs_version(&nfs_v2);
+ return 0;
+}
+
+static void __exit exit_nfs_v2(void)
+{
+ unregister_nfs_version(&nfs_v2);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_nfs_v2);
+module_exit(exit_nfs_v2);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
new file mode 100644
index 000000000..b34196da1
--- /dev/null
+++ b/fs/nfs/nfs2xdr.c
@@ -0,0 +1,1156 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/nfs2xdr.c
+ *
+ * XDR functions to encode/decode NFS RPC arguments and results.
+ *
+ * Copyright (C) 1992, 1993, 1994 Rick Sladkey
+ * Copyright (C) 1996 Olaf Kirch
+ * 04 Aug 1998 Ion Badulescu <ionut@cs.columbia.edu>
+ * FIFO's need special handling in NFSv2
+ */
+
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs_fs.h>
+#include "nfstrace.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_XDR
+
+/* Mapping from NFS error code to "errno" error code. */
+#define errno_NFSERR_IO EIO
+
+/*
+ * Declare the space requirements for NFS arguments and replies as
+ * number of 32bit-words
+ */
+#define NFS_fhandle_sz (8)
+#define NFS_sattr_sz (8)
+#define NFS_filename_sz (1+(NFS2_MAXNAMLEN>>2))
+#define NFS_path_sz (1+(NFS2_MAXPATHLEN>>2))
+#define NFS_fattr_sz (17)
+#define NFS_info_sz (5)
+#define NFS_entry_sz (NFS_filename_sz+3)
+
+#define NFS_diropargs_sz (NFS_fhandle_sz+NFS_filename_sz)
+#define NFS_removeargs_sz (NFS_fhandle_sz+NFS_filename_sz)
+#define NFS_sattrargs_sz (NFS_fhandle_sz+NFS_sattr_sz)
+#define NFS_readlinkargs_sz (NFS_fhandle_sz)
+#define NFS_readargs_sz (NFS_fhandle_sz+3)
+#define NFS_writeargs_sz (NFS_fhandle_sz+4)
+#define NFS_createargs_sz (NFS_diropargs_sz+NFS_sattr_sz)
+#define NFS_renameargs_sz (NFS_diropargs_sz+NFS_diropargs_sz)
+#define NFS_linkargs_sz (NFS_fhandle_sz+NFS_diropargs_sz)
+#define NFS_symlinkargs_sz (NFS_diropargs_sz+1+NFS_sattr_sz)
+#define NFS_readdirargs_sz (NFS_fhandle_sz+2)
+
+#define NFS_attrstat_sz (1+NFS_fattr_sz)
+#define NFS_diropres_sz (1+NFS_fhandle_sz+NFS_fattr_sz)
+#define NFS_readlinkres_sz (2+1)
+#define NFS_readres_sz (1+NFS_fattr_sz+1+1)
+#define NFS_writeres_sz (NFS_attrstat_sz)
+#define NFS_stat_sz (1)
+#define NFS_readdirres_sz (1+1)
+#define NFS_statfsres_sz (1+NFS_info_sz)
+
+static int nfs_stat_to_errno(enum nfs_stat);
+
+/*
+ * Encode/decode NFSv2 basic data types
+ *
+ * Basic NFSv2 data types are defined in section 2.3 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions. For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+
+static struct user_namespace *rpc_userns(const struct rpc_clnt *clnt)
+{
+ if (clnt && clnt->cl_cred)
+ return clnt->cl_cred->user_ns;
+ return &init_user_ns;
+}
+
+static struct user_namespace *rpc_rqst_userns(const struct rpc_rqst *rqstp)
+{
+ if (rqstp->rq_task)
+ return rpc_userns(rqstp->rq_task->tk_client);
+ return &init_user_ns;
+}
+
+/*
+ * typedef opaque nfsdata<>;
+ */
+static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result)
+{
+ u32 recvd, count;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ count = be32_to_cpup(p);
+ recvd = xdr_read_pages(xdr, count);
+ if (unlikely(count > recvd))
+ goto out_cheating;
+out:
+ result->eof = 0; /* NFSv2 does not pass EOF flag on the wire. */
+ result->count = count;
+ return count;
+out_cheating:
+ dprintk("NFS: server cheating in read result: "
+ "count %u > recvd %u\n", count, recvd);
+ count = recvd;
+ goto out;
+}
+
+/*
+ * enum stat {
+ * NFS_OK = 0,
+ * NFSERR_PERM = 1,
+ * NFSERR_NOENT = 2,
+ * NFSERR_IO = 5,
+ * NFSERR_NXIO = 6,
+ * NFSERR_ACCES = 13,
+ * NFSERR_EXIST = 17,
+ * NFSERR_NODEV = 19,
+ * NFSERR_NOTDIR = 20,
+ * NFSERR_ISDIR = 21,
+ * NFSERR_FBIG = 27,
+ * NFSERR_NOSPC = 28,
+ * NFSERR_ROFS = 30,
+ * NFSERR_NAMETOOLONG = 63,
+ * NFSERR_NOTEMPTY = 66,
+ * NFSERR_DQUOT = 69,
+ * NFSERR_STALE = 70,
+ * NFSERR_WFLUSH = 99
+ * };
+ */
+static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ if (unlikely(*p != cpu_to_be32(NFS_OK)))
+ goto out_status;
+ *status = 0;
+ return 0;
+out_status:
+ *status = be32_to_cpup(p);
+ trace_nfs_xdr_status(xdr, (int)*status);
+ return 0;
+}
+
+/*
+ * 2.3.2. ftype
+ *
+ * enum ftype {
+ * NFNON = 0,
+ * NFREG = 1,
+ * NFDIR = 2,
+ * NFBLK = 3,
+ * NFCHR = 4,
+ * NFLNK = 5
+ * };
+ *
+ */
+static __be32 *xdr_decode_ftype(__be32 *p, u32 *type)
+{
+ *type = be32_to_cpup(p++);
+ if (unlikely(*type > NF2FIFO))
+ *type = NFBAD;
+ return p;
+}
+
+/*
+ * 2.3.3. fhandle
+ *
+ * typedef opaque fhandle[FHSIZE];
+ */
+static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS2_FHSIZE);
+ memcpy(p, fh->data, NFS2_FHSIZE);
+}
+
+static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS2_FHSIZE);
+ if (unlikely(!p))
+ return -EIO;
+ fh->size = NFS2_FHSIZE;
+ memcpy(fh->data, p, NFS2_FHSIZE);
+ return 0;
+}
+
+/*
+ * 2.3.4. timeval
+ *
+ * struct timeval {
+ * unsigned int seconds;
+ * unsigned int useconds;
+ * };
+ */
+static __be32 *xdr_encode_time(__be32 *p, const struct timespec64 *timep)
+{
+ *p++ = cpu_to_be32((u32)timep->tv_sec);
+ if (timep->tv_nsec != 0)
+ *p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC);
+ else
+ *p++ = cpu_to_be32(0);
+ return p;
+}
+
+/*
+ * Passing the invalid value useconds=1000000 is a Sun convention for
+ * "set to current server time". It's needed to make permissions checks
+ * for the "touch" program across v2 mounts to Solaris and Irix servers
+ * work correctly. See description of sattr in section 6.1 of "NFS
+ * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5.
+ */
+static __be32 *xdr_encode_current_server_time(__be32 *p,
+ const struct timespec64 *timep)
+{
+ *p++ = cpu_to_be32(timep->tv_sec);
+ *p++ = cpu_to_be32(1000000);
+ return p;
+}
+
+static __be32 *xdr_decode_time(__be32 *p, struct timespec64 *timep)
+{
+ timep->tv_sec = be32_to_cpup(p++);
+ timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC;
+ return p;
+}
+
+/*
+ * 2.3.5. fattr
+ *
+ * struct fattr {
+ * ftype type;
+ * unsigned int mode;
+ * unsigned int nlink;
+ * unsigned int uid;
+ * unsigned int gid;
+ * unsigned int size;
+ * unsigned int blocksize;
+ * unsigned int rdev;
+ * unsigned int blocks;
+ * unsigned int fsid;
+ * unsigned int fileid;
+ * timeval atime;
+ * timeval mtime;
+ * timeval ctime;
+ * };
+ *
+ */
+static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+ struct user_namespace *userns)
+{
+ u32 rdev, type;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS_fattr_sz << 2);
+ if (unlikely(!p))
+ return -EIO;
+
+ fattr->valid |= NFS_ATTR_FATTR_V2;
+
+ p = xdr_decode_ftype(p, &type);
+
+ fattr->mode = be32_to_cpup(p++);
+ fattr->nlink = be32_to_cpup(p++);
+ fattr->uid = make_kuid(userns, be32_to_cpup(p++));
+ if (!uid_valid(fattr->uid))
+ goto out_uid;
+ fattr->gid = make_kgid(userns, be32_to_cpup(p++));
+ if (!gid_valid(fattr->gid))
+ goto out_gid;
+
+ fattr->size = be32_to_cpup(p++);
+ fattr->du.nfs2.blocksize = be32_to_cpup(p++);
+
+ rdev = be32_to_cpup(p++);
+ fattr->rdev = new_decode_dev(rdev);
+ if (type == (u32)NFCHR && rdev == (u32)NFS2_FIFO_DEV) {
+ fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
+ fattr->rdev = 0;
+ }
+
+ fattr->du.nfs2.blocks = be32_to_cpup(p++);
+ fattr->fsid.major = be32_to_cpup(p++);
+ fattr->fsid.minor = 0;
+ fattr->fileid = be32_to_cpup(p++);
+
+ p = xdr_decode_time(p, &fattr->atime);
+ p = xdr_decode_time(p, &fattr->mtime);
+ xdr_decode_time(p, &fattr->ctime);
+ fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime);
+
+ return 0;
+out_uid:
+ dprintk("NFS: returned invalid uid\n");
+ return -EINVAL;
+out_gid:
+ dprintk("NFS: returned invalid gid\n");
+ return -EINVAL;
+}
+
+/*
+ * 2.3.6. sattr
+ *
+ * struct sattr {
+ * unsigned int mode;
+ * unsigned int uid;
+ * unsigned int gid;
+ * unsigned int size;
+ * timeval atime;
+ * timeval mtime;
+ * };
+ */
+
+#define NFS2_SATTR_NOT_SET (0xffffffff)
+
+static __be32 *xdr_time_not_set(__be32 *p)
+{
+ *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+ *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+ return p;
+}
+
+static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr,
+ struct user_namespace *userns)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS_sattr_sz << 2);
+
+ if (attr->ia_valid & ATTR_MODE)
+ *p++ = cpu_to_be32(attr->ia_mode);
+ else
+ *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+ if (attr->ia_valid & ATTR_UID)
+ *p++ = cpu_to_be32(from_kuid_munged(userns, attr->ia_uid));
+ else
+ *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+ if (attr->ia_valid & ATTR_GID)
+ *p++ = cpu_to_be32(from_kgid_munged(userns, attr->ia_gid));
+ else
+ *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+ if (attr->ia_valid & ATTR_SIZE)
+ *p++ = cpu_to_be32((u32)attr->ia_size);
+ else
+ *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+
+ if (attr->ia_valid & ATTR_ATIME_SET)
+ p = xdr_encode_time(p, &attr->ia_atime);
+ else if (attr->ia_valid & ATTR_ATIME)
+ p = xdr_encode_current_server_time(p, &attr->ia_atime);
+ else
+ p = xdr_time_not_set(p);
+ if (attr->ia_valid & ATTR_MTIME_SET)
+ xdr_encode_time(p, &attr->ia_mtime);
+ else if (attr->ia_valid & ATTR_MTIME)
+ xdr_encode_current_server_time(p, &attr->ia_mtime);
+ else
+ xdr_time_not_set(p);
+}
+
+/*
+ * 2.3.7. filename
+ *
+ * typedef string filename<MAXNAMLEN>;
+ */
+static void encode_filename(struct xdr_stream *xdr,
+ const char *name, u32 length)
+{
+ __be32 *p;
+
+ WARN_ON_ONCE(length > NFS2_MAXNAMLEN);
+ p = xdr_reserve_space(xdr, 4 + length);
+ xdr_encode_opaque(p, name, length);
+}
+
+static int decode_filename_inline(struct xdr_stream *xdr,
+ const char **name, u32 *length)
+{
+ __be32 *p;
+ u32 count;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ count = be32_to_cpup(p);
+ if (count > NFS3_MAXNAMLEN)
+ goto out_nametoolong;
+ p = xdr_inline_decode(xdr, count);
+ if (unlikely(!p))
+ return -EIO;
+ *name = (const char *)p;
+ *length = count;
+ return 0;
+out_nametoolong:
+ dprintk("NFS: returned filename too long: %u\n", count);
+ return -ENAMETOOLONG;
+}
+
+/*
+ * 2.3.8. path
+ *
+ * typedef string path<MAXPATHLEN>;
+ */
+static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+ *p = cpu_to_be32(length);
+ xdr_write_pages(xdr, pages, 0, length);
+}
+
+static int decode_path(struct xdr_stream *xdr)
+{
+ u32 length, recvd;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ length = be32_to_cpup(p);
+ if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN))
+ goto out_size;
+ recvd = xdr_read_pages(xdr, length);
+ if (unlikely(length > recvd))
+ goto out_cheating;
+ xdr_terminate_string(xdr->buf, length);
+ return 0;
+out_size:
+ dprintk("NFS: returned pathname too long: %u\n", length);
+ return -ENAMETOOLONG;
+out_cheating:
+ dprintk("NFS: server cheating in pathname result: "
+ "length %u > received %u\n", length, recvd);
+ return -EIO;
+}
+
+/*
+ * 2.3.9. attrstat
+ *
+ * union attrstat switch (stat status) {
+ * case NFS_OK:
+ * fattr attributes;
+ * default:
+ * void;
+ * };
+ */
+static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result,
+ __u32 *op_status,
+ struct user_namespace *userns)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (op_status)
+ *op_status = status;
+ if (status != NFS_OK)
+ goto out_default;
+ error = decode_fattr(xdr, result, userns);
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+/*
+ * 2.3.10. diropargs
+ *
+ * struct diropargs {
+ * fhandle dir;
+ * filename name;
+ * };
+ */
+static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh,
+ const char *name, u32 length)
+{
+ encode_fhandle(xdr, fh);
+ encode_filename(xdr, name, length);
+}
+
+/*
+ * 2.3.11. diropres
+ *
+ * union diropres switch (stat status) {
+ * case NFS_OK:
+ * struct {
+ * fhandle file;
+ * fattr attributes;
+ * } diropok;
+ * default:
+ * void;
+ * };
+ */
+static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result,
+ struct user_namespace *userns)
+{
+ int error;
+
+ error = decode_fhandle(xdr, result->fh);
+ if (unlikely(error))
+ goto out;
+ error = decode_fattr(xdr, result->fattr, userns);
+out:
+ return error;
+}
+
+static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result,
+ struct user_namespace *userns)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS_OK)
+ goto out_default;
+ error = decode_diropok(xdr, result, userns);
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+
+/*
+ * NFSv2 XDR encode functions
+ *
+ * NFSv2 argument types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ */
+
+static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_fh *fh = data;
+
+ encode_fhandle(xdr, fh);
+}
+
+/*
+ * 2.2.3. sattrargs
+ *
+ * struct sattrargs {
+ * fhandle file;
+ * sattr attributes;
+ * };
+ */
+static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_sattrargs *args = data;
+
+ encode_fhandle(xdr, args->fh);
+ encode_sattr(xdr, args->sattr, rpc_rqst_userns(req));
+}
+
+static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_diropargs *args = data;
+
+ encode_diropargs(xdr, args->fh, args->name, args->len);
+}
+
+static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_readlinkargs *args = data;
+
+ encode_fhandle(xdr, args->fh);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->pglen, NFS_readlinkres_sz);
+}
+
+/*
+ * 2.2.7. readargs
+ *
+ * struct readargs {
+ * fhandle file;
+ * unsigned offset;
+ * unsigned count;
+ * unsigned totalcount;
+ * };
+ */
+static void encode_readargs(struct xdr_stream *xdr,
+ const struct nfs_pgio_args *args)
+{
+ u32 offset = args->offset;
+ u32 count = args->count;
+ __be32 *p;
+
+ encode_fhandle(xdr, args->fh);
+
+ p = xdr_reserve_space(xdr, 4 + 4 + 4);
+ *p++ = cpu_to_be32(offset);
+ *p++ = cpu_to_be32(count);
+ *p = cpu_to_be32(count);
+}
+
+static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_pgio_args *args = data;
+
+ encode_readargs(xdr, args);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, NFS_readres_sz);
+ req->rq_rcv_buf.flags |= XDRBUF_READ;
+}
+
+/*
+ * 2.2.9. writeargs
+ *
+ * struct writeargs {
+ * fhandle file;
+ * unsigned beginoffset;
+ * unsigned offset;
+ * unsigned totalcount;
+ * nfsdata data;
+ * };
+ */
+static void encode_writeargs(struct xdr_stream *xdr,
+ const struct nfs_pgio_args *args)
+{
+ u32 offset = args->offset;
+ u32 count = args->count;
+ __be32 *p;
+
+ encode_fhandle(xdr, args->fh);
+
+ p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
+ *p++ = cpu_to_be32(offset);
+ *p++ = cpu_to_be32(offset);
+ *p++ = cpu_to_be32(count);
+
+ /* nfsdata */
+ *p = cpu_to_be32(count);
+ xdr_write_pages(xdr, args->pages, args->pgbase, count);
+}
+
+static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_pgio_args *args = data;
+
+ encode_writeargs(xdr, args);
+ xdr->buf->flags |= XDRBUF_WRITE;
+}
+
+/*
+ * 2.2.10. createargs
+ *
+ * struct createargs {
+ * diropargs where;
+ * sattr attributes;
+ * };
+ */
+static void nfs2_xdr_enc_createargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_createargs *args = data;
+
+ encode_diropargs(xdr, args->fh, args->name, args->len);
+ encode_sattr(xdr, args->sattr, rpc_rqst_userns(req));
+}
+
+static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_removeargs *args = data;
+
+ encode_diropargs(xdr, args->fh, args->name.name, args->name.len);
+}
+
+/*
+ * 2.2.12. renameargs
+ *
+ * struct renameargs {
+ * diropargs from;
+ * diropargs to;
+ * };
+ */
+static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_renameargs *args = data;
+ const struct qstr *old = args->old_name;
+ const struct qstr *new = args->new_name;
+
+ encode_diropargs(xdr, args->old_dir, old->name, old->len);
+ encode_diropargs(xdr, args->new_dir, new->name, new->len);
+}
+
+/*
+ * 2.2.13. linkargs
+ *
+ * struct linkargs {
+ * fhandle from;
+ * diropargs to;
+ * };
+ */
+static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_linkargs *args = data;
+
+ encode_fhandle(xdr, args->fromfh);
+ encode_diropargs(xdr, args->tofh, args->toname, args->tolen);
+}
+
+/*
+ * 2.2.14. symlinkargs
+ *
+ * struct symlinkargs {
+ * diropargs from;
+ * path to;
+ * sattr attributes;
+ * };
+ */
+static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_symlinkargs *args = data;
+
+ encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen);
+ encode_path(xdr, args->pages, args->pathlen);
+ encode_sattr(xdr, args->sattr, rpc_rqst_userns(req));
+}
+
+/*
+ * 2.2.17. readdirargs
+ *
+ * struct readdirargs {
+ * fhandle dir;
+ * nfscookie cookie;
+ * unsigned count;
+ * };
+ */
+static void encode_readdirargs(struct xdr_stream *xdr,
+ const struct nfs_readdirargs *args)
+{
+ __be32 *p;
+
+ encode_fhandle(xdr, args->fh);
+
+ p = xdr_reserve_space(xdr, 4 + 4);
+ *p++ = cpu_to_be32(args->cookie);
+ *p = cpu_to_be32(args->count);
+}
+
+static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_readdirargs *args = data;
+
+ encode_readdirargs(xdr, args);
+ rpc_prepare_reply_pages(req, args->pages, 0,
+ args->count, NFS_readdirres_sz);
+}
+
+/*
+ * NFSv2 XDR decode functions
+ *
+ * NFSv2 result types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ */
+
+static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *__unused)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS_OK)
+ goto out_default;
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *result)
+{
+ return decode_attrstat(xdr, result, NULL, rpc_rqst_userns(req));
+}
+
+static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *result)
+{
+ return decode_diropres(xdr, result, rpc_rqst_userns(req));
+}
+
+/*
+ * 2.2.6. readlinkres
+ *
+ * union readlinkres switch (stat status) {
+ * case NFS_OK:
+ * path data;
+ * default:
+ * void;
+ * };
+ */
+static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req,
+ struct xdr_stream *xdr, void *__unused)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS_OK)
+ goto out_default;
+ error = decode_path(xdr);
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+/*
+ * 2.2.7. readres
+ *
+ * union readres switch (stat status) {
+ * case NFS_OK:
+ * fattr attributes;
+ * nfsdata data;
+ * default:
+ * void;
+ * };
+ */
+static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_pgio_res *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ result->op_status = status;
+ if (status != NFS_OK)
+ goto out_default;
+ error = decode_fattr(xdr, result->fattr, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ error = decode_nfsdata(xdr, result);
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_pgio_res *result = data;
+
+ /* All NFSv2 writes are "file sync" writes */
+ result->verf->committed = NFS_FILE_SYNC;
+ return decode_attrstat(xdr, result->fattr, &result->op_status,
+ rpc_rqst_userns(req));
+}
+
+/**
+ * nfs2_decode_dirent - Decode a single NFSv2 directory entry stored in
+ * the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 2.2.17. entry
+ *
+ * struct entry {
+ * unsigned fileid;
+ * filename name;
+ * nfscookie cookie;
+ * entry *nextentry;
+ * };
+ */
+int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+ bool plus)
+{
+ __be32 *p;
+ int error;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EAGAIN;
+ if (*p++ == xdr_zero) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EAGAIN;
+ if (*p++ == xdr_zero)
+ return -EAGAIN;
+ entry->eof = 1;
+ return -EBADCOOKIE;
+ }
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EAGAIN;
+ entry->ino = be32_to_cpup(p);
+
+ error = decode_filename_inline(xdr, &entry->name, &entry->len);
+ if (unlikely(error))
+ return error == -ENAMETOOLONG ? -ENAMETOOLONG : -EAGAIN;
+
+ /*
+ * The type (size and byte order) of nfscookie isn't defined in
+ * RFC 1094. This implementation assumes that it's an XDR uint32.
+ */
+ entry->prev_cookie = entry->cookie;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EAGAIN;
+ entry->cookie = be32_to_cpup(p);
+
+ entry->d_type = DT_UNKNOWN;
+
+ return 0;
+}
+
+/*
+ * 2.2.17. readdirres
+ *
+ * union readdirres switch (stat status) {
+ * case NFS_OK:
+ * struct {
+ * entry *entries;
+ * bool eof;
+ * } readdirok;
+ * default:
+ * void;
+ * };
+ *
+ * Read the directory contents into the page cache, but don't
+ * touch them. The actual decoding is done by nfs2_decode_dirent()
+ * during subsequent nfs_readdir() calls.
+ */
+static int decode_readdirok(struct xdr_stream *xdr)
+{
+ return xdr_read_pages(xdr, xdr->buf->page_len);
+}
+
+static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req,
+ struct xdr_stream *xdr, void *__unused)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS_OK)
+ goto out_default;
+ error = decode_readdirok(xdr);
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+/*
+ * 2.2.18. statfsres
+ *
+ * union statfsres (stat status) {
+ * case NFS_OK:
+ * struct {
+ * unsigned tsize;
+ * unsigned bsize;
+ * unsigned blocks;
+ * unsigned bfree;
+ * unsigned bavail;
+ * } info;
+ * default:
+ * void;
+ * };
+ */
+static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS_info_sz << 2);
+ if (unlikely(!p))
+ return -EIO;
+ result->tsize = be32_to_cpup(p++);
+ result->bsize = be32_to_cpup(p++);
+ result->blocks = be32_to_cpup(p++);
+ result->bfree = be32_to_cpup(p++);
+ result->bavail = be32_to_cpup(p);
+ return 0;
+}
+
+static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *result)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS_OK)
+ goto out_default;
+ error = decode_info(xdr, result);
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+
+/*
+ * We need to translate between nfs status return values and
+ * the local errno values which may not be the same.
+ */
+static const struct {
+ int stat;
+ int errno;
+} nfs_errtbl[] = {
+ { NFS_OK, 0 },
+ { NFSERR_PERM, -EPERM },
+ { NFSERR_NOENT, -ENOENT },
+ { NFSERR_IO, -errno_NFSERR_IO},
+ { NFSERR_NXIO, -ENXIO },
+/* { NFSERR_EAGAIN, -EAGAIN }, */
+ { NFSERR_ACCES, -EACCES },
+ { NFSERR_EXIST, -EEXIST },
+ { NFSERR_XDEV, -EXDEV },
+ { NFSERR_NODEV, -ENODEV },
+ { NFSERR_NOTDIR, -ENOTDIR },
+ { NFSERR_ISDIR, -EISDIR },
+ { NFSERR_INVAL, -EINVAL },
+ { NFSERR_FBIG, -EFBIG },
+ { NFSERR_NOSPC, -ENOSPC },
+ { NFSERR_ROFS, -EROFS },
+ { NFSERR_MLINK, -EMLINK },
+ { NFSERR_NAMETOOLONG, -ENAMETOOLONG },
+ { NFSERR_NOTEMPTY, -ENOTEMPTY },
+ { NFSERR_DQUOT, -EDQUOT },
+ { NFSERR_STALE, -ESTALE },
+ { NFSERR_REMOTE, -EREMOTE },
+#ifdef EWFLUSH
+ { NFSERR_WFLUSH, -EWFLUSH },
+#endif
+ { NFSERR_BADHANDLE, -EBADHANDLE },
+ { NFSERR_NOT_SYNC, -ENOTSYNC },
+ { NFSERR_BAD_COOKIE, -EBADCOOKIE },
+ { NFSERR_NOTSUPP, -ENOTSUPP },
+ { NFSERR_TOOSMALL, -ETOOSMALL },
+ { NFSERR_SERVERFAULT, -EREMOTEIO },
+ { NFSERR_BADTYPE, -EBADTYPE },
+ { NFSERR_JUKEBOX, -EJUKEBOX },
+ { -1, -EIO }
+};
+
+/**
+ * nfs_stat_to_errno - convert an NFS status code to a local errno
+ * @status: NFS status code to convert
+ *
+ * Returns a local errno value, or -EIO if the NFS status code is
+ * not recognized. This function is used jointly by NFSv2 and NFSv3.
+ */
+static int nfs_stat_to_errno(enum nfs_stat status)
+{
+ int i;
+
+ for (i = 0; nfs_errtbl[i].stat != -1; i++) {
+ if (nfs_errtbl[i].stat == (int)status)
+ return nfs_errtbl[i].errno;
+ }
+ dprintk("NFS: Unrecognized nfs status value: %u\n", status);
+ return nfs_errtbl[i].errno;
+}
+
+#define PROC(proc, argtype, restype, timer) \
+[NFSPROC_##proc] = { \
+ .p_proc = NFSPROC_##proc, \
+ .p_encode = nfs2_xdr_enc_##argtype, \
+ .p_decode = nfs2_xdr_dec_##restype, \
+ .p_arglen = NFS_##argtype##_sz, \
+ .p_replen = NFS_##restype##_sz, \
+ .p_timer = timer, \
+ .p_statidx = NFSPROC_##proc, \
+ .p_name = #proc, \
+ }
+const struct rpc_procinfo nfs_procedures[] = {
+ PROC(GETATTR, fhandle, attrstat, 1),
+ PROC(SETATTR, sattrargs, attrstat, 0),
+ PROC(LOOKUP, diropargs, diropres, 2),
+ PROC(READLINK, readlinkargs, readlinkres, 3),
+ PROC(READ, readargs, readres, 3),
+ PROC(WRITE, writeargs, writeres, 4),
+ PROC(CREATE, createargs, diropres, 0),
+ PROC(REMOVE, removeargs, stat, 0),
+ PROC(RENAME, renameargs, stat, 0),
+ PROC(LINK, linkargs, stat, 0),
+ PROC(SYMLINK, symlinkargs, stat, 0),
+ PROC(MKDIR, createargs, diropres, 0),
+ PROC(RMDIR, diropargs, stat, 0),
+ PROC(READDIR, readdirargs, readdirres, 3),
+ PROC(STATFS, fhandle, statfsres, 0),
+};
+
+static unsigned int nfs_version2_counts[ARRAY_SIZE(nfs_procedures)];
+const struct rpc_version nfs_version2 = {
+ .number = 2,
+ .nrprocs = ARRAY_SIZE(nfs_procedures),
+ .procs = nfs_procedures,
+ .counts = nfs_version2_counts,
+};
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
new file mode 100644
index 000000000..1b950b66b
--- /dev/null
+++ b/fs/nfs/nfs3_fs.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2014 Anna Schumaker.
+ *
+ * NFSv3-specific filesystem definitions and declarations
+ */
+#ifndef __LINUX_FS_NFS_NFS3_FS_H
+#define __LINUX_FS_NFS_NFS3_FS_H
+
+/*
+ * nfs3acl.c
+ */
+#ifdef CONFIG_NFS_V3_ACL
+extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type);
+extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+ struct posix_acl *dfacl);
+extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t);
+extern const struct xattr_handler *nfs3_xattr_handlers[];
+#else
+static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+ struct posix_acl *dfacl)
+{
+ return 0;
+}
+#define nfs3_listxattr NULL
+#endif /* CONFIG_NFS_V3_ACL */
+
+/* nfs3client.c */
+struct nfs_server *nfs3_create_server(struct fs_context *);
+struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
+ struct nfs_fattr *, rpc_authflavor_t);
+
+/* nfs3super.c */
+extern struct nfs_subversion nfs_v3;
+
+#endif /* __LINUX_FS_NFS_NFS3_FS_H */
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
new file mode 100644
index 000000000..c6c863382
--- /dev/null
+++ b/fs/nfs/nfs3acl.c
@@ -0,0 +1,344 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs_fs.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/nfsacl.h>
+
+#include "internal.h"
+#include "nfs3_fs.h"
+
+#define NFSDBG_FACILITY NFSDBG_PROC
+
+/*
+ * nfs3_prepare_get_acl, nfs3_complete_get_acl, nfs3_abort_get_acl: Helpers for
+ * caching get_acl results in a race-free way. See fs/posix_acl.c:get_acl()
+ * for explanations.
+ */
+static void nfs3_prepare_get_acl(struct posix_acl **p)
+{
+ struct posix_acl *sentinel = uncached_acl_sentinel(current);
+
+ if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED) {
+ /* Not the first reader or sentinel already in place. */
+ }
+}
+
+static void nfs3_complete_get_acl(struct posix_acl **p, struct posix_acl *acl)
+{
+ struct posix_acl *sentinel = uncached_acl_sentinel(current);
+
+ /* Only cache the ACL if our sentinel is still in place. */
+ posix_acl_dup(acl);
+ if (cmpxchg(p, sentinel, acl) != sentinel)
+ posix_acl_release(acl);
+}
+
+static void nfs3_abort_get_acl(struct posix_acl **p)
+{
+ struct posix_acl *sentinel = uncached_acl_sentinel(current);
+
+ /* Remove our sentinel upon failure. */
+ cmpxchg(p, sentinel, ACL_NOT_CACHED);
+}
+
+struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct page *pages[NFSACL_MAXPAGES] = { };
+ struct nfs3_getaclargs args = {
+ .fh = NFS_FH(inode),
+ /* The xdr layer may allocate pages here. */
+ .pages = pages,
+ };
+ struct nfs3_getaclres res = {
+ NULL,
+ };
+ struct rpc_message msg = {
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ int status, count;
+
+ if (!nfs_server_capable(inode, NFS_CAP_ACLS))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ status = nfs_revalidate_inode(server, inode);
+ if (status < 0)
+ return ERR_PTR(status);
+
+ /*
+ * Only get the access acl when explicitly requested: We don't
+ * need it for access decisions, and only some applications use
+ * it. Applications which request the access acl first are not
+ * penalized from this optimization.
+ */
+ if (type == ACL_TYPE_ACCESS)
+ args.mask |= NFS_ACLCNT|NFS_ACL;
+ if (S_ISDIR(inode->i_mode))
+ args.mask |= NFS_DFACLCNT|NFS_DFACL;
+ if (args.mask == 0)
+ return NULL;
+
+ dprintk("NFS call getacl\n");
+ msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL];
+ res.fattr = nfs_alloc_fattr();
+ if (res.fattr == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ if (args.mask & NFS_ACL)
+ nfs3_prepare_get_acl(&inode->i_acl);
+ if (args.mask & NFS_DFACL)
+ nfs3_prepare_get_acl(&inode->i_default_acl);
+
+ status = rpc_call_sync(server->client_acl, &msg, 0);
+ dprintk("NFS reply getacl: %d\n", status);
+
+ /* pages may have been allocated at the xdr layer. */
+ for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++)
+ __free_page(args.pages[count]);
+
+ switch (status) {
+ case 0:
+ status = nfs_refresh_inode(inode, res.fattr);
+ break;
+ case -EPFNOSUPPORT:
+ case -EPROTONOSUPPORT:
+ dprintk("NFS_V3_ACL extension not supported; disabling\n");
+ server->caps &= ~NFS_CAP_ACLS;
+ fallthrough;
+ case -ENOTSUPP:
+ status = -EOPNOTSUPP;
+ default:
+ goto getout;
+ }
+ if ((args.mask & res.mask) != args.mask) {
+ status = -EIO;
+ goto getout;
+ }
+
+ if (res.acl_access != NULL) {
+ if ((posix_acl_equiv_mode(res.acl_access, NULL) == 0) ||
+ res.acl_access->a_count == 0) {
+ posix_acl_release(res.acl_access);
+ res.acl_access = NULL;
+ }
+ }
+
+ if (res.mask & NFS_ACL)
+ nfs3_complete_get_acl(&inode->i_acl, res.acl_access);
+ else
+ forget_cached_acl(inode, ACL_TYPE_ACCESS);
+
+ if (res.mask & NFS_DFACL)
+ nfs3_complete_get_acl(&inode->i_default_acl, res.acl_default);
+ else
+ forget_cached_acl(inode, ACL_TYPE_DEFAULT);
+
+ nfs_free_fattr(res.fattr);
+ if (type == ACL_TYPE_ACCESS) {
+ posix_acl_release(res.acl_default);
+ return res.acl_access;
+ } else {
+ posix_acl_release(res.acl_access);
+ return res.acl_default;
+ }
+
+getout:
+ nfs3_abort_get_acl(&inode->i_acl);
+ nfs3_abort_get_acl(&inode->i_default_acl);
+ posix_acl_release(res.acl_access);
+ posix_acl_release(res.acl_default);
+ nfs_free_fattr(res.fattr);
+ return ERR_PTR(status);
+}
+
+static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+ struct posix_acl *dfacl)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_fattr *fattr;
+ struct page *pages[NFSACL_MAXPAGES];
+ struct nfs3_setaclargs args = {
+ .inode = inode,
+ .mask = NFS_ACL,
+ .acl_access = acl,
+ .pages = pages,
+ };
+ struct rpc_message msg = {
+ .rpc_argp = &args,
+ .rpc_resp = &fattr,
+ };
+ int status = 0;
+
+ if (acl == NULL && (!S_ISDIR(inode->i_mode) || dfacl == NULL))
+ goto out;
+
+ status = -EOPNOTSUPP;
+ if (!nfs_server_capable(inode, NFS_CAP_ACLS))
+ goto out;
+
+ /* We are doing this here because XDR marshalling does not
+ * return any results, it BUGs. */
+ status = -ENOSPC;
+ if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES)
+ goto out;
+ if (dfacl != NULL && dfacl->a_count > NFS_ACL_MAX_ENTRIES)
+ goto out;
+ if (S_ISDIR(inode->i_mode)) {
+ args.mask |= NFS_DFACL;
+ args.acl_default = dfacl;
+ args.len = nfsacl_size(acl, dfacl);
+ } else
+ args.len = nfsacl_size(acl, NULL);
+
+ if (args.len > NFS_ACL_INLINE_BUFSIZE) {
+ unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT);
+
+ status = -ENOMEM;
+ do {
+ args.pages[args.npages] = alloc_page(GFP_KERNEL);
+ if (args.pages[args.npages] == NULL)
+ goto out_freepages;
+ args.npages++;
+ } while (args.npages < npages);
+ }
+
+ dprintk("NFS call setacl\n");
+ status = -ENOMEM;
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ goto out_freepages;
+
+ msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
+ msg.rpc_resp = fattr;
+ status = rpc_call_sync(server->client_acl, &msg, 0);
+ nfs_access_zap_cache(inode);
+ nfs_zap_acl_cache(inode);
+ dprintk("NFS reply setacl: %d\n", status);
+
+ switch (status) {
+ case 0:
+ status = nfs_refresh_inode(inode, fattr);
+ break;
+ case -EPFNOSUPPORT:
+ case -EPROTONOSUPPORT:
+ dprintk("NFS_V3_ACL SETACL RPC not supported"
+ "(will not retry)\n");
+ server->caps &= ~NFS_CAP_ACLS;
+ fallthrough;
+ case -ENOTSUPP:
+ status = -EOPNOTSUPP;
+ }
+ nfs_free_fattr(fattr);
+out_freepages:
+ while (args.npages != 0) {
+ args.npages--;
+ __free_page(args.pages[args.npages]);
+ }
+out:
+ return status;
+}
+
+int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+ struct posix_acl *dfacl)
+{
+ int ret;
+ ret = __nfs3_proc_setacls(inode, acl, dfacl);
+ return (ret == -EOPNOTSUPP) ? 0 : ret;
+
+}
+
+int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+ struct posix_acl *orig = acl, *dfacl = NULL, *alloc;
+ int status;
+
+ if (S_ISDIR(inode->i_mode)) {
+ switch(type) {
+ case ACL_TYPE_ACCESS:
+ alloc = get_acl(inode, ACL_TYPE_DEFAULT);
+ if (IS_ERR(alloc))
+ goto fail;
+ dfacl = alloc;
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ alloc = get_acl(inode, ACL_TYPE_ACCESS);
+ if (IS_ERR(alloc))
+ goto fail;
+ dfacl = acl;
+ acl = alloc;
+ break;
+ }
+ }
+
+ if (acl == NULL) {
+ alloc = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+ if (IS_ERR(alloc))
+ goto fail;
+ acl = alloc;
+ }
+ status = __nfs3_proc_setacls(inode, acl, dfacl);
+out:
+ if (acl != orig)
+ posix_acl_release(acl);
+ if (dfacl != orig)
+ posix_acl_release(dfacl);
+ return status;
+
+fail:
+ status = PTR_ERR(alloc);
+ goto out;
+}
+
+const struct xattr_handler *nfs3_xattr_handlers[] = {
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+ NULL,
+};
+
+static int
+nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data,
+ size_t size, ssize_t *result)
+{
+ struct posix_acl *acl;
+ char *p = data + *result;
+
+ acl = get_acl(inode, type);
+ if (IS_ERR_OR_NULL(acl))
+ return 0;
+
+ posix_acl_release(acl);
+
+ *result += strlen(name);
+ *result += 1;
+ if (!size)
+ return 0;
+ if (*result > size)
+ return -ERANGE;
+
+ strcpy(p, name);
+ return 0;
+}
+
+ssize_t
+nfs3_listxattr(struct dentry *dentry, char *data, size_t size)
+{
+ struct inode *inode = d_inode(dentry);
+ ssize_t result = 0;
+ int error;
+
+ error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS,
+ XATTR_NAME_POSIX_ACL_ACCESS, data, size, &result);
+ if (error)
+ return error;
+
+ error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT,
+ XATTR_NAME_POSIX_ACL_DEFAULT, data, size, &result);
+ if (error)
+ return error;
+ return result;
+}
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
new file mode 100644
index 000000000..b49359afa
--- /dev/null
+++ b/fs/nfs/nfs3client.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/sunrpc/addr.h>
+#include "internal.h"
+#include "nfs3_fs.h"
+
+#ifdef CONFIG_NFS_V3_ACL
+static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program };
+static const struct rpc_version *nfsacl_version[] = {
+ [3] = &nfsacl_version3,
+};
+
+const struct rpc_program nfsacl_program = {
+ .name = "nfsacl",
+ .number = NFS_ACL_PROGRAM,
+ .nrvers = ARRAY_SIZE(nfsacl_version),
+ .version = nfsacl_version,
+ .stats = &nfsacl_rpcstat,
+};
+
+/*
+ * Initialise an NFSv3 ACL client connection
+ */
+static void nfs_init_server_aclclient(struct nfs_server *server)
+{
+ if (server->flags & NFS_MOUNT_NOACL)
+ goto out_noacl;
+
+ server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3);
+ if (IS_ERR(server->client_acl))
+ goto out_noacl;
+
+ /* No errors! Assume that Sun nfsacls are supported */
+ server->caps |= NFS_CAP_ACLS;
+ return;
+
+out_noacl:
+ server->caps &= ~NFS_CAP_ACLS;
+}
+#else
+static inline void nfs_init_server_aclclient(struct nfs_server *server)
+{
+ server->flags &= ~NFS_MOUNT_NOACL;
+ server->caps &= ~NFS_CAP_ACLS;
+}
+#endif
+
+struct nfs_server *nfs3_create_server(struct fs_context *fc)
+{
+ struct nfs_server *server = nfs_create_server(fc);
+
+ /* Create a client RPC handle for the NFS v3 ACL management interface */
+ if (!IS_ERR(server))
+ nfs_init_server_aclclient(server);
+ return server;
+}
+
+struct nfs_server *nfs3_clone_server(struct nfs_server *source,
+ struct nfs_fh *fh,
+ struct nfs_fattr *fattr,
+ rpc_authflavor_t flavor)
+{
+ struct nfs_server *server = nfs_clone_server(source, fh, fattr, flavor);
+ if (!IS_ERR(server) && !IS_ERR(source->client_acl))
+ nfs_init_server_aclclient(server);
+ return server;
+}
+
+/*
+ * Set up a pNFS Data Server client over NFSv3.
+ *
+ * Return any existing nfs_client that matches server address,port,version
+ * and minorversion.
+ *
+ * For a new nfs_client, use a soft mount (default), a low retrans and a
+ * low timeout interval so that if a connection is lost, we retry through
+ * the MDS.
+ */
+struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
+ const struct sockaddr *ds_addr, int ds_addrlen,
+ int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans)
+{
+ struct rpc_timeout ds_timeout;
+ struct nfs_client *mds_clp = mds_srv->nfs_client;
+ struct nfs_client_initdata cl_init = {
+ .addr = ds_addr,
+ .addrlen = ds_addrlen,
+ .nodename = mds_clp->cl_rpcclient->cl_nodename,
+ .ip_addr = mds_clp->cl_ipaddr,
+ .nfs_mod = &nfs_v3,
+ .proto = ds_proto,
+ .net = mds_clp->cl_net,
+ .timeparms = &ds_timeout,
+ .cred = mds_srv->cred,
+ };
+ struct nfs_client *clp;
+ char buf[INET6_ADDRSTRLEN + 1];
+
+ /* fake a hostname because lockd wants it */
+ if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
+ return ERR_PTR(-EINVAL);
+ cl_init.hostname = buf;
+
+ if (mds_clp->cl_nconnect > 1 && ds_proto == XPRT_TRANSPORT_TCP)
+ cl_init.nconnect = mds_clp->cl_nconnect;
+
+ if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
+ __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+
+ __set_bit(NFS_CS_DS, &cl_init.init_flags);
+
+ /* Use the MDS nfs_client cl_ipaddr. */
+ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
+ clp = nfs_get_client(&cl_init);
+
+ return clp;
+}
+EXPORT_SYMBOL_GPL(nfs3_set_ds_client);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
new file mode 100644
index 000000000..e1491def7
--- /dev/null
+++ b/fs/nfs/nfs3proc.c
@@ -0,0 +1,1044 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/nfs3proc.c
+ *
+ * Client-side NFSv3 procedures stubs.
+ *
+ * Copyright (C) 1997, Olaf Kirch
+ */
+
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/slab.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/lockd/bind.h>
+#include <linux/nfs_mount.h>
+#include <linux/freezer.h>
+#include <linux/xattr.h>
+
+#include "iostat.h"
+#include "internal.h"
+#include "nfs3_fs.h"
+
+#define NFSDBG_FACILITY NFSDBG_PROC
+
+/* A wrapper to handle the EJUKEBOX error messages */
+static int
+nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
+{
+ int res;
+ do {
+ res = rpc_call_sync(clnt, msg, flags);
+ if (res != -EJUKEBOX)
+ break;
+ freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME);
+ res = -ERESTARTSYS;
+ } while (!fatal_signal_pending(current));
+ return res;
+}
+
+#define rpc_call_sync(clnt, msg, flags) nfs3_rpc_wrapper(clnt, msg, flags)
+
+static int
+nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode)
+{
+ if (task->tk_status != -EJUKEBOX)
+ return 0;
+ if (task->tk_status == -EJUKEBOX)
+ nfs_inc_stats(inode, NFSIOS_DELAY);
+ task->tk_status = 0;
+ rpc_restart_call(task);
+ rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
+ return 1;
+}
+
+static int
+do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_FSINFO],
+ .rpc_argp = fhandle,
+ .rpc_resp = info,
+ };
+ int status;
+
+ dprintk("%s: call fsinfo\n", __func__);
+ nfs_fattr_init(info->fattr);
+ status = rpc_call_sync(client, &msg, 0);
+ dprintk("%s: reply fsinfo: %d\n", __func__, status);
+ if (status == 0 && !(info->fattr->valid & NFS_ATTR_FATTR)) {
+ msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
+ msg.rpc_resp = info->fattr;
+ status = rpc_call_sync(client, &msg, 0);
+ dprintk("%s: reply getattr: %d\n", __func__, status);
+ }
+ return status;
+}
+
+/*
+ * Bare-bones access to getattr: this is for nfs_get_root/nfs_get_sb
+ */
+static int
+nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ int status;
+
+ status = do_proc_get_root(server->client, fhandle, info);
+ if (status && server->nfs_client->cl_rpcclient != server->client)
+ status = do_proc_get_root(server->nfs_client->cl_rpcclient, fhandle, info);
+ return status;
+}
+
+/*
+ * One function for each procedure in the NFS protocol.
+ */
+static int
+nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct nfs4_label *label,
+ struct inode *inode)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR],
+ .rpc_argp = fhandle,
+ .rpc_resp = fattr,
+ };
+ int status;
+ unsigned short task_flags = 0;
+
+ /* Is this is an attribute revalidation, subject to softreval? */
+ if (inode && (server->flags & NFS_MOUNT_SOFTREVAL))
+ task_flags |= RPC_TASK_TIMEOUT;
+
+ dprintk("NFS call getattr\n");
+ nfs_fattr_init(fattr);
+ status = rpc_call_sync(server->client, &msg, task_flags);
+ dprintk("NFS reply getattr: %d\n", status);
+ return status;
+}
+
+static int
+nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
+ struct iattr *sattr)
+{
+ struct inode *inode = d_inode(dentry);
+ struct nfs3_sattrargs arg = {
+ .fh = NFS_FH(inode),
+ .sattr = sattr,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_SETATTR],
+ .rpc_argp = &arg,
+ .rpc_resp = fattr,
+ };
+ int status;
+
+ dprintk("NFS call setattr\n");
+ if (sattr->ia_valid & ATTR_FILE)
+ msg.rpc_cred = nfs_file_cred(sattr->ia_file);
+ nfs_fattr_init(fattr);
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ if (status == 0) {
+ nfs_setattr_update_inode(inode, sattr, fattr);
+ if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+ nfs_zap_acl_cache(inode);
+ }
+ dprintk("NFS reply setattr: %d\n", status);
+ return status;
+}
+
+static int
+nfs3_proc_lookup(struct inode *dir, struct dentry *dentry,
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
+{
+ struct nfs3_diropargs arg = {
+ .fh = NFS_FH(dir),
+ .name = dentry->d_name.name,
+ .len = dentry->d_name.len
+ };
+ struct nfs3_diropres res = {
+ .fh = fhandle,
+ .fattr = fattr
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_LOOKUP],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int status;
+ unsigned short task_flags = 0;
+
+ /* Is this is an attribute revalidation, subject to softreval? */
+ if (nfs_lookup_is_soft_revalidate(dentry))
+ task_flags |= RPC_TASK_TIMEOUT;
+
+ res.dir_attr = nfs_alloc_fattr();
+ if (res.dir_attr == NULL)
+ return -ENOMEM;
+
+ dprintk("NFS call lookup %pd2\n", dentry);
+ nfs_fattr_init(fattr);
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, task_flags);
+ nfs_refresh_inode(dir, res.dir_attr);
+ if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) {
+ msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
+ msg.rpc_argp = fhandle;
+ msg.rpc_resp = fattr;
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, task_flags);
+ }
+ nfs_free_fattr(res.dir_attr);
+ dprintk("NFS reply lookup: %d\n", status);
+ return status;
+}
+
+static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
+{
+ struct nfs3_accessargs arg = {
+ .fh = NFS_FH(inode),
+ .access = entry->mask,
+ };
+ struct nfs3_accessres res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ .rpc_cred = entry->cred,
+ };
+ int status = -ENOMEM;
+
+ dprintk("NFS call access\n");
+ res.fattr = nfs_alloc_fattr();
+ if (res.fattr == NULL)
+ goto out;
+
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ nfs_refresh_inode(inode, res.fattr);
+ if (status == 0)
+ nfs_access_set_mask(entry, res.access);
+ nfs_free_fattr(res.fattr);
+out:
+ dprintk("NFS reply access: %d\n", status);
+ return status;
+}
+
+static int nfs3_proc_readlink(struct inode *inode, struct page *page,
+ unsigned int pgbase, unsigned int pglen)
+{
+ struct nfs_fattr *fattr;
+ struct nfs3_readlinkargs args = {
+ .fh = NFS_FH(inode),
+ .pgbase = pgbase,
+ .pglen = pglen,
+ .pages = &page
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_READLINK],
+ .rpc_argp = &args,
+ };
+ int status = -ENOMEM;
+
+ dprintk("NFS call readlink\n");
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ goto out;
+ msg.rpc_resp = fattr;
+
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ nfs_refresh_inode(inode, fattr);
+ nfs_free_fattr(fattr);
+out:
+ dprintk("NFS reply readlink: %d\n", status);
+ return status;
+}
+
+struct nfs3_createdata {
+ struct rpc_message msg;
+ union {
+ struct nfs3_createargs create;
+ struct nfs3_mkdirargs mkdir;
+ struct nfs3_symlinkargs symlink;
+ struct nfs3_mknodargs mknod;
+ } arg;
+ struct nfs3_diropres res;
+ struct nfs_fh fh;
+ struct nfs_fattr fattr;
+ struct nfs_fattr dir_attr;
+};
+
+static struct nfs3_createdata *nfs3_alloc_createdata(void)
+{
+ struct nfs3_createdata *data;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (data != NULL) {
+ data->msg.rpc_argp = &data->arg;
+ data->msg.rpc_resp = &data->res;
+ data->res.fh = &data->fh;
+ data->res.fattr = &data->fattr;
+ data->res.dir_attr = &data->dir_attr;
+ nfs_fattr_init(data->res.fattr);
+ nfs_fattr_init(data->res.dir_attr);
+ }
+ return data;
+}
+
+static struct dentry *
+nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
+{
+ int status;
+
+ status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
+ nfs_post_op_update_inode(dir, data->res.dir_attr);
+ if (status != 0)
+ return ERR_PTR(status);
+
+ return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr, NULL);
+}
+
+static void nfs3_free_createdata(struct nfs3_createdata *data)
+{
+ kfree(data);
+}
+
+/*
+ * Create a regular file.
+ */
+static int
+nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+ int flags)
+{
+ struct posix_acl *default_acl, *acl;
+ struct nfs3_createdata *data;
+ struct dentry *d_alias;
+ int status = -ENOMEM;
+
+ dprintk("NFS call create %pd\n", dentry);
+
+ data = nfs3_alloc_createdata();
+ if (data == NULL)
+ goto out;
+
+ data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_CREATE];
+ data->arg.create.fh = NFS_FH(dir);
+ data->arg.create.name = dentry->d_name.name;
+ data->arg.create.len = dentry->d_name.len;
+ data->arg.create.sattr = sattr;
+
+ data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
+ if (flags & O_EXCL) {
+ data->arg.create.createmode = NFS3_CREATE_EXCLUSIVE;
+ data->arg.create.verifier[0] = cpu_to_be32(jiffies);
+ data->arg.create.verifier[1] = cpu_to_be32(current->pid);
+ }
+
+ status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
+ if (status)
+ goto out;
+
+ for (;;) {
+ d_alias = nfs3_do_create(dir, dentry, data);
+ status = PTR_ERR_OR_ZERO(d_alias);
+
+ if (status != -ENOTSUPP)
+ break;
+ /* If the server doesn't support the exclusive creation
+ * semantics, try again with simple 'guarded' mode. */
+ switch (data->arg.create.createmode) {
+ case NFS3_CREATE_EXCLUSIVE:
+ data->arg.create.createmode = NFS3_CREATE_GUARDED;
+ break;
+
+ case NFS3_CREATE_GUARDED:
+ data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
+ break;
+
+ case NFS3_CREATE_UNCHECKED:
+ goto out_release_acls;
+ }
+ nfs_fattr_init(data->res.dir_attr);
+ nfs_fattr_init(data->res.fattr);
+ }
+
+ if (status != 0)
+ goto out_release_acls;
+
+ if (d_alias)
+ dentry = d_alias;
+
+ /* When we created the file with exclusive semantics, make
+ * sure we set the attributes afterwards. */
+ if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) {
+ dprintk("NFS call setattr (post-create)\n");
+
+ if (!(sattr->ia_valid & ATTR_ATIME_SET))
+ sattr->ia_valid |= ATTR_ATIME;
+ if (!(sattr->ia_valid & ATTR_MTIME_SET))
+ sattr->ia_valid |= ATTR_MTIME;
+
+ /* Note: we could use a guarded setattr here, but I'm
+ * not sure this buys us anything (and I'd have
+ * to revamp the NFSv3 XDR code) */
+ status = nfs3_proc_setattr(dentry, data->res.fattr, sattr);
+ nfs_post_op_update_inode(d_inode(dentry), data->res.fattr);
+ dprintk("NFS reply setattr (post-create): %d\n", status);
+ if (status != 0)
+ goto out_dput;
+ }
+
+ status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
+
+out_dput:
+ dput(d_alias);
+out_release_acls:
+ posix_acl_release(acl);
+ posix_acl_release(default_acl);
+out:
+ nfs3_free_createdata(data);
+ dprintk("NFS reply create: %d\n", status);
+ return status;
+}
+
+static int
+nfs3_proc_remove(struct inode *dir, struct dentry *dentry)
+{
+ struct nfs_removeargs arg = {
+ .fh = NFS_FH(dir),
+ .name = dentry->d_name,
+ };
+ struct nfs_removeres res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int status = -ENOMEM;
+
+ dprintk("NFS call remove %pd2\n", dentry);
+ res.dir_attr = nfs_alloc_fattr();
+ if (res.dir_attr == NULL)
+ goto out;
+
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ nfs_post_op_update_inode(dir, res.dir_attr);
+ nfs_free_fattr(res.dir_attr);
+out:
+ dprintk("NFS reply remove: %d\n", status);
+ return status;
+}
+
+static void
+nfs3_proc_unlink_setup(struct rpc_message *msg,
+ struct dentry *dentry,
+ struct inode *inode)
+{
+ msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE];
+}
+
+static void nfs3_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
+{
+ rpc_call_start(task);
+}
+
+static int
+nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
+{
+ struct nfs_removeres *res;
+ if (nfs3_async_handle_jukebox(task, dir))
+ return 0;
+ res = task->tk_msg.rpc_resp;
+ nfs_post_op_update_inode(dir, res->dir_attr);
+ return 1;
+}
+
+static void
+nfs3_proc_rename_setup(struct rpc_message *msg,
+ struct dentry *old_dentry,
+ struct dentry *new_dentry)
+{
+ msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME];
+}
+
+static void nfs3_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
+{
+ rpc_call_start(task);
+}
+
+static int
+nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+ struct inode *new_dir)
+{
+ struct nfs_renameres *res;
+
+ if (nfs3_async_handle_jukebox(task, old_dir))
+ return 0;
+ res = task->tk_msg.rpc_resp;
+
+ nfs_post_op_update_inode(old_dir, res->old_fattr);
+ nfs_post_op_update_inode(new_dir, res->new_fattr);
+ return 1;
+}
+
+static int
+nfs3_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name)
+{
+ struct nfs3_linkargs arg = {
+ .fromfh = NFS_FH(inode),
+ .tofh = NFS_FH(dir),
+ .toname = name->name,
+ .tolen = name->len
+ };
+ struct nfs3_linkres res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_LINK],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int status = -ENOMEM;
+
+ dprintk("NFS call link %s\n", name->name);
+ res.fattr = nfs_alloc_fattr();
+ res.dir_attr = nfs_alloc_fattr();
+ if (res.fattr == NULL || res.dir_attr == NULL)
+ goto out;
+
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ nfs_post_op_update_inode(dir, res.dir_attr);
+ nfs_post_op_update_inode(inode, res.fattr);
+out:
+ nfs_free_fattr(res.dir_attr);
+ nfs_free_fattr(res.fattr);
+ dprintk("NFS reply link: %d\n", status);
+ return status;
+}
+
+static int
+nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
+ unsigned int len, struct iattr *sattr)
+{
+ struct nfs3_createdata *data;
+ struct dentry *d_alias;
+ int status = -ENOMEM;
+
+ if (len > NFS3_MAXPATHLEN)
+ return -ENAMETOOLONG;
+
+ dprintk("NFS call symlink %pd\n", dentry);
+
+ data = nfs3_alloc_createdata();
+ if (data == NULL)
+ goto out;
+ data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK];
+ data->arg.symlink.fromfh = NFS_FH(dir);
+ data->arg.symlink.fromname = dentry->d_name.name;
+ data->arg.symlink.fromlen = dentry->d_name.len;
+ data->arg.symlink.pages = &page;
+ data->arg.symlink.pathlen = len;
+ data->arg.symlink.sattr = sattr;
+
+ d_alias = nfs3_do_create(dir, dentry, data);
+ status = PTR_ERR_OR_ZERO(d_alias);
+
+ if (status == 0)
+ dput(d_alias);
+
+ nfs3_free_createdata(data);
+out:
+ dprintk("NFS reply symlink: %d\n", status);
+ return status;
+}
+
+static int
+nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
+{
+ struct posix_acl *default_acl, *acl;
+ struct nfs3_createdata *data;
+ struct dentry *d_alias;
+ int status = -ENOMEM;
+
+ dprintk("NFS call mkdir %pd\n", dentry);
+
+ data = nfs3_alloc_createdata();
+ if (data == NULL)
+ goto out;
+
+ status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
+ if (status)
+ goto out;
+
+ data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];
+ data->arg.mkdir.fh = NFS_FH(dir);
+ data->arg.mkdir.name = dentry->d_name.name;
+ data->arg.mkdir.len = dentry->d_name.len;
+ data->arg.mkdir.sattr = sattr;
+
+ d_alias = nfs3_do_create(dir, dentry, data);
+ status = PTR_ERR_OR_ZERO(d_alias);
+
+ if (status != 0)
+ goto out_release_acls;
+
+ if (d_alias)
+ dentry = d_alias;
+
+ status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
+
+ dput(d_alias);
+out_release_acls:
+ posix_acl_release(acl);
+ posix_acl_release(default_acl);
+out:
+ nfs3_free_createdata(data);
+ dprintk("NFS reply mkdir: %d\n", status);
+ return status;
+}
+
+static int
+nfs3_proc_rmdir(struct inode *dir, const struct qstr *name)
+{
+ struct nfs_fattr *dir_attr;
+ struct nfs3_diropargs arg = {
+ .fh = NFS_FH(dir),
+ .name = name->name,
+ .len = name->len
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_RMDIR],
+ .rpc_argp = &arg,
+ };
+ int status = -ENOMEM;
+
+ dprintk("NFS call rmdir %s\n", name->name);
+ dir_attr = nfs_alloc_fattr();
+ if (dir_attr == NULL)
+ goto out;
+
+ msg.rpc_resp = dir_attr;
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ nfs_post_op_update_inode(dir, dir_attr);
+ nfs_free_fattr(dir_attr);
+out:
+ dprintk("NFS reply rmdir: %d\n", status);
+ return status;
+}
+
+/*
+ * The READDIR implementation is somewhat hackish - we pass the user buffer
+ * to the encode function, which installs it in the receive iovec.
+ * The decode function itself doesn't perform any decoding, it just makes
+ * sure the reply is syntactically correct.
+ *
+ * Also note that this implementation handles both plain readdir and
+ * readdirplus.
+ */
+static int
+nfs3_proc_readdir(struct dentry *dentry, const struct cred *cred,
+ u64 cookie, struct page **pages, unsigned int count, bool plus)
+{
+ struct inode *dir = d_inode(dentry);
+ __be32 *verf = NFS_I(dir)->cookieverf;
+ struct nfs3_readdirargs arg = {
+ .fh = NFS_FH(dir),
+ .cookie = cookie,
+ .verf = {verf[0], verf[1]},
+ .plus = plus,
+ .count = count,
+ .pages = pages
+ };
+ struct nfs3_readdirres res = {
+ .verf = verf,
+ .plus = plus
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_READDIR],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ int status = -ENOMEM;
+
+ if (plus)
+ msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS];
+
+ dprintk("NFS call readdir%s %d\n",
+ plus? "plus" : "", (unsigned int) cookie);
+
+ res.dir_attr = nfs_alloc_fattr();
+ if (res.dir_attr == NULL)
+ goto out;
+
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+
+ nfs_invalidate_atime(dir);
+ nfs_refresh_inode(dir, res.dir_attr);
+
+ nfs_free_fattr(res.dir_attr);
+out:
+ dprintk("NFS reply readdir%s: %d\n",
+ plus? "plus" : "", status);
+ return status;
+}
+
+static int
+nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+ dev_t rdev)
+{
+ struct posix_acl *default_acl, *acl;
+ struct nfs3_createdata *data;
+ struct dentry *d_alias;
+ int status = -ENOMEM;
+
+ dprintk("NFS call mknod %pd %u:%u\n", dentry,
+ MAJOR(rdev), MINOR(rdev));
+
+ data = nfs3_alloc_createdata();
+ if (data == NULL)
+ goto out;
+
+ status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
+ if (status)
+ goto out;
+
+ data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD];
+ data->arg.mknod.fh = NFS_FH(dir);
+ data->arg.mknod.name = dentry->d_name.name;
+ data->arg.mknod.len = dentry->d_name.len;
+ data->arg.mknod.sattr = sattr;
+ data->arg.mknod.rdev = rdev;
+
+ switch (sattr->ia_mode & S_IFMT) {
+ case S_IFBLK:
+ data->arg.mknod.type = NF3BLK;
+ break;
+ case S_IFCHR:
+ data->arg.mknod.type = NF3CHR;
+ break;
+ case S_IFIFO:
+ data->arg.mknod.type = NF3FIFO;
+ break;
+ case S_IFSOCK:
+ data->arg.mknod.type = NF3SOCK;
+ break;
+ default:
+ status = -EINVAL;
+ goto out_release_acls;
+ }
+
+ d_alias = nfs3_do_create(dir, dentry, data);
+ status = PTR_ERR_OR_ZERO(d_alias);
+ if (status != 0)
+ goto out_release_acls;
+
+ if (d_alias)
+ dentry = d_alias;
+
+ status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
+
+ dput(d_alias);
+out_release_acls:
+ posix_acl_release(acl);
+ posix_acl_release(default_acl);
+out:
+ nfs3_free_createdata(data);
+ dprintk("NFS reply mknod: %d\n", status);
+ return status;
+}
+
+static int
+nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsstat *stat)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_FSSTAT],
+ .rpc_argp = fhandle,
+ .rpc_resp = stat,
+ };
+ int status;
+
+ dprintk("NFS call fsstat\n");
+ nfs_fattr_init(stat->fattr);
+ status = rpc_call_sync(server->client, &msg, 0);
+ dprintk("NFS reply fsstat: %d\n", status);
+ return status;
+}
+
+static int
+do_proc_fsinfo(struct rpc_clnt *client, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_FSINFO],
+ .rpc_argp = fhandle,
+ .rpc_resp = info,
+ };
+ int status;
+
+ dprintk("NFS call fsinfo\n");
+ nfs_fattr_init(info->fattr);
+ status = rpc_call_sync(client, &msg, 0);
+ dprintk("NFS reply fsinfo: %d\n", status);
+ return status;
+}
+
+/*
+ * Bare-bones access to fsinfo: this is for nfs_get_root/nfs_get_sb via
+ * nfs_create_server
+ */
+static int
+nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ int status;
+
+ status = do_proc_fsinfo(server->client, fhandle, info);
+ if (status && server->nfs_client->cl_rpcclient != server->client)
+ status = do_proc_fsinfo(server->nfs_client->cl_rpcclient, fhandle, info);
+ return status;
+}
+
+static int
+nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_pathconf *info)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_PATHCONF],
+ .rpc_argp = fhandle,
+ .rpc_resp = info,
+ };
+ int status;
+
+ dprintk("NFS call pathconf\n");
+ nfs_fattr_init(info->fattr);
+ status = rpc_call_sync(server->client, &msg, 0);
+ dprintk("NFS reply pathconf: %d\n", status);
+ return status;
+}
+
+static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+ struct inode *inode = hdr->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+
+ if (hdr->pgio_done_cb != NULL)
+ return hdr->pgio_done_cb(task, hdr);
+
+ if (nfs3_async_handle_jukebox(task, inode))
+ return -EAGAIN;
+
+ if (task->tk_status >= 0 && !server->read_hdrsize)
+ cmpxchg(&server->read_hdrsize, 0, hdr->res.replen);
+
+ nfs_invalidate_atime(inode);
+ nfs_refresh_inode(inode, &hdr->fattr);
+ return 0;
+}
+
+static void nfs3_proc_read_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg)
+{
+ msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
+ hdr->args.replen = NFS_SERVER(hdr->inode)->read_hdrsize;
+}
+
+static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ rpc_call_start(task);
+ return 0;
+}
+
+static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+ struct inode *inode = hdr->inode;
+
+ if (hdr->pgio_done_cb != NULL)
+ return hdr->pgio_done_cb(task, hdr);
+
+ if (nfs3_async_handle_jukebox(task, inode))
+ return -EAGAIN;
+ if (task->tk_status >= 0)
+ nfs_writeback_update_inode(hdr);
+ return 0;
+}
+
+static void nfs3_proc_write_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg,
+ struct rpc_clnt **clnt)
+{
+ msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
+}
+
+static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
+{
+ rpc_call_start(task);
+}
+
+static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data)
+{
+ if (data->commit_done_cb != NULL)
+ return data->commit_done_cb(task, data);
+
+ if (nfs3_async_handle_jukebox(task, data->inode))
+ return -EAGAIN;
+ nfs_refresh_inode(data->inode, data->res.fattr);
+ return 0;
+}
+
+static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg,
+ struct rpc_clnt **clnt)
+{
+ msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT];
+}
+
+static void nfs3_nlm_alloc_call(void *data)
+{
+ struct nfs_lock_context *l_ctx = data;
+ if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags)) {
+ get_nfs_open_context(l_ctx->open_context);
+ nfs_get_lock_context(l_ctx->open_context);
+ }
+}
+
+static bool nfs3_nlm_unlock_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs_lock_context *l_ctx = data;
+ if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags))
+ return nfs_async_iocounter_wait(task, l_ctx);
+ return false;
+
+}
+
+static void nfs3_nlm_release_call(void *data)
+{
+ struct nfs_lock_context *l_ctx = data;
+ struct nfs_open_context *ctx;
+ if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags)) {
+ ctx = l_ctx->open_context;
+ nfs_put_lock_context(l_ctx);
+ put_nfs_open_context(ctx);
+ }
+}
+
+static const struct nlmclnt_operations nlmclnt_fl_close_lock_ops = {
+ .nlmclnt_alloc_call = nfs3_nlm_alloc_call,
+ .nlmclnt_unlock_prepare = nfs3_nlm_unlock_prepare,
+ .nlmclnt_release_call = nfs3_nlm_release_call,
+};
+
+static int
+nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = file_inode(filp);
+ struct nfs_lock_context *l_ctx = NULL;
+ struct nfs_open_context *ctx = nfs_file_open_context(filp);
+ int status;
+
+ if (fl->fl_flags & FL_CLOSE) {
+ l_ctx = nfs_get_lock_context(ctx);
+ if (IS_ERR(l_ctx))
+ l_ctx = NULL;
+ else
+ set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags);
+ }
+
+ status = nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl, l_ctx);
+
+ if (l_ctx)
+ nfs_put_lock_context(l_ctx);
+
+ return status;
+}
+
+static int nfs3_have_delegation(struct inode *inode, fmode_t flags)
+{
+ return 0;
+}
+
+static const struct inode_operations nfs3_dir_inode_operations = {
+ .create = nfs_create,
+ .lookup = nfs_lookup,
+ .link = nfs_link,
+ .unlink = nfs_unlink,
+ .symlink = nfs_symlink,
+ .mkdir = nfs_mkdir,
+ .rmdir = nfs_rmdir,
+ .mknod = nfs_mknod,
+ .rename = nfs_rename,
+ .permission = nfs_permission,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+#ifdef CONFIG_NFS_V3_ACL
+ .listxattr = nfs3_listxattr,
+ .get_acl = nfs3_get_acl,
+ .set_acl = nfs3_set_acl,
+#endif
+};
+
+static const struct inode_operations nfs3_file_inode_operations = {
+ .permission = nfs_permission,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+#ifdef CONFIG_NFS_V3_ACL
+ .listxattr = nfs3_listxattr,
+ .get_acl = nfs3_get_acl,
+ .set_acl = nfs3_set_acl,
+#endif
+};
+
+const struct nfs_rpc_ops nfs_v3_clientops = {
+ .version = 3, /* protocol version */
+ .dentry_ops = &nfs_dentry_operations,
+ .dir_inode_ops = &nfs3_dir_inode_operations,
+ .file_inode_ops = &nfs3_file_inode_operations,
+ .file_ops = &nfs_file_operations,
+ .nlmclnt_ops = &nlmclnt_fl_close_lock_ops,
+ .getroot = nfs3_proc_get_root,
+ .submount = nfs_submount,
+ .try_get_tree = nfs_try_get_tree,
+ .getattr = nfs3_proc_getattr,
+ .setattr = nfs3_proc_setattr,
+ .lookup = nfs3_proc_lookup,
+ .access = nfs3_proc_access,
+ .readlink = nfs3_proc_readlink,
+ .create = nfs3_proc_create,
+ .remove = nfs3_proc_remove,
+ .unlink_setup = nfs3_proc_unlink_setup,
+ .unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare,
+ .unlink_done = nfs3_proc_unlink_done,
+ .rename_setup = nfs3_proc_rename_setup,
+ .rename_rpc_prepare = nfs3_proc_rename_rpc_prepare,
+ .rename_done = nfs3_proc_rename_done,
+ .link = nfs3_proc_link,
+ .symlink = nfs3_proc_symlink,
+ .mkdir = nfs3_proc_mkdir,
+ .rmdir = nfs3_proc_rmdir,
+ .readdir = nfs3_proc_readdir,
+ .mknod = nfs3_proc_mknod,
+ .statfs = nfs3_proc_statfs,
+ .fsinfo = nfs3_proc_fsinfo,
+ .pathconf = nfs3_proc_pathconf,
+ .decode_dirent = nfs3_decode_dirent,
+ .pgio_rpc_prepare = nfs3_proc_pgio_rpc_prepare,
+ .read_setup = nfs3_proc_read_setup,
+ .read_done = nfs3_read_done,
+ .write_setup = nfs3_proc_write_setup,
+ .write_done = nfs3_write_done,
+ .commit_setup = nfs3_proc_commit_setup,
+ .commit_rpc_prepare = nfs3_proc_commit_rpc_prepare,
+ .commit_done = nfs3_commit_done,
+ .lock = nfs3_proc_lock,
+ .clear_acl_cache = forget_all_cached_acls,
+ .close_context = nfs_close_context,
+ .have_delegation = nfs3_have_delegation,
+ .alloc_client = nfs_alloc_client,
+ .init_client = nfs_init_client,
+ .free_client = nfs_free_client,
+ .create_server = nfs3_create_server,
+ .clone_server = nfs3_clone_server,
+};
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
new file mode 100644
index 000000000..7c5809431
--- /dev/null
+++ b/fs/nfs/nfs3super.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2012 Netapp, Inc. All rights reserved.
+ */
+#include <linux/module.h>
+#include <linux/nfs_fs.h>
+#include "internal.h"
+#include "nfs3_fs.h"
+#include "nfs.h"
+
+struct nfs_subversion nfs_v3 = {
+ .owner = THIS_MODULE,
+ .nfs_fs = &nfs_fs_type,
+ .rpc_vers = &nfs_version3,
+ .rpc_ops = &nfs_v3_clientops,
+ .sops = &nfs_sops,
+#ifdef CONFIG_NFS_V3_ACL
+ .xattr = nfs3_xattr_handlers,
+#endif
+};
+
+static int __init init_nfs_v3(void)
+{
+ register_nfs_version(&nfs_v3);
+ return 0;
+}
+
+static void __exit exit_nfs_v3(void)
+{
+ unregister_nfs_version(&nfs_v3);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_nfs_v3);
+module_exit(exit_nfs_v3);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
new file mode 100644
index 000000000..509f32845
--- /dev/null
+++ b/fs/nfs/nfs3xdr.c
@@ -0,0 +1,2576 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/nfs3xdr.c
+ *
+ * XDR functions to encode/decode NFSv3 RPC arguments and results.
+ *
+ * Copyright (C) 1996, 1997 Olaf Kirch
+ */
+
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/kdev_t.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfsacl.h>
+#include "nfstrace.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_XDR
+
+/* Mapping from NFS error code to "errno" error code. */
+#define errno_NFSERR_IO EIO
+
+/*
+ * Declare the space requirements for NFS arguments and replies as
+ * number of 32bit-words
+ */
+#define NFS3_fhandle_sz (1+16)
+#define NFS3_fh_sz (NFS3_fhandle_sz) /* shorthand */
+#define NFS3_post_op_fh_sz (1+NFS3_fh_sz)
+#define NFS3_sattr_sz (15)
+#define NFS3_filename_sz (1+(NFS3_MAXNAMLEN>>2))
+#define NFS3_path_sz (1+(NFS3_MAXPATHLEN>>2))
+#define NFS3_fattr_sz (21)
+#define NFS3_cookieverf_sz (NFS3_COOKIEVERFSIZE>>2)
+#define NFS3_wcc_attr_sz (6)
+#define NFS3_pre_op_attr_sz (1+NFS3_wcc_attr_sz)
+#define NFS3_post_op_attr_sz (1+NFS3_fattr_sz)
+#define NFS3_wcc_data_sz (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz)
+#define NFS3_diropargs_sz (NFS3_fh_sz+NFS3_filename_sz)
+
+#define NFS3_getattrargs_sz (NFS3_fh_sz)
+#define NFS3_setattrargs_sz (NFS3_fh_sz+NFS3_sattr_sz+3)
+#define NFS3_lookupargs_sz (NFS3_fh_sz+NFS3_filename_sz)
+#define NFS3_accessargs_sz (NFS3_fh_sz+1)
+#define NFS3_readlinkargs_sz (NFS3_fh_sz)
+#define NFS3_readargs_sz (NFS3_fh_sz+3)
+#define NFS3_writeargs_sz (NFS3_fh_sz+5)
+#define NFS3_createargs_sz (NFS3_diropargs_sz+NFS3_sattr_sz)
+#define NFS3_mkdirargs_sz (NFS3_diropargs_sz+NFS3_sattr_sz)
+#define NFS3_symlinkargs_sz (NFS3_diropargs_sz+1+NFS3_sattr_sz)
+#define NFS3_mknodargs_sz (NFS3_diropargs_sz+2+NFS3_sattr_sz)
+#define NFS3_removeargs_sz (NFS3_fh_sz+NFS3_filename_sz)
+#define NFS3_renameargs_sz (NFS3_diropargs_sz+NFS3_diropargs_sz)
+#define NFS3_linkargs_sz (NFS3_fh_sz+NFS3_diropargs_sz)
+#define NFS3_readdirargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+3)
+#define NFS3_readdirplusargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+4)
+#define NFS3_commitargs_sz (NFS3_fh_sz+3)
+
+#define NFS3_getattrres_sz (1+NFS3_fattr_sz)
+#define NFS3_setattrres_sz (1+NFS3_wcc_data_sz)
+#define NFS3_removeres_sz (NFS3_setattrres_sz)
+#define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))
+#define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1)
+#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1+1)
+#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3+1)
+#define NFS3_writeres_sz (1+NFS3_wcc_data_sz+4)
+#define NFS3_createres_sz (1+NFS3_post_op_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
+#define NFS3_renameres_sz (1+(2 * NFS3_wcc_data_sz))
+#define NFS3_linkres_sz (1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
+#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2+1)
+#define NFS3_fsstatres_sz (1+NFS3_post_op_attr_sz+13)
+#define NFS3_fsinfores_sz (1+NFS3_post_op_attr_sz+12)
+#define NFS3_pathconfres_sz (1+NFS3_post_op_attr_sz+6)
+#define NFS3_commitres_sz (1+NFS3_wcc_data_sz+2)
+
+#define ACL3_getaclargs_sz (NFS3_fh_sz+1)
+#define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \
+ XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
+#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \
+ XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)+1)
+#define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz)
+
+static int nfs3_stat_to_errno(enum nfs_stat);
+
+/*
+ * Map file type to S_IFMT bits
+ */
+static const umode_t nfs_type2fmt[] = {
+ [NF3BAD] = 0,
+ [NF3REG] = S_IFREG,
+ [NF3DIR] = S_IFDIR,
+ [NF3BLK] = S_IFBLK,
+ [NF3CHR] = S_IFCHR,
+ [NF3LNK] = S_IFLNK,
+ [NF3SOCK] = S_IFSOCK,
+ [NF3FIFO] = S_IFIFO,
+};
+
+static struct user_namespace *rpc_userns(const struct rpc_clnt *clnt)
+{
+ if (clnt && clnt->cl_cred)
+ return clnt->cl_cred->user_ns;
+ return &init_user_ns;
+}
+
+static struct user_namespace *rpc_rqst_userns(const struct rpc_rqst *rqstp)
+{
+ if (rqstp->rq_task)
+ return rpc_userns(rqstp->rq_task->tk_client);
+ return &init_user_ns;
+}
+
+/*
+ * Encode/decode NFSv3 basic data types
+ *
+ * Basic NFSv3 data types are defined in section 2.5 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions. For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+
+static void encode_uint32(struct xdr_stream *xdr, u32 value)
+{
+ __be32 *p = xdr_reserve_space(xdr, 4);
+ *p = cpu_to_be32(value);
+}
+
+static int decode_uint32(struct xdr_stream *xdr, u32 *value)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *value = be32_to_cpup(p);
+ return 0;
+}
+
+static int decode_uint64(struct xdr_stream *xdr, u64 *value)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, value);
+ return 0;
+}
+
+/*
+ * fileid3
+ *
+ * typedef uint64 fileid3;
+ */
+static __be32 *xdr_decode_fileid3(__be32 *p, u64 *fileid)
+{
+ return xdr_decode_hyper(p, fileid);
+}
+
+static int decode_fileid3(struct xdr_stream *xdr, u64 *fileid)
+{
+ return decode_uint64(xdr, fileid);
+}
+
+/*
+ * filename3
+ *
+ * typedef string filename3<>;
+ */
+static void encode_filename3(struct xdr_stream *xdr,
+ const char *name, u32 length)
+{
+ __be32 *p;
+
+ WARN_ON_ONCE(length > NFS3_MAXNAMLEN);
+ p = xdr_reserve_space(xdr, 4 + length);
+ xdr_encode_opaque(p, name, length);
+}
+
+static int decode_inline_filename3(struct xdr_stream *xdr,
+ const char **name, u32 *length)
+{
+ __be32 *p;
+ u32 count;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ count = be32_to_cpup(p);
+ if (count > NFS3_MAXNAMLEN)
+ goto out_nametoolong;
+ p = xdr_inline_decode(xdr, count);
+ if (unlikely(!p))
+ return -EIO;
+ *name = (const char *)p;
+ *length = count;
+ return 0;
+
+out_nametoolong:
+ dprintk("NFS: returned filename too long: %u\n", count);
+ return -ENAMETOOLONG;
+}
+
+/*
+ * nfspath3
+ *
+ * typedef string nfspath3<>;
+ */
+static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages,
+ const u32 length)
+{
+ encode_uint32(xdr, length);
+ xdr_write_pages(xdr, pages, 0, length);
+}
+
+static int decode_nfspath3(struct xdr_stream *xdr)
+{
+ u32 recvd, count;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ count = be32_to_cpup(p);
+ if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN))
+ goto out_nametoolong;
+ recvd = xdr_read_pages(xdr, count);
+ if (unlikely(count > recvd))
+ goto out_cheating;
+ xdr_terminate_string(xdr->buf, count);
+ return 0;
+
+out_nametoolong:
+ dprintk("NFS: returned pathname too long: %u\n", count);
+ return -ENAMETOOLONG;
+out_cheating:
+ dprintk("NFS: server cheating in pathname result: "
+ "count %u > recvd %u\n", count, recvd);
+ return -EIO;
+}
+
+/*
+ * cookie3
+ *
+ * typedef uint64 cookie3
+ */
+static __be32 *xdr_encode_cookie3(__be32 *p, u64 cookie)
+{
+ return xdr_encode_hyper(p, cookie);
+}
+
+static int decode_cookie3(struct xdr_stream *xdr, u64 *cookie)
+{
+ return decode_uint64(xdr, cookie);
+}
+
+/*
+ * cookieverf3
+ *
+ * typedef opaque cookieverf3[NFS3_COOKIEVERFSIZE];
+ */
+static __be32 *xdr_encode_cookieverf3(__be32 *p, const __be32 *verifier)
+{
+ memcpy(p, verifier, NFS3_COOKIEVERFSIZE);
+ return p + XDR_QUADLEN(NFS3_COOKIEVERFSIZE);
+}
+
+static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
+ if (unlikely(!p))
+ return -EIO;
+ memcpy(verifier, p, NFS3_COOKIEVERFSIZE);
+ return 0;
+}
+
+/*
+ * createverf3
+ *
+ * typedef opaque createverf3[NFS3_CREATEVERFSIZE];
+ */
+static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS3_CREATEVERFSIZE);
+ memcpy(p, verifier, NFS3_CREATEVERFSIZE);
+}
+
+static int decode_writeverf3(struct xdr_stream *xdr, struct nfs_write_verifier *verifier)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE);
+ if (unlikely(!p))
+ return -EIO;
+ memcpy(verifier->data, p, NFS3_WRITEVERFSIZE);
+ return 0;
+}
+
+/*
+ * size3
+ *
+ * typedef uint64 size3;
+ */
+static __be32 *xdr_decode_size3(__be32 *p, u64 *size)
+{
+ return xdr_decode_hyper(p, size);
+}
+
+/*
+ * nfsstat3
+ *
+ * enum nfsstat3 {
+ * NFS3_OK = 0,
+ * ...
+ * }
+ */
+#define NFS3_OK NFS_OK
+
+static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ if (unlikely(*p != cpu_to_be32(NFS3_OK)))
+ goto out_status;
+ *status = 0;
+ return 0;
+out_status:
+ *status = be32_to_cpup(p);
+ trace_nfs_xdr_status(xdr, (int)*status);
+ return 0;
+}
+
+/*
+ * ftype3
+ *
+ * enum ftype3 {
+ * NF3REG = 1,
+ * NF3DIR = 2,
+ * NF3BLK = 3,
+ * NF3CHR = 4,
+ * NF3LNK = 5,
+ * NF3SOCK = 6,
+ * NF3FIFO = 7
+ * };
+ */
+static void encode_ftype3(struct xdr_stream *xdr, const u32 type)
+{
+ encode_uint32(xdr, type);
+}
+
+static __be32 *xdr_decode_ftype3(__be32 *p, umode_t *mode)
+{
+ u32 type;
+
+ type = be32_to_cpup(p++);
+ if (type > NF3FIFO)
+ type = NF3NON;
+ *mode = nfs_type2fmt[type];
+ return p;
+}
+
+/*
+ * specdata3
+ *
+ * struct specdata3 {
+ * uint32 specdata1;
+ * uint32 specdata2;
+ * };
+ */
+static void encode_specdata3(struct xdr_stream *xdr, const dev_t rdev)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 8);
+ *p++ = cpu_to_be32(MAJOR(rdev));
+ *p = cpu_to_be32(MINOR(rdev));
+}
+
+static __be32 *xdr_decode_specdata3(__be32 *p, dev_t *rdev)
+{
+ unsigned int major, minor;
+
+ major = be32_to_cpup(p++);
+ minor = be32_to_cpup(p++);
+ *rdev = MKDEV(major, minor);
+ if (MAJOR(*rdev) != major || MINOR(*rdev) != minor)
+ *rdev = 0;
+ return p;
+}
+
+/*
+ * nfs_fh3
+ *
+ * struct nfs_fh3 {
+ * opaque data<NFS3_FHSIZE>;
+ * };
+ */
+static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+ __be32 *p;
+
+ WARN_ON_ONCE(fh->size > NFS3_FHSIZE);
+ p = xdr_reserve_space(xdr, 4 + fh->size);
+ xdr_encode_opaque(p, fh->data, fh->size);
+}
+
+static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+ u32 length;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ length = be32_to_cpup(p++);
+ if (unlikely(length > NFS3_FHSIZE))
+ goto out_toobig;
+ p = xdr_inline_decode(xdr, length);
+ if (unlikely(!p))
+ return -EIO;
+ fh->size = length;
+ memcpy(fh->data, p, length);
+ return 0;
+out_toobig:
+ dprintk("NFS: file handle size (%u) too big\n", length);
+ return -E2BIG;
+}
+
+static void zero_nfs_fh3(struct nfs_fh *fh)
+{
+ memset(fh, 0, sizeof(*fh));
+}
+
+/*
+ * nfstime3
+ *
+ * struct nfstime3 {
+ * uint32 seconds;
+ * uint32 nseconds;
+ * };
+ */
+static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec64 *timep)
+{
+ *p++ = cpu_to_be32((u32)timep->tv_sec);
+ *p++ = cpu_to_be32(timep->tv_nsec);
+ return p;
+}
+
+static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec64 *timep)
+{
+ timep->tv_sec = be32_to_cpup(p++);
+ timep->tv_nsec = be32_to_cpup(p++);
+ return p;
+}
+
+/*
+ * sattr3
+ *
+ * enum time_how {
+ * DONT_CHANGE = 0,
+ * SET_TO_SERVER_TIME = 1,
+ * SET_TO_CLIENT_TIME = 2
+ * };
+ *
+ * union set_mode3 switch (bool set_it) {
+ * case TRUE:
+ * mode3 mode;
+ * default:
+ * void;
+ * };
+ *
+ * union set_uid3 switch (bool set_it) {
+ * case TRUE:
+ * uid3 uid;
+ * default:
+ * void;
+ * };
+ *
+ * union set_gid3 switch (bool set_it) {
+ * case TRUE:
+ * gid3 gid;
+ * default:
+ * void;
+ * };
+ *
+ * union set_size3 switch (bool set_it) {
+ * case TRUE:
+ * size3 size;
+ * default:
+ * void;
+ * };
+ *
+ * union set_atime switch (time_how set_it) {
+ * case SET_TO_CLIENT_TIME:
+ * nfstime3 atime;
+ * default:
+ * void;
+ * };
+ *
+ * union set_mtime switch (time_how set_it) {
+ * case SET_TO_CLIENT_TIME:
+ * nfstime3 mtime;
+ * default:
+ * void;
+ * };
+ *
+ * struct sattr3 {
+ * set_mode3 mode;
+ * set_uid3 uid;
+ * set_gid3 gid;
+ * set_size3 size;
+ * set_atime atime;
+ * set_mtime mtime;
+ * };
+ */
+static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr,
+ struct user_namespace *userns)
+{
+ u32 nbytes;
+ __be32 *p;
+
+ /*
+ * In order to make only a single xdr_reserve_space() call,
+ * pre-compute the total number of bytes to be reserved.
+ * Six boolean values, one for each set_foo field, are always
+ * present in the encoded result, so start there.
+ */
+ nbytes = 6 * 4;
+ if (attr->ia_valid & ATTR_MODE)
+ nbytes += 4;
+ if (attr->ia_valid & ATTR_UID)
+ nbytes += 4;
+ if (attr->ia_valid & ATTR_GID)
+ nbytes += 4;
+ if (attr->ia_valid & ATTR_SIZE)
+ nbytes += 8;
+ if (attr->ia_valid & ATTR_ATIME_SET)
+ nbytes += 8;
+ if (attr->ia_valid & ATTR_MTIME_SET)
+ nbytes += 8;
+ p = xdr_reserve_space(xdr, nbytes);
+
+ if (attr->ia_valid & ATTR_MODE) {
+ *p++ = xdr_one;
+ *p++ = cpu_to_be32(attr->ia_mode & S_IALLUGO);
+ } else
+ *p++ = xdr_zero;
+
+ if (attr->ia_valid & ATTR_UID) {
+ *p++ = xdr_one;
+ *p++ = cpu_to_be32(from_kuid_munged(userns, attr->ia_uid));
+ } else
+ *p++ = xdr_zero;
+
+ if (attr->ia_valid & ATTR_GID) {
+ *p++ = xdr_one;
+ *p++ = cpu_to_be32(from_kgid_munged(userns, attr->ia_gid));
+ } else
+ *p++ = xdr_zero;
+
+ if (attr->ia_valid & ATTR_SIZE) {
+ *p++ = xdr_one;
+ p = xdr_encode_hyper(p, (u64)attr->ia_size);
+ } else
+ *p++ = xdr_zero;
+
+ if (attr->ia_valid & ATTR_ATIME_SET) {
+ *p++ = xdr_two;
+ p = xdr_encode_nfstime3(p, &attr->ia_atime);
+ } else if (attr->ia_valid & ATTR_ATIME) {
+ *p++ = xdr_one;
+ } else
+ *p++ = xdr_zero;
+
+ if (attr->ia_valid & ATTR_MTIME_SET) {
+ *p++ = xdr_two;
+ xdr_encode_nfstime3(p, &attr->ia_mtime);
+ } else if (attr->ia_valid & ATTR_MTIME) {
+ *p = xdr_one;
+ } else
+ *p = xdr_zero;
+}
+
+/*
+ * fattr3
+ *
+ * struct fattr3 {
+ * ftype3 type;
+ * mode3 mode;
+ * uint32 nlink;
+ * uid3 uid;
+ * gid3 gid;
+ * size3 size;
+ * size3 used;
+ * specdata3 rdev;
+ * uint64 fsid;
+ * fileid3 fileid;
+ * nfstime3 atime;
+ * nfstime3 mtime;
+ * nfstime3 ctime;
+ * };
+ */
+static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+ struct user_namespace *userns)
+{
+ umode_t fmode;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2);
+ if (unlikely(!p))
+ return -EIO;
+
+ p = xdr_decode_ftype3(p, &fmode);
+
+ fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode;
+ fattr->nlink = be32_to_cpup(p++);
+ fattr->uid = make_kuid(userns, be32_to_cpup(p++));
+ if (!uid_valid(fattr->uid))
+ goto out_uid;
+ fattr->gid = make_kgid(userns, be32_to_cpup(p++));
+ if (!gid_valid(fattr->gid))
+ goto out_gid;
+
+ p = xdr_decode_size3(p, &fattr->size);
+ p = xdr_decode_size3(p, &fattr->du.nfs3.used);
+ p = xdr_decode_specdata3(p, &fattr->rdev);
+
+ p = xdr_decode_hyper(p, &fattr->fsid.major);
+ fattr->fsid.minor = 0;
+
+ p = xdr_decode_fileid3(p, &fattr->fileid);
+ p = xdr_decode_nfstime3(p, &fattr->atime);
+ p = xdr_decode_nfstime3(p, &fattr->mtime);
+ xdr_decode_nfstime3(p, &fattr->ctime);
+ fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime);
+
+ fattr->valid |= NFS_ATTR_FATTR_V3;
+ return 0;
+out_uid:
+ dprintk("NFS: returned invalid uid\n");
+ return -EINVAL;
+out_gid:
+ dprintk("NFS: returned invalid gid\n");
+ return -EINVAL;
+}
+
+/*
+ * post_op_attr
+ *
+ * union post_op_attr switch (bool attributes_follow) {
+ * case TRUE:
+ * fattr3 attributes;
+ * case FALSE:
+ * void;
+ * };
+ */
+static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+ struct user_namespace *userns)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ if (*p != xdr_zero)
+ return decode_fattr3(xdr, fattr, userns);
+ return 0;
+}
+
+/*
+ * wcc_attr
+ * struct wcc_attr {
+ * size3 size;
+ * nfstime3 mtime;
+ * nfstime3 ctime;
+ * };
+ */
+static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2);
+ if (unlikely(!p))
+ return -EIO;
+
+ fattr->valid |= NFS_ATTR_FATTR_PRESIZE
+ | NFS_ATTR_FATTR_PRECHANGE
+ | NFS_ATTR_FATTR_PREMTIME
+ | NFS_ATTR_FATTR_PRECTIME;
+
+ p = xdr_decode_size3(p, &fattr->pre_size);
+ p = xdr_decode_nfstime3(p, &fattr->pre_mtime);
+ xdr_decode_nfstime3(p, &fattr->pre_ctime);
+ fattr->pre_change_attr = nfs_timespec_to_change_attr(&fattr->pre_ctime);
+
+ return 0;
+}
+
+/*
+ * pre_op_attr
+ * union pre_op_attr switch (bool attributes_follow) {
+ * case TRUE:
+ * wcc_attr attributes;
+ * case FALSE:
+ * void;
+ * };
+ *
+ * wcc_data
+ *
+ * struct wcc_data {
+ * pre_op_attr before;
+ * post_op_attr after;
+ * };
+ */
+static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ if (*p != xdr_zero)
+ return decode_wcc_attr(xdr, fattr);
+ return 0;
+}
+
+static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+ struct user_namespace *userns)
+{
+ int error;
+
+ error = decode_pre_op_attr(xdr, fattr);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, fattr, userns);
+out:
+ return error;
+}
+
+/*
+ * post_op_fh3
+ *
+ * union post_op_fh3 switch (bool handle_follows) {
+ * case TRUE:
+ * nfs_fh3 handle;
+ * case FALSE:
+ * void;
+ * };
+ */
+static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+ __be32 *p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ if (*p != xdr_zero)
+ return decode_nfs_fh3(xdr, fh);
+ zero_nfs_fh3(fh);
+ return 0;
+}
+
+/*
+ * diropargs3
+ *
+ * struct diropargs3 {
+ * nfs_fh3 dir;
+ * filename3 name;
+ * };
+ */
+static void encode_diropargs3(struct xdr_stream *xdr, const struct nfs_fh *fh,
+ const char *name, u32 length)
+{
+ encode_nfs_fh3(xdr, fh);
+ encode_filename3(xdr, name, length);
+}
+
+
+/*
+ * NFSv3 XDR encode functions
+ *
+ * NFSv3 argument types are defined in section 3.3 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
+ */
+
+/*
+ * 3.3.1 GETATTR3args
+ *
+ * struct GETATTR3args {
+ * nfs_fh3 object;
+ * };
+ */
+static void nfs3_xdr_enc_getattr3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_fh *fh = data;
+
+ encode_nfs_fh3(xdr, fh);
+}
+
+/*
+ * 3.3.2 SETATTR3args
+ *
+ * union sattrguard3 switch (bool check) {
+ * case TRUE:
+ * nfstime3 obj_ctime;
+ * case FALSE:
+ * void;
+ * };
+ *
+ * struct SETATTR3args {
+ * nfs_fh3 object;
+ * sattr3 new_attributes;
+ * sattrguard3 guard;
+ * };
+ */
+static void encode_sattrguard3(struct xdr_stream *xdr,
+ const struct nfs3_sattrargs *args)
+{
+ __be32 *p;
+
+ if (args->guard) {
+ p = xdr_reserve_space(xdr, 4 + 8);
+ *p++ = xdr_one;
+ xdr_encode_nfstime3(p, &args->guardtime);
+ } else {
+ p = xdr_reserve_space(xdr, 4);
+ *p = xdr_zero;
+ }
+}
+
+static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_sattrargs *args = data;
+ encode_nfs_fh3(xdr, args->fh);
+ encode_sattr3(xdr, args->sattr, rpc_rqst_userns(req));
+ encode_sattrguard3(xdr, args);
+}
+
+/*
+ * 3.3.3 LOOKUP3args
+ *
+ * struct LOOKUP3args {
+ * diropargs3 what;
+ * };
+ */
+static void nfs3_xdr_enc_lookup3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_diropargs *args = data;
+
+ encode_diropargs3(xdr, args->fh, args->name, args->len);
+}
+
+/*
+ * 3.3.4 ACCESS3args
+ *
+ * struct ACCESS3args {
+ * nfs_fh3 object;
+ * uint32 access;
+ * };
+ */
+static void encode_access3args(struct xdr_stream *xdr,
+ const struct nfs3_accessargs *args)
+{
+ encode_nfs_fh3(xdr, args->fh);
+ encode_uint32(xdr, args->access);
+}
+
+static void nfs3_xdr_enc_access3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_accessargs *args = data;
+
+ encode_access3args(xdr, args);
+}
+
+/*
+ * 3.3.5 READLINK3args
+ *
+ * struct READLINK3args {
+ * nfs_fh3 symlink;
+ * };
+ */
+static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_readlinkargs *args = data;
+
+ encode_nfs_fh3(xdr, args->fh);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->pglen, NFS3_readlinkres_sz);
+}
+
+/*
+ * 3.3.6 READ3args
+ *
+ * struct READ3args {
+ * nfs_fh3 file;
+ * offset3 offset;
+ * count3 count;
+ * };
+ */
+static void encode_read3args(struct xdr_stream *xdr,
+ const struct nfs_pgio_args *args)
+{
+ __be32 *p;
+
+ encode_nfs_fh3(xdr, args->fh);
+
+ p = xdr_reserve_space(xdr, 8 + 4);
+ p = xdr_encode_hyper(p, args->offset);
+ *p = cpu_to_be32(args->count);
+}
+
+static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_pgio_args *args = data;
+ unsigned int replen = args->replen ? args->replen : NFS3_readres_sz;
+
+ encode_read3args(xdr, args);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, replen);
+ req->rq_rcv_buf.flags |= XDRBUF_READ;
+}
+
+/*
+ * 3.3.7 WRITE3args
+ *
+ * enum stable_how {
+ * UNSTABLE = 0,
+ * DATA_SYNC = 1,
+ * FILE_SYNC = 2
+ * };
+ *
+ * struct WRITE3args {
+ * nfs_fh3 file;
+ * offset3 offset;
+ * count3 count;
+ * stable_how stable;
+ * opaque data<>;
+ * };
+ */
+static void encode_write3args(struct xdr_stream *xdr,
+ const struct nfs_pgio_args *args)
+{
+ __be32 *p;
+
+ encode_nfs_fh3(xdr, args->fh);
+
+ p = xdr_reserve_space(xdr, 8 + 4 + 4 + 4);
+ p = xdr_encode_hyper(p, args->offset);
+ *p++ = cpu_to_be32(args->count);
+ *p++ = cpu_to_be32(args->stable);
+ *p = cpu_to_be32(args->count);
+ xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
+}
+
+static void nfs3_xdr_enc_write3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_pgio_args *args = data;
+
+ encode_write3args(xdr, args);
+ xdr->buf->flags |= XDRBUF_WRITE;
+}
+
+/*
+ * 3.3.8 CREATE3args
+ *
+ * enum createmode3 {
+ * UNCHECKED = 0,
+ * GUARDED = 1,
+ * EXCLUSIVE = 2
+ * };
+ *
+ * union createhow3 switch (createmode3 mode) {
+ * case UNCHECKED:
+ * case GUARDED:
+ * sattr3 obj_attributes;
+ * case EXCLUSIVE:
+ * createverf3 verf;
+ * };
+ *
+ * struct CREATE3args {
+ * diropargs3 where;
+ * createhow3 how;
+ * };
+ */
+static void encode_createhow3(struct xdr_stream *xdr,
+ const struct nfs3_createargs *args,
+ struct user_namespace *userns)
+{
+ encode_uint32(xdr, args->createmode);
+ switch (args->createmode) {
+ case NFS3_CREATE_UNCHECKED:
+ case NFS3_CREATE_GUARDED:
+ encode_sattr3(xdr, args->sattr, userns);
+ break;
+ case NFS3_CREATE_EXCLUSIVE:
+ encode_createverf3(xdr, args->verifier);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static void nfs3_xdr_enc_create3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_createargs *args = data;
+
+ encode_diropargs3(xdr, args->fh, args->name, args->len);
+ encode_createhow3(xdr, args, rpc_rqst_userns(req));
+}
+
+/*
+ * 3.3.9 MKDIR3args
+ *
+ * struct MKDIR3args {
+ * diropargs3 where;
+ * sattr3 attributes;
+ * };
+ */
+static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_mkdirargs *args = data;
+
+ encode_diropargs3(xdr, args->fh, args->name, args->len);
+ encode_sattr3(xdr, args->sattr, rpc_rqst_userns(req));
+}
+
+/*
+ * 3.3.10 SYMLINK3args
+ *
+ * struct symlinkdata3 {
+ * sattr3 symlink_attributes;
+ * nfspath3 symlink_data;
+ * };
+ *
+ * struct SYMLINK3args {
+ * diropargs3 where;
+ * symlinkdata3 symlink;
+ * };
+ */
+static void encode_symlinkdata3(struct xdr_stream *xdr,
+ const void *data,
+ struct user_namespace *userns)
+{
+ const struct nfs3_symlinkargs *args = data;
+
+ encode_sattr3(xdr, args->sattr, userns);
+ encode_nfspath3(xdr, args->pages, args->pathlen);
+}
+
+static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_symlinkargs *args = data;
+
+ encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
+ encode_symlinkdata3(xdr, args, rpc_rqst_userns(req));
+ xdr->buf->flags |= XDRBUF_WRITE;
+}
+
+/*
+ * 3.3.11 MKNOD3args
+ *
+ * struct devicedata3 {
+ * sattr3 dev_attributes;
+ * specdata3 spec;
+ * };
+ *
+ * union mknoddata3 switch (ftype3 type) {
+ * case NF3CHR:
+ * case NF3BLK:
+ * devicedata3 device;
+ * case NF3SOCK:
+ * case NF3FIFO:
+ * sattr3 pipe_attributes;
+ * default:
+ * void;
+ * };
+ *
+ * struct MKNOD3args {
+ * diropargs3 where;
+ * mknoddata3 what;
+ * };
+ */
+static void encode_devicedata3(struct xdr_stream *xdr,
+ const struct nfs3_mknodargs *args,
+ struct user_namespace *userns)
+{
+ encode_sattr3(xdr, args->sattr, userns);
+ encode_specdata3(xdr, args->rdev);
+}
+
+static void encode_mknoddata3(struct xdr_stream *xdr,
+ const struct nfs3_mknodargs *args,
+ struct user_namespace *userns)
+{
+ encode_ftype3(xdr, args->type);
+ switch (args->type) {
+ case NF3CHR:
+ case NF3BLK:
+ encode_devicedata3(xdr, args, userns);
+ break;
+ case NF3SOCK:
+ case NF3FIFO:
+ encode_sattr3(xdr, args->sattr, userns);
+ break;
+ case NF3REG:
+ case NF3DIR:
+ break;
+ default:
+ BUG();
+ }
+}
+
+static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_mknodargs *args = data;
+
+ encode_diropargs3(xdr, args->fh, args->name, args->len);
+ encode_mknoddata3(xdr, args, rpc_rqst_userns(req));
+}
+
+/*
+ * 3.3.12 REMOVE3args
+ *
+ * struct REMOVE3args {
+ * diropargs3 object;
+ * };
+ */
+static void nfs3_xdr_enc_remove3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_removeargs *args = data;
+
+ encode_diropargs3(xdr, args->fh, args->name.name, args->name.len);
+}
+
+/*
+ * 3.3.14 RENAME3args
+ *
+ * struct RENAME3args {
+ * diropargs3 from;
+ * diropargs3 to;
+ * };
+ */
+static void nfs3_xdr_enc_rename3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_renameargs *args = data;
+ const struct qstr *old = args->old_name;
+ const struct qstr *new = args->new_name;
+
+ encode_diropargs3(xdr, args->old_dir, old->name, old->len);
+ encode_diropargs3(xdr, args->new_dir, new->name, new->len);
+}
+
+/*
+ * 3.3.15 LINK3args
+ *
+ * struct LINK3args {
+ * nfs_fh3 file;
+ * diropargs3 link;
+ * };
+ */
+static void nfs3_xdr_enc_link3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_linkargs *args = data;
+
+ encode_nfs_fh3(xdr, args->fromfh);
+ encode_diropargs3(xdr, args->tofh, args->toname, args->tolen);
+}
+
+/*
+ * 3.3.16 READDIR3args
+ *
+ * struct READDIR3args {
+ * nfs_fh3 dir;
+ * cookie3 cookie;
+ * cookieverf3 cookieverf;
+ * count3 count;
+ * };
+ */
+static void encode_readdir3args(struct xdr_stream *xdr,
+ const struct nfs3_readdirargs *args)
+{
+ __be32 *p;
+
+ encode_nfs_fh3(xdr, args->fh);
+
+ p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4);
+ p = xdr_encode_cookie3(p, args->cookie);
+ p = xdr_encode_cookieverf3(p, args->verf);
+ *p = cpu_to_be32(args->count);
+}
+
+static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_readdirargs *args = data;
+
+ encode_readdir3args(xdr, args);
+ rpc_prepare_reply_pages(req, args->pages, 0,
+ args->count, NFS3_readdirres_sz);
+}
+
+/*
+ * 3.3.17 READDIRPLUS3args
+ *
+ * struct READDIRPLUS3args {
+ * nfs_fh3 dir;
+ * cookie3 cookie;
+ * cookieverf3 cookieverf;
+ * count3 dircount;
+ * count3 maxcount;
+ * };
+ */
+static void encode_readdirplus3args(struct xdr_stream *xdr,
+ const struct nfs3_readdirargs *args)
+{
+ __be32 *p;
+
+ encode_nfs_fh3(xdr, args->fh);
+
+ p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4 + 4);
+ p = xdr_encode_cookie3(p, args->cookie);
+ p = xdr_encode_cookieverf3(p, args->verf);
+
+ /*
+ * readdirplus: need dircount + buffer size.
+ * We just make sure we make dircount big enough
+ */
+ *p++ = cpu_to_be32(args->count >> 3);
+
+ *p = cpu_to_be32(args->count);
+}
+
+static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_readdirargs *args = data;
+
+ encode_readdirplus3args(xdr, args);
+ rpc_prepare_reply_pages(req, args->pages, 0,
+ args->count, NFS3_readdirres_sz);
+}
+
+/*
+ * 3.3.21 COMMIT3args
+ *
+ * struct COMMIT3args {
+ * nfs_fh3 file;
+ * offset3 offset;
+ * count3 count;
+ * };
+ */
+static void encode_commit3args(struct xdr_stream *xdr,
+ const struct nfs_commitargs *args)
+{
+ __be32 *p;
+
+ encode_nfs_fh3(xdr, args->fh);
+
+ p = xdr_reserve_space(xdr, 8 + 4);
+ p = xdr_encode_hyper(p, args->offset);
+ *p = cpu_to_be32(args->count);
+}
+
+static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_commitargs *args = data;
+
+ encode_commit3args(xdr, args);
+}
+
+#ifdef CONFIG_NFS_V3_ACL
+
+static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_getaclargs *args = data;
+
+ encode_nfs_fh3(xdr, args->fh);
+ encode_uint32(xdr, args->mask);
+ if (args->mask & (NFS_ACL | NFS_DFACL)) {
+ rpc_prepare_reply_pages(req, args->pages, 0,
+ NFSACL_MAXPAGES << PAGE_SHIFT,
+ ACL3_getaclres_sz);
+ req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
+ }
+}
+
+static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs3_setaclargs *args = data;
+ unsigned int base;
+ int error;
+
+ encode_nfs_fh3(xdr, NFS_FH(args->inode));
+ encode_uint32(xdr, args->mask);
+
+ base = req->rq_slen;
+ if (args->npages != 0)
+ xdr_write_pages(xdr, args->pages, 0, args->len);
+ else
+ xdr_reserve_space(xdr, args->len);
+
+ error = nfsacl_encode(xdr->buf, base, args->inode,
+ (args->mask & NFS_ACL) ?
+ args->acl_access : NULL, 1, 0);
+ /* FIXME: this is just broken */
+ BUG_ON(error < 0);
+ error = nfsacl_encode(xdr->buf, base + error, args->inode,
+ (args->mask & NFS_DFACL) ?
+ args->acl_default : NULL, 1,
+ NFS_ACL_DEFAULT);
+ BUG_ON(error < 0);
+}
+
+#endif /* CONFIG_NFS_V3_ACL */
+
+/*
+ * NFSv3 XDR decode functions
+ *
+ * NFSv3 result types are defined in section 3.3 of RFC 1813:
+ * "NFS Version 3 Protocol Specification".
+ */
+
+/*
+ * 3.3.1 GETATTR3res
+ *
+ * struct GETATTR3resok {
+ * fattr3 obj_attributes;
+ * };
+ *
+ * union GETATTR3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * GETATTR3resok resok;
+ * default:
+ * void;
+ * };
+ */
+static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *result)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_default;
+ error = decode_fattr3(xdr, result, rpc_rqst_userns(req));
+out:
+ return error;
+out_default:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.2 SETATTR3res
+ *
+ * struct SETATTR3resok {
+ * wcc_data obj_wcc;
+ * };
+ *
+ * struct SETATTR3resfail {
+ * wcc_data obj_wcc;
+ * };
+ *
+ * union SETATTR3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * SETATTR3resok resok;
+ * default:
+ * SETATTR3resfail resfail;
+ * };
+ */
+static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *result)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_wcc_data(xdr, result, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_status;
+out:
+ return error;
+out_status:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.3 LOOKUP3res
+ *
+ * struct LOOKUP3resok {
+ * nfs_fh3 object;
+ * post_op_attr obj_attributes;
+ * post_op_attr dir_attributes;
+ * };
+ *
+ * struct LOOKUP3resfail {
+ * post_op_attr dir_attributes;
+ * };
+ *
+ * union LOOKUP3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * LOOKUP3resok resok;
+ * default:
+ * LOOKUP3resfail resfail;
+ * };
+ */
+static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct user_namespace *userns = rpc_rqst_userns(req);
+ struct nfs3_diropres *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_default;
+ error = decode_nfs_fh3(xdr, result->fh);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, result->fattr, userns);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, result->dir_attr, userns);
+out:
+ return error;
+out_default:
+ error = decode_post_op_attr(xdr, result->dir_attr, userns);
+ if (unlikely(error))
+ goto out;
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.4 ACCESS3res
+ *
+ * struct ACCESS3resok {
+ * post_op_attr obj_attributes;
+ * uint32 access;
+ * };
+ *
+ * struct ACCESS3resfail {
+ * post_op_attr obj_attributes;
+ * };
+ *
+ * union ACCESS3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * ACCESS3resok resok;
+ * default:
+ * ACCESS3resfail resfail;
+ * };
+ */
+static int nfs3_xdr_dec_access3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs3_accessres *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_default;
+ error = decode_uint32(xdr, &result->access);
+out:
+ return error;
+out_default:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.5 READLINK3res
+ *
+ * struct READLINK3resok {
+ * post_op_attr symlink_attributes;
+ * nfspath3 data;
+ * };
+ *
+ * struct READLINK3resfail {
+ * post_op_attr symlink_attributes;
+ * };
+ *
+ * union READLINK3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * READLINK3resok resok;
+ * default:
+ * READLINK3resfail resfail;
+ * };
+ */
+static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *result)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, result, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_default;
+ error = decode_nfspath3(xdr);
+out:
+ return error;
+out_default:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.6 READ3res
+ *
+ * struct READ3resok {
+ * post_op_attr file_attributes;
+ * count3 count;
+ * bool eof;
+ * opaque data<>;
+ * };
+ *
+ * struct READ3resfail {
+ * post_op_attr file_attributes;
+ * };
+ *
+ * union READ3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * READ3resok resok;
+ * default:
+ * READ3resfail resfail;
+ * };
+ */
+static int decode_read3resok(struct xdr_stream *xdr,
+ struct nfs_pgio_res *result)
+{
+ u32 eof, count, ocount, recvd;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4 + 4 + 4);
+ if (unlikely(!p))
+ return -EIO;
+ count = be32_to_cpup(p++);
+ eof = be32_to_cpup(p++);
+ ocount = be32_to_cpup(p++);
+ if (unlikely(ocount != count))
+ goto out_mismatch;
+ recvd = xdr_read_pages(xdr, count);
+ if (unlikely(count > recvd))
+ goto out_cheating;
+out:
+ result->eof = eof;
+ result->count = count;
+ return count;
+out_mismatch:
+ dprintk("NFS: READ count doesn't match length of opaque: "
+ "count %u != ocount %u\n", count, ocount);
+ return -EIO;
+out_cheating:
+ dprintk("NFS: server cheating in read result: "
+ "count %u > recvd %u\n", count, recvd);
+ count = recvd;
+ eof = 0;
+ goto out;
+}
+
+static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_pgio_res *result = data;
+ unsigned int pos;
+ enum nfs_stat status;
+ int error;
+
+ pos = xdr_stream_pos(xdr);
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ result->op_status = status;
+ if (status != NFS3_OK)
+ goto out_status;
+ result->replen = 4 + ((xdr_stream_pos(xdr) - pos) >> 2);
+ error = decode_read3resok(xdr, result);
+out:
+ return error;
+out_status:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.7 WRITE3res
+ *
+ * enum stable_how {
+ * UNSTABLE = 0,
+ * DATA_SYNC = 1,
+ * FILE_SYNC = 2
+ * };
+ *
+ * struct WRITE3resok {
+ * wcc_data file_wcc;
+ * count3 count;
+ * stable_how committed;
+ * writeverf3 verf;
+ * };
+ *
+ * struct WRITE3resfail {
+ * wcc_data file_wcc;
+ * };
+ *
+ * union WRITE3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * WRITE3resok resok;
+ * default:
+ * WRITE3resfail resfail;
+ * };
+ */
+static int decode_write3resok(struct xdr_stream *xdr,
+ struct nfs_pgio_res *result)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4 + 4);
+ if (unlikely(!p))
+ return -EIO;
+ result->count = be32_to_cpup(p++);
+ result->verf->committed = be32_to_cpup(p++);
+ if (unlikely(result->verf->committed > NFS_FILE_SYNC))
+ goto out_badvalue;
+ if (decode_writeverf3(xdr, &result->verf->verifier))
+ return -EIO;
+ return result->count;
+out_badvalue:
+ dprintk("NFS: bad stable_how value: %u\n", result->verf->committed);
+ return -EIO;
+}
+
+static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_pgio_res *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_wcc_data(xdr, result->fattr, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ result->op_status = status;
+ if (status != NFS3_OK)
+ goto out_status;
+ error = decode_write3resok(xdr, result);
+out:
+ return error;
+out_status:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.8 CREATE3res
+ *
+ * struct CREATE3resok {
+ * post_op_fh3 obj;
+ * post_op_attr obj_attributes;
+ * wcc_data dir_wcc;
+ * };
+ *
+ * struct CREATE3resfail {
+ * wcc_data dir_wcc;
+ * };
+ *
+ * union CREATE3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * CREATE3resok resok;
+ * default:
+ * CREATE3resfail resfail;
+ * };
+ */
+static int decode_create3resok(struct xdr_stream *xdr,
+ struct nfs3_diropres *result,
+ struct user_namespace *userns)
+{
+ int error;
+
+ error = decode_post_op_fh3(xdr, result->fh);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, result->fattr, userns);
+ if (unlikely(error))
+ goto out;
+ /* The server isn't required to return a file handle.
+ * If it didn't, force the client to perform a LOOKUP
+ * to determine the correct file handle and attribute
+ * values for the new object. */
+ if (result->fh->size == 0)
+ result->fattr->valid = 0;
+ error = decode_wcc_data(xdr, result->dir_attr, userns);
+out:
+ return error;
+}
+
+static int nfs3_xdr_dec_create3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct user_namespace *userns = rpc_rqst_userns(req);
+ struct nfs3_diropres *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_default;
+ error = decode_create3resok(xdr, result, userns);
+out:
+ return error;
+out_default:
+ error = decode_wcc_data(xdr, result->dir_attr, userns);
+ if (unlikely(error))
+ goto out;
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.12 REMOVE3res
+ *
+ * struct REMOVE3resok {
+ * wcc_data dir_wcc;
+ * };
+ *
+ * struct REMOVE3resfail {
+ * wcc_data dir_wcc;
+ * };
+ *
+ * union REMOVE3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * REMOVE3resok resok;
+ * default:
+ * REMOVE3resfail resfail;
+ * };
+ */
+static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_removeres *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_wcc_data(xdr, result->dir_attr, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_status;
+out:
+ return error;
+out_status:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.14 RENAME3res
+ *
+ * struct RENAME3resok {
+ * wcc_data fromdir_wcc;
+ * wcc_data todir_wcc;
+ * };
+ *
+ * struct RENAME3resfail {
+ * wcc_data fromdir_wcc;
+ * wcc_data todir_wcc;
+ * };
+ *
+ * union RENAME3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * RENAME3resok resok;
+ * default:
+ * RENAME3resfail resfail;
+ * };
+ */
+static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct user_namespace *userns = rpc_rqst_userns(req);
+ struct nfs_renameres *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_wcc_data(xdr, result->old_fattr, userns);
+ if (unlikely(error))
+ goto out;
+ error = decode_wcc_data(xdr, result->new_fattr, userns);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_status;
+out:
+ return error;
+out_status:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.15 LINK3res
+ *
+ * struct LINK3resok {
+ * post_op_attr file_attributes;
+ * wcc_data linkdir_wcc;
+ * };
+ *
+ * struct LINK3resfail {
+ * post_op_attr file_attributes;
+ * wcc_data linkdir_wcc;
+ * };
+ *
+ * union LINK3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * LINK3resok resok;
+ * default:
+ * LINK3resfail resfail;
+ * };
+ */
+static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *data)
+{
+ struct user_namespace *userns = rpc_rqst_userns(req);
+ struct nfs3_linkres *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, result->fattr, userns);
+ if (unlikely(error))
+ goto out;
+ error = decode_wcc_data(xdr, result->dir_attr, userns);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_status;
+out:
+ return error;
+out_status:
+ return nfs3_stat_to_errno(status);
+}
+
+/**
+ * nfs3_decode_dirent - Decode a single NFSv3 directory entry stored in
+ * the local page cache
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 3.3.16 entry3
+ *
+ * struct entry3 {
+ * fileid3 fileid;
+ * filename3 name;
+ * cookie3 cookie;
+ * fhandle3 filehandle;
+ * post_op_attr3 attributes;
+ * entry3 *nextentry;
+ * };
+ *
+ * 3.3.17 entryplus3
+ * struct entryplus3 {
+ * fileid3 fileid;
+ * filename3 name;
+ * cookie3 cookie;
+ * post_op_attr name_attributes;
+ * post_op_fh3 name_handle;
+ * entryplus3 *nextentry;
+ * };
+ */
+int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+ bool plus)
+{
+ struct user_namespace *userns = rpc_userns(entry->server->client);
+ __be32 *p;
+ int error;
+ u64 new_cookie;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EAGAIN;
+ if (*p == xdr_zero) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EAGAIN;
+ if (*p == xdr_zero)
+ return -EAGAIN;
+ entry->eof = 1;
+ return -EBADCOOKIE;
+ }
+
+ error = decode_fileid3(xdr, &entry->ino);
+ if (unlikely(error))
+ return -EAGAIN;
+
+ error = decode_inline_filename3(xdr, &entry->name, &entry->len);
+ if (unlikely(error))
+ return error == -ENAMETOOLONG ? -ENAMETOOLONG : -EAGAIN;
+
+ error = decode_cookie3(xdr, &new_cookie);
+ if (unlikely(error))
+ return -EAGAIN;
+
+ entry->d_type = DT_UNKNOWN;
+
+ if (plus) {
+ entry->fattr->valid = 0;
+ error = decode_post_op_attr(xdr, entry->fattr, userns);
+ if (unlikely(error))
+ return -EAGAIN;
+ if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
+ entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
+
+ if (entry->fattr->fileid != entry->ino) {
+ entry->fattr->mounted_on_fileid = entry->ino;
+ entry->fattr->valid |= NFS_ATTR_FATTR_MOUNTED_ON_FILEID;
+ }
+
+ /* In fact, a post_op_fh3: */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EAGAIN;
+ if (*p != xdr_zero) {
+ error = decode_nfs_fh3(xdr, entry->fh);
+ if (unlikely(error))
+ return -EAGAIN;
+ } else
+ zero_nfs_fh3(entry->fh);
+ }
+
+ entry->prev_cookie = entry->cookie;
+ entry->cookie = new_cookie;
+
+ return 0;
+}
+
+/*
+ * 3.3.16 READDIR3res
+ *
+ * struct dirlist3 {
+ * entry3 *entries;
+ * bool eof;
+ * };
+ *
+ * struct READDIR3resok {
+ * post_op_attr dir_attributes;
+ * cookieverf3 cookieverf;
+ * dirlist3 reply;
+ * };
+ *
+ * struct READDIR3resfail {
+ * post_op_attr dir_attributes;
+ * };
+ *
+ * union READDIR3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * READDIR3resok resok;
+ * default:
+ * READDIR3resfail resfail;
+ * };
+ *
+ * Read the directory contents into the page cache, but otherwise
+ * don't touch them. The actual decoding is done by nfs3_decode_entry()
+ * during subsequent nfs_readdir() calls.
+ */
+static int decode_dirlist3(struct xdr_stream *xdr)
+{
+ return xdr_read_pages(xdr, xdr->buf->page_len);
+}
+
+static int decode_readdir3resok(struct xdr_stream *xdr,
+ struct nfs3_readdirres *result,
+ struct user_namespace *userns)
+{
+ int error;
+
+ error = decode_post_op_attr(xdr, result->dir_attr, userns);
+ if (unlikely(error))
+ goto out;
+ /* XXX: do we need to check if result->verf != NULL ? */
+ error = decode_cookieverf3(xdr, result->verf);
+ if (unlikely(error))
+ goto out;
+ error = decode_dirlist3(xdr);
+out:
+ return error;
+}
+
+static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs3_readdirres *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_default;
+ error = decode_readdir3resok(xdr, result, rpc_rqst_userns(req));
+out:
+ return error;
+out_default:
+ error = decode_post_op_attr(xdr, result->dir_attr, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.18 FSSTAT3res
+ *
+ * struct FSSTAT3resok {
+ * post_op_attr obj_attributes;
+ * size3 tbytes;
+ * size3 fbytes;
+ * size3 abytes;
+ * size3 tfiles;
+ * size3 ffiles;
+ * size3 afiles;
+ * uint32 invarsec;
+ * };
+ *
+ * struct FSSTAT3resfail {
+ * post_op_attr obj_attributes;
+ * };
+ *
+ * union FSSTAT3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * FSSTAT3resok resok;
+ * default:
+ * FSSTAT3resfail resfail;
+ * };
+ */
+static int decode_fsstat3resok(struct xdr_stream *xdr,
+ struct nfs_fsstat *result)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 8 * 6 + 4);
+ if (unlikely(!p))
+ return -EIO;
+ p = xdr_decode_size3(p, &result->tbytes);
+ p = xdr_decode_size3(p, &result->fbytes);
+ p = xdr_decode_size3(p, &result->abytes);
+ p = xdr_decode_size3(p, &result->tfiles);
+ p = xdr_decode_size3(p, &result->ffiles);
+ xdr_decode_size3(p, &result->afiles);
+ /* ignore invarsec */
+ return 0;
+}
+
+static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_fsstat *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_status;
+ error = decode_fsstat3resok(xdr, result);
+out:
+ return error;
+out_status:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.19 FSINFO3res
+ *
+ * struct FSINFO3resok {
+ * post_op_attr obj_attributes;
+ * uint32 rtmax;
+ * uint32 rtpref;
+ * uint32 rtmult;
+ * uint32 wtmax;
+ * uint32 wtpref;
+ * uint32 wtmult;
+ * uint32 dtpref;
+ * size3 maxfilesize;
+ * nfstime3 time_delta;
+ * uint32 properties;
+ * };
+ *
+ * struct FSINFO3resfail {
+ * post_op_attr obj_attributes;
+ * };
+ *
+ * union FSINFO3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * FSINFO3resok resok;
+ * default:
+ * FSINFO3resfail resfail;
+ * };
+ */
+static int decode_fsinfo3resok(struct xdr_stream *xdr,
+ struct nfs_fsinfo *result)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4);
+ if (unlikely(!p))
+ return -EIO;
+ result->rtmax = be32_to_cpup(p++);
+ result->rtpref = be32_to_cpup(p++);
+ result->rtmult = be32_to_cpup(p++);
+ result->wtmax = be32_to_cpup(p++);
+ result->wtpref = be32_to_cpup(p++);
+ result->wtmult = be32_to_cpup(p++);
+ result->dtpref = be32_to_cpup(p++);
+ p = xdr_decode_size3(p, &result->maxfilesize);
+ xdr_decode_nfstime3(p, &result->time_delta);
+
+ /* ignore properties */
+ result->lease_time = 0;
+ return 0;
+}
+
+static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_fsinfo *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_status;
+ error = decode_fsinfo3resok(xdr, result);
+out:
+ return error;
+out_status:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.20 PATHCONF3res
+ *
+ * struct PATHCONF3resok {
+ * post_op_attr obj_attributes;
+ * uint32 linkmax;
+ * uint32 name_max;
+ * bool no_trunc;
+ * bool chown_restricted;
+ * bool case_insensitive;
+ * bool case_preserving;
+ * };
+ *
+ * struct PATHCONF3resfail {
+ * post_op_attr obj_attributes;
+ * };
+ *
+ * union PATHCONF3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * PATHCONF3resok resok;
+ * default:
+ * PATHCONF3resfail resfail;
+ * };
+ */
+static int decode_pathconf3resok(struct xdr_stream *xdr,
+ struct nfs_pathconf *result)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4 * 6);
+ if (unlikely(!p))
+ return -EIO;
+ result->max_link = be32_to_cpup(p++);
+ result->max_namelen = be32_to_cpup(p);
+ /* ignore remaining fields */
+ return 0;
+}
+
+static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_pathconf *result = data;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_status;
+ error = decode_pathconf3resok(xdr, result);
+out:
+ return error;
+out_status:
+ return nfs3_stat_to_errno(status);
+}
+
+/*
+ * 3.3.21 COMMIT3res
+ *
+ * struct COMMIT3resok {
+ * wcc_data file_wcc;
+ * writeverf3 verf;
+ * };
+ *
+ * struct COMMIT3resfail {
+ * wcc_data file_wcc;
+ * };
+ *
+ * union COMMIT3res switch (nfsstat3 status) {
+ * case NFS3_OK:
+ * COMMIT3resok resok;
+ * default:
+ * COMMIT3resfail resfail;
+ * };
+ */
+static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_commitres *result = data;
+ struct nfs_writeverf *verf = result->verf;
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ error = decode_wcc_data(xdr, result->fattr, rpc_rqst_userns(req));
+ if (unlikely(error))
+ goto out;
+ result->op_status = status;
+ if (status != NFS3_OK)
+ goto out_status;
+ error = decode_writeverf3(xdr, &verf->verifier);
+ if (!error)
+ verf->committed = NFS_FILE_SYNC;
+out:
+ return error;
+out_status:
+ return nfs3_stat_to_errno(status);
+}
+
+#ifdef CONFIG_NFS_V3_ACL
+
+static inline int decode_getacl3resok(struct xdr_stream *xdr,
+ struct nfs3_getaclres *result,
+ struct user_namespace *userns)
+{
+ struct posix_acl **acl;
+ unsigned int *aclcnt;
+ size_t hdrlen;
+ int error;
+
+ error = decode_post_op_attr(xdr, result->fattr, userns);
+ if (unlikely(error))
+ goto out;
+ error = decode_uint32(xdr, &result->mask);
+ if (unlikely(error))
+ goto out;
+ error = -EINVAL;
+ if (result->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+ goto out;
+
+ hdrlen = xdr_stream_pos(xdr);
+
+ acl = NULL;
+ if (result->mask & NFS_ACL)
+ acl = &result->acl_access;
+ aclcnt = NULL;
+ if (result->mask & NFS_ACLCNT)
+ aclcnt = &result->acl_access_count;
+ error = nfsacl_decode(xdr->buf, hdrlen, aclcnt, acl);
+ if (unlikely(error <= 0))
+ goto out;
+
+ acl = NULL;
+ if (result->mask & NFS_DFACL)
+ acl = &result->acl_default;
+ aclcnt = NULL;
+ if (result->mask & NFS_DFACLCNT)
+ aclcnt = &result->acl_default_count;
+ error = nfsacl_decode(xdr->buf, hdrlen + error, aclcnt, acl);
+ if (unlikely(error <= 0))
+ return error;
+ error = 0;
+out:
+ return error;
+}
+
+static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *result)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_default;
+ error = decode_getacl3resok(xdr, result, rpc_rqst_userns(req));
+out:
+ return error;
+out_default:
+ return nfs3_stat_to_errno(status);
+}
+
+static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *result)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_nfsstat3(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS3_OK)
+ goto out_default;
+ error = decode_post_op_attr(xdr, result, rpc_rqst_userns(req));
+out:
+ return error;
+out_default:
+ return nfs3_stat_to_errno(status);
+}
+
+#endif /* CONFIG_NFS_V3_ACL */
+
+
+/*
+ * We need to translate between nfs status return values and
+ * the local errno values which may not be the same.
+ */
+static const struct {
+ int stat;
+ int errno;
+} nfs_errtbl[] = {
+ { NFS_OK, 0 },
+ { NFSERR_PERM, -EPERM },
+ { NFSERR_NOENT, -ENOENT },
+ { NFSERR_IO, -errno_NFSERR_IO},
+ { NFSERR_NXIO, -ENXIO },
+/* { NFSERR_EAGAIN, -EAGAIN }, */
+ { NFSERR_ACCES, -EACCES },
+ { NFSERR_EXIST, -EEXIST },
+ { NFSERR_XDEV, -EXDEV },
+ { NFSERR_NODEV, -ENODEV },
+ { NFSERR_NOTDIR, -ENOTDIR },
+ { NFSERR_ISDIR, -EISDIR },
+ { NFSERR_INVAL, -EINVAL },
+ { NFSERR_FBIG, -EFBIG },
+ { NFSERR_NOSPC, -ENOSPC },
+ { NFSERR_ROFS, -EROFS },
+ { NFSERR_MLINK, -EMLINK },
+ { NFSERR_NAMETOOLONG, -ENAMETOOLONG },
+ { NFSERR_NOTEMPTY, -ENOTEMPTY },
+ { NFSERR_DQUOT, -EDQUOT },
+ { NFSERR_STALE, -ESTALE },
+ { NFSERR_REMOTE, -EREMOTE },
+#ifdef EWFLUSH
+ { NFSERR_WFLUSH, -EWFLUSH },
+#endif
+ { NFSERR_BADHANDLE, -EBADHANDLE },
+ { NFSERR_NOT_SYNC, -ENOTSYNC },
+ { NFSERR_BAD_COOKIE, -EBADCOOKIE },
+ { NFSERR_NOTSUPP, -ENOTSUPP },
+ { NFSERR_TOOSMALL, -ETOOSMALL },
+ { NFSERR_SERVERFAULT, -EREMOTEIO },
+ { NFSERR_BADTYPE, -EBADTYPE },
+ { NFSERR_JUKEBOX, -EJUKEBOX },
+ { -1, -EIO }
+};
+
+/**
+ * nfs3_stat_to_errno - convert an NFS status code to a local errno
+ * @status: NFS status code to convert
+ *
+ * Returns a local errno value, or -EIO if the NFS status code is
+ * not recognized. This function is used jointly by NFSv2 and NFSv3.
+ */
+static int nfs3_stat_to_errno(enum nfs_stat status)
+{
+ int i;
+
+ for (i = 0; nfs_errtbl[i].stat != -1; i++) {
+ if (nfs_errtbl[i].stat == (int)status)
+ return nfs_errtbl[i].errno;
+ }
+ dprintk("NFS: Unrecognized nfs status value: %u\n", status);
+ return nfs_errtbl[i].errno;
+}
+
+
+#define PROC(proc, argtype, restype, timer) \
+[NFS3PROC_##proc] = { \
+ .p_proc = NFS3PROC_##proc, \
+ .p_encode = nfs3_xdr_enc_##argtype##3args, \
+ .p_decode = nfs3_xdr_dec_##restype##3res, \
+ .p_arglen = NFS3_##argtype##args_sz, \
+ .p_replen = NFS3_##restype##res_sz, \
+ .p_timer = timer, \
+ .p_statidx = NFS3PROC_##proc, \
+ .p_name = #proc, \
+ }
+
+const struct rpc_procinfo nfs3_procedures[] = {
+ PROC(GETATTR, getattr, getattr, 1),
+ PROC(SETATTR, setattr, setattr, 0),
+ PROC(LOOKUP, lookup, lookup, 2),
+ PROC(ACCESS, access, access, 1),
+ PROC(READLINK, readlink, readlink, 3),
+ PROC(READ, read, read, 3),
+ PROC(WRITE, write, write, 4),
+ PROC(CREATE, create, create, 0),
+ PROC(MKDIR, mkdir, create, 0),
+ PROC(SYMLINK, symlink, create, 0),
+ PROC(MKNOD, mknod, create, 0),
+ PROC(REMOVE, remove, remove, 0),
+ PROC(RMDIR, lookup, setattr, 0),
+ PROC(RENAME, rename, rename, 0),
+ PROC(LINK, link, link, 0),
+ PROC(READDIR, readdir, readdir, 3),
+ PROC(READDIRPLUS, readdirplus, readdir, 3),
+ PROC(FSSTAT, getattr, fsstat, 0),
+ PROC(FSINFO, getattr, fsinfo, 0),
+ PROC(PATHCONF, getattr, pathconf, 0),
+ PROC(COMMIT, commit, commit, 5),
+};
+
+static unsigned int nfs_version3_counts[ARRAY_SIZE(nfs3_procedures)];
+const struct rpc_version nfs_version3 = {
+ .number = 3,
+ .nrprocs = ARRAY_SIZE(nfs3_procedures),
+ .procs = nfs3_procedures,
+ .counts = nfs_version3_counts,
+};
+
+#ifdef CONFIG_NFS_V3_ACL
+static const struct rpc_procinfo nfs3_acl_procedures[] = {
+ [ACLPROC3_GETACL] = {
+ .p_proc = ACLPROC3_GETACL,
+ .p_encode = nfs3_xdr_enc_getacl3args,
+ .p_decode = nfs3_xdr_dec_getacl3res,
+ .p_arglen = ACL3_getaclargs_sz,
+ .p_replen = ACL3_getaclres_sz,
+ .p_timer = 1,
+ .p_name = "GETACL",
+ },
+ [ACLPROC3_SETACL] = {
+ .p_proc = ACLPROC3_SETACL,
+ .p_encode = nfs3_xdr_enc_setacl3args,
+ .p_decode = nfs3_xdr_dec_setacl3res,
+ .p_arglen = ACL3_setaclargs_sz,
+ .p_replen = ACL3_setaclres_sz,
+ .p_timer = 0,
+ .p_name = "SETACL",
+ },
+};
+
+static unsigned int nfs3_acl_counts[ARRAY_SIZE(nfs3_acl_procedures)];
+const struct rpc_version nfsacl_version3 = {
+ .number = 3,
+ .nrprocs = ARRAY_SIZE(nfs3_acl_procedures),
+ .procs = nfs3_acl_procedures,
+ .counts = nfs3_acl_counts,
+};
+#endif /* CONFIG_NFS_V3_ACL */
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
new file mode 100644
index 000000000..0fe5aacbc
--- /dev/null
+++ b/fs/nfs/nfs42.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
+ */
+
+#ifndef __LINUX_FS_NFS_NFS4_2_H
+#define __LINUX_FS_NFS_NFS4_2_H
+
+#include <linux/xattr.h>
+
+/*
+ * FIXME: four LAYOUTSTATS calls per compound at most! Do we need to support
+ * more? Need to consider not to pre-alloc too much for a compound.
+ */
+#define PNFS_LAYOUTSTATS_MAXDEV (4)
+
+/* nfs4.2proc.c */
+#ifdef CONFIG_NFS_V4_2
+int nfs42_proc_allocate(struct file *, loff_t, loff_t);
+ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t,
+ struct nl4_server *, nfs4_stateid *, bool);
+int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
+loff_t nfs42_proc_llseek(struct file *, loff_t, int);
+int nfs42_proc_layoutstats_generic(struct nfs_server *,
+ struct nfs42_layoutstat_data *);
+int nfs42_proc_clone(struct file *, struct file *, loff_t, loff_t, loff_t);
+int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg,
+ const struct nfs42_layout_error *errors,
+ size_t n);
+int nfs42_proc_copy_notify(struct file *, struct file *,
+ struct nfs42_copy_notify_res *);
+static inline bool nfs42_files_from_same_server(struct file *in,
+ struct file *out)
+{
+ struct nfs_client *c_in = (NFS_SERVER(file_inode(in)))->nfs_client;
+ struct nfs_client *c_out = (NFS_SERVER(file_inode(out)))->nfs_client;
+
+ return nfs4_check_serverowner_major_id(c_in->cl_serverowner,
+ c_out->cl_serverowner);
+}
+
+ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name,
+ void *buf, size_t buflen);
+int nfs42_proc_setxattr(struct inode *inode, const char *name,
+ const void *buf, size_t buflen, int flags);
+ssize_t nfs42_proc_listxattrs(struct inode *inode, void *buf,
+ size_t buflen, u64 *cookiep, bool *eofp);
+int nfs42_proc_removexattr(struct inode *inode, const char *name);
+
+/*
+ * Maximum XDR buffer size needed for a listxattr buffer of buflen size.
+ *
+ * The upper boundary is a buffer with all 1-byte sized attribute names.
+ * They would be 7 bytes long in the eventual buffer ("user.x\0"), and
+ * 8 bytes long XDR-encoded.
+ *
+ * Include the trailing eof word as well.
+ */
+static inline u32 nfs42_listxattr_xdrsize(u32 buflen)
+{
+ return ((buflen / (XATTR_USER_PREFIX_LEN + 2)) * 8) + 4;
+}
+#endif /* CONFIG_NFS_V4_2 */
+#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
new file mode 100644
index 000000000..dfeea7120
--- /dev/null
+++ b/fs/nfs/nfs42proc.c
@@ -0,0 +1,1373 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
+ */
+#include <linux/fs.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_xdr.h>
+#include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
+#include "nfs42.h"
+#include "iostat.h"
+#include "pnfs.h"
+#include "nfs4session.h"
+#include "internal.h"
+#include "delegation.h"
+#include "nfs4trace.h"
+
+#define NFSDBG_FACILITY NFSDBG_PROC
+static int nfs42_do_offload_cancel_async(struct file *dst, nfs4_stateid *std);
+
+static void nfs42_set_netaddr(struct file *filep, struct nfs42_netaddr *naddr)
+{
+ struct nfs_client *clp = (NFS_SERVER(file_inode(filep)))->nfs_client;
+ unsigned short port = 2049;
+
+ rcu_read_lock();
+ naddr->netid_len = scnprintf(naddr->netid,
+ sizeof(naddr->netid), "%s",
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_NETID));
+ naddr->addr_len = scnprintf(naddr->addr,
+ sizeof(naddr->addr),
+ "%s.%u.%u",
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR),
+ port >> 8, port & 255);
+ rcu_read_unlock();
+}
+
+static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
+ struct nfs_lock_context *lock, loff_t offset, loff_t len)
+{
+ struct inode *inode = file_inode(filep);
+ struct nfs_server *server = NFS_SERVER(inode);
+ u32 bitmask[3];
+ struct nfs42_falloc_args args = {
+ .falloc_fh = NFS_FH(inode),
+ .falloc_offset = offset,
+ .falloc_length = len,
+ .falloc_bitmask = bitmask,
+ };
+ struct nfs42_falloc_res res = {
+ .falloc_server = server,
+ };
+ int status;
+
+ msg->rpc_argp = &args;
+ msg->rpc_resp = &res;
+
+ status = nfs4_set_rw_stateid(&args.falloc_stateid, lock->open_context,
+ lock, FMODE_WRITE);
+ if (status) {
+ if (status == -EAGAIN)
+ status = -NFS4ERR_BAD_STATEID;
+ return status;
+ }
+
+ memcpy(bitmask, server->cache_consistency_bitmask, sizeof(bitmask));
+ if (server->attr_bitmask[1] & FATTR4_WORD1_SPACE_USED)
+ bitmask[1] |= FATTR4_WORD1_SPACE_USED;
+
+ res.falloc_fattr = nfs_alloc_fattr();
+ if (!res.falloc_fattr)
+ return -ENOMEM;
+
+ status = nfs4_call_sync(server->client, server, msg,
+ &args.seq_args, &res.seq_res, 0);
+ if (status == 0)
+ status = nfs_post_op_update_inode_force_wcc(inode,
+ res.falloc_fattr);
+
+ kfree(res.falloc_fattr);
+ return status;
+}
+
+static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
+ loff_t offset, loff_t len)
+{
+ struct inode *inode = file_inode(filep);
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs4_exception exception = { };
+ struct nfs_lock_context *lock;
+ int err;
+
+ lock = nfs_get_lock_context(nfs_file_open_context(filep));
+ if (IS_ERR(lock))
+ return PTR_ERR(lock);
+
+ exception.inode = inode;
+ exception.state = lock->open_context->state;
+
+ err = nfs_sync_inode(inode);
+ if (err)
+ goto out;
+
+ do {
+ err = _nfs42_proc_fallocate(msg, filep, lock, offset, len);
+ if (err == -ENOTSUPP) {
+ err = -EOPNOTSUPP;
+ break;
+ }
+ err = nfs4_handle_exception(server, err, &exception);
+ } while (exception.retry);
+out:
+ nfs_put_lock_context(lock);
+ return err;
+}
+
+int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE],
+ };
+ struct inode *inode = file_inode(filep);
+ int err;
+
+ if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE))
+ return -EOPNOTSUPP;
+
+ inode_lock(inode);
+
+ err = nfs42_proc_fallocate(&msg, filep, offset, len);
+ if (err == -EOPNOTSUPP)
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE;
+
+ inode_unlock(inode);
+ return err;
+}
+
+int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DEALLOCATE],
+ };
+ struct inode *inode = file_inode(filep);
+ int err;
+
+ if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE))
+ return -EOPNOTSUPP;
+
+ inode_lock(inode);
+
+ err = nfs42_proc_fallocate(&msg, filep, offset, len);
+ if (err == 0)
+ truncate_pagecache_range(inode, offset, (offset + len) -1);
+ if (err == -EOPNOTSUPP)
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
+
+ inode_unlock(inode);
+ return err;
+}
+
+static int handle_async_copy(struct nfs42_copy_res *res,
+ struct nfs_server *dst_server,
+ struct nfs_server *src_server,
+ struct file *src,
+ struct file *dst,
+ nfs4_stateid *src_stateid,
+ bool *restart)
+{
+ struct nfs4_copy_state *copy, *tmp_copy;
+ int status = NFS4_OK;
+ bool found_pending = false;
+ struct nfs_open_context *dst_ctx = nfs_file_open_context(dst);
+ struct nfs_open_context *src_ctx = nfs_file_open_context(src);
+
+ copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS);
+ if (!copy)
+ return -ENOMEM;
+
+ spin_lock(&dst_server->nfs_client->cl_lock);
+ list_for_each_entry(tmp_copy,
+ &dst_server->nfs_client->pending_cb_stateids,
+ copies) {
+ if (memcmp(&res->write_res.stateid, &tmp_copy->stateid,
+ NFS4_STATEID_SIZE))
+ continue;
+ found_pending = true;
+ list_del(&tmp_copy->copies);
+ break;
+ }
+ if (found_pending) {
+ spin_unlock(&dst_server->nfs_client->cl_lock);
+ kfree(copy);
+ copy = tmp_copy;
+ goto out;
+ }
+
+ memcpy(&copy->stateid, &res->write_res.stateid, NFS4_STATEID_SIZE);
+ init_completion(&copy->completion);
+ copy->parent_dst_state = dst_ctx->state;
+ copy->parent_src_state = src_ctx->state;
+
+ list_add_tail(&copy->copies, &dst_server->ss_copies);
+ spin_unlock(&dst_server->nfs_client->cl_lock);
+
+ if (dst_server != src_server) {
+ spin_lock(&src_server->nfs_client->cl_lock);
+ list_add_tail(&copy->src_copies, &src_server->ss_copies);
+ spin_unlock(&src_server->nfs_client->cl_lock);
+ }
+
+ status = wait_for_completion_interruptible(&copy->completion);
+ spin_lock(&dst_server->nfs_client->cl_lock);
+ list_del_init(&copy->copies);
+ spin_unlock(&dst_server->nfs_client->cl_lock);
+ if (dst_server != src_server) {
+ spin_lock(&src_server->nfs_client->cl_lock);
+ list_del_init(&copy->src_copies);
+ spin_unlock(&src_server->nfs_client->cl_lock);
+ }
+ if (status == -ERESTARTSYS) {
+ goto out_cancel;
+ } else if (copy->flags || copy->error == NFS4ERR_PARTNER_NO_AUTH) {
+ status = -EAGAIN;
+ *restart = true;
+ goto out_cancel;
+ }
+out:
+ res->write_res.count = copy->count;
+ memcpy(&res->write_res.verifier, &copy->verf, sizeof(copy->verf));
+ status = -copy->error;
+
+out_free:
+ kfree(copy);
+ return status;
+out_cancel:
+ nfs42_do_offload_cancel_async(dst, &copy->stateid);
+ if (!nfs42_files_from_same_server(src, dst))
+ nfs42_do_offload_cancel_async(src, src_stateid);
+ goto out_free;
+}
+
+static int process_copy_commit(struct file *dst, loff_t pos_dst,
+ struct nfs42_copy_res *res)
+{
+ struct nfs_commitres cres;
+ int status = -ENOMEM;
+
+ cres.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS);
+ if (!cres.verf)
+ goto out;
+
+ status = nfs4_proc_commit(dst, pos_dst, res->write_res.count, &cres);
+ if (status)
+ goto out_free;
+ if (nfs_write_verifier_cmp(&res->write_res.verifier.verifier,
+ &cres.verf->verifier)) {
+ dprintk("commit verf differs from copy verf\n");
+ status = -EAGAIN;
+ }
+out_free:
+ kfree(cres.verf);
+out:
+ return status;
+}
+
+static ssize_t _nfs42_proc_copy(struct file *src,
+ struct nfs_lock_context *src_lock,
+ struct file *dst,
+ struct nfs_lock_context *dst_lock,
+ struct nfs42_copy_args *args,
+ struct nfs42_copy_res *res,
+ struct nl4_server *nss,
+ nfs4_stateid *cnr_stateid,
+ bool *restart)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY],
+ .rpc_argp = args,
+ .rpc_resp = res,
+ };
+ struct inode *dst_inode = file_inode(dst);
+ struct inode *src_inode = file_inode(src);
+ struct nfs_server *dst_server = NFS_SERVER(dst_inode);
+ struct nfs_server *src_server = NFS_SERVER(src_inode);
+ loff_t pos_src = args->src_pos;
+ loff_t pos_dst = args->dst_pos;
+ size_t count = args->count;
+ ssize_t status;
+
+ if (nss) {
+ args->cp_src = nss;
+ nfs4_stateid_copy(&args->src_stateid, cnr_stateid);
+ } else {
+ status = nfs4_set_rw_stateid(&args->src_stateid,
+ src_lock->open_context, src_lock, FMODE_READ);
+ if (status) {
+ if (status == -EAGAIN)
+ status = -NFS4ERR_BAD_STATEID;
+ return status;
+ }
+ }
+ status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping,
+ pos_src, pos_src + (loff_t)count - 1);
+ if (status)
+ return status;
+
+ status = nfs4_set_rw_stateid(&args->dst_stateid, dst_lock->open_context,
+ dst_lock, FMODE_WRITE);
+ if (status) {
+ if (status == -EAGAIN)
+ status = -NFS4ERR_BAD_STATEID;
+ return status;
+ }
+
+ status = nfs_sync_inode(dst_inode);
+ if (status)
+ return status;
+
+ res->commit_res.verf = NULL;
+ if (args->sync) {
+ res->commit_res.verf =
+ kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS);
+ if (!res->commit_res.verf)
+ return -ENOMEM;
+ }
+ set_bit(NFS_CLNT_SRC_SSC_COPY_STATE,
+ &src_lock->open_context->state->flags);
+ set_bit(NFS_CLNT_DST_SSC_COPY_STATE,
+ &dst_lock->open_context->state->flags);
+
+ status = nfs4_call_sync(dst_server->client, dst_server, &msg,
+ &args->seq_args, &res->seq_res, 0);
+ if (status == -ENOTSUPP)
+ dst_server->caps &= ~NFS_CAP_COPY;
+ if (status)
+ goto out;
+
+ if (args->sync &&
+ nfs_write_verifier_cmp(&res->write_res.verifier.verifier,
+ &res->commit_res.verf->verifier)) {
+ status = -EAGAIN;
+ goto out;
+ }
+
+ if (!res->synchronous) {
+ status = handle_async_copy(res, dst_server, src_server, src,
+ dst, &args->src_stateid, restart);
+ if (status)
+ goto out;
+ }
+
+ if ((!res->synchronous || !args->sync) &&
+ res->write_res.verifier.committed != NFS_FILE_SYNC) {
+ status = process_copy_commit(dst, pos_dst, res);
+ if (status)
+ goto out;
+ }
+
+ WARN_ON_ONCE(invalidate_inode_pages2_range(dst_inode->i_mapping,
+ pos_dst >> PAGE_SHIFT,
+ (pos_dst + res->write_res.count - 1) >> PAGE_SHIFT));
+ spin_lock(&dst_inode->i_lock);
+ NFS_I(dst_inode)->cache_validity |= (NFS_INO_REVAL_PAGECACHE |
+ NFS_INO_REVAL_FORCED | NFS_INO_INVALID_SIZE |
+ NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA);
+ spin_unlock(&dst_inode->i_lock);
+ spin_lock(&src_inode->i_lock);
+ NFS_I(src_inode)->cache_validity |= (NFS_INO_REVAL_PAGECACHE |
+ NFS_INO_REVAL_FORCED | NFS_INO_INVALID_ATIME);
+ spin_unlock(&src_inode->i_lock);
+ status = res->write_res.count;
+out:
+ if (args->sync)
+ kfree(res->commit_res.verf);
+ return status;
+}
+
+ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
+ struct file *dst, loff_t pos_dst, size_t count,
+ struct nl4_server *nss,
+ nfs4_stateid *cnr_stateid, bool sync)
+{
+ struct nfs_server *server = NFS_SERVER(file_inode(dst));
+ struct nfs_lock_context *src_lock;
+ struct nfs_lock_context *dst_lock;
+ struct nfs42_copy_args args = {
+ .src_fh = NFS_FH(file_inode(src)),
+ .src_pos = pos_src,
+ .dst_fh = NFS_FH(file_inode(dst)),
+ .dst_pos = pos_dst,
+ .count = count,
+ .sync = sync,
+ };
+ struct nfs42_copy_res res;
+ struct nfs4_exception src_exception = {
+ .inode = file_inode(src),
+ .stateid = &args.src_stateid,
+ };
+ struct nfs4_exception dst_exception = {
+ .inode = file_inode(dst),
+ .stateid = &args.dst_stateid,
+ };
+ ssize_t err, err2;
+ bool restart = false;
+
+ src_lock = nfs_get_lock_context(nfs_file_open_context(src));
+ if (IS_ERR(src_lock))
+ return PTR_ERR(src_lock);
+
+ src_exception.state = src_lock->open_context->state;
+
+ dst_lock = nfs_get_lock_context(nfs_file_open_context(dst));
+ if (IS_ERR(dst_lock)) {
+ err = PTR_ERR(dst_lock);
+ goto out_put_src_lock;
+ }
+
+ dst_exception.state = dst_lock->open_context->state;
+
+ do {
+ inode_lock(file_inode(dst));
+ err = _nfs42_proc_copy(src, src_lock,
+ dst, dst_lock,
+ &args, &res,
+ nss, cnr_stateid, &restart);
+ inode_unlock(file_inode(dst));
+
+ if (err >= 0)
+ break;
+ if (err == -ENOTSUPP &&
+ nfs42_files_from_same_server(src, dst)) {
+ err = -EOPNOTSUPP;
+ break;
+ } else if (err == -EAGAIN) {
+ if (!restart) {
+ dst_exception.retry = 1;
+ continue;
+ }
+ break;
+ } else if (err == -NFS4ERR_OFFLOAD_NO_REQS &&
+ args.sync != res.synchronous) {
+ args.sync = res.synchronous;
+ dst_exception.retry = 1;
+ continue;
+ } else if ((err == -ESTALE ||
+ err == -NFS4ERR_OFFLOAD_DENIED ||
+ err == -ENOTSUPP) &&
+ !nfs42_files_from_same_server(src, dst)) {
+ nfs42_do_offload_cancel_async(src, &args.src_stateid);
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ err2 = nfs4_handle_exception(server, err, &src_exception);
+ err = nfs4_handle_exception(server, err, &dst_exception);
+ if (!err)
+ err = err2;
+ } while (src_exception.retry || dst_exception.retry);
+
+ nfs_put_lock_context(dst_lock);
+out_put_src_lock:
+ nfs_put_lock_context(src_lock);
+ return err;
+}
+
+struct nfs42_offloadcancel_data {
+ struct nfs_server *seq_server;
+ struct nfs42_offload_status_args args;
+ struct nfs42_offload_status_res res;
+};
+
+static void nfs42_offload_cancel_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs42_offloadcancel_data *data = calldata;
+
+ nfs4_setup_sequence(data->seq_server->nfs_client,
+ &data->args.osa_seq_args,
+ &data->res.osr_seq_res, task);
+}
+
+static void nfs42_offload_cancel_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs42_offloadcancel_data *data = calldata;
+
+ nfs41_sequence_done(task, &data->res.osr_seq_res);
+ if (task->tk_status &&
+ nfs4_async_handle_error(task, data->seq_server, NULL,
+ NULL) == -EAGAIN)
+ rpc_restart_call_prepare(task);
+}
+
+static void nfs42_free_offloadcancel_data(void *data)
+{
+ kfree(data);
+}
+
+static const struct rpc_call_ops nfs42_offload_cancel_ops = {
+ .rpc_call_prepare = nfs42_offload_cancel_prepare,
+ .rpc_call_done = nfs42_offload_cancel_done,
+ .rpc_release = nfs42_free_offloadcancel_data,
+};
+
+static int nfs42_do_offload_cancel_async(struct file *dst,
+ nfs4_stateid *stateid)
+{
+ struct nfs_server *dst_server = NFS_SERVER(file_inode(dst));
+ struct nfs42_offloadcancel_data *data = NULL;
+ struct nfs_open_context *ctx = nfs_file_open_context(dst);
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OFFLOAD_CANCEL],
+ .rpc_cred = ctx->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = dst_server->client,
+ .rpc_message = &msg,
+ .callback_ops = &nfs42_offload_cancel_ops,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC,
+ };
+ int status;
+
+ if (!(dst_server->caps & NFS_CAP_OFFLOAD_CANCEL))
+ return -EOPNOTSUPP;
+
+ data = kzalloc(sizeof(struct nfs42_offloadcancel_data), GFP_NOFS);
+ if (data == NULL)
+ return -ENOMEM;
+
+ data->seq_server = dst_server;
+ data->args.osa_src_fh = NFS_FH(file_inode(dst));
+ memcpy(&data->args.osa_stateid, stateid,
+ sizeof(data->args.osa_stateid));
+ msg.rpc_argp = &data->args;
+ msg.rpc_resp = &data->res;
+ task_setup_data.callback_data = data;
+ nfs4_init_sequence(&data->args.osa_seq_args, &data->res.osr_seq_res,
+ 1, 0);
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ status = rpc_wait_for_completion_task(task);
+ if (status == -ENOTSUPP)
+ dst_server->caps &= ~NFS_CAP_OFFLOAD_CANCEL;
+ rpc_put_task(task);
+ return status;
+}
+
+static int _nfs42_proc_copy_notify(struct file *src, struct file *dst,
+ struct nfs42_copy_notify_args *args,
+ struct nfs42_copy_notify_res *res)
+{
+ struct nfs_server *src_server = NFS_SERVER(file_inode(src));
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY_NOTIFY],
+ .rpc_argp = args,
+ .rpc_resp = res,
+ };
+ int status;
+ struct nfs_open_context *ctx;
+ struct nfs_lock_context *l_ctx;
+
+ ctx = get_nfs_open_context(nfs_file_open_context(src));
+ l_ctx = nfs_get_lock_context(ctx);
+ if (IS_ERR(l_ctx)) {
+ status = PTR_ERR(l_ctx);
+ goto out;
+ }
+
+ status = nfs4_set_rw_stateid(&args->cna_src_stateid, ctx, l_ctx,
+ FMODE_READ);
+ nfs_put_lock_context(l_ctx);
+ if (status) {
+ if (status == -EAGAIN)
+ status = -NFS4ERR_BAD_STATEID;
+ goto out;
+ }
+
+ status = nfs4_call_sync(src_server->client, src_server, &msg,
+ &args->cna_seq_args, &res->cnr_seq_res, 0);
+ if (status == -ENOTSUPP)
+ src_server->caps &= ~NFS_CAP_COPY_NOTIFY;
+
+out:
+ put_nfs_open_context(nfs_file_open_context(src));
+ return status;
+}
+
+int nfs42_proc_copy_notify(struct file *src, struct file *dst,
+ struct nfs42_copy_notify_res *res)
+{
+ struct nfs_server *src_server = NFS_SERVER(file_inode(src));
+ struct nfs42_copy_notify_args *args;
+ struct nfs4_exception exception = {
+ .inode = file_inode(src),
+ };
+ int status;
+
+ if (!(src_server->caps & NFS_CAP_COPY_NOTIFY))
+ return -EOPNOTSUPP;
+
+ args = kzalloc(sizeof(struct nfs42_copy_notify_args), GFP_NOFS);
+ if (args == NULL)
+ return -ENOMEM;
+
+ args->cna_src_fh = NFS_FH(file_inode(src)),
+ args->cna_dst.nl4_type = NL4_NETADDR;
+ nfs42_set_netaddr(dst, &args->cna_dst.u.nl4_addr);
+ exception.stateid = &args->cna_src_stateid;
+
+ do {
+ status = _nfs42_proc_copy_notify(src, dst, args, res);
+ if (status == -ENOTSUPP) {
+ status = -EOPNOTSUPP;
+ goto out;
+ }
+ status = nfs4_handle_exception(src_server, status, &exception);
+ } while (exception.retry);
+
+out:
+ kfree(args);
+ return status;
+}
+
+static loff_t _nfs42_proc_llseek(struct file *filep,
+ struct nfs_lock_context *lock, loff_t offset, int whence)
+{
+ struct inode *inode = file_inode(filep);
+ struct nfs42_seek_args args = {
+ .sa_fh = NFS_FH(inode),
+ .sa_offset = offset,
+ .sa_what = (whence == SEEK_HOLE) ?
+ NFS4_CONTENT_HOLE : NFS4_CONTENT_DATA,
+ };
+ struct nfs42_seek_res res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEEK],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ struct nfs_server *server = NFS_SERVER(inode);
+ int status;
+
+ if (!nfs_server_capable(inode, NFS_CAP_SEEK))
+ return -ENOTSUPP;
+
+ status = nfs4_set_rw_stateid(&args.sa_stateid, lock->open_context,
+ lock, FMODE_READ);
+ if (status) {
+ if (status == -EAGAIN)
+ status = -NFS4ERR_BAD_STATEID;
+ return status;
+ }
+
+ status = nfs_filemap_write_and_wait_range(inode->i_mapping,
+ offset, LLONG_MAX);
+ if (status)
+ return status;
+
+ status = nfs4_call_sync(server->client, server, &msg,
+ &args.seq_args, &res.seq_res, 0);
+ if (status == -ENOTSUPP)
+ server->caps &= ~NFS_CAP_SEEK;
+ if (status)
+ return status;
+
+ if (whence == SEEK_DATA && res.sr_eof)
+ return -NFS4ERR_NXIO;
+ else
+ return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
+}
+
+loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
+{
+ struct nfs_server *server = NFS_SERVER(file_inode(filep));
+ struct nfs4_exception exception = { };
+ struct nfs_lock_context *lock;
+ loff_t err;
+
+ lock = nfs_get_lock_context(nfs_file_open_context(filep));
+ if (IS_ERR(lock))
+ return PTR_ERR(lock);
+
+ exception.inode = file_inode(filep);
+ exception.state = lock->open_context->state;
+
+ do {
+ err = _nfs42_proc_llseek(filep, lock, offset, whence);
+ if (err >= 0)
+ break;
+ if (err == -ENOTSUPP) {
+ err = -EOPNOTSUPP;
+ break;
+ }
+ err = nfs4_handle_exception(server, err, &exception);
+ } while (exception.retry);
+
+ nfs_put_lock_context(lock);
+ return err;
+}
+
+
+static void
+nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs42_layoutstat_data *data = calldata;
+ struct inode *inode = data->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct pnfs_layout_hdr *lo;
+
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (!pnfs_layout_is_valid(lo)) {
+ spin_unlock(&inode->i_lock);
+ rpc_exit(task, 0);
+ return;
+ }
+ nfs4_stateid_copy(&data->args.stateid, &lo->plh_stateid);
+ spin_unlock(&inode->i_lock);
+ nfs4_setup_sequence(server->nfs_client, &data->args.seq_args,
+ &data->res.seq_res, task);
+}
+
+static void
+nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs42_layoutstat_data *data = calldata;
+ struct inode *inode = data->inode;
+ struct pnfs_layout_hdr *lo;
+
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return;
+
+ switch (task->tk_status) {
+ case 0:
+ return;
+ case -NFS4ERR_BADHANDLE:
+ case -ESTALE:
+ pnfs_destroy_layout(NFS_I(inode));
+ break;
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_BAD_STATEID:
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match(&data->args.stateid,
+ &lo->plh_stateid)) {
+ LIST_HEAD(head);
+
+ /*
+ * Mark the bad layout state as invalid, then retry
+ * with the current stateid.
+ */
+ pnfs_mark_layout_stateid_invalid(lo, &head);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&head);
+ nfs_commit_inode(inode, 0);
+ } else
+ spin_unlock(&inode->i_lock);
+ break;
+ case -NFS4ERR_OLD_STATEID:
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match_other(&data->args.stateid,
+ &lo->plh_stateid)) {
+ /* Do we need to delay before resending? */
+ if (!nfs4_stateid_is_newer(&lo->plh_stateid,
+ &data->args.stateid))
+ rpc_delay(task, HZ);
+ rpc_restart_call_prepare(task);
+ }
+ spin_unlock(&inode->i_lock);
+ break;
+ case -ENOTSUPP:
+ case -EOPNOTSUPP:
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
+ }
+
+ trace_nfs4_layoutstats(inode, &data->args.stateid, task->tk_status);
+}
+
+static void
+nfs42_layoutstat_release(void *calldata)
+{
+ struct nfs42_layoutstat_data *data = calldata;
+ struct nfs42_layoutstat_devinfo *devinfo = data->args.devinfo;
+ int i;
+
+ for (i = 0; i < data->args.num_dev; i++) {
+ if (devinfo[i].ld_private.ops && devinfo[i].ld_private.ops->free)
+ devinfo[i].ld_private.ops->free(&devinfo[i].ld_private);
+ }
+
+ pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout);
+ smp_mb__before_atomic();
+ clear_bit(NFS_INO_LAYOUTSTATS, &NFS_I(data->args.inode)->flags);
+ smp_mb__after_atomic();
+ nfs_iput_and_deactive(data->inode);
+ kfree(data->args.devinfo);
+ kfree(data);
+}
+
+static const struct rpc_call_ops nfs42_layoutstat_ops = {
+ .rpc_call_prepare = nfs42_layoutstat_prepare,
+ .rpc_call_done = nfs42_layoutstat_done,
+ .rpc_release = nfs42_layoutstat_release,
+};
+
+int nfs42_proc_layoutstats_generic(struct nfs_server *server,
+ struct nfs42_layoutstat_data *data)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTSTATS],
+ .rpc_argp = &data->args,
+ .rpc_resp = &data->res,
+ };
+ struct rpc_task_setup task_setup = {
+ .rpc_client = server->client,
+ .rpc_message = &msg,
+ .callback_ops = &nfs42_layoutstat_ops,
+ .callback_data = data,
+ .flags = RPC_TASK_ASYNC,
+ };
+ struct rpc_task *task;
+
+ data->inode = nfs_igrab_and_active(data->args.inode);
+ if (!data->inode) {
+ nfs42_layoutstat_release(data);
+ return -EAGAIN;
+ }
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0, 0);
+ task = rpc_run_task(&task_setup);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ rpc_put_task(task);
+ return 0;
+}
+
+static struct nfs42_layouterror_data *
+nfs42_alloc_layouterror_data(struct pnfs_layout_segment *lseg, gfp_t gfp_flags)
+{
+ struct nfs42_layouterror_data *data;
+ struct inode *inode = lseg->pls_layout->plh_inode;
+
+ data = kzalloc(sizeof(*data), gfp_flags);
+ if (data) {
+ data->args.inode = data->inode = nfs_igrab_and_active(inode);
+ if (data->inode) {
+ data->lseg = pnfs_get_lseg(lseg);
+ if (data->lseg)
+ return data;
+ nfs_iput_and_deactive(data->inode);
+ }
+ kfree(data);
+ }
+ return NULL;
+}
+
+static void
+nfs42_free_layouterror_data(struct nfs42_layouterror_data *data)
+{
+ pnfs_put_lseg(data->lseg);
+ nfs_iput_and_deactive(data->inode);
+ kfree(data);
+}
+
+static void
+nfs42_layouterror_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs42_layouterror_data *data = calldata;
+ struct inode *inode = data->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct pnfs_layout_hdr *lo = data->lseg->pls_layout;
+ unsigned i;
+
+ spin_lock(&inode->i_lock);
+ if (!pnfs_layout_is_valid(lo)) {
+ spin_unlock(&inode->i_lock);
+ rpc_exit(task, 0);
+ return;
+ }
+ for (i = 0; i < data->args.num_errors; i++)
+ nfs4_stateid_copy(&data->args.errors[i].stateid,
+ &lo->plh_stateid);
+ spin_unlock(&inode->i_lock);
+ nfs4_setup_sequence(server->nfs_client, &data->args.seq_args,
+ &data->res.seq_res, task);
+}
+
+static void
+nfs42_layouterror_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs42_layouterror_data *data = calldata;
+ struct inode *inode = data->inode;
+ struct pnfs_layout_hdr *lo = data->lseg->pls_layout;
+
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return;
+
+ switch (task->tk_status) {
+ case 0:
+ return;
+ case -NFS4ERR_BADHANDLE:
+ case -ESTALE:
+ pnfs_destroy_layout(NFS_I(inode));
+ break;
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_BAD_STATEID:
+ spin_lock(&inode->i_lock);
+ if (pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match(&data->args.errors[0].stateid,
+ &lo->plh_stateid)) {
+ LIST_HEAD(head);
+
+ /*
+ * Mark the bad layout state as invalid, then retry
+ * with the current stateid.
+ */
+ pnfs_mark_layout_stateid_invalid(lo, &head);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&head);
+ nfs_commit_inode(inode, 0);
+ } else
+ spin_unlock(&inode->i_lock);
+ break;
+ case -NFS4ERR_OLD_STATEID:
+ spin_lock(&inode->i_lock);
+ if (pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match_other(&data->args.errors[0].stateid,
+ &lo->plh_stateid)) {
+ /* Do we need to delay before resending? */
+ if (!nfs4_stateid_is_newer(&lo->plh_stateid,
+ &data->args.errors[0].stateid))
+ rpc_delay(task, HZ);
+ rpc_restart_call_prepare(task);
+ }
+ spin_unlock(&inode->i_lock);
+ break;
+ case -ENOTSUPP:
+ case -EOPNOTSUPP:
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTERROR;
+ }
+
+ trace_nfs4_layouterror(inode, &data->args.errors[0].stateid,
+ task->tk_status);
+}
+
+static void
+nfs42_layouterror_release(void *calldata)
+{
+ struct nfs42_layouterror_data *data = calldata;
+
+ nfs42_free_layouterror_data(data);
+}
+
+static const struct rpc_call_ops nfs42_layouterror_ops = {
+ .rpc_call_prepare = nfs42_layouterror_prepare,
+ .rpc_call_done = nfs42_layouterror_done,
+ .rpc_release = nfs42_layouterror_release,
+};
+
+int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg,
+ const struct nfs42_layout_error *errors, size_t n)
+{
+ struct inode *inode = lseg->pls_layout->plh_inode;
+ struct nfs42_layouterror_data *data;
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTERROR],
+ };
+ struct rpc_task_setup task_setup = {
+ .rpc_message = &msg,
+ .callback_ops = &nfs42_layouterror_ops,
+ .flags = RPC_TASK_ASYNC,
+ };
+ unsigned int i;
+
+ if (!nfs_server_capable(inode, NFS_CAP_LAYOUTERROR))
+ return -EOPNOTSUPP;
+ if (n > NFS42_LAYOUTERROR_MAX)
+ return -EINVAL;
+ data = nfs42_alloc_layouterror_data(lseg, GFP_NOFS);
+ if (!data)
+ return -ENOMEM;
+ for (i = 0; i < n; i++) {
+ data->args.errors[i] = errors[i];
+ data->args.num_errors++;
+ data->res.num_errors++;
+ }
+ msg.rpc_argp = &data->args;
+ msg.rpc_resp = &data->res;
+ task_setup.callback_data = data;
+ task_setup.rpc_client = NFS_SERVER(inode)->client;
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0, 0);
+ task = rpc_run_task(&task_setup);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ rpc_put_task(task);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs42_proc_layouterror);
+
+static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
+ struct file *dst_f, struct nfs_lock_context *src_lock,
+ struct nfs_lock_context *dst_lock, loff_t src_offset,
+ loff_t dst_offset, loff_t count)
+{
+ struct inode *src_inode = file_inode(src_f);
+ struct inode *dst_inode = file_inode(dst_f);
+ struct nfs_server *server = NFS_SERVER(dst_inode);
+ struct nfs42_clone_args args = {
+ .src_fh = NFS_FH(src_inode),
+ .dst_fh = NFS_FH(dst_inode),
+ .src_offset = src_offset,
+ .dst_offset = dst_offset,
+ .count = count,
+ .dst_bitmask = server->cache_consistency_bitmask,
+ };
+ struct nfs42_clone_res res = {
+ .server = server,
+ };
+ int status;
+
+ msg->rpc_argp = &args;
+ msg->rpc_resp = &res;
+
+ status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context,
+ src_lock, FMODE_READ);
+ if (status) {
+ if (status == -EAGAIN)
+ status = -NFS4ERR_BAD_STATEID;
+ return status;
+ }
+ status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
+ dst_lock, FMODE_WRITE);
+ if (status) {
+ if (status == -EAGAIN)
+ status = -NFS4ERR_BAD_STATEID;
+ return status;
+ }
+
+ res.dst_fattr = nfs_alloc_fattr();
+ if (!res.dst_fattr)
+ return -ENOMEM;
+
+ status = nfs4_call_sync(server->client, server, msg,
+ &args.seq_args, &res.seq_res, 0);
+ if (status == 0)
+ status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);
+
+ kfree(res.dst_fattr);
+ return status;
+}
+
+int nfs42_proc_clone(struct file *src_f, struct file *dst_f,
+ loff_t src_offset, loff_t dst_offset, loff_t count)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLONE],
+ };
+ struct inode *inode = file_inode(src_f);
+ struct nfs_server *server = NFS_SERVER(file_inode(src_f));
+ struct nfs_lock_context *src_lock;
+ struct nfs_lock_context *dst_lock;
+ struct nfs4_exception src_exception = { };
+ struct nfs4_exception dst_exception = { };
+ int err, err2;
+
+ if (!nfs_server_capable(inode, NFS_CAP_CLONE))
+ return -EOPNOTSUPP;
+
+ src_lock = nfs_get_lock_context(nfs_file_open_context(src_f));
+ if (IS_ERR(src_lock))
+ return PTR_ERR(src_lock);
+
+ src_exception.inode = file_inode(src_f);
+ src_exception.state = src_lock->open_context->state;
+
+ dst_lock = nfs_get_lock_context(nfs_file_open_context(dst_f));
+ if (IS_ERR(dst_lock)) {
+ err = PTR_ERR(dst_lock);
+ goto out_put_src_lock;
+ }
+
+ dst_exception.inode = file_inode(dst_f);
+ dst_exception.state = dst_lock->open_context->state;
+
+ do {
+ err = _nfs42_proc_clone(&msg, src_f, dst_f, src_lock, dst_lock,
+ src_offset, dst_offset, count);
+ if (err == -ENOTSUPP || err == -EOPNOTSUPP) {
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_CLONE;
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ err2 = nfs4_handle_exception(server, err, &src_exception);
+ err = nfs4_handle_exception(server, err, &dst_exception);
+ if (!err)
+ err = err2;
+ } while (src_exception.retry || dst_exception.retry);
+
+ nfs_put_lock_context(dst_lock);
+out_put_src_lock:
+ nfs_put_lock_context(src_lock);
+ return err;
+}
+
+#define NFS4XATTR_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE)
+
+static int _nfs42_proc_removexattr(struct inode *inode, const char *name)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs42_removexattrargs args = {
+ .fh = NFS_FH(inode),
+ .xattr_name = name,
+ };
+ struct nfs42_removexattrres res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVEXATTR],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ int ret;
+ unsigned long timestamp = jiffies;
+
+ ret = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
+ &res.seq_res, 1);
+ if (!ret)
+ nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0);
+
+ return ret;
+}
+
+static int _nfs42_proc_setxattr(struct inode *inode, const char *name,
+ const void *buf, size_t buflen, int flags)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct page *pages[NFS4XATTR_MAXPAGES];
+ struct nfs42_setxattrargs arg = {
+ .fh = NFS_FH(inode),
+ .xattr_pages = pages,
+ .xattr_len = buflen,
+ .xattr_name = name,
+ .xattr_flags = flags,
+ };
+ struct nfs42_setxattrres res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETXATTR],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int ret, np;
+ unsigned long timestamp = jiffies;
+
+ if (buflen > server->sxasize)
+ return -ERANGE;
+
+ if (buflen > 0) {
+ np = nfs4_buf_to_pages_noslab(buf, buflen, arg.xattr_pages);
+ if (np < 0)
+ return np;
+ } else
+ np = 0;
+
+ ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
+ &res.seq_res, 1);
+
+ for (; np > 0; np--)
+ put_page(pages[np - 1]);
+
+ if (!ret)
+ nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0);
+
+ return ret;
+}
+
+static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
+ void *buf, size_t buflen)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct page *pages[NFS4XATTR_MAXPAGES] = {};
+ struct nfs42_getxattrargs arg = {
+ .fh = NFS_FH(inode),
+ .xattr_pages = pages,
+ .xattr_len = buflen,
+ .xattr_name = name,
+ };
+ struct nfs42_getxattrres res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETXATTR],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int ret, np;
+
+ ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
+ &res.seq_res, 0);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Normally, the caching is done one layer up, but for successful
+ * RPCS, always cache the result here, even if the caller was
+ * just querying the length, or if the reply was too big for
+ * the caller. This avoids a second RPC in the case of the
+ * common query-alloc-retrieve cycle for xattrs.
+ *
+ * Note that xattr_len is always capped to XATTR_SIZE_MAX.
+ */
+
+ nfs4_xattr_cache_add(inode, name, NULL, pages, res.xattr_len);
+
+ if (buflen) {
+ if (res.xattr_len > buflen)
+ return -ERANGE;
+ _copy_from_pages(buf, pages, 0, res.xattr_len);
+ }
+
+ np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE);
+ while (--np >= 0)
+ __free_page(pages[np]);
+
+ return res.xattr_len;
+}
+
+static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf,
+ size_t buflen, u64 *cookiep, bool *eofp)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct page **pages;
+ struct nfs42_listxattrsargs arg = {
+ .fh = NFS_FH(inode),
+ .cookie = *cookiep,
+ };
+ struct nfs42_listxattrsres res = {
+ .eof = false,
+ .xattr_buf = buf,
+ .xattr_len = buflen,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LISTXATTRS],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ u32 xdrlen;
+ int ret, np, i;
+
+
+ ret = -ENOMEM;
+ res.scratch = alloc_page(GFP_KERNEL);
+ if (!res.scratch)
+ goto out;
+
+ xdrlen = nfs42_listxattr_xdrsize(buflen);
+ if (xdrlen > server->lxasize)
+ xdrlen = server->lxasize;
+ np = xdrlen / PAGE_SIZE + 1;
+
+ pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ goto out_free_scratch;
+ for (i = 0; i < np; i++) {
+ pages[i] = alloc_page(GFP_KERNEL);
+ if (!pages[i])
+ goto out_free_pages;
+ }
+
+ arg.xattr_pages = pages;
+ arg.count = xdrlen;
+
+ ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
+ &res.seq_res, 0);
+
+ if (ret >= 0) {
+ ret = res.copied;
+ *cookiep = res.cookie;
+ *eofp = res.eof;
+ }
+
+out_free_pages:
+ while (--np >= 0) {
+ if (pages[np])
+ __free_page(pages[np]);
+ }
+ kfree(pages);
+out_free_scratch:
+ __free_page(res.scratch);
+out:
+ return ret;
+
+}
+
+ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name,
+ void *buf, size_t buflen)
+{
+ struct nfs4_exception exception = { };
+ ssize_t err;
+
+ do {
+ err = _nfs42_proc_getxattr(inode, name, buf, buflen);
+ if (err >= 0)
+ break;
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+
+ return err;
+}
+
+int nfs42_proc_setxattr(struct inode *inode, const char *name,
+ const void *buf, size_t buflen, int flags)
+{
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = _nfs42_proc_setxattr(inode, name, buf, buflen, flags);
+ if (!err)
+ break;
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+
+ return err;
+}
+
+ssize_t nfs42_proc_listxattrs(struct inode *inode, void *buf,
+ size_t buflen, u64 *cookiep, bool *eofp)
+{
+ struct nfs4_exception exception = { };
+ ssize_t err;
+
+ do {
+ err = _nfs42_proc_listxattrs(inode, buf, buflen,
+ cookiep, eofp);
+ if (err >= 0)
+ break;
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+
+ return err;
+}
+
+int nfs42_proc_removexattr(struct inode *inode, const char *name)
+{
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = _nfs42_proc_removexattr(inode, name);
+ if (!err)
+ break;
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+
+ return err;
+}
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
new file mode 100644
index 000000000..6c2ce7991
--- /dev/null
+++ b/fs/nfs/nfs42xattr.c
@@ -0,0 +1,1057 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2019, 2020 Amazon.com, Inc. or its affiliates. All rights reserved.
+ *
+ * User extended attribute client side cache functions.
+ *
+ * Author: Frank van der Linden <fllinden@amazon.com>
+ */
+#include <linux/errno.h>
+#include <linux/nfs_fs.h>
+#include <linux/hashtable.h>
+#include <linux/refcount.h>
+#include <uapi/linux/xattr.h>
+
+#include "nfs4_fs.h"
+#include "internal.h"
+
+/*
+ * User extended attributes client side caching is implemented by having
+ * a cache structure attached to NFS inodes. This structure is allocated
+ * when needed, and freed when the cache is zapped.
+ *
+ * The cache structure contains as hash table of entries, and a pointer
+ * to a special-cased entry for the listxattr cache.
+ *
+ * Accessing and allocating / freeing the caches is done via reference
+ * counting. The cache entries use a similar refcounting scheme.
+ *
+ * This makes freeing a cache, both from the shrinker and from the
+ * zap cache path, easy. It also means that, in current use cases,
+ * the large majority of inodes will not waste any memory, as they
+ * will never have any user extended attributes assigned to them.
+ *
+ * Attribute entries are hashed in to a simple hash table. They are
+ * also part of an LRU.
+ *
+ * There are three shrinkers.
+ *
+ * Two shrinkers deal with the cache entries themselves: one for
+ * large entries (> PAGE_SIZE), and one for smaller entries. The
+ * shrinker for the larger entries works more aggressively than
+ * those for the smaller entries.
+ *
+ * The other shrinker frees the cache structures themselves.
+ */
+
+/*
+ * 64 buckets is a good default. There is likely no reasonable
+ * workload that uses more than even 64 user extended attributes.
+ * You can certainly add a lot more - but you get what you ask for
+ * in those circumstances.
+ */
+#define NFS4_XATTR_HASH_SIZE 64
+
+#define NFSDBG_FACILITY NFSDBG_XATTRCACHE
+
+struct nfs4_xattr_cache;
+struct nfs4_xattr_entry;
+
+struct nfs4_xattr_bucket {
+ spinlock_t lock;
+ struct hlist_head hlist;
+ struct nfs4_xattr_cache *cache;
+ bool draining;
+};
+
+struct nfs4_xattr_cache {
+ struct kref ref;
+ struct nfs4_xattr_bucket buckets[NFS4_XATTR_HASH_SIZE];
+ struct list_head lru;
+ struct list_head dispose;
+ atomic_long_t nent;
+ spinlock_t listxattr_lock;
+ struct inode *inode;
+ struct nfs4_xattr_entry *listxattr;
+};
+
+struct nfs4_xattr_entry {
+ struct kref ref;
+ struct hlist_node hnode;
+ struct list_head lru;
+ struct list_head dispose;
+ char *xattr_name;
+ void *xattr_value;
+ size_t xattr_size;
+ struct nfs4_xattr_bucket *bucket;
+ uint32_t flags;
+};
+
+#define NFS4_XATTR_ENTRY_EXTVAL 0x0001
+
+/*
+ * LRU list of NFS inodes that have xattr caches.
+ */
+static struct list_lru nfs4_xattr_cache_lru;
+static struct list_lru nfs4_xattr_entry_lru;
+static struct list_lru nfs4_xattr_large_entry_lru;
+
+static struct kmem_cache *nfs4_xattr_cache_cachep;
+
+/*
+ * Hashing helper functions.
+ */
+static void
+nfs4_xattr_hash_init(struct nfs4_xattr_cache *cache)
+{
+ unsigned int i;
+
+ for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) {
+ INIT_HLIST_HEAD(&cache->buckets[i].hlist);
+ spin_lock_init(&cache->buckets[i].lock);
+ cache->buckets[i].cache = cache;
+ cache->buckets[i].draining = false;
+ }
+}
+
+/*
+ * Locking order:
+ * 1. inode i_lock or bucket lock
+ * 2. list_lru lock (taken by list_lru_* functions)
+ */
+
+/*
+ * Wrapper functions to add a cache entry to the right LRU.
+ */
+static bool
+nfs4_xattr_entry_lru_add(struct nfs4_xattr_entry *entry)
+{
+ struct list_lru *lru;
+
+ lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ?
+ &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
+
+ return list_lru_add(lru, &entry->lru);
+}
+
+static bool
+nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry)
+{
+ struct list_lru *lru;
+
+ lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ?
+ &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
+
+ return list_lru_del(lru, &entry->lru);
+}
+
+/*
+ * This function allocates cache entries. They are the normal
+ * extended attribute name/value pairs, but may also be a listxattr
+ * cache. Those allocations use the same entry so that they can be
+ * treated as one by the memory shrinker.
+ *
+ * xattr cache entries are allocated together with names. If the
+ * value fits in to one page with the entry structure and the name,
+ * it will also be part of the same allocation (kmalloc). This is
+ * expected to be the vast majority of cases. Larger allocations
+ * have a value pointer that is allocated separately by kvmalloc.
+ *
+ * Parameters:
+ *
+ * @name: Name of the extended attribute. NULL for listxattr cache
+ * entry.
+ * @value: Value of attribute, or listxattr cache. NULL if the
+ * value is to be copied from pages instead.
+ * @pages: Pages to copy the value from, if not NULL. Passed in to
+ * make it easier to copy the value after an RPC, even if
+ * the value will not be passed up to application (e.g.
+ * for a 'query' getxattr with NULL buffer).
+ * @len: Length of the value. Can be 0 for zero-length attribues.
+ * @value and @pages will be NULL if @len is 0.
+ */
+static struct nfs4_xattr_entry *
+nfs4_xattr_alloc_entry(const char *name, const void *value,
+ struct page **pages, size_t len)
+{
+ struct nfs4_xattr_entry *entry;
+ void *valp;
+ char *namep;
+ size_t alloclen, slen;
+ char *buf;
+ uint32_t flags;
+
+ BUILD_BUG_ON(sizeof(struct nfs4_xattr_entry) +
+ XATTR_NAME_MAX + 1 > PAGE_SIZE);
+
+ alloclen = sizeof(struct nfs4_xattr_entry);
+ if (name != NULL) {
+ slen = strlen(name) + 1;
+ alloclen += slen;
+ } else
+ slen = 0;
+
+ if (alloclen + len <= PAGE_SIZE) {
+ alloclen += len;
+ flags = 0;
+ } else {
+ flags = NFS4_XATTR_ENTRY_EXTVAL;
+ }
+
+ buf = kmalloc(alloclen, GFP_KERNEL_ACCOUNT | GFP_NOFS);
+ if (buf == NULL)
+ return NULL;
+ entry = (struct nfs4_xattr_entry *)buf;
+
+ if (name != NULL) {
+ namep = buf + sizeof(struct nfs4_xattr_entry);
+ memcpy(namep, name, slen);
+ } else {
+ namep = NULL;
+ }
+
+
+ if (flags & NFS4_XATTR_ENTRY_EXTVAL) {
+ valp = kvmalloc(len, GFP_KERNEL_ACCOUNT | GFP_NOFS);
+ if (valp == NULL) {
+ kfree(buf);
+ return NULL;
+ }
+ } else if (len != 0) {
+ valp = buf + sizeof(struct nfs4_xattr_entry) + slen;
+ } else
+ valp = NULL;
+
+ if (valp != NULL) {
+ if (value != NULL)
+ memcpy(valp, value, len);
+ else
+ _copy_from_pages(valp, pages, 0, len);
+ }
+
+ entry->flags = flags;
+ entry->xattr_value = valp;
+ kref_init(&entry->ref);
+ entry->xattr_name = namep;
+ entry->xattr_size = len;
+ entry->bucket = NULL;
+ INIT_LIST_HEAD(&entry->lru);
+ INIT_LIST_HEAD(&entry->dispose);
+ INIT_HLIST_NODE(&entry->hnode);
+
+ return entry;
+}
+
+static void
+nfs4_xattr_free_entry(struct nfs4_xattr_entry *entry)
+{
+ if (entry->flags & NFS4_XATTR_ENTRY_EXTVAL)
+ kvfree(entry->xattr_value);
+ kfree(entry);
+}
+
+static void
+nfs4_xattr_free_entry_cb(struct kref *kref)
+{
+ struct nfs4_xattr_entry *entry;
+
+ entry = container_of(kref, struct nfs4_xattr_entry, ref);
+
+ if (WARN_ON(!list_empty(&entry->lru)))
+ return;
+
+ nfs4_xattr_free_entry(entry);
+}
+
+static void
+nfs4_xattr_free_cache_cb(struct kref *kref)
+{
+ struct nfs4_xattr_cache *cache;
+ int i;
+
+ cache = container_of(kref, struct nfs4_xattr_cache, ref);
+
+ for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) {
+ if (WARN_ON(!hlist_empty(&cache->buckets[i].hlist)))
+ return;
+ cache->buckets[i].draining = false;
+ }
+
+ cache->listxattr = NULL;
+
+ kmem_cache_free(nfs4_xattr_cache_cachep, cache);
+
+}
+
+static struct nfs4_xattr_cache *
+nfs4_xattr_alloc_cache(void)
+{
+ struct nfs4_xattr_cache *cache;
+
+ cache = kmem_cache_alloc(nfs4_xattr_cache_cachep,
+ GFP_KERNEL_ACCOUNT | GFP_NOFS);
+ if (cache == NULL)
+ return NULL;
+
+ kref_init(&cache->ref);
+ atomic_long_set(&cache->nent, 0);
+
+ return cache;
+}
+
+/*
+ * Set the listxattr cache, which is a special-cased cache entry.
+ * The special value ERR_PTR(-ESTALE) is used to indicate that
+ * the cache is being drained - this prevents a new listxattr
+ * cache from being added to what is now a stale cache.
+ */
+static int
+nfs4_xattr_set_listcache(struct nfs4_xattr_cache *cache,
+ struct nfs4_xattr_entry *new)
+{
+ struct nfs4_xattr_entry *old;
+ int ret = 1;
+
+ spin_lock(&cache->listxattr_lock);
+
+ old = cache->listxattr;
+
+ if (old == ERR_PTR(-ESTALE)) {
+ ret = 0;
+ goto out;
+ }
+
+ cache->listxattr = new;
+ if (new != NULL && new != ERR_PTR(-ESTALE))
+ nfs4_xattr_entry_lru_add(new);
+
+ if (old != NULL) {
+ nfs4_xattr_entry_lru_del(old);
+ kref_put(&old->ref, nfs4_xattr_free_entry_cb);
+ }
+out:
+ spin_unlock(&cache->listxattr_lock);
+
+ return ret;
+}
+
+/*
+ * Unlink a cache from its parent inode, clearing out an invalid
+ * cache. Must be called with i_lock held.
+ */
+static struct nfs4_xattr_cache *
+nfs4_xattr_cache_unlink(struct inode *inode)
+{
+ struct nfs_inode *nfsi;
+ struct nfs4_xattr_cache *oldcache;
+
+ nfsi = NFS_I(inode);
+
+ oldcache = nfsi->xattr_cache;
+ if (oldcache != NULL) {
+ list_lru_del(&nfs4_xattr_cache_lru, &oldcache->lru);
+ oldcache->inode = NULL;
+ }
+ nfsi->xattr_cache = NULL;
+ nfsi->cache_validity &= ~NFS_INO_INVALID_XATTR;
+
+ return oldcache;
+
+}
+
+/*
+ * Discard a cache. Called by get_cache() if there was an old,
+ * invalid cache. Can also be called from a shrinker callback.
+ *
+ * The cache is dead, it has already been unlinked from its inode,
+ * and no longer appears on the cache LRU list.
+ *
+ * Mark all buckets as draining, so that no new entries are added. This
+ * could still happen in the unlikely, but possible case that another
+ * thread had grabbed a reference before it was unlinked from the inode,
+ * and is still holding it for an add operation.
+ *
+ * Remove all entries from the LRU lists, so that there is no longer
+ * any way to 'find' this cache. Then, remove the entries from the hash
+ * table.
+ *
+ * At that point, the cache will remain empty and can be freed when the final
+ * reference drops, which is very likely the kref_put at the end of
+ * this function, or the one called immediately afterwards in the
+ * shrinker callback.
+ */
+static void
+nfs4_xattr_discard_cache(struct nfs4_xattr_cache *cache)
+{
+ unsigned int i;
+ struct nfs4_xattr_entry *entry;
+ struct nfs4_xattr_bucket *bucket;
+ struct hlist_node *n;
+
+ nfs4_xattr_set_listcache(cache, ERR_PTR(-ESTALE));
+
+ for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) {
+ bucket = &cache->buckets[i];
+
+ spin_lock(&bucket->lock);
+ bucket->draining = true;
+ hlist_for_each_entry_safe(entry, n, &bucket->hlist, hnode) {
+ nfs4_xattr_entry_lru_del(entry);
+ hlist_del_init(&entry->hnode);
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+ }
+ spin_unlock(&bucket->lock);
+ }
+
+ atomic_long_set(&cache->nent, 0);
+
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+}
+
+/*
+ * Get a referenced copy of the cache structure. Avoid doing allocs
+ * while holding i_lock. Which means that we do some optimistic allocation,
+ * and might have to free the result in rare cases.
+ *
+ * This function only checks the NFS_INO_INVALID_XATTR cache validity bit
+ * and acts accordingly, replacing the cache when needed. For the read case
+ * (!add), this means that the caller must make sure that the cache
+ * is valid before caling this function. getxattr and listxattr call
+ * revalidate_inode to do this. The attribute cache timeout (for the
+ * non-delegated case) is expected to be dealt with in the revalidate
+ * call.
+ */
+
+static struct nfs4_xattr_cache *
+nfs4_xattr_get_cache(struct inode *inode, int add)
+{
+ struct nfs_inode *nfsi;
+ struct nfs4_xattr_cache *cache, *oldcache, *newcache;
+
+ nfsi = NFS_I(inode);
+
+ cache = oldcache = NULL;
+
+ spin_lock(&inode->i_lock);
+
+ if (nfsi->cache_validity & NFS_INO_INVALID_XATTR)
+ oldcache = nfs4_xattr_cache_unlink(inode);
+ else
+ cache = nfsi->xattr_cache;
+
+ if (cache != NULL)
+ kref_get(&cache->ref);
+
+ spin_unlock(&inode->i_lock);
+
+ if (add && cache == NULL) {
+ newcache = NULL;
+
+ cache = nfs4_xattr_alloc_cache();
+ if (cache == NULL)
+ goto out;
+
+ spin_lock(&inode->i_lock);
+ if (nfsi->cache_validity & NFS_INO_INVALID_XATTR) {
+ /*
+ * The cache was invalidated again. Give up,
+ * since what we want to enter is now likely
+ * outdated anyway.
+ */
+ spin_unlock(&inode->i_lock);
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+ cache = NULL;
+ goto out;
+ }
+
+ /*
+ * Check if someone beat us to it.
+ */
+ if (nfsi->xattr_cache != NULL) {
+ newcache = nfsi->xattr_cache;
+ kref_get(&newcache->ref);
+ } else {
+ kref_get(&cache->ref);
+ nfsi->xattr_cache = cache;
+ cache->inode = inode;
+ list_lru_add(&nfs4_xattr_cache_lru, &cache->lru);
+ }
+
+ spin_unlock(&inode->i_lock);
+
+ /*
+ * If there was a race, throw away the cache we just
+ * allocated, and use the new one allocated by someone
+ * else.
+ */
+ if (newcache != NULL) {
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+ cache = newcache;
+ }
+ }
+
+out:
+ /*
+ * Discard the now orphaned old cache.
+ */
+ if (oldcache != NULL)
+ nfs4_xattr_discard_cache(oldcache);
+
+ return cache;
+}
+
+static inline struct nfs4_xattr_bucket *
+nfs4_xattr_hash_bucket(struct nfs4_xattr_cache *cache, const char *name)
+{
+ return &cache->buckets[jhash(name, strlen(name), 0) &
+ (ARRAY_SIZE(cache->buckets) - 1)];
+}
+
+static struct nfs4_xattr_entry *
+nfs4_xattr_get_entry(struct nfs4_xattr_bucket *bucket, const char *name)
+{
+ struct nfs4_xattr_entry *entry;
+
+ entry = NULL;
+
+ hlist_for_each_entry(entry, &bucket->hlist, hnode) {
+ if (!strcmp(entry->xattr_name, name))
+ break;
+ }
+
+ return entry;
+}
+
+static int
+nfs4_xattr_hash_add(struct nfs4_xattr_cache *cache,
+ struct nfs4_xattr_entry *entry)
+{
+ struct nfs4_xattr_bucket *bucket;
+ struct nfs4_xattr_entry *oldentry = NULL;
+ int ret = 1;
+
+ bucket = nfs4_xattr_hash_bucket(cache, entry->xattr_name);
+ entry->bucket = bucket;
+
+ spin_lock(&bucket->lock);
+
+ if (bucket->draining) {
+ ret = 0;
+ goto out;
+ }
+
+ oldentry = nfs4_xattr_get_entry(bucket, entry->xattr_name);
+ if (oldentry != NULL) {
+ hlist_del_init(&oldentry->hnode);
+ nfs4_xattr_entry_lru_del(oldentry);
+ } else {
+ atomic_long_inc(&cache->nent);
+ }
+
+ hlist_add_head(&entry->hnode, &bucket->hlist);
+ nfs4_xattr_entry_lru_add(entry);
+
+out:
+ spin_unlock(&bucket->lock);
+
+ if (oldentry != NULL)
+ kref_put(&oldentry->ref, nfs4_xattr_free_entry_cb);
+
+ return ret;
+}
+
+static void
+nfs4_xattr_hash_remove(struct nfs4_xattr_cache *cache, const char *name)
+{
+ struct nfs4_xattr_bucket *bucket;
+ struct nfs4_xattr_entry *entry;
+
+ bucket = nfs4_xattr_hash_bucket(cache, name);
+
+ spin_lock(&bucket->lock);
+
+ entry = nfs4_xattr_get_entry(bucket, name);
+ if (entry != NULL) {
+ hlist_del_init(&entry->hnode);
+ nfs4_xattr_entry_lru_del(entry);
+ atomic_long_dec(&cache->nent);
+ }
+
+ spin_unlock(&bucket->lock);
+
+ if (entry != NULL)
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+}
+
+static struct nfs4_xattr_entry *
+nfs4_xattr_hash_find(struct nfs4_xattr_cache *cache, const char *name)
+{
+ struct nfs4_xattr_bucket *bucket;
+ struct nfs4_xattr_entry *entry;
+
+ bucket = nfs4_xattr_hash_bucket(cache, name);
+
+ spin_lock(&bucket->lock);
+
+ entry = nfs4_xattr_get_entry(bucket, name);
+ if (entry != NULL)
+ kref_get(&entry->ref);
+
+ spin_unlock(&bucket->lock);
+
+ return entry;
+}
+
+/*
+ * Entry point to retrieve an entry from the cache.
+ */
+ssize_t nfs4_xattr_cache_get(struct inode *inode, const char *name, char *buf,
+ ssize_t buflen)
+{
+ struct nfs4_xattr_cache *cache;
+ struct nfs4_xattr_entry *entry;
+ ssize_t ret;
+
+ cache = nfs4_xattr_get_cache(inode, 0);
+ if (cache == NULL)
+ return -ENOENT;
+
+ ret = 0;
+ entry = nfs4_xattr_hash_find(cache, name);
+
+ if (entry != NULL) {
+ dprintk("%s: cache hit '%s', len %lu\n", __func__,
+ entry->xattr_name, (unsigned long)entry->xattr_size);
+ if (buflen == 0) {
+ /* Length probe only */
+ ret = entry->xattr_size;
+ } else if (buflen < entry->xattr_size)
+ ret = -ERANGE;
+ else {
+ memcpy(buf, entry->xattr_value, entry->xattr_size);
+ ret = entry->xattr_size;
+ }
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+ } else {
+ dprintk("%s: cache miss '%s'\n", __func__, name);
+ ret = -ENOENT;
+ }
+
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+
+ return ret;
+}
+
+/*
+ * Retrieve a cached list of xattrs from the cache.
+ */
+ssize_t nfs4_xattr_cache_list(struct inode *inode, char *buf, ssize_t buflen)
+{
+ struct nfs4_xattr_cache *cache;
+ struct nfs4_xattr_entry *entry;
+ ssize_t ret;
+
+ cache = nfs4_xattr_get_cache(inode, 0);
+ if (cache == NULL)
+ return -ENOENT;
+
+ spin_lock(&cache->listxattr_lock);
+
+ entry = cache->listxattr;
+
+ if (entry != NULL && entry != ERR_PTR(-ESTALE)) {
+ if (buflen == 0) {
+ /* Length probe only */
+ ret = entry->xattr_size;
+ } else if (entry->xattr_size > buflen)
+ ret = -ERANGE;
+ else {
+ memcpy(buf, entry->xattr_value, entry->xattr_size);
+ ret = entry->xattr_size;
+ }
+ } else {
+ ret = -ENOENT;
+ }
+
+ spin_unlock(&cache->listxattr_lock);
+
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+
+ return ret;
+}
+
+/*
+ * Add an xattr to the cache.
+ *
+ * This also invalidates the xattr list cache.
+ */
+void nfs4_xattr_cache_add(struct inode *inode, const char *name,
+ const char *buf, struct page **pages, ssize_t buflen)
+{
+ struct nfs4_xattr_cache *cache;
+ struct nfs4_xattr_entry *entry;
+
+ dprintk("%s: add '%s' len %lu\n", __func__,
+ name, (unsigned long)buflen);
+
+ cache = nfs4_xattr_get_cache(inode, 1);
+ if (cache == NULL)
+ return;
+
+ entry = nfs4_xattr_alloc_entry(name, buf, pages, buflen);
+ if (entry == NULL)
+ goto out;
+
+ (void)nfs4_xattr_set_listcache(cache, NULL);
+
+ if (!nfs4_xattr_hash_add(cache, entry))
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+
+out:
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+}
+
+
+/*
+ * Remove an xattr from the cache.
+ *
+ * This also invalidates the xattr list cache.
+ */
+void nfs4_xattr_cache_remove(struct inode *inode, const char *name)
+{
+ struct nfs4_xattr_cache *cache;
+
+ dprintk("%s: remove '%s'\n", __func__, name);
+
+ cache = nfs4_xattr_get_cache(inode, 0);
+ if (cache == NULL)
+ return;
+
+ (void)nfs4_xattr_set_listcache(cache, NULL);
+ nfs4_xattr_hash_remove(cache, name);
+
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+}
+
+/*
+ * Cache listxattr output, replacing any possible old one.
+ */
+void nfs4_xattr_cache_set_list(struct inode *inode, const char *buf,
+ ssize_t buflen)
+{
+ struct nfs4_xattr_cache *cache;
+ struct nfs4_xattr_entry *entry;
+
+ cache = nfs4_xattr_get_cache(inode, 1);
+ if (cache == NULL)
+ return;
+
+ entry = nfs4_xattr_alloc_entry(NULL, buf, NULL, buflen);
+ if (entry == NULL)
+ goto out;
+
+ /*
+ * This is just there to be able to get to bucket->cache,
+ * which is obviously the same for all buckets, so just
+ * use bucket 0.
+ */
+ entry->bucket = &cache->buckets[0];
+
+ if (!nfs4_xattr_set_listcache(cache, entry))
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+
+out:
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+}
+
+/*
+ * Zap the entire cache. Called when an inode is evicted.
+ */
+void nfs4_xattr_cache_zap(struct inode *inode)
+{
+ struct nfs4_xattr_cache *oldcache;
+
+ spin_lock(&inode->i_lock);
+ oldcache = nfs4_xattr_cache_unlink(inode);
+ spin_unlock(&inode->i_lock);
+
+ if (oldcache)
+ nfs4_xattr_discard_cache(oldcache);
+}
+
+/*
+ * The entry LRU is shrunk more aggressively than the cache LRU,
+ * by settings @seeks to 1.
+ *
+ * Cache structures are freed only when they've become empty, after
+ * pruning all but one entry.
+ */
+
+static unsigned long nfs4_xattr_cache_count(struct shrinker *shrink,
+ struct shrink_control *sc);
+static unsigned long nfs4_xattr_entry_count(struct shrinker *shrink,
+ struct shrink_control *sc);
+static unsigned long nfs4_xattr_cache_scan(struct shrinker *shrink,
+ struct shrink_control *sc);
+static unsigned long nfs4_xattr_entry_scan(struct shrinker *shrink,
+ struct shrink_control *sc);
+
+static struct shrinker nfs4_xattr_cache_shrinker = {
+ .count_objects = nfs4_xattr_cache_count,
+ .scan_objects = nfs4_xattr_cache_scan,
+ .seeks = DEFAULT_SEEKS,
+ .flags = SHRINKER_MEMCG_AWARE,
+};
+
+static struct shrinker nfs4_xattr_entry_shrinker = {
+ .count_objects = nfs4_xattr_entry_count,
+ .scan_objects = nfs4_xattr_entry_scan,
+ .seeks = DEFAULT_SEEKS,
+ .batch = 512,
+ .flags = SHRINKER_MEMCG_AWARE,
+};
+
+static struct shrinker nfs4_xattr_large_entry_shrinker = {
+ .count_objects = nfs4_xattr_entry_count,
+ .scan_objects = nfs4_xattr_entry_scan,
+ .seeks = 1,
+ .batch = 512,
+ .flags = SHRINKER_MEMCG_AWARE,
+};
+
+static enum lru_status
+cache_lru_isolate(struct list_head *item,
+ struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+ struct list_head *dispose = arg;
+ struct inode *inode;
+ struct nfs4_xattr_cache *cache = container_of(item,
+ struct nfs4_xattr_cache, lru);
+
+ if (atomic_long_read(&cache->nent) > 1)
+ return LRU_SKIP;
+
+ /*
+ * If a cache structure is on the LRU list, we know that
+ * its inode is valid. Try to lock it to break the link.
+ * Since we're inverting the lock order here, only try.
+ */
+ inode = cache->inode;
+
+ if (!spin_trylock(&inode->i_lock))
+ return LRU_SKIP;
+
+ kref_get(&cache->ref);
+
+ cache->inode = NULL;
+ NFS_I(inode)->xattr_cache = NULL;
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_XATTR;
+ list_lru_isolate(lru, &cache->lru);
+
+ spin_unlock(&inode->i_lock);
+
+ list_add_tail(&cache->dispose, dispose);
+ return LRU_REMOVED;
+}
+
+static unsigned long
+nfs4_xattr_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ LIST_HEAD(dispose);
+ unsigned long freed;
+ struct nfs4_xattr_cache *cache;
+
+ freed = list_lru_shrink_walk(&nfs4_xattr_cache_lru, sc,
+ cache_lru_isolate, &dispose);
+ while (!list_empty(&dispose)) {
+ cache = list_first_entry(&dispose, struct nfs4_xattr_cache,
+ dispose);
+ list_del_init(&cache->dispose);
+ nfs4_xattr_discard_cache(cache);
+ kref_put(&cache->ref, nfs4_xattr_free_cache_cb);
+ }
+
+ return freed;
+}
+
+
+static unsigned long
+nfs4_xattr_cache_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ unsigned long count;
+
+ count = list_lru_shrink_count(&nfs4_xattr_cache_lru, sc);
+ return vfs_pressure_ratio(count);
+}
+
+static enum lru_status
+entry_lru_isolate(struct list_head *item,
+ struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+ struct list_head *dispose = arg;
+ struct nfs4_xattr_bucket *bucket;
+ struct nfs4_xattr_cache *cache;
+ struct nfs4_xattr_entry *entry = container_of(item,
+ struct nfs4_xattr_entry, lru);
+
+ bucket = entry->bucket;
+ cache = bucket->cache;
+
+ /*
+ * Unhook the entry from its parent (either a cache bucket
+ * or a cache structure if it's a listxattr buf), so that
+ * it's no longer found. Then add it to the isolate list,
+ * to be freed later.
+ *
+ * In both cases, we're reverting lock order, so use
+ * trylock and skip the entry if we can't get the lock.
+ */
+ if (entry->xattr_name != NULL) {
+ /* Regular cache entry */
+ if (!spin_trylock(&bucket->lock))
+ return LRU_SKIP;
+
+ kref_get(&entry->ref);
+
+ hlist_del_init(&entry->hnode);
+ atomic_long_dec(&cache->nent);
+ list_lru_isolate(lru, &entry->lru);
+
+ spin_unlock(&bucket->lock);
+ } else {
+ /* Listxattr cache entry */
+ if (!spin_trylock(&cache->listxattr_lock))
+ return LRU_SKIP;
+
+ kref_get(&entry->ref);
+
+ cache->listxattr = NULL;
+ list_lru_isolate(lru, &entry->lru);
+
+ spin_unlock(&cache->listxattr_lock);
+ }
+
+ list_add_tail(&entry->dispose, dispose);
+ return LRU_REMOVED;
+}
+
+static unsigned long
+nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ LIST_HEAD(dispose);
+ unsigned long freed;
+ struct nfs4_xattr_entry *entry;
+ struct list_lru *lru;
+
+ lru = (shrink == &nfs4_xattr_large_entry_shrinker) ?
+ &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
+
+ freed = list_lru_shrink_walk(lru, sc, entry_lru_isolate, &dispose);
+
+ while (!list_empty(&dispose)) {
+ entry = list_first_entry(&dispose, struct nfs4_xattr_entry,
+ dispose);
+ list_del_init(&entry->dispose);
+
+ /*
+ * Drop two references: the one that we just grabbed
+ * in entry_lru_isolate, and the one that was set
+ * when the entry was first allocated.
+ */
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+ kref_put(&entry->ref, nfs4_xattr_free_entry_cb);
+ }
+
+ return freed;
+}
+
+static unsigned long
+nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ unsigned long count;
+ struct list_lru *lru;
+
+ lru = (shrink == &nfs4_xattr_large_entry_shrinker) ?
+ &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
+
+ count = list_lru_shrink_count(lru, sc);
+ return vfs_pressure_ratio(count);
+}
+
+
+static void nfs4_xattr_cache_init_once(void *p)
+{
+ struct nfs4_xattr_cache *cache = (struct nfs4_xattr_cache *)p;
+
+ spin_lock_init(&cache->listxattr_lock);
+ atomic_long_set(&cache->nent, 0);
+ nfs4_xattr_hash_init(cache);
+ cache->listxattr = NULL;
+ INIT_LIST_HEAD(&cache->lru);
+ INIT_LIST_HEAD(&cache->dispose);
+}
+
+int __init nfs4_xattr_cache_init(void)
+{
+ int ret = 0;
+
+ nfs4_xattr_cache_cachep = kmem_cache_create("nfs4_xattr_cache_cache",
+ sizeof(struct nfs4_xattr_cache), 0,
+ (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ nfs4_xattr_cache_init_once);
+ if (nfs4_xattr_cache_cachep == NULL)
+ return -ENOMEM;
+
+ ret = list_lru_init_memcg(&nfs4_xattr_large_entry_lru,
+ &nfs4_xattr_large_entry_shrinker);
+ if (ret)
+ goto out4;
+
+ ret = list_lru_init_memcg(&nfs4_xattr_entry_lru,
+ &nfs4_xattr_entry_shrinker);
+ if (ret)
+ goto out3;
+
+ ret = list_lru_init_memcg(&nfs4_xattr_cache_lru,
+ &nfs4_xattr_cache_shrinker);
+ if (ret)
+ goto out2;
+
+ ret = register_shrinker(&nfs4_xattr_cache_shrinker);
+ if (ret)
+ goto out1;
+
+ ret = register_shrinker(&nfs4_xattr_entry_shrinker);
+ if (ret)
+ goto out;
+
+ ret = register_shrinker(&nfs4_xattr_large_entry_shrinker);
+ if (!ret)
+ return 0;
+
+ unregister_shrinker(&nfs4_xattr_entry_shrinker);
+out:
+ unregister_shrinker(&nfs4_xattr_cache_shrinker);
+out1:
+ list_lru_destroy(&nfs4_xattr_cache_lru);
+out2:
+ list_lru_destroy(&nfs4_xattr_entry_lru);
+out3:
+ list_lru_destroy(&nfs4_xattr_large_entry_lru);
+out4:
+ kmem_cache_destroy(nfs4_xattr_cache_cachep);
+
+ return ret;
+}
+
+void nfs4_xattr_cache_exit(void)
+{
+ unregister_shrinker(&nfs4_xattr_large_entry_shrinker);
+ unregister_shrinker(&nfs4_xattr_entry_shrinker);
+ unregister_shrinker(&nfs4_xattr_cache_shrinker);
+ list_lru_destroy(&nfs4_xattr_large_entry_lru);
+ list_lru_destroy(&nfs4_xattr_entry_lru);
+ list_lru_destroy(&nfs4_xattr_cache_lru);
+ kmem_cache_destroy(nfs4_xattr_cache_cachep);
+}
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
new file mode 100644
index 000000000..f2248d9d4
--- /dev/null
+++ b/fs/nfs/nfs42xdr.c
@@ -0,0 +1,1592 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
+ */
+#ifndef __LINUX_FS_NFS_NFS4_2XDR_H
+#define __LINUX_FS_NFS_NFS4_2XDR_H
+
+#include "nfs42.h"
+
+#define encode_fallocate_maxsz (encode_stateid_maxsz + \
+ 2 /* offset */ + \
+ 2 /* length */)
+#define NFS42_WRITE_RES_SIZE (1 /* wr_callback_id size */ +\
+ XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+ 2 /* wr_count */ + \
+ 1 /* wr_committed */ + \
+ XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define encode_allocate_maxsz (op_encode_hdr_maxsz + \
+ encode_fallocate_maxsz)
+#define decode_allocate_maxsz (op_decode_hdr_maxsz)
+#define encode_copy_maxsz (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+ XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+ 2 + 2 + 2 + 1 + 1 + 1 +\
+ 1 + /* One cnr_source_server */\
+ 1 + /* nl4_type */ \
+ 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT))
+#define decode_copy_maxsz (op_decode_hdr_maxsz + \
+ NFS42_WRITE_RES_SIZE + \
+ 1 /* cr_consecutive */ + \
+ 1 /* cr_synchronous */)
+#define encode_offload_cancel_maxsz (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define decode_offload_cancel_maxsz (op_decode_hdr_maxsz)
+#define encode_copy_notify_maxsz (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+ 1 + /* nl4_type */ \
+ 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT))
+#define decode_copy_notify_maxsz (op_decode_hdr_maxsz + \
+ 3 + /* cnr_lease_time */\
+ XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+ 1 + /* Support 1 cnr_source_server */\
+ 1 + /* nl4_type */ \
+ 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT))
+#define encode_deallocate_maxsz (op_encode_hdr_maxsz + \
+ encode_fallocate_maxsz)
+#define decode_deallocate_maxsz (op_decode_hdr_maxsz)
+#define encode_read_plus_maxsz (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + 3)
+#define NFS42_READ_PLUS_SEGMENT_SIZE (1 /* data_content4 */ + \
+ 2 /* data_info4.di_offset */ + \
+ 2 /* data_info4.di_length */)
+#define decode_read_plus_maxsz (op_decode_hdr_maxsz + \
+ 1 /* rpr_eof */ + \
+ 1 /* rpr_contents count */ + \
+ 2 * NFS42_READ_PLUS_SEGMENT_SIZE)
+#define encode_seek_maxsz (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + \
+ 2 /* offset */ + \
+ 1 /* whence */)
+#define decode_seek_maxsz (op_decode_hdr_maxsz + \
+ 1 /* eof */ + \
+ 1 /* whence */ + \
+ 2 /* offset */ + \
+ 2 /* length */)
+#define encode_io_info_maxsz 4
+#define encode_layoutstats_maxsz (op_decode_hdr_maxsz + \
+ 2 /* offset */ + \
+ 2 /* length */ + \
+ encode_stateid_maxsz + \
+ encode_io_info_maxsz + \
+ encode_io_info_maxsz + \
+ 1 /* opaque devaddr4 length */ + \
+ XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
+#define decode_layoutstats_maxsz (op_decode_hdr_maxsz)
+#define encode_device_error_maxsz (XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
+ 1 /* status */ + 1 /* opnum */)
+#define encode_layouterror_maxsz (op_decode_hdr_maxsz + \
+ 2 /* offset */ + \
+ 2 /* length */ + \
+ encode_stateid_maxsz + \
+ 1 /* Array size */ + \
+ encode_device_error_maxsz)
+#define decode_layouterror_maxsz (op_decode_hdr_maxsz)
+#define encode_clone_maxsz (encode_stateid_maxsz + \
+ encode_stateid_maxsz + \
+ 2 /* src offset */ + \
+ 2 /* dst offset */ + \
+ 2 /* count */)
+#define decode_clone_maxsz (op_decode_hdr_maxsz)
+
+#define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_allocate_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_allocate_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_allocate_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_copy_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_savefh_maxsz + \
+ encode_putfh_maxsz + \
+ encode_copy_maxsz + \
+ encode_commit_maxsz)
+#define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_savefh_maxsz + \
+ decode_putfh_maxsz + \
+ decode_copy_maxsz + \
+ decode_commit_maxsz)
+#define NFS4_enc_offload_cancel_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_offload_cancel_maxsz)
+#define NFS4_dec_offload_cancel_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_offload_cancel_maxsz)
+#define NFS4_enc_copy_notify_sz (compound_encode_hdr_maxsz + \
+ encode_putfh_maxsz + \
+ encode_copy_notify_maxsz)
+#define NFS4_dec_copy_notify_sz (compound_decode_hdr_maxsz + \
+ decode_putfh_maxsz + \
+ decode_copy_notify_maxsz)
+#define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_deallocate_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_deallocate_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_deallocate_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_read_plus_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_read_plus_maxsz)
+#define NFS4_dec_read_plus_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_read_plus_maxsz)
+#define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_seek_maxsz)
+#define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_seek_maxsz)
+#define NFS4_enc_layoutstats_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ PNFS_LAYOUTSTATS_MAXDEV * encode_layoutstats_maxsz)
+#define NFS4_dec_layoutstats_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
+#define NFS4_enc_layouterror_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ NFS42_LAYOUTERROR_MAX * \
+ encode_layouterror_maxsz)
+#define NFS4_dec_layouterror_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ NFS42_LAYOUTERROR_MAX * \
+ decode_layouterror_maxsz)
+#define NFS4_enc_clone_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_savefh_maxsz + \
+ encode_putfh_maxsz + \
+ encode_clone_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_clone_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_savefh_maxsz + \
+ decode_putfh_maxsz + \
+ decode_clone_maxsz + \
+ decode_getattr_maxsz)
+
+/* Not limited by NFS itself, limited by the generic xattr code */
+#define nfs4_xattr_name_maxsz XDR_QUADLEN(XATTR_NAME_MAX)
+
+#define encode_getxattr_maxsz (op_encode_hdr_maxsz + 1 + \
+ nfs4_xattr_name_maxsz)
+#define decode_getxattr_maxsz (op_decode_hdr_maxsz + 1 + 1)
+#define encode_setxattr_maxsz (op_encode_hdr_maxsz + \
+ 1 + nfs4_xattr_name_maxsz + 1)
+#define decode_setxattr_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz)
+#define encode_listxattrs_maxsz (op_encode_hdr_maxsz + 2 + 1)
+#define decode_listxattrs_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1 + 1)
+#define encode_removexattr_maxsz (op_encode_hdr_maxsz + 1 + \
+ nfs4_xattr_name_maxsz)
+#define decode_removexattr_maxsz (op_decode_hdr_maxsz + \
+ decode_change_info_maxsz)
+
+#define NFS4_enc_getxattr_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_getxattr_maxsz)
+#define NFS4_dec_getxattr_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_getxattr_maxsz)
+#define NFS4_enc_setxattr_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_setxattr_maxsz)
+#define NFS4_dec_setxattr_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_setxattr_maxsz)
+#define NFS4_enc_listxattrs_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_listxattrs_maxsz)
+#define NFS4_dec_listxattrs_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_listxattrs_maxsz)
+#define NFS4_enc_removexattr_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_removexattr_maxsz)
+#define NFS4_dec_removexattr_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_removexattr_maxsz)
+
+/*
+ * These values specify the maximum amount of data that is not
+ * associated with the extended attribute name or extended
+ * attribute list in the SETXATTR, GETXATTR and LISTXATTR
+ * respectively.
+ */
+const u32 nfs42_maxsetxattr_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+ compound_encode_hdr_maxsz +
+ encode_sequence_maxsz +
+ encode_putfh_maxsz + 1 +
+ nfs4_xattr_name_maxsz)
+ * XDR_UNIT);
+
+const u32 nfs42_maxgetxattr_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+ compound_decode_hdr_maxsz +
+ decode_sequence_maxsz +
+ decode_putfh_maxsz + 1) * XDR_UNIT);
+
+const u32 nfs42_maxlistxattrs_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+ compound_decode_hdr_maxsz +
+ decode_sequence_maxsz +
+ decode_putfh_maxsz + 3) * XDR_UNIT);
+
+static void encode_fallocate(struct xdr_stream *xdr,
+ const struct nfs42_falloc_args *args)
+{
+ encode_nfs4_stateid(xdr, &args->falloc_stateid);
+ encode_uint64(xdr, args->falloc_offset);
+ encode_uint64(xdr, args->falloc_length);
+}
+
+static void encode_allocate(struct xdr_stream *xdr,
+ const struct nfs42_falloc_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_ALLOCATE, decode_allocate_maxsz, hdr);
+ encode_fallocate(xdr, args);
+}
+
+static void encode_nl4_server(struct xdr_stream *xdr,
+ const struct nl4_server *ns)
+{
+ encode_uint32(xdr, ns->nl4_type);
+ switch (ns->nl4_type) {
+ case NL4_NAME:
+ case NL4_URL:
+ encode_string(xdr, ns->u.nl4_str_sz, ns->u.nl4_str);
+ break;
+ case NL4_NETADDR:
+ encode_string(xdr, ns->u.nl4_addr.netid_len,
+ ns->u.nl4_addr.netid);
+ encode_string(xdr, ns->u.nl4_addr.addr_len,
+ ns->u.nl4_addr.addr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+}
+
+static void encode_copy(struct xdr_stream *xdr,
+ const struct nfs42_copy_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_COPY, decode_copy_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->src_stateid);
+ encode_nfs4_stateid(xdr, &args->dst_stateid);
+
+ encode_uint64(xdr, args->src_pos);
+ encode_uint64(xdr, args->dst_pos);
+ encode_uint64(xdr, args->count);
+
+ encode_uint32(xdr, 1); /* consecutive = true */
+ encode_uint32(xdr, args->sync);
+ if (args->cp_src == NULL) { /* intra-ssc */
+ encode_uint32(xdr, 0); /* no src server list */
+ return;
+ }
+ encode_uint32(xdr, 1); /* supporting 1 server */
+ encode_nl4_server(xdr, args->cp_src);
+}
+
+static void encode_offload_cancel(struct xdr_stream *xdr,
+ const struct nfs42_offload_status_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_OFFLOAD_CANCEL, decode_offload_cancel_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->osa_stateid);
+}
+
+static void encode_copy_notify(struct xdr_stream *xdr,
+ const struct nfs42_copy_notify_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_COPY_NOTIFY, decode_copy_notify_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->cna_src_stateid);
+ encode_nl4_server(xdr, &args->cna_dst);
+}
+
+static void encode_deallocate(struct xdr_stream *xdr,
+ const struct nfs42_falloc_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_DEALLOCATE, decode_deallocate_maxsz, hdr);
+ encode_fallocate(xdr, args);
+}
+
+static void encode_read_plus(struct xdr_stream *xdr,
+ const struct nfs_pgio_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_READ_PLUS, decode_read_plus_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->stateid);
+ encode_uint64(xdr, args->offset);
+ encode_uint32(xdr, args->count);
+}
+
+static void encode_seek(struct xdr_stream *xdr,
+ const struct nfs42_seek_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_SEEK, decode_seek_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->sa_stateid);
+ encode_uint64(xdr, args->sa_offset);
+ encode_uint32(xdr, args->sa_what);
+}
+
+static void encode_layoutstats(struct xdr_stream *xdr,
+ const struct nfs42_layoutstat_args *args,
+ struct nfs42_layoutstat_devinfo *devinfo,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LAYOUTSTATS, decode_layoutstats_maxsz, hdr);
+ p = reserve_space(xdr, 8 + 8);
+ p = xdr_encode_hyper(p, devinfo->offset);
+ p = xdr_encode_hyper(p, devinfo->length);
+ encode_nfs4_stateid(xdr, &args->stateid);
+ p = reserve_space(xdr, 4*8 + NFS4_DEVICEID4_SIZE + 4);
+ p = xdr_encode_hyper(p, devinfo->read_count);
+ p = xdr_encode_hyper(p, devinfo->read_bytes);
+ p = xdr_encode_hyper(p, devinfo->write_count);
+ p = xdr_encode_hyper(p, devinfo->write_bytes);
+ p = xdr_encode_opaque_fixed(p, devinfo->dev_id.data,
+ NFS4_DEVICEID4_SIZE);
+ /* Encode layoutupdate4 */
+ *p++ = cpu_to_be32(devinfo->layout_type);
+ if (devinfo->ld_private.ops)
+ devinfo->ld_private.ops->encode(xdr, args,
+ &devinfo->ld_private);
+ else
+ encode_uint32(xdr, 0);
+}
+
+static void encode_clone(struct xdr_stream *xdr,
+ const struct nfs42_clone_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_CLONE, decode_clone_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->src_stateid);
+ encode_nfs4_stateid(xdr, &args->dst_stateid);
+ p = reserve_space(xdr, 3*8);
+ p = xdr_encode_hyper(p, args->src_offset);
+ p = xdr_encode_hyper(p, args->dst_offset);
+ xdr_encode_hyper(p, args->count);
+}
+
+static void encode_device_error(struct xdr_stream *xdr,
+ const struct nfs42_device_error *error)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 2*4);
+ p = xdr_encode_opaque_fixed(p, error->dev_id.data,
+ NFS4_DEVICEID4_SIZE);
+ *p++ = cpu_to_be32(error->status);
+ *p = cpu_to_be32(error->opnum);
+}
+
+static void encode_layouterror(struct xdr_stream *xdr,
+ const struct nfs42_layout_error *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LAYOUTERROR, decode_layouterror_maxsz, hdr);
+ p = reserve_space(xdr, 8 + 8);
+ p = xdr_encode_hyper(p, args->offset);
+ p = xdr_encode_hyper(p, args->length);
+ encode_nfs4_stateid(xdr, &args->stateid);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(1);
+ encode_device_error(xdr, &args->errors[0]);
+}
+
+static void encode_setxattr(struct xdr_stream *xdr,
+ const struct nfs42_setxattrargs *arg,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ BUILD_BUG_ON(XATTR_CREATE != SETXATTR4_CREATE);
+ BUILD_BUG_ON(XATTR_REPLACE != SETXATTR4_REPLACE);
+
+ encode_op_hdr(xdr, OP_SETXATTR, decode_setxattr_maxsz, hdr);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(arg->xattr_flags);
+ encode_string(xdr, strlen(arg->xattr_name), arg->xattr_name);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(arg->xattr_len);
+ if (arg->xattr_len)
+ xdr_write_pages(xdr, arg->xattr_pages, 0, arg->xattr_len);
+}
+
+static int decode_setxattr(struct xdr_stream *xdr,
+ struct nfs4_change_info *cinfo)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_SETXATTR);
+ if (status)
+ goto out;
+ status = decode_change_info(xdr, cinfo);
+out:
+ return status;
+}
+
+
+static void encode_getxattr(struct xdr_stream *xdr, const char *name,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_GETXATTR, decode_getxattr_maxsz, hdr);
+ encode_string(xdr, strlen(name), name);
+}
+
+static int decode_getxattr(struct xdr_stream *xdr,
+ struct nfs42_getxattrres *res,
+ struct rpc_rqst *req)
+{
+ int status;
+ __be32 *p;
+ u32 len, rdlen;
+
+ status = decode_op_hdr(xdr, OP_GETXATTR);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ len = be32_to_cpup(p);
+ if (len > req->rq_rcv_buf.page_len)
+ return -ERANGE;
+
+ res->xattr_len = len;
+
+ if (len > 0) {
+ rdlen = xdr_read_pages(xdr, len);
+ if (rdlen < len)
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static void encode_removexattr(struct xdr_stream *xdr, const char *name,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_REMOVEXATTR, decode_removexattr_maxsz, hdr);
+ encode_string(xdr, strlen(name), name);
+}
+
+
+static int decode_removexattr(struct xdr_stream *xdr,
+ struct nfs4_change_info *cinfo)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_REMOVEXATTR);
+ if (status)
+ goto out;
+
+ status = decode_change_info(xdr, cinfo);
+out:
+ return status;
+}
+
+static void encode_listxattrs(struct xdr_stream *xdr,
+ const struct nfs42_listxattrsargs *arg,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LISTXATTRS, decode_listxattrs_maxsz, hdr);
+
+ p = reserve_space(xdr, 12);
+ if (unlikely(!p))
+ return;
+
+ p = xdr_encode_hyper(p, arg->cookie);
+ /*
+ * RFC 8276 says to specify the full max length of the LISTXATTRS
+ * XDR reply. Count is set to the XDR length of the names array
+ * plus the EOF marker. So, add the cookie and the names count.
+ */
+ *p = cpu_to_be32(arg->count + 8 + 4);
+}
+
+static int decode_listxattrs(struct xdr_stream *xdr,
+ struct nfs42_listxattrsres *res)
+{
+ int status;
+ __be32 *p;
+ u32 count, len, ulen;
+ size_t left, copied;
+ char *buf;
+
+ status = decode_op_hdr(xdr, OP_LISTXATTRS);
+ if (status) {
+ /*
+ * Special case: for LISTXATTRS, NFS4ERR_TOOSMALL
+ * should be translated to ERANGE.
+ */
+ if (status == -ETOOSMALL)
+ status = -ERANGE;
+ goto out;
+ }
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+
+ xdr_decode_hyper(p, &res->cookie);
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ left = res->xattr_len;
+ buf = res->xattr_buf;
+
+ count = be32_to_cpup(p);
+ copied = 0;
+
+ /*
+ * We have asked for enough room to encode the maximum number
+ * of possible attribute names, so everything should fit.
+ *
+ * But, don't rely on that assumption. Just decode entries
+ * until they don't fit anymore, just in case the server did
+ * something odd.
+ */
+ while (count--) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ len = be32_to_cpup(p);
+ if (len > (XATTR_NAME_MAX - XATTR_USER_PREFIX_LEN)) {
+ status = -ERANGE;
+ goto out;
+ }
+
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return -EIO;
+
+ ulen = len + XATTR_USER_PREFIX_LEN + 1;
+ if (buf) {
+ if (ulen > left) {
+ status = -ERANGE;
+ goto out;
+ }
+
+ memcpy(buf, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+ memcpy(buf + XATTR_USER_PREFIX_LEN, p, len);
+
+ buf[ulen - 1] = 0;
+ buf += ulen;
+ left -= ulen;
+ }
+ copied += ulen;
+ }
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ res->eof = be32_to_cpup(p);
+ res->copied = copied;
+
+out:
+ if (status == -ERANGE && res->xattr_len == XATTR_LIST_MAX)
+ status = -E2BIG;
+
+ return status;
+}
+
+/*
+ * Encode ALLOCATE request
+ */
+static void nfs4_xdr_enc_allocate(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_falloc_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->falloc_fh, &hdr);
+ encode_allocate(xdr, args, &hdr);
+ encode_getfattr(xdr, args->falloc_bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+static void encode_copy_commit(struct xdr_stream *xdr,
+ const struct nfs42_copy_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr);
+ p = reserve_space(xdr, 12);
+ p = xdr_encode_hyper(p, args->dst_pos);
+ *p = cpu_to_be32(args->count);
+}
+
+/*
+ * Encode COPY request
+ */
+static void nfs4_xdr_enc_copy(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_copy_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->src_fh, &hdr);
+ encode_savefh(xdr, &hdr);
+ encode_putfh(xdr, args->dst_fh, &hdr);
+ encode_copy(xdr, args, &hdr);
+ if (args->sync)
+ encode_copy_commit(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode OFFLOAD_CANEL request
+ */
+static void nfs4_xdr_enc_offload_cancel(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_offload_status_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->osa_seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->osa_seq_args, &hdr);
+ encode_putfh(xdr, args->osa_src_fh, &hdr);
+ encode_offload_cancel(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode COPY_NOTIFY request
+ */
+static void nfs4_xdr_enc_copy_notify(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_copy_notify_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->cna_seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->cna_seq_args, &hdr);
+ encode_putfh(xdr, args->cna_src_fh, &hdr);
+ encode_copy_notify(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode DEALLOCATE request
+ */
+static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_falloc_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->falloc_fh, &hdr);
+ encode_deallocate(xdr, args, &hdr);
+ encode_getfattr(xdr, args->falloc_bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode READ_PLUS request
+ */
+static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_pgio_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_read_plus(xdr, args, &hdr);
+
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, hdr.replen);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode SEEK request
+ */
+static void nfs4_xdr_enc_seek(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_seek_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->sa_fh, &hdr);
+ encode_seek(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode LAYOUTSTATS request
+ */
+static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_layoutstat_args *args = data;
+ int i;
+
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ WARN_ON(args->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
+ for (i = 0; i < args->num_dev; i++)
+ encode_layoutstats(xdr, args, &args->devinfo[i], &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode CLONE request
+ */
+static void nfs4_xdr_enc_clone(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_clone_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->src_fh, &hdr);
+ encode_savefh(xdr, &hdr);
+ encode_putfh(xdr, args->dst_fh, &hdr);
+ encode_clone(xdr, args, &hdr);
+ encode_getfattr(xdr, args->dst_bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode LAYOUTERROR request
+ */
+static void nfs4_xdr_enc_layouterror(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_layouterror_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+ int i;
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+ for (i = 0; i < args->num_errors; i++)
+ encode_layouterror(xdr, &args->errors[i], &hdr);
+ encode_nops(&hdr);
+}
+
+static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
+{
+ return decode_op_hdr(xdr, OP_ALLOCATE);
+}
+
+static int decode_write_response(struct xdr_stream *xdr,
+ struct nfs42_write_res *res)
+{
+ __be32 *p;
+ int status, count;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ count = be32_to_cpup(p);
+ if (count > 1)
+ return -EREMOTEIO;
+ else if (count == 1) {
+ status = decode_opaque_fixed(xdr, &res->stateid,
+ NFS4_STATEID_SIZE);
+ if (unlikely(status))
+ return -EIO;
+ }
+ p = xdr_inline_decode(xdr, 8 + 4);
+ if (unlikely(!p))
+ return -EIO;
+ p = xdr_decode_hyper(p, &res->count);
+ res->verifier.committed = be32_to_cpup(p);
+ return decode_verifier(xdr, &res->verifier.verifier);
+}
+
+static int decode_nl4_server(struct xdr_stream *xdr, struct nl4_server *ns)
+{
+ struct nfs42_netaddr *naddr;
+ uint32_t dummy;
+ char *dummy_str;
+ __be32 *p;
+ int status;
+
+ /* nl_type */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ ns->nl4_type = be32_to_cpup(p);
+ switch (ns->nl4_type) {
+ case NL4_NAME:
+ case NL4_URL:
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
+ if (unlikely(dummy > NFS4_OPAQUE_LIMIT))
+ return -EIO;
+ memcpy(&ns->u.nl4_str, dummy_str, dummy);
+ ns->u.nl4_str_sz = dummy;
+ break;
+ case NL4_NETADDR:
+ naddr = &ns->u.nl4_addr;
+
+ /* netid string */
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
+ if (unlikely(dummy > RPCBIND_MAXNETIDLEN))
+ return -EIO;
+ naddr->netid_len = dummy;
+ memcpy(naddr->netid, dummy_str, naddr->netid_len);
+
+ /* uaddr string */
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
+ if (unlikely(dummy > RPCBIND_MAXUADDRLEN))
+ return -EIO;
+ naddr->addr_len = dummy;
+ memcpy(naddr->addr, dummy_str, naddr->addr_len);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int decode_copy_requirements(struct xdr_stream *xdr,
+ struct nfs42_copy_res *res) {
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4 + 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ res->consecutive = be32_to_cpup(p++);
+ res->synchronous = be32_to_cpup(p++);
+ return 0;
+}
+
+static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_COPY);
+ if (status == NFS4ERR_OFFLOAD_NO_REQS) {
+ status = decode_copy_requirements(xdr, res);
+ if (status)
+ return status;
+ return NFS4ERR_OFFLOAD_NO_REQS;
+ } else if (status)
+ return status;
+
+ status = decode_write_response(xdr, &res->write_res);
+ if (status)
+ return status;
+
+ return decode_copy_requirements(xdr, res);
+}
+
+static int decode_offload_cancel(struct xdr_stream *xdr,
+ struct nfs42_offload_status_res *res)
+{
+ return decode_op_hdr(xdr, OP_OFFLOAD_CANCEL);
+}
+
+static int decode_copy_notify(struct xdr_stream *xdr,
+ struct nfs42_copy_notify_res *res)
+{
+ __be32 *p;
+ int status, count;
+
+ status = decode_op_hdr(xdr, OP_COPY_NOTIFY);
+ if (status)
+ return status;
+ /* cnr_lease_time */
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(!p))
+ return -EIO;
+ p = xdr_decode_hyper(p, &res->cnr_lease_time.seconds);
+ res->cnr_lease_time.nseconds = be32_to_cpup(p);
+
+ status = decode_opaque_fixed(xdr, &res->cnr_stateid, NFS4_STATEID_SIZE);
+ if (unlikely(status))
+ return -EIO;
+
+ /* number of source addresses */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ count = be32_to_cpup(p);
+ if (count > 1)
+ pr_warn("NFS: %s: nsvr %d > Supported. Use first servers\n",
+ __func__, count);
+
+ status = decode_nl4_server(xdr, &res->cnr_src);
+ if (unlikely(status))
+ return -EIO;
+ return 0;
+}
+
+static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
+{
+ return decode_op_hdr(xdr, OP_DEALLOCATE);
+}
+
+static int decode_read_plus_data(struct xdr_stream *xdr,
+ struct nfs_pgio_res *res)
+{
+ uint32_t count, recvd;
+ uint64_t offset;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 8 + 4);
+ if (!p)
+ return 1;
+
+ p = xdr_decode_hyper(p, &offset);
+ count = be32_to_cpup(p);
+ recvd = xdr_align_data(xdr, res->count, count);
+ res->count += recvd;
+
+ if (count > recvd)
+ return 1;
+ return 0;
+}
+
+static int decode_read_plus_hole(struct xdr_stream *xdr, struct nfs_pgio_res *res,
+ uint32_t *eof)
+{
+ uint64_t offset, length, recvd;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 8 + 8);
+ if (!p)
+ return 1;
+
+ p = xdr_decode_hyper(p, &offset);
+ p = xdr_decode_hyper(p, &length);
+ recvd = xdr_expand_hole(xdr, res->count, length);
+ res->count += recvd;
+
+ if (recvd < length)
+ return 1;
+ return 0;
+}
+
+static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
+{
+ uint32_t eof, segments, type;
+ int status, i;
+ __be32 *p;
+
+ status = decode_op_hdr(xdr, OP_READ_PLUS);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 4 + 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ eof = be32_to_cpup(p++);
+ segments = be32_to_cpup(p++);
+ if (segments == 0)
+ goto out;
+
+ for (i = 0; i < segments; i++) {
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ goto early_out;
+
+ type = be32_to_cpup(p++);
+ if (type == NFS4_CONTENT_DATA)
+ status = decode_read_plus_data(xdr, res);
+ else if (type == NFS4_CONTENT_HOLE)
+ status = decode_read_plus_hole(xdr, res, &eof);
+ else
+ return -EINVAL;
+
+ if (status < 0)
+ return status;
+ if (status > 0)
+ goto early_out;
+ }
+
+out:
+ res->eof = eof;
+ return 0;
+early_out:
+ if (unlikely(!i))
+ return -EIO;
+ res->eof = 0;
+ return 0;
+}
+
+static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
+{
+ int status;
+ __be32 *p;
+
+ status = decode_op_hdr(xdr, OP_SEEK);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 4 + 8);
+ if (unlikely(!p))
+ return -EIO;
+
+ res->sr_eof = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &res->sr_offset);
+ return 0;
+}
+
+static int decode_layoutstats(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_LAYOUTSTATS);
+}
+
+static int decode_clone(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_CLONE);
+}
+
+static int decode_layouterror(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_LAYOUTERROR);
+}
+
+/*
+ * Decode ALLOCATE request
+ */
+static int nfs4_xdr_dec_allocate(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_falloc_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_allocate(xdr, res);
+ if (status)
+ goto out;
+ decode_getfattr(xdr, res->falloc_fattr, res->falloc_server);
+out:
+ return status;
+}
+
+/*
+ * Decode COPY response
+ */
+static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_copy_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_savefh(xdr);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_copy(xdr, res);
+ if (status)
+ goto out;
+ if (res->commit_res.verf)
+ status = decode_commit(xdr, &res->commit_res);
+out:
+ return status;
+}
+
+/*
+ * Decode OFFLOAD_CANCEL response
+ */
+static int nfs4_xdr_dec_offload_cancel(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_offload_status_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->osr_seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_offload_cancel(xdr, res);
+
+out:
+ return status;
+}
+
+/*
+ * Decode COPY_NOTIFY response
+ */
+static int nfs4_xdr_dec_copy_notify(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_copy_notify_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->cnr_seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_copy_notify(xdr, res);
+
+out:
+ return status;
+}
+
+/*
+ * Decode DEALLOCATE request
+ */
+static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_falloc_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_deallocate(xdr, res);
+ if (status)
+ goto out;
+ decode_getfattr(xdr, res->falloc_fattr, res->falloc_server);
+out:
+ return status;
+}
+
+/*
+ * Decode READ_PLUS request
+ */
+static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_pgio_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_read_plus(xdr, res);
+ if (!status)
+ status = res->count;
+out:
+ return status;
+}
+
+/*
+ * Decode SEEK request
+ */
+static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_seek_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_seek(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode LAYOUTSTATS request
+ */
+static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_layoutstat_res *res = data;
+ struct compound_hdr hdr;
+ int status, i;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
+ for (i = 0; i < res->num_dev; i++) {
+ status = decode_layoutstats(xdr);
+ if (status)
+ goto out;
+ }
+out:
+ res->rpc_status = status;
+ return status;
+}
+
+/*
+ * Decode CLONE request
+ */
+static int nfs4_xdr_dec_clone(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_clone_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_savefh(xdr);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_clone(xdr);
+ if (status)
+ goto out;
+ decode_getfattr(xdr, res->dst_fattr, res->server);
+out:
+ res->rpc_status = status;
+ return status;
+}
+
+/*
+ * Decode LAYOUTERROR request
+ */
+static int nfs4_xdr_dec_layouterror(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_layouterror_res *res = data;
+ struct compound_hdr hdr;
+ int status, i;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+
+ for (i = 0; i < res->num_errors && status == 0; i++)
+ status = decode_layouterror(xdr);
+out:
+ res->rpc_status = status;
+ return status;
+}
+
+#ifdef CONFIG_NFS_V4_2
+static void nfs4_xdr_enc_setxattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_setxattrargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_setxattr(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_setxattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_setxattrres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, req);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+
+ status = decode_setxattr(xdr, &res->cinfo);
+out:
+ return status;
+}
+
+static void nfs4_xdr_enc_getxattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_getxattrargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+ size_t plen;
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_getxattr(xdr, args->xattr_name, &hdr);
+
+ plen = args->xattr_len ? args->xattr_len : XATTR_SIZE_MAX;
+
+ rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen,
+ hdr.replen);
+ req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
+
+ encode_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_getxattr(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr, void *data)
+{
+ struct nfs42_getxattrres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_getxattr(xdr, res, rqstp);
+out:
+ return status;
+}
+
+static void nfs4_xdr_enc_listxattrs(struct rpc_rqst *req,
+ struct xdr_stream *xdr, const void *data)
+{
+ const struct nfs42_listxattrsargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_listxattrs(xdr, args, &hdr);
+
+ rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->count,
+ hdr.replen);
+
+ encode_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr, void *data)
+{
+ struct nfs42_listxattrsres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ xdr_set_scratch_buffer(xdr, page_address(res->scratch), PAGE_SIZE);
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_listxattrs(xdr, res);
+out:
+ return status;
+}
+
+static void nfs4_xdr_enc_removexattr(struct rpc_rqst *req,
+ struct xdr_stream *xdr, const void *data)
+{
+ const struct nfs42_removexattrargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_removexattr(xdr, args->xattr_name, &hdr);
+ encode_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_removexattr(struct rpc_rqst *req,
+ struct xdr_stream *xdr, void *data)
+{
+ struct nfs42_removexattrres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, req);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+
+ status = decode_removexattr(xdr, &res->cinfo);
+out:
+ return status;
+}
+#endif
+#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
new file mode 100644
index 000000000..8b41c0b86
--- /dev/null
+++ b/fs/nfs/nfs4_fs.h
@@ -0,0 +1,671 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/fs/nfs/nfs4_fs.h
+ *
+ * Copyright (C) 2005 Trond Myklebust
+ *
+ * NFSv4-specific filesystem definitions and declarations
+ */
+
+#ifndef __LINUX_FS_NFS_NFS4_FS_H
+#define __LINUX_FS_NFS_NFS4_FS_H
+
+#if defined(CONFIG_NFS_V4_2)
+#define NFS4_MAX_MINOR_VERSION 2
+#elif defined(CONFIG_NFS_V4_1)
+#define NFS4_MAX_MINOR_VERSION 1
+#else
+#define NFS4_MAX_MINOR_VERSION 0
+#endif
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+
+#define NFS4_MAX_LOOP_ON_RECOVER (10)
+
+#include <linux/seqlock.h>
+
+struct idmap;
+
+enum nfs4_client_state {
+ NFS4CLNT_MANAGER_RUNNING = 0,
+ NFS4CLNT_CHECK_LEASE,
+ NFS4CLNT_LEASE_EXPIRED,
+ NFS4CLNT_RECLAIM_REBOOT,
+ NFS4CLNT_RECLAIM_NOGRACE,
+ NFS4CLNT_DELEGRETURN,
+ NFS4CLNT_SESSION_RESET,
+ NFS4CLNT_LEASE_CONFIRM,
+ NFS4CLNT_SERVER_SCOPE_MISMATCH,
+ NFS4CLNT_PURGE_STATE,
+ NFS4CLNT_BIND_CONN_TO_SESSION,
+ NFS4CLNT_MOVED,
+ NFS4CLNT_LEASE_MOVED,
+ NFS4CLNT_DELEGATION_EXPIRED,
+ NFS4CLNT_RUN_MANAGER,
+ NFS4CLNT_MANAGER_AVAILABLE,
+ NFS4CLNT_RECALL_RUNNING,
+ NFS4CLNT_RECALL_ANY_LAYOUT_READ,
+ NFS4CLNT_RECALL_ANY_LAYOUT_RW,
+ NFS4CLNT_DELEGRETURN_DELAYED,
+};
+
+#define NFS4_RENEW_TIMEOUT 0x01
+#define NFS4_RENEW_DELEGATION_CB 0x02
+
+struct nfs_seqid_counter;
+struct nfs4_minor_version_ops {
+ u32 minor_version;
+ unsigned init_caps;
+
+ int (*init_client)(struct nfs_client *);
+ void (*shutdown_client)(struct nfs_client *);
+ bool (*match_stateid)(const nfs4_stateid *,
+ const nfs4_stateid *);
+ int (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
+ struct nfs_fsinfo *);
+ void (*free_lock_state)(struct nfs_server *,
+ struct nfs4_lock_state *);
+ int (*test_and_free_expired)(struct nfs_server *,
+ nfs4_stateid *, const struct cred *);
+ struct nfs_seqid *
+ (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+ void (*session_trunk)(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt, void *data);
+ const struct rpc_call_ops *call_sync_ops;
+ const struct nfs4_state_recovery_ops *reboot_recovery_ops;
+ const struct nfs4_state_recovery_ops *nograce_recovery_ops;
+ const struct nfs4_state_maintenance_ops *state_renewal_ops;
+ const struct nfs4_mig_recovery_ops *mig_recovery_ops;
+};
+
+#define NFS_SEQID_CONFIRMED 1
+struct nfs_seqid_counter {
+ ktime_t create_time;
+ int owner_id;
+ int flags;
+ u32 counter;
+ spinlock_t lock; /* Protects the list */
+ struct list_head list; /* Defines sequence of RPC calls */
+ struct rpc_wait_queue wait; /* RPC call delay queue */
+};
+
+struct nfs_seqid {
+ struct nfs_seqid_counter *sequence;
+ struct list_head list;
+ struct rpc_task *task;
+};
+
+static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status)
+{
+ if (seqid_mutating_err(-status))
+ seqid->flags |= NFS_SEQID_CONFIRMED;
+}
+
+/*
+ * NFS4 state_owners and lock_owners are simply labels for ordered
+ * sequences of RPC calls. Their sole purpose is to provide once-only
+ * semantics by allowing the server to identify replayed requests.
+ */
+struct nfs4_state_owner {
+ struct nfs_server *so_server;
+ struct list_head so_lru;
+ unsigned long so_expires;
+ struct rb_node so_server_node;
+
+ const struct cred *so_cred; /* Associated cred */
+
+ spinlock_t so_lock;
+ atomic_t so_count;
+ unsigned long so_flags;
+ struct list_head so_states;
+ struct nfs_seqid_counter so_seqid;
+ seqcount_spinlock_t so_reclaim_seqcount;
+ struct mutex so_delegreturn_mutex;
+};
+
+enum {
+ NFS_OWNER_RECLAIM_REBOOT,
+ NFS_OWNER_RECLAIM_NOGRACE
+};
+
+#define NFS_LOCK_NEW 0
+#define NFS_LOCK_RECLAIM 1
+#define NFS_LOCK_EXPIRED 2
+
+/*
+ * struct nfs4_state maintains the client-side state for a given
+ * (state_owner,inode) tuple (OPEN) or state_owner (LOCK).
+ *
+ * OPEN:
+ * In order to know when to OPEN_DOWNGRADE or CLOSE the state on the server,
+ * we need to know how many files are open for reading or writing on a
+ * given inode. This information too is stored here.
+ *
+ * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
+ */
+
+struct nfs4_lock_state {
+ struct list_head ls_locks; /* Other lock stateids */
+ struct nfs4_state * ls_state; /* Pointer to open state */
+#define NFS_LOCK_INITIALIZED 0
+#define NFS_LOCK_LOST 1
+ unsigned long ls_flags;
+ struct nfs_seqid_counter ls_seqid;
+ nfs4_stateid ls_stateid;
+ refcount_t ls_count;
+ fl_owner_t ls_owner;
+};
+
+/* bits for nfs4_state->flags */
+enum {
+ LK_STATE_IN_USE,
+ NFS_DELEGATED_STATE, /* Current stateid is delegation */
+ NFS_OPEN_STATE, /* OPEN stateid is set */
+ NFS_O_RDONLY_STATE, /* OPEN stateid has read-only state */
+ NFS_O_WRONLY_STATE, /* OPEN stateid has write-only state */
+ NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */
+ NFS_STATE_RECLAIM_REBOOT, /* OPEN stateid server rebooted */
+ NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */
+ NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */
+ NFS_STATE_RECOVERY_FAILED, /* OPEN stateid state recovery failed */
+ NFS_STATE_MAY_NOTIFY_LOCK, /* server may CB_NOTIFY_LOCK */
+ NFS_STATE_CHANGE_WAIT, /* A state changing operation is outstanding */
+ NFS_CLNT_DST_SSC_COPY_STATE, /* dst server open state on client*/
+ NFS_CLNT_SRC_SSC_COPY_STATE, /* src server open state on client*/
+ NFS_SRV_SSC_COPY_STATE, /* ssc state on the dst server */
+};
+
+struct nfs4_state {
+ struct list_head open_states; /* List of states for the same state_owner */
+ struct list_head inode_states; /* List of states for the same inode */
+ struct list_head lock_states; /* List of subservient lock stateids */
+
+ struct nfs4_state_owner *owner; /* Pointer to the open owner */
+ struct inode *inode; /* Pointer to the inode */
+
+ unsigned long flags; /* Do we hold any locks? */
+ spinlock_t state_lock; /* Protects the lock_states list */
+
+ seqlock_t seqlock; /* Protects the stateid/open_stateid */
+ nfs4_stateid stateid; /* Current stateid: may be delegation */
+ nfs4_stateid open_stateid; /* OPEN stateid */
+
+ /* The following 3 fields are protected by owner->so_lock */
+ unsigned int n_rdonly; /* Number of read-only references */
+ unsigned int n_wronly; /* Number of write-only references */
+ unsigned int n_rdwr; /* Number of read/write references */
+ fmode_t state; /* State on the server (R,W, or RW) */
+ refcount_t count;
+
+ wait_queue_head_t waitq;
+ struct rcu_head rcu_head;
+};
+
+
+struct nfs4_exception {
+ struct nfs4_state *state;
+ struct inode *inode;
+ nfs4_stateid *stateid;
+ long timeout;
+ unsigned char task_is_privileged : 1;
+ unsigned char delay : 1,
+ recovering : 1,
+ retry : 1;
+ bool interruptible;
+};
+
+struct nfs4_state_recovery_ops {
+ int owner_flag_bit;
+ int state_flag_bit;
+ int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *);
+ int (*recover_lock)(struct nfs4_state *, struct file_lock *);
+ int (*establish_clid)(struct nfs_client *, const struct cred *);
+ int (*reclaim_complete)(struct nfs_client *, const struct cred *);
+ int (*detect_trunking)(struct nfs_client *, struct nfs_client **,
+ const struct cred *);
+};
+
+struct nfs4_opendata {
+ struct kref kref;
+ struct nfs_openargs o_arg;
+ struct nfs_openres o_res;
+ struct nfs_open_confirmargs c_arg;
+ struct nfs_open_confirmres c_res;
+ struct nfs4_string owner_name;
+ struct nfs4_string group_name;
+ struct nfs4_label *a_label;
+ struct nfs_fattr f_attr;
+ struct nfs4_label *f_label;
+ struct dentry *dir;
+ struct dentry *dentry;
+ struct nfs4_state_owner *owner;
+ struct nfs4_state *state;
+ struct iattr attrs;
+ struct nfs4_layoutget *lgp;
+ unsigned long timestamp;
+ bool rpc_done;
+ bool file_created;
+ bool is_recover;
+ bool cancelled;
+ int rpc_status;
+};
+
+struct nfs4_add_xprt_data {
+ struct nfs_client *clp;
+ const struct cred *cred;
+};
+
+struct nfs4_state_maintenance_ops {
+ int (*sched_state_renewal)(struct nfs_client *, const struct cred *, unsigned);
+ const struct cred * (*get_state_renewal_cred)(struct nfs_client *);
+ int (*renew_lease)(struct nfs_client *, const struct cred *);
+};
+
+struct nfs4_mig_recovery_ops {
+ int (*get_locations)(struct inode *, struct nfs4_fs_locations *,
+ struct page *, const struct cred *);
+ int (*fsid_present)(struct inode *, const struct cred *);
+};
+
+extern const struct dentry_operations nfs4_dentry_operations;
+
+/* dir.c */
+int nfs_atomic_open(struct inode *, struct dentry *, struct file *,
+ unsigned, umode_t);
+
+/* fs_context.c */
+extern struct file_system_type nfs4_fs_type;
+
+/* nfs4namespace.c */
+struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *,
+ const struct qstr *);
+int nfs4_submount(struct fs_context *, struct nfs_server *);
+int nfs4_replace_transport(struct nfs_server *server,
+ const struct nfs4_fs_locations *locations);
+size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa,
+ size_t salen, struct net *net, int port);
+/* nfs4proc.c */
+extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *);
+extern int nfs4_async_handle_error(struct rpc_task *task,
+ struct nfs_server *server,
+ struct nfs4_state *state, long *timeout);
+extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *,
+ struct rpc_message *, struct nfs4_sequence_args *,
+ struct nfs4_sequence_res *, int);
+extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int, int);
+extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, const struct cred *, struct nfs4_setclientid_res *);
+extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, const struct cred *);
+extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool);
+extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, const struct cred *cred);
+extern int nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred);
+extern int nfs4_destroy_clientid(struct nfs_client *clp);
+extern int nfs4_init_clientid(struct nfs_client *, const struct cred *);
+extern int nfs41_init_clientid(struct nfs_client *, const struct cred *);
+extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait);
+extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
+extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *,
+ struct nfs4_fs_locations *, struct page *);
+extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *,
+ struct page *page, const struct cred *);
+extern int nfs4_proc_fsid_present(struct inode *, const struct cred *);
+extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *,
+ struct dentry *,
+ struct nfs_fh *,
+ struct nfs_fattr *);
+extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
+extern const struct xattr_handler *nfs4_xattr_handlers[];
+extern int nfs4_set_rw_stateid(nfs4_stateid *stateid,
+ const struct nfs_open_context *ctx,
+ const struct nfs_lock_context *l_ctx,
+ fmode_t fmode);
+extern int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct nfs4_label *label,
+ struct inode *inode);
+extern int update_open_stateid(struct nfs4_state *state,
+ const nfs4_stateid *open_stateid,
+ const nfs4_stateid *deleg_stateid,
+ fmode_t fmode);
+
+extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
+ struct nfs_fsinfo *fsinfo);
+extern void nfs4_update_changeattr(struct inode *dir,
+ struct nfs4_change_info *cinfo,
+ unsigned long timestamp,
+ unsigned long cache_validity);
+extern int nfs4_buf_to_pages_noslab(const void *buf, size_t buflen,
+ struct page **pages);
+
+#if defined(CONFIG_NFS_V4_1)
+extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *);
+extern int nfs4_proc_create_session(struct nfs_client *, const struct cred *);
+extern int nfs4_proc_destroy_session(struct nfs4_session *, const struct cred *);
+extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
+ bool sync);
+extern int nfs4_detect_session_trunking(struct nfs_client *clp,
+ struct nfs41_exchange_id_res *res, struct rpc_xprt *xprt);
+
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+ return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) ==
+ EXCHGID4_FLAG_USE_PNFS_DS;
+}
+
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+ return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS;
+}
+
+static inline bool
+_nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
+ struct rpc_clnt **clntp, struct rpc_message *msg)
+{
+ rpc_authflavor_t flavor;
+
+ if (sp4_mode == NFS_SP4_MACH_CRED_CLEANUP ||
+ sp4_mode == NFS_SP4_MACH_CRED_PNFS_CLEANUP) {
+ /* Using machine creds for cleanup operations
+ * is only relevent if the client credentials
+ * might expire. So don't bother for
+ * RPC_AUTH_UNIX. If file was only exported to
+ * sec=sys, the PUTFH would fail anyway.
+ */
+ if ((*clntp)->cl_auth->au_flavor == RPC_AUTH_UNIX)
+ return false;
+ }
+ if (test_bit(sp4_mode, &clp->cl_sp4_flags)) {
+ msg->rpc_cred = rpc_machine_cred();
+
+ flavor = clp->cl_rpcclient->cl_auth->au_flavor;
+ WARN_ON_ONCE(flavor != RPC_AUTH_GSS_KRB5I &&
+ flavor != RPC_AUTH_GSS_KRB5P);
+ *clntp = clp->cl_rpcclient;
+
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Function responsible for determining if an rpc_message should use the
+ * machine cred under SP4_MACH_CRED and if so switching the credential and
+ * authflavor (using the nfs_client's rpc_clnt which will be krb5i/p).
+ * Should be called before rpc_call_sync/rpc_call_async.
+ */
+static inline void
+nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
+ struct rpc_clnt **clntp, struct rpc_message *msg)
+{
+ _nfs4_state_protect(clp, sp4_mode, clntp, msg);
+}
+
+/*
+ * Special wrapper to nfs4_state_protect for write.
+ * If WRITE can use machine cred but COMMIT cannot, make sure all writes
+ * that use machine cred use NFS_FILE_SYNC.
+ */
+static inline void
+nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
+ struct rpc_message *msg, struct nfs_pgio_header *hdr)
+{
+ if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) &&
+ !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags))
+ hdr->args.stable = NFS_FILE_SYNC;
+}
+#else /* CONFIG_NFS_v4_1 */
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+ return false;
+}
+
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+ return false;
+}
+
+static inline void
+nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags,
+ struct rpc_clnt **clntp, struct rpc_message *msg)
+{
+}
+
+static inline void
+nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
+ struct rpc_message *msg, struct nfs_pgio_header *hdr)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
+
+extern const u32 nfs4_fattr_bitmap[3];
+extern const u32 nfs4_statfs_bitmap[3];
+extern const u32 nfs4_pathconf_bitmap[3];
+extern const u32 nfs4_fsinfo_bitmap[3];
+extern const u32 nfs4_fs_locations_bitmap[3];
+
+void nfs40_shutdown_client(struct nfs_client *);
+void nfs41_shutdown_client(struct nfs_client *);
+int nfs40_init_client(struct nfs_client *);
+int nfs41_init_client(struct nfs_client *);
+void nfs4_free_client(struct nfs_client *);
+
+struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *);
+
+/* nfs4renewd.c */
+extern void nfs4_schedule_state_renewal(struct nfs_client *);
+extern void nfs4_renewd_prepare_shutdown(struct nfs_server *);
+extern void nfs4_kill_renewd(struct nfs_client *);
+extern void nfs4_renew_state(struct work_struct *);
+extern void nfs4_set_lease_period(struct nfs_client *clp, unsigned long lease);
+
+
+/* nfs4state.c */
+extern const nfs4_stateid current_stateid;
+
+const struct cred *nfs4_get_clid_cred(struct nfs_client *clp);
+const struct cred *nfs4_get_machine_cred(struct nfs_client *clp);
+const struct cred *nfs4_get_renew_cred(struct nfs_client *clp);
+int nfs4_discover_server_trunking(struct nfs_client *clp,
+ struct nfs_client **);
+int nfs40_discover_server_trunking(struct nfs_client *clp,
+ struct nfs_client **, const struct cred *);
+#if defined(CONFIG_NFS_V4_1)
+int nfs41_discover_server_trunking(struct nfs_client *clp,
+ struct nfs_client **, const struct cred *);
+extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
+extern void nfs41_notify_server(struct nfs_client *);
+bool nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1,
+ struct nfs41_server_owner *o2);
+#else
+static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, const struct cred *, gfp_t);
+extern void nfs4_put_state_owner(struct nfs4_state_owner *);
+extern void nfs4_purge_state_owners(struct nfs_server *, struct list_head *);
+extern void nfs4_free_state_owners(struct list_head *head);
+extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
+extern void nfs4_put_open_state(struct nfs4_state *);
+extern void nfs4_close_state(struct nfs4_state *, fmode_t);
+extern void nfs4_close_sync(struct nfs4_state *, fmode_t);
+extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
+extern void nfs_inode_find_state_and_recover(struct inode *inode,
+ const nfs4_stateid *stateid);
+extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *, struct nfs4_state *);
+extern void nfs4_schedule_lease_recovery(struct nfs_client *);
+extern int nfs4_wait_clnt_recover(struct nfs_client *clp);
+extern int nfs4_client_recover_expired_lease(struct nfs_client *clp);
+extern void nfs4_schedule_state_manager(struct nfs_client *);
+extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
+extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
+extern int nfs4_schedule_migration_recovery(const struct nfs_server *);
+extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *);
+extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags, bool);
+extern void nfs41_handle_server_scope(struct nfs_client *,
+ struct nfs41_server_scope **);
+extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
+extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
+extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t,
+ const struct nfs_lock_context *, nfs4_stateid *,
+ const struct cred **);
+extern bool nfs4_copy_open_stateid(nfs4_stateid *dst,
+ struct nfs4_state *state);
+
+extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
+extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
+extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
+extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
+extern void nfs_release_seqid(struct nfs_seqid *seqid);
+extern void nfs_free_seqid(struct nfs_seqid *seqid);
+extern int nfs4_setup_sequence(struct nfs_client *client,
+ struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ struct rpc_task *task);
+extern int nfs4_sequence_done(struct rpc_task *task,
+ struct nfs4_sequence_res *res);
+
+extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp);
+extern int nfs4_proc_commit(struct file *dst, __u64 offset, __u32 count, struct nfs_commitres *res);
+extern const nfs4_stateid zero_stateid;
+extern const nfs4_stateid invalid_stateid;
+
+/* nfs4super.c */
+struct nfs_mount_info;
+extern struct nfs_subversion nfs_v4;
+extern bool nfs4_disable_idmapping;
+extern unsigned short max_session_slots;
+extern unsigned short max_session_cb_slots;
+extern unsigned short send_implementation_id;
+extern bool recover_lost_locks;
+
+#define NFS4_CLIENT_ID_UNIQ_LEN (64)
+extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN];
+
+extern int nfs4_try_get_tree(struct fs_context *);
+extern int nfs4_get_referral_tree(struct fs_context *);
+
+/* nfs4sysctl.c */
+#ifdef CONFIG_SYSCTL
+int nfs4_register_sysctl(void);
+void nfs4_unregister_sysctl(void);
+#else
+static inline int nfs4_register_sysctl(void)
+{
+ return 0;
+}
+
+static inline void nfs4_unregister_sysctl(void)
+{
+}
+#endif
+
+/* nfs4xdr.c */
+extern const struct rpc_procinfo nfs4_procedures[];
+
+#ifdef CONFIG_NFS_V4_2
+extern const u32 nfs42_maxsetxattr_overhead;
+extern const u32 nfs42_maxgetxattr_overhead;
+extern const u32 nfs42_maxlistxattrs_overhead;
+#endif
+
+struct nfs4_mount_data;
+
+/* callback_xdr.c */
+extern const struct svc_version nfs4_callback_version1;
+extern const struct svc_version nfs4_callback_version4;
+
+static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src)
+{
+ memcpy(dst->data, src->data, sizeof(dst->data));
+ dst->type = src->type;
+}
+
+static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src)
+{
+ if (dst->type != src->type)
+ return false;
+ return memcmp(dst->data, src->data, sizeof(dst->data)) == 0;
+}
+
+static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src)
+{
+ return memcmp(dst->other, src->other, NFS4_STATEID_OTHER_SIZE) == 0;
+}
+
+static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stateid *s2)
+{
+ return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0;
+}
+
+static inline bool nfs4_stateid_is_next(const nfs4_stateid *s1, const nfs4_stateid *s2)
+{
+ u32 seq1 = be32_to_cpu(s1->seqid);
+ u32 seq2 = be32_to_cpu(s2->seqid);
+
+ return seq2 == seq1 + 1U || (seq2 == 1U && seq1 == 0xffffffffU);
+}
+
+static inline bool nfs4_stateid_match_or_older(const nfs4_stateid *dst, const nfs4_stateid *src)
+{
+ return nfs4_stateid_match_other(dst, src) &&
+ !(src->seqid && nfs4_stateid_is_newer(dst, src));
+}
+
+static inline void nfs4_stateid_seqid_inc(nfs4_stateid *s1)
+{
+ u32 seqid = be32_to_cpu(s1->seqid);
+
+ if (++seqid == 0)
+ ++seqid;
+ s1->seqid = cpu_to_be32(seqid);
+}
+
+static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state)
+{
+ return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0;
+}
+
+static inline bool nfs4_state_match_open_stateid_other(const struct nfs4_state *state,
+ const nfs4_stateid *stateid)
+{
+ return test_bit(NFS_OPEN_STATE, &state->flags) &&
+ nfs4_stateid_match_other(&state->open_stateid, stateid);
+}
+
+/* nfs42xattr.c */
+#ifdef CONFIG_NFS_V4_2
+extern int __init nfs4_xattr_cache_init(void);
+extern void nfs4_xattr_cache_exit(void);
+extern void nfs4_xattr_cache_add(struct inode *inode, const char *name,
+ const char *buf, struct page **pages,
+ ssize_t buflen);
+extern void nfs4_xattr_cache_remove(struct inode *inode, const char *name);
+extern ssize_t nfs4_xattr_cache_get(struct inode *inode, const char *name,
+ char *buf, ssize_t buflen);
+extern void nfs4_xattr_cache_set_list(struct inode *inode, const char *buf,
+ ssize_t buflen);
+extern ssize_t nfs4_xattr_cache_list(struct inode *inode, char *buf,
+ ssize_t buflen);
+extern void nfs4_xattr_cache_zap(struct inode *inode);
+#else
+static inline void nfs4_xattr_cache_zap(struct inode *inode)
+{
+}
+#endif /* CONFIG_NFS_V4_2 */
+
+#else /* CONFIG_NFS_V4 */
+
+#define nfs4_close_state(a, b) do { } while (0)
+#define nfs4_close_sync(a, b) do { } while (0)
+#define nfs4_state_protect(a, b, c, d) do { } while (0)
+#define nfs4_state_protect_write(a, b, c, d) do { } while (0)
+
+
+#endif /* CONFIG_NFS_V4 */
+#endif /* __LINUX_FS_NFS_NFS4_FS.H */
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
new file mode 100644
index 000000000..252c99c76
--- /dev/null
+++ b/fs/nfs/nfs4client.c
@@ -0,0 +1,1342 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+#include <linux/module.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/auth.h>
+#include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/bc_xprt.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include "internal.h"
+#include "callback.h"
+#include "delegation.h"
+#include "nfs4session.h"
+#include "nfs4idmap.h"
+#include "pnfs.h"
+#include "netns.h"
+
+#define NFSDBG_FACILITY NFSDBG_CLIENT
+
+/*
+ * Get a unique NFSv4.0 callback identifier which will be used
+ * by the V4.0 callback service to lookup the nfs_client struct
+ */
+static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
+{
+ int ret = 0;
+ struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+
+ if (clp->rpc_ops->version != 4 || minorversion != 0)
+ return ret;
+ idr_preload(GFP_KERNEL);
+ spin_lock(&nn->nfs_client_lock);
+ ret = idr_alloc(&nn->cb_ident_idr, clp, 1, 0, GFP_NOWAIT);
+ if (ret >= 0)
+ clp->cl_cb_ident = ret;
+ spin_unlock(&nn->nfs_client_lock);
+ idr_preload_end();
+ return ret < 0 ? ret : 0;
+}
+
+#ifdef CONFIG_NFS_V4_1
+/*
+ * Per auth flavor data server rpc clients
+ */
+struct nfs4_ds_server {
+ struct list_head list; /* ds_clp->cl_ds_clients */
+ struct rpc_clnt *rpc_clnt;
+};
+
+/**
+ * nfs4_find_ds_client - Common lookup case for DS I/O
+ * @ds_clp: pointer to the DS's nfs_client
+ * @flavor: rpc auth flavour to match
+ */
+static struct nfs4_ds_server *
+nfs4_find_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor)
+{
+ struct nfs4_ds_server *dss;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(dss, &ds_clp->cl_ds_clients, list) {
+ if (dss->rpc_clnt->cl_auth->au_flavor != flavor)
+ continue;
+ goto out;
+ }
+ dss = NULL;
+out:
+ rcu_read_unlock();
+ return dss;
+}
+
+static struct nfs4_ds_server *
+nfs4_add_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor,
+ struct nfs4_ds_server *new)
+{
+ struct nfs4_ds_server *dss;
+
+ spin_lock(&ds_clp->cl_lock);
+ list_for_each_entry(dss, &ds_clp->cl_ds_clients, list) {
+ if (dss->rpc_clnt->cl_auth->au_flavor != flavor)
+ continue;
+ goto out;
+ }
+ if (new)
+ list_add_rcu(&new->list, &ds_clp->cl_ds_clients);
+ dss = new;
+out:
+ spin_unlock(&ds_clp->cl_lock); /* need some lock to protect list */
+ return dss;
+}
+
+static struct nfs4_ds_server *
+nfs4_alloc_ds_server(struct nfs_client *ds_clp, rpc_authflavor_t flavor)
+{
+ struct nfs4_ds_server *dss;
+
+ dss = kmalloc(sizeof(*dss), GFP_NOFS);
+ if (dss == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ dss->rpc_clnt = rpc_clone_client_set_auth(ds_clp->cl_rpcclient, flavor);
+ if (IS_ERR(dss->rpc_clnt)) {
+ int err = PTR_ERR(dss->rpc_clnt);
+ kfree (dss);
+ return ERR_PTR(err);
+ }
+ INIT_LIST_HEAD(&dss->list);
+
+ return dss;
+}
+
+static void
+nfs4_free_ds_server(struct nfs4_ds_server *dss)
+{
+ rpc_release_client(dss->rpc_clnt);
+ kfree(dss);
+}
+
+/**
+ * nfs4_find_or_create_ds_client - Find or create a DS rpc client
+ * @ds_clp: pointer to the DS's nfs_client
+ * @inode: pointer to the inode
+ *
+ * Find or create a DS rpc client with th MDS server rpc client auth flavor
+ * in the nfs_client cl_ds_clients list.
+ */
+struct rpc_clnt *
+nfs4_find_or_create_ds_client(struct nfs_client *ds_clp, struct inode *inode)
+{
+ struct nfs4_ds_server *dss, *new;
+ rpc_authflavor_t flavor = NFS_SERVER(inode)->client->cl_auth->au_flavor;
+
+ dss = nfs4_find_ds_client(ds_clp, flavor);
+ if (dss != NULL)
+ goto out;
+ new = nfs4_alloc_ds_server(ds_clp, flavor);
+ if (IS_ERR(new))
+ return ERR_CAST(new);
+ dss = nfs4_add_ds_client(ds_clp, flavor, new);
+ if (dss != new)
+ nfs4_free_ds_server(new);
+out:
+ return dss->rpc_clnt;
+}
+EXPORT_SYMBOL_GPL(nfs4_find_or_create_ds_client);
+
+static void
+nfs4_shutdown_ds_clients(struct nfs_client *clp)
+{
+ struct nfs4_ds_server *dss;
+
+ while (!list_empty(&clp->cl_ds_clients)) {
+ dss = list_entry(clp->cl_ds_clients.next,
+ struct nfs4_ds_server, list);
+ list_del(&dss->list);
+ rpc_shutdown_client(dss->rpc_clnt);
+ kfree (dss);
+ }
+}
+
+static void
+nfs4_cleanup_callback(struct nfs_client *clp)
+{
+ struct nfs4_copy_state *cp_state;
+
+ while (!list_empty(&clp->pending_cb_stateids)) {
+ cp_state = list_entry(clp->pending_cb_stateids.next,
+ struct nfs4_copy_state, copies);
+ list_del(&cp_state->copies);
+ kfree(cp_state);
+ }
+}
+
+void nfs41_shutdown_client(struct nfs_client *clp)
+{
+ if (nfs4_has_session(clp)) {
+ nfs4_cleanup_callback(clp);
+ nfs4_shutdown_ds_clients(clp);
+ nfs4_destroy_session(clp->cl_session);
+ nfs4_destroy_clientid(clp);
+ }
+
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+void nfs40_shutdown_client(struct nfs_client *clp)
+{
+ if (clp->cl_slot_tbl) {
+ nfs4_shutdown_slot_table(clp->cl_slot_tbl);
+ kfree(clp->cl_slot_tbl);
+ }
+}
+
+struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
+{
+ char buf[INET6_ADDRSTRLEN + 1];
+ const char *ip_addr = cl_init->ip_addr;
+ struct nfs_client *clp = nfs_alloc_client(cl_init);
+ int err;
+
+ if (IS_ERR(clp))
+ return clp;
+
+ err = nfs_get_cb_ident_idr(clp, cl_init->minorversion);
+ if (err)
+ goto error;
+
+ if (cl_init->minorversion > NFS4_MAX_MINOR_VERSION) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ spin_lock_init(&clp->cl_lock);
+ INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
+ INIT_LIST_HEAD(&clp->cl_ds_clients);
+ rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
+ clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
+ clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
+ clp->cl_mig_gen = 1;
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+ init_waitqueue_head(&clp->cl_lock_waitq);
+#endif
+ INIT_LIST_HEAD(&clp->pending_cb_stateids);
+
+ if (cl_init->minorversion != 0)
+ __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags);
+ __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
+ __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
+
+ /*
+ * Set up the connection to the server before we add add to the
+ * global list.
+ */
+ err = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I);
+ if (err == -EINVAL)
+ err = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
+ if (err < 0)
+ goto error;
+
+ /* If no clientaddr= option was specified, find a usable cb address */
+ if (ip_addr == NULL) {
+ struct sockaddr_storage cb_addr;
+ struct sockaddr *sap = (struct sockaddr *)&cb_addr;
+
+ err = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr));
+ if (err < 0)
+ goto error;
+ err = rpc_ntop(sap, buf, sizeof(buf));
+ if (err < 0)
+ goto error;
+ ip_addr = (const char *)buf;
+ }
+ strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
+
+ err = nfs_idmap_new(clp);
+ if (err < 0) {
+ dprintk("%s: failed to create idmapper. Error = %d\n",
+ __func__, err);
+ goto error;
+ }
+ __set_bit(NFS_CS_IDMAP, &clp->cl_res_state);
+ return clp;
+
+error:
+ nfs_free_client(clp);
+ return ERR_PTR(err);
+}
+
+/*
+ * Destroy the NFS4 callback service
+ */
+static void nfs4_destroy_callback(struct nfs_client *clp)
+{
+ if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
+ nfs_callback_down(clp->cl_mvops->minor_version, clp->cl_net);
+}
+
+static void nfs4_shutdown_client(struct nfs_client *clp)
+{
+ if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
+ nfs4_kill_renewd(clp);
+ clp->cl_mvops->shutdown_client(clp);
+ nfs4_destroy_callback(clp);
+ if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
+ nfs_idmap_delete(clp);
+
+ rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
+ kfree(clp->cl_serverowner);
+ kfree(clp->cl_serverscope);
+ kfree(clp->cl_implid);
+ kfree(clp->cl_owner_id);
+}
+
+void nfs4_free_client(struct nfs_client *clp)
+{
+ nfs4_shutdown_client(clp);
+ nfs_free_client(clp);
+}
+
+/*
+ * Initialize the NFS4 callback service
+ */
+static int nfs4_init_callback(struct nfs_client *clp)
+{
+ struct rpc_xprt *xprt;
+ int error;
+
+ xprt = rcu_dereference_raw(clp->cl_rpcclient->cl_xprt);
+
+ if (nfs4_has_session(clp)) {
+ error = xprt_setup_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);
+ if (error < 0)
+ return error;
+ }
+
+ error = nfs_callback_up(clp->cl_mvops->minor_version, xprt);
+ if (error < 0) {
+ dprintk("%s: failed to start callback. Error = %d\n",
+ __func__, error);
+ return error;
+ }
+ __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
+
+ return 0;
+}
+
+/**
+ * nfs40_init_client - nfs_client initialization tasks for NFSv4.0
+ * @clp: nfs_client to initialize
+ *
+ * Returns zero on success, or a negative errno if some error occurred.
+ */
+int nfs40_init_client(struct nfs_client *clp)
+{
+ struct nfs4_slot_table *tbl;
+ int ret;
+
+ tbl = kzalloc(sizeof(*tbl), GFP_NOFS);
+ if (tbl == NULL)
+ return -ENOMEM;
+
+ ret = nfs4_setup_slot_table(tbl, NFS4_MAX_SLOT_TABLE,
+ "NFSv4.0 transport Slot table");
+ if (ret) {
+ nfs4_shutdown_slot_table(tbl);
+ kfree(tbl);
+ return ret;
+ }
+
+ clp->cl_slot_tbl = tbl;
+ return 0;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+/**
+ * nfs41_init_client - nfs_client initialization tasks for NFSv4.1+
+ * @clp: nfs_client to initialize
+ *
+ * Returns zero on success, or a negative errno if some error occurred.
+ */
+int nfs41_init_client(struct nfs_client *clp)
+{
+ struct nfs4_session *session = NULL;
+
+ /*
+ * Create the session and mark it expired.
+ * When a SEQUENCE operation encounters the expired session
+ * it will do session recovery to initialize it.
+ */
+ session = nfs4_alloc_session(clp);
+ if (!session)
+ return -ENOMEM;
+
+ clp->cl_session = session;
+
+ /*
+ * The create session reply races with the server back
+ * channel probe. Mark the client NFS_CS_SESSION_INITING
+ * so that the client back channel can find the
+ * nfs_client struct
+ */
+ nfs_mark_client_ready(clp, NFS_CS_SESSION_INITING);
+ return 0;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * Initialize the minor version specific parts of an NFS4 client record
+ */
+static int nfs4_init_client_minor_version(struct nfs_client *clp)
+{
+ int ret;
+
+ ret = clp->cl_mvops->init_client(clp);
+ if (ret)
+ return ret;
+ return nfs4_init_callback(clp);
+}
+
+/**
+ * nfs4_init_client - Initialise an NFS4 client record
+ *
+ * @clp: nfs_client to initialise
+ * @cl_init: pointer to nfs_client_initdata
+ *
+ * Returns pointer to an NFS client, or an ERR_PTR value.
+ */
+struct nfs_client *nfs4_init_client(struct nfs_client *clp,
+ const struct nfs_client_initdata *cl_init)
+{
+ struct nfs_client *old;
+ int error;
+
+ if (clp->cl_cons_state == NFS_CS_READY)
+ /* the client is initialised already */
+ return clp;
+
+ error = nfs4_init_client_minor_version(clp);
+ if (error < 0)
+ goto error;
+
+ error = nfs4_discover_server_trunking(clp, &old);
+ if (error < 0)
+ goto error;
+
+ if (clp != old) {
+ clp->cl_preserve_clid = true;
+ /*
+ * Mark the client as having failed initialization so other
+ * processes walking the nfs_client_list in nfs_match_client()
+ * won't try to use it.
+ */
+ nfs_mark_client_ready(clp, -EPERM);
+ }
+ clear_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags);
+ nfs_put_client(clp);
+ return old;
+
+error:
+ nfs_mark_client_ready(clp, error);
+ nfs_put_client(clp);
+ return ERR_PTR(error);
+}
+
+/*
+ * SETCLIENTID just did a callback update with the callback ident in
+ * "drop," but server trunking discovery claims "drop" and "keep" are
+ * actually the same server. Swap the callback IDs so that "keep"
+ * will continue to use the callback ident the server now knows about,
+ * and so that "keep"'s original callback ident is destroyed when
+ * "drop" is freed.
+ */
+static void nfs4_swap_callback_idents(struct nfs_client *keep,
+ struct nfs_client *drop)
+{
+ struct nfs_net *nn = net_generic(keep->cl_net, nfs_net_id);
+ unsigned int save = keep->cl_cb_ident;
+
+ if (keep->cl_cb_ident == drop->cl_cb_ident)
+ return;
+
+ dprintk("%s: keeping callback ident %u and dropping ident %u\n",
+ __func__, keep->cl_cb_ident, drop->cl_cb_ident);
+
+ spin_lock(&nn->nfs_client_lock);
+
+ idr_replace(&nn->cb_ident_idr, keep, drop->cl_cb_ident);
+ keep->cl_cb_ident = drop->cl_cb_ident;
+
+ idr_replace(&nn->cb_ident_idr, drop, save);
+ drop->cl_cb_ident = save;
+
+ spin_unlock(&nn->nfs_client_lock);
+}
+
+static bool nfs4_match_client_owner_id(const struct nfs_client *clp1,
+ const struct nfs_client *clp2)
+{
+ if (clp1->cl_owner_id == NULL || clp2->cl_owner_id == NULL)
+ return true;
+ return strcmp(clp1->cl_owner_id, clp2->cl_owner_id) == 0;
+}
+
+static bool nfs4_same_verifier(nfs4_verifier *v1, nfs4_verifier *v2)
+{
+ return memcmp(v1->data, v2->data, sizeof(v1->data)) == 0;
+}
+
+static int nfs4_match_client(struct nfs_client *pos, struct nfs_client *new,
+ struct nfs_client **prev, struct nfs_net *nn)
+{
+ int status;
+
+ if (pos->rpc_ops != new->rpc_ops)
+ return 1;
+
+ if (pos->cl_minorversion != new->cl_minorversion)
+ return 1;
+
+ /* If "pos" isn't marked ready, we can't trust the
+ * remaining fields in "pos", especially the client
+ * ID and serverowner fields. Wait for CREATE_SESSION
+ * to finish. */
+ if (pos->cl_cons_state > NFS_CS_READY) {
+ refcount_inc(&pos->cl_count);
+ spin_unlock(&nn->nfs_client_lock);
+
+ nfs_put_client(*prev);
+ *prev = pos;
+
+ status = nfs_wait_client_init_complete(pos);
+ spin_lock(&nn->nfs_client_lock);
+
+ if (status < 0)
+ return status;
+ }
+
+ if (pos->cl_cons_state != NFS_CS_READY)
+ return 1;
+
+ if (pos->cl_clientid != new->cl_clientid)
+ return 1;
+
+ /* NFSv4.1 always uses the uniform string, however someone
+ * might switch the uniquifier string on us.
+ */
+ if (!nfs4_match_client_owner_id(pos, new))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * nfs40_walk_client_list - Find server that recognizes a client ID
+ *
+ * @new: nfs_client with client ID to test
+ * @result: OUT: found nfs_client, or new
+ * @cred: credential to use for trunking test
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status.
+ * If zero is returned, an nfs_client pointer is planted in "result."
+ *
+ * NB: nfs40_walk_client_list() relies on the new nfs_client being
+ * the last nfs_client on the list.
+ */
+int nfs40_walk_client_list(struct nfs_client *new,
+ struct nfs_client **result,
+ const struct cred *cred)
+{
+ struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
+ struct nfs_client *pos, *prev = NULL;
+ struct nfs4_setclientid_res clid = {
+ .clientid = new->cl_clientid,
+ .confirm = new->cl_confirm,
+ };
+ int status = -NFS4ERR_STALE_CLIENTID;
+
+ spin_lock(&nn->nfs_client_lock);
+ list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+
+ if (pos == new)
+ goto found;
+
+ status = nfs4_match_client(pos, new, &prev, nn);
+ if (status < 0)
+ goto out_unlock;
+ if (status != 0)
+ continue;
+ /*
+ * We just sent a new SETCLIENTID, which should have
+ * caused the server to return a new cl_confirm. So if
+ * cl_confirm is the same, then this is a different
+ * server that just returned the same cl_confirm by
+ * coincidence:
+ */
+ if ((new != pos) && nfs4_same_verifier(&pos->cl_confirm,
+ &new->cl_confirm))
+ continue;
+ /*
+ * But if the cl_confirm's are different, then the only
+ * way that a SETCLIENTID_CONFIRM to pos can succeed is
+ * if new and pos point to the same server:
+ */
+found:
+ refcount_inc(&pos->cl_count);
+ spin_unlock(&nn->nfs_client_lock);
+
+ nfs_put_client(prev);
+ prev = pos;
+
+ status = nfs4_proc_setclientid_confirm(pos, &clid, cred);
+ switch (status) {
+ case -NFS4ERR_STALE_CLIENTID:
+ break;
+ case 0:
+ nfs4_swap_callback_idents(pos, new);
+ pos->cl_confirm = new->cl_confirm;
+ nfs_mark_client_ready(pos, NFS_CS_READY);
+
+ prev = NULL;
+ *result = pos;
+ goto out;
+ case -ERESTARTSYS:
+ case -ETIMEDOUT:
+ /* The callback path may have been inadvertently
+ * changed. Schedule recovery!
+ */
+ nfs4_schedule_path_down_recovery(pos);
+ default:
+ goto out;
+ }
+
+ spin_lock(&nn->nfs_client_lock);
+ }
+out_unlock:
+ spin_unlock(&nn->nfs_client_lock);
+
+ /* No match found. The server lost our clientid */
+out:
+ nfs_put_client(prev);
+ return status;
+}
+
+#ifdef CONFIG_NFS_V4_1
+/*
+ * Returns true if the server major ids match
+ */
+bool
+nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1,
+ struct nfs41_server_owner *o2)
+{
+ if (o1->major_id_sz != o2->major_id_sz)
+ return false;
+ return memcmp(o1->major_id, o2->major_id, o1->major_id_sz) == 0;
+}
+
+/*
+ * Returns true if the server scopes match
+ */
+static bool
+nfs4_check_server_scope(struct nfs41_server_scope *s1,
+ struct nfs41_server_scope *s2)
+{
+ if (s1->server_scope_sz != s2->server_scope_sz)
+ return false;
+ return memcmp(s1->server_scope, s2->server_scope,
+ s1->server_scope_sz) == 0;
+}
+
+/**
+ * nfs4_detect_session_trunking - Checks for session trunking.
+ * @clp: original mount nfs_client
+ * @res: result structure from an exchange_id using the original mount
+ * nfs_client with a new multi_addr transport
+ * @xprt: pointer to the transport to add.
+ *
+ * Called after a successful EXCHANGE_ID on a multi-addr connection.
+ * Upon success, add the transport.
+ *
+ * Returns zero on success, otherwise -EINVAL
+ *
+ * Note: since the exchange_id for the new multi_addr transport uses the
+ * same nfs_client from the original mount, the cl_owner_id is reused,
+ * so eir_clientowner is the same.
+ */
+int nfs4_detect_session_trunking(struct nfs_client *clp,
+ struct nfs41_exchange_id_res *res,
+ struct rpc_xprt *xprt)
+{
+ /* Check eir_clientid */
+ if (clp->cl_clientid != res->clientid)
+ goto out_err;
+
+ /* Check eir_server_owner so_major_id */
+ if (!nfs4_check_serverowner_major_id(clp->cl_serverowner,
+ res->server_owner))
+ goto out_err;
+
+ /* Check eir_server_owner so_minor_id */
+ if (clp->cl_serverowner->minor_id != res->server_owner->minor_id)
+ goto out_err;
+
+ /* Check eir_server_scope */
+ if (!nfs4_check_server_scope(clp->cl_serverscope, res->server_scope))
+ goto out_err;
+
+ pr_info("NFS: %s: Session trunking succeeded for %s\n",
+ clp->cl_hostname,
+ xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+ return 0;
+out_err:
+ pr_info("NFS: %s: Session trunking failed for %s\n", clp->cl_hostname,
+ xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+ return -EINVAL;
+}
+
+/**
+ * nfs41_walk_client_list - Find nfs_client that matches a client/server owner
+ *
+ * @new: nfs_client with client ID to test
+ * @result: OUT: found nfs_client, or new
+ * @cred: credential to use for trunking test
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status.
+ * If zero is returned, an nfs_client pointer is planted in "result."
+ *
+ * NB: nfs41_walk_client_list() relies on the new nfs_client being
+ * the last nfs_client on the list.
+ */
+int nfs41_walk_client_list(struct nfs_client *new,
+ struct nfs_client **result,
+ const struct cred *cred)
+{
+ struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
+ struct nfs_client *pos, *prev = NULL;
+ int status = -NFS4ERR_STALE_CLIENTID;
+
+ spin_lock(&nn->nfs_client_lock);
+ list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+
+ if (pos == new)
+ goto found;
+
+ status = nfs4_match_client(pos, new, &prev, nn);
+ if (status < 0)
+ goto out;
+ if (status != 0)
+ continue;
+
+ /*
+ * Note that session trunking is just a special subcase of
+ * client id trunking. In either case, we want to fall back
+ * to using the existing nfs_client.
+ */
+ if (!nfs4_check_serverowner_major_id(pos->cl_serverowner,
+ new->cl_serverowner))
+ continue;
+
+found:
+ refcount_inc(&pos->cl_count);
+ *result = pos;
+ status = 0;
+ break;
+ }
+
+out:
+ spin_unlock(&nn->nfs_client_lock);
+ nfs_put_client(prev);
+ return status;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static void nfs4_destroy_server(struct nfs_server *server)
+{
+ LIST_HEAD(freeme);
+
+ nfs_server_return_all_delegations(server);
+ unset_pnfs_layoutdriver(server);
+ nfs4_purge_state_owners(server, &freeme);
+ nfs4_free_state_owners(&freeme);
+}
+
+/*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by callback identifier
+ */
+struct nfs_client *
+nfs4_find_client_ident(struct net *net, int cb_ident)
+{
+ struct nfs_client *clp;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ spin_lock(&nn->nfs_client_lock);
+ clp = idr_find(&nn->cb_ident_idr, cb_ident);
+ if (clp)
+ refcount_inc(&clp->cl_count);
+ spin_unlock(&nn->nfs_client_lock);
+ return clp;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/* Common match routine for v4.0 and v4.1 callback services */
+static bool nfs4_cb_match_client(const struct sockaddr *addr,
+ struct nfs_client *clp, u32 minorversion)
+{
+ struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+
+ /* Don't match clients that failed to initialise */
+ if (!(clp->cl_cons_state == NFS_CS_READY ||
+ clp->cl_cons_state == NFS_CS_SESSION_INITING))
+ return false;
+
+ smp_rmb();
+
+ /* Match the version and minorversion */
+ if (clp->rpc_ops->version != 4 ||
+ clp->cl_minorversion != minorversion)
+ return false;
+
+ /* Match only the IP address, not the port number */
+ return rpc_cmp_addr(addr, clap);
+}
+
+/*
+ * NFSv4.1 callback thread helper
+ * For CB_COMPOUND calls, find a client by IP address, protocol version,
+ * minorversion, and sessionID
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
+ struct nfs4_sessionid *sid, u32 minorversion)
+{
+ struct nfs_client *clp;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ spin_lock(&nn->nfs_client_lock);
+ list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
+ if (!nfs4_cb_match_client(addr, clp, minorversion))
+ continue;
+
+ if (!nfs4_has_session(clp))
+ continue;
+
+ /* Match sessionid*/
+ if (memcmp(clp->cl_session->sess_id.data,
+ sid->data, NFS4_MAX_SESSIONID_LEN) != 0)
+ continue;
+
+ refcount_inc(&clp->cl_count);
+ spin_unlock(&nn->nfs_client_lock);
+ return clp;
+ }
+ spin_unlock(&nn->nfs_client_lock);
+ return NULL;
+}
+
+#else /* CONFIG_NFS_V4_1 */
+
+struct nfs_client *
+nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
+ struct nfs4_sessionid *sid, u32 minorversion)
+{
+ return NULL;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * Set up an NFS4 client
+ */
+static int nfs4_set_client(struct nfs_server *server,
+ const char *hostname,
+ const struct sockaddr *addr,
+ const size_t addrlen,
+ const char *ip_addr,
+ int proto, const struct rpc_timeout *timeparms,
+ u32 minorversion, unsigned int nconnect,
+ struct net *net)
+{
+ struct nfs_client_initdata cl_init = {
+ .hostname = hostname,
+ .addr = addr,
+ .addrlen = addrlen,
+ .ip_addr = ip_addr,
+ .nfs_mod = &nfs_v4,
+ .proto = proto,
+ .minorversion = minorversion,
+ .net = net,
+ .timeparms = timeparms,
+ .cred = server->cred,
+ };
+ struct nfs_client *clp;
+
+ if (minorversion == 0)
+ __set_bit(NFS_CS_REUSEPORT, &cl_init.init_flags);
+ if (proto == XPRT_TRANSPORT_TCP)
+ cl_init.nconnect = nconnect;
+
+ if (server->flags & NFS_MOUNT_NORESVPORT)
+ __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+ if (server->options & NFS_OPTION_MIGRATION)
+ __set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
+ if (test_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status))
+ __set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags);
+ server->port = rpc_get_port(addr);
+
+ /* Allocate or find a client reference we can use */
+ clp = nfs_get_client(&cl_init);
+ if (IS_ERR(clp))
+ return PTR_ERR(clp);
+
+ if (server->nfs_client == clp) {
+ nfs_put_client(clp);
+ return -ELOOP;
+ }
+
+ /*
+ * Query for the lease time on clientid setup or renewal
+ *
+ * Note that this will be set on nfs_clients that were created
+ * only for the DS role and did not set this bit, but now will
+ * serve a dual role.
+ */
+ set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state);
+
+ server->nfs_client = clp;
+ return 0;
+}
+
+/*
+ * Set up a pNFS Data Server client.
+ *
+ * Return any existing nfs_client that matches server address,port,version
+ * and minorversion.
+ *
+ * For a new nfs_client, use a soft mount (default), a low retrans and a
+ * low timeout interval so that if a connection is lost, we retry through
+ * the MDS.
+ */
+struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
+ const struct sockaddr *ds_addr, int ds_addrlen,
+ int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
+ u32 minor_version)
+{
+ struct rpc_timeout ds_timeout;
+ struct nfs_client *mds_clp = mds_srv->nfs_client;
+ struct nfs_client_initdata cl_init = {
+ .addr = ds_addr,
+ .addrlen = ds_addrlen,
+ .nodename = mds_clp->cl_rpcclient->cl_nodename,
+ .ip_addr = mds_clp->cl_ipaddr,
+ .nfs_mod = &nfs_v4,
+ .proto = ds_proto,
+ .minorversion = minor_version,
+ .net = mds_clp->cl_net,
+ .timeparms = &ds_timeout,
+ .cred = mds_srv->cred,
+ };
+ char buf[INET6_ADDRSTRLEN + 1];
+
+ if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
+ return ERR_PTR(-EINVAL);
+ cl_init.hostname = buf;
+
+ if (mds_clp->cl_nconnect > 1 && ds_proto == XPRT_TRANSPORT_TCP)
+ cl_init.nconnect = mds_clp->cl_nconnect;
+
+ if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
+ __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+
+ /*
+ * Set an authflavor equual to the MDS value. Use the MDS nfs_client
+ * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
+ * (section 13.1 RFC 5661).
+ */
+ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
+ return nfs_get_client(&cl_init);
+}
+EXPORT_SYMBOL_GPL(nfs4_set_ds_client);
+
+/*
+ * Session has been established, and the client marked ready.
+ * Limit the mount rsize, wsize and dtsize using negotiated fore
+ * channel attributes.
+ */
+static void nfs4_session_limit_rwsize(struct nfs_server *server)
+{
+#ifdef CONFIG_NFS_V4_1
+ struct nfs4_session *sess;
+ u32 server_resp_sz;
+ u32 server_rqst_sz;
+
+ if (!nfs4_has_session(server->nfs_client))
+ return;
+ sess = server->nfs_client->cl_session;
+ server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead;
+ server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead;
+
+ if (server->dtsize > server_resp_sz)
+ server->dtsize = server_resp_sz;
+ if (server->rsize > server_resp_sz)
+ server->rsize = server_resp_sz;
+ if (server->wsize > server_rqst_sz)
+ server->wsize = server_rqst_sz;
+#endif /* CONFIG_NFS_V4_1 */
+}
+
+/*
+ * Limit xattr sizes using the channel attributes.
+ */
+static void nfs4_session_limit_xasize(struct nfs_server *server)
+{
+#ifdef CONFIG_NFS_V4_2
+ struct nfs4_session *sess;
+ u32 server_gxa_sz;
+ u32 server_sxa_sz;
+ u32 server_lxa_sz;
+
+ if (!nfs4_has_session(server->nfs_client))
+ return;
+
+ sess = server->nfs_client->cl_session;
+
+ server_gxa_sz = sess->fc_attrs.max_resp_sz - nfs42_maxgetxattr_overhead;
+ server_sxa_sz = sess->fc_attrs.max_rqst_sz - nfs42_maxsetxattr_overhead;
+ server_lxa_sz = sess->fc_attrs.max_resp_sz -
+ nfs42_maxlistxattrs_overhead;
+
+ if (server->gxasize > server_gxa_sz)
+ server->gxasize = server_gxa_sz;
+ if (server->sxasize > server_sxa_sz)
+ server->sxasize = server_sxa_sz;
+ if (server->lxasize > server_lxa_sz)
+ server->lxasize = server_lxa_sz;
+#endif
+}
+
+static int nfs4_server_common_setup(struct nfs_server *server,
+ struct nfs_fh *mntfh, bool auth_probe)
+{
+ struct nfs_fattr *fattr;
+ int error;
+
+ /* data servers support only a subset of NFSv4.1 */
+ if (is_ds_only_client(server->nfs_client))
+ return -EPROTONOSUPPORT;
+
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ return -ENOMEM;
+
+ /* We must ensure the session is initialised first */
+ error = nfs4_init_session(server->nfs_client);
+ if (error < 0)
+ goto out;
+
+ /* Set the basic capabilities */
+ server->caps |= server->nfs_client->cl_mvops->init_caps;
+ if (server->flags & NFS_MOUNT_NORDIRPLUS)
+ server->caps &= ~NFS_CAP_READDIRPLUS;
+ if (server->nfs_client->cl_proto == XPRT_TRANSPORT_RDMA)
+ server->caps &= ~NFS_CAP_READ_PLUS;
+ /*
+ * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
+ * authentication.
+ */
+ if (nfs4_disable_idmapping &&
+ server->client->cl_auth->au_flavor == RPC_AUTH_UNIX)
+ server->caps |= NFS_CAP_UIDGID_NOMAP;
+
+
+ /* Probe the root fh to retrieve its FSID and filehandle */
+ error = nfs4_get_rootfh(server, mntfh, auth_probe);
+ if (error < 0)
+ goto out;
+
+ dprintk("Server FSID: %llx:%llx\n",
+ (unsigned long long) server->fsid.major,
+ (unsigned long long) server->fsid.minor);
+ nfs_display_fhandle(mntfh, "Pseudo-fs root FH");
+
+ error = nfs_probe_fsinfo(server, mntfh, fattr);
+ if (error < 0)
+ goto out;
+
+ nfs4_session_limit_rwsize(server);
+ nfs4_session_limit_xasize(server);
+
+ if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
+ server->namelen = NFS4_MAXNAMLEN;
+
+ nfs_server_insert_lists(server);
+ server->mount_time = jiffies;
+ server->destroy = nfs4_destroy_server;
+out:
+ nfs_free_fattr(fattr);
+ return error;
+}
+
+/*
+ * Create a version 4 volume record
+ */
+static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct rpc_timeout timeparms;
+ int error;
+
+ nfs_init_timeout_values(&timeparms, ctx->nfs_server.protocol,
+ ctx->timeo, ctx->retrans);
+
+ /* Initialise the client representation from the mount data */
+ server->flags = ctx->flags;
+ server->options = ctx->options;
+ server->auth_info = ctx->auth_info;
+
+ /* Use the first specified auth flavor. If this flavor isn't
+ * allowed by the server, use the SECINFO path to try the
+ * other specified flavors */
+ if (ctx->auth_info.flavor_len >= 1)
+ ctx->selected_flavor = ctx->auth_info.flavors[0];
+ else
+ ctx->selected_flavor = RPC_AUTH_UNIX;
+
+ /* Get a client record */
+ error = nfs4_set_client(server,
+ ctx->nfs_server.hostname,
+ &ctx->nfs_server.address,
+ ctx->nfs_server.addrlen,
+ ctx->client_address,
+ ctx->nfs_server.protocol,
+ &timeparms,
+ ctx->minorversion,
+ ctx->nfs_server.nconnect,
+ fc->net_ns);
+ if (error < 0)
+ return error;
+
+ if (ctx->rsize)
+ server->rsize = nfs_block_size(ctx->rsize, NULL);
+ if (ctx->wsize)
+ server->wsize = nfs_block_size(ctx->wsize, NULL);
+
+ server->acregmin = ctx->acregmin * HZ;
+ server->acregmax = ctx->acregmax * HZ;
+ server->acdirmin = ctx->acdirmin * HZ;
+ server->acdirmax = ctx->acdirmax * HZ;
+ server->port = ctx->nfs_server.port;
+
+ return nfs_init_server_rpcclient(server, &timeparms,
+ ctx->selected_flavor);
+}
+
+/*
+ * Create a version 4 volume record
+ * - keyed on server and FSID
+ */
+struct nfs_server *nfs4_create_server(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct nfs_server *server;
+ bool auth_probe;
+ int error;
+
+ server = nfs_alloc_server();
+ if (!server)
+ return ERR_PTR(-ENOMEM);
+
+ server->cred = get_cred(current_cred());
+
+ auth_probe = ctx->auth_info.flavor_len < 1;
+
+ /* set up the general RPC client */
+ error = nfs4_init_server(server, fc);
+ if (error < 0)
+ goto error;
+
+ error = nfs4_server_common_setup(server, ctx->mntfh, auth_probe);
+ if (error < 0)
+ goto error;
+
+ return server;
+
+error:
+ nfs_free_server(server);
+ return ERR_PTR(error);
+}
+
+/*
+ * Create an NFS4 referral server record
+ */
+struct nfs_server *nfs4_create_referral_server(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct nfs_client *parent_client;
+ struct nfs_server *server, *parent_server;
+ bool auth_probe;
+ int error;
+
+ server = nfs_alloc_server();
+ if (!server)
+ return ERR_PTR(-ENOMEM);
+
+ parent_server = NFS_SB(ctx->clone_data.sb);
+ parent_client = parent_server->nfs_client;
+
+ server->cred = get_cred(parent_server->cred);
+
+ /* Initialise the client representation from the parent server */
+ nfs_server_copy_userdata(server, parent_server);
+
+ /* Get a client representation */
+#if IS_ENABLED(CONFIG_SUNRPC_XPRT_RDMA)
+ rpc_set_port(&ctx->nfs_server.address, NFS_RDMA_PORT);
+ error = nfs4_set_client(server,
+ ctx->nfs_server.hostname,
+ &ctx->nfs_server.address,
+ ctx->nfs_server.addrlen,
+ parent_client->cl_ipaddr,
+ XPRT_TRANSPORT_RDMA,
+ parent_server->client->cl_timeout,
+ parent_client->cl_mvops->minor_version,
+ parent_client->cl_nconnect,
+ parent_client->cl_net);
+ if (!error)
+ goto init_server;
+#endif /* IS_ENABLED(CONFIG_SUNRPC_XPRT_RDMA) */
+
+ rpc_set_port(&ctx->nfs_server.address, NFS_PORT);
+ error = nfs4_set_client(server,
+ ctx->nfs_server.hostname,
+ &ctx->nfs_server.address,
+ ctx->nfs_server.addrlen,
+ parent_client->cl_ipaddr,
+ XPRT_TRANSPORT_TCP,
+ parent_server->client->cl_timeout,
+ parent_client->cl_mvops->minor_version,
+ parent_client->cl_nconnect,
+ parent_client->cl_net);
+ if (error < 0)
+ goto error;
+
+#if IS_ENABLED(CONFIG_SUNRPC_XPRT_RDMA)
+init_server:
+#endif
+ error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout,
+ ctx->selected_flavor);
+ if (error < 0)
+ goto error;
+
+ auth_probe = parent_server->auth_info.flavor_len < 1;
+
+ error = nfs4_server_common_setup(server, ctx->mntfh, auth_probe);
+ if (error < 0)
+ goto error;
+
+ return server;
+
+error:
+ nfs_free_server(server);
+ return ERR_PTR(error);
+}
+
+/*
+ * Grab the destination's particulars, including lease expiry time.
+ *
+ * Returns zero if probe succeeded and retrieved FSID matches the FSID
+ * we have cached.
+ */
+static int nfs_probe_destination(struct nfs_server *server)
+{
+ struct inode *inode = d_inode(server->super->s_root);
+ struct nfs_fattr *fattr;
+ int error;
+
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ return -ENOMEM;
+
+ /* Sanity: the probe won't work if the destination server
+ * does not recognize the migrated FH. */
+ error = nfs_probe_fsinfo(server, NFS_FH(inode), fattr);
+
+ nfs_free_fattr(fattr);
+ return error;
+}
+
+/**
+ * nfs4_update_server - Move an nfs_server to a different nfs_client
+ *
+ * @server: represents FSID to be moved
+ * @hostname: new end-point's hostname
+ * @sap: new end-point's socket address
+ * @salen: size of "sap"
+ * @net: net namespace
+ *
+ * The nfs_server must be quiescent before this function is invoked.
+ * Either its session is drained (NFSv4.1+), or its transport is
+ * plugged and drained (NFSv4.0).
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs4_update_server(struct nfs_server *server, const char *hostname,
+ struct sockaddr *sap, size_t salen, struct net *net)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct rpc_clnt *clnt = server->client;
+ struct xprt_create xargs = {
+ .ident = clp->cl_proto,
+ .net = net,
+ .dstaddr = sap,
+ .addrlen = salen,
+ .servername = hostname,
+ };
+ char buf[INET6_ADDRSTRLEN + 1];
+ struct sockaddr_storage address;
+ struct sockaddr *localaddr = (struct sockaddr *)&address;
+ int error;
+
+ error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout);
+ if (error != 0)
+ return error;
+
+ error = rpc_localaddr(clnt, localaddr, sizeof(address));
+ if (error != 0)
+ return error;
+
+ if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0)
+ return -EAFNOSUPPORT;
+
+ nfs_server_remove_lists(server);
+ set_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status);
+ error = nfs4_set_client(server, hostname, sap, salen, buf,
+ clp->cl_proto, clnt->cl_timeout,
+ clp->cl_minorversion,
+ clp->cl_nconnect, net);
+ clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status);
+ if (error != 0) {
+ nfs_server_insert_lists(server);
+ return error;
+ }
+ nfs_put_client(clp);
+
+ if (server->nfs_client->cl_hostname == NULL) {
+ server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL);
+ if (server->nfs_client->cl_hostname == NULL)
+ return -ENOMEM;
+ }
+ nfs_server_insert_lists(server);
+
+ return nfs_probe_destination(server);
+}
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
new file mode 100644
index 000000000..70cd0d764
--- /dev/null
+++ b/fs/nfs/nfs4file.c
@@ -0,0 +1,470 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/file.c
+ *
+ * Copyright (C) 1992 Rick Sladkey
+ */
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/falloc.h>
+#include <linux/mount.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_ssc.h>
+#include "delegation.h"
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+#include "pnfs.h"
+
+#include "nfstrace.h"
+
+#ifdef CONFIG_NFS_V4_2
+#include "nfs42.h"
+#endif
+
+#define NFSDBG_FACILITY NFSDBG_FILE
+
+static int
+nfs4_file_open(struct inode *inode, struct file *filp)
+{
+ struct nfs_open_context *ctx;
+ struct dentry *dentry = file_dentry(filp);
+ struct dentry *parent = NULL;
+ struct inode *dir;
+ unsigned openflags = filp->f_flags;
+ fmode_t f_mode;
+ struct iattr attr;
+ int err;
+
+ /*
+ * If no cached dentry exists or if it's negative, NFSv4 handled the
+ * opens in ->lookup() or ->create().
+ *
+ * We only get this far for a cached positive dentry. We skipped
+ * revalidation, so handle it here by dropping the dentry and returning
+ * -EOPENSTALE. The VFS will retry the lookup/create/open.
+ */
+
+ dprintk("NFS: open file(%pd2)\n", dentry);
+
+ err = nfs_check_flags(openflags);
+ if (err)
+ return err;
+
+ f_mode = filp->f_mode;
+ if ((openflags & O_ACCMODE) == 3)
+ f_mode |= flags_to_mode(openflags);
+
+ /* We can't create new files here */
+ openflags &= ~(O_CREAT|O_EXCL);
+
+ parent = dget_parent(dentry);
+ dir = d_inode(parent);
+
+ ctx = alloc_nfs_open_context(file_dentry(filp), f_mode, filp);
+ err = PTR_ERR(ctx);
+ if (IS_ERR(ctx))
+ goto out;
+
+ attr.ia_valid = ATTR_OPEN;
+ if (openflags & O_TRUNC) {
+ attr.ia_valid |= ATTR_SIZE;
+ attr.ia_size = 0;
+ filemap_write_and_wait(inode->i_mapping);
+ }
+
+ inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ switch (err) {
+ default:
+ goto out_put_ctx;
+ case -ENOENT:
+ case -ESTALE:
+ case -EISDIR:
+ case -ENOTDIR:
+ case -ELOOP:
+ goto out_drop;
+ }
+ }
+ if (inode != d_inode(dentry))
+ goto out_drop;
+
+ nfs_file_set_open_context(filp, ctx);
+ nfs_fscache_open_file(inode, filp);
+ err = 0;
+
+out_put_ctx:
+ put_nfs_open_context(ctx);
+out:
+ dput(parent);
+ return err;
+
+out_drop:
+ d_drop(dentry);
+ err = -EOPENSTALE;
+ goto out_put_ctx;
+}
+
+/*
+ * Flush all dirty pages, and check for write errors.
+ */
+static int
+nfs4_file_flush(struct file *file, fl_owner_t id)
+{
+ struct inode *inode = file_inode(file);
+ errseq_t since;
+
+ dprintk("NFS: flush(%pD2)\n", file);
+
+ nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
+ if ((file->f_mode & FMODE_WRITE) == 0)
+ return 0;
+
+ /*
+ * If we're holding a write delegation, then check if we're required
+ * to flush the i/o on close. If not, then just start the i/o now.
+ */
+ if (!nfs4_delegation_flush_on_close(inode))
+ return filemap_fdatawrite(file->f_mapping);
+
+ /* Flush writes to the server and return any errors */
+ since = filemap_sample_wb_err(file->f_mapping);
+ nfs_wb_all(inode);
+ return filemap_check_wb_err(file->f_mapping, since);
+}
+
+#ifdef CONFIG_NFS_V4_2
+static ssize_t __nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t count, unsigned int flags)
+{
+ struct nfs42_copy_notify_res *cn_resp = NULL;
+ struct nl4_server *nss = NULL;
+ nfs4_stateid *cnrs = NULL;
+ ssize_t ret;
+ bool sync = false;
+
+ /* Only offload copy if superblock is the same */
+ if (file_in->f_op != &nfs4_file_operations)
+ return -EXDEV;
+ if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY) ||
+ !nfs_server_capable(file_inode(file_in), NFS_CAP_COPY))
+ return -EOPNOTSUPP;
+ if (file_inode(file_in) == file_inode(file_out))
+ return -EOPNOTSUPP;
+ /* if the copy size if smaller than 2 RPC payloads, make it
+ * synchronous
+ */
+ if (count <= 2 * NFS_SERVER(file_inode(file_in))->rsize)
+ sync = true;
+retry:
+ if (!nfs42_files_from_same_server(file_in, file_out)) {
+ /* for inter copy, if copy size if smaller than 12 RPC
+ * payloads, fallback to traditional copy. There are
+ * 14 RPCs during an NFSv4.x mount between source/dest
+ * servers.
+ */
+ if (sync ||
+ count <= 14 * NFS_SERVER(file_inode(file_in))->rsize)
+ return -EOPNOTSUPP;
+ cn_resp = kzalloc(sizeof(struct nfs42_copy_notify_res),
+ GFP_NOFS);
+ if (unlikely(cn_resp == NULL))
+ return -ENOMEM;
+
+ ret = nfs42_proc_copy_notify(file_in, file_out, cn_resp);
+ if (ret) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ nss = &cn_resp->cnr_src;
+ cnrs = &cn_resp->cnr_stateid;
+ }
+ ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count,
+ nss, cnrs, sync);
+out:
+ if (!nfs42_files_from_same_server(file_in, file_out))
+ kfree(cn_resp);
+ if (ret == -EAGAIN)
+ goto retry;
+ return ret;
+}
+
+static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t count, unsigned int flags)
+{
+ ssize_t ret;
+
+ ret = __nfs4_copy_file_range(file_in, pos_in, file_out, pos_out, count,
+ flags);
+ if (ret == -EOPNOTSUPP || ret == -EXDEV)
+ ret = generic_copy_file_range(file_in, pos_in, file_out,
+ pos_out, count, flags);
+ return ret;
+}
+
+static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
+{
+ loff_t ret;
+
+ switch (whence) {
+ case SEEK_HOLE:
+ case SEEK_DATA:
+ ret = nfs42_proc_llseek(filep, offset, whence);
+ if (ret != -EOPNOTSUPP)
+ return ret;
+ fallthrough;
+ default:
+ return nfs_file_llseek(filep, offset, whence);
+ }
+}
+
+static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t len)
+{
+ struct inode *inode = file_inode(filep);
+ long ret;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ if ((mode != 0) && (mode != (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)))
+ return -EOPNOTSUPP;
+
+ ret = inode_newsize_ok(inode, offset + len);
+ if (ret < 0)
+ return ret;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return nfs42_proc_deallocate(filep, offset, len);
+ return nfs42_proc_allocate(filep, offset, len);
+}
+
+static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off,
+ struct file *dst_file, loff_t dst_off, loff_t count,
+ unsigned int remap_flags)
+{
+ struct inode *dst_inode = file_inode(dst_file);
+ struct nfs_server *server = NFS_SERVER(dst_inode);
+ struct inode *src_inode = file_inode(src_file);
+ unsigned int bs = server->clone_blksize;
+ bool same_inode = false;
+ int ret;
+
+ /* NFS does not support deduplication. */
+ if (remap_flags & REMAP_FILE_DEDUP)
+ return -EOPNOTSUPP;
+
+ if (remap_flags & ~REMAP_FILE_ADVISORY)
+ return -EINVAL;
+
+ if (IS_SWAPFILE(dst_inode) || IS_SWAPFILE(src_inode))
+ return -ETXTBSY;
+
+ /* check alignment w.r.t. clone_blksize */
+ ret = -EINVAL;
+ if (bs) {
+ if (!IS_ALIGNED(src_off, bs) || !IS_ALIGNED(dst_off, bs))
+ goto out;
+ if (!IS_ALIGNED(count, bs) && i_size_read(src_inode) != (src_off + count))
+ goto out;
+ }
+
+ if (src_inode == dst_inode)
+ same_inode = true;
+
+ /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */
+ if (same_inode) {
+ inode_lock(src_inode);
+ } else if (dst_inode < src_inode) {
+ inode_lock_nested(dst_inode, I_MUTEX_PARENT);
+ inode_lock_nested(src_inode, I_MUTEX_CHILD);
+ } else {
+ inode_lock_nested(src_inode, I_MUTEX_PARENT);
+ inode_lock_nested(dst_inode, I_MUTEX_CHILD);
+ }
+
+ /* flush all pending writes on both src and dst so that server
+ * has the latest data */
+ ret = nfs_sync_inode(src_inode);
+ if (ret)
+ goto out_unlock;
+ ret = nfs_sync_inode(dst_inode);
+ if (ret)
+ goto out_unlock;
+
+ ret = nfs42_proc_clone(src_file, dst_file, src_off, dst_off, count);
+
+ /* truncate inode page cache of the dst range so that future reads can fetch
+ * new data from server */
+ if (!ret)
+ truncate_inode_pages_range(&dst_inode->i_data, dst_off, dst_off + count - 1);
+
+out_unlock:
+ if (same_inode) {
+ inode_unlock(src_inode);
+ } else if (dst_inode < src_inode) {
+ inode_unlock(src_inode);
+ inode_unlock(dst_inode);
+ } else {
+ inode_unlock(dst_inode);
+ inode_unlock(src_inode);
+ }
+out:
+ return ret < 0 ? ret : count;
+}
+
+static int read_name_gen = 1;
+#define SSC_READ_NAME_BODY "ssc_read_%d"
+
+static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
+ struct nfs_fh *src_fh, nfs4_stateid *stateid)
+{
+ struct nfs_fattr *fattr = nfs_alloc_fattr();
+ struct file *filep, *res;
+ struct nfs_server *server;
+ struct inode *r_ino = NULL;
+ struct nfs_open_context *ctx;
+ struct nfs4_state_owner *sp;
+ char *read_name = NULL;
+ int len, status = 0;
+
+ server = NFS_SERVER(ss_mnt->mnt_root->d_inode);
+
+ if (!fattr)
+ return ERR_PTR(-ENOMEM);
+
+ status = nfs4_proc_getattr(server, src_fh, fattr, NULL, NULL);
+ if (status < 0) {
+ res = ERR_PTR(status);
+ goto out;
+ }
+
+ if (!S_ISREG(fattr->mode)) {
+ res = ERR_PTR(-EBADF);
+ goto out;
+ }
+
+ res = ERR_PTR(-ENOMEM);
+ len = strlen(SSC_READ_NAME_BODY) + 16;
+ read_name = kzalloc(len, GFP_NOFS);
+ if (read_name == NULL)
+ goto out;
+ snprintf(read_name, len, SSC_READ_NAME_BODY, read_name_gen++);
+
+ r_ino = nfs_fhget(ss_mnt->mnt_root->d_inode->i_sb, src_fh, fattr,
+ NULL);
+ if (IS_ERR(r_ino)) {
+ res = ERR_CAST(r_ino);
+ goto out_free_name;
+ }
+
+ filep = alloc_file_pseudo(r_ino, ss_mnt, read_name, FMODE_READ,
+ r_ino->i_fop);
+ if (IS_ERR(filep)) {
+ res = ERR_CAST(filep);
+ iput(r_ino);
+ goto out_free_name;
+ }
+ filep->f_mode |= FMODE_READ;
+
+ ctx = alloc_nfs_open_context(filep->f_path.dentry, filep->f_mode,
+ filep);
+ if (IS_ERR(ctx)) {
+ res = ERR_CAST(ctx);
+ goto out_filep;
+ }
+
+ res = ERR_PTR(-EINVAL);
+ sp = nfs4_get_state_owner(server, ctx->cred, GFP_KERNEL);
+ if (sp == NULL)
+ goto out_ctx;
+
+ ctx->state = nfs4_get_open_state(r_ino, sp);
+ if (ctx->state == NULL)
+ goto out_stateowner;
+
+ set_bit(NFS_SRV_SSC_COPY_STATE, &ctx->state->flags);
+ memcpy(&ctx->state->open_stateid.other, &stateid->other,
+ NFS4_STATEID_OTHER_SIZE);
+ update_open_stateid(ctx->state, stateid, NULL, filep->f_mode);
+ set_bit(NFS_OPEN_STATE, &ctx->state->flags);
+
+ nfs_file_set_open_context(filep, ctx);
+ put_nfs_open_context(ctx);
+
+ file_ra_state_init(&filep->f_ra, filep->f_mapping->host->i_mapping);
+ res = filep;
+out_free_name:
+ kfree(read_name);
+out:
+ nfs_free_fattr(fattr);
+ return res;
+out_stateowner:
+ nfs4_put_state_owner(sp);
+out_ctx:
+ put_nfs_open_context(ctx);
+out_filep:
+ fput(filep);
+ goto out_free_name;
+}
+
+static void __nfs42_ssc_close(struct file *filep)
+{
+ struct nfs_open_context *ctx = nfs_file_open_context(filep);
+
+ ctx->state->flags = 0;
+}
+
+static const struct nfs4_ssc_client_ops nfs4_ssc_clnt_ops_tbl = {
+ .sco_open = __nfs42_ssc_open,
+ .sco_close = __nfs42_ssc_close,
+};
+
+/**
+ * nfs42_ssc_register_ops - Wrapper to register NFS_V4 ops in nfs_common
+ *
+ * Return values:
+ * None
+ */
+void nfs42_ssc_register_ops(void)
+{
+ nfs42_ssc_register(&nfs4_ssc_clnt_ops_tbl);
+}
+
+/**
+ * nfs42_ssc_unregister_ops - wrapper to un-register NFS_V4 ops in nfs_common
+ *
+ * Return values:
+ * None.
+ */
+void nfs42_ssc_unregister_ops(void)
+{
+ nfs42_ssc_unregister(&nfs4_ssc_clnt_ops_tbl);
+}
+#endif /* CONFIG_NFS_V4_2 */
+
+const struct file_operations nfs4_file_operations = {
+ .read_iter = nfs_file_read,
+ .write_iter = nfs_file_write,
+ .mmap = nfs_file_mmap,
+ .open = nfs4_file_open,
+ .flush = nfs4_file_flush,
+ .release = nfs_file_release,
+ .fsync = nfs_file_fsync,
+ .lock = nfs_lock,
+ .flock = nfs_flock,
+ .splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
+ .check_flags = nfs_check_flags,
+ .setlease = simple_nosetlease,
+#ifdef CONFIG_NFS_V4_2
+ .copy_file_range = nfs4_copy_file_range,
+ .llseek = nfs4_file_llseek,
+ .fallocate = nfs42_fallocate,
+ .remap_file_range = nfs42_remap_file_range,
+#else
+ .llseek = nfs_file_llseek,
+#endif
+};
diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c
new file mode 100644
index 000000000..1a69479a3
--- /dev/null
+++ b/fs/nfs/nfs4getroot.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+* Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+* Written by David Howells (dhowells@redhat.com)
+*/
+
+#include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_CLIENT
+
+int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_probe)
+{
+ struct nfs_fsinfo fsinfo;
+ int ret = -ENOMEM;
+
+ fsinfo.fattr = nfs_alloc_fattr();
+ if (fsinfo.fattr == NULL)
+ goto out;
+
+ /* Start by getting the root filehandle from the server */
+ ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo, auth_probe);
+ if (ret < 0) {
+ dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret);
+ goto out;
+ }
+
+ if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE)
+ || !S_ISDIR(fsinfo.fattr->mode)) {
+ printk(KERN_ERR "nfs4_get_rootfh:"
+ " getroot encountered non-directory\n");
+ ret = -ENOTDIR;
+ goto out;
+ }
+
+ memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
+out:
+ nfs_free_fattr(fsinfo.fattr);
+ return ret;
+}
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
new file mode 100644
index 000000000..ec6afd3c4
--- /dev/null
+++ b/fs/nfs/nfs4idmap.c
@@ -0,0 +1,806 @@
+/*
+ * fs/nfs/idmap.c
+ *
+ * UID and GID to name mapping for clients.
+ *
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/types.h>
+#include <linux/parser.h>
+#include <linux/fs.h>
+#include <net/net_namespace.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <keys/user-type.h>
+#include <keys/request_key_auth-type.h>
+#include <linux/module.h>
+#include <linux/user_namespace.h>
+
+#include "internal.h"
+#include "netns.h"
+#include "nfs4idmap.h"
+#include "nfs4trace.h"
+
+#define NFS_UINT_MAXLEN 11
+
+static const struct cred *id_resolver_cache;
+static struct key_type key_type_id_resolver_legacy;
+
+struct idmap_legacy_upcalldata {
+ struct rpc_pipe_msg pipe_msg;
+ struct idmap_msg idmap_msg;
+ struct key *authkey;
+ struct idmap *idmap;
+};
+
+struct idmap {
+ struct rpc_pipe_dir_object idmap_pdo;
+ struct rpc_pipe *idmap_pipe;
+ struct idmap_legacy_upcalldata *idmap_upcall_data;
+ struct mutex idmap_mutex;
+ struct user_namespace *user_ns;
+};
+
+static struct user_namespace *idmap_userns(const struct idmap *idmap)
+{
+ if (idmap && idmap->user_ns)
+ return idmap->user_ns;
+ return &init_user_ns;
+}
+
+/**
+ * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields
+ * @fattr: fully initialised struct nfs_fattr
+ * @owner_name: owner name string cache
+ * @group_name: group name string cache
+ */
+void nfs_fattr_init_names(struct nfs_fattr *fattr,
+ struct nfs4_string *owner_name,
+ struct nfs4_string *group_name)
+{
+ fattr->owner_name = owner_name;
+ fattr->group_name = group_name;
+}
+
+static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr)
+{
+ fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME;
+ kfree(fattr->owner_name->data);
+}
+
+static void nfs_fattr_free_group_name(struct nfs_fattr *fattr)
+{
+ fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME;
+ kfree(fattr->group_name->data);
+}
+
+static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+ struct nfs4_string *owner = fattr->owner_name;
+ kuid_t uid;
+
+ if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME))
+ return false;
+ if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) {
+ fattr->uid = uid;
+ fattr->valid |= NFS_ATTR_FATTR_OWNER;
+ }
+ return true;
+}
+
+static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+ struct nfs4_string *group = fattr->group_name;
+ kgid_t gid;
+
+ if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME))
+ return false;
+ if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) {
+ fattr->gid = gid;
+ fattr->valid |= NFS_ATTR_FATTR_GROUP;
+ }
+ return true;
+}
+
+/**
+ * nfs_fattr_free_names - free up the NFSv4 owner and group strings
+ * @fattr: a fully initialised nfs_fattr structure
+ */
+void nfs_fattr_free_names(struct nfs_fattr *fattr)
+{
+ if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)
+ nfs_fattr_free_owner_name(fattr);
+ if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)
+ nfs_fattr_free_group_name(fattr);
+}
+
+/**
+ * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free
+ * @server: pointer to the filesystem nfs_server structure
+ * @fattr: a fully initialised nfs_fattr structure
+ *
+ * This helper maps the cached NFSv4 owner/group strings in fattr into
+ * their numeric uid/gid equivalents, and then frees the cached strings.
+ */
+void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+ if (nfs_fattr_map_owner_name(server, fattr))
+ nfs_fattr_free_owner_name(fattr);
+ if (nfs_fattr_map_group_name(server, fattr))
+ nfs_fattr_free_group_name(fattr);
+}
+
+int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
+{
+ unsigned long val;
+ char buf[16];
+
+ if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf))
+ return 0;
+ memcpy(buf, name, namelen);
+ buf[namelen] = '\0';
+ if (kstrtoul(buf, 0, &val) != 0)
+ return 0;
+ *res = val;
+ return 1;
+}
+EXPORT_SYMBOL_GPL(nfs_map_string_to_numeric);
+
+static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
+{
+ return snprintf(buf, buflen, "%u", id);
+}
+
+static struct key_type key_type_id_resolver = {
+ .name = "id_resolver",
+ .preparse = user_preparse,
+ .free_preparse = user_free_preparse,
+ .instantiate = generic_key_instantiate,
+ .revoke = user_revoke,
+ .destroy = user_destroy,
+ .describe = user_describe,
+ .read = user_read,
+};
+
+int nfs_idmap_init(void)
+{
+ struct cred *cred;
+ struct key *keyring;
+ int ret = 0;
+
+ printk(KERN_NOTICE "NFS: Registering the %s key type\n",
+ key_type_id_resolver.name);
+
+ cred = prepare_kernel_cred(NULL);
+ if (!cred)
+ return -ENOMEM;
+
+ keyring = keyring_alloc(".id_resolver",
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
+ if (IS_ERR(keyring)) {
+ ret = PTR_ERR(keyring);
+ goto failed_put_cred;
+ }
+
+ ret = register_key_type(&key_type_id_resolver);
+ if (ret < 0)
+ goto failed_put_key;
+
+ ret = register_key_type(&key_type_id_resolver_legacy);
+ if (ret < 0)
+ goto failed_reg_legacy;
+
+ set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags);
+ cred->thread_keyring = keyring;
+ cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+ id_resolver_cache = cred;
+ return 0;
+
+failed_reg_legacy:
+ unregister_key_type(&key_type_id_resolver);
+failed_put_key:
+ key_put(keyring);
+failed_put_cred:
+ put_cred(cred);
+ return ret;
+}
+
+void nfs_idmap_quit(void)
+{
+ key_revoke(id_resolver_cache->thread_keyring);
+ unregister_key_type(&key_type_id_resolver);
+ unregister_key_type(&key_type_id_resolver_legacy);
+ put_cred(id_resolver_cache);
+}
+
+/*
+ * Assemble the description to pass to request_key()
+ * This function will allocate a new string and update dest to point
+ * at it. The caller is responsible for freeing dest.
+ *
+ * On error 0 is returned. Otherwise, the length of dest is returned.
+ */
+static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
+ const char *type, size_t typelen, char **desc)
+{
+ char *cp;
+ size_t desclen = typelen + namelen + 2;
+
+ *desc = kmalloc(desclen, GFP_KERNEL);
+ if (!*desc)
+ return -ENOMEM;
+
+ cp = *desc;
+ memcpy(cp, type, typelen);
+ cp += typelen;
+ *cp++ = ':';
+
+ memcpy(cp, name, namelen);
+ cp += namelen;
+ *cp = '\0';
+ return desclen;
+}
+
+static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
+ const char *type, struct idmap *idmap)
+{
+ char *desc;
+ struct key *rkey = ERR_PTR(-EAGAIN);
+ ssize_t ret;
+
+ ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ if (!idmap->user_ns || idmap->user_ns == &init_user_ns)
+ rkey = request_key(&key_type_id_resolver, desc, "");
+ if (IS_ERR(rkey)) {
+ mutex_lock(&idmap->idmap_mutex);
+ rkey = request_key_with_auxdata(&key_type_id_resolver_legacy,
+ desc, NULL, "", 0, idmap);
+ mutex_unlock(&idmap->idmap_mutex);
+ }
+ if (!IS_ERR(rkey))
+ set_bit(KEY_FLAG_ROOT_CAN_INVAL, &rkey->flags);
+
+ kfree(desc);
+ return rkey;
+}
+
+static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
+ const char *type, void *data,
+ size_t data_size, struct idmap *idmap)
+{
+ const struct cred *saved_cred;
+ struct key *rkey;
+ const struct user_key_payload *payload;
+ ssize_t ret;
+
+ saved_cred = override_creds(id_resolver_cache);
+ rkey = nfs_idmap_request_key(name, namelen, type, idmap);
+ revert_creds(saved_cred);
+
+ if (IS_ERR(rkey)) {
+ ret = PTR_ERR(rkey);
+ goto out;
+ }
+
+ rcu_read_lock();
+ rkey->perm |= KEY_USR_VIEW;
+
+ ret = key_validate(rkey);
+ if (ret < 0)
+ goto out_up;
+
+ payload = user_key_payload_rcu(rkey);
+ if (IS_ERR_OR_NULL(payload)) {
+ ret = PTR_ERR(payload);
+ goto out_up;
+ }
+
+ ret = payload->datalen;
+ if (ret > 0 && ret <= data_size)
+ memcpy(data, payload->data, ret);
+ else
+ ret = -EINVAL;
+
+out_up:
+ rcu_read_unlock();
+ key_put(rkey);
+out:
+ return ret;
+}
+
+/* ID -> Name */
+static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf,
+ size_t buflen, struct idmap *idmap)
+{
+ char id_str[NFS_UINT_MAXLEN];
+ int id_len;
+ ssize_t ret;
+
+ id_len = nfs_map_numeric_to_string(id, id_str, sizeof(id_str));
+ ret = nfs_idmap_get_key(id_str, id_len, type, buf, buflen, idmap);
+ if (ret < 0)
+ return -EINVAL;
+ return ret;
+}
+
+/* Name -> ID */
+static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *type,
+ __u32 *id, struct idmap *idmap)
+{
+ char id_str[NFS_UINT_MAXLEN];
+ long id_long;
+ ssize_t data_size;
+ int ret = 0;
+
+ data_size = nfs_idmap_get_key(name, namelen, type, id_str, NFS_UINT_MAXLEN, idmap);
+ if (data_size <= 0) {
+ ret = -EINVAL;
+ } else {
+ ret = kstrtol(id_str, 10, &id_long);
+ if (!ret)
+ *id = (__u32)id_long;
+ }
+ return ret;
+}
+
+/* idmap classic begins here */
+
+enum {
+ Opt_find_uid, Opt_find_gid, Opt_find_user, Opt_find_group, Opt_find_err
+};
+
+static const match_table_t nfs_idmap_tokens = {
+ { Opt_find_uid, "uid:%s" },
+ { Opt_find_gid, "gid:%s" },
+ { Opt_find_user, "user:%s" },
+ { Opt_find_group, "group:%s" },
+ { Opt_find_err, NULL }
+};
+
+static int nfs_idmap_legacy_upcall(struct key *, void *);
+static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
+ size_t);
+static void idmap_release_pipe(struct inode *);
+static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
+
+static const struct rpc_pipe_ops idmap_upcall_ops = {
+ .upcall = rpc_pipe_generic_upcall,
+ .downcall = idmap_pipe_downcall,
+ .release_pipe = idmap_release_pipe,
+ .destroy_msg = idmap_pipe_destroy_msg,
+};
+
+static struct key_type key_type_id_resolver_legacy = {
+ .name = "id_legacy",
+ .preparse = user_preparse,
+ .free_preparse = user_free_preparse,
+ .instantiate = generic_key_instantiate,
+ .revoke = user_revoke,
+ .destroy = user_destroy,
+ .describe = user_describe,
+ .read = user_read,
+ .request_key = nfs_idmap_legacy_upcall,
+};
+
+static void nfs_idmap_pipe_destroy(struct dentry *dir,
+ struct rpc_pipe_dir_object *pdo)
+{
+ struct idmap *idmap = pdo->pdo_data;
+ struct rpc_pipe *pipe = idmap->idmap_pipe;
+
+ if (pipe->dentry) {
+ rpc_unlink(pipe->dentry);
+ pipe->dentry = NULL;
+ }
+}
+
+static int nfs_idmap_pipe_create(struct dentry *dir,
+ struct rpc_pipe_dir_object *pdo)
+{
+ struct idmap *idmap = pdo->pdo_data;
+ struct rpc_pipe *pipe = idmap->idmap_pipe;
+ struct dentry *dentry;
+
+ dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ pipe->dentry = dentry;
+ return 0;
+}
+
+static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = {
+ .create = nfs_idmap_pipe_create,
+ .destroy = nfs_idmap_pipe_destroy,
+};
+
+int
+nfs_idmap_new(struct nfs_client *clp)
+{
+ struct idmap *idmap;
+ struct rpc_pipe *pipe;
+ int error;
+
+ idmap = kzalloc(sizeof(*idmap), GFP_KERNEL);
+ if (idmap == NULL)
+ return -ENOMEM;
+
+ mutex_init(&idmap->idmap_mutex);
+ idmap->user_ns = get_user_ns(clp->cl_rpcclient->cl_cred->user_ns);
+
+ rpc_init_pipe_dir_object(&idmap->idmap_pdo,
+ &nfs_idmap_pipe_dir_object_ops,
+ idmap);
+
+ pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0);
+ if (IS_ERR(pipe)) {
+ error = PTR_ERR(pipe);
+ goto err;
+ }
+ idmap->idmap_pipe = pipe;
+
+ error = rpc_add_pipe_dir_object(clp->cl_net,
+ &clp->cl_rpcclient->cl_pipedir_objects,
+ &idmap->idmap_pdo);
+ if (error)
+ goto err_destroy_pipe;
+
+ clp->cl_idmap = idmap;
+ return 0;
+err_destroy_pipe:
+ rpc_destroy_pipe_data(idmap->idmap_pipe);
+err:
+ put_user_ns(idmap->user_ns);
+ kfree(idmap);
+ return error;
+}
+
+void
+nfs_idmap_delete(struct nfs_client *clp)
+{
+ struct idmap *idmap = clp->cl_idmap;
+
+ if (!idmap)
+ return;
+ clp->cl_idmap = NULL;
+ rpc_remove_pipe_dir_object(clp->cl_net,
+ &clp->cl_rpcclient->cl_pipedir_objects,
+ &idmap->idmap_pdo);
+ rpc_destroy_pipe_data(idmap->idmap_pipe);
+ put_user_ns(idmap->user_ns);
+ kfree(idmap);
+}
+
+static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap,
+ struct idmap_msg *im,
+ struct rpc_pipe_msg *msg)
+{
+ substring_t substr;
+ int token, ret;
+
+ im->im_type = IDMAP_TYPE_GROUP;
+ token = match_token(desc, nfs_idmap_tokens, &substr);
+
+ switch (token) {
+ case Opt_find_uid:
+ im->im_type = IDMAP_TYPE_USER;
+ fallthrough;
+ case Opt_find_gid:
+ im->im_conv = IDMAP_CONV_NAMETOID;
+ ret = match_strlcpy(im->im_name, &substr, IDMAP_NAMESZ);
+ break;
+
+ case Opt_find_user:
+ im->im_type = IDMAP_TYPE_USER;
+ fallthrough;
+ case Opt_find_group:
+ im->im_conv = IDMAP_CONV_IDTONAME;
+ ret = match_int(&substr, &im->im_id);
+ if (ret)
+ goto out;
+ break;
+
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ msg->data = im;
+ msg->len = sizeof(struct idmap_msg);
+
+out:
+ return ret;
+}
+
+static bool
+nfs_idmap_prepare_pipe_upcall(struct idmap *idmap,
+ struct idmap_legacy_upcalldata *data)
+{
+ if (idmap->idmap_upcall_data != NULL) {
+ WARN_ON_ONCE(1);
+ return false;
+ }
+ idmap->idmap_upcall_data = data;
+ return true;
+}
+
+static void nfs_idmap_complete_pipe_upcall(struct idmap_legacy_upcalldata *data,
+ int ret)
+{
+ complete_request_key(data->authkey, ret);
+ key_put(data->authkey);
+ kfree(data);
+}
+
+static void nfs_idmap_abort_pipe_upcall(struct idmap *idmap,
+ struct idmap_legacy_upcalldata *data,
+ int ret)
+{
+ if (cmpxchg(&idmap->idmap_upcall_data, data, NULL) == data)
+ nfs_idmap_complete_pipe_upcall(data, ret);
+}
+
+static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux)
+{
+ struct idmap_legacy_upcalldata *data;
+ struct request_key_auth *rka = get_request_key_auth(authkey);
+ struct rpc_pipe_msg *msg;
+ struct idmap_msg *im;
+ struct idmap *idmap = (struct idmap *)aux;
+ struct key *key = rka->target_key;
+ int ret = -ENOKEY;
+
+ if (!aux)
+ goto out1;
+
+ /* msg and im are freed in idmap_pipe_destroy_msg */
+ ret = -ENOMEM;
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ goto out1;
+
+ msg = &data->pipe_msg;
+ im = &data->idmap_msg;
+ data->idmap = idmap;
+ data->authkey = key_get(authkey);
+
+ ret = nfs_idmap_prepare_message(key->description, idmap, im, msg);
+ if (ret < 0)
+ goto out2;
+
+ ret = -EAGAIN;
+ if (!nfs_idmap_prepare_pipe_upcall(idmap, data))
+ goto out2;
+
+ ret = rpc_queue_upcall(idmap->idmap_pipe, msg);
+ if (ret < 0)
+ nfs_idmap_abort_pipe_upcall(idmap, data, ret);
+
+ return ret;
+out2:
+ kfree(data);
+out1:
+ complete_request_key(authkey, ret);
+ return ret;
+}
+
+static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data, size_t datalen)
+{
+ return key_instantiate_and_link(key, data, datalen,
+ id_resolver_cache->thread_keyring,
+ authkey);
+}
+
+static int nfs_idmap_read_and_verify_message(struct idmap_msg *im,
+ struct idmap_msg *upcall,
+ struct key *key, struct key *authkey)
+{
+ char id_str[NFS_UINT_MAXLEN];
+ size_t len;
+ int ret = -ENOKEY;
+
+ /* ret = -ENOKEY */
+ if (upcall->im_type != im->im_type || upcall->im_conv != im->im_conv)
+ goto out;
+ switch (im->im_conv) {
+ case IDMAP_CONV_NAMETOID:
+ if (strcmp(upcall->im_name, im->im_name) != 0)
+ break;
+ /* Note: here we store the NUL terminator too */
+ len = 1 + nfs_map_numeric_to_string(im->im_id, id_str,
+ sizeof(id_str));
+ ret = nfs_idmap_instantiate(key, authkey, id_str, len);
+ break;
+ case IDMAP_CONV_IDTONAME:
+ if (upcall->im_id != im->im_id)
+ break;
+ len = strlen(im->im_name);
+ ret = nfs_idmap_instantiate(key, authkey, im->im_name, len);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+out:
+ return ret;
+}
+
+static ssize_t
+idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
+{
+ struct request_key_auth *rka;
+ struct rpc_inode *rpci = RPC_I(file_inode(filp));
+ struct idmap *idmap = (struct idmap *)rpci->private;
+ struct idmap_legacy_upcalldata *data;
+ struct key *authkey;
+ struct idmap_msg im;
+ size_t namelen_in;
+ int ret = -ENOKEY;
+
+ /* If instantiation is successful, anyone waiting for key construction
+ * will have been woken up and someone else may now have used
+ * idmap_key_cons - so after this point we may no longer touch it.
+ */
+ data = xchg(&idmap->idmap_upcall_data, NULL);
+ if (data == NULL)
+ goto out_noupcall;
+
+ authkey = data->authkey;
+ rka = get_request_key_auth(authkey);
+
+ if (mlen != sizeof(im)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ if (copy_from_user(&im, src, mlen) != 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (!(im.im_status & IDMAP_STATUS_SUCCESS)) {
+ ret = -ENOKEY;
+ goto out;
+ }
+
+ namelen_in = strnlen(im.im_name, IDMAP_NAMESZ);
+ if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = nfs_idmap_read_and_verify_message(&im, &data->idmap_msg,
+ rka->target_key, authkey);
+ if (ret >= 0) {
+ key_set_timeout(rka->target_key, nfs_idmap_cache_timeout);
+ ret = mlen;
+ }
+
+out:
+ nfs_idmap_complete_pipe_upcall(data, ret);
+out_noupcall:
+ return ret;
+}
+
+static void
+idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+ struct idmap_legacy_upcalldata *data = container_of(msg,
+ struct idmap_legacy_upcalldata,
+ pipe_msg);
+ struct idmap *idmap = data->idmap;
+
+ if (msg->errno)
+ nfs_idmap_abort_pipe_upcall(idmap, data, msg->errno);
+}
+
+static void
+idmap_release_pipe(struct inode *inode)
+{
+ struct rpc_inode *rpci = RPC_I(inode);
+ struct idmap *idmap = (struct idmap *)rpci->private;
+ struct idmap_legacy_upcalldata *data;
+
+ data = xchg(&idmap->idmap_upcall_data, NULL);
+ if (data)
+ nfs_idmap_complete_pipe_upcall(data, -EPIPE);
+}
+
+int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid)
+{
+ struct idmap *idmap = server->nfs_client->cl_idmap;
+ __u32 id = -1;
+ int ret = 0;
+
+ if (!nfs_map_string_to_numeric(name, namelen, &id))
+ ret = nfs_idmap_lookup_id(name, namelen, "uid", &id, idmap);
+ if (ret == 0) {
+ *uid = make_kuid(idmap_userns(idmap), id);
+ if (!uid_valid(*uid))
+ ret = -ERANGE;
+ }
+ trace_nfs4_map_name_to_uid(name, namelen, id, ret);
+ return ret;
+}
+
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, kgid_t *gid)
+{
+ struct idmap *idmap = server->nfs_client->cl_idmap;
+ __u32 id = -1;
+ int ret = 0;
+
+ if (!nfs_map_string_to_numeric(name, namelen, &id))
+ ret = nfs_idmap_lookup_id(name, namelen, "gid", &id, idmap);
+ if (ret == 0) {
+ *gid = make_kgid(idmap_userns(idmap), id);
+ if (!gid_valid(*gid))
+ ret = -ERANGE;
+ }
+ trace_nfs4_map_group_to_gid(name, namelen, id, ret);
+ return ret;
+}
+
+int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf, size_t buflen)
+{
+ struct idmap *idmap = server->nfs_client->cl_idmap;
+ int ret = -EINVAL;
+ __u32 id;
+
+ id = from_kuid_munged(idmap_userns(idmap), uid);
+ if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+ ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap);
+ if (ret < 0)
+ ret = nfs_map_numeric_to_string(id, buf, buflen);
+ trace_nfs4_map_uid_to_name(buf, ret, id, ret);
+ return ret;
+}
+int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, size_t buflen)
+{
+ struct idmap *idmap = server->nfs_client->cl_idmap;
+ int ret = -EINVAL;
+ __u32 id;
+
+ id = from_kgid_munged(idmap_userns(idmap), gid);
+ if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+ ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap);
+ if (ret < 0)
+ ret = nfs_map_numeric_to_string(id, buf, buflen);
+ trace_nfs4_map_gid_to_group(buf, ret, id, ret);
+ return ret;
+}
diff --git a/fs/nfs/nfs4idmap.h b/fs/nfs/nfs4idmap.h
new file mode 100644
index 000000000..de44d7330
--- /dev/null
+++ b/fs/nfs/nfs4idmap.h
@@ -0,0 +1,68 @@
+/*
+ * fs/nfs/nfs4idmap.h
+ *
+ * UID and GID to name mapping for clients.
+ *
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef NFS_IDMAP_H
+#define NFS_IDMAP_H
+
+#include <linux/uidgid.h>
+#include <uapi/linux/nfs_idmap.h>
+
+
+/* Forward declaration to make this header independent of others */
+struct nfs_client;
+struct nfs_server;
+struct nfs_fattr;
+struct nfs4_string;
+
+int nfs_idmap_init(void);
+void nfs_idmap_quit(void);
+int nfs_idmap_new(struct nfs_client *);
+void nfs_idmap_delete(struct nfs_client *);
+
+void nfs_fattr_init_names(struct nfs_fattr *fattr,
+ struct nfs4_string *owner_name,
+ struct nfs4_string *group_name);
+void nfs_fattr_free_names(struct nfs_fattr *);
+void nfs_fattr_map_and_free_names(struct nfs_server *, struct nfs_fattr *);
+
+int nfs_map_name_to_uid(const struct nfs_server *, const char *, size_t, kuid_t *);
+int nfs_map_group_to_gid(const struct nfs_server *, const char *, size_t, kgid_t *);
+int nfs_map_uid_to_name(const struct nfs_server *, kuid_t, char *, size_t);
+int nfs_map_gid_to_group(const struct nfs_server *, kgid_t, char *, size_t);
+
+int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res);
+
+extern unsigned int nfs_idmap_cache_timeout;
+#endif /* NFS_IDMAP_H */
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
new file mode 100644
index 000000000..3680c8da5
--- /dev/null
+++ b/fs/nfs/nfs4namespace.c
@@ -0,0 +1,572 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/nfs4namespace.c
+ *
+ * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ * - Modified by David Howells <dhowells@redhat.com>
+ *
+ * NFSv4 namespace
+ */
+
+#include <linux/module.h>
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include "internal.h"
+#include "nfs4_fs.h"
+#include "nfs.h"
+#include "dns_resolve.h"
+
+#define NFSDBG_FACILITY NFSDBG_VFS
+
+/*
+ * Work out the length that an NFSv4 path would render to as a standard posix
+ * path, with a leading slash but no terminating slash.
+ */
+static ssize_t nfs4_pathname_len(const struct nfs4_pathname *pathname)
+{
+ ssize_t len = 0;
+ int i;
+
+ for (i = 0; i < pathname->ncomponents; i++) {
+ const struct nfs4_string *component = &pathname->components[i];
+
+ if (component->len > NAME_MAX)
+ goto too_long;
+ len += 1 + component->len; /* Adding "/foo" */
+ if (len > PATH_MAX)
+ goto too_long;
+ }
+ return len;
+
+too_long:
+ return -ENAMETOOLONG;
+}
+
+/*
+ * Convert the NFSv4 pathname components into a standard posix path.
+ */
+static char *nfs4_pathname_string(const struct nfs4_pathname *pathname,
+ unsigned short *_len)
+{
+ ssize_t len;
+ char *buf, *p;
+ int i;
+
+ len = nfs4_pathname_len(pathname);
+ if (len < 0)
+ return ERR_PTR(len);
+ *_len = len;
+
+ p = buf = kmalloc(len + 1, GFP_KERNEL);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < pathname->ncomponents; i++) {
+ const struct nfs4_string *component = &pathname->components[i];
+
+ *p++ = '/';
+ memcpy(p, component->data, component->len);
+ p += component->len;
+ }
+
+ *p = 0;
+ return buf;
+}
+
+/*
+ * return the path component of "<server>:<path>"
+ * nfspath - the "<server>:<path>" string
+ * end - one past the last char that could contain "<server>:"
+ * returns NULL on failure
+ */
+static char *nfs_path_component(const char *nfspath, const char *end)
+{
+ char *p;
+
+ if (*nfspath == '[') {
+ /* parse [] escaped IPv6 addrs */
+ p = strchr(nfspath, ']');
+ if (p != NULL && ++p < end && *p == ':')
+ return p + 1;
+ } else {
+ /* otherwise split on first colon */
+ p = strchr(nfspath, ':');
+ if (p != NULL && p < end)
+ return p + 1;
+ }
+ return NULL;
+}
+
+/*
+ * Determine the mount path as a string
+ */
+static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen)
+{
+ char *limit;
+ char *path = nfs_path(&limit, dentry, buffer, buflen,
+ NFS_PATH_CANONICAL);
+ if (!IS_ERR(path)) {
+ char *path_component = nfs_path_component(path, limit);
+ if (path_component)
+ return path_component;
+ }
+ return path;
+}
+
+/*
+ * Check that fs_locations::fs_root [RFC3530 6.3] is a prefix for what we
+ * believe to be the server path to this dentry
+ */
+static int nfs4_validate_fspath(struct dentry *dentry,
+ const struct nfs4_fs_locations *locations,
+ struct nfs_fs_context *ctx)
+{
+ const char *path;
+ char *fs_path;
+ unsigned short len;
+ char *buf;
+ int n;
+
+ buf = kmalloc(4096, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ path = nfs4_path(dentry, buf, 4096);
+ if (IS_ERR(path)) {
+ kfree(buf);
+ return PTR_ERR(path);
+ }
+
+ fs_path = nfs4_pathname_string(&locations->fs_path, &len);
+ if (IS_ERR(fs_path)) {
+ kfree(buf);
+ return PTR_ERR(fs_path);
+ }
+
+ n = strncmp(path, fs_path, len);
+ kfree(buf);
+ kfree(fs_path);
+ if (n != 0) {
+ dprintk("%s: path %s does not begin with fsroot %s\n",
+ __func__, path, ctx->nfs_server.export_path);
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa,
+ size_t salen, struct net *net, int port)
+{
+ ssize_t ret;
+
+ ret = rpc_pton(net, string, len, sa, salen);
+ if (ret == 0) {
+ ret = rpc_uaddr2sockaddr(net, string, len, sa, salen);
+ if (ret == 0) {
+ ret = nfs_dns_resolve_name(net, string, len, sa, salen);
+ if (ret < 0)
+ ret = 0;
+ }
+ } else if (port) {
+ rpc_set_port(sa, port);
+ }
+ return ret;
+}
+
+/**
+ * nfs_find_best_sec - Find a security mechanism supported locally
+ * @clnt: pointer to rpc_clnt
+ * @server: NFS server struct
+ * @flavors: List of security tuples returned by SECINFO procedure
+ *
+ * Return an rpc client that uses the first security mechanism in
+ * "flavors" that is locally supported. The "flavors" array
+ * is searched in the order returned from the server, per RFC 3530
+ * recommendation and each flavor is checked for membership in the
+ * sec= mount option list if it exists.
+ *
+ * Return -EPERM if no matching flavor is found in the array.
+ *
+ * Please call rpc_shutdown_client() when you are done with this rpc client.
+ *
+ */
+static struct rpc_clnt *nfs_find_best_sec(struct rpc_clnt *clnt,
+ struct nfs_server *server,
+ struct nfs4_secinfo_flavors *flavors)
+{
+ rpc_authflavor_t pflavor;
+ struct nfs4_secinfo4 *secinfo;
+ unsigned int i;
+
+ for (i = 0; i < flavors->num_flavors; i++) {
+ secinfo = &flavors->flavors[i];
+
+ switch (secinfo->flavor) {
+ case RPC_AUTH_NULL:
+ case RPC_AUTH_UNIX:
+ case RPC_AUTH_GSS:
+ pflavor = rpcauth_get_pseudoflavor(secinfo->flavor,
+ &secinfo->flavor_info);
+ /* does the pseudoflavor match a sec= mount opt? */
+ if (pflavor != RPC_AUTH_MAXFLAVOR &&
+ nfs_auth_info_match(&server->auth_info, pflavor)) {
+ struct rpc_clnt *new;
+ struct rpc_cred *cred;
+
+ /* Cloning creates an rpc_auth for the flavor */
+ new = rpc_clone_client_set_auth(clnt, pflavor);
+ if (IS_ERR(new))
+ continue;
+ /**
+ * Check that the user actually can use the
+ * flavor. This is mostly for RPC_AUTH_GSS
+ * where cr_init obtains a gss context
+ */
+ cred = rpcauth_lookupcred(new->cl_auth, 0);
+ if (IS_ERR(cred)) {
+ rpc_shutdown_client(new);
+ continue;
+ }
+ put_rpccred(cred);
+ return new;
+ }
+ }
+ }
+ return ERR_PTR(-EPERM);
+}
+
+/**
+ * nfs4_negotiate_security - in response to an NFS4ERR_WRONGSEC on lookup,
+ * return an rpc_clnt that uses the best available security flavor with
+ * respect to the secinfo flavor list and the sec= mount options.
+ *
+ * @clnt: RPC client to clone
+ * @inode: directory inode
+ * @name: lookup name
+ *
+ * Please call rpc_shutdown_client() when you are done with this rpc client.
+ */
+struct rpc_clnt *
+nfs4_negotiate_security(struct rpc_clnt *clnt, struct inode *inode,
+ const struct qstr *name)
+{
+ struct page *page;
+ struct nfs4_secinfo_flavors *flavors;
+ struct rpc_clnt *new;
+ int err;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ flavors = page_address(page);
+
+ err = nfs4_proc_secinfo(inode, name, flavors);
+ if (err < 0) {
+ new = ERR_PTR(err);
+ goto out;
+ }
+
+ new = nfs_find_best_sec(clnt, NFS_SERVER(inode), flavors);
+
+out:
+ put_page(page);
+ return new;
+}
+
+static int try_location(struct fs_context *fc,
+ const struct nfs4_fs_location *location)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ unsigned int len, s;
+ char *export_path, *source, *p;
+ int ret = -ENOENT;
+
+ /* Allocate a buffer big enough to hold any of the hostnames plus a
+ * terminating char and also a buffer big enough to hold the hostname
+ * plus a colon plus the path.
+ */
+ len = 0;
+ for (s = 0; s < location->nservers; s++) {
+ const struct nfs4_string *buf = &location->servers[s];
+ if (buf->len > len)
+ len = buf->len;
+ }
+
+ kfree(ctx->nfs_server.hostname);
+ ctx->nfs_server.hostname = kmalloc(len + 1, GFP_KERNEL);
+ if (!ctx->nfs_server.hostname)
+ return -ENOMEM;
+
+ export_path = nfs4_pathname_string(&location->rootpath,
+ &ctx->nfs_server.export_path_len);
+ if (IS_ERR(export_path))
+ return PTR_ERR(export_path);
+
+ kfree(ctx->nfs_server.export_path);
+ ctx->nfs_server.export_path = export_path;
+
+ source = kmalloc(len + 1 + ctx->nfs_server.export_path_len + 1,
+ GFP_KERNEL);
+ if (!source)
+ return -ENOMEM;
+
+ kfree(fc->source);
+ fc->source = source;
+ for (s = 0; s < location->nservers; s++) {
+ const struct nfs4_string *buf = &location->servers[s];
+
+ if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
+ continue;
+
+ ctx->nfs_server.addrlen =
+ nfs_parse_server_name(buf->data, buf->len,
+ &ctx->nfs_server.address,
+ sizeof(ctx->nfs_server._address),
+ fc->net_ns, 0);
+ if (ctx->nfs_server.addrlen == 0)
+ continue;
+
+ rpc_set_port(&ctx->nfs_server.address, NFS_PORT);
+
+ memcpy(ctx->nfs_server.hostname, buf->data, buf->len);
+ ctx->nfs_server.hostname[buf->len] = '\0';
+
+ p = source;
+ memcpy(p, buf->data, buf->len);
+ p += buf->len;
+ *p++ = ':';
+ memcpy(p, ctx->nfs_server.export_path, ctx->nfs_server.export_path_len);
+ p += ctx->nfs_server.export_path_len;
+ *p = 0;
+
+ ret = nfs4_get_referral_tree(fc);
+ if (ret == 0)
+ return 0;
+ }
+
+ return ret;
+}
+
+/**
+ * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
+ * @fc: pointer to struct nfs_fs_context
+ * @locations: array of NFSv4 server location information
+ *
+ */
+static int nfs_follow_referral(struct fs_context *fc,
+ const struct nfs4_fs_locations *locations)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ int loc, error;
+
+ if (locations == NULL || locations->nlocations <= 0)
+ return -ENOENT;
+
+ dprintk("%s: referral at %pd2\n", __func__, ctx->clone_data.dentry);
+
+ /* Ensure fs path is a prefix of current dentry path */
+ error = nfs4_validate_fspath(ctx->clone_data.dentry, locations, ctx);
+ if (error < 0)
+ return error;
+
+ error = -ENOENT;
+ for (loc = 0; loc < locations->nlocations; loc++) {
+ const struct nfs4_fs_location *location = &locations->locations[loc];
+
+ if (location == NULL || location->nservers <= 0 ||
+ location->rootpath.ncomponents == 0)
+ continue;
+
+ error = try_location(fc, location);
+ if (error == 0)
+ return 0;
+ }
+
+ return error;
+}
+
+/*
+ * nfs_do_refmount - handle crossing a referral on server
+ * @dentry - dentry of referral
+ *
+ */
+static int nfs_do_refmount(struct fs_context *fc, struct rpc_clnt *client)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct dentry *dentry, *parent;
+ struct nfs4_fs_locations *fs_locations = NULL;
+ struct page *page;
+ int err = -ENOMEM;
+
+ /* BUG_ON(IS_ROOT(dentry)); */
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+ if (!fs_locations)
+ goto out_free;
+
+ /* Get locations */
+ dentry = ctx->clone_data.dentry;
+ parent = dget_parent(dentry);
+ dprintk("%s: getting locations for %pd2\n",
+ __func__, dentry);
+
+ err = nfs4_proc_fs_locations(client, d_inode(parent), &dentry->d_name, fs_locations, page);
+ dput(parent);
+ if (err != 0)
+ goto out_free_2;
+
+ err = -ENOENT;
+ if (fs_locations->nlocations <= 0 ||
+ fs_locations->fs_path.ncomponents <= 0)
+ goto out_free_2;
+
+ err = nfs_follow_referral(fc, fs_locations);
+out_free_2:
+ kfree(fs_locations);
+out_free:
+ __free_page(page);
+ return err;
+}
+
+int nfs4_submount(struct fs_context *fc, struct nfs_server *server)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct dentry *dentry = ctx->clone_data.dentry;
+ struct dentry *parent = dget_parent(dentry);
+ struct inode *dir = d_inode(parent);
+ struct rpc_clnt *client;
+ int ret;
+
+ /* Look it up again to get its attributes and sec flavor */
+ client = nfs4_proc_lookup_mountpoint(dir, dentry, ctx->mntfh,
+ ctx->clone_data.fattr);
+ dput(parent);
+ if (IS_ERR(client))
+ return PTR_ERR(client);
+
+ ctx->selected_flavor = client->cl_auth->au_flavor;
+ if (ctx->clone_data.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+ ret = nfs_do_refmount(fc, client);
+ } else {
+ ret = nfs_do_submount(fc);
+ }
+
+ rpc_shutdown_client(client);
+ return ret;
+}
+
+/*
+ * Try one location from the fs_locations array.
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+static int nfs4_try_replacing_one_location(struct nfs_server *server,
+ char *page, char *page2,
+ const struct nfs4_fs_location *location)
+{
+ const size_t addr_bufsize = sizeof(struct sockaddr_storage);
+ struct net *net = rpc_net_ns(server->client);
+ struct sockaddr *sap;
+ unsigned int s;
+ size_t salen;
+ int error;
+
+ sap = kmalloc(addr_bufsize, GFP_KERNEL);
+ if (sap == NULL)
+ return -ENOMEM;
+
+ error = -ENOENT;
+ for (s = 0; s < location->nservers; s++) {
+ const struct nfs4_string *buf = &location->servers[s];
+ char *hostname;
+
+ if (buf->len <= 0 || buf->len > PAGE_SIZE)
+ continue;
+
+ if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len) != NULL)
+ continue;
+
+ salen = nfs_parse_server_name(buf->data, buf->len,
+ sap, addr_bufsize, net, 0);
+ if (salen == 0)
+ continue;
+ rpc_set_port(sap, NFS_PORT);
+
+ error = -ENOMEM;
+ hostname = kmemdup_nul(buf->data, buf->len, GFP_KERNEL);
+ if (hostname == NULL)
+ break;
+
+ error = nfs4_update_server(server, hostname, sap, salen, net);
+ kfree(hostname);
+ if (error == 0)
+ break;
+ }
+
+ kfree(sap);
+ return error;
+}
+
+/**
+ * nfs4_replace_transport - set up transport to destination server
+ *
+ * @server: export being migrated
+ * @locations: fs_locations array
+ *
+ * Returns zero on success, or a negative errno value.
+ *
+ * The client tries all the entries in the "locations" array, in the
+ * order returned by the server, until one works or the end of the
+ * array is reached.
+ */
+int nfs4_replace_transport(struct nfs_server *server,
+ const struct nfs4_fs_locations *locations)
+{
+ char *page = NULL, *page2 = NULL;
+ int loc, error;
+
+ error = -ENOENT;
+ if (locations == NULL || locations->nlocations <= 0)
+ goto out;
+
+ error = -ENOMEM;
+ page = (char *) __get_free_page(GFP_USER);
+ if (!page)
+ goto out;
+ page2 = (char *) __get_free_page(GFP_USER);
+ if (!page2)
+ goto out;
+
+ for (loc = 0; loc < locations->nlocations; loc++) {
+ const struct nfs4_fs_location *location =
+ &locations->locations[loc];
+
+ if (location == NULL || location->nservers <= 0 ||
+ location->rootpath.ncomponents == 0)
+ continue;
+
+ error = nfs4_try_replacing_one_location(server, page,
+ page2, location);
+ if (error == 0)
+ break;
+ }
+
+out:
+ free_page((unsigned long)page);
+ free_page((unsigned long)page2);
+ return error;
+}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
new file mode 100644
index 000000000..7c3c96ed6
--- /dev/null
+++ b/fs/nfs/nfs4proc.c
@@ -0,0 +1,10530 @@
+/*
+ * fs/nfs/nfs4proc.c
+ *
+ * Client-side procedure declarations for NFSv4.
+ *
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Kendrick Smith <kmsmith@umich.edu>
+ * Andy Adamson <andros@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/ratelimit.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/nfs_mount.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/xattr.h>
+#include <linux/utsname.h>
+#include <linux/freezer.h>
+#include <linux/iversion.h>
+
+#include "nfs4_fs.h"
+#include "delegation.h"
+#include "internal.h"
+#include "iostat.h"
+#include "callback.h"
+#include "pnfs.h"
+#include "netns.h"
+#include "sysfs.h"
+#include "nfs4idmap.h"
+#include "nfs4session.h"
+#include "fscache.h"
+#include "nfs42.h"
+
+#include "nfs4trace.h"
+
+#ifdef CONFIG_NFS_V4_2
+#include "nfs42.h"
+#endif /* CONFIG_NFS_V4_2 */
+
+#define NFSDBG_FACILITY NFSDBG_PROC
+
+#define NFS4_BITMASK_SZ 3
+
+#define NFS4_POLL_RETRY_MIN (HZ/10)
+#define NFS4_POLL_RETRY_MAX (15*HZ)
+
+/* file attributes which can be mapped to nfs attributes */
+#define NFS4_VALID_ATTRS (ATTR_MODE \
+ | ATTR_UID \
+ | ATTR_GID \
+ | ATTR_SIZE \
+ | ATTR_ATIME \
+ | ATTR_MTIME \
+ | ATTR_CTIME \
+ | ATTR_ATIME_SET \
+ | ATTR_MTIME_SET)
+
+struct nfs4_opendata;
+static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
+static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
+static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label, struct inode *inode);
+static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
+ struct nfs_fattr *fattr, struct iattr *sattr,
+ struct nfs_open_context *ctx, struct nfs4_label *ilabel,
+ struct nfs4_label *olabel);
+#ifdef CONFIG_NFS_V4_1
+static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
+ const struct cred *cred,
+ struct nfs4_slot *slot,
+ bool is_privileged);
+static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
+ const struct cred *);
+static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *,
+ const struct cred *, bool);
+#endif
+static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ],
+ const __u32 *src, struct inode *inode,
+ struct nfs_server *server,
+ struct nfs4_label *label);
+
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static inline struct nfs4_label *
+nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr, struct nfs4_label *label)
+{
+ int err;
+
+ if (label == NULL)
+ return NULL;
+
+ if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)
+ return NULL;
+
+ label->lfs = 0;
+ label->pi = 0;
+ label->len = 0;
+ label->label = NULL;
+
+ err = security_dentry_init_security(dentry, sattr->ia_mode,
+ &dentry->d_name, (void **)&label->label, &label->len);
+ if (err == 0)
+ return label;
+
+ return NULL;
+}
+static inline void
+nfs4_label_release_security(struct nfs4_label *label)
+{
+ if (label)
+ security_release_secctx(label->label, label->len);
+}
+static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
+{
+ if (label)
+ return server->attr_bitmask;
+
+ return server->attr_bitmask_nl;
+}
+#else
+static inline struct nfs4_label *
+nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr, struct nfs4_label *l)
+{ return NULL; }
+static inline void
+nfs4_label_release_security(struct nfs4_label *label)
+{ return; }
+static inline u32 *
+nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
+{ return server->attr_bitmask; }
+#endif
+
+/* Prevent leaks of NFSv4 errors into userland */
+static int nfs4_map_errors(int err)
+{
+ if (err >= -1000)
+ return err;
+ switch (err) {
+ case -NFS4ERR_RESOURCE:
+ case -NFS4ERR_LAYOUTTRYLATER:
+ case -NFS4ERR_RECALLCONFLICT:
+ case -NFS4ERR_RETURNCONFLICT:
+ return -EREMOTEIO;
+ case -NFS4ERR_WRONGSEC:
+ case -NFS4ERR_WRONG_CRED:
+ return -EPERM;
+ case -NFS4ERR_BADOWNER:
+ case -NFS4ERR_BADNAME:
+ return -EINVAL;
+ case -NFS4ERR_SHARE_DENIED:
+ return -EACCES;
+ case -NFS4ERR_MINOR_VERS_MISMATCH:
+ return -EPROTONOSUPPORT;
+ case -NFS4ERR_FILE_OPEN:
+ return -EBUSY;
+ default:
+ dprintk("%s could not handle NFSv4 error %d\n",
+ __func__, -err);
+ break;
+ }
+ return -EIO;
+}
+
+/*
+ * This is our standard bitmap for GETATTR requests.
+ */
+const u32 nfs4_fattr_bitmap[3] = {
+ FATTR4_WORD0_TYPE
+ | FATTR4_WORD0_CHANGE
+ | FATTR4_WORD0_SIZE
+ | FATTR4_WORD0_FSID
+ | FATTR4_WORD0_FILEID,
+ FATTR4_WORD1_MODE
+ | FATTR4_WORD1_NUMLINKS
+ | FATTR4_WORD1_OWNER
+ | FATTR4_WORD1_OWNER_GROUP
+ | FATTR4_WORD1_RAWDEV
+ | FATTR4_WORD1_SPACE_USED
+ | FATTR4_WORD1_TIME_ACCESS
+ | FATTR4_WORD1_TIME_METADATA
+ | FATTR4_WORD1_TIME_MODIFY
+ | FATTR4_WORD1_MOUNTED_ON_FILEID,
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ FATTR4_WORD2_SECURITY_LABEL
+#endif
+};
+
+static const u32 nfs4_pnfs_open_bitmap[3] = {
+ FATTR4_WORD0_TYPE
+ | FATTR4_WORD0_CHANGE
+ | FATTR4_WORD0_SIZE
+ | FATTR4_WORD0_FSID
+ | FATTR4_WORD0_FILEID,
+ FATTR4_WORD1_MODE
+ | FATTR4_WORD1_NUMLINKS
+ | FATTR4_WORD1_OWNER
+ | FATTR4_WORD1_OWNER_GROUP
+ | FATTR4_WORD1_RAWDEV
+ | FATTR4_WORD1_SPACE_USED
+ | FATTR4_WORD1_TIME_ACCESS
+ | FATTR4_WORD1_TIME_METADATA
+ | FATTR4_WORD1_TIME_MODIFY,
+ FATTR4_WORD2_MDSTHRESHOLD
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ | FATTR4_WORD2_SECURITY_LABEL
+#endif
+};
+
+static const u32 nfs4_open_noattr_bitmap[3] = {
+ FATTR4_WORD0_TYPE
+ | FATTR4_WORD0_FILEID,
+};
+
+const u32 nfs4_statfs_bitmap[3] = {
+ FATTR4_WORD0_FILES_AVAIL
+ | FATTR4_WORD0_FILES_FREE
+ | FATTR4_WORD0_FILES_TOTAL,
+ FATTR4_WORD1_SPACE_AVAIL
+ | FATTR4_WORD1_SPACE_FREE
+ | FATTR4_WORD1_SPACE_TOTAL
+};
+
+const u32 nfs4_pathconf_bitmap[3] = {
+ FATTR4_WORD0_MAXLINK
+ | FATTR4_WORD0_MAXNAME,
+ 0
+};
+
+const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
+ | FATTR4_WORD0_MAXREAD
+ | FATTR4_WORD0_MAXWRITE
+ | FATTR4_WORD0_LEASE_TIME,
+ FATTR4_WORD1_TIME_DELTA
+ | FATTR4_WORD1_FS_LAYOUT_TYPES,
+ FATTR4_WORD2_LAYOUT_BLKSIZE
+ | FATTR4_WORD2_CLONE_BLKSIZE
+ | FATTR4_WORD2_XATTR_SUPPORT
+};
+
+const u32 nfs4_fs_locations_bitmap[3] = {
+ FATTR4_WORD0_CHANGE
+ | FATTR4_WORD0_SIZE
+ | FATTR4_WORD0_FSID
+ | FATTR4_WORD0_FILEID
+ | FATTR4_WORD0_FS_LOCATIONS,
+ FATTR4_WORD1_OWNER
+ | FATTR4_WORD1_OWNER_GROUP
+ | FATTR4_WORD1_RAWDEV
+ | FATTR4_WORD1_SPACE_USED
+ | FATTR4_WORD1_TIME_ACCESS
+ | FATTR4_WORD1_TIME_METADATA
+ | FATTR4_WORD1_TIME_MODIFY
+ | FATTR4_WORD1_MOUNTED_ON_FILEID,
+};
+
+static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src,
+ struct inode *inode)
+{
+ unsigned long cache_validity;
+
+ memcpy(dst, src, NFS4_BITMASK_SZ*sizeof(*dst));
+ if (!inode || !nfs4_have_delegation(inode, FMODE_READ))
+ return;
+
+ cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+ if (!(cache_validity & NFS_INO_REVAL_FORCED))
+ cache_validity &= ~(NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_SIZE);
+
+ if (!(cache_validity & NFS_INO_INVALID_SIZE))
+ dst[0] &= ~FATTR4_WORD0_SIZE;
+
+ if (!(cache_validity & NFS_INO_INVALID_CHANGE))
+ dst[0] &= ~FATTR4_WORD0_CHANGE;
+}
+
+static void nfs4_bitmap_copy_adjust_setattr(__u32 *dst,
+ const __u32 *src, struct inode *inode)
+{
+ nfs4_bitmap_copy_adjust(dst, src, inode);
+}
+
+static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
+ struct nfs4_readdir_arg *readdir)
+{
+ unsigned int attrs = FATTR4_WORD0_FILEID | FATTR4_WORD0_TYPE;
+ __be32 *start, *p;
+
+ if (cookie > 2) {
+ readdir->cookie = cookie;
+ memcpy(&readdir->verifier, verifier, sizeof(readdir->verifier));
+ return;
+ }
+
+ readdir->cookie = 0;
+ memset(&readdir->verifier, 0, sizeof(readdir->verifier));
+ if (cookie == 2)
+ return;
+
+ /*
+ * NFSv4 servers do not return entries for '.' and '..'
+ * Therefore, we fake these entries here. We let '.'
+ * have cookie 0 and '..' have cookie 1. Note that
+ * when talking to the server, we always send cookie 0
+ * instead of 1 or 2.
+ */
+ start = p = kmap_atomic(*readdir->pages);
+
+ if (cookie == 0) {
+ *p++ = xdr_one; /* next */
+ *p++ = xdr_zero; /* cookie, first word */
+ *p++ = xdr_one; /* cookie, second word */
+ *p++ = xdr_one; /* entry len */
+ memcpy(p, ".\0\0\0", 4); /* entry */
+ p++;
+ *p++ = xdr_one; /* bitmap length */
+ *p++ = htonl(attrs); /* bitmap */
+ *p++ = htonl(12); /* attribute buffer length */
+ *p++ = htonl(NF4DIR);
+ p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry)));
+ }
+
+ *p++ = xdr_one; /* next */
+ *p++ = xdr_zero; /* cookie, first word */
+ *p++ = xdr_two; /* cookie, second word */
+ *p++ = xdr_two; /* entry len */
+ memcpy(p, "..\0\0", 4); /* entry */
+ p++;
+ *p++ = xdr_one; /* bitmap length */
+ *p++ = htonl(attrs); /* bitmap */
+ *p++ = htonl(12); /* attribute buffer length */
+ *p++ = htonl(NF4DIR);
+ p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry->d_parent)));
+
+ readdir->pgbase = (char *)p - (char *)start;
+ readdir->count -= readdir->pgbase;
+ kunmap_atomic(start);
+}
+
+static void nfs4_fattr_set_prechange(struct nfs_fattr *fattr, u64 version)
+{
+ if (!(fattr->valid & NFS_ATTR_FATTR_PRECHANGE)) {
+ fattr->pre_change_attr = version;
+ fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
+ }
+}
+
+static void nfs4_test_and_free_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ const struct cred *cred)
+{
+ const struct nfs4_minor_version_ops *ops = server->nfs_client->cl_mvops;
+
+ ops->test_and_free_expired(server, stateid, cred);
+}
+
+static void __nfs4_free_revoked_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ const struct cred *cred)
+{
+ stateid->type = NFS4_REVOKED_STATEID_TYPE;
+ nfs4_test_and_free_stateid(server, stateid, cred);
+}
+
+static void nfs4_free_revoked_stateid(struct nfs_server *server,
+ const nfs4_stateid *stateid,
+ const struct cred *cred)
+{
+ nfs4_stateid tmp;
+
+ nfs4_stateid_copy(&tmp, stateid);
+ __nfs4_free_revoked_stateid(server, &tmp, cred);
+}
+
+static long nfs4_update_delay(long *timeout)
+{
+ long ret;
+ if (!timeout)
+ return NFS4_POLL_RETRY_MAX;
+ if (*timeout <= 0)
+ *timeout = NFS4_POLL_RETRY_MIN;
+ if (*timeout > NFS4_POLL_RETRY_MAX)
+ *timeout = NFS4_POLL_RETRY_MAX;
+ ret = *timeout;
+ *timeout <<= 1;
+ return ret;
+}
+
+static int nfs4_delay_killable(long *timeout)
+{
+ might_sleep();
+
+ freezable_schedule_timeout_killable_unsafe(
+ nfs4_update_delay(timeout));
+ if (!__fatal_signal_pending(current))
+ return 0;
+ return -EINTR;
+}
+
+static int nfs4_delay_interruptible(long *timeout)
+{
+ might_sleep();
+
+ freezable_schedule_timeout_interruptible_unsafe(nfs4_update_delay(timeout));
+ if (!signal_pending(current))
+ return 0;
+ return __fatal_signal_pending(current) ? -EINTR :-ERESTARTSYS;
+}
+
+static int nfs4_delay(long *timeout, bool interruptible)
+{
+ if (interruptible)
+ return nfs4_delay_interruptible(timeout);
+ return nfs4_delay_killable(timeout);
+}
+
+static const nfs4_stateid *
+nfs4_recoverable_stateid(const nfs4_stateid *stateid)
+{
+ if (!stateid)
+ return NULL;
+ switch (stateid->type) {
+ case NFS4_OPEN_STATEID_TYPE:
+ case NFS4_LOCK_STATEID_TYPE:
+ case NFS4_DELEGATION_STATEID_TYPE:
+ return stateid;
+ default:
+ break;
+ }
+ return NULL;
+}
+
+/* This is the error handling routine for processes that are allowed
+ * to sleep.
+ */
+static int nfs4_do_handle_exception(struct nfs_server *server,
+ int errorcode, struct nfs4_exception *exception)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_state *state = exception->state;
+ const nfs4_stateid *stateid;
+ struct inode *inode = exception->inode;
+ int ret = errorcode;
+
+ exception->delay = 0;
+ exception->recovering = 0;
+ exception->retry = 0;
+
+ stateid = nfs4_recoverable_stateid(exception->stateid);
+ if (stateid == NULL && state != NULL)
+ stateid = nfs4_recoverable_stateid(&state->stateid);
+
+ switch(errorcode) {
+ case 0:
+ return 0;
+ case -NFS4ERR_BADHANDLE:
+ case -ESTALE:
+ if (inode != NULL && S_ISREG(inode->i_mode))
+ pnfs_destroy_layout(NFS_I(inode));
+ break;
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_PARTNER_NO_AUTH:
+ if (inode != NULL && stateid != NULL) {
+ nfs_inode_find_state_and_recover(inode,
+ stateid);
+ goto wait_on_recovery;
+ }
+ fallthrough;
+ case -NFS4ERR_OPENMODE:
+ if (inode) {
+ int err;
+
+ err = nfs_async_inode_return_delegation(inode,
+ stateid);
+ if (err == 0)
+ goto wait_on_recovery;
+ if (stateid != NULL && stateid->type == NFS4_DELEGATION_STATEID_TYPE) {
+ exception->retry = 1;
+ break;
+ }
+ }
+ if (state == NULL)
+ break;
+ ret = nfs4_schedule_stateid_recovery(server, state);
+ if (ret < 0)
+ break;
+ goto wait_on_recovery;
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_STALE_CLIENTID:
+ nfs4_schedule_lease_recovery(clp);
+ goto wait_on_recovery;
+ case -NFS4ERR_MOVED:
+ ret = nfs4_schedule_migration_recovery(server);
+ if (ret < 0)
+ break;
+ goto wait_on_recovery;
+ case -NFS4ERR_LEASE_MOVED:
+ nfs4_schedule_lease_moved_recovery(clp);
+ goto wait_on_recovery;
+#if defined(CONFIG_NFS_V4_1)
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_SEQ_FALSE_RETRY:
+ case -NFS4ERR_SEQ_MISORDERED:
+ /* Handled in nfs41_sequence_process() */
+ goto wait_on_recovery;
+#endif /* defined(CONFIG_NFS_V4_1) */
+ case -NFS4ERR_FILE_OPEN:
+ if (exception->timeout > HZ) {
+ /* We have retried a decent amount, time to
+ * fail
+ */
+ ret = -EBUSY;
+ break;
+ }
+ fallthrough;
+ case -NFS4ERR_DELAY:
+ nfs_inc_server_stats(server, NFSIOS_DELAY);
+ fallthrough;
+ case -NFS4ERR_GRACE:
+ case -NFS4ERR_LAYOUTTRYLATER:
+ case -NFS4ERR_RECALLCONFLICT:
+ case -NFS4ERR_RETURNCONFLICT:
+ exception->delay = 1;
+ return 0;
+
+ case -NFS4ERR_RETRY_UNCACHED_REP:
+ case -NFS4ERR_OLD_STATEID:
+ exception->retry = 1;
+ break;
+ case -NFS4ERR_BADOWNER:
+ /* The following works around a Linux server bug! */
+ case -NFS4ERR_BADNAME:
+ if (server->caps & NFS_CAP_UIDGID_NOMAP) {
+ server->caps &= ~NFS_CAP_UIDGID_NOMAP;
+ exception->retry = 1;
+ printk(KERN_WARNING "NFS: v4 server %s "
+ "does not accept raw "
+ "uid/gids. "
+ "Reenabling the idmapper.\n",
+ server->nfs_client->cl_hostname);
+ }
+ }
+ /* We failed to handle the error */
+ return nfs4_map_errors(ret);
+wait_on_recovery:
+ exception->recovering = 1;
+ return 0;
+}
+
+/* This is the error handling routine for processes that are allowed
+ * to sleep.
+ */
+int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+{
+ struct nfs_client *clp = server->nfs_client;
+ int ret;
+
+ ret = nfs4_do_handle_exception(server, errorcode, exception);
+ if (exception->delay) {
+ ret = nfs4_delay(&exception->timeout,
+ exception->interruptible);
+ goto out_retry;
+ }
+ if (exception->recovering) {
+ if (exception->task_is_privileged)
+ return -EDEADLOCK;
+ ret = nfs4_wait_clnt_recover(clp);
+ if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+ return -EIO;
+ goto out_retry;
+ }
+ return ret;
+out_retry:
+ if (ret == 0)
+ exception->retry = 1;
+ return ret;
+}
+
+static int
+nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server,
+ int errorcode, struct nfs4_exception *exception)
+{
+ struct nfs_client *clp = server->nfs_client;
+ int ret;
+
+ ret = nfs4_do_handle_exception(server, errorcode, exception);
+ if (exception->delay) {
+ rpc_delay(task, nfs4_update_delay(&exception->timeout));
+ goto out_retry;
+ }
+ if (exception->recovering) {
+ if (exception->task_is_privileged)
+ return -EDEADLOCK;
+ rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
+ if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
+ rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
+ goto out_retry;
+ }
+ if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+ ret = -EIO;
+ return ret;
+out_retry:
+ if (ret == 0) {
+ exception->retry = 1;
+ /*
+ * For NFS4ERR_MOVED, the client transport will need to
+ * be recomputed after migration recovery has completed.
+ */
+ if (errorcode == -NFS4ERR_MOVED)
+ rpc_task_release_transport(task);
+ }
+ return ret;
+}
+
+int
+nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server,
+ struct nfs4_state *state, long *timeout)
+{
+ struct nfs4_exception exception = {
+ .state = state,
+ };
+
+ if (task->tk_status >= 0)
+ return 0;
+ if (timeout)
+ exception.timeout = *timeout;
+ task->tk_status = nfs4_async_handle_exception(task, server,
+ task->tk_status,
+ &exception);
+ if (exception.delay && timeout)
+ *timeout = exception.timeout;
+ if (exception.retry)
+ return -EAGAIN;
+ return 0;
+}
+
+/*
+ * Return 'true' if 'clp' is using an rpc_client that is integrity protected
+ * or 'false' otherwise.
+ */
+static bool _nfs4_is_integrity_protected(struct nfs_client *clp)
+{
+ rpc_authflavor_t flavor = clp->cl_rpcclient->cl_auth->au_flavor;
+ return (flavor == RPC_AUTH_GSS_KRB5I) || (flavor == RPC_AUTH_GSS_KRB5P);
+}
+
+static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp)
+{
+ spin_lock(&clp->cl_lock);
+ if (time_before(clp->cl_last_renewal,timestamp))
+ clp->cl_last_renewal = timestamp;
+ spin_unlock(&clp->cl_lock);
+}
+
+static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
+{
+ struct nfs_client *clp = server->nfs_client;
+
+ if (!nfs4_has_session(clp))
+ do_renew_lease(clp, timestamp);
+}
+
+struct nfs4_call_sync_data {
+ const struct nfs_server *seq_server;
+ struct nfs4_sequence_args *seq_args;
+ struct nfs4_sequence_res *seq_res;
+};
+
+void nfs4_init_sequence(struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res, int cache_reply,
+ int privileged)
+{
+ args->sa_slot = NULL;
+ args->sa_cache_this = cache_reply;
+ args->sa_privileged = privileged;
+
+ res->sr_slot = NULL;
+}
+
+static void nfs40_sequence_free_slot(struct nfs4_sequence_res *res)
+{
+ struct nfs4_slot *slot = res->sr_slot;
+ struct nfs4_slot_table *tbl;
+
+ tbl = slot->table;
+ spin_lock(&tbl->slot_tbl_lock);
+ if (!nfs41_wake_and_assign_slot(tbl, slot))
+ nfs4_free_slot(tbl, slot);
+ spin_unlock(&tbl->slot_tbl_lock);
+
+ res->sr_slot = NULL;
+}
+
+static int nfs40_sequence_done(struct rpc_task *task,
+ struct nfs4_sequence_res *res)
+{
+ if (res->sr_slot != NULL)
+ nfs40_sequence_free_slot(res);
+ return 1;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static void nfs41_release_slot(struct nfs4_slot *slot)
+{
+ struct nfs4_session *session;
+ struct nfs4_slot_table *tbl;
+ bool send_new_highest_used_slotid = false;
+
+ if (!slot)
+ return;
+ tbl = slot->table;
+ session = tbl->session;
+
+ /* Bump the slot sequence number */
+ if (slot->seq_done)
+ slot->seq_nr++;
+ slot->seq_done = 0;
+
+ spin_lock(&tbl->slot_tbl_lock);
+ /* Be nice to the server: try to ensure that the last transmitted
+ * value for highest_user_slotid <= target_highest_slotid
+ */
+ if (tbl->highest_used_slotid > tbl->target_highest_slotid)
+ send_new_highest_used_slotid = true;
+
+ if (nfs41_wake_and_assign_slot(tbl, slot)) {
+ send_new_highest_used_slotid = false;
+ goto out_unlock;
+ }
+ nfs4_free_slot(tbl, slot);
+
+ if (tbl->highest_used_slotid != NFS4_NO_SLOT)
+ send_new_highest_used_slotid = false;
+out_unlock:
+ spin_unlock(&tbl->slot_tbl_lock);
+ if (send_new_highest_used_slotid)
+ nfs41_notify_server(session->clp);
+ if (waitqueue_active(&tbl->slot_waitq))
+ wake_up_all(&tbl->slot_waitq);
+}
+
+static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
+{
+ nfs41_release_slot(res->sr_slot);
+ res->sr_slot = NULL;
+}
+
+static void nfs4_slot_sequence_record_sent(struct nfs4_slot *slot,
+ u32 seqnr)
+{
+ if ((s32)(seqnr - slot->seq_nr_highest_sent) > 0)
+ slot->seq_nr_highest_sent = seqnr;
+}
+static void nfs4_slot_sequence_acked(struct nfs4_slot *slot, u32 seqnr)
+{
+ nfs4_slot_sequence_record_sent(slot, seqnr);
+ slot->seq_nr_last_acked = seqnr;
+}
+
+static void nfs4_probe_sequence(struct nfs_client *client, const struct cred *cred,
+ struct nfs4_slot *slot)
+{
+ struct rpc_task *task = _nfs41_proc_sequence(client, cred, slot, true);
+ if (!IS_ERR(task))
+ rpc_put_task_async(task);
+}
+
+static int nfs41_sequence_process(struct rpc_task *task,
+ struct nfs4_sequence_res *res)
+{
+ struct nfs4_session *session;
+ struct nfs4_slot *slot = res->sr_slot;
+ struct nfs_client *clp;
+ int status;
+ int ret = 1;
+
+ if (slot == NULL)
+ goto out_noaction;
+ /* don't increment the sequence number if the task wasn't sent */
+ if (!RPC_WAS_SENT(task) || slot->seq_done)
+ goto out;
+
+ session = slot->table->session;
+ clp = session->clp;
+
+ trace_nfs4_sequence_done(session, res);
+
+ status = res->sr_status;
+ if (task->tk_status == -NFS4ERR_DEADSESSION)
+ status = -NFS4ERR_DEADSESSION;
+
+ /* Check the SEQUENCE operation status */
+ switch (status) {
+ case 0:
+ /* Mark this sequence number as having been acked */
+ nfs4_slot_sequence_acked(slot, slot->seq_nr);
+ /* Update the slot's sequence and clientid lease timer */
+ slot->seq_done = 1;
+ do_renew_lease(clp, res->sr_timestamp);
+ /* Check sequence flags */
+ nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags,
+ !!slot->privileged);
+ nfs41_update_target_slotid(slot->table, slot, res);
+ break;
+ case 1:
+ /*
+ * sr_status remains 1 if an RPC level error occurred.
+ * The server may or may not have processed the sequence
+ * operation..
+ */
+ nfs4_slot_sequence_record_sent(slot, slot->seq_nr);
+ slot->seq_done = 1;
+ goto out;
+ case -NFS4ERR_DELAY:
+ /* The server detected a resend of the RPC call and
+ * returned NFS4ERR_DELAY as per Section 2.10.6.2
+ * of RFC5661.
+ */
+ dprintk("%s: slot=%u seq=%u: Operation in progress\n",
+ __func__,
+ slot->slot_nr,
+ slot->seq_nr);
+ goto out_retry;
+ case -NFS4ERR_RETRY_UNCACHED_REP:
+ case -NFS4ERR_SEQ_FALSE_RETRY:
+ /*
+ * The server thinks we tried to replay a request.
+ * Retry the call after bumping the sequence ID.
+ */
+ nfs4_slot_sequence_acked(slot, slot->seq_nr);
+ goto retry_new_seq;
+ case -NFS4ERR_BADSLOT:
+ /*
+ * The slot id we used was probably retired. Try again
+ * using a different slot id.
+ */
+ if (slot->slot_nr < slot->table->target_highest_slotid)
+ goto session_recover;
+ goto retry_nowait;
+ case -NFS4ERR_SEQ_MISORDERED:
+ nfs4_slot_sequence_record_sent(slot, slot->seq_nr);
+ /*
+ * Were one or more calls using this slot interrupted?
+ * If the server never received the request, then our
+ * transmitted slot sequence number may be too high. However,
+ * if the server did receive the request then it might
+ * accidentally give us a reply with a mismatched operation.
+ * We can sort this out by sending a lone sequence operation
+ * to the server on the same slot.
+ */
+ if ((s32)(slot->seq_nr - slot->seq_nr_last_acked) > 1) {
+ slot->seq_nr--;
+ if (task->tk_msg.rpc_proc != &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE]) {
+ nfs4_probe_sequence(clp, task->tk_msg.rpc_cred, slot);
+ res->sr_slot = NULL;
+ }
+ goto retry_nowait;
+ }
+ /*
+ * RFC5661:
+ * A retry might be sent while the original request is
+ * still in progress on the replier. The replier SHOULD
+ * deal with the issue by returning NFS4ERR_DELAY as the
+ * reply to SEQUENCE or CB_SEQUENCE operation, but
+ * implementations MAY return NFS4ERR_SEQ_MISORDERED.
+ *
+ * Restart the search after a delay.
+ */
+ slot->seq_nr = slot->seq_nr_highest_sent;
+ goto out_retry;
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ goto session_recover;
+ default:
+ /* Just update the slot sequence no. */
+ slot->seq_done = 1;
+ }
+out:
+ /* The session may be reset by one of the error handlers. */
+ dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
+out_noaction:
+ return ret;
+session_recover:
+ set_bit(NFS4_SLOT_TBL_DRAINING, &session->fc_slot_table.slot_tbl_state);
+ nfs4_schedule_session_recovery(session, status);
+ dprintk("%s ERROR: %d Reset session\n", __func__, status);
+ nfs41_sequence_free_slot(res);
+ goto out;
+retry_new_seq:
+ ++slot->seq_nr;
+retry_nowait:
+ if (rpc_restart_call_prepare(task)) {
+ nfs41_sequence_free_slot(res);
+ task->tk_status = 0;
+ ret = 0;
+ }
+ goto out;
+out_retry:
+ if (!rpc_restart_call(task))
+ goto out;
+ rpc_delay(task, NFS4_POLL_RETRY_MAX);
+ return 0;
+}
+
+int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
+{
+ if (!nfs41_sequence_process(task, res))
+ return 0;
+ if (res->sr_slot != NULL)
+ nfs41_sequence_free_slot(res);
+ return 1;
+
+}
+EXPORT_SYMBOL_GPL(nfs41_sequence_done);
+
+static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res)
+{
+ if (res->sr_slot == NULL)
+ return 1;
+ if (res->sr_slot->table->session != NULL)
+ return nfs41_sequence_process(task, res);
+ return nfs40_sequence_done(task, res);
+}
+
+static void nfs4_sequence_free_slot(struct nfs4_sequence_res *res)
+{
+ if (res->sr_slot != NULL) {
+ if (res->sr_slot->table->session != NULL)
+ nfs41_sequence_free_slot(res);
+ else
+ nfs40_sequence_free_slot(res);
+ }
+}
+
+int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
+{
+ if (res->sr_slot == NULL)
+ return 1;
+ if (!res->sr_slot->table->session)
+ return nfs40_sequence_done(task, res);
+ return nfs41_sequence_done(task, res);
+}
+EXPORT_SYMBOL_GPL(nfs4_sequence_done);
+
+static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_call_sync_data *data = calldata;
+
+ dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
+
+ nfs4_setup_sequence(data->seq_server->nfs_client,
+ data->seq_args, data->seq_res, task);
+}
+
+static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_call_sync_data *data = calldata;
+
+ nfs41_sequence_done(task, data->seq_res);
+}
+
+static const struct rpc_call_ops nfs41_call_sync_ops = {
+ .rpc_call_prepare = nfs41_call_sync_prepare,
+ .rpc_call_done = nfs41_call_sync_done,
+};
+
+#else /* !CONFIG_NFS_V4_1 */
+
+static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res)
+{
+ return nfs40_sequence_done(task, res);
+}
+
+static void nfs4_sequence_free_slot(struct nfs4_sequence_res *res)
+{
+ if (res->sr_slot != NULL)
+ nfs40_sequence_free_slot(res);
+}
+
+int nfs4_sequence_done(struct rpc_task *task,
+ struct nfs4_sequence_res *res)
+{
+ return nfs40_sequence_done(task, res);
+}
+EXPORT_SYMBOL_GPL(nfs4_sequence_done);
+
+#endif /* !CONFIG_NFS_V4_1 */
+
+static void nfs41_sequence_res_init(struct nfs4_sequence_res *res)
+{
+ res->sr_timestamp = jiffies;
+ res->sr_status_flags = 0;
+ res->sr_status = 1;
+}
+
+static
+void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ struct nfs4_slot *slot)
+{
+ if (!slot)
+ return;
+ slot->privileged = args->sa_privileged ? 1 : 0;
+ args->sa_slot = slot;
+
+ res->sr_slot = slot;
+}
+
+int nfs4_setup_sequence(struct nfs_client *client,
+ struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ struct rpc_task *task)
+{
+ struct nfs4_session *session = nfs4_get_session(client);
+ struct nfs4_slot_table *tbl = client->cl_slot_tbl;
+ struct nfs4_slot *slot;
+
+ /* slot already allocated? */
+ if (res->sr_slot != NULL)
+ goto out_start;
+
+ if (session)
+ tbl = &session->fc_slot_table;
+
+ spin_lock(&tbl->slot_tbl_lock);
+ /* The state manager will wait until the slot table is empty */
+ if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
+ goto out_sleep;
+
+ slot = nfs4_alloc_slot(tbl);
+ if (IS_ERR(slot)) {
+ if (slot == ERR_PTR(-ENOMEM))
+ goto out_sleep_timeout;
+ goto out_sleep;
+ }
+ spin_unlock(&tbl->slot_tbl_lock);
+
+ nfs4_sequence_attach_slot(args, res, slot);
+
+ trace_nfs4_setup_sequence(session, args);
+out_start:
+ nfs41_sequence_res_init(res);
+ rpc_call_start(task);
+ return 0;
+out_sleep_timeout:
+ /* Try again in 1/4 second */
+ if (args->sa_privileged)
+ rpc_sleep_on_priority_timeout(&tbl->slot_tbl_waitq, task,
+ jiffies + (HZ >> 2), RPC_PRIORITY_PRIVILEGED);
+ else
+ rpc_sleep_on_timeout(&tbl->slot_tbl_waitq, task,
+ NULL, jiffies + (HZ >> 2));
+ spin_unlock(&tbl->slot_tbl_lock);
+ return -EAGAIN;
+out_sleep:
+ if (args->sa_privileged)
+ rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task,
+ RPC_PRIORITY_PRIVILEGED);
+ else
+ rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
+ spin_unlock(&tbl->slot_tbl_lock);
+ return -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(nfs4_setup_sequence);
+
+static void nfs40_call_sync_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_call_sync_data *data = calldata;
+ nfs4_setup_sequence(data->seq_server->nfs_client,
+ data->seq_args, data->seq_res, task);
+}
+
+static void nfs40_call_sync_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_call_sync_data *data = calldata;
+ nfs4_sequence_done(task, data->seq_res);
+}
+
+static const struct rpc_call_ops nfs40_call_sync_ops = {
+ .rpc_call_prepare = nfs40_call_sync_prepare,
+ .rpc_call_done = nfs40_call_sync_done,
+};
+
+static int nfs4_call_sync_custom(struct rpc_task_setup *task_setup)
+{
+ int ret;
+ struct rpc_task *task;
+
+ task = rpc_run_task(task_setup);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ ret = task->tk_status;
+ rpc_put_task(task);
+ return ret;
+}
+
+static int nfs4_do_call_sync(struct rpc_clnt *clnt,
+ struct nfs_server *server,
+ struct rpc_message *msg,
+ struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ unsigned short task_flags)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_call_sync_data data = {
+ .seq_server = server,
+ .seq_args = args,
+ .seq_res = res,
+ };
+ struct rpc_task_setup task_setup = {
+ .rpc_client = clnt,
+ .rpc_message = msg,
+ .callback_ops = clp->cl_mvops->call_sync_ops,
+ .callback_data = &data,
+ .flags = task_flags,
+ };
+
+ return nfs4_call_sync_custom(&task_setup);
+}
+
+static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
+ struct nfs_server *server,
+ struct rpc_message *msg,
+ struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res)
+{
+ return nfs4_do_call_sync(clnt, server, msg, args, res, 0);
+}
+
+
+int nfs4_call_sync(struct rpc_clnt *clnt,
+ struct nfs_server *server,
+ struct rpc_message *msg,
+ struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ int cache_reply)
+{
+ nfs4_init_sequence(args, res, cache_reply, 0);
+ return nfs4_call_sync_sequence(clnt, server, msg, args, res);
+}
+
+static void
+nfs4_inc_nlink_locked(struct inode *inode)
+{
+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_OTHER;
+ inc_nlink(inode);
+}
+
+static void
+nfs4_dec_nlink_locked(struct inode *inode)
+{
+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_OTHER;
+ drop_nlink(inode);
+}
+
+static void
+nfs4_update_changeattr_locked(struct inode *inode,
+ struct nfs4_change_info *cinfo,
+ unsigned long timestamp, unsigned long cache_validity)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ nfsi->cache_validity |= NFS_INO_INVALID_CTIME
+ | NFS_INO_INVALID_MTIME
+ | cache_validity;
+
+ if (cinfo->atomic && cinfo->before == inode_peek_iversion_raw(inode)) {
+ nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
+ nfsi->attrtimeo_timestamp = jiffies;
+ } else {
+ if (S_ISDIR(inode->i_mode)) {
+ nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+ nfs_force_lookup_revalidate(inode);
+ } else {
+ if (!NFS_PROTO(inode)->have_delegation(inode,
+ FMODE_READ))
+ nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
+ }
+
+ if (cinfo->before != inode_peek_iversion_raw(inode))
+ nfsi->cache_validity |= NFS_INO_INVALID_ACCESS |
+ NFS_INO_INVALID_ACL |
+ NFS_INO_INVALID_XATTR;
+ }
+ inode_set_iversion_raw(inode, cinfo->after);
+ nfsi->read_cache_jiffies = timestamp;
+ nfsi->attr_gencount = nfs_inc_attr_generation_counter();
+ nfsi->cache_validity &= ~NFS_INO_INVALID_CHANGE;
+
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ nfs_fscache_invalidate(inode);
+}
+
+void
+nfs4_update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo,
+ unsigned long timestamp, unsigned long cache_validity)
+{
+ spin_lock(&dir->i_lock);
+ nfs4_update_changeattr_locked(dir, cinfo, timestamp, cache_validity);
+ spin_unlock(&dir->i_lock);
+}
+
+struct nfs4_open_createattrs {
+ struct nfs4_label *label;
+ struct iattr *sattr;
+ const __u32 verf[2];
+};
+
+static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server,
+ int err, struct nfs4_exception *exception)
+{
+ if (err != -EINVAL)
+ return false;
+ if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1))
+ return false;
+ server->caps &= ~NFS_CAP_ATOMIC_OPEN_V1;
+ exception->retry = 1;
+ return true;
+}
+
+static fmode_t _nfs4_ctx_to_accessmode(const struct nfs_open_context *ctx)
+{
+ return ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC);
+}
+
+static fmode_t _nfs4_ctx_to_openmode(const struct nfs_open_context *ctx)
+{
+ fmode_t ret = ctx->mode & (FMODE_READ|FMODE_WRITE);
+
+ return (ctx->mode & FMODE_EXEC) ? FMODE_READ | ret : ret;
+}
+
+static u32
+nfs4_map_atomic_open_share(struct nfs_server *server,
+ fmode_t fmode, int openflags)
+{
+ u32 res = 0;
+
+ switch (fmode & (FMODE_READ | FMODE_WRITE)) {
+ case FMODE_READ:
+ res = NFS4_SHARE_ACCESS_READ;
+ break;
+ case FMODE_WRITE:
+ res = NFS4_SHARE_ACCESS_WRITE;
+ break;
+ case FMODE_READ|FMODE_WRITE:
+ res = NFS4_SHARE_ACCESS_BOTH;
+ }
+ if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1))
+ goto out;
+ /* Want no delegation if we're using O_DIRECT */
+ if (openflags & O_DIRECT)
+ res |= NFS4_SHARE_WANT_NO_DELEG;
+out:
+ return res;
+}
+
+static enum open_claim_type4
+nfs4_map_atomic_open_claim(struct nfs_server *server,
+ enum open_claim_type4 claim)
+{
+ if (server->caps & NFS_CAP_ATOMIC_OPEN_V1)
+ return claim;
+ switch (claim) {
+ default:
+ return claim;
+ case NFS4_OPEN_CLAIM_FH:
+ return NFS4_OPEN_CLAIM_NULL;
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ return NFS4_OPEN_CLAIM_DELEGATE_CUR;
+ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+ return NFS4_OPEN_CLAIM_DELEGATE_PREV;
+ }
+}
+
+static void nfs4_init_opendata_res(struct nfs4_opendata *p)
+{
+ p->o_res.f_attr = &p->f_attr;
+ p->o_res.f_label = p->f_label;
+ p->o_res.seqid = p->o_arg.seqid;
+ p->c_res.seqid = p->c_arg.seqid;
+ p->o_res.server = p->o_arg.server;
+ p->o_res.access_request = p->o_arg.access;
+ nfs_fattr_init(&p->f_attr);
+ nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name);
+}
+
+static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
+ struct nfs4_state_owner *sp, fmode_t fmode, int flags,
+ const struct nfs4_open_createattrs *c,
+ enum open_claim_type4 claim,
+ gfp_t gfp_mask)
+{
+ struct dentry *parent = dget_parent(dentry);
+ struct inode *dir = d_inode(parent);
+ struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+ struct nfs4_label *label = (c != NULL) ? c->label : NULL;
+ struct nfs4_opendata *p;
+
+ p = kzalloc(sizeof(*p), gfp_mask);
+ if (p == NULL)
+ goto err;
+
+ p->f_label = nfs4_label_alloc(server, gfp_mask);
+ if (IS_ERR(p->f_label))
+ goto err_free_p;
+
+ p->a_label = nfs4_label_alloc(server, gfp_mask);
+ if (IS_ERR(p->a_label))
+ goto err_free_f;
+
+ alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
+ p->o_arg.seqid = alloc_seqid(&sp->so_seqid, gfp_mask);
+ if (IS_ERR(p->o_arg.seqid))
+ goto err_free_label;
+ nfs_sb_active(dentry->d_sb);
+ p->dentry = dget(dentry);
+ p->dir = parent;
+ p->owner = sp;
+ atomic_inc(&sp->so_count);
+ p->o_arg.open_flags = flags;
+ p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
+ p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
+ p->o_arg.share_access = nfs4_map_atomic_open_share(server,
+ fmode, flags);
+ if (flags & O_CREAT) {
+ p->o_arg.umask = current_umask();
+ p->o_arg.label = nfs4_label_copy(p->a_label, label);
+ if (c->sattr != NULL && c->sattr->ia_valid != 0) {
+ p->o_arg.u.attrs = &p->attrs;
+ memcpy(&p->attrs, c->sattr, sizeof(p->attrs));
+
+ memcpy(p->o_arg.u.verifier.data, c->verf,
+ sizeof(p->o_arg.u.verifier.data));
+ }
+ }
+ /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS
+ * will return permission denied for all bits until close */
+ if (!(flags & O_EXCL)) {
+ /* ask server to check for all possible rights as results
+ * are cached */
+ switch (p->o_arg.claim) {
+ default:
+ break;
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_FH:
+ p->o_arg.access = NFS4_ACCESS_READ |
+ NFS4_ACCESS_MODIFY |
+ NFS4_ACCESS_EXTEND |
+ NFS4_ACCESS_EXECUTE;
+#ifdef CONFIG_NFS_V4_2
+ if (server->caps & NFS_CAP_XATTR)
+ p->o_arg.access |= NFS4_ACCESS_XAREAD |
+ NFS4_ACCESS_XAWRITE |
+ NFS4_ACCESS_XALIST;
+#endif
+ }
+ }
+ p->o_arg.clientid = server->nfs_client->cl_clientid;
+ p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time);
+ p->o_arg.id.uniquifier = sp->so_seqid.owner_id;
+ p->o_arg.name = &dentry->d_name;
+ p->o_arg.server = server;
+ p->o_arg.bitmask = nfs4_bitmask(server, label);
+ p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0];
+ switch (p->o_arg.claim) {
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+ p->o_arg.fh = NFS_FH(dir);
+ break;
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ case NFS4_OPEN_CLAIM_FH:
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+ p->o_arg.fh = NFS_FH(d_inode(dentry));
+ }
+ p->c_arg.fh = &p->o_res.fh;
+ p->c_arg.stateid = &p->o_res.stateid;
+ p->c_arg.seqid = p->o_arg.seqid;
+ nfs4_init_opendata_res(p);
+ kref_init(&p->kref);
+ return p;
+
+err_free_label:
+ nfs4_label_free(p->a_label);
+err_free_f:
+ nfs4_label_free(p->f_label);
+err_free_p:
+ kfree(p);
+err:
+ dput(parent);
+ return NULL;
+}
+
+static void nfs4_opendata_free(struct kref *kref)
+{
+ struct nfs4_opendata *p = container_of(kref,
+ struct nfs4_opendata, kref);
+ struct super_block *sb = p->dentry->d_sb;
+
+ nfs4_lgopen_release(p->lgp);
+ nfs_free_seqid(p->o_arg.seqid);
+ nfs4_sequence_free_slot(&p->o_res.seq_res);
+ if (p->state != NULL)
+ nfs4_put_open_state(p->state);
+ nfs4_put_state_owner(p->owner);
+
+ nfs4_label_free(p->a_label);
+ nfs4_label_free(p->f_label);
+
+ dput(p->dir);
+ dput(p->dentry);
+ nfs_sb_deactive(sb);
+ nfs_fattr_free_names(&p->f_attr);
+ kfree(p->f_attr.mdsthreshold);
+ kfree(p);
+}
+
+static void nfs4_opendata_put(struct nfs4_opendata *p)
+{
+ if (p != NULL)
+ kref_put(&p->kref, nfs4_opendata_free);
+}
+
+static bool nfs4_mode_match_open_stateid(struct nfs4_state *state,
+ fmode_t fmode)
+{
+ switch(fmode & (FMODE_READ|FMODE_WRITE)) {
+ case FMODE_READ|FMODE_WRITE:
+ return state->n_rdwr != 0;
+ case FMODE_WRITE:
+ return state->n_wronly != 0;
+ case FMODE_READ:
+ return state->n_rdonly != 0;
+ }
+ WARN_ON_ONCE(1);
+ return false;
+}
+
+static int can_open_cached(struct nfs4_state *state, fmode_t mode,
+ int open_mode, enum open_claim_type4 claim)
+{
+ int ret = 0;
+
+ if (open_mode & (O_EXCL|O_TRUNC))
+ goto out;
+ switch (claim) {
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_FH:
+ goto out;
+ default:
+ break;
+ }
+ switch (mode & (FMODE_READ|FMODE_WRITE)) {
+ case FMODE_READ:
+ ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0
+ && state->n_rdonly != 0;
+ break;
+ case FMODE_WRITE:
+ ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0
+ && state->n_wronly != 0;
+ break;
+ case FMODE_READ|FMODE_WRITE:
+ ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0
+ && state->n_rdwr != 0;
+ }
+out:
+ return ret;
+}
+
+static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode,
+ enum open_claim_type4 claim)
+{
+ if (delegation == NULL)
+ return 0;
+ if ((delegation->type & fmode) != fmode)
+ return 0;
+ switch (claim) {
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_FH:
+ break;
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ if (!test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
+ break;
+ fallthrough;
+ default:
+ return 0;
+ }
+ nfs_mark_delegation_referenced(delegation);
+ return 1;
+}
+
+static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
+{
+ switch (fmode) {
+ case FMODE_WRITE:
+ state->n_wronly++;
+ break;
+ case FMODE_READ:
+ state->n_rdonly++;
+ break;
+ case FMODE_READ|FMODE_WRITE:
+ state->n_rdwr++;
+ }
+ nfs4_state_set_mode_locked(state, state->state | fmode);
+}
+
+#ifdef CONFIG_NFS_V4_1
+static bool nfs_open_stateid_recover_openmode(struct nfs4_state *state)
+{
+ if (state->n_rdonly && !test_bit(NFS_O_RDONLY_STATE, &state->flags))
+ return true;
+ if (state->n_wronly && !test_bit(NFS_O_WRONLY_STATE, &state->flags))
+ return true;
+ if (state->n_rdwr && !test_bit(NFS_O_RDWR_STATE, &state->flags))
+ return true;
+ return false;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static void nfs_state_log_update_open_stateid(struct nfs4_state *state)
+{
+ if (test_and_clear_bit(NFS_STATE_CHANGE_WAIT, &state->flags))
+ wake_up_all(&state->waitq);
+}
+
+static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
+{
+ struct nfs_client *clp = state->owner->so_server->nfs_client;
+ bool need_recover = false;
+
+ if (test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags) && state->n_rdonly)
+ need_recover = true;
+ if (test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags) && state->n_wronly)
+ need_recover = true;
+ if (test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags) && state->n_rdwr)
+ need_recover = true;
+ if (need_recover)
+ nfs4_state_mark_reclaim_nograce(clp, state);
+}
+
+/*
+ * Check for whether or not the caller may update the open stateid
+ * to the value passed in by stateid.
+ *
+ * Note: This function relies heavily on the server implementing
+ * RFC7530 Section 9.1.4.2, and RFC5661 Section 8.2.2
+ * correctly.
+ * i.e. The stateid seqids have to be initialised to 1, and
+ * are then incremented on every state transition.
+ */
+static bool nfs_stateid_is_sequential(struct nfs4_state *state,
+ const nfs4_stateid *stateid)
+{
+ if (test_bit(NFS_OPEN_STATE, &state->flags)) {
+ /* The common case - we're updating to a new sequence number */
+ if (nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+ if (nfs4_stateid_is_next(&state->open_stateid, stateid))
+ return true;
+ return false;
+ }
+ /* The server returned a new stateid */
+ }
+ /* This is the first OPEN in this generation */
+ if (stateid->seqid == cpu_to_be32(1))
+ return true;
+ return false;
+}
+
+static void nfs_resync_open_stateid_locked(struct nfs4_state *state)
+{
+ if (!(state->n_wronly || state->n_rdonly || state->n_rdwr))
+ return;
+ if (state->n_wronly)
+ set_bit(NFS_O_WRONLY_STATE, &state->flags);
+ if (state->n_rdonly)
+ set_bit(NFS_O_RDONLY_STATE, &state->flags);
+ if (state->n_rdwr)
+ set_bit(NFS_O_RDWR_STATE, &state->flags);
+ set_bit(NFS_OPEN_STATE, &state->flags);
+}
+
+static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
+ nfs4_stateid *stateid, fmode_t fmode)
+{
+ clear_bit(NFS_O_RDWR_STATE, &state->flags);
+ switch (fmode & (FMODE_READ|FMODE_WRITE)) {
+ case FMODE_WRITE:
+ clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+ break;
+ case FMODE_READ:
+ clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+ break;
+ case 0:
+ clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+ clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+ clear_bit(NFS_OPEN_STATE, &state->flags);
+ }
+ if (stateid == NULL)
+ return;
+ /* Handle OPEN+OPEN_DOWNGRADE races */
+ if (nfs4_stateid_match_other(stateid, &state->open_stateid) &&
+ !nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
+ nfs_resync_open_stateid_locked(state);
+ goto out;
+ }
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
+ nfs4_stateid_copy(&state->stateid, stateid);
+ nfs4_stateid_copy(&state->open_stateid, stateid);
+ trace_nfs4_open_stateid_update(state->inode, stateid, 0);
+out:
+ nfs_state_log_update_open_stateid(state);
+}
+
+static void nfs_clear_open_stateid(struct nfs4_state *state,
+ nfs4_stateid *arg_stateid,
+ nfs4_stateid *stateid, fmode_t fmode)
+{
+ write_seqlock(&state->seqlock);
+ /* Ignore, if the CLOSE argment doesn't match the current stateid */
+ if (nfs4_state_match_open_stateid_other(state, arg_stateid))
+ nfs_clear_open_stateid_locked(state, stateid, fmode);
+ write_sequnlock(&state->seqlock);
+ if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
+ nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
+}
+
+static void nfs_set_open_stateid_locked(struct nfs4_state *state,
+ const nfs4_stateid *stateid, nfs4_stateid *freeme)
+ __must_hold(&state->owner->so_lock)
+ __must_hold(&state->seqlock)
+ __must_hold(RCU)
+
+{
+ DEFINE_WAIT(wait);
+ int status = 0;
+ for (;;) {
+
+ if (nfs_stateid_is_sequential(state, stateid))
+ break;
+
+ if (status)
+ break;
+ /* Rely on seqids for serialisation with NFSv4.0 */
+ if (!nfs4_has_session(NFS_SERVER(state->inode)->nfs_client))
+ break;
+
+ set_bit(NFS_STATE_CHANGE_WAIT, &state->flags);
+ prepare_to_wait(&state->waitq, &wait, TASK_KILLABLE);
+ /*
+ * Ensure we process the state changes in the same order
+ * in which the server processed them by delaying the
+ * update of the stateid until we are in sequence.
+ */
+ write_sequnlock(&state->seqlock);
+ spin_unlock(&state->owner->so_lock);
+ rcu_read_unlock();
+ trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0);
+
+ if (!fatal_signal_pending(current)) {
+ if (schedule_timeout(5*HZ) == 0)
+ status = -EAGAIN;
+ else
+ status = 0;
+ } else
+ status = -EINTR;
+ finish_wait(&state->waitq, &wait);
+ rcu_read_lock();
+ spin_lock(&state->owner->so_lock);
+ write_seqlock(&state->seqlock);
+ }
+
+ if (test_bit(NFS_OPEN_STATE, &state->flags) &&
+ !nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+ nfs4_stateid_copy(freeme, &state->open_stateid);
+ nfs_test_and_clear_all_open_stateid(state);
+ }
+
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
+ nfs4_stateid_copy(&state->stateid, stateid);
+ nfs4_stateid_copy(&state->open_stateid, stateid);
+ trace_nfs4_open_stateid_update(state->inode, stateid, status);
+ nfs_state_log_update_open_stateid(state);
+}
+
+static void nfs_state_set_open_stateid(struct nfs4_state *state,
+ const nfs4_stateid *open_stateid,
+ fmode_t fmode,
+ nfs4_stateid *freeme)
+{
+ /*
+ * Protect the call to nfs4_state_set_mode_locked and
+ * serialise the stateid update
+ */
+ write_seqlock(&state->seqlock);
+ nfs_set_open_stateid_locked(state, open_stateid, freeme);
+ switch (fmode) {
+ case FMODE_READ:
+ set_bit(NFS_O_RDONLY_STATE, &state->flags);
+ break;
+ case FMODE_WRITE:
+ set_bit(NFS_O_WRONLY_STATE, &state->flags);
+ break;
+ case FMODE_READ|FMODE_WRITE:
+ set_bit(NFS_O_RDWR_STATE, &state->flags);
+ }
+ set_bit(NFS_OPEN_STATE, &state->flags);
+ write_sequnlock(&state->seqlock);
+}
+
+static void nfs_state_clear_open_state_flags(struct nfs4_state *state)
+{
+ clear_bit(NFS_O_RDWR_STATE, &state->flags);
+ clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+ clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+ clear_bit(NFS_OPEN_STATE, &state->flags);
+}
+
+static void nfs_state_set_delegation(struct nfs4_state *state,
+ const nfs4_stateid *deleg_stateid,
+ fmode_t fmode)
+{
+ /*
+ * Protect the call to nfs4_state_set_mode_locked and
+ * serialise the stateid update
+ */
+ write_seqlock(&state->seqlock);
+ nfs4_stateid_copy(&state->stateid, deleg_stateid);
+ set_bit(NFS_DELEGATED_STATE, &state->flags);
+ write_sequnlock(&state->seqlock);
+}
+
+static void nfs_state_clear_delegation(struct nfs4_state *state)
+{
+ write_seqlock(&state->seqlock);
+ nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+ clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ write_sequnlock(&state->seqlock);
+}
+
+int update_open_stateid(struct nfs4_state *state,
+ const nfs4_stateid *open_stateid,
+ const nfs4_stateid *delegation,
+ fmode_t fmode)
+{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs_inode *nfsi = NFS_I(state->inode);
+ struct nfs_delegation *deleg_cur;
+ nfs4_stateid freeme = { };
+ int ret = 0;
+
+ fmode &= (FMODE_READ|FMODE_WRITE);
+
+ rcu_read_lock();
+ spin_lock(&state->owner->so_lock);
+ if (open_stateid != NULL) {
+ nfs_state_set_open_stateid(state, open_stateid, fmode, &freeme);
+ ret = 1;
+ }
+
+ deleg_cur = nfs4_get_valid_delegation(state->inode);
+ if (deleg_cur == NULL)
+ goto no_delegation;
+
+ spin_lock(&deleg_cur->lock);
+ if (rcu_dereference(nfsi->delegation) != deleg_cur ||
+ test_bit(NFS_DELEGATION_RETURNING, &deleg_cur->flags) ||
+ (deleg_cur->type & fmode) != fmode)
+ goto no_delegation_unlock;
+
+ if (delegation == NULL)
+ delegation = &deleg_cur->stateid;
+ else if (!nfs4_stateid_match_other(&deleg_cur->stateid, delegation))
+ goto no_delegation_unlock;
+
+ nfs_mark_delegation_referenced(deleg_cur);
+ nfs_state_set_delegation(state, &deleg_cur->stateid, fmode);
+ ret = 1;
+no_delegation_unlock:
+ spin_unlock(&deleg_cur->lock);
+no_delegation:
+ if (ret)
+ update_open_stateflags(state, fmode);
+ spin_unlock(&state->owner->so_lock);
+ rcu_read_unlock();
+
+ if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
+ nfs4_schedule_state_manager(clp);
+ if (freeme.type != 0)
+ nfs4_test_and_free_stateid(server, &freeme,
+ state->owner->so_cred);
+
+ return ret;
+}
+
+static bool nfs4_update_lock_stateid(struct nfs4_lock_state *lsp,
+ const nfs4_stateid *stateid)
+{
+ struct nfs4_state *state = lsp->ls_state;
+ bool ret = false;
+
+ spin_lock(&state->state_lock);
+ if (!nfs4_stateid_match_other(stateid, &lsp->ls_stateid))
+ goto out_noupdate;
+ if (!nfs4_stateid_is_newer(stateid, &lsp->ls_stateid))
+ goto out_noupdate;
+ nfs4_stateid_copy(&lsp->ls_stateid, stateid);
+ ret = true;
+out_noupdate:
+ spin_unlock(&state->state_lock);
+ return ret;
+}
+
+static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode)
+{
+ struct nfs_delegation *delegation;
+
+ fmode &= FMODE_READ|FMODE_WRITE;
+ rcu_read_lock();
+ delegation = nfs4_get_valid_delegation(inode);
+ if (delegation == NULL || (delegation->type & fmode) == fmode) {
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+ nfs4_inode_return_delegation(inode);
+}
+
+static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
+{
+ struct nfs4_state *state = opendata->state;
+ struct nfs_delegation *delegation;
+ int open_mode = opendata->o_arg.open_flags;
+ fmode_t fmode = opendata->o_arg.fmode;
+ enum open_claim_type4 claim = opendata->o_arg.claim;
+ nfs4_stateid stateid;
+ int ret = -EAGAIN;
+
+ for (;;) {
+ spin_lock(&state->owner->so_lock);
+ if (can_open_cached(state, fmode, open_mode, claim)) {
+ update_open_stateflags(state, fmode);
+ spin_unlock(&state->owner->so_lock);
+ goto out_return_state;
+ }
+ spin_unlock(&state->owner->so_lock);
+ rcu_read_lock();
+ delegation = nfs4_get_valid_delegation(state->inode);
+ if (!can_open_delegated(delegation, fmode, claim)) {
+ rcu_read_unlock();
+ break;
+ }
+ /* Save the delegation */
+ nfs4_stateid_copy(&stateid, &delegation->stateid);
+ rcu_read_unlock();
+ nfs_release_seqid(opendata->o_arg.seqid);
+ if (!opendata->is_recover) {
+ ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
+ if (ret != 0)
+ goto out;
+ }
+ ret = -EAGAIN;
+
+ /* Try to update the stateid using the delegation */
+ if (update_open_stateid(state, NULL, &stateid, fmode))
+ goto out_return_state;
+ }
+out:
+ return ERR_PTR(ret);
+out_return_state:
+ refcount_inc(&state->count);
+ return state;
+}
+
+static void
+nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state)
+{
+ struct nfs_client *clp = NFS_SERVER(state->inode)->nfs_client;
+ struct nfs_delegation *delegation;
+ int delegation_flags = 0;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(state->inode)->delegation);
+ if (delegation)
+ delegation_flags = delegation->flags;
+ rcu_read_unlock();
+ switch (data->o_arg.claim) {
+ default:
+ break;
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ pr_err_ratelimited("NFS: Broken NFSv4 server %s is "
+ "returning a delegation for "
+ "OPEN(CLAIM_DELEGATE_CUR)\n",
+ clp->cl_hostname);
+ return;
+ }
+ if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
+ nfs_inode_set_delegation(state->inode,
+ data->owner->so_cred,
+ data->o_res.delegation_type,
+ &data->o_res.delegation,
+ data->o_res.pagemod_limit);
+ else
+ nfs_inode_reclaim_delegation(state->inode,
+ data->owner->so_cred,
+ data->o_res.delegation_type,
+ &data->o_res.delegation,
+ data->o_res.pagemod_limit);
+
+ if (data->o_res.do_recall)
+ nfs_async_inode_return_delegation(state->inode,
+ &data->o_res.delegation);
+}
+
+/*
+ * Check the inode attributes against the CLAIM_PREVIOUS returned attributes
+ * and update the nfs4_state.
+ */
+static struct nfs4_state *
+_nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
+{
+ struct inode *inode = data->state->inode;
+ struct nfs4_state *state = data->state;
+ int ret;
+
+ if (!data->rpc_done) {
+ if (data->rpc_status)
+ return ERR_PTR(data->rpc_status);
+ return nfs4_try_open_cached(data);
+ }
+
+ ret = nfs_refresh_inode(inode, &data->f_attr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (data->o_res.delegation_type != 0)
+ nfs4_opendata_check_deleg(data, state);
+
+ if (!update_open_stateid(state, &data->o_res.stateid,
+ NULL, data->o_arg.fmode))
+ return ERR_PTR(-EAGAIN);
+ refcount_inc(&state->count);
+
+ return state;
+}
+
+static struct inode *
+nfs4_opendata_get_inode(struct nfs4_opendata *data)
+{
+ struct inode *inode;
+
+ switch (data->o_arg.claim) {
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+ if (!(data->f_attr.valid & NFS_ATTR_FATTR))
+ return ERR_PTR(-EAGAIN);
+ inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh,
+ &data->f_attr, data->f_label);
+ break;
+ default:
+ inode = d_inode(data->dentry);
+ ihold(inode);
+ nfs_refresh_inode(inode, &data->f_attr);
+ }
+ return inode;
+}
+
+static struct nfs4_state *
+nfs4_opendata_find_nfs4_state(struct nfs4_opendata *data)
+{
+ struct nfs4_state *state;
+ struct inode *inode;
+
+ inode = nfs4_opendata_get_inode(data);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+ if (data->state != NULL && data->state->inode == inode) {
+ state = data->state;
+ refcount_inc(&state->count);
+ } else
+ state = nfs4_get_open_state(inode, data->owner);
+ iput(inode);
+ if (state == NULL)
+ state = ERR_PTR(-ENOMEM);
+ return state;
+}
+
+static struct nfs4_state *
+_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
+{
+ struct nfs4_state *state;
+
+ if (!data->rpc_done) {
+ state = nfs4_try_open_cached(data);
+ trace_nfs4_cached_open(data->state);
+ goto out;
+ }
+
+ state = nfs4_opendata_find_nfs4_state(data);
+ if (IS_ERR(state))
+ goto out;
+
+ if (data->o_res.delegation_type != 0)
+ nfs4_opendata_check_deleg(data, state);
+ if (!update_open_stateid(state, &data->o_res.stateid,
+ NULL, data->o_arg.fmode)) {
+ nfs4_put_open_state(state);
+ state = ERR_PTR(-EAGAIN);
+ }
+out:
+ nfs_release_seqid(data->o_arg.seqid);
+ return state;
+}
+
+static struct nfs4_state *
+nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
+{
+ struct nfs4_state *ret;
+
+ if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS)
+ ret =_nfs4_opendata_reclaim_to_nfs4_state(data);
+ else
+ ret = _nfs4_opendata_to_nfs4_state(data);
+ nfs4_sequence_free_slot(&data->o_res.seq_res);
+ return ret;
+}
+
+static struct nfs_open_context *
+nfs4_state_find_open_context_mode(struct nfs4_state *state, fmode_t mode)
+{
+ struct nfs_inode *nfsi = NFS_I(state->inode);
+ struct nfs_open_context *ctx;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
+ if (ctx->state != state)
+ continue;
+ if ((ctx->mode & mode) != mode)
+ continue;
+ if (!get_nfs_open_context(ctx))
+ continue;
+ rcu_read_unlock();
+ return ctx;
+ }
+ rcu_read_unlock();
+ return ERR_PTR(-ENOENT);
+}
+
+static struct nfs_open_context *
+nfs4_state_find_open_context(struct nfs4_state *state)
+{
+ struct nfs_open_context *ctx;
+
+ ctx = nfs4_state_find_open_context_mode(state, FMODE_READ|FMODE_WRITE);
+ if (!IS_ERR(ctx))
+ return ctx;
+ ctx = nfs4_state_find_open_context_mode(state, FMODE_WRITE);
+ if (!IS_ERR(ctx))
+ return ctx;
+ return nfs4_state_find_open_context_mode(state, FMODE_READ);
+}
+
+static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx,
+ struct nfs4_state *state, enum open_claim_type4 claim)
+{
+ struct nfs4_opendata *opendata;
+
+ opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0,
+ NULL, claim, GFP_NOFS);
+ if (opendata == NULL)
+ return ERR_PTR(-ENOMEM);
+ opendata->state = state;
+ refcount_inc(&state->count);
+ return opendata;
+}
+
+static int nfs4_open_recover_helper(struct nfs4_opendata *opendata,
+ fmode_t fmode)
+{
+ struct nfs4_state *newstate;
+ struct nfs_server *server = NFS_SB(opendata->dentry->d_sb);
+ int openflags = opendata->o_arg.open_flags;
+ int ret;
+
+ if (!nfs4_mode_match_open_stateid(opendata->state, fmode))
+ return 0;
+ opendata->o_arg.fmode = fmode;
+ opendata->o_arg.share_access =
+ nfs4_map_atomic_open_share(server, fmode, openflags);
+ memset(&opendata->o_res, 0, sizeof(opendata->o_res));
+ memset(&opendata->c_res, 0, sizeof(opendata->c_res));
+ nfs4_init_opendata_res(opendata);
+ ret = _nfs4_recover_proc_open(opendata);
+ if (ret != 0)
+ return ret;
+ newstate = nfs4_opendata_to_nfs4_state(opendata);
+ if (IS_ERR(newstate))
+ return PTR_ERR(newstate);
+ if (newstate != opendata->state)
+ ret = -ESTALE;
+ nfs4_close_state(newstate, fmode);
+ return ret;
+}
+
+static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *state)
+{
+ int ret;
+
+ /* memory barrier prior to reading state->n_* */
+ smp_rmb();
+ ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE);
+ if (ret != 0)
+ return ret;
+ ret = nfs4_open_recover_helper(opendata, FMODE_WRITE);
+ if (ret != 0)
+ return ret;
+ ret = nfs4_open_recover_helper(opendata, FMODE_READ);
+ if (ret != 0)
+ return ret;
+ /*
+ * We may have performed cached opens for all three recoveries.
+ * Check if we need to update the current stateid.
+ */
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 &&
+ !nfs4_stateid_match(&state->stateid, &state->open_stateid)) {
+ write_seqlock(&state->seqlock);
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
+ nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+ write_sequnlock(&state->seqlock);
+ }
+ return 0;
+}
+
+/*
+ * OPEN_RECLAIM:
+ * reclaim state on the server after a reboot.
+ */
+static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state *state)
+{
+ struct nfs_delegation *delegation;
+ struct nfs4_opendata *opendata;
+ fmode_t delegation_type = 0;
+ int status;
+
+ opendata = nfs4_open_recoverdata_alloc(ctx, state,
+ NFS4_OPEN_CLAIM_PREVIOUS);
+ if (IS_ERR(opendata))
+ return PTR_ERR(opendata);
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(state->inode)->delegation);
+ if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0)
+ delegation_type = delegation->type;
+ rcu_read_unlock();
+ opendata->o_arg.u.delegation_type = delegation_type;
+ status = nfs4_open_recover(opendata, state);
+ nfs4_opendata_put(opendata);
+ return status;
+}
+
+static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state *state)
+{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ struct nfs4_exception exception = { };
+ int err;
+ do {
+ err = _nfs4_do_open_reclaim(ctx, state);
+ trace_nfs4_open_reclaim(ctx, 0, err);
+ if (nfs4_clear_cap_atomic_open_v1(server, err, &exception))
+ continue;
+ if (err != -NFS4ERR_DELAY)
+ break;
+ nfs4_handle_exception(server, err, &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+ struct nfs_open_context *ctx;
+ int ret;
+
+ ctx = nfs4_state_find_open_context(state);
+ if (IS_ERR(ctx))
+ return -EAGAIN;
+ clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ nfs_state_clear_open_state_flags(state);
+ ret = nfs4_do_open_reclaim(ctx, state);
+ put_nfs_open_context(ctx);
+ return ret;
+}
+
+static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, struct file_lock *fl, int err)
+{
+ switch (err) {
+ default:
+ printk(KERN_ERR "NFS: %s: unhandled error "
+ "%d.\n", __func__, err);
+ case 0:
+ case -ENOENT:
+ case -EAGAIN:
+ case -ESTALE:
+ case -ETIMEDOUT:
+ break;
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case -NFS4ERR_DEADSESSION:
+ return -EAGAIN;
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_STALE_STATEID:
+ /* Don't recall a delegation if it was lost */
+ nfs4_schedule_lease_recovery(server->nfs_client);
+ return -EAGAIN;
+ case -NFS4ERR_MOVED:
+ nfs4_schedule_migration_recovery(server);
+ return -EAGAIN;
+ case -NFS4ERR_LEASE_MOVED:
+ nfs4_schedule_lease_moved_recovery(server->nfs_client);
+ return -EAGAIN;
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_OPENMODE:
+ nfs_inode_find_state_and_recover(state->inode,
+ stateid);
+ nfs4_schedule_stateid_recovery(server, state);
+ return -EAGAIN;
+ case -NFS4ERR_DELAY:
+ case -NFS4ERR_GRACE:
+ ssleep(1);
+ return -EAGAIN;
+ case -ENOMEM:
+ case -NFS4ERR_DENIED:
+ if (fl) {
+ struct nfs4_lock_state *lsp = fl->fl_u.nfs4_fl.owner;
+ if (lsp)
+ set_bit(NFS_LOCK_LOST, &lsp->ls_flags);
+ }
+ return 0;
+ }
+ return err;
+}
+
+int nfs4_open_delegation_recall(struct nfs_open_context *ctx,
+ struct nfs4_state *state, const nfs4_stateid *stateid)
+{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ struct nfs4_opendata *opendata;
+ int err = 0;
+
+ opendata = nfs4_open_recoverdata_alloc(ctx, state,
+ NFS4_OPEN_CLAIM_DELEG_CUR_FH);
+ if (IS_ERR(opendata))
+ return PTR_ERR(opendata);
+ nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid);
+ if (!test_bit(NFS_O_RDWR_STATE, &state->flags)) {
+ err = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE);
+ if (err)
+ goto out;
+ }
+ if (!test_bit(NFS_O_WRONLY_STATE, &state->flags)) {
+ err = nfs4_open_recover_helper(opendata, FMODE_WRITE);
+ if (err)
+ goto out;
+ }
+ if (!test_bit(NFS_O_RDONLY_STATE, &state->flags)) {
+ err = nfs4_open_recover_helper(opendata, FMODE_READ);
+ if (err)
+ goto out;
+ }
+ nfs_state_clear_delegation(state);
+out:
+ nfs4_opendata_put(opendata);
+ return nfs4_handle_delegation_recall_error(server, state, stateid, NULL, err);
+}
+
+static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_opendata *data = calldata;
+
+ nfs4_setup_sequence(data->o_arg.server->nfs_client,
+ &data->c_arg.seq_args, &data->c_res.seq_res, task);
+}
+
+static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_opendata *data = calldata;
+
+ nfs40_sequence_done(task, &data->c_res.seq_res);
+
+ data->rpc_status = task->tk_status;
+ if (data->rpc_status == 0) {
+ nfs4_stateid_copy(&data->o_res.stateid, &data->c_res.stateid);
+ nfs_confirm_seqid(&data->owner->so_seqid, 0);
+ renew_lease(data->o_res.server, data->timestamp);
+ data->rpc_done = true;
+ }
+}
+
+static void nfs4_open_confirm_release(void *calldata)
+{
+ struct nfs4_opendata *data = calldata;
+ struct nfs4_state *state = NULL;
+
+ /* If this request hasn't been cancelled, do nothing */
+ if (!data->cancelled)
+ goto out_free;
+ /* In case of error, no cleanup! */
+ if (!data->rpc_done)
+ goto out_free;
+ state = nfs4_opendata_to_nfs4_state(data);
+ if (!IS_ERR(state))
+ nfs4_close_state(state, data->o_arg.fmode);
+out_free:
+ nfs4_opendata_put(data);
+}
+
+static const struct rpc_call_ops nfs4_open_confirm_ops = {
+ .rpc_call_prepare = nfs4_open_confirm_prepare,
+ .rpc_call_done = nfs4_open_confirm_done,
+ .rpc_release = nfs4_open_confirm_release,
+};
+
+/*
+ * Note: On error, nfs4_proc_open_confirm will free the struct nfs4_opendata
+ */
+static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
+{
+ struct nfs_server *server = NFS_SERVER(d_inode(data->dir));
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
+ .rpc_argp = &data->c_arg,
+ .rpc_resp = &data->c_res,
+ .rpc_cred = data->owner->so_cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = server->client,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_open_confirm_ops,
+ .callback_data = data,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
+ };
+ int status;
+
+ nfs4_init_sequence(&data->c_arg.seq_args, &data->c_res.seq_res, 1,
+ data->is_recover);
+ kref_get(&data->kref);
+ data->rpc_done = false;
+ data->rpc_status = 0;
+ data->timestamp = jiffies;
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ status = rpc_wait_for_completion_task(task);
+ if (status != 0) {
+ data->cancelled = true;
+ smp_wmb();
+ } else
+ status = data->rpc_status;
+ rpc_put_task(task);
+ return status;
+}
+
+static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_opendata *data = calldata;
+ struct nfs4_state_owner *sp = data->owner;
+ struct nfs_client *clp = sp->so_server->nfs_client;
+ enum open_claim_type4 claim = data->o_arg.claim;
+
+ if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
+ goto out_wait;
+ /*
+ * Check if we still need to send an OPEN call, or if we can use
+ * a delegation instead.
+ */
+ if (data->state != NULL) {
+ struct nfs_delegation *delegation;
+
+ if (can_open_cached(data->state, data->o_arg.fmode,
+ data->o_arg.open_flags, claim))
+ goto out_no_action;
+ rcu_read_lock();
+ delegation = nfs4_get_valid_delegation(data->state->inode);
+ if (can_open_delegated(delegation, data->o_arg.fmode, claim))
+ goto unlock_no_action;
+ rcu_read_unlock();
+ }
+ /* Update client id. */
+ data->o_arg.clientid = clp->cl_clientid;
+ switch (claim) {
+ default:
+ break;
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+ data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0];
+ fallthrough;
+ case NFS4_OPEN_CLAIM_FH:
+ task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
+ }
+ data->timestamp = jiffies;
+ if (nfs4_setup_sequence(data->o_arg.server->nfs_client,
+ &data->o_arg.seq_args,
+ &data->o_res.seq_res,
+ task) != 0)
+ nfs_release_seqid(data->o_arg.seqid);
+
+ /* Set the create mode (note dependency on the session type) */
+ data->o_arg.createmode = NFS4_CREATE_UNCHECKED;
+ if (data->o_arg.open_flags & O_EXCL) {
+ data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE;
+ if (nfs4_has_persistent_session(clp))
+ data->o_arg.createmode = NFS4_CREATE_GUARDED;
+ else if (clp->cl_mvops->minor_version > 0)
+ data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE4_1;
+ }
+ return;
+unlock_no_action:
+ trace_nfs4_cached_open(data->state);
+ rcu_read_unlock();
+out_no_action:
+ task->tk_action = NULL;
+out_wait:
+ nfs4_sequence_done(task, &data->o_res.seq_res);
+}
+
+static void nfs4_open_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_opendata *data = calldata;
+
+ data->rpc_status = task->tk_status;
+
+ if (!nfs4_sequence_process(task, &data->o_res.seq_res))
+ return;
+
+ if (task->tk_status == 0) {
+ if (data->o_res.f_attr->valid & NFS_ATTR_FATTR_TYPE) {
+ switch (data->o_res.f_attr->mode & S_IFMT) {
+ case S_IFREG:
+ break;
+ case S_IFLNK:
+ data->rpc_status = -ELOOP;
+ break;
+ case S_IFDIR:
+ data->rpc_status = -EISDIR;
+ break;
+ default:
+ data->rpc_status = -ENOTDIR;
+ }
+ }
+ renew_lease(data->o_res.server, data->timestamp);
+ if (!(data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM))
+ nfs_confirm_seqid(&data->owner->so_seqid, 0);
+ }
+ data->rpc_done = true;
+}
+
+static void nfs4_open_release(void *calldata)
+{
+ struct nfs4_opendata *data = calldata;
+ struct nfs4_state *state = NULL;
+
+ /* If this request hasn't been cancelled, do nothing */
+ if (!data->cancelled)
+ goto out_free;
+ /* In case of error, no cleanup! */
+ if (data->rpc_status != 0 || !data->rpc_done)
+ goto out_free;
+ /* In case we need an open_confirm, no cleanup! */
+ if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)
+ goto out_free;
+ state = nfs4_opendata_to_nfs4_state(data);
+ if (!IS_ERR(state))
+ nfs4_close_state(state, data->o_arg.fmode);
+out_free:
+ nfs4_opendata_put(data);
+}
+
+static const struct rpc_call_ops nfs4_open_ops = {
+ .rpc_call_prepare = nfs4_open_prepare,
+ .rpc_call_done = nfs4_open_done,
+ .rpc_release = nfs4_open_release,
+};
+
+static int nfs4_run_open_task(struct nfs4_opendata *data,
+ struct nfs_open_context *ctx)
+{
+ struct inode *dir = d_inode(data->dir);
+ struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs_openargs *o_arg = &data->o_arg;
+ struct nfs_openres *o_res = &data->o_res;
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN],
+ .rpc_argp = o_arg,
+ .rpc_resp = o_res,
+ .rpc_cred = data->owner->so_cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = server->client,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_open_ops,
+ .callback_data = data,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
+ };
+ int status;
+
+ kref_get(&data->kref);
+ data->rpc_done = false;
+ data->rpc_status = 0;
+ data->cancelled = false;
+ data->is_recover = false;
+ if (!ctx) {
+ nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1, 1);
+ data->is_recover = true;
+ task_setup_data.flags |= RPC_TASK_TIMEOUT;
+ } else {
+ nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1, 0);
+ pnfs_lgopen_prepare(data, ctx);
+ }
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ status = rpc_wait_for_completion_task(task);
+ if (status != 0) {
+ data->cancelled = true;
+ smp_wmb();
+ } else
+ status = data->rpc_status;
+ rpc_put_task(task);
+
+ return status;
+}
+
+static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
+{
+ struct inode *dir = d_inode(data->dir);
+ struct nfs_openres *o_res = &data->o_res;
+ int status;
+
+ status = nfs4_run_open_task(data, NULL);
+ if (status != 0 || !data->rpc_done)
+ return status;
+
+ nfs_fattr_map_and_free_names(NFS_SERVER(dir), &data->f_attr);
+
+ if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM)
+ status = _nfs4_proc_open_confirm(data);
+
+ return status;
+}
+
+/*
+ * Additional permission checks in order to distinguish between an
+ * open for read, and an open for execute. This works around the
+ * fact that NFSv4 OPEN treats read and execute permissions as being
+ * the same.
+ * Note that in the non-execute case, we want to turn off permission
+ * checking if we just created a new file (POSIX open() semantics).
+ */
+static int nfs4_opendata_access(const struct cred *cred,
+ struct nfs4_opendata *opendata,
+ struct nfs4_state *state, fmode_t fmode,
+ int openflags)
+{
+ struct nfs_access_entry cache;
+ u32 mask, flags;
+
+ /* access call failed or for some reason the server doesn't
+ * support any access modes -- defer access call until later */
+ if (opendata->o_res.access_supported == 0)
+ return 0;
+
+ mask = 0;
+ /*
+ * Use openflags to check for exec, because fmode won't
+ * always have FMODE_EXEC set when file open for exec.
+ */
+ if (openflags & __FMODE_EXEC) {
+ /* ONLY check for exec rights */
+ if (S_ISDIR(state->inode->i_mode))
+ mask = NFS4_ACCESS_LOOKUP;
+ else
+ mask = NFS4_ACCESS_EXECUTE;
+ } else if ((fmode & FMODE_READ) && !opendata->file_created)
+ mask = NFS4_ACCESS_READ;
+
+ cache.cred = cred;
+ nfs_access_set_mask(&cache, opendata->o_res.access_result);
+ nfs_access_add_cache(state->inode, &cache);
+
+ flags = NFS4_ACCESS_READ | NFS4_ACCESS_EXECUTE | NFS4_ACCESS_LOOKUP;
+ if ((mask & ~cache.mask & flags) == 0)
+ return 0;
+
+ return -EACCES;
+}
+
+/*
+ * Note: On error, nfs4_proc_open will free the struct nfs4_opendata
+ */
+static int _nfs4_proc_open(struct nfs4_opendata *data,
+ struct nfs_open_context *ctx)
+{
+ struct inode *dir = d_inode(data->dir);
+ struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs_openargs *o_arg = &data->o_arg;
+ struct nfs_openres *o_res = &data->o_res;
+ int status;
+
+ status = nfs4_run_open_task(data, ctx);
+ if (!data->rpc_done)
+ return status;
+ if (status != 0) {
+ if (status == -NFS4ERR_BADNAME &&
+ !(o_arg->open_flags & O_CREAT))
+ return -ENOENT;
+ return status;
+ }
+
+ nfs_fattr_map_and_free_names(server, &data->f_attr);
+
+ if (o_arg->open_flags & O_CREAT) {
+ if (o_arg->open_flags & O_EXCL)
+ data->file_created = true;
+ else if (o_res->cinfo.before != o_res->cinfo.after)
+ data->file_created = true;
+ if (data->file_created ||
+ inode_peek_iversion_raw(dir) != o_res->cinfo.after)
+ nfs4_update_changeattr(dir, &o_res->cinfo,
+ o_res->f_attr->time_start,
+ NFS_INO_INVALID_DATA);
+ }
+ if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
+ server->caps &= ~NFS_CAP_POSIX_LOCK;
+ if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
+ status = _nfs4_proc_open_confirm(data);
+ if (status != 0)
+ return status;
+ }
+ if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) {
+ nfs4_sequence_free_slot(&o_res->seq_res);
+ nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr,
+ o_res->f_label, NULL);
+ }
+ return 0;
+}
+
+/*
+ * OPEN_EXPIRED:
+ * reclaim state on the server after a network partition.
+ * Assumes caller holds the appropriate lock
+ */
+static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state)
+{
+ struct nfs4_opendata *opendata;
+ int ret;
+
+ opendata = nfs4_open_recoverdata_alloc(ctx, state, NFS4_OPEN_CLAIM_FH);
+ if (IS_ERR(opendata))
+ return PTR_ERR(opendata);
+ /*
+ * We're not recovering a delegation, so ask for no delegation.
+ * Otherwise the recovery thread could deadlock with an outstanding
+ * delegation return.
+ */
+ opendata->o_arg.open_flags = O_DIRECT;
+ ret = nfs4_open_recover(opendata, state);
+ if (ret == -ESTALE)
+ d_drop(ctx->dentry);
+ nfs4_opendata_put(opendata);
+ return ret;
+}
+
+static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state)
+{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = _nfs4_open_expired(ctx, state);
+ trace_nfs4_open_expired(ctx, 0, err);
+ if (nfs4_clear_cap_atomic_open_v1(server, err, &exception))
+ continue;
+ switch (err) {
+ default:
+ goto out;
+ case -NFS4ERR_GRACE:
+ case -NFS4ERR_DELAY:
+ nfs4_handle_exception(server, err, &exception);
+ err = 0;
+ }
+ } while (exception.retry);
+out:
+ return err;
+}
+
+static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+ struct nfs_open_context *ctx;
+ int ret;
+
+ ctx = nfs4_state_find_open_context(state);
+ if (IS_ERR(ctx))
+ return -EAGAIN;
+ ret = nfs4_do_open_expired(ctx, state);
+ put_nfs_open_context(ctx);
+ return ret;
+}
+
+static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state,
+ const nfs4_stateid *stateid)
+{
+ nfs_remove_bad_delegation(state->inode, stateid);
+ nfs_state_clear_delegation(state);
+}
+
+static void nfs40_clear_delegation_stateid(struct nfs4_state *state)
+{
+ if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL)
+ nfs_finish_clear_delegation_stateid(state, NULL);
+}
+
+static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+ /* NFSv4.0 doesn't allow for delegation recovery on open expire */
+ nfs40_clear_delegation_stateid(state);
+ nfs_state_clear_open_state_flags(state);
+ return nfs4_open_expired(sp, state);
+}
+
+static int nfs40_test_and_free_expired_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ const struct cred *cred)
+{
+ return -NFS4ERR_BAD_STATEID;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+static int nfs41_test_and_free_expired_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ const struct cred *cred)
+{
+ int status;
+
+ switch (stateid->type) {
+ default:
+ break;
+ case NFS4_INVALID_STATEID_TYPE:
+ case NFS4_SPECIAL_STATEID_TYPE:
+ return -NFS4ERR_BAD_STATEID;
+ case NFS4_REVOKED_STATEID_TYPE:
+ goto out_free;
+ }
+
+ status = nfs41_test_stateid(server, stateid, cred);
+ switch (status) {
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_DELEG_REVOKED:
+ break;
+ default:
+ return status;
+ }
+out_free:
+ /* Ack the revoked state to the server */
+ nfs41_free_stateid(server, stateid, cred, true);
+ return -NFS4ERR_EXPIRED;
+}
+
+static int nfs41_check_delegation_stateid(struct nfs4_state *state)
+{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ nfs4_stateid stateid;
+ struct nfs_delegation *delegation;
+ const struct cred *cred = NULL;
+ int status, ret = NFS_OK;
+
+ /* Get the delegation credential for use by test/free_stateid */
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(state->inode)->delegation);
+ if (delegation == NULL) {
+ rcu_read_unlock();
+ nfs_state_clear_delegation(state);
+ return NFS_OK;
+ }
+
+ spin_lock(&delegation->lock);
+ nfs4_stateid_copy(&stateid, &delegation->stateid);
+
+ if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED,
+ &delegation->flags)) {
+ spin_unlock(&delegation->lock);
+ rcu_read_unlock();
+ return NFS_OK;
+ }
+
+ if (delegation->cred)
+ cred = get_cred(delegation->cred);
+ spin_unlock(&delegation->lock);
+ rcu_read_unlock();
+ status = nfs41_test_and_free_expired_stateid(server, &stateid, cred);
+ trace_nfs4_test_delegation_stateid(state, NULL, status);
+ if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID)
+ nfs_finish_clear_delegation_stateid(state, &stateid);
+ else
+ ret = status;
+
+ put_cred(cred);
+ return ret;
+}
+
+static void nfs41_delegation_recover_stateid(struct nfs4_state *state)
+{
+ nfs4_stateid tmp;
+
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags) &&
+ nfs4_copy_delegation_stateid(state->inode, state->state,
+ &tmp, NULL) &&
+ nfs4_stateid_match_other(&state->stateid, &tmp))
+ nfs_state_set_delegation(state, &tmp, state->state);
+ else
+ nfs_state_clear_delegation(state);
+}
+
+/**
+ * nfs41_check_expired_locks - possibly free a lock stateid
+ *
+ * @state: NFSv4 state for an inode
+ *
+ * Returns NFS_OK if recovery for this stateid is now finished.
+ * Otherwise a negative NFS4ERR value is returned.
+ */
+static int nfs41_check_expired_locks(struct nfs4_state *state)
+{
+ int status, ret = NFS_OK;
+ struct nfs4_lock_state *lsp, *prev = NULL;
+ struct nfs_server *server = NFS_SERVER(state->inode);
+
+ if (!test_bit(LK_STATE_IN_USE, &state->flags))
+ goto out;
+
+ spin_lock(&state->state_lock);
+ list_for_each_entry(lsp, &state->lock_states, ls_locks) {
+ if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
+ const struct cred *cred = lsp->ls_state->owner->so_cred;
+
+ refcount_inc(&lsp->ls_count);
+ spin_unlock(&state->state_lock);
+
+ nfs4_put_lock_state(prev);
+ prev = lsp;
+
+ status = nfs41_test_and_free_expired_stateid(server,
+ &lsp->ls_stateid,
+ cred);
+ trace_nfs4_test_lock_stateid(state, lsp, status);
+ if (status == -NFS4ERR_EXPIRED ||
+ status == -NFS4ERR_BAD_STATEID) {
+ clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
+ lsp->ls_stateid.type = NFS4_INVALID_STATEID_TYPE;
+ if (!recover_lost_locks)
+ set_bit(NFS_LOCK_LOST, &lsp->ls_flags);
+ } else if (status != NFS_OK) {
+ ret = status;
+ nfs4_put_lock_state(prev);
+ goto out;
+ }
+ spin_lock(&state->state_lock);
+ }
+ }
+ spin_unlock(&state->state_lock);
+ nfs4_put_lock_state(prev);
+out:
+ return ret;
+}
+
+/**
+ * nfs41_check_open_stateid - possibly free an open stateid
+ *
+ * @state: NFSv4 state for an inode
+ *
+ * Returns NFS_OK if recovery for this stateid is now finished.
+ * Otherwise a negative NFS4ERR value is returned.
+ */
+static int nfs41_check_open_stateid(struct nfs4_state *state)
+{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ nfs4_stateid *stateid = &state->open_stateid;
+ const struct cred *cred = state->owner->so_cred;
+ int status;
+
+ if (test_bit(NFS_OPEN_STATE, &state->flags) == 0)
+ return -NFS4ERR_BAD_STATEID;
+ status = nfs41_test_and_free_expired_stateid(server, stateid, cred);
+ trace_nfs4_test_open_stateid(state, NULL, status);
+ if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) {
+ nfs_state_clear_open_state_flags(state);
+ stateid->type = NFS4_INVALID_STATEID_TYPE;
+ return status;
+ }
+ if (nfs_open_stateid_recover_openmode(state))
+ return -NFS4ERR_OPENMODE;
+ return NFS_OK;
+}
+
+static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+ int status;
+
+ status = nfs41_check_delegation_stateid(state);
+ if (status != NFS_OK)
+ return status;
+ nfs41_delegation_recover_stateid(state);
+
+ status = nfs41_check_expired_locks(state);
+ if (status != NFS_OK)
+ return status;
+ status = nfs41_check_open_stateid(state);
+ if (status != NFS_OK)
+ status = nfs4_open_expired(sp, state);
+ return status;
+}
+#endif
+
+/*
+ * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-*
+ * fields corresponding to attributes that were used to store the verifier.
+ * Make sure we clobber those fields in the later setattr call
+ */
+static unsigned nfs4_exclusive_attrset(struct nfs4_opendata *opendata,
+ struct iattr *sattr, struct nfs4_label **label)
+{
+ const __u32 *bitmask = opendata->o_arg.server->exclcreat_bitmask;
+ __u32 attrset[3];
+ unsigned ret;
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(attrset); i++) {
+ attrset[i] = opendata->o_res.attrset[i];
+ if (opendata->o_arg.createmode == NFS4_CREATE_EXCLUSIVE4_1)
+ attrset[i] &= ~bitmask[i];
+ }
+
+ ret = (opendata->o_arg.createmode == NFS4_CREATE_EXCLUSIVE) ?
+ sattr->ia_valid : 0;
+
+ if ((attrset[1] & (FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET))) {
+ if (sattr->ia_valid & ATTR_ATIME_SET)
+ ret |= ATTR_ATIME_SET;
+ else
+ ret |= ATTR_ATIME;
+ }
+
+ if ((attrset[1] & (FATTR4_WORD1_TIME_MODIFY|FATTR4_WORD1_TIME_MODIFY_SET))) {
+ if (sattr->ia_valid & ATTR_MTIME_SET)
+ ret |= ATTR_MTIME_SET;
+ else
+ ret |= ATTR_MTIME;
+ }
+
+ if (!(attrset[2] & FATTR4_WORD2_SECURITY_LABEL))
+ *label = NULL;
+ return ret;
+}
+
+static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
+ int flags, struct nfs_open_context *ctx)
+{
+ struct nfs4_state_owner *sp = opendata->owner;
+ struct nfs_server *server = sp->so_server;
+ struct dentry *dentry;
+ struct nfs4_state *state;
+ fmode_t acc_mode = _nfs4_ctx_to_accessmode(ctx);
+ struct inode *dir = d_inode(opendata->dir);
+ unsigned long dir_verifier;
+ unsigned int seq;
+ int ret;
+
+ seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+ dir_verifier = nfs_save_change_attribute(dir);
+
+ ret = _nfs4_proc_open(opendata, ctx);
+ if (ret != 0)
+ goto out;
+
+ state = _nfs4_opendata_to_nfs4_state(opendata);
+ ret = PTR_ERR(state);
+ if (IS_ERR(state))
+ goto out;
+ ctx->state = state;
+ if (server->caps & NFS_CAP_POSIX_LOCK)
+ set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+ if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK)
+ set_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags);
+
+ dentry = opendata->dentry;
+ if (d_really_is_negative(dentry)) {
+ struct dentry *alias;
+ d_drop(dentry);
+ alias = d_exact_alias(dentry, state->inode);
+ if (!alias)
+ alias = d_splice_alias(igrab(state->inode), dentry);
+ /* d_splice_alias() can't fail here - it's a non-directory */
+ if (alias) {
+ dput(ctx->dentry);
+ ctx->dentry = dentry = alias;
+ }
+ }
+
+ switch(opendata->o_arg.claim) {
+ default:
+ break;
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+ if (!opendata->rpc_done)
+ break;
+ if (opendata->o_res.delegation_type != 0)
+ dir_verifier = nfs_save_change_attribute(dir);
+ nfs_set_verifier(dentry, dir_verifier);
+ }
+
+ /* Parse layoutget results before we check for access */
+ pnfs_parse_lgopen(state->inode, opendata->lgp, ctx);
+
+ ret = nfs4_opendata_access(sp->so_cred, opendata, state,
+ acc_mode, flags);
+ if (ret != 0)
+ goto out;
+
+ if (d_inode(dentry) == state->inode) {
+ nfs_inode_attach_open_context(ctx);
+ if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+ nfs4_schedule_stateid_recovery(server, state);
+ }
+
+out:
+ if (!opendata->cancelled) {
+ if (opendata->lgp) {
+ nfs4_lgopen_release(opendata->lgp);
+ opendata->lgp = NULL;
+ }
+ nfs4_sequence_free_slot(&opendata->o_res.seq_res);
+ }
+ return ret;
+}
+
+/*
+ * Returns a referenced nfs4_state
+ */
+static int _nfs4_do_open(struct inode *dir,
+ struct nfs_open_context *ctx,
+ int flags,
+ const struct nfs4_open_createattrs *c,
+ int *opened)
+{
+ struct nfs4_state_owner *sp;
+ struct nfs4_state *state = NULL;
+ struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs4_opendata *opendata;
+ struct dentry *dentry = ctx->dentry;
+ const struct cred *cred = ctx->cred;
+ struct nfs4_threshold **ctx_th = &ctx->mdsthreshold;
+ fmode_t fmode = _nfs4_ctx_to_openmode(ctx);
+ enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL;
+ struct iattr *sattr = c->sattr;
+ struct nfs4_label *label = c->label;
+ struct nfs4_label *olabel = NULL;
+ int status;
+
+ /* Protect against reboot recovery conflicts */
+ status = -ENOMEM;
+ sp = nfs4_get_state_owner(server, cred, GFP_KERNEL);
+ if (sp == NULL) {
+ dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n");
+ goto out_err;
+ }
+ status = nfs4_client_recover_expired_lease(server->nfs_client);
+ if (status != 0)
+ goto err_put_state_owner;
+ if (d_really_is_positive(dentry))
+ nfs4_return_incompatible_delegation(d_inode(dentry), fmode);
+ status = -ENOMEM;
+ if (d_really_is_positive(dentry))
+ claim = NFS4_OPEN_CLAIM_FH;
+ opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags,
+ c, claim, GFP_KERNEL);
+ if (opendata == NULL)
+ goto err_put_state_owner;
+
+ if (label) {
+ olabel = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(olabel)) {
+ status = PTR_ERR(olabel);
+ goto err_opendata_put;
+ }
+ }
+
+ if (server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
+ if (!opendata->f_attr.mdsthreshold) {
+ opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
+ if (!opendata->f_attr.mdsthreshold)
+ goto err_free_label;
+ }
+ opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];
+ }
+ if (d_really_is_positive(dentry))
+ opendata->state = nfs4_get_open_state(d_inode(dentry), sp);
+
+ status = _nfs4_open_and_get_state(opendata, flags, ctx);
+ if (status != 0)
+ goto err_free_label;
+ state = ctx->state;
+
+ if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) &&
+ (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
+ unsigned attrs = nfs4_exclusive_attrset(opendata, sattr, &label);
+ /*
+ * send create attributes which was not set by open
+ * with an extra setattr.
+ */
+ if (attrs || label) {
+ unsigned ia_old = sattr->ia_valid;
+
+ sattr->ia_valid = attrs;
+ nfs_fattr_init(opendata->o_res.f_attr);
+ status = nfs4_do_setattr(state->inode, cred,
+ opendata->o_res.f_attr, sattr,
+ ctx, label, olabel);
+ if (status == 0) {
+ nfs_setattr_update_inode(state->inode, sattr,
+ opendata->o_res.f_attr);
+ nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
+ }
+ sattr->ia_valid = ia_old;
+ }
+ }
+ if (opened && opendata->file_created)
+ *opened = 1;
+
+ if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) {
+ *ctx_th = opendata->f_attr.mdsthreshold;
+ opendata->f_attr.mdsthreshold = NULL;
+ }
+
+ nfs4_label_free(olabel);
+
+ nfs4_opendata_put(opendata);
+ nfs4_put_state_owner(sp);
+ return 0;
+err_free_label:
+ nfs4_label_free(olabel);
+err_opendata_put:
+ nfs4_opendata_put(opendata);
+err_put_state_owner:
+ nfs4_put_state_owner(sp);
+out_err:
+ return status;
+}
+
+
+static struct nfs4_state *nfs4_do_open(struct inode *dir,
+ struct nfs_open_context *ctx,
+ int flags,
+ struct iattr *sattr,
+ struct nfs4_label *label,
+ int *opened)
+{
+ struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ struct nfs4_state *res;
+ struct nfs4_open_createattrs c = {
+ .label = label,
+ .sattr = sattr,
+ .verf = {
+ [0] = (__u32)jiffies,
+ [1] = (__u32)current->pid,
+ },
+ };
+ int status;
+
+ do {
+ status = _nfs4_do_open(dir, ctx, flags, &c, opened);
+ res = ctx->state;
+ trace_nfs4_open_file(ctx, flags, status);
+ if (status == 0)
+ break;
+ /* NOTE: BAD_SEQID means the server and client disagree about the
+ * book-keeping w.r.t. state-changing operations
+ * (OPEN/CLOSE/LOCK/LOCKU...)
+ * It is actually a sign of a bug on the client or on the server.
+ *
+ * If we receive a BAD_SEQID error in the particular case of
+ * doing an OPEN, we assume that nfs_increment_open_seqid() will
+ * have unhashed the old state_owner for us, and that we can
+ * therefore safely retry using a new one. We should still warn
+ * the user though...
+ */
+ if (status == -NFS4ERR_BAD_SEQID) {
+ pr_warn_ratelimited("NFS: v4 server %s "
+ " returned a bad sequence-id error!\n",
+ NFS_SERVER(dir)->nfs_client->cl_hostname);
+ exception.retry = 1;
+ continue;
+ }
+ /*
+ * BAD_STATEID on OPEN means that the server cancelled our
+ * state before it received the OPEN_CONFIRM.
+ * Recover by retrying the request as per the discussion
+ * on Page 181 of RFC3530.
+ */
+ if (status == -NFS4ERR_BAD_STATEID) {
+ exception.retry = 1;
+ continue;
+ }
+ if (status == -NFS4ERR_EXPIRED) {
+ nfs4_schedule_lease_recovery(server->nfs_client);
+ exception.retry = 1;
+ continue;
+ }
+ if (status == -EAGAIN) {
+ /* We must have found a delegation */
+ exception.retry = 1;
+ continue;
+ }
+ if (nfs4_clear_cap_atomic_open_v1(server, status, &exception))
+ continue;
+ res = ERR_PTR(nfs4_handle_exception(server,
+ status, &exception));
+ } while (exception.retry);
+ return res;
+}
+
+static int _nfs4_do_setattr(struct inode *inode,
+ struct nfs_setattrargs *arg,
+ struct nfs_setattrres *res,
+ const struct cred *cred,
+ struct nfs_open_context *ctx)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
+ .rpc_argp = arg,
+ .rpc_resp = res,
+ .rpc_cred = cred,
+ };
+ const struct cred *delegation_cred = NULL;
+ unsigned long timestamp = jiffies;
+ bool truncate;
+ int status;
+
+ nfs_fattr_init(res->fattr);
+
+ /* Servers should only apply open mode checks for file size changes */
+ truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false;
+ if (!truncate) {
+ nfs4_inode_make_writeable(inode);
+ goto zero_stateid;
+ }
+
+ if (nfs4_copy_delegation_stateid(inode, FMODE_WRITE, &arg->stateid, &delegation_cred)) {
+ /* Use that stateid */
+ } else if (ctx != NULL && ctx->state) {
+ struct nfs_lock_context *l_ctx;
+ if (!nfs4_valid_open_stateid(ctx->state))
+ return -EBADF;
+ l_ctx = nfs_get_lock_context(ctx);
+ if (IS_ERR(l_ctx))
+ return PTR_ERR(l_ctx);
+ status = nfs4_select_rw_stateid(ctx->state, FMODE_WRITE, l_ctx,
+ &arg->stateid, &delegation_cred);
+ nfs_put_lock_context(l_ctx);
+ if (status == -EIO)
+ return -EBADF;
+ else if (status == -EAGAIN)
+ goto zero_stateid;
+ } else {
+zero_stateid:
+ nfs4_stateid_copy(&arg->stateid, &zero_stateid);
+ }
+ if (delegation_cred)
+ msg.rpc_cred = delegation_cred;
+
+ status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1);
+
+ put_cred(delegation_cred);
+ if (status == 0 && ctx != NULL)
+ renew_lease(server, timestamp);
+ trace_nfs4_setattr(inode, &arg->stateid, status);
+ return status;
+}
+
+static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
+ struct nfs_fattr *fattr, struct iattr *sattr,
+ struct nfs_open_context *ctx, struct nfs4_label *ilabel,
+ struct nfs4_label *olabel)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ __u32 bitmask[NFS4_BITMASK_SZ];
+ struct nfs4_state *state = ctx ? ctx->state : NULL;
+ struct nfs_setattrargs arg = {
+ .fh = NFS_FH(inode),
+ .iap = sattr,
+ .server = server,
+ .bitmask = bitmask,
+ .label = ilabel,
+ };
+ struct nfs_setattrres res = {
+ .fattr = fattr,
+ .label = olabel,
+ .server = server,
+ };
+ struct nfs4_exception exception = {
+ .state = state,
+ .inode = inode,
+ .stateid = &arg.stateid,
+ };
+ int err;
+
+ do {
+ nfs4_bitmap_copy_adjust_setattr(bitmask,
+ nfs4_bitmask(server, olabel),
+ inode);
+
+ err = _nfs4_do_setattr(inode, &arg, &res, cred, ctx);
+ switch (err) {
+ case -NFS4ERR_OPENMODE:
+ if (!(sattr->ia_valid & ATTR_SIZE)) {
+ pr_warn_once("NFSv4: server %s is incorrectly "
+ "applying open mode checks to "
+ "a SETATTR that is not "
+ "changing file size.\n",
+ server->nfs_client->cl_hostname);
+ }
+ if (state && !(state->state & FMODE_WRITE)) {
+ err = -EBADF;
+ if (sattr->ia_valid & ATTR_OPEN)
+ err = -EACCES;
+ goto out;
+ }
+ }
+ err = nfs4_handle_exception(server, err, &exception);
+ } while (exception.retry);
+out:
+ return err;
+}
+
+static bool
+nfs4_wait_on_layoutreturn(struct inode *inode, struct rpc_task *task)
+{
+ if (inode == NULL || !nfs_have_layout(inode))
+ return false;
+
+ return pnfs_wait_on_layoutreturn(inode, task);
+}
+
+/*
+ * Update the seqid of an open stateid
+ */
+static void nfs4_sync_open_stateid(nfs4_stateid *dst,
+ struct nfs4_state *state)
+{
+ __be32 seqid_open;
+ u32 dst_seqid;
+ int seq;
+
+ for (;;) {
+ if (!nfs4_valid_open_stateid(state))
+ break;
+ seq = read_seqbegin(&state->seqlock);
+ if (!nfs4_state_match_open_stateid_other(state, dst)) {
+ nfs4_stateid_copy(dst, &state->open_stateid);
+ if (read_seqretry(&state->seqlock, seq))
+ continue;
+ break;
+ }
+ seqid_open = state->open_stateid.seqid;
+ if (read_seqretry(&state->seqlock, seq))
+ continue;
+
+ dst_seqid = be32_to_cpu(dst->seqid);
+ if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) < 0)
+ dst->seqid = seqid_open;
+ break;
+ }
+}
+
+/*
+ * Update the seqid of an open stateid after receiving
+ * NFS4ERR_OLD_STATEID
+ */
+static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst,
+ struct nfs4_state *state)
+{
+ __be32 seqid_open;
+ u32 dst_seqid;
+ bool ret;
+ int seq, status = -EAGAIN;
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ ret = false;
+ if (!nfs4_valid_open_stateid(state))
+ break;
+ seq = read_seqbegin(&state->seqlock);
+ if (!nfs4_state_match_open_stateid_other(state, dst)) {
+ if (read_seqretry(&state->seqlock, seq))
+ continue;
+ break;
+ }
+
+ write_seqlock(&state->seqlock);
+ seqid_open = state->open_stateid.seqid;
+
+ dst_seqid = be32_to_cpu(dst->seqid);
+
+ /* Did another OPEN bump the state's seqid? try again: */
+ if ((s32)(be32_to_cpu(seqid_open) - dst_seqid) > 0) {
+ dst->seqid = seqid_open;
+ write_sequnlock(&state->seqlock);
+ ret = true;
+ break;
+ }
+
+ /* server says we're behind but we haven't seen the update yet */
+ set_bit(NFS_STATE_CHANGE_WAIT, &state->flags);
+ prepare_to_wait(&state->waitq, &wait, TASK_KILLABLE);
+ write_sequnlock(&state->seqlock);
+ trace_nfs4_close_stateid_update_wait(state->inode, dst, 0);
+
+ if (fatal_signal_pending(current))
+ status = -EINTR;
+ else
+ if (schedule_timeout(5*HZ) != 0)
+ status = 0;
+
+ finish_wait(&state->waitq, &wait);
+
+ if (!status)
+ continue;
+ if (status == -EINTR)
+ break;
+
+ /* we slept the whole 5 seconds, we must have lost a seqid */
+ dst->seqid = cpu_to_be32(dst_seqid + 1);
+ ret = true;
+ break;
+ }
+
+ return ret;
+}
+
+struct nfs4_closedata {
+ struct inode *inode;
+ struct nfs4_state *state;
+ struct nfs_closeargs arg;
+ struct nfs_closeres res;
+ struct {
+ struct nfs4_layoutreturn_args arg;
+ struct nfs4_layoutreturn_res res;
+ struct nfs4_xdr_opaque_data ld_private;
+ u32 roc_barrier;
+ bool roc;
+ } lr;
+ struct nfs_fattr fattr;
+ unsigned long timestamp;
+};
+
+static void nfs4_free_closedata(void *data)
+{
+ struct nfs4_closedata *calldata = data;
+ struct nfs4_state_owner *sp = calldata->state->owner;
+ struct super_block *sb = calldata->state->inode->i_sb;
+
+ if (calldata->lr.roc)
+ pnfs_roc_release(&calldata->lr.arg, &calldata->lr.res,
+ calldata->res.lr_ret);
+ nfs4_put_open_state(calldata->state);
+ nfs_free_seqid(calldata->arg.seqid);
+ nfs4_put_state_owner(sp);
+ nfs_sb_deactive(sb);
+ kfree(calldata);
+}
+
+static void nfs4_close_done(struct rpc_task *task, void *data)
+{
+ struct nfs4_closedata *calldata = data;
+ struct nfs4_state *state = calldata->state;
+ struct nfs_server *server = NFS_SERVER(calldata->inode);
+ nfs4_stateid *res_stateid = NULL;
+ struct nfs4_exception exception = {
+ .state = state,
+ .inode = calldata->inode,
+ .stateid = &calldata->arg.stateid,
+ };
+
+ dprintk("%s: begin!\n", __func__);
+ if (!nfs4_sequence_done(task, &calldata->res.seq_res))
+ return;
+ trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status);
+
+ /* Handle Layoutreturn errors */
+ if (pnfs_roc_done(task, &calldata->arg.lr_args, &calldata->res.lr_res,
+ &calldata->res.lr_ret) == -EAGAIN)
+ goto out_restart;
+
+ /* hmm. we are done with the inode, and in the process of freeing
+ * the state_owner. we keep this around to process errors
+ */
+ switch (task->tk_status) {
+ case 0:
+ res_stateid = &calldata->res.stateid;
+ renew_lease(server, calldata->timestamp);
+ break;
+ case -NFS4ERR_ACCESS:
+ if (calldata->arg.bitmask != NULL) {
+ calldata->arg.bitmask = NULL;
+ calldata->res.fattr = NULL;
+ goto out_restart;
+
+ }
+ break;
+ case -NFS4ERR_OLD_STATEID:
+ /* Did we race with OPEN? */
+ if (nfs4_refresh_open_old_stateid(&calldata->arg.stateid,
+ state))
+ goto out_restart;
+ goto out_release;
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_EXPIRED:
+ nfs4_free_revoked_stateid(server,
+ &calldata->arg.stateid,
+ task->tk_msg.rpc_cred);
+ fallthrough;
+ case -NFS4ERR_BAD_STATEID:
+ if (calldata->arg.fmode == 0)
+ break;
+ fallthrough;
+ default:
+ task->tk_status = nfs4_async_handle_exception(task,
+ server, task->tk_status, &exception);
+ if (exception.retry)
+ goto out_restart;
+ }
+ nfs_clear_open_stateid(state, &calldata->arg.stateid,
+ res_stateid, calldata->arg.fmode);
+out_release:
+ task->tk_status = 0;
+ nfs_release_seqid(calldata->arg.seqid);
+ nfs_refresh_inode(calldata->inode, &calldata->fattr);
+ dprintk("%s: done, ret = %d!\n", __func__, task->tk_status);
+ return;
+out_restart:
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
+ goto out_release;
+}
+
+static void nfs4_close_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs4_closedata *calldata = data;
+ struct nfs4_state *state = calldata->state;
+ struct inode *inode = calldata->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct pnfs_layout_hdr *lo;
+ bool is_rdonly, is_wronly, is_rdwr;
+ int call_close = 0;
+
+ dprintk("%s: begin!\n", __func__);
+ if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
+ goto out_wait;
+
+ task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
+ spin_lock(&state->owner->so_lock);
+ is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
+ is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
+ is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
+ /* Calculate the change in open mode */
+ calldata->arg.fmode = 0;
+ if (state->n_rdwr == 0) {
+ if (state->n_rdonly == 0)
+ call_close |= is_rdonly;
+ else if (is_rdonly)
+ calldata->arg.fmode |= FMODE_READ;
+ if (state->n_wronly == 0)
+ call_close |= is_wronly;
+ else if (is_wronly)
+ calldata->arg.fmode |= FMODE_WRITE;
+ if (calldata->arg.fmode != (FMODE_READ|FMODE_WRITE))
+ call_close |= is_rdwr;
+ } else if (is_rdwr)
+ calldata->arg.fmode |= FMODE_READ|FMODE_WRITE;
+
+ nfs4_sync_open_stateid(&calldata->arg.stateid, state);
+ if (!nfs4_valid_open_stateid(state))
+ call_close = 0;
+ spin_unlock(&state->owner->so_lock);
+
+ if (!call_close) {
+ /* Note: exit _without_ calling nfs4_close_done */
+ goto out_no_action;
+ }
+
+ if (!calldata->lr.roc && nfs4_wait_on_layoutreturn(inode, task)) {
+ nfs_release_seqid(calldata->arg.seqid);
+ goto out_wait;
+ }
+
+ lo = calldata->arg.lr_args ? calldata->arg.lr_args->layout : NULL;
+ if (lo && !pnfs_layout_is_valid(lo)) {
+ calldata->arg.lr_args = NULL;
+ calldata->res.lr_res = NULL;
+ }
+
+ if (calldata->arg.fmode == 0)
+ task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
+
+ if (calldata->arg.fmode == 0 || calldata->arg.fmode == FMODE_READ) {
+ /* Close-to-open cache consistency revalidation */
+ if (!nfs4_have_delegation(inode, FMODE_READ)) {
+ nfs4_bitmask_set(calldata->arg.bitmask_store,
+ server->cache_consistency_bitmask,
+ inode, server, NULL);
+ calldata->arg.bitmask = calldata->arg.bitmask_store;
+ } else
+ calldata->arg.bitmask = NULL;
+ }
+
+ calldata->arg.share_access =
+ nfs4_map_atomic_open_share(NFS_SERVER(inode),
+ calldata->arg.fmode, 0);
+
+ if (calldata->res.fattr == NULL)
+ calldata->arg.bitmask = NULL;
+ else if (calldata->arg.bitmask == NULL)
+ calldata->res.fattr = NULL;
+ calldata->timestamp = jiffies;
+ if (nfs4_setup_sequence(NFS_SERVER(inode)->nfs_client,
+ &calldata->arg.seq_args,
+ &calldata->res.seq_res,
+ task) != 0)
+ nfs_release_seqid(calldata->arg.seqid);
+ dprintk("%s: done!\n", __func__);
+ return;
+out_no_action:
+ task->tk_action = NULL;
+out_wait:
+ nfs4_sequence_done(task, &calldata->res.seq_res);
+}
+
+static const struct rpc_call_ops nfs4_close_ops = {
+ .rpc_call_prepare = nfs4_close_prepare,
+ .rpc_call_done = nfs4_close_done,
+ .rpc_release = nfs4_free_closedata,
+};
+
+/*
+ * It is possible for data to be read/written from a mem-mapped file
+ * after the sys_close call (which hits the vfs layer as a flush).
+ * This means that we can't safely call nfsv4 close on a file until
+ * the inode is cleared. This in turn means that we are not good
+ * NFSv4 citizens - we do not indicate to the server to update the file's
+ * share state even when we are done with one of the three share
+ * stateid's in the inode.
+ *
+ * NOTE: Caller must be holding the sp->so_owner semaphore!
+ */
+int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
+{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+ struct nfs4_closedata *calldata;
+ struct nfs4_state_owner *sp = state->owner;
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
+ .rpc_cred = state->owner->so_cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = server->client,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_close_ops,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
+ };
+ int status = -ENOMEM;
+
+ nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP,
+ &task_setup_data.rpc_client, &msg);
+
+ calldata = kzalloc(sizeof(*calldata), gfp_mask);
+ if (calldata == NULL)
+ goto out;
+ nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1, 0);
+ calldata->inode = state->inode;
+ calldata->state = state;
+ calldata->arg.fh = NFS_FH(state->inode);
+ if (!nfs4_copy_open_stateid(&calldata->arg.stateid, state))
+ goto out_free_calldata;
+ /* Serialization for the sequence id */
+ alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
+ calldata->arg.seqid = alloc_seqid(&state->owner->so_seqid, gfp_mask);
+ if (IS_ERR(calldata->arg.seqid))
+ goto out_free_calldata;
+ nfs_fattr_init(&calldata->fattr);
+ calldata->arg.fmode = 0;
+ calldata->lr.arg.ld_private = &calldata->lr.ld_private;
+ calldata->res.fattr = &calldata->fattr;
+ calldata->res.seqid = calldata->arg.seqid;
+ calldata->res.server = server;
+ calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
+ calldata->lr.roc = pnfs_roc(state->inode,
+ &calldata->lr.arg, &calldata->lr.res, msg.rpc_cred);
+ if (calldata->lr.roc) {
+ calldata->arg.lr_args = &calldata->lr.arg;
+ calldata->res.lr_res = &calldata->lr.res;
+ }
+ nfs_sb_active(calldata->inode->i_sb);
+
+ msg.rpc_argp = &calldata->arg;
+ msg.rpc_resp = &calldata->res;
+ task_setup_data.callback_data = calldata;
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ status = 0;
+ if (wait)
+ status = rpc_wait_for_completion_task(task);
+ rpc_put_task(task);
+ return status;
+out_free_calldata:
+ kfree(calldata);
+out:
+ nfs4_put_open_state(state);
+ nfs4_put_state_owner(sp);
+ return status;
+}
+
+static struct inode *
+nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx,
+ int open_flags, struct iattr *attr, int *opened)
+{
+ struct nfs4_state *state;
+ struct nfs4_label l, *label;
+
+ label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);
+
+ /* Protect against concurrent sillydeletes */
+ state = nfs4_do_open(dir, ctx, open_flags, attr, label, opened);
+
+ nfs4_label_release_security(label);
+
+ if (IS_ERR(state))
+ return ERR_CAST(state);
+ return state->inode;
+}
+
+static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+ if (ctx->state == NULL)
+ return;
+ if (is_sync)
+ nfs4_close_sync(ctx->state, _nfs4_ctx_to_openmode(ctx));
+ else
+ nfs4_close_state(ctx->state, _nfs4_ctx_to_openmode(ctx));
+}
+
+#define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL)
+#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL)
+#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_XATTR_SUPPORT - 1UL)
+
+static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
+{
+ u32 bitmask[3] = {}, minorversion = server->nfs_client->cl_minorversion;
+ struct nfs4_server_caps_arg args = {
+ .fhandle = fhandle,
+ .bitmask = bitmask,
+ };
+ struct nfs4_server_caps_res res = {};
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SERVER_CAPS],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ int status;
+ int i;
+
+ bitmask[0] = FATTR4_WORD0_SUPPORTED_ATTRS |
+ FATTR4_WORD0_FH_EXPIRE_TYPE |
+ FATTR4_WORD0_LINK_SUPPORT |
+ FATTR4_WORD0_SYMLINK_SUPPORT |
+ FATTR4_WORD0_ACLSUPPORT;
+ if (minorversion)
+ bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+
+ status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+ if (status == 0) {
+ /* Sanity check the server answers */
+ switch (minorversion) {
+ case 0:
+ res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK;
+ res.attr_bitmask[2] = 0;
+ break;
+ case 1:
+ res.attr_bitmask[2] &= FATTR4_WORD2_NFS41_MASK;
+ break;
+ case 2:
+ res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK;
+ }
+ memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
+ server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
+ NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
+ NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|
+ NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
+ NFS_CAP_CTIME|NFS_CAP_MTIME|
+ NFS_CAP_SECURITY_LABEL);
+ if (res.attr_bitmask[0] & FATTR4_WORD0_ACL &&
+ res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
+ server->caps |= NFS_CAP_ACLS;
+ if (res.has_links != 0)
+ server->caps |= NFS_CAP_HARDLINKS;
+ if (res.has_symlinks != 0)
+ server->caps |= NFS_CAP_SYMLINKS;
+ if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID)
+ server->caps |= NFS_CAP_FILEID;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_MODE)
+ server->caps |= NFS_CAP_MODE;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS)
+ server->caps |= NFS_CAP_NLINK;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER)
+ server->caps |= NFS_CAP_OWNER;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP)
+ server->caps |= NFS_CAP_OWNER_GROUP;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS)
+ server->caps |= NFS_CAP_ATIME;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA)
+ server->caps |= NFS_CAP_CTIME;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
+ server->caps |= NFS_CAP_MTIME;
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)
+ server->caps |= NFS_CAP_SECURITY_LABEL;
+#endif
+ memcpy(server->attr_bitmask_nl, res.attr_bitmask,
+ sizeof(server->attr_bitmask));
+ server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+
+ memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
+ server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
+ server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+ server->cache_consistency_bitmask[2] = 0;
+
+ /* Avoid a regression due to buggy server */
+ for (i = 0; i < ARRAY_SIZE(res.exclcreat_bitmask); i++)
+ res.exclcreat_bitmask[i] &= res.attr_bitmask[i];
+ memcpy(server->exclcreat_bitmask, res.exclcreat_bitmask,
+ sizeof(server->exclcreat_bitmask));
+
+ server->acl_bitmask = res.acl_bitmask;
+ server->fh_expire_type = res.fh_expire_type;
+ }
+
+ return status;
+}
+
+int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = nfs4_handle_exception(server,
+ _nfs4_server_capabilities(server, fhandle),
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ u32 bitmask[3];
+ struct nfs4_lookup_root_arg args = {
+ .bitmask = bitmask,
+ };
+ struct nfs4_lookup_res res = {
+ .server = server,
+ .fattr = info->fattr,
+ .fh = fhandle,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUP_ROOT],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+
+ bitmask[0] = nfs4_fattr_bitmap[0];
+ bitmask[1] = nfs4_fattr_bitmap[1];
+ /*
+ * Process the label in the upcoming getfattr
+ */
+ bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL;
+
+ nfs_fattr_init(info->fattr);
+ return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = _nfs4_lookup_root(server, fhandle, info);
+ trace_nfs4_lookup_root(server, fhandle, info->fattr, err);
+ switch (err) {
+ case 0:
+ case -NFS4ERR_WRONGSEC:
+ goto out;
+ default:
+ err = nfs4_handle_exception(server, err, &exception);
+ }
+ } while (exception.retry);
+out:
+ return err;
+}
+
+static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info, rpc_authflavor_t flavor)
+{
+ struct rpc_auth_create_args auth_args = {
+ .pseudoflavor = flavor,
+ };
+ struct rpc_auth *auth;
+
+ auth = rpcauth_create(&auth_args, server->client);
+ if (IS_ERR(auth))
+ return -EACCES;
+ return nfs4_lookup_root(server, fhandle, info);
+}
+
+/*
+ * Retry pseudoroot lookup with various security flavors. We do this when:
+ *
+ * NFSv4.0: the PUTROOTFH operation returns NFS4ERR_WRONGSEC
+ * NFSv4.1: the server does not support the SECINFO_NO_NAME operation
+ *
+ * Returns zero on success, or a negative NFS4ERR value, or a
+ * negative errno value.
+ */
+static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ /* Per 3530bis 15.33.5 */
+ static const rpc_authflavor_t flav_array[] = {
+ RPC_AUTH_GSS_KRB5P,
+ RPC_AUTH_GSS_KRB5I,
+ RPC_AUTH_GSS_KRB5,
+ RPC_AUTH_UNIX, /* courtesy */
+ RPC_AUTH_NULL,
+ };
+ int status = -EPERM;
+ size_t i;
+
+ if (server->auth_info.flavor_len > 0) {
+ /* try each flavor specified by user */
+ for (i = 0; i < server->auth_info.flavor_len; i++) {
+ status = nfs4_lookup_root_sec(server, fhandle, info,
+ server->auth_info.flavors[i]);
+ if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
+ continue;
+ break;
+ }
+ } else {
+ /* no flavors specified by user, try default list */
+ for (i = 0; i < ARRAY_SIZE(flav_array); i++) {
+ status = nfs4_lookup_root_sec(server, fhandle, info,
+ flav_array[i]);
+ if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
+ continue;
+ break;
+ }
+ }
+
+ /*
+ * -EACCES could mean that the user doesn't have correct permissions
+ * to access the mount. It could also mean that we tried to mount
+ * with a gss auth flavor, but rpc.gssd isn't running. Either way,
+ * existing mount programs don't handle -EACCES very well so it should
+ * be mapped to -EPERM instead.
+ */
+ if (status == -EACCES)
+ status = -EPERM;
+ return status;
+}
+
+/**
+ * nfs4_proc_get_rootfh - get file handle for server's pseudoroot
+ * @server: initialized nfs_server handle
+ * @fhandle: we fill in the pseudo-fs root file handle
+ * @info: we fill in an FSINFO struct
+ * @auth_probe: probe the auth flavours
+ *
+ * Returns zero on success, or a negative errno.
+ */
+int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info,
+ bool auth_probe)
+{
+ int status = 0;
+
+ if (!auth_probe)
+ status = nfs4_lookup_root(server, fhandle, info);
+
+ if (auth_probe || status == NFS4ERR_WRONGSEC)
+ status = server->nfs_client->cl_mvops->find_root_sec(server,
+ fhandle, info);
+
+ if (status == 0)
+ status = nfs4_server_capabilities(server, fhandle);
+ if (status == 0)
+ status = nfs4_do_fsinfo(server, fhandle, info);
+
+ return nfs4_map_errors(status);
+}
+
+static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
+ struct nfs_fsinfo *info)
+{
+ int error;
+ struct nfs_fattr *fattr = info->fattr;
+ struct nfs4_label *label = fattr->label;
+
+ error = nfs4_server_capabilities(server, mntfh);
+ if (error < 0) {
+ dprintk("nfs4_get_root: getcaps error = %d\n", -error);
+ return error;
+ }
+
+ error = nfs4_proc_getattr(server, mntfh, fattr, label, NULL);
+ if (error < 0) {
+ dprintk("nfs4_get_root: getattr error = %d\n", -error);
+ goto out;
+ }
+
+ if (fattr->valid & NFS_ATTR_FATTR_FSID &&
+ !nfs_fsid_equal(&server->fsid, &fattr->fsid))
+ memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
+
+out:
+ return error;
+}
+
+/*
+ * Get locations and (maybe) other attributes of a referral.
+ * Note that we'll actually follow the referral later when
+ * we detect fsid mismatch in inode revalidation
+ */
+static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir,
+ const struct qstr *name, struct nfs_fattr *fattr,
+ struct nfs_fh *fhandle)
+{
+ int status = -ENOMEM;
+ struct page *page = NULL;
+ struct nfs4_fs_locations *locations = NULL;
+
+ page = alloc_page(GFP_KERNEL);
+ if (page == NULL)
+ goto out;
+ locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+ if (locations == NULL)
+ goto out;
+
+ status = nfs4_proc_fs_locations(client, dir, name, locations, page);
+ if (status != 0)
+ goto out;
+
+ /*
+ * If the fsid didn't change, this is a migration event, not a
+ * referral. Cause us to drop into the exception handler, which
+ * will kick off migration recovery.
+ */
+ if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) {
+ dprintk("%s: server did not return a different fsid for"
+ " a referral at %s\n", __func__, name->name);
+ status = -NFS4ERR_MOVED;
+ goto out;
+ }
+ /* Fixup attributes for the nfs_lookup() call to nfs_fhget() */
+ nfs_fixup_referral_attributes(&locations->fattr);
+
+ /* replace the lookup nfs_fattr with the locations nfs_fattr */
+ memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr));
+ memset(fhandle, 0, sizeof(struct nfs_fh));
+out:
+ if (page)
+ __free_page(page);
+ kfree(locations);
+ return status;
+}
+
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct nfs4_label *label,
+ struct inode *inode)
+{
+ __u32 bitmask[NFS4_BITMASK_SZ];
+ struct nfs4_getattr_arg args = {
+ .fh = fhandle,
+ .bitmask = bitmask,
+ };
+ struct nfs4_getattr_res res = {
+ .fattr = fattr,
+ .label = label,
+ .server = server,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ unsigned short task_flags = 0;
+
+ /* Is this is an attribute revalidation, subject to softreval? */
+ if (inode && (server->flags & NFS_MOUNT_SOFTREVAL))
+ task_flags |= RPC_TASK_TIMEOUT;
+
+ nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, label), inode);
+
+ nfs_fattr_init(fattr);
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
+ return nfs4_do_call_sync(server->client, server, &msg,
+ &args.seq_args, &res.seq_res, task_flags);
+}
+
+int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct nfs4_label *label,
+ struct inode *inode)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = _nfs4_proc_getattr(server, fhandle, fattr, label, inode);
+ trace_nfs4_getattr(server, fhandle, fattr, err);
+ err = nfs4_handle_exception(server, err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+/*
+ * The file is not closed if it is opened due to the a request to change
+ * the size of the file. The open call will not be needed once the
+ * VFS layer lookup-intents are implemented.
+ *
+ * Close is called when the inode is destroyed.
+ * If we haven't opened the file for O_WRONLY, we
+ * need to in the size_change case to obtain a stateid.
+ *
+ * Got race?
+ * Because OPEN is always done by name in nfsv4, it is
+ * possible that we opened a different file by the same
+ * name. We can recognize this race condition, but we
+ * can't do anything about it besides returning an error.
+ *
+ * This will be fixed with VFS changes (lookup-intent).
+ */
+static int
+nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
+ struct iattr *sattr)
+{
+ struct inode *inode = d_inode(dentry);
+ const struct cred *cred = NULL;
+ struct nfs_open_context *ctx = NULL;
+ struct nfs4_label *label = NULL;
+ int status;
+
+ if (pnfs_ld_layoutret_on_setattr(inode) &&
+ sattr->ia_valid & ATTR_SIZE &&
+ sattr->ia_size < i_size_read(inode))
+ pnfs_commit_and_return_layout(inode);
+
+ nfs_fattr_init(fattr);
+
+ /* Deal with open(O_TRUNC) */
+ if (sattr->ia_valid & ATTR_OPEN)
+ sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME);
+
+ /* Optimization: if the end result is no change, don't RPC */
+ if ((sattr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0)
+ return 0;
+
+ /* Search for an existing open(O_WRITE) file */
+ if (sattr->ia_valid & ATTR_FILE) {
+
+ ctx = nfs_file_open_context(sattr->ia_file);
+ if (ctx)
+ cred = ctx->cred;
+ }
+
+ label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+ if (IS_ERR(label))
+ return PTR_ERR(label);
+
+ /* Return any delegations if we're going to change ACLs */
+ if ((sattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
+ nfs4_inode_make_writeable(inode);
+
+ status = nfs4_do_setattr(inode, cred, fattr, sattr, ctx, NULL, label);
+ if (status == 0) {
+ nfs_setattr_update_inode(inode, sattr, fattr);
+ nfs_setsecurity(inode, fattr, label);
+ }
+ nfs4_label_free(label);
+ return status;
+}
+
+static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
+ struct dentry *dentry, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct nfs4_label *label)
+{
+ struct nfs_server *server = NFS_SERVER(dir);
+ int status;
+ struct nfs4_lookup_arg args = {
+ .bitmask = server->attr_bitmask,
+ .dir_fh = NFS_FH(dir),
+ .name = &dentry->d_name,
+ };
+ struct nfs4_lookup_res res = {
+ .server = server,
+ .fattr = fattr,
+ .label = label,
+ .fh = fhandle,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUP],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ unsigned short task_flags = 0;
+
+ /* Is this is an attribute revalidation, subject to softreval? */
+ if (nfs_lookup_is_soft_revalidate(dentry))
+ task_flags |= RPC_TASK_TIMEOUT;
+
+ args.bitmask = nfs4_bitmask(server, label);
+
+ nfs_fattr_init(fattr);
+
+ dprintk("NFS call lookup %pd2\n", dentry);
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
+ status = nfs4_do_call_sync(clnt, server, &msg,
+ &args.seq_args, &res.seq_res, task_flags);
+ dprintk("NFS reply lookup: %d\n", status);
+ return status;
+}
+
+static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr)
+{
+ fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
+ NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_MOUNTPOINT;
+ fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
+ fattr->nlink = 2;
+}
+
+static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
+ struct dentry *dentry, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct nfs4_label *label)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ struct rpc_clnt *client = *clnt;
+ const struct qstr *name = &dentry->d_name;
+ int err;
+ do {
+ err = _nfs4_proc_lookup(client, dir, dentry, fhandle, fattr, label);
+ trace_nfs4_lookup(dir, name, err);
+ switch (err) {
+ case -NFS4ERR_BADNAME:
+ err = -ENOENT;
+ goto out;
+ case -NFS4ERR_MOVED:
+ err = nfs4_get_referral(client, dir, name, fattr, fhandle);
+ if (err == -NFS4ERR_MOVED)
+ err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception);
+ goto out;
+ case -NFS4ERR_WRONGSEC:
+ err = -EPERM;
+ if (client != *clnt)
+ goto out;
+ client = nfs4_negotiate_security(client, dir, name);
+ if (IS_ERR(client))
+ return PTR_ERR(client);
+
+ exception.retry = 1;
+ break;
+ default:
+ err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception);
+ }
+ } while (exception.retry);
+
+out:
+ if (err == 0)
+ *clnt = client;
+ else if (client != *clnt)
+ rpc_shutdown_client(client);
+
+ return err;
+}
+
+static int nfs4_proc_lookup(struct inode *dir, struct dentry *dentry,
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
+{
+ int status;
+ struct rpc_clnt *client = NFS_CLIENT(dir);
+
+ status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr, label);
+ if (client != NFS_CLIENT(dir)) {
+ rpc_shutdown_client(client);
+ nfs_fixup_secinfo_attributes(fattr);
+ }
+ return status;
+}
+
+struct rpc_clnt *
+nfs4_proc_lookup_mountpoint(struct inode *dir, struct dentry *dentry,
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+{
+ struct rpc_clnt *client = NFS_CLIENT(dir);
+ int status;
+
+ status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr, NULL);
+ if (status < 0)
+ return ERR_PTR(status);
+ return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client;
+}
+
+static int _nfs4_proc_lookupp(struct inode *inode,
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
+{
+ struct rpc_clnt *clnt = NFS_CLIENT(inode);
+ struct nfs_server *server = NFS_SERVER(inode);
+ int status;
+ struct nfs4_lookupp_arg args = {
+ .bitmask = server->attr_bitmask,
+ .fh = NFS_FH(inode),
+ };
+ struct nfs4_lookupp_res res = {
+ .server = server,
+ .fattr = fattr,
+ .label = label,
+ .fh = fhandle,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUPP],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+
+ args.bitmask = nfs4_bitmask(server, label);
+
+ nfs_fattr_init(fattr);
+
+ dprintk("NFS call lookupp ino=0x%lx\n", inode->i_ino);
+ status = nfs4_call_sync(clnt, server, &msg, &args.seq_args,
+ &res.seq_res, 0);
+ dprintk("NFS reply lookupp: %d\n", status);
+ return status;
+}
+
+static int nfs4_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct nfs4_label *label)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = _nfs4_proc_lookupp(inode, fhandle, fattr, label);
+ trace_nfs4_lookupp(inode, err);
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs4_accessargs args = {
+ .fh = NFS_FH(inode),
+ .access = entry->mask,
+ };
+ struct nfs4_accessres res = {
+ .server = server,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = entry->cred,
+ };
+ int status = 0;
+
+ if (!nfs4_have_delegation(inode, FMODE_READ)) {
+ res.fattr = nfs_alloc_fattr();
+ if (res.fattr == NULL)
+ return -ENOMEM;
+ args.bitmask = server->cache_consistency_bitmask;
+ }
+ status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+ if (!status) {
+ nfs_access_set_mask(entry, res.access);
+ if (res.fattr)
+ nfs_refresh_inode(inode, res.fattr);
+ }
+ nfs_free_fattr(res.fattr);
+ return status;
+}
+
+static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = _nfs4_proc_access(inode, entry);
+ trace_nfs4_access(inode, err);
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+/*
+ * TODO: For the time being, we don't try to get any attributes
+ * along with any of the zero-copy operations READ, READDIR,
+ * READLINK, WRITE.
+ *
+ * In the case of the first three, we want to put the GETATTR
+ * after the read-type operation -- this is because it is hard
+ * to predict the length of a GETATTR response in v4, and thus
+ * align the READ data correctly. This means that the GETATTR
+ * may end up partially falling into the page cache, and we should
+ * shift it into the 'tail' of the xdr_buf before processing.
+ * To do this efficiently, we need to know the total length
+ * of data received, which doesn't seem to be available outside
+ * of the RPC layer.
+ *
+ * In the case of WRITE, we also want to put the GETATTR after
+ * the operation -- in this case because we want to make sure
+ * we get the post-operation mtime and size.
+ *
+ * Both of these changes to the XDR layer would in fact be quite
+ * minor, but I decided to leave them for a subsequent patch.
+ */
+static int _nfs4_proc_readlink(struct inode *inode, struct page *page,
+ unsigned int pgbase, unsigned int pglen)
+{
+ struct nfs4_readlink args = {
+ .fh = NFS_FH(inode),
+ .pgbase = pgbase,
+ .pglen = pglen,
+ .pages = &page,
+ };
+ struct nfs4_readlink_res res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READLINK],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+
+ return nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_proc_readlink(struct inode *inode, struct page *page,
+ unsigned int pgbase, unsigned int pglen)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = _nfs4_proc_readlink(inode, page, pgbase, pglen);
+ trace_nfs4_readlink(inode, err);
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+/*
+ * This is just for mknod. open(O_CREAT) will always do ->open_context().
+ */
+static int
+nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+ int flags)
+{
+ struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs4_label l, *ilabel;
+ struct nfs_open_context *ctx;
+ struct nfs4_state *state;
+ int status = 0;
+
+ ctx = alloc_nfs_open_context(dentry, FMODE_READ, NULL);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
+
+ if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+ sattr->ia_mode &= ~current_umask();
+ state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, NULL);
+ if (IS_ERR(state)) {
+ status = PTR_ERR(state);
+ goto out;
+ }
+out:
+ nfs4_label_release_security(ilabel);
+ put_nfs_open_context(ctx);
+ return status;
+}
+
+static int
+_nfs4_proc_remove(struct inode *dir, const struct qstr *name, u32 ftype)
+{
+ struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs_removeargs args = {
+ .fh = NFS_FH(dir),
+ .name = *name,
+ };
+ struct nfs_removeres res = {
+ .server = server,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ unsigned long timestamp = jiffies;
+ int status;
+
+ status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
+ if (status == 0) {
+ spin_lock(&dir->i_lock);
+ nfs4_update_changeattr_locked(dir, &res.cinfo, timestamp,
+ NFS_INO_INVALID_DATA);
+ /* Removing a directory decrements nlink in the parent */
+ if (ftype == NF4DIR && dir->i_nlink > 2)
+ nfs4_dec_nlink_locked(dir);
+ spin_unlock(&dir->i_lock);
+ }
+ return status;
+}
+
+static int nfs4_proc_remove(struct inode *dir, struct dentry *dentry)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ struct inode *inode = d_inode(dentry);
+ int err;
+
+ if (inode) {
+ if (inode->i_nlink == 1)
+ nfs4_inode_return_delegation(inode);
+ else
+ nfs4_inode_make_writeable(inode);
+ }
+ do {
+ err = _nfs4_proc_remove(dir, &dentry->d_name, NF4REG);
+ trace_nfs4_remove(dir, &dentry->d_name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int nfs4_proc_rmdir(struct inode *dir, const struct qstr *name)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+
+ do {
+ err = _nfs4_proc_remove(dir, name, NF4DIR);
+ trace_nfs4_remove(dir, name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static void nfs4_proc_unlink_setup(struct rpc_message *msg,
+ struct dentry *dentry,
+ struct inode *inode)
+{
+ struct nfs_removeargs *args = msg->rpc_argp;
+ struct nfs_removeres *res = msg->rpc_resp;
+
+ res->server = NFS_SB(dentry->d_sb);
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
+ nfs4_init_sequence(&args->seq_args, &res->seq_res, 1, 0);
+
+ nfs_fattr_init(res->dir_attr);
+
+ if (inode)
+ nfs4_inode_return_delegation(inode);
+}
+
+static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
+{
+ nfs4_setup_sequence(NFS_SB(data->dentry->d_sb)->nfs_client,
+ &data->args.seq_args,
+ &data->res.seq_res,
+ task);
+}
+
+static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
+{
+ struct nfs_unlinkdata *data = task->tk_calldata;
+ struct nfs_removeres *res = &data->res;
+
+ if (!nfs4_sequence_done(task, &res->seq_res))
+ return 0;
+ if (nfs4_async_handle_error(task, res->server, NULL,
+ &data->timeout) == -EAGAIN)
+ return 0;
+ if (task->tk_status == 0)
+ nfs4_update_changeattr(dir, &res->cinfo,
+ res->dir_attr->time_start,
+ NFS_INO_INVALID_DATA);
+ return 1;
+}
+
+static void nfs4_proc_rename_setup(struct rpc_message *msg,
+ struct dentry *old_dentry,
+ struct dentry *new_dentry)
+{
+ struct nfs_renameargs *arg = msg->rpc_argp;
+ struct nfs_renameres *res = msg->rpc_resp;
+ struct inode *old_inode = d_inode(old_dentry);
+ struct inode *new_inode = d_inode(new_dentry);
+
+ if (old_inode)
+ nfs4_inode_make_writeable(old_inode);
+ if (new_inode)
+ nfs4_inode_return_delegation(new_inode);
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
+ res->server = NFS_SB(old_dentry->d_sb);
+ nfs4_init_sequence(&arg->seq_args, &res->seq_res, 1, 0);
+}
+
+static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
+{
+ nfs4_setup_sequence(NFS_SERVER(data->old_dir)->nfs_client,
+ &data->args.seq_args,
+ &data->res.seq_res,
+ task);
+}
+
+static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+ struct inode *new_dir)
+{
+ struct nfs_renamedata *data = task->tk_calldata;
+ struct nfs_renameres *res = &data->res;
+
+ if (!nfs4_sequence_done(task, &res->seq_res))
+ return 0;
+ if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN)
+ return 0;
+
+ if (task->tk_status == 0) {
+ if (new_dir != old_dir) {
+ /* Note: If we moved a directory, nlink will change */
+ nfs4_update_changeattr(old_dir, &res->old_cinfo,
+ res->old_fattr->time_start,
+ NFS_INO_INVALID_OTHER |
+ NFS_INO_INVALID_DATA);
+ nfs4_update_changeattr(new_dir, &res->new_cinfo,
+ res->new_fattr->time_start,
+ NFS_INO_INVALID_OTHER |
+ NFS_INO_INVALID_DATA);
+ } else
+ nfs4_update_changeattr(old_dir, &res->old_cinfo,
+ res->old_fattr->time_start,
+ NFS_INO_INVALID_DATA);
+ }
+ return 1;
+}
+
+static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ __u32 bitmask[NFS4_BITMASK_SZ];
+ struct nfs4_link_arg arg = {
+ .fh = NFS_FH(inode),
+ .dir_fh = NFS_FH(dir),
+ .name = name,
+ .bitmask = bitmask,
+ };
+ struct nfs4_link_res res = {
+ .server = server,
+ .label = NULL,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int status = -ENOMEM;
+
+ res.fattr = nfs_alloc_fattr();
+ if (res.fattr == NULL)
+ goto out;
+
+ res.label = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(res.label)) {
+ status = PTR_ERR(res.label);
+ goto out;
+ }
+
+ nfs4_inode_make_writeable(inode);
+ nfs4_bitmap_copy_adjust_setattr(bitmask, nfs4_bitmask(server, res.label), inode);
+
+ status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+ if (!status) {
+ nfs4_update_changeattr(dir, &res.cinfo, res.fattr->time_start,
+ NFS_INO_INVALID_DATA);
+ status = nfs_post_op_update_inode(inode, res.fattr);
+ if (!status)
+ nfs_setsecurity(inode, res.fattr, res.label);
+ }
+
+
+ nfs4_label_free(res.label);
+
+out:
+ nfs_free_fattr(res.fattr);
+ return status;
+}
+
+static int nfs4_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = nfs4_handle_exception(NFS_SERVER(inode),
+ _nfs4_proc_link(inode, dir, name),
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+struct nfs4_createdata {
+ struct rpc_message msg;
+ struct nfs4_create_arg arg;
+ struct nfs4_create_res res;
+ struct nfs_fh fh;
+ struct nfs_fattr fattr;
+ struct nfs4_label *label;
+};
+
+static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
+ const struct qstr *name, struct iattr *sattr, u32 ftype)
+{
+ struct nfs4_createdata *data;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (data != NULL) {
+ struct nfs_server *server = NFS_SERVER(dir);
+
+ data->label = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(data->label))
+ goto out_free;
+
+ data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
+ data->msg.rpc_argp = &data->arg;
+ data->msg.rpc_resp = &data->res;
+ data->arg.dir_fh = NFS_FH(dir);
+ data->arg.server = server;
+ data->arg.name = name;
+ data->arg.attrs = sattr;
+ data->arg.ftype = ftype;
+ data->arg.bitmask = nfs4_bitmask(server, data->label);
+ data->arg.umask = current_umask();
+ data->res.server = server;
+ data->res.fh = &data->fh;
+ data->res.fattr = &data->fattr;
+ data->res.label = data->label;
+ nfs_fattr_init(data->res.fattr);
+ }
+ return data;
+out_free:
+ kfree(data);
+ return NULL;
+}
+
+static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
+{
+ int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg,
+ &data->arg.seq_args, &data->res.seq_res, 1);
+ if (status == 0) {
+ spin_lock(&dir->i_lock);
+ nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo,
+ data->res.fattr->time_start,
+ NFS_INO_INVALID_DATA);
+ /* Creating a directory bumps nlink in the parent */
+ if (data->arg.ftype == NF4DIR)
+ nfs4_inc_nlink_locked(dir);
+ spin_unlock(&dir->i_lock);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label);
+ }
+ return status;
+}
+
+static void nfs4_free_createdata(struct nfs4_createdata *data)
+{
+ nfs4_label_free(data->label);
+ kfree(data);
+}
+
+static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
+ struct page *page, unsigned int len, struct iattr *sattr,
+ struct nfs4_label *label)
+{
+ struct nfs4_createdata *data;
+ int status = -ENAMETOOLONG;
+
+ if (len > NFS4_MAXPATHLEN)
+ goto out;
+
+ status = -ENOMEM;
+ data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4LNK);
+ if (data == NULL)
+ goto out;
+
+ data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
+ data->arg.u.symlink.pages = &page;
+ data->arg.u.symlink.len = len;
+ data->arg.label = label;
+
+ status = nfs4_do_create(dir, dentry, data);
+
+ nfs4_free_createdata(data);
+out:
+ return status;
+}
+
+static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
+ struct page *page, unsigned int len, struct iattr *sattr)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ struct nfs4_label l, *label;
+ int err;
+
+ label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
+ do {
+ err = _nfs4_proc_symlink(dir, dentry, page, len, sattr, label);
+ trace_nfs4_symlink(dir, &dentry->d_name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
+ &exception);
+ } while (exception.retry);
+
+ nfs4_label_release_security(label);
+ return err;
+}
+
+static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr, struct nfs4_label *label)
+{
+ struct nfs4_createdata *data;
+ int status = -ENOMEM;
+
+ data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR);
+ if (data == NULL)
+ goto out;
+
+ data->arg.label = label;
+ status = nfs4_do_create(dir, dentry, data);
+
+ nfs4_free_createdata(data);
+out:
+ return status;
+}
+
+static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr)
+{
+ struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ struct nfs4_label l, *label;
+ int err;
+
+ label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
+ if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+ sattr->ia_mode &= ~current_umask();
+ do {
+ err = _nfs4_proc_mkdir(dir, dentry, sattr, label);
+ trace_nfs4_mkdir(dir, &dentry->d_name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
+ &exception);
+ } while (exception.retry);
+ nfs4_label_release_security(label);
+
+ return err;
+}
+
+static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred,
+ u64 cookie, struct page **pages, unsigned int count, bool plus)
+{
+ struct inode *dir = d_inode(dentry);
+ struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs4_readdir_arg args = {
+ .fh = NFS_FH(dir),
+ .pages = pages,
+ .pgbase = 0,
+ .count = count,
+ .plus = plus,
+ };
+ struct nfs4_readdir_res res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READDIR],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ int status;
+
+ dprintk("%s: dentry = %pd2, cookie = %Lu\n", __func__,
+ dentry,
+ (unsigned long long)cookie);
+ if (!(server->caps & NFS_CAP_SECURITY_LABEL))
+ args.bitmask = server->attr_bitmask_nl;
+ else
+ args.bitmask = server->attr_bitmask;
+
+ nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args);
+ res.pgbase = args.pgbase;
+ status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
+ &res.seq_res, 0);
+ if (status >= 0) {
+ memcpy(NFS_I(dir)->cookieverf, res.verifier.data, NFS4_VERIFIER_SIZE);
+ status += args.pgbase;
+ }
+
+ nfs_invalidate_atime(dir);
+
+ dprintk("%s: returns %d\n", __func__, status);
+ return status;
+}
+
+static int nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred,
+ u64 cookie, struct page **pages, unsigned int count, bool plus)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = _nfs4_proc_readdir(dentry, cred, cookie,
+ pages, count, plus);
+ trace_nfs4_readdir(d_inode(dentry), err);
+ err = nfs4_handle_exception(NFS_SERVER(d_inode(dentry)), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr, struct nfs4_label *label, dev_t rdev)
+{
+ struct nfs4_createdata *data;
+ int mode = sattr->ia_mode;
+ int status = -ENOMEM;
+
+ data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK);
+ if (data == NULL)
+ goto out;
+
+ if (S_ISFIFO(mode))
+ data->arg.ftype = NF4FIFO;
+ else if (S_ISBLK(mode)) {
+ data->arg.ftype = NF4BLK;
+ data->arg.u.device.specdata1 = MAJOR(rdev);
+ data->arg.u.device.specdata2 = MINOR(rdev);
+ }
+ else if (S_ISCHR(mode)) {
+ data->arg.ftype = NF4CHR;
+ data->arg.u.device.specdata1 = MAJOR(rdev);
+ data->arg.u.device.specdata2 = MINOR(rdev);
+ } else if (!S_ISSOCK(mode)) {
+ status = -EINVAL;
+ goto out_free;
+ }
+
+ data->arg.label = label;
+ status = nfs4_do_create(dir, dentry, data);
+out_free:
+ nfs4_free_createdata(data);
+out:
+ return status;
+}
+
+static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr, dev_t rdev)
+{
+ struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ struct nfs4_label l, *label;
+ int err;
+
+ label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
+ if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+ sattr->ia_mode &= ~current_umask();
+ do {
+ err = _nfs4_proc_mknod(dir, dentry, sattr, label, rdev);
+ trace_nfs4_mknod(dir, &dentry->d_name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
+ &exception);
+ } while (exception.retry);
+
+ nfs4_label_release_security(label);
+
+ return err;
+}
+
+static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsstat *fsstat)
+{
+ struct nfs4_statfs_arg args = {
+ .fh = fhandle,
+ .bitmask = server->attr_bitmask,
+ };
+ struct nfs4_statfs_res res = {
+ .fsstat = fsstat,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_STATFS],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+
+ nfs_fattr_init(fsstat->fattr);
+ return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = nfs4_handle_exception(server,
+ _nfs4_proc_statfs(server, fhandle, fsstat),
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *fsinfo)
+{
+ struct nfs4_fsinfo_arg args = {
+ .fh = fhandle,
+ .bitmask = server->attr_bitmask,
+ };
+ struct nfs4_fsinfo_res res = {
+ .fsinfo = fsinfo,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSINFO],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+
+ return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+
+ do {
+ err = _nfs4_do_fsinfo(server, fhandle, fsinfo);
+ trace_nfs4_fsinfo(server, fhandle, fsinfo->fattr, err);
+ if (err == 0) {
+ nfs4_set_lease_period(server->nfs_client, fsinfo->lease_time * HZ);
+ break;
+ }
+ err = nfs4_handle_exception(server, err, &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
+{
+ int error;
+
+ nfs_fattr_init(fsinfo->fattr);
+ error = nfs4_do_fsinfo(server, fhandle, fsinfo);
+ if (error == 0) {
+ /* block layout checks this! */
+ server->pnfs_blksize = fsinfo->blksize;
+ set_pnfs_layoutdriver(server, fhandle, fsinfo);
+ }
+
+ return error;
+}
+
+static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_pathconf *pathconf)
+{
+ struct nfs4_pathconf_arg args = {
+ .fh = fhandle,
+ .bitmask = server->attr_bitmask,
+ };
+ struct nfs4_pathconf_res res = {
+ .pathconf = pathconf,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PATHCONF],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+
+ /* None of the pathconf attributes are mandatory to implement */
+ if ((args.bitmask[0] & nfs4_pathconf_bitmap[0]) == 0) {
+ memset(pathconf, 0, sizeof(*pathconf));
+ return 0;
+ }
+
+ nfs_fattr_init(pathconf->fattr);
+ return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+}
+
+static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_pathconf *pathconf)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+
+ do {
+ err = nfs4_handle_exception(server,
+ _nfs4_proc_pathconf(server, fhandle, pathconf),
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+int nfs4_set_rw_stateid(nfs4_stateid *stateid,
+ const struct nfs_open_context *ctx,
+ const struct nfs_lock_context *l_ctx,
+ fmode_t fmode)
+{
+ return nfs4_select_rw_stateid(ctx->state, fmode, l_ctx, stateid, NULL);
+}
+EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid);
+
+static bool nfs4_stateid_is_current(nfs4_stateid *stateid,
+ const struct nfs_open_context *ctx,
+ const struct nfs_lock_context *l_ctx,
+ fmode_t fmode)
+{
+ nfs4_stateid _current_stateid;
+
+ /* If the current stateid represents a lost lock, then exit */
+ if (nfs4_set_rw_stateid(&_current_stateid, ctx, l_ctx, fmode) == -EIO)
+ return true;
+ return nfs4_stateid_match(stateid, &_current_stateid);
+}
+
+static bool nfs4_error_stateid_expired(int err)
+{
+ switch (err) {
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_OLD_STATEID:
+ case -NFS4ERR_OPENMODE:
+ case -NFS4ERR_EXPIRED:
+ return true;
+ }
+ return false;
+}
+
+static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+ struct nfs_server *server = NFS_SERVER(hdr->inode);
+
+ trace_nfs4_read(hdr, task->tk_status);
+ if (task->tk_status < 0) {
+ struct nfs4_exception exception = {
+ .inode = hdr->inode,
+ .state = hdr->args.context->state,
+ .stateid = &hdr->args.stateid,
+ };
+ task->tk_status = nfs4_async_handle_exception(task,
+ server, task->tk_status, &exception);
+ if (exception.retry) {
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
+ }
+
+ if (task->tk_status > 0)
+ renew_lease(server, hdr->timestamp);
+ return 0;
+}
+
+static bool nfs4_read_stateid_changed(struct rpc_task *task,
+ struct nfs_pgio_args *args)
+{
+
+ if (!nfs4_error_stateid_expired(task->tk_status) ||
+ nfs4_stateid_is_current(&args->stateid,
+ args->context,
+ args->lock_context,
+ FMODE_READ))
+ return false;
+ rpc_restart_call_prepare(task);
+ return true;
+}
+
+static bool nfs4_read_plus_not_supported(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ struct nfs_server *server = NFS_SERVER(hdr->inode);
+ struct rpc_message *msg = &task->tk_msg;
+
+ if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS] &&
+ server->caps & NFS_CAP_READ_PLUS && task->tk_status == -ENOTSUPP) {
+ server->caps &= ~NFS_CAP_READ_PLUS;
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+ rpc_restart_call_prepare(task);
+ return true;
+ }
+ return false;
+}
+
+static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+ dprintk("--> %s\n", __func__);
+
+ if (!nfs4_sequence_done(task, &hdr->res.seq_res))
+ return -EAGAIN;
+ if (nfs4_read_stateid_changed(task, &hdr->args))
+ return -EAGAIN;
+ if (nfs4_read_plus_not_supported(task, hdr))
+ return -EAGAIN;
+ if (task->tk_status > 0)
+ nfs_invalidate_atime(hdr->inode);
+ return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
+ nfs4_read_done_cb(task, hdr);
+}
+
+#if defined CONFIG_NFS_V4_2 && defined CONFIG_NFS_V4_2_READ_PLUS
+static void nfs42_read_plus_support(struct nfs_server *server, struct rpc_message *msg)
+{
+ if (server->caps & NFS_CAP_READ_PLUS)
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS];
+ else
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+}
+#else
+static void nfs42_read_plus_support(struct nfs_server *server, struct rpc_message *msg)
+{
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+}
+#endif /* CONFIG_NFS_V4_2 */
+
+static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg)
+{
+ hdr->timestamp = jiffies;
+ if (!hdr->pgio_done_cb)
+ hdr->pgio_done_cb = nfs4_read_done_cb;
+ nfs42_read_plus_support(NFS_SERVER(hdr->inode), msg);
+ nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
+}
+
+static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (nfs4_setup_sequence(NFS_SERVER(hdr->inode)->nfs_client,
+ &hdr->args.seq_args,
+ &hdr->res.seq_res,
+ task))
+ return 0;
+ if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+ hdr->args.lock_context,
+ hdr->rw_mode) == -EIO)
+ return -EIO;
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags)))
+ return -EIO;
+ return 0;
+}
+
+static int nfs4_write_done_cb(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ struct inode *inode = hdr->inode;
+
+ trace_nfs4_write(hdr, task->tk_status);
+ if (task->tk_status < 0) {
+ struct nfs4_exception exception = {
+ .inode = hdr->inode,
+ .state = hdr->args.context->state,
+ .stateid = &hdr->args.stateid,
+ };
+ task->tk_status = nfs4_async_handle_exception(task,
+ NFS_SERVER(inode), task->tk_status,
+ &exception);
+ if (exception.retry) {
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
+ }
+ if (task->tk_status >= 0) {
+ renew_lease(NFS_SERVER(inode), hdr->timestamp);
+ nfs_writeback_update_inode(hdr);
+ }
+ return 0;
+}
+
+static bool nfs4_write_stateid_changed(struct rpc_task *task,
+ struct nfs_pgio_args *args)
+{
+
+ if (!nfs4_error_stateid_expired(task->tk_status) ||
+ nfs4_stateid_is_current(&args->stateid,
+ args->context,
+ args->lock_context,
+ FMODE_WRITE))
+ return false;
+ rpc_restart_call_prepare(task);
+ return true;
+}
+
+static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+ if (!nfs4_sequence_done(task, &hdr->res.seq_res))
+ return -EAGAIN;
+ if (nfs4_write_stateid_changed(task, &hdr->args))
+ return -EAGAIN;
+ return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
+ nfs4_write_done_cb(task, hdr);
+}
+
+static
+bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr)
+{
+ /* Don't request attributes for pNFS or O_DIRECT writes */
+ if (hdr->ds_clp != NULL || hdr->dreq != NULL)
+ return false;
+ /* Otherwise, request attributes if and only if we don't hold
+ * a delegation
+ */
+ return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
+}
+
+static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], const __u32 *src,
+ struct inode *inode, struct nfs_server *server,
+ struct nfs4_label *label)
+{
+ unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+ unsigned int i;
+
+ memcpy(bitmask, src, sizeof(*bitmask) * NFS4_BITMASK_SZ);
+
+ if (cache_validity & (NFS_INO_INVALID_CHANGE | NFS_INO_REVAL_PAGECACHE))
+ bitmask[0] |= FATTR4_WORD0_CHANGE;
+ if (cache_validity & NFS_INO_INVALID_ATIME)
+ bitmask[1] |= FATTR4_WORD1_TIME_ACCESS;
+ if (cache_validity & NFS_INO_INVALID_OTHER)
+ bitmask[1] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER |
+ FATTR4_WORD1_OWNER_GROUP |
+ FATTR4_WORD1_NUMLINKS;
+ if (label && label->len && cache_validity & NFS_INO_INVALID_LABEL)
+ bitmask[2] |= FATTR4_WORD2_SECURITY_LABEL;
+ if (cache_validity & NFS_INO_INVALID_CTIME)
+ bitmask[1] |= FATTR4_WORD1_TIME_METADATA;
+ if (cache_validity & NFS_INO_INVALID_MTIME)
+ bitmask[1] |= FATTR4_WORD1_TIME_MODIFY;
+ if (cache_validity & NFS_INO_INVALID_BLOCKS)
+ bitmask[1] |= FATTR4_WORD1_SPACE_USED;
+
+ if (nfs4_have_delegation(inode, FMODE_READ) &&
+ !(cache_validity & NFS_INO_REVAL_FORCED))
+ bitmask[0] &= ~FATTR4_WORD0_SIZE;
+ else if (cache_validity &
+ (NFS_INO_INVALID_SIZE | NFS_INO_REVAL_PAGECACHE))
+ bitmask[0] |= FATTR4_WORD0_SIZE;
+
+ for (i = 0; i < NFS4_BITMASK_SZ; i++)
+ bitmask[i] &= server->attr_bitmask[i];
+}
+
+static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg,
+ struct rpc_clnt **clnt)
+{
+ struct nfs_server *server = NFS_SERVER(hdr->inode);
+
+ if (!nfs4_write_need_cache_consistency_data(hdr)) {
+ hdr->args.bitmask = NULL;
+ hdr->res.fattr = NULL;
+ } else {
+ nfs4_bitmask_set(hdr->args.bitmask_store,
+ server->cache_consistency_bitmask,
+ hdr->inode, server, NULL);
+ hdr->args.bitmask = hdr->args.bitmask_store;
+ }
+
+ if (!hdr->pgio_done_cb)
+ hdr->pgio_done_cb = nfs4_write_done_cb;
+ hdr->res.server = server;
+ hdr->timestamp = jiffies;
+
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
+ nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
+ nfs4_state_protect_write(hdr->ds_clp ? hdr->ds_clp : server->nfs_client, clnt, msg, hdr);
+}
+
+static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
+{
+ nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client,
+ &data->args.seq_args,
+ &data->res.seq_res,
+ task);
+}
+
+static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data)
+{
+ struct inode *inode = data->inode;
+
+ trace_nfs4_commit(data, task->tk_status);
+ if (nfs4_async_handle_error(task, NFS_SERVER(inode),
+ NULL, NULL) == -EAGAIN) {
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static int nfs4_commit_done(struct rpc_task *task, struct nfs_commit_data *data)
+{
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return -EAGAIN;
+ return data->commit_done_cb(task, data);
+}
+
+static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg,
+ struct rpc_clnt **clnt)
+{
+ struct nfs_server *server = NFS_SERVER(data->inode);
+
+ if (data->commit_done_cb == NULL)
+ data->commit_done_cb = nfs4_commit_done_cb;
+ data->res.server = server;
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 0);
+ nfs4_state_protect(data->ds_clp ? data->ds_clp : server->nfs_client,
+ NFS_SP4_MACH_CRED_COMMIT, clnt, msg);
+}
+
+static int _nfs4_proc_commit(struct file *dst, struct nfs_commitargs *args,
+ struct nfs_commitres *res)
+{
+ struct inode *dst_inode = file_inode(dst);
+ struct nfs_server *server = NFS_SERVER(dst_inode);
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT],
+ .rpc_argp = args,
+ .rpc_resp = res,
+ };
+
+ args->fh = NFS_FH(dst_inode);
+ return nfs4_call_sync(server->client, server, &msg,
+ &args->seq_args, &res->seq_res, 1);
+}
+
+int nfs4_proc_commit(struct file *dst, __u64 offset, __u32 count, struct nfs_commitres *res)
+{
+ struct nfs_commitargs args = {
+ .offset = offset,
+ .count = count,
+ };
+ struct nfs_server *dst_server = NFS_SERVER(file_inode(dst));
+ struct nfs4_exception exception = { };
+ int status;
+
+ do {
+ status = _nfs4_proc_commit(dst, &args, res);
+ status = nfs4_handle_exception(dst_server, status, &exception);
+ } while (exception.retry);
+
+ return status;
+}
+
+struct nfs4_renewdata {
+ struct nfs_client *client;
+ unsigned long timestamp;
+};
+
+/*
+ * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special
+ * standalone procedure for queueing an asynchronous RENEW.
+ */
+static void nfs4_renew_release(void *calldata)
+{
+ struct nfs4_renewdata *data = calldata;
+ struct nfs_client *clp = data->client;
+
+ if (refcount_read(&clp->cl_count) > 1)
+ nfs4_schedule_state_renewal(clp);
+ nfs_put_client(clp);
+ kfree(data);
+}
+
+static void nfs4_renew_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_renewdata *data = calldata;
+ struct nfs_client *clp = data->client;
+ unsigned long timestamp = data->timestamp;
+
+ trace_nfs4_renew_async(clp, task->tk_status);
+ switch (task->tk_status) {
+ case 0:
+ break;
+ case -NFS4ERR_LEASE_MOVED:
+ nfs4_schedule_lease_moved_recovery(clp);
+ break;
+ default:
+ /* Unless we're shutting down, schedule state recovery! */
+ if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0)
+ return;
+ if (task->tk_status != NFS4ERR_CB_PATH_DOWN) {
+ nfs4_schedule_lease_recovery(clp);
+ return;
+ }
+ nfs4_schedule_path_down_recovery(clp);
+ }
+ do_renew_lease(clp, timestamp);
+}
+
+static const struct rpc_call_ops nfs4_renew_ops = {
+ .rpc_call_done = nfs4_renew_done,
+ .rpc_release = nfs4_renew_release,
+};
+
+static int nfs4_proc_async_renew(struct nfs_client *clp, const struct cred *cred, unsigned renew_flags)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
+ .rpc_argp = clp,
+ .rpc_cred = cred,
+ };
+ struct nfs4_renewdata *data;
+
+ if (renew_flags == 0)
+ return 0;
+ if (!refcount_inc_not_zero(&clp->cl_count))
+ return -EIO;
+ data = kmalloc(sizeof(*data), GFP_NOFS);
+ if (data == NULL) {
+ nfs_put_client(clp);
+ return -ENOMEM;
+ }
+ data->client = clp;
+ data->timestamp = jiffies;
+ return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT,
+ &nfs4_renew_ops, data);
+}
+
+static int nfs4_proc_renew(struct nfs_client *clp, const struct cred *cred)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
+ .rpc_argp = clp,
+ .rpc_cred = cred,
+ };
+ unsigned long now = jiffies;
+ int status;
+
+ status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+ if (status < 0)
+ return status;
+ do_renew_lease(clp, now);
+ return 0;
+}
+
+static inline int nfs4_server_supports_acls(struct nfs_server *server)
+{
+ return server->caps & NFS_CAP_ACLS;
+}
+
+/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that
+ * it's OK to put sizeof(void) * (XATTR_SIZE_MAX/PAGE_SIZE) bytes on
+ * the stack.
+ */
+#define NFS4ACL_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE)
+
+int nfs4_buf_to_pages_noslab(const void *buf, size_t buflen,
+ struct page **pages)
+{
+ struct page *newpage, **spages;
+ int rc = 0;
+ size_t len;
+ spages = pages;
+
+ do {
+ len = min_t(size_t, PAGE_SIZE, buflen);
+ newpage = alloc_page(GFP_KERNEL);
+
+ if (newpage == NULL)
+ goto unwind;
+ memcpy(page_address(newpage), buf, len);
+ buf += len;
+ buflen -= len;
+ *pages++ = newpage;
+ rc++;
+ } while (buflen != 0);
+
+ return rc;
+
+unwind:
+ for(; rc > 0; rc--)
+ __free_page(spages[rc-1]);
+ return -ENOMEM;
+}
+
+struct nfs4_cached_acl {
+ int cached;
+ size_t len;
+ char data[];
+};
+
+static void nfs4_set_cached_acl(struct inode *inode, struct nfs4_cached_acl *acl)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ spin_lock(&inode->i_lock);
+ kfree(nfsi->nfs4_acl);
+ nfsi->nfs4_acl = acl;
+ spin_unlock(&inode->i_lock);
+}
+
+static void nfs4_zap_acl_attr(struct inode *inode)
+{
+ nfs4_set_cached_acl(inode, NULL);
+}
+
+static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_t buflen)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs4_cached_acl *acl;
+ int ret = -ENOENT;
+
+ spin_lock(&inode->i_lock);
+ acl = nfsi->nfs4_acl;
+ if (acl == NULL)
+ goto out;
+ if (buf == NULL) /* user is just asking for length */
+ goto out_len;
+ if (acl->cached == 0)
+ goto out;
+ ret = -ERANGE; /* see getxattr(2) man page */
+ if (acl->len > buflen)
+ goto out;
+ memcpy(buf, acl->data, acl->len);
+out_len:
+ ret = acl->len;
+out:
+ spin_unlock(&inode->i_lock);
+ return ret;
+}
+
+static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size_t pgbase, size_t acl_len)
+{
+ struct nfs4_cached_acl *acl;
+ size_t buflen = sizeof(*acl) + acl_len;
+
+ if (buflen <= PAGE_SIZE) {
+ acl = kmalloc(buflen, GFP_KERNEL);
+ if (acl == NULL)
+ goto out;
+ acl->cached = 1;
+ _copy_from_pages(acl->data, pages, pgbase, acl_len);
+ } else {
+ acl = kmalloc(sizeof(*acl), GFP_KERNEL);
+ if (acl == NULL)
+ goto out;
+ acl->cached = 0;
+ }
+ acl->len = acl_len;
+out:
+ nfs4_set_cached_acl(inode, acl);
+}
+
+/*
+ * The getxattr API returns the required buffer length when called with a
+ * NULL buf. The NFSv4 acl tool then calls getxattr again after allocating
+ * the required buf. On a NULL buf, we send a page of data to the server
+ * guessing that the ACL request can be serviced by a page. If so, we cache
+ * up to the page of ACL data, and the 2nd call to getxattr is serviced by
+ * the cache. If not so, we throw away the page, and cache the required
+ * length. The next getxattr call will then produce another round trip to
+ * the server, this time with the input buf of the required size.
+ */
+static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
+{
+ struct page **pages;
+ struct nfs_getaclargs args = {
+ .fh = NFS_FH(inode),
+ .acl_len = buflen,
+ };
+ struct nfs_getaclres res = {
+ .acl_len = buflen,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ unsigned int npages;
+ int ret = -ENOMEM, i;
+ struct nfs_server *server = NFS_SERVER(inode);
+
+ if (buflen == 0)
+ buflen = server->rsize;
+
+ npages = DIV_ROUND_UP(buflen, PAGE_SIZE) + 1;
+ pages = kmalloc_array(npages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
+
+ args.acl_pages = pages;
+
+ for (i = 0; i < npages; i++) {
+ pages[i] = alloc_page(GFP_KERNEL);
+ if (!pages[i])
+ goto out_free;
+ }
+
+ /* for decoding across pages */
+ res.acl_scratch = alloc_page(GFP_KERNEL);
+ if (!res.acl_scratch)
+ goto out_free;
+
+ args.acl_len = npages * PAGE_SIZE;
+
+ dprintk("%s buf %p buflen %zu npages %d args.acl_len %zu\n",
+ __func__, buf, buflen, npages, args.acl_len);
+ ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode),
+ &msg, &args.seq_args, &res.seq_res, 0);
+ if (ret)
+ goto out_free;
+
+ /* Handle the case where the passed-in buffer is too short */
+ if (res.acl_flags & NFS4_ACL_TRUNC) {
+ /* Did the user only issue a request for the acl length? */
+ if (buf == NULL)
+ goto out_ok;
+ ret = -ERANGE;
+ goto out_free;
+ }
+ nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len);
+ if (buf) {
+ if (res.acl_len > buflen) {
+ ret = -ERANGE;
+ goto out_free;
+ }
+ _copy_from_pages(buf, pages, res.acl_data_offset, res.acl_len);
+ }
+out_ok:
+ ret = res.acl_len;
+out_free:
+ while (--i >= 0)
+ __free_page(pages[i]);
+ if (res.acl_scratch)
+ __free_page(res.acl_scratch);
+ kfree(pages);
+ return ret;
+}
+
+static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ ssize_t ret;
+ do {
+ ret = __nfs4_get_acl_uncached(inode, buf, buflen);
+ trace_nfs4_get_acl(inode, ret);
+ if (ret >= 0)
+ break;
+ ret = nfs4_handle_exception(NFS_SERVER(inode), ret, &exception);
+ } while (exception.retry);
+ return ret;
+}
+
+static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ int ret;
+
+ if (!nfs4_server_supports_acls(server))
+ return -EOPNOTSUPP;
+ ret = nfs_revalidate_inode(server, inode);
+ if (ret < 0)
+ return ret;
+ if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+ nfs_zap_acl_cache(inode);
+ ret = nfs4_read_cached_acl(inode, buf, buflen);
+ if (ret != -ENOENT)
+ /* -ENOENT is returned if there is no ACL or if there is an ACL
+ * but no cached acl data, just the acl length */
+ return ret;
+ return nfs4_get_acl_uncached(inode, buf, buflen);
+}
+
+static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct page *pages[NFS4ACL_MAXPAGES];
+ struct nfs_setaclargs arg = {
+ .fh = NFS_FH(inode),
+ .acl_pages = pages,
+ .acl_len = buflen,
+ };
+ struct nfs_setaclres res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETACL],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE);
+ int ret, i;
+
+ /* You can't remove system.nfs4_acl: */
+ if (buflen == 0)
+ return -EINVAL;
+ if (!nfs4_server_supports_acls(server))
+ return -EOPNOTSUPP;
+ if (npages > ARRAY_SIZE(pages))
+ return -ERANGE;
+ i = nfs4_buf_to_pages_noslab(buf, buflen, arg.acl_pages);
+ if (i < 0)
+ return i;
+ nfs4_inode_make_writeable(inode);
+ ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+
+ /*
+ * Free each page after tx, so the only ref left is
+ * held by the network stack
+ */
+ for (; i > 0; i--)
+ put_page(pages[i-1]);
+
+ /*
+ * Acl update can result in inode attribute update.
+ * so mark the attribute cache invalid.
+ */
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_CTIME
+ | NFS_INO_REVAL_FORCED;
+ spin_unlock(&inode->i_lock);
+ nfs_access_zap_cache(inode);
+ nfs_zap_acl_cache(inode);
+ return ret;
+}
+
+static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen)
+{
+ struct nfs4_exception exception = { };
+ int err;
+ do {
+ err = __nfs4_proc_set_acl(inode, buf, buflen);
+ trace_nfs4_set_acl(inode, err);
+ if (err == -NFS4ERR_BADOWNER || err == -NFS4ERR_BADNAME) {
+ /*
+ * no need to retry since the kernel
+ * isn't involved in encoding the ACEs.
+ */
+ err = -EINVAL;
+ break;
+ }
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static int _nfs4_get_security_label(struct inode *inode, void *buf,
+ size_t buflen)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_fattr fattr;
+ struct nfs4_label label = {0, 0, buflen, buf};
+
+ u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
+ struct nfs4_getattr_arg arg = {
+ .fh = NFS_FH(inode),
+ .bitmask = bitmask,
+ };
+ struct nfs4_getattr_res res = {
+ .fattr = &fattr,
+ .label = &label,
+ .server = server,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int ret;
+
+ nfs_fattr_init(&fattr);
+
+ ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0);
+ if (ret)
+ return ret;
+ if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
+ return -ENOENT;
+ return label.len;
+}
+
+static int nfs4_get_security_label(struct inode *inode, void *buf,
+ size_t buflen)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+
+ if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
+ return -EOPNOTSUPP;
+
+ do {
+ err = _nfs4_get_security_label(inode, buf, buflen);
+ trace_nfs4_get_security_label(inode, err);
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int _nfs4_do_set_security_label(struct inode *inode,
+ struct nfs4_label *ilabel,
+ struct nfs_fattr *fattr,
+ struct nfs4_label *olabel)
+{
+
+ struct iattr sattr = {0};
+ struct nfs_server *server = NFS_SERVER(inode);
+ const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
+ struct nfs_setattrargs arg = {
+ .fh = NFS_FH(inode),
+ .iap = &sattr,
+ .server = server,
+ .bitmask = bitmask,
+ .label = ilabel,
+ };
+ struct nfs_setattrres res = {
+ .fattr = fattr,
+ .label = olabel,
+ .server = server,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int status;
+
+ nfs4_stateid_copy(&arg.stateid, &zero_stateid);
+
+ status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+ if (status)
+ dprintk("%s failed: %d\n", __func__, status);
+
+ return status;
+}
+
+static int nfs4_do_set_security_label(struct inode *inode,
+ struct nfs4_label *ilabel,
+ struct nfs_fattr *fattr,
+ struct nfs4_label *olabel)
+{
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = _nfs4_do_set_security_label(inode, ilabel,
+ fattr, olabel);
+ trace_nfs4_set_security_label(inode, err);
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int
+nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen)
+{
+ struct nfs4_label ilabel, *olabel = NULL;
+ struct nfs_fattr fattr;
+ int status;
+
+ if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
+ return -EOPNOTSUPP;
+
+ nfs_fattr_init(&fattr);
+
+ ilabel.pi = 0;
+ ilabel.lfs = 0;
+ ilabel.label = (char *)buf;
+ ilabel.len = buflen;
+
+ olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+ if (IS_ERR(olabel)) {
+ status = -PTR_ERR(olabel);
+ goto out;
+ }
+
+ status = nfs4_do_set_security_label(inode, &ilabel, &fattr, olabel);
+ if (status == 0)
+ nfs_setsecurity(inode, &fattr, olabel);
+
+ nfs4_label_free(olabel);
+out:
+ return status;
+}
+#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
+
+
+static void nfs4_init_boot_verifier(const struct nfs_client *clp,
+ nfs4_verifier *bootverf)
+{
+ __be32 verf[2];
+
+ if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) {
+ /* An impossible timestamp guarantees this value
+ * will never match a generated boot time. */
+ verf[0] = cpu_to_be32(U32_MAX);
+ verf[1] = cpu_to_be32(U32_MAX);
+ } else {
+ struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+ u64 ns = ktime_to_ns(nn->boot_time);
+
+ verf[0] = cpu_to_be32(ns >> 32);
+ verf[1] = cpu_to_be32(ns);
+ }
+ memcpy(bootverf->data, verf, sizeof(bootverf->data));
+}
+
+static size_t
+nfs4_get_uniquifier(struct nfs_client *clp, char *buf, size_t buflen)
+{
+ struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+ struct nfs_netns_client *nn_clp = nn->nfs_client;
+ const char *id;
+
+ buf[0] = '\0';
+
+ if (nn_clp) {
+ rcu_read_lock();
+ id = rcu_dereference(nn_clp->identifier);
+ if (id)
+ strscpy(buf, id, buflen);
+ rcu_read_unlock();
+ }
+
+ if (nfs4_client_id_uniquifier[0] != '\0' && buf[0] == '\0')
+ strscpy(buf, nfs4_client_id_uniquifier, buflen);
+
+ return strlen(buf);
+}
+
+static int
+nfs4_init_nonuniform_client_string(struct nfs_client *clp)
+{
+ char buf[NFS4_CLIENT_ID_UNIQ_LEN];
+ size_t buflen;
+ size_t len;
+ char *str;
+
+ if (clp->cl_owner_id != NULL)
+ return 0;
+
+ rcu_read_lock();
+ len = 14 +
+ strlen(clp->cl_rpcclient->cl_nodename) +
+ 1 +
+ strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) +
+ 1;
+ rcu_read_unlock();
+
+ buflen = nfs4_get_uniquifier(clp, buf, sizeof(buf));
+ if (buflen)
+ len += buflen + 1;
+
+ if (len > NFS4_OPAQUE_LIMIT + 1)
+ return -EINVAL;
+
+ /*
+ * Since this string is allocated at mount time, and held until the
+ * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
+ * about a memory-reclaim deadlock.
+ */
+ str = kmalloc(len, GFP_KERNEL);
+ if (!str)
+ return -ENOMEM;
+
+ rcu_read_lock();
+ if (buflen)
+ scnprintf(str, len, "Linux NFSv4.0 %s/%s/%s",
+ clp->cl_rpcclient->cl_nodename, buf,
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR));
+ else
+ scnprintf(str, len, "Linux NFSv4.0 %s/%s",
+ clp->cl_rpcclient->cl_nodename,
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR));
+ rcu_read_unlock();
+
+ clp->cl_owner_id = str;
+ return 0;
+}
+
+static int
+nfs4_init_uniform_client_string(struct nfs_client *clp)
+{
+ char buf[NFS4_CLIENT_ID_UNIQ_LEN];
+ size_t buflen;
+ size_t len;
+ char *str;
+
+ if (clp->cl_owner_id != NULL)
+ return 0;
+
+ len = 10 + 10 + 1 + 10 + 1 +
+ strlen(clp->cl_rpcclient->cl_nodename) + 1;
+
+ buflen = nfs4_get_uniquifier(clp, buf, sizeof(buf));
+ if (buflen)
+ len += buflen + 1;
+
+ if (len > NFS4_OPAQUE_LIMIT + 1)
+ return -EINVAL;
+
+ /*
+ * Since this string is allocated at mount time, and held until the
+ * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
+ * about a memory-reclaim deadlock.
+ */
+ str = kmalloc(len, GFP_KERNEL);
+ if (!str)
+ return -ENOMEM;
+
+ if (buflen)
+ scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
+ clp->rpc_ops->version, clp->cl_minorversion,
+ buf, clp->cl_rpcclient->cl_nodename);
+ else
+ scnprintf(str, len, "Linux NFSv%u.%u %s",
+ clp->rpc_ops->version, clp->cl_minorversion,
+ clp->cl_rpcclient->cl_nodename);
+ clp->cl_owner_id = str;
+ return 0;
+}
+
+/*
+ * nfs4_callback_up_net() starts only "tcp" and "tcp6" callback
+ * services. Advertise one based on the address family of the
+ * clientaddr.
+ */
+static unsigned int
+nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len)
+{
+ if (strchr(clp->cl_ipaddr, ':') != NULL)
+ return scnprintf(buf, len, "tcp6");
+ else
+ return scnprintf(buf, len, "tcp");
+}
+
+static void nfs4_setclientid_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_setclientid *sc = calldata;
+
+ if (task->tk_status == 0)
+ sc->sc_cred = get_rpccred(task->tk_rqstp->rq_cred);
+}
+
+static const struct rpc_call_ops nfs4_setclientid_ops = {
+ .rpc_call_done = nfs4_setclientid_done,
+};
+
+/**
+ * nfs4_proc_setclientid - Negotiate client ID
+ * @clp: state data structure
+ * @program: RPC program for NFSv4 callback service
+ * @port: IP port number for NFS4 callback service
+ * @cred: credential to use for this call
+ * @res: where to place the result
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status code.
+ */
+int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
+ unsigned short port, const struct cred *cred,
+ struct nfs4_setclientid_res *res)
+{
+ nfs4_verifier sc_verifier;
+ struct nfs4_setclientid setclientid = {
+ .sc_verifier = &sc_verifier,
+ .sc_prog = program,
+ .sc_clnt = clp,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
+ .rpc_argp = &setclientid,
+ .rpc_resp = res,
+ .rpc_cred = cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clp->cl_rpcclient,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_setclientid_ops,
+ .callback_data = &setclientid,
+ .flags = RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN,
+ };
+ unsigned long now = jiffies;
+ int status;
+
+ /* nfs_client_id4 */
+ nfs4_init_boot_verifier(clp, &sc_verifier);
+
+ if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags))
+ status = nfs4_init_uniform_client_string(clp);
+ else
+ status = nfs4_init_nonuniform_client_string(clp);
+
+ if (status)
+ goto out;
+
+ /* cb_client4 */
+ setclientid.sc_netid_len =
+ nfs4_init_callback_netid(clp,
+ setclientid.sc_netid,
+ sizeof(setclientid.sc_netid));
+ setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,
+ sizeof(setclientid.sc_uaddr), "%s.%u.%u",
+ clp->cl_ipaddr, port >> 8, port & 255);
+
+ dprintk("NFS call setclientid auth=%s, '%s'\n",
+ clp->cl_rpcclient->cl_auth->au_ops->au_name,
+ clp->cl_owner_id);
+
+ status = nfs4_call_sync_custom(&task_setup_data);
+ if (setclientid.sc_cred) {
+ kfree(clp->cl_acceptor);
+ clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred);
+ put_rpccred(setclientid.sc_cred);
+ }
+
+ if (status == 0)
+ do_renew_lease(clp, now);
+out:
+ trace_nfs4_setclientid(clp, status);
+ dprintk("NFS reply setclientid: %d\n", status);
+ return status;
+}
+
+/**
+ * nfs4_proc_setclientid_confirm - Confirm client ID
+ * @clp: state data structure
+ * @arg: result of a previous SETCLIENTID
+ * @cred: credential to use for this call
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status code.
+ */
+int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
+ struct nfs4_setclientid_res *arg,
+ const struct cred *cred)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM],
+ .rpc_argp = arg,
+ .rpc_cred = cred,
+ };
+ int status;
+
+ dprintk("NFS call setclientid_confirm auth=%s, (client ID %llx)\n",
+ clp->cl_rpcclient->cl_auth->au_ops->au_name,
+ clp->cl_clientid);
+ status = rpc_call_sync(clp->cl_rpcclient, &msg,
+ RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
+ trace_nfs4_setclientid_confirm(clp, status);
+ dprintk("NFS reply setclientid_confirm: %d\n", status);
+ return status;
+}
+
+struct nfs4_delegreturndata {
+ struct nfs4_delegreturnargs args;
+ struct nfs4_delegreturnres res;
+ struct nfs_fh fh;
+ nfs4_stateid stateid;
+ unsigned long timestamp;
+ struct {
+ struct nfs4_layoutreturn_args arg;
+ struct nfs4_layoutreturn_res res;
+ struct nfs4_xdr_opaque_data ld_private;
+ u32 roc_barrier;
+ bool roc;
+ } lr;
+ struct nfs_fattr fattr;
+ int rpc_status;
+ struct inode *inode;
+};
+
+static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_delegreturndata *data = calldata;
+ struct nfs4_exception exception = {
+ .inode = data->inode,
+ .stateid = &data->stateid,
+ .task_is_privileged = data->args.seq_args.sa_privileged,
+ };
+
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return;
+
+ trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status);
+
+ /* Handle Layoutreturn errors */
+ if (pnfs_roc_done(task, &data->args.lr_args, &data->res.lr_res,
+ &data->res.lr_ret) == -EAGAIN)
+ goto out_restart;
+
+ switch (task->tk_status) {
+ case 0:
+ renew_lease(data->res.server, data->timestamp);
+ break;
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_EXPIRED:
+ nfs4_free_revoked_stateid(data->res.server,
+ data->args.stateid,
+ task->tk_msg.rpc_cred);
+ fallthrough;
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_STALE_STATEID:
+ case -ETIMEDOUT:
+ task->tk_status = 0;
+ break;
+ case -NFS4ERR_OLD_STATEID:
+ if (!nfs4_refresh_delegation_stateid(&data->stateid, data->inode))
+ nfs4_stateid_seqid_inc(&data->stateid);
+ if (data->args.bitmask) {
+ data->args.bitmask = NULL;
+ data->res.fattr = NULL;
+ }
+ goto out_restart;
+ case -NFS4ERR_ACCESS:
+ if (data->args.bitmask) {
+ data->args.bitmask = NULL;
+ data->res.fattr = NULL;
+ goto out_restart;
+ }
+ fallthrough;
+ default:
+ task->tk_status = nfs4_async_handle_exception(task,
+ data->res.server, task->tk_status,
+ &exception);
+ if (exception.retry)
+ goto out_restart;
+ }
+ nfs_delegation_mark_returned(data->inode, data->args.stateid);
+ data->rpc_status = task->tk_status;
+ return;
+out_restart:
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
+}
+
+static void nfs4_delegreturn_release(void *calldata)
+{
+ struct nfs4_delegreturndata *data = calldata;
+ struct inode *inode = data->inode;
+
+ if (data->lr.roc)
+ pnfs_roc_release(&data->lr.arg, &data->lr.res,
+ data->res.lr_ret);
+ if (inode) {
+ nfs4_fattr_set_prechange(&data->fattr,
+ inode_peek_iversion_raw(inode));
+ nfs_refresh_inode(inode, &data->fattr);
+ nfs_iput_and_deactive(inode);
+ }
+ kfree(calldata);
+}
+
+static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs4_delegreturndata *d_data;
+ struct pnfs_layout_hdr *lo;
+
+ d_data = (struct nfs4_delegreturndata *)data;
+
+ if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) {
+ nfs4_sequence_done(task, &d_data->res.seq_res);
+ return;
+ }
+
+ lo = d_data->args.lr_args ? d_data->args.lr_args->layout : NULL;
+ if (lo && !pnfs_layout_is_valid(lo)) {
+ d_data->args.lr_args = NULL;
+ d_data->res.lr_res = NULL;
+ }
+
+ nfs4_setup_sequence(d_data->res.server->nfs_client,
+ &d_data->args.seq_args,
+ &d_data->res.seq_res,
+ task);
+}
+
+static const struct rpc_call_ops nfs4_delegreturn_ops = {
+ .rpc_call_prepare = nfs4_delegreturn_prepare,
+ .rpc_call_done = nfs4_delegreturn_done,
+ .rpc_release = nfs4_delegreturn_release,
+};
+
+static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, const nfs4_stateid *stateid, int issync)
+{
+ struct nfs4_delegreturndata *data;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN],
+ .rpc_cred = cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = server->client,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_delegreturn_ops,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,
+ };
+ int status = 0;
+
+ data = kzalloc(sizeof(*data), GFP_NOFS);
+ if (data == NULL)
+ return -ENOMEM;
+
+ nfs4_state_protect(server->nfs_client,
+ NFS_SP4_MACH_CRED_CLEANUP,
+ &task_setup_data.rpc_client, &msg);
+
+ data->args.fhandle = &data->fh;
+ data->args.stateid = &data->stateid;
+ nfs4_bitmask_set(data->args.bitmask_store,
+ server->cache_consistency_bitmask, inode, server,
+ NULL);
+ data->args.bitmask = data->args.bitmask_store;
+ nfs_copy_fh(&data->fh, NFS_FH(inode));
+ nfs4_stateid_copy(&data->stateid, stateid);
+ data->res.fattr = &data->fattr;
+ data->res.server = server;
+ data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
+ data->lr.arg.ld_private = &data->lr.ld_private;
+ nfs_fattr_init(data->res.fattr);
+ data->timestamp = jiffies;
+ data->rpc_status = 0;
+ data->inode = nfs_igrab_and_active(inode);
+ if (data->inode || issync) {
+ data->lr.roc = pnfs_roc(inode, &data->lr.arg, &data->lr.res,
+ cred);
+ if (data->lr.roc) {
+ data->args.lr_args = &data->lr.arg;
+ data->res.lr_res = &data->lr.res;
+ }
+ }
+
+ if (!data->inode)
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1,
+ 1);
+ else
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1,
+ 0);
+ task_setup_data.callback_data = data;
+ msg.rpc_argp = &data->args;
+ msg.rpc_resp = &data->res;
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ if (!issync)
+ goto out;
+ status = rpc_wait_for_completion_task(task);
+ if (status != 0)
+ goto out;
+ status = data->rpc_status;
+out:
+ rpc_put_task(task);
+ return status;
+}
+
+int nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, const nfs4_stateid *stateid, int issync)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs4_exception exception = { };
+ int err;
+ do {
+ err = _nfs4_proc_delegreturn(inode, cred, stateid, issync);
+ trace_nfs4_delegreturn(inode, stateid, err);
+ switch (err) {
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_EXPIRED:
+ case 0:
+ return 0;
+ }
+ err = nfs4_handle_exception(server, err, &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+ struct inode *inode = state->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs_lockt_args arg = {
+ .fh = NFS_FH(inode),
+ .fl = request,
+ };
+ struct nfs_lockt_res res = {
+ .denied = request,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKT],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ .rpc_cred = state->owner->so_cred,
+ };
+ struct nfs4_lock_state *lsp;
+ int status;
+
+ arg.lock_owner.clientid = clp->cl_clientid;
+ status = nfs4_set_lock_state(state, request);
+ if (status != 0)
+ goto out;
+ lsp = request->fl_u.nfs4_fl.owner;
+ arg.lock_owner.id = lsp->ls_seqid.owner_id;
+ arg.lock_owner.s_dev = server->s_dev;
+ status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+ switch (status) {
+ case 0:
+ request->fl_type = F_UNLCK;
+ break;
+ case -NFS4ERR_DENIED:
+ status = 0;
+ }
+ request->fl_ops->fl_release_private(request);
+ request->fl_ops = NULL;
+out:
+ return status;
+}
+
+static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+
+ do {
+ err = _nfs4_proc_getlk(state, cmd, request);
+ trace_nfs4_get_lock(request, state, cmd, err);
+ err = nfs4_handle_exception(NFS_SERVER(state->inode), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+/*
+ * Update the seqid of a lock stateid after receiving
+ * NFS4ERR_OLD_STATEID
+ */
+static bool nfs4_refresh_lock_old_stateid(nfs4_stateid *dst,
+ struct nfs4_lock_state *lsp)
+{
+ struct nfs4_state *state = lsp->ls_state;
+ bool ret = false;
+
+ spin_lock(&state->state_lock);
+ if (!nfs4_stateid_match_other(dst, &lsp->ls_stateid))
+ goto out;
+ if (!nfs4_stateid_is_newer(&lsp->ls_stateid, dst))
+ nfs4_stateid_seqid_inc(dst);
+ else
+ dst->seqid = lsp->ls_stateid.seqid;
+ ret = true;
+out:
+ spin_unlock(&state->state_lock);
+ return ret;
+}
+
+static bool nfs4_sync_lock_stateid(nfs4_stateid *dst,
+ struct nfs4_lock_state *lsp)
+{
+ struct nfs4_state *state = lsp->ls_state;
+ bool ret;
+
+ spin_lock(&state->state_lock);
+ ret = !nfs4_stateid_match_other(dst, &lsp->ls_stateid);
+ nfs4_stateid_copy(dst, &lsp->ls_stateid);
+ spin_unlock(&state->state_lock);
+ return ret;
+}
+
+struct nfs4_unlockdata {
+ struct nfs_locku_args arg;
+ struct nfs_locku_res res;
+ struct nfs4_lock_state *lsp;
+ struct nfs_open_context *ctx;
+ struct nfs_lock_context *l_ctx;
+ struct file_lock fl;
+ struct nfs_server *server;
+ unsigned long timestamp;
+};
+
+static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
+ struct nfs_open_context *ctx,
+ struct nfs4_lock_state *lsp,
+ struct nfs_seqid *seqid)
+{
+ struct nfs4_unlockdata *p;
+ struct nfs4_state *state = lsp->ls_state;
+ struct inode *inode = state->inode;
+
+ p = kzalloc(sizeof(*p), GFP_NOFS);
+ if (p == NULL)
+ return NULL;
+ p->arg.fh = NFS_FH(inode);
+ p->arg.fl = &p->fl;
+ p->arg.seqid = seqid;
+ p->res.seqid = seqid;
+ p->lsp = lsp;
+ /* Ensure we don't close file until we're done freeing locks! */
+ p->ctx = get_nfs_open_context(ctx);
+ p->l_ctx = nfs_get_lock_context(ctx);
+ locks_init_lock(&p->fl);
+ locks_copy_lock(&p->fl, fl);
+ p->server = NFS_SERVER(inode);
+ spin_lock(&state->state_lock);
+ nfs4_stateid_copy(&p->arg.stateid, &lsp->ls_stateid);
+ spin_unlock(&state->state_lock);
+ return p;
+}
+
+static void nfs4_locku_release_calldata(void *data)
+{
+ struct nfs4_unlockdata *calldata = data;
+ nfs_free_seqid(calldata->arg.seqid);
+ nfs4_put_lock_state(calldata->lsp);
+ nfs_put_lock_context(calldata->l_ctx);
+ put_nfs_open_context(calldata->ctx);
+ kfree(calldata);
+}
+
+static void nfs4_locku_done(struct rpc_task *task, void *data)
+{
+ struct nfs4_unlockdata *calldata = data;
+ struct nfs4_exception exception = {
+ .inode = calldata->lsp->ls_state->inode,
+ .stateid = &calldata->arg.stateid,
+ };
+
+ if (!nfs4_sequence_done(task, &calldata->res.seq_res))
+ return;
+ switch (task->tk_status) {
+ case 0:
+ renew_lease(calldata->server, calldata->timestamp);
+ locks_lock_inode_wait(calldata->lsp->ls_state->inode, &calldata->fl);
+ if (nfs4_update_lock_stateid(calldata->lsp,
+ &calldata->res.stateid))
+ break;
+ fallthrough;
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_EXPIRED:
+ nfs4_free_revoked_stateid(calldata->server,
+ &calldata->arg.stateid,
+ task->tk_msg.rpc_cred);
+ fallthrough;
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_STALE_STATEID:
+ if (nfs4_sync_lock_stateid(&calldata->arg.stateid,
+ calldata->lsp))
+ rpc_restart_call_prepare(task);
+ break;
+ case -NFS4ERR_OLD_STATEID:
+ if (nfs4_refresh_lock_old_stateid(&calldata->arg.stateid,
+ calldata->lsp))
+ rpc_restart_call_prepare(task);
+ break;
+ default:
+ task->tk_status = nfs4_async_handle_exception(task,
+ calldata->server, task->tk_status,
+ &exception);
+ if (exception.retry)
+ rpc_restart_call_prepare(task);
+ }
+ nfs_release_seqid(calldata->arg.seqid);
+}
+
+static void nfs4_locku_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs4_unlockdata *calldata = data;
+
+ if (test_bit(NFS_CONTEXT_UNLOCK, &calldata->l_ctx->open_context->flags) &&
+ nfs_async_iocounter_wait(task, calldata->l_ctx))
+ return;
+
+ if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
+ goto out_wait;
+ if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
+ /* Note: exit _without_ running nfs4_locku_done */
+ goto out_no_action;
+ }
+ calldata->timestamp = jiffies;
+ if (nfs4_setup_sequence(calldata->server->nfs_client,
+ &calldata->arg.seq_args,
+ &calldata->res.seq_res,
+ task) != 0)
+ nfs_release_seqid(calldata->arg.seqid);
+ return;
+out_no_action:
+ task->tk_action = NULL;
+out_wait:
+ nfs4_sequence_done(task, &calldata->res.seq_res);
+}
+
+static const struct rpc_call_ops nfs4_locku_ops = {
+ .rpc_call_prepare = nfs4_locku_prepare,
+ .rpc_call_done = nfs4_locku_done,
+ .rpc_release = nfs4_locku_release_calldata,
+};
+
+static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
+ struct nfs_open_context *ctx,
+ struct nfs4_lock_state *lsp,
+ struct nfs_seqid *seqid)
+{
+ struct nfs4_unlockdata *data;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKU],
+ .rpc_cred = ctx->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = NFS_CLIENT(lsp->ls_state->inode),
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_locku_ops,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC,
+ };
+
+ nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client,
+ NFS_SP4_MACH_CRED_CLEANUP, &task_setup_data.rpc_client, &msg);
+
+ /* Ensure this is an unlock - when canceling a lock, the
+ * canceled lock is passed in, and it won't be an unlock.
+ */
+ fl->fl_type = F_UNLCK;
+ if (fl->fl_flags & FL_CLOSE)
+ set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags);
+
+ data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid);
+ if (data == NULL) {
+ nfs_free_seqid(seqid);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1, 0);
+ msg.rpc_argp = &data->arg;
+ msg.rpc_resp = &data->res;
+ task_setup_data.callback_data = data;
+ return rpc_run_task(&task_setup_data);
+}
+
+static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+ struct inode *inode = state->inode;
+ struct nfs4_state_owner *sp = state->owner;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_seqid *seqid;
+ struct nfs4_lock_state *lsp;
+ struct rpc_task *task;
+ struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+ int status = 0;
+ unsigned char fl_flags = request->fl_flags;
+
+ status = nfs4_set_lock_state(state, request);
+ /* Unlock _before_ we do the RPC call */
+ request->fl_flags |= FL_EXISTS;
+ /* Exclude nfs_delegation_claim_locks() */
+ mutex_lock(&sp->so_delegreturn_mutex);
+ /* Exclude nfs4_reclaim_open_stateid() - note nesting! */
+ down_read(&nfsi->rwsem);
+ if (locks_lock_inode_wait(inode, request) == -ENOENT) {
+ up_read(&nfsi->rwsem);
+ mutex_unlock(&sp->so_delegreturn_mutex);
+ goto out;
+ }
+ up_read(&nfsi->rwsem);
+ mutex_unlock(&sp->so_delegreturn_mutex);
+ if (status != 0)
+ goto out;
+ /* Is this a delegated lock? */
+ lsp = request->fl_u.nfs4_fl.owner;
+ if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0)
+ goto out;
+ alloc_seqid = NFS_SERVER(inode)->nfs_client->cl_mvops->alloc_seqid;
+ seqid = alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
+ status = -ENOMEM;
+ if (IS_ERR(seqid))
+ goto out;
+ task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid);
+ status = PTR_ERR(task);
+ if (IS_ERR(task))
+ goto out;
+ status = rpc_wait_for_completion_task(task);
+ rpc_put_task(task);
+out:
+ request->fl_flags = fl_flags;
+ trace_nfs4_unlock(request, state, F_SETLK, status);
+ return status;
+}
+
+struct nfs4_lockdata {
+ struct nfs_lock_args arg;
+ struct nfs_lock_res res;
+ struct nfs4_lock_state *lsp;
+ struct nfs_open_context *ctx;
+ struct file_lock fl;
+ unsigned long timestamp;
+ int rpc_status;
+ int cancelled;
+ struct nfs_server *server;
+};
+
+static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
+ struct nfs_open_context *ctx, struct nfs4_lock_state *lsp,
+ gfp_t gfp_mask)
+{
+ struct nfs4_lockdata *p;
+ struct inode *inode = lsp->ls_state->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+
+ p = kzalloc(sizeof(*p), gfp_mask);
+ if (p == NULL)
+ return NULL;
+
+ p->arg.fh = NFS_FH(inode);
+ p->arg.fl = &p->fl;
+ p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask);
+ if (IS_ERR(p->arg.open_seqid))
+ goto out_free;
+ alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
+ p->arg.lock_seqid = alloc_seqid(&lsp->ls_seqid, gfp_mask);
+ if (IS_ERR(p->arg.lock_seqid))
+ goto out_free_seqid;
+ p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
+ p->arg.lock_owner.id = lsp->ls_seqid.owner_id;
+ p->arg.lock_owner.s_dev = server->s_dev;
+ p->res.lock_seqid = p->arg.lock_seqid;
+ p->lsp = lsp;
+ p->server = server;
+ p->ctx = get_nfs_open_context(ctx);
+ locks_init_lock(&p->fl);
+ locks_copy_lock(&p->fl, fl);
+ return p;
+out_free_seqid:
+ nfs_free_seqid(p->arg.open_seqid);
+out_free:
+ kfree(p);
+ return NULL;
+}
+
+static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_lockdata *data = calldata;
+ struct nfs4_state *state = data->lsp->ls_state;
+
+ dprintk("%s: begin!\n", __func__);
+ if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
+ goto out_wait;
+ /* Do we need to do an open_to_lock_owner? */
+ if (!test_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags)) {
+ if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) {
+ goto out_release_lock_seqid;
+ }
+ nfs4_stateid_copy(&data->arg.open_stateid,
+ &state->open_stateid);
+ data->arg.new_lock_owner = 1;
+ data->res.open_seqid = data->arg.open_seqid;
+ } else {
+ data->arg.new_lock_owner = 0;
+ nfs4_stateid_copy(&data->arg.lock_stateid,
+ &data->lsp->ls_stateid);
+ }
+ if (!nfs4_valid_open_stateid(state)) {
+ data->rpc_status = -EBADF;
+ task->tk_action = NULL;
+ goto out_release_open_seqid;
+ }
+ data->timestamp = jiffies;
+ if (nfs4_setup_sequence(data->server->nfs_client,
+ &data->arg.seq_args,
+ &data->res.seq_res,
+ task) == 0)
+ return;
+out_release_open_seqid:
+ nfs_release_seqid(data->arg.open_seqid);
+out_release_lock_seqid:
+ nfs_release_seqid(data->arg.lock_seqid);
+out_wait:
+ nfs4_sequence_done(task, &data->res.seq_res);
+ dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
+}
+
+static void nfs4_lock_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_lockdata *data = calldata;
+ struct nfs4_lock_state *lsp = data->lsp;
+ struct nfs_server *server = NFS_SERVER(d_inode(data->ctx->dentry));
+
+ dprintk("%s: begin!\n", __func__);
+
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return;
+
+ data->rpc_status = task->tk_status;
+ switch (task->tk_status) {
+ case 0:
+ renew_lease(server, data->timestamp);
+ if (data->arg.new_lock && !data->cancelled) {
+ data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
+ if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0)
+ goto out_restart;
+ }
+ if (data->arg.new_lock_owner != 0) {
+ nfs_confirm_seqid(&lsp->ls_seqid, 0);
+ nfs4_stateid_copy(&lsp->ls_stateid, &data->res.stateid);
+ set_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
+ } else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid))
+ goto out_restart;
+ break;
+ case -NFS4ERR_OLD_STATEID:
+ if (data->arg.new_lock_owner != 0 &&
+ nfs4_refresh_open_old_stateid(&data->arg.open_stateid,
+ lsp->ls_state))
+ goto out_restart;
+ if (nfs4_refresh_lock_old_stateid(&data->arg.lock_stateid, lsp))
+ goto out_restart;
+ fallthrough;
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_EXPIRED:
+ if (data->arg.new_lock_owner != 0) {
+ if (!nfs4_stateid_match(&data->arg.open_stateid,
+ &lsp->ls_state->open_stateid))
+ goto out_restart;
+ else if (nfs4_async_handle_error(task, server, lsp->ls_state, NULL) == -EAGAIN)
+ goto out_restart;
+ } else if (!nfs4_stateid_match(&data->arg.lock_stateid,
+ &lsp->ls_stateid))
+ goto out_restart;
+ }
+out_done:
+ dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status);
+ return;
+out_restart:
+ if (!data->cancelled)
+ rpc_restart_call_prepare(task);
+ goto out_done;
+}
+
+static void nfs4_lock_release(void *calldata)
+{
+ struct nfs4_lockdata *data = calldata;
+
+ dprintk("%s: begin!\n", __func__);
+ nfs_free_seqid(data->arg.open_seqid);
+ if (data->cancelled && data->rpc_status == 0) {
+ struct rpc_task *task;
+ task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp,
+ data->arg.lock_seqid);
+ if (!IS_ERR(task))
+ rpc_put_task_async(task);
+ dprintk("%s: cancelling lock!\n", __func__);
+ } else
+ nfs_free_seqid(data->arg.lock_seqid);
+ nfs4_put_lock_state(data->lsp);
+ put_nfs_open_context(data->ctx);
+ kfree(data);
+ dprintk("%s: done!\n", __func__);
+}
+
+static const struct rpc_call_ops nfs4_lock_ops = {
+ .rpc_call_prepare = nfs4_lock_prepare,
+ .rpc_call_done = nfs4_lock_done,
+ .rpc_release = nfs4_lock_release,
+};
+
+static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
+{
+ switch (error) {
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_BAD_STATEID:
+ lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+ if (new_lock_owner != 0 ||
+ test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0)
+ nfs4_schedule_stateid_recovery(server, lsp->ls_state);
+ break;
+ case -NFS4ERR_STALE_STATEID:
+ lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+ nfs4_schedule_lease_recovery(server->nfs_client);
+ }
+}
+
+static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int recovery_type)
+{
+ struct nfs4_lockdata *data;
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCK],
+ .rpc_cred = state->owner->so_cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = NFS_CLIENT(state->inode),
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_lock_ops,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
+ };
+ int ret;
+
+ dprintk("%s: begin!\n", __func__);
+ data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
+ fl->fl_u.nfs4_fl.owner,
+ recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS);
+ if (data == NULL)
+ return -ENOMEM;
+ if (IS_SETLKW(cmd))
+ data->arg.block = 1;
+ nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1,
+ recovery_type > NFS_LOCK_NEW);
+ msg.rpc_argp = &data->arg;
+ msg.rpc_resp = &data->res;
+ task_setup_data.callback_data = data;
+ if (recovery_type > NFS_LOCK_NEW) {
+ if (recovery_type == NFS_LOCK_RECLAIM)
+ data->arg.reclaim = NFS_LOCK_RECLAIM;
+ } else
+ data->arg.new_lock = 1;
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ ret = rpc_wait_for_completion_task(task);
+ if (ret == 0) {
+ ret = data->rpc_status;
+ if (ret)
+ nfs4_handle_setlk_error(data->server, data->lsp,
+ data->arg.new_lock_owner, ret);
+ } else
+ data->cancelled = true;
+ trace_nfs4_set_lock(fl, state, &data->res.stateid, cmd, ret);
+ rpc_put_task(task);
+ dprintk("%s: done, ret = %d!\n", __func__, ret);
+ return ret;
+}
+
+static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request)
+{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ struct nfs4_exception exception = {
+ .inode = state->inode,
+ };
+ int err;
+
+ do {
+ /* Cache the lock if possible... */
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
+ return 0;
+ err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
+ if (err != -NFS4ERR_DELAY)
+ break;
+ nfs4_handle_exception(server, err, &exception);
+ } while (exception.retry);
+ return err;
+}
+
+static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request)
+{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ struct nfs4_exception exception = {
+ .inode = state->inode,
+ };
+ int err;
+
+ err = nfs4_set_lock_state(state, request);
+ if (err != 0)
+ return err;
+ if (!recover_lost_locks) {
+ set_bit(NFS_LOCK_LOST, &request->fl_u.nfs4_fl.owner->ls_flags);
+ return 0;
+ }
+ do {
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
+ return 0;
+ err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED);
+ switch (err) {
+ default:
+ goto out;
+ case -NFS4ERR_GRACE:
+ case -NFS4ERR_DELAY:
+ nfs4_handle_exception(server, err, &exception);
+ err = 0;
+ }
+ } while (exception.retry);
+out:
+ return err;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request)
+{
+ struct nfs4_lock_state *lsp;
+ int status;
+
+ status = nfs4_set_lock_state(state, request);
+ if (status != 0)
+ return status;
+ lsp = request->fl_u.nfs4_fl.owner;
+ if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) ||
+ test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
+ return 0;
+ return nfs4_lock_expired(state, request);
+}
+#endif
+
+static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+ struct nfs_inode *nfsi = NFS_I(state->inode);
+ struct nfs4_state_owner *sp = state->owner;
+ unsigned char fl_flags = request->fl_flags;
+ int status;
+
+ request->fl_flags |= FL_ACCESS;
+ status = locks_lock_inode_wait(state->inode, request);
+ if (status < 0)
+ goto out;
+ mutex_lock(&sp->so_delegreturn_mutex);
+ down_read(&nfsi->rwsem);
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
+ /* Yes: cache locks! */
+ /* ...but avoid races with delegation recall... */
+ request->fl_flags = fl_flags & ~FL_SLEEP;
+ status = locks_lock_inode_wait(state->inode, request);
+ up_read(&nfsi->rwsem);
+ mutex_unlock(&sp->so_delegreturn_mutex);
+ goto out;
+ }
+ up_read(&nfsi->rwsem);
+ mutex_unlock(&sp->so_delegreturn_mutex);
+ status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
+out:
+ request->fl_flags = fl_flags;
+ return status;
+}
+
+static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+ struct nfs4_exception exception = {
+ .state = state,
+ .inode = state->inode,
+ .interruptible = true,
+ };
+ int err;
+
+ do {
+ err = _nfs4_proc_setlk(state, cmd, request);
+ if (err == -NFS4ERR_DENIED)
+ err = -EAGAIN;
+ err = nfs4_handle_exception(NFS_SERVER(state->inode),
+ err, &exception);
+ } while (exception.retry);
+ return err;
+}
+
+#define NFS4_LOCK_MINTIMEOUT (1 * HZ)
+#define NFS4_LOCK_MAXTIMEOUT (30 * HZ)
+
+static int
+nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd,
+ struct file_lock *request)
+{
+ int status = -ERESTARTSYS;
+ unsigned long timeout = NFS4_LOCK_MINTIMEOUT;
+
+ while(!signalled()) {
+ status = nfs4_proc_setlk(state, cmd, request);
+ if ((status != -EAGAIN) || IS_SETLK(cmd))
+ break;
+ freezable_schedule_timeout_interruptible(timeout);
+ timeout *= 2;
+ timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout);
+ status = -ERESTARTSYS;
+ }
+ return status;
+}
+
+#ifdef CONFIG_NFS_V4_1
+struct nfs4_lock_waiter {
+ struct task_struct *task;
+ struct inode *inode;
+ struct nfs_lowner *owner;
+};
+
+static int
+nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key)
+{
+ int ret;
+ struct nfs4_lock_waiter *waiter = wait->private;
+
+ /* NULL key means to wake up everyone */
+ if (key) {
+ struct cb_notify_lock_args *cbnl = key;
+ struct nfs_lowner *lowner = &cbnl->cbnl_owner,
+ *wowner = waiter->owner;
+
+ /* Only wake if the callback was for the same owner. */
+ if (lowner->id != wowner->id || lowner->s_dev != wowner->s_dev)
+ return 0;
+
+ /* Make sure it's for the right inode */
+ if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh))
+ return 0;
+ }
+
+ /* override "private" so we can use default_wake_function */
+ wait->private = waiter->task;
+ ret = woken_wake_function(wait, mode, flags, key);
+ if (ret)
+ list_del_init(&wait->entry);
+ wait->private = waiter;
+ return ret;
+}
+
+static int
+nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+ int status = -ERESTARTSYS;
+ struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner;
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ struct nfs_client *clp = server->nfs_client;
+ wait_queue_head_t *q = &clp->cl_lock_waitq;
+ struct nfs_lowner owner = { .clientid = clp->cl_clientid,
+ .id = lsp->ls_seqid.owner_id,
+ .s_dev = server->s_dev };
+ struct nfs4_lock_waiter waiter = { .task = current,
+ .inode = state->inode,
+ .owner = &owner};
+ wait_queue_entry_t wait;
+
+ /* Don't bother with waitqueue if we don't expect a callback */
+ if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags))
+ return nfs4_retry_setlk_simple(state, cmd, request);
+
+ init_wait(&wait);
+ wait.private = &waiter;
+ wait.func = nfs4_wake_lock_waiter;
+
+ while(!signalled()) {
+ add_wait_queue(q, &wait);
+ status = nfs4_proc_setlk(state, cmd, request);
+ if ((status != -EAGAIN) || IS_SETLK(cmd)) {
+ finish_wait(q, &wait);
+ break;
+ }
+
+ status = -ERESTARTSYS;
+ freezer_do_not_count();
+ wait_woken(&wait, TASK_INTERRUPTIBLE, NFS4_LOCK_MAXTIMEOUT);
+ freezer_count();
+ finish_wait(q, &wait);
+ }
+
+ return status;
+}
+#else /* !CONFIG_NFS_V4_1 */
+static inline int
+nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+ return nfs4_retry_setlk_simple(state, cmd, request);
+}
+#endif
+
+static int
+nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
+{
+ struct nfs_open_context *ctx;
+ struct nfs4_state *state;
+ int status;
+
+ /* verify open state */
+ ctx = nfs_file_open_context(filp);
+ state = ctx->state;
+
+ if (IS_GETLK(cmd)) {
+ if (state != NULL)
+ return nfs4_proc_getlk(state, F_GETLK, request);
+ return 0;
+ }
+
+ if (!(IS_SETLK(cmd) || IS_SETLKW(cmd)))
+ return -EINVAL;
+
+ if (request->fl_type == F_UNLCK) {
+ if (state != NULL)
+ return nfs4_proc_unlck(state, cmd, request);
+ return 0;
+ }
+
+ if (state == NULL)
+ return -ENOLCK;
+
+ if ((request->fl_flags & FL_POSIX) &&
+ !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
+ return -ENOLCK;
+
+ /*
+ * Don't rely on the VFS having checked the file open mode,
+ * since it won't do this for flock() locks.
+ */
+ switch (request->fl_type) {
+ case F_RDLCK:
+ if (!(filp->f_mode & FMODE_READ))
+ return -EBADF;
+ break;
+ case F_WRLCK:
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+ }
+
+ status = nfs4_set_lock_state(state, request);
+ if (status != 0)
+ return status;
+
+ return nfs4_retry_setlk(state, cmd, request);
+}
+
+int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid)
+{
+ struct nfs_server *server = NFS_SERVER(state->inode);
+ int err;
+
+ err = nfs4_set_lock_state(state, fl);
+ if (err != 0)
+ return err;
+ do {
+ err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
+ if (err != -NFS4ERR_DELAY)
+ break;
+ ssleep(1);
+ } while (err == -NFS4ERR_DELAY);
+ return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err);
+}
+
+struct nfs_release_lockowner_data {
+ struct nfs4_lock_state *lsp;
+ struct nfs_server *server;
+ struct nfs_release_lockowner_args args;
+ struct nfs_release_lockowner_res res;
+ unsigned long timestamp;
+};
+
+static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs_release_lockowner_data *data = calldata;
+ struct nfs_server *server = data->server;
+ nfs4_setup_sequence(server->nfs_client, &data->args.seq_args,
+ &data->res.seq_res, task);
+ data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
+ data->timestamp = jiffies;
+}
+
+static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs_release_lockowner_data *data = calldata;
+ struct nfs_server *server = data->server;
+
+ nfs40_sequence_done(task, &data->res.seq_res);
+
+ switch (task->tk_status) {
+ case 0:
+ renew_lease(server, data->timestamp);
+ break;
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_EXPIRED:
+ nfs4_schedule_lease_recovery(server->nfs_client);
+ break;
+ case -NFS4ERR_LEASE_MOVED:
+ case -NFS4ERR_DELAY:
+ if (nfs4_async_handle_error(task, server,
+ NULL, NULL) == -EAGAIN)
+ rpc_restart_call_prepare(task);
+ }
+}
+
+static void nfs4_release_lockowner_release(void *calldata)
+{
+ struct nfs_release_lockowner_data *data = calldata;
+ nfs4_free_lock_state(data->server, data->lsp);
+ kfree(calldata);
+}
+
+static const struct rpc_call_ops nfs4_release_lockowner_ops = {
+ .rpc_call_prepare = nfs4_release_lockowner_prepare,
+ .rpc_call_done = nfs4_release_lockowner_done,
+ .rpc_release = nfs4_release_lockowner_release,
+};
+
+static void
+nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
+{
+ struct nfs_release_lockowner_data *data;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER],
+ };
+
+ if (server->nfs_client->cl_mvops->minor_version != 0)
+ return;
+
+ data = kmalloc(sizeof(*data), GFP_NOFS);
+ if (!data)
+ return;
+ data->lsp = lsp;
+ data->server = server;
+ data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
+ data->args.lock_owner.id = lsp->ls_seqid.owner_id;
+ data->args.lock_owner.s_dev = server->s_dev;
+
+ msg.rpc_argp = &data->args;
+ msg.rpc_resp = &data->res;
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0, 0);
+ rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);
+}
+
+#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
+
+static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, const void *buf,
+ size_t buflen, int flags)
+{
+ return nfs4_proc_set_acl(inode, buf, buflen);
+}
+
+static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, void *buf, size_t buflen)
+{
+ return nfs4_proc_get_acl(inode, buf, buflen);
+}
+
+static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry)
+{
+ return nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry)));
+}
+
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+
+static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, const void *buf,
+ size_t buflen, int flags)
+{
+ if (security_ismaclabel(key))
+ return nfs4_set_security_label(inode, buf, buflen);
+
+ return -EOPNOTSUPP;
+}
+
+static int nfs4_xattr_get_nfs4_label(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, void *buf, size_t buflen)
+{
+ if (security_ismaclabel(key))
+ return nfs4_get_security_label(inode, buf, buflen);
+ return -EOPNOTSUPP;
+}
+
+static ssize_t
+nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
+{
+ int len = 0;
+
+ if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) {
+ len = security_inode_listsecurity(inode, list, list_len);
+ if (len >= 0 && list_len && len > list_len)
+ return -ERANGE;
+ }
+ return len;
+}
+
+static const struct xattr_handler nfs4_xattr_nfs4_label_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = nfs4_xattr_get_nfs4_label,
+ .set = nfs4_xattr_set_nfs4_label,
+};
+
+#else
+
+static ssize_t
+nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
+{
+ return 0;
+}
+
+#endif
+
+#ifdef CONFIG_NFS_V4_2
+static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, const void *buf,
+ size_t buflen, int flags)
+{
+ u32 mask;
+ int ret;
+
+ if (!nfs_server_capable(inode, NFS_CAP_XATTR))
+ return -EOPNOTSUPP;
+
+ /*
+ * There is no mapping from the MAY_* flags to the NFS_ACCESS_XA*
+ * flags right now. Handling of xattr operations use the normal
+ * file read/write permissions.
+ *
+ * Just in case the server has other ideas (which RFC 8276 allows),
+ * do a cached access check for the XA* flags to possibly avoid
+ * doing an RPC and getting EACCES back.
+ */
+ if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) {
+ if (!(mask & NFS_ACCESS_XAWRITE))
+ return -EACCES;
+ }
+
+ if (buf == NULL) {
+ ret = nfs42_proc_removexattr(inode, key);
+ if (!ret)
+ nfs4_xattr_cache_remove(inode, key);
+ } else {
+ ret = nfs42_proc_setxattr(inode, key, buf, buflen, flags);
+ if (!ret)
+ nfs4_xattr_cache_add(inode, key, buf, NULL, buflen);
+ }
+
+ return ret;
+}
+
+static int nfs4_xattr_get_nfs4_user(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, void *buf, size_t buflen)
+{
+ u32 mask;
+ ssize_t ret;
+
+ if (!nfs_server_capable(inode, NFS_CAP_XATTR))
+ return -EOPNOTSUPP;
+
+ if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) {
+ if (!(mask & NFS_ACCESS_XAREAD))
+ return -EACCES;
+ }
+
+ ret = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (ret)
+ return ret;
+
+ ret = nfs4_xattr_cache_get(inode, key, buf, buflen);
+ if (ret >= 0 || (ret < 0 && ret != -ENOENT))
+ return ret;
+
+ ret = nfs42_proc_getxattr(inode, key, buf, buflen);
+
+ return ret;
+}
+
+static ssize_t
+nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len)
+{
+ u64 cookie;
+ bool eof;
+ ssize_t ret, size;
+ char *buf;
+ size_t buflen;
+ u32 mask;
+
+ if (!nfs_server_capable(inode, NFS_CAP_XATTR))
+ return 0;
+
+ if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) {
+ if (!(mask & NFS_ACCESS_XALIST))
+ return 0;
+ }
+
+ ret = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (ret)
+ return ret;
+
+ ret = nfs4_xattr_cache_list(inode, list, list_len);
+ if (ret >= 0 || (ret < 0 && ret != -ENOENT))
+ return ret;
+
+ cookie = 0;
+ eof = false;
+ buflen = list_len ? list_len : XATTR_LIST_MAX;
+ buf = list_len ? list : NULL;
+ size = 0;
+
+ while (!eof) {
+ ret = nfs42_proc_listxattrs(inode, buf, buflen,
+ &cookie, &eof);
+ if (ret < 0)
+ return ret;
+
+ if (list_len) {
+ buf += ret;
+ buflen -= ret;
+ }
+ size += ret;
+ }
+
+ if (list_len)
+ nfs4_xattr_cache_set_list(inode, list, size);
+
+ return size;
+}
+
+#else
+
+static ssize_t
+nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len)
+{
+ return 0;
+}
+#endif /* CONFIG_NFS_V4_2 */
+
+/*
+ * nfs_fhget will use either the mounted_on_fileid or the fileid
+ */
+static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)
+{
+ if (!(((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) ||
+ (fattr->valid & NFS_ATTR_FATTR_FILEID)) &&
+ (fattr->valid & NFS_ATTR_FATTR_FSID) &&
+ (fattr->valid & NFS_ATTR_FATTR_V4_LOCATIONS)))
+ return;
+
+ fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
+ NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_V4_REFERRAL;
+ fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
+ fattr->nlink = 2;
+}
+
+static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
+ const struct qstr *name,
+ struct nfs4_fs_locations *fs_locations,
+ struct page *page)
+{
+ struct nfs_server *server = NFS_SERVER(dir);
+ u32 bitmask[3];
+ struct nfs4_fs_locations_arg args = {
+ .dir_fh = NFS_FH(dir),
+ .name = name,
+ .page = page,
+ .bitmask = bitmask,
+ };
+ struct nfs4_fs_locations_res res = {
+ .fs_locations = fs_locations,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ int status;
+
+ dprintk("%s: start\n", __func__);
+
+ bitmask[0] = nfs4_fattr_bitmap[0] | FATTR4_WORD0_FS_LOCATIONS;
+ bitmask[1] = nfs4_fattr_bitmap[1];
+
+ /* Ask for the fileid of the absent filesystem if mounted_on_fileid
+ * is not supported */
+ if (NFS_SERVER(dir)->attr_bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
+ bitmask[0] &= ~FATTR4_WORD0_FILEID;
+ else
+ bitmask[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
+
+ nfs_fattr_init(&fs_locations->fattr);
+ fs_locations->server = server;
+ fs_locations->nlocations = 0;
+ status = nfs4_call_sync(client, server, &msg, &args.seq_args, &res.seq_res, 0);
+ dprintk("%s: returned status = %d\n", __func__, status);
+ return status;
+}
+
+int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
+ const struct qstr *name,
+ struct nfs4_fs_locations *fs_locations,
+ struct page *page)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = _nfs4_proc_fs_locations(client, dir, name,
+ fs_locations, page);
+ trace_nfs4_get_fs_locations(dir, name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+/*
+ * This operation also signals the server that this client is
+ * performing migration recovery. The server can stop returning
+ * NFS4ERR_LEASE_MOVED to this client. A RENEW operation is
+ * appended to this compound to identify the client ID which is
+ * performing recovery.
+ */
+static int _nfs40_proc_get_locations(struct inode *inode,
+ struct nfs4_fs_locations *locations,
+ struct page *page, const struct cred *cred)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct rpc_clnt *clnt = server->client;
+ u32 bitmask[2] = {
+ [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
+ };
+ struct nfs4_fs_locations_arg args = {
+ .clientid = server->nfs_client->cl_clientid,
+ .fh = NFS_FH(inode),
+ .page = page,
+ .bitmask = bitmask,
+ .migration = 1, /* skip LOOKUP */
+ .renew = 1, /* append RENEW */
+ };
+ struct nfs4_fs_locations_res res = {
+ .fs_locations = locations,
+ .migration = 1,
+ .renew = 1,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ unsigned long now = jiffies;
+ int status;
+
+ nfs_fattr_init(&locations->fattr);
+ locations->server = server;
+ locations->nlocations = 0;
+
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1);
+ status = nfs4_call_sync_sequence(clnt, server, &msg,
+ &args.seq_args, &res.seq_res);
+ if (status)
+ return status;
+
+ renew_lease(server, now);
+ return 0;
+}
+
+#ifdef CONFIG_NFS_V4_1
+
+/*
+ * This operation also signals the server that this client is
+ * performing migration recovery. The server can stop asserting
+ * SEQ4_STATUS_LEASE_MOVED for this client. The client ID
+ * performing this operation is identified in the SEQUENCE
+ * operation in this compound.
+ *
+ * When the client supports GETATTR(fs_locations_info), it can
+ * be plumbed in here.
+ */
+static int _nfs41_proc_get_locations(struct inode *inode,
+ struct nfs4_fs_locations *locations,
+ struct page *page, const struct cred *cred)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct rpc_clnt *clnt = server->client;
+ u32 bitmask[2] = {
+ [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
+ };
+ struct nfs4_fs_locations_arg args = {
+ .fh = NFS_FH(inode),
+ .page = page,
+ .bitmask = bitmask,
+ .migration = 1, /* skip LOOKUP */
+ };
+ struct nfs4_fs_locations_res res = {
+ .fs_locations = locations,
+ .migration = 1,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ int status;
+
+ nfs_fattr_init(&locations->fattr);
+ locations->server = server;
+ locations->nlocations = 0;
+
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1);
+ status = nfs4_call_sync_sequence(clnt, server, &msg,
+ &args.seq_args, &res.seq_res);
+ if (status == NFS4_OK &&
+ res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED)
+ status = -NFS4ERR_LEASE_MOVED;
+ return status;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_proc_get_locations - discover locations for a migrated FSID
+ * @inode: inode on FSID that is migrating
+ * @locations: result of query
+ * @page: buffer
+ * @cred: credential to use for this operation
+ *
+ * Returns NFS4_OK on success, a negative NFS4ERR status code if the
+ * operation failed, or a negative errno if a local error occurred.
+ *
+ * On success, "locations" is filled in, but if the server has
+ * no locations information, NFS_ATTR_FATTR_V4_LOCATIONS is not
+ * asserted.
+ *
+ * -NFS4ERR_LEASE_MOVED is returned if the server still has leases
+ * from this client that require migration recovery.
+ */
+int nfs4_proc_get_locations(struct inode *inode,
+ struct nfs4_fs_locations *locations,
+ struct page *page, const struct cred *cred)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_client *clp = server->nfs_client;
+ const struct nfs4_mig_recovery_ops *ops =
+ clp->cl_mvops->mig_recovery_ops;
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int status;
+
+ dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__,
+ (unsigned long long)server->fsid.major,
+ (unsigned long long)server->fsid.minor,
+ clp->cl_hostname);
+ nfs_display_fhandle(NFS_FH(inode), __func__);
+
+ do {
+ status = ops->get_locations(inode, locations, page, cred);
+ if (status != -NFS4ERR_DELAY)
+ break;
+ nfs4_handle_exception(server, status, &exception);
+ } while (exception.retry);
+ return status;
+}
+
+/*
+ * This operation also signals the server that this client is
+ * performing "lease moved" recovery. The server can stop
+ * returning NFS4ERR_LEASE_MOVED to this client. A RENEW operation
+ * is appended to this compound to identify the client ID which is
+ * performing recovery.
+ */
+static int _nfs40_proc_fsid_present(struct inode *inode, const struct cred *cred)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+ struct rpc_clnt *clnt = server->client;
+ struct nfs4_fsid_present_arg args = {
+ .fh = NFS_FH(inode),
+ .clientid = clp->cl_clientid,
+ .renew = 1, /* append RENEW */
+ };
+ struct nfs4_fsid_present_res res = {
+ .renew = 1,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ unsigned long now = jiffies;
+ int status;
+
+ res.fh = nfs_alloc_fhandle();
+ if (res.fh == NULL)
+ return -ENOMEM;
+
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1);
+ status = nfs4_call_sync_sequence(clnt, server, &msg,
+ &args.seq_args, &res.seq_res);
+ nfs_free_fhandle(res.fh);
+ if (status)
+ return status;
+
+ do_renew_lease(clp, now);
+ return 0;
+}
+
+#ifdef CONFIG_NFS_V4_1
+
+/*
+ * This operation also signals the server that this client is
+ * performing "lease moved" recovery. The server can stop asserting
+ * SEQ4_STATUS_LEASE_MOVED for this client. The client ID performing
+ * this operation is identified in the SEQUENCE operation in this
+ * compound.
+ */
+static int _nfs41_proc_fsid_present(struct inode *inode, const struct cred *cred)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct rpc_clnt *clnt = server->client;
+ struct nfs4_fsid_present_arg args = {
+ .fh = NFS_FH(inode),
+ };
+ struct nfs4_fsid_present_res res = {
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ int status;
+
+ res.fh = nfs_alloc_fhandle();
+ if (res.fh == NULL)
+ return -ENOMEM;
+
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1);
+ status = nfs4_call_sync_sequence(clnt, server, &msg,
+ &args.seq_args, &res.seq_res);
+ nfs_free_fhandle(res.fh);
+ if (status == NFS4_OK &&
+ res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED)
+ status = -NFS4ERR_LEASE_MOVED;
+ return status;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_proc_fsid_present - Is this FSID present or absent on server?
+ * @inode: inode on FSID to check
+ * @cred: credential to use for this operation
+ *
+ * Server indicates whether the FSID is present, moved, or not
+ * recognized. This operation is necessary to clear a LEASE_MOVED
+ * condition for this client ID.
+ *
+ * Returns NFS4_OK if the FSID is present on this server,
+ * -NFS4ERR_MOVED if the FSID is no longer present, a negative
+ * NFS4ERR code if some error occurred on the server, or a
+ * negative errno if a local failure occurred.
+ */
+int nfs4_proc_fsid_present(struct inode *inode, const struct cred *cred)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_client *clp = server->nfs_client;
+ const struct nfs4_mig_recovery_ops *ops =
+ clp->cl_mvops->mig_recovery_ops;
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int status;
+
+ dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__,
+ (unsigned long long)server->fsid.major,
+ (unsigned long long)server->fsid.minor,
+ clp->cl_hostname);
+ nfs_display_fhandle(NFS_FH(inode), __func__);
+
+ do {
+ status = ops->fsid_present(inode, cred);
+ if (status != -NFS4ERR_DELAY)
+ break;
+ nfs4_handle_exception(server, status, &exception);
+ } while (exception.retry);
+ return status;
+}
+
+/*
+ * If 'use_integrity' is true and the state managment nfs_client
+ * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient
+ * and the machine credential as per RFC3530bis and RFC5661 Security
+ * Considerations sections. Otherwise, just use the user cred with the
+ * filesystem's rpc_client.
+ */
+static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors, bool use_integrity)
+{
+ int status;
+ struct rpc_clnt *clnt = NFS_SERVER(dir)->client;
+ struct nfs_client *clp = NFS_SERVER(dir)->nfs_client;
+ struct nfs4_secinfo_arg args = {
+ .dir_fh = NFS_FH(dir),
+ .name = name,
+ };
+ struct nfs4_secinfo_res res = {
+ .flavors = flavors,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ struct nfs4_call_sync_data data = {
+ .seq_server = NFS_SERVER(dir),
+ .seq_args = &args.seq_args,
+ .seq_res = &res.seq_res,
+ };
+ struct rpc_task_setup task_setup = {
+ .rpc_client = clnt,
+ .rpc_message = &msg,
+ .callback_ops = clp->cl_mvops->call_sync_ops,
+ .callback_data = &data,
+ .flags = RPC_TASK_NO_ROUND_ROBIN,
+ };
+ const struct cred *cred = NULL;
+
+ if (use_integrity) {
+ clnt = clp->cl_rpcclient;
+ task_setup.rpc_client = clnt;
+
+ cred = nfs4_get_clid_cred(clp);
+ msg.rpc_cred = cred;
+ }
+
+ dprintk("NFS call secinfo %s\n", name->name);
+
+ nfs4_state_protect(clp, NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg);
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
+ status = nfs4_call_sync_custom(&task_setup);
+
+ dprintk("NFS reply secinfo: %d\n", status);
+
+ put_cred(cred);
+ return status;
+}
+
+int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name,
+ struct nfs4_secinfo_flavors *flavors)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = -NFS4ERR_WRONGSEC;
+
+ /* try to use integrity protection with machine cred */
+ if (_nfs4_is_integrity_protected(NFS_SERVER(dir)->nfs_client))
+ err = _nfs4_proc_secinfo(dir, name, flavors, true);
+
+ /*
+ * if unable to use integrity protection, or SECINFO with
+ * integrity protection returns NFS4ERR_WRONGSEC (which is
+ * disallowed by spec, but exists in deployed servers) use
+ * the current filesystem's rpc_client and the user cred.
+ */
+ if (err == -NFS4ERR_WRONGSEC)
+ err = _nfs4_proc_secinfo(dir, name, flavors, false);
+
+ trace_nfs4_secinfo(dir, name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+
+#ifdef CONFIG_NFS_V4_1
+/*
+ * Check the exchange flags returned by the server for invalid flags, having
+ * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or
+ * DS flags set.
+ */
+static int nfs4_check_cl_exchange_flags(u32 flags, u32 version)
+{
+ if (version >= 2 && (flags & ~EXCHGID4_2_FLAG_MASK_R))
+ goto out_inval;
+ else if (version < 2 && (flags & ~EXCHGID4_FLAG_MASK_R))
+ goto out_inval;
+ if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) &&
+ (flags & EXCHGID4_FLAG_USE_NON_PNFS))
+ goto out_inval;
+ if (!(flags & (EXCHGID4_FLAG_MASK_PNFS)))
+ goto out_inval;
+ return NFS_OK;
+out_inval:
+ return -NFS4ERR_INVAL;
+}
+
+static bool
+nfs41_same_server_scope(struct nfs41_server_scope *a,
+ struct nfs41_server_scope *b)
+{
+ if (a->server_scope_sz != b->server_scope_sz)
+ return false;
+ return memcmp(a->server_scope, b->server_scope, a->server_scope_sz) == 0;
+}
+
+static void
+nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs41_bind_conn_to_session_args *args = task->tk_msg.rpc_argp;
+ struct nfs41_bind_conn_to_session_res *res = task->tk_msg.rpc_resp;
+ struct nfs_client *clp = args->client;
+
+ switch (task->tk_status) {
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_DEADSESSION:
+ nfs4_schedule_session_recovery(clp->cl_session,
+ task->tk_status);
+ return;
+ }
+ if (args->dir == NFS4_CDFC4_FORE_OR_BOTH &&
+ res->dir != NFS4_CDFS4_BOTH) {
+ rpc_task_close_connection(task);
+ if (args->retries++ < MAX_BIND_CONN_TO_SESSION_RETRIES)
+ rpc_restart_call(task);
+ }
+}
+
+static const struct rpc_call_ops nfs4_bind_one_conn_to_session_ops = {
+ .rpc_call_done = nfs4_bind_one_conn_to_session_done,
+};
+
+/*
+ * nfs4_proc_bind_one_conn_to_session()
+ *
+ * The 4.1 client currently uses the same TCP connection for the
+ * fore and backchannel.
+ */
+static
+int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt,
+ struct nfs_client *clp,
+ const struct cred *cred)
+{
+ int status;
+ struct nfs41_bind_conn_to_session_args args = {
+ .client = clp,
+ .dir = NFS4_CDFC4_FORE_OR_BOTH,
+ .retries = 0,
+ };
+ struct nfs41_bind_conn_to_session_res res;
+ struct rpc_message msg = {
+ .rpc_proc =
+ &nfs4_procedures[NFSPROC4_CLNT_BIND_CONN_TO_SESSION],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clnt,
+ .rpc_xprt = xprt,
+ .callback_ops = &nfs4_bind_one_conn_to_session_ops,
+ .rpc_message = &msg,
+ .flags = RPC_TASK_TIMEOUT,
+ };
+ struct rpc_task *task;
+
+ nfs4_copy_sessionid(&args.sessionid, &clp->cl_session->sess_id);
+ if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
+ args.dir = NFS4_CDFC4_FORE;
+
+ /* Do not set the backchannel flag unless this is clnt->cl_xprt */
+ if (xprt != rcu_access_pointer(clnt->cl_xprt))
+ args.dir = NFS4_CDFC4_FORE;
+
+ task = rpc_run_task(&task_setup_data);
+ if (!IS_ERR(task)) {
+ status = task->tk_status;
+ rpc_put_task(task);
+ } else
+ status = PTR_ERR(task);
+ trace_nfs4_bind_conn_to_session(clp, status);
+ if (status == 0) {
+ if (memcmp(res.sessionid.data,
+ clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) {
+ dprintk("NFS: %s: Session ID mismatch\n", __func__);
+ return -EIO;
+ }
+ if ((res.dir & args.dir) != res.dir || res.dir == 0) {
+ dprintk("NFS: %s: Unexpected direction from server\n",
+ __func__);
+ return -EIO;
+ }
+ if (res.use_conn_in_rdma_mode != args.use_conn_in_rdma_mode) {
+ dprintk("NFS: %s: Server returned RDMA mode = true\n",
+ __func__);
+ return -EIO;
+ }
+ }
+
+ return status;
+}
+
+struct rpc_bind_conn_calldata {
+ struct nfs_client *clp;
+ const struct cred *cred;
+};
+
+static int
+nfs4_proc_bind_conn_to_session_callback(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt,
+ void *calldata)
+{
+ struct rpc_bind_conn_calldata *p = calldata;
+
+ return nfs4_proc_bind_one_conn_to_session(clnt, xprt, p->clp, p->cred);
+}
+
+int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, const struct cred *cred)
+{
+ struct rpc_bind_conn_calldata data = {
+ .clp = clp,
+ .cred = cred,
+ };
+ return rpc_clnt_iterate_for_each_xprt(clp->cl_rpcclient,
+ nfs4_proc_bind_conn_to_session_callback, &data);
+}
+
+/*
+ * Minimum set of SP4_MACH_CRED operations from RFC 5661 in the enforce map
+ * and operations we'd like to see to enable certain features in the allow map
+ */
+static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = {
+ .how = SP4_MACH_CRED,
+ .enforce.u.words = {
+ [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
+ 1 << (OP_EXCHANGE_ID - 32) |
+ 1 << (OP_CREATE_SESSION - 32) |
+ 1 << (OP_DESTROY_SESSION - 32) |
+ 1 << (OP_DESTROY_CLIENTID - 32)
+ },
+ .allow.u.words = {
+ [0] = 1 << (OP_CLOSE) |
+ 1 << (OP_OPEN_DOWNGRADE) |
+ 1 << (OP_LOCKU) |
+ 1 << (OP_DELEGRETURN) |
+ 1 << (OP_COMMIT),
+ [1] = 1 << (OP_SECINFO - 32) |
+ 1 << (OP_SECINFO_NO_NAME - 32) |
+ 1 << (OP_LAYOUTRETURN - 32) |
+ 1 << (OP_TEST_STATEID - 32) |
+ 1 << (OP_FREE_STATEID - 32) |
+ 1 << (OP_WRITE - 32)
+ }
+};
+
+/*
+ * Select the state protection mode for client `clp' given the server results
+ * from exchange_id in `sp'.
+ *
+ * Returns 0 on success, negative errno otherwise.
+ */
+static int nfs4_sp4_select_mode(struct nfs_client *clp,
+ struct nfs41_state_protection *sp)
+{
+ static const u32 supported_enforce[NFS4_OP_MAP_NUM_WORDS] = {
+ [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
+ 1 << (OP_EXCHANGE_ID - 32) |
+ 1 << (OP_CREATE_SESSION - 32) |
+ 1 << (OP_DESTROY_SESSION - 32) |
+ 1 << (OP_DESTROY_CLIENTID - 32)
+ };
+ unsigned long flags = 0;
+ unsigned int i;
+ int ret = 0;
+
+ if (sp->how == SP4_MACH_CRED) {
+ /* Print state protect result */
+ dfprintk(MOUNT, "Server SP4_MACH_CRED support:\n");
+ for (i = 0; i <= LAST_NFS4_OP; i++) {
+ if (test_bit(i, sp->enforce.u.longs))
+ dfprintk(MOUNT, " enforce op %d\n", i);
+ if (test_bit(i, sp->allow.u.longs))
+ dfprintk(MOUNT, " allow op %d\n", i);
+ }
+
+ /* make sure nothing is on enforce list that isn't supported */
+ for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++) {
+ if (sp->enforce.u.words[i] & ~supported_enforce[i]) {
+ dfprintk(MOUNT, "sp4_mach_cred: disabled\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /*
+ * Minimal mode - state operations are allowed to use machine
+ * credential. Note this already happens by default, so the
+ * client doesn't have to do anything more than the negotiation.
+ *
+ * NOTE: we don't care if EXCHANGE_ID is in the list -
+ * we're already using the machine cred for exchange_id
+ * and will never use a different cred.
+ */
+ if (test_bit(OP_BIND_CONN_TO_SESSION, sp->enforce.u.longs) &&
+ test_bit(OP_CREATE_SESSION, sp->enforce.u.longs) &&
+ test_bit(OP_DESTROY_SESSION, sp->enforce.u.longs) &&
+ test_bit(OP_DESTROY_CLIENTID, sp->enforce.u.longs)) {
+ dfprintk(MOUNT, "sp4_mach_cred:\n");
+ dfprintk(MOUNT, " minimal mode enabled\n");
+ __set_bit(NFS_SP4_MACH_CRED_MINIMAL, &flags);
+ } else {
+ dfprintk(MOUNT, "sp4_mach_cred: disabled\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (test_bit(OP_CLOSE, sp->allow.u.longs) &&
+ test_bit(OP_OPEN_DOWNGRADE, sp->allow.u.longs) &&
+ test_bit(OP_DELEGRETURN, sp->allow.u.longs) &&
+ test_bit(OP_LOCKU, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " cleanup mode enabled\n");
+ __set_bit(NFS_SP4_MACH_CRED_CLEANUP, &flags);
+ }
+
+ if (test_bit(OP_LAYOUTRETURN, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " pnfs cleanup mode enabled\n");
+ __set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP, &flags);
+ }
+
+ if (test_bit(OP_SECINFO, sp->allow.u.longs) &&
+ test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " secinfo mode enabled\n");
+ __set_bit(NFS_SP4_MACH_CRED_SECINFO, &flags);
+ }
+
+ if (test_bit(OP_TEST_STATEID, sp->allow.u.longs) &&
+ test_bit(OP_FREE_STATEID, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " stateid mode enabled\n");
+ __set_bit(NFS_SP4_MACH_CRED_STATEID, &flags);
+ }
+
+ if (test_bit(OP_WRITE, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " write mode enabled\n");
+ __set_bit(NFS_SP4_MACH_CRED_WRITE, &flags);
+ }
+
+ if (test_bit(OP_COMMIT, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " commit mode enabled\n");
+ __set_bit(NFS_SP4_MACH_CRED_COMMIT, &flags);
+ }
+ }
+out:
+ clp->cl_sp4_flags = flags;
+ return ret;
+}
+
+struct nfs41_exchange_id_data {
+ struct nfs41_exchange_id_res res;
+ struct nfs41_exchange_id_args args;
+};
+
+static void nfs4_exchange_id_release(void *data)
+{
+ struct nfs41_exchange_id_data *cdata =
+ (struct nfs41_exchange_id_data *)data;
+
+ nfs_put_client(cdata->args.client);
+ kfree(cdata->res.impl_id);
+ kfree(cdata->res.server_scope);
+ kfree(cdata->res.server_owner);
+ kfree(cdata);
+}
+
+static const struct rpc_call_ops nfs4_exchange_id_call_ops = {
+ .rpc_release = nfs4_exchange_id_release,
+};
+
+/*
+ * _nfs4_proc_exchange_id()
+ *
+ * Wrapper for EXCHANGE_ID operation.
+ */
+static struct rpc_task *
+nfs4_run_exchange_id(struct nfs_client *clp, const struct cred *cred,
+ u32 sp4_how, struct rpc_xprt *xprt)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID],
+ .rpc_cred = cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clp->cl_rpcclient,
+ .callback_ops = &nfs4_exchange_id_call_ops,
+ .rpc_message = &msg,
+ .flags = RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN,
+ };
+ struct nfs41_exchange_id_data *calldata;
+ int status;
+
+ if (!refcount_inc_not_zero(&clp->cl_count))
+ return ERR_PTR(-EIO);
+
+ status = -ENOMEM;
+ calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
+ if (!calldata)
+ goto out;
+
+ nfs4_init_boot_verifier(clp, &calldata->args.verifier);
+
+ status = nfs4_init_uniform_client_string(clp);
+ if (status)
+ goto out_calldata;
+
+ calldata->res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
+ GFP_NOFS);
+ status = -ENOMEM;
+ if (unlikely(calldata->res.server_owner == NULL))
+ goto out_calldata;
+
+ calldata->res.server_scope = kzalloc(sizeof(struct nfs41_server_scope),
+ GFP_NOFS);
+ if (unlikely(calldata->res.server_scope == NULL))
+ goto out_server_owner;
+
+ calldata->res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS);
+ if (unlikely(calldata->res.impl_id == NULL))
+ goto out_server_scope;
+
+ switch (sp4_how) {
+ case SP4_NONE:
+ calldata->args.state_protect.how = SP4_NONE;
+ break;
+
+ case SP4_MACH_CRED:
+ calldata->args.state_protect = nfs4_sp4_mach_cred_request;
+ break;
+
+ default:
+ /* unsupported! */
+ WARN_ON_ONCE(1);
+ status = -EINVAL;
+ goto out_impl_id;
+ }
+ if (xprt) {
+ task_setup_data.rpc_xprt = xprt;
+ task_setup_data.flags |= RPC_TASK_SOFTCONN;
+ memcpy(calldata->args.verifier.data, clp->cl_confirm.data,
+ sizeof(calldata->args.verifier.data));
+ }
+ calldata->args.client = clp;
+ calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+ EXCHGID4_FLAG_BIND_PRINC_STATEID;
+#ifdef CONFIG_NFS_V4_1_MIGRATION
+ calldata->args.flags |= EXCHGID4_FLAG_SUPP_MOVED_MIGR;
+#endif
+ msg.rpc_argp = &calldata->args;
+ msg.rpc_resp = &calldata->res;
+ task_setup_data.callback_data = calldata;
+
+ return rpc_run_task(&task_setup_data);
+
+out_impl_id:
+ kfree(calldata->res.impl_id);
+out_server_scope:
+ kfree(calldata->res.server_scope);
+out_server_owner:
+ kfree(calldata->res.server_owner);
+out_calldata:
+ kfree(calldata);
+out:
+ nfs_put_client(clp);
+ return ERR_PTR(status);
+}
+
+/*
+ * _nfs4_proc_exchange_id()
+ *
+ * Wrapper for EXCHANGE_ID operation.
+ */
+static int _nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred,
+ u32 sp4_how)
+{
+ struct rpc_task *task;
+ struct nfs41_exchange_id_args *argp;
+ struct nfs41_exchange_id_res *resp;
+ unsigned long now = jiffies;
+ int status;
+
+ task = nfs4_run_exchange_id(clp, cred, sp4_how, NULL);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ argp = task->tk_msg.rpc_argp;
+ resp = task->tk_msg.rpc_resp;
+ status = task->tk_status;
+ if (status != 0)
+ goto out;
+
+ status = nfs4_check_cl_exchange_flags(resp->flags,
+ clp->cl_mvops->minor_version);
+ if (status != 0)
+ goto out;
+
+ status = nfs4_sp4_select_mode(clp, &resp->state_protect);
+ if (status != 0)
+ goto out;
+
+ do_renew_lease(clp, now);
+
+ clp->cl_clientid = resp->clientid;
+ clp->cl_exchange_flags = resp->flags;
+ clp->cl_seqid = resp->seqid;
+ /* Client ID is not confirmed */
+ if (!(resp->flags & EXCHGID4_FLAG_CONFIRMED_R))
+ clear_bit(NFS4_SESSION_ESTABLISHED,
+ &clp->cl_session->session_state);
+
+ if (clp->cl_serverscope != NULL &&
+ !nfs41_same_server_scope(clp->cl_serverscope,
+ resp->server_scope)) {
+ dprintk("%s: server_scope mismatch detected\n",
+ __func__);
+ set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
+ }
+
+ swap(clp->cl_serverowner, resp->server_owner);
+ swap(clp->cl_serverscope, resp->server_scope);
+ swap(clp->cl_implid, resp->impl_id);
+
+ /* Save the EXCHANGE_ID verifier session trunk tests */
+ memcpy(clp->cl_confirm.data, argp->verifier.data,
+ sizeof(clp->cl_confirm.data));
+out:
+ trace_nfs4_exchange_id(clp, status);
+ rpc_put_task(task);
+ return status;
+}
+
+/*
+ * nfs4_proc_exchange_id()
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status code.
+ *
+ * Since the clientid has expired, all compounds using sessions
+ * associated with the stale clientid will be returning
+ * NFS4ERR_BADSESSION in the sequence operation, and will therefore
+ * be in some phase of session reset.
+ *
+ * Will attempt to negotiate SP4_MACH_CRED if krb5i / krb5p auth is used.
+ */
+int nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred)
+{
+ rpc_authflavor_t authflavor = clp->cl_rpcclient->cl_auth->au_flavor;
+ int status;
+
+ /* try SP4_MACH_CRED if krb5i/p */
+ if (authflavor == RPC_AUTH_GSS_KRB5I ||
+ authflavor == RPC_AUTH_GSS_KRB5P) {
+ status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED);
+ if (!status)
+ return 0;
+ }
+
+ /* try SP4_NONE */
+ return _nfs4_proc_exchange_id(clp, cred, SP4_NONE);
+}
+
+/**
+ * nfs4_test_session_trunk
+ *
+ * This is an add_xprt_test() test function called from
+ * rpc_clnt_setup_test_and_add_xprt.
+ *
+ * The rpc_xprt_switch is referrenced by rpc_clnt_setup_test_and_add_xprt
+ * and is dereferrenced in nfs4_exchange_id_release
+ *
+ * Upon success, add the new transport to the rpc_clnt
+ *
+ * @clnt: struct rpc_clnt to get new transport
+ * @xprt: the rpc_xprt to test
+ * @data: call data for _nfs4_proc_exchange_id.
+ */
+void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
+ void *data)
+{
+ struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data;
+ struct rpc_task *task;
+ int status;
+
+ u32 sp4_how;
+
+ dprintk("--> %s try %s\n", __func__,
+ xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+ sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED);
+
+ /* Test connection for session trunking. Async exchange_id call */
+ task = nfs4_run_exchange_id(adata->clp, adata->cred, sp4_how, xprt);
+ if (IS_ERR(task))
+ return;
+
+ status = task->tk_status;
+ if (status == 0)
+ status = nfs4_detect_session_trunking(adata->clp,
+ task->tk_msg.rpc_resp, xprt);
+
+ if (status == 0)
+ rpc_clnt_xprt_switch_add_xprt(clnt, xprt);
+
+ rpc_put_task(task);
+}
+EXPORT_SYMBOL_GPL(nfs4_test_session_trunk);
+
+static int _nfs4_proc_destroy_clientid(struct nfs_client *clp,
+ const struct cred *cred)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_CLIENTID],
+ .rpc_argp = clp,
+ .rpc_cred = cred,
+ };
+ int status;
+
+ status = rpc_call_sync(clp->cl_rpcclient, &msg,
+ RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
+ trace_nfs4_destroy_clientid(clp, status);
+ if (status)
+ dprintk("NFS: Got error %d from the server %s on "
+ "DESTROY_CLIENTID.", status, clp->cl_hostname);
+ return status;
+}
+
+static int nfs4_proc_destroy_clientid(struct nfs_client *clp,
+ const struct cred *cred)
+{
+ unsigned int loop;
+ int ret;
+
+ for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
+ ret = _nfs4_proc_destroy_clientid(clp, cred);
+ switch (ret) {
+ case -NFS4ERR_DELAY:
+ case -NFS4ERR_CLIENTID_BUSY:
+ ssleep(1);
+ break;
+ default:
+ return ret;
+ }
+ }
+ return 0;
+}
+
+int nfs4_destroy_clientid(struct nfs_client *clp)
+{
+ const struct cred *cred;
+ int ret = 0;
+
+ if (clp->cl_mvops->minor_version < 1)
+ goto out;
+ if (clp->cl_exchange_flags == 0)
+ goto out;
+ if (clp->cl_preserve_clid)
+ goto out;
+ cred = nfs4_get_clid_cred(clp);
+ ret = nfs4_proc_destroy_clientid(clp, cred);
+ put_cred(cred);
+ switch (ret) {
+ case 0:
+ case -NFS4ERR_STALE_CLIENTID:
+ clp->cl_exchange_flags = 0;
+ }
+out:
+ return ret;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+struct nfs4_get_lease_time_data {
+ struct nfs4_get_lease_time_args *args;
+ struct nfs4_get_lease_time_res *res;
+ struct nfs_client *clp;
+};
+
+static void nfs4_get_lease_time_prepare(struct rpc_task *task,
+ void *calldata)
+{
+ struct nfs4_get_lease_time_data *data =
+ (struct nfs4_get_lease_time_data *)calldata;
+
+ dprintk("--> %s\n", __func__);
+ /* just setup sequence, do not trigger session recovery
+ since we're invoked within one */
+ nfs4_setup_sequence(data->clp,
+ &data->args->la_seq_args,
+ &data->res->lr_seq_res,
+ task);
+ dprintk("<-- %s\n", __func__);
+}
+
+/*
+ * Called from nfs4_state_manager thread for session setup, so don't recover
+ * from sequence operation or clientid errors.
+ */
+static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_get_lease_time_data *data =
+ (struct nfs4_get_lease_time_data *)calldata;
+
+ dprintk("--> %s\n", __func__);
+ if (!nfs4_sequence_done(task, &data->res->lr_seq_res))
+ return;
+ switch (task->tk_status) {
+ case -NFS4ERR_DELAY:
+ case -NFS4ERR_GRACE:
+ dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
+ rpc_delay(task, NFS4_POLL_RETRY_MIN);
+ task->tk_status = 0;
+ fallthrough;
+ case -NFS4ERR_RETRY_UNCACHED_REP:
+ rpc_restart_call_prepare(task);
+ return;
+ }
+ dprintk("<-- %s\n", __func__);
+}
+
+static const struct rpc_call_ops nfs4_get_lease_time_ops = {
+ .rpc_call_prepare = nfs4_get_lease_time_prepare,
+ .rpc_call_done = nfs4_get_lease_time_done,
+};
+
+int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
+{
+ struct nfs4_get_lease_time_args args;
+ struct nfs4_get_lease_time_res res = {
+ .lr_fsinfo = fsinfo,
+ };
+ struct nfs4_get_lease_time_data data = {
+ .args = &args,
+ .res = &res,
+ .clp = clp,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GET_LEASE_TIME],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ struct rpc_task_setup task_setup = {
+ .rpc_client = clp->cl_rpcclient,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_get_lease_time_ops,
+ .callback_data = &data,
+ .flags = RPC_TASK_TIMEOUT,
+ };
+
+ nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0, 1);
+ return nfs4_call_sync_custom(&task_setup);
+}
+
+#ifdef CONFIG_NFS_V4_1
+
+/*
+ * Initialize the values to be used by the client in CREATE_SESSION
+ * If nfs4_init_session set the fore channel request and response sizes,
+ * use them.
+ *
+ * Set the back channel max_resp_sz_cached to zero to force the client to
+ * always set csa_cachethis to FALSE because the current implementation
+ * of the back channel DRC only supports caching the CB_SEQUENCE operation.
+ */
+static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args,
+ struct rpc_clnt *clnt)
+{
+ unsigned int max_rqst_sz, max_resp_sz;
+ unsigned int max_bc_payload = rpc_max_bc_payload(clnt);
+ unsigned int max_bc_slots = rpc_num_bc_slots(clnt);
+
+ max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
+ max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
+
+ /* Fore channel attributes */
+ args->fc_attrs.max_rqst_sz = max_rqst_sz;
+ args->fc_attrs.max_resp_sz = max_resp_sz;
+ args->fc_attrs.max_ops = NFS4_MAX_OPS;
+ args->fc_attrs.max_reqs = max_session_slots;
+
+ dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "
+ "max_ops=%u max_reqs=%u\n",
+ __func__,
+ args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz,
+ args->fc_attrs.max_ops, args->fc_attrs.max_reqs);
+
+ /* Back channel attributes */
+ args->bc_attrs.max_rqst_sz = max_bc_payload;
+ args->bc_attrs.max_resp_sz = max_bc_payload;
+ args->bc_attrs.max_resp_sz_cached = 0;
+ args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
+ args->bc_attrs.max_reqs = max_t(unsigned short, max_session_cb_slots, 1);
+ if (args->bc_attrs.max_reqs > max_bc_slots)
+ args->bc_attrs.max_reqs = max_bc_slots;
+
+ dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
+ "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
+ __func__,
+ args->bc_attrs.max_rqst_sz, args->bc_attrs.max_resp_sz,
+ args->bc_attrs.max_resp_sz_cached, args->bc_attrs.max_ops,
+ args->bc_attrs.max_reqs);
+}
+
+static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args,
+ struct nfs41_create_session_res *res)
+{
+ struct nfs4_channel_attrs *sent = &args->fc_attrs;
+ struct nfs4_channel_attrs *rcvd = &res->fc_attrs;
+
+ if (rcvd->max_resp_sz > sent->max_resp_sz)
+ return -EINVAL;
+ /*
+ * Our requested max_ops is the minimum we need; we're not
+ * prepared to break up compounds into smaller pieces than that.
+ * So, no point even trying to continue if the server won't
+ * cooperate:
+ */
+ if (rcvd->max_ops < sent->max_ops)
+ return -EINVAL;
+ if (rcvd->max_reqs == 0)
+ return -EINVAL;
+ if (rcvd->max_reqs > NFS4_MAX_SLOT_TABLE)
+ rcvd->max_reqs = NFS4_MAX_SLOT_TABLE;
+ return 0;
+}
+
+static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args,
+ struct nfs41_create_session_res *res)
+{
+ struct nfs4_channel_attrs *sent = &args->bc_attrs;
+ struct nfs4_channel_attrs *rcvd = &res->bc_attrs;
+
+ if (!(res->flags & SESSION4_BACK_CHAN))
+ goto out;
+ if (rcvd->max_rqst_sz > sent->max_rqst_sz)
+ return -EINVAL;
+ if (rcvd->max_resp_sz < sent->max_resp_sz)
+ return -EINVAL;
+ if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
+ return -EINVAL;
+ if (rcvd->max_ops > sent->max_ops)
+ return -EINVAL;
+ if (rcvd->max_reqs > sent->max_reqs)
+ return -EINVAL;
+out:
+ return 0;
+}
+
+static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
+ struct nfs41_create_session_res *res)
+{
+ int ret;
+
+ ret = nfs4_verify_fore_channel_attrs(args, res);
+ if (ret)
+ return ret;
+ return nfs4_verify_back_channel_attrs(args, res);
+}
+
+static void nfs4_update_session(struct nfs4_session *session,
+ struct nfs41_create_session_res *res)
+{
+ nfs4_copy_sessionid(&session->sess_id, &res->sessionid);
+ /* Mark client id and session as being confirmed */
+ session->clp->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R;
+ set_bit(NFS4_SESSION_ESTABLISHED, &session->session_state);
+ session->flags = res->flags;
+ memcpy(&session->fc_attrs, &res->fc_attrs, sizeof(session->fc_attrs));
+ if (res->flags & SESSION4_BACK_CHAN)
+ memcpy(&session->bc_attrs, &res->bc_attrs,
+ sizeof(session->bc_attrs));
+}
+
+static int _nfs4_proc_create_session(struct nfs_client *clp,
+ const struct cred *cred)
+{
+ struct nfs4_session *session = clp->cl_session;
+ struct nfs41_create_session_args args = {
+ .client = clp,
+ .clientid = clp->cl_clientid,
+ .seqid = clp->cl_seqid,
+ .cb_program = NFS4_CALLBACK,
+ };
+ struct nfs41_create_session_res res;
+
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ int status;
+
+ nfs4_init_channel_attrs(&args, clp->cl_rpcclient);
+ args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
+
+ status = rpc_call_sync(session->clp->cl_rpcclient, &msg,
+ RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
+ trace_nfs4_create_session(clp, status);
+
+ switch (status) {
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_DELAY:
+ case -ETIMEDOUT:
+ case -EACCES:
+ case -EAGAIN:
+ goto out;
+ }
+
+ clp->cl_seqid++;
+ if (!status) {
+ /* Verify the session's negotiated channel_attrs values */
+ status = nfs4_verify_channel_attrs(&args, &res);
+ /* Increment the clientid slot sequence id */
+ if (status)
+ goto out;
+ nfs4_update_session(session, &res);
+ }
+out:
+ return status;
+}
+
+/*
+ * Issues a CREATE_SESSION operation to the server.
+ * It is the responsibility of the caller to verify the session is
+ * expired before calling this routine.
+ */
+int nfs4_proc_create_session(struct nfs_client *clp, const struct cred *cred)
+{
+ int status;
+ unsigned *ptr;
+ struct nfs4_session *session = clp->cl_session;
+
+ dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
+
+ status = _nfs4_proc_create_session(clp, cred);
+ if (status)
+ goto out;
+
+ /* Init or reset the session slot tables */
+ status = nfs4_setup_session_slot_tables(session);
+ dprintk("slot table setup returned %d\n", status);
+ if (status)
+ goto out;
+
+ ptr = (unsigned *)&session->sess_id.data[0];
+ dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__,
+ clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]);
+out:
+ dprintk("<-- %s\n", __func__);
+ return status;
+}
+
+/*
+ * Issue the over-the-wire RPC DESTROY_SESSION.
+ * The caller must serialize access to this routine.
+ */
+int nfs4_proc_destroy_session(struct nfs4_session *session,
+ const struct cred *cred)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION],
+ .rpc_argp = session,
+ .rpc_cred = cred,
+ };
+ int status = 0;
+
+ dprintk("--> nfs4_proc_destroy_session\n");
+
+ /* session is still being setup */
+ if (!test_and_clear_bit(NFS4_SESSION_ESTABLISHED, &session->session_state))
+ return 0;
+
+ status = rpc_call_sync(session->clp->cl_rpcclient, &msg,
+ RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
+ trace_nfs4_destroy_session(session->clp, status);
+
+ if (status)
+ dprintk("NFS: Got error %d from the server on DESTROY_SESSION. "
+ "Session has been destroyed regardless...\n", status);
+
+ dprintk("<-- nfs4_proc_destroy_session\n");
+ return status;
+}
+
+/*
+ * Renew the cl_session lease.
+ */
+struct nfs4_sequence_data {
+ struct nfs_client *clp;
+ struct nfs4_sequence_args args;
+ struct nfs4_sequence_res res;
+};
+
+static void nfs41_sequence_release(void *data)
+{
+ struct nfs4_sequence_data *calldata = data;
+ struct nfs_client *clp = calldata->clp;
+
+ if (refcount_read(&clp->cl_count) > 1)
+ nfs4_schedule_state_renewal(clp);
+ nfs_put_client(clp);
+ kfree(calldata);
+}
+
+static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp)
+{
+ switch(task->tk_status) {
+ case -NFS4ERR_DELAY:
+ rpc_delay(task, NFS4_POLL_RETRY_MAX);
+ return -EAGAIN;
+ default:
+ nfs4_schedule_lease_recovery(clp);
+ }
+ return 0;
+}
+
+static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs4_sequence_data *calldata = data;
+ struct nfs_client *clp = calldata->clp;
+
+ if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp))
+ return;
+
+ trace_nfs4_sequence(clp, task->tk_status);
+ if (task->tk_status < 0) {
+ dprintk("%s ERROR %d\n", __func__, task->tk_status);
+ if (refcount_read(&clp->cl_count) == 1)
+ goto out;
+
+ if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) {
+ rpc_restart_call_prepare(task);
+ return;
+ }
+ }
+ dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
+out:
+ dprintk("<-- %s\n", __func__);
+}
+
+static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs4_sequence_data *calldata = data;
+ struct nfs_client *clp = calldata->clp;
+ struct nfs4_sequence_args *args;
+ struct nfs4_sequence_res *res;
+
+ args = task->tk_msg.rpc_argp;
+ res = task->tk_msg.rpc_resp;
+
+ nfs4_setup_sequence(clp, args, res, task);
+}
+
+static const struct rpc_call_ops nfs41_sequence_ops = {
+ .rpc_call_done = nfs41_sequence_call_done,
+ .rpc_call_prepare = nfs41_sequence_prepare,
+ .rpc_release = nfs41_sequence_release,
+};
+
+static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
+ const struct cred *cred,
+ struct nfs4_slot *slot,
+ bool is_privileged)
+{
+ struct nfs4_sequence_data *calldata;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
+ .rpc_cred = cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clp->cl_rpcclient,
+ .rpc_message = &msg,
+ .callback_ops = &nfs41_sequence_ops,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,
+ };
+ struct rpc_task *ret;
+
+ ret = ERR_PTR(-EIO);
+ if (!refcount_inc_not_zero(&clp->cl_count))
+ goto out_err;
+
+ ret = ERR_PTR(-ENOMEM);
+ calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
+ if (calldata == NULL)
+ goto out_put_clp;
+ nfs4_init_sequence(&calldata->args, &calldata->res, 0, is_privileged);
+ nfs4_sequence_attach_slot(&calldata->args, &calldata->res, slot);
+ msg.rpc_argp = &calldata->args;
+ msg.rpc_resp = &calldata->res;
+ calldata->clp = clp;
+ task_setup_data.callback_data = calldata;
+
+ ret = rpc_run_task(&task_setup_data);
+ if (IS_ERR(ret))
+ goto out_err;
+ return ret;
+out_put_clp:
+ nfs_put_client(clp);
+out_err:
+ nfs41_release_slot(slot);
+ return ret;
+}
+
+static int nfs41_proc_async_sequence(struct nfs_client *clp, const struct cred *cred, unsigned renew_flags)
+{
+ struct rpc_task *task;
+ int ret = 0;
+
+ if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
+ return -EAGAIN;
+ task = _nfs41_proc_sequence(clp, cred, NULL, false);
+ if (IS_ERR(task))
+ ret = PTR_ERR(task);
+ else
+ rpc_put_task_async(task);
+ dprintk("<-- %s status=%d\n", __func__, ret);
+ return ret;
+}
+
+static int nfs4_proc_sequence(struct nfs_client *clp, const struct cred *cred)
+{
+ struct rpc_task *task;
+ int ret;
+
+ task = _nfs41_proc_sequence(clp, cred, NULL, true);
+ if (IS_ERR(task)) {
+ ret = PTR_ERR(task);
+ goto out;
+ }
+ ret = rpc_wait_for_completion_task(task);
+ if (!ret)
+ ret = task->tk_status;
+ rpc_put_task(task);
+out:
+ dprintk("<-- %s status=%d\n", __func__, ret);
+ return ret;
+}
+
+struct nfs4_reclaim_complete_data {
+ struct nfs_client *clp;
+ struct nfs41_reclaim_complete_args arg;
+ struct nfs41_reclaim_complete_res res;
+};
+
+static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs4_reclaim_complete_data *calldata = data;
+
+ nfs4_setup_sequence(calldata->clp,
+ &calldata->arg.seq_args,
+ &calldata->res.seq_res,
+ task);
+}
+
+static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp)
+{
+ switch(task->tk_status) {
+ case 0:
+ wake_up_all(&clp->cl_lock_waitq);
+ fallthrough;
+ case -NFS4ERR_COMPLETE_ALREADY:
+ case -NFS4ERR_WRONG_CRED: /* What to do here? */
+ break;
+ case -NFS4ERR_DELAY:
+ rpc_delay(task, NFS4_POLL_RETRY_MAX);
+ fallthrough;
+ case -NFS4ERR_RETRY_UNCACHED_REP:
+ case -EACCES:
+ dprintk("%s: failed to reclaim complete error %d for server %s, retrying\n",
+ __func__, task->tk_status, clp->cl_hostname);
+ return -EAGAIN;
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ break;
+ default:
+ nfs4_schedule_lease_recovery(clp);
+ }
+ return 0;
+}
+
+static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
+{
+ struct nfs4_reclaim_complete_data *calldata = data;
+ struct nfs_client *clp = calldata->clp;
+ struct nfs4_sequence_res *res = &calldata->res.seq_res;
+
+ dprintk("--> %s\n", __func__);
+ if (!nfs41_sequence_done(task, res))
+ return;
+
+ trace_nfs4_reclaim_complete(clp, task->tk_status);
+ if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) {
+ rpc_restart_call_prepare(task);
+ return;
+ }
+ dprintk("<-- %s\n", __func__);
+}
+
+static void nfs4_free_reclaim_complete_data(void *data)
+{
+ struct nfs4_reclaim_complete_data *calldata = data;
+
+ kfree(calldata);
+}
+
+static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = {
+ .rpc_call_prepare = nfs4_reclaim_complete_prepare,
+ .rpc_call_done = nfs4_reclaim_complete_done,
+ .rpc_release = nfs4_free_reclaim_complete_data,
+};
+
+/*
+ * Issue a global reclaim complete.
+ */
+static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
+ const struct cred *cred)
+{
+ struct nfs4_reclaim_complete_data *calldata;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE],
+ .rpc_cred = cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clp->cl_rpcclient,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_reclaim_complete_call_ops,
+ .flags = RPC_TASK_NO_ROUND_ROBIN,
+ };
+ int status = -ENOMEM;
+
+ dprintk("--> %s\n", __func__);
+ calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
+ if (calldata == NULL)
+ goto out;
+ calldata->clp = clp;
+ calldata->arg.one_fs = 0;
+
+ nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0, 1);
+ msg.rpc_argp = &calldata->arg;
+ msg.rpc_resp = &calldata->res;
+ task_setup_data.callback_data = calldata;
+ status = nfs4_call_sync_custom(&task_setup_data);
+out:
+ dprintk("<-- %s status=%d\n", __func__, status);
+ return status;
+}
+
+static void
+nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutget *lgp = calldata;
+ struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+
+ dprintk("--> %s\n", __func__);
+ nfs4_setup_sequence(server->nfs_client, &lgp->args.seq_args,
+ &lgp->res.seq_res, task);
+ dprintk("<-- %s\n", __func__);
+}
+
+static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutget *lgp = calldata;
+
+ dprintk("--> %s\n", __func__);
+ nfs41_sequence_process(task, &lgp->res.seq_res);
+ dprintk("<-- %s\n", __func__);
+}
+
+static int
+nfs4_layoutget_handle_exception(struct rpc_task *task,
+ struct nfs4_layoutget *lgp, struct nfs4_exception *exception)
+{
+ struct inode *inode = lgp->args.inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct pnfs_layout_hdr *lo;
+ int nfs4err = task->tk_status;
+ int err, status = 0;
+ LIST_HEAD(head);
+
+ dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
+
+ nfs4_sequence_free_slot(&lgp->res.seq_res);
+
+ switch (nfs4err) {
+ case 0:
+ goto out;
+
+ /*
+ * NFS4ERR_LAYOUTUNAVAILABLE means we are not supposed to use pnfs
+ * on the file. set tk_status to -ENODATA to tell upper layer to
+ * retry go inband.
+ */
+ case -NFS4ERR_LAYOUTUNAVAILABLE:
+ status = -ENODATA;
+ goto out;
+ /*
+ * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
+ * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
+ */
+ case -NFS4ERR_BADLAYOUT:
+ status = -EOVERFLOW;
+ goto out;
+ /*
+ * NFS4ERR_LAYOUTTRYLATER is a conflict with another client
+ * (or clients) writing to the same RAID stripe except when
+ * the minlength argument is 0 (see RFC5661 section 18.43.3).
+ *
+ * Treat it like we would RECALLCONFLICT -- we retry for a little
+ * while, and then eventually give up.
+ */
+ case -NFS4ERR_LAYOUTTRYLATER:
+ if (lgp->args.minlength == 0) {
+ status = -EOVERFLOW;
+ goto out;
+ }
+ status = -EBUSY;
+ break;
+ case -NFS4ERR_RECALLCONFLICT:
+ case -NFS4ERR_RETURNCONFLICT:
+ status = -ERECALLCONFLICT;
+ break;
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_BAD_STATEID:
+ exception->timeout = 0;
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ /* If the open stateid was bad, then recover it. */
+ if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
+ !nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) {
+ spin_unlock(&inode->i_lock);
+ exception->state = lgp->args.ctx->state;
+ exception->stateid = &lgp->args.stateid;
+ break;
+ }
+
+ /*
+ * Mark the bad layout state as invalid, then retry
+ */
+ pnfs_mark_layout_stateid_invalid(lo, &head);
+ spin_unlock(&inode->i_lock);
+ nfs_commit_inode(inode, 0);
+ pnfs_free_lseg_list(&head);
+ status = -EAGAIN;
+ goto out;
+ }
+
+ err = nfs4_handle_exception(server, nfs4err, exception);
+ if (!status) {
+ if (exception->retry)
+ status = -EAGAIN;
+ else
+ status = err;
+ }
+out:
+ dprintk("<-- %s\n", __func__);
+ return status;
+}
+
+size_t max_response_pages(struct nfs_server *server)
+{
+ u32 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+ return nfs_page_array_len(0, max_resp_sz);
+}
+
+static void nfs4_layoutget_release(void *calldata)
+{
+ struct nfs4_layoutget *lgp = calldata;
+
+ dprintk("--> %s\n", __func__);
+ nfs4_sequence_free_slot(&lgp->res.seq_res);
+ pnfs_layoutget_free(lgp);
+ dprintk("<-- %s\n", __func__);
+}
+
+static const struct rpc_call_ops nfs4_layoutget_call_ops = {
+ .rpc_call_prepare = nfs4_layoutget_prepare,
+ .rpc_call_done = nfs4_layoutget_done,
+ .rpc_release = nfs4_layoutget_release,
+};
+
+struct pnfs_layout_segment *
+nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
+{
+ struct inode *inode = lgp->args.inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
+ .rpc_argp = &lgp->args,
+ .rpc_resp = &lgp->res,
+ .rpc_cred = lgp->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = server->client,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_layoutget_call_ops,
+ .callback_data = lgp,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
+ };
+ struct pnfs_layout_segment *lseg = NULL;
+ struct nfs4_exception exception = {
+ .inode = inode,
+ .timeout = *timeout,
+ };
+ int status = 0;
+
+ dprintk("--> %s\n", __func__);
+
+ /* nfs4_layoutget_release calls pnfs_put_layout_hdr */
+ pnfs_get_layout_hdr(NFS_I(inode)->layout);
+
+ nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0);
+
+ task = rpc_run_task(&task_setup_data);
+
+ status = rpc_wait_for_completion_task(task);
+ if (status != 0)
+ goto out;
+
+ if (task->tk_status < 0) {
+ status = nfs4_layoutget_handle_exception(task, lgp, &exception);
+ *timeout = exception.timeout;
+ } else if (lgp->res.layoutp->len == 0) {
+ status = -EAGAIN;
+ *timeout = nfs4_update_delay(&exception.timeout);
+ } else
+ lseg = pnfs_layout_process(lgp);
+out:
+ trace_nfs4_layoutget(lgp->args.ctx,
+ &lgp->args.range,
+ &lgp->res.range,
+ &lgp->res.stateid,
+ status);
+
+ rpc_put_task(task);
+ dprintk("<-- %s status=%d\n", __func__, status);
+ if (status)
+ return ERR_PTR(status);
+ return lseg;
+}
+
+static void
+nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutreturn *lrp = calldata;
+
+ dprintk("--> %s\n", __func__);
+ nfs4_setup_sequence(lrp->clp,
+ &lrp->args.seq_args,
+ &lrp->res.seq_res,
+ task);
+ if (!pnfs_layout_is_valid(lrp->args.layout))
+ rpc_exit(task, 0);
+}
+
+static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutreturn *lrp = calldata;
+ struct nfs_server *server;
+
+ dprintk("--> %s\n", __func__);
+
+ if (!nfs41_sequence_process(task, &lrp->res.seq_res))
+ return;
+
+ /*
+ * Was there an RPC level error? Assume the call succeeded,
+ * and that we need to release the layout
+ */
+ if (task->tk_rpc_status != 0 && RPC_WAS_SENT(task)) {
+ lrp->res.lrs_present = 0;
+ return;
+ }
+
+ server = NFS_SERVER(lrp->args.inode);
+ switch (task->tk_status) {
+ case -NFS4ERR_OLD_STATEID:
+ if (nfs4_layout_refresh_old_stateid(&lrp->args.stateid,
+ &lrp->args.range,
+ lrp->args.inode))
+ goto out_restart;
+ fallthrough;
+ default:
+ task->tk_status = 0;
+ fallthrough;
+ case 0:
+ break;
+ case -NFS4ERR_DELAY:
+ if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
+ break;
+ goto out_restart;
+ }
+ dprintk("<-- %s\n", __func__);
+ return;
+out_restart:
+ task->tk_status = 0;
+ nfs4_sequence_free_slot(&lrp->res.seq_res);
+ rpc_restart_call_prepare(task);
+}
+
+static void nfs4_layoutreturn_release(void *calldata)
+{
+ struct nfs4_layoutreturn *lrp = calldata;
+ struct pnfs_layout_hdr *lo = lrp->args.layout;
+
+ dprintk("--> %s\n", __func__);
+ pnfs_layoutreturn_free_lsegs(lo, &lrp->args.stateid, &lrp->args.range,
+ lrp->res.lrs_present ? &lrp->res.stateid : NULL);
+ nfs4_sequence_free_slot(&lrp->res.seq_res);
+ if (lrp->ld_private.ops && lrp->ld_private.ops->free)
+ lrp->ld_private.ops->free(&lrp->ld_private);
+ pnfs_put_layout_hdr(lrp->args.layout);
+ nfs_iput_and_deactive(lrp->inode);
+ put_cred(lrp->cred);
+ kfree(calldata);
+ dprintk("<-- %s\n", __func__);
+}
+
+static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
+ .rpc_call_prepare = nfs4_layoutreturn_prepare,
+ .rpc_call_done = nfs4_layoutreturn_done,
+ .rpc_release = nfs4_layoutreturn_release,
+};
+
+int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
+{
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
+ .rpc_argp = &lrp->args,
+ .rpc_resp = &lrp->res,
+ .rpc_cred = lrp->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = NFS_SERVER(lrp->args.inode)->client,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_layoutreturn_call_ops,
+ .callback_data = lrp,
+ };
+ int status = 0;
+
+ nfs4_state_protect(NFS_SERVER(lrp->args.inode)->nfs_client,
+ NFS_SP4_MACH_CRED_PNFS_CLEANUP,
+ &task_setup_data.rpc_client, &msg);
+
+ dprintk("--> %s\n", __func__);
+ lrp->inode = nfs_igrab_and_active(lrp->args.inode);
+ if (!sync) {
+ if (!lrp->inode) {
+ nfs4_layoutreturn_release(lrp);
+ return -EAGAIN;
+ }
+ task_setup_data.flags |= RPC_TASK_ASYNC;
+ }
+ if (!lrp->inode)
+ nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1,
+ 1);
+ else
+ nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1,
+ 0);
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ if (sync)
+ status = task->tk_status;
+ trace_nfs4_layoutreturn(lrp->args.inode, &lrp->args.stateid, status);
+ dprintk("<-- %s status=%d\n", __func__, status);
+ rpc_put_task(task);
+ return status;
+}
+
+static int
+_nfs4_proc_getdeviceinfo(struct nfs_server *server,
+ struct pnfs_device *pdev,
+ const struct cred *cred)
+{
+ struct nfs4_getdeviceinfo_args args = {
+ .pdev = pdev,
+ .notify_types = NOTIFY_DEVICEID4_CHANGE |
+ NOTIFY_DEVICEID4_DELETE,
+ };
+ struct nfs4_getdeviceinfo_res res = {
+ .pdev = pdev,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ int status;
+
+ dprintk("--> %s\n", __func__);
+ status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+ if (res.notification & ~args.notify_types)
+ dprintk("%s: unsupported notification\n", __func__);
+ if (res.notification != args.notify_types)
+ pdev->nocache = 1;
+
+ dprintk("<-- %s status=%d\n", __func__, status);
+
+ return status;
+}
+
+int nfs4_proc_getdeviceinfo(struct nfs_server *server,
+ struct pnfs_device *pdev,
+ const struct cred *cred)
+{
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = nfs4_handle_exception(server,
+ _nfs4_proc_getdeviceinfo(server, pdev, cred),
+ &exception);
+ } while (exception.retry);
+ return err;
+}
+EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
+
+static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutcommit_data *data = calldata;
+ struct nfs_server *server = NFS_SERVER(data->args.inode);
+
+ nfs4_setup_sequence(server->nfs_client,
+ &data->args.seq_args,
+ &data->res.seq_res,
+ task);
+}
+
+static void
+nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutcommit_data *data = calldata;
+ struct nfs_server *server = NFS_SERVER(data->args.inode);
+
+ if (!nfs41_sequence_done(task, &data->res.seq_res))
+ return;
+
+ switch (task->tk_status) { /* Just ignore these failures */
+ case -NFS4ERR_DELEG_REVOKED: /* layout was recalled */
+ case -NFS4ERR_BADIOMODE: /* no IOMODE_RW layout for range */
+ case -NFS4ERR_BADLAYOUT: /* no layout */
+ case -NFS4ERR_GRACE: /* loca_recalim always false */
+ task->tk_status = 0;
+ case 0:
+ break;
+ default:
+ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
+ rpc_restart_call_prepare(task);
+ return;
+ }
+ }
+}
+
+static void nfs4_layoutcommit_release(void *calldata)
+{
+ struct nfs4_layoutcommit_data *data = calldata;
+
+ pnfs_cleanup_layoutcommit(data);
+ nfs_post_op_update_inode_force_wcc(data->args.inode,
+ data->res.fattr);
+ put_cred(data->cred);
+ nfs_iput_and_deactive(data->inode);
+ kfree(data);
+}
+
+static const struct rpc_call_ops nfs4_layoutcommit_ops = {
+ .rpc_call_prepare = nfs4_layoutcommit_prepare,
+ .rpc_call_done = nfs4_layoutcommit_done,
+ .rpc_release = nfs4_layoutcommit_release,
+};
+
+int
+nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT],
+ .rpc_argp = &data->args,
+ .rpc_resp = &data->res,
+ .rpc_cred = data->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .task = &data->task,
+ .rpc_client = NFS_CLIENT(data->args.inode),
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_layoutcommit_ops,
+ .callback_data = data,
+ };
+ struct rpc_task *task;
+ int status = 0;
+
+ dprintk("NFS: initiating layoutcommit call. sync %d "
+ "lbw: %llu inode %lu\n", sync,
+ data->args.lastbytewritten,
+ data->args.inode->i_ino);
+
+ if (!sync) {
+ data->inode = nfs_igrab_and_active(data->args.inode);
+ if (data->inode == NULL) {
+ nfs4_layoutcommit_release(data);
+ return -EAGAIN;
+ }
+ task_setup_data.flags = RPC_TASK_ASYNC;
+ }
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 0);
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ if (sync)
+ status = task->tk_status;
+ trace_nfs4_layoutcommit(data->args.inode, &data->args.stateid, status);
+ dprintk("%s: status %d\n", __func__, status);
+ rpc_put_task(task);
+ return status;
+}
+
+/*
+ * Use the state managment nfs_client cl_rpcclient, which uses krb5i (if
+ * possible) as per RFC3530bis and RFC5661 Security Considerations sections
+ */
+static int
+_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info,
+ struct nfs4_secinfo_flavors *flavors, bool use_integrity)
+{
+ struct nfs41_secinfo_no_name_args args = {
+ .style = SECINFO_STYLE_CURRENT_FH,
+ };
+ struct nfs4_secinfo_res res = {
+ .flavors = flavors,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO_NO_NAME],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ struct nfs4_call_sync_data data = {
+ .seq_server = server,
+ .seq_args = &args.seq_args,
+ .seq_res = &res.seq_res,
+ };
+ struct rpc_task_setup task_setup = {
+ .rpc_client = server->client,
+ .rpc_message = &msg,
+ .callback_ops = server->nfs_client->cl_mvops->call_sync_ops,
+ .callback_data = &data,
+ .flags = RPC_TASK_NO_ROUND_ROBIN,
+ };
+ const struct cred *cred = NULL;
+ int status;
+
+ if (use_integrity) {
+ task_setup.rpc_client = server->nfs_client->cl_rpcclient;
+
+ cred = nfs4_get_clid_cred(server->nfs_client);
+ msg.rpc_cred = cred;
+ }
+
+ dprintk("--> %s\n", __func__);
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
+ status = nfs4_call_sync_custom(&task_setup);
+ dprintk("<-- %s status=%d\n", __func__, status);
+
+ put_cred(cred);
+
+ return status;
+}
+
+static int
+nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ /* first try using integrity protection */
+ err = -NFS4ERR_WRONGSEC;
+
+ /* try to use integrity protection with machine cred */
+ if (_nfs4_is_integrity_protected(server->nfs_client))
+ err = _nfs41_proc_secinfo_no_name(server, fhandle, info,
+ flavors, true);
+
+ /*
+ * if unable to use integrity protection, or SECINFO with
+ * integrity protection returns NFS4ERR_WRONGSEC (which is
+ * disallowed by spec, but exists in deployed servers) use
+ * the current filesystem's rpc_client and the user cred.
+ */
+ if (err == -NFS4ERR_WRONGSEC)
+ err = _nfs41_proc_secinfo_no_name(server, fhandle, info,
+ flavors, false);
+
+ switch (err) {
+ case 0:
+ case -NFS4ERR_WRONGSEC:
+ case -ENOTSUPP:
+ goto out;
+ default:
+ err = nfs4_handle_exception(server, err, &exception);
+ }
+ } while (exception.retry);
+out:
+ return err;
+}
+
+static int
+nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ int err;
+ struct page *page;
+ rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
+ struct nfs4_secinfo_flavors *flavors;
+ struct nfs4_secinfo4 *secinfo;
+ int i;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ flavors = page_address(page);
+ err = nfs41_proc_secinfo_no_name(server, fhandle, info, flavors);
+
+ /*
+ * Fall back on "guess and check" method if
+ * the server doesn't support SECINFO_NO_NAME
+ */
+ if (err == -NFS4ERR_WRONGSEC || err == -ENOTSUPP) {
+ err = nfs4_find_root_sec(server, fhandle, info);
+ goto out_freepage;
+ }
+ if (err)
+ goto out_freepage;
+
+ for (i = 0; i < flavors->num_flavors; i++) {
+ secinfo = &flavors->flavors[i];
+
+ switch (secinfo->flavor) {
+ case RPC_AUTH_NULL:
+ case RPC_AUTH_UNIX:
+ case RPC_AUTH_GSS:
+ flavor = rpcauth_get_pseudoflavor(secinfo->flavor,
+ &secinfo->flavor_info);
+ break;
+ default:
+ flavor = RPC_AUTH_MAXFLAVOR;
+ break;
+ }
+
+ if (!nfs_auth_info_match(&server->auth_info, flavor))
+ flavor = RPC_AUTH_MAXFLAVOR;
+
+ if (flavor != RPC_AUTH_MAXFLAVOR) {
+ err = nfs4_lookup_root_sec(server, fhandle,
+ info, flavor);
+ if (!err)
+ break;
+ }
+ }
+
+ if (flavor == RPC_AUTH_MAXFLAVOR)
+ err = -EPERM;
+
+out_freepage:
+ put_page(page);
+ if (err == -EACCES)
+ return -EPERM;
+out:
+ return err;
+}
+
+static int _nfs41_test_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ const struct cred *cred)
+{
+ int status;
+ struct nfs41_test_stateid_args args = {
+ .stateid = stateid,
+ };
+ struct nfs41_test_stateid_res res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ struct rpc_clnt *rpc_client = server->client;
+
+ nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID,
+ &rpc_client, &msg);
+
+ dprintk("NFS call test_stateid %p\n", stateid);
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1);
+ status = nfs4_call_sync_sequence(rpc_client, server, &msg,
+ &args.seq_args, &res.seq_res);
+ if (status != NFS_OK) {
+ dprintk("NFS reply test_stateid: failed, %d\n", status);
+ return status;
+ }
+ dprintk("NFS reply test_stateid: succeeded, %d\n", -res.status);
+ return -res.status;
+}
+
+static void nfs4_handle_delay_or_session_error(struct nfs_server *server,
+ int err, struct nfs4_exception *exception)
+{
+ exception->retry = 0;
+ switch(err) {
+ case -NFS4ERR_DELAY:
+ case -NFS4ERR_RETRY_UNCACHED_REP:
+ nfs4_handle_exception(server, err, exception);
+ break;
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case -NFS4ERR_DEADSESSION:
+ nfs4_do_handle_exception(server, err, exception);
+ }
+}
+
+/**
+ * nfs41_test_stateid - perform a TEST_STATEID operation
+ *
+ * @server: server / transport on which to perform the operation
+ * @stateid: state ID to test
+ * @cred: credential
+ *
+ * Returns NFS_OK if the server recognizes that "stateid" is valid.
+ * Otherwise a negative NFS4ERR value is returned if the operation
+ * failed or the state ID is not currently valid.
+ */
+static int nfs41_test_stateid(struct nfs_server *server,
+ nfs4_stateid *stateid,
+ const struct cred *cred)
+{
+ struct nfs4_exception exception = {
+ .interruptible = true,
+ };
+ int err;
+ do {
+ err = _nfs41_test_stateid(server, stateid, cred);
+ nfs4_handle_delay_or_session_error(server, err, &exception);
+ } while (exception.retry);
+ return err;
+}
+
+struct nfs_free_stateid_data {
+ struct nfs_server *server;
+ struct nfs41_free_stateid_args args;
+ struct nfs41_free_stateid_res res;
+};
+
+static void nfs41_free_stateid_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs_free_stateid_data *data = calldata;
+ nfs4_setup_sequence(data->server->nfs_client,
+ &data->args.seq_args,
+ &data->res.seq_res,
+ task);
+}
+
+static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs_free_stateid_data *data = calldata;
+
+ nfs41_sequence_done(task, &data->res.seq_res);
+
+ switch (task->tk_status) {
+ case -NFS4ERR_DELAY:
+ if (nfs4_async_handle_error(task, data->server, NULL, NULL) == -EAGAIN)
+ rpc_restart_call_prepare(task);
+ }
+}
+
+static void nfs41_free_stateid_release(void *calldata)
+{
+ kfree(calldata);
+}
+
+static const struct rpc_call_ops nfs41_free_stateid_ops = {
+ .rpc_call_prepare = nfs41_free_stateid_prepare,
+ .rpc_call_done = nfs41_free_stateid_done,
+ .rpc_release = nfs41_free_stateid_release,
+};
+
+/**
+ * nfs41_free_stateid - perform a FREE_STATEID operation
+ *
+ * @server: server / transport on which to perform the operation
+ * @stateid: state ID to release
+ * @cred: credential
+ * @privileged: set to true if this call needs to be privileged
+ *
+ * Note: this function is always asynchronous.
+ */
+static int nfs41_free_stateid(struct nfs_server *server,
+ const nfs4_stateid *stateid,
+ const struct cred *cred,
+ bool privileged)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID],
+ .rpc_cred = cred,
+ };
+ struct rpc_task_setup task_setup = {
+ .rpc_client = server->client,
+ .rpc_message = &msg,
+ .callback_ops = &nfs41_free_stateid_ops,
+ .flags = RPC_TASK_ASYNC,
+ };
+ struct nfs_free_stateid_data *data;
+ struct rpc_task *task;
+
+ nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID,
+ &task_setup.rpc_client, &msg);
+
+ dprintk("NFS call free_stateid %p\n", stateid);
+ data = kmalloc(sizeof(*data), GFP_NOFS);
+ if (!data)
+ return -ENOMEM;
+ data->server = server;
+ nfs4_stateid_copy(&data->args.stateid, stateid);
+
+ task_setup.callback_data = data;
+
+ msg.rpc_argp = &data->args;
+ msg.rpc_resp = &data->res;
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, privileged);
+ task = rpc_run_task(&task_setup);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ rpc_put_task(task);
+ return 0;
+}
+
+static void
+nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
+{
+ const struct cred *cred = lsp->ls_state->owner->so_cred;
+
+ nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
+ nfs4_free_lock_state(server, lsp);
+}
+
+static bool nfs41_match_stateid(const nfs4_stateid *s1,
+ const nfs4_stateid *s2)
+{
+ if (s1->type != s2->type)
+ return false;
+
+ if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0)
+ return false;
+
+ if (s1->seqid == s2->seqid)
+ return true;
+
+ return s1->seqid == 0 || s2->seqid == 0;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+static bool nfs4_match_stateid(const nfs4_stateid *s1,
+ const nfs4_stateid *s2)
+{
+ return nfs4_stateid_match(s1, s2);
+}
+
+
+static const struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
+ .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
+ .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
+ .recover_open = nfs4_open_reclaim,
+ .recover_lock = nfs4_lock_reclaim,
+ .establish_clid = nfs4_init_clientid,
+ .detect_trunking = nfs40_discover_server_trunking,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
+ .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
+ .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
+ .recover_open = nfs4_open_reclaim,
+ .recover_lock = nfs4_lock_reclaim,
+ .establish_clid = nfs41_init_clientid,
+ .reclaim_complete = nfs41_proc_reclaim_complete,
+ .detect_trunking = nfs41_discover_server_trunking,
+};
+#endif /* CONFIG_NFS_V4_1 */
+
+static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
+ .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
+ .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
+ .recover_open = nfs40_open_expired,
+ .recover_lock = nfs4_lock_expired,
+ .establish_clid = nfs4_init_clientid,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static const struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
+ .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
+ .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
+ .recover_open = nfs41_open_expired,
+ .recover_lock = nfs41_lock_expired,
+ .establish_clid = nfs41_init_clientid,
+};
+#endif /* CONFIG_NFS_V4_1 */
+
+static const struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = {
+ .sched_state_renewal = nfs4_proc_async_renew,
+ .get_state_renewal_cred = nfs4_get_renew_cred,
+ .renew_lease = nfs4_proc_renew,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
+ .sched_state_renewal = nfs41_proc_async_sequence,
+ .get_state_renewal_cred = nfs4_get_machine_cred,
+ .renew_lease = nfs4_proc_sequence,
+};
+#endif
+
+static const struct nfs4_mig_recovery_ops nfs40_mig_recovery_ops = {
+ .get_locations = _nfs40_proc_get_locations,
+ .fsid_present = _nfs40_proc_fsid_present,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static const struct nfs4_mig_recovery_ops nfs41_mig_recovery_ops = {
+ .get_locations = _nfs41_proc_get_locations,
+ .fsid_present = _nfs41_proc_fsid_present,
+};
+#endif /* CONFIG_NFS_V4_1 */
+
+static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
+ .minor_version = 0,
+ .init_caps = NFS_CAP_READDIRPLUS
+ | NFS_CAP_ATOMIC_OPEN
+ | NFS_CAP_POSIX_LOCK,
+ .init_client = nfs40_init_client,
+ .shutdown_client = nfs40_shutdown_client,
+ .match_stateid = nfs4_match_stateid,
+ .find_root_sec = nfs4_find_root_sec,
+ .free_lock_state = nfs4_release_lockowner,
+ .test_and_free_expired = nfs40_test_and_free_expired_stateid,
+ .alloc_seqid = nfs_alloc_seqid,
+ .call_sync_ops = &nfs40_call_sync_ops,
+ .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
+ .nograce_recovery_ops = &nfs40_nograce_recovery_ops,
+ .state_renewal_ops = &nfs40_state_renewal_ops,
+ .mig_recovery_ops = &nfs40_mig_recovery_ops,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static struct nfs_seqid *
+nfs_alloc_no_seqid(struct nfs_seqid_counter *arg1, gfp_t arg2)
+{
+ return NULL;
+}
+
+static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
+ .minor_version = 1,
+ .init_caps = NFS_CAP_READDIRPLUS
+ | NFS_CAP_ATOMIC_OPEN
+ | NFS_CAP_POSIX_LOCK
+ | NFS_CAP_STATEID_NFSV41
+ | NFS_CAP_ATOMIC_OPEN_V1
+ | NFS_CAP_LGOPEN,
+ .init_client = nfs41_init_client,
+ .shutdown_client = nfs41_shutdown_client,
+ .match_stateid = nfs41_match_stateid,
+ .find_root_sec = nfs41_find_root_sec,
+ .free_lock_state = nfs41_free_lock_state,
+ .test_and_free_expired = nfs41_test_and_free_expired_stateid,
+ .alloc_seqid = nfs_alloc_no_seqid,
+ .session_trunk = nfs4_test_session_trunk,
+ .call_sync_ops = &nfs41_call_sync_ops,
+ .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
+ .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
+ .state_renewal_ops = &nfs41_state_renewal_ops,
+ .mig_recovery_ops = &nfs41_mig_recovery_ops,
+};
+#endif
+
+#if defined(CONFIG_NFS_V4_2)
+static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
+ .minor_version = 2,
+ .init_caps = NFS_CAP_READDIRPLUS
+ | NFS_CAP_ATOMIC_OPEN
+ | NFS_CAP_POSIX_LOCK
+ | NFS_CAP_STATEID_NFSV41
+ | NFS_CAP_ATOMIC_OPEN_V1
+ | NFS_CAP_LGOPEN
+ | NFS_CAP_ALLOCATE
+ | NFS_CAP_COPY
+ | NFS_CAP_OFFLOAD_CANCEL
+ | NFS_CAP_COPY_NOTIFY
+ | NFS_CAP_DEALLOCATE
+ | NFS_CAP_SEEK
+ | NFS_CAP_LAYOUTSTATS
+ | NFS_CAP_CLONE
+ | NFS_CAP_LAYOUTERROR
+ | NFS_CAP_READ_PLUS,
+ .init_client = nfs41_init_client,
+ .shutdown_client = nfs41_shutdown_client,
+ .match_stateid = nfs41_match_stateid,
+ .find_root_sec = nfs41_find_root_sec,
+ .free_lock_state = nfs41_free_lock_state,
+ .call_sync_ops = &nfs41_call_sync_ops,
+ .test_and_free_expired = nfs41_test_and_free_expired_stateid,
+ .alloc_seqid = nfs_alloc_no_seqid,
+ .session_trunk = nfs4_test_session_trunk,
+ .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
+ .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
+ .state_renewal_ops = &nfs41_state_renewal_ops,
+ .mig_recovery_ops = &nfs41_mig_recovery_ops,
+};
+#endif
+
+const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
+ [0] = &nfs_v4_0_minor_ops,
+#if defined(CONFIG_NFS_V4_1)
+ [1] = &nfs_v4_1_minor_ops,
+#endif
+#if defined(CONFIG_NFS_V4_2)
+ [2] = &nfs_v4_2_minor_ops,
+#endif
+};
+
+static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+ ssize_t error, error2, error3;
+
+ error = generic_listxattr(dentry, list, size);
+ if (error < 0)
+ return error;
+ if (list) {
+ list += error;
+ size -= error;
+ }
+
+ error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, size);
+ if (error2 < 0)
+ return error2;
+
+ if (list) {
+ list += error2;
+ size -= error2;
+ }
+
+ error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, size);
+ if (error3 < 0)
+ return error3;
+
+ return error + error2 + error3;
+}
+
+static void nfs4_enable_swap(struct inode *inode)
+{
+ /* The state manager thread must always be running.
+ * It will notice the client is a swapper, and stay put.
+ */
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+
+ nfs4_schedule_state_manager(clp);
+}
+
+static void nfs4_disable_swap(struct inode *inode)
+{
+ /* The state manager thread will now exit once it is
+ * woken.
+ */
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+
+ set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
+ wake_up_var(&clp->cl_state);
+}
+
+static const struct inode_operations nfs4_dir_inode_operations = {
+ .create = nfs_create,
+ .lookup = nfs_lookup,
+ .atomic_open = nfs_atomic_open,
+ .link = nfs_link,
+ .unlink = nfs_unlink,
+ .symlink = nfs_symlink,
+ .mkdir = nfs_mkdir,
+ .rmdir = nfs_rmdir,
+ .mknod = nfs_mknod,
+ .rename = nfs_rename,
+ .permission = nfs_permission,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+ .listxattr = nfs4_listxattr,
+};
+
+static const struct inode_operations nfs4_file_inode_operations = {
+ .permission = nfs_permission,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+ .listxattr = nfs4_listxattr,
+};
+
+const struct nfs_rpc_ops nfs_v4_clientops = {
+ .version = 4, /* protocol version */
+ .dentry_ops = &nfs4_dentry_operations,
+ .dir_inode_ops = &nfs4_dir_inode_operations,
+ .file_inode_ops = &nfs4_file_inode_operations,
+ .file_ops = &nfs4_file_operations,
+ .getroot = nfs4_proc_get_root,
+ .submount = nfs4_submount,
+ .try_get_tree = nfs4_try_get_tree,
+ .getattr = nfs4_proc_getattr,
+ .setattr = nfs4_proc_setattr,
+ .lookup = nfs4_proc_lookup,
+ .lookupp = nfs4_proc_lookupp,
+ .access = nfs4_proc_access,
+ .readlink = nfs4_proc_readlink,
+ .create = nfs4_proc_create,
+ .remove = nfs4_proc_remove,
+ .unlink_setup = nfs4_proc_unlink_setup,
+ .unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare,
+ .unlink_done = nfs4_proc_unlink_done,
+ .rename_setup = nfs4_proc_rename_setup,
+ .rename_rpc_prepare = nfs4_proc_rename_rpc_prepare,
+ .rename_done = nfs4_proc_rename_done,
+ .link = nfs4_proc_link,
+ .symlink = nfs4_proc_symlink,
+ .mkdir = nfs4_proc_mkdir,
+ .rmdir = nfs4_proc_rmdir,
+ .readdir = nfs4_proc_readdir,
+ .mknod = nfs4_proc_mknod,
+ .statfs = nfs4_proc_statfs,
+ .fsinfo = nfs4_proc_fsinfo,
+ .pathconf = nfs4_proc_pathconf,
+ .set_capabilities = nfs4_server_capabilities,
+ .decode_dirent = nfs4_decode_dirent,
+ .pgio_rpc_prepare = nfs4_proc_pgio_rpc_prepare,
+ .read_setup = nfs4_proc_read_setup,
+ .read_done = nfs4_read_done,
+ .write_setup = nfs4_proc_write_setup,
+ .write_done = nfs4_write_done,
+ .commit_setup = nfs4_proc_commit_setup,
+ .commit_rpc_prepare = nfs4_proc_commit_rpc_prepare,
+ .commit_done = nfs4_commit_done,
+ .lock = nfs4_proc_lock,
+ .clear_acl_cache = nfs4_zap_acl_attr,
+ .close_context = nfs4_close_context,
+ .open_context = nfs4_atomic_open,
+ .have_delegation = nfs4_have_delegation,
+ .alloc_client = nfs4_alloc_client,
+ .init_client = nfs4_init_client,
+ .free_client = nfs4_free_client,
+ .create_server = nfs4_create_server,
+ .clone_server = nfs_clone_server,
+ .enable_swap = nfs4_enable_swap,
+ .disable_swap = nfs4_disable_swap,
+};
+
+static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
+ .name = XATTR_NAME_NFSV4_ACL,
+ .list = nfs4_xattr_list_nfs4_acl,
+ .get = nfs4_xattr_get_nfs4_acl,
+ .set = nfs4_xattr_set_nfs4_acl,
+};
+
+#ifdef CONFIG_NFS_V4_2
+static const struct xattr_handler nfs4_xattr_nfs4_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = nfs4_xattr_get_nfs4_user,
+ .set = nfs4_xattr_set_nfs4_user,
+};
+#endif
+
+const struct xattr_handler *nfs4_xattr_handlers[] = {
+ &nfs4_xattr_nfs4_acl_handler,
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ &nfs4_xattr_nfs4_label_handler,
+#endif
+#ifdef CONFIG_NFS_V4_2
+ &nfs4_xattr_nfs4_user_handler,
+#endif
+ NULL
+};
+
+/*
+ * Local variables:
+ * c-basic-offset: 8
+ * End:
+ */
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
new file mode 100644
index 000000000..ff876dda7
--- /dev/null
+++ b/fs/nfs/nfs4renewd.c
@@ -0,0 +1,157 @@
+/*
+ * fs/nfs/nfs4renewd.c
+ *
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Kendrick Smith <kmsmith@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Implementation of the NFSv4 "renew daemon", which wakes up periodically to
+ * send a RENEW, to keep state alive on the server. The daemon is implemented
+ * as an rpc_task, not a real kernel thread, so it always runs in rpciod's
+ * context. There is one renewd per nfs_server.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/clnt.h>
+
+#include <linux/nfs.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
+#include "delegation.h"
+
+#define NFSDBG_FACILITY NFSDBG_STATE
+
+void
+nfs4_renew_state(struct work_struct *work)
+{
+ const struct nfs4_state_maintenance_ops *ops;
+ struct nfs_client *clp =
+ container_of(work, struct nfs_client, cl_renewd.work);
+ const struct cred *cred;
+ long lease;
+ unsigned long last, now;
+ unsigned renew_flags = 0;
+
+ ops = clp->cl_mvops->state_renewal_ops;
+ dprintk("%s: start\n", __func__);
+
+ if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state))
+ goto out;
+
+ lease = clp->cl_lease_time;
+ last = clp->cl_last_renewal;
+ now = jiffies;
+ /* Are we close to a lease timeout? */
+ if (time_after(now, last + lease/3))
+ renew_flags |= NFS4_RENEW_TIMEOUT;
+ if (nfs_delegations_present(clp))
+ renew_flags |= NFS4_RENEW_DELEGATION_CB;
+
+ if (renew_flags != 0) {
+ cred = ops->get_state_renewal_cred(clp);
+ if (cred == NULL) {
+ if (!(renew_flags & NFS4_RENEW_DELEGATION_CB)) {
+ set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ goto out;
+ }
+ nfs_expire_all_delegations(clp);
+ } else {
+ int ret;
+
+ /* Queue an asynchronous RENEW. */
+ ret = ops->sched_state_renewal(clp, cred, renew_flags);
+ put_cred(cred);
+ switch (ret) {
+ default:
+ goto out_exp;
+ case -EAGAIN:
+ case -ENOMEM:
+ break;
+ }
+ }
+ } else {
+ dprintk("%s: failed to call renewd. Reason: lease not expired \n",
+ __func__);
+ }
+ nfs4_schedule_state_renewal(clp);
+out_exp:
+ nfs_expire_unreferenced_delegations(clp);
+out:
+ dprintk("%s: done\n", __func__);
+}
+
+void
+nfs4_schedule_state_renewal(struct nfs_client *clp)
+{
+ long timeout;
+
+ spin_lock(&clp->cl_lock);
+ timeout = (2 * clp->cl_lease_time) / 3 + (long)clp->cl_last_renewal
+ - (long)jiffies;
+ if (timeout < 5 * HZ)
+ timeout = 5 * HZ;
+ dprintk("%s: requeueing work. Lease period = %ld\n",
+ __func__, (timeout + HZ - 1) / HZ);
+ mod_delayed_work(system_wq, &clp->cl_renewd, timeout);
+ set_bit(NFS_CS_RENEWD, &clp->cl_res_state);
+ spin_unlock(&clp->cl_lock);
+}
+
+void
+nfs4_kill_renewd(struct nfs_client *clp)
+{
+ cancel_delayed_work_sync(&clp->cl_renewd);
+}
+
+/**
+ * nfs4_set_lease_period - Sets the lease period on a nfs_client
+ *
+ * @clp: pointer to nfs_client
+ * @lease: new value for lease period
+ */
+void nfs4_set_lease_period(struct nfs_client *clp,
+ unsigned long lease)
+{
+ spin_lock(&clp->cl_lock);
+ clp->cl_lease_time = lease;
+ spin_unlock(&clp->cl_lock);
+
+ /* Cap maximum reconnect timeout at 1/2 lease period */
+ rpc_set_connect_timeout(clp->cl_rpcclient, lease, lease >> 1);
+}
+
+/*
+ * Local variables:
+ * c-basic-offset: 8
+ * End:
+ */
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
new file mode 100644
index 000000000..4145a0138
--- /dev/null
+++ b/fs/nfs/nfs4session.c
@@ -0,0 +1,653 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * fs/nfs/nfs4session.c
+ *
+ * Copyright (c) 2012 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/bc_xprt.h>
+#include <linux/nfs.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/module.h>
+
+#include "nfs4_fs.h"
+#include "internal.h"
+#include "nfs4session.h"
+#include "callback.h"
+
+#define NFSDBG_FACILITY NFSDBG_STATE
+
+static void nfs4_init_slot_table(struct nfs4_slot_table *tbl, const char *queue)
+{
+ tbl->highest_used_slotid = NFS4_NO_SLOT;
+ spin_lock_init(&tbl->slot_tbl_lock);
+ rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, queue);
+ init_waitqueue_head(&tbl->slot_waitq);
+ init_completion(&tbl->complete);
+}
+
+/*
+ * nfs4_shrink_slot_table - free retired slots from the slot table
+ */
+static void nfs4_shrink_slot_table(struct nfs4_slot_table *tbl, u32 newsize)
+{
+ struct nfs4_slot **p;
+ if (newsize >= tbl->max_slots)
+ return;
+
+ p = &tbl->slots;
+ while (newsize--)
+ p = &(*p)->next;
+ while (*p) {
+ struct nfs4_slot *slot = *p;
+
+ *p = slot->next;
+ kfree(slot);
+ tbl->max_slots--;
+ }
+}
+
+/**
+ * nfs4_slot_tbl_drain_complete - wake waiters when drain is complete
+ * @tbl: controlling slot table
+ *
+ */
+void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl)
+{
+ if (nfs4_slot_tbl_draining(tbl))
+ complete(&tbl->complete);
+}
+
+/*
+ * nfs4_free_slot - free a slot and efficiently update slot table.
+ *
+ * freeing a slot is trivially done by clearing its respective bit
+ * in the bitmap.
+ * If the freed slotid equals highest_used_slotid we want to update it
+ * so that the server would be able to size down the slot table if needed,
+ * otherwise we know that the highest_used_slotid is still in use.
+ * When updating highest_used_slotid there may be "holes" in the bitmap
+ * so we need to scan down from highest_used_slotid to 0 looking for the now
+ * highest slotid in use.
+ * If none found, highest_used_slotid is set to NFS4_NO_SLOT.
+ *
+ * Must be called while holding tbl->slot_tbl_lock
+ */
+void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
+{
+ u32 slotid = slot->slot_nr;
+
+ /* clear used bit in bitmap */
+ __clear_bit(slotid, tbl->used_slots);
+
+ /* update highest_used_slotid when it is freed */
+ if (slotid == tbl->highest_used_slotid) {
+ u32 new_max = find_last_bit(tbl->used_slots, slotid);
+ if (new_max < slotid)
+ tbl->highest_used_slotid = new_max;
+ else {
+ tbl->highest_used_slotid = NFS4_NO_SLOT;
+ nfs4_slot_tbl_drain_complete(tbl);
+ }
+ }
+ dprintk("%s: slotid %u highest_used_slotid %u\n", __func__,
+ slotid, tbl->highest_used_slotid);
+}
+
+static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table *tbl,
+ u32 slotid, u32 seq_init, gfp_t gfp_mask)
+{
+ struct nfs4_slot *slot;
+
+ slot = kzalloc(sizeof(*slot), gfp_mask);
+ if (slot) {
+ slot->table = tbl;
+ slot->slot_nr = slotid;
+ slot->seq_nr = seq_init;
+ slot->seq_nr_highest_sent = seq_init;
+ slot->seq_nr_last_acked = seq_init - 1;
+ }
+ return slot;
+}
+
+static struct nfs4_slot *nfs4_find_or_create_slot(struct nfs4_slot_table *tbl,
+ u32 slotid, u32 seq_init, gfp_t gfp_mask)
+{
+ struct nfs4_slot **p, *slot;
+
+ p = &tbl->slots;
+ for (;;) {
+ if (*p == NULL) {
+ *p = nfs4_new_slot(tbl, tbl->max_slots,
+ seq_init, gfp_mask);
+ if (*p == NULL)
+ break;
+ tbl->max_slots++;
+ }
+ slot = *p;
+ if (slot->slot_nr == slotid)
+ return slot;
+ p = &slot->next;
+ }
+ return ERR_PTR(-ENOMEM);
+}
+
+static void nfs4_lock_slot(struct nfs4_slot_table *tbl,
+ struct nfs4_slot *slot)
+{
+ u32 slotid = slot->slot_nr;
+
+ __set_bit(slotid, tbl->used_slots);
+ if (slotid > tbl->highest_used_slotid ||
+ tbl->highest_used_slotid == NFS4_NO_SLOT)
+ tbl->highest_used_slotid = slotid;
+ slot->generation = tbl->generation;
+}
+
+/*
+ * nfs4_try_to_lock_slot - Given a slot try to allocate it
+ *
+ * Note: must be called with the slot_tbl_lock held.
+ */
+bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
+{
+ if (nfs4_test_locked_slot(tbl, slot->slot_nr))
+ return false;
+ nfs4_lock_slot(tbl, slot);
+ return true;
+}
+
+/*
+ * nfs4_lookup_slot - Find a slot but don't allocate it
+ *
+ * Note: must be called with the slot_tbl_lock held.
+ */
+struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid)
+{
+ if (slotid <= tbl->max_slotid)
+ return nfs4_find_or_create_slot(tbl, slotid, 0, GFP_NOWAIT);
+ return ERR_PTR(-E2BIG);
+}
+
+static int nfs4_slot_get_seqid(struct nfs4_slot_table *tbl, u32 slotid,
+ u32 *seq_nr)
+ __must_hold(&tbl->slot_tbl_lock)
+{
+ struct nfs4_slot *slot;
+ int ret;
+
+ slot = nfs4_lookup_slot(tbl, slotid);
+ ret = PTR_ERR_OR_ZERO(slot);
+ if (!ret)
+ *seq_nr = slot->seq_nr;
+
+ return ret;
+}
+
+/*
+ * nfs4_slot_seqid_in_use - test if a slot sequence id is still in use
+ *
+ * Given a slot table, slot id and sequence number, determine if the
+ * RPC call in question is still in flight. This function is mainly
+ * intended for use by the callback channel.
+ */
+static bool nfs4_slot_seqid_in_use(struct nfs4_slot_table *tbl,
+ u32 slotid, u32 seq_nr)
+{
+ u32 cur_seq = 0;
+ bool ret = false;
+
+ spin_lock(&tbl->slot_tbl_lock);
+ if (nfs4_slot_get_seqid(tbl, slotid, &cur_seq) == 0 &&
+ cur_seq == seq_nr && test_bit(slotid, tbl->used_slots))
+ ret = true;
+ spin_unlock(&tbl->slot_tbl_lock);
+ return ret;
+}
+
+/*
+ * nfs4_slot_wait_on_seqid - wait until a slot sequence id is complete
+ *
+ * Given a slot table, slot id and sequence number, wait until the
+ * corresponding RPC call completes. This function is mainly
+ * intended for use by the callback channel.
+ */
+int nfs4_slot_wait_on_seqid(struct nfs4_slot_table *tbl,
+ u32 slotid, u32 seq_nr,
+ unsigned long timeout)
+{
+ if (wait_event_timeout(tbl->slot_waitq,
+ !nfs4_slot_seqid_in_use(tbl, slotid, seq_nr),
+ timeout) == 0)
+ return -ETIMEDOUT;
+ return 0;
+}
+
+/*
+ * nfs4_alloc_slot - efficiently look for a free slot
+ *
+ * nfs4_alloc_slot looks for an unset bit in the used_slots bitmap.
+ * If found, we mark the slot as used, update the highest_used_slotid,
+ * and respectively set up the sequence operation args.
+ *
+ * Note: must be called with under the slot_tbl_lock.
+ */
+struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl)
+{
+ struct nfs4_slot *ret = ERR_PTR(-EBUSY);
+ u32 slotid;
+
+ dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n",
+ __func__, tbl->used_slots[0], tbl->highest_used_slotid,
+ tbl->max_slotid + 1);
+ slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slotid + 1);
+ if (slotid <= tbl->max_slotid) {
+ ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
+ if (!IS_ERR(ret))
+ nfs4_lock_slot(tbl, ret);
+ }
+ dprintk("<-- %s used_slots=%04lx highest_used=%u slotid=%u\n",
+ __func__, tbl->used_slots[0], tbl->highest_used_slotid,
+ !IS_ERR(ret) ? ret->slot_nr : NFS4_NO_SLOT);
+ return ret;
+}
+
+static int nfs4_grow_slot_table(struct nfs4_slot_table *tbl,
+ u32 max_reqs, u32 ivalue)
+{
+ if (max_reqs <= tbl->max_slots)
+ return 0;
+ if (!IS_ERR(nfs4_find_or_create_slot(tbl, max_reqs - 1, ivalue, GFP_NOFS)))
+ return 0;
+ return -ENOMEM;
+}
+
+static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl,
+ u32 server_highest_slotid,
+ u32 ivalue)
+{
+ struct nfs4_slot **p;
+
+ nfs4_shrink_slot_table(tbl, server_highest_slotid + 1);
+ p = &tbl->slots;
+ while (*p) {
+ (*p)->seq_nr = ivalue;
+ (*p)->seq_nr_highest_sent = ivalue;
+ (*p)->seq_nr_last_acked = ivalue - 1;
+ p = &(*p)->next;
+ }
+ tbl->highest_used_slotid = NFS4_NO_SLOT;
+ tbl->target_highest_slotid = server_highest_slotid;
+ tbl->server_highest_slotid = server_highest_slotid;
+ tbl->d_target_highest_slotid = 0;
+ tbl->d2_target_highest_slotid = 0;
+ tbl->max_slotid = server_highest_slotid;
+}
+
+/*
+ * (re)Initialise a slot table
+ */
+static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl,
+ u32 max_reqs, u32 ivalue)
+{
+ int ret;
+
+ dprintk("--> %s: max_reqs=%u, tbl->max_slots %u\n", __func__,
+ max_reqs, tbl->max_slots);
+
+ if (max_reqs > NFS4_MAX_SLOT_TABLE)
+ max_reqs = NFS4_MAX_SLOT_TABLE;
+
+ ret = nfs4_grow_slot_table(tbl, max_reqs, ivalue);
+ if (ret)
+ goto out;
+
+ spin_lock(&tbl->slot_tbl_lock);
+ nfs4_reset_slot_table(tbl, max_reqs - 1, ivalue);
+ spin_unlock(&tbl->slot_tbl_lock);
+
+ dprintk("%s: tbl=%p slots=%p max_slots=%u\n", __func__,
+ tbl, tbl->slots, tbl->max_slots);
+out:
+ dprintk("<-- %s: return %d\n", __func__, ret);
+ return ret;
+}
+
+/*
+ * nfs4_release_slot_table - release all slot table entries
+ */
+static void nfs4_release_slot_table(struct nfs4_slot_table *tbl)
+{
+ nfs4_shrink_slot_table(tbl, 0);
+}
+
+/**
+ * nfs4_shutdown_slot_table - release resources attached to a slot table
+ * @tbl: slot table to shut down
+ *
+ */
+void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl)
+{
+ nfs4_release_slot_table(tbl);
+ rpc_destroy_wait_queue(&tbl->slot_tbl_waitq);
+}
+
+/**
+ * nfs4_setup_slot_table - prepare a stand-alone slot table for use
+ * @tbl: slot table to set up
+ * @max_reqs: maximum number of requests allowed
+ * @queue: name to give RPC wait queue
+ *
+ * Returns zero on success, or a negative errno.
+ */
+int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, unsigned int max_reqs,
+ const char *queue)
+{
+ nfs4_init_slot_table(tbl, queue);
+ return nfs4_realloc_slot_table(tbl, max_reqs, 0);
+}
+
+static bool nfs41_assign_slot(struct rpc_task *task, void *pslot)
+{
+ struct nfs4_sequence_args *args = task->tk_msg.rpc_argp;
+ struct nfs4_sequence_res *res = task->tk_msg.rpc_resp;
+ struct nfs4_slot *slot = pslot;
+ struct nfs4_slot_table *tbl = slot->table;
+
+ if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
+ return false;
+ slot->generation = tbl->generation;
+ args->sa_slot = slot;
+ res->sr_timestamp = jiffies;
+ res->sr_slot = slot;
+ res->sr_status_flags = 0;
+ res->sr_status = 1;
+ return true;
+}
+
+static bool __nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
+ struct nfs4_slot *slot)
+{
+ if (rpc_wake_up_first(&tbl->slot_tbl_waitq, nfs41_assign_slot, slot))
+ return true;
+ return false;
+}
+
+bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
+ struct nfs4_slot *slot)
+{
+ if (slot->slot_nr > tbl->max_slotid)
+ return false;
+ return __nfs41_wake_and_assign_slot(tbl, slot);
+}
+
+static bool nfs41_try_wake_next_slot_table_entry(struct nfs4_slot_table *tbl)
+{
+ struct nfs4_slot *slot = nfs4_alloc_slot(tbl);
+ if (!IS_ERR(slot)) {
+ bool ret = __nfs41_wake_and_assign_slot(tbl, slot);
+ if (ret)
+ return ret;
+ nfs4_free_slot(tbl, slot);
+ }
+ return false;
+}
+
+void nfs41_wake_slot_table(struct nfs4_slot_table *tbl)
+{
+ for (;;) {
+ if (!nfs41_try_wake_next_slot_table_entry(tbl))
+ break;
+ }
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static void nfs41_set_max_slotid_locked(struct nfs4_slot_table *tbl,
+ u32 target_highest_slotid)
+{
+ u32 max_slotid;
+
+ max_slotid = min(NFS4_MAX_SLOT_TABLE - 1, target_highest_slotid);
+ if (max_slotid > tbl->server_highest_slotid)
+ max_slotid = tbl->server_highest_slotid;
+ if (max_slotid > tbl->target_highest_slotid)
+ max_slotid = tbl->target_highest_slotid;
+ tbl->max_slotid = max_slotid;
+ nfs41_wake_slot_table(tbl);
+}
+
+/* Update the client's idea of target_highest_slotid */
+static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl,
+ u32 target_highest_slotid)
+{
+ if (tbl->target_highest_slotid == target_highest_slotid)
+ return;
+ tbl->target_highest_slotid = target_highest_slotid;
+ tbl->generation++;
+}
+
+void nfs41_set_target_slotid(struct nfs4_slot_table *tbl,
+ u32 target_highest_slotid)
+{
+ spin_lock(&tbl->slot_tbl_lock);
+ nfs41_set_target_slotid_locked(tbl, target_highest_slotid);
+ tbl->d_target_highest_slotid = 0;
+ tbl->d2_target_highest_slotid = 0;
+ nfs41_set_max_slotid_locked(tbl, target_highest_slotid);
+ spin_unlock(&tbl->slot_tbl_lock);
+}
+
+static void nfs41_set_server_slotid_locked(struct nfs4_slot_table *tbl,
+ u32 highest_slotid)
+{
+ if (tbl->server_highest_slotid == highest_slotid)
+ return;
+ if (tbl->highest_used_slotid > highest_slotid)
+ return;
+ /* Deallocate slots */
+ nfs4_shrink_slot_table(tbl, highest_slotid + 1);
+ tbl->server_highest_slotid = highest_slotid;
+}
+
+static s32 nfs41_derivative_target_slotid(s32 s1, s32 s2)
+{
+ s1 -= s2;
+ if (s1 == 0)
+ return 0;
+ if (s1 < 0)
+ return (s1 - 1) >> 1;
+ return (s1 + 1) >> 1;
+}
+
+static int nfs41_sign_s32(s32 s1)
+{
+ if (s1 > 0)
+ return 1;
+ if (s1 < 0)
+ return -1;
+ return 0;
+}
+
+static bool nfs41_same_sign_or_zero_s32(s32 s1, s32 s2)
+{
+ if (!s1 || !s2)
+ return true;
+ return nfs41_sign_s32(s1) == nfs41_sign_s32(s2);
+}
+
+/* Try to eliminate outliers by checking for sharp changes in the
+ * derivatives and second derivatives
+ */
+static bool nfs41_is_outlier_target_slotid(struct nfs4_slot_table *tbl,
+ u32 new_target)
+{
+ s32 d_target, d2_target;
+ bool ret = true;
+
+ d_target = nfs41_derivative_target_slotid(new_target,
+ tbl->target_highest_slotid);
+ d2_target = nfs41_derivative_target_slotid(d_target,
+ tbl->d_target_highest_slotid);
+ /* Is first derivative same sign? */
+ if (nfs41_same_sign_or_zero_s32(d_target, tbl->d_target_highest_slotid))
+ ret = false;
+ /* Is second derivative same sign? */
+ if (nfs41_same_sign_or_zero_s32(d2_target, tbl->d2_target_highest_slotid))
+ ret = false;
+ tbl->d_target_highest_slotid = d_target;
+ tbl->d2_target_highest_slotid = d2_target;
+ return ret;
+}
+
+void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
+ struct nfs4_slot *slot,
+ struct nfs4_sequence_res *res)
+{
+ spin_lock(&tbl->slot_tbl_lock);
+ if (!nfs41_is_outlier_target_slotid(tbl, res->sr_target_highest_slotid))
+ nfs41_set_target_slotid_locked(tbl, res->sr_target_highest_slotid);
+ if (tbl->generation == slot->generation)
+ nfs41_set_server_slotid_locked(tbl, res->sr_highest_slotid);
+ nfs41_set_max_slotid_locked(tbl, res->sr_target_highest_slotid);
+ spin_unlock(&tbl->slot_tbl_lock);
+}
+
+static void nfs4_release_session_slot_tables(struct nfs4_session *session)
+{
+ nfs4_release_slot_table(&session->fc_slot_table);
+ nfs4_release_slot_table(&session->bc_slot_table);
+}
+
+/*
+ * Initialize or reset the forechannel and backchannel tables
+ */
+int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
+{
+ struct nfs4_slot_table *tbl;
+ int status;
+
+ dprintk("--> %s\n", __func__);
+ /* Fore channel */
+ tbl = &ses->fc_slot_table;
+ tbl->session = ses;
+ status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
+ if (status || !(ses->flags & SESSION4_BACK_CHAN)) /* -ENOMEM */
+ return status;
+ /* Back channel */
+ tbl = &ses->bc_slot_table;
+ tbl->session = ses;
+ status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
+ if (status && tbl->slots == NULL)
+ /* Fore and back channel share a connection so get
+ * both slot tables or neither */
+ nfs4_release_session_slot_tables(ses);
+ return status;
+}
+
+struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
+{
+ struct nfs4_session *session;
+
+ session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
+ if (!session)
+ return NULL;
+
+ nfs4_init_slot_table(&session->fc_slot_table, "ForeChannel Slot table");
+ nfs4_init_slot_table(&session->bc_slot_table, "BackChannel Slot table");
+ session->session_state = 1<<NFS4_SESSION_INITING;
+
+ session->clp = clp;
+ return session;
+}
+
+static void nfs4_destroy_session_slot_tables(struct nfs4_session *session)
+{
+ nfs4_shutdown_slot_table(&session->fc_slot_table);
+ nfs4_shutdown_slot_table(&session->bc_slot_table);
+}
+
+void nfs4_destroy_session(struct nfs4_session *session)
+{
+ struct rpc_xprt *xprt;
+ const struct cred *cred;
+
+ cred = nfs4_get_clid_cred(session->clp);
+ nfs4_proc_destroy_session(session, cred);
+ put_cred(cred);
+
+ rcu_read_lock();
+ xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt);
+ rcu_read_unlock();
+ dprintk("%s Destroy backchannel for xprt %p\n",
+ __func__, xprt);
+ xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);
+ nfs4_destroy_session_slot_tables(session);
+ kfree(session);
+}
+
+/*
+ * With sessions, the client is not marked ready until after a
+ * successful EXCHANGE_ID and CREATE_SESSION.
+ *
+ * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate
+ * other versions of NFS can be tried.
+ */
+static int nfs41_check_session_ready(struct nfs_client *clp)
+{
+ int ret;
+
+ if (clp->cl_cons_state == NFS_CS_SESSION_INITING) {
+ ret = nfs4_client_recover_expired_lease(clp);
+ if (ret)
+ return ret;
+ }
+ if (clp->cl_cons_state < NFS_CS_READY)
+ return -EPROTONOSUPPORT;
+ smp_rmb();
+ return 0;
+}
+
+int nfs4_init_session(struct nfs_client *clp)
+{
+ if (!nfs4_has_session(clp))
+ return 0;
+
+ clear_bit(NFS4_SESSION_INITING, &clp->cl_session->session_state);
+ return nfs41_check_session_ready(clp);
+}
+
+int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time)
+{
+ struct nfs4_session *session = clp->cl_session;
+ int ret;
+
+ spin_lock(&clp->cl_lock);
+ if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
+ /*
+ * Do not set NFS_CS_CHECK_LEASE_TIME instead set the
+ * DS lease to be equal to the MDS lease.
+ */
+ clp->cl_lease_time = lease_time;
+ clp->cl_last_renewal = jiffies;
+ }
+ spin_unlock(&clp->cl_lock);
+
+ ret = nfs41_check_session_ready(clp);
+ if (ret)
+ return ret;
+ /* Test for the DS role */
+ if (!is_ds_client(clp))
+ return -ENODEV;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
+
+#endif /* defined(CONFIG_NFS_V4_1) */
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
new file mode 100644
index 000000000..b996ee23f
--- /dev/null
+++ b/fs/nfs/nfs4session.h
@@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * fs/nfs/nfs4session.h
+ *
+ * Copyright (c) 2012 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ */
+#ifndef __LINUX_FS_NFS_NFS4SESSION_H
+#define __LINUX_FS_NFS_NFS4SESSION_H
+
+/* maximum number of slots to use */
+#define NFS4_DEF_SLOT_TABLE_SIZE (64U)
+#define NFS4_DEF_CB_SLOT_TABLE_SIZE (16U)
+#define NFS4_MAX_SLOT_TABLE (1024U)
+#define NFS4_NO_SLOT ((u32)-1)
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+
+/* Sessions slot seqid */
+struct nfs4_slot {
+ struct nfs4_slot_table *table;
+ struct nfs4_slot *next;
+ unsigned long generation;
+ u32 slot_nr;
+ u32 seq_nr;
+ u32 seq_nr_last_acked;
+ u32 seq_nr_highest_sent;
+ unsigned int privileged : 1,
+ seq_done : 1;
+};
+
+/* Sessions */
+enum nfs4_slot_tbl_state {
+ NFS4_SLOT_TBL_DRAINING,
+};
+
+#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long))
+struct nfs4_slot_table {
+ struct nfs4_session *session; /* Parent session */
+ struct nfs4_slot *slots; /* seqid per slot */
+ unsigned long used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */
+ spinlock_t slot_tbl_lock;
+ struct rpc_wait_queue slot_tbl_waitq; /* allocators may wait here */
+ wait_queue_head_t slot_waitq; /* Completion wait on slot */
+ u32 max_slots; /* # slots in table */
+ u32 max_slotid; /* Max allowed slotid value */
+ u32 highest_used_slotid; /* sent to server on each SEQ.
+ * op for dynamic resizing */
+ u32 target_highest_slotid; /* Server max_slot target */
+ u32 server_highest_slotid; /* Server highest slotid */
+ s32 d_target_highest_slotid; /* Derivative */
+ s32 d2_target_highest_slotid; /* 2nd derivative */
+ unsigned long generation; /* Generation counter for
+ target_highest_slotid */
+ struct completion complete;
+ unsigned long slot_tbl_state;
+};
+
+/*
+ * Session related parameters
+ */
+struct nfs4_session {
+ struct nfs4_sessionid sess_id;
+ u32 flags;
+ unsigned long session_state;
+ u32 hash_alg;
+ u32 ssv_len;
+
+ /* The fore and back channel */
+ struct nfs4_channel_attrs fc_attrs;
+ struct nfs4_slot_table fc_slot_table;
+ struct nfs4_channel_attrs bc_attrs;
+ struct nfs4_slot_table bc_slot_table;
+ struct nfs_client *clp;
+};
+
+enum nfs4_session_state {
+ NFS4_SESSION_INITING,
+ NFS4_SESSION_ESTABLISHED,
+};
+
+extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,
+ unsigned int max_reqs, const char *queue);
+extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl);
+extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
+extern struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid);
+extern int nfs4_slot_wait_on_seqid(struct nfs4_slot_table *tbl,
+ u32 slotid, u32 seq_nr,
+ unsigned long timeout);
+extern bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
+extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
+extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
+bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
+ struct nfs4_slot *slot);
+void nfs41_wake_slot_table(struct nfs4_slot_table *tbl);
+
+static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl)
+{
+ return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
+}
+
+static inline bool nfs4_test_locked_slot(const struct nfs4_slot_table *tbl,
+ u32 slotid)
+{
+ return !!test_bit(slotid, tbl->used_slots);
+}
+
+static inline struct nfs4_session *nfs4_get_session(const struct nfs_client *clp)
+{
+ return clp->cl_session;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl,
+ u32 target_highest_slotid);
+extern void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
+ struct nfs4_slot *slot,
+ struct nfs4_sequence_res *res);
+
+extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses);
+
+extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
+extern void nfs4_destroy_session(struct nfs4_session *session);
+extern int nfs4_init_session(struct nfs_client *clp);
+extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
+
+/*
+ * Determine if sessions are in use.
+ */
+static inline int nfs4_has_session(const struct nfs_client *clp)
+{
+ if (clp->cl_session)
+ return 1;
+ return 0;
+}
+
+static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
+{
+ if (nfs4_has_session(clp))
+ return (clp->cl_session->flags & SESSION4_PERSIST);
+ return 0;
+}
+
+static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst,
+ const struct nfs4_sessionid *src)
+{
+ memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN);
+}
+
+#ifdef CONFIG_CRC32
+/*
+ * nfs_session_id_hash - calculate the crc32 hash for the session id
+ * @session - pointer to session
+ */
+#define nfs_session_id_hash(sess_id) \
+ (~crc32_le(0xFFFFFFFF, &(sess_id)->data[0], sizeof((sess_id)->data)))
+#else
+#define nfs_session_id_hash(session) (0)
+#endif
+#else /* defined(CONFIG_NFS_V4_1) */
+
+static inline int nfs4_init_session(struct nfs_client *clp)
+{
+ return 0;
+}
+
+/*
+ * Determine if sessions are in use.
+ */
+static inline int nfs4_has_session(const struct nfs_client *clp)
+{
+ return 0;
+}
+
+static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
+{
+ return 0;
+}
+
+#define nfs_session_id_hash(session) (0)
+
+#endif /* defined(CONFIG_NFS_V4_1) */
+#endif /* IS_ENABLED(CONFIG_NFS_V4) */
+#endif /* __LINUX_FS_NFS_NFS4SESSION_H */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
new file mode 100644
index 000000000..afb617a4a
--- /dev/null
+++ b/fs/nfs/nfs4state.c
@@ -0,0 +1,2768 @@
+/*
+ * fs/nfs/nfs4state.c
+ *
+ * Client-side XDR for NFSv4.
+ *
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Kendrick Smith <kmsmith@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Implementation of the NFSv4 state model. For the time being,
+ * this is minimal, but will be made much more complex in a
+ * subsequent patch.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/nfs_fs.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/ratelimit.h>
+#include <linux/workqueue.h>
+#include <linux/bitops.h>
+#include <linux/jiffies.h>
+#include <linux/sched/mm.h>
+
+#include <linux/sunrpc/clnt.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "internal.h"
+#include "nfs4idmap.h"
+#include "nfs4session.h"
+#include "pnfs.h"
+#include "netns.h"
+#include "nfs4trace.h"
+
+#define NFSDBG_FACILITY NFSDBG_STATE
+
+#define OPENOWNER_POOL_SIZE 8
+
+static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp);
+
+const nfs4_stateid zero_stateid = {
+ { .data = { 0 } },
+ .type = NFS4_SPECIAL_STATEID_TYPE,
+};
+const nfs4_stateid invalid_stateid = {
+ {
+ /* Funky initialiser keeps older gcc versions happy */
+ .data = { 0xff, 0xff, 0xff, 0xff, 0 },
+ },
+ .type = NFS4_INVALID_STATEID_TYPE,
+};
+
+const nfs4_stateid current_stateid = {
+ {
+ /* Funky initialiser keeps older gcc versions happy */
+ .data = { 0x0, 0x0, 0x0, 0x1, 0 },
+ },
+ .type = NFS4_SPECIAL_STATEID_TYPE,
+};
+
+static DEFINE_MUTEX(nfs_clid_init_mutex);
+
+static int nfs4_setup_state_renewal(struct nfs_client *clp)
+{
+ int status;
+ struct nfs_fsinfo fsinfo;
+
+ if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
+ nfs4_schedule_state_renewal(clp);
+ return 0;
+ }
+
+ status = nfs4_proc_get_lease_time(clp, &fsinfo);
+ if (status == 0) {
+ nfs4_set_lease_period(clp, fsinfo.lease_time * HZ);
+ nfs4_schedule_state_renewal(clp);
+ }
+
+ return status;
+}
+
+int nfs4_init_clientid(struct nfs_client *clp, const struct cred *cred)
+{
+ struct nfs4_setclientid_res clid = {
+ .clientid = clp->cl_clientid,
+ .confirm = clp->cl_confirm,
+ };
+ unsigned short port;
+ int status;
+ struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+
+ if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state))
+ goto do_confirm;
+ port = nn->nfs_callback_tcpport;
+ if (clp->cl_addr.ss_family == AF_INET6)
+ port = nn->nfs_callback_tcpport6;
+
+ status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
+ if (status != 0)
+ goto out;
+ clp->cl_clientid = clid.clientid;
+ clp->cl_confirm = clid.confirm;
+ set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+do_confirm:
+ status = nfs4_proc_setclientid_confirm(clp, &clid, cred);
+ if (status != 0)
+ goto out;
+ clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+ nfs4_setup_state_renewal(clp);
+out:
+ return status;
+}
+
+/**
+ * nfs40_discover_server_trunking - Detect server IP address trunking (mv0)
+ *
+ * @clp: nfs_client under test
+ * @result: OUT: found nfs_client, or clp
+ * @cred: credential to use for trunking test
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status.
+ * If zero is returned, an nfs_client pointer is planted in
+ * "result".
+ *
+ * Note: The returned client may not yet be marked ready.
+ */
+int nfs40_discover_server_trunking(struct nfs_client *clp,
+ struct nfs_client **result,
+ const struct cred *cred)
+{
+ struct nfs4_setclientid_res clid = {
+ .clientid = clp->cl_clientid,
+ .confirm = clp->cl_confirm,
+ };
+ struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+ unsigned short port;
+ int status;
+
+ port = nn->nfs_callback_tcpport;
+ if (clp->cl_addr.ss_family == AF_INET6)
+ port = nn->nfs_callback_tcpport6;
+
+ status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
+ if (status != 0)
+ goto out;
+ clp->cl_clientid = clid.clientid;
+ clp->cl_confirm = clid.confirm;
+
+ status = nfs40_walk_client_list(clp, result, cred);
+ if (status == 0) {
+ /* Sustain the lease, even if it's empty. If the clientid4
+ * goes stale it's of no use for trunking discovery. */
+ nfs4_schedule_state_renewal(*result);
+
+ /* If the client state need to recover, do it. */
+ if (clp->cl_state)
+ nfs4_schedule_state_manager(clp);
+ }
+out:
+ return status;
+}
+
+const struct cred *nfs4_get_machine_cred(struct nfs_client *clp)
+{
+ return get_cred(rpc_machine_cred());
+}
+
+static void nfs4_root_machine_cred(struct nfs_client *clp)
+{
+
+ /* Force root creds instead of machine */
+ clp->cl_principal = NULL;
+ clp->cl_rpcclient->cl_principal = NULL;
+}
+
+static const struct cred *
+nfs4_get_renew_cred_server_locked(struct nfs_server *server)
+{
+ const struct cred *cred = NULL;
+ struct nfs4_state_owner *sp;
+ struct rb_node *pos;
+
+ for (pos = rb_first(&server->state_owners);
+ pos != NULL;
+ pos = rb_next(pos)) {
+ sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
+ if (list_empty(&sp->so_states))
+ continue;
+ cred = get_cred(sp->so_cred);
+ break;
+ }
+ return cred;
+}
+
+/**
+ * nfs4_get_renew_cred - Acquire credential for a renew operation
+ * @clp: client state handle
+ *
+ * Returns an rpc_cred with reference count bumped, or NULL.
+ * Caller must hold clp->cl_lock.
+ */
+const struct cred *nfs4_get_renew_cred(struct nfs_client *clp)
+{
+ const struct cred *cred = NULL;
+ struct nfs_server *server;
+
+ /* Use machine credentials if available */
+ cred = nfs4_get_machine_cred(clp);
+ if (cred != NULL)
+ goto out;
+
+ spin_lock(&clp->cl_lock);
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ cred = nfs4_get_renew_cred_server_locked(server);
+ if (cred != NULL)
+ break;
+ }
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+
+out:
+ return cred;
+}
+
+static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl)
+{
+ if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
+ spin_lock(&tbl->slot_tbl_lock);
+ nfs41_wake_slot_table(tbl);
+ spin_unlock(&tbl->slot_tbl_lock);
+ }
+}
+
+static void nfs4_end_drain_session(struct nfs_client *clp)
+{
+ struct nfs4_session *ses = clp->cl_session;
+
+ if (clp->cl_slot_tbl) {
+ nfs4_end_drain_slot_table(clp->cl_slot_tbl);
+ return;
+ }
+
+ if (ses != NULL) {
+ nfs4_end_drain_slot_table(&ses->bc_slot_table);
+ nfs4_end_drain_slot_table(&ses->fc_slot_table);
+ }
+}
+
+static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl)
+{
+ set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
+ spin_lock(&tbl->slot_tbl_lock);
+ if (tbl->highest_used_slotid != NFS4_NO_SLOT) {
+ reinit_completion(&tbl->complete);
+ spin_unlock(&tbl->slot_tbl_lock);
+ return wait_for_completion_interruptible(&tbl->complete);
+ }
+ spin_unlock(&tbl->slot_tbl_lock);
+ return 0;
+}
+
+static int nfs4_begin_drain_session(struct nfs_client *clp)
+{
+ struct nfs4_session *ses = clp->cl_session;
+ int ret;
+
+ if (clp->cl_slot_tbl)
+ return nfs4_drain_slot_tbl(clp->cl_slot_tbl);
+
+ /* back channel */
+ ret = nfs4_drain_slot_tbl(&ses->bc_slot_table);
+ if (ret)
+ return ret;
+ /* fore channel */
+ return nfs4_drain_slot_tbl(&ses->fc_slot_table);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static void nfs41_finish_session_reset(struct nfs_client *clp)
+{
+ clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+ clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+ /* create_session negotiated new slot table */
+ clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+ nfs4_setup_state_renewal(clp);
+}
+
+int nfs41_init_clientid(struct nfs_client *clp, const struct cred *cred)
+{
+ int status;
+
+ if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state))
+ goto do_confirm;
+ status = nfs4_proc_exchange_id(clp, cred);
+ if (status != 0)
+ goto out;
+ set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+do_confirm:
+ status = nfs4_proc_create_session(clp, cred);
+ if (status != 0)
+ goto out;
+ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_CONFIRMED_R))
+ nfs4_state_start_reclaim_reboot(clp);
+ nfs41_finish_session_reset(clp);
+ nfs_mark_client_ready(clp, NFS_CS_READY);
+out:
+ return status;
+}
+
+/**
+ * nfs41_discover_server_trunking - Detect server IP address trunking (mv1)
+ *
+ * @clp: nfs_client under test
+ * @result: OUT: found nfs_client, or clp
+ * @cred: credential to use for trunking test
+ *
+ * Returns NFS4_OK, a negative errno, or a negative NFS4ERR status.
+ * If NFS4_OK is returned, an nfs_client pointer is planted in
+ * "result".
+ *
+ * Note: The returned client may not yet be marked ready.
+ */
+int nfs41_discover_server_trunking(struct nfs_client *clp,
+ struct nfs_client **result,
+ const struct cred *cred)
+{
+ int status;
+
+ status = nfs4_proc_exchange_id(clp, cred);
+ if (status != NFS4_OK)
+ return status;
+
+ status = nfs41_walk_client_list(clp, result, cred);
+ if (status < 0)
+ return status;
+ if (clp != *result)
+ return 0;
+
+ /*
+ * Purge state if the client id was established in a prior
+ * instance and the client id could not have arrived on the
+ * server via Transparent State Migration.
+ */
+ if (clp->cl_exchange_flags & EXCHGID4_FLAG_CONFIRMED_R) {
+ if (!test_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags))
+ set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state);
+ else
+ set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+ }
+ nfs4_schedule_state_manager(clp);
+ status = nfs_wait_client_init_complete(clp);
+ if (status < 0)
+ nfs_put_client(clp);
+ return status;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_get_clid_cred - Acquire credential for a setclientid operation
+ * @clp: client state handle
+ *
+ * Returns a cred with reference count bumped, or NULL.
+ */
+const struct cred *nfs4_get_clid_cred(struct nfs_client *clp)
+{
+ const struct cred *cred;
+
+ cred = nfs4_get_machine_cred(clp);
+ return cred;
+}
+
+static struct nfs4_state_owner *
+nfs4_find_state_owner_locked(struct nfs_server *server, const struct cred *cred)
+{
+ struct rb_node **p = &server->state_owners.rb_node,
+ *parent = NULL;
+ struct nfs4_state_owner *sp;
+ int cmp;
+
+ while (*p != NULL) {
+ parent = *p;
+ sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
+ cmp = cred_fscmp(cred, sp->so_cred);
+
+ if (cmp < 0)
+ p = &parent->rb_left;
+ else if (cmp > 0)
+ p = &parent->rb_right;
+ else {
+ if (!list_empty(&sp->so_lru))
+ list_del_init(&sp->so_lru);
+ atomic_inc(&sp->so_count);
+ return sp;
+ }
+ }
+ return NULL;
+}
+
+static struct nfs4_state_owner *
+nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
+{
+ struct nfs_server *server = new->so_server;
+ struct rb_node **p = &server->state_owners.rb_node,
+ *parent = NULL;
+ struct nfs4_state_owner *sp;
+ int cmp;
+
+ while (*p != NULL) {
+ parent = *p;
+ sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
+ cmp = cred_fscmp(new->so_cred, sp->so_cred);
+
+ if (cmp < 0)
+ p = &parent->rb_left;
+ else if (cmp > 0)
+ p = &parent->rb_right;
+ else {
+ if (!list_empty(&sp->so_lru))
+ list_del_init(&sp->so_lru);
+ atomic_inc(&sp->so_count);
+ return sp;
+ }
+ }
+ rb_link_node(&new->so_server_node, parent, p);
+ rb_insert_color(&new->so_server_node, &server->state_owners);
+ return new;
+}
+
+static void
+nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
+{
+ struct nfs_server *server = sp->so_server;
+
+ if (!RB_EMPTY_NODE(&sp->so_server_node))
+ rb_erase(&sp->so_server_node, &server->state_owners);
+}
+
+static void
+nfs4_init_seqid_counter(struct nfs_seqid_counter *sc)
+{
+ sc->create_time = ktime_get();
+ sc->flags = 0;
+ sc->counter = 0;
+ spin_lock_init(&sc->lock);
+ INIT_LIST_HEAD(&sc->list);
+ rpc_init_wait_queue(&sc->wait, "Seqid_waitqueue");
+}
+
+static void
+nfs4_destroy_seqid_counter(struct nfs_seqid_counter *sc)
+{
+ rpc_destroy_wait_queue(&sc->wait);
+}
+
+/*
+ * nfs4_alloc_state_owner(): this is called on the OPEN or CREATE path to
+ * create a new state_owner.
+ *
+ */
+static struct nfs4_state_owner *
+nfs4_alloc_state_owner(struct nfs_server *server,
+ const struct cred *cred,
+ gfp_t gfp_flags)
+{
+ struct nfs4_state_owner *sp;
+
+ sp = kzalloc(sizeof(*sp), gfp_flags);
+ if (!sp)
+ return NULL;
+ sp->so_seqid.owner_id = ida_simple_get(&server->openowner_id, 0, 0,
+ gfp_flags);
+ if (sp->so_seqid.owner_id < 0) {
+ kfree(sp);
+ return NULL;
+ }
+ sp->so_server = server;
+ sp->so_cred = get_cred(cred);
+ spin_lock_init(&sp->so_lock);
+ INIT_LIST_HEAD(&sp->so_states);
+ nfs4_init_seqid_counter(&sp->so_seqid);
+ atomic_set(&sp->so_count, 1);
+ INIT_LIST_HEAD(&sp->so_lru);
+ seqcount_spinlock_init(&sp->so_reclaim_seqcount, &sp->so_lock);
+ mutex_init(&sp->so_delegreturn_mutex);
+ return sp;
+}
+
+static void
+nfs4_reset_state_owner(struct nfs4_state_owner *sp)
+{
+ /* This state_owner is no longer usable, but must
+ * remain in place so that state recovery can find it
+ * and the opens associated with it.
+ * It may also be used for new 'open' request to
+ * return a delegation to the server.
+ * So update the 'create_time' so that it looks like
+ * a new state_owner. This will cause the server to
+ * request an OPEN_CONFIRM to start a new sequence.
+ */
+ sp->so_seqid.create_time = ktime_get();
+}
+
+static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
+{
+ nfs4_destroy_seqid_counter(&sp->so_seqid);
+ put_cred(sp->so_cred);
+ ida_simple_remove(&sp->so_server->openowner_id, sp->so_seqid.owner_id);
+ kfree(sp);
+}
+
+static void nfs4_gc_state_owners(struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_state_owner *sp, *tmp;
+ unsigned long time_min, time_max;
+ LIST_HEAD(doomed);
+
+ spin_lock(&clp->cl_lock);
+ time_max = jiffies;
+ time_min = (long)time_max - (long)clp->cl_lease_time;
+ list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
+ /* NB: LRU is sorted so that oldest is at the head */
+ if (time_in_range(sp->so_expires, time_min, time_max))
+ break;
+ list_move(&sp->so_lru, &doomed);
+ nfs4_remove_state_owner_locked(sp);
+ }
+ spin_unlock(&clp->cl_lock);
+
+ list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
+ list_del(&sp->so_lru);
+ nfs4_free_state_owner(sp);
+ }
+}
+
+/**
+ * nfs4_get_state_owner - Look up a state owner given a credential
+ * @server: nfs_server to search
+ * @cred: RPC credential to match
+ * @gfp_flags: allocation mode
+ *
+ * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
+ */
+struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
+ const struct cred *cred,
+ gfp_t gfp_flags)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_state_owner *sp, *new;
+
+ spin_lock(&clp->cl_lock);
+ sp = nfs4_find_state_owner_locked(server, cred);
+ spin_unlock(&clp->cl_lock);
+ if (sp != NULL)
+ goto out;
+ new = nfs4_alloc_state_owner(server, cred, gfp_flags);
+ if (new == NULL)
+ goto out;
+ spin_lock(&clp->cl_lock);
+ sp = nfs4_insert_state_owner_locked(new);
+ spin_unlock(&clp->cl_lock);
+ if (sp != new)
+ nfs4_free_state_owner(new);
+out:
+ nfs4_gc_state_owners(server);
+ return sp;
+}
+
+/**
+ * nfs4_put_state_owner - Release a nfs4_state_owner
+ * @sp: state owner data to release
+ *
+ * Note that we keep released state owners on an LRU
+ * list.
+ * This caches valid state owners so that they can be
+ * reused, to avoid the OPEN_CONFIRM on minor version 0.
+ * It also pins the uniquifier of dropped state owners for
+ * a while, to ensure that those state owner names are
+ * never reused.
+ */
+void nfs4_put_state_owner(struct nfs4_state_owner *sp)
+{
+ struct nfs_server *server = sp->so_server;
+ struct nfs_client *clp = server->nfs_client;
+
+ if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
+ return;
+
+ sp->so_expires = jiffies;
+ list_add_tail(&sp->so_lru, &server->state_owners_lru);
+ spin_unlock(&clp->cl_lock);
+}
+
+/**
+ * nfs4_purge_state_owners - Release all cached state owners
+ * @server: nfs_server with cached state owners to release
+ * @head: resulting list of state owners
+ *
+ * Called at umount time. Remaining state owners will be on
+ * the LRU with ref count of zero.
+ * Note that the state owners are not freed, but are added
+ * to the list @head, which can later be used as an argument
+ * to nfs4_free_state_owners.
+ */
+void nfs4_purge_state_owners(struct nfs_server *server, struct list_head *head)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_state_owner *sp, *tmp;
+
+ spin_lock(&clp->cl_lock);
+ list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
+ list_move(&sp->so_lru, head);
+ nfs4_remove_state_owner_locked(sp);
+ }
+ spin_unlock(&clp->cl_lock);
+}
+
+/**
+ * nfs4_purge_state_owners - Release all cached state owners
+ * @head: resulting list of state owners
+ *
+ * Frees a list of state owners that was generated by
+ * nfs4_purge_state_owners
+ */
+void nfs4_free_state_owners(struct list_head *head)
+{
+ struct nfs4_state_owner *sp, *tmp;
+
+ list_for_each_entry_safe(sp, tmp, head, so_lru) {
+ list_del(&sp->so_lru);
+ nfs4_free_state_owner(sp);
+ }
+}
+
+static struct nfs4_state *
+nfs4_alloc_open_state(void)
+{
+ struct nfs4_state *state;
+
+ state = kzalloc(sizeof(*state), GFP_NOFS);
+ if (!state)
+ return NULL;
+ refcount_set(&state->count, 1);
+ INIT_LIST_HEAD(&state->lock_states);
+ spin_lock_init(&state->state_lock);
+ seqlock_init(&state->seqlock);
+ init_waitqueue_head(&state->waitq);
+ return state;
+}
+
+void
+nfs4_state_set_mode_locked(struct nfs4_state *state, fmode_t fmode)
+{
+ if (state->state == fmode)
+ return;
+ /* NB! List reordering - see the reclaim code for why. */
+ if ((fmode & FMODE_WRITE) != (state->state & FMODE_WRITE)) {
+ if (fmode & FMODE_WRITE)
+ list_move(&state->open_states, &state->owner->so_states);
+ else
+ list_move_tail(&state->open_states, &state->owner->so_states);
+ }
+ state->state = fmode;
+}
+
+static struct nfs4_state *
+__nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs4_state *state;
+
+ list_for_each_entry_rcu(state, &nfsi->open_states, inode_states) {
+ if (state->owner != owner)
+ continue;
+ if (!nfs4_valid_open_stateid(state))
+ continue;
+ if (refcount_inc_not_zero(&state->count))
+ return state;
+ }
+ return NULL;
+}
+
+static void
+nfs4_free_open_state(struct nfs4_state *state)
+{
+ kfree_rcu(state, rcu_head);
+}
+
+struct nfs4_state *
+nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner)
+{
+ struct nfs4_state *state, *new;
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ rcu_read_lock();
+ state = __nfs4_find_state_byowner(inode, owner);
+ rcu_read_unlock();
+ if (state)
+ goto out;
+ new = nfs4_alloc_open_state();
+ spin_lock(&owner->so_lock);
+ spin_lock(&inode->i_lock);
+ state = __nfs4_find_state_byowner(inode, owner);
+ if (state == NULL && new != NULL) {
+ state = new;
+ state->owner = owner;
+ atomic_inc(&owner->so_count);
+ ihold(inode);
+ state->inode = inode;
+ list_add_rcu(&state->inode_states, &nfsi->open_states);
+ spin_unlock(&inode->i_lock);
+ /* Note: The reclaim code dictates that we add stateless
+ * and read-only stateids to the end of the list */
+ list_add_tail(&state->open_states, &owner->so_states);
+ spin_unlock(&owner->so_lock);
+ } else {
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&owner->so_lock);
+ if (new)
+ nfs4_free_open_state(new);
+ }
+out:
+ return state;
+}
+
+void nfs4_put_open_state(struct nfs4_state *state)
+{
+ struct inode *inode = state->inode;
+ struct nfs4_state_owner *owner = state->owner;
+
+ if (!refcount_dec_and_lock(&state->count, &owner->so_lock))
+ return;
+ spin_lock(&inode->i_lock);
+ list_del_rcu(&state->inode_states);
+ list_del(&state->open_states);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&owner->so_lock);
+ nfs4_inode_return_delegation_on_close(inode);
+ iput(inode);
+ nfs4_free_open_state(state);
+ nfs4_put_state_owner(owner);
+}
+
+/*
+ * Close the current file.
+ */
+static void __nfs4_close(struct nfs4_state *state,
+ fmode_t fmode, gfp_t gfp_mask, int wait)
+{
+ struct nfs4_state_owner *owner = state->owner;
+ int call_close = 0;
+ fmode_t newstate;
+
+ atomic_inc(&owner->so_count);
+ /* Protect against nfs4_find_state() */
+ spin_lock(&owner->so_lock);
+ switch (fmode & (FMODE_READ | FMODE_WRITE)) {
+ case FMODE_READ:
+ state->n_rdonly--;
+ break;
+ case FMODE_WRITE:
+ state->n_wronly--;
+ break;
+ case FMODE_READ|FMODE_WRITE:
+ state->n_rdwr--;
+ }
+ newstate = FMODE_READ|FMODE_WRITE;
+ if (state->n_rdwr == 0) {
+ if (state->n_rdonly == 0) {
+ newstate &= ~FMODE_READ;
+ call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags);
+ call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
+ }
+ if (state->n_wronly == 0) {
+ newstate &= ~FMODE_WRITE;
+ call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags);
+ call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags);
+ }
+ if (newstate == 0)
+ clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ }
+ nfs4_state_set_mode_locked(state, newstate);
+ spin_unlock(&owner->so_lock);
+
+ if (!call_close) {
+ nfs4_put_open_state(state);
+ nfs4_put_state_owner(owner);
+ } else
+ nfs4_do_close(state, gfp_mask, wait);
+}
+
+void nfs4_close_state(struct nfs4_state *state, fmode_t fmode)
+{
+ __nfs4_close(state, fmode, GFP_NOFS, 0);
+}
+
+void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
+{
+ __nfs4_close(state, fmode, GFP_KERNEL, 1);
+}
+
+/*
+ * Search the state->lock_states for an existing lock_owner
+ * that is compatible with either of the given owners.
+ * If the second is non-zero, then the first refers to a Posix-lock
+ * owner (current->files) and the second refers to a flock/OFD
+ * owner (struct file*). In that case, prefer a match for the first
+ * owner.
+ * If both sorts of locks are held on the one file we cannot know
+ * which stateid was intended to be used, so a "correct" choice cannot
+ * be made. Failing that, a "consistent" choice is preferable. The
+ * consistent choice we make is to prefer the first owner, that of a
+ * Posix lock.
+ */
+static struct nfs4_lock_state *
+__nfs4_find_lock_state(struct nfs4_state *state,
+ fl_owner_t fl_owner, fl_owner_t fl_owner2)
+{
+ struct nfs4_lock_state *pos, *ret = NULL;
+ list_for_each_entry(pos, &state->lock_states, ls_locks) {
+ if (pos->ls_owner == fl_owner) {
+ ret = pos;
+ break;
+ }
+ if (pos->ls_owner == fl_owner2)
+ ret = pos;
+ }
+ if (ret)
+ refcount_inc(&ret->ls_count);
+ return ret;
+}
+
+/*
+ * Return a compatible lock_state. If no initialized lock_state structure
+ * exists, return an uninitialized one.
+ *
+ */
+static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
+{
+ struct nfs4_lock_state *lsp;
+ struct nfs_server *server = state->owner->so_server;
+
+ lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
+ if (lsp == NULL)
+ return NULL;
+ nfs4_init_seqid_counter(&lsp->ls_seqid);
+ refcount_set(&lsp->ls_count, 1);
+ lsp->ls_state = state;
+ lsp->ls_owner = fl_owner;
+ lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS);
+ if (lsp->ls_seqid.owner_id < 0)
+ goto out_free;
+ INIT_LIST_HEAD(&lsp->ls_locks);
+ return lsp;
+out_free:
+ kfree(lsp);
+ return NULL;
+}
+
+void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
+{
+ ida_simple_remove(&server->lockowner_id, lsp->ls_seqid.owner_id);
+ nfs4_destroy_seqid_counter(&lsp->ls_seqid);
+ kfree(lsp);
+}
+
+/*
+ * Return a compatible lock_state. If no initialized lock_state structure
+ * exists, return an uninitialized one.
+ *
+ */
+static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
+{
+ struct nfs4_lock_state *lsp, *new = NULL;
+
+ for(;;) {
+ spin_lock(&state->state_lock);
+ lsp = __nfs4_find_lock_state(state, owner, NULL);
+ if (lsp != NULL)
+ break;
+ if (new != NULL) {
+ list_add(&new->ls_locks, &state->lock_states);
+ set_bit(LK_STATE_IN_USE, &state->flags);
+ lsp = new;
+ new = NULL;
+ break;
+ }
+ spin_unlock(&state->state_lock);
+ new = nfs4_alloc_lock_state(state, owner);
+ if (new == NULL)
+ return NULL;
+ }
+ spin_unlock(&state->state_lock);
+ if (new != NULL)
+ nfs4_free_lock_state(state->owner->so_server, new);
+ return lsp;
+}
+
+/*
+ * Release reference to lock_state, and free it if we see that
+ * it is no longer in use
+ */
+void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
+{
+ struct nfs_server *server;
+ struct nfs4_state *state;
+
+ if (lsp == NULL)
+ return;
+ state = lsp->ls_state;
+ if (!refcount_dec_and_lock(&lsp->ls_count, &state->state_lock))
+ return;
+ list_del(&lsp->ls_locks);
+ if (list_empty(&state->lock_states))
+ clear_bit(LK_STATE_IN_USE, &state->flags);
+ spin_unlock(&state->state_lock);
+ server = state->owner->so_server;
+ if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
+ struct nfs_client *clp = server->nfs_client;
+
+ clp->cl_mvops->free_lock_state(server, lsp);
+ } else
+ nfs4_free_lock_state(server, lsp);
+}
+
+static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
+{
+ struct nfs4_lock_state *lsp = src->fl_u.nfs4_fl.owner;
+
+ dst->fl_u.nfs4_fl.owner = lsp;
+ refcount_inc(&lsp->ls_count);
+}
+
+static void nfs4_fl_release_lock(struct file_lock *fl)
+{
+ nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner);
+}
+
+static const struct file_lock_operations nfs4_fl_lock_ops = {
+ .fl_copy_lock = nfs4_fl_copy_lock,
+ .fl_release_private = nfs4_fl_release_lock,
+};
+
+int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
+{
+ struct nfs4_lock_state *lsp;
+
+ if (fl->fl_ops != NULL)
+ return 0;
+ lsp = nfs4_get_lock_state(state, fl->fl_owner);
+ if (lsp == NULL)
+ return -ENOMEM;
+ fl->fl_u.nfs4_fl.owner = lsp;
+ fl->fl_ops = &nfs4_fl_lock_ops;
+ return 0;
+}
+
+static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
+ struct nfs4_state *state,
+ const struct nfs_lock_context *l_ctx)
+{
+ struct nfs4_lock_state *lsp;
+ fl_owner_t fl_owner, fl_flock_owner;
+ int ret = -ENOENT;
+
+ if (l_ctx == NULL)
+ goto out;
+
+ if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
+ goto out;
+
+ fl_owner = l_ctx->lockowner;
+ fl_flock_owner = l_ctx->open_context->flock_owner;
+
+ spin_lock(&state->state_lock);
+ lsp = __nfs4_find_lock_state(state, fl_owner, fl_flock_owner);
+ if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
+ ret = -EIO;
+ else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
+ nfs4_stateid_copy(dst, &lsp->ls_stateid);
+ ret = 0;
+ }
+ spin_unlock(&state->state_lock);
+ nfs4_put_lock_state(lsp);
+out:
+ return ret;
+}
+
+bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
+{
+ bool ret;
+ const nfs4_stateid *src;
+ int seq;
+
+ do {
+ ret = false;
+ src = &zero_stateid;
+ seq = read_seqbegin(&state->seqlock);
+ if (test_bit(NFS_OPEN_STATE, &state->flags)) {
+ src = &state->open_stateid;
+ ret = true;
+ }
+ nfs4_stateid_copy(dst, src);
+ } while (read_seqretry(&state->seqlock, seq));
+ return ret;
+}
+
+/*
+ * Byte-range lock aware utility to initialize the stateid of read/write
+ * requests.
+ */
+int nfs4_select_rw_stateid(struct nfs4_state *state,
+ fmode_t fmode, const struct nfs_lock_context *l_ctx,
+ nfs4_stateid *dst, const struct cred **cred)
+{
+ int ret;
+
+ if (!nfs4_valid_open_stateid(state))
+ return -EIO;
+ if (cred != NULL)
+ *cred = NULL;
+ ret = nfs4_copy_lock_stateid(dst, state, l_ctx);
+ if (ret == -EIO)
+ /* A lost lock - don't even consider delegations */
+ goto out;
+ /* returns true if delegation stateid found and copied */
+ if (nfs4_copy_delegation_stateid(state->inode, fmode, dst, cred)) {
+ ret = 0;
+ goto out;
+ }
+ if (ret != -ENOENT)
+ /* nfs4_copy_delegation_stateid() didn't over-write
+ * dst, so it still has the lock stateid which we now
+ * choose to use.
+ */
+ goto out;
+ ret = nfs4_copy_open_stateid(dst, state) ? 0 : -EAGAIN;
+out:
+ if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41))
+ dst->seqid = 0;
+ return ret;
+}
+
+struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask)
+{
+ struct nfs_seqid *new;
+
+ new = kmalloc(sizeof(*new), gfp_mask);
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ new->sequence = counter;
+ INIT_LIST_HEAD(&new->list);
+ new->task = NULL;
+ return new;
+}
+
+void nfs_release_seqid(struct nfs_seqid *seqid)
+{
+ struct nfs_seqid_counter *sequence;
+
+ if (seqid == NULL || list_empty(&seqid->list))
+ return;
+ sequence = seqid->sequence;
+ spin_lock(&sequence->lock);
+ list_del_init(&seqid->list);
+ if (!list_empty(&sequence->list)) {
+ struct nfs_seqid *next;
+
+ next = list_first_entry(&sequence->list,
+ struct nfs_seqid, list);
+ rpc_wake_up_queued_task(&sequence->wait, next->task);
+ }
+ spin_unlock(&sequence->lock);
+}
+
+void nfs_free_seqid(struct nfs_seqid *seqid)
+{
+ nfs_release_seqid(seqid);
+ kfree(seqid);
+}
+
+/*
+ * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or
+ * failed with a seqid incrementing error -
+ * see comments nfs4.h:seqid_mutating_error()
+ */
+static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
+{
+ switch (status) {
+ case 0:
+ break;
+ case -NFS4ERR_BAD_SEQID:
+ if (seqid->sequence->flags & NFS_SEQID_CONFIRMED)
+ return;
+ pr_warn_ratelimited("NFS: v4 server returned a bad"
+ " sequence-id error on an"
+ " unconfirmed sequence %p!\n",
+ seqid->sequence);
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_BADXDR:
+ case -NFS4ERR_RESOURCE:
+ case -NFS4ERR_NOFILEHANDLE:
+ case -NFS4ERR_MOVED:
+ /* Non-seqid mutating errors */
+ return;
+ }
+ /*
+ * Note: no locking needed as we are guaranteed to be first
+ * on the sequence list
+ */
+ seqid->sequence->counter++;
+}
+
+void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
+{
+ struct nfs4_state_owner *sp;
+
+ if (seqid == NULL)
+ return;
+
+ sp = container_of(seqid->sequence, struct nfs4_state_owner, so_seqid);
+ if (status == -NFS4ERR_BAD_SEQID)
+ nfs4_reset_state_owner(sp);
+ if (!nfs4_has_session(sp->so_server->nfs_client))
+ nfs_increment_seqid(status, seqid);
+}
+
+/*
+ * Increment the seqid if the LOCK/LOCKU succeeded, or
+ * failed with a seqid incrementing error -
+ * see comments nfs4.h:seqid_mutating_error()
+ */
+void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)
+{
+ if (seqid != NULL)
+ nfs_increment_seqid(status, seqid);
+}
+
+int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
+{
+ struct nfs_seqid_counter *sequence;
+ int status = 0;
+
+ if (seqid == NULL)
+ goto out;
+ sequence = seqid->sequence;
+ spin_lock(&sequence->lock);
+ seqid->task = task;
+ if (list_empty(&seqid->list))
+ list_add_tail(&seqid->list, &sequence->list);
+ if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid)
+ goto unlock;
+ rpc_sleep_on(&sequence->wait, task, NULL);
+ status = -EAGAIN;
+unlock:
+ spin_unlock(&sequence->lock);
+out:
+ return status;
+}
+
+static int nfs4_run_state_manager(void *);
+
+static void nfs4_clear_state_manager_bit(struct nfs_client *clp)
+{
+ smp_mb__before_atomic();
+ clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
+ smp_mb__after_atomic();
+ wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING);
+ rpc_wake_up(&clp->cl_rpcwaitq);
+}
+
+/*
+ * Schedule the nfs_client asynchronous state management routine
+ */
+void nfs4_schedule_state_manager(struct nfs_client *clp)
+{
+ struct task_struct *task;
+ char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
+ struct rpc_clnt *clnt = clp->cl_rpcclient;
+ bool swapon = false;
+
+ set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
+
+ if (atomic_read(&clnt->cl_swapper)) {
+ swapon = !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE,
+ &clp->cl_state);
+ if (!swapon) {
+ wake_up_var(&clp->cl_state);
+ return;
+ }
+ }
+
+ if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+ return;
+
+ __module_get(THIS_MODULE);
+ refcount_inc(&clp->cl_count);
+
+ /* The rcu_read_lock() is not strictly necessary, as the state
+ * manager is the only thread that ever changes the rpc_xprt
+ * after it's initialized. At this point, we're single threaded. */
+ rcu_read_lock();
+ snprintf(buf, sizeof(buf), "%s-manager",
+ rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+ rcu_read_unlock();
+ task = kthread_run(nfs4_run_state_manager, clp, "%s", buf);
+ if (IS_ERR(task)) {
+ printk(KERN_ERR "%s: kthread_run: %ld\n",
+ __func__, PTR_ERR(task));
+ if (!nfs_client_init_is_complete(clp))
+ nfs_mark_client_ready(clp, PTR_ERR(task));
+ if (swapon)
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
+ nfs4_clear_state_manager_bit(clp);
+ nfs_put_client(clp);
+ module_put(THIS_MODULE);
+ }
+}
+
+/*
+ * Schedule a lease recovery attempt
+ */
+void nfs4_schedule_lease_recovery(struct nfs_client *clp)
+{
+ if (!clp)
+ return;
+ if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+ set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+ dprintk("%s: scheduling lease recovery for server %s\n", __func__,
+ clp->cl_hostname);
+ nfs4_schedule_state_manager(clp);
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery);
+
+/**
+ * nfs4_schedule_migration_recovery - trigger migration recovery
+ *
+ * @server: FSID that is migrating
+ *
+ * Returns zero if recovery has started, otherwise a negative NFS4ERR
+ * value is returned.
+ */
+int nfs4_schedule_migration_recovery(const struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+
+ if (server->fh_expire_type != NFS4_FH_PERSISTENT) {
+ pr_err("NFS: volatile file handles not supported (server %s)\n",
+ clp->cl_hostname);
+ return -NFS4ERR_IO;
+ }
+
+ if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+ return -NFS4ERR_IO;
+
+ dprintk("%s: scheduling migration recovery for (%llx:%llx) on %s\n",
+ __func__,
+ (unsigned long long)server->fsid.major,
+ (unsigned long long)server->fsid.minor,
+ clp->cl_hostname);
+
+ set_bit(NFS_MIG_IN_TRANSITION,
+ &((struct nfs_server *)server)->mig_status);
+ set_bit(NFS4CLNT_MOVED, &clp->cl_state);
+
+ nfs4_schedule_state_manager(clp);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_migration_recovery);
+
+/**
+ * nfs4_schedule_lease_moved_recovery - start lease-moved recovery
+ *
+ * @clp: server to check for moved leases
+ *
+ */
+void nfs4_schedule_lease_moved_recovery(struct nfs_client *clp)
+{
+ dprintk("%s: scheduling lease-moved recovery for client ID %llx on %s\n",
+ __func__, clp->cl_clientid, clp->cl_hostname);
+
+ set_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state);
+ nfs4_schedule_state_manager(clp);
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_lease_moved_recovery);
+
+int nfs4_wait_clnt_recover(struct nfs_client *clp)
+{
+ int res;
+
+ might_sleep();
+
+ refcount_inc(&clp->cl_count);
+ res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
+ nfs_wait_bit_killable, TASK_KILLABLE);
+ if (res)
+ goto out;
+ if (clp->cl_cons_state < 0)
+ res = clp->cl_cons_state;
+out:
+ nfs_put_client(clp);
+ return res;
+}
+
+int nfs4_client_recover_expired_lease(struct nfs_client *clp)
+{
+ unsigned int loop;
+ int ret;
+
+ for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
+ ret = nfs4_wait_clnt_recover(clp);
+ if (ret != 0)
+ break;
+ if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
+ !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
+ break;
+ nfs4_schedule_state_manager(clp);
+ ret = -EIO;
+ }
+ return ret;
+}
+
+/*
+ * nfs40_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
+ * @clp: client to process
+ *
+ * Set the NFS4CLNT_LEASE_EXPIRED state in order to force a
+ * resend of the SETCLIENTID and hence re-establish the
+ * callback channel. Then return all existing delegations.
+ */
+static void nfs40_handle_cb_pathdown(struct nfs_client *clp)
+{
+ set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ nfs_expire_all_delegations(clp);
+ dprintk("%s: handling CB_PATHDOWN recovery for server %s\n", __func__,
+ clp->cl_hostname);
+}
+
+void nfs4_schedule_path_down_recovery(struct nfs_client *clp)
+{
+ nfs40_handle_cb_pathdown(clp);
+ nfs4_schedule_state_manager(clp);
+}
+
+static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
+{
+
+ if (!nfs4_valid_open_stateid(state))
+ return 0;
+ set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+ /* Don't recover state that expired before the reboot */
+ if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) {
+ clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+ return 0;
+ }
+ set_bit(NFS_OWNER_RECLAIM_REBOOT, &state->owner->so_flags);
+ set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+ return 1;
+}
+
+int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
+{
+ if (!nfs4_valid_open_stateid(state))
+ return 0;
+ set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
+ clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+ set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags);
+ set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
+ return 1;
+}
+
+int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state)
+{
+ struct nfs_client *clp = server->nfs_client;
+
+ if (!nfs4_state_mark_reclaim_nograce(clp, state))
+ return -EBADF;
+ nfs_inode_find_delegation_state_and_recover(state->inode,
+ &state->stateid);
+ dprintk("%s: scheduling stateid recovery for server %s\n", __func__,
+ clp->cl_hostname);
+ nfs4_schedule_state_manager(clp);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery);
+
+static struct nfs4_lock_state *
+nfs_state_find_lock_state_by_stateid(struct nfs4_state *state,
+ const nfs4_stateid *stateid)
+{
+ struct nfs4_lock_state *pos;
+
+ list_for_each_entry(pos, &state->lock_states, ls_locks) {
+ if (!test_bit(NFS_LOCK_INITIALIZED, &pos->ls_flags))
+ continue;
+ if (nfs4_stateid_match_or_older(&pos->ls_stateid, stateid))
+ return pos;
+ }
+ return NULL;
+}
+
+static bool nfs_state_lock_state_matches_stateid(struct nfs4_state *state,
+ const nfs4_stateid *stateid)
+{
+ bool found = false;
+
+ if (test_bit(LK_STATE_IN_USE, &state->flags)) {
+ spin_lock(&state->state_lock);
+ if (nfs_state_find_lock_state_by_stateid(state, stateid))
+ found = true;
+ spin_unlock(&state->state_lock);
+ }
+ return found;
+}
+
+void nfs_inode_find_state_and_recover(struct inode *inode,
+ const nfs4_stateid *stateid)
+{
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_open_context *ctx;
+ struct nfs4_state *state;
+ bool found = false;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
+ state = ctx->state;
+ if (state == NULL)
+ continue;
+ if (nfs4_stateid_match_or_older(&state->stateid, stateid) &&
+ nfs4_state_mark_reclaim_nograce(clp, state)) {
+ found = true;
+ continue;
+ }
+ if (test_bit(NFS_OPEN_STATE, &state->flags) &&
+ nfs4_stateid_match_or_older(&state->open_stateid, stateid) &&
+ nfs4_state_mark_reclaim_nograce(clp, state)) {
+ found = true;
+ continue;
+ }
+ if (nfs_state_lock_state_matches_stateid(state, stateid) &&
+ nfs4_state_mark_reclaim_nograce(clp, state))
+ found = true;
+ }
+ rcu_read_unlock();
+
+ nfs_inode_find_delegation_state_and_recover(inode, stateid);
+ if (found)
+ nfs4_schedule_state_manager(clp);
+}
+
+static void nfs4_state_mark_open_context_bad(struct nfs4_state *state, int err)
+{
+ struct inode *inode = state->inode;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_open_context *ctx;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
+ if (ctx->state != state)
+ continue;
+ set_bit(NFS_CONTEXT_BAD, &ctx->flags);
+ pr_warn("NFSv4: state recovery failed for open file %pd2, "
+ "error = %d\n", ctx->dentry, err);
+ }
+ rcu_read_unlock();
+}
+
+static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error)
+{
+ set_bit(NFS_STATE_RECOVERY_FAILED, &state->flags);
+ nfs4_state_mark_open_context_bad(state, error);
+}
+
+
+static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
+{
+ struct inode *inode = state->inode;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct file_lock *fl;
+ struct nfs4_lock_state *lsp;
+ int status = 0;
+ struct file_lock_context *flctx = inode->i_flctx;
+ struct list_head *list;
+
+ if (flctx == NULL)
+ return 0;
+
+ list = &flctx->flc_posix;
+
+ /* Guard against delegation returns and new lock/unlock calls */
+ down_write(&nfsi->rwsem);
+ spin_lock(&flctx->flc_lock);
+restart:
+ list_for_each_entry(fl, list, fl_list) {
+ if (nfs_file_open_context(fl->fl_file)->state != state)
+ continue;
+ spin_unlock(&flctx->flc_lock);
+ status = ops->recover_lock(state, fl);
+ switch (status) {
+ case 0:
+ break;
+ case -ETIMEDOUT:
+ case -ESTALE:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_NO_GRACE:
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ goto out;
+ default:
+ pr_err("NFS: %s: unhandled error %d\n",
+ __func__, status);
+ fallthrough;
+ case -ENOMEM:
+ case -NFS4ERR_DENIED:
+ case -NFS4ERR_RECLAIM_BAD:
+ case -NFS4ERR_RECLAIM_CONFLICT:
+ lsp = fl->fl_u.nfs4_fl.owner;
+ if (lsp)
+ set_bit(NFS_LOCK_LOST, &lsp->ls_flags);
+ status = 0;
+ }
+ spin_lock(&flctx->flc_lock);
+ }
+ if (list == &flctx->flc_posix) {
+ list = &flctx->flc_flock;
+ goto restart;
+ }
+ spin_unlock(&flctx->flc_lock);
+out:
+ up_write(&nfsi->rwsem);
+ return status;
+}
+
+#ifdef CONFIG_NFS_V4_2
+static void nfs42_complete_copies(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+ struct nfs4_copy_state *copy;
+
+ if (!test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags) &&
+ !test_bit(NFS_CLNT_SRC_SSC_COPY_STATE, &state->flags))
+ return;
+
+ spin_lock(&sp->so_server->nfs_client->cl_lock);
+ list_for_each_entry(copy, &sp->so_server->ss_copies, copies) {
+ if ((test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags) &&
+ !nfs4_stateid_match_other(&state->stateid,
+ &copy->parent_dst_state->stateid)))
+ continue;
+ copy->flags = 1;
+ if (test_and_clear_bit(NFS_CLNT_DST_SSC_COPY_STATE,
+ &state->flags)) {
+ clear_bit(NFS_CLNT_SRC_SSC_COPY_STATE, &state->flags);
+ complete(&copy->completion);
+ }
+ }
+ list_for_each_entry(copy, &sp->so_server->ss_copies, src_copies) {
+ if ((test_bit(NFS_CLNT_SRC_SSC_COPY_STATE, &state->flags) &&
+ !nfs4_stateid_match_other(&state->stateid,
+ &copy->parent_src_state->stateid)))
+ continue;
+ copy->flags = 1;
+ if (test_and_clear_bit(NFS_CLNT_DST_SSC_COPY_STATE,
+ &state->flags))
+ complete(&copy->completion);
+ }
+ spin_unlock(&sp->so_server->nfs_client->cl_lock);
+}
+#else /* !CONFIG_NFS_V4_2 */
+static inline void nfs42_complete_copies(struct nfs4_state_owner *sp,
+ struct nfs4_state *state)
+{
+}
+#endif /* CONFIG_NFS_V4_2 */
+
+static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_state *state,
+ const struct nfs4_state_recovery_ops *ops)
+{
+ struct nfs4_lock_state *lock;
+ int status;
+
+ status = ops->recover_open(sp, state);
+ if (status < 0)
+ return status;
+
+ status = nfs4_reclaim_locks(state, ops);
+ if (status < 0)
+ return status;
+
+ if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) {
+ spin_lock(&state->state_lock);
+ list_for_each_entry(lock, &state->lock_states, ls_locks) {
+ trace_nfs4_state_lock_reclaim(state, lock);
+ if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
+ pr_warn_ratelimited("NFS: %s: Lock reclaim failed!\n", __func__);
+ }
+ spin_unlock(&state->state_lock);
+ }
+
+ nfs42_complete_copies(sp, state);
+ clear_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
+ return status;
+}
+
+static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops)
+{
+ struct nfs4_state *state;
+ unsigned int loop = 0;
+ int status = 0;
+#ifdef CONFIG_NFS_V4_2
+ bool found_ssc_copy_state = false;
+#endif /* CONFIG_NFS_V4_2 */
+
+ /* Note: we rely on the sp->so_states list being ordered
+ * so that we always reclaim open(O_RDWR) and/or open(O_WRITE)
+ * states first.
+ * This is needed to ensure that the server won't give us any
+ * read delegations that we have to return if, say, we are
+ * recovering after a network partition or a reboot from a
+ * server that doesn't support a grace period.
+ */
+ spin_lock(&sp->so_lock);
+ raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
+restart:
+ list_for_each_entry(state, &sp->so_states, open_states) {
+ if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
+ continue;
+ if (!nfs4_valid_open_stateid(state))
+ continue;
+ if (state->state == 0)
+ continue;
+#ifdef CONFIG_NFS_V4_2
+ if (test_bit(NFS_SRV_SSC_COPY_STATE, &state->flags)) {
+ nfs4_state_mark_recovery_failed(state, -EIO);
+ found_ssc_copy_state = true;
+ continue;
+ }
+#endif /* CONFIG_NFS_V4_2 */
+ refcount_inc(&state->count);
+ spin_unlock(&sp->so_lock);
+ status = __nfs4_reclaim_open_state(sp, state, ops);
+
+ switch (status) {
+ default:
+ if (status >= 0) {
+ loop = 0;
+ break;
+ }
+ printk(KERN_ERR "NFS: %s: unhandled error %d\n", __func__, status);
+ fallthrough;
+ case -ENOENT:
+ case -ENOMEM:
+ case -EACCES:
+ case -EROFS:
+ case -EIO:
+ case -ESTALE:
+ /* Open state on this file cannot be recovered */
+ nfs4_state_mark_recovery_failed(state, status);
+ break;
+ case -EAGAIN:
+ ssleep(1);
+ if (loop++ < 10) {
+ set_bit(ops->state_flag_bit, &state->flags);
+ break;
+ }
+ fallthrough;
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_OLD_STATEID:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_RECLAIM_BAD:
+ case -NFS4ERR_RECLAIM_CONFLICT:
+ nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
+ break;
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_NO_GRACE:
+ nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
+ fallthrough;
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case -ETIMEDOUT:
+ goto out_err;
+ }
+ nfs4_put_open_state(state);
+ spin_lock(&sp->so_lock);
+ goto restart;
+ }
+ raw_write_seqcount_end(&sp->so_reclaim_seqcount);
+ spin_unlock(&sp->so_lock);
+#ifdef CONFIG_NFS_V4_2
+ if (found_ssc_copy_state)
+ return -EIO;
+#endif /* CONFIG_NFS_V4_2 */
+ return 0;
+out_err:
+ nfs4_put_open_state(state);
+ spin_lock(&sp->so_lock);
+ raw_write_seqcount_end(&sp->so_reclaim_seqcount);
+ spin_unlock(&sp->so_lock);
+ return status;
+}
+
+static void nfs4_clear_open_state(struct nfs4_state *state)
+{
+ struct nfs4_lock_state *lock;
+
+ clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+ clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+ clear_bit(NFS_O_RDWR_STATE, &state->flags);
+ spin_lock(&state->state_lock);
+ list_for_each_entry(lock, &state->lock_states, ls_locks) {
+ lock->ls_seqid.flags = 0;
+ clear_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags);
+ }
+ spin_unlock(&state->state_lock);
+}
+
+static void nfs4_reset_seqids(struct nfs_server *server,
+ int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_state_owner *sp;
+ struct rb_node *pos;
+ struct nfs4_state *state;
+
+ spin_lock(&clp->cl_lock);
+ for (pos = rb_first(&server->state_owners);
+ pos != NULL;
+ pos = rb_next(pos)) {
+ sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
+ sp->so_seqid.flags = 0;
+ spin_lock(&sp->so_lock);
+ list_for_each_entry(state, &sp->so_states, open_states) {
+ if (mark_reclaim(clp, state))
+ nfs4_clear_open_state(state);
+ }
+ spin_unlock(&sp->so_lock);
+ }
+ spin_unlock(&clp->cl_lock);
+}
+
+static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp,
+ int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
+{
+ struct nfs_server *server;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ nfs4_reset_seqids(server, mark_reclaim);
+ rcu_read_unlock();
+}
+
+static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
+{
+ set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+ /* Mark all delegations for reclaim */
+ nfs_delegation_mark_reclaim(clp);
+ nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot);
+}
+
+static int nfs4_reclaim_complete(struct nfs_client *clp,
+ const struct nfs4_state_recovery_ops *ops,
+ const struct cred *cred)
+{
+ /* Notify the server we're done reclaiming our state */
+ if (ops->reclaim_complete)
+ return ops->reclaim_complete(clp, cred);
+ return 0;
+}
+
+static void nfs4_clear_reclaim_server(struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_state_owner *sp;
+ struct rb_node *pos;
+ struct nfs4_state *state;
+
+ spin_lock(&clp->cl_lock);
+ for (pos = rb_first(&server->state_owners);
+ pos != NULL;
+ pos = rb_next(pos)) {
+ sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
+ spin_lock(&sp->so_lock);
+ list_for_each_entry(state, &sp->so_states, open_states) {
+ if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT,
+ &state->flags))
+ continue;
+ nfs4_state_mark_reclaim_nograce(clp, state);
+ }
+ spin_unlock(&sp->so_lock);
+ }
+ spin_unlock(&clp->cl_lock);
+}
+
+static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+
+ if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+ return 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ nfs4_clear_reclaim_server(server);
+ rcu_read_unlock();
+
+ nfs_delegation_reap_unclaimed(clp);
+ return 1;
+}
+
+static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
+{
+ const struct nfs4_state_recovery_ops *ops;
+ const struct cred *cred;
+ int err;
+
+ if (!nfs4_state_clear_reclaim_reboot(clp))
+ return;
+ ops = clp->cl_mvops->reboot_recovery_ops;
+ cred = nfs4_get_clid_cred(clp);
+ err = nfs4_reclaim_complete(clp, ops, cred);
+ put_cred(cred);
+ if (err == -NFS4ERR_CONN_NOT_BOUND_TO_SESSION)
+ set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+}
+
+static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
+{
+ nfs_mark_test_expired_all_delegations(clp);
+ nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
+}
+
+static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
+{
+ switch (error) {
+ case 0:
+ break;
+ case -NFS4ERR_CB_PATH_DOWN:
+ nfs40_handle_cb_pathdown(clp);
+ break;
+ case -NFS4ERR_NO_GRACE:
+ nfs4_state_end_reclaim_reboot(clp);
+ break;
+ case -NFS4ERR_STALE_CLIENTID:
+ set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ nfs4_state_start_reclaim_reboot(clp);
+ break;
+ case -NFS4ERR_EXPIRED:
+ set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ nfs4_state_start_reclaim_nograce(clp);
+ break;
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_SEQ_FALSE_RETRY:
+ case -NFS4ERR_SEQ_MISORDERED:
+ set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+ /* Zero session reset errors */
+ break;
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+ break;
+ default:
+ dprintk("%s: failed to handle error %d for server %s\n",
+ __func__, error, clp->cl_hostname);
+ return error;
+ }
+ dprintk("%s: handled error %d for server %s\n", __func__, error,
+ clp->cl_hostname);
+ return 0;
+}
+
+static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
+{
+ struct nfs4_state_owner *sp;
+ struct nfs_server *server;
+ struct rb_node *pos;
+ LIST_HEAD(freeme);
+ int status = 0;
+
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ nfs4_purge_state_owners(server, &freeme);
+ spin_lock(&clp->cl_lock);
+ for (pos = rb_first(&server->state_owners);
+ pos != NULL;
+ pos = rb_next(pos)) {
+ sp = rb_entry(pos,
+ struct nfs4_state_owner, so_server_node);
+ if (!test_and_clear_bit(ops->owner_flag_bit,
+ &sp->so_flags))
+ continue;
+ if (!atomic_inc_not_zero(&sp->so_count))
+ continue;
+ spin_unlock(&clp->cl_lock);
+ rcu_read_unlock();
+
+ status = nfs4_reclaim_open_state(sp, ops);
+ if (status < 0) {
+ set_bit(ops->owner_flag_bit, &sp->so_flags);
+ nfs4_put_state_owner(sp);
+ status = nfs4_recovery_handle_error(clp, status);
+ return (status != 0) ? status : -EAGAIN;
+ }
+
+ nfs4_put_state_owner(sp);
+ goto restart;
+ }
+ spin_unlock(&clp->cl_lock);
+ }
+ rcu_read_unlock();
+ nfs4_free_state_owners(&freeme);
+ return 0;
+}
+
+static int nfs4_check_lease(struct nfs_client *clp)
+{
+ const struct cred *cred;
+ const struct nfs4_state_maintenance_ops *ops =
+ clp->cl_mvops->state_renewal_ops;
+ int status;
+
+ /* Is the client already known to have an expired lease? */
+ if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+ return 0;
+ cred = ops->get_state_renewal_cred(clp);
+ if (cred == NULL) {
+ cred = nfs4_get_clid_cred(clp);
+ status = -ENOKEY;
+ if (cred == NULL)
+ goto out;
+ }
+ status = ops->renew_lease(clp, cred);
+ put_cred(cred);
+ if (status == -ETIMEDOUT) {
+ set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+ return 0;
+ }
+out:
+ return nfs4_recovery_handle_error(clp, status);
+}
+
+/* Set NFS4CLNT_LEASE_EXPIRED and reclaim reboot state for all v4.0 errors
+ * and for recoverable errors on EXCHANGE_ID for v4.1
+ */
+static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
+{
+ switch (status) {
+ case -NFS4ERR_SEQ_MISORDERED:
+ if (test_and_set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state))
+ return -ESERVERFAULT;
+ /* Lease confirmation error: retry after purging the lease */
+ ssleep(1);
+ clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+ break;
+ case -NFS4ERR_STALE_CLIENTID:
+ clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+ nfs4_state_start_reclaim_reboot(clp);
+ break;
+ case -NFS4ERR_CLID_INUSE:
+ pr_err("NFS: Server %s reports our clientid is in use\n",
+ clp->cl_hostname);
+ nfs_mark_client_ready(clp, -EPERM);
+ clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+ return -EPERM;
+ case -EACCES:
+ case -NFS4ERR_DELAY:
+ case -EAGAIN:
+ ssleep(1);
+ break;
+
+ case -NFS4ERR_MINOR_VERS_MISMATCH:
+ if (clp->cl_cons_state == NFS_CS_SESSION_INITING)
+ nfs_mark_client_ready(clp, -EPROTONOSUPPORT);
+ dprintk("%s: exit with error %d for server %s\n",
+ __func__, -EPROTONOSUPPORT, clp->cl_hostname);
+ return -EPROTONOSUPPORT;
+ case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
+ * in nfs4_exchange_id */
+ default:
+ dprintk("%s: exit with error %d for server %s\n", __func__,
+ status, clp->cl_hostname);
+ return status;
+ }
+ set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ dprintk("%s: handled error %d for server %s\n", __func__, status,
+ clp->cl_hostname);
+ return 0;
+}
+
+static int nfs4_establish_lease(struct nfs_client *clp)
+{
+ const struct cred *cred;
+ const struct nfs4_state_recovery_ops *ops =
+ clp->cl_mvops->reboot_recovery_ops;
+ int status;
+
+ status = nfs4_begin_drain_session(clp);
+ if (status != 0)
+ return status;
+ cred = nfs4_get_clid_cred(clp);
+ if (cred == NULL)
+ return -ENOENT;
+ status = ops->establish_clid(clp, cred);
+ put_cred(cred);
+ if (status != 0)
+ return status;
+ pnfs_destroy_all_layouts(clp);
+ return 0;
+}
+
+/*
+ * Returns zero or a negative errno. NFS4ERR values are converted
+ * to local errno values.
+ */
+static int nfs4_reclaim_lease(struct nfs_client *clp)
+{
+ int status;
+
+ status = nfs4_establish_lease(clp);
+ if (status < 0)
+ return nfs4_handle_reclaim_lease_error(clp, status);
+ if (test_and_clear_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state))
+ nfs4_state_start_reclaim_nograce(clp);
+ if (!test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state))
+ set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+ clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+ clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ return 0;
+}
+
+static int nfs4_purge_lease(struct nfs_client *clp)
+{
+ int status;
+
+ status = nfs4_establish_lease(clp);
+ if (status < 0)
+ return nfs4_handle_reclaim_lease_error(clp, status);
+ clear_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state);
+ set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ nfs4_state_start_reclaim_nograce(clp);
+ return 0;
+}
+
+/*
+ * Try remote migration of one FSID from a source server to a
+ * destination server. The source server provides a list of
+ * potential destinations.
+ *
+ * Returns zero or a negative NFS4ERR status code.
+ */
+static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_fs_locations *locations = NULL;
+ struct inode *inode;
+ struct page *page;
+ int status, result;
+
+ dprintk("--> %s: FSID %llx:%llx on \"%s\"\n", __func__,
+ (unsigned long long)server->fsid.major,
+ (unsigned long long)server->fsid.minor,
+ clp->cl_hostname);
+
+ result = 0;
+ page = alloc_page(GFP_KERNEL);
+ locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+ if (page == NULL || locations == NULL) {
+ dprintk("<-- %s: no memory\n", __func__);
+ goto out;
+ }
+
+ inode = d_inode(server->super->s_root);
+ result = nfs4_proc_get_locations(inode, locations, page, cred);
+ if (result) {
+ dprintk("<-- %s: failed to retrieve fs_locations: %d\n",
+ __func__, result);
+ goto out;
+ }
+
+ result = -NFS4ERR_NXIO;
+ if (!locations->nlocations)
+ goto out;
+
+ if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) {
+ dprintk("<-- %s: No fs_locations data, migration skipped\n",
+ __func__);
+ goto out;
+ }
+
+ status = nfs4_begin_drain_session(clp);
+ if (status != 0) {
+ result = status;
+ goto out;
+ }
+
+ status = nfs4_replace_transport(server, locations);
+ if (status != 0) {
+ dprintk("<-- %s: failed to replace transport: %d\n",
+ __func__, status);
+ goto out;
+ }
+
+ result = 0;
+ dprintk("<-- %s: migration succeeded\n", __func__);
+
+out:
+ if (page != NULL)
+ __free_page(page);
+ kfree(locations);
+ if (result) {
+ pr_err("NFS: migration recovery failed (server %s)\n",
+ clp->cl_hostname);
+ set_bit(NFS_MIG_FAILED, &server->mig_status);
+ }
+ return result;
+}
+
+/*
+ * Returns zero or a negative NFS4ERR status code.
+ */
+static int nfs4_handle_migration(struct nfs_client *clp)
+{
+ const struct nfs4_state_maintenance_ops *ops =
+ clp->cl_mvops->state_renewal_ops;
+ struct nfs_server *server;
+ const struct cred *cred;
+
+ dprintk("%s: migration reported on \"%s\"\n", __func__,
+ clp->cl_hostname);
+
+ cred = ops->get_state_renewal_cred(clp);
+ if (cred == NULL)
+ return -NFS4ERR_NOENT;
+
+ clp->cl_mig_gen++;
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ int status;
+
+ if (server->mig_gen == clp->cl_mig_gen)
+ continue;
+ server->mig_gen = clp->cl_mig_gen;
+
+ if (!test_and_clear_bit(NFS_MIG_IN_TRANSITION,
+ &server->mig_status))
+ continue;
+
+ rcu_read_unlock();
+ status = nfs4_try_migration(server, cred);
+ if (status < 0) {
+ put_cred(cred);
+ return status;
+ }
+ goto restart;
+ }
+ rcu_read_unlock();
+ put_cred(cred);
+ return 0;
+}
+
+/*
+ * Test each nfs_server on the clp's cl_superblocks list to see
+ * if it's moved to another server. Stop when the server no longer
+ * returns NFS4ERR_LEASE_MOVED.
+ */
+static int nfs4_handle_lease_moved(struct nfs_client *clp)
+{
+ const struct nfs4_state_maintenance_ops *ops =
+ clp->cl_mvops->state_renewal_ops;
+ struct nfs_server *server;
+ const struct cred *cred;
+
+ dprintk("%s: lease moved reported on \"%s\"\n", __func__,
+ clp->cl_hostname);
+
+ cred = ops->get_state_renewal_cred(clp);
+ if (cred == NULL)
+ return -NFS4ERR_NOENT;
+
+ clp->cl_mig_gen++;
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ struct inode *inode;
+ int status;
+
+ if (server->mig_gen == clp->cl_mig_gen)
+ continue;
+ server->mig_gen = clp->cl_mig_gen;
+
+ rcu_read_unlock();
+
+ inode = d_inode(server->super->s_root);
+ status = nfs4_proc_fsid_present(inode, cred);
+ if (status != -NFS4ERR_MOVED)
+ goto restart; /* wasn't this one */
+ if (nfs4_try_migration(server, cred) == -NFS4ERR_LEASE_MOVED)
+ goto restart; /* there are more */
+ goto out;
+ }
+ rcu_read_unlock();
+
+out:
+ put_cred(cred);
+ return 0;
+}
+
+/**
+ * nfs4_discover_server_trunking - Detect server IP address trunking
+ *
+ * @clp: nfs_client under test
+ * @result: OUT: found nfs_client, or clp
+ *
+ * Returns zero or a negative errno. If zero is returned,
+ * an nfs_client pointer is planted in "result".
+ *
+ * Note: since we are invoked in process context, and
+ * not from inside the state manager, we cannot use
+ * nfs4_handle_reclaim_lease_error().
+ */
+int nfs4_discover_server_trunking(struct nfs_client *clp,
+ struct nfs_client **result)
+{
+ const struct nfs4_state_recovery_ops *ops =
+ clp->cl_mvops->reboot_recovery_ops;
+ struct rpc_clnt *clnt;
+ const struct cred *cred;
+ int i, status;
+
+ dprintk("NFS: %s: testing '%s'\n", __func__, clp->cl_hostname);
+
+ clnt = clp->cl_rpcclient;
+ i = 0;
+
+ mutex_lock(&nfs_clid_init_mutex);
+again:
+ status = -ENOENT;
+ cred = nfs4_get_clid_cred(clp);
+ if (cred == NULL)
+ goto out_unlock;
+
+ status = ops->detect_trunking(clp, result, cred);
+ put_cred(cred);
+ switch (status) {
+ case 0:
+ case -EINTR:
+ case -ERESTARTSYS:
+ break;
+ case -ETIMEDOUT:
+ if (clnt->cl_softrtry)
+ break;
+ fallthrough;
+ case -NFS4ERR_DELAY:
+ case -EAGAIN:
+ ssleep(1);
+ fallthrough;
+ case -NFS4ERR_STALE_CLIENTID:
+ dprintk("NFS: %s after status %d, retrying\n",
+ __func__, status);
+ goto again;
+ case -EACCES:
+ if (i++ == 0) {
+ nfs4_root_machine_cred(clp);
+ goto again;
+ }
+ if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX)
+ break;
+ fallthrough;
+ case -NFS4ERR_CLID_INUSE:
+ case -NFS4ERR_WRONGSEC:
+ /* No point in retrying if we already used RPC_AUTH_UNIX */
+ if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX) {
+ status = -EPERM;
+ break;
+ }
+ clnt = rpc_clone_client_set_auth(clnt, RPC_AUTH_UNIX);
+ if (IS_ERR(clnt)) {
+ status = PTR_ERR(clnt);
+ break;
+ }
+ /* Note: this is safe because we haven't yet marked the
+ * client as ready, so we are the only user of
+ * clp->cl_rpcclient
+ */
+ clnt = xchg(&clp->cl_rpcclient, clnt);
+ rpc_shutdown_client(clnt);
+ clnt = clp->cl_rpcclient;
+ goto again;
+
+ case -NFS4ERR_MINOR_VERS_MISMATCH:
+ status = -EPROTONOSUPPORT;
+ break;
+
+ case -EKEYEXPIRED:
+ case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
+ * in nfs4_exchange_id */
+ status = -EKEYEXPIRED;
+ break;
+ default:
+ pr_warn("NFS: %s unhandled error %d. Exiting with error EIO\n",
+ __func__, status);
+ status = -EIO;
+ }
+
+out_unlock:
+ mutex_unlock(&nfs_clid_init_mutex);
+ dprintk("NFS: %s: status = %d\n", __func__, status);
+ return status;
+}
+
+#ifdef CONFIG_NFS_V4_1
+void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
+{
+ struct nfs_client *clp = session->clp;
+
+ switch (err) {
+ default:
+ set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+ break;
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+ }
+ nfs4_schedule_state_manager(clp);
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
+
+void nfs41_notify_server(struct nfs_client *clp)
+{
+ /* Use CHECK_LEASE to ping the server with a SEQUENCE */
+ set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+ nfs4_schedule_state_manager(clp);
+}
+
+static void nfs4_reset_all_state(struct nfs_client *clp)
+{
+ if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
+ set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state);
+ clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+ nfs4_state_start_reclaim_nograce(clp);
+ dprintk("%s: scheduling reset of all state for server %s!\n",
+ __func__, clp->cl_hostname);
+ nfs4_schedule_state_manager(clp);
+ }
+}
+
+static void nfs41_handle_server_reboot(struct nfs_client *clp)
+{
+ if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
+ nfs4_state_start_reclaim_reboot(clp);
+ dprintk("%s: server %s rebooted!\n", __func__,
+ clp->cl_hostname);
+ nfs4_schedule_state_manager(clp);
+ }
+}
+
+static void nfs41_handle_all_state_revoked(struct nfs_client *clp)
+{
+ nfs4_reset_all_state(clp);
+ dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname);
+}
+
+static void nfs41_handle_some_state_revoked(struct nfs_client *clp)
+{
+ nfs4_state_start_reclaim_nograce(clp);
+ nfs4_schedule_state_manager(clp);
+
+ dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname);
+}
+
+static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp)
+{
+ /* FIXME: For now, we destroy all layouts. */
+ pnfs_destroy_all_layouts(clp);
+ nfs_test_expired_all_delegations(clp);
+ dprintk("%s: Recallable state revoked on server %s!\n", __func__,
+ clp->cl_hostname);
+}
+
+static void nfs41_handle_backchannel_fault(struct nfs_client *clp)
+{
+ set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+ nfs4_schedule_state_manager(clp);
+
+ dprintk("%s: server %s declared a backchannel fault\n", __func__,
+ clp->cl_hostname);
+}
+
+static void nfs41_handle_cb_path_down(struct nfs_client *clp)
+{
+ if (test_and_set_bit(NFS4CLNT_BIND_CONN_TO_SESSION,
+ &clp->cl_state) == 0)
+ nfs4_schedule_state_manager(clp);
+}
+
+void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags,
+ bool recovery)
+{
+ if (!flags)
+ return;
+
+ dprintk("%s: \"%s\" (client ID %llx) flags=0x%08x\n",
+ __func__, clp->cl_hostname, clp->cl_clientid, flags);
+ /*
+ * If we're called from the state manager thread, then assume we're
+ * already handling the RECLAIM_NEEDED and/or STATE_REVOKED.
+ * Those flags are expected to remain set until we're done
+ * recovering (see RFC5661, section 18.46.3).
+ */
+ if (recovery)
+ goto out_recovery;
+
+ if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
+ nfs41_handle_server_reboot(clp);
+ if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED))
+ nfs41_handle_all_state_revoked(clp);
+ if (flags & (SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
+ SEQ4_STATUS_ADMIN_STATE_REVOKED))
+ nfs41_handle_some_state_revoked(clp);
+ if (flags & SEQ4_STATUS_LEASE_MOVED)
+ nfs4_schedule_lease_moved_recovery(clp);
+ if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
+ nfs41_handle_recallable_state_revoked(clp);
+out_recovery:
+ if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT)
+ nfs41_handle_backchannel_fault(clp);
+ else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
+ SEQ4_STATUS_CB_PATH_DOWN_SESSION))
+ nfs41_handle_cb_path_down(clp);
+}
+
+static int nfs4_reset_session(struct nfs_client *clp)
+{
+ const struct cred *cred;
+ int status;
+
+ if (!nfs4_has_session(clp))
+ return 0;
+ status = nfs4_begin_drain_session(clp);
+ if (status != 0)
+ return status;
+ cred = nfs4_get_clid_cred(clp);
+ status = nfs4_proc_destroy_session(clp->cl_session, cred);
+ switch (status) {
+ case 0:
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_DEADSESSION:
+ break;
+ case -NFS4ERR_BACK_CHAN_BUSY:
+ case -NFS4ERR_DELAY:
+ set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+ status = 0;
+ ssleep(1);
+ goto out;
+ default:
+ status = nfs4_recovery_handle_error(clp, status);
+ goto out;
+ }
+
+ memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN);
+ status = nfs4_proc_create_session(clp, cred);
+ if (status) {
+ dprintk("%s: session reset failed with status %d for server %s!\n",
+ __func__, status, clp->cl_hostname);
+ status = nfs4_handle_reclaim_lease_error(clp, status);
+ goto out;
+ }
+ nfs41_finish_session_reset(clp);
+ dprintk("%s: session reset was successful for server %s!\n",
+ __func__, clp->cl_hostname);
+out:
+ put_cred(cred);
+ return status;
+}
+
+static int nfs4_bind_conn_to_session(struct nfs_client *clp)
+{
+ const struct cred *cred;
+ int ret;
+
+ if (!nfs4_has_session(clp))
+ return 0;
+ ret = nfs4_begin_drain_session(clp);
+ if (ret != 0)
+ return ret;
+ cred = nfs4_get_clid_cred(clp);
+ ret = nfs4_proc_bind_conn_to_session(clp, cred);
+ put_cred(cred);
+ clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+ switch (ret) {
+ case 0:
+ dprintk("%s: bind_conn_to_session was successful for server %s!\n",
+ __func__, clp->cl_hostname);
+ break;
+ case -NFS4ERR_DELAY:
+ ssleep(1);
+ set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+ break;
+ default:
+ return nfs4_recovery_handle_error(clp, ret);
+ }
+ return 0;
+}
+
+static void nfs4_layoutreturn_any_run(struct nfs_client *clp)
+{
+ int iomode = 0;
+
+ if (test_and_clear_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &clp->cl_state))
+ iomode += IOMODE_READ;
+ if (test_and_clear_bit(NFS4CLNT_RECALL_ANY_LAYOUT_RW, &clp->cl_state))
+ iomode += IOMODE_RW;
+ /* Note: IOMODE_READ + IOMODE_RW == IOMODE_ANY */
+ if (iomode) {
+ pnfs_layout_return_unused_byclid(clp, iomode);
+ set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
+ }
+}
+#else /* CONFIG_NFS_V4_1 */
+static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
+
+static int nfs4_bind_conn_to_session(struct nfs_client *clp)
+{
+ return 0;
+}
+
+static void nfs4_layoutreturn_any_run(struct nfs_client *clp)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static void nfs4_state_manager(struct nfs_client *clp)
+{
+ unsigned int memflags;
+ int status = 0;
+ const char *section = "", *section_sep = "";
+
+ /*
+ * State recovery can deadlock if the direct reclaim code tries
+ * start NFS writeback. So ensure memory allocations are all
+ * GFP_NOFS.
+ */
+ memflags = memalloc_nofs_save();
+
+ /* Ensure exclusive access to NFSv4 state */
+ do {
+ trace_nfs4_state_mgr(clp);
+ clear_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
+ if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) {
+ section = "purge state";
+ status = nfs4_purge_lease(clp);
+ if (status < 0)
+ goto out_error;
+ continue;
+ }
+
+ if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) {
+ section = "lease expired";
+ /* We're going to have to re-establish a clientid */
+ status = nfs4_reclaim_lease(clp);
+ if (status < 0)
+ goto out_error;
+ continue;
+ }
+
+ /* Initialize or reset the session */
+ if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) {
+ section = "reset session";
+ status = nfs4_reset_session(clp);
+ if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+ continue;
+ if (status < 0)
+ goto out_error;
+ }
+
+ /* Send BIND_CONN_TO_SESSION */
+ if (test_and_clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION,
+ &clp->cl_state)) {
+ section = "bind conn to session";
+ status = nfs4_bind_conn_to_session(clp);
+ if (status < 0)
+ goto out_error;
+ continue;
+ }
+
+ if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
+ section = "check lease";
+ status = nfs4_check_lease(clp);
+ if (status < 0)
+ goto out_error;
+ continue;
+ }
+
+ if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) {
+ section = "migration";
+ status = nfs4_handle_migration(clp);
+ if (status < 0)
+ goto out_error;
+ }
+
+ if (test_and_clear_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state)) {
+ section = "lease moved";
+ status = nfs4_handle_lease_moved(clp);
+ if (status < 0)
+ goto out_error;
+ }
+
+ /* First recover reboot state... */
+ if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
+ section = "reclaim reboot";
+ status = nfs4_do_reclaim(clp,
+ clp->cl_mvops->reboot_recovery_ops);
+ if (status == -EAGAIN)
+ continue;
+ if (status < 0)
+ goto out_error;
+ nfs4_state_end_reclaim_reboot(clp);
+ continue;
+ }
+
+ /* Detect expired delegations... */
+ if (test_and_clear_bit(NFS4CLNT_DELEGATION_EXPIRED, &clp->cl_state)) {
+ section = "detect expired delegations";
+ nfs_reap_expired_delegations(clp);
+ continue;
+ }
+
+ /* Now recover expired state... */
+ if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
+ section = "reclaim nograce";
+ status = nfs4_do_reclaim(clp,
+ clp->cl_mvops->nograce_recovery_ops);
+ if (status == -EAGAIN)
+ continue;
+ if (status < 0)
+ goto out_error;
+ clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
+ }
+
+ memalloc_nofs_restore(memflags);
+ nfs4_end_drain_session(clp);
+ nfs4_clear_state_manager_bit(clp);
+
+ if (test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
+ !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING,
+ &clp->cl_state)) {
+ memflags = memalloc_nofs_save();
+ continue;
+ }
+
+ if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state)) {
+ if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
+ nfs_client_return_marked_delegations(clp);
+ set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
+ }
+ nfs4_layoutreturn_any_run(clp);
+ clear_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state);
+ }
+
+ return;
+
+ } while (refcount_read(&clp->cl_count) > 1 && !signalled());
+ goto out_drain;
+
+out_error:
+ if (strlen(section))
+ section_sep = ": ";
+ trace_nfs4_state_mgr_failed(clp, section, status);
+ pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s"
+ " with error %d\n", section_sep, section,
+ clp->cl_hostname, -status);
+ ssleep(1);
+out_drain:
+ memalloc_nofs_restore(memflags);
+ nfs4_end_drain_session(clp);
+ nfs4_clear_state_manager_bit(clp);
+}
+
+static int nfs4_run_state_manager(void *ptr)
+{
+ struct nfs_client *clp = ptr;
+ struct rpc_clnt *cl = clp->cl_rpcclient;
+
+ while (cl != cl->cl_parent)
+ cl = cl->cl_parent;
+
+ allow_signal(SIGKILL);
+again:
+ nfs4_state_manager(clp);
+
+ if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) &&
+ !test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) {
+ wait_var_event_interruptible(&clp->cl_state,
+ test_bit(NFS4CLNT_RUN_MANAGER,
+ &clp->cl_state));
+ if (!atomic_read(&cl->cl_swapper))
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
+ if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
+ !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state))
+ goto again;
+ /* Either no longer a swapper, or were signalled */
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
+ }
+
+ if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
+ test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
+ !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state))
+ goto again;
+
+ nfs_put_client(clp);
+ module_put_and_exit(0);
+ return 0;
+}
+
+/*
+ * Local variables:
+ * c-basic-offset: 8
+ * End:
+ */
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
new file mode 100644
index 000000000..d09bcfd7d
--- /dev/null
+++ b/fs/nfs/nfs4super.c
@@ -0,0 +1,314 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2012 Bryan Schumaker <bjschuma@netapp.com>
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_ssc.h>
+#include "delegation.h"
+#include "internal.h"
+#include "nfs4_fs.h"
+#include "nfs4idmap.h"
+#include "dns_resolve.h"
+#include "pnfs.h"
+#include "nfs.h"
+
+#define NFSDBG_FACILITY NFSDBG_VFS
+
+static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc);
+static void nfs4_evict_inode(struct inode *inode);
+
+static const struct super_operations nfs4_sops = {
+ .alloc_inode = nfs_alloc_inode,
+ .free_inode = nfs_free_inode,
+ .write_inode = nfs4_write_inode,
+ .drop_inode = nfs_drop_inode,
+ .statfs = nfs_statfs,
+ .evict_inode = nfs4_evict_inode,
+ .umount_begin = nfs_umount_begin,
+ .show_options = nfs_show_options,
+ .show_devname = nfs_show_devname,
+ .show_path = nfs_show_path,
+ .show_stats = nfs_show_stats,
+};
+
+struct nfs_subversion nfs_v4 = {
+ .owner = THIS_MODULE,
+ .nfs_fs = &nfs4_fs_type,
+ .rpc_vers = &nfs_version4,
+ .rpc_ops = &nfs_v4_clientops,
+ .sops = &nfs4_sops,
+ .xattr = nfs4_xattr_handlers,
+};
+
+static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ int ret = nfs_write_inode(inode, wbc);
+
+ if (ret == 0)
+ ret = pnfs_layoutcommit_inode(inode,
+ wbc->sync_mode == WB_SYNC_ALL);
+ return ret;
+}
+
+/*
+ * Clean out any remaining NFSv4 state that might be left over due
+ * to open() calls that passed nfs_atomic_lookup, but failed to call
+ * nfs_open().
+ */
+static void nfs4_evict_inode(struct inode *inode)
+{
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+ /* If we are holding a delegation, return and free it */
+ nfs_inode_evict_delegation(inode);
+ /* Note that above delegreturn would trigger pnfs return-on-close */
+ pnfs_return_layout(inode);
+ pnfs_destroy_layout_final(NFS_I(inode));
+ /* First call standard NFS clear_inode() code */
+ nfs_clear_inode(inode);
+ nfs4_xattr_cache_zap(inode);
+}
+
+struct nfs_referral_count {
+ struct list_head list;
+ const struct task_struct *task;
+ unsigned int referral_count;
+};
+
+static LIST_HEAD(nfs_referral_count_list);
+static DEFINE_SPINLOCK(nfs_referral_count_list_lock);
+
+static struct nfs_referral_count *nfs_find_referral_count(void)
+{
+ struct nfs_referral_count *p;
+
+ list_for_each_entry(p, &nfs_referral_count_list, list) {
+ if (p->task == current)
+ return p;
+ }
+ return NULL;
+}
+
+#define NFS_MAX_NESTED_REFERRALS 2
+
+static int nfs_referral_loop_protect(void)
+{
+ struct nfs_referral_count *p, *new;
+ int ret = -ENOMEM;
+
+ new = kmalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ goto out;
+ new->task = current;
+ new->referral_count = 1;
+
+ ret = 0;
+ spin_lock(&nfs_referral_count_list_lock);
+ p = nfs_find_referral_count();
+ if (p != NULL) {
+ if (p->referral_count >= NFS_MAX_NESTED_REFERRALS)
+ ret = -ELOOP;
+ else
+ p->referral_count++;
+ } else {
+ list_add(&new->list, &nfs_referral_count_list);
+ new = NULL;
+ }
+ spin_unlock(&nfs_referral_count_list_lock);
+ kfree(new);
+out:
+ return ret;
+}
+
+static void nfs_referral_loop_unprotect(void)
+{
+ struct nfs_referral_count *p;
+
+ spin_lock(&nfs_referral_count_list_lock);
+ p = nfs_find_referral_count();
+ p->referral_count--;
+ if (p->referral_count == 0)
+ list_del(&p->list);
+ else
+ p = NULL;
+ spin_unlock(&nfs_referral_count_list_lock);
+ kfree(p);
+}
+
+static int do_nfs4_mount(struct nfs_server *server,
+ struct fs_context *fc,
+ const char *hostname,
+ const char *export_path)
+{
+ struct nfs_fs_context *root_ctx;
+ struct fs_context *root_fc;
+ struct vfsmount *root_mnt;
+ struct dentry *dentry;
+ size_t len;
+ int ret;
+
+ struct fs_parameter param = {
+ .key = "source",
+ .type = fs_value_is_string,
+ .dirfd = -1,
+ };
+
+ if (IS_ERR(server))
+ return PTR_ERR(server);
+
+ root_fc = vfs_dup_fs_context(fc);
+ if (IS_ERR(root_fc)) {
+ nfs_free_server(server);
+ return PTR_ERR(root_fc);
+ }
+ kfree(root_fc->source);
+ root_fc->source = NULL;
+
+ root_ctx = nfs_fc2context(root_fc);
+ root_ctx->internal = true;
+ root_ctx->server = server;
+ /* We leave export_path unset as it's not used to find the root. */
+
+ len = strlen(hostname) + 5;
+ param.string = kmalloc(len, GFP_KERNEL);
+ if (param.string == NULL) {
+ put_fs_context(root_fc);
+ return -ENOMEM;
+ }
+
+ /* Does hostname needs to be enclosed in brackets? */
+ if (strchr(hostname, ':'))
+ param.size = snprintf(param.string, len, "[%s]:/", hostname);
+ else
+ param.size = snprintf(param.string, len, "%s:/", hostname);
+ ret = vfs_parse_fs_param(root_fc, &param);
+ kfree(param.string);
+ if (ret < 0) {
+ put_fs_context(root_fc);
+ return ret;
+ }
+ root_mnt = fc_mount(root_fc);
+ put_fs_context(root_fc);
+
+ if (IS_ERR(root_mnt))
+ return PTR_ERR(root_mnt);
+
+ ret = nfs_referral_loop_protect();
+ if (ret) {
+ mntput(root_mnt);
+ return ret;
+ }
+
+ dentry = mount_subtree(root_mnt, export_path);
+ nfs_referral_loop_unprotect();
+
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ fc->root = dentry;
+ return 0;
+}
+
+int nfs4_try_get_tree(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ int err;
+
+ dfprintk(MOUNT, "--> nfs4_try_get_tree()\n");
+
+ /* We create a mount for the server's root, walk to the requested
+ * location and then create another mount for that.
+ */
+ err= do_nfs4_mount(nfs4_create_server(fc),
+ fc, ctx->nfs_server.hostname,
+ ctx->nfs_server.export_path);
+ if (err) {
+ nfs_ferrorf(fc, MOUNT, "NFS4: Couldn't follow remote path");
+ dfprintk(MOUNT, "<-- nfs4_try_get_tree() = %d [error]\n", err);
+ } else {
+ dfprintk(MOUNT, "<-- nfs4_try_get_tree() = 0\n");
+ }
+ return err;
+}
+
+/*
+ * Create an NFS4 server record on referral traversal
+ */
+int nfs4_get_referral_tree(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ int err;
+
+ dprintk("--> nfs4_referral_mount()\n");
+
+ /* create a new volume representation */
+ err = do_nfs4_mount(nfs4_create_referral_server(fc),
+ fc, ctx->nfs_server.hostname,
+ ctx->nfs_server.export_path);
+ if (err) {
+ nfs_ferrorf(fc, MOUNT, "NFS4: Couldn't follow remote path");
+ dfprintk(MOUNT, "<-- nfs4_get_referral_tree() = %d [error]\n", err);
+ } else {
+ dfprintk(MOUNT, "<-- nfs4_get_referral_tree() = 0\n");
+ }
+ return err;
+}
+
+static int __init init_nfs_v4(void)
+{
+ int err;
+
+ err = nfs_dns_resolver_init();
+ if (err)
+ goto out;
+
+ err = nfs_idmap_init();
+ if (err)
+ goto out1;
+
+#ifdef CONFIG_NFS_V4_2
+ err = nfs4_xattr_cache_init();
+ if (err)
+ goto out2;
+#endif
+
+ err = nfs4_register_sysctl();
+ if (err)
+ goto out2;
+
+#ifdef CONFIG_NFS_V4_2
+ nfs42_ssc_register_ops();
+#endif
+ register_nfs_version(&nfs_v4);
+ return 0;
+out2:
+ nfs_idmap_quit();
+out1:
+ nfs_dns_resolver_destroy();
+out:
+ return err;
+}
+
+static void __exit exit_nfs_v4(void)
+{
+ /* Not called in the _init(), conditionally loaded */
+ nfs4_pnfs_v3_ds_connect_unload();
+
+ unregister_nfs_version(&nfs_v4);
+#ifdef CONFIG_NFS_V4_2
+ nfs4_xattr_cache_exit();
+ nfs42_ssc_unregister_ops();
+#endif
+ nfs4_unregister_sysctl();
+ nfs_idmap_quit();
+ nfs_dns_resolver_destroy();
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_nfs_v4);
+module_exit(exit_nfs_v4);
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
new file mode 100644
index 000000000..c394e4447
--- /dev/null
+++ b/fs/nfs/nfs4sysctl.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/nfs4sysctl.c
+ *
+ * Sysctl interface to NFS v4 parameters
+ *
+ * Copyright (c) 2006 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/sysctl.h>
+#include <linux/nfs_fs.h>
+
+#include "nfs4_fs.h"
+#include "nfs4idmap.h"
+#include "callback.h"
+
+static const int nfs_set_port_min;
+static const int nfs_set_port_max = 65535;
+static struct ctl_table_header *nfs4_callback_sysctl_table;
+
+static struct ctl_table nfs4_cb_sysctls[] = {
+ {
+ .procname = "nfs_callback_tcpport",
+ .data = &nfs_callback_set_tcpport,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (int *)&nfs_set_port_min,
+ .extra2 = (int *)&nfs_set_port_max,
+ },
+ {
+ .procname = "idmap_cache_timeout",
+ .data = &nfs_idmap_cache_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static struct ctl_table nfs4_cb_sysctl_dir[] = {
+ {
+ .procname = "nfs",
+ .mode = 0555,
+ .child = nfs4_cb_sysctls,
+ },
+ { }
+};
+
+static struct ctl_table nfs4_cb_sysctl_root[] = {
+ {
+ .procname = "fs",
+ .mode = 0555,
+ .child = nfs4_cb_sysctl_dir,
+ },
+ { }
+};
+
+int nfs4_register_sysctl(void)
+{
+ nfs4_callback_sysctl_table = register_sysctl_table(nfs4_cb_sysctl_root);
+ if (nfs4_callback_sysctl_table == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+void nfs4_unregister_sysctl(void)
+{
+ unregister_sysctl_table(nfs4_callback_sysctl_table);
+ nfs4_callback_sysctl_table = NULL;
+}
diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c
new file mode 100644
index 000000000..d9ac556be
--- /dev/null
+++ b/fs/nfs/nfs4trace.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
+#include "internal.h"
+#include "nfs4session.h"
+#include "callback.h"
+#include "pnfs.h"
+
+#define CREATE_TRACE_POINTS
+#include "nfs4trace.h"
+
+#ifdef CONFIG_NFS_V4_1
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_read);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_commit_ds);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_init_read);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_init_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_get_mirror_count);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_done);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_done);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_pagelist);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_pagelist);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_read_error);
+EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_write_error);
+EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_commit_error);
+#endif
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
new file mode 100644
index 000000000..d862df976
--- /dev/null
+++ b/fs/nfs/nfs4trace.h
@@ -0,0 +1,2311 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nfs4
+
+#if !defined(_TRACE_NFS4_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NFS4_H
+
+#include <linux/tracepoint.h>
+
+TRACE_DEFINE_ENUM(EPERM);
+TRACE_DEFINE_ENUM(ENOENT);
+TRACE_DEFINE_ENUM(EIO);
+TRACE_DEFINE_ENUM(ENXIO);
+TRACE_DEFINE_ENUM(EACCES);
+TRACE_DEFINE_ENUM(EEXIST);
+TRACE_DEFINE_ENUM(EXDEV);
+TRACE_DEFINE_ENUM(ENOTDIR);
+TRACE_DEFINE_ENUM(EISDIR);
+TRACE_DEFINE_ENUM(EFBIG);
+TRACE_DEFINE_ENUM(ENOSPC);
+TRACE_DEFINE_ENUM(EROFS);
+TRACE_DEFINE_ENUM(EMLINK);
+TRACE_DEFINE_ENUM(ENAMETOOLONG);
+TRACE_DEFINE_ENUM(ENOTEMPTY);
+TRACE_DEFINE_ENUM(EDQUOT);
+TRACE_DEFINE_ENUM(ESTALE);
+TRACE_DEFINE_ENUM(EBADHANDLE);
+TRACE_DEFINE_ENUM(EBADCOOKIE);
+TRACE_DEFINE_ENUM(ENOTSUPP);
+TRACE_DEFINE_ENUM(ETOOSMALL);
+TRACE_DEFINE_ENUM(EREMOTEIO);
+TRACE_DEFINE_ENUM(EBADTYPE);
+TRACE_DEFINE_ENUM(EAGAIN);
+TRACE_DEFINE_ENUM(ELOOP);
+TRACE_DEFINE_ENUM(EOPNOTSUPP);
+TRACE_DEFINE_ENUM(EDEADLK);
+TRACE_DEFINE_ENUM(ENOMEM);
+TRACE_DEFINE_ENUM(EKEYEXPIRED);
+TRACE_DEFINE_ENUM(ETIMEDOUT);
+TRACE_DEFINE_ENUM(ERESTARTSYS);
+TRACE_DEFINE_ENUM(ECONNREFUSED);
+TRACE_DEFINE_ENUM(ECONNRESET);
+TRACE_DEFINE_ENUM(ENETUNREACH);
+TRACE_DEFINE_ENUM(EHOSTUNREACH);
+TRACE_DEFINE_ENUM(EHOSTDOWN);
+TRACE_DEFINE_ENUM(EPIPE);
+TRACE_DEFINE_ENUM(EPFNOSUPPORT);
+TRACE_DEFINE_ENUM(EPROTONOSUPPORT);
+
+TRACE_DEFINE_ENUM(NFS4_OK);
+TRACE_DEFINE_ENUM(NFS4ERR_ACCESS);
+TRACE_DEFINE_ENUM(NFS4ERR_ATTRNOTSUPP);
+TRACE_DEFINE_ENUM(NFS4ERR_ADMIN_REVOKED);
+TRACE_DEFINE_ENUM(NFS4ERR_BACK_CHAN_BUSY);
+TRACE_DEFINE_ENUM(NFS4ERR_BADCHAR);
+TRACE_DEFINE_ENUM(NFS4ERR_BADHANDLE);
+TRACE_DEFINE_ENUM(NFS4ERR_BADIOMODE);
+TRACE_DEFINE_ENUM(NFS4ERR_BADLAYOUT);
+TRACE_DEFINE_ENUM(NFS4ERR_BADLABEL);
+TRACE_DEFINE_ENUM(NFS4ERR_BADNAME);
+TRACE_DEFINE_ENUM(NFS4ERR_BADOWNER);
+TRACE_DEFINE_ENUM(NFS4ERR_BADSESSION);
+TRACE_DEFINE_ENUM(NFS4ERR_BADSLOT);
+TRACE_DEFINE_ENUM(NFS4ERR_BADTYPE);
+TRACE_DEFINE_ENUM(NFS4ERR_BADXDR);
+TRACE_DEFINE_ENUM(NFS4ERR_BAD_COOKIE);
+TRACE_DEFINE_ENUM(NFS4ERR_BAD_HIGH_SLOT);
+TRACE_DEFINE_ENUM(NFS4ERR_BAD_RANGE);
+TRACE_DEFINE_ENUM(NFS4ERR_BAD_SEQID);
+TRACE_DEFINE_ENUM(NFS4ERR_BAD_SESSION_DIGEST);
+TRACE_DEFINE_ENUM(NFS4ERR_BAD_STATEID);
+TRACE_DEFINE_ENUM(NFS4ERR_CB_PATH_DOWN);
+TRACE_DEFINE_ENUM(NFS4ERR_CLID_INUSE);
+TRACE_DEFINE_ENUM(NFS4ERR_CLIENTID_BUSY);
+TRACE_DEFINE_ENUM(NFS4ERR_COMPLETE_ALREADY);
+TRACE_DEFINE_ENUM(NFS4ERR_CONN_NOT_BOUND_TO_SESSION);
+TRACE_DEFINE_ENUM(NFS4ERR_DEADLOCK);
+TRACE_DEFINE_ENUM(NFS4ERR_DEADSESSION);
+TRACE_DEFINE_ENUM(NFS4ERR_DELAY);
+TRACE_DEFINE_ENUM(NFS4ERR_DELEG_ALREADY_WANTED);
+TRACE_DEFINE_ENUM(NFS4ERR_DELEG_REVOKED);
+TRACE_DEFINE_ENUM(NFS4ERR_DENIED);
+TRACE_DEFINE_ENUM(NFS4ERR_DIRDELEG_UNAVAIL);
+TRACE_DEFINE_ENUM(NFS4ERR_DQUOT);
+TRACE_DEFINE_ENUM(NFS4ERR_ENCR_ALG_UNSUPP);
+TRACE_DEFINE_ENUM(NFS4ERR_EXIST);
+TRACE_DEFINE_ENUM(NFS4ERR_EXPIRED);
+TRACE_DEFINE_ENUM(NFS4ERR_FBIG);
+TRACE_DEFINE_ENUM(NFS4ERR_FHEXPIRED);
+TRACE_DEFINE_ENUM(NFS4ERR_FILE_OPEN);
+TRACE_DEFINE_ENUM(NFS4ERR_GRACE);
+TRACE_DEFINE_ENUM(NFS4ERR_HASH_ALG_UNSUPP);
+TRACE_DEFINE_ENUM(NFS4ERR_INVAL);
+TRACE_DEFINE_ENUM(NFS4ERR_IO);
+TRACE_DEFINE_ENUM(NFS4ERR_ISDIR);
+TRACE_DEFINE_ENUM(NFS4ERR_LAYOUTTRYLATER);
+TRACE_DEFINE_ENUM(NFS4ERR_LAYOUTUNAVAILABLE);
+TRACE_DEFINE_ENUM(NFS4ERR_LEASE_MOVED);
+TRACE_DEFINE_ENUM(NFS4ERR_LOCKED);
+TRACE_DEFINE_ENUM(NFS4ERR_LOCKS_HELD);
+TRACE_DEFINE_ENUM(NFS4ERR_LOCK_RANGE);
+TRACE_DEFINE_ENUM(NFS4ERR_MINOR_VERS_MISMATCH);
+TRACE_DEFINE_ENUM(NFS4ERR_MLINK);
+TRACE_DEFINE_ENUM(NFS4ERR_MOVED);
+TRACE_DEFINE_ENUM(NFS4ERR_NAMETOOLONG);
+TRACE_DEFINE_ENUM(NFS4ERR_NOENT);
+TRACE_DEFINE_ENUM(NFS4ERR_NOFILEHANDLE);
+TRACE_DEFINE_ENUM(NFS4ERR_NOMATCHING_LAYOUT);
+TRACE_DEFINE_ENUM(NFS4ERR_NOSPC);
+TRACE_DEFINE_ENUM(NFS4ERR_NOTDIR);
+TRACE_DEFINE_ENUM(NFS4ERR_NOTEMPTY);
+TRACE_DEFINE_ENUM(NFS4ERR_NOTSUPP);
+TRACE_DEFINE_ENUM(NFS4ERR_NOT_ONLY_OP);
+TRACE_DEFINE_ENUM(NFS4ERR_NOT_SAME);
+TRACE_DEFINE_ENUM(NFS4ERR_NO_GRACE);
+TRACE_DEFINE_ENUM(NFS4ERR_NXIO);
+TRACE_DEFINE_ENUM(NFS4ERR_OLD_STATEID);
+TRACE_DEFINE_ENUM(NFS4ERR_OPENMODE);
+TRACE_DEFINE_ENUM(NFS4ERR_OP_ILLEGAL);
+TRACE_DEFINE_ENUM(NFS4ERR_OP_NOT_IN_SESSION);
+TRACE_DEFINE_ENUM(NFS4ERR_PERM);
+TRACE_DEFINE_ENUM(NFS4ERR_PNFS_IO_HOLE);
+TRACE_DEFINE_ENUM(NFS4ERR_PNFS_NO_LAYOUT);
+TRACE_DEFINE_ENUM(NFS4ERR_RECALLCONFLICT);
+TRACE_DEFINE_ENUM(NFS4ERR_RECLAIM_BAD);
+TRACE_DEFINE_ENUM(NFS4ERR_RECLAIM_CONFLICT);
+TRACE_DEFINE_ENUM(NFS4ERR_REJECT_DELEG);
+TRACE_DEFINE_ENUM(NFS4ERR_REP_TOO_BIG);
+TRACE_DEFINE_ENUM(NFS4ERR_REP_TOO_BIG_TO_CACHE);
+TRACE_DEFINE_ENUM(NFS4ERR_REQ_TOO_BIG);
+TRACE_DEFINE_ENUM(NFS4ERR_RESOURCE);
+TRACE_DEFINE_ENUM(NFS4ERR_RESTOREFH);
+TRACE_DEFINE_ENUM(NFS4ERR_RETRY_UNCACHED_REP);
+TRACE_DEFINE_ENUM(NFS4ERR_RETURNCONFLICT);
+TRACE_DEFINE_ENUM(NFS4ERR_ROFS);
+TRACE_DEFINE_ENUM(NFS4ERR_SAME);
+TRACE_DEFINE_ENUM(NFS4ERR_SHARE_DENIED);
+TRACE_DEFINE_ENUM(NFS4ERR_SEQUENCE_POS);
+TRACE_DEFINE_ENUM(NFS4ERR_SEQ_FALSE_RETRY);
+TRACE_DEFINE_ENUM(NFS4ERR_SEQ_MISORDERED);
+TRACE_DEFINE_ENUM(NFS4ERR_SERVERFAULT);
+TRACE_DEFINE_ENUM(NFS4ERR_STALE);
+TRACE_DEFINE_ENUM(NFS4ERR_STALE_CLIENTID);
+TRACE_DEFINE_ENUM(NFS4ERR_STALE_STATEID);
+TRACE_DEFINE_ENUM(NFS4ERR_SYMLINK);
+TRACE_DEFINE_ENUM(NFS4ERR_TOOSMALL);
+TRACE_DEFINE_ENUM(NFS4ERR_TOO_MANY_OPS);
+TRACE_DEFINE_ENUM(NFS4ERR_UNKNOWN_LAYOUTTYPE);
+TRACE_DEFINE_ENUM(NFS4ERR_UNSAFE_COMPOUND);
+TRACE_DEFINE_ENUM(NFS4ERR_WRONGSEC);
+TRACE_DEFINE_ENUM(NFS4ERR_WRONG_CRED);
+TRACE_DEFINE_ENUM(NFS4ERR_WRONG_TYPE);
+TRACE_DEFINE_ENUM(NFS4ERR_XDEV);
+
+TRACE_DEFINE_ENUM(NFS4ERR_RESET_TO_MDS);
+TRACE_DEFINE_ENUM(NFS4ERR_RESET_TO_PNFS);
+
+#define show_nfsv4_errors(error) \
+ __print_symbolic(error, \
+ { NFS4_OK, "OK" }, \
+ /* Mapped by nfs4_stat_to_errno() */ \
+ { EPERM, "EPERM" }, \
+ { ENOENT, "ENOENT" }, \
+ { EIO, "EIO" }, \
+ { ENXIO, "ENXIO" }, \
+ { EACCES, "EACCES" }, \
+ { EEXIST, "EEXIST" }, \
+ { EXDEV, "EXDEV" }, \
+ { ENOTDIR, "ENOTDIR" }, \
+ { EISDIR, "EISDIR" }, \
+ { EFBIG, "EFBIG" }, \
+ { ENOSPC, "ENOSPC" }, \
+ { EROFS, "EROFS" }, \
+ { EMLINK, "EMLINK" }, \
+ { ENAMETOOLONG, "ENAMETOOLONG" }, \
+ { ENOTEMPTY, "ENOTEMPTY" }, \
+ { EDQUOT, "EDQUOT" }, \
+ { ESTALE, "ESTALE" }, \
+ { EBADHANDLE, "EBADHANDLE" }, \
+ { EBADCOOKIE, "EBADCOOKIE" }, \
+ { ENOTSUPP, "ENOTSUPP" }, \
+ { ETOOSMALL, "ETOOSMALL" }, \
+ { EREMOTEIO, "EREMOTEIO" }, \
+ { EBADTYPE, "EBADTYPE" }, \
+ { EAGAIN, "EAGAIN" }, \
+ { ELOOP, "ELOOP" }, \
+ { EOPNOTSUPP, "EOPNOTSUPP" }, \
+ { EDEADLK, "EDEADLK" }, \
+ /* RPC errors */ \
+ { ENOMEM, "ENOMEM" }, \
+ { EKEYEXPIRED, "EKEYEXPIRED" }, \
+ { ETIMEDOUT, "ETIMEDOUT" }, \
+ { ERESTARTSYS, "ERESTARTSYS" }, \
+ { ECONNREFUSED, "ECONNREFUSED" }, \
+ { ECONNRESET, "ECONNRESET" }, \
+ { ENETUNREACH, "ENETUNREACH" }, \
+ { EHOSTUNREACH, "EHOSTUNREACH" }, \
+ { EHOSTDOWN, "EHOSTDOWN" }, \
+ { EPIPE, "EPIPE" }, \
+ { EPFNOSUPPORT, "EPFNOSUPPORT" }, \
+ { EPROTONOSUPPORT, "EPROTONOSUPPORT" }, \
+ /* NFSv4 native errors */ \
+ { NFS4ERR_ACCESS, "ACCESS" }, \
+ { NFS4ERR_ATTRNOTSUPP, "ATTRNOTSUPP" }, \
+ { NFS4ERR_ADMIN_REVOKED, "ADMIN_REVOKED" }, \
+ { NFS4ERR_BACK_CHAN_BUSY, "BACK_CHAN_BUSY" }, \
+ { NFS4ERR_BADCHAR, "BADCHAR" }, \
+ { NFS4ERR_BADHANDLE, "BADHANDLE" }, \
+ { NFS4ERR_BADIOMODE, "BADIOMODE" }, \
+ { NFS4ERR_BADLAYOUT, "BADLAYOUT" }, \
+ { NFS4ERR_BADLABEL, "BADLABEL" }, \
+ { NFS4ERR_BADNAME, "BADNAME" }, \
+ { NFS4ERR_BADOWNER, "BADOWNER" }, \
+ { NFS4ERR_BADSESSION, "BADSESSION" }, \
+ { NFS4ERR_BADSLOT, "BADSLOT" }, \
+ { NFS4ERR_BADTYPE, "BADTYPE" }, \
+ { NFS4ERR_BADXDR, "BADXDR" }, \
+ { NFS4ERR_BAD_COOKIE, "BAD_COOKIE" }, \
+ { NFS4ERR_BAD_HIGH_SLOT, "BAD_HIGH_SLOT" }, \
+ { NFS4ERR_BAD_RANGE, "BAD_RANGE" }, \
+ { NFS4ERR_BAD_SEQID, "BAD_SEQID" }, \
+ { NFS4ERR_BAD_SESSION_DIGEST, "BAD_SESSION_DIGEST" }, \
+ { NFS4ERR_BAD_STATEID, "BAD_STATEID" }, \
+ { NFS4ERR_CB_PATH_DOWN, "CB_PATH_DOWN" }, \
+ { NFS4ERR_CLID_INUSE, "CLID_INUSE" }, \
+ { NFS4ERR_CLIENTID_BUSY, "CLIENTID_BUSY" }, \
+ { NFS4ERR_COMPLETE_ALREADY, "COMPLETE_ALREADY" }, \
+ { NFS4ERR_CONN_NOT_BOUND_TO_SESSION, \
+ "CONN_NOT_BOUND_TO_SESSION" }, \
+ { NFS4ERR_DEADLOCK, "DEADLOCK" }, \
+ { NFS4ERR_DEADSESSION, "DEAD_SESSION" }, \
+ { NFS4ERR_DELAY, "DELAY" }, \
+ { NFS4ERR_DELEG_ALREADY_WANTED, \
+ "DELEG_ALREADY_WANTED" }, \
+ { NFS4ERR_DELEG_REVOKED, "DELEG_REVOKED" }, \
+ { NFS4ERR_DENIED, "DENIED" }, \
+ { NFS4ERR_DIRDELEG_UNAVAIL, "DIRDELEG_UNAVAIL" }, \
+ { NFS4ERR_DQUOT, "DQUOT" }, \
+ { NFS4ERR_ENCR_ALG_UNSUPP, "ENCR_ALG_UNSUPP" }, \
+ { NFS4ERR_EXIST, "EXIST" }, \
+ { NFS4ERR_EXPIRED, "EXPIRED" }, \
+ { NFS4ERR_FBIG, "FBIG" }, \
+ { NFS4ERR_FHEXPIRED, "FHEXPIRED" }, \
+ { NFS4ERR_FILE_OPEN, "FILE_OPEN" }, \
+ { NFS4ERR_GRACE, "GRACE" }, \
+ { NFS4ERR_HASH_ALG_UNSUPP, "HASH_ALG_UNSUPP" }, \
+ { NFS4ERR_INVAL, "INVAL" }, \
+ { NFS4ERR_IO, "IO" }, \
+ { NFS4ERR_ISDIR, "ISDIR" }, \
+ { NFS4ERR_LAYOUTTRYLATER, "LAYOUTTRYLATER" }, \
+ { NFS4ERR_LAYOUTUNAVAILABLE, "LAYOUTUNAVAILABLE" }, \
+ { NFS4ERR_LEASE_MOVED, "LEASE_MOVED" }, \
+ { NFS4ERR_LOCKED, "LOCKED" }, \
+ { NFS4ERR_LOCKS_HELD, "LOCKS_HELD" }, \
+ { NFS4ERR_LOCK_RANGE, "LOCK_RANGE" }, \
+ { NFS4ERR_MINOR_VERS_MISMATCH, "MINOR_VERS_MISMATCH" }, \
+ { NFS4ERR_MLINK, "MLINK" }, \
+ { NFS4ERR_MOVED, "MOVED" }, \
+ { NFS4ERR_NAMETOOLONG, "NAMETOOLONG" }, \
+ { NFS4ERR_NOENT, "NOENT" }, \
+ { NFS4ERR_NOFILEHANDLE, "NOFILEHANDLE" }, \
+ { NFS4ERR_NOMATCHING_LAYOUT, "NOMATCHING_LAYOUT" }, \
+ { NFS4ERR_NOSPC, "NOSPC" }, \
+ { NFS4ERR_NOTDIR, "NOTDIR" }, \
+ { NFS4ERR_NOTEMPTY, "NOTEMPTY" }, \
+ { NFS4ERR_NOTSUPP, "NOTSUPP" }, \
+ { NFS4ERR_NOT_ONLY_OP, "NOT_ONLY_OP" }, \
+ { NFS4ERR_NOT_SAME, "NOT_SAME" }, \
+ { NFS4ERR_NO_GRACE, "NO_GRACE" }, \
+ { NFS4ERR_NXIO, "NXIO" }, \
+ { NFS4ERR_OLD_STATEID, "OLD_STATEID" }, \
+ { NFS4ERR_OPENMODE, "OPENMODE" }, \
+ { NFS4ERR_OP_ILLEGAL, "OP_ILLEGAL" }, \
+ { NFS4ERR_OP_NOT_IN_SESSION, "OP_NOT_IN_SESSION" }, \
+ { NFS4ERR_PERM, "PERM" }, \
+ { NFS4ERR_PNFS_IO_HOLE, "PNFS_IO_HOLE" }, \
+ { NFS4ERR_PNFS_NO_LAYOUT, "PNFS_NO_LAYOUT" }, \
+ { NFS4ERR_RECALLCONFLICT, "RECALLCONFLICT" }, \
+ { NFS4ERR_RECLAIM_BAD, "RECLAIM_BAD" }, \
+ { NFS4ERR_RECLAIM_CONFLICT, "RECLAIM_CONFLICT" }, \
+ { NFS4ERR_REJECT_DELEG, "REJECT_DELEG" }, \
+ { NFS4ERR_REP_TOO_BIG, "REP_TOO_BIG" }, \
+ { NFS4ERR_REP_TOO_BIG_TO_CACHE, \
+ "REP_TOO_BIG_TO_CACHE" }, \
+ { NFS4ERR_REQ_TOO_BIG, "REQ_TOO_BIG" }, \
+ { NFS4ERR_RESOURCE, "RESOURCE" }, \
+ { NFS4ERR_RESTOREFH, "RESTOREFH" }, \
+ { NFS4ERR_RETRY_UNCACHED_REP, "RETRY_UNCACHED_REP" }, \
+ { NFS4ERR_RETURNCONFLICT, "RETURNCONFLICT" }, \
+ { NFS4ERR_ROFS, "ROFS" }, \
+ { NFS4ERR_SAME, "SAME" }, \
+ { NFS4ERR_SHARE_DENIED, "SHARE_DENIED" }, \
+ { NFS4ERR_SEQUENCE_POS, "SEQUENCE_POS" }, \
+ { NFS4ERR_SEQ_FALSE_RETRY, "SEQ_FALSE_RETRY" }, \
+ { NFS4ERR_SEQ_MISORDERED, "SEQ_MISORDERED" }, \
+ { NFS4ERR_SERVERFAULT, "SERVERFAULT" }, \
+ { NFS4ERR_STALE, "STALE" }, \
+ { NFS4ERR_STALE_CLIENTID, "STALE_CLIENTID" }, \
+ { NFS4ERR_STALE_STATEID, "STALE_STATEID" }, \
+ { NFS4ERR_SYMLINK, "SYMLINK" }, \
+ { NFS4ERR_TOOSMALL, "TOOSMALL" }, \
+ { NFS4ERR_TOO_MANY_OPS, "TOO_MANY_OPS" }, \
+ { NFS4ERR_UNKNOWN_LAYOUTTYPE, "UNKNOWN_LAYOUTTYPE" }, \
+ { NFS4ERR_UNSAFE_COMPOUND, "UNSAFE_COMPOUND" }, \
+ { NFS4ERR_WRONGSEC, "WRONGSEC" }, \
+ { NFS4ERR_WRONG_CRED, "WRONG_CRED" }, \
+ { NFS4ERR_WRONG_TYPE, "WRONG_TYPE" }, \
+ { NFS4ERR_XDEV, "XDEV" }, \
+ /* ***** Internal to Linux NFS client ***** */ \
+ { NFS4ERR_RESET_TO_MDS, "RESET_TO_MDS" }, \
+ { NFS4ERR_RESET_TO_PNFS, "RESET_TO_PNFS" })
+
+#define show_open_flags(flags) \
+ __print_flags(flags, "|", \
+ { O_CREAT, "O_CREAT" }, \
+ { O_EXCL, "O_EXCL" }, \
+ { O_TRUNC, "O_TRUNC" }, \
+ { O_DIRECT, "O_DIRECT" })
+
+#define show_fmode_flags(mode) \
+ __print_flags(mode, "|", \
+ { ((__force unsigned long)FMODE_READ), "READ" }, \
+ { ((__force unsigned long)FMODE_WRITE), "WRITE" }, \
+ { ((__force unsigned long)FMODE_EXEC), "EXEC" })
+
+#define show_nfs_fattr_flags(valid) \
+ __print_flags((unsigned long)valid, "|", \
+ { NFS_ATTR_FATTR_TYPE, "TYPE" }, \
+ { NFS_ATTR_FATTR_MODE, "MODE" }, \
+ { NFS_ATTR_FATTR_NLINK, "NLINK" }, \
+ { NFS_ATTR_FATTR_OWNER, "OWNER" }, \
+ { NFS_ATTR_FATTR_GROUP, "GROUP" }, \
+ { NFS_ATTR_FATTR_RDEV, "RDEV" }, \
+ { NFS_ATTR_FATTR_SIZE, "SIZE" }, \
+ { NFS_ATTR_FATTR_FSID, "FSID" }, \
+ { NFS_ATTR_FATTR_FILEID, "FILEID" }, \
+ { NFS_ATTR_FATTR_ATIME, "ATIME" }, \
+ { NFS_ATTR_FATTR_MTIME, "MTIME" }, \
+ { NFS_ATTR_FATTR_CTIME, "CTIME" }, \
+ { NFS_ATTR_FATTR_CHANGE, "CHANGE" }, \
+ { NFS_ATTR_FATTR_OWNER_NAME, "OWNER_NAME" }, \
+ { NFS_ATTR_FATTR_GROUP_NAME, "GROUP_NAME" })
+
+DECLARE_EVENT_CLASS(nfs4_clientid_event,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ int error
+ ),
+
+ TP_ARGS(clp, error),
+
+ TP_STRUCT__entry(
+ __string(dstaddr, clp->cl_hostname)
+ __field(unsigned long, error)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error < 0 ? -error : 0;
+ __assign_str(dstaddr, clp->cl_hostname);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) dstaddr=%s",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ __get_str(dstaddr)
+ )
+);
+#define DEFINE_NFS4_CLIENTID_EVENT(name) \
+ DEFINE_EVENT(nfs4_clientid_event, name, \
+ TP_PROTO( \
+ const struct nfs_client *clp, \
+ int error \
+ ), \
+ TP_ARGS(clp, error))
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_setclientid);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_setclientid_confirm);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_renew);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_renew_async);
+#ifdef CONFIG_NFS_V4_1
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_exchange_id);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_create_session);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_destroy_session);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_destroy_clientid);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_bind_conn_to_session);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_sequence);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_reclaim_complete);
+
+#define show_nfs4_sequence_status_flags(status) \
+ __print_flags((unsigned long)status, "|", \
+ { SEQ4_STATUS_CB_PATH_DOWN, "CB_PATH_DOWN" }, \
+ { SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING, \
+ "CB_GSS_CONTEXTS_EXPIRING" }, \
+ { SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED, \
+ "CB_GSS_CONTEXTS_EXPIRED" }, \
+ { SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED, \
+ "EXPIRED_ALL_STATE_REVOKED" }, \
+ { SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED, \
+ "EXPIRED_SOME_STATE_REVOKED" }, \
+ { SEQ4_STATUS_ADMIN_STATE_REVOKED, \
+ "ADMIN_STATE_REVOKED" }, \
+ { SEQ4_STATUS_RECALLABLE_STATE_REVOKED, \
+ "RECALLABLE_STATE_REVOKED" }, \
+ { SEQ4_STATUS_LEASE_MOVED, "LEASE_MOVED" }, \
+ { SEQ4_STATUS_RESTART_RECLAIM_NEEDED, \
+ "RESTART_RECLAIM_NEEDED" }, \
+ { SEQ4_STATUS_CB_PATH_DOWN_SESSION, \
+ "CB_PATH_DOWN_SESSION" }, \
+ { SEQ4_STATUS_BACKCHANNEL_FAULT, \
+ "BACKCHANNEL_FAULT" })
+
+TRACE_EVENT(nfs4_sequence_done,
+ TP_PROTO(
+ const struct nfs4_session *session,
+ const struct nfs4_sequence_res *res
+ ),
+ TP_ARGS(session, res),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, session)
+ __field(unsigned int, slot_nr)
+ __field(unsigned int, seq_nr)
+ __field(unsigned int, highest_slotid)
+ __field(unsigned int, target_highest_slotid)
+ __field(unsigned int, status_flags)
+ __field(unsigned long, error)
+ ),
+
+ TP_fast_assign(
+ const struct nfs4_slot *sr_slot = res->sr_slot;
+ __entry->session = nfs_session_id_hash(&session->sess_id);
+ __entry->slot_nr = sr_slot->slot_nr;
+ __entry->seq_nr = sr_slot->seq_nr;
+ __entry->highest_slotid = res->sr_highest_slotid;
+ __entry->target_highest_slotid =
+ res->sr_target_highest_slotid;
+ __entry->status_flags = res->sr_status_flags;
+ __entry->error = res->sr_status < 0 ?
+ -res->sr_status : 0;
+ ),
+ TP_printk(
+ "error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u "
+ "highest_slotid=%u target_highest_slotid=%u "
+ "status_flags=%u (%s)",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ __entry->session,
+ __entry->slot_nr,
+ __entry->seq_nr,
+ __entry->highest_slotid,
+ __entry->target_highest_slotid,
+ __entry->status_flags,
+ show_nfs4_sequence_status_flags(__entry->status_flags)
+ )
+);
+
+struct cb_sequenceargs;
+struct cb_sequenceres;
+
+TRACE_EVENT(nfs4_cb_sequence,
+ TP_PROTO(
+ const struct cb_sequenceargs *args,
+ const struct cb_sequenceres *res,
+ __be32 status
+ ),
+ TP_ARGS(args, res, status),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, session)
+ __field(unsigned int, slot_nr)
+ __field(unsigned int, seq_nr)
+ __field(unsigned int, highest_slotid)
+ __field(unsigned int, cachethis)
+ __field(unsigned long, error)
+ ),
+
+ TP_fast_assign(
+ __entry->session = nfs_session_id_hash(&args->csa_sessionid);
+ __entry->slot_nr = args->csa_slotid;
+ __entry->seq_nr = args->csa_sequenceid;
+ __entry->highest_slotid = args->csa_highestslotid;
+ __entry->cachethis = args->csa_cachethis;
+ __entry->error = be32_to_cpu(status);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u "
+ "highest_slotid=%u",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ __entry->session,
+ __entry->slot_nr,
+ __entry->seq_nr,
+ __entry->highest_slotid
+ )
+);
+
+TRACE_EVENT(nfs4_cb_seqid_err,
+ TP_PROTO(
+ const struct cb_sequenceargs *args,
+ __be32 status
+ ),
+ TP_ARGS(args, status),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, session)
+ __field(unsigned int, slot_nr)
+ __field(unsigned int, seq_nr)
+ __field(unsigned int, highest_slotid)
+ __field(unsigned int, cachethis)
+ __field(unsigned long, error)
+ ),
+
+ TP_fast_assign(
+ __entry->session = nfs_session_id_hash(&args->csa_sessionid);
+ __entry->slot_nr = args->csa_slotid;
+ __entry->seq_nr = args->csa_sequenceid;
+ __entry->highest_slotid = args->csa_highestslotid;
+ __entry->cachethis = args->csa_cachethis;
+ __entry->error = be32_to_cpu(status);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u "
+ "highest_slotid=%u",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ __entry->session,
+ __entry->slot_nr,
+ __entry->seq_nr,
+ __entry->highest_slotid
+ )
+);
+
+#endif /* CONFIG_NFS_V4_1 */
+
+TRACE_EVENT(nfs4_setup_sequence,
+ TP_PROTO(
+ const struct nfs4_session *session,
+ const struct nfs4_sequence_args *args
+ ),
+ TP_ARGS(session, args),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, session)
+ __field(unsigned int, slot_nr)
+ __field(unsigned int, seq_nr)
+ __field(unsigned int, highest_used_slotid)
+ ),
+
+ TP_fast_assign(
+ const struct nfs4_slot *sa_slot = args->sa_slot;
+ __entry->session = session ? nfs_session_id_hash(&session->sess_id) : 0;
+ __entry->slot_nr = sa_slot->slot_nr;
+ __entry->seq_nr = sa_slot->seq_nr;
+ __entry->highest_used_slotid =
+ sa_slot->table->highest_used_slotid;
+ ),
+ TP_printk(
+ "session=0x%08x slot_nr=%u seq_nr=%u "
+ "highest_used_slotid=%u",
+ __entry->session,
+ __entry->slot_nr,
+ __entry->seq_nr,
+ __entry->highest_used_slotid
+ )
+);
+
+TRACE_DEFINE_ENUM(NFS4CLNT_MANAGER_RUNNING);
+TRACE_DEFINE_ENUM(NFS4CLNT_CHECK_LEASE);
+TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_EXPIRED);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECLAIM_REBOOT);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECLAIM_NOGRACE);
+TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN);
+TRACE_DEFINE_ENUM(NFS4CLNT_SESSION_RESET);
+TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_CONFIRM);
+TRACE_DEFINE_ENUM(NFS4CLNT_SERVER_SCOPE_MISMATCH);
+TRACE_DEFINE_ENUM(NFS4CLNT_PURGE_STATE);
+TRACE_DEFINE_ENUM(NFS4CLNT_BIND_CONN_TO_SESSION);
+TRACE_DEFINE_ENUM(NFS4CLNT_MOVED);
+TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_MOVED);
+TRACE_DEFINE_ENUM(NFS4CLNT_DELEGATION_EXPIRED);
+TRACE_DEFINE_ENUM(NFS4CLNT_RUN_MANAGER);
+TRACE_DEFINE_ENUM(NFS4CLNT_MANAGER_AVAILABLE);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_RUNNING);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_READ);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_RW);
+TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_DELAYED);
+
+#define show_nfs4_clp_state(state) \
+ __print_flags(state, "|", \
+ { BIT(NFS4CLNT_MANAGER_RUNNING), "MANAGER_RUNNING" }, \
+ { BIT(NFS4CLNT_CHECK_LEASE), "CHECK_LEASE" }, \
+ { BIT(NFS4CLNT_LEASE_EXPIRED), "LEASE_EXPIRED" }, \
+ { BIT(NFS4CLNT_RECLAIM_REBOOT), "RECLAIM_REBOOT" }, \
+ { BIT(NFS4CLNT_RECLAIM_NOGRACE), "RECLAIM_NOGRACE" }, \
+ { BIT(NFS4CLNT_DELEGRETURN), "DELEGRETURN" }, \
+ { BIT(NFS4CLNT_SESSION_RESET), "SESSION_RESET" }, \
+ { BIT(NFS4CLNT_LEASE_CONFIRM), "LEASE_CONFIRM" }, \
+ { BIT(NFS4CLNT_SERVER_SCOPE_MISMATCH), "SERVER_SCOPE_MISMATCH" }, \
+ { BIT(NFS4CLNT_PURGE_STATE), "PURGE_STATE" }, \
+ { BIT(NFS4CLNT_BIND_CONN_TO_SESSION), "BIND_CONN_TO_SESSION" }, \
+ { BIT(NFS4CLNT_MOVED), "MOVED" }, \
+ { BIT(NFS4CLNT_LEASE_MOVED), "LEASE_MOVED" }, \
+ { BIT(NFS4CLNT_DELEGATION_EXPIRED), "DELEGATION_EXPIRED" }, \
+ { BIT(NFS4CLNT_RUN_MANAGER), "RUN_MANAGER" }, \
+ { BIT(NFS4CLNT_MANAGER_AVAILABLE), "MANAGER_AVAILABLE" }, \
+ { BIT(NFS4CLNT_RECALL_RUNNING), "RECALL_RUNNING" }, \
+ { BIT(NFS4CLNT_RECALL_ANY_LAYOUT_READ), "RECALL_ANY_LAYOUT_READ" }, \
+ { BIT(NFS4CLNT_RECALL_ANY_LAYOUT_RW), "RECALL_ANY_LAYOUT_RW" }, \
+ { BIT(NFS4CLNT_DELEGRETURN_DELAYED), "DELERETURN_DELAYED" })
+
+TRACE_EVENT(nfs4_state_mgr,
+ TP_PROTO(
+ const struct nfs_client *clp
+ ),
+
+ TP_ARGS(clp),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, state)
+ __string(hostname, clp->cl_hostname)
+ ),
+
+ TP_fast_assign(
+ __entry->state = clp->cl_state;
+ __assign_str(hostname, clp->cl_hostname)
+ ),
+
+ TP_printk(
+ "hostname=%s clp state=%s", __get_str(hostname),
+ show_nfs4_clp_state(__entry->state)
+ )
+)
+
+TRACE_EVENT(nfs4_state_mgr_failed,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ const char *section,
+ int status
+ ),
+
+ TP_ARGS(clp, section, status),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(unsigned long, state)
+ __string(hostname, clp->cl_hostname)
+ __string(section, section)
+ ),
+
+ TP_fast_assign(
+ __entry->error = status < 0 ? -status : 0;
+ __entry->state = clp->cl_state;
+ __assign_str(hostname, clp->cl_hostname);
+ __assign_str(section, section);
+ ),
+
+ TP_printk(
+ "hostname=%s clp state=%s error=%ld (%s) section=%s",
+ __get_str(hostname),
+ show_nfs4_clp_state(__entry->state), -__entry->error,
+ show_nfsv4_errors(__entry->error), __get_str(section)
+
+ )
+)
+
+TRACE_EVENT(nfs4_xdr_status,
+ TP_PROTO(
+ const struct xdr_stream *xdr,
+ u32 op,
+ u32 error
+ ),
+
+ TP_ARGS(xdr, op, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, task_id)
+ __field(unsigned int, client_id)
+ __field(u32, xid)
+ __field(u32, op)
+ __field(unsigned long, error)
+ ),
+
+ TP_fast_assign(
+ const struct rpc_rqst *rqstp = xdr->rqst;
+ const struct rpc_task *task = rqstp->rq_task;
+
+ __entry->task_id = task->tk_pid;
+ __entry->client_id = task->tk_client->cl_clid;
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->op = op;
+ __entry->error = error;
+ ),
+
+ TP_printk(
+ "task:%u@%d xid=0x%08x error=%ld (%s) operation=%u",
+ __entry->task_id, __entry->client_id, __entry->xid,
+ -__entry->error, show_nfsv4_errors(__entry->error),
+ __entry->op
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs4_cb_error_class,
+ TP_PROTO(
+ __be32 xid,
+ u32 cb_ident
+ ),
+
+ TP_ARGS(xid, cb_ident),
+
+ TP_STRUCT__entry(
+ __field(u32, xid)
+ __field(u32, cbident)
+ ),
+
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(xid);
+ __entry->cbident = cb_ident;
+ ),
+
+ TP_printk(
+ "xid=0x%08x cb_ident=0x%08x",
+ __entry->xid, __entry->cbident
+ )
+);
+
+#define DEFINE_CB_ERROR_EVENT(name) \
+ DEFINE_EVENT(nfs4_cb_error_class, nfs_cb_##name, \
+ TP_PROTO( \
+ __be32 xid, \
+ u32 cb_ident \
+ ), \
+ TP_ARGS(xid, cb_ident))
+
+DEFINE_CB_ERROR_EVENT(no_clp);
+DEFINE_CB_ERROR_EVENT(badprinc);
+
+DECLARE_EVENT_CLASS(nfs4_open_event,
+ TP_PROTO(
+ const struct nfs_open_context *ctx,
+ int flags,
+ int error
+ ),
+
+ TP_ARGS(ctx, flags, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(unsigned int, flags)
+ __field(unsigned int, fmode)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, dir)
+ __string(name, ctx->dentry->d_name.name)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, openstateid_seq)
+ __field(u32, openstateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct nfs4_state *state = ctx->state;
+ const struct inode *inode = NULL;
+
+ __entry->error = -error;
+ __entry->flags = flags;
+ __entry->fmode = (__force unsigned int)ctx->mode;
+ __entry->dev = ctx->dentry->d_sb->s_dev;
+ if (!IS_ERR_OR_NULL(state)) {
+ inode = state->inode;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ __entry->openstateid_seq =
+ be32_to_cpu(state->open_stateid.seqid);
+ __entry->openstateid_hash =
+ nfs_stateid_hash(&state->open_stateid);
+ } else {
+ __entry->stateid_seq = 0;
+ __entry->stateid_hash = 0;
+ __entry->openstateid_seq = 0;
+ __entry->openstateid_hash = 0;
+ }
+ if (inode != NULL) {
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ } else {
+ __entry->fileid = 0;
+ __entry->fhandle = 0;
+ }
+ __entry->dir = NFS_FILEID(d_inode(ctx->dentry->d_parent));
+ __assign_str(name, ctx->dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) flags=%d (%s) fmode=%s "
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "name=%02x:%02x:%llu/%s stateid=%d:0x%08x "
+ "openstateid=%d:0x%08x",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ __entry->flags,
+ show_open_flags(__entry->flags),
+ show_fmode_flags(__entry->fmode),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name),
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->openstateid_seq, __entry->openstateid_hash
+ )
+);
+
+#define DEFINE_NFS4_OPEN_EVENT(name) \
+ DEFINE_EVENT(nfs4_open_event, name, \
+ TP_PROTO( \
+ const struct nfs_open_context *ctx, \
+ int flags, \
+ int error \
+ ), \
+ TP_ARGS(ctx, flags, error))
+DEFINE_NFS4_OPEN_EVENT(nfs4_open_reclaim);
+DEFINE_NFS4_OPEN_EVENT(nfs4_open_expired);
+DEFINE_NFS4_OPEN_EVENT(nfs4_open_file);
+
+TRACE_EVENT(nfs4_cached_open,
+ TP_PROTO(
+ const struct nfs4_state *state
+ ),
+ TP_ARGS(state),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned int, fmode)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->fmode = (__force unsigned int)state->state;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ ),
+
+ TP_printk(
+ "fmode=%s fileid=%02x:%02x:%llu "
+ "fhandle=0x%08x stateid=%d:0x%08x",
+ __entry->fmode ? show_fmode_flags(__entry->fmode) :
+ "closed",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
+TRACE_EVENT(nfs4_close,
+ TP_PROTO(
+ const struct nfs4_state *state,
+ const struct nfs_closeargs *args,
+ const struct nfs_closeres *res,
+ int error
+ ),
+
+ TP_ARGS(state, args, res, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned int, fmode)
+ __field(unsigned long, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->fmode = (__force unsigned int)state->state;
+ __entry->error = error < 0 ? -error : 0;
+ __entry->stateid_seq =
+ be32_to_cpu(args->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&args->stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fmode=%s fileid=%02x:%02x:%llu "
+ "fhandle=0x%08x openstateid=%d:0x%08x",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ __entry->fmode ? show_fmode_flags(__entry->fmode) :
+ "closed",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
+TRACE_DEFINE_ENUM(F_GETLK);
+TRACE_DEFINE_ENUM(F_SETLK);
+TRACE_DEFINE_ENUM(F_SETLKW);
+TRACE_DEFINE_ENUM(F_RDLCK);
+TRACE_DEFINE_ENUM(F_WRLCK);
+TRACE_DEFINE_ENUM(F_UNLCK);
+
+#define show_lock_cmd(type) \
+ __print_symbolic((int)type, \
+ { F_GETLK, "GETLK" }, \
+ { F_SETLK, "SETLK" }, \
+ { F_SETLKW, "SETLKW" })
+#define show_lock_type(type) \
+ __print_symbolic((int)type, \
+ { F_RDLCK, "RDLCK" }, \
+ { F_WRLCK, "WRLCK" }, \
+ { F_UNLCK, "UNLCK" })
+
+DECLARE_EVENT_CLASS(nfs4_lock_event,
+ TP_PROTO(
+ const struct file_lock *request,
+ const struct nfs4_state *state,
+ int cmd,
+ int error
+ ),
+
+ TP_ARGS(request, state, cmd, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(int, cmd)
+ __field(char, type)
+ __field(loff_t, start)
+ __field(loff_t, end)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->error = error < 0 ? -error : 0;
+ __entry->cmd = cmd;
+ __entry->type = request->fl_type;
+ __entry->start = request->fl_start;
+ __entry->end = request->fl_end;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) cmd=%s:%s range=%lld:%lld "
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ show_lock_cmd(__entry->cmd),
+ show_lock_type(__entry->type),
+ (long long)__entry->start,
+ (long long)__entry->end,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
+#define DEFINE_NFS4_LOCK_EVENT(name) \
+ DEFINE_EVENT(nfs4_lock_event, name, \
+ TP_PROTO( \
+ const struct file_lock *request, \
+ const struct nfs4_state *state, \
+ int cmd, \
+ int error \
+ ), \
+ TP_ARGS(request, state, cmd, error))
+DEFINE_NFS4_LOCK_EVENT(nfs4_get_lock);
+DEFINE_NFS4_LOCK_EVENT(nfs4_unlock);
+
+TRACE_EVENT(nfs4_set_lock,
+ TP_PROTO(
+ const struct file_lock *request,
+ const struct nfs4_state *state,
+ const nfs4_stateid *lockstateid,
+ int cmd,
+ int error
+ ),
+
+ TP_ARGS(request, state, lockstateid, cmd, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(int, cmd)
+ __field(char, type)
+ __field(loff_t, start)
+ __field(loff_t, end)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, lockstateid_seq)
+ __field(u32, lockstateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->error = error < 0 ? -error : 0;
+ __entry->cmd = cmd;
+ __entry->type = request->fl_type;
+ __entry->start = request->fl_start;
+ __entry->end = request->fl_end;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ __entry->lockstateid_seq =
+ be32_to_cpu(lockstateid->seqid);
+ __entry->lockstateid_hash =
+ nfs_stateid_hash(lockstateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) cmd=%s:%s range=%lld:%lld "
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x lockstateid=%d:0x%08x",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ show_lock_cmd(__entry->cmd),
+ show_lock_type(__entry->type),
+ (long long)__entry->start,
+ (long long)__entry->end,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->lockstateid_seq, __entry->lockstateid_hash
+ )
+);
+
+TRACE_DEFINE_ENUM(LK_STATE_IN_USE);
+TRACE_DEFINE_ENUM(NFS_DELEGATED_STATE);
+TRACE_DEFINE_ENUM(NFS_OPEN_STATE);
+TRACE_DEFINE_ENUM(NFS_O_RDONLY_STATE);
+TRACE_DEFINE_ENUM(NFS_O_WRONLY_STATE);
+TRACE_DEFINE_ENUM(NFS_O_RDWR_STATE);
+TRACE_DEFINE_ENUM(NFS_STATE_RECLAIM_REBOOT);
+TRACE_DEFINE_ENUM(NFS_STATE_RECLAIM_NOGRACE);
+TRACE_DEFINE_ENUM(NFS_STATE_POSIX_LOCKS);
+TRACE_DEFINE_ENUM(NFS_STATE_RECOVERY_FAILED);
+TRACE_DEFINE_ENUM(NFS_STATE_MAY_NOTIFY_LOCK);
+TRACE_DEFINE_ENUM(NFS_STATE_CHANGE_WAIT);
+TRACE_DEFINE_ENUM(NFS_CLNT_DST_SSC_COPY_STATE);
+TRACE_DEFINE_ENUM(NFS_CLNT_SRC_SSC_COPY_STATE);
+TRACE_DEFINE_ENUM(NFS_SRV_SSC_COPY_STATE);
+
+#define show_nfs4_state_flags(flags) \
+ __print_flags(flags, "|", \
+ { LK_STATE_IN_USE, "IN_USE" }, \
+ { NFS_DELEGATED_STATE, "DELEGATED" }, \
+ { NFS_OPEN_STATE, "OPEN" }, \
+ { NFS_O_RDONLY_STATE, "O_RDONLY" }, \
+ { NFS_O_WRONLY_STATE, "O_WRONLY" }, \
+ { NFS_O_RDWR_STATE, "O_RDWR" }, \
+ { NFS_STATE_RECLAIM_REBOOT, "RECLAIM_REBOOT" }, \
+ { NFS_STATE_RECLAIM_NOGRACE, "RECLAIM_NOGRACE" }, \
+ { NFS_STATE_POSIX_LOCKS, "POSIX_LOCKS" }, \
+ { NFS_STATE_RECOVERY_FAILED, "RECOVERY_FAILED" }, \
+ { NFS_STATE_MAY_NOTIFY_LOCK, "MAY_NOTIFY_LOCK" }, \
+ { NFS_STATE_CHANGE_WAIT, "CHANGE_WAIT" }, \
+ { NFS_CLNT_DST_SSC_COPY_STATE, "CLNT_DST_SSC_COPY" }, \
+ { NFS_CLNT_SRC_SSC_COPY_STATE, "CLNT_SRC_SSC_COPY" }, \
+ { NFS_SRV_SSC_COPY_STATE, "SRV_SSC_COPY" })
+
+#define show_nfs4_lock_flags(flags) \
+ __print_flags(flags, "|", \
+ { BIT(NFS_LOCK_INITIALIZED), "INITIALIZED" }, \
+ { BIT(NFS_LOCK_LOST), "LOST" })
+
+TRACE_EVENT(nfs4_state_lock_reclaim,
+ TP_PROTO(
+ const struct nfs4_state *state,
+ const struct nfs4_lock_state *lock
+ ),
+
+ TP_ARGS(state, lock),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned long, state_flags)
+ __field(unsigned long, lock_flags)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->state_flags = state->flags;
+ __entry->lock_flags = lock->ls_flags;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x state_flags=%s lock_flags=%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid, __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ show_nfs4_state_flags(__entry->state_flags),
+ show_nfs4_lock_flags(__entry->lock_flags)
+ )
+)
+
+DECLARE_EVENT_CLASS(nfs4_set_delegation_event,
+ TP_PROTO(
+ const struct inode *inode,
+ fmode_t fmode
+ ),
+
+ TP_ARGS(inode, fmode),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned int, fmode)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->fmode = (__force unsigned int)fmode;
+ ),
+
+ TP_printk(
+ "fmode=%s fileid=%02x:%02x:%llu fhandle=0x%08x",
+ show_fmode_flags(__entry->fmode),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle
+ )
+);
+#define DEFINE_NFS4_SET_DELEGATION_EVENT(name) \
+ DEFINE_EVENT(nfs4_set_delegation_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ fmode_t fmode \
+ ), \
+ TP_ARGS(inode, fmode))
+DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_set_delegation);
+DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_reclaim_delegation);
+
+TRACE_EVENT(nfs4_delegreturn_exit,
+ TP_PROTO(
+ const struct nfs4_delegreturnargs *args,
+ const struct nfs4_delegreturnres *res,
+ int error
+ ),
+
+ TP_ARGS(args, res, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(unsigned long, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = res->server->s_dev;
+ __entry->fhandle = nfs_fhandle_hash(args->fhandle);
+ __entry->error = error < 0 ? -error : 0;
+ __entry->stateid_seq =
+ be32_to_cpu(args->stateid->seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(args->stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) dev=%02x:%02x fhandle=0x%08x "
+ "stateid=%d:0x%08x",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
+#ifdef CONFIG_NFS_V4_1
+DECLARE_EVENT_CLASS(nfs4_test_stateid_event,
+ TP_PROTO(
+ const struct nfs4_state *state,
+ const struct nfs4_lock_state *lsp,
+ int error
+ ),
+
+ TP_ARGS(state, lsp, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->error = error < 0 ? -error : 0;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
+#define DEFINE_NFS4_TEST_STATEID_EVENT(name) \
+ DEFINE_EVENT(nfs4_test_stateid_event, name, \
+ TP_PROTO( \
+ const struct nfs4_state *state, \
+ const struct nfs4_lock_state *lsp, \
+ int error \
+ ), \
+ TP_ARGS(state, lsp, error))
+DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_delegation_stateid);
+DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_open_stateid);
+DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_lock_stateid);
+#endif /* CONFIG_NFS_V4_1 */
+
+DECLARE_EVENT_CLASS(nfs4_lookup_event,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct qstr *name,
+ int error
+ ),
+
+ TP_ARGS(dir, name, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, error)
+ __field(u64, dir)
+ __string(name, name->name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->error = -error;
+ __assign_str(name, name->name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) name=%02x:%02x:%llu/%s",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+#define DEFINE_NFS4_LOOKUP_EVENT(name) \
+ DEFINE_EVENT(nfs4_lookup_event, name, \
+ TP_PROTO( \
+ const struct inode *dir, \
+ const struct qstr *name, \
+ int error \
+ ), \
+ TP_ARGS(dir, name, error))
+
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_lookup);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_symlink);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_mkdir);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_mknod);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_remove);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_get_fs_locations);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_secinfo);
+
+TRACE_EVENT(nfs4_lookupp,
+ TP_PROTO(
+ const struct inode *inode,
+ int error
+ ),
+
+ TP_ARGS(inode, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, ino)
+ __field(unsigned long, error)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = NFS_FILEID(inode);
+ __entry->error = error < 0 ? -error : 0;
+ ),
+
+ TP_printk(
+ "error=%ld (%s) inode=%02x:%02x:%llu",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->ino
+ )
+);
+
+TRACE_EVENT(nfs4_rename,
+ TP_PROTO(
+ const struct inode *olddir,
+ const struct qstr *oldname,
+ const struct inode *newdir,
+ const struct qstr *newname,
+ int error
+ ),
+
+ TP_ARGS(olddir, oldname, newdir, newname, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, error)
+ __field(u64, olddir)
+ __string(oldname, oldname->name)
+ __field(u64, newdir)
+ __string(newname, newname->name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = olddir->i_sb->s_dev;
+ __entry->olddir = NFS_FILEID(olddir);
+ __entry->newdir = NFS_FILEID(newdir);
+ __entry->error = error < 0 ? -error : 0;
+ __assign_str(oldname, oldname->name);
+ __assign_str(newname, newname->name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) oldname=%02x:%02x:%llu/%s "
+ "newname=%02x:%02x:%llu/%s",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->olddir,
+ __get_str(oldname),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->newdir,
+ __get_str(newname)
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs4_inode_event,
+ TP_PROTO(
+ const struct inode *inode,
+ int error
+ ),
+
+ TP_ARGS(inode, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned long, error)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->error = error < 0 ? -error : 0;
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle
+ )
+);
+
+#define DEFINE_NFS4_INODE_EVENT(name) \
+ DEFINE_EVENT(nfs4_inode_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ int error \
+ ), \
+ TP_ARGS(inode, error))
+
+DEFINE_NFS4_INODE_EVENT(nfs4_access);
+DEFINE_NFS4_INODE_EVENT(nfs4_readlink);
+DEFINE_NFS4_INODE_EVENT(nfs4_readdir);
+DEFINE_NFS4_INODE_EVENT(nfs4_get_acl);
+DEFINE_NFS4_INODE_EVENT(nfs4_set_acl);
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+DEFINE_NFS4_INODE_EVENT(nfs4_get_security_label);
+DEFINE_NFS4_INODE_EVENT(nfs4_set_security_label);
+#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
+
+DECLARE_EVENT_CLASS(nfs4_inode_stateid_event,
+ TP_PROTO(
+ const struct inode *inode,
+ const nfs4_stateid *stateid,
+ int error
+ ),
+
+ TP_ARGS(inode, stateid, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned long, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->error = error < 0 ? -error : 0;
+ __entry->stateid_seq =
+ be32_to_cpu(stateid->seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
+#define DEFINE_NFS4_INODE_STATEID_EVENT(name) \
+ DEFINE_EVENT(nfs4_inode_stateid_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ const nfs4_stateid *stateid, \
+ int error \
+ ), \
+ TP_ARGS(inode, stateid, error))
+
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update_wait);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_close_stateid_update_wait);
+
+DECLARE_EVENT_CLASS(nfs4_getattr_event,
+ TP_PROTO(
+ const struct nfs_server *server,
+ const struct nfs_fh *fhandle,
+ const struct nfs_fattr *fattr,
+ int error
+ ),
+
+ TP_ARGS(server, fhandle, fattr, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned int, valid)
+ __field(unsigned long, error)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = server->s_dev;
+ __entry->valid = fattr->valid;
+ __entry->fhandle = nfs_fhandle_hash(fhandle);
+ __entry->fileid = (fattr->valid & NFS_ATTR_FATTR_FILEID) ? fattr->fileid : 0;
+ __entry->error = error < 0 ? -error : 0;
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "valid=%s",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ show_nfs_fattr_flags(__entry->valid)
+ )
+);
+
+#define DEFINE_NFS4_GETATTR_EVENT(name) \
+ DEFINE_EVENT(nfs4_getattr_event, name, \
+ TP_PROTO( \
+ const struct nfs_server *server, \
+ const struct nfs_fh *fhandle, \
+ const struct nfs_fattr *fattr, \
+ int error \
+ ), \
+ TP_ARGS(server, fhandle, fattr, error))
+DEFINE_NFS4_GETATTR_EVENT(nfs4_getattr);
+DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root);
+DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo);
+
+DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ const struct nfs_fh *fhandle,
+ const struct inode *inode,
+ int error
+ ),
+
+ TP_ARGS(clp, fhandle, inode, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __string(dstaddr, clp ? clp->cl_hostname : "unknown")
+ ),
+
+ TP_fast_assign(
+ __entry->error = error < 0 ? -error : 0;
+ __entry->fhandle = nfs_fhandle_hash(fhandle);
+ if (!IS_ERR_OR_NULL(inode)) {
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ } else {
+ __entry->fileid = 0;
+ __entry->dev = 0;
+ }
+ __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown")
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "dstaddr=%s",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __get_str(dstaddr)
+ )
+);
+
+#define DEFINE_NFS4_INODE_CALLBACK_EVENT(name) \
+ DEFINE_EVENT(nfs4_inode_callback_event, name, \
+ TP_PROTO( \
+ const struct nfs_client *clp, \
+ const struct nfs_fh *fhandle, \
+ const struct inode *inode, \
+ int error \
+ ), \
+ TP_ARGS(clp, fhandle, inode, error))
+DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr);
+
+DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ const struct nfs_fh *fhandle,
+ const struct inode *inode,
+ const nfs4_stateid *stateid,
+ int error
+ ),
+
+ TP_ARGS(clp, fhandle, inode, stateid, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __string(dstaddr, clp ? clp->cl_hostname : "unknown")
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error < 0 ? -error : 0;
+ __entry->fhandle = nfs_fhandle_hash(fhandle);
+ if (!IS_ERR_OR_NULL(inode)) {
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ } else {
+ __entry->fileid = 0;
+ __entry->dev = 0;
+ }
+ __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown")
+ __entry->stateid_seq =
+ be32_to_cpu(stateid->seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(stateid);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x dstaddr=%s",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __get_str(dstaddr)
+ )
+);
+
+#define DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(name) \
+ DEFINE_EVENT(nfs4_inode_stateid_callback_event, name, \
+ TP_PROTO( \
+ const struct nfs_client *clp, \
+ const struct nfs_fh *fhandle, \
+ const struct inode *inode, \
+ const nfs4_stateid *stateid, \
+ int error \
+ ), \
+ TP_ARGS(clp, fhandle, inode, stateid, error))
+DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_recall);
+DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_layoutrecall_file);
+
+DECLARE_EVENT_CLASS(nfs4_idmap_event,
+ TP_PROTO(
+ const char *name,
+ int len,
+ u32 id,
+ int error
+ ),
+
+ TP_ARGS(name, len, id, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(u32, id)
+ __dynamic_array(char, name, len > 0 ? len + 1 : 1)
+ ),
+
+ TP_fast_assign(
+ if (len < 0)
+ len = 0;
+ __entry->error = error < 0 ? error : 0;
+ __entry->id = id;
+ memcpy(__get_str(name), name, len);
+ __get_str(name)[len] = 0;
+ ),
+
+ TP_printk(
+ "error=%ld (%s) id=%u name=%s",
+ -__entry->error, show_nfsv4_errors(__entry->error),
+ __entry->id,
+ __get_str(name)
+ )
+);
+#define DEFINE_NFS4_IDMAP_EVENT(name) \
+ DEFINE_EVENT(nfs4_idmap_event, name, \
+ TP_PROTO( \
+ const char *name, \
+ int len, \
+ u32 id, \
+ int error \
+ ), \
+ TP_ARGS(name, len, id, error))
+DEFINE_NFS4_IDMAP_EVENT(nfs4_map_name_to_uid);
+DEFINE_NFS4_IDMAP_EVENT(nfs4_map_group_to_gid);
+DEFINE_NFS4_IDMAP_EVENT(nfs4_map_uid_to_name);
+DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group);
+
+#ifdef CONFIG_NFS_V4_1
+#define NFS4_LSEG_LAYOUT_STATEID_HASH(lseg) \
+ (lseg ? nfs_stateid_hash(&lseg->pls_layout->plh_stateid) : 0)
+#else
+#define NFS4_LSEG_LAYOUT_STATEID_HASH(lseg) (0)
+#endif
+
+DECLARE_EVENT_CLASS(nfs4_read_event,
+ TP_PROTO(
+ const struct nfs_pgio_header *hdr,
+ int error
+ ),
+
+ TP_ARGS(hdr, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, arg_count)
+ __field(u32, res_count)
+ __field(unsigned long, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = hdr->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = hdr->args.fh ?
+ hdr->args.fh : &nfsi->fh;
+ const struct nfs4_state *state =
+ hdr->args.context->state;
+ const struct pnfs_layout_segment *lseg = hdr->lseg;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ __entry->offset = hdr->args.offset;
+ __entry->arg_count = hdr->args.count;
+ __entry->res_count = hdr->res.count;
+ __entry->error = error < 0 ? -error : 0;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ __entry->layoutstateid_seq = lseg ? lseg->pls_seq : 0;
+ __entry->layoutstateid_hash =
+ NFS4_LSEG_LAYOUT_STATEID_HASH(lseg);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u stateid=%d:0x%08x "
+ "layoutstateid=%d:0x%08x",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset,
+ __entry->arg_count, __entry->res_count,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash
+ )
+);
+#define DEFINE_NFS4_READ_EVENT(name) \
+ DEFINE_EVENT(nfs4_read_event, name, \
+ TP_PROTO( \
+ const struct nfs_pgio_header *hdr, \
+ int error \
+ ), \
+ TP_ARGS(hdr, error))
+DEFINE_NFS4_READ_EVENT(nfs4_read);
+#ifdef CONFIG_NFS_V4_1
+DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
+#endif /* CONFIG_NFS_V4_1 */
+
+DECLARE_EVENT_CLASS(nfs4_write_event,
+ TP_PROTO(
+ const struct nfs_pgio_header *hdr,
+ int error
+ ),
+
+ TP_ARGS(hdr, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, arg_count)
+ __field(u32, res_count)
+ __field(unsigned long, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = hdr->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = hdr->args.fh ?
+ hdr->args.fh : &nfsi->fh;
+ const struct nfs4_state *state =
+ hdr->args.context->state;
+ const struct pnfs_layout_segment *lseg = hdr->lseg;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ __entry->offset = hdr->args.offset;
+ __entry->arg_count = hdr->args.count;
+ __entry->res_count = hdr->res.count;
+ __entry->error = error < 0 ? -error : 0;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ __entry->layoutstateid_seq = lseg ? lseg->pls_seq : 0;
+ __entry->layoutstateid_hash =
+ NFS4_LSEG_LAYOUT_STATEID_HASH(lseg);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u stateid=%d:0x%08x "
+ "layoutstateid=%d:0x%08x",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset,
+ __entry->arg_count, __entry->res_count,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash
+ )
+);
+
+#define DEFINE_NFS4_WRITE_EVENT(name) \
+ DEFINE_EVENT(nfs4_write_event, name, \
+ TP_PROTO( \
+ const struct nfs_pgio_header *hdr, \
+ int error \
+ ), \
+ TP_ARGS(hdr, error))
+DEFINE_NFS4_WRITE_EVENT(nfs4_write);
+#ifdef CONFIG_NFS_V4_1
+DEFINE_NFS4_WRITE_EVENT(nfs4_pnfs_write);
+#endif /* CONFIG_NFS_V4_1 */
+
+DECLARE_EVENT_CLASS(nfs4_commit_event,
+ TP_PROTO(
+ const struct nfs_commit_data *data,
+ int error
+ ),
+
+ TP_ARGS(data, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned long, error)
+ __field(loff_t, offset)
+ __field(u32, count)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = data->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = data->args.fh ?
+ data->args.fh : &nfsi->fh;
+ const struct pnfs_layout_segment *lseg = data->lseg;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ __entry->offset = data->args.offset;
+ __entry->count = data->args.count;
+ __entry->error = error < 0 ? -error : 0;
+ __entry->layoutstateid_seq = lseg ? lseg->pls_seq : 0;
+ __entry->layoutstateid_hash =
+ NFS4_LSEG_LAYOUT_STATEID_HASH(lseg);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u layoutstateid=%d:0x%08x",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset,
+ __entry->count,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash
+ )
+);
+#define DEFINE_NFS4_COMMIT_EVENT(name) \
+ DEFINE_EVENT(nfs4_commit_event, name, \
+ TP_PROTO( \
+ const struct nfs_commit_data *data, \
+ int error \
+ ), \
+ TP_ARGS(data, error))
+DEFINE_NFS4_COMMIT_EVENT(nfs4_commit);
+#ifdef CONFIG_NFS_V4_1
+DEFINE_NFS4_COMMIT_EVENT(nfs4_pnfs_commit_ds);
+
+TRACE_DEFINE_ENUM(IOMODE_READ);
+TRACE_DEFINE_ENUM(IOMODE_RW);
+TRACE_DEFINE_ENUM(IOMODE_ANY);
+
+#define show_pnfs_iomode(iomode) \
+ __print_symbolic(iomode, \
+ { IOMODE_READ, "READ" }, \
+ { IOMODE_RW, "RW" }, \
+ { IOMODE_ANY, "ANY" })
+
+TRACE_EVENT(nfs4_layoutget,
+ TP_PROTO(
+ const struct nfs_open_context *ctx,
+ const struct pnfs_layout_range *args,
+ const struct pnfs_layout_range *res,
+ const nfs4_stateid *layout_stateid,
+ int error
+ ),
+
+ TP_ARGS(ctx, args, res, layout_stateid, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u32, iomode)
+ __field(u64, offset)
+ __field(u64, count)
+ __field(unsigned long, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = d_inode(ctx->dentry);
+ const struct nfs4_state *state = ctx->state;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->iomode = args->iomode;
+ __entry->offset = args->offset;
+ __entry->count = args->length;
+ __entry->error = error < 0 ? -error : 0;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ if (!error) {
+ __entry->layoutstateid_seq =
+ be32_to_cpu(layout_stateid->seqid);
+ __entry->layoutstateid_hash =
+ nfs_stateid_hash(layout_stateid);
+ } else {
+ __entry->layoutstateid_seq = 0;
+ __entry->layoutstateid_hash = 0;
+ }
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "iomode=%s offset=%llu count=%llu stateid=%d:0x%08x "
+ "layoutstateid=%d:0x%08x",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ show_pnfs_iomode(__entry->iomode),
+ (unsigned long long)__entry->offset,
+ (unsigned long long)__entry->count,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash
+ )
+);
+
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutcommit);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn_on_close);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layouterror);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutstats);
+
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_UNKNOWN);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_NO_PNFS);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_MDSTHRESH);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_NOMEM);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_BULK_RECALL);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_FOUND_CACHED);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_RETURN);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_BLOCKED);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_INVALID_OPEN);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_RETRY);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_EXIT);
+
+#define show_pnfs_update_layout_reason(reason) \
+ __print_symbolic(reason, \
+ { PNFS_UPDATE_LAYOUT_UNKNOWN, "unknown" }, \
+ { PNFS_UPDATE_LAYOUT_NO_PNFS, "no pnfs" }, \
+ { PNFS_UPDATE_LAYOUT_RD_ZEROLEN, "read+zerolen" }, \
+ { PNFS_UPDATE_LAYOUT_MDSTHRESH, "mdsthresh" }, \
+ { PNFS_UPDATE_LAYOUT_NOMEM, "nomem" }, \
+ { PNFS_UPDATE_LAYOUT_BULK_RECALL, "bulk recall" }, \
+ { PNFS_UPDATE_LAYOUT_IO_TEST_FAIL, "io test fail" }, \
+ { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" }, \
+ { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" }, \
+ { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \
+ { PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" }, \
+ { PNFS_UPDATE_LAYOUT_RETRY, "retrying" }, \
+ { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }, \
+ { PNFS_UPDATE_LAYOUT_EXIT, "exit" })
+
+TRACE_EVENT(pnfs_update_layout,
+ TP_PROTO(struct inode *inode,
+ loff_t pos,
+ u64 count,
+ enum pnfs_iomode iomode,
+ struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ enum pnfs_update_layout_reason reason
+ ),
+ TP_ARGS(inode, pos, count, iomode, lo, lseg, reason),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u32, fhandle)
+ __field(loff_t, pos)
+ __field(u64, count)
+ __field(enum pnfs_iomode, iomode)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
+ __field(long, lseg)
+ __field(enum pnfs_update_layout_reason, reason)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->pos = pos;
+ __entry->count = count;
+ __entry->iomode = iomode;
+ __entry->reason = reason;
+ if (lo != NULL) {
+ __entry->layoutstateid_seq =
+ be32_to_cpu(lo->plh_stateid.seqid);
+ __entry->layoutstateid_hash =
+ nfs_stateid_hash(&lo->plh_stateid);
+ } else {
+ __entry->layoutstateid_seq = 0;
+ __entry->layoutstateid_hash = 0;
+ }
+ __entry->lseg = (long)lseg;
+ ),
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "iomode=%s pos=%llu count=%llu "
+ "layoutstateid=%d:0x%08x lseg=0x%lx (%s)",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ show_pnfs_iomode(__entry->iomode),
+ (unsigned long long)__entry->pos,
+ (unsigned long long)__entry->count,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash,
+ __entry->lseg,
+ show_pnfs_update_layout_reason(__entry->reason)
+ )
+);
+
+DECLARE_EVENT_CLASS(pnfs_layout_event,
+ TP_PROTO(struct inode *inode,
+ loff_t pos,
+ u64 count,
+ enum pnfs_iomode iomode,
+ struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg
+ ),
+ TP_ARGS(inode, pos, count, iomode, lo, lseg),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u32, fhandle)
+ __field(loff_t, pos)
+ __field(u64, count)
+ __field(enum pnfs_iomode, iomode)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
+ __field(long, lseg)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->pos = pos;
+ __entry->count = count;
+ __entry->iomode = iomode;
+ if (lo != NULL) {
+ __entry->layoutstateid_seq =
+ be32_to_cpu(lo->plh_stateid.seqid);
+ __entry->layoutstateid_hash =
+ nfs_stateid_hash(&lo->plh_stateid);
+ } else {
+ __entry->layoutstateid_seq = 0;
+ __entry->layoutstateid_hash = 0;
+ }
+ __entry->lseg = (long)lseg;
+ ),
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "iomode=%s pos=%llu count=%llu "
+ "layoutstateid=%d:0x%08x lseg=0x%lx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ show_pnfs_iomode(__entry->iomode),
+ (unsigned long long)__entry->pos,
+ (unsigned long long)__entry->count,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash,
+ __entry->lseg
+ )
+);
+
+#define DEFINE_PNFS_LAYOUT_EVENT(name) \
+ DEFINE_EVENT(pnfs_layout_event, name, \
+ TP_PROTO(struct inode *inode, \
+ loff_t pos, \
+ u64 count, \
+ enum pnfs_iomode iomode, \
+ struct pnfs_layout_hdr *lo, \
+ struct pnfs_layout_segment *lseg \
+ ), \
+ TP_ARGS(inode, pos, count, iomode, lo, lseg))
+
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_init_read);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_init_write);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_get_mirror_count);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_done);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_done);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_pagelist);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_pagelist);
+
+DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,
+ TP_PROTO(
+ const struct nfs_pgio_header *hdr
+ ),
+
+ TP_ARGS(hdr),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, count)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __string(dstaddr, hdr->ds_clp ?
+ rpc_peeraddr2str(hdr->ds_clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown")
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = hdr->inode;
+
+ __entry->error = hdr->res.op_status;
+ __entry->fhandle = nfs_fhandle_hash(hdr->args.fh);
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->offset = hdr->args.offset;
+ __entry->count = hdr->args.count;
+ __entry->stateid_seq =
+ be32_to_cpu(hdr->args.stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&hdr->args.stateid);
+ __assign_str(dstaddr, hdr->ds_clp ?
+ rpc_peeraddr2str(hdr->ds_clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown");
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%llu count=%u stateid=%d:0x%08x dstaddr=%s",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->offset, __entry->count,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __get_str(dstaddr)
+ )
+);
+
+#define DEFINE_NFS4_FLEXFILES_IO_EVENT(name) \
+ DEFINE_EVENT(nfs4_flexfiles_io_event, name, \
+ TP_PROTO( \
+ const struct nfs_pgio_header *hdr \
+ ), \
+ TP_ARGS(hdr))
+DEFINE_NFS4_FLEXFILES_IO_EVENT(ff_layout_read_error);
+DEFINE_NFS4_FLEXFILES_IO_EVENT(ff_layout_write_error);
+
+TRACE_EVENT(ff_layout_commit_error,
+ TP_PROTO(
+ const struct nfs_commit_data *data
+ ),
+
+ TP_ARGS(data),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, count)
+ __string(dstaddr, data->ds_clp ?
+ rpc_peeraddr2str(data->ds_clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown")
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = data->inode;
+
+ __entry->error = data->res.op_status;
+ __entry->fhandle = nfs_fhandle_hash(data->args.fh);
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->offset = data->args.offset;
+ __entry->count = data->args.count;
+ __assign_str(dstaddr, data->ds_clp ?
+ rpc_peeraddr2str(data->ds_clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown");
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%llu count=%u dstaddr=%s",
+ -__entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->offset, __entry->count,
+ __get_str(dstaddr)
+ )
+);
+
+
+#endif /* CONFIG_NFS_V4_1 */
+
+#endif /* _TRACE_NFS4_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE nfs4trace
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
new file mode 100644
index 000000000..f1e599553
--- /dev/null
+++ b/fs/nfs/nfs4xdr.c
@@ -0,0 +1,7634 @@
+/*
+ * fs/nfs/nfs4xdr.c
+ *
+ * Client-side XDR for NFSv4.
+ *
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Kendrick Smith <kmsmith@umich.edu>
+ * Andy Adamson <andros@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/kdev_t.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/msg_prot.h>
+#include <linux/sunrpc/gss_api.h>
+#include <linux/nfs.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+
+#include "nfs4_fs.h"
+#include "nfs4trace.h"
+#include "internal.h"
+#include "nfs4idmap.h"
+#include "nfs4session.h"
+#include "pnfs.h"
+#include "netns.h"
+
+#define NFSDBG_FACILITY NFSDBG_XDR
+
+/* Mapping from NFS error code to "errno" error code. */
+#define errno_NFSERR_IO EIO
+
+struct compound_hdr;
+static int nfs4_stat_to_errno(int);
+static void encode_layoutget(struct xdr_stream *xdr,
+ const struct nfs4_layoutget_args *args,
+ struct compound_hdr *hdr);
+static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
+ struct nfs4_layoutget_res *res);
+
+/* NFSv4 COMPOUND tags are only wanted for debugging purposes */
+#ifdef DEBUG
+#define NFS4_MAXTAGLEN 20
+#else
+#define NFS4_MAXTAGLEN 0
+#endif
+
+/* lock,open owner id:
+ * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2)
+ */
+#define open_owner_id_maxsz (1 + 2 + 1 + 1 + 2)
+#define lock_owner_id_maxsz (1 + 1 + 4)
+#define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2))
+#define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2))
+#define op_encode_hdr_maxsz (1)
+#define op_decode_hdr_maxsz (2)
+#define encode_stateid_maxsz (XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define decode_stateid_maxsz (XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define encode_verifier_maxsz (XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define decode_verifier_maxsz (XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define encode_putfh_maxsz (op_encode_hdr_maxsz + 1 + \
+ (NFS4_FHSIZE >> 2))
+#define decode_putfh_maxsz (op_decode_hdr_maxsz)
+#define encode_putrootfh_maxsz (op_encode_hdr_maxsz)
+#define decode_putrootfh_maxsz (op_decode_hdr_maxsz)
+#define encode_getfh_maxsz (op_encode_hdr_maxsz)
+#define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \
+ ((3+NFS4_FHSIZE) >> 2))
+#define nfs4_fattr_bitmap_maxsz 4
+#define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
+#define nfstime4_maxsz (3)
+#define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2))
+#define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2))
+#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#define nfs4_group_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+/* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */
+#define nfs4_label_maxsz (4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN))
+#else
+#define nfs4_label_maxsz 0
+#endif
+/* We support only one layout type per file system */
+#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)
+/* This is based on getfattr, which uses the most attributes: */
+#define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \
+ 3*nfstime4_maxsz + \
+ nfs4_owner_maxsz + \
+ nfs4_group_maxsz + nfs4_label_maxsz + \
+ decode_mdsthreshold_maxsz))
+#define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \
+ nfs4_fattr_value_maxsz)
+#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
+#define encode_attrs_maxsz (nfs4_fattr_bitmap_maxsz + \
+ 1 + 2 + 1 + \
+ nfs4_owner_maxsz + \
+ nfs4_group_maxsz + \
+ nfs4_label_maxsz + \
+ 1 + nfstime4_maxsz + \
+ 1 + nfstime4_maxsz)
+#define encode_savefh_maxsz (op_encode_hdr_maxsz)
+#define decode_savefh_maxsz (op_decode_hdr_maxsz)
+#define encode_restorefh_maxsz (op_encode_hdr_maxsz)
+#define decode_restorefh_maxsz (op_decode_hdr_maxsz)
+#define encode_fsinfo_maxsz (encode_getattr_maxsz)
+/* The 5 accounts for the PNFS attributes, and assumes that at most three
+ * layout types will be returned.
+ */
+#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \
+ nfs4_fattr_bitmap_maxsz + 4 + 8 + 5)
+#define encode_renew_maxsz (op_encode_hdr_maxsz + 3)
+#define decode_renew_maxsz (op_decode_hdr_maxsz)
+#define encode_setclientid_maxsz \
+ (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \
+ /* client name */ \
+ 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+ 1 /* sc_prog */ + \
+ 1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \
+ 1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \
+ 1) /* sc_cb_ident */
+#define decode_setclientid_maxsz \
+ (op_decode_hdr_maxsz + \
+ 2 /* clientid */ + \
+ XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \
+ 1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \
+ 1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN))
+#define encode_setclientid_confirm_maxsz \
+ (op_encode_hdr_maxsz + \
+ 3 + (NFS4_VERIFIER_SIZE >> 2))
+#define decode_setclientid_confirm_maxsz \
+ (op_decode_hdr_maxsz)
+#define encode_lookup_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz)
+#define decode_lookup_maxsz (op_decode_hdr_maxsz)
+#define encode_lookupp_maxsz (op_encode_hdr_maxsz)
+#define decode_lookupp_maxsz (op_decode_hdr_maxsz)
+#define encode_share_access_maxsz \
+ (2)
+#define encode_createmode_maxsz (1 + encode_attrs_maxsz + encode_verifier_maxsz)
+#define encode_opentype_maxsz (1 + encode_createmode_maxsz)
+#define encode_claim_null_maxsz (1 + nfs4_name_maxsz)
+#define encode_open_maxsz (op_encode_hdr_maxsz + \
+ 2 + encode_share_access_maxsz + 2 + \
+ open_owner_id_maxsz + \
+ encode_opentype_maxsz + \
+ encode_claim_null_maxsz)
+#define decode_space_limit_maxsz (3)
+#define decode_ace_maxsz (3 + nfs4_owner_maxsz)
+#define decode_delegation_maxsz (1 + decode_stateid_maxsz + 1 + \
+ decode_space_limit_maxsz + \
+ decode_ace_maxsz)
+#define decode_change_info_maxsz (5)
+#define decode_open_maxsz (op_decode_hdr_maxsz + \
+ decode_stateid_maxsz + \
+ decode_change_info_maxsz + 1 + \
+ nfs4_fattr_bitmap_maxsz + \
+ decode_delegation_maxsz)
+#define encode_open_confirm_maxsz \
+ (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + 1)
+#define decode_open_confirm_maxsz \
+ (op_decode_hdr_maxsz + \
+ decode_stateid_maxsz)
+#define encode_open_downgrade_maxsz \
+ (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + 1 + \
+ encode_share_access_maxsz)
+#define decode_open_downgrade_maxsz \
+ (op_decode_hdr_maxsz + \
+ decode_stateid_maxsz)
+#define encode_close_maxsz (op_encode_hdr_maxsz + \
+ 1 + encode_stateid_maxsz)
+#define decode_close_maxsz (op_decode_hdr_maxsz + \
+ decode_stateid_maxsz)
+#define encode_setattr_maxsz (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + \
+ encode_attrs_maxsz)
+#define decode_setattr_maxsz (op_decode_hdr_maxsz + \
+ nfs4_fattr_bitmap_maxsz)
+#define encode_read_maxsz (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + 3)
+#define decode_read_maxsz (op_decode_hdr_maxsz + 2 + 1)
+#define encode_readdir_maxsz (op_encode_hdr_maxsz + \
+ 2 + encode_verifier_maxsz + 5 + \
+ nfs4_label_maxsz)
+#define decode_readdir_maxsz (op_decode_hdr_maxsz + \
+ decode_verifier_maxsz + 1)
+#define encode_readlink_maxsz (op_encode_hdr_maxsz)
+#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1 + 1)
+#define encode_write_maxsz (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + 4)
+#define decode_write_maxsz (op_decode_hdr_maxsz + \
+ 2 + decode_verifier_maxsz)
+#define encode_commit_maxsz (op_encode_hdr_maxsz + 3)
+#define decode_commit_maxsz (op_decode_hdr_maxsz + \
+ decode_verifier_maxsz)
+#define encode_remove_maxsz (op_encode_hdr_maxsz + \
+ nfs4_name_maxsz)
+#define decode_remove_maxsz (op_decode_hdr_maxsz + \
+ decode_change_info_maxsz)
+#define encode_rename_maxsz (op_encode_hdr_maxsz + \
+ 2 * nfs4_name_maxsz)
+#define decode_rename_maxsz (op_decode_hdr_maxsz + \
+ decode_change_info_maxsz + \
+ decode_change_info_maxsz)
+#define encode_link_maxsz (op_encode_hdr_maxsz + \
+ nfs4_name_maxsz)
+#define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz)
+#define encode_lockowner_maxsz (7)
+#define encode_lock_maxsz (op_encode_hdr_maxsz + \
+ 7 + \
+ 1 + encode_stateid_maxsz + 1 + \
+ encode_lockowner_maxsz)
+#define decode_lock_denied_maxsz \
+ (8 + decode_lockowner_maxsz)
+#define decode_lock_maxsz (op_decode_hdr_maxsz + \
+ decode_lock_denied_maxsz)
+#define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \
+ encode_lockowner_maxsz)
+#define decode_lockt_maxsz (op_decode_hdr_maxsz + \
+ decode_lock_denied_maxsz)
+#define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \
+ encode_stateid_maxsz + \
+ 4)
+#define decode_locku_maxsz (op_decode_hdr_maxsz + \
+ decode_stateid_maxsz)
+#define encode_release_lockowner_maxsz \
+ (op_encode_hdr_maxsz + \
+ encode_lockowner_maxsz)
+#define decode_release_lockowner_maxsz \
+ (op_decode_hdr_maxsz)
+#define encode_access_maxsz (op_encode_hdr_maxsz + 1)
+#define decode_access_maxsz (op_decode_hdr_maxsz + 2)
+#define encode_symlink_maxsz (op_encode_hdr_maxsz + \
+ 1 + nfs4_name_maxsz + \
+ 1 + \
+ nfs4_fattr_maxsz)
+#define decode_symlink_maxsz (op_decode_hdr_maxsz + 8)
+#define encode_create_maxsz (op_encode_hdr_maxsz + \
+ 1 + 2 + nfs4_name_maxsz + \
+ encode_attrs_maxsz)
+#define decode_create_maxsz (op_decode_hdr_maxsz + \
+ decode_change_info_maxsz + \
+ nfs4_fattr_bitmap_maxsz)
+#define encode_statfs_maxsz (encode_getattr_maxsz)
+#define decode_statfs_maxsz (decode_getattr_maxsz)
+#define encode_delegreturn_maxsz (op_encode_hdr_maxsz + 4)
+#define decode_delegreturn_maxsz (op_decode_hdr_maxsz)
+#define encode_getacl_maxsz (encode_getattr_maxsz)
+#define decode_getacl_maxsz (op_decode_hdr_maxsz + \
+ nfs4_fattr_bitmap_maxsz + 1 + 1)
+#define encode_setacl_maxsz (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + 3)
+#define decode_setacl_maxsz (decode_setattr_maxsz)
+#define encode_fs_locations_maxsz \
+ (encode_getattr_maxsz)
+#define decode_fs_locations_maxsz \
+ (1)
+#define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz)
+#define decode_secinfo_maxsz (op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4))
+
+#if defined(CONFIG_NFS_V4_1)
+#define NFS4_MAX_MACHINE_NAME_LEN (64)
+#define IMPL_NAME_LIMIT (sizeof(utsname()->sysname) + sizeof(utsname()->release) + \
+ sizeof(utsname()->version) + sizeof(utsname()->machine) + 8)
+
+#define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
+ encode_verifier_maxsz + \
+ 1 /* co_ownerid.len */ + \
+ /* eia_clientowner */ \
+ 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+ 1 /* flags */ + \
+ 1 /* spa_how */ + \
+ /* max is SP4_MACH_CRED (for now) */ + \
+ 1 + NFS4_OP_MAP_NUM_WORDS + \
+ 1 + NFS4_OP_MAP_NUM_WORDS + \
+ 1 /* implementation id array of size 1 */ + \
+ 1 /* nii_domain */ + \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+ 1 /* nii_name */ + \
+ XDR_QUADLEN(IMPL_NAME_LIMIT) + \
+ 3 /* nii_date */)
+#define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \
+ 2 /* eir_clientid */ + \
+ 1 /* eir_sequenceid */ + \
+ 1 /* eir_flags */ + \
+ 1 /* spr_how */ + \
+ /* max is SP4_MACH_CRED (for now) */ + \
+ 1 + NFS4_OP_MAP_NUM_WORDS + \
+ 1 + NFS4_OP_MAP_NUM_WORDS + \
+ 2 /* eir_server_owner.so_minor_id */ + \
+ /* eir_server_owner.so_major_id<> */ \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
+ /* eir_server_scope<> */ \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
+ 1 /* eir_server_impl_id array length */ + \
+ 1 /* nii_domain */ + \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+ 1 /* nii_name */ + \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+ 3 /* nii_date */)
+#define encode_channel_attrs_maxsz (6 + 1 /* ca_rdma_ird.len (0) */)
+#define decode_channel_attrs_maxsz (6 + \
+ 1 /* ca_rdma_ird.len */ + \
+ 1 /* ca_rdma_ird */)
+#define encode_create_session_maxsz (op_encode_hdr_maxsz + \
+ 2 /* csa_clientid */ + \
+ 1 /* csa_sequence */ + \
+ 1 /* csa_flags */ + \
+ encode_channel_attrs_maxsz + \
+ encode_channel_attrs_maxsz + \
+ 1 /* csa_cb_program */ + \
+ 1 /* csa_sec_parms.len (1) */ + \
+ 1 /* cb_secflavor (AUTH_SYS) */ + \
+ 1 /* stamp */ + \
+ 1 /* machinename.len */ + \
+ XDR_QUADLEN(NFS4_MAX_MACHINE_NAME_LEN) + \
+ 1 /* uid */ + \
+ 1 /* gid */ + \
+ 1 /* gids.len (0) */)
+#define decode_create_session_maxsz (op_decode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+ 1 /* csr_sequence */ + \
+ 1 /* csr_flags */ + \
+ decode_channel_attrs_maxsz + \
+ decode_channel_attrs_maxsz)
+#define encode_bind_conn_to_session_maxsz (op_encode_hdr_maxsz + \
+ /* bctsa_sessid */ \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+ 1 /* bctsa_dir */ + \
+ 1 /* bctsa_use_conn_in_rdma_mode */)
+#define decode_bind_conn_to_session_maxsz (op_decode_hdr_maxsz + \
+ /* bctsr_sessid */ \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+ 1 /* bctsr_dir */ + \
+ 1 /* bctsr_use_conn_in_rdma_mode */)
+#define encode_destroy_session_maxsz (op_encode_hdr_maxsz + 4)
+#define decode_destroy_session_maxsz (op_decode_hdr_maxsz)
+#define encode_destroy_clientid_maxsz (op_encode_hdr_maxsz + 2)
+#define decode_destroy_clientid_maxsz (op_decode_hdr_maxsz)
+#define encode_sequence_maxsz (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 4)
+#define decode_sequence_maxsz (op_decode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
+#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
+#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
+#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
+ 1 /* layout type */ + \
+ 1 /* maxcount */ + \
+ 1 /* bitmap size */ + \
+ 1 /* notification bitmap length */ + \
+ 1 /* notification bitmap, word 0 */)
+#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
+ 1 /* layout type */ + \
+ 1 /* opaque devaddr4 length */ + \
+ /* devaddr4 payload is read into page */ \
+ 1 /* notification bitmap length */ + \
+ 1 /* notification bitmap, word 0 */ + \
+ 1 /* possible XDR padding */)
+#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
+ encode_stateid_maxsz)
+#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
+ decode_stateid_maxsz + \
+ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + 1)
+#define encode_layoutcommit_maxsz (op_encode_hdr_maxsz + \
+ 2 /* offset */ + \
+ 2 /* length */ + \
+ 1 /* reclaim */ + \
+ encode_stateid_maxsz + \
+ 1 /* new offset (true) */ + \
+ 2 /* last byte written */ + \
+ 1 /* nt_timechanged (false) */ + \
+ 1 /* layoutupdate4 layout type */ + \
+ 1 /* layoutupdate4 opaqueue len */)
+ /* the actual content of layoutupdate4 should
+ be allocated by drivers and spliced in
+ using xdr_write_pages */
+#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
+#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + \
+ 1 + \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT))
+#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
+ 1 + decode_stateid_maxsz)
+#define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1)
+#define decode_secinfo_no_name_maxsz decode_secinfo_maxsz
+#define encode_test_stateid_maxsz (op_encode_hdr_maxsz + 2 + \
+ XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define decode_test_stateid_maxsz (op_decode_hdr_maxsz + 2 + 1)
+#define encode_free_stateid_maxsz (op_encode_hdr_maxsz + 1 + \
+ XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define decode_free_stateid_maxsz (op_decode_hdr_maxsz)
+#else /* CONFIG_NFS_V4_1 */
+#define encode_sequence_maxsz 0
+#define decode_sequence_maxsz 0
+#define encode_layoutreturn_maxsz 0
+#define decode_layoutreturn_maxsz 0
+#define encode_layoutget_maxsz 0
+#define decode_layoutget_maxsz 0
+#endif /* CONFIG_NFS_V4_1 */
+
+#define NFS4_enc_compound_sz (1024) /* XXX: large enough? */
+#define NFS4_dec_compound_sz (1024) /* XXX: large enough? */
+#define NFS4_enc_read_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_read_maxsz)
+#define NFS4_dec_read_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_read_maxsz)
+#define NFS4_enc_readlink_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_readlink_maxsz)
+#define NFS4_dec_readlink_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_readlink_maxsz)
+#define NFS4_enc_readdir_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_readdir_maxsz)
+#define NFS4_dec_readdir_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_readdir_maxsz)
+#define NFS4_enc_write_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_write_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_write_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_write_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_commit_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_commit_maxsz)
+#define NFS4_dec_commit_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_commit_maxsz)
+#define NFS4_enc_open_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_open_maxsz + \
+ encode_access_maxsz + \
+ encode_getfh_maxsz + \
+ encode_getattr_maxsz + \
+ encode_layoutget_maxsz)
+#define NFS4_dec_open_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_open_maxsz + \
+ decode_access_maxsz + \
+ decode_getfh_maxsz + \
+ decode_getattr_maxsz + \
+ decode_layoutget_maxsz)
+#define NFS4_enc_open_confirm_sz \
+ (compound_encode_hdr_maxsz + \
+ encode_putfh_maxsz + \
+ encode_open_confirm_maxsz)
+#define NFS4_dec_open_confirm_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_putfh_maxsz + \
+ decode_open_confirm_maxsz)
+#define NFS4_enc_open_noattr_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_open_maxsz + \
+ encode_access_maxsz + \
+ encode_getattr_maxsz + \
+ encode_layoutget_maxsz)
+#define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_open_maxsz + \
+ decode_access_maxsz + \
+ decode_getattr_maxsz + \
+ decode_layoutget_maxsz)
+#define NFS4_enc_open_downgrade_sz \
+ (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_layoutreturn_maxsz + \
+ encode_open_downgrade_maxsz)
+#define NFS4_dec_open_downgrade_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_layoutreturn_maxsz + \
+ decode_open_downgrade_maxsz)
+#define NFS4_enc_close_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_layoutreturn_maxsz + \
+ encode_close_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_layoutreturn_maxsz + \
+ decode_close_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_setattr_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_setattr_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_setattr_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_fsinfo_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_fsinfo_maxsz)
+#define NFS4_dec_fsinfo_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_fsinfo_maxsz)
+#define NFS4_enc_renew_sz (compound_encode_hdr_maxsz + \
+ encode_renew_maxsz)
+#define NFS4_dec_renew_sz (compound_decode_hdr_maxsz + \
+ decode_renew_maxsz)
+#define NFS4_enc_setclientid_sz (compound_encode_hdr_maxsz + \
+ encode_setclientid_maxsz)
+#define NFS4_dec_setclientid_sz (compound_decode_hdr_maxsz + \
+ decode_setclientid_maxsz)
+#define NFS4_enc_setclientid_confirm_sz \
+ (compound_encode_hdr_maxsz + \
+ encode_setclientid_confirm_maxsz)
+#define NFS4_dec_setclientid_confirm_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_setclientid_confirm_maxsz)
+#define NFS4_enc_lock_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_lock_maxsz)
+#define NFS4_dec_lock_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_lock_maxsz)
+#define NFS4_enc_lockt_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_lockt_maxsz)
+#define NFS4_dec_lockt_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_lockt_maxsz)
+#define NFS4_enc_locku_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_locku_maxsz)
+#define NFS4_dec_locku_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_locku_maxsz)
+#define NFS4_enc_release_lockowner_sz \
+ (compound_encode_hdr_maxsz + \
+ encode_lockowner_maxsz)
+#define NFS4_dec_release_lockowner_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_lockowner_maxsz)
+#define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_access_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_access_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_access_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_getattr_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_getattr_maxsz + \
+ encode_renew_maxsz)
+#define NFS4_dec_getattr_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_getattr_maxsz + \
+ decode_renew_maxsz)
+#define NFS4_enc_lookup_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_lookup_maxsz + \
+ encode_getattr_maxsz + \
+ encode_getfh_maxsz)
+#define NFS4_dec_lookup_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_lookup_maxsz + \
+ decode_getattr_maxsz + \
+ decode_getfh_maxsz)
+#define NFS4_enc_lookupp_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_lookupp_maxsz + \
+ encode_getattr_maxsz + \
+ encode_getfh_maxsz)
+#define NFS4_dec_lookupp_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_lookupp_maxsz + \
+ decode_getattr_maxsz + \
+ decode_getfh_maxsz)
+#define NFS4_enc_lookup_root_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putrootfh_maxsz + \
+ encode_getattr_maxsz + \
+ encode_getfh_maxsz)
+#define NFS4_dec_lookup_root_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putrootfh_maxsz + \
+ decode_getattr_maxsz + \
+ decode_getfh_maxsz)
+#define NFS4_enc_remove_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_remove_maxsz)
+#define NFS4_dec_remove_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_remove_maxsz)
+#define NFS4_enc_rename_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_savefh_maxsz + \
+ encode_putfh_maxsz + \
+ encode_rename_maxsz)
+#define NFS4_dec_rename_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_savefh_maxsz + \
+ decode_putfh_maxsz + \
+ decode_rename_maxsz)
+#define NFS4_enc_link_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_savefh_maxsz + \
+ encode_putfh_maxsz + \
+ encode_link_maxsz + \
+ encode_restorefh_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_link_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_savefh_maxsz + \
+ decode_putfh_maxsz + \
+ decode_link_maxsz + \
+ decode_restorefh_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_symlink_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_symlink_maxsz + \
+ encode_getattr_maxsz + \
+ encode_getfh_maxsz)
+#define NFS4_dec_symlink_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_symlink_maxsz + \
+ decode_getattr_maxsz + \
+ decode_getfh_maxsz)
+#define NFS4_enc_create_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_create_maxsz + \
+ encode_getfh_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_create_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_create_maxsz + \
+ decode_getfh_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_pathconf_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_pathconf_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_statfs_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_statfs_maxsz)
+#define NFS4_dec_statfs_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_statfs_maxsz)
+#define NFS4_enc_server_caps_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_server_caps_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_delegreturn_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_layoutreturn_maxsz + \
+ encode_delegreturn_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_layoutreturn_maxsz + \
+ decode_delegreturn_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_getacl_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_getacl_maxsz)
+#define NFS4_dec_getacl_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_getacl_maxsz)
+#define NFS4_enc_setacl_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_setacl_maxsz)
+#define NFS4_dec_setacl_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_setacl_maxsz)
+#define NFS4_enc_fs_locations_sz \
+ (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_lookup_maxsz + \
+ encode_fs_locations_maxsz + \
+ encode_renew_maxsz)
+#define NFS4_dec_fs_locations_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_lookup_maxsz + \
+ decode_fs_locations_maxsz + \
+ decode_renew_maxsz)
+#define NFS4_enc_secinfo_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_secinfo_maxsz)
+#define NFS4_dec_secinfo_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_secinfo_maxsz)
+#define NFS4_enc_fsid_present_sz \
+ (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_getfh_maxsz + \
+ encode_renew_maxsz)
+#define NFS4_dec_fsid_present_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_getfh_maxsz + \
+ decode_renew_maxsz)
+#if defined(CONFIG_NFS_V4_1)
+#define NFS4_enc_bind_conn_to_session_sz \
+ (compound_encode_hdr_maxsz + \
+ encode_bind_conn_to_session_maxsz)
+#define NFS4_dec_bind_conn_to_session_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_bind_conn_to_session_maxsz)
+#define NFS4_enc_exchange_id_sz \
+ (compound_encode_hdr_maxsz + \
+ encode_exchange_id_maxsz)
+#define NFS4_dec_exchange_id_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_exchange_id_maxsz)
+#define NFS4_enc_create_session_sz \
+ (compound_encode_hdr_maxsz + \
+ encode_create_session_maxsz)
+#define NFS4_dec_create_session_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_create_session_maxsz)
+#define NFS4_enc_destroy_session_sz (compound_encode_hdr_maxsz + \
+ encode_destroy_session_maxsz)
+#define NFS4_dec_destroy_session_sz (compound_decode_hdr_maxsz + \
+ decode_destroy_session_maxsz)
+#define NFS4_enc_destroy_clientid_sz (compound_encode_hdr_maxsz + \
+ encode_destroy_clientid_maxsz)
+#define NFS4_dec_destroy_clientid_sz (compound_decode_hdr_maxsz + \
+ decode_destroy_clientid_maxsz)
+#define NFS4_enc_sequence_sz \
+ (compound_decode_hdr_maxsz + \
+ encode_sequence_maxsz)
+#define NFS4_dec_sequence_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz)
+#endif
+#define NFS4_enc_get_lease_time_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putrootfh_maxsz + \
+ encode_fsinfo_maxsz)
+#define NFS4_dec_get_lease_time_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putrootfh_maxsz + \
+ decode_fsinfo_maxsz)
+#if defined(CONFIG_NFS_V4_1)
+#define NFS4_enc_reclaim_complete_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_reclaim_complete_maxsz)
+#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_reclaim_complete_maxsz)
+#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz +\
+ encode_getdeviceinfo_maxsz)
+#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_getdeviceinfo_maxsz)
+#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_layoutget_maxsz)
+#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_layoutget_maxsz)
+#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz +\
+ encode_putfh_maxsz + \
+ encode_layoutcommit_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_layoutcommit_maxsz + \
+ decode_getattr_maxsz)
+#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_layoutreturn_maxsz)
+#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_layoutreturn_maxsz)
+#define NFS4_enc_secinfo_no_name_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putrootfh_maxsz +\
+ encode_secinfo_no_name_maxsz)
+#define NFS4_dec_secinfo_no_name_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putrootfh_maxsz + \
+ decode_secinfo_no_name_maxsz)
+#define NFS4_enc_test_stateid_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_test_stateid_maxsz)
+#define NFS4_dec_test_stateid_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_test_stateid_maxsz)
+#define NFS4_enc_free_stateid_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_free_stateid_maxsz)
+#define NFS4_dec_free_stateid_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_free_stateid_maxsz)
+
+const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+ compound_encode_hdr_maxsz +
+ encode_sequence_maxsz +
+ encode_putfh_maxsz +
+ encode_getattr_maxsz) *
+ XDR_UNIT);
+
+const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+ compound_decode_hdr_maxsz +
+ decode_sequence_maxsz +
+ decode_putfh_maxsz) *
+ XDR_UNIT);
+
+const u32 nfs41_maxgetdevinfo_overhead = ((RPC_MAX_REPHEADER_WITH_AUTH +
+ compound_decode_hdr_maxsz +
+ decode_sequence_maxsz) *
+ XDR_UNIT);
+EXPORT_SYMBOL_GPL(nfs41_maxgetdevinfo_overhead);
+#endif /* CONFIG_NFS_V4_1 */
+
+static const umode_t nfs_type2fmt[] = {
+ [NF4BAD] = 0,
+ [NF4REG] = S_IFREG,
+ [NF4DIR] = S_IFDIR,
+ [NF4BLK] = S_IFBLK,
+ [NF4CHR] = S_IFCHR,
+ [NF4LNK] = S_IFLNK,
+ [NF4SOCK] = S_IFSOCK,
+ [NF4FIFO] = S_IFIFO,
+ [NF4ATTRDIR] = 0,
+ [NF4NAMEDATTR] = 0,
+};
+
+struct compound_hdr {
+ int32_t status;
+ uint32_t nops;
+ __be32 * nops_p;
+ uint32_t taglen;
+ char * tag;
+ uint32_t replen; /* expected reply words */
+ u32 minorversion;
+};
+
+static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes)
+{
+ __be32 *p = xdr_reserve_space(xdr, nbytes);
+ BUG_ON(!p);
+ return p;
+}
+
+static void encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len)
+{
+ WARN_ON_ONCE(xdr_stream_encode_opaque_fixed(xdr, buf, len) < 0);
+}
+
+static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
+{
+ WARN_ON_ONCE(xdr_stream_encode_opaque(xdr, str, len) < 0);
+}
+
+static void encode_uint32(struct xdr_stream *xdr, u32 n)
+{
+ WARN_ON_ONCE(xdr_stream_encode_u32(xdr, n) < 0);
+}
+
+static void encode_uint64(struct xdr_stream *xdr, u64 n)
+{
+ WARN_ON_ONCE(xdr_stream_encode_u64(xdr, n) < 0);
+}
+
+static ssize_t xdr_encode_bitmap4(struct xdr_stream *xdr,
+ const __u32 *bitmap, size_t len)
+{
+ ssize_t ret;
+
+ /* Trim empty words */
+ while (len > 0 && bitmap[len-1] == 0)
+ len--;
+ ret = xdr_stream_encode_uint32_array(xdr, bitmap, len);
+ if (WARN_ON_ONCE(ret < 0))
+ return ret;
+ return len;
+}
+
+static size_t mask_bitmap4(const __u32 *bitmap, const __u32 *mask,
+ __u32 *res, size_t len)
+{
+ size_t i;
+ __u32 tmp;
+
+ while (len > 0 && (bitmap[len-1] == 0 || mask[len-1] == 0))
+ len--;
+ for (i = len; i-- > 0;) {
+ tmp = bitmap[i] & mask[i];
+ res[i] = tmp;
+ }
+ return len;
+}
+
+static void encode_nfs4_seqid(struct xdr_stream *xdr,
+ const struct nfs_seqid *seqid)
+{
+ if (seqid != NULL)
+ encode_uint32(xdr, seqid->sequence->counter);
+ else
+ encode_uint32(xdr, 0);
+}
+
+static void encode_compound_hdr(struct xdr_stream *xdr,
+ struct rpc_rqst *req,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ /* initialize running count of expected bytes in reply.
+ * NOTE: the replied tag SHOULD be the same is the one sent,
+ * but this is not required as a MUST for the server to do so. */
+ hdr->replen = 3 + hdr->taglen;
+
+ WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN);
+ encode_string(xdr, hdr->taglen, hdr->tag);
+ p = reserve_space(xdr, 8);
+ *p++ = cpu_to_be32(hdr->minorversion);
+ hdr->nops_p = p;
+ *p = cpu_to_be32(hdr->nops);
+}
+
+static void encode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 op,
+ uint32_t replen,
+ struct compound_hdr *hdr)
+{
+ encode_uint32(xdr, op);
+ hdr->nops++;
+ hdr->replen += replen;
+}
+
+static void encode_nops(struct compound_hdr *hdr)
+{
+ WARN_ON_ONCE(hdr->nops > NFS4_MAX_OPS);
+ *hdr->nops_p = htonl(hdr->nops);
+}
+
+static void encode_nfs4_stateid(struct xdr_stream *xdr, const nfs4_stateid *stateid)
+{
+ encode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
+}
+
+static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
+{
+ encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE);
+}
+
+static __be32 *
+xdr_encode_nfstime4(__be32 *p, const struct timespec64 *t)
+{
+ p = xdr_encode_hyper(p, t->tv_sec);
+ *p++ = cpu_to_be32(t->tv_nsec);
+ return p;
+}
+
+static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
+ const struct nfs4_label *label,
+ const umode_t *umask,
+ const struct nfs_server *server,
+ const uint32_t attrmask[])
+{
+ char owner_name[IDMAP_NAMESZ];
+ char owner_group[IDMAP_NAMESZ];
+ int owner_namelen = 0;
+ int owner_grouplen = 0;
+ __be32 *p;
+ uint32_t len = 0;
+ uint32_t bmval[3] = { 0 };
+
+ /*
+ * We reserve enough space to write the entire attribute buffer at once.
+ */
+ if ((iap->ia_valid & ATTR_SIZE) && (attrmask[0] & FATTR4_WORD0_SIZE)) {
+ bmval[0] |= FATTR4_WORD0_SIZE;
+ len += 8;
+ }
+ if (iap->ia_valid & ATTR_MODE) {
+ if (umask && (attrmask[2] & FATTR4_WORD2_MODE_UMASK)) {
+ bmval[2] |= FATTR4_WORD2_MODE_UMASK;
+ len += 8;
+ } else if (attrmask[1] & FATTR4_WORD1_MODE) {
+ bmval[1] |= FATTR4_WORD1_MODE;
+ len += 4;
+ }
+ }
+ if ((iap->ia_valid & ATTR_UID) && (attrmask[1] & FATTR4_WORD1_OWNER)) {
+ owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
+ if (owner_namelen < 0) {
+ dprintk("nfs: couldn't resolve uid %d to string\n",
+ from_kuid(&init_user_ns, iap->ia_uid));
+ /* XXX */
+ strcpy(owner_name, "nobody");
+ owner_namelen = sizeof("nobody") - 1;
+ /* goto out; */
+ }
+ bmval[1] |= FATTR4_WORD1_OWNER;
+ len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
+ }
+ if ((iap->ia_valid & ATTR_GID) &&
+ (attrmask[1] & FATTR4_WORD1_OWNER_GROUP)) {
+ owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ);
+ if (owner_grouplen < 0) {
+ dprintk("nfs: couldn't resolve gid %d to string\n",
+ from_kgid(&init_user_ns, iap->ia_gid));
+ strcpy(owner_group, "nobody");
+ owner_grouplen = sizeof("nobody") - 1;
+ /* goto out; */
+ }
+ bmval[1] |= FATTR4_WORD1_OWNER_GROUP;
+ len += 4 + (XDR_QUADLEN(owner_grouplen) << 2);
+ }
+ if (attrmask[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
+ if (iap->ia_valid & ATTR_ATIME_SET) {
+ bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
+ len += 4 + (nfstime4_maxsz << 2);
+ } else if (iap->ia_valid & ATTR_ATIME) {
+ bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
+ len += 4;
+ }
+ }
+ if (attrmask[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
+ if (iap->ia_valid & ATTR_MTIME_SET) {
+ bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
+ len += 4 + (nfstime4_maxsz << 2);
+ } else if (iap->ia_valid & ATTR_MTIME) {
+ bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
+ len += 4;
+ }
+ }
+
+ if (label && (attrmask[2] & FATTR4_WORD2_SECURITY_LABEL)) {
+ len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
+ bmval[2] |= FATTR4_WORD2_SECURITY_LABEL;
+ }
+
+ xdr_encode_bitmap4(xdr, bmval, ARRAY_SIZE(bmval));
+ xdr_stream_encode_opaque_inline(xdr, (void **)&p, len);
+
+ if (bmval[0] & FATTR4_WORD0_SIZE)
+ p = xdr_encode_hyper(p, iap->ia_size);
+ if (bmval[1] & FATTR4_WORD1_MODE)
+ *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
+ if (bmval[1] & FATTR4_WORD1_OWNER)
+ p = xdr_encode_opaque(p, owner_name, owner_namelen);
+ if (bmval[1] & FATTR4_WORD1_OWNER_GROUP)
+ p = xdr_encode_opaque(p, owner_group, owner_grouplen);
+ if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
+ if (iap->ia_valid & ATTR_ATIME_SET) {
+ *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+ p = xdr_encode_nfstime4(p, &iap->ia_atime);
+ } else
+ *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
+ }
+ if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
+ if (iap->ia_valid & ATTR_MTIME_SET) {
+ *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+ p = xdr_encode_nfstime4(p, &iap->ia_mtime);
+ } else
+ *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
+ }
+ if (label && (bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) {
+ *p++ = cpu_to_be32(label->lfs);
+ *p++ = cpu_to_be32(label->pi);
+ *p++ = cpu_to_be32(label->len);
+ p = xdr_encode_opaque_fixed(p, label->label, label->len);
+ }
+ if (bmval[2] & FATTR4_WORD2_MODE_UMASK) {
+ *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
+ *p++ = cpu_to_be32(*umask);
+ }
+
+/* out: */
+}
+
+static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_ACCESS, decode_access_maxsz, hdr);
+ encode_uint32(xdr, access);
+}
+
+static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr);
+ encode_nfs4_seqid(xdr, arg->seqid);
+ encode_nfs4_stateid(xdr, &arg->stateid);
+}
+
+static void encode_commit(struct xdr_stream *xdr, const struct nfs_commitargs *args, struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr);
+ p = reserve_space(xdr, 12);
+ p = xdr_encode_hyper(p, args->offset);
+ *p = cpu_to_be32(args->count);
+}
+
+static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_CREATE, decode_create_maxsz, hdr);
+ encode_uint32(xdr, create->ftype);
+
+ switch (create->ftype) {
+ case NF4LNK:
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(create->u.symlink.len);
+ xdr_write_pages(xdr, create->u.symlink.pages, 0,
+ create->u.symlink.len);
+ xdr->buf->flags |= XDRBUF_WRITE;
+ break;
+
+ case NF4BLK: case NF4CHR:
+ p = reserve_space(xdr, 8);
+ *p++ = cpu_to_be32(create->u.device.specdata1);
+ *p = cpu_to_be32(create->u.device.specdata2);
+ break;
+
+ default:
+ break;
+ }
+
+ encode_string(xdr, create->name->len, create->name->name);
+ encode_attrs(xdr, create->attrs, create->label, &create->umask,
+ create->server, create->server->attr_bitmask);
+}
+
+static void encode_getattr(struct xdr_stream *xdr,
+ const __u32 *bitmap, const __u32 *mask, size_t len,
+ struct compound_hdr *hdr)
+{
+ __u32 masked_bitmap[nfs4_fattr_bitmap_maxsz];
+
+ encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr);
+ if (mask) {
+ if (WARN_ON_ONCE(len > ARRAY_SIZE(masked_bitmap)))
+ len = ARRAY_SIZE(masked_bitmap);
+ len = mask_bitmap4(bitmap, mask, masked_bitmap, len);
+ bitmap = masked_bitmap;
+ }
+ xdr_encode_bitmap4(xdr, bitmap, len);
+}
+
+static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
+{
+ encode_getattr(xdr, nfs4_fattr_bitmap, bitmask,
+ ARRAY_SIZE(nfs4_fattr_bitmap), hdr);
+}
+
+static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask,
+ const u32 *open_bitmap,
+ struct compound_hdr *hdr)
+{
+ encode_getattr(xdr, open_bitmap, bitmask, 3, hdr);
+}
+
+static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
+{
+ encode_getattr(xdr, nfs4_fsinfo_bitmap, bitmask,
+ ARRAY_SIZE(nfs4_fsinfo_bitmap), hdr);
+}
+
+static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
+{
+ encode_getattr(xdr, nfs4_fs_locations_bitmap, bitmask,
+ ARRAY_SIZE(nfs4_fs_locations_bitmap), hdr);
+}
+
+static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_GETFH, decode_getfh_maxsz, hdr);
+}
+
+static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_LINK, decode_link_maxsz, hdr);
+ encode_string(xdr, name->len, name->name);
+}
+
+static inline int nfs4_lock_type(struct file_lock *fl, int block)
+{
+ if (fl->fl_type == F_RDLCK)
+ return block ? NFS4_READW_LT : NFS4_READ_LT;
+ return block ? NFS4_WRITEW_LT : NFS4_WRITE_LT;
+}
+
+static inline uint64_t nfs4_lock_length(struct file_lock *fl)
+{
+ if (fl->fl_end == OFFSET_MAX)
+ return ~(uint64_t)0;
+ return fl->fl_end - fl->fl_start + 1;
+}
+
+static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 32);
+ p = xdr_encode_hyper(p, lowner->clientid);
+ *p++ = cpu_to_be32(20);
+ p = xdr_encode_opaque_fixed(p, "lock id:", 8);
+ *p++ = cpu_to_be32(lowner->s_dev);
+ xdr_encode_hyper(p, lowner->id);
+}
+
+/*
+ * opcode,type,reclaim,offset,length,new_lock_owner = 32
+ * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40
+ */
+static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args, struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LOCK, decode_lock_maxsz, hdr);
+ p = reserve_space(xdr, 28);
+ *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
+ *p++ = cpu_to_be32(args->reclaim);
+ p = xdr_encode_hyper(p, args->fl->fl_start);
+ p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+ *p = cpu_to_be32(args->new_lock_owner);
+ if (args->new_lock_owner){
+ encode_nfs4_seqid(xdr, args->open_seqid);
+ encode_nfs4_stateid(xdr, &args->open_stateid);
+ encode_nfs4_seqid(xdr, args->lock_seqid);
+ encode_lockowner(xdr, &args->lock_owner);
+ }
+ else {
+ encode_nfs4_stateid(xdr, &args->lock_stateid);
+ encode_nfs4_seqid(xdr, args->lock_seqid);
+ }
+}
+
+static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LOCKT, decode_lockt_maxsz, hdr);
+ p = reserve_space(xdr, 20);
+ *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
+ p = xdr_encode_hyper(p, args->fl->fl_start);
+ p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+ encode_lockowner(xdr, &args->lock_owner);
+}
+
+static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr);
+ encode_uint32(xdr, nfs4_lock_type(args->fl, 0));
+ encode_nfs4_seqid(xdr, args->seqid);
+ encode_nfs4_stateid(xdr, &args->stateid);
+ p = reserve_space(xdr, 16);
+ p = xdr_encode_hyper(p, args->fl->fl_start);
+ xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+}
+
+static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_RELEASE_LOCKOWNER, decode_release_lockowner_maxsz, hdr);
+ encode_lockowner(xdr, lowner);
+}
+
+static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_LOOKUP, decode_lookup_maxsz, hdr);
+ encode_string(xdr, name->len, name->name);
+}
+
+static void encode_lookupp(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_LOOKUPP, decode_lookupp_maxsz, hdr);
+}
+
+static void encode_share_access(struct xdr_stream *xdr, u32 share_access)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 8);
+ *p++ = cpu_to_be32(share_access);
+ *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */
+}
+
+static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
+{
+ __be32 *p;
+ /*
+ * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
+ * owner 4 = 32
+ */
+ encode_nfs4_seqid(xdr, arg->seqid);
+ encode_share_access(xdr, arg->share_access);
+ p = reserve_space(xdr, 36);
+ p = xdr_encode_hyper(p, arg->clientid);
+ *p++ = cpu_to_be32(24);
+ p = xdr_encode_opaque_fixed(p, "open id:", 8);
+ *p++ = cpu_to_be32(arg->server->s_dev);
+ *p++ = cpu_to_be32(arg->id.uniquifier);
+ xdr_encode_hyper(p, arg->id.create_time);
+}
+
+static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ switch(arg->createmode) {
+ case NFS4_CREATE_UNCHECKED:
+ *p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
+ encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask,
+ arg->server, arg->server->attr_bitmask);
+ break;
+ case NFS4_CREATE_GUARDED:
+ *p = cpu_to_be32(NFS4_CREATE_GUARDED);
+ encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask,
+ arg->server, arg->server->attr_bitmask);
+ break;
+ case NFS4_CREATE_EXCLUSIVE:
+ *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
+ encode_nfs4_verifier(xdr, &arg->u.verifier);
+ break;
+ case NFS4_CREATE_EXCLUSIVE4_1:
+ *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
+ encode_nfs4_verifier(xdr, &arg->u.verifier);
+ encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask,
+ arg->server, arg->server->exclcreat_bitmask);
+ }
+}
+
+static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *arg)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ switch (arg->open_flags & O_CREAT) {
+ case 0:
+ *p = cpu_to_be32(NFS4_OPEN_NOCREATE);
+ break;
+ default:
+ *p = cpu_to_be32(NFS4_OPEN_CREATE);
+ encode_createmode(xdr, arg);
+ }
+}
+
+static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delegation_type)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ switch (delegation_type) {
+ case 0:
+ *p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
+ break;
+ case FMODE_READ:
+ *p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ);
+ break;
+ case FMODE_WRITE|FMODE_READ:
+ *p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *name)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
+ encode_string(xdr, name->len, name->name);
+}
+
+static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
+ encode_delegation_type(xdr, type);
+}
+
+static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struct qstr *name, const nfs4_stateid *stateid)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
+ encode_nfs4_stateid(xdr, stateid);
+ encode_string(xdr, name->len, name->name);
+}
+
+static inline void encode_claim_fh(struct xdr_stream *xdr)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(NFS4_OPEN_CLAIM_FH);
+}
+
+static inline void encode_claim_delegate_cur_fh(struct xdr_stream *xdr, const nfs4_stateid *stateid)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEG_CUR_FH);
+ encode_nfs4_stateid(xdr, stateid);
+}
+
+static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_OPEN, decode_open_maxsz, hdr);
+ encode_openhdr(xdr, arg);
+ encode_opentype(xdr, arg);
+ switch (arg->claim) {
+ case NFS4_OPEN_CLAIM_NULL:
+ encode_claim_null(xdr, arg->name);
+ break;
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ encode_claim_previous(xdr, arg->u.delegation_type);
+ break;
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation);
+ break;
+ case NFS4_OPEN_CLAIM_FH:
+ encode_claim_fh(xdr);
+ break;
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ encode_claim_delegate_cur_fh(xdr, &arg->u.delegation);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_OPEN_CONFIRM, decode_open_confirm_maxsz, hdr);
+ encode_nfs4_stateid(xdr, arg->stateid);
+ encode_nfs4_seqid(xdr, arg->seqid);
+}
+
+static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &arg->stateid);
+ encode_nfs4_seqid(xdr, arg->seqid);
+ encode_share_access(xdr, arg->share_access);
+}
+
+static void
+encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_PUTFH, decode_putfh_maxsz, hdr);
+ encode_string(xdr, fh->size, fh->data);
+}
+
+static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr);
+}
+
+static void encode_read(struct xdr_stream *xdr, const struct nfs_pgio_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_READ, decode_read_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->stateid);
+
+ p = reserve_space(xdr, 12);
+ p = xdr_encode_hyper(p, args->offset);
+ *p = cpu_to_be32(args->count);
+}
+
+static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
+{
+ uint32_t attrs[3] = {
+ FATTR4_WORD0_RDATTR_ERROR,
+ FATTR4_WORD1_MOUNTED_ON_FILEID,
+ };
+ uint32_t dircount = readdir->count >> 1;
+ __be32 *p, verf[2];
+ uint32_t attrlen = 0;
+ unsigned int i;
+
+ if (readdir->plus) {
+ attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
+ FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE|FATTR4_WORD0_FILEID;
+ attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER|
+ FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
+ FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
+ FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+ attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
+ dircount >>= 1;
+ }
+ /* Use mounted_on_fileid only if the server supports it */
+ if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID))
+ attrs[0] |= FATTR4_WORD0_FILEID;
+ for (i = 0; i < ARRAY_SIZE(attrs); i++) {
+ attrs[i] &= readdir->bitmask[i];
+ if (attrs[i] != 0)
+ attrlen = i+1;
+ }
+
+ encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr);
+ encode_uint64(xdr, readdir->cookie);
+ encode_nfs4_verifier(xdr, &readdir->verifier);
+ p = reserve_space(xdr, 12 + (attrlen << 2));
+ *p++ = cpu_to_be32(dircount);
+ *p++ = cpu_to_be32(readdir->count);
+ *p++ = cpu_to_be32(attrlen);
+ for (i = 0; i < attrlen; i++)
+ *p++ = cpu_to_be32(attrs[i]);
+ memcpy(verf, readdir->verifier.data, sizeof(verf));
+
+ dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n",
+ __func__,
+ (unsigned long long)readdir->cookie,
+ verf[0], verf[1],
+ attrs[0] & readdir->bitmask[0],
+ attrs[1] & readdir->bitmask[1],
+ attrs[2] & readdir->bitmask[2]);
+}
+
+static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_READLINK, decode_readlink_maxsz, hdr);
+}
+
+static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_REMOVE, decode_remove_maxsz, hdr);
+ encode_string(xdr, name->len, name->name);
+}
+
+static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_RENAME, decode_rename_maxsz, hdr);
+ encode_string(xdr, oldname->len, oldname->name);
+ encode_string(xdr, newname->len, newname->name);
+}
+
+static void encode_renew(struct xdr_stream *xdr, clientid4 clid,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_RENEW, decode_renew_maxsz, hdr);
+ encode_uint64(xdr, clid);
+}
+
+static void
+encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_RESTOREFH, decode_restorefh_maxsz, hdr);
+}
+
+static void
+encode_setacl(struct xdr_stream *xdr, const struct nfs_setaclargs *arg,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_SETATTR, decode_setacl_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &zero_stateid);
+ p = reserve_space(xdr, 2*4);
+ *p++ = cpu_to_be32(1);
+ *p = cpu_to_be32(FATTR4_WORD0_ACL);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(arg->acl_len);
+ xdr_write_pages(xdr, arg->acl_pages, 0, arg->acl_len);
+}
+
+static void
+encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_SAVEFH, decode_savefh_maxsz, hdr);
+}
+
+static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &arg->stateid);
+ encode_attrs(xdr, arg->iap, arg->label, NULL, server,
+ server->attr_bitmask);
+}
+
+static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr);
+ encode_nfs4_verifier(xdr, setclientid->sc_verifier);
+
+ encode_string(xdr, strlen(setclientid->sc_clnt->cl_owner_id),
+ setclientid->sc_clnt->cl_owner_id);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(setclientid->sc_prog);
+ encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
+ encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(setclientid->sc_clnt->cl_cb_ident);
+}
+
+static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM,
+ decode_setclientid_confirm_maxsz, hdr);
+ encode_uint64(xdr, arg->clientid);
+ encode_nfs4_verifier(xdr, &arg->confirm);
+}
+
+static void encode_write(struct xdr_stream *xdr, const struct nfs_pgio_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_WRITE, decode_write_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->stateid);
+
+ p = reserve_space(xdr, 16);
+ p = xdr_encode_hyper(p, args->offset);
+ *p++ = cpu_to_be32(args->stable);
+ *p = cpu_to_be32(args->count);
+
+ xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
+}
+
+static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_DELEGRETURN, decode_delegreturn_maxsz, hdr);
+ encode_nfs4_stateid(xdr, stateid);
+}
+
+static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_SECINFO, decode_secinfo_maxsz, hdr);
+ encode_string(xdr, name->len, name->name);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/* NFSv4.1 operations */
+static void encode_bind_conn_to_session(struct xdr_stream *xdr,
+ const struct nfs41_bind_conn_to_session_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION,
+ decode_bind_conn_to_session_maxsz, hdr);
+ encode_opaque_fixed(xdr, args->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+ p = xdr_reserve_space(xdr, 8);
+ *p++ = cpu_to_be32(args->dir);
+ *p = (args->use_conn_in_rdma_mode) ? cpu_to_be32(1) : cpu_to_be32(0);
+}
+
+static void encode_op_map(struct xdr_stream *xdr, const struct nfs4_op_map *op_map)
+{
+ unsigned int i;
+ encode_uint32(xdr, NFS4_OP_MAP_NUM_WORDS);
+ for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++)
+ encode_uint32(xdr, op_map->u.words[i]);
+}
+
+static void encode_exchange_id(struct xdr_stream *xdr,
+ const struct nfs41_exchange_id_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+ char impl_name[IMPL_NAME_LIMIT];
+ int len = 0;
+
+ encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr);
+ encode_nfs4_verifier(xdr, &args->verifier);
+
+ encode_string(xdr, strlen(args->client->cl_owner_id),
+ args->client->cl_owner_id);
+
+ encode_uint32(xdr, args->flags);
+ encode_uint32(xdr, args->state_protect.how);
+
+ switch (args->state_protect.how) {
+ case SP4_NONE:
+ break;
+ case SP4_MACH_CRED:
+ encode_op_map(xdr, &args->state_protect.enforce);
+ encode_op_map(xdr, &args->state_protect.allow);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ if (send_implementation_id &&
+ sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 &&
+ sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN)
+ <= sizeof(impl_name) + 1)
+ len = snprintf(impl_name, sizeof(impl_name), "%s %s %s %s",
+ utsname()->sysname, utsname()->release,
+ utsname()->version, utsname()->machine);
+
+ if (len > 0) {
+ encode_uint32(xdr, 1); /* implementation id array length=1 */
+
+ encode_string(xdr,
+ sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) - 1,
+ CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN);
+ encode_string(xdr, len, impl_name);
+ /* just send zeros for nii_date - the date is in nii_name */
+ p = reserve_space(xdr, 12);
+ p = xdr_encode_hyper(p, 0);
+ *p = cpu_to_be32(0);
+ } else
+ encode_uint32(xdr, 0); /* implementation id array length=0 */
+}
+
+static void encode_create_session(struct xdr_stream *xdr,
+ const struct nfs41_create_session_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+ struct nfs_client *clp = args->client;
+ struct rpc_clnt *clnt = clp->cl_rpcclient;
+ struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+ u32 max_resp_sz_cached;
+
+ /*
+ * Assumes OPEN is the biggest non-idempotent compound.
+ * 2 is the verifier.
+ */
+ max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + 2)
+ * XDR_UNIT + RPC_MAX_AUTH_SIZE;
+
+ encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr);
+ p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12);
+ p = xdr_encode_hyper(p, args->clientid);
+ *p++ = cpu_to_be32(args->seqid); /*Sequence id */
+ *p++ = cpu_to_be32(args->flags); /*flags */
+
+ /* Fore Channel */
+ *p++ = cpu_to_be32(0); /* header padding size */
+ *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */
+ *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */
+ *p++ = cpu_to_be32(max_resp_sz_cached); /* Max resp sz cached */
+ *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */
+ *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */
+ *p++ = cpu_to_be32(0); /* rdmachannel_attrs */
+
+ /* Back Channel */
+ *p++ = cpu_to_be32(0); /* header padding size */
+ *p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz); /* max req size */
+ *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz); /* max resp size */
+ *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */
+ *p++ = cpu_to_be32(args->bc_attrs.max_ops); /* max operations */
+ *p++ = cpu_to_be32(args->bc_attrs.max_reqs); /* max requests */
+ *p++ = cpu_to_be32(0); /* rdmachannel_attrs */
+
+ *p++ = cpu_to_be32(args->cb_program); /* cb_program */
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */
+
+ /* authsys_parms rfc1831 */
+ *p++ = cpu_to_be32(ktime_to_ns(nn->boot_time)); /* stamp */
+ p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen);
+ *p++ = cpu_to_be32(0); /* UID */
+ *p++ = cpu_to_be32(0); /* GID */
+ *p = cpu_to_be32(0); /* No more gids */
+}
+
+static void encode_destroy_session(struct xdr_stream *xdr,
+ const struct nfs4_session *session,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_DESTROY_SESSION, decode_destroy_session_maxsz, hdr);
+ encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+}
+
+static void encode_destroy_clientid(struct xdr_stream *xdr,
+ uint64_t clientid,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_DESTROY_CLIENTID, decode_destroy_clientid_maxsz, hdr);
+ encode_uint64(xdr, clientid);
+}
+
+static void encode_reclaim_complete(struct xdr_stream *xdr,
+ const struct nfs41_reclaim_complete_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_RECLAIM_COMPLETE, decode_reclaim_complete_maxsz, hdr);
+ encode_uint32(xdr, args->one_fs);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static void encode_sequence(struct xdr_stream *xdr,
+ const struct nfs4_sequence_args *args,
+ struct compound_hdr *hdr)
+{
+#if defined(CONFIG_NFS_V4_1)
+ struct nfs4_session *session;
+ struct nfs4_slot_table *tp;
+ struct nfs4_slot *slot = args->sa_slot;
+ __be32 *p;
+
+ tp = slot->table;
+ session = tp->session;
+ if (!session)
+ return;
+
+ encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr);
+
+ /*
+ * Sessionid + seqid + slotid + max slotid + cache_this
+ */
+ dprintk("%s: sessionid=%u:%u:%u:%u seqid=%d slotid=%d "
+ "max_slotid=%d cache_this=%d\n",
+ __func__,
+ ((u32 *)session->sess_id.data)[0],
+ ((u32 *)session->sess_id.data)[1],
+ ((u32 *)session->sess_id.data)[2],
+ ((u32 *)session->sess_id.data)[3],
+ slot->seq_nr, slot->slot_nr,
+ tp->highest_used_slotid, args->sa_cache_this);
+ p = reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 16);
+ p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+ *p++ = cpu_to_be32(slot->seq_nr);
+ *p++ = cpu_to_be32(slot->slot_nr);
+ *p++ = cpu_to_be32(tp->highest_used_slotid);
+ *p = cpu_to_be32(args->sa_cache_this);
+#endif /* CONFIG_NFS_V4_1 */
+}
+
+#ifdef CONFIG_NFS_V4_1
+static void
+encode_getdeviceinfo(struct xdr_stream *xdr,
+ const struct nfs4_getdeviceinfo_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr);
+ p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 4 + 4);
+ p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
+ NFS4_DEVICEID4_SIZE);
+ *p++ = cpu_to_be32(args->pdev->layout_type);
+ *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */
+
+ p = reserve_space(xdr, 4 + 4);
+ *p++ = cpu_to_be32(1); /* bitmap length */
+ *p++ = cpu_to_be32(args->notify_types);
+}
+
+static void
+encode_layoutget(struct xdr_stream *xdr,
+ const struct nfs4_layoutget_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LAYOUTGET, decode_layoutget_maxsz, hdr);
+ p = reserve_space(xdr, 36);
+ *p++ = cpu_to_be32(0); /* Signal layout available */
+ *p++ = cpu_to_be32(args->type);
+ *p++ = cpu_to_be32(args->range.iomode);
+ p = xdr_encode_hyper(p, args->range.offset);
+ p = xdr_encode_hyper(p, args->range.length);
+ p = xdr_encode_hyper(p, args->minlength);
+ encode_nfs4_stateid(xdr, &args->stateid);
+ encode_uint32(xdr, args->maxcount);
+
+ dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
+ __func__,
+ args->type,
+ args->range.iomode,
+ (unsigned long)args->range.offset,
+ (unsigned long)args->range.length,
+ args->maxcount);
+}
+
+static int
+encode_layoutcommit(struct xdr_stream *xdr,
+ struct inode *inode,
+ const struct nfs4_layoutcommit_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
+ NFS_SERVER(args->inode)->pnfs_curr_ld->id);
+
+ encode_op_hdr(xdr, OP_LAYOUTCOMMIT, decode_layoutcommit_maxsz, hdr);
+ p = reserve_space(xdr, 20);
+ /* Only whole file layouts */
+ p = xdr_encode_hyper(p, 0); /* offset */
+ p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */
+ *p = cpu_to_be32(0); /* reclaim */
+ encode_nfs4_stateid(xdr, &args->stateid);
+ if (args->lastbytewritten != U64_MAX) {
+ p = reserve_space(xdr, 20);
+ *p++ = cpu_to_be32(1); /* newoffset = TRUE */
+ p = xdr_encode_hyper(p, args->lastbytewritten);
+ } else {
+ p = reserve_space(xdr, 12);
+ *p++ = cpu_to_be32(0); /* newoffset = FALSE */
+ }
+ *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
+ *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
+
+ encode_uint32(xdr, args->layoutupdate_len);
+ if (args->layoutupdate_pages)
+ xdr_write_pages(xdr, args->layoutupdate_pages, 0,
+ args->layoutupdate_len);
+
+ return 0;
+}
+
+static void
+encode_layoutreturn(struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr);
+ p = reserve_space(xdr, 16);
+ *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */
+ *p++ = cpu_to_be32(args->layout_type);
+ *p++ = cpu_to_be32(args->range.iomode);
+ *p = cpu_to_be32(RETURN_FILE);
+ p = reserve_space(xdr, 16);
+ p = xdr_encode_hyper(p, args->range.offset);
+ p = xdr_encode_hyper(p, args->range.length);
+ spin_lock(&args->inode->i_lock);
+ encode_nfs4_stateid(xdr, &args->stateid);
+ spin_unlock(&args->inode->i_lock);
+ if (args->ld_private->ops && args->ld_private->ops->encode)
+ args->ld_private->ops->encode(xdr, args, args->ld_private);
+ else
+ encode_uint32(xdr, 0);
+}
+
+static int
+encode_secinfo_no_name(struct xdr_stream *xdr,
+ const struct nfs41_secinfo_no_name_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_SECINFO_NO_NAME, decode_secinfo_no_name_maxsz, hdr);
+ encode_uint32(xdr, args->style);
+ return 0;
+}
+
+static void encode_test_stateid(struct xdr_stream *xdr,
+ const struct nfs41_test_stateid_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_TEST_STATEID, decode_test_stateid_maxsz, hdr);
+ encode_uint32(xdr, 1);
+ encode_nfs4_stateid(xdr, args->stateid);
+}
+
+static void encode_free_stateid(struct xdr_stream *xdr,
+ const struct nfs41_free_stateid_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->stateid);
+}
+#else
+static inline void
+encode_layoutreturn(struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args,
+ struct compound_hdr *hdr)
+{
+}
+
+static void
+encode_layoutget(struct xdr_stream *xdr,
+ const struct nfs4_layoutget_args *args,
+ struct compound_hdr *hdr)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * END OF "GENERIC" ENCODE ROUTINES.
+ */
+
+static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
+{
+#if defined(CONFIG_NFS_V4_1)
+ struct nfs4_session *session = args->sa_slot->table->session;
+ if (session)
+ return session->clp->cl_mvops->minor_version;
+#endif /* CONFIG_NFS_V4_1 */
+ return 0;
+}
+
+/*
+ * Encode an ACCESS request
+ */
+static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_accessargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_access(xdr, args->access, &hdr);
+ if (args->bitmask)
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode LOOKUP request
+ */
+static void nfs4_xdr_enc_lookup(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_lookup_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->dir_fh, &hdr);
+ encode_lookup(xdr, args->name, &hdr);
+ encode_getfh(xdr, &hdr);
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode LOOKUPP request
+ */
+static void nfs4_xdr_enc_lookupp(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_lookupp_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_lookupp(xdr, &hdr);
+ encode_getfh(xdr, &hdr);
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode LOOKUP_ROOT request
+ */
+static void nfs4_xdr_enc_lookup_root(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_lookup_root_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putrootfh(xdr, &hdr);
+ encode_getfh(xdr, &hdr);
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode REMOVE request
+ */
+static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_removeargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_remove(xdr, &args->name, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode RENAME request
+ */
+static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_renameargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->old_dir, &hdr);
+ encode_savefh(xdr, &hdr);
+ encode_putfh(xdr, args->new_dir, &hdr);
+ encode_rename(xdr, args->old_name, args->new_name, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode LINK request
+ */
+static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_link_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_savefh(xdr, &hdr);
+ encode_putfh(xdr, args->dir_fh, &hdr);
+ encode_link(xdr, args->name, &hdr);
+ encode_restorefh(xdr, &hdr);
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode CREATE request
+ */
+static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_create_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->dir_fh, &hdr);
+ encode_create(xdr, args, &hdr);
+ encode_getfh(xdr, &hdr);
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode SYMLINK request
+ */
+static void nfs4_xdr_enc_symlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_create_arg *args = data;
+
+ nfs4_xdr_enc_create(req, xdr, args);
+}
+
+/*
+ * Encode GETATTR request
+ */
+static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_getattr_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode a CLOSE request
+ */
+static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_closeargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ if (args->lr_args)
+ encode_layoutreturn(xdr, args->lr_args, &hdr);
+ if (args->bitmask != NULL)
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_close(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode an OPEN request
+ */
+static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_openargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_open(xdr, args, &hdr);
+ encode_getfh(xdr, &hdr);
+ if (args->access)
+ encode_access(xdr, args->access, &hdr);
+ encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
+ if (args->lg_args) {
+ encode_layoutget(xdr, args->lg_args, &hdr);
+ rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0,
+ args->lg_args->layout.pglen,
+ hdr.replen);
+ }
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode an OPEN_CONFIRM request
+ */
+static void nfs4_xdr_enc_open_confirm(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_open_confirmargs *args = data;
+ struct compound_hdr hdr = {
+ .nops = 0,
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_open_confirm(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode an OPEN request with no attributes.
+ */
+static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_openargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_open(xdr, args, &hdr);
+ if (args->access)
+ encode_access(xdr, args->access, &hdr);
+ encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);
+ if (args->lg_args) {
+ encode_layoutget(xdr, args->lg_args, &hdr);
+ rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0,
+ args->lg_args->layout.pglen,
+ hdr.replen);
+ }
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode an OPEN_DOWNGRADE request
+ */
+static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_closeargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ if (args->lr_args)
+ encode_layoutreturn(xdr, args->lr_args, &hdr);
+ encode_open_downgrade(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode a LOCK request
+ */
+static void nfs4_xdr_enc_lock(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_lock_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_lock(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode a LOCKT request
+ */
+static void nfs4_xdr_enc_lockt(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_lockt_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_lockt(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode a LOCKU request
+ */
+static void nfs4_xdr_enc_locku(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_locku_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_locku(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+static void nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_release_lockowner_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = 0,
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_release_lockowner(xdr, &args->lock_owner, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode a READLINK request
+ */
+static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_readlink *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_readlink(xdr, args, req, &hdr);
+
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->pglen, hdr.replen);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode a READDIR request
+ */
+static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_readdir_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_readdir(xdr, args, req, &hdr);
+
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, hdr.replen);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode a READ request
+ */
+static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_pgio_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_read(xdr, args, &hdr);
+
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+ args->count, hdr.replen);
+ req->rq_rcv_buf.flags |= XDRBUF_READ;
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode an SETATTR request
+ */
+static void nfs4_xdr_enc_setattr(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_setattrargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_setattr(xdr, args, args->server, &hdr);
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode a GETACL request
+ */
+static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_getaclargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+ const __u32 nfs4_acl_bitmap[1] = {
+ [0] = FATTR4_WORD0_ACL,
+ };
+ uint32_t replen;
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ replen = hdr.replen + op_decode_hdr_maxsz;
+ encode_getattr(xdr, nfs4_acl_bitmap, NULL,
+ ARRAY_SIZE(nfs4_acl_bitmap), &hdr);
+
+ rpc_prepare_reply_pages(req, args->acl_pages, 0,
+ args->acl_len, replen + 1);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode a WRITE request
+ */
+static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_pgio_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_write(xdr, args, &hdr);
+ req->rq_snd_buf.flags |= XDRBUF_WRITE;
+ if (args->bitmask)
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * a COMMIT request
+ */
+static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_commitargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_commit(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * FSINFO request
+ */
+static void nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_fsinfo_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_fsinfo(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * a PATHCONF request
+ */
+static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_pathconf_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_getattr(xdr, nfs4_pathconf_bitmap, args->bitmask,
+ ARRAY_SIZE(nfs4_pathconf_bitmap), &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * a STATFS request
+ */
+static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_statfs_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_getattr(xdr, nfs4_statfs_bitmap, args->bitmask,
+ ARRAY_SIZE(nfs4_statfs_bitmap), &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * GETATTR_BITMAP request
+ */
+static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_server_caps_arg *args = data;
+ const u32 *bitmask = args->bitmask;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fhandle, &hdr);
+ encode_getattr(xdr, bitmask, NULL, 3, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * a RENEW request
+ */
+static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+
+{
+ const struct nfs_client *clp = data;
+ struct compound_hdr hdr = {
+ .nops = 0,
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_renew(xdr, clp->cl_clientid, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * a SETCLIENTID request
+ */
+static void nfs4_xdr_enc_setclientid(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_setclientid *sc = data;
+ struct compound_hdr hdr = {
+ .nops = 0,
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_setclientid(xdr, sc, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * a SETCLIENTID_CONFIRM request
+ */
+static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_setclientid_res *arg = data;
+ struct compound_hdr hdr = {
+ .nops = 0,
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_setclientid_confirm(xdr, arg, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * DELEGRETURN request
+ */
+static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_delegreturnargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fhandle, &hdr);
+ if (args->lr_args)
+ encode_layoutreturn(xdr, args->lr_args, &hdr);
+ if (args->bitmask)
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_delegreturn(xdr, args->stateid, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode FS_LOCATIONS request
+ */
+static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_fs_locations_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+ uint32_t replen;
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ if (args->migration) {
+ encode_putfh(xdr, args->fh, &hdr);
+ replen = hdr.replen;
+ encode_fs_locations(xdr, args->bitmask, &hdr);
+ if (args->renew)
+ encode_renew(xdr, args->clientid, &hdr);
+ } else {
+ encode_putfh(xdr, args->dir_fh, &hdr);
+ encode_lookup(xdr, args->name, &hdr);
+ replen = hdr.replen;
+ encode_fs_locations(xdr, args->bitmask, &hdr);
+ }
+
+ rpc_prepare_reply_pages(req, (struct page **)&args->page, 0,
+ PAGE_SIZE, replen + 1);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode SECINFO request
+ */
+static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_secinfo_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->dir_fh, &hdr);
+ encode_secinfo(xdr, args->name, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode FSID_PRESENT request
+ */
+static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_fsid_present_arg *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_getfh(xdr, &hdr);
+ if (args->renew)
+ encode_renew(xdr, args->clientid, &hdr);
+ encode_nops(&hdr);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * BIND_CONN_TO_SESSION request
+ */
+static void nfs4_xdr_enc_bind_conn_to_session(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs41_bind_conn_to_session_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = args->client->cl_mvops->minor_version,
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_bind_conn_to_session(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * EXCHANGE_ID request
+ */
+static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs41_exchange_id_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = args->client->cl_mvops->minor_version,
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_exchange_id(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * a CREATE_SESSION request
+ */
+static void nfs4_xdr_enc_create_session(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs41_create_session_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = args->client->cl_mvops->minor_version,
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_create_session(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * a DESTROY_SESSION request
+ */
+static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_session *session = data;
+ struct compound_hdr hdr = {
+ .minorversion = session->clp->cl_mvops->minor_version,
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_destroy_session(xdr, session, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * a DESTROY_CLIENTID request
+ */
+static void nfs4_xdr_enc_destroy_clientid(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_client *clp = data;
+ struct compound_hdr hdr = {
+ .minorversion = clp->cl_mvops->minor_version,
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_destroy_clientid(xdr, clp->cl_clientid, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * a SEQUENCE request
+ */
+static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_sequence_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+#endif
+
+/*
+ * a GET_LEASE_TIME request
+ */
+static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_get_lease_time_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
+ };
+ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->la_seq_args, &hdr);
+ encode_putrootfh(xdr, &hdr);
+ encode_fsinfo(xdr, lease_bitmap, &hdr);
+ encode_nops(&hdr);
+}
+
+#ifdef CONFIG_NFS_V4_1
+
+/*
+ * a RECLAIM_COMPLETE request
+ */
+static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs41_reclaim_complete_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args)
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_reclaim_complete(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode GETDEVICEINFO request
+ */
+static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_getdeviceinfo_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+ uint32_t replen;
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+
+ replen = hdr.replen + op_decode_hdr_maxsz;
+
+ encode_getdeviceinfo(xdr, args, &hdr);
+
+ /* set up reply kvec. device_addr4 opaque data is read into the
+ * pages */
+ rpc_prepare_reply_pages(req, args->pdev->pages, args->pdev->pgbase,
+ args->pdev->pglen, replen + 2 + 1);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode LAYOUTGET request
+ */
+static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_layoutget_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+ encode_layoutget(xdr, args, &hdr);
+
+ rpc_prepare_reply_pages(req, args->layout.pages, 0,
+ args->layout.pglen, hdr.replen);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode LAYOUTCOMMIT request
+ */
+static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *priv)
+{
+ const struct nfs4_layoutcommit_args *args = priv;
+ struct nfs4_layoutcommit_data *data =
+ container_of(args, struct nfs4_layoutcommit_data, args);
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+ encode_layoutcommit(xdr, data->args.inode, args, &hdr);
+ encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode LAYOUTRETURN request
+ */
+static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs4_layoutreturn_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+ encode_layoutreturn(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode SECINFO_NO_NAME request
+ */
+static void nfs4_xdr_enc_secinfo_no_name(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs41_secinfo_no_name_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putrootfh(xdr, &hdr);
+ encode_secinfo_no_name(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode TEST_STATEID request
+ */
+static void nfs4_xdr_enc_test_stateid(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs41_test_stateid_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_test_stateid(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Encode FREE_STATEID request
+ */
+static void nfs4_xdr_enc_free_stateid(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs41_free_stateid_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_free_stateid(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
+{
+ ssize_t ret = xdr_stream_decode_opaque_inline(xdr, (void **)string,
+ NFS4_OPAQUE_LIMIT);
+ if (unlikely(ret < 0))
+ return -EIO;
+ *len = ret;
+ return 0;
+}
+
+static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ hdr->status = be32_to_cpup(p++);
+ hdr->taglen = be32_to_cpup(p);
+
+ p = xdr_inline_decode(xdr, hdr->taglen + 4);
+ if (unlikely(!p))
+ return -EIO;
+ hdr->tag = (char *)p;
+ p += XDR_QUADLEN(hdr->taglen);
+ hdr->nops = be32_to_cpup(p);
+ if (unlikely(hdr->nops < 1))
+ return nfs4_stat_to_errno(hdr->status);
+ return 0;
+}
+
+static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected,
+ int *nfs_retval)
+{
+ __be32 *p;
+ uint32_t opnum;
+ int32_t nfserr;
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ opnum = be32_to_cpup(p++);
+ if (unlikely(opnum != expected))
+ goto out_bad_operation;
+ if (unlikely(*p != cpu_to_be32(NFS_OK)))
+ goto out_status;
+ *nfs_retval = 0;
+ return true;
+out_status:
+ nfserr = be32_to_cpup(p);
+ trace_nfs4_xdr_status(xdr, opnum, nfserr);
+ *nfs_retval = nfs4_stat_to_errno(nfserr);
+ return true;
+out_bad_operation:
+ dprintk("nfs: Server returned operation"
+ " %d but we issued a request for %d\n",
+ opnum, expected);
+ *nfs_retval = -EREMOTEIO;
+ return false;
+out_overflow:
+ *nfs_retval = -EIO;
+ return false;
+}
+
+static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
+{
+ int retval;
+
+ __decode_op_hdr(xdr, expected, &retval);
+ return retval;
+}
+
+/* Dummy routine */
+static int decode_ace(struct xdr_stream *xdr, void *ace)
+{
+ __be32 *p;
+ unsigned int strlen;
+ char *str;
+
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(!p))
+ return -EIO;
+ return decode_opaque_inline(xdr, &strlen, &str);
+}
+
+static ssize_t
+decode_bitmap4(struct xdr_stream *xdr, uint32_t *bitmap, size_t sz)
+{
+ ssize_t ret;
+
+ ret = xdr_stream_decode_uint32_array(xdr, bitmap, sz);
+ if (likely(ret >= 0))
+ return ret;
+ if (ret != -EMSGSIZE)
+ return -EIO;
+ return sz;
+}
+
+static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
+{
+ ssize_t ret;
+ ret = decode_bitmap4(xdr, bitmap, 3);
+ return ret < 0 ? ret : 0;
+}
+
+static int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, unsigned int *savep)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *attrlen = be32_to_cpup(p);
+ *savep = xdr_stream_pos(xdr);
+ return 0;
+}
+
+static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
+{
+ if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) {
+ int ret;
+ ret = decode_attr_bitmap(xdr, bitmask);
+ if (unlikely(ret < 0))
+ return ret;
+ bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
+ } else
+ bitmask[0] = bitmask[1] = bitmask[2] = 0;
+ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
+ bitmask[0], bitmask[1], bitmask[2]);
+ return 0;
+}
+
+static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type)
+{
+ __be32 *p;
+ int ret = 0;
+
+ *type = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *type = be32_to_cpup(p);
+ if (*type < NF4REG || *type > NF4NAMEDATTR) {
+ dprintk("%s: bad type %d\n", __func__, *type);
+ return -EIO;
+ }
+ bitmap[0] &= ~FATTR4_WORD0_TYPE;
+ ret = NFS_ATTR_FATTR_TYPE;
+ }
+ dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
+ return ret;
+}
+
+static int decode_attr_fh_expire_type(struct xdr_stream *xdr,
+ uint32_t *bitmap, uint32_t *type)
+{
+ __be32 *p;
+
+ *type = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_FH_EXPIRE_TYPE - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_FH_EXPIRE_TYPE)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *type = be32_to_cpup(p);
+ bitmap[0] &= ~FATTR4_WORD0_FH_EXPIRE_TYPE;
+ }
+ dprintk("%s: expire type=0x%x\n", __func__, *type);
+ return 0;
+}
+
+static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
+{
+ __be32 *p;
+ int ret = 0;
+
+ *change = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, change);
+ bitmap[0] &= ~FATTR4_WORD0_CHANGE;
+ ret = NFS_ATTR_FATTR_CHANGE;
+ }
+ dprintk("%s: change attribute=%Lu\n", __func__,
+ (unsigned long long)*change);
+ return ret;
+}
+
+static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
+{
+ __be32 *p;
+ int ret = 0;
+
+ *size = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, size);
+ bitmap[0] &= ~FATTR4_WORD0_SIZE;
+ ret = NFS_ATTR_FATTR_SIZE;
+ }
+ dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
+ return ret;
+}
+
+static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+ __be32 *p;
+
+ *res = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *res = be32_to_cpup(p);
+ bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
+ }
+ dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
+ return 0;
+}
+
+static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+ __be32 *p;
+
+ *res = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *res = be32_to_cpup(p);
+ bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
+ }
+ dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
+ return 0;
+}
+
+static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
+{
+ __be32 *p;
+ int ret = 0;
+
+ fsid->major = 0;
+ fsid->minor = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_FSID)) {
+ p = xdr_inline_decode(xdr, 16);
+ if (unlikely(!p))
+ return -EIO;
+ p = xdr_decode_hyper(p, &fsid->major);
+ xdr_decode_hyper(p, &fsid->minor);
+ bitmap[0] &= ~FATTR4_WORD0_FSID;
+ ret = NFS_ATTR_FATTR_FSID;
+ }
+ dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__,
+ (unsigned long long)fsid->major,
+ (unsigned long long)fsid->minor);
+ return ret;
+}
+
+static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+ __be32 *p;
+
+ *res = 60;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *res = be32_to_cpup(p);
+ bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
+ }
+ dprintk("%s: lease time=%u\n", __func__, (unsigned int)*res);
+ return 0;
+}
+
+static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *res)
+{
+ __be32 *p;
+
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_RDATTR_ERROR - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
+ *res = -be32_to_cpup(p);
+ }
+ return 0;
+}
+
+static int decode_attr_exclcreat_supported(struct xdr_stream *xdr,
+ uint32_t *bitmap, uint32_t *bitmask)
+{
+ if (likely(bitmap[2] & FATTR4_WORD2_SUPPATTR_EXCLCREAT)) {
+ int ret;
+ ret = decode_attr_bitmap(xdr, bitmask);
+ if (unlikely(ret < 0))
+ return ret;
+ bitmap[2] &= ~FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+ } else
+ bitmask[0] = bitmask[1] = bitmask[2] = 0;
+ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
+ bitmask[0], bitmask[1], bitmask[2]);
+ return 0;
+}
+
+static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
+{
+ __be32 *p;
+ u32 len;
+
+ if (fh != NULL)
+ memset(fh, 0, sizeof(*fh));
+
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEHANDLE - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ len = be32_to_cpup(p);
+ if (len > NFS4_FHSIZE)
+ return -EIO;
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return -EIO;
+ if (fh != NULL) {
+ memcpy(fh->data, p, len);
+ fh->size = len;
+ }
+ bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE;
+ }
+ return 0;
+}
+
+static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+ __be32 *p;
+
+ *res = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *res = be32_to_cpup(p);
+ bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
+ }
+ dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
+ return 0;
+}
+
+static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
+{
+ __be32 *p;
+ int ret = 0;
+
+ *fileid = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, fileid);
+ bitmap[0] &= ~FATTR4_WORD0_FILEID;
+ ret = NFS_ATTR_FATTR_FILEID;
+ }
+ dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
+ return ret;
+}
+
+static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
+{
+ __be32 *p;
+ int ret = 0;
+
+ *fileid = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, fileid);
+ bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
+ ret = NFS_ATTR_FATTR_MOUNTED_ON_FILEID;
+ }
+ dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
+ return ret;
+}
+
+static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+ __be32 *p;
+ int status = 0;
+
+ *res = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, res);
+ bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
+ }
+ dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
+ return status;
+}
+
+static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+ __be32 *p;
+ int status = 0;
+
+ *res = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, res);
+ bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
+ }
+ dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
+ return status;
+}
+
+static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+ __be32 *p;
+ int status = 0;
+
+ *res = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, res);
+ bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
+ }
+ dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
+ return status;
+}
+
+static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
+{
+ u32 n;
+ __be32 *p;
+ int status = 0;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ n = be32_to_cpup(p);
+ if (n == 0)
+ goto root_path;
+ dprintk("pathname4: ");
+ if (n > NFS4_PATHNAME_MAXCOMPONENTS) {
+ dprintk("cannot parse %d components in path\n", n);
+ goto out_eio;
+ }
+ for (path->ncomponents = 0; path->ncomponents < n; path->ncomponents++) {
+ struct nfs4_string *component = &path->components[path->ncomponents];
+ status = decode_opaque_inline(xdr, &component->len, &component->data);
+ if (unlikely(status != 0))
+ goto out_eio;
+ ifdebug (XDR)
+ pr_cont("%s%.*s ",
+ (path->ncomponents != n ? "/ " : ""),
+ component->len, component->data);
+ }
+out:
+ return status;
+root_path:
+/* a root pathname is sent as a zero component4 */
+ path->ncomponents = 1;
+ path->components[0].len=0;
+ path->components[0].data=NULL;
+ dprintk("pathname4: /\n");
+ goto out;
+out_eio:
+ dprintk(" status %d", status);
+ status = -EIO;
+ goto out;
+}
+
+static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
+{
+ int n;
+ __be32 *p;
+ int status = -EIO;
+
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_FS_LOCATIONS -1U)))
+ goto out;
+ status = 0;
+ if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
+ goto out;
+ bitmap[0] &= ~FATTR4_WORD0_FS_LOCATIONS;
+ status = -EIO;
+ /* Ignore borken servers that return unrequested attrs */
+ if (unlikely(res == NULL))
+ goto out;
+ dprintk("%s: fsroot:\n", __func__);
+ status = decode_pathname(xdr, &res->fs_path);
+ if (unlikely(status != 0))
+ goto out;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_eio;
+ n = be32_to_cpup(p);
+ for (res->nlocations = 0; res->nlocations < n; res->nlocations++) {
+ u32 m;
+ struct nfs4_fs_location *loc;
+
+ if (res->nlocations == NFS4_FS_LOCATIONS_MAXENTRIES)
+ break;
+ loc = &res->locations[res->nlocations];
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_eio;
+ m = be32_to_cpup(p);
+
+ dprintk("%s: servers:\n", __func__);
+ for (loc->nservers = 0; loc->nservers < m; loc->nservers++) {
+ struct nfs4_string *server;
+
+ if (loc->nservers == NFS4_FS_LOCATION_MAXSERVERS) {
+ unsigned int i;
+ dprintk("%s: using first %u of %u servers "
+ "returned for location %u\n",
+ __func__,
+ NFS4_FS_LOCATION_MAXSERVERS,
+ m, res->nlocations);
+ for (i = loc->nservers; i < m; i++) {
+ unsigned int len;
+ char *data;
+ status = decode_opaque_inline(xdr, &len, &data);
+ if (unlikely(status != 0))
+ goto out_eio;
+ }
+ break;
+ }
+ server = &loc->servers[loc->nservers];
+ status = decode_opaque_inline(xdr, &server->len, &server->data);
+ if (unlikely(status != 0))
+ goto out_eio;
+ dprintk("%s ", server->data);
+ }
+ status = decode_pathname(xdr, &loc->rootpath);
+ if (unlikely(status != 0))
+ goto out_eio;
+ }
+ if (res->nlocations != 0)
+ status = NFS_ATTR_FATTR_V4_LOCATIONS;
+out:
+ dprintk("%s: fs_locations done, error = %d\n", __func__, status);
+ return status;
+out_eio:
+ status = -EIO;
+ goto out;
+}
+
+static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+ __be32 *p;
+ int status = 0;
+
+ *res = 0;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, res);
+ bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
+ }
+ dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
+ return status;
+}
+
+static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
+{
+ __be32 *p;
+ int status = 0;
+
+ *maxlink = 1;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *maxlink = be32_to_cpup(p);
+ bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
+ }
+ dprintk("%s: maxlink=%u\n", __func__, *maxlink);
+ return status;
+}
+
+static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
+{
+ __be32 *p;
+ int status = 0;
+
+ *maxname = 1024;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *maxname = be32_to_cpup(p);
+ bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
+ }
+ dprintk("%s: maxname=%u\n", __func__, *maxname);
+ return status;
+}
+
+static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+ __be32 *p;
+ int status = 0;
+
+ *res = 1024;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXREAD - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) {
+ uint64_t maxread;
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, &maxread);
+ if (maxread > 0x7FFFFFFF)
+ maxread = 0x7FFFFFFF;
+ *res = (uint32_t)maxread;
+ bitmap[0] &= ~FATTR4_WORD0_MAXREAD;
+ }
+ dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res);
+ return status;
+}
+
+static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
+{
+ __be32 *p;
+ int status = 0;
+
+ *res = 1024;
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXWRITE - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) {
+ uint64_t maxwrite;
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, &maxwrite);
+ if (maxwrite > 0x7FFFFFFF)
+ maxwrite = 0x7FFFFFFF;
+ *res = (uint32_t)maxwrite;
+ bitmap[0] &= ~FATTR4_WORD0_MAXWRITE;
+ }
+ dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res);
+ return status;
+}
+
+static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
+{
+ uint32_t tmp;
+ __be32 *p;
+ int ret = 0;
+
+ *mode = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ tmp = be32_to_cpup(p);
+ *mode = tmp & ~S_IFMT;
+ bitmap[1] &= ~FATTR4_WORD1_MODE;
+ ret = NFS_ATTR_FATTR_MODE;
+ }
+ dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
+ return ret;
+}
+
+static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
+{
+ __be32 *p;
+ int ret = 0;
+
+ *nlink = 1;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *nlink = be32_to_cpup(p);
+ bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
+ ret = NFS_ATTR_FATTR_NLINK;
+ }
+ dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
+ return ret;
+}
+
+static ssize_t decode_nfs4_string(struct xdr_stream *xdr,
+ struct nfs4_string *name, gfp_t gfp_flags)
+{
+ ssize_t ret;
+
+ ret = xdr_stream_decode_string_dup(xdr, &name->data,
+ XDR_MAX_NETOBJ, gfp_flags);
+ name->len = 0;
+ if (ret > 0)
+ name->len = ret;
+ return ret;
+}
+
+static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
+ const struct nfs_server *server, kuid_t *uid,
+ struct nfs4_string *owner_name)
+{
+ ssize_t len;
+ char *p;
+
+ *uid = make_kuid(&init_user_ns, -2);
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
+ return -EIO;
+ if (!(bitmap[1] & FATTR4_WORD1_OWNER))
+ return 0;
+ bitmap[1] &= ~FATTR4_WORD1_OWNER;
+
+ if (owner_name != NULL) {
+ len = decode_nfs4_string(xdr, owner_name, GFP_NOIO);
+ if (len <= 0)
+ goto out;
+ dprintk("%s: name=%s\n", __func__, owner_name->data);
+ return NFS_ATTR_FATTR_OWNER_NAME;
+ } else {
+ len = xdr_stream_decode_opaque_inline(xdr, (void **)&p,
+ XDR_MAX_NETOBJ);
+ if (len <= 0 || nfs_map_name_to_uid(server, p, len, uid) != 0)
+ goto out;
+ dprintk("%s: uid=%d\n", __func__, (int)from_kuid(&init_user_ns, *uid));
+ return NFS_ATTR_FATTR_OWNER;
+ }
+out:
+ if (len == -EBADMSG)
+ return -EIO;
+ return 0;
+}
+
+static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
+ const struct nfs_server *server, kgid_t *gid,
+ struct nfs4_string *group_name)
+{
+ ssize_t len;
+ char *p;
+
+ *gid = make_kgid(&init_user_ns, -2);
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
+ return -EIO;
+ if (!(bitmap[1] & FATTR4_WORD1_OWNER_GROUP))
+ return 0;
+ bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
+
+ if (group_name != NULL) {
+ len = decode_nfs4_string(xdr, group_name, GFP_NOIO);
+ if (len <= 0)
+ goto out;
+ dprintk("%s: name=%s\n", __func__, group_name->data);
+ return NFS_ATTR_FATTR_GROUP_NAME;
+ } else {
+ len = xdr_stream_decode_opaque_inline(xdr, (void **)&p,
+ XDR_MAX_NETOBJ);
+ if (len <= 0 || nfs_map_group_to_gid(server, p, len, gid) != 0)
+ goto out;
+ dprintk("%s: gid=%d\n", __func__, (int)from_kgid(&init_user_ns, *gid));
+ return NFS_ATTR_FATTR_GROUP;
+ }
+out:
+ if (len == -EBADMSG)
+ return -EIO;
+ return 0;
+}
+
+static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
+{
+ uint32_t major = 0, minor = 0;
+ __be32 *p;
+ int ret = 0;
+
+ *rdev = MKDEV(0,0);
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) {
+ dev_t tmp;
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ major = be32_to_cpup(p++);
+ minor = be32_to_cpup(p);
+ tmp = MKDEV(major, minor);
+ if (MAJOR(tmp) == major && MINOR(tmp) == minor)
+ *rdev = tmp;
+ bitmap[1] &= ~ FATTR4_WORD1_RAWDEV;
+ ret = NFS_ATTR_FATTR_RDEV;
+ }
+ dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
+ return ret;
+}
+
+static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+ __be32 *p;
+ int status = 0;
+
+ *res = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, res);
+ bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
+ }
+ dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
+ return status;
+}
+
+static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+ __be32 *p;
+ int status = 0;
+
+ *res = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, res);
+ bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
+ }
+ dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
+ return status;
+}
+
+static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
+{
+ __be32 *p;
+ int status = 0;
+
+ *res = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, res);
+ bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
+ }
+ dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
+ return status;
+}
+
+static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
+{
+ __be32 *p;
+ int ret = 0;
+
+ *used = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, used);
+ bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
+ ret = NFS_ATTR_FATTR_SPACE_USED;
+ }
+ dprintk("%s: space used=%Lu\n", __func__,
+ (unsigned long long)*used);
+ return ret;
+}
+
+static __be32 *
+xdr_decode_nfstime4(__be32 *p, struct timespec64 *t)
+{
+ __u64 sec;
+
+ p = xdr_decode_hyper(p, &sec);
+ t-> tv_sec = sec;
+ t->tv_nsec = be32_to_cpup(p++);
+ return p;
+}
+
+static int decode_attr_time(struct xdr_stream *xdr, struct timespec64 *time)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, nfstime4_maxsz << 2);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_nfstime4(p, time);
+ return 0;
+}
+
+static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time)
+{
+ int status = 0;
+
+ time->tv_sec = 0;
+ time->tv_nsec = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_ACCESS - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) {
+ status = decode_attr_time(xdr, time);
+ if (status == 0)
+ status = NFS_ATTR_FATTR_ATIME;
+ bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS;
+ }
+ dprintk("%s: atime=%lld\n", __func__, time->tv_sec);
+ return status;
+}
+
+static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time)
+{
+ int status = 0;
+
+ time->tv_sec = 0;
+ time->tv_nsec = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_METADATA - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) {
+ status = decode_attr_time(xdr, time);
+ if (status == 0)
+ status = NFS_ATTR_FATTR_CTIME;
+ bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA;
+ }
+ dprintk("%s: ctime=%lld\n", __func__, time->tv_sec);
+ return status;
+}
+
+static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
+ struct timespec64 *time)
+{
+ int status = 0;
+
+ time->tv_sec = 0;
+ time->tv_nsec = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_DELTA - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_TIME_DELTA)) {
+ status = decode_attr_time(xdr, time);
+ bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA;
+ }
+ dprintk("%s: time_delta=%lld %ld\n", __func__, time->tv_sec,
+ time->tv_nsec);
+ return status;
+}
+
+static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
+ struct nfs4_label *label)
+{
+ uint32_t pi = 0;
+ uint32_t lfs = 0;
+ __u32 len;
+ __be32 *p;
+ int status = 0;
+
+ if (unlikely(bitmap[2] & (FATTR4_WORD2_SECURITY_LABEL - 1U)))
+ return -EIO;
+ if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ lfs = be32_to_cpup(p++);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ pi = be32_to_cpup(p++);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ len = be32_to_cpup(p++);
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return -EIO;
+ bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+ if (len < NFS4_MAXLABELLEN) {
+ if (label && label->len) {
+ if (label->len < len)
+ return -ERANGE;
+ memcpy(label->label, p, len);
+ label->len = len;
+ label->pi = pi;
+ label->lfs = lfs;
+ status = NFS_ATTR_FATTR_V4_SECURITY_LABEL;
+ }
+ } else
+ printk(KERN_WARNING "%s: label too long (%u)!\n",
+ __func__, len);
+ if (label && label->label)
+ dprintk("%s: label=%.*s, len=%d, PI=%d, LFS=%d\n",
+ __func__, label->len, (char *)label->label,
+ label->len, label->pi, label->lfs);
+ }
+ return status;
+}
+
+static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time)
+{
+ int status = 0;
+
+ time->tv_sec = 0;
+ time->tv_nsec = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_MODIFY - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) {
+ status = decode_attr_time(xdr, time);
+ if (status == 0)
+ status = NFS_ATTR_FATTR_MTIME;
+ bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY;
+ }
+ dprintk("%s: mtime=%lld\n", __func__, time->tv_sec);
+ return status;
+}
+
+static int decode_attr_xattrsupport(struct xdr_stream *xdr, uint32_t *bitmap,
+ uint32_t *res)
+{
+ __be32 *p;
+
+ *res = 0;
+ if (unlikely(bitmap[2] & (FATTR4_WORD2_XATTR_SUPPORT - 1U)))
+ return -EIO;
+ if (likely(bitmap[2] & FATTR4_WORD2_XATTR_SUPPORT)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *res = be32_to_cpup(p);
+ bitmap[2] &= ~FATTR4_WORD2_XATTR_SUPPORT;
+ }
+ dprintk("%s: XATTR support=%s\n", __func__,
+ *res == 0 ? "false" : "true");
+ return 0;
+}
+
+static int verify_attr_len(struct xdr_stream *xdr, unsigned int savep, uint32_t attrlen)
+{
+ unsigned int attrwords = XDR_QUADLEN(attrlen);
+ unsigned int nwords = (xdr_stream_pos(xdr) - savep) >> 2;
+
+ if (unlikely(attrwords != nwords)) {
+ dprintk("%s: server returned incorrect attribute length: "
+ "%u %c %u\n",
+ __func__,
+ attrwords << 2,
+ (attrwords < nwords) ? '<' : '>',
+ nwords << 2);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 20);
+ if (unlikely(!p))
+ return -EIO;
+ cinfo->atomic = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &cinfo->before);
+ xdr_decode_hyper(p, &cinfo->after);
+ return 0;
+}
+
+static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access)
+{
+ __be32 *p;
+ uint32_t supp, acc;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_ACCESS);
+ if (status)
+ return status;
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ supp = be32_to_cpup(p++);
+ acc = be32_to_cpup(p);
+ *supported = supp;
+ *access = acc;
+ return 0;
+}
+
+static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
+{
+ ssize_t ret = xdr_stream_decode_opaque_fixed(xdr, buf, len);
+ if (unlikely(ret < 0))
+ return -EIO;
+ return 0;
+}
+
+static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
+}
+
+static int decode_open_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_OPEN_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
+static int decode_lock_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_LOCK_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
+static int decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_DELEGATION_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
+static int decode_invalid_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ nfs4_stateid dummy;
+
+ nfs4_stateid_copy(stateid, &invalid_stateid);
+ return decode_stateid(xdr, &dummy);
+}
+
+static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_CLOSE);
+ if (status != -EIO)
+ nfs_increment_open_seqid(status, res->seqid);
+ if (!status)
+ status = decode_invalid_stateid(xdr, &res->stateid);
+ return status;
+}
+
+static int decode_verifier(struct xdr_stream *xdr, void *verifier)
+{
+ return decode_opaque_fixed(xdr, verifier, NFS4_VERIFIER_SIZE);
+}
+
+static int decode_write_verifier(struct xdr_stream *xdr, struct nfs_write_verifier *verifier)
+{
+ return decode_opaque_fixed(xdr, verifier->data, NFS4_VERIFIER_SIZE);
+}
+
+static int decode_commit(struct xdr_stream *xdr, struct nfs_commitres *res)
+{
+ struct nfs_writeverf *verf = res->verf;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_COMMIT);
+ if (!status)
+ status = decode_write_verifier(xdr, &verf->verifier);
+ if (!status)
+ verf->committed = NFS_FILE_SYNC;
+ return status;
+}
+
+static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
+{
+ __be32 *p;
+ uint32_t bmlen;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_CREATE);
+ if (status)
+ return status;
+ if ((status = decode_change_info(xdr, cinfo)))
+ return status;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ bmlen = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, bmlen << 2);
+ if (likely(p))
+ return 0;
+ return -EIO;
+}
+
+static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
+{
+ unsigned int savep;
+ uint32_t attrlen, bitmap[3] = {0};
+ int status;
+
+ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_supported(xdr, bitmap, res->attr_bitmask)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_fh_expire_type(xdr, bitmap,
+ &res->fh_expire_type)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_link_support(xdr, bitmap, &res->has_links)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_symlink_support(xdr, bitmap, &res->has_symlinks)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_exclcreat_supported(xdr, bitmap,
+ res->exclcreat_bitmask)) != 0)
+ goto xdr_error;
+ status = verify_attr_len(xdr, savep, attrlen);
+xdr_error:
+ dprintk("%s: xdr returned %d!\n", __func__, -status);
+ return status;
+}
+
+static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
+{
+ unsigned int savep;
+ uint32_t attrlen, bitmap[3] = {0};
+ int status;
+
+ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+ goto xdr_error;
+
+ if ((status = decode_attr_files_avail(xdr, bitmap, &fsstat->afiles)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_files_free(xdr, bitmap, &fsstat->ffiles)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_files_total(xdr, bitmap, &fsstat->tfiles)) != 0)
+ goto xdr_error;
+
+ status = -EIO;
+ if (unlikely(bitmap[0]))
+ goto xdr_error;
+
+ if ((status = decode_attr_space_avail(xdr, bitmap, &fsstat->abytes)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_space_free(xdr, bitmap, &fsstat->fbytes)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_space_total(xdr, bitmap, &fsstat->tbytes)) != 0)
+ goto xdr_error;
+
+ status = verify_attr_len(xdr, savep, attrlen);
+xdr_error:
+ dprintk("%s: xdr returned %d!\n", __func__, -status);
+ return status;
+}
+
+static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
+{
+ unsigned int savep;
+ uint32_t attrlen, bitmap[3] = {0};
+ int status;
+
+ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+ goto xdr_error;
+
+ if ((status = decode_attr_maxlink(xdr, bitmap, &pathconf->max_link)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_maxname(xdr, bitmap, &pathconf->max_namelen)) != 0)
+ goto xdr_error;
+
+ status = verify_attr_len(xdr, savep, attrlen);
+xdr_error:
+ dprintk("%s: xdr returned %d!\n", __func__, -status);
+ return status;
+}
+
+static int decode_threshold_hint(struct xdr_stream *xdr,
+ uint32_t *bitmap,
+ uint64_t *res,
+ uint32_t hint_bit)
+{
+ __be32 *p;
+
+ *res = 0;
+ if (likely(bitmap[0] & hint_bit)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, res);
+ }
+ return 0;
+}
+
+static int decode_first_threshold_item4(struct xdr_stream *xdr,
+ struct nfs4_threshold *res)
+{
+ __be32 *p;
+ unsigned int savep;
+ uint32_t bitmap[3] = {0,}, attrlen;
+ int status;
+
+ /* layout type */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ res->l_type = be32_to_cpup(p);
+
+ /* thi_hintset bitmap */
+ status = decode_attr_bitmap(xdr, bitmap);
+ if (status < 0)
+ goto xdr_error;
+
+ /* thi_hintlist length */
+ status = decode_attr_length(xdr, &attrlen, &savep);
+ if (status < 0)
+ goto xdr_error;
+ /* thi_hintlist */
+ status = decode_threshold_hint(xdr, bitmap, &res->rd_sz, THRESHOLD_RD);
+ if (status < 0)
+ goto xdr_error;
+ status = decode_threshold_hint(xdr, bitmap, &res->wr_sz, THRESHOLD_WR);
+ if (status < 0)
+ goto xdr_error;
+ status = decode_threshold_hint(xdr, bitmap, &res->rd_io_sz,
+ THRESHOLD_RD_IO);
+ if (status < 0)
+ goto xdr_error;
+ status = decode_threshold_hint(xdr, bitmap, &res->wr_io_sz,
+ THRESHOLD_WR_IO);
+ if (status < 0)
+ goto xdr_error;
+
+ status = verify_attr_len(xdr, savep, attrlen);
+ res->bm = bitmap[0];
+
+ dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
+ __func__, res->bm, res->rd_sz, res->wr_sz, res->rd_io_sz,
+ res->wr_io_sz);
+xdr_error:
+ dprintk("%s ret=%d!\n", __func__, status);
+ return status;
+}
+
+/*
+ * Thresholds on pNFS direct I/O vrs MDS I/O
+ */
+static int decode_attr_mdsthreshold(struct xdr_stream *xdr,
+ uint32_t *bitmap,
+ struct nfs4_threshold *res)
+{
+ __be32 *p;
+ int status = 0;
+ uint32_t num;
+
+ if (unlikely(bitmap[2] & (FATTR4_WORD2_MDSTHRESHOLD - 1U)))
+ return -EIO;
+ if (bitmap[2] & FATTR4_WORD2_MDSTHRESHOLD) {
+ /* Did the server return an unrequested attribute? */
+ if (unlikely(res == NULL))
+ return -EREMOTEIO;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ num = be32_to_cpup(p);
+ if (num == 0)
+ return 0;
+ if (num > 1)
+ printk(KERN_INFO "%s: Warning: Multiple pNFS layout "
+ "drivers per filesystem not supported\n",
+ __func__);
+
+ status = decode_first_threshold_item4(xdr, res);
+ bitmap[2] &= ~FATTR4_WORD2_MDSTHRESHOLD;
+ }
+ return status;
+}
+
+static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
+ struct nfs_fattr *fattr, struct nfs_fh *fh,
+ struct nfs4_fs_locations *fs_loc, struct nfs4_label *label,
+ const struct nfs_server *server)
+{
+ int status;
+ umode_t fmode = 0;
+ uint32_t type;
+ int32_t err;
+
+ status = decode_attr_type(xdr, bitmap, &type);
+ if (status < 0)
+ goto xdr_error;
+ fattr->mode = 0;
+ if (status != 0) {
+ fattr->mode |= nfs_type2fmt[type];
+ fattr->valid |= status;
+ }
+
+ status = decode_attr_change(xdr, bitmap, &fattr->change_attr);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_size(xdr, bitmap, &fattr->size);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_fsid(xdr, bitmap, &fattr->fsid);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ err = 0;
+ status = decode_attr_error(xdr, bitmap, &err);
+ if (status < 0)
+ goto xdr_error;
+
+ status = decode_attr_filehandle(xdr, bitmap, fh);
+ if (status < 0)
+ goto xdr_error;
+
+ status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_fs_locations(xdr, bitmap, fs_loc);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = -EIO;
+ if (unlikely(bitmap[0]))
+ goto xdr_error;
+
+ status = decode_attr_mode(xdr, bitmap, &fmode);
+ if (status < 0)
+ goto xdr_error;
+ if (status != 0) {
+ fattr->mode |= fmode;
+ fattr->valid |= status;
+ }
+
+ status = decode_attr_nlink(xdr, bitmap, &fattr->nlink);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, fattr->owner_name);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_group(xdr, bitmap, server, &fattr->gid, fattr->group_name);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_rdev(xdr, bitmap, &fattr->rdev);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_time_access(xdr, bitmap, &fattr->atime);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = decode_attr_mounted_on_fileid(xdr, bitmap, &fattr->mounted_on_fileid);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
+ status = -EIO;
+ if (unlikely(bitmap[1]))
+ goto xdr_error;
+
+ status = decode_attr_mdsthreshold(xdr, bitmap, fattr->mdsthreshold);
+ if (status < 0)
+ goto xdr_error;
+
+ if (label) {
+ status = decode_attr_security_label(xdr, bitmap, label);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+ }
+
+xdr_error:
+ dprintk("%s: xdr returned %d\n", __func__, -status);
+ return status;
+}
+
+static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+ struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc,
+ struct nfs4_label *label, const struct nfs_server *server)
+{
+ unsigned int savep;
+ uint32_t attrlen,
+ bitmap[3] = {0};
+ int status;
+
+ status = decode_op_hdr(xdr, OP_GETATTR);
+ if (status < 0)
+ goto xdr_error;
+
+ status = decode_attr_bitmap(xdr, bitmap);
+ if (status < 0)
+ goto xdr_error;
+
+ status = decode_attr_length(xdr, &attrlen, &savep);
+ if (status < 0)
+ goto xdr_error;
+
+ status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc,
+ label, server);
+ if (status < 0)
+ goto xdr_error;
+
+ status = verify_attr_len(xdr, savep, attrlen);
+xdr_error:
+ dprintk("%s: xdr returned %d\n", __func__, -status);
+ return status;
+}
+
+static int decode_getfattr_label(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+ struct nfs4_label *label, const struct nfs_server *server)
+{
+ return decode_getfattr_generic(xdr, fattr, NULL, NULL, label, server);
+}
+
+static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+ const struct nfs_server *server)
+{
+ return decode_getfattr_generic(xdr, fattr, NULL, NULL, NULL, server);
+}
+
+/*
+ * Decode potentially multiple layout types.
+ */
+static int decode_pnfs_layout_types(struct xdr_stream *xdr,
+ struct nfs_fsinfo *fsinfo)
+{
+ __be32 *p;
+ uint32_t i;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ fsinfo->nlayouttypes = be32_to_cpup(p);
+
+ /* pNFS is not supported by the underlying file system */
+ if (fsinfo->nlayouttypes == 0)
+ return 0;
+
+ /* Decode and set first layout type, move xdr->p past unused types */
+ p = xdr_inline_decode(xdr, fsinfo->nlayouttypes * 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ /* If we get too many, then just cap it at the max */
+ if (fsinfo->nlayouttypes > NFS_MAX_LAYOUT_TYPES) {
+ printk(KERN_INFO "NFS: %s: Warning: Too many (%u) pNFS layout types\n",
+ __func__, fsinfo->nlayouttypes);
+ fsinfo->nlayouttypes = NFS_MAX_LAYOUT_TYPES;
+ }
+
+ for(i = 0; i < fsinfo->nlayouttypes; ++i)
+ fsinfo->layouttype[i] = be32_to_cpup(p++);
+ return 0;
+}
+
+/*
+ * The type of file system exported.
+ * Note we must ensure that layouttype is set in any non-error case.
+ */
+static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
+ struct nfs_fsinfo *fsinfo)
+{
+ int status = 0;
+
+ dprintk("%s: bitmap is %x\n", __func__, bitmap[1]);
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
+ return -EIO;
+ if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
+ status = decode_pnfs_layout_types(xdr, fsinfo);
+ bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
+ }
+ return status;
+}
+
+/*
+ * The prefered block size for layout directed io
+ */
+static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
+ uint32_t *res)
+{
+ __be32 *p;
+
+ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
+ *res = 0;
+ if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *res = be32_to_cpup(p);
+ bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE;
+ }
+ return 0;
+}
+
+/*
+ * The granularity of a CLONE operation.
+ */
+static int decode_attr_clone_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
+ uint32_t *res)
+{
+ __be32 *p;
+
+ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
+ *res = 0;
+ if (bitmap[2] & FATTR4_WORD2_CLONE_BLKSIZE) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ *res = be32_to_cpup(p);
+ bitmap[2] &= ~FATTR4_WORD2_CLONE_BLKSIZE;
+ }
+ return 0;
+}
+
+static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
+{
+ unsigned int savep;
+ uint32_t attrlen, bitmap[3];
+ int status;
+
+ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+ goto xdr_error;
+
+ fsinfo->rtmult = fsinfo->wtmult = 512; /* ??? */
+
+ if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_maxfilesize(xdr, bitmap, &fsinfo->maxfilesize)) != 0)
+ goto xdr_error;
+ if ((status = decode_attr_maxread(xdr, bitmap, &fsinfo->rtmax)) != 0)
+ goto xdr_error;
+ fsinfo->rtpref = fsinfo->dtpref = fsinfo->rtmax;
+ if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
+ goto xdr_error;
+ fsinfo->wtpref = fsinfo->wtmax;
+
+ status = -EIO;
+ if (unlikely(bitmap[0]))
+ goto xdr_error;
+
+ status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
+ if (status != 0)
+ goto xdr_error;
+ status = decode_attr_pnfstype(xdr, bitmap, fsinfo);
+ if (status != 0)
+ goto xdr_error;
+
+ status = -EIO;
+ if (unlikely(bitmap[1]))
+ goto xdr_error;
+
+ status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
+ if (status)
+ goto xdr_error;
+ status = decode_attr_clone_blksize(xdr, bitmap, &fsinfo->clone_blksize);
+ if (status)
+ goto xdr_error;
+
+ status = decode_attr_xattrsupport(xdr, bitmap,
+ &fsinfo->xattr_support);
+ if (status)
+ goto xdr_error;
+
+ status = verify_attr_len(xdr, savep, attrlen);
+xdr_error:
+ dprintk("%s: xdr returned %d!\n", __func__, -status);
+ return status;
+}
+
+static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+ __be32 *p;
+ uint32_t len;
+ int status;
+
+ /* Zero handle first to allow comparisons */
+ memset(fh, 0, sizeof(*fh));
+
+ status = decode_op_hdr(xdr, OP_GETFH);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ len = be32_to_cpup(p);
+ if (len > NFS4_FHSIZE)
+ return -EIO;
+ fh->size = len;
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return -EIO;
+ memcpy(fh->data, p, len);
+ return 0;
+}
+
+static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_LINK);
+ if (status)
+ return status;
+ return decode_change_info(xdr, cinfo);
+}
+
+/*
+ * We create the owner, so we know a proper owner.id length is 4.
+ */
+static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
+{
+ uint64_t offset, length, clientid;
+ __be32 *p;
+ uint32_t namelen, type;
+
+ p = xdr_inline_decode(xdr, 32); /* read 32 bytes */
+ if (unlikely(!p))
+ return -EIO;
+ p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */
+ p = xdr_decode_hyper(p, &length);
+ type = be32_to_cpup(p++); /* 4 byte read */
+ if (fl != NULL) { /* manipulate file lock */
+ fl->fl_start = (loff_t)offset;
+ fl->fl_end = fl->fl_start + (loff_t)length - 1;
+ if (length == ~(uint64_t)0)
+ fl->fl_end = OFFSET_MAX;
+ fl->fl_type = F_WRLCK;
+ if (type & 1)
+ fl->fl_type = F_RDLCK;
+ fl->fl_pid = 0;
+ }
+ p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */
+ namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */
+ p = xdr_inline_decode(xdr, namelen); /* variable size field */
+ if (likely(!p))
+ return -EIO;
+ return -NFS4ERR_DENIED;
+}
+
+static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_LOCK);
+ if (status == -EIO)
+ goto out;
+ if (status == 0) {
+ status = decode_lock_stateid(xdr, &res->stateid);
+ if (unlikely(status))
+ goto out;
+ } else if (status == -NFS4ERR_DENIED)
+ status = decode_lock_denied(xdr, NULL);
+ if (res->open_seqid != NULL)
+ nfs_increment_open_seqid(status, res->open_seqid);
+ nfs_increment_lock_seqid(status, res->lock_seqid);
+out:
+ return status;
+}
+
+static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res)
+{
+ int status;
+ status = decode_op_hdr(xdr, OP_LOCKT);
+ if (status == -NFS4ERR_DENIED)
+ return decode_lock_denied(xdr, res->denied);
+ return status;
+}
+
+static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_LOCKU);
+ if (status != -EIO)
+ nfs_increment_lock_seqid(status, res->seqid);
+ if (status == 0)
+ status = decode_lock_stateid(xdr, &res->stateid);
+ return status;
+}
+
+static int decode_release_lockowner(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER);
+}
+
+static int decode_lookup(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_LOOKUP);
+}
+
+static int decode_lookupp(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_LOOKUPP);
+}
+
+/* This is too sick! */
+static int decode_space_limit(struct xdr_stream *xdr,
+ unsigned long *pagemod_limit)
+{
+ __be32 *p;
+ uint32_t limit_type, nblocks, blocksize;
+ u64 maxsize = 0;
+
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(!p))
+ return -EIO;
+ limit_type = be32_to_cpup(p++);
+ switch (limit_type) {
+ case NFS4_LIMIT_SIZE:
+ xdr_decode_hyper(p, &maxsize);
+ break;
+ case NFS4_LIMIT_BLOCKS:
+ nblocks = be32_to_cpup(p++);
+ blocksize = be32_to_cpup(p);
+ maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
+ }
+ maxsize >>= PAGE_SHIFT;
+ *pagemod_limit = min_t(u64, maxsize, ULONG_MAX);
+ return 0;
+}
+
+static int decode_rw_delegation(struct xdr_stream *xdr,
+ uint32_t delegation_type,
+ struct nfs_openres *res)
+{
+ __be32 *p;
+ int status;
+
+ status = decode_delegation_stateid(xdr, &res->delegation);
+ if (unlikely(status))
+ return status;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ res->do_recall = be32_to_cpup(p);
+
+ switch (delegation_type) {
+ case NFS4_OPEN_DELEGATE_READ:
+ res->delegation_type = FMODE_READ;
+ break;
+ case NFS4_OPEN_DELEGATE_WRITE:
+ res->delegation_type = FMODE_WRITE|FMODE_READ;
+ if (decode_space_limit(xdr, &res->pagemod_limit) < 0)
+ return -EIO;
+ }
+ return decode_ace(xdr, NULL);
+}
+
+static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+{
+ __be32 *p;
+ uint32_t why_no_delegation;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ why_no_delegation = be32_to_cpup(p);
+ switch (why_no_delegation) {
+ case WND4_CONTENTION:
+ case WND4_RESOURCE:
+ xdr_inline_decode(xdr, 4);
+ /* Ignore for now */
+ }
+ return 0;
+}
+
+static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+{
+ __be32 *p;
+ uint32_t delegation_type;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ delegation_type = be32_to_cpup(p);
+ res->delegation_type = 0;
+ switch (delegation_type) {
+ case NFS4_OPEN_DELEGATE_NONE:
+ return 0;
+ case NFS4_OPEN_DELEGATE_READ:
+ case NFS4_OPEN_DELEGATE_WRITE:
+ return decode_rw_delegation(xdr, delegation_type, res);
+ case NFS4_OPEN_DELEGATE_NONE_EXT:
+ return decode_no_delegation(xdr, res);
+ }
+ return -EIO;
+}
+
+static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
+{
+ __be32 *p;
+ uint32_t savewords, bmlen, i;
+ int status;
+
+ if (!__decode_op_hdr(xdr, OP_OPEN, &status))
+ return status;
+ nfs_increment_open_seqid(status, res->seqid);
+ if (status)
+ return status;
+ status = decode_open_stateid(xdr, &res->stateid);
+ if (unlikely(status))
+ return status;
+
+ decode_change_info(xdr, &res->cinfo);
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ res->rflags = be32_to_cpup(p++);
+ bmlen = be32_to_cpup(p);
+ if (bmlen > 10)
+ goto xdr_error;
+
+ p = xdr_inline_decode(xdr, bmlen << 2);
+ if (unlikely(!p))
+ return -EIO;
+ savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
+ for (i = 0; i < savewords; ++i)
+ res->attrset[i] = be32_to_cpup(p++);
+ for (; i < NFS4_BITMAP_SIZE; i++)
+ res->attrset[i] = 0;
+
+ return decode_delegation(xdr, res);
+xdr_error:
+ dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen);
+ return -EIO;
+}
+
+static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
+ if (status != -EIO)
+ nfs_increment_open_seqid(status, res->seqid);
+ if (!status)
+ status = decode_open_stateid(xdr, &res->stateid);
+ return status;
+}
+
+static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE);
+ if (status != -EIO)
+ nfs_increment_open_seqid(status, res->seqid);
+ if (!status)
+ status = decode_open_stateid(xdr, &res->stateid);
+ return status;
+}
+
+static int decode_putfh(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_PUTFH);
+}
+
+static int decode_putrootfh(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_PUTROOTFH);
+}
+
+static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req,
+ struct nfs_pgio_res *res)
+{
+ __be32 *p;
+ uint32_t count, eof, recvd;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_READ);
+ if (status)
+ return status;
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ eof = be32_to_cpup(p++);
+ count = be32_to_cpup(p);
+ recvd = xdr_read_pages(xdr, count);
+ if (count > recvd) {
+ dprintk("NFS: server cheating in read reply: "
+ "count %u > recvd %u\n", count, recvd);
+ count = recvd;
+ eof = 0;
+ }
+ res->eof = eof;
+ res->count = count;
+ return 0;
+}
+
+static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
+{
+ int status;
+ __be32 verf[2];
+
+ status = decode_op_hdr(xdr, OP_READDIR);
+ if (!status)
+ status = decode_verifier(xdr, readdir->verifier.data);
+ if (unlikely(status))
+ return status;
+ memcpy(verf, readdir->verifier.data, sizeof(verf));
+ dprintk("%s: verifier = %08x:%08x\n",
+ __func__, verf[0], verf[1]);
+ return xdr_read_pages(xdr, xdr->buf->page_len);
+}
+
+static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
+{
+ struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+ u32 len, recvd;
+ __be32 *p;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_READLINK);
+ if (status)
+ return status;
+
+ /* Convert length of symlink */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ len = be32_to_cpup(p);
+ if (len >= rcvbuf->page_len || len <= 0) {
+ dprintk("nfs: server returned giant symlink!\n");
+ return -ENAMETOOLONG;
+ }
+ recvd = xdr_read_pages(xdr, len);
+ if (recvd < len) {
+ dprintk("NFS: server cheating in readlink reply: "
+ "count %u > recvd %u\n", len, recvd);
+ return -EIO;
+ }
+ /*
+ * The XDR encode routine has set things up so that
+ * the link text will be copied directly into the
+ * buffer. We just have to do overflow-checking,
+ * and null-terminate the text (the VFS expects
+ * null-termination).
+ */
+ xdr_terminate_string(rcvbuf, len);
+ return 0;
+}
+
+static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_REMOVE);
+ if (status)
+ goto out;
+ status = decode_change_info(xdr, cinfo);
+out:
+ return status;
+}
+
+static int decode_rename(struct xdr_stream *xdr, struct nfs4_change_info *old_cinfo,
+ struct nfs4_change_info *new_cinfo)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_RENAME);
+ if (status)
+ goto out;
+ if ((status = decode_change_info(xdr, old_cinfo)))
+ goto out;
+ status = decode_change_info(xdr, new_cinfo);
+out:
+ return status;
+}
+
+static int decode_renew(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_RENEW);
+}
+
+static int
+decode_restorefh(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_RESTOREFH);
+}
+
+static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
+ struct nfs_getaclres *res)
+{
+ unsigned int savep;
+ uint32_t attrlen,
+ bitmap[3] = {0};
+ int status;
+
+ res->acl_len = 0;
+ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+ goto out;
+
+ xdr_enter_page(xdr, xdr->buf->page_len);
+
+ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+ goto out;
+ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+ goto out;
+
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U)))
+ return -EIO;
+ if (likely(bitmap[0] & FATTR4_WORD0_ACL)) {
+
+ /* The bitmap (xdr len + bitmaps) and the attr xdr len words
+ * are stored with the acl data to handle the problem of
+ * variable length bitmaps.*/
+ res->acl_data_offset = xdr_page_pos(xdr);
+ res->acl_len = attrlen;
+
+ /* Check for receive buffer overflow */
+ if (res->acl_len > (xdr->nwords << 2) ||
+ res->acl_len + res->acl_data_offset > xdr->buf->page_len) {
+ res->acl_flags |= NFS4_ACL_TRUNC;
+ dprintk("NFS: acl reply: attrlen %u > page_len %u\n",
+ attrlen, xdr->nwords << 2);
+ }
+ } else
+ status = -EOPNOTSUPP;
+
+out:
+ return status;
+}
+
+static int
+decode_savefh(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_SAVEFH);
+}
+
+static int decode_setattr(struct xdr_stream *xdr)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_SETATTR);
+ if (status)
+ return status;
+ if (decode_bitmap4(xdr, NULL, 0) >= 0)
+ return 0;
+ return -EIO;
+}
+
+static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res)
+{
+ __be32 *p;
+ uint32_t opnum;
+ int32_t nfserr;
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ opnum = be32_to_cpup(p++);
+ if (opnum != OP_SETCLIENTID) {
+ dprintk("nfs: decode_setclientid: Server returned operation"
+ " %d\n", opnum);
+ return -EIO;
+ }
+ nfserr = be32_to_cpup(p);
+ if (nfserr == NFS_OK) {
+ p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
+ if (unlikely(!p))
+ return -EIO;
+ p = xdr_decode_hyper(p, &res->clientid);
+ memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE);
+ } else if (nfserr == NFSERR_CLID_INUSE) {
+ uint32_t len;
+
+ /* skip netid string */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ len = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return -EIO;
+
+ /* skip uaddr string */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ len = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return -EIO;
+ return -NFSERR_CLID_INUSE;
+ } else
+ return nfs4_stat_to_errno(nfserr);
+
+ return 0;
+}
+
+static int decode_setclientid_confirm(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM);
+}
+
+static int decode_write(struct xdr_stream *xdr, struct nfs_pgio_res *res)
+{
+ __be32 *p;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_WRITE);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ res->count = be32_to_cpup(p++);
+ res->verf->committed = be32_to_cpup(p++);
+ return decode_write_verifier(xdr, &res->verf->verifier);
+}
+
+static int decode_delegreturn(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_DELEGRETURN);
+}
+
+static int decode_secinfo_gss(struct xdr_stream *xdr,
+ struct nfs4_secinfo4 *flavor)
+{
+ u32 oid_len;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ oid_len = be32_to_cpup(p);
+ if (oid_len > GSS_OID_MAX_LEN)
+ return -EINVAL;
+
+ p = xdr_inline_decode(xdr, oid_len);
+ if (unlikely(!p))
+ return -EIO;
+ memcpy(flavor->flavor_info.oid.data, p, oid_len);
+ flavor->flavor_info.oid.len = oid_len;
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ flavor->flavor_info.qop = be32_to_cpup(p++);
+ flavor->flavor_info.service = be32_to_cpup(p);
+
+ return 0;
+}
+
+static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
+{
+ struct nfs4_secinfo4 *sec_flavor;
+ unsigned int i, num_flavors;
+ int status;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+
+ res->flavors->num_flavors = 0;
+ num_flavors = be32_to_cpup(p);
+
+ for (i = 0; i < num_flavors; i++) {
+ sec_flavor = &res->flavors->flavors[i];
+ if ((char *)&sec_flavor[1] - (char *)res->flavors > PAGE_SIZE)
+ break;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ sec_flavor->flavor = be32_to_cpup(p);
+
+ if (sec_flavor->flavor == RPC_AUTH_GSS) {
+ status = decode_secinfo_gss(xdr, sec_flavor);
+ if (status)
+ goto out;
+ }
+ res->flavors->num_flavors++;
+ }
+
+ status = 0;
+out:
+ return status;
+}
+
+static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
+{
+ int status = decode_op_hdr(xdr, OP_SECINFO);
+ if (status)
+ return status;
+ return decode_secinfo_common(xdr, res);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+static int decode_secinfo_no_name(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
+{
+ int status = decode_op_hdr(xdr, OP_SECINFO_NO_NAME);
+ if (status)
+ return status;
+ return decode_secinfo_common(xdr, res);
+}
+
+static int decode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map)
+{
+ __be32 *p;
+ uint32_t bitmap_words;
+ unsigned int i;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
+ bitmap_words = be32_to_cpup(p++);
+ if (bitmap_words > NFS4_OP_MAP_NUM_WORDS)
+ return -EIO;
+ p = xdr_inline_decode(xdr, 4 * bitmap_words);
+ for (i = 0; i < bitmap_words; i++)
+ op_map->u.words[i] = be32_to_cpup(p++);
+
+ return 0;
+}
+
+static int decode_exchange_id(struct xdr_stream *xdr,
+ struct nfs41_exchange_id_res *res)
+{
+ __be32 *p;
+ uint32_t dummy;
+ char *dummy_str;
+ int status;
+ uint32_t impl_id_count;
+
+ status = decode_op_hdr(xdr, OP_EXCHANGE_ID);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ xdr_decode_hyper(p, &res->clientid);
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(!p))
+ return -EIO;
+ res->seqid = be32_to_cpup(p++);
+ res->flags = be32_to_cpup(p++);
+
+ res->state_protect.how = be32_to_cpup(p);
+ switch (res->state_protect.how) {
+ case SP4_NONE:
+ break;
+ case SP4_MACH_CRED:
+ status = decode_op_map(xdr, &res->state_protect.enforce);
+ if (status)
+ return status;
+ status = decode_op_map(xdr, &res->state_protect.allow);
+ if (status)
+ return status;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -EIO;
+ }
+
+ /* server_owner4.so_minor_id */
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ p = xdr_decode_hyper(p, &res->server_owner->minor_id);
+
+ /* server_owner4.so_major_id */
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
+ memcpy(res->server_owner->major_id, dummy_str, dummy);
+ res->server_owner->major_id_sz = dummy;
+
+ /* server_scope4 */
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
+ memcpy(res->server_scope->server_scope, dummy_str, dummy);
+ res->server_scope->server_scope_sz = dummy;
+
+ /* Implementation Id */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ impl_id_count = be32_to_cpup(p++);
+
+ if (impl_id_count) {
+ /* nii_domain */
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
+ memcpy(res->impl_id->domain, dummy_str, dummy);
+
+ /* nii_name */
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
+ memcpy(res->impl_id->name, dummy_str, dummy);
+
+ /* nii_date */
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(!p))
+ return -EIO;
+ p = xdr_decode_hyper(p, &res->impl_id->date.seconds);
+ res->impl_id->date.nseconds = be32_to_cpup(p);
+
+ /* if there's more than one entry, ignore the rest */
+ }
+ return 0;
+}
+
+static int decode_chan_attrs(struct xdr_stream *xdr,
+ struct nfs4_channel_attrs *attrs)
+{
+ __be32 *p;
+ u32 nr_attrs, val;
+
+ p = xdr_inline_decode(xdr, 28);
+ if (unlikely(!p))
+ return -EIO;
+ val = be32_to_cpup(p++); /* headerpadsz */
+ if (val)
+ return -EINVAL; /* no support for header padding yet */
+ attrs->max_rqst_sz = be32_to_cpup(p++);
+ attrs->max_resp_sz = be32_to_cpup(p++);
+ attrs->max_resp_sz_cached = be32_to_cpup(p++);
+ attrs->max_ops = be32_to_cpup(p++);
+ attrs->max_reqs = be32_to_cpup(p++);
+ nr_attrs = be32_to_cpup(p);
+ if (unlikely(nr_attrs > 1)) {
+ printk(KERN_WARNING "NFS: %s: Invalid rdma channel attrs "
+ "count %u\n", __func__, nr_attrs);
+ return -EINVAL;
+ }
+ if (nr_attrs == 1) {
+ p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */
+ if (unlikely(!p))
+ return -EIO;
+ }
+ return 0;
+}
+
+static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
+{
+ return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN);
+}
+
+static int decode_bind_conn_to_session(struct xdr_stream *xdr,
+ struct nfs41_bind_conn_to_session_res *res)
+{
+ __be32 *p;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION);
+ if (!status)
+ status = decode_sessionid(xdr, &res->sessionid);
+ if (unlikely(status))
+ return status;
+
+ /* dir flags, rdma mode bool */
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+
+ res->dir = be32_to_cpup(p++);
+ if (res->dir == 0 || res->dir > NFS4_CDFS4_BOTH)
+ return -EIO;
+ if (be32_to_cpup(p) == 0)
+ res->use_conn_in_rdma_mode = false;
+ else
+ res->use_conn_in_rdma_mode = true;
+
+ return 0;
+}
+
+static int decode_create_session(struct xdr_stream *xdr,
+ struct nfs41_create_session_res *res)
+{
+ __be32 *p;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_CREATE_SESSION);
+ if (!status)
+ status = decode_sessionid(xdr, &res->sessionid);
+ if (unlikely(status))
+ return status;
+
+ /* seqid, flags */
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ res->seqid = be32_to_cpup(p++);
+ res->flags = be32_to_cpup(p);
+
+ /* Channel attributes */
+ status = decode_chan_attrs(xdr, &res->fc_attrs);
+ if (!status)
+ status = decode_chan_attrs(xdr, &res->bc_attrs);
+ return status;
+}
+
+static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
+{
+ return decode_op_hdr(xdr, OP_DESTROY_SESSION);
+}
+
+static int decode_destroy_clientid(struct xdr_stream *xdr, void *dummy)
+{
+ return decode_op_hdr(xdr, OP_DESTROY_CLIENTID);
+}
+
+static int decode_reclaim_complete(struct xdr_stream *xdr, void *dummy)
+{
+ return decode_op_hdr(xdr, OP_RECLAIM_COMPLETE);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static int decode_sequence(struct xdr_stream *xdr,
+ struct nfs4_sequence_res *res,
+ struct rpc_rqst *rqstp)
+{
+#if defined(CONFIG_NFS_V4_1)
+ struct nfs4_session *session;
+ struct nfs4_sessionid id;
+ u32 dummy;
+ int status;
+ __be32 *p;
+
+ if (res->sr_slot == NULL)
+ return 0;
+ if (!res->sr_slot->table->session)
+ return 0;
+
+ status = decode_op_hdr(xdr, OP_SEQUENCE);
+ if (!status)
+ status = decode_sessionid(xdr, &id);
+ if (unlikely(status))
+ goto out_err;
+
+ /*
+ * If the server returns different values for sessionID, slotID or
+ * sequence number, the server is looney tunes.
+ */
+ status = -EREMOTEIO;
+ session = res->sr_slot->table->session;
+
+ if (memcmp(id.data, session->sess_id.data,
+ NFS4_MAX_SESSIONID_LEN)) {
+ dprintk("%s Invalid session id\n", __func__);
+ goto out_err;
+ }
+
+ p = xdr_inline_decode(xdr, 20);
+ if (unlikely(!p))
+ goto out_overflow;
+
+ /* seqid */
+ dummy = be32_to_cpup(p++);
+ if (dummy != res->sr_slot->seq_nr) {
+ dprintk("%s Invalid sequence number\n", __func__);
+ goto out_err;
+ }
+ /* slot id */
+ dummy = be32_to_cpup(p++);
+ if (dummy != res->sr_slot->slot_nr) {
+ dprintk("%s Invalid slot id\n", __func__);
+ goto out_err;
+ }
+ /* highest slot id */
+ res->sr_highest_slotid = be32_to_cpup(p++);
+ /* target highest slot id */
+ res->sr_target_highest_slotid = be32_to_cpup(p++);
+ /* result flags */
+ res->sr_status_flags = be32_to_cpup(p);
+ status = 0;
+out_err:
+ res->sr_status = status;
+ return status;
+out_overflow:
+ status = -EIO;
+ goto out_err;
+#else /* CONFIG_NFS_V4_1 */
+ return 0;
+#endif /* CONFIG_NFS_V4_1 */
+}
+
+#if defined(CONFIG_NFS_V4_1)
+static int decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ stateid->type = NFS4_LAYOUT_STATEID_TYPE;
+ return decode_stateid(xdr, stateid);
+}
+
+static int decode_getdeviceinfo(struct xdr_stream *xdr,
+ struct nfs4_getdeviceinfo_res *res)
+{
+ struct pnfs_device *pdev = res->pdev;
+ __be32 *p;
+ uint32_t len, type;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_GETDEVICEINFO);
+ if (status) {
+ if (status == -ETOOSMALL) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ pdev->mincount = be32_to_cpup(p);
+ dprintk("%s: Min count too small. mincnt = %u\n",
+ __func__, pdev->mincount);
+ }
+ return status;
+ }
+
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ type = be32_to_cpup(p++);
+ if (type != pdev->layout_type) {
+ dprintk("%s: layout mismatch req: %u pdev: %u\n",
+ __func__, pdev->layout_type, type);
+ return -EINVAL;
+ }
+ /*
+ * Get the length of the opaque device_addr4. xdr_read_pages places
+ * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages)
+ * and places the remaining xdr data in xdr_buf->tail
+ */
+ pdev->mincount = be32_to_cpup(p);
+ if (xdr_read_pages(xdr, pdev->mincount) != pdev->mincount)
+ return -EIO;
+
+ /* Parse notification bitmap, verifying that it is zero. */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ len = be32_to_cpup(p);
+ if (len) {
+ uint32_t i;
+
+ p = xdr_inline_decode(xdr, 4 * len);
+ if (unlikely(!p))
+ return -EIO;
+
+ res->notification = be32_to_cpup(p++);
+ for (i = 1; i < len; i++) {
+ if (be32_to_cpup(p++)) {
+ dprintk("%s: unsupported notification\n",
+ __func__);
+ return -EIO;
+ }
+ }
+ }
+ return 0;
+}
+
+static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
+ struct nfs4_layoutget_res *res)
+{
+ __be32 *p;
+ int status;
+ u32 layout_count;
+ u32 recvd;
+
+ status = decode_op_hdr(xdr, OP_LAYOUTGET);
+ if (status)
+ goto out;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ res->return_on_close = be32_to_cpup(p);
+ decode_layout_stateid(xdr, &res->stateid);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ layout_count = be32_to_cpup(p);
+ if (!layout_count) {
+ dprintk("%s: server responded with empty layout array\n",
+ __func__);
+ status = -EINVAL;
+ goto out;
+ }
+
+ p = xdr_inline_decode(xdr, 28);
+ if (unlikely(!p))
+ goto out_overflow;
+ p = xdr_decode_hyper(p, &res->range.offset);
+ p = xdr_decode_hyper(p, &res->range.length);
+ res->range.iomode = be32_to_cpup(p++);
+ res->type = be32_to_cpup(p++);
+ res->layoutp->len = be32_to_cpup(p);
+
+ dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",
+ __func__,
+ (unsigned long)res->range.offset,
+ (unsigned long)res->range.length,
+ res->range.iomode,
+ res->type,
+ res->layoutp->len);
+
+ recvd = xdr_read_pages(xdr, res->layoutp->len);
+ if (res->layoutp->len > recvd) {
+ dprintk("NFS: server cheating in layoutget reply: "
+ "layout len %u > recvd %u\n",
+ res->layoutp->len, recvd);
+ status = -EINVAL;
+ goto out;
+ }
+
+ if (layout_count > 1) {
+ /* We only handle a length one array at the moment. Any
+ * further entries are just ignored. Note that this means
+ * the client may see a response that is less than the
+ * minimum it requested.
+ */
+ dprintk("%s: server responded with %d layouts, dropping tail\n",
+ __func__, layout_count);
+ }
+
+out:
+ res->status = status;
+ return status;
+out_overflow:
+ status = -EIO;
+ goto out;
+}
+
+static int decode_layoutreturn(struct xdr_stream *xdr,
+ struct nfs4_layoutreturn_res *res)
+{
+ __be32 *p;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
+ if (status)
+ return status;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ res->lrs_present = be32_to_cpup(p);
+ if (res->lrs_present)
+ status = decode_layout_stateid(xdr, &res->stateid);
+ else
+ nfs4_stateid_copy(&res->stateid, &invalid_stateid);
+ return status;
+}
+
+static int decode_layoutcommit(struct xdr_stream *xdr,
+ struct rpc_rqst *req,
+ struct nfs4_layoutcommit_res *res)
+{
+ __be32 *p;
+ __u32 sizechanged;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
+ res->status = status;
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ sizechanged = be32_to_cpup(p);
+
+ if (sizechanged) {
+ /* throw away new size */
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ return -EIO;
+ }
+ return 0;
+}
+
+static int decode_test_stateid(struct xdr_stream *xdr,
+ struct nfs41_test_stateid_res *res)
+{
+ __be32 *p;
+ int status;
+ int num_res;
+
+ status = decode_op_hdr(xdr, OP_TEST_STATEID);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ num_res = be32_to_cpup(p++);
+ if (num_res != 1)
+ return -EIO;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ res->status = be32_to_cpup(p++);
+
+ return status;
+}
+
+static int decode_free_stateid(struct xdr_stream *xdr,
+ struct nfs41_free_stateid_res *res)
+{
+ res->status = decode_op_hdr(xdr, OP_FREE_STATEID);
+ return res->status;
+}
+#else
+static inline
+int decode_layoutreturn(struct xdr_stream *xdr,
+ struct nfs4_layoutreturn_res *res)
+{
+ return 0;
+}
+
+static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
+ struct nfs4_layoutget_res *res)
+{
+ return 0;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * END OF "GENERIC" DECODE ROUTINES.
+ */
+
+/*
+ * Decode OPEN_DOWNGRADE response
+ */
+static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_closeres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ if (res->lr_res) {
+ status = decode_layoutreturn(xdr, res->lr_res);
+ res->lr_ret = status;
+ if (status)
+ goto out;
+ }
+ status = decode_open_downgrade(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode ACCESS response
+ */
+static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_accessres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status != 0)
+ goto out;
+ status = decode_access(xdr, &res->supported, &res->access);
+ if (status != 0)
+ goto out;
+ if (res->fattr)
+ decode_getfattr(xdr, res->fattr, res->server);
+out:
+ return status;
+}
+
+/*
+ * Decode LOOKUP response
+ */
+static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_lookup_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_lookup(xdr);
+ if (status)
+ goto out;
+ status = decode_getfh(xdr, res->fh);
+ if (status)
+ goto out;
+ status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+out:
+ return status;
+}
+
+/*
+ * Decode LOOKUPP response
+ */
+static int nfs4_xdr_dec_lookupp(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_lookupp_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_lookupp(xdr);
+ if (status)
+ goto out;
+ status = decode_getfh(xdr, res->fh);
+ if (status)
+ goto out;
+ status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+out:
+ return status;
+}
+
+/*
+ * Decode LOOKUP_ROOT response
+ */
+static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_lookup_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putrootfh(xdr);
+ if (status)
+ goto out;
+ status = decode_getfh(xdr, res->fh);
+ if (status == 0)
+ status = decode_getfattr_label(xdr, res->fattr,
+ res->label, res->server);
+out:
+ return status;
+}
+
+/*
+ * Decode REMOVE response
+ */
+static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_removeres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_remove(xdr, &res->cinfo);
+out:
+ return status;
+}
+
+/*
+ * Decode RENAME response
+ */
+static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_renameres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_savefh(xdr);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo);
+out:
+ return status;
+}
+
+/*
+ * Decode LINK response
+ */
+static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_link_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_savefh(xdr);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_link(xdr, &res->cinfo);
+ if (status)
+ goto out;
+ /*
+ * Note order: OP_LINK leaves the directory as the current
+ * filehandle.
+ */
+ status = decode_restorefh(xdr);
+ if (status)
+ goto out;
+ decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+out:
+ return status;
+}
+
+/*
+ * Decode CREATE response
+ */
+static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_create_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_create(xdr, &res->dir_cinfo);
+ if (status)
+ goto out;
+ status = decode_getfh(xdr, res->fh);
+ if (status)
+ goto out;
+ decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+out:
+ return status;
+}
+
+/*
+ * Decode SYMLINK response
+ */
+static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *res)
+{
+ return nfs4_xdr_dec_create(rqstp, xdr, res);
+}
+
+/*
+ * Decode GETATTR response
+ */
+static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_getattr_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+out:
+ return status;
+}
+
+/*
+ * Encode an SETACL request
+ */
+static void nfs4_xdr_enc_setacl(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs_setaclargs *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_setacl(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
+ * Decode SETACL response
+ */
+static int
+nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_setaclres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_setattr(xdr);
+out:
+ return status;
+}
+
+/*
+ * Decode GETACL response
+ */
+static int
+nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_getaclres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ if (res->acl_scratch != NULL) {
+ void *p = page_address(res->acl_scratch);
+ xdr_set_scratch_buffer(xdr, p, PAGE_SIZE);
+ }
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_getacl(xdr, rqstp, res);
+
+out:
+ return status;
+}
+
+/*
+ * Decode CLOSE response
+ */
+static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_closeres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ if (res->lr_res) {
+ status = decode_layoutreturn(xdr, res->lr_res);
+ res->lr_ret = status;
+ if (status)
+ goto out;
+ }
+ if (res->fattr != NULL) {
+ status = decode_getfattr(xdr, res->fattr, res->server);
+ if (status != 0)
+ goto out;
+ }
+ status = decode_close(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode OPEN response
+ */
+static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_openres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_open(xdr, res);
+ if (status)
+ goto out;
+ status = decode_getfh(xdr, &res->fh);
+ if (status)
+ goto out;
+ if (res->access_request)
+ decode_access(xdr, &res->access_supported, &res->access_result);
+ decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server);
+ if (res->lg_res)
+ decode_layoutget(xdr, rqstp, res->lg_res);
+out:
+ return status;
+}
+
+/*
+ * Decode OPEN_CONFIRM response
+ */
+static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_open_confirmres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_open_confirm(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode OPEN response
+ */
+static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_openres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_open(xdr, res);
+ if (status)
+ goto out;
+ if (res->access_request)
+ decode_access(xdr, &res->access_supported, &res->access_result);
+ decode_getfattr(xdr, res->f_attr, res->server);
+ if (res->lg_res)
+ decode_layoutget(xdr, rqstp, res->lg_res);
+out:
+ return status;
+}
+
+/*
+ * Decode SETATTR response
+ */
+static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_setattrres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_setattr(xdr);
+ if (status)
+ goto out;
+ decode_getfattr_label(xdr, res->fattr, res->label, res->server);
+out:
+ return status;
+}
+
+/*
+ * Decode LOCK response
+ */
+static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_lock_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_lock(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode LOCKT response
+ */
+static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_lockt_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_lockt(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode LOCKU response
+ */
+static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_locku_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_locku(xdr, res);
+out:
+ return status;
+}
+
+static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr, void *dummy)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_release_lockowner(xdr);
+ return status;
+}
+
+/*
+ * Decode READLINK response
+ */
+static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_readlink_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_readlink(xdr, rqstp);
+out:
+ return status;
+}
+
+/*
+ * Decode READDIR response
+ */
+static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_readdir_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_readdir(xdr, rqstp, res);
+out:
+ return status;
+}
+
+/*
+ * Decode Read response
+ */
+static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_pgio_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ res->op_status = hdr.status;
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_read(xdr, rqstp, res);
+ if (!status)
+ status = res->count;
+out:
+ return status;
+}
+
+/*
+ * Decode WRITE response
+ */
+static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_pgio_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ res->op_status = hdr.status;
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_write(xdr, res);
+ if (status)
+ goto out;
+ if (res->fattr)
+ decode_getfattr(xdr, res->fattr, res->server);
+ if (!status)
+ status = res->count;
+out:
+ return status;
+}
+
+/*
+ * Decode COMMIT response
+ */
+static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs_commitres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ res->op_status = hdr.status;
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_commit(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode FSINFO response
+ */
+static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_fsinfo_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_sequence(xdr, &res->seq_res, req);
+ if (!status)
+ status = decode_putfh(xdr);
+ if (!status)
+ status = decode_fsinfo(xdr, res->fsinfo);
+ return status;
+}
+
+/*
+ * Decode PATHCONF response
+ */
+static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_pathconf_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_sequence(xdr, &res->seq_res, req);
+ if (!status)
+ status = decode_putfh(xdr);
+ if (!status)
+ status = decode_pathconf(xdr, res->pathconf);
+ return status;
+}
+
+/*
+ * Decode STATFS response
+ */
+static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_statfs_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_sequence(xdr, &res->seq_res, req);
+ if (!status)
+ status = decode_putfh(xdr);
+ if (!status)
+ status = decode_statfs(xdr, res->fsstat);
+ return status;
+}
+
+/*
+ * Decode GETATTR_BITMAP response
+ */
+static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_server_caps_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, req);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_server_caps(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode RENEW response
+ */
+static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ void *__unused)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_renew(xdr);
+ return status;
+}
+
+/*
+ * Decode SETCLIENTID response
+ */
+static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_setclientid_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_setclientid(xdr, res);
+ return status;
+}
+
+/*
+ * Decode SETCLIENTID_CONFIRM response
+ */
+static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_setclientid_confirm(xdr);
+ return status;
+}
+
+/*
+ * Decode DELEGRETURN response
+ */
+static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_delegreturnres *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status != 0)
+ goto out;
+ if (res->lr_res) {
+ status = decode_layoutreturn(xdr, res->lr_res);
+ res->lr_ret = status;
+ if (status)
+ goto out;
+ }
+ if (res->fattr) {
+ status = decode_getfattr(xdr, res->fattr, res->server);
+ if (status != 0)
+ goto out;
+ }
+ status = decode_delegreturn(xdr);
+out:
+ return status;
+}
+
+/*
+ * Decode FS_LOCATIONS response
+ */
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_fs_locations_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, req);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ if (res->migration) {
+ xdr_enter_page(xdr, PAGE_SIZE);
+ status = decode_getfattr_generic(xdr,
+ &res->fs_locations->fattr,
+ NULL, res->fs_locations,
+ NULL, res->fs_locations->server);
+ if (status)
+ goto out;
+ if (res->renew)
+ status = decode_renew(xdr);
+ } else {
+ status = decode_lookup(xdr);
+ if (status)
+ goto out;
+ xdr_enter_page(xdr, PAGE_SIZE);
+ status = decode_getfattr_generic(xdr,
+ &res->fs_locations->fattr,
+ NULL, res->fs_locations,
+ NULL, res->fs_locations->server);
+ }
+out:
+ return status;
+}
+
+/*
+ * Decode SECINFO response
+ */
+static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_secinfo_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_secinfo(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode FSID_PRESENT response
+ */
+static int nfs4_xdr_dec_fsid_present(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_fsid_present_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_getfh(xdr, res->fh);
+ if (status)
+ goto out;
+ if (res->renew)
+ status = decode_renew(xdr);
+out:
+ return status;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * Decode BIND_CONN_TO_SESSION response
+ */
+static int nfs4_xdr_dec_bind_conn_to_session(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_bind_conn_to_session(xdr, res);
+ return status;
+}
+
+/*
+ * Decode EXCHANGE_ID response
+ */
+static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_exchange_id(xdr, res);
+ return status;
+}
+
+/*
+ * Decode CREATE_SESSION response
+ */
+static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_create_session(xdr, res);
+ return status;
+}
+
+/*
+ * Decode DESTROY_SESSION response
+ */
+static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_destroy_session(xdr, res);
+ return status;
+}
+
+/*
+ * Decode DESTROY_CLIENTID response
+ */
+static int nfs4_xdr_dec_destroy_clientid(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_destroy_clientid(xdr, res);
+ return status;
+}
+
+/*
+ * Decode SEQUENCE response
+ */
+static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_sequence(xdr, res, rqstp);
+ return status;
+}
+
+#endif
+
+/*
+ * Decode GET_LEASE_TIME response
+ */
+static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_get_lease_time_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_sequence(xdr, &res->lr_seq_res, rqstp);
+ if (!status)
+ status = decode_putrootfh(xdr);
+ if (!status)
+ status = decode_fsinfo(xdr, res->lr_fsinfo);
+ return status;
+}
+
+#ifdef CONFIG_NFS_V4_1
+
+/*
+ * Decode RECLAIM_COMPLETE response
+ */
+static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs41_reclaim_complete_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (!status)
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (!status)
+ status = decode_reclaim_complete(xdr, NULL);
+ return status;
+}
+
+/*
+ * Decode GETDEVINFO response
+ */
+static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_getdeviceinfo_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status != 0)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status != 0)
+ goto out;
+ status = decode_getdeviceinfo(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode LAYOUTGET response
+ */
+static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_layoutget_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_layoutget(xdr, rqstp, res);
+out:
+ return status;
+}
+
+/*
+ * Decode LAYOUTRETURN response
+ */
+static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_layoutreturn_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_layoutreturn(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode LAYOUTCOMMIT response
+ */
+static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_layoutcommit_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_layoutcommit(xdr, rqstp, res);
+ if (status)
+ goto out;
+ decode_getfattr(xdr, res->fattr, res->server);
+out:
+ return status;
+}
+
+/*
+ * Decode SECINFO_NO_NAME response
+ */
+static int nfs4_xdr_dec_secinfo_no_name(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs4_secinfo_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putrootfh(xdr);
+ if (status)
+ goto out;
+ status = decode_secinfo_no_name(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode TEST_STATEID response
+ */
+static int nfs4_xdr_dec_test_stateid(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs41_test_stateid_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_test_stateid(xdr, res);
+out:
+ return status;
+}
+
+/*
+ * Decode FREE_STATEID response
+ */
+static int nfs4_xdr_dec_free_stateid(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs41_free_stateid_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_free_stateid(xdr, res);
+out:
+ return status;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_decode_dirent - Decode a single NFSv4 directory entry stored in
+ * the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ */
+int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+ bool plus)
+{
+ unsigned int savep;
+ uint32_t bitmap[3] = {0};
+ uint32_t len;
+ uint64_t new_cookie;
+ __be32 *p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EAGAIN;
+ if (*p == xdr_zero) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EAGAIN;
+ if (*p == xdr_zero)
+ return -EAGAIN;
+ entry->eof = 1;
+ return -EBADCOOKIE;
+ }
+
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(!p))
+ return -EAGAIN;
+ p = xdr_decode_hyper(p, &new_cookie);
+ entry->len = be32_to_cpup(p);
+
+ p = xdr_inline_decode(xdr, entry->len);
+ if (unlikely(!p))
+ return -EAGAIN;
+ entry->name = (const char *) p;
+
+ /*
+ * In case the server doesn't return an inode number,
+ * we fake one here. (We don't use inode number 0,
+ * since glibc seems to choke on it...)
+ */
+ entry->ino = 1;
+ entry->fattr->valid = 0;
+
+ if (decode_attr_bitmap(xdr, bitmap) < 0)
+ return -EAGAIN;
+
+ if (decode_attr_length(xdr, &len, &savep) < 0)
+ return -EAGAIN;
+
+ if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
+ NULL, entry->label, entry->server) < 0)
+ return -EAGAIN;
+ if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
+ entry->ino = entry->fattr->mounted_on_fileid;
+ else if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
+ entry->ino = entry->fattr->fileid;
+
+ entry->d_type = DT_UNKNOWN;
+ if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
+ entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
+
+ entry->prev_cookie = entry->cookie;
+ entry->cookie = new_cookie;
+
+ return 0;
+}
+
+/*
+ * We need to translate between nfs status return values and
+ * the local errno values which may not be the same.
+ */
+static struct {
+ int stat;
+ int errno;
+} nfs_errtbl[] = {
+ { NFS4_OK, 0 },
+ { NFS4ERR_PERM, -EPERM },
+ { NFS4ERR_NOENT, -ENOENT },
+ { NFS4ERR_IO, -errno_NFSERR_IO},
+ { NFS4ERR_NXIO, -ENXIO },
+ { NFS4ERR_ACCESS, -EACCES },
+ { NFS4ERR_EXIST, -EEXIST },
+ { NFS4ERR_XDEV, -EXDEV },
+ { NFS4ERR_NOTDIR, -ENOTDIR },
+ { NFS4ERR_ISDIR, -EISDIR },
+ { NFS4ERR_INVAL, -EINVAL },
+ { NFS4ERR_FBIG, -EFBIG },
+ { NFS4ERR_NOSPC, -ENOSPC },
+ { NFS4ERR_ROFS, -EROFS },
+ { NFS4ERR_MLINK, -EMLINK },
+ { NFS4ERR_NAMETOOLONG, -ENAMETOOLONG },
+ { NFS4ERR_NOTEMPTY, -ENOTEMPTY },
+ { NFS4ERR_DQUOT, -EDQUOT },
+ { NFS4ERR_STALE, -ESTALE },
+ { NFS4ERR_BADHANDLE, -EBADHANDLE },
+ { NFS4ERR_BAD_COOKIE, -EBADCOOKIE },
+ { NFS4ERR_NOTSUPP, -ENOTSUPP },
+ { NFS4ERR_TOOSMALL, -ETOOSMALL },
+ { NFS4ERR_SERVERFAULT, -EREMOTEIO },
+ { NFS4ERR_BADTYPE, -EBADTYPE },
+ { NFS4ERR_LOCKED, -EAGAIN },
+ { NFS4ERR_SYMLINK, -ELOOP },
+ { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP },
+ { NFS4ERR_DEADLOCK, -EDEADLK },
+ { NFS4ERR_NOXATTR, -ENODATA },
+ { NFS4ERR_XATTR2BIG, -E2BIG },
+ { -1, -EIO }
+};
+
+/*
+ * Convert an NFS error code to a local one.
+ * This one is used jointly by NFSv2 and NFSv3.
+ */
+static int
+nfs4_stat_to_errno(int stat)
+{
+ int i;
+ for (i = 0; nfs_errtbl[i].stat != -1; i++) {
+ if (nfs_errtbl[i].stat == stat)
+ return nfs_errtbl[i].errno;
+ }
+ if (stat <= 10000 || stat > 10100) {
+ /* The server is looney tunes. */
+ return -EREMOTEIO;
+ }
+ /* If we cannot translate the error, the recovery routines should
+ * handle it.
+ * Note: remaining NFSv4 error codes have values > 10000, so should
+ * not conflict with native Linux error codes.
+ */
+ return -stat;
+}
+
+#ifdef CONFIG_NFS_V4_2
+#include "nfs42xdr.c"
+#endif /* CONFIG_NFS_V4_2 */
+
+#define PROC(proc, argtype, restype) \
+[NFSPROC4_CLNT_##proc] = { \
+ .p_proc = NFSPROC4_COMPOUND, \
+ .p_encode = nfs4_xdr_##argtype, \
+ .p_decode = nfs4_xdr_##restype, \
+ .p_arglen = NFS4_##argtype##_sz, \
+ .p_replen = NFS4_##restype##_sz, \
+ .p_statidx = NFSPROC4_CLNT_##proc, \
+ .p_name = #proc, \
+}
+
+#define STUB(proc) \
+[NFSPROC4_CLNT_##proc] = { \
+ .p_name = #proc, \
+}
+
+#if defined(CONFIG_NFS_V4_1)
+#define PROC41(proc, argtype, restype) \
+ PROC(proc, argtype, restype)
+#else
+#define PROC41(proc, argtype, restype) \
+ STUB(proc)
+#endif
+
+#if defined(CONFIG_NFS_V4_2)
+#define PROC42(proc, argtype, restype) \
+ PROC(proc, argtype, restype)
+#else
+#define PROC42(proc, argtype, restype) \
+ STUB(proc)
+#endif
+
+const struct rpc_procinfo nfs4_procedures[] = {
+ PROC(READ, enc_read, dec_read),
+ PROC(WRITE, enc_write, dec_write),
+ PROC(COMMIT, enc_commit, dec_commit),
+ PROC(OPEN, enc_open, dec_open),
+ PROC(OPEN_CONFIRM, enc_open_confirm, dec_open_confirm),
+ PROC(OPEN_NOATTR, enc_open_noattr, dec_open_noattr),
+ PROC(OPEN_DOWNGRADE, enc_open_downgrade, dec_open_downgrade),
+ PROC(CLOSE, enc_close, dec_close),
+ PROC(SETATTR, enc_setattr, dec_setattr),
+ PROC(FSINFO, enc_fsinfo, dec_fsinfo),
+ PROC(RENEW, enc_renew, dec_renew),
+ PROC(SETCLIENTID, enc_setclientid, dec_setclientid),
+ PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm),
+ PROC(LOCK, enc_lock, dec_lock),
+ PROC(LOCKT, enc_lockt, dec_lockt),
+ PROC(LOCKU, enc_locku, dec_locku),
+ PROC(ACCESS, enc_access, dec_access),
+ PROC(GETATTR, enc_getattr, dec_getattr),
+ PROC(LOOKUP, enc_lookup, dec_lookup),
+ PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root),
+ PROC(REMOVE, enc_remove, dec_remove),
+ PROC(RENAME, enc_rename, dec_rename),
+ PROC(LINK, enc_link, dec_link),
+ PROC(SYMLINK, enc_symlink, dec_symlink),
+ PROC(CREATE, enc_create, dec_create),
+ PROC(PATHCONF, enc_pathconf, dec_pathconf),
+ PROC(STATFS, enc_statfs, dec_statfs),
+ PROC(READLINK, enc_readlink, dec_readlink),
+ PROC(READDIR, enc_readdir, dec_readdir),
+ PROC(SERVER_CAPS, enc_server_caps, dec_server_caps),
+ PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn),
+ PROC(GETACL, enc_getacl, dec_getacl),
+ PROC(SETACL, enc_setacl, dec_setacl),
+ PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations),
+ PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
+ PROC(SECINFO, enc_secinfo, dec_secinfo),
+ PROC(FSID_PRESENT, enc_fsid_present, dec_fsid_present),
+ PROC41(EXCHANGE_ID, enc_exchange_id, dec_exchange_id),
+ PROC41(CREATE_SESSION, enc_create_session, dec_create_session),
+ PROC41(DESTROY_SESSION, enc_destroy_session, dec_destroy_session),
+ PROC41(SEQUENCE, enc_sequence, dec_sequence),
+ PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time),
+ PROC41(RECLAIM_COMPLETE,enc_reclaim_complete, dec_reclaim_complete),
+ PROC41(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
+ PROC41(LAYOUTGET, enc_layoutget, dec_layoutget),
+ PROC41(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit),
+ PROC41(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn),
+ PROC41(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name),
+ PROC41(TEST_STATEID, enc_test_stateid, dec_test_stateid),
+ PROC41(FREE_STATEID, enc_free_stateid, dec_free_stateid),
+ STUB(GETDEVICELIST),
+ PROC41(BIND_CONN_TO_SESSION,
+ enc_bind_conn_to_session, dec_bind_conn_to_session),
+ PROC41(DESTROY_CLIENTID,enc_destroy_clientid, dec_destroy_clientid),
+ PROC42(SEEK, enc_seek, dec_seek),
+ PROC42(ALLOCATE, enc_allocate, dec_allocate),
+ PROC42(DEALLOCATE, enc_deallocate, dec_deallocate),
+ PROC42(LAYOUTSTATS, enc_layoutstats, dec_layoutstats),
+ PROC42(CLONE, enc_clone, dec_clone),
+ PROC42(COPY, enc_copy, dec_copy),
+ PROC42(OFFLOAD_CANCEL, enc_offload_cancel, dec_offload_cancel),
+ PROC42(COPY_NOTIFY, enc_copy_notify, dec_copy_notify),
+ PROC(LOOKUPP, enc_lookupp, dec_lookupp),
+ PROC42(LAYOUTERROR, enc_layouterror, dec_layouterror),
+ PROC42(GETXATTR, enc_getxattr, dec_getxattr),
+ PROC42(SETXATTR, enc_setxattr, dec_setxattr),
+ PROC42(LISTXATTRS, enc_listxattrs, dec_listxattrs),
+ PROC42(REMOVEXATTR, enc_removexattr, dec_removexattr),
+ PROC42(READ_PLUS, enc_read_plus, dec_read_plus),
+};
+
+static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];
+const struct rpc_version nfs_version4 = {
+ .number = 4,
+ .nrprocs = ARRAY_SIZE(nfs4_procedures),
+ .procs = nfs4_procedures,
+ .counts = nfs_version4_counts,
+};
+
+/*
+ * Local variables:
+ * c-basic-offset: 8
+ * End:
+ */
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
new file mode 100644
index 000000000..fa1483088
--- /dev/null
+++ b/fs/nfs/nfsroot.c
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1995, 1996 Gero Kuhlmann <gero@gkminix.han.de>
+ *
+ * Allow an NFS filesystem to be mounted as root. The way this works is:
+ * (1) Use the IP autoconfig mechanism to set local IP addresses and routes.
+ * (2) Construct the device string and the options string using DHCP
+ * option 17 and/or kernel command line options.
+ * (3) When mount_root() sets up the root file system, pass these strings
+ * to the NFS client's regular mount interface via sys_mount().
+ *
+ *
+ * Changes:
+ *
+ * Alan Cox : Removed get_address name clash with FPU.
+ * Alan Cox : Reformatted a bit.
+ * Gero Kuhlmann : Code cleanup
+ * Michael Rausch : Fixed recognition of an incoming RARP answer.
+ * Martin Mares : (2.0) Auto-configuration via BOOTP supported.
+ * Martin Mares : Manual selection of interface & BOOTP/RARP.
+ * Martin Mares : Using network routes instead of host routes,
+ * allowing the default configuration to be used
+ * for normal operation of the host.
+ * Martin Mares : Randomized timer with exponential backoff
+ * installed to minimize network congestion.
+ * Martin Mares : Code cleanup.
+ * Martin Mares : (2.1) BOOTP and RARP made configuration options.
+ * Martin Mares : Server hostname generation fixed.
+ * Gerd Knorr : Fixed wired inode handling
+ * Martin Mares : (2.2) "0.0.0.0" addresses from command line ignored.
+ * Martin Mares : RARP replies not tested for server address.
+ * Gero Kuhlmann : (2.3) Some bug fixes and code cleanup again (please
+ * send me your new patches _before_ bothering
+ * Linus so that I don' always have to cleanup
+ * _afterwards_ - thanks)
+ * Gero Kuhlmann : Last changes of Martin Mares undone.
+ * Gero Kuhlmann : RARP replies are tested for specified server
+ * again. However, it's now possible to have
+ * different RARP and NFS servers.
+ * Gero Kuhlmann : "0.0.0.0" addresses from command line are
+ * now mapped to INADDR_NONE.
+ * Gero Kuhlmann : Fixed a bug which prevented BOOTP path name
+ * from being used (thanks to Leo Spiekman)
+ * Andy Walker : Allow to specify the NFS server in nfs_root
+ * without giving a path name
+ * Swen Thümmler : Allow to specify the NFS options in nfs_root
+ * without giving a path name. Fix BOOTP request
+ * for domainname (domainname is NIS domain, not
+ * DNS domain!). Skip dummy devices for BOOTP.
+ * Jacek Zapala : Fixed a bug which prevented server-ip address
+ * from nfsroot parameter from being used.
+ * Olaf Kirch : Adapted to new NFS code.
+ * Jakub Jelinek : Free used code segment.
+ * Marko Kohtala : Fixed some bugs.
+ * Martin Mares : Debug message cleanup
+ * Martin Mares : Changed to use the new generic IP layer autoconfig
+ * code. BOOTP and RARP moved there.
+ * Martin Mares : Default path now contains host name instead of
+ * host IP address (but host name defaults to IP
+ * address anyway).
+ * Martin Mares : Use root_server_addr appropriately during setup.
+ * Martin Mares : Rewrote parameter parsing, now hopefully giving
+ * correct overriding.
+ * Trond Myklebust : Add in preliminary support for NFSv3 and TCP.
+ * Fix bug in root_nfs_addr(). nfs_data.namlen
+ * is NOT for the length of the hostname.
+ * Hua Qin : Support for mounting root file system via
+ * NFS over TCP.
+ * Fabian Frederick: Option parser rebuilt (using parser lib)
+ * Chuck Lever : Use super.c's text-based mount option parsing
+ * Chuck Lever : Add "nfsrootdebug".
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/nfs.h>
+#include <linux/nfs_fs.h>
+#include <linux/utsname.h>
+#include <linux/root_dev.h>
+#include <net/ipconfig.h>
+
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_ROOT
+
+/* Default path we try to mount. "%s" gets replaced by our IP address */
+#define NFS_ROOT "/tftpboot/%s"
+
+/* Default NFSROOT mount options. */
+#if defined(CONFIG_NFS_V2)
+#define NFS_DEF_OPTIONS "vers=2,tcp,rsize=4096,wsize=4096"
+#elif defined(CONFIG_NFS_V3)
+#define NFS_DEF_OPTIONS "vers=3,tcp,rsize=4096,wsize=4096"
+#else
+#define NFS_DEF_OPTIONS "vers=4,tcp,rsize=4096,wsize=4096"
+#endif
+
+/* Parameters passed from the kernel command line */
+static char nfs_root_parms[NFS_MAXPATHLEN + 1] __initdata = "";
+
+/* Text-based mount options passed to super.c */
+static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS;
+
+/* Address of NFS server */
+static __be32 servaddr __initdata = htonl(INADDR_NONE);
+
+/* Name of directory to mount */
+static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
+
+/* server:export path string passed to super.c */
+static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = "";
+
+#ifdef NFS_DEBUG
+/*
+ * When the "nfsrootdebug" kernel command line option is specified,
+ * enable debugging messages for NFSROOT.
+ */
+static int __init nfs_root_debug(char *__unused)
+{
+ nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT;
+ return 1;
+}
+
+__setup("nfsrootdebug", nfs_root_debug);
+#endif
+
+/*
+ * Parse NFS server and directory information passed on the kernel
+ * command line.
+ *
+ * nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
+ *
+ * If there is a "%s" token in the <root-dir> string, it is replaced
+ * by the ASCII-representation of the client's IP address.
+ */
+static int __init nfs_root_setup(char *line)
+{
+ ROOT_DEV = Root_NFS;
+
+ if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) {
+ strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms));
+ } else {
+ size_t n = strlen(line) + sizeof(NFS_ROOT) - 1;
+ if (n >= sizeof(nfs_root_parms))
+ line[sizeof(nfs_root_parms) - sizeof(NFS_ROOT) - 2] = '\0';
+ sprintf(nfs_root_parms, NFS_ROOT, line);
+ }
+
+ /*
+ * Extract the IP address of the NFS server containing our
+ * root file system, if one was specified.
+ *
+ * Note: root_nfs_parse_addr() removes the server-ip from
+ * nfs_root_parms, if it exists.
+ */
+ root_server_addr = root_nfs_parse_addr(nfs_root_parms);
+
+ return 1;
+}
+
+__setup("nfsroot=", nfs_root_setup);
+
+static int __init root_nfs_copy(char *dest, const char *src,
+ const size_t destlen)
+{
+ if (strlcpy(dest, src, destlen) > destlen)
+ return -1;
+ return 0;
+}
+
+static int __init root_nfs_cat(char *dest, const char *src,
+ const size_t destlen)
+{
+ size_t len = strlen(dest);
+
+ if (len && dest[len - 1] != ',')
+ if (strlcat(dest, ",", destlen) > destlen)
+ return -1;
+
+ if (strlcat(dest, src, destlen) > destlen)
+ return -1;
+ return 0;
+}
+
+/*
+ * Parse out root export path and mount options from
+ * passed-in string @incoming.
+ *
+ * Copy the export path into @exppath.
+ */
+static int __init root_nfs_parse_options(char *incoming, char *exppath,
+ const size_t exppathlen)
+{
+ char *p;
+
+ /*
+ * Set the NFS remote path
+ */
+ p = strsep(&incoming, ",");
+ if (*p != '\0' && strcmp(p, "default") != 0)
+ if (root_nfs_copy(exppath, p, exppathlen))
+ return -1;
+
+ /*
+ * @incoming now points to the rest of the string; if it
+ * contains something, append it to our root options buffer
+ */
+ if (incoming != NULL && *incoming != '\0')
+ if (root_nfs_cat(nfs_root_options, incoming,
+ sizeof(nfs_root_options)))
+ return -1;
+ return 0;
+}
+
+/*
+ * Decode the export directory path name and NFS options from
+ * the kernel command line. This has to be done late in order to
+ * use a dynamically acquired client IP address for the remote
+ * root directory path.
+ *
+ * Returns zero if successful; otherwise -1 is returned.
+ */
+static int __init root_nfs_data(char *cmdline)
+{
+ char mand_options[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
+ int len, retval = -1;
+ char *tmp = NULL;
+ const size_t tmplen = sizeof(nfs_export_path);
+
+ tmp = kzalloc(tmplen, GFP_KERNEL);
+ if (tmp == NULL)
+ goto out_nomem;
+ strcpy(tmp, NFS_ROOT);
+
+ if (root_server_path[0] != '\0') {
+ dprintk("Root-NFS: DHCPv4 option 17: %s\n",
+ root_server_path);
+ if (root_nfs_parse_options(root_server_path, tmp, tmplen))
+ goto out_optionstoolong;
+ }
+
+ if (cmdline[0] != '\0') {
+ dprintk("Root-NFS: nfsroot=%s\n", cmdline);
+ if (root_nfs_parse_options(cmdline, tmp, tmplen))
+ goto out_optionstoolong;
+ }
+
+ /*
+ * Append mandatory options for nfsroot so they override
+ * what has come before
+ */
+ snprintf(mand_options, sizeof(mand_options), "nolock,addr=%pI4",
+ &servaddr);
+ if (root_nfs_cat(nfs_root_options, mand_options,
+ sizeof(nfs_root_options)))
+ goto out_optionstoolong;
+
+ /*
+ * Set up nfs_root_device. For NFS mounts, this looks like
+ *
+ * server:/path
+ *
+ * At this point, utsname()->nodename contains our local
+ * IP address or hostname, set by ipconfig. If "%s" exists
+ * in tmp, substitute the nodename, then shovel the whole
+ * mess into nfs_root_device.
+ */
+ len = snprintf(nfs_export_path, sizeof(nfs_export_path),
+ tmp, utsname()->nodename);
+ if (len >= (int)sizeof(nfs_export_path))
+ goto out_devnametoolong;
+ len = snprintf(nfs_root_device, sizeof(nfs_root_device),
+ "%pI4:%s", &servaddr, nfs_export_path);
+ if (len >= (int)sizeof(nfs_root_device))
+ goto out_devnametoolong;
+
+ retval = 0;
+
+out:
+ kfree(tmp);
+ return retval;
+out_nomem:
+ printk(KERN_ERR "Root-NFS: could not allocate memory\n");
+ goto out;
+out_optionstoolong:
+ printk(KERN_ERR "Root-NFS: mount options string too long\n");
+ goto out;
+out_devnametoolong:
+ printk(KERN_ERR "Root-NFS: root device name too long.\n");
+ goto out;
+}
+
+/**
+ * nfs_root_data - Return prepared 'data' for NFSROOT mount
+ * @root_device: OUT: address of string containing NFSROOT device
+ * @root_data: OUT: address of string containing NFSROOT mount options
+ *
+ * Returns zero and sets @root_device and @root_data if successful,
+ * otherwise -1 is returned.
+ */
+int __init nfs_root_data(char **root_device, char **root_data)
+{
+ servaddr = root_server_addr;
+ if (servaddr == htonl(INADDR_NONE)) {
+ printk(KERN_ERR "Root-NFS: no NFS server address\n");
+ return -1;
+ }
+
+ if (root_nfs_data(nfs_root_parms) < 0)
+ return -1;
+
+ *root_device = nfs_root_device;
+ *root_data = nfs_root_options;
+ return 0;
+}
diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c
new file mode 100644
index 000000000..a90b36350
--- /dev/null
+++ b/fs/nfs/nfstrace.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/nfs_fs.h>
+#include <linux/namei.h>
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+#include "nfstrace.h"
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_xdr_status);
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
new file mode 100644
index 000000000..5a59dcdce
--- /dev/null
+++ b/fs/nfs/nfstrace.h
@@ -0,0 +1,1444 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nfs
+
+#if !defined(_TRACE_NFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NFS_H
+
+#include <linux/tracepoint.h>
+#include <linux/iversion.h>
+
+TRACE_DEFINE_ENUM(DT_UNKNOWN);
+TRACE_DEFINE_ENUM(DT_FIFO);
+TRACE_DEFINE_ENUM(DT_CHR);
+TRACE_DEFINE_ENUM(DT_DIR);
+TRACE_DEFINE_ENUM(DT_BLK);
+TRACE_DEFINE_ENUM(DT_REG);
+TRACE_DEFINE_ENUM(DT_LNK);
+TRACE_DEFINE_ENUM(DT_SOCK);
+TRACE_DEFINE_ENUM(DT_WHT);
+
+#define nfs_show_file_type(ftype) \
+ __print_symbolic(ftype, \
+ { DT_UNKNOWN, "UNKNOWN" }, \
+ { DT_FIFO, "FIFO" }, \
+ { DT_CHR, "CHR" }, \
+ { DT_DIR, "DIR" }, \
+ { DT_BLK, "BLK" }, \
+ { DT_REG, "REG" }, \
+ { DT_LNK, "LNK" }, \
+ { DT_SOCK, "SOCK" }, \
+ { DT_WHT, "WHT" })
+
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_DATA);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_ATIME);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_ACCESS);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_ACL);
+TRACE_DEFINE_ENUM(NFS_INO_REVAL_PAGECACHE);
+TRACE_DEFINE_ENUM(NFS_INO_REVAL_FORCED);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_LABEL);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_CHANGE);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_CTIME);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_MTIME);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_SIZE);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_OTHER);
+
+#define nfs_show_cache_validity(v) \
+ __print_flags(v, "|", \
+ { NFS_INO_INVALID_DATA, "INVALID_DATA" }, \
+ { NFS_INO_INVALID_ATIME, "INVALID_ATIME" }, \
+ { NFS_INO_INVALID_ACCESS, "INVALID_ACCESS" }, \
+ { NFS_INO_INVALID_ACL, "INVALID_ACL" }, \
+ { NFS_INO_REVAL_PAGECACHE, "REVAL_PAGECACHE" }, \
+ { NFS_INO_REVAL_FORCED, "REVAL_FORCED" }, \
+ { NFS_INO_INVALID_LABEL, "INVALID_LABEL" }, \
+ { NFS_INO_INVALID_CHANGE, "INVALID_CHANGE" }, \
+ { NFS_INO_INVALID_CTIME, "INVALID_CTIME" }, \
+ { NFS_INO_INVALID_MTIME, "INVALID_MTIME" }, \
+ { NFS_INO_INVALID_SIZE, "INVALID_SIZE" }, \
+ { NFS_INO_INVALID_OTHER, "INVALID_OTHER" }, \
+ { NFS_INO_INVALID_XATTR, "INVALID_XATTR" })
+
+TRACE_DEFINE_ENUM(NFS_INO_ADVISE_RDPLUS);
+TRACE_DEFINE_ENUM(NFS_INO_STALE);
+TRACE_DEFINE_ENUM(NFS_INO_ACL_LRU_SET);
+TRACE_DEFINE_ENUM(NFS_INO_INVALIDATING);
+TRACE_DEFINE_ENUM(NFS_INO_FSCACHE);
+TRACE_DEFINE_ENUM(NFS_INO_FSCACHE_LOCK);
+TRACE_DEFINE_ENUM(NFS_INO_LAYOUTCOMMIT);
+TRACE_DEFINE_ENUM(NFS_INO_LAYOUTCOMMITTING);
+TRACE_DEFINE_ENUM(NFS_INO_LAYOUTSTATS);
+TRACE_DEFINE_ENUM(NFS_INO_ODIRECT);
+
+#define nfs_show_nfsi_flags(v) \
+ __print_flags(v, "|", \
+ { BIT(NFS_INO_ADVISE_RDPLUS), "ADVISE_RDPLUS" }, \
+ { BIT(NFS_INO_STALE), "STALE" }, \
+ { BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \
+ { BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \
+ { BIT(NFS_INO_FSCACHE), "FSCACHE" }, \
+ { BIT(NFS_INO_FSCACHE_LOCK), "FSCACHE_LOCK" }, \
+ { BIT(NFS_INO_LAYOUTCOMMIT), "NEED_LAYOUTCOMMIT" }, \
+ { BIT(NFS_INO_LAYOUTCOMMITTING), "LAYOUTCOMMIT" }, \
+ { BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \
+ { BIT(NFS_INO_ODIRECT), "ODIRECT" })
+
+DECLARE_EVENT_CLASS(nfs_inode_event,
+ TP_PROTO(
+ const struct inode *inode
+ ),
+
+ TP_ARGS(inode),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode_peek_iversion_raw(inode);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu ",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (unsigned long long)__entry->version
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs_inode_event_done,
+ TP_PROTO(
+ const struct inode *inode,
+ int error
+ ),
+
+ TP_ARGS(inode, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(unsigned char, type)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, size)
+ __field(unsigned long, nfsi_flags)
+ __field(unsigned long, cache_validity)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ __entry->error = error < 0 ? -error : 0;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->type = nfs_umode_to_dtype(inode->i_mode);
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->size = i_size_read(inode);
+ __entry->nfsi_flags = nfsi->flags;
+ __entry->cache_validity = nfsi->cache_validity;
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "type=%u (%s) version=%llu size=%lld "
+ "cache_validity=0x%lx (%s) nfs_flags=0x%lx (%s)",
+ -__entry->error, nfs_show_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->type,
+ nfs_show_file_type(__entry->type),
+ (unsigned long long)__entry->version,
+ (long long)__entry->size,
+ __entry->cache_validity,
+ nfs_show_cache_validity(__entry->cache_validity),
+ __entry->nfsi_flags,
+ nfs_show_nfsi_flags(__entry->nfsi_flags)
+ )
+);
+
+#define DEFINE_NFS_INODE_EVENT(name) \
+ DEFINE_EVENT(nfs_inode_event, name, \
+ TP_PROTO( \
+ const struct inode *inode \
+ ), \
+ TP_ARGS(inode))
+#define DEFINE_NFS_INODE_EVENT_DONE(name) \
+ DEFINE_EVENT(nfs_inode_event_done, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ int error \
+ ), \
+ TP_ARGS(inode, error))
+DEFINE_NFS_INODE_EVENT(nfs_set_inode_stale);
+DEFINE_NFS_INODE_EVENT(nfs_refresh_inode_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_refresh_inode_exit);
+DEFINE_NFS_INODE_EVENT(nfs_revalidate_inode_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_revalidate_inode_exit);
+DEFINE_NFS_INODE_EVENT(nfs_invalidate_mapping_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_invalidate_mapping_exit);
+DEFINE_NFS_INODE_EVENT(nfs_getattr_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_getattr_exit);
+DEFINE_NFS_INODE_EVENT(nfs_setattr_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_setattr_exit);
+DEFINE_NFS_INODE_EVENT(nfs_writeback_page_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_page_exit);
+DEFINE_NFS_INODE_EVENT(nfs_writeback_inode_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_inode_exit);
+DEFINE_NFS_INODE_EVENT(nfs_fsync_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit);
+DEFINE_NFS_INODE_EVENT(nfs_access_enter);
+
+TRACE_EVENT(nfs_access_exit,
+ TP_PROTO(
+ const struct inode *inode,
+ unsigned int mask,
+ unsigned int permitted,
+ int error
+ ),
+
+ TP_ARGS(inode, mask, permitted, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(unsigned char, type)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, size)
+ __field(unsigned long, nfsi_flags)
+ __field(unsigned long, cache_validity)
+ __field(unsigned int, mask)
+ __field(unsigned int, permitted)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ __entry->error = error < 0 ? -error : 0;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->type = nfs_umode_to_dtype(inode->i_mode);
+ __entry->version = inode_peek_iversion_raw(inode);
+ __entry->size = i_size_read(inode);
+ __entry->nfsi_flags = nfsi->flags;
+ __entry->cache_validity = nfsi->cache_validity;
+ __entry->mask = mask;
+ __entry->permitted = permitted;
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "type=%u (%s) version=%llu size=%lld "
+ "cache_validity=0x%lx (%s) nfs_flags=0x%lx (%s) "
+ "mask=0x%x permitted=0x%x",
+ -__entry->error, nfs_show_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->type,
+ nfs_show_file_type(__entry->type),
+ (unsigned long long)__entry->version,
+ (long long)__entry->size,
+ __entry->cache_validity,
+ nfs_show_cache_validity(__entry->cache_validity),
+ __entry->nfsi_flags,
+ nfs_show_nfsi_flags(__entry->nfsi_flags),
+ __entry->mask, __entry->permitted
+ )
+);
+
+TRACE_DEFINE_ENUM(LOOKUP_FOLLOW);
+TRACE_DEFINE_ENUM(LOOKUP_DIRECTORY);
+TRACE_DEFINE_ENUM(LOOKUP_AUTOMOUNT);
+TRACE_DEFINE_ENUM(LOOKUP_PARENT);
+TRACE_DEFINE_ENUM(LOOKUP_REVAL);
+TRACE_DEFINE_ENUM(LOOKUP_RCU);
+TRACE_DEFINE_ENUM(LOOKUP_OPEN);
+TRACE_DEFINE_ENUM(LOOKUP_CREATE);
+TRACE_DEFINE_ENUM(LOOKUP_EXCL);
+TRACE_DEFINE_ENUM(LOOKUP_RENAME_TARGET);
+TRACE_DEFINE_ENUM(LOOKUP_JUMPED);
+TRACE_DEFINE_ENUM(LOOKUP_ROOT);
+TRACE_DEFINE_ENUM(LOOKUP_EMPTY);
+TRACE_DEFINE_ENUM(LOOKUP_DOWN);
+
+#define show_lookup_flags(flags) \
+ __print_flags(flags, "|", \
+ { LOOKUP_FOLLOW, "FOLLOW" }, \
+ { LOOKUP_DIRECTORY, "DIRECTORY" }, \
+ { LOOKUP_AUTOMOUNT, "AUTOMOUNT" }, \
+ { LOOKUP_PARENT, "PARENT" }, \
+ { LOOKUP_REVAL, "REVAL" }, \
+ { LOOKUP_RCU, "RCU" }, \
+ { LOOKUP_OPEN, "OPEN" }, \
+ { LOOKUP_CREATE, "CREATE" }, \
+ { LOOKUP_EXCL, "EXCL" }, \
+ { LOOKUP_RENAME_TARGET, "RENAME_TARGET" }, \
+ { LOOKUP_JUMPED, "JUMPED" }, \
+ { LOOKUP_ROOT, "ROOT" }, \
+ { LOOKUP_EMPTY, "EMPTY" }, \
+ { LOOKUP_DOWN, "DOWN" })
+
+DECLARE_EVENT_CLASS(nfs_lookup_event,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct dentry *dentry,
+ unsigned int flags
+ ),
+
+ TP_ARGS(dir, dentry, flags),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, flags)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->flags = flags;
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
+ __entry->flags,
+ show_lookup_flags(__entry->flags),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+#define DEFINE_NFS_LOOKUP_EVENT(name) \
+ DEFINE_EVENT(nfs_lookup_event, name, \
+ TP_PROTO( \
+ const struct inode *dir, \
+ const struct dentry *dentry, \
+ unsigned int flags \
+ ), \
+ TP_ARGS(dir, dentry, flags))
+
+DECLARE_EVENT_CLASS(nfs_lookup_event_done,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct dentry *dentry,
+ unsigned int flags,
+ int error
+ ),
+
+ TP_ARGS(dir, dentry, flags, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(unsigned long, flags)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->error = error < 0 ? -error : 0;
+ __entry->flags = flags;
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
+ -__entry->error, nfs_show_status(__entry->error),
+ __entry->flags,
+ show_lookup_flags(__entry->flags),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+#define DEFINE_NFS_LOOKUP_EVENT_DONE(name) \
+ DEFINE_EVENT(nfs_lookup_event_done, name, \
+ TP_PROTO( \
+ const struct inode *dir, \
+ const struct dentry *dentry, \
+ unsigned int flags, \
+ int error \
+ ), \
+ TP_ARGS(dir, dentry, flags, error))
+
+DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_enter);
+DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_exit);
+DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_revalidate_enter);
+DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_revalidate_exit);
+
+TRACE_DEFINE_ENUM(O_WRONLY);
+TRACE_DEFINE_ENUM(O_RDWR);
+TRACE_DEFINE_ENUM(O_CREAT);
+TRACE_DEFINE_ENUM(O_EXCL);
+TRACE_DEFINE_ENUM(O_NOCTTY);
+TRACE_DEFINE_ENUM(O_TRUNC);
+TRACE_DEFINE_ENUM(O_APPEND);
+TRACE_DEFINE_ENUM(O_NONBLOCK);
+TRACE_DEFINE_ENUM(O_DSYNC);
+TRACE_DEFINE_ENUM(O_DIRECT);
+TRACE_DEFINE_ENUM(O_LARGEFILE);
+TRACE_DEFINE_ENUM(O_DIRECTORY);
+TRACE_DEFINE_ENUM(O_NOFOLLOW);
+TRACE_DEFINE_ENUM(O_NOATIME);
+TRACE_DEFINE_ENUM(O_CLOEXEC);
+
+#define show_open_flags(flags) \
+ __print_flags(flags, "|", \
+ { O_WRONLY, "O_WRONLY" }, \
+ { O_RDWR, "O_RDWR" }, \
+ { O_CREAT, "O_CREAT" }, \
+ { O_EXCL, "O_EXCL" }, \
+ { O_NOCTTY, "O_NOCTTY" }, \
+ { O_TRUNC, "O_TRUNC" }, \
+ { O_APPEND, "O_APPEND" }, \
+ { O_NONBLOCK, "O_NONBLOCK" }, \
+ { O_DSYNC, "O_DSYNC" }, \
+ { O_DIRECT, "O_DIRECT" }, \
+ { O_LARGEFILE, "O_LARGEFILE" }, \
+ { O_DIRECTORY, "O_DIRECTORY" }, \
+ { O_NOFOLLOW, "O_NOFOLLOW" }, \
+ { O_NOATIME, "O_NOATIME" }, \
+ { O_CLOEXEC, "O_CLOEXEC" })
+
+TRACE_DEFINE_ENUM(FMODE_READ);
+TRACE_DEFINE_ENUM(FMODE_WRITE);
+TRACE_DEFINE_ENUM(FMODE_EXEC);
+
+#define show_fmode_flags(mode) \
+ __print_flags(mode, "|", \
+ { ((__force unsigned long)FMODE_READ), "READ" }, \
+ { ((__force unsigned long)FMODE_WRITE), "WRITE" }, \
+ { ((__force unsigned long)FMODE_EXEC), "EXEC" })
+
+TRACE_EVENT(nfs_atomic_open_enter,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct nfs_open_context *ctx,
+ unsigned int flags
+ ),
+
+ TP_ARGS(dir, ctx, flags),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, flags)
+ __field(unsigned int, fmode)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, ctx->dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->flags = flags;
+ __entry->fmode = (__force unsigned int)ctx->mode;
+ __assign_str(name, ctx->dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "flags=0x%lx (%s) fmode=%s name=%02x:%02x:%llu/%s",
+ __entry->flags,
+ show_open_flags(__entry->flags),
+ show_fmode_flags(__entry->fmode),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfs_atomic_open_exit,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct nfs_open_context *ctx,
+ unsigned int flags,
+ int error
+ ),
+
+ TP_ARGS(dir, ctx, flags, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(unsigned long, flags)
+ __field(unsigned int, fmode)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, ctx->dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->error = -error;
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->flags = flags;
+ __entry->fmode = (__force unsigned int)ctx->mode;
+ __assign_str(name, ctx->dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) flags=0x%lx (%s) fmode=%s "
+ "name=%02x:%02x:%llu/%s",
+ -__entry->error, nfs_show_status(__entry->error),
+ __entry->flags,
+ show_open_flags(__entry->flags),
+ show_fmode_flags(__entry->fmode),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfs_create_enter,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct dentry *dentry,
+ unsigned int flags
+ ),
+
+ TP_ARGS(dir, dentry, flags),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, flags)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->flags = flags;
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
+ __entry->flags,
+ show_open_flags(__entry->flags),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfs_create_exit,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct dentry *dentry,
+ unsigned int flags,
+ int error
+ ),
+
+ TP_ARGS(dir, dentry, flags, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(unsigned long, flags)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->error = -error;
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->flags = flags;
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
+ -__entry->error, nfs_show_status(__entry->error),
+ __entry->flags,
+ show_open_flags(__entry->flags),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs_directory_event,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct dentry *dentry
+ ),
+
+ TP_ARGS(dir, dentry),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "name=%02x:%02x:%llu/%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+#define DEFINE_NFS_DIRECTORY_EVENT(name) \
+ DEFINE_EVENT(nfs_directory_event, name, \
+ TP_PROTO( \
+ const struct inode *dir, \
+ const struct dentry *dentry \
+ ), \
+ TP_ARGS(dir, dentry))
+
+DECLARE_EVENT_CLASS(nfs_directory_event_done,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct dentry *dentry,
+ int error
+ ),
+
+ TP_ARGS(dir, dentry, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->error = error < 0 ? -error : 0;
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) name=%02x:%02x:%llu/%s",
+ -__entry->error, nfs_show_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+#define DEFINE_NFS_DIRECTORY_EVENT_DONE(name) \
+ DEFINE_EVENT(nfs_directory_event_done, name, \
+ TP_PROTO( \
+ const struct inode *dir, \
+ const struct dentry *dentry, \
+ int error \
+ ), \
+ TP_ARGS(dir, dentry, error))
+
+DEFINE_NFS_DIRECTORY_EVENT(nfs_mknod_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_mknod_exit);
+DEFINE_NFS_DIRECTORY_EVENT(nfs_mkdir_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_mkdir_exit);
+DEFINE_NFS_DIRECTORY_EVENT(nfs_rmdir_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_rmdir_exit);
+DEFINE_NFS_DIRECTORY_EVENT(nfs_remove_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_remove_exit);
+DEFINE_NFS_DIRECTORY_EVENT(nfs_unlink_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_unlink_exit);
+DEFINE_NFS_DIRECTORY_EVENT(nfs_symlink_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_symlink_exit);
+
+TRACE_EVENT(nfs_link_enter,
+ TP_PROTO(
+ const struct inode *inode,
+ const struct inode *dir,
+ const struct dentry *dentry
+ ),
+
+ TP_ARGS(inode, dir, dentry),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dir = NFS_FILEID(dir);
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->fileid,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfs_link_exit,
+ TP_PROTO(
+ const struct inode *inode,
+ const struct inode *dir,
+ const struct dentry *dentry,
+ int error
+ ),
+
+ TP_ARGS(inode, dir, dentry, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dir = NFS_FILEID(dir);
+ __entry->error = error < 0 ? -error : 0;
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s",
+ -__entry->error, nfs_show_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->fileid,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs_rename_event,
+ TP_PROTO(
+ const struct inode *old_dir,
+ const struct dentry *old_dentry,
+ const struct inode *new_dir,
+ const struct dentry *new_dentry
+ ),
+
+ TP_ARGS(old_dir, old_dentry, new_dir, new_dentry),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, old_dir)
+ __field(u64, new_dir)
+ __string(old_name, old_dentry->d_name.name)
+ __string(new_name, new_dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = old_dir->i_sb->s_dev;
+ __entry->old_dir = NFS_FILEID(old_dir);
+ __entry->new_dir = NFS_FILEID(new_dir);
+ __assign_str(old_name, old_dentry->d_name.name);
+ __assign_str(new_name, new_dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "old_name=%02x:%02x:%llu/%s new_name=%02x:%02x:%llu/%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->old_dir,
+ __get_str(old_name),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->new_dir,
+ __get_str(new_name)
+ )
+);
+#define DEFINE_NFS_RENAME_EVENT(name) \
+ DEFINE_EVENT(nfs_rename_event, name, \
+ TP_PROTO( \
+ const struct inode *old_dir, \
+ const struct dentry *old_dentry, \
+ const struct inode *new_dir, \
+ const struct dentry *new_dentry \
+ ), \
+ TP_ARGS(old_dir, old_dentry, new_dir, new_dentry))
+
+DECLARE_EVENT_CLASS(nfs_rename_event_done,
+ TP_PROTO(
+ const struct inode *old_dir,
+ const struct dentry *old_dentry,
+ const struct inode *new_dir,
+ const struct dentry *new_dentry,
+ int error
+ ),
+
+ TP_ARGS(old_dir, old_dentry, new_dir, new_dentry, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, error)
+ __field(u64, old_dir)
+ __string(old_name, old_dentry->d_name.name)
+ __field(u64, new_dir)
+ __string(new_name, new_dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = old_dir->i_sb->s_dev;
+ __entry->error = -error;
+ __entry->old_dir = NFS_FILEID(old_dir);
+ __entry->new_dir = NFS_FILEID(new_dir);
+ __assign_str(old_name, old_dentry->d_name.name);
+ __assign_str(new_name, new_dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%ld (%s) old_name=%02x:%02x:%llu/%s "
+ "new_name=%02x:%02x:%llu/%s",
+ -__entry->error, nfs_show_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->old_dir,
+ __get_str(old_name),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->new_dir,
+ __get_str(new_name)
+ )
+);
+#define DEFINE_NFS_RENAME_EVENT_DONE(name) \
+ DEFINE_EVENT(nfs_rename_event_done, name, \
+ TP_PROTO( \
+ const struct inode *old_dir, \
+ const struct dentry *old_dentry, \
+ const struct inode *new_dir, \
+ const struct dentry *new_dentry, \
+ int error \
+ ), \
+ TP_ARGS(old_dir, old_dentry, new_dir, \
+ new_dentry, error))
+
+DEFINE_NFS_RENAME_EVENT(nfs_rename_enter);
+DEFINE_NFS_RENAME_EVENT_DONE(nfs_rename_exit);
+
+DEFINE_NFS_RENAME_EVENT_DONE(nfs_sillyrename_rename);
+
+TRACE_EVENT(nfs_sillyrename_unlink,
+ TP_PROTO(
+ const struct nfs_unlinkdata *data,
+ int error
+ ),
+
+ TP_ARGS(data, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, error)
+ __field(u64, dir)
+ __dynamic_array(char, name, data->args.name.len + 1)
+ ),
+
+ TP_fast_assign(
+ struct inode *dir = d_inode(data->dentry->d_parent);
+ size_t len = data->args.name.len;
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->error = -error;
+ memcpy(__get_str(name),
+ data->args.name.name, len);
+ __get_str(name)[len] = 0;
+ ),
+
+ TP_printk(
+ "error=%ld (%s) name=%02x:%02x:%llu/%s",
+ -__entry->error, nfs_show_status(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfs_initiate_read,
+ TP_PROTO(
+ const struct nfs_pgio_header *hdr
+ ),
+
+ TP_ARGS(hdr),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, count)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = hdr->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = hdr->args.fh ?
+ hdr->args.fh : &nfsi->fh;
+
+ __entry->offset = hdr->args.offset;
+ __entry->count = hdr->args.count;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset, __entry->count
+ )
+);
+
+TRACE_EVENT(nfs_readpage_done,
+ TP_PROTO(
+ const struct rpc_task *task,
+ const struct nfs_pgio_header *hdr
+ ),
+
+ TP_ARGS(task, hdr),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, arg_count)
+ __field(u32, res_count)
+ __field(bool, eof)
+ __field(int, status)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = hdr->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = hdr->args.fh ?
+ hdr->args.fh : &nfsi->fh;
+
+ __entry->status = task->tk_status;
+ __entry->offset = hdr->args.offset;
+ __entry->arg_count = hdr->args.count;
+ __entry->res_count = hdr->res.count;
+ __entry->eof = hdr->res.eof;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u status=%d%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset, __entry->arg_count,
+ __entry->res_count, __entry->status,
+ __entry->eof ? " eof" : ""
+ )
+);
+
+TRACE_EVENT(nfs_readpage_short,
+ TP_PROTO(
+ const struct rpc_task *task,
+ const struct nfs_pgio_header *hdr
+ ),
+
+ TP_ARGS(task, hdr),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, arg_count)
+ __field(u32, res_count)
+ __field(bool, eof)
+ __field(int, status)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = hdr->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = hdr->args.fh ?
+ hdr->args.fh : &nfsi->fh;
+
+ __entry->status = task->tk_status;
+ __entry->offset = hdr->args.offset;
+ __entry->arg_count = hdr->args.count;
+ __entry->res_count = hdr->res.count;
+ __entry->eof = hdr->res.eof;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u status=%d%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset, __entry->arg_count,
+ __entry->res_count, __entry->status,
+ __entry->eof ? " eof" : ""
+ )
+);
+
+TRACE_EVENT(nfs_pgio_error,
+ TP_PROTO(
+ const struct nfs_pgio_header *hdr,
+ int error,
+ loff_t pos
+ ),
+
+ TP_ARGS(hdr, error, pos),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, arg_count)
+ __field(u32, res_count)
+ __field(loff_t, pos)
+ __field(int, status)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = hdr->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = hdr->args.fh ?
+ hdr->args.fh : &nfsi->fh;
+
+ __entry->status = error;
+ __entry->offset = hdr->args.offset;
+ __entry->arg_count = hdr->args.count;
+ __entry->res_count = hdr->res.count;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk("fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u pos=%llu status=%d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid, __entry->fhandle,
+ (long long)__entry->offset, __entry->arg_count, __entry->res_count,
+ __entry->pos, __entry->status
+ )
+);
+
+TRACE_DEFINE_ENUM(NFS_UNSTABLE);
+TRACE_DEFINE_ENUM(NFS_DATA_SYNC);
+TRACE_DEFINE_ENUM(NFS_FILE_SYNC);
+
+#define nfs_show_stable(stable) \
+ __print_symbolic(stable, \
+ { NFS_UNSTABLE, "UNSTABLE" }, \
+ { NFS_DATA_SYNC, "DATA_SYNC" }, \
+ { NFS_FILE_SYNC, "FILE_SYNC" })
+
+TRACE_EVENT(nfs_initiate_write,
+ TP_PROTO(
+ const struct nfs_pgio_header *hdr
+ ),
+
+ TP_ARGS(hdr),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, count)
+ __field(enum nfs3_stable_how, stable)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = hdr->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = hdr->args.fh ?
+ hdr->args.fh : &nfsi->fh;
+
+ __entry->offset = hdr->args.offset;
+ __entry->count = hdr->args.count;
+ __entry->stable = hdr->args.stable;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u stable=%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset, __entry->count,
+ nfs_show_stable(__entry->stable)
+ )
+);
+
+TRACE_EVENT(nfs_writeback_done,
+ TP_PROTO(
+ const struct rpc_task *task,
+ const struct nfs_pgio_header *hdr
+ ),
+
+ TP_ARGS(task, hdr),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, arg_count)
+ __field(u32, res_count)
+ __field(int, status)
+ __field(enum nfs3_stable_how, stable)
+ __array(char, verifier, NFS4_VERIFIER_SIZE)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = hdr->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = hdr->args.fh ?
+ hdr->args.fh : &nfsi->fh;
+ const struct nfs_writeverf *verf = hdr->res.verf;
+
+ __entry->status = task->tk_status;
+ __entry->offset = hdr->args.offset;
+ __entry->arg_count = hdr->args.count;
+ __entry->res_count = hdr->res.count;
+ __entry->stable = verf->committed;
+ memcpy(__entry->verifier,
+ &verf->verifier,
+ NFS4_VERIFIER_SIZE);
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u status=%d stable=%s "
+ "verifier=%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset, __entry->arg_count,
+ __entry->res_count, __entry->status,
+ nfs_show_stable(__entry->stable),
+ __print_hex_str(__entry->verifier, NFS4_VERIFIER_SIZE)
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs_page_error_class,
+ TP_PROTO(
+ const struct nfs_page *req,
+ int error
+ ),
+
+ TP_ARGS(req, error),
+
+ TP_STRUCT__entry(
+ __field(const void *, req)
+ __field(pgoff_t, index)
+ __field(unsigned int, offset)
+ __field(unsigned int, pgbase)
+ __field(unsigned int, bytes)
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ __entry->req = req;
+ __entry->index = req->wb_index;
+ __entry->offset = req->wb_offset;
+ __entry->pgbase = req->wb_pgbase;
+ __entry->bytes = req->wb_bytes;
+ __entry->error = error;
+ ),
+
+ TP_printk(
+ "req=%p index=%lu offset=%u pgbase=%u bytes=%u error=%d",
+ __entry->req, __entry->index, __entry->offset,
+ __entry->pgbase, __entry->bytes, __entry->error
+ )
+);
+
+#define DEFINE_NFS_PAGEERR_EVENT(name) \
+ DEFINE_EVENT(nfs_page_error_class, name, \
+ TP_PROTO( \
+ const struct nfs_page *req, \
+ int error \
+ ), \
+ TP_ARGS(req, error))
+
+DEFINE_NFS_PAGEERR_EVENT(nfs_write_error);
+DEFINE_NFS_PAGEERR_EVENT(nfs_comp_error);
+DEFINE_NFS_PAGEERR_EVENT(nfs_commit_error);
+
+TRACE_EVENT(nfs_initiate_commit,
+ TP_PROTO(
+ const struct nfs_commit_data *data
+ ),
+
+ TP_ARGS(data),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(u32, count)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = data->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = data->args.fh ?
+ data->args.fh : &nfsi->fh;
+
+ __entry->offset = data->args.offset;
+ __entry->count = data->args.count;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset, __entry->count
+ )
+);
+
+TRACE_EVENT(nfs_commit_done,
+ TP_PROTO(
+ const struct rpc_task *task,
+ const struct nfs_commit_data *data
+ ),
+
+ TP_ARGS(task, data),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(int, status)
+ __field(enum nfs3_stable_how, stable)
+ __array(char, verifier, NFS4_VERIFIER_SIZE)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = data->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = data->args.fh ?
+ data->args.fh : &nfsi->fh;
+ const struct nfs_writeverf *verf = data->res.verf;
+
+ __entry->status = task->tk_status;
+ __entry->offset = data->args.offset;
+ __entry->stable = verf->committed;
+ memcpy(__entry->verifier,
+ &verf->verifier,
+ NFS4_VERIFIER_SIZE);
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld status=%d stable=%s verifier=%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset, __entry->status,
+ nfs_show_stable(__entry->stable),
+ __print_hex_str(__entry->verifier, NFS4_VERIFIER_SIZE)
+ )
+);
+
+TRACE_EVENT(nfs_fh_to_dentry,
+ TP_PROTO(
+ const struct super_block *sb,
+ const struct nfs_fh *fh,
+ u64 fileid,
+ int error
+ ),
+
+ TP_ARGS(sb, fh, fileid, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error;
+ __entry->dev = sb->s_dev;
+ __entry->fileid = fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ ),
+
+ TP_printk(
+ "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x ",
+ __entry->error,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle
+ )
+);
+
+TRACE_DEFINE_ENUM(NFS_OK);
+TRACE_DEFINE_ENUM(NFSERR_PERM);
+TRACE_DEFINE_ENUM(NFSERR_NOENT);
+TRACE_DEFINE_ENUM(NFSERR_IO);
+TRACE_DEFINE_ENUM(NFSERR_NXIO);
+TRACE_DEFINE_ENUM(ECHILD);
+TRACE_DEFINE_ENUM(NFSERR_EAGAIN);
+TRACE_DEFINE_ENUM(NFSERR_ACCES);
+TRACE_DEFINE_ENUM(NFSERR_EXIST);
+TRACE_DEFINE_ENUM(NFSERR_XDEV);
+TRACE_DEFINE_ENUM(NFSERR_NODEV);
+TRACE_DEFINE_ENUM(NFSERR_NOTDIR);
+TRACE_DEFINE_ENUM(NFSERR_ISDIR);
+TRACE_DEFINE_ENUM(NFSERR_INVAL);
+TRACE_DEFINE_ENUM(NFSERR_FBIG);
+TRACE_DEFINE_ENUM(NFSERR_NOSPC);
+TRACE_DEFINE_ENUM(NFSERR_ROFS);
+TRACE_DEFINE_ENUM(NFSERR_MLINK);
+TRACE_DEFINE_ENUM(NFSERR_OPNOTSUPP);
+TRACE_DEFINE_ENUM(NFSERR_NAMETOOLONG);
+TRACE_DEFINE_ENUM(NFSERR_NOTEMPTY);
+TRACE_DEFINE_ENUM(NFSERR_DQUOT);
+TRACE_DEFINE_ENUM(NFSERR_STALE);
+TRACE_DEFINE_ENUM(NFSERR_REMOTE);
+TRACE_DEFINE_ENUM(NFSERR_WFLUSH);
+TRACE_DEFINE_ENUM(NFSERR_BADHANDLE);
+TRACE_DEFINE_ENUM(NFSERR_NOT_SYNC);
+TRACE_DEFINE_ENUM(NFSERR_BAD_COOKIE);
+TRACE_DEFINE_ENUM(NFSERR_NOTSUPP);
+TRACE_DEFINE_ENUM(NFSERR_TOOSMALL);
+TRACE_DEFINE_ENUM(NFSERR_SERVERFAULT);
+TRACE_DEFINE_ENUM(NFSERR_BADTYPE);
+TRACE_DEFINE_ENUM(NFSERR_JUKEBOX);
+
+#define nfs_show_status(x) \
+ __print_symbolic(x, \
+ { NFS_OK, "OK" }, \
+ { NFSERR_PERM, "PERM" }, \
+ { NFSERR_NOENT, "NOENT" }, \
+ { NFSERR_IO, "IO" }, \
+ { NFSERR_NXIO, "NXIO" }, \
+ { ECHILD, "CHILD" }, \
+ { NFSERR_EAGAIN, "AGAIN" }, \
+ { NFSERR_ACCES, "ACCES" }, \
+ { NFSERR_EXIST, "EXIST" }, \
+ { NFSERR_XDEV, "XDEV" }, \
+ { NFSERR_NODEV, "NODEV" }, \
+ { NFSERR_NOTDIR, "NOTDIR" }, \
+ { NFSERR_ISDIR, "ISDIR" }, \
+ { NFSERR_INVAL, "INVAL" }, \
+ { NFSERR_FBIG, "FBIG" }, \
+ { NFSERR_NOSPC, "NOSPC" }, \
+ { NFSERR_ROFS, "ROFS" }, \
+ { NFSERR_MLINK, "MLINK" }, \
+ { NFSERR_OPNOTSUPP, "OPNOTSUPP" }, \
+ { NFSERR_NAMETOOLONG, "NAMETOOLONG" }, \
+ { NFSERR_NOTEMPTY, "NOTEMPTY" }, \
+ { NFSERR_DQUOT, "DQUOT" }, \
+ { NFSERR_STALE, "STALE" }, \
+ { NFSERR_REMOTE, "REMOTE" }, \
+ { NFSERR_WFLUSH, "WFLUSH" }, \
+ { NFSERR_BADHANDLE, "BADHANDLE" }, \
+ { NFSERR_NOT_SYNC, "NOTSYNC" }, \
+ { NFSERR_BAD_COOKIE, "BADCOOKIE" }, \
+ { NFSERR_NOTSUPP, "NOTSUPP" }, \
+ { NFSERR_TOOSMALL, "TOOSMALL" }, \
+ { NFSERR_SERVERFAULT, "REMOTEIO" }, \
+ { NFSERR_BADTYPE, "BADTYPE" }, \
+ { NFSERR_JUKEBOX, "JUKEBOX" })
+
+TRACE_EVENT(nfs_xdr_status,
+ TP_PROTO(
+ const struct xdr_stream *xdr,
+ int error
+ ),
+
+ TP_ARGS(xdr, error),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, task_id)
+ __field(unsigned int, client_id)
+ __field(u32, xid)
+ __field(int, version)
+ __field(unsigned long, error)
+ __string(program,
+ xdr->rqst->rq_task->tk_client->cl_program->name)
+ __string(procedure,
+ xdr->rqst->rq_task->tk_msg.rpc_proc->p_name)
+ ),
+
+ TP_fast_assign(
+ const struct rpc_rqst *rqstp = xdr->rqst;
+ const struct rpc_task *task = rqstp->rq_task;
+
+ __entry->task_id = task->tk_pid;
+ __entry->client_id = task->tk_client->cl_clid;
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->version = task->tk_client->cl_vers;
+ __entry->error = error;
+ __assign_str(program,
+ task->tk_client->cl_program->name)
+ __assign_str(procedure, task->tk_msg.rpc_proc->p_name)
+ ),
+
+ TP_printk(
+ "task:%u@%d xid=0x%08x %sv%d %s error=%ld (%s)",
+ __entry->task_id, __entry->client_id, __entry->xid,
+ __get_str(program), __entry->version,
+ __get_str(procedure), -__entry->error,
+ nfs_show_status(__entry->error)
+ )
+);
+
+#endif /* _TRACE_NFS_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE nfstrace
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
new file mode 100644
index 000000000..17fef6eb4
--- /dev/null
+++ b/fs/nfs/pagelist.c
@@ -0,0 +1,1470 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/pagelist.c
+ *
+ * A set of helper functions for managing NFS read and write requests.
+ * The main purpose of these routines is to provide support for the
+ * coalescing of several requests into a single RPC call.
+ *
+ * Copyright 2000, 2001 (c) Trond Myklebust <trond.myklebust@fys.uio.no>
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/sched.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/nfs_mount.h>
+#include <linux/export.h>
+
+#include "internal.h"
+#include "pnfs.h"
+#include "nfstrace.h"
+
+#define NFSDBG_FACILITY NFSDBG_PAGECACHE
+
+static struct kmem_cache *nfs_page_cachep;
+static const struct rpc_call_ops nfs_pgio_common_ops;
+
+static struct nfs_pgio_mirror *
+nfs_pgio_get_mirror(struct nfs_pageio_descriptor *desc, u32 idx)
+{
+ if (desc->pg_ops->pg_get_mirror)
+ return desc->pg_ops->pg_get_mirror(desc, idx);
+ return &desc->pg_mirrors[0];
+}
+
+struct nfs_pgio_mirror *
+nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc)
+{
+ return nfs_pgio_get_mirror(desc, desc->pg_mirror_idx);
+}
+EXPORT_SYMBOL_GPL(nfs_pgio_current_mirror);
+
+static u32
+nfs_pgio_set_current_mirror(struct nfs_pageio_descriptor *desc, u32 idx)
+{
+ if (desc->pg_ops->pg_set_mirror)
+ return desc->pg_ops->pg_set_mirror(desc, idx);
+ return desc->pg_mirror_idx;
+}
+
+void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr,
+ void (*release)(struct nfs_pgio_header *hdr))
+{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+
+ hdr->req = nfs_list_entry(mirror->pg_list.next);
+ hdr->inode = desc->pg_inode;
+ hdr->cred = nfs_req_openctx(hdr->req)->cred;
+ hdr->io_start = req_offset(hdr->req);
+ hdr->good_bytes = mirror->pg_count;
+ hdr->io_completion = desc->pg_io_completion;
+ hdr->dreq = desc->pg_dreq;
+ hdr->release = release;
+ hdr->completion_ops = desc->pg_completion_ops;
+ if (hdr->completion_ops->init_hdr)
+ hdr->completion_ops->init_hdr(hdr);
+
+ hdr->pgio_mirror_idx = desc->pg_mirror_idx;
+}
+EXPORT_SYMBOL_GPL(nfs_pgheader_init);
+
+void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
+{
+ unsigned int new = pos - hdr->io_start;
+
+ trace_nfs_pgio_error(hdr, error, pos);
+ if (hdr->good_bytes > new) {
+ hdr->good_bytes = new;
+ clear_bit(NFS_IOHDR_EOF, &hdr->flags);
+ if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags))
+ hdr->error = error;
+ }
+}
+
+static inline struct nfs_page *nfs_page_alloc(void)
+{
+ struct nfs_page *p =
+ kmem_cache_zalloc(nfs_page_cachep, nfs_io_gfp_mask());
+ if (p)
+ INIT_LIST_HEAD(&p->wb_list);
+ return p;
+}
+
+static inline void
+nfs_page_free(struct nfs_page *p)
+{
+ kmem_cache_free(nfs_page_cachep, p);
+}
+
+/**
+ * nfs_iocounter_wait - wait for i/o to complete
+ * @l_ctx: nfs_lock_context with io_counter to use
+ *
+ * returns -ERESTARTSYS if interrupted by a fatal signal.
+ * Otherwise returns 0 once the io_count hits 0.
+ */
+int
+nfs_iocounter_wait(struct nfs_lock_context *l_ctx)
+{
+ return wait_var_event_killable(&l_ctx->io_count,
+ !atomic_read(&l_ctx->io_count));
+}
+
+/**
+ * nfs_async_iocounter_wait - wait on a rpc_waitqueue for I/O
+ * to complete
+ * @task: the rpc_task that should wait
+ * @l_ctx: nfs_lock_context with io_counter to check
+ *
+ * Returns true if there is outstanding I/O to wait on and the
+ * task has been put to sleep.
+ */
+bool
+nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx)
+{
+ struct inode *inode = d_inode(l_ctx->open_context->dentry);
+ bool ret = false;
+
+ if (atomic_read(&l_ctx->io_count) > 0) {
+ rpc_sleep_on(&NFS_SERVER(inode)->uoc_rpcwaitq, task, NULL);
+ ret = true;
+ }
+
+ if (atomic_read(&l_ctx->io_count) == 0) {
+ rpc_wake_up_queued_task(&NFS_SERVER(inode)->uoc_rpcwaitq, task);
+ ret = false;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait);
+
+/*
+ * nfs_page_lock_head_request - page lock the head of the page group
+ * @req: any member of the page group
+ */
+struct nfs_page *
+nfs_page_group_lock_head(struct nfs_page *req)
+{
+ struct nfs_page *head = req->wb_head;
+
+ while (!nfs_lock_request(head)) {
+ int ret = nfs_wait_on_request(head);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ }
+ if (head != req)
+ kref_get(&head->wb_kref);
+ return head;
+}
+
+/*
+ * nfs_unroll_locks - unlock all newly locked reqs and wait on @req
+ * @head: head request of page group, must be holding head lock
+ * @req: request that couldn't lock and needs to wait on the req bit lock
+ *
+ * This is a helper function for nfs_lock_and_join_requests
+ * returns 0 on success, < 0 on error.
+ */
+static void
+nfs_unroll_locks(struct nfs_page *head, struct nfs_page *req)
+{
+ struct nfs_page *tmp;
+
+ /* relinquish all the locks successfully grabbed this run */
+ for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) {
+ if (!kref_read(&tmp->wb_kref))
+ continue;
+ nfs_unlock_and_release_request(tmp);
+ }
+}
+
+/*
+ * nfs_page_group_lock_subreq - try to lock a subrequest
+ * @head: head request of page group
+ * @subreq: request to lock
+ *
+ * This is a helper function for nfs_lock_and_join_requests which
+ * must be called with the head request and page group both locked.
+ * On error, it returns with the page group unlocked.
+ */
+static int
+nfs_page_group_lock_subreq(struct nfs_page *head, struct nfs_page *subreq)
+{
+ int ret;
+
+ if (!kref_get_unless_zero(&subreq->wb_kref))
+ return 0;
+ while (!nfs_lock_request(subreq)) {
+ nfs_page_group_unlock(head);
+ ret = nfs_wait_on_request(subreq);
+ if (!ret)
+ ret = nfs_page_group_lock(head);
+ if (ret < 0) {
+ nfs_unroll_locks(head, subreq);
+ nfs_release_request(subreq);
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/*
+ * nfs_page_group_lock_subrequests - try to lock the subrequests
+ * @head: head request of page group
+ *
+ * This is a helper function for nfs_lock_and_join_requests which
+ * must be called with the head request locked.
+ */
+int nfs_page_group_lock_subrequests(struct nfs_page *head)
+{
+ struct nfs_page *subreq;
+ int ret;
+
+ ret = nfs_page_group_lock(head);
+ if (ret < 0)
+ return ret;
+ /* lock each request in the page group */
+ for (subreq = head->wb_this_page; subreq != head;
+ subreq = subreq->wb_this_page) {
+ ret = nfs_page_group_lock_subreq(head, subreq);
+ if (ret < 0)
+ return ret;
+ }
+ nfs_page_group_unlock(head);
+ return 0;
+}
+
+/*
+ * nfs_page_set_headlock - set the request PG_HEADLOCK
+ * @req: request that is to be locked
+ *
+ * this lock must be held when modifying req->wb_head
+ *
+ * return 0 on success, < 0 on error
+ */
+int
+nfs_page_set_headlock(struct nfs_page *req)
+{
+ if (!test_and_set_bit(PG_HEADLOCK, &req->wb_flags))
+ return 0;
+
+ set_bit(PG_CONTENDED1, &req->wb_flags);
+ smp_mb__after_atomic();
+ return wait_on_bit_lock(&req->wb_flags, PG_HEADLOCK,
+ TASK_UNINTERRUPTIBLE);
+}
+
+/*
+ * nfs_page_clear_headlock - clear the request PG_HEADLOCK
+ * @req: request that is to be locked
+ */
+void
+nfs_page_clear_headlock(struct nfs_page *req)
+{
+ smp_mb__before_atomic();
+ clear_bit(PG_HEADLOCK, &req->wb_flags);
+ smp_mb__after_atomic();
+ if (!test_bit(PG_CONTENDED1, &req->wb_flags))
+ return;
+ wake_up_bit(&req->wb_flags, PG_HEADLOCK);
+}
+
+/*
+ * nfs_page_group_lock - lock the head of the page group
+ * @req: request in group that is to be locked
+ *
+ * this lock must be held when traversing or modifying the page
+ * group list
+ *
+ * return 0 on success, < 0 on error
+ */
+int
+nfs_page_group_lock(struct nfs_page *req)
+{
+ int ret;
+
+ ret = nfs_page_set_headlock(req);
+ if (ret || req->wb_head == req)
+ return ret;
+ return nfs_page_set_headlock(req->wb_head);
+}
+
+/*
+ * nfs_page_group_unlock - unlock the head of the page group
+ * @req: request in group that is to be unlocked
+ */
+void
+nfs_page_group_unlock(struct nfs_page *req)
+{
+ if (req != req->wb_head)
+ nfs_page_clear_headlock(req->wb_head);
+ nfs_page_clear_headlock(req);
+}
+
+/*
+ * nfs_page_group_sync_on_bit_locked
+ *
+ * must be called with page group lock held
+ */
+static bool
+nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
+{
+ struct nfs_page *head = req->wb_head;
+ struct nfs_page *tmp;
+
+ WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags));
+ WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags));
+
+ tmp = req->wb_this_page;
+ while (tmp != req) {
+ if (!test_bit(bit, &tmp->wb_flags))
+ return false;
+ tmp = tmp->wb_this_page;
+ }
+
+ /* true! reset all bits */
+ tmp = req;
+ do {
+ clear_bit(bit, &tmp->wb_flags);
+ tmp = tmp->wb_this_page;
+ } while (tmp != req);
+
+ return true;
+}
+
+/*
+ * nfs_page_group_sync_on_bit - set bit on current request, but only
+ * return true if the bit is set for all requests in page group
+ * @req - request in page group
+ * @bit - PG_* bit that is used to sync page group
+ */
+bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
+{
+ bool ret;
+
+ nfs_page_group_lock(req);
+ ret = nfs_page_group_sync_on_bit_locked(req, bit);
+ nfs_page_group_unlock(req);
+
+ return ret;
+}
+
+/*
+ * nfs_page_group_init - Initialize the page group linkage for @req
+ * @req - a new nfs request
+ * @prev - the previous request in page group, or NULL if @req is the first
+ * or only request in the group (the head).
+ */
+static inline void
+nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
+{
+ struct inode *inode;
+ WARN_ON_ONCE(prev == req);
+
+ if (!prev) {
+ /* a head request */
+ req->wb_head = req;
+ req->wb_this_page = req;
+ } else {
+ /* a subrequest */
+ WARN_ON_ONCE(prev->wb_this_page != prev->wb_head);
+ WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags));
+ req->wb_head = prev->wb_head;
+ req->wb_this_page = prev->wb_this_page;
+ prev->wb_this_page = req;
+
+ /* All subrequests take a ref on the head request until
+ * nfs_page_group_destroy is called */
+ kref_get(&req->wb_head->wb_kref);
+
+ /* grab extra ref and bump the request count if head request
+ * has extra ref from the write/commit path to handle handoff
+ * between write and commit lists. */
+ if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) {
+ inode = page_file_mapping(req->wb_page)->host;
+ set_bit(PG_INODE_REF, &req->wb_flags);
+ kref_get(&req->wb_kref);
+ atomic_long_inc(&NFS_I(inode)->nrequests);
+ }
+ }
+}
+
+/*
+ * nfs_page_group_destroy - sync the destruction of page groups
+ * @req - request that no longer needs the page group
+ *
+ * releases the page group reference from each member once all
+ * members have called this function.
+ */
+static void
+nfs_page_group_destroy(struct kref *kref)
+{
+ struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
+ struct nfs_page *head = req->wb_head;
+ struct nfs_page *tmp, *next;
+
+ if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
+ goto out;
+
+ tmp = req;
+ do {
+ next = tmp->wb_this_page;
+ /* unlink and free */
+ tmp->wb_this_page = tmp;
+ tmp->wb_head = tmp;
+ nfs_free_request(tmp);
+ tmp = next;
+ } while (tmp != req);
+out:
+ /* subrequests must release the ref on the head request */
+ if (head != req)
+ nfs_release_request(head);
+}
+
+static struct nfs_page *
+__nfs_create_request(struct nfs_lock_context *l_ctx, struct page *page,
+ unsigned int pgbase, unsigned int offset,
+ unsigned int count)
+{
+ struct nfs_page *req;
+ struct nfs_open_context *ctx = l_ctx->open_context;
+
+ if (test_bit(NFS_CONTEXT_BAD, &ctx->flags))
+ return ERR_PTR(-EBADF);
+ /* try to allocate the request struct */
+ req = nfs_page_alloc();
+ if (req == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ req->wb_lock_context = l_ctx;
+ refcount_inc(&l_ctx->count);
+ atomic_inc(&l_ctx->io_count);
+
+ /* Initialize the request struct. Initially, we assume a
+ * long write-back delay. This will be adjusted in
+ * update_nfs_request below if the region is not locked. */
+ req->wb_page = page;
+ if (page) {
+ req->wb_index = page_index(page);
+ get_page(page);
+ }
+ req->wb_offset = offset;
+ req->wb_pgbase = pgbase;
+ req->wb_bytes = count;
+ kref_init(&req->wb_kref);
+ req->wb_nio = 0;
+ return req;
+}
+
+/**
+ * nfs_create_request - Create an NFS read/write request.
+ * @ctx: open context to use
+ * @page: page to write
+ * @offset: starting offset within the page for the write
+ * @count: number of bytes to read/write
+ *
+ * The page must be locked by the caller. This makes sure we never
+ * create two different requests for the same page.
+ * User should ensure it is safe to sleep in this function.
+ */
+struct nfs_page *
+nfs_create_request(struct nfs_open_context *ctx, struct page *page,
+ unsigned int offset, unsigned int count)
+{
+ struct nfs_lock_context *l_ctx = nfs_get_lock_context(ctx);
+ struct nfs_page *ret;
+
+ if (IS_ERR(l_ctx))
+ return ERR_CAST(l_ctx);
+ ret = __nfs_create_request(l_ctx, page, offset, offset, count);
+ if (!IS_ERR(ret))
+ nfs_page_group_init(ret, NULL);
+ nfs_put_lock_context(l_ctx);
+ return ret;
+}
+
+static struct nfs_page *
+nfs_create_subreq(struct nfs_page *req,
+ unsigned int pgbase,
+ unsigned int offset,
+ unsigned int count)
+{
+ struct nfs_page *last;
+ struct nfs_page *ret;
+
+ ret = __nfs_create_request(req->wb_lock_context, req->wb_page,
+ pgbase, offset, count);
+ if (!IS_ERR(ret)) {
+ /* find the last request */
+ for (last = req->wb_head;
+ last->wb_this_page != req->wb_head;
+ last = last->wb_this_page)
+ ;
+
+ nfs_lock_request(ret);
+ ret->wb_index = req->wb_index;
+ nfs_page_group_init(ret, last);
+ ret->wb_nio = req->wb_nio;
+ }
+ return ret;
+}
+
+/**
+ * nfs_unlock_request - Unlock request and wake up sleepers.
+ * @req: pointer to request
+ */
+void nfs_unlock_request(struct nfs_page *req)
+{
+ if (!NFS_WBACK_BUSY(req)) {
+ printk(KERN_ERR "NFS: Invalid unlock attempted\n");
+ BUG();
+ }
+ smp_mb__before_atomic();
+ clear_bit(PG_BUSY, &req->wb_flags);
+ smp_mb__after_atomic();
+ if (!test_bit(PG_CONTENDED2, &req->wb_flags))
+ return;
+ wake_up_bit(&req->wb_flags, PG_BUSY);
+}
+
+/**
+ * nfs_unlock_and_release_request - Unlock request and release the nfs_page
+ * @req: pointer to request
+ */
+void nfs_unlock_and_release_request(struct nfs_page *req)
+{
+ nfs_unlock_request(req);
+ nfs_release_request(req);
+}
+
+/*
+ * nfs_clear_request - Free up all resources allocated to the request
+ * @req:
+ *
+ * Release page and open context resources associated with a read/write
+ * request after it has completed.
+ */
+static void nfs_clear_request(struct nfs_page *req)
+{
+ struct page *page = req->wb_page;
+ struct nfs_lock_context *l_ctx = req->wb_lock_context;
+ struct nfs_open_context *ctx;
+
+ if (page != NULL) {
+ put_page(page);
+ req->wb_page = NULL;
+ }
+ if (l_ctx != NULL) {
+ if (atomic_dec_and_test(&l_ctx->io_count)) {
+ wake_up_var(&l_ctx->io_count);
+ ctx = l_ctx->open_context;
+ if (test_bit(NFS_CONTEXT_UNLOCK, &ctx->flags))
+ rpc_wake_up(&NFS_SERVER(d_inode(ctx->dentry))->uoc_rpcwaitq);
+ }
+ nfs_put_lock_context(l_ctx);
+ req->wb_lock_context = NULL;
+ }
+}
+
+/**
+ * nfs_release_request - Release the count on an NFS read/write request
+ * @req: request to release
+ *
+ * Note: Should never be called with the spinlock held!
+ */
+void nfs_free_request(struct nfs_page *req)
+{
+ WARN_ON_ONCE(req->wb_this_page != req);
+
+ /* extra debug: make sure no sync bits are still set */
+ WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
+ WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags));
+ WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags));
+ WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags));
+ WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags));
+
+ /* Release struct file and open context */
+ nfs_clear_request(req);
+ nfs_page_free(req);
+}
+
+void nfs_release_request(struct nfs_page *req)
+{
+ kref_put(&req->wb_kref, nfs_page_group_destroy);
+}
+EXPORT_SYMBOL_GPL(nfs_release_request);
+
+/**
+ * nfs_wait_on_request - Wait for a request to complete.
+ * @req: request to wait upon.
+ *
+ * Interruptible by fatal signals only.
+ * The user is responsible for holding a count on the request.
+ */
+int
+nfs_wait_on_request(struct nfs_page *req)
+{
+ if (!test_bit(PG_BUSY, &req->wb_flags))
+ return 0;
+ set_bit(PG_CONTENDED2, &req->wb_flags);
+ smp_mb__after_atomic();
+ return wait_on_bit_io(&req->wb_flags, PG_BUSY,
+ TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL_GPL(nfs_wait_on_request);
+
+/*
+ * nfs_generic_pg_test - determine if requests can be coalesced
+ * @desc: pointer to descriptor
+ * @prev: previous request in desc, or NULL
+ * @req: this request
+ *
+ * Returns zero if @req cannot be coalesced into @desc, otherwise it returns
+ * the size of the request.
+ */
+size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *prev, struct nfs_page *req)
+{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+
+ if (mirror->pg_count > mirror->pg_bsize) {
+ /* should never happen */
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ /*
+ * Limit the request size so that we can still allocate a page array
+ * for it without upsetting the slab allocator.
+ */
+ if (((mirror->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
+ sizeof(struct page *) > PAGE_SIZE)
+ return 0;
+
+ return min(mirror->pg_bsize - mirror->pg_count, (size_t)req->wb_bytes);
+}
+EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
+
+struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops)
+{
+ struct nfs_pgio_header *hdr = ops->rw_alloc_header();
+
+ if (hdr) {
+ INIT_LIST_HEAD(&hdr->pages);
+ hdr->rw_ops = ops;
+ }
+ return hdr;
+}
+EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc);
+
+/**
+ * nfs_pgio_data_destroy - make @hdr suitable for reuse
+ *
+ * Frees memory and releases refs from nfs_generic_pgio, so that it may
+ * be called again.
+ *
+ * @hdr: A header that has had nfs_generic_pgio called
+ */
+static void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr)
+{
+ if (hdr->args.context)
+ put_nfs_open_context(hdr->args.context);
+ if (hdr->page_array.pagevec != hdr->page_array.page_array)
+ kfree(hdr->page_array.pagevec);
+}
+
+/*
+ * nfs_pgio_header_free - Free a read or write header
+ * @hdr: The header to free
+ */
+void nfs_pgio_header_free(struct nfs_pgio_header *hdr)
+{
+ nfs_pgio_data_destroy(hdr);
+ hdr->rw_ops->rw_free_header(hdr);
+}
+EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
+
+/**
+ * nfs_pgio_rpcsetup - Set up arguments for a pageio call
+ * @hdr: The pageio hdr
+ * @count: Number of bytes to read
+ * @how: How to commit data (writes only)
+ * @cinfo: Commit information for the call (writes only)
+ */
+static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr,
+ unsigned int count,
+ int how, struct nfs_commit_info *cinfo)
+{
+ struct nfs_page *req = hdr->req;
+
+ /* Set up the RPC argument and reply structs
+ * NB: take care not to mess about with hdr->commit et al. */
+
+ hdr->args.fh = NFS_FH(hdr->inode);
+ hdr->args.offset = req_offset(req);
+ /* pnfs_set_layoutcommit needs this */
+ hdr->mds_offset = hdr->args.offset;
+ hdr->args.pgbase = req->wb_pgbase;
+ hdr->args.pages = hdr->page_array.pagevec;
+ hdr->args.count = count;
+ hdr->args.context = get_nfs_open_context(nfs_req_openctx(req));
+ hdr->args.lock_context = req->wb_lock_context;
+ hdr->args.stable = NFS_UNSTABLE;
+ switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
+ case 0:
+ break;
+ case FLUSH_COND_STABLE:
+ if (nfs_reqs_to_commit(cinfo))
+ break;
+ fallthrough;
+ default:
+ hdr->args.stable = NFS_FILE_SYNC;
+ }
+
+ hdr->res.fattr = &hdr->fattr;
+ hdr->res.count = 0;
+ hdr->res.eof = 0;
+ hdr->res.verf = &hdr->verf;
+ nfs_fattr_init(&hdr->fattr);
+}
+
+/**
+ * nfs_pgio_prepare - Prepare pageio hdr to go over the wire
+ * @task: The current task
+ * @calldata: pageio header to prepare
+ */
+static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs_pgio_header *hdr = calldata;
+ int err;
+ err = NFS_PROTO(hdr->inode)->pgio_rpc_prepare(task, hdr);
+ if (err)
+ rpc_exit(task, err);
+}
+
+int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
+ const struct cred *cred, const struct nfs_rpc_ops *rpc_ops,
+ const struct rpc_call_ops *call_ops, int how, int flags)
+{
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_argp = &hdr->args,
+ .rpc_resp = &hdr->res,
+ .rpc_cred = cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clnt,
+ .task = &hdr->task,
+ .rpc_message = &msg,
+ .callback_ops = call_ops,
+ .callback_data = hdr,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC | flags,
+ };
+
+ hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
+
+ dprintk("NFS: initiated pgio call "
+ "(req %s/%llu, %u bytes @ offset %llu)\n",
+ hdr->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(hdr->inode),
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
+
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ rpc_put_task(task);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
+
+/**
+ * nfs_pgio_error - Clean up from a pageio error
+ * @hdr: pageio header
+ */
+static void nfs_pgio_error(struct nfs_pgio_header *hdr)
+{
+ set_bit(NFS_IOHDR_REDO, &hdr->flags);
+ hdr->completion_ops->completion(hdr);
+}
+
+/**
+ * nfs_pgio_release - Release pageio data
+ * @calldata: The pageio header to release
+ */
+static void nfs_pgio_release(void *calldata)
+{
+ struct nfs_pgio_header *hdr = calldata;
+ hdr->completion_ops->completion(hdr);
+}
+
+static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror,
+ unsigned int bsize)
+{
+ INIT_LIST_HEAD(&mirror->pg_list);
+ mirror->pg_bytes_written = 0;
+ mirror->pg_count = 0;
+ mirror->pg_bsize = bsize;
+ mirror->pg_base = 0;
+ mirror->pg_recoalesce = 0;
+}
+
+/**
+ * nfs_pageio_init - initialise a page io descriptor
+ * @desc: pointer to descriptor
+ * @inode: pointer to inode
+ * @pg_ops: pointer to pageio operations
+ * @compl_ops: pointer to pageio completion operations
+ * @rw_ops: pointer to nfs read/write operations
+ * @bsize: io block size
+ * @io_flags: extra parameters for the io function
+ */
+void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
+ struct inode *inode,
+ const struct nfs_pageio_ops *pg_ops,
+ const struct nfs_pgio_completion_ops *compl_ops,
+ const struct nfs_rw_ops *rw_ops,
+ size_t bsize,
+ int io_flags)
+{
+ desc->pg_moreio = 0;
+ desc->pg_inode = inode;
+ desc->pg_ops = pg_ops;
+ desc->pg_completion_ops = compl_ops;
+ desc->pg_rw_ops = rw_ops;
+ desc->pg_ioflags = io_flags;
+ desc->pg_error = 0;
+ desc->pg_lseg = NULL;
+ desc->pg_io_completion = NULL;
+ desc->pg_dreq = NULL;
+ desc->pg_bsize = bsize;
+
+ desc->pg_mirror_count = 1;
+ desc->pg_mirror_idx = 0;
+
+ desc->pg_mirrors_dynamic = NULL;
+ desc->pg_mirrors = desc->pg_mirrors_static;
+ nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize);
+ desc->pg_maxretrans = 0;
+}
+
+/**
+ * nfs_pgio_result - Basic pageio error handling
+ * @task: The task that ran
+ * @calldata: Pageio header to check
+ */
+static void nfs_pgio_result(struct rpc_task *task, void *calldata)
+{
+ struct nfs_pgio_header *hdr = calldata;
+ struct inode *inode = hdr->inode;
+
+ dprintk("NFS: %s: %5u, (status %d)\n", __func__,
+ task->tk_pid, task->tk_status);
+
+ if (hdr->rw_ops->rw_done(task, hdr, inode) != 0)
+ return;
+ if (task->tk_status < 0)
+ nfs_set_pgio_error(hdr, task->tk_status, hdr->args.offset);
+ else
+ hdr->rw_ops->rw_result(task, hdr);
+}
+
+/*
+ * Create an RPC task for the given read or write request and kick it.
+ * The page must have been locked by the caller.
+ *
+ * It may happen that the page we're passed is not marked dirty.
+ * This is the case if nfs_updatepage detects a conflicting request
+ * that has been written but not committed.
+ */
+int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr)
+{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+ struct nfs_page *req;
+ struct page **pages,
+ *last_page;
+ struct list_head *head = &mirror->pg_list;
+ struct nfs_commit_info cinfo;
+ struct nfs_page_array *pg_array = &hdr->page_array;
+ unsigned int pagecount, pageused;
+ gfp_t gfp_flags = nfs_io_gfp_mask();
+
+ pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
+ pg_array->npages = pagecount;
+
+ if (pagecount <= ARRAY_SIZE(pg_array->page_array))
+ pg_array->pagevec = pg_array->page_array;
+ else {
+ pg_array->pagevec = kcalloc(pagecount, sizeof(struct page *), gfp_flags);
+ if (!pg_array->pagevec) {
+ pg_array->npages = 0;
+ nfs_pgio_error(hdr);
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
+ }
+ }
+
+ nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
+ pages = hdr->page_array.pagevec;
+ last_page = NULL;
+ pageused = 0;
+ while (!list_empty(head)) {
+ req = nfs_list_entry(head->next);
+ nfs_list_move_request(req, &hdr->pages);
+
+ if (!last_page || last_page != req->wb_page) {
+ pageused++;
+ if (pageused > pagecount)
+ break;
+ *pages++ = last_page = req->wb_page;
+ }
+ }
+ if (WARN_ON_ONCE(pageused != pagecount)) {
+ nfs_pgio_error(hdr);
+ desc->pg_error = -EINVAL;
+ return desc->pg_error;
+ }
+
+ if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
+ (desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
+ desc->pg_ioflags &= ~FLUSH_COND_STABLE;
+
+ /* Set up the argument struct */
+ nfs_pgio_rpcsetup(hdr, mirror->pg_count, desc->pg_ioflags, &cinfo);
+ desc->pg_rpc_callops = &nfs_pgio_common_ops;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_generic_pgio);
+
+static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
+{
+ struct nfs_pgio_header *hdr;
+ int ret;
+
+ hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+ if (!hdr) {
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
+ }
+ nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
+ ret = nfs_generic_pgio(desc, hdr);
+ if (ret == 0)
+ ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
+ hdr,
+ hdr->cred,
+ NFS_PROTO(hdr->inode),
+ desc->pg_rpc_callops,
+ desc->pg_ioflags,
+ RPC_TASK_CRED_NOREF);
+ return ret;
+}
+
+static struct nfs_pgio_mirror *
+nfs_pageio_alloc_mirrors(struct nfs_pageio_descriptor *desc,
+ unsigned int mirror_count)
+{
+ struct nfs_pgio_mirror *ret;
+ unsigned int i;
+
+ kfree(desc->pg_mirrors_dynamic);
+ desc->pg_mirrors_dynamic = NULL;
+ if (mirror_count == 1)
+ return desc->pg_mirrors_static;
+ ret = kmalloc_array(mirror_count, sizeof(*ret), nfs_io_gfp_mask());
+ if (ret != NULL) {
+ for (i = 0; i < mirror_count; i++)
+ nfs_pageio_mirror_init(&ret[i], desc->pg_bsize);
+ desc->pg_mirrors_dynamic = ret;
+ }
+ return ret;
+}
+
+/*
+ * nfs_pageio_setup_mirroring - determine if mirroring is to be used
+ * by calling the pg_get_mirror_count op
+ */
+static void nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ unsigned int mirror_count = 1;
+
+ if (pgio->pg_ops->pg_get_mirror_count)
+ mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
+ if (mirror_count == pgio->pg_mirror_count || pgio->pg_error < 0)
+ return;
+
+ if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX) {
+ pgio->pg_error = -EINVAL;
+ return;
+ }
+
+ pgio->pg_mirrors = nfs_pageio_alloc_mirrors(pgio, mirror_count);
+ if (pgio->pg_mirrors == NULL) {
+ pgio->pg_error = -ENOMEM;
+ pgio->pg_mirrors = pgio->pg_mirrors_static;
+ mirror_count = 1;
+ }
+ pgio->pg_mirror_count = mirror_count;
+}
+
+static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
+{
+ pgio->pg_mirror_count = 1;
+ pgio->pg_mirror_idx = 0;
+ pgio->pg_mirrors = pgio->pg_mirrors_static;
+ kfree(pgio->pg_mirrors_dynamic);
+ pgio->pg_mirrors_dynamic = NULL;
+}
+
+static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
+ const struct nfs_lock_context *l2)
+{
+ return l1->lockowner == l2->lockowner;
+}
+
+/**
+ * nfs_coalesce_size - test two requests for compatibility
+ * @prev: pointer to nfs_page
+ * @req: pointer to nfs_page
+ * @pgio: pointer to nfs_pagio_descriptor
+ *
+ * The nfs_page structures 'prev' and 'req' are compared to ensure that the
+ * page data area they describe is contiguous, and that their RPC
+ * credentials, NFSv4 open state, and lockowners are the same.
+ *
+ * Returns size of the request that can be coalesced
+ */
+static unsigned int nfs_coalesce_size(struct nfs_page *prev,
+ struct nfs_page *req,
+ struct nfs_pageio_descriptor *pgio)
+{
+ struct file_lock_context *flctx;
+
+ if (prev) {
+ if (!nfs_match_open_context(nfs_req_openctx(req), nfs_req_openctx(prev)))
+ return 0;
+ flctx = d_inode(nfs_req_openctx(req)->dentry)->i_flctx;
+ if (flctx != NULL &&
+ !(list_empty_careful(&flctx->flc_posix) &&
+ list_empty_careful(&flctx->flc_flock)) &&
+ !nfs_match_lock_context(req->wb_lock_context,
+ prev->wb_lock_context))
+ return 0;
+ if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
+ return 0;
+ if (req->wb_page == prev->wb_page) {
+ if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes)
+ return 0;
+ } else {
+ if (req->wb_pgbase != 0 ||
+ prev->wb_pgbase + prev->wb_bytes != PAGE_SIZE)
+ return 0;
+ }
+ }
+ return pgio->pg_ops->pg_test(pgio, prev, req);
+}
+
+/**
+ * nfs_pageio_do_add_request - Attempt to coalesce a request into a page list.
+ * @desc: destination io descriptor
+ * @req: request
+ *
+ * If the request 'req' was successfully coalesced into the existing list
+ * of pages 'desc', it returns the size of req.
+ */
+static unsigned int
+nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *req)
+{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+ struct nfs_page *prev = NULL;
+ unsigned int size;
+
+ if (list_empty(&mirror->pg_list)) {
+ if (desc->pg_ops->pg_init)
+ desc->pg_ops->pg_init(desc, req);
+ if (desc->pg_error < 0)
+ return 0;
+ mirror->pg_base = req->wb_pgbase;
+ mirror->pg_count = 0;
+ mirror->pg_recoalesce = 0;
+ } else
+ prev = nfs_list_entry(mirror->pg_list.prev);
+
+ if (desc->pg_maxretrans && req->wb_nio > desc->pg_maxretrans) {
+ if (NFS_SERVER(desc->pg_inode)->flags & NFS_MOUNT_SOFTERR)
+ desc->pg_error = -ETIMEDOUT;
+ else
+ desc->pg_error = -EIO;
+ return 0;
+ }
+
+ size = nfs_coalesce_size(prev, req, desc);
+ if (size < req->wb_bytes)
+ return size;
+ nfs_list_move_request(req, &mirror->pg_list);
+ mirror->pg_count += req->wb_bytes;
+ return req->wb_bytes;
+}
+
+/*
+ * Helper for nfs_pageio_add_request and nfs_pageio_complete
+ */
+static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
+{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+ if (!list_empty(&mirror->pg_list)) {
+ int error = desc->pg_ops->pg_doio(desc);
+ if (error < 0)
+ desc->pg_error = error;
+ if (list_empty(&mirror->pg_list)) {
+ mirror->pg_bytes_written += mirror->pg_count;
+ mirror->pg_count = 0;
+ mirror->pg_base = 0;
+ mirror->pg_recoalesce = 0;
+ }
+ }
+}
+
+static void
+nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *req)
+{
+ LIST_HEAD(head);
+
+ nfs_list_move_request(req, &head);
+ desc->pg_completion_ops->error_cleanup(&head, desc->pg_error);
+}
+
+/**
+ * nfs_pageio_add_request - Attempt to coalesce a request into a page list.
+ * @desc: destination io descriptor
+ * @req: request
+ *
+ * This may split a request into subrequests which are all part of the
+ * same page group. If so, it will submit @req as the last one, to ensure
+ * the pointer to @req is still valid in case of failure.
+ *
+ * Returns true if the request 'req' was successfully coalesced into the
+ * existing list of pages 'desc'.
+ */
+static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *req)
+{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+ struct nfs_page *subreq;
+ unsigned int size, subreq_size;
+
+ nfs_page_group_lock(req);
+
+ subreq = req;
+ subreq_size = subreq->wb_bytes;
+ for(;;) {
+ size = nfs_pageio_do_add_request(desc, subreq);
+ if (size == subreq_size) {
+ /* We successfully submitted a request */
+ if (subreq == req)
+ break;
+ req->wb_pgbase += size;
+ req->wb_bytes -= size;
+ req->wb_offset += size;
+ subreq_size = req->wb_bytes;
+ subreq = req;
+ continue;
+ }
+ if (WARN_ON_ONCE(subreq != req)) {
+ nfs_page_group_unlock(req);
+ nfs_pageio_cleanup_request(desc, subreq);
+ subreq = req;
+ subreq_size = req->wb_bytes;
+ nfs_page_group_lock(req);
+ }
+ if (!size) {
+ /* Can't coalesce any more, so do I/O */
+ nfs_page_group_unlock(req);
+ desc->pg_moreio = 1;
+ nfs_pageio_doio(desc);
+ if (desc->pg_error < 0 || mirror->pg_recoalesce)
+ return 0;
+ /* retry add_request for this subreq */
+ nfs_page_group_lock(req);
+ continue;
+ }
+ subreq = nfs_create_subreq(req, req->wb_pgbase,
+ req->wb_offset, size);
+ if (IS_ERR(subreq))
+ goto err_ptr;
+ subreq_size = size;
+ }
+
+ nfs_page_group_unlock(req);
+ return 1;
+err_ptr:
+ desc->pg_error = PTR_ERR(subreq);
+ nfs_page_group_unlock(req);
+ return 0;
+}
+
+static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
+{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+ LIST_HEAD(head);
+
+ do {
+ list_splice_init(&mirror->pg_list, &head);
+ mirror->pg_count = 0;
+ mirror->pg_base = 0;
+ mirror->pg_recoalesce = 0;
+
+ while (!list_empty(&head)) {
+ struct nfs_page *req;
+
+ req = list_first_entry(&head, struct nfs_page, wb_list);
+ if (__nfs_pageio_add_request(desc, req))
+ continue;
+ if (desc->pg_error < 0) {
+ list_splice_tail(&head, &mirror->pg_list);
+ mirror->pg_recoalesce = 1;
+ return 0;
+ }
+ break;
+ }
+ } while (mirror->pg_recoalesce);
+ return 1;
+}
+
+static int nfs_pageio_add_request_mirror(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *req)
+{
+ int ret;
+
+ do {
+ ret = __nfs_pageio_add_request(desc, req);
+ if (ret)
+ break;
+ if (desc->pg_error < 0)
+ break;
+ ret = nfs_do_recoalesce(desc);
+ } while (ret);
+
+ return ret;
+}
+
+static void nfs_pageio_error_cleanup(struct nfs_pageio_descriptor *desc)
+{
+ u32 midx;
+ struct nfs_pgio_mirror *mirror;
+
+ if (!desc->pg_error)
+ return;
+
+ for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+ mirror = nfs_pgio_get_mirror(desc, midx);
+ desc->pg_completion_ops->error_cleanup(&mirror->pg_list,
+ desc->pg_error);
+ }
+}
+
+int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *req)
+{
+ u32 midx;
+ unsigned int pgbase, offset, bytes;
+ struct nfs_page *dupreq;
+
+ pgbase = req->wb_pgbase;
+ offset = req->wb_offset;
+ bytes = req->wb_bytes;
+
+ nfs_pageio_setup_mirroring(desc, req);
+ if (desc->pg_error < 0)
+ goto out_failed;
+
+ /* Create the mirror instances first, and fire them off */
+ for (midx = 1; midx < desc->pg_mirror_count; midx++) {
+ nfs_page_group_lock(req);
+
+ dupreq = nfs_create_subreq(req,
+ pgbase, offset, bytes);
+
+ nfs_page_group_unlock(req);
+ if (IS_ERR(dupreq)) {
+ desc->pg_error = PTR_ERR(dupreq);
+ goto out_failed;
+ }
+
+ nfs_pgio_set_current_mirror(desc, midx);
+ if (!nfs_pageio_add_request_mirror(desc, dupreq))
+ goto out_cleanup_subreq;
+ }
+
+ nfs_pgio_set_current_mirror(desc, 0);
+ if (!nfs_pageio_add_request_mirror(desc, req))
+ goto out_failed;
+
+ return 1;
+
+out_cleanup_subreq:
+ nfs_pageio_cleanup_request(desc, dupreq);
+out_failed:
+ nfs_pageio_error_cleanup(desc);
+ return 0;
+}
+
+/*
+ * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an
+ * nfs_pageio_descriptor
+ * @desc: pointer to io descriptor
+ * @mirror_idx: pointer to mirror index
+ */
+static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
+ u32 mirror_idx)
+{
+ struct nfs_pgio_mirror *mirror;
+ u32 restore_idx;
+
+ restore_idx = nfs_pgio_set_current_mirror(desc, mirror_idx);
+ mirror = nfs_pgio_current_mirror(desc);
+
+ for (;;) {
+ nfs_pageio_doio(desc);
+ if (desc->pg_error < 0 || !mirror->pg_recoalesce)
+ break;
+ if (!nfs_do_recoalesce(desc))
+ break;
+ }
+ nfs_pgio_set_current_mirror(desc, restore_idx);
+}
+
+/*
+ * nfs_pageio_resend - Transfer requests to new descriptor and resend
+ * @hdr - the pgio header to move request from
+ * @desc - the pageio descriptor to add requests to
+ *
+ * Try to move each request (nfs_page) from @hdr to @desc then attempt
+ * to send them.
+ *
+ * Returns 0 on success and < 0 on error.
+ */
+int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr)
+{
+ LIST_HEAD(pages);
+
+ desc->pg_io_completion = hdr->io_completion;
+ desc->pg_dreq = hdr->dreq;
+ list_splice_init(&hdr->pages, &pages);
+ while (!list_empty(&pages)) {
+ struct nfs_page *req = nfs_list_entry(pages.next);
+
+ if (!nfs_pageio_add_request(desc, req))
+ break;
+ }
+ nfs_pageio_complete(desc);
+ if (!list_empty(&pages)) {
+ int err = desc->pg_error < 0 ? desc->pg_error : -EIO;
+ hdr->completion_ops->error_cleanup(&pages, err);
+ nfs_set_pgio_error(hdr, err, hdr->io_start);
+ return err;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_resend);
+
+/**
+ * nfs_pageio_complete - Complete I/O then cleanup an nfs_pageio_descriptor
+ * @desc: pointer to io descriptor
+ */
+void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
+{
+ u32 midx;
+
+ for (midx = 0; midx < desc->pg_mirror_count; midx++)
+ nfs_pageio_complete_mirror(desc, midx);
+
+ if (desc->pg_error < 0)
+ nfs_pageio_error_cleanup(desc);
+ if (desc->pg_ops->pg_cleanup)
+ desc->pg_ops->pg_cleanup(desc);
+ nfs_pageio_cleanup_mirroring(desc);
+}
+
+/**
+ * nfs_pageio_cond_complete - Conditional I/O completion
+ * @desc: pointer to io descriptor
+ * @index: page index
+ *
+ * It is important to ensure that processes don't try to take locks
+ * on non-contiguous ranges of pages as that might deadlock. This
+ * function should be called before attempting to wait on a locked
+ * nfs_page. It will complete the I/O if the page index 'index'
+ * is not contiguous with the existing list of pages in 'desc'.
+ */
+void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
+{
+ struct nfs_pgio_mirror *mirror;
+ struct nfs_page *prev;
+ u32 midx;
+
+ for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+ mirror = nfs_pgio_get_mirror(desc, midx);
+ if (!list_empty(&mirror->pg_list)) {
+ prev = nfs_list_entry(mirror->pg_list.prev);
+ if (index != prev->wb_index + 1) {
+ nfs_pageio_complete(desc);
+ break;
+ }
+ }
+ }
+}
+
+/*
+ * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1)
+ */
+void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio)
+{
+ nfs_pageio_complete(pgio);
+}
+
+int __init nfs_init_nfspagecache(void)
+{
+ nfs_page_cachep = kmem_cache_create("nfs_page",
+ sizeof(struct nfs_page),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (nfs_page_cachep == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void nfs_destroy_nfspagecache(void)
+{
+ kmem_cache_destroy(nfs_page_cachep);
+}
+
+static const struct rpc_call_ops nfs_pgio_common_ops = {
+ .rpc_call_prepare = nfs_pgio_prepare,
+ .rpc_call_done = nfs_pgio_result,
+ .rpc_release = nfs_pgio_release,
+};
+
+const struct nfs_pageio_ops nfs_pgio_rw_ops = {
+ .pg_test = nfs_generic_pg_test,
+ .pg_doio = nfs_generic_pg_pgios,
+};
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
new file mode 100644
index 000000000..ed6a3ed83
--- /dev/null
+++ b/fs/nfs/pnfs.c
@@ -0,0 +1,3360 @@
+/*
+ * pNFS functions to call and manage layout drivers.
+ *
+ * Copyright (c) 2002 [year of first publication]
+ * The Regents of the University of Michigan
+ * All Rights Reserved
+ *
+ * Dean Hildebrand <dhildebz@umich.edu>
+ *
+ * Permission is granted to use, copy, create derivative works, and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the University of Michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. If
+ * the above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * This software is provided as is, without representation or warranty
+ * of any kind either express or implied, including without limitation
+ * the implied warranties of merchantability, fitness for a particular
+ * purpose, or noninfringement. The Regents of the University of
+ * Michigan shall not be liable for any damages, including special,
+ * indirect, incidental, or consequential damages, with respect to any
+ * claim arising out of or in connection with the use of the software,
+ * even if it has been or is hereafter advised of the possibility of
+ * such damages.
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/module.h>
+#include <linux/sort.h>
+#include "internal.h"
+#include "pnfs.h"
+#include "iostat.h"
+#include "nfs4trace.h"
+#include "delegation.h"
+#include "nfs42.h"
+#include "nfs4_fs.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS
+#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
+
+/* Locking:
+ *
+ * pnfs_spinlock:
+ * protects pnfs_modules_tbl.
+ */
+static DEFINE_SPINLOCK(pnfs_spinlock);
+
+/*
+ * pnfs_modules_tbl holds all pnfs modules
+ */
+static LIST_HEAD(pnfs_modules_tbl);
+
+static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo);
+static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
+ struct list_head *free_me,
+ const struct pnfs_layout_range *range,
+ u32 seq);
+static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
+ struct list_head *tmp_list);
+
+/* Return the registered pnfs layout driver module matching given id */
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver_locked(u32 id)
+{
+ struct pnfs_layoutdriver_type *local;
+
+ list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
+ if (local->id == id)
+ goto out;
+ local = NULL;
+out:
+ dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
+ return local;
+}
+
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver(u32 id)
+{
+ struct pnfs_layoutdriver_type *local;
+
+ spin_lock(&pnfs_spinlock);
+ local = find_pnfs_driver_locked(id);
+ if (local != NULL && !try_module_get(local->owner)) {
+ dprintk("%s: Could not grab reference on module\n", __func__);
+ local = NULL;
+ }
+ spin_unlock(&pnfs_spinlock);
+ return local;
+}
+
+const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id)
+{
+ return find_pnfs_driver(id);
+}
+
+void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld)
+{
+ if (ld)
+ module_put(ld->owner);
+}
+
+void
+unset_pnfs_layoutdriver(struct nfs_server *nfss)
+{
+ if (nfss->pnfs_curr_ld) {
+ if (nfss->pnfs_curr_ld->clear_layoutdriver)
+ nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
+ /* Decrement the MDS count. Purge the deviceid cache if zero */
+ if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count))
+ nfs4_deviceid_purge_client(nfss->nfs_client);
+ module_put(nfss->pnfs_curr_ld->owner);
+ }
+ nfss->pnfs_curr_ld = NULL;
+}
+
+/*
+ * When the server sends a list of layout types, we choose one in the order
+ * given in the list below.
+ *
+ * FIXME: should this list be configurable in some fashion? module param?
+ * mount option? something else?
+ */
+static const u32 ld_prefs[] = {
+ LAYOUT_SCSI,
+ LAYOUT_BLOCK_VOLUME,
+ LAYOUT_OSD2_OBJECTS,
+ LAYOUT_FLEX_FILES,
+ LAYOUT_NFSV4_1_FILES,
+ 0
+};
+
+static int
+ld_cmp(const void *e1, const void *e2)
+{
+ u32 ld1 = *((u32 *)e1);
+ u32 ld2 = *((u32 *)e2);
+ int i;
+
+ for (i = 0; ld_prefs[i] != 0; i++) {
+ if (ld1 == ld_prefs[i])
+ return -1;
+
+ if (ld2 == ld_prefs[i])
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Try to set the server's pnfs module to the pnfs layout type specified by id.
+ * Currently only one pNFS layout driver per filesystem is supported.
+ *
+ * @ids array of layout types supported by MDS.
+ */
+void
+set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
+ struct nfs_fsinfo *fsinfo)
+{
+ struct pnfs_layoutdriver_type *ld_type = NULL;
+ u32 id;
+ int i;
+
+ if (fsinfo->nlayouttypes == 0)
+ goto out_no_driver;
+ if (!(server->nfs_client->cl_exchange_flags &
+ (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
+ printk(KERN_ERR "NFS: %s: cl_exchange_flags 0x%x\n",
+ __func__, server->nfs_client->cl_exchange_flags);
+ goto out_no_driver;
+ }
+
+ sort(fsinfo->layouttype, fsinfo->nlayouttypes,
+ sizeof(*fsinfo->layouttype), ld_cmp, NULL);
+
+ for (i = 0; i < fsinfo->nlayouttypes; i++) {
+ id = fsinfo->layouttype[i];
+ ld_type = find_pnfs_driver(id);
+ if (!ld_type) {
+ request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX,
+ id);
+ ld_type = find_pnfs_driver(id);
+ }
+ if (ld_type)
+ break;
+ }
+
+ if (!ld_type) {
+ dprintk("%s: No pNFS module found!\n", __func__);
+ goto out_no_driver;
+ }
+
+ server->pnfs_curr_ld = ld_type;
+ if (ld_type->set_layoutdriver
+ && ld_type->set_layoutdriver(server, mntfh)) {
+ printk(KERN_ERR "NFS: %s: Error initializing pNFS layout "
+ "driver %u.\n", __func__, id);
+ module_put(ld_type->owner);
+ goto out_no_driver;
+ }
+ /* Bump the MDS count */
+ atomic_inc(&server->nfs_client->cl_mds_count);
+
+ dprintk("%s: pNFS module for %u set\n", __func__, id);
+ return;
+
+out_no_driver:
+ dprintk("%s: Using NFSv4 I/O\n", __func__);
+ server->pnfs_curr_ld = NULL;
+}
+
+int
+pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+ int status = -EINVAL;
+ struct pnfs_layoutdriver_type *tmp;
+
+ if (ld_type->id == 0) {
+ printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__);
+ return status;
+ }
+ if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
+ printk(KERN_ERR "NFS: %s Layout driver must provide "
+ "alloc_lseg and free_lseg.\n", __func__);
+ return status;
+ }
+
+ spin_lock(&pnfs_spinlock);
+ tmp = find_pnfs_driver_locked(ld_type->id);
+ if (!tmp) {
+ list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
+ status = 0;
+ dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
+ ld_type->name);
+ } else {
+ printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n",
+ __func__, ld_type->id);
+ }
+ spin_unlock(&pnfs_spinlock);
+
+ return status;
+}
+EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
+
+void
+pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+ dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
+ spin_lock(&pnfs_spinlock);
+ list_del(&ld_type->pnfs_tblid);
+ spin_unlock(&pnfs_spinlock);
+}
+EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
+
+/*
+ * pNFS client layout cache
+ */
+
+/* Need to hold i_lock if caller does not already hold reference */
+void
+pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ refcount_inc(&lo->plh_refcount);
+}
+
+static struct pnfs_layout_hdr *
+pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
+ return ld->alloc_layout_hdr(ino, gfp_flags);
+}
+
+static void
+pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct nfs_server *server = NFS_SERVER(lo->plh_inode);
+ struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
+
+ if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) {
+ struct nfs_client *clp = server->nfs_client;
+
+ spin_lock(&clp->cl_lock);
+ list_del_rcu(&lo->plh_layouts);
+ spin_unlock(&clp->cl_lock);
+ }
+ put_cred(lo->plh_lc_cred);
+ return ld->free_layout_hdr(lo);
+}
+
+static void
+pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct nfs_inode *nfsi = NFS_I(lo->plh_inode);
+ dprintk("%s: freeing layout cache %p\n", __func__, lo);
+ nfsi->layout = NULL;
+ /* Reset MDS Threshold I/O counters */
+ nfsi->write_io = 0;
+ nfsi->read_io = 0;
+}
+
+void
+pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct inode *inode;
+ unsigned long i_state;
+
+ if (!lo)
+ return;
+ inode = lo->plh_inode;
+ pnfs_layoutreturn_before_put_layout_hdr(lo);
+
+ if (refcount_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+ if (!list_empty(&lo->plh_segs))
+ WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
+ pnfs_detach_layout_hdr(lo);
+ i_state = inode->i_state;
+ spin_unlock(&inode->i_lock);
+ pnfs_free_layout_hdr(lo);
+ /* Notify pnfs_destroy_layout_final() that we're done */
+ if (i_state & (I_FREEING | I_CLEAR))
+ wake_up_var(lo);
+ }
+}
+
+static struct inode *
+pnfs_grab_inode_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct inode *inode = igrab(lo->plh_inode);
+ if (inode)
+ return inode;
+ set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags);
+ return NULL;
+}
+
+/*
+ * Compare 2 layout stateid sequence ids, to see which is newer,
+ * taking into account wraparound issues.
+ */
+static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
+{
+ return (s32)(s1 - s2) > 0;
+}
+
+static void pnfs_barrier_update(struct pnfs_layout_hdr *lo, u32 newseq)
+{
+ if (pnfs_seqid_is_newer(newseq, lo->plh_barrier) || !lo->plh_barrier)
+ lo->plh_barrier = newseq;
+}
+
+static void
+pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
+ u32 seq)
+{
+ if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
+ iomode = IOMODE_ANY;
+ lo->plh_return_iomode = iomode;
+ set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+ /*
+ * We must set lo->plh_return_seq to avoid livelocks with
+ * pnfs_layout_need_return()
+ */
+ if (seq == 0)
+ seq = be32_to_cpu(lo->plh_stateid.seqid);
+ if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq))
+ lo->plh_return_seq = seq;
+ pnfs_barrier_update(lo, seq);
+}
+
+static void
+pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
+{
+ struct pnfs_layout_segment *lseg;
+ lo->plh_return_iomode = 0;
+ lo->plh_return_seq = 0;
+ clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+ list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
+ if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ continue;
+ pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
+ }
+}
+
+static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
+{
+ clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
+ clear_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
+ rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
+}
+
+static void
+pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg,
+ struct list_head *free_me)
+{
+ clear_bit(NFS_LSEG_ROC, &lseg->pls_flags);
+ clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
+ if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags))
+ pnfs_lseg_dec_and_remove_zero(lseg, free_me);
+ if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
+ pnfs_lseg_dec_and_remove_zero(lseg, free_me);
+}
+
+/*
+ * Update the seqid of a layout stateid after receiving
+ * NFS4ERR_OLD_STATEID
+ */
+bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
+ struct pnfs_layout_range *dst_range,
+ struct inode *inode)
+{
+ struct pnfs_layout_hdr *lo;
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+ bool ret = false;
+ LIST_HEAD(head);
+ int err;
+
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (lo && pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match_other(dst, &lo->plh_stateid)) {
+ /* Is our call using the most recent seqid? If so, bump it */
+ if (!nfs4_stateid_is_newer(&lo->plh_stateid, dst)) {
+ nfs4_stateid_seqid_inc(dst);
+ ret = true;
+ goto out;
+ }
+ /* Try to update the seqid to the most recent */
+ err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0);
+ if (err != -EBUSY) {
+ dst->seqid = lo->plh_stateid.seqid;
+ *dst_range = range;
+ ret = true;
+ }
+ }
+out:
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&head);
+ return ret;
+}
+
+/*
+ * Mark a pnfs_layout_hdr and all associated layout segments as invalid
+ *
+ * In order to continue using the pnfs_layout_hdr, a full recovery
+ * is required.
+ * Note that caller must hold inode->i_lock.
+ */
+int
+pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
+ struct list_head *lseg_list)
+{
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+ struct pnfs_layout_segment *lseg, *next;
+
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+ list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+ pnfs_clear_lseg_state(lseg, lseg_list);
+ pnfs_clear_layoutreturn_info(lo);
+ pnfs_free_returned_lsegs(lo, lseg_list, &range, 0);
+ set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
+ if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
+ !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
+ pnfs_clear_layoutreturn_waitbit(lo);
+ return !list_empty(&lo->plh_segs);
+}
+
+static int
+pnfs_iomode_to_fail_bit(u32 iomode)
+{
+ return iomode == IOMODE_RW ?
+ NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
+}
+
+static void
+pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
+{
+ lo->plh_retry_timestamp = jiffies;
+ if (!test_and_set_bit(fail_bit, &lo->plh_flags))
+ refcount_inc(&lo->plh_refcount);
+}
+
+static void
+pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
+{
+ if (test_and_clear_bit(fail_bit, &lo->plh_flags))
+ refcount_dec(&lo->plh_refcount);
+}
+
+static void
+pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
+{
+ struct inode *inode = lo->plh_inode;
+ struct pnfs_layout_range range = {
+ .iomode = iomode,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+ LIST_HEAD(head);
+
+ spin_lock(&inode->i_lock);
+ pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+ pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&head);
+ dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
+ iomode == IOMODE_RW ? "RW" : "READ");
+}
+
+static bool
+pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
+{
+ unsigned long start, end;
+ int fail_bit = pnfs_iomode_to_fail_bit(iomode);
+
+ if (test_bit(fail_bit, &lo->plh_flags) == 0)
+ return false;
+ end = jiffies;
+ start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT;
+ if (!time_in_range(lo->plh_retry_timestamp, start, end)) {
+ /* It is time to retry the failed layoutgets */
+ pnfs_layout_clear_fail_bit(lo, fail_bit);
+ return false;
+ }
+ return true;
+}
+
+static void
+pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
+ const struct pnfs_layout_range *range,
+ const nfs4_stateid *stateid)
+{
+ INIT_LIST_HEAD(&lseg->pls_list);
+ INIT_LIST_HEAD(&lseg->pls_lc_list);
+ INIT_LIST_HEAD(&lseg->pls_commits);
+ refcount_set(&lseg->pls_refcount, 1);
+ set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
+ lseg->pls_layout = lo;
+ lseg->pls_range = *range;
+ lseg->pls_seq = be32_to_cpu(stateid->seqid);
+}
+
+static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
+{
+ if (lseg != NULL) {
+ struct inode *inode = lseg->pls_layout->plh_inode;
+ NFS_SERVER(inode)->pnfs_curr_ld->free_lseg(lseg);
+ }
+}
+
+static void
+pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg)
+{
+ WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+ list_del_init(&lseg->pls_list);
+ /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
+ refcount_dec(&lo->plh_refcount);
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ return;
+ if (list_empty(&lo->plh_segs) &&
+ !test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
+ !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
+ if (atomic_read(&lo->plh_outstanding) == 0)
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+ clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+ }
+}
+
+static bool
+pnfs_cache_lseg_for_layoutreturn(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg)
+{
+ if (test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
+ pnfs_layout_is_valid(lo)) {
+ pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
+ list_move_tail(&lseg->pls_list, &lo->plh_return_segs);
+ return true;
+ }
+ return false;
+}
+
+void
+pnfs_put_lseg(struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_layout_hdr *lo;
+ struct inode *inode;
+
+ if (!lseg)
+ return;
+
+ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
+ refcount_read(&lseg->pls_refcount),
+ test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+
+ lo = lseg->pls_layout;
+ inode = lo->plh_inode;
+
+ if (refcount_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
+ if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
+ spin_unlock(&inode->i_lock);
+ return;
+ }
+ pnfs_get_layout_hdr(lo);
+ pnfs_layout_remove_lseg(lo, lseg);
+ if (pnfs_cache_lseg_for_layoutreturn(lo, lseg))
+ lseg = NULL;
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg(lseg);
+ pnfs_put_layout_hdr(lo);
+ }
+}
+EXPORT_SYMBOL_GPL(pnfs_put_lseg);
+
+/*
+ * is l2 fully contained in l1?
+ * start1 end1
+ * [----------------------------------)
+ * start2 end2
+ * [----------------)
+ */
+static bool
+pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
+{
+ u64 start1 = l1->offset;
+ u64 end1 = pnfs_end_offset(start1, l1->length);
+ u64 start2 = l2->offset;
+ u64 end2 = pnfs_end_offset(start2, l2->length);
+
+ return (start1 <= start2) && (end1 >= end2);
+}
+
+static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
+ struct list_head *tmp_list)
+{
+ if (!refcount_dec_and_test(&lseg->pls_refcount))
+ return false;
+ pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
+ list_add(&lseg->pls_list, tmp_list);
+ return true;
+}
+
+/* Returns 1 if lseg is removed from list, 0 otherwise */
+static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
+ struct list_head *tmp_list)
+{
+ int rv = 0;
+
+ if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
+ /* Remove the reference keeping the lseg in the
+ * list. It will now be removed when all
+ * outstanding io is finished.
+ */
+ dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+ refcount_read(&lseg->pls_refcount));
+ if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list))
+ rv = 1;
+ }
+ return rv;
+}
+
+static bool
+pnfs_should_free_range(const struct pnfs_layout_range *lseg_range,
+ const struct pnfs_layout_range *recall_range)
+{
+ return (recall_range->iomode == IOMODE_ANY ||
+ lseg_range->iomode == recall_range->iomode) &&
+ pnfs_lseg_range_intersecting(lseg_range, recall_range);
+}
+
+static bool
+pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg,
+ const struct pnfs_layout_range *recall_range,
+ u32 seq)
+{
+ if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq))
+ return false;
+ if (recall_range == NULL)
+ return true;
+ return pnfs_should_free_range(&lseg->pls_range, recall_range);
+}
+
+/**
+ * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later
+ * @lo: layout header containing the lsegs
+ * @tmp_list: list head where doomed lsegs should go
+ * @recall_range: optional recall range argument to match (may be NULL)
+ * @seq: only invalidate lsegs obtained prior to this sequence (may be 0)
+ *
+ * Walk the list of lsegs in the layout header, and tear down any that should
+ * be destroyed. If "recall_range" is specified then the segment must match
+ * that range. If "seq" is non-zero, then only match segments that were handed
+ * out at or before that sequence.
+ *
+ * Returns number of matching invalid lsegs remaining in list after scanning
+ * it and purging them.
+ */
+int
+pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+ struct list_head *tmp_list,
+ const struct pnfs_layout_range *recall_range,
+ u32 seq)
+{
+ struct pnfs_layout_segment *lseg, *next;
+ int remaining = 0;
+
+ dprintk("%s:Begin lo %p\n", __func__, lo);
+
+ if (list_empty(&lo->plh_segs))
+ return 0;
+ list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+ if (pnfs_match_lseg_recall(lseg, recall_range, seq)) {
+ dprintk("%s: freeing lseg %p iomode %d seq %u "
+ "offset %llu length %llu\n", __func__,
+ lseg, lseg->pls_range.iomode, lseg->pls_seq,
+ lseg->pls_range.offset, lseg->pls_range.length);
+ if (!mark_lseg_invalid(lseg, tmp_list))
+ remaining++;
+ }
+ dprintk("%s:Return %i\n", __func__, remaining);
+ return remaining;
+}
+
+static void
+pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
+ struct list_head *free_me,
+ const struct pnfs_layout_range *range,
+ u32 seq)
+{
+ struct pnfs_layout_segment *lseg, *next;
+
+ list_for_each_entry_safe(lseg, next, &lo->plh_return_segs, pls_list) {
+ if (pnfs_match_lseg_recall(lseg, range, seq))
+ list_move_tail(&lseg->pls_list, free_me);
+ }
+}
+
+/* note free_me must contain lsegs from a single layout_hdr */
+void
+pnfs_free_lseg_list(struct list_head *free_me)
+{
+ struct pnfs_layout_segment *lseg, *tmp;
+
+ if (list_empty(free_me))
+ return;
+
+ list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
+ list_del(&lseg->pls_list);
+ pnfs_free_lseg(lseg);
+ }
+}
+
+static struct pnfs_layout_hdr *__pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+ struct pnfs_layout_hdr *lo;
+ LIST_HEAD(tmp_list);
+
+ spin_lock(&nfsi->vfs_inode.i_lock);
+ lo = nfsi->layout;
+ if (lo) {
+ pnfs_get_layout_hdr(lo);
+ pnfs_mark_layout_stateid_invalid(lo, &tmp_list);
+ pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
+ pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
+ spin_unlock(&nfsi->vfs_inode.i_lock);
+ pnfs_free_lseg_list(&tmp_list);
+ nfs_commit_inode(&nfsi->vfs_inode, 0);
+ pnfs_put_layout_hdr(lo);
+ } else
+ spin_unlock(&nfsi->vfs_inode.i_lock);
+ return lo;
+}
+
+void pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+ __pnfs_destroy_layout(nfsi);
+}
+EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
+
+static bool pnfs_layout_removed(struct nfs_inode *nfsi,
+ struct pnfs_layout_hdr *lo)
+{
+ bool ret;
+
+ spin_lock(&nfsi->vfs_inode.i_lock);
+ ret = nfsi->layout != lo;
+ spin_unlock(&nfsi->vfs_inode.i_lock);
+ return ret;
+}
+
+void pnfs_destroy_layout_final(struct nfs_inode *nfsi)
+{
+ struct pnfs_layout_hdr *lo = __pnfs_destroy_layout(nfsi);
+
+ if (lo)
+ wait_var_event(lo, pnfs_layout_removed(nfsi, lo));
+}
+
+static bool
+pnfs_layout_add_bulk_destroy_list(struct inode *inode,
+ struct list_head *layout_list)
+{
+ struct pnfs_layout_hdr *lo;
+ bool ret = false;
+
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) {
+ pnfs_get_layout_hdr(lo);
+ list_add(&lo->plh_bulk_destroy, layout_list);
+ ret = true;
+ }
+ spin_unlock(&inode->i_lock);
+ return ret;
+}
+
+/* Caller must hold rcu_read_lock and clp->cl_lock */
+static int
+pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
+ struct nfs_server *server,
+ struct list_head *layout_list)
+ __must_hold(&clp->cl_lock)
+ __must_hold(RCU)
+{
+ struct pnfs_layout_hdr *lo, *next;
+ struct inode *inode;
+
+ list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
+ if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
+ test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) ||
+ !list_empty(&lo->plh_bulk_destroy))
+ continue;
+ /* If the sb is being destroyed, just bail */
+ if (!nfs_sb_active(server->super))
+ break;
+ inode = pnfs_grab_inode_layout_hdr(lo);
+ if (inode != NULL) {
+ if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags))
+ list_del_rcu(&lo->plh_layouts);
+ if (pnfs_layout_add_bulk_destroy_list(inode,
+ layout_list))
+ continue;
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+ iput(inode);
+ } else {
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+ }
+ nfs_sb_deactive(server->super);
+ spin_lock(&clp->cl_lock);
+ rcu_read_lock();
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static int
+pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
+ bool is_bulk_recall)
+{
+ struct pnfs_layout_hdr *lo;
+ struct inode *inode;
+ LIST_HEAD(lseg_list);
+ int ret = 0;
+
+ while (!list_empty(layout_list)) {
+ lo = list_entry(layout_list->next, struct pnfs_layout_hdr,
+ plh_bulk_destroy);
+ dprintk("%s freeing layout for inode %lu\n", __func__,
+ lo->plh_inode->i_ino);
+ inode = lo->plh_inode;
+
+ pnfs_layoutcommit_inode(inode, false);
+
+ spin_lock(&inode->i_lock);
+ list_del_init(&lo->plh_bulk_destroy);
+ if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) {
+ if (is_bulk_recall)
+ set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+ ret = -EAGAIN;
+ }
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&lseg_list);
+ /* Free all lsegs that are attached to commit buckets */
+ nfs_commit_inode(inode, 0);
+ pnfs_put_layout_hdr(lo);
+ nfs_iput_and_deactive(inode);
+ }
+ return ret;
+}
+
+int
+pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
+ struct nfs_fsid *fsid,
+ bool is_recall)
+{
+ struct nfs_server *server;
+ LIST_HEAD(layout_list);
+
+ spin_lock(&clp->cl_lock);
+ rcu_read_lock();
+restart:
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0)
+ continue;
+ if (pnfs_layout_bulk_destroy_byserver_locked(clp,
+ server,
+ &layout_list) != 0)
+ goto restart;
+ }
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+
+ if (list_empty(&layout_list))
+ return 0;
+ return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
+}
+
+int
+pnfs_destroy_layouts_byclid(struct nfs_client *clp,
+ bool is_recall)
+{
+ struct nfs_server *server;
+ LIST_HEAD(layout_list);
+
+ spin_lock(&clp->cl_lock);
+ rcu_read_lock();
+restart:
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ if (pnfs_layout_bulk_destroy_byserver_locked(clp,
+ server,
+ &layout_list) != 0)
+ goto restart;
+ }
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+
+ if (list_empty(&layout_list))
+ return 0;
+ return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
+}
+
+/*
+ * Called by the state manager to remove all layouts established under an
+ * expired lease.
+ */
+void
+pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+ nfs4_deviceid_mark_client_invalid(clp);
+ nfs4_deviceid_purge_client(clp);
+
+ pnfs_destroy_layouts_byclid(clp, false);
+}
+
+static void
+pnfs_set_layout_cred(struct pnfs_layout_hdr *lo, const struct cred *cred)
+{
+ const struct cred *old;
+
+ if (cred && cred_fscmp(lo->plh_lc_cred, cred) != 0) {
+ old = xchg(&lo->plh_lc_cred, get_cred(cred));
+ put_cred(old);
+ }
+}
+
+/* update lo->plh_stateid with new if is more recent */
+void
+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
+ const struct cred *cred, bool update_barrier)
+{
+ u32 oldseq = be32_to_cpu(lo->plh_stateid.seqid);
+ u32 newseq = be32_to_cpu(new->seqid);
+
+ if (!pnfs_layout_is_valid(lo)) {
+ pnfs_set_layout_cred(lo, cred);
+ nfs4_stateid_copy(&lo->plh_stateid, new);
+ lo->plh_barrier = newseq;
+ pnfs_clear_layoutreturn_info(lo);
+ clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+ return;
+ }
+
+ if (pnfs_seqid_is_newer(newseq, oldseq))
+ nfs4_stateid_copy(&lo->plh_stateid, new);
+
+ if (update_barrier) {
+ pnfs_barrier_update(lo, newseq);
+ return;
+ }
+ /*
+ * Because of wraparound, we want to keep the barrier
+ * "close" to the current seqids. We really only want to
+ * get here from a layoutget call.
+ */
+ if (atomic_read(&lo->plh_outstanding) == 1)
+ pnfs_barrier_update(lo, be32_to_cpu(lo->plh_stateid.seqid));
+}
+
+static bool
+pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *stateid)
+{
+ u32 seqid = be32_to_cpu(stateid->seqid);
+
+ return lo->plh_barrier && pnfs_seqid_is_newer(lo->plh_barrier, seqid);
+}
+
+/* lget is set to 1 if called from inside send_layoutget call chain */
+static bool
+pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
+{
+ return lo->plh_block_lgets ||
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+}
+
+static struct nfs_server *
+pnfs_find_server(struct inode *inode, struct nfs_open_context *ctx)
+{
+ struct nfs_server *server;
+
+ if (inode) {
+ server = NFS_SERVER(inode);
+ } else {
+ struct dentry *parent_dir = dget_parent(ctx->dentry);
+ server = NFS_SERVER(parent_dir->d_inode);
+ dput(parent_dir);
+ }
+ return server;
+}
+
+static void nfs4_free_pages(struct page **pages, size_t size)
+{
+ int i;
+
+ if (!pages)
+ return;
+
+ for (i = 0; i < size; i++) {
+ if (!pages[i])
+ break;
+ __free_page(pages[i]);
+ }
+ kfree(pages);
+}
+
+static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags)
+{
+ struct page **pages;
+ int i;
+
+ pages = kmalloc_array(size, sizeof(struct page *), gfp_flags);
+ if (!pages) {
+ dprintk("%s: can't alloc array of %zu pages\n", __func__, size);
+ return NULL;
+ }
+
+ for (i = 0; i < size; i++) {
+ pages[i] = alloc_page(gfp_flags);
+ if (!pages[i]) {
+ dprintk("%s: failed to allocate page\n", __func__);
+ nfs4_free_pages(pages, i);
+ return NULL;
+ }
+ }
+
+ return pages;
+}
+
+static struct nfs4_layoutget *
+pnfs_alloc_init_layoutget_args(struct inode *ino,
+ struct nfs_open_context *ctx,
+ const nfs4_stateid *stateid,
+ const struct pnfs_layout_range *range,
+ gfp_t gfp_flags)
+{
+ struct nfs_server *server = pnfs_find_server(ino, ctx);
+ size_t max_reply_sz = server->pnfs_curr_ld->max_layoutget_response;
+ size_t max_pages = max_response_pages(server);
+ struct nfs4_layoutget *lgp;
+
+ dprintk("--> %s\n", __func__);
+
+ lgp = kzalloc(sizeof(*lgp), gfp_flags);
+ if (lgp == NULL)
+ return NULL;
+
+ if (max_reply_sz) {
+ size_t npages = (max_reply_sz + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (npages < max_pages)
+ max_pages = npages;
+ }
+
+ lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags);
+ if (!lgp->args.layout.pages) {
+ kfree(lgp);
+ return NULL;
+ }
+ lgp->args.layout.pglen = max_pages * PAGE_SIZE;
+ lgp->res.layoutp = &lgp->args.layout;
+
+ /* Don't confuse uninitialised result and success */
+ lgp->res.status = -NFS4ERR_DELAY;
+
+ lgp->args.minlength = PAGE_SIZE;
+ if (lgp->args.minlength > range->length)
+ lgp->args.minlength = range->length;
+ if (ino) {
+ loff_t i_size = i_size_read(ino);
+
+ if (range->iomode == IOMODE_READ) {
+ if (range->offset >= i_size)
+ lgp->args.minlength = 0;
+ else if (i_size - range->offset < lgp->args.minlength)
+ lgp->args.minlength = i_size - range->offset;
+ }
+ }
+ lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
+ pnfs_copy_range(&lgp->args.range, range);
+ lgp->args.type = server->pnfs_curr_ld->id;
+ lgp->args.inode = ino;
+ lgp->args.ctx = get_nfs_open_context(ctx);
+ nfs4_stateid_copy(&lgp->args.stateid, stateid);
+ lgp->gfp_flags = gfp_flags;
+ lgp->cred = ctx->cred;
+ return lgp;
+}
+
+void pnfs_layoutget_free(struct nfs4_layoutget *lgp)
+{
+ size_t max_pages = lgp->args.layout.pglen / PAGE_SIZE;
+
+ nfs4_free_pages(lgp->args.layout.pages, max_pages);
+ if (lgp->args.inode)
+ pnfs_put_layout_hdr(NFS_I(lgp->args.inode)->layout);
+ put_nfs_open_context(lgp->args.ctx);
+ kfree(lgp);
+}
+
+static void pnfs_clear_layoutcommit(struct inode *inode,
+ struct list_head *head)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct pnfs_layout_segment *lseg, *tmp;
+
+ if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
+ return;
+ list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) {
+ if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
+ continue;
+ pnfs_lseg_dec_and_remove_zero(lseg, head);
+ }
+}
+
+void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *arg_stateid,
+ const struct pnfs_layout_range *range,
+ const nfs4_stateid *stateid)
+{
+ struct inode *inode = lo->plh_inode;
+ LIST_HEAD(freeme);
+
+ spin_lock(&inode->i_lock);
+ if (!pnfs_layout_is_valid(lo) || !arg_stateid ||
+ !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid))
+ goto out_unlock;
+ if (stateid) {
+ u32 seq = be32_to_cpu(arg_stateid->seqid);
+
+ pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq);
+ pnfs_free_returned_lsegs(lo, &freeme, range, seq);
+ pnfs_set_layout_stateid(lo, stateid, NULL, true);
+ } else
+ pnfs_mark_layout_stateid_invalid(lo, &freeme);
+out_unlock:
+ pnfs_clear_layoutreturn_waitbit(lo);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&freeme);
+
+}
+
+static bool
+pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
+ nfs4_stateid *stateid,
+ const struct cred **cred,
+ enum pnfs_iomode *iomode)
+{
+ /* Serialise LAYOUTGET/LAYOUTRETURN */
+ if (atomic_read(&lo->plh_outstanding) != 0)
+ return false;
+ if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
+ return false;
+ set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
+ pnfs_get_layout_hdr(lo);
+ nfs4_stateid_copy(stateid, &lo->plh_stateid);
+ *cred = get_cred(lo->plh_lc_cred);
+ if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
+ if (lo->plh_return_seq != 0)
+ stateid->seqid = cpu_to_be32(lo->plh_return_seq);
+ if (iomode != NULL)
+ *iomode = lo->plh_return_iomode;
+ pnfs_clear_layoutreturn_info(lo);
+ } else if (iomode != NULL)
+ *iomode = IOMODE_ANY;
+ pnfs_barrier_update(lo, be32_to_cpu(stateid->seqid));
+ return true;
+}
+
+static void
+pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args,
+ struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *stateid,
+ enum pnfs_iomode iomode)
+{
+ struct inode *inode = lo->plh_inode;
+
+ args->layout_type = NFS_SERVER(inode)->pnfs_curr_ld->id;
+ args->inode = inode;
+ args->range.iomode = iomode;
+ args->range.offset = 0;
+ args->range.length = NFS4_MAX_UINT64;
+ args->layout = lo;
+ nfs4_stateid_copy(&args->stateid, stateid);
+}
+
+static int
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *stateid,
+ const struct cred **pcred,
+ enum pnfs_iomode iomode,
+ bool sync)
+{
+ struct inode *ino = lo->plh_inode;
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
+ struct nfs4_layoutreturn *lrp;
+ const struct cred *cred = *pcred;
+ int status = 0;
+
+ *pcred = NULL;
+ lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
+ if (unlikely(lrp == NULL)) {
+ status = -ENOMEM;
+ spin_lock(&ino->i_lock);
+ pnfs_clear_layoutreturn_waitbit(lo);
+ spin_unlock(&ino->i_lock);
+ put_cred(cred);
+ pnfs_put_layout_hdr(lo);
+ goto out;
+ }
+
+ pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode);
+ lrp->args.ld_private = &lrp->ld_private;
+ lrp->clp = NFS_SERVER(ino)->nfs_client;
+ lrp->cred = cred;
+ if (ld->prepare_layoutreturn)
+ ld->prepare_layoutreturn(&lrp->args);
+
+ status = nfs4_proc_layoutreturn(lrp, sync);
+out:
+ dprintk("<-- %s status: %d\n", __func__, status);
+ return status;
+}
+
+static bool
+pnfs_layout_segments_returnable(struct pnfs_layout_hdr *lo,
+ enum pnfs_iomode iomode,
+ u32 seq)
+{
+ struct pnfs_layout_range recall_range = {
+ .length = NFS4_MAX_UINT64,
+ .iomode = iomode,
+ };
+ return pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
+ &recall_range, seq) != -EBUSY;
+}
+
+/* Return true if layoutreturn is needed */
+static bool
+pnfs_layout_need_return(struct pnfs_layout_hdr *lo)
+{
+ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ return false;
+ return pnfs_layout_segments_returnable(lo, lo->plh_return_iomode,
+ lo->plh_return_seq);
+}
+
+static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct inode *inode= lo->plh_inode;
+
+ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ return;
+ spin_lock(&inode->i_lock);
+ if (pnfs_layout_need_return(lo)) {
+ const struct cred *cred;
+ nfs4_stateid stateid;
+ enum pnfs_iomode iomode;
+ bool send;
+
+ send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
+ spin_unlock(&inode->i_lock);
+ if (send) {
+ /* Send an async layoutreturn so we dont deadlock */
+ pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
+ }
+ } else
+ spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
+ * when the layout segment list is empty.
+ *
+ * Note that a pnfs_layout_hdr can exist with an empty layout segment
+ * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the
+ * deviceid is marked invalid.
+ */
+int
+_pnfs_return_layout(struct inode *ino)
+{
+ struct pnfs_layout_hdr *lo = NULL;
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+ LIST_HEAD(tmp_list);
+ const struct cred *cred;
+ nfs4_stateid stateid;
+ int status = 0;
+ bool send, valid_layout;
+
+ dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino);
+
+ spin_lock(&ino->i_lock);
+ lo = nfsi->layout;
+ if (!lo) {
+ spin_unlock(&ino->i_lock);
+ dprintk("NFS: %s no layout to return\n", __func__);
+ goto out;
+ }
+ /* Reference matched in nfs4_layoutreturn_release */
+ pnfs_get_layout_hdr(lo);
+ /* Is there an outstanding layoutreturn ? */
+ if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
+ spin_unlock(&ino->i_lock);
+ if (wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
+ TASK_UNINTERRUPTIBLE))
+ goto out_put_layout_hdr;
+ spin_lock(&ino->i_lock);
+ }
+ valid_layout = pnfs_layout_is_valid(lo);
+ pnfs_clear_layoutcommit(ino, &tmp_list);
+ pnfs_mark_matching_lsegs_return(lo, &tmp_list, &range, 0);
+
+ if (NFS_SERVER(ino)->pnfs_curr_ld->return_range)
+ NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
+
+ /* Don't send a LAYOUTRETURN if list was initially empty */
+ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) ||
+ !valid_layout) {
+ spin_unlock(&ino->i_lock);
+ dprintk("NFS: %s no layout segments to return\n", __func__);
+ goto out_wait_layoutreturn;
+ }
+
+ send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL);
+ spin_unlock(&ino->i_lock);
+ if (send)
+ status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, true);
+out_wait_layoutreturn:
+ wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE);
+out_put_layout_hdr:
+ pnfs_free_lseg_list(&tmp_list);
+ pnfs_put_layout_hdr(lo);
+out:
+ dprintk("<-- %s status: %d\n", __func__, status);
+ return status;
+}
+
+int
+pnfs_commit_and_return_layout(struct inode *inode)
+{
+ struct pnfs_layout_hdr *lo;
+ int ret;
+
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (lo == NULL) {
+ spin_unlock(&inode->i_lock);
+ return 0;
+ }
+ pnfs_get_layout_hdr(lo);
+ /* Block new layoutgets and read/write to ds */
+ lo->plh_block_lgets++;
+ spin_unlock(&inode->i_lock);
+ filemap_fdatawait(inode->i_mapping);
+ ret = pnfs_layoutcommit_inode(inode, true);
+ if (ret == 0)
+ ret = _pnfs_return_layout(inode);
+ spin_lock(&inode->i_lock);
+ lo->plh_block_lgets--;
+ spin_unlock(&inode->i_lock);
+ pnfs_put_layout_hdr(lo);
+ return ret;
+}
+
+bool pnfs_roc(struct inode *ino,
+ struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ const struct cred *cred)
+{
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct nfs_open_context *ctx;
+ struct nfs4_state *state;
+ struct pnfs_layout_hdr *lo;
+ struct pnfs_layout_segment *lseg, *next;
+ const struct cred *lc_cred;
+ nfs4_stateid stateid;
+ enum pnfs_iomode iomode = 0;
+ bool layoutreturn = false, roc = false;
+ bool skip_read = false;
+
+ if (!nfs_have_layout(ino))
+ return false;
+retry:
+ rcu_read_lock();
+ spin_lock(&ino->i_lock);
+ lo = nfsi->layout;
+ if (!lo || !pnfs_layout_is_valid(lo) ||
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+ lo = NULL;
+ goto out_noroc;
+ }
+ pnfs_get_layout_hdr(lo);
+ if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
+ spin_unlock(&ino->i_lock);
+ rcu_read_unlock();
+ wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
+ TASK_UNINTERRUPTIBLE);
+ pnfs_put_layout_hdr(lo);
+ goto retry;
+ }
+
+ /* no roc if we hold a delegation */
+ if (nfs4_check_delegation(ino, FMODE_READ)) {
+ if (nfs4_check_delegation(ino, FMODE_WRITE))
+ goto out_noroc;
+ skip_read = true;
+ }
+
+ list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
+ state = ctx->state;
+ if (state == NULL)
+ continue;
+ /* Don't return layout if there is open file state */
+ if (state->state & FMODE_WRITE)
+ goto out_noroc;
+ if (state->state & FMODE_READ)
+ skip_read = true;
+ }
+
+
+ list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) {
+ if (skip_read && lseg->pls_range.iomode == IOMODE_READ)
+ continue;
+ /* If we are sending layoutreturn, invalidate all valid lsegs */
+ if (!test_and_clear_bit(NFS_LSEG_ROC, &lseg->pls_flags))
+ continue;
+ /*
+ * Note: mark lseg for return so pnfs_layout_remove_lseg
+ * doesn't invalidate the layout for us.
+ */
+ set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
+ if (!mark_lseg_invalid(lseg, &lo->plh_return_segs))
+ continue;
+ pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
+ }
+
+ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ goto out_noroc;
+
+ /* ROC in two conditions:
+ * 1. there are ROC lsegs
+ * 2. we don't send layoutreturn
+ */
+ /* lo ref dropped in pnfs_roc_release() */
+ layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &lc_cred, &iomode);
+ /* If the creds don't match, we can't compound the layoutreturn */
+ if (!layoutreturn || cred_fscmp(cred, lc_cred) != 0)
+ goto out_noroc;
+
+ roc = layoutreturn;
+ pnfs_init_layoutreturn_args(args, lo, &stateid, iomode);
+ res->lrs_present = 0;
+ layoutreturn = false;
+ put_cred(lc_cred);
+
+out_noroc:
+ spin_unlock(&ino->i_lock);
+ rcu_read_unlock();
+ pnfs_layoutcommit_inode(ino, true);
+ if (roc) {
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
+ if (ld->prepare_layoutreturn)
+ ld->prepare_layoutreturn(args);
+ pnfs_put_layout_hdr(lo);
+ return true;
+ }
+ if (layoutreturn)
+ pnfs_send_layoutreturn(lo, &stateid, &lc_cred, iomode, true);
+ pnfs_put_layout_hdr(lo);
+ return false;
+}
+
+int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp,
+ struct nfs4_layoutreturn_res **respp, int *ret)
+{
+ struct nfs4_layoutreturn_args *arg = *argpp;
+ int retval = -EAGAIN;
+
+ if (!arg)
+ return 0;
+ /* Handle Layoutreturn errors */
+ switch (*ret) {
+ case 0:
+ retval = 0;
+ break;
+ case -NFS4ERR_NOMATCHING_LAYOUT:
+ /* Was there an RPC level error? If not, retry */
+ if (task->tk_rpc_status == 0)
+ break;
+ /* If the call was not sent, let caller handle it */
+ if (!RPC_WAS_SENT(task))
+ return 0;
+ /*
+ * Otherwise, assume the call succeeded and
+ * that we need to release the layout
+ */
+ *ret = 0;
+ (*respp)->lrs_present = 0;
+ retval = 0;
+ break;
+ case -NFS4ERR_DELAY:
+ /* Let the caller handle the retry */
+ *ret = -NFS4ERR_NOMATCHING_LAYOUT;
+ return 0;
+ case -NFS4ERR_OLD_STATEID:
+ if (!nfs4_layout_refresh_old_stateid(&arg->stateid,
+ &arg->range, arg->inode))
+ break;
+ *ret = -NFS4ERR_NOMATCHING_LAYOUT;
+ return -EAGAIN;
+ }
+ *argpp = NULL;
+ *respp = NULL;
+ return retval;
+}
+
+void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ int ret)
+{
+ struct pnfs_layout_hdr *lo = args->layout;
+ struct inode *inode = args->inode;
+ const nfs4_stateid *arg_stateid = NULL;
+ const nfs4_stateid *res_stateid = NULL;
+ struct nfs4_xdr_opaque_data *ld_private = args->ld_private;
+
+ switch (ret) {
+ case -NFS4ERR_NOMATCHING_LAYOUT:
+ spin_lock(&inode->i_lock);
+ if (pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match_other(&args->stateid, &lo->plh_stateid))
+ pnfs_set_plh_return_info(lo, args->range.iomode, 0);
+ spin_unlock(&inode->i_lock);
+ break;
+ case 0:
+ if (res->lrs_present)
+ res_stateid = &res->stateid;
+ fallthrough;
+ default:
+ arg_stateid = &args->stateid;
+ }
+ trace_nfs4_layoutreturn_on_close(args->inode, &args->stateid, ret);
+ pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range,
+ res_stateid);
+ if (ld_private && ld_private->ops && ld_private->ops->free)
+ ld_private->ops->free(ld_private);
+ pnfs_put_layout_hdr(lo);
+}
+
+bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
+{
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct pnfs_layout_hdr *lo;
+ bool sleep = false;
+
+ /* we might not have grabbed lo reference. so need to check under
+ * i_lock */
+ spin_lock(&ino->i_lock);
+ lo = nfsi->layout;
+ if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
+ rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
+ sleep = true;
+ }
+ spin_unlock(&ino->i_lock);
+ return sleep;
+}
+
+/*
+ * Compare two layout segments for sorting into layout cache.
+ * We want to preferentially return RW over RO layouts, so ensure those
+ * are seen first.
+ */
+static s64
+pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
+{
+ s64 d;
+
+ /* high offset > low offset */
+ d = l1->offset - l2->offset;
+ if (d)
+ return d;
+
+ /* short length > long length */
+ d = l2->length - l1->length;
+ if (d)
+ return d;
+
+ /* read > read/write */
+ return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
+}
+
+static bool
+pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
+{
+ return pnfs_lseg_range_cmp(l1, l2) > 0;
+}
+
+static bool
+pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg,
+ struct pnfs_layout_segment *old)
+{
+ return false;
+}
+
+void
+pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ bool (*is_after)(const struct pnfs_layout_range *,
+ const struct pnfs_layout_range *),
+ bool (*do_merge)(struct pnfs_layout_segment *,
+ struct pnfs_layout_segment *),
+ struct list_head *free_me)
+{
+ struct pnfs_layout_segment *lp, *tmp;
+
+ dprintk("%s:Begin\n", __func__);
+
+ list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) {
+ if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0)
+ continue;
+ if (do_merge(lseg, lp)) {
+ mark_lseg_invalid(lp, free_me);
+ continue;
+ }
+ if (is_after(&lseg->pls_range, &lp->pls_range))
+ continue;
+ list_add_tail(&lseg->pls_list, &lp->pls_list);
+ dprintk("%s: inserted lseg %p "
+ "iomode %d offset %llu length %llu before "
+ "lp %p iomode %d offset %llu length %llu\n",
+ __func__, lseg, lseg->pls_range.iomode,
+ lseg->pls_range.offset, lseg->pls_range.length,
+ lp, lp->pls_range.iomode, lp->pls_range.offset,
+ lp->pls_range.length);
+ goto out;
+ }
+ list_add_tail(&lseg->pls_list, &lo->plh_segs);
+ dprintk("%s: inserted lseg %p "
+ "iomode %d offset %llu length %llu at tail\n",
+ __func__, lseg, lseg->pls_range.iomode,
+ lseg->pls_range.offset, lseg->pls_range.length);
+out:
+ pnfs_get_layout_hdr(lo);
+
+ dprintk("%s:Return\n", __func__);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg);
+
+static void
+pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ struct list_head *free_me)
+{
+ struct inode *inode = lo->plh_inode;
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+ if (ld->add_lseg != NULL)
+ ld->add_lseg(lo, lseg, free_me);
+ else
+ pnfs_generic_layout_insert_lseg(lo, lseg,
+ pnfs_lseg_range_is_after,
+ pnfs_lseg_no_merge,
+ free_me);
+}
+
+static struct pnfs_layout_hdr *
+alloc_init_layout_hdr(struct inode *ino,
+ struct nfs_open_context *ctx,
+ gfp_t gfp_flags)
+{
+ struct pnfs_layout_hdr *lo;
+
+ lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
+ if (!lo)
+ return NULL;
+ refcount_set(&lo->plh_refcount, 1);
+ INIT_LIST_HEAD(&lo->plh_layouts);
+ INIT_LIST_HEAD(&lo->plh_segs);
+ INIT_LIST_HEAD(&lo->plh_return_segs);
+ INIT_LIST_HEAD(&lo->plh_bulk_destroy);
+ lo->plh_inode = ino;
+ lo->plh_lc_cred = get_cred(ctx->cred);
+ lo->plh_flags |= 1 << NFS_LAYOUT_INVALID_STID;
+ return lo;
+}
+
+static struct pnfs_layout_hdr *
+pnfs_find_alloc_layout(struct inode *ino,
+ struct nfs_open_context *ctx,
+ gfp_t gfp_flags)
+ __releases(&ino->i_lock)
+ __acquires(&ino->i_lock)
+{
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct pnfs_layout_hdr *new = NULL;
+
+ dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
+
+ if (nfsi->layout != NULL)
+ goto out_existing;
+ spin_unlock(&ino->i_lock);
+ new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
+ spin_lock(&ino->i_lock);
+
+ if (likely(nfsi->layout == NULL)) { /* Won the race? */
+ nfsi->layout = new;
+ return new;
+ } else if (new != NULL)
+ pnfs_free_layout_hdr(new);
+out_existing:
+ pnfs_get_layout_hdr(nfsi->layout);
+ return nfsi->layout;
+}
+
+/*
+ * iomode matching rules:
+ * iomode lseg strict match
+ * iomode
+ * ----- ----- ------ -----
+ * ANY READ N/A true
+ * ANY RW N/A true
+ * RW READ N/A false
+ * RW RW N/A true
+ * READ READ N/A true
+ * READ RW true false
+ * READ RW false true
+ */
+static bool
+pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
+ const struct pnfs_layout_range *range,
+ bool strict_iomode)
+{
+ struct pnfs_layout_range range1;
+
+ if ((range->iomode == IOMODE_RW &&
+ ls_range->iomode != IOMODE_RW) ||
+ (range->iomode != ls_range->iomode &&
+ strict_iomode) ||
+ !pnfs_lseg_range_intersecting(ls_range, range))
+ return false;
+
+ /* range1 covers only the first byte in the range */
+ range1 = *range;
+ range1.length = 1;
+ return pnfs_lseg_range_contained(ls_range, &range1);
+}
+
+/*
+ * lookup range in layout
+ */
+static struct pnfs_layout_segment *
+pnfs_find_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range,
+ bool strict_iomode)
+{
+ struct pnfs_layout_segment *lseg, *ret = NULL;
+
+ dprintk("%s:Begin\n", __func__);
+
+ list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
+ if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
+ pnfs_lseg_range_match(&lseg->pls_range, range,
+ strict_iomode)) {
+ ret = pnfs_get_lseg(lseg);
+ break;
+ }
+ }
+
+ dprintk("%s:Return lseg %p ref %d\n",
+ __func__, ret, ret ? refcount_read(&ret->pls_refcount) : 0);
+ return ret;
+}
+
+/*
+ * Use mdsthreshold hints set at each OPEN to determine if I/O should go
+ * to the MDS or over pNFS
+ *
+ * The nfs_inode read_io and write_io fields are cumulative counters reset
+ * when there are no layout segments. Note that in pnfs_update_layout iomode
+ * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a
+ * WRITE request.
+ *
+ * A return of true means use MDS I/O.
+ *
+ * From rfc 5661:
+ * If a file's size is smaller than the file size threshold, data accesses
+ * SHOULD be sent to the metadata server. If an I/O request has a length that
+ * is below the I/O size threshold, the I/O SHOULD be sent to the metadata
+ * server. If both file size and I/O size are provided, the client SHOULD
+ * reach or exceed both thresholds before sending its read or write
+ * requests to the data server.
+ */
+static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
+ struct inode *ino, int iomode)
+{
+ struct nfs4_threshold *t = ctx->mdsthreshold;
+ struct nfs_inode *nfsi = NFS_I(ino);
+ loff_t fsize = i_size_read(ino);
+ bool size = false, size_set = false, io = false, io_set = false, ret = false;
+
+ if (t == NULL)
+ return ret;
+
+ dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
+ __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz);
+
+ switch (iomode) {
+ case IOMODE_READ:
+ if (t->bm & THRESHOLD_RD) {
+ dprintk("%s fsize %llu\n", __func__, fsize);
+ size_set = true;
+ if (fsize < t->rd_sz)
+ size = true;
+ }
+ if (t->bm & THRESHOLD_RD_IO) {
+ dprintk("%s nfsi->read_io %llu\n", __func__,
+ nfsi->read_io);
+ io_set = true;
+ if (nfsi->read_io < t->rd_io_sz)
+ io = true;
+ }
+ break;
+ case IOMODE_RW:
+ if (t->bm & THRESHOLD_WR) {
+ dprintk("%s fsize %llu\n", __func__, fsize);
+ size_set = true;
+ if (fsize < t->wr_sz)
+ size = true;
+ }
+ if (t->bm & THRESHOLD_WR_IO) {
+ dprintk("%s nfsi->write_io %llu\n", __func__,
+ nfsi->write_io);
+ io_set = true;
+ if (nfsi->write_io < t->wr_io_sz)
+ io = true;
+ }
+ break;
+ }
+ if (size_set && io_set) {
+ if (size && io)
+ ret = true;
+ } else if (size || io)
+ ret = true;
+
+ dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret);
+ return ret;
+}
+
+static int pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
+{
+ /*
+ * send layoutcommit as it can hold up layoutreturn due to lseg
+ * reference
+ */
+ pnfs_layoutcommit_inode(lo->plh_inode, false);
+ return wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
+ nfs_wait_bit_killable,
+ TASK_KILLABLE);
+}
+
+static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo)
+{
+ atomic_inc(&lo->plh_outstanding);
+}
+
+static void nfs_layoutget_end(struct pnfs_layout_hdr *lo)
+{
+ if (atomic_dec_and_test(&lo->plh_outstanding) &&
+ test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags))
+ wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN);
+}
+
+static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo)
+{
+ return test_bit(NFS_LAYOUT_FIRST_LAYOUTGET, &lo->plh_flags);
+}
+
+static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
+{
+ unsigned long *bitlock = &lo->plh_flags;
+
+ clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
+ smp_mb__after_atomic();
+ wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
+}
+
+static void _add_to_server_list(struct pnfs_layout_hdr *lo,
+ struct nfs_server *server)
+{
+ if (!test_and_set_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) {
+ struct nfs_client *clp = server->nfs_client;
+
+ /* The lo must be on the clp list if there is any
+ * chance of a CB_LAYOUTRECALL(FILE) coming in.
+ */
+ spin_lock(&clp->cl_lock);
+ list_add_tail_rcu(&lo->plh_layouts, &server->layouts);
+ spin_unlock(&clp->cl_lock);
+ }
+}
+
+/*
+ * Layout segment is retreived from the server if not cached.
+ * The appropriate layout segment is referenced and returned to the caller.
+ */
+struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino,
+ struct nfs_open_context *ctx,
+ loff_t pos,
+ u64 count,
+ enum pnfs_iomode iomode,
+ bool strict_iomode,
+ gfp_t gfp_flags)
+{
+ struct pnfs_layout_range arg = {
+ .iomode = iomode,
+ .offset = pos,
+ .length = count,
+ };
+ unsigned pg_offset;
+ struct nfs_server *server = NFS_SERVER(ino);
+ struct nfs_client *clp = server->nfs_client;
+ struct pnfs_layout_hdr *lo = NULL;
+ struct pnfs_layout_segment *lseg = NULL;
+ struct nfs4_layoutget *lgp;
+ nfs4_stateid stateid;
+ long timeout = 0;
+ unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
+ bool first;
+
+ if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_NO_PNFS);
+ goto out;
+ }
+
+ if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_MDSTHRESH);
+ goto out;
+ }
+
+lookup_again:
+ lseg = ERR_PTR(nfs4_client_recover_expired_lease(clp));
+ if (IS_ERR(lseg))
+ goto out;
+ first = false;
+ spin_lock(&ino->i_lock);
+ lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
+ if (lo == NULL) {
+ spin_unlock(&ino->i_lock);
+ lseg = ERR_PTR(-ENOMEM);
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_NOMEM);
+ goto out;
+ }
+
+ /* Do we even need to bother with this? */
+ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_BULK_RECALL);
+ dprintk("%s matches recall, use MDS\n", __func__);
+ goto out_unlock;
+ }
+
+ /* if LAYOUTGET already failed once we don't try again */
+ if (pnfs_layout_io_test_failed(lo, iomode)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
+ goto out_unlock;
+ }
+
+ /*
+ * If the layout segment list is empty, but there are outstanding
+ * layoutget calls, then they might be subject to a layoutrecall.
+ */
+ if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) &&
+ atomic_read(&lo->plh_outstanding) != 0) {
+ spin_unlock(&ino->i_lock);
+ lseg = ERR_PTR(wait_on_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN,
+ TASK_KILLABLE));
+ if (IS_ERR(lseg))
+ goto out_put_layout_hdr;
+ pnfs_put_layout_hdr(lo);
+ goto lookup_again;
+ }
+
+ /*
+ * Because we free lsegs when sending LAYOUTRETURN, we need to wait
+ * for LAYOUTRETURN.
+ */
+ if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
+ spin_unlock(&ino->i_lock);
+ dprintk("%s wait for layoutreturn\n", __func__);
+ lseg = ERR_PTR(pnfs_prepare_to_retry_layoutget(lo));
+ if (!IS_ERR(lseg)) {
+ pnfs_put_layout_hdr(lo);
+ dprintk("%s retrying\n", __func__);
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ lseg,
+ PNFS_UPDATE_LAYOUT_RETRY);
+ goto lookup_again;
+ }
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_RETURN);
+ goto out_put_layout_hdr;
+ }
+
+ lseg = pnfs_find_lseg(lo, &arg, strict_iomode);
+ if (lseg) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_FOUND_CACHED);
+ goto out_unlock;
+ }
+
+ /*
+ * Choose a stateid for the LAYOUTGET. If we don't have a layout
+ * stateid, or it has been invalidated, then we must use the open
+ * stateid.
+ */
+ if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
+ int status;
+
+ /*
+ * The first layoutget for the file. Need to serialize per
+ * RFC 5661 Errata 3208.
+ */
+ if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
+ &lo->plh_flags)) {
+ spin_unlock(&ino->i_lock);
+ lseg = ERR_PTR(wait_on_bit(&lo->plh_flags,
+ NFS_LAYOUT_FIRST_LAYOUTGET,
+ TASK_KILLABLE));
+ if (IS_ERR(lseg))
+ goto out_put_layout_hdr;
+ pnfs_put_layout_hdr(lo);
+ dprintk("%s retrying\n", __func__);
+ goto lookup_again;
+ }
+
+ spin_unlock(&ino->i_lock);
+ first = true;
+ status = nfs4_select_rw_stateid(ctx->state,
+ iomode == IOMODE_RW ? FMODE_WRITE : FMODE_READ,
+ NULL, &stateid, NULL);
+ if (status != 0) {
+ lseg = ERR_PTR(status);
+ trace_pnfs_update_layout(ino, pos, count,
+ iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_INVALID_OPEN);
+ nfs4_schedule_stateid_recovery(server, ctx->state);
+ pnfs_clear_first_layoutget(lo);
+ pnfs_put_layout_hdr(lo);
+ goto lookup_again;
+ }
+ spin_lock(&ino->i_lock);
+ } else {
+ nfs4_stateid_copy(&stateid, &lo->plh_stateid);
+ }
+
+ if (pnfs_layoutgets_blocked(lo)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_BLOCKED);
+ goto out_unlock;
+ }
+ nfs_layoutget_begin(lo);
+ spin_unlock(&ino->i_lock);
+
+ _add_to_server_list(lo, server);
+
+ pg_offset = arg.offset & ~PAGE_MASK;
+ if (pg_offset) {
+ arg.offset -= pg_offset;
+ arg.length += pg_offset;
+ }
+ if (arg.length != NFS4_MAX_UINT64)
+ arg.length = PAGE_ALIGN(arg.length);
+
+ lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &stateid, &arg, gfp_flags);
+ if (!lgp) {
+ lseg = ERR_PTR(-ENOMEM);
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, NULL,
+ PNFS_UPDATE_LAYOUT_NOMEM);
+ nfs_layoutget_end(lo);
+ goto out_put_layout_hdr;
+ }
+
+ lseg = nfs4_proc_layoutget(lgp, &timeout);
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
+ nfs_layoutget_end(lo);
+ if (IS_ERR(lseg)) {
+ switch(PTR_ERR(lseg)) {
+ case -EBUSY:
+ if (time_after(jiffies, giveup))
+ lseg = NULL;
+ break;
+ case -ERECALLCONFLICT:
+ case -EAGAIN:
+ break;
+ case -ENODATA:
+ /* The server returned NFS4ERR_LAYOUTUNAVAILABLE */
+ pnfs_layout_set_fail_bit(
+ lo, pnfs_iomode_to_fail_bit(iomode));
+ lseg = NULL;
+ goto out_put_layout_hdr;
+ default:
+ if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
+ pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+ lseg = NULL;
+ }
+ goto out_put_layout_hdr;
+ }
+ if (lseg) {
+ if (first)
+ pnfs_clear_first_layoutget(lo);
+ trace_pnfs_update_layout(ino, pos, count,
+ iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
+ pnfs_put_layout_hdr(lo);
+ goto lookup_again;
+ }
+ } else {
+ pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+ }
+
+out_put_layout_hdr:
+ if (first)
+ pnfs_clear_first_layoutget(lo);
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+ PNFS_UPDATE_LAYOUT_EXIT);
+ pnfs_put_layout_hdr(lo);
+out:
+ dprintk("%s: inode %s/%llu pNFS layout segment %s for "
+ "(%s, offset: %llu, length: %llu)\n",
+ __func__, ino->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(ino),
+ IS_ERR_OR_NULL(lseg) ? "not found" : "found",
+ iomode==IOMODE_RW ? "read/write" : "read-only",
+ (unsigned long long)pos,
+ (unsigned long long)count);
+ return lseg;
+out_unlock:
+ spin_unlock(&ino->i_lock);
+ goto out_put_layout_hdr;
+}
+EXPORT_SYMBOL_GPL(pnfs_update_layout);
+
+static bool
+pnfs_sanity_check_layout_range(struct pnfs_layout_range *range)
+{
+ switch (range->iomode) {
+ case IOMODE_READ:
+ case IOMODE_RW:
+ break;
+ default:
+ return false;
+ }
+ if (range->offset == NFS4_MAX_UINT64)
+ return false;
+ if (range->length == 0)
+ return false;
+ if (range->length != NFS4_MAX_UINT64 &&
+ range->length > NFS4_MAX_UINT64 - range->offset)
+ return false;
+ return true;
+}
+
+static struct pnfs_layout_hdr *
+_pnfs_grab_empty_layout(struct inode *ino, struct nfs_open_context *ctx)
+{
+ struct pnfs_layout_hdr *lo;
+
+ spin_lock(&ino->i_lock);
+ lo = pnfs_find_alloc_layout(ino, ctx, GFP_KERNEL);
+ if (!lo)
+ goto out_unlock;
+ if (!test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags))
+ goto out_unlock;
+ if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+ goto out_unlock;
+ if (pnfs_layoutgets_blocked(lo))
+ goto out_unlock;
+ if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, &lo->plh_flags))
+ goto out_unlock;
+ nfs_layoutget_begin(lo);
+ spin_unlock(&ino->i_lock);
+ _add_to_server_list(lo, NFS_SERVER(ino));
+ return lo;
+
+out_unlock:
+ spin_unlock(&ino->i_lock);
+ pnfs_put_layout_hdr(lo);
+ return NULL;
+}
+
+static void _lgopen_prepare_attached(struct nfs4_opendata *data,
+ struct nfs_open_context *ctx)
+{
+ struct inode *ino = data->dentry->d_inode;
+ struct pnfs_layout_range rng = {
+ .iomode = (data->o_arg.fmode & FMODE_WRITE) ?
+ IOMODE_RW: IOMODE_READ,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+ struct nfs4_layoutget *lgp;
+ struct pnfs_layout_hdr *lo;
+
+ /* Heuristic: don't send layoutget if we have cached data */
+ if (rng.iomode == IOMODE_READ &&
+ (i_size_read(ino) == 0 || ino->i_mapping->nrpages != 0))
+ return;
+
+ lo = _pnfs_grab_empty_layout(ino, ctx);
+ if (!lo)
+ return;
+ lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid,
+ &rng, GFP_KERNEL);
+ if (!lgp) {
+ pnfs_clear_first_layoutget(lo);
+ nfs_layoutget_end(lo);
+ pnfs_put_layout_hdr(lo);
+ return;
+ }
+ data->lgp = lgp;
+ data->o_arg.lg_args = &lgp->args;
+ data->o_res.lg_res = &lgp->res;
+}
+
+static void _lgopen_prepare_floating(struct nfs4_opendata *data,
+ struct nfs_open_context *ctx)
+{
+ struct pnfs_layout_range rng = {
+ .iomode = (data->o_arg.fmode & FMODE_WRITE) ?
+ IOMODE_RW: IOMODE_READ,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+ struct nfs4_layoutget *lgp;
+
+ lgp = pnfs_alloc_init_layoutget_args(NULL, ctx, &current_stateid,
+ &rng, GFP_KERNEL);
+ if (!lgp)
+ return;
+ data->lgp = lgp;
+ data->o_arg.lg_args = &lgp->args;
+ data->o_res.lg_res = &lgp->res;
+}
+
+void pnfs_lgopen_prepare(struct nfs4_opendata *data,
+ struct nfs_open_context *ctx)
+{
+ struct nfs_server *server = NFS_SERVER(data->dir->d_inode);
+
+ if (!(pnfs_enabled_sb(server) &&
+ server->pnfs_curr_ld->flags & PNFS_LAYOUTGET_ON_OPEN))
+ return;
+ /* Could check on max_ops, but currently hardcoded high enough */
+ if (!nfs_server_capable(data->dir->d_inode, NFS_CAP_LGOPEN))
+ return;
+ if (data->state)
+ _lgopen_prepare_attached(data, ctx);
+ else
+ _lgopen_prepare_floating(data, ctx);
+}
+
+void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp,
+ struct nfs_open_context *ctx)
+{
+ struct pnfs_layout_hdr *lo;
+ struct pnfs_layout_segment *lseg;
+ struct nfs_server *srv = NFS_SERVER(ino);
+ u32 iomode;
+
+ if (!lgp)
+ return;
+ dprintk("%s: entered with status %i\n", __func__, lgp->res.status);
+ if (lgp->res.status) {
+ switch (lgp->res.status) {
+ default:
+ break;
+ /*
+ * Halt lgopen attempts if the server doesn't recognise
+ * the "current stateid" value, the layout type, or the
+ * layoutget operation as being valid.
+ * Also if it complains about too many ops in the compound
+ * or of the request/reply being too big.
+ */
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_NOTSUPP:
+ case -NFS4ERR_REP_TOO_BIG:
+ case -NFS4ERR_REP_TOO_BIG_TO_CACHE:
+ case -NFS4ERR_REQ_TOO_BIG:
+ case -NFS4ERR_TOO_MANY_OPS:
+ case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
+ srv->caps &= ~NFS_CAP_LGOPEN;
+ }
+ return;
+ }
+ if (!lgp->args.inode) {
+ lo = _pnfs_grab_empty_layout(ino, ctx);
+ if (!lo)
+ return;
+ lgp->args.inode = ino;
+ } else
+ lo = NFS_I(lgp->args.inode)->layout;
+
+ lseg = pnfs_layout_process(lgp);
+ if (!IS_ERR(lseg)) {
+ iomode = lgp->args.range.iomode;
+ pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+ pnfs_put_lseg(lseg);
+ }
+}
+
+void nfs4_lgopen_release(struct nfs4_layoutget *lgp)
+{
+ if (lgp != NULL) {
+ struct inode *inode = lgp->args.inode;
+ if (inode) {
+ struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
+ pnfs_clear_first_layoutget(lo);
+ nfs_layoutget_end(lo);
+ }
+ pnfs_layoutget_free(lgp);
+ }
+}
+
+struct pnfs_layout_segment *
+pnfs_layout_process(struct nfs4_layoutget *lgp)
+{
+ struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
+ struct nfs4_layoutget_res *res = &lgp->res;
+ struct pnfs_layout_segment *lseg;
+ struct inode *ino = lo->plh_inode;
+ LIST_HEAD(free_me);
+
+ if (!pnfs_sanity_check_layout_range(&res->range))
+ return ERR_PTR(-EINVAL);
+
+ /* Inject layout blob into I/O device driver */
+ lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
+ if (IS_ERR_OR_NULL(lseg)) {
+ if (!lseg)
+ lseg = ERR_PTR(-ENOMEM);
+
+ dprintk("%s: Could not allocate layout: error %ld\n",
+ __func__, PTR_ERR(lseg));
+ return lseg;
+ }
+
+ pnfs_init_lseg(lo, lseg, &res->range, &res->stateid);
+
+ spin_lock(&ino->i_lock);
+ if (pnfs_layoutgets_blocked(lo)) {
+ dprintk("%s forget reply due to state\n", __func__);
+ goto out_forget;
+ }
+
+ if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) &&
+ !pnfs_is_first_layoutget(lo))
+ goto out_forget;
+
+ if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
+ /* existing state ID, make sure the sequence number matches. */
+ if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
+ if (!pnfs_layout_is_valid(lo))
+ lo->plh_barrier = 0;
+ dprintk("%s forget reply due to sequence\n", __func__);
+ goto out_forget;
+ }
+ pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, false);
+ } else if (pnfs_layout_is_valid(lo)) {
+ /*
+ * We got an entirely new state ID. Mark all segments for the
+ * inode invalid, and retry the layoutget
+ */
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .length = NFS4_MAX_UINT64,
+ };
+ pnfs_set_plh_return_info(lo, IOMODE_ANY, 0);
+ pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
+ &range, 0);
+ goto out_forget;
+ } else {
+ /* We have a completely new layout */
+ pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true);
+ }
+
+ pnfs_get_lseg(lseg);
+ pnfs_layout_insert_lseg(lo, lseg, &free_me);
+
+
+ if (res->return_on_close)
+ set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
+
+ spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&free_me);
+ return lseg;
+
+out_forget:
+ spin_unlock(&ino->i_lock);
+ lseg->pls_layout = lo;
+ NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+ pnfs_free_lseg_list(&free_me);
+ return ERR_PTR(-EAGAIN);
+}
+
+/**
+ * pnfs_mark_matching_lsegs_return - Free or return matching layout segments
+ * @lo: pointer to layout header
+ * @tmp_list: list header to be used with pnfs_free_lseg_list()
+ * @return_range: describe layout segment ranges to be returned
+ * @seq: stateid seqid to match
+ *
+ * This function is mainly intended for use by layoutrecall. It attempts
+ * to free the layout segment immediately, or else to mark it for return
+ * as soon as its reference count drops to zero.
+ *
+ * Returns
+ * - 0: a layoutreturn needs to be scheduled.
+ * - EBUSY: there are layout segment that are still in use.
+ * - ENOENT: there are no layout segments that need to be returned.
+ */
+int
+pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
+ struct list_head *tmp_list,
+ const struct pnfs_layout_range *return_range,
+ u32 seq)
+{
+ struct pnfs_layout_segment *lseg, *next;
+ int remaining = 0;
+
+ dprintk("%s:Begin lo %p\n", __func__, lo);
+
+ assert_spin_locked(&lo->plh_inode->i_lock);
+
+ if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ tmp_list = &lo->plh_return_segs;
+
+ list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+ if (pnfs_match_lseg_recall(lseg, return_range, seq)) {
+ dprintk("%s: marking lseg %p iomode %d "
+ "offset %llu length %llu\n", __func__,
+ lseg, lseg->pls_range.iomode,
+ lseg->pls_range.offset,
+ lseg->pls_range.length);
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ tmp_list = &lo->plh_return_segs;
+ if (mark_lseg_invalid(lseg, tmp_list))
+ continue;
+ remaining++;
+ set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
+ }
+
+ if (remaining) {
+ pnfs_set_plh_return_info(lo, return_range->iomode, seq);
+ return -EBUSY;
+ }
+
+ if (!list_empty(&lo->plh_return_segs)) {
+ pnfs_set_plh_return_info(lo, return_range->iomode, seq);
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static void
+pnfs_mark_layout_for_return(struct inode *inode,
+ const struct pnfs_layout_range *range)
+{
+ struct pnfs_layout_hdr *lo;
+ bool return_now = false;
+
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (!pnfs_layout_is_valid(lo)) {
+ spin_unlock(&inode->i_lock);
+ return;
+ }
+ pnfs_set_plh_return_info(lo, range->iomode, 0);
+ /*
+ * mark all matching lsegs so that we are sure to have no live
+ * segments at hand when sending layoutreturn. See pnfs_put_lseg()
+ * for how it works.
+ */
+ if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, range, 0) != -EBUSY) {
+ const struct cred *cred;
+ nfs4_stateid stateid;
+ enum pnfs_iomode iomode;
+
+ return_now = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
+ spin_unlock(&inode->i_lock);
+ if (return_now)
+ pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
+ } else {
+ spin_unlock(&inode->i_lock);
+ nfs_commit_inode(inode, 0);
+ }
+}
+
+void pnfs_error_mark_layout_for_return(struct inode *inode,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_layout_range range = {
+ .iomode = lseg->pls_range.iomode,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+
+ pnfs_mark_layout_for_return(inode, &range);
+}
+EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
+
+static bool
+pnfs_layout_can_be_returned(struct pnfs_layout_hdr *lo)
+{
+ return pnfs_layout_is_valid(lo) &&
+ !test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) &&
+ !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
+}
+
+static struct pnfs_layout_segment *
+pnfs_find_first_lseg(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range,
+ enum pnfs_iomode iomode)
+{
+ struct pnfs_layout_segment *lseg;
+
+ list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
+ if (!test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
+ continue;
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ continue;
+ if (lseg->pls_range.iomode != iomode && iomode != IOMODE_ANY)
+ continue;
+ if (pnfs_lseg_range_intersecting(&lseg->pls_range, range))
+ return lseg;
+ }
+ return NULL;
+}
+
+/* Find open file states whose mode matches that of the range */
+static bool
+pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range)
+{
+ struct list_head *head;
+ struct nfs_open_context *ctx;
+ fmode_t mode = 0;
+
+ if (!pnfs_layout_can_be_returned(lo) ||
+ !pnfs_find_first_lseg(lo, range, range->iomode))
+ return false;
+
+ head = &NFS_I(lo->plh_inode)->open_files;
+ list_for_each_entry_rcu(ctx, head, list) {
+ if (ctx->state)
+ mode |= ctx->state->state & (FMODE_READ|FMODE_WRITE);
+ }
+
+ switch (range->iomode) {
+ default:
+ break;
+ case IOMODE_READ:
+ mode &= ~FMODE_WRITE;
+ break;
+ case IOMODE_RW:
+ if (pnfs_find_first_lseg(lo, range, IOMODE_READ))
+ mode &= ~FMODE_READ;
+ }
+ return mode == 0;
+}
+
+static int pnfs_layout_return_unused_byserver(struct nfs_server *server,
+ void *data)
+{
+ const struct pnfs_layout_range *range = data;
+ const struct cred *cred;
+ struct pnfs_layout_hdr *lo;
+ struct inode *inode;
+ nfs4_stateid stateid;
+ enum pnfs_iomode iomode;
+
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
+ inode = lo->plh_inode;
+ if (!inode || !pnfs_layout_can_be_returned(lo) ||
+ test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ continue;
+ spin_lock(&inode->i_lock);
+ if (!lo->plh_inode ||
+ !pnfs_should_return_unused_layout(lo, range)) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+ pnfs_get_layout_hdr(lo);
+ pnfs_set_plh_return_info(lo, range->iomode, 0);
+ if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
+ range, 0) != 0 ||
+ !pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode)) {
+ spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
+ pnfs_put_layout_hdr(lo);
+ cond_resched();
+ goto restart;
+ }
+ spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
+ pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
+ pnfs_put_layout_hdr(lo);
+ cond_resched();
+ goto restart;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+void
+pnfs_layout_return_unused_byclid(struct nfs_client *clp,
+ enum pnfs_iomode iomode)
+{
+ struct pnfs_layout_range range = {
+ .iomode = iomode,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+
+ nfs_client_for_each_server(clp, pnfs_layout_return_unused_byserver,
+ &range);
+}
+
+void
+pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio)
+{
+ if (pgio->pg_lseg == NULL ||
+ test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags))
+ return;
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout);
+
+/*
+ * Check for any intersection between the request and the pgio->pg_lseg,
+ * and if none, put this pgio->pg_lseg away.
+ */
+void
+pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+{
+ if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) {
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_range);
+
+void
+pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+{
+ u64 rd_size = req->wb_bytes;
+
+ pnfs_generic_pg_check_layout(pgio);
+ pnfs_generic_pg_check_range(pgio, req);
+ if (pgio->pg_lseg == NULL) {
+ if (pgio->pg_dreq == NULL)
+ rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
+ else
+ rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ nfs_req_openctx(req),
+ req_offset(req),
+ rd_size,
+ IOMODE_READ,
+ false,
+ GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
+ /* If no lseg, fall back to read through mds */
+ if (pgio->pg_lseg == NULL)
+ nfs_pageio_reset_read_mds(pgio);
+
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
+
+void
+pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req, u64 wb_size)
+{
+ pnfs_generic_pg_check_layout(pgio);
+ pnfs_generic_pg_check_range(pgio, req);
+ if (pgio->pg_lseg == NULL) {
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ nfs_req_openctx(req),
+ req_offset(req),
+ wb_size,
+ IOMODE_RW,
+ false,
+ GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
+ /* If no lseg, fall back to write through mds */
+ if (pgio->pg_lseg == NULL)
+ nfs_pageio_reset_write_mds(pgio);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
+
+void
+pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc)
+{
+ if (desc->pg_lseg) {
+ pnfs_put_lseg(desc->pg_lseg);
+ desc->pg_lseg = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup);
+
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+size_t
+pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *prev, struct nfs_page *req)
+{
+ unsigned int size;
+ u64 seg_end, req_start, seg_left;
+
+ size = nfs_generic_pg_test(pgio, prev, req);
+ if (!size)
+ return 0;
+
+ /*
+ * 'size' contains the number of bytes left in the current page (up
+ * to the original size asked for in @req->wb_bytes).
+ *
+ * Calculate how many bytes are left in the layout segment
+ * and if there are less bytes than 'size', return that instead.
+ *
+ * Please also note that 'end_offset' is actually the offset of the
+ * first byte that lies outside the pnfs_layout_range. FIXME?
+ *
+ */
+ if (pgio->pg_lseg) {
+ seg_end = pnfs_end_offset(pgio->pg_lseg->pls_range.offset,
+ pgio->pg_lseg->pls_range.length);
+ req_start = req_offset(req);
+
+ /* start of request is past the last byte of this segment */
+ if (req_start >= seg_end)
+ return 0;
+
+ /* adjust 'size' iff there are fewer bytes left in the
+ * segment than what nfs_generic_pg_test returned */
+ seg_left = seg_end - req_start;
+ if (seg_left < size)
+ size = (unsigned int)seg_left;
+ }
+
+ return size;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
+
+int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
+{
+ struct nfs_pageio_descriptor pgio;
+
+ /* Resend all requests through the MDS */
+ nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
+ hdr->completion_ops);
+ set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags);
+ return nfs_pageio_resend(&pgio, hdr);
+}
+EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
+
+static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
+{
+
+ dprintk("pnfs write error = %d\n", hdr->pnfs_error);
+ if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
+ PNFS_LAYOUTRET_ON_ERROR) {
+ pnfs_return_layout(hdr->inode);
+ }
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
+ hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
+}
+
+/*
+ * Called by non rpc-based layout drivers
+ */
+void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
+{
+ if (likely(!hdr->pnfs_error)) {
+ pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
+ hdr->mds_offset + hdr->res.count);
+ hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
+ }
+ trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
+ if (unlikely(hdr->pnfs_error))
+ pnfs_ld_handle_write_error(hdr);
+ hdr->mds_ops->rpc_release(hdr);
+}
+EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
+
+static void
+pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr)
+{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ list_splice_tail_init(&hdr->pages, &mirror->pg_list);
+ nfs_pageio_reset_write_mds(desc);
+ mirror->pg_recoalesce = 1;
+ }
+ hdr->completion_ops->completion(hdr);
+}
+
+static enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
+ const struct rpc_call_ops *call_ops,
+ struct pnfs_layout_segment *lseg,
+ int how)
+{
+ struct inode *inode = hdr->inode;
+ enum pnfs_try_status trypnfs;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+
+ hdr->mds_ops = call_ops;
+
+ dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
+ inode->i_ino, hdr->args.count, hdr->args.offset, how);
+ trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
+ if (trypnfs != PNFS_NOT_ATTEMPTED)
+ nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
+ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+ return trypnfs;
+}
+
+static void
+pnfs_do_write(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr, int how)
+{
+ const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
+ struct pnfs_layout_segment *lseg = desc->pg_lseg;
+ enum pnfs_try_status trypnfs;
+
+ trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
+ switch (trypnfs) {
+ case PNFS_NOT_ATTEMPTED:
+ pnfs_write_through_mds(desc, hdr);
+ case PNFS_ATTEMPTED:
+ break;
+ case PNFS_TRY_AGAIN:
+ /* cleanup hdr and prepare to redo pnfs */
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+ list_splice_init(&hdr->pages, &mirror->pg_list);
+ mirror->pg_recoalesce = 1;
+ }
+ hdr->mds_ops->rpc_release(hdr);
+ }
+}
+
+static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
+{
+ pnfs_put_lseg(hdr->lseg);
+ nfs_pgio_header_free(hdr);
+}
+
+int
+pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
+{
+ struct nfs_pgio_header *hdr;
+ int ret;
+
+ hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+ if (!hdr) {
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
+ }
+ nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
+
+ hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
+ ret = nfs_generic_pgio(desc, hdr);
+ if (!ret)
+ pnfs_do_write(desc, hdr, desc->pg_ioflags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
+
+int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
+{
+ struct nfs_pageio_descriptor pgio;
+
+ /* Resend all requests through the MDS */
+ nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
+ return nfs_pageio_resend(&pgio, hdr);
+}
+EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
+
+static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
+{
+ dprintk("pnfs read error = %d\n", hdr->pnfs_error);
+ if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
+ PNFS_LAYOUTRET_ON_ERROR) {
+ pnfs_return_layout(hdr->inode);
+ }
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
+ hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
+}
+
+/*
+ * Called by non rpc-based layout drivers
+ */
+void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
+{
+ if (likely(!hdr->pnfs_error))
+ hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
+ trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
+ if (unlikely(hdr->pnfs_error))
+ pnfs_ld_handle_read_error(hdr);
+ hdr->mds_ops->rpc_release(hdr);
+}
+EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
+
+static void
+pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr)
+{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ list_splice_tail_init(&hdr->pages, &mirror->pg_list);
+ nfs_pageio_reset_read_mds(desc);
+ mirror->pg_recoalesce = 1;
+ }
+ hdr->completion_ops->completion(hdr);
+}
+
+/*
+ * Call the appropriate parallel I/O subsystem read function.
+ */
+static enum pnfs_try_status
+pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
+ const struct rpc_call_ops *call_ops,
+ struct pnfs_layout_segment *lseg)
+{
+ struct inode *inode = hdr->inode;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+ enum pnfs_try_status trypnfs;
+
+ hdr->mds_ops = call_ops;
+
+ dprintk("%s: Reading ino:%lu %u@%llu\n",
+ __func__, inode->i_ino, hdr->args.count, hdr->args.offset);
+
+ trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
+ if (trypnfs != PNFS_NOT_ATTEMPTED)
+ nfs_inc_stats(inode, NFSIOS_PNFS_READ);
+ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+ return trypnfs;
+}
+
+/* Resend all requests through pnfs. */
+void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr,
+ unsigned int mirror_idx)
+{
+ struct nfs_pageio_descriptor pgio;
+
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ /* Prevent deadlocks with layoutreturn! */
+ pnfs_put_lseg(hdr->lseg);
+ hdr->lseg = NULL;
+
+ nfs_pageio_init_read(&pgio, hdr->inode, false,
+ hdr->completion_ops);
+ pgio.pg_mirror_idx = mirror_idx;
+ hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr);
+ }
+}
+EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
+
+static void
+pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
+{
+ const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
+ struct pnfs_layout_segment *lseg = desc->pg_lseg;
+ enum pnfs_try_status trypnfs;
+
+ trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
+ switch (trypnfs) {
+ case PNFS_NOT_ATTEMPTED:
+ pnfs_read_through_mds(desc, hdr);
+ case PNFS_ATTEMPTED:
+ break;
+ case PNFS_TRY_AGAIN:
+ /* cleanup hdr and prepare to redo pnfs */
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+ list_splice_init(&hdr->pages, &mirror->pg_list);
+ mirror->pg_recoalesce = 1;
+ }
+ hdr->mds_ops->rpc_release(hdr);
+ }
+}
+
+static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
+{
+ pnfs_put_lseg(hdr->lseg);
+ nfs_pgio_header_free(hdr);
+}
+
+int
+pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
+{
+ struct nfs_pgio_header *hdr;
+ int ret;
+
+ hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
+ if (!hdr) {
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
+ }
+ nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
+ hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
+ ret = nfs_generic_pgio(desc, hdr);
+ if (!ret)
+ pnfs_do_read(desc, hdr);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
+
+static void pnfs_clear_layoutcommitting(struct inode *inode)
+{
+ unsigned long *bitlock = &NFS_I(inode)->flags;
+
+ clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
+ smp_mb__after_atomic();
+ wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
+}
+
+/*
+ * There can be multiple RW segments.
+ */
+static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
+{
+ struct pnfs_layout_segment *lseg;
+
+ list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
+ if (lseg->pls_range.iomode == IOMODE_RW &&
+ test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
+ list_add(&lseg->pls_lc_list, listp);
+ }
+}
+
+static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp)
+{
+ struct pnfs_layout_segment *lseg, *tmp;
+
+ /* Matched by references in pnfs_set_layoutcommit */
+ list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) {
+ list_del_init(&lseg->pls_lc_list);
+ pnfs_put_lseg(lseg);
+ }
+
+ pnfs_clear_layoutcommitting(inode);
+}
+
+void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
+{
+ pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode);
+}
+EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
+
+void
+pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg,
+ loff_t end_pos)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ bool mark_as_dirty = false;
+
+ spin_lock(&inode->i_lock);
+ if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
+ nfsi->layout->plh_lwb = end_pos;
+ mark_as_dirty = true;
+ dprintk("%s: Set layoutcommit for inode %lu ",
+ __func__, inode->i_ino);
+ } else if (end_pos > nfsi->layout->plh_lwb)
+ nfsi->layout->plh_lwb = end_pos;
+ if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) {
+ /* references matched in nfs4_layoutcommit_release */
+ pnfs_get_lseg(lseg);
+ }
+ spin_unlock(&inode->i_lock);
+ dprintk("%s: lseg %p end_pos %llu\n",
+ __func__, lseg, nfsi->layout->plh_lwb);
+
+ /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
+ * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
+ if (mark_as_dirty)
+ mark_inode_dirty_sync(inode);
+}
+EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
+
+void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
+{
+ struct nfs_server *nfss = NFS_SERVER(data->args.inode);
+
+ if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
+ nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
+ pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list);
+}
+
+/*
+ * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
+ * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
+ * data to disk to allow the server to recover the data if it crashes.
+ * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag
+ * is off, and a COMMIT is sent to a data server, or
+ * if WRITEs to a data server return NFS_DATA_SYNC.
+ */
+int
+pnfs_layoutcommit_inode(struct inode *inode, bool sync)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ struct nfs4_layoutcommit_data *data;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ loff_t end_pos;
+ int status;
+
+ if (!pnfs_layoutcommit_outstanding(inode))
+ return 0;
+
+ dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
+
+ status = -EAGAIN;
+ if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
+ if (!sync)
+ goto out;
+ status = wait_on_bit_lock_action(&nfsi->flags,
+ NFS_INO_LAYOUTCOMMITTING,
+ nfs_wait_bit_killable,
+ TASK_KILLABLE);
+ if (status)
+ goto out;
+ }
+
+ status = -ENOMEM;
+ /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
+ data = kzalloc(sizeof(*data), GFP_NOFS);
+ if (!data)
+ goto clear_layoutcommitting;
+
+ status = 0;
+ spin_lock(&inode->i_lock);
+ if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
+ goto out_unlock;
+
+ INIT_LIST_HEAD(&data->lseg_list);
+ pnfs_list_write_lseg(inode, &data->lseg_list);
+
+ end_pos = nfsi->layout->plh_lwb;
+
+ nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
+ data->cred = get_cred(nfsi->layout->plh_lc_cred);
+ spin_unlock(&inode->i_lock);
+
+ data->args.inode = inode;
+ nfs_fattr_init(&data->fattr);
+ data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
+ data->res.fattr = &data->fattr;
+ if (end_pos != 0)
+ data->args.lastbytewritten = end_pos - 1;
+ else
+ data->args.lastbytewritten = U64_MAX;
+ data->res.server = NFS_SERVER(inode);
+
+ if (ld->prepare_layoutcommit) {
+ status = ld->prepare_layoutcommit(&data->args);
+ if (status) {
+ put_cred(data->cred);
+ spin_lock(&inode->i_lock);
+ set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
+ if (end_pos > nfsi->layout->plh_lwb)
+ nfsi->layout->plh_lwb = end_pos;
+ goto out_unlock;
+ }
+ }
+
+
+ status = nfs4_proc_layoutcommit(data, sync);
+out:
+ if (status)
+ mark_inode_dirty_sync(inode);
+ dprintk("<-- %s status %d\n", __func__, status);
+ return status;
+out_unlock:
+ spin_unlock(&inode->i_lock);
+ kfree(data);
+clear_layoutcommitting:
+ pnfs_clear_layoutcommitting(inode);
+ goto out;
+}
+EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
+
+int
+pnfs_generic_sync(struct inode *inode, bool datasync)
+{
+ return pnfs_layoutcommit_inode(inode, true);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_sync);
+
+struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
+{
+ struct nfs4_threshold *thp;
+
+ thp = kzalloc(sizeof(*thp), GFP_NOFS);
+ if (!thp) {
+ dprintk("%s mdsthreshold allocation failed\n", __func__);
+ return NULL;
+ }
+ return thp;
+}
+
+#if IS_ENABLED(CONFIG_NFS_V4_2)
+int
+pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs42_layoutstat_data *data;
+ struct pnfs_layout_hdr *hdr;
+ int status = 0;
+
+ if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
+ goto out;
+
+ if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS))
+ goto out;
+
+ if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
+ goto out;
+
+ spin_lock(&inode->i_lock);
+ if (!NFS_I(inode)->layout) {
+ spin_unlock(&inode->i_lock);
+ goto out_clear_layoutstats;
+ }
+ hdr = NFS_I(inode)->layout;
+ pnfs_get_layout_hdr(hdr);
+ spin_unlock(&inode->i_lock);
+
+ data = kzalloc(sizeof(*data), gfp_flags);
+ if (!data) {
+ status = -ENOMEM;
+ goto out_put;
+ }
+
+ data->args.fh = NFS_FH(inode);
+ data->args.inode = inode;
+ status = ld->prepare_layoutstats(&data->args);
+ if (status)
+ goto out_free;
+
+ status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data);
+
+out:
+ dprintk("%s returns %d\n", __func__, status);
+ return status;
+
+out_free:
+ kfree(data);
+out_put:
+ pnfs_put_layout_hdr(hdr);
+out_clear_layoutstats:
+ smp_mb__before_atomic();
+ clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
+ smp_mb__after_atomic();
+ goto out;
+}
+EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
+#endif
+
+unsigned int layoutstats_timer;
+module_param(layoutstats_timer, uint, 0644);
+EXPORT_SYMBOL_GPL(layoutstats_timer);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
new file mode 100644
index 000000000..a7cf84a66
--- /dev/null
+++ b/fs/nfs/pnfs.h
@@ -0,0 +1,925 @@
+/*
+ * pNFS client data structures.
+ *
+ * Copyright (c) 2002
+ * The Regents of the University of Michigan
+ * All Rights Reserved
+ *
+ * Dean Hildebrand <dhildebz@umich.edu>
+ *
+ * Permission is granted to use, copy, create derivative works, and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the University of Michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. If
+ * the above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * This software is provided as is, without representation or warranty
+ * of any kind either express or implied, including without limitation
+ * the implied warranties of merchantability, fitness for a particular
+ * purpose, or noninfringement. The Regents of the University of
+ * Michigan shall not be liable for any damages, including special,
+ * indirect, incidental, or consequential damages, with respect to any
+ * claim arising out of or in connection with the use of the software,
+ * even if it has been or is hereafter advised of the possibility of
+ * such damages.
+ */
+
+#ifndef FS_NFS_PNFS_H
+#define FS_NFS_PNFS_H
+
+#include <linux/refcount.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/workqueue.h>
+
+struct nfs4_opendata;
+
+enum {
+ NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
+ NFS_LSEG_ROC, /* roc bit received from server */
+ NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */
+ NFS_LSEG_LAYOUTRETURN, /* layoutreturn bit set for layoutreturn */
+ NFS_LSEG_UNAVAILABLE, /* unavailable bit set for temporary problem */
+};
+
+/* Individual ip address */
+struct nfs4_pnfs_ds_addr {
+ struct sockaddr_storage da_addr;
+ size_t da_addrlen;
+ struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */
+ char *da_remotestr; /* human readable addr+port */
+};
+
+struct nfs4_pnfs_ds {
+ struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
+ char *ds_remotestr; /* comma sep list of addrs */
+ struct list_head ds_addrs;
+ struct nfs_client *ds_clp;
+ refcount_t ds_count;
+ unsigned long ds_state;
+#define NFS4DS_CONNECTING 0 /* ds is establishing connection */
+};
+
+struct pnfs_layout_segment {
+ struct list_head pls_list;
+ struct list_head pls_lc_list;
+ struct list_head pls_commits;
+ struct pnfs_layout_range pls_range;
+ refcount_t pls_refcount;
+ u32 pls_seq;
+ unsigned long pls_flags;
+ struct pnfs_layout_hdr *pls_layout;
+};
+
+enum pnfs_try_status {
+ PNFS_ATTEMPTED = 0,
+ PNFS_NOT_ATTEMPTED = 1,
+ PNFS_TRY_AGAIN = 2,
+};
+
+/* error codes for internal use */
+#define NFS4ERR_RESET_TO_MDS 12001
+#define NFS4ERR_RESET_TO_PNFS 12002
+
+#ifdef CONFIG_NFS_V4_1
+
+#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
+
+/*
+ * Default data server connection timeout and retrans vaules.
+ * Set by module parameters dataserver_timeo and dataserver_retrans.
+ */
+#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */
+#define NFS4_DEF_DS_RETRANS 5
+#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ)
+
+enum {
+ NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
+ NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
+ NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
+ NFS_LAYOUT_RETURN, /* layoutreturn in progress */
+ NFS_LAYOUT_RETURN_LOCK, /* Serialise layoutreturn */
+ NFS_LAYOUT_RETURN_REQUESTED, /* Return this layout ASAP */
+ NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
+ NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
+ NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */
+ NFS_LAYOUT_HASHED, /* The layout visible */
+ NFS_LAYOUT_DRAIN,
+};
+
+enum layoutdriver_policy_flags {
+ /* Should the pNFS client commit and return the layout upon truncate to
+ * a smaller size */
+ PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
+ PNFS_LAYOUTRET_ON_ERROR = 1 << 1,
+ PNFS_READ_WHOLE_PAGE = 1 << 2,
+ PNFS_LAYOUTGET_ON_OPEN = 1 << 3,
+};
+
+struct nfs4_deviceid_node;
+
+/* Per-layout driver specific registration structure */
+struct pnfs_layoutdriver_type {
+ struct list_head pnfs_tblid;
+ const u32 id;
+ const char *name;
+ struct module *owner;
+ unsigned flags;
+ unsigned max_deviceinfo_size;
+ unsigned max_layoutget_response;
+
+ int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
+ int (*clear_layoutdriver) (struct nfs_server *);
+
+ struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
+ void (*free_layout_hdr) (struct pnfs_layout_hdr *);
+
+ struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
+ void (*free_lseg) (struct pnfs_layout_segment *lseg);
+ void (*add_lseg) (struct pnfs_layout_hdr *layoutid,
+ struct pnfs_layout_segment *lseg,
+ struct list_head *free_me);
+
+ void (*return_range) (struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range);
+
+ /* test for nfs page cache coalescing */
+ const struct nfs_pageio_ops *pg_read_ops;
+ const struct nfs_pageio_ops *pg_write_ops;
+
+ struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode);
+
+ int (*sync)(struct inode *inode, bool datasync);
+
+ /*
+ * Return PNFS_ATTEMPTED to indicate the layout code has attempted
+ * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
+ */
+ enum pnfs_try_status (*read_pagelist)(struct nfs_pgio_header *);
+ enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int);
+
+ void (*free_deviceid_node) (struct nfs4_deviceid_node *);
+ struct nfs4_deviceid_node * (*alloc_deviceid_node)
+ (struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags);
+
+ int (*prepare_layoutreturn) (struct nfs4_layoutreturn_args *);
+
+ void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
+ int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args);
+ int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args);
+};
+
+struct pnfs_commit_ops {
+ void (*setup_ds_info)(struct pnfs_ds_commit_info *,
+ struct pnfs_layout_segment *);
+ void (*release_ds_info)(struct pnfs_ds_commit_info *,
+ struct inode *inode);
+ int (*commit_pagelist)(struct inode *inode,
+ struct list_head *mds_pages,
+ int how,
+ struct nfs_commit_info *cinfo);
+ void (*mark_request_commit) (struct nfs_page *req,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx);
+ void (*clear_request_commit) (struct nfs_page *req,
+ struct nfs_commit_info *cinfo);
+ int (*scan_commit_lists) (struct nfs_commit_info *cinfo,
+ int max);
+ void (*recover_commit_reqs) (struct list_head *list,
+ struct nfs_commit_info *cinfo);
+ struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
+ struct page *page);
+};
+
+struct pnfs_layout_hdr {
+ refcount_t plh_refcount;
+ atomic_t plh_outstanding; /* number of RPCs out */
+ struct list_head plh_layouts; /* other client layouts */
+ struct list_head plh_bulk_destroy;
+ struct list_head plh_segs; /* layout segments list */
+ struct list_head plh_return_segs; /* invalid layout segments */
+ unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
+ unsigned long plh_retry_timestamp;
+ unsigned long plh_flags;
+ nfs4_stateid plh_stateid;
+ u32 plh_barrier; /* ignore lower seqids */
+ u32 plh_return_seq;
+ enum pnfs_iomode plh_return_iomode;
+ loff_t plh_lwb; /* last write byte for layoutcommit */
+ const struct cred *plh_lc_cred; /* layoutcommit cred */
+ struct inode *plh_inode;
+ struct rcu_head plh_rcu;
+};
+
+struct pnfs_device {
+ struct nfs4_deviceid dev_id;
+ unsigned int layout_type;
+ unsigned int mincount;
+ unsigned int maxcount; /* gdia_maxcount */
+ struct page **pages;
+ unsigned int pgbase;
+ unsigned int pglen; /* reply buffer length */
+ unsigned char nocache : 1;/* May not be cached */
+};
+
+#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
+
+struct pnfs_devicelist {
+ unsigned int eof;
+ unsigned int num_devs;
+ struct nfs4_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
+};
+
+extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
+extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
+extern const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id);
+extern void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld);
+
+/* nfs4proc.c */
+extern size_t max_response_pages(struct nfs_server *server);
+extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
+ struct pnfs_device *dev,
+ const struct cred *cred);
+extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout);
+extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
+
+/* pnfs.c */
+void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
+void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
+
+void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *);
+void unset_pnfs_layoutdriver(struct nfs_server *);
+void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio);
+void pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req);
+void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
+int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
+void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req, u64 wb_size);
+void pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *);
+int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
+size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *prev, struct nfs_page *req);
+void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg);
+struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp);
+void pnfs_layoutget_free(struct nfs4_layoutget *lgp);
+void pnfs_free_lseg_list(struct list_head *tmp_list);
+void pnfs_destroy_layout(struct nfs_inode *);
+void pnfs_destroy_layout_final(struct nfs_inode *);
+void pnfs_destroy_all_layouts(struct nfs_client *);
+int pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
+ struct nfs_fsid *fsid,
+ bool is_recall);
+int pnfs_destroy_layouts_byclid(struct nfs_client *clp,
+ bool is_recall);
+bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
+ struct pnfs_layout_range *dst_range,
+ struct inode *inode);
+void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
+void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *new,
+ const struct cred *cred,
+ bool update_barrier);
+int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+ struct list_head *tmp_list,
+ const struct pnfs_layout_range *recall_range,
+ u32 seq);
+int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
+ struct list_head *tmp_list,
+ const struct pnfs_layout_range *recall_range,
+ u32 seq);
+int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
+ struct list_head *lseg_list);
+bool pnfs_roc(struct inode *ino,
+ struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ const struct cred *cred);
+int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp,
+ struct nfs4_layoutreturn_res **respp, int *ret);
+void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ int ret);
+bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task);
+void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t);
+void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
+int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
+int pnfs_generic_sync(struct inode *inode, bool datasync);
+int pnfs_nfs_generic_sync(struct inode *inode, bool datasync);
+int _pnfs_return_layout(struct inode *);
+int pnfs_commit_and_return_layout(struct inode *);
+void pnfs_ld_write_done(struct nfs_pgio_header *);
+void pnfs_ld_read_done(struct nfs_pgio_header *);
+void pnfs_read_resend_pnfs(struct nfs_pgio_header *, unsigned int mirror_idx);
+struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
+ struct nfs_open_context *ctx,
+ loff_t pos,
+ u64 count,
+ enum pnfs_iomode iomode,
+ bool strict_iomode,
+ gfp_t gfp_flags);
+void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *arg_stateid,
+ const struct pnfs_layout_range *range,
+ const nfs4_stateid *stateid);
+
+void pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ bool (*is_after)(const struct pnfs_layout_range *lseg_range,
+ const struct pnfs_layout_range *old),
+ bool (*do_merge)(struct pnfs_layout_segment *lseg,
+ struct pnfs_layout_segment *old),
+ struct list_head *free_me);
+
+void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
+int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
+int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
+struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
+void pnfs_error_mark_layout_for_return(struct inode *inode,
+ struct pnfs_layout_segment *lseg);
+void pnfs_layout_return_unused_byclid(struct nfs_client *clp,
+ enum pnfs_iomode iomode);
+
+/* nfs4_deviceid_flags */
+enum {
+ NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */
+ NFS_DEVICEID_UNAVAILABLE, /* device temporarily unavailable */
+ NFS_DEVICEID_NOCACHE, /* device may not be cached */
+};
+
+/* pnfs_dev.c */
+struct nfs4_deviceid_node {
+ struct hlist_node node;
+ struct hlist_node tmpnode;
+ const struct pnfs_layoutdriver_type *ld;
+ const struct nfs_client *nfs_client;
+ unsigned long flags;
+ unsigned long timestamp_unavailable;
+ struct nfs4_deviceid deviceid;
+ struct rcu_head rcu;
+ atomic_t ref;
+};
+
+struct nfs4_deviceid_node *
+nfs4_find_get_deviceid(struct nfs_server *server,
+ const struct nfs4_deviceid *id, const struct cred *cred,
+ gfp_t gfp_mask);
+void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *,
+ const struct nfs4_deviceid *);
+bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
+void nfs4_mark_deviceid_available(struct nfs4_deviceid_node *node);
+void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
+bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
+void nfs4_deviceid_purge_client(const struct nfs_client *);
+
+/* pnfs_nfs.c */
+struct pnfs_commit_array *pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags);
+void pnfs_free_commit_array(struct pnfs_commit_array *p);
+struct pnfs_commit_array *pnfs_add_commit_array(struct pnfs_ds_commit_info *,
+ struct pnfs_commit_array *,
+ struct pnfs_layout_segment *);
+
+void pnfs_generic_ds_cinfo_release_lseg(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg);
+void pnfs_generic_ds_cinfo_destroy(struct pnfs_ds_commit_info *fl_cinfo);
+
+void pnfs_generic_clear_request_commit(struct nfs_page *req,
+ struct nfs_commit_info *cinfo);
+void pnfs_generic_commit_release(void *calldata);
+void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data);
+void pnfs_generic_rw_release(void *data);
+void pnfs_generic_recover_commit_reqs(struct list_head *dst,
+ struct nfs_commit_info *cinfo);
+struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo,
+ struct page *page);
+int pnfs_generic_commit_pagelist(struct inode *inode,
+ struct list_head *mds_pages,
+ int how,
+ struct nfs_commit_info *cinfo,
+ int (*initiate_commit)(struct nfs_commit_data *data,
+ int how));
+int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max);
+void pnfs_generic_write_commit_done(struct rpc_task *task, void *data);
+void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds);
+struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs,
+ gfp_t gfp_flags);
+void nfs4_pnfs_v3_ds_connect_unload(void);
+int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
+ struct nfs4_deviceid_node *devid, unsigned int timeo,
+ unsigned int retrans, u32 version, u32 minor_version);
+struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net,
+ struct xdr_stream *xdr,
+ gfp_t gfp_flags);
+void pnfs_layout_mark_request_commit(struct nfs_page *req,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx);
+void pnfs_lgopen_prepare(struct nfs4_opendata *data,
+ struct nfs_open_context *ctx);
+void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp,
+ struct nfs_open_context *ctx);
+void nfs4_lgopen_release(struct nfs4_layoutget *lgp);
+
+static inline bool nfs_have_layout(struct inode *inode)
+{
+ return NFS_I(inode)->layout != NULL;
+}
+
+static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo)
+{
+ return test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) == 0;
+}
+
+static inline struct nfs4_deviceid_node *
+nfs4_get_deviceid(struct nfs4_deviceid_node *d)
+{
+ atomic_inc(&d->ref);
+ return d;
+}
+
+static inline struct pnfs_layout_segment *
+pnfs_get_lseg(struct pnfs_layout_segment *lseg)
+{
+ if (lseg) {
+ refcount_inc(&lseg->pls_refcount);
+ smp_mb__after_atomic();
+ }
+ return lseg;
+}
+
+static inline bool
+pnfs_is_valid_lseg(struct pnfs_layout_segment *lseg)
+{
+ return test_bit(NFS_LSEG_VALID, &lseg->pls_flags) != 0;
+}
+
+/* Return true if a layout driver is being used for this mountpoint */
+static inline int pnfs_enabled_sb(struct nfs_server *nfss)
+{
+ return nfss->pnfs_curr_ld != NULL;
+}
+
+static inline int
+pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (fl_cinfo == NULL || fl_cinfo->ncommitting == 0)
+ return PNFS_NOT_ATTEMPTED;
+ return fl_cinfo->ops->commit_pagelist(inode, mds_pages, how, cinfo);
+}
+
+static inline struct pnfs_ds_commit_info *
+pnfs_get_ds_info(struct inode *inode)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+ if (ld == NULL || ld->get_ds_info == NULL)
+ return NULL;
+ return ld->get_ds_info(inode);
+}
+
+static inline void
+pnfs_init_ds_commit_info_ops(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+ struct pnfs_ds_commit_info *inode_cinfo = pnfs_get_ds_info(inode);
+ if (inode_cinfo != NULL)
+ fl_cinfo->ops = inode_cinfo->ops;
+}
+
+static inline void
+pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo)
+{
+ INIT_LIST_HEAD(&fl_cinfo->commits);
+ fl_cinfo->ops = NULL;
+}
+
+static inline void
+pnfs_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+ if (fl_cinfo->ops != NULL && fl_cinfo->ops->release_ds_info != NULL)
+ fl_cinfo->ops->release_ds_info(fl_cinfo, inode);
+}
+
+static inline void
+pnfs_generic_mark_devid_invalid(struct nfs4_deviceid_node *node)
+{
+ set_bit(NFS_DEVICEID_INVALID, &node->flags);
+}
+
+static inline bool
+pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo, u32 ds_commit_idx)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (!lseg || !fl_cinfo->ops || !fl_cinfo->ops->mark_request_commit)
+ return false;
+ fl_cinfo->ops->mark_request_commit(req, lseg, cinfo, ds_commit_idx);
+ return true;
+}
+
+static inline bool
+pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (!fl_cinfo || !fl_cinfo->ops || !fl_cinfo->ops->clear_request_commit)
+ return false;
+ fl_cinfo->ops->clear_request_commit(req, cinfo);
+ return true;
+}
+
+static inline int
+pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
+ int max)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (!fl_cinfo || fl_cinfo->nwritten == 0)
+ return 0;
+ return fl_cinfo->ops->scan_commit_lists(cinfo, max);
+}
+
+static inline void
+pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (fl_cinfo && fl_cinfo->nwritten != 0)
+ fl_cinfo->ops->recover_commit_reqs(head, cinfo);
+}
+
+static inline struct nfs_page *
+pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
+ struct page *page)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (!fl_cinfo->ops || !fl_cinfo->ops->search_commit_reqs)
+ return NULL;
+ return fl_cinfo->ops->search_commit_reqs(cinfo, page);
+}
+
+/* Should the pNFS client commit and return the layout upon a setattr */
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+ if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+ return false;
+ return NFS_SERVER(inode)->pnfs_curr_ld->flags &
+ PNFS_LAYOUTRET_ON_SETATTR;
+}
+
+static inline bool
+pnfs_ld_read_whole_page(struct inode *inode)
+{
+ if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+ return false;
+ return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE;
+}
+
+static inline int
+pnfs_sync_inode(struct inode *inode, bool datasync)
+{
+ if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+ return 0;
+ return NFS_SERVER(inode)->pnfs_curr_ld->sync(inode, datasync);
+}
+
+static inline bool
+pnfs_layoutcommit_outstanding(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ return test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags) != 0 ||
+ test_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags) != 0;
+}
+
+static inline int pnfs_return_layout(struct inode *ino)
+{
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct nfs_server *nfss = NFS_SERVER(ino);
+
+ if (pnfs_enabled_sb(nfss) && nfsi->layout) {
+ set_bit(NFS_LAYOUT_RETURN_REQUESTED, &nfsi->layout->plh_flags);
+ return _pnfs_return_layout(ino);
+ }
+
+ return 0;
+}
+
+static inline bool
+pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
+ struct nfs_server *nfss)
+{
+ return (dst && src && src->bm != 0 && nfss->pnfs_curr_ld &&
+ nfss->pnfs_curr_ld->id == src->l_type);
+}
+
+static inline u64
+pnfs_calc_offset_end(u64 offset, u64 len)
+{
+ if (len == NFS4_MAX_UINT64 || len >= NFS4_MAX_UINT64 - offset)
+ return NFS4_MAX_UINT64;
+ return offset + len - 1;
+}
+
+static inline u64
+pnfs_calc_offset_length(u64 offset, u64 end)
+{
+ if (end == NFS4_MAX_UINT64 || end <= offset)
+ return NFS4_MAX_UINT64;
+ return 1 + end - offset;
+}
+
+static inline void
+pnfs_copy_range(struct pnfs_layout_range *dst,
+ const struct pnfs_layout_range *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+
+static inline u64
+pnfs_end_offset(u64 start, u64 len)
+{
+ if (NFS4_MAX_UINT64 - start <= len)
+ return NFS4_MAX_UINT64;
+ return start + len;
+}
+
+/*
+ * Are 2 ranges intersecting?
+ * start1 end1
+ * [----------------------------------)
+ * start2 end2
+ * [----------------)
+ */
+static inline bool
+pnfs_is_range_intersecting(u64 start1, u64 end1, u64 start2, u64 end2)
+{
+ return (end1 == NFS4_MAX_UINT64 || start2 < end1) &&
+ (end2 == NFS4_MAX_UINT64 || start1 < end2);
+}
+
+static inline bool
+pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
+{
+ u64 end1 = pnfs_end_offset(l1->offset, l1->length);
+ u64 end2 = pnfs_end_offset(l2->offset, l2->length);
+
+ return pnfs_is_range_intersecting(l1->offset, end1, l2->offset, end2);
+}
+
+static inline bool
+pnfs_lseg_request_intersecting(struct pnfs_layout_segment *lseg, struct nfs_page *req)
+{
+ u64 seg_last = pnfs_end_offset(lseg->pls_range.offset, lseg->pls_range.length);
+ u64 req_last = req_offset(req) + req->wb_bytes;
+
+ return pnfs_is_range_intersecting(lseg->pls_range.offset, seg_last,
+ req_offset(req), req_last);
+}
+
+extern unsigned int layoutstats_timer;
+
+#ifdef NFS_DEBUG
+void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
+#else
+static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id)
+{
+}
+
+#endif /* NFS_DEBUG */
+#else /* CONFIG_NFS_V4_1 */
+
+static inline bool nfs_have_layout(struct inode *inode)
+{
+ return false;
+}
+
+static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+}
+
+static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+}
+
+static inline void pnfs_destroy_layout_final(struct nfs_inode *nfsi)
+{
+}
+
+static inline struct pnfs_layout_segment *
+pnfs_get_lseg(struct pnfs_layout_segment *lseg)
+{
+ return NULL;
+}
+
+static inline void pnfs_put_lseg(struct pnfs_layout_segment *lseg)
+{
+}
+
+static inline int pnfs_return_layout(struct inode *ino)
+{
+ return 0;
+}
+
+static inline int pnfs_commit_and_return_layout(struct inode *inode)
+{
+ return 0;
+}
+
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+ return false;
+}
+
+static inline bool
+pnfs_ld_read_whole_page(struct inode *inode)
+{
+ return false;
+}
+
+static inline int
+pnfs_sync_inode(struct inode *inode, bool datasync)
+{
+ return 0;
+}
+
+static inline bool
+pnfs_layoutcommit_outstanding(struct inode *inode)
+{
+ return false;
+}
+
+
+static inline bool
+pnfs_roc(struct inode *ino,
+ struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ const struct cred *cred)
+{
+ return false;
+}
+
+static inline int
+pnfs_roc_done(struct rpc_task *task,
+ struct nfs4_layoutreturn_args **argpp,
+ struct nfs4_layoutreturn_res **respp,
+ int *ret)
+{
+ return 0;
+}
+
+static inline void
+pnfs_roc_release(struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ int ret)
+{
+}
+
+static inline bool
+pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
+{
+ return false;
+}
+
+static inline void set_pnfs_layoutdriver(struct nfs_server *s,
+ const struct nfs_fh *mntfh,
+ struct nfs_fsinfo *fsinfo)
+{
+}
+
+static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
+{
+}
+
+static inline int
+pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how,
+ struct nfs_commit_info *cinfo)
+{
+ return PNFS_NOT_ATTEMPTED;
+}
+
+static inline struct pnfs_ds_commit_info *
+pnfs_get_ds_info(struct inode *inode)
+{
+ return NULL;
+}
+
+static inline void
+pnfs_init_ds_commit_info_ops(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+}
+
+static inline void
+pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo)
+{
+}
+
+static inline void
+pnfs_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+}
+
+static inline bool
+pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo, u32 ds_commit_idx)
+{
+ return false;
+}
+
+static inline bool
+pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo)
+{
+ return false;
+}
+
+static inline int
+pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
+ int max)
+{
+ return 0;
+}
+
+static inline void
+pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo)
+{
+}
+
+static inline struct nfs_page *
+pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
+ struct page *page)
+{
+ return NULL;
+}
+
+static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
+{
+ return 0;
+}
+
+static inline bool
+pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
+ struct nfs_server *nfss)
+{
+ return false;
+}
+
+static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
+{
+ return NULL;
+}
+
+static inline void nfs4_pnfs_v3_ds_connect_unload(void)
+{
+}
+
+static inline bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
+ struct pnfs_layout_range *dst_range,
+ struct inode *inode)
+{
+ return false;
+}
+
+static inline void pnfs_lgopen_prepare(struct nfs4_opendata *data,
+ struct nfs_open_context *ctx)
+{
+}
+
+static inline void pnfs_parse_lgopen(struct inode *ino,
+ struct nfs4_layoutget *lgp,
+ struct nfs_open_context *ctx)
+{
+}
+
+static inline void nfs4_lgopen_release(struct nfs4_layoutget *lgp)
+{
+}
+
+static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo)
+{
+ return false;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+#if IS_ENABLED(CONFIG_NFS_V4_2)
+int pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags);
+#else
+static inline int
+pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
+{
+ return 0;
+}
+#endif
+
+#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
new file mode 100644
index 000000000..d4829f3f2
--- /dev/null
+++ b/fs/nfs/pnfs_dev.c
@@ -0,0 +1,377 @@
+/*
+ * Device operations for the pnfs client.
+ *
+ * Copyright (c) 2002
+ * The Regents of the University of Michigan
+ * All Rights Reserved
+ *
+ * Dean Hildebrand <dhildebz@umich.edu>
+ * Garth Goodson <Garth.Goodson@netapp.com>
+ *
+ * Permission is granted to use, copy, create derivative works, and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the University of Michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. If
+ * the above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * This software is provided as is, without representation or warranty
+ * of any kind either express or implied, including without limitation
+ * the implied warranties of merchantability, fitness for a particular
+ * purpose, or noninfringement. The Regents of the University of
+ * Michigan shall not be liable for any damages, including special,
+ * indirect, incidental, or consequential damages, with respect to any
+ * claim arising out of or in connection with the use of the software,
+ * even if it has been or is hereafter advised of the possibility of
+ * such damages.
+ */
+
+#include <linux/export.h>
+#include <linux/nfs_fs.h>
+#include "nfs4session.h"
+#include "internal.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS
+
+/*
+ * Device ID RCU cache. A device ID is unique per server and layout type.
+ */
+#define NFS4_DEVICE_ID_HASH_BITS 5
+#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
+#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
+
+
+static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];
+static DEFINE_SPINLOCK(nfs4_deviceid_lock);
+
+#ifdef NFS_DEBUG
+void
+nfs4_print_deviceid(const struct nfs4_deviceid *id)
+{
+ u32 *p = (u32 *)id;
+
+ dprintk("%s: device id= [%x%x%x%x]\n", __func__,
+ p[0], p[1], p[2], p[3]);
+}
+EXPORT_SYMBOL_GPL(nfs4_print_deviceid);
+#endif
+
+static inline u32
+nfs4_deviceid_hash(const struct nfs4_deviceid *id)
+{
+ unsigned char *cptr = (unsigned char *)id->data;
+ unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+ u32 x = 0;
+
+ while (nbytes--) {
+ x *= 37;
+ x += *cptr++;
+ }
+ return x & NFS4_DEVICE_ID_HASH_MASK;
+}
+
+static struct nfs4_deviceid_node *
+_lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
+ const struct nfs_client *clp, const struct nfs4_deviceid *id,
+ long hash)
+{
+ struct nfs4_deviceid_node *d;
+
+ hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node)
+ if (d->ld == ld && d->nfs_client == clp &&
+ !memcmp(&d->deviceid, id, sizeof(*id))) {
+ if (atomic_read(&d->ref))
+ return d;
+ else
+ continue;
+ }
+ return NULL;
+}
+
+static struct nfs4_deviceid_node *
+nfs4_get_device_info(struct nfs_server *server,
+ const struct nfs4_deviceid *dev_id,
+ const struct cred *cred, gfp_t gfp_flags)
+{
+ struct nfs4_deviceid_node *d = NULL;
+ struct pnfs_device *pdev = NULL;
+ struct page **pages = NULL;
+ u32 max_resp_sz;
+ int max_pages;
+ int rc, i;
+
+ /*
+ * Use the session max response size as the basis for setting
+ * GETDEVICEINFO's maxcount
+ */
+ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+ if (server->pnfs_curr_ld->max_deviceinfo_size &&
+ server->pnfs_curr_ld->max_deviceinfo_size < max_resp_sz)
+ max_resp_sz = server->pnfs_curr_ld->max_deviceinfo_size;
+ max_pages = nfs_page_array_len(0, max_resp_sz);
+ dprintk("%s: server %p max_resp_sz %u max_pages %d\n",
+ __func__, server, max_resp_sz, max_pages);
+
+ pdev = kzalloc(sizeof(*pdev), gfp_flags);
+ if (!pdev)
+ return NULL;
+
+ pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
+ if (!pages)
+ goto out_free_pdev;
+
+ for (i = 0; i < max_pages; i++) {
+ pages[i] = alloc_page(gfp_flags);
+ if (!pages[i])
+ goto out_free_pages;
+ }
+
+ memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
+ pdev->layout_type = server->pnfs_curr_ld->id;
+ pdev->pages = pages;
+ pdev->pgbase = 0;
+ pdev->pglen = max_resp_sz;
+ pdev->mincount = 0;
+ pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
+
+ rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
+ dprintk("%s getdevice info returns %d\n", __func__, rc);
+ if (rc)
+ goto out_free_pages;
+
+ /*
+ * Found new device, need to decode it and then add it to the
+ * list of known devices for this mountpoint.
+ */
+ d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev,
+ gfp_flags);
+ if (d && pdev->nocache)
+ set_bit(NFS_DEVICEID_NOCACHE, &d->flags);
+
+out_free_pages:
+ while (--i >= 0)
+ __free_page(pages[i]);
+ kfree(pages);
+out_free_pdev:
+ kfree(pdev);
+ dprintk("<-- %s d %p\n", __func__, d);
+ return d;
+}
+
+/*
+ * Lookup a deviceid in cache and get a reference count on it if found
+ *
+ * @clp nfs_client associated with deviceid
+ * @id deviceid to look up
+ */
+static struct nfs4_deviceid_node *
+__nfs4_find_get_deviceid(struct nfs_server *server,
+ const struct nfs4_deviceid *id, long hash)
+{
+ struct nfs4_deviceid_node *d;
+
+ rcu_read_lock();
+ d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id,
+ hash);
+ if (d != NULL && !atomic_inc_not_zero(&d->ref))
+ d = NULL;
+ rcu_read_unlock();
+ return d;
+}
+
+struct nfs4_deviceid_node *
+nfs4_find_get_deviceid(struct nfs_server *server,
+ const struct nfs4_deviceid *id, const struct cred *cred,
+ gfp_t gfp_mask)
+{
+ long hash = nfs4_deviceid_hash(id);
+ struct nfs4_deviceid_node *d, *new;
+
+ d = __nfs4_find_get_deviceid(server, id, hash);
+ if (d)
+ return d;
+
+ new = nfs4_get_device_info(server, id, cred, gfp_mask);
+ if (!new)
+ return new;
+
+ spin_lock(&nfs4_deviceid_lock);
+ d = __nfs4_find_get_deviceid(server, id, hash);
+ if (d) {
+ spin_unlock(&nfs4_deviceid_lock);
+ server->pnfs_curr_ld->free_deviceid_node(new);
+ return d;
+ }
+ hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
+ atomic_inc(&new->ref);
+ spin_unlock(&nfs4_deviceid_lock);
+
+ return new;
+}
+EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
+
+/*
+ * Remove a deviceid from cache
+ *
+ * @clp nfs_client associated with deviceid
+ * @id the deviceid to unhash
+ *
+ * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise.
+ */
+void
+nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
+ const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+ struct nfs4_deviceid_node *d;
+
+ spin_lock(&nfs4_deviceid_lock);
+ rcu_read_lock();
+ d = _lookup_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
+ rcu_read_unlock();
+ if (!d) {
+ spin_unlock(&nfs4_deviceid_lock);
+ return;
+ }
+ hlist_del_init_rcu(&d->node);
+ clear_bit(NFS_DEVICEID_NOCACHE, &d->flags);
+ spin_unlock(&nfs4_deviceid_lock);
+
+ /* balance the initial ref set in pnfs_insert_deviceid */
+ nfs4_put_deviceid_node(d);
+}
+EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
+
+void
+nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, struct nfs_server *server,
+ const struct nfs4_deviceid *id)
+{
+ INIT_HLIST_NODE(&d->node);
+ INIT_HLIST_NODE(&d->tmpnode);
+ d->ld = server->pnfs_curr_ld;
+ d->nfs_client = server->nfs_client;
+ d->flags = 0;
+ d->deviceid = *id;
+ atomic_set(&d->ref, 1);
+}
+EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
+
+/*
+ * Dereference a deviceid node and delete it when its reference count drops
+ * to zero.
+ *
+ * @d deviceid node to put
+ *
+ * return true iff the node was deleted
+ * Note that since the test for d->ref == 0 is sufficient to establish
+ * that the node is no longer hashed in the global device id cache.
+ */
+bool
+nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
+{
+ if (test_bit(NFS_DEVICEID_NOCACHE, &d->flags)) {
+ if (atomic_add_unless(&d->ref, -1, 2))
+ return false;
+ nfs4_delete_deviceid(d->ld, d->nfs_client, &d->deviceid);
+ }
+ if (!atomic_dec_and_test(&d->ref))
+ return false;
+ d->ld->free_deviceid_node(d);
+ return true;
+}
+EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
+
+void
+nfs4_mark_deviceid_available(struct nfs4_deviceid_node *node)
+{
+ if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) {
+ clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+ smp_mb__after_atomic();
+ }
+}
+EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_available);
+
+void
+nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node)
+{
+ node->timestamp_unavailable = jiffies;
+ smp_mb__before_atomic();
+ set_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+ smp_mb__after_atomic();
+}
+EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_unavailable);
+
+bool
+nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node)
+{
+ if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) {
+ unsigned long start, end;
+
+ end = jiffies;
+ start = end - PNFS_DEVICE_RETRY_TIMEOUT;
+ if (time_in_range(node->timestamp_unavailable, start, end))
+ return true;
+ clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+ smp_mb__after_atomic();
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(nfs4_test_deviceid_unavailable);
+
+static void
+_deviceid_purge_client(const struct nfs_client *clp, long hash)
+{
+ struct nfs4_deviceid_node *d;
+ HLIST_HEAD(tmp);
+
+ spin_lock(&nfs4_deviceid_lock);
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node)
+ if (d->nfs_client == clp && atomic_read(&d->ref)) {
+ hlist_del_init_rcu(&d->node);
+ hlist_add_head(&d->tmpnode, &tmp);
+ clear_bit(NFS_DEVICEID_NOCACHE, &d->flags);
+ }
+ rcu_read_unlock();
+ spin_unlock(&nfs4_deviceid_lock);
+
+ if (hlist_empty(&tmp))
+ return;
+
+ while (!hlist_empty(&tmp)) {
+ d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode);
+ hlist_del(&d->tmpnode);
+ nfs4_put_deviceid_node(d);
+ }
+}
+
+void
+nfs4_deviceid_purge_client(const struct nfs_client *clp)
+{
+ long h;
+
+ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS))
+ return;
+ for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++)
+ _deviceid_purge_client(clp, h);
+}
+
+/*
+ * Stop use of all deviceids associated with an nfs_client
+ */
+void
+nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
+{
+ struct nfs4_deviceid_node *d;
+ int i;
+
+ rcu_read_lock();
+ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i ++){
+ hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[i], node)
+ if (d->nfs_client == clp)
+ set_bit(NFS_DEVICEID_INVALID, &d->flags);
+ }
+ rcu_read_unlock();
+}
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
new file mode 100644
index 000000000..a2ad8bb87
--- /dev/null
+++ b/fs/nfs/pnfs_nfs.c
@@ -0,0 +1,1215 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Common NFS I/O operations for the pnfs file based
+ * layout drivers.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tom Haynes <loghyr@primarydata.com>
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/module.h>
+
+#include "nfs4session.h"
+#include "internal.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS
+
+void pnfs_generic_rw_release(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ nfs_put_client(hdr->ds_clp);
+ hdr->mds_ops->rpc_release(data);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_rw_release);
+
+/* Fake up some data that will cause nfs_commit_release to retry the writes. */
+void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data)
+{
+ struct nfs_writeverf *verf = data->res.verf;
+
+ data->task.tk_status = 0;
+ memset(&verf->verifier, 0, sizeof(verf->verifier));
+ verf->committed = NFS_UNSTABLE;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_prepare_to_resend_writes);
+
+void pnfs_generic_write_commit_done(struct rpc_task *task, void *data)
+{
+ struct nfs_commit_data *wdata = data;
+
+ /* Note this may cause RPC to be resent */
+ wdata->mds_ops->rpc_call_done(task, data);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_write_commit_done);
+
+void pnfs_generic_commit_release(void *calldata)
+{
+ struct nfs_commit_data *data = calldata;
+
+ data->completion_ops->completion(data);
+ pnfs_put_lseg(data->lseg);
+ nfs_put_client(data->ds_clp);
+ nfs_commitdata_release(data);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
+
+static struct pnfs_layout_segment *
+pnfs_free_bucket_lseg(struct pnfs_commit_bucket *bucket)
+{
+ if (list_empty(&bucket->committing) && list_empty(&bucket->written)) {
+ struct pnfs_layout_segment *freeme = bucket->lseg;
+ bucket->lseg = NULL;
+ return freeme;
+ }
+ return NULL;
+}
+
+/* The generic layer is about to remove the req from the commit list.
+ * If this will make the bucket empty, it will need to put the lseg reference.
+ * Note this must be called holding nfsi->commit_mutex
+ */
+void
+pnfs_generic_clear_request_commit(struct nfs_page *req,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_commit_bucket *bucket = NULL;
+
+ if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
+ goto out;
+ cinfo->ds->nwritten--;
+ if (list_is_singular(&req->wb_list))
+ bucket = list_first_entry(&req->wb_list,
+ struct pnfs_commit_bucket, written);
+out:
+ nfs_request_remove_commit_list(req, cinfo);
+ if (bucket)
+ pnfs_put_lseg(pnfs_free_bucket_lseg(bucket));
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit);
+
+struct pnfs_commit_array *
+pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags)
+{
+ struct pnfs_commit_array *p;
+ struct pnfs_commit_bucket *b;
+
+ p = kmalloc(struct_size(p, buckets, n), gfp_flags);
+ if (!p)
+ return NULL;
+ p->nbuckets = n;
+ INIT_LIST_HEAD(&p->cinfo_list);
+ INIT_LIST_HEAD(&p->lseg_list);
+ p->lseg = NULL;
+ for (b = &p->buckets[0]; n != 0; b++, n--) {
+ INIT_LIST_HEAD(&b->written);
+ INIT_LIST_HEAD(&b->committing);
+ b->lseg = NULL;
+ b->direct_verf.committed = NFS_INVALID_STABLE_HOW;
+ }
+ return p;
+}
+EXPORT_SYMBOL_GPL(pnfs_alloc_commit_array);
+
+void
+pnfs_free_commit_array(struct pnfs_commit_array *p)
+{
+ kfree_rcu(p, rcu);
+}
+EXPORT_SYMBOL_GPL(pnfs_free_commit_array);
+
+static struct pnfs_commit_array *
+pnfs_find_commit_array_by_lseg(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_commit_array *array;
+
+ list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+ if (array->lseg == lseg)
+ return array;
+ }
+ return NULL;
+}
+
+struct pnfs_commit_array *
+pnfs_add_commit_array(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_commit_array *new,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_commit_array *array;
+
+ array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg);
+ if (array)
+ return array;
+ new->lseg = lseg;
+ refcount_set(&new->refcount, 1);
+ list_add_rcu(&new->cinfo_list, &fl_cinfo->commits);
+ list_add(&new->lseg_list, &lseg->pls_commits);
+ return new;
+}
+EXPORT_SYMBOL_GPL(pnfs_add_commit_array);
+
+static struct pnfs_commit_array *
+pnfs_lookup_commit_array(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_commit_array *array;
+
+ rcu_read_lock();
+ array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg);
+ if (!array) {
+ rcu_read_unlock();
+ fl_cinfo->ops->setup_ds_info(fl_cinfo, lseg);
+ rcu_read_lock();
+ array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg);
+ }
+ rcu_read_unlock();
+ return array;
+}
+
+static void
+pnfs_release_commit_array_locked(struct pnfs_commit_array *array)
+{
+ list_del_rcu(&array->cinfo_list);
+ list_del(&array->lseg_list);
+ pnfs_free_commit_array(array);
+}
+
+static void
+pnfs_put_commit_array_locked(struct pnfs_commit_array *array)
+{
+ if (refcount_dec_and_test(&array->refcount))
+ pnfs_release_commit_array_locked(array);
+}
+
+static void
+pnfs_put_commit_array(struct pnfs_commit_array *array, struct inode *inode)
+{
+ if (refcount_dec_and_lock(&array->refcount, &inode->i_lock)) {
+ pnfs_release_commit_array_locked(array);
+ spin_unlock(&inode->i_lock);
+ }
+}
+
+static struct pnfs_commit_array *
+pnfs_get_commit_array(struct pnfs_commit_array *array)
+{
+ if (refcount_inc_not_zero(&array->refcount))
+ return array;
+ return NULL;
+}
+
+static void
+pnfs_remove_and_free_commit_array(struct pnfs_commit_array *array)
+{
+ array->lseg = NULL;
+ list_del_init(&array->lseg_list);
+ pnfs_put_commit_array_locked(array);
+}
+
+void
+pnfs_generic_ds_cinfo_release_lseg(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_commit_array *array, *tmp;
+
+ list_for_each_entry_safe(array, tmp, &lseg->pls_commits, lseg_list)
+ pnfs_remove_and_free_commit_array(array);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_ds_cinfo_release_lseg);
+
+void
+pnfs_generic_ds_cinfo_destroy(struct pnfs_ds_commit_info *fl_cinfo)
+{
+ struct pnfs_commit_array *array, *tmp;
+
+ list_for_each_entry_safe(array, tmp, &fl_cinfo->commits, cinfo_list)
+ pnfs_remove_and_free_commit_array(array);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_ds_cinfo_destroy);
+
+/*
+ * Locks the nfs_page requests for commit and moves them to
+ * @bucket->committing.
+ */
+static int
+pnfs_bucket_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
+ struct nfs_commit_info *cinfo,
+ int max)
+{
+ struct list_head *src = &bucket->written;
+ struct list_head *dst = &bucket->committing;
+ int ret;
+
+ lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
+ ret = nfs_scan_commit_list(src, dst, cinfo, max);
+ if (ret) {
+ cinfo->ds->nwritten -= ret;
+ cinfo->ds->ncommitting += ret;
+ }
+ return ret;
+}
+
+static int pnfs_bucket_scan_array(struct nfs_commit_info *cinfo,
+ struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets,
+ int max)
+{
+ unsigned int i;
+ int rv = 0, cnt;
+
+ for (i = 0; i < nbuckets && max != 0; i++) {
+ cnt = pnfs_bucket_scan_ds_commit_list(&buckets[i], cinfo, max);
+ rv += cnt;
+ max -= cnt;
+ }
+ return rv;
+}
+
+/* Move reqs from written to committing lists, returning count
+ * of number moved.
+ */
+int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+ struct pnfs_commit_array *array;
+ int rv = 0, cnt;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+ if (!array->lseg || !pnfs_get_commit_array(array))
+ continue;
+ rcu_read_unlock();
+ cnt = pnfs_bucket_scan_array(cinfo, array->buckets,
+ array->nbuckets, max);
+ rcu_read_lock();
+ pnfs_put_commit_array(array, cinfo->inode);
+ rv += cnt;
+ max -= cnt;
+ if (!max)
+ break;
+ }
+ rcu_read_unlock();
+ return rv;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists);
+
+static unsigned int
+pnfs_bucket_recover_commit_reqs(struct list_head *dst,
+ struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_commit_bucket *b;
+ struct pnfs_layout_segment *freeme;
+ unsigned int nwritten, ret = 0;
+ unsigned int i;
+
+restart:
+ for (i = 0, b = buckets; i < nbuckets; i++, b++) {
+ nwritten = nfs_scan_commit_list(&b->written, dst, cinfo, 0);
+ if (!nwritten)
+ continue;
+ ret += nwritten;
+ freeme = pnfs_free_bucket_lseg(b);
+ if (freeme) {
+ pnfs_put_lseg(freeme);
+ goto restart;
+ }
+ }
+ return ret;
+}
+
+/* Pull everything off the committing lists and dump into @dst. */
+void pnfs_generic_recover_commit_reqs(struct list_head *dst,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+ struct pnfs_commit_array *array;
+ unsigned int nwritten;
+
+ lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
+ rcu_read_lock();
+ list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+ if (!array->lseg || !pnfs_get_commit_array(array))
+ continue;
+ rcu_read_unlock();
+ nwritten = pnfs_bucket_recover_commit_reqs(dst,
+ array->buckets,
+ array->nbuckets,
+ cinfo);
+ rcu_read_lock();
+ pnfs_put_commit_array(array, cinfo->inode);
+ fl_cinfo->nwritten -= nwritten;
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs);
+
+static struct nfs_page *
+pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets, struct page *page)
+{
+ struct nfs_page *req;
+ struct pnfs_commit_bucket *b;
+ unsigned int i;
+
+ /* Linearly search the commit lists for each bucket until a matching
+ * request is found */
+ for (i = 0, b = buckets; i < nbuckets; i++, b++) {
+ list_for_each_entry(req, &b->written, wb_list) {
+ if (req->wb_page == page)
+ return req->wb_head;
+ }
+ list_for_each_entry(req, &b->committing, wb_list) {
+ if (req->wb_page == page)
+ return req->wb_head;
+ }
+ }
+ return NULL;
+}
+
+/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head reqest
+ * for @page
+ * @cinfo - commit info for current inode
+ * @page - page to search for matching head request
+ *
+ * Returns a the head request if one is found, otherwise returns NULL.
+ */
+struct nfs_page *
+pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+ struct pnfs_commit_array *array;
+ struct nfs_page *req;
+
+ list_for_each_entry(array, &fl_cinfo->commits, cinfo_list) {
+ req = pnfs_bucket_search_commit_reqs(array->buckets,
+ array->nbuckets, page);
+ if (req)
+ return req;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_search_commit_reqs);
+
+static struct pnfs_layout_segment *
+pnfs_bucket_get_committing(struct list_head *head,
+ struct pnfs_commit_bucket *bucket,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_layout_segment *lseg;
+ struct list_head *pos;
+
+ list_for_each(pos, &bucket->committing)
+ cinfo->ds->ncommitting--;
+ list_splice_init(&bucket->committing, head);
+ lseg = pnfs_free_bucket_lseg(bucket);
+ if (!lseg)
+ lseg = pnfs_get_lseg(bucket->lseg);
+ return lseg;
+}
+
+static struct nfs_commit_data *
+pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket,
+ struct nfs_commit_info *cinfo)
+{
+ struct nfs_commit_data *data = nfs_commitdata_alloc();
+
+ if (!data)
+ return NULL;
+ data->lseg = pnfs_bucket_get_committing(&data->pages, bucket, cinfo);
+ return data;
+}
+
+static void pnfs_generic_retry_commit(struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets,
+ struct nfs_commit_info *cinfo,
+ unsigned int idx)
+{
+ struct pnfs_commit_bucket *bucket;
+ struct pnfs_layout_segment *freeme;
+ LIST_HEAD(pages);
+
+ for (bucket = buckets; idx < nbuckets; bucket++, idx++) {
+ if (list_empty(&bucket->committing))
+ continue;
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+ freeme = pnfs_bucket_get_committing(&pages, bucket, cinfo);
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+ nfs_retry_commit(&pages, freeme, cinfo, idx);
+ pnfs_put_lseg(freeme);
+ }
+}
+
+static unsigned int
+pnfs_bucket_alloc_ds_commits(struct list_head *list,
+ struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_commit_bucket *bucket;
+ struct nfs_commit_data *data;
+ unsigned int i;
+ unsigned int nreq = 0;
+
+ for (i = 0, bucket = buckets; i < nbuckets; i++, bucket++) {
+ if (list_empty(&bucket->committing))
+ continue;
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+ if (!list_empty(&bucket->committing)) {
+ data = pnfs_bucket_fetch_commitdata(bucket, cinfo);
+ if (!data)
+ goto out_error;
+ data->ds_commit_index = i;
+ list_add_tail(&data->list, list);
+ nreq++;
+ }
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+ }
+ return nreq;
+out_error:
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+ /* Clean up on error */
+ pnfs_generic_retry_commit(buckets, nbuckets, cinfo, i);
+ return nreq;
+}
+
+static unsigned int
+pnfs_alloc_ds_commits_list(struct list_head *list,
+ struct pnfs_ds_commit_info *fl_cinfo,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_commit_array *array;
+ unsigned int ret = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+ if (!array->lseg || !pnfs_get_commit_array(array))
+ continue;
+ rcu_read_unlock();
+ ret += pnfs_bucket_alloc_ds_commits(list, array->buckets,
+ array->nbuckets, cinfo);
+ rcu_read_lock();
+ pnfs_put_commit_array(array, cinfo->inode);
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+/* This follows nfs_commit_list pretty closely */
+int
+pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
+ int how, struct nfs_commit_info *cinfo,
+ int (*initiate_commit)(struct nfs_commit_data *data,
+ int how))
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+ struct nfs_commit_data *data, *tmp;
+ LIST_HEAD(list);
+ unsigned int nreq = 0;
+
+ if (!list_empty(mds_pages)) {
+ data = nfs_commitdata_alloc();
+ if (!data) {
+ nfs_retry_commit(mds_pages, NULL, cinfo, -1);
+ return -ENOMEM;
+ }
+ data->ds_commit_index = -1;
+ list_splice_init(mds_pages, &data->pages);
+ list_add_tail(&data->list, &list);
+ nreq++;
+ }
+
+ nreq += pnfs_alloc_ds_commits_list(&list, fl_cinfo, cinfo);
+ if (nreq == 0)
+ goto out;
+
+ list_for_each_entry_safe(data, tmp, &list, list) {
+ list_del(&data->list);
+ if (data->ds_commit_index < 0) {
+ nfs_init_commit(data, NULL, NULL, cinfo);
+ nfs_initiate_commit(NFS_CLIENT(inode), data,
+ NFS_PROTO(data->inode),
+ data->mds_ops, how,
+ RPC_TASK_CRED_NOREF);
+ } else {
+ nfs_init_commit(data, NULL, data->lseg, cinfo);
+ initiate_commit(data, how);
+ }
+ }
+out:
+ return PNFS_ATTEMPTED;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist);
+
+/*
+ * Data server cache
+ *
+ * Data servers can be mapped to different device ids.
+ * nfs4_pnfs_ds reference counting
+ * - set to 1 on allocation
+ * - incremented when a device id maps a data server already in the cache.
+ * - decremented when deviceid is removed from the cache.
+ */
+static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
+static LIST_HEAD(nfs4_data_server_cache);
+
+/* Debug routines */
+static void
+print_ds(struct nfs4_pnfs_ds *ds)
+{
+ if (ds == NULL) {
+ printk(KERN_WARNING "%s NULL device\n", __func__);
+ return;
+ }
+ printk(KERN_WARNING " ds %s\n"
+ " ref count %d\n"
+ " client %p\n"
+ " cl_exchange_flags %x\n",
+ ds->ds_remotestr,
+ refcount_read(&ds->ds_count), ds->ds_clp,
+ ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
+}
+
+static bool
+same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
+{
+ struct sockaddr_in *a, *b;
+ struct sockaddr_in6 *a6, *b6;
+
+ if (addr1->sa_family != addr2->sa_family)
+ return false;
+
+ switch (addr1->sa_family) {
+ case AF_INET:
+ a = (struct sockaddr_in *)addr1;
+ b = (struct sockaddr_in *)addr2;
+
+ if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
+ a->sin_port == b->sin_port)
+ return true;
+ break;
+
+ case AF_INET6:
+ a6 = (struct sockaddr_in6 *)addr1;
+ b6 = (struct sockaddr_in6 *)addr2;
+
+ /* LINKLOCAL addresses must have matching scope_id */
+ if (ipv6_addr_src_scope(&a6->sin6_addr) ==
+ IPV6_ADDR_SCOPE_LINKLOCAL &&
+ a6->sin6_scope_id != b6->sin6_scope_id)
+ return false;
+
+ if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
+ a6->sin6_port == b6->sin6_port)
+ return true;
+ break;
+
+ default:
+ dprintk("%s: unhandled address family: %u\n",
+ __func__, addr1->sa_family);
+ return false;
+ }
+
+ return false;
+}
+
+/*
+ * Checks if 'dsaddrs1' contains a subset of 'dsaddrs2'. If it does,
+ * declare a match.
+ */
+static bool
+_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
+ const struct list_head *dsaddrs2)
+{
+ struct nfs4_pnfs_ds_addr *da1, *da2;
+ struct sockaddr *sa1, *sa2;
+ bool match = false;
+
+ list_for_each_entry(da1, dsaddrs1, da_node) {
+ sa1 = (struct sockaddr *)&da1->da_addr;
+ match = false;
+ list_for_each_entry(da2, dsaddrs2, da_node) {
+ sa2 = (struct sockaddr *)&da2->da_addr;
+ match = same_sockaddr(sa1, sa2);
+ if (match)
+ break;
+ }
+ if (!match)
+ break;
+ }
+ return match;
+}
+
+/*
+ * Lookup DS by addresses. nfs4_ds_cache_lock is held
+ */
+static struct nfs4_pnfs_ds *
+_data_server_lookup_locked(const struct list_head *dsaddrs)
+{
+ struct nfs4_pnfs_ds *ds;
+
+ list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
+ if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
+ return ds;
+ return NULL;
+}
+
+static void destroy_ds(struct nfs4_pnfs_ds *ds)
+{
+ struct nfs4_pnfs_ds_addr *da;
+
+ dprintk("--> %s\n", __func__);
+ ifdebug(FACILITY)
+ print_ds(ds);
+
+ nfs_put_client(ds->ds_clp);
+
+ while (!list_empty(&ds->ds_addrs)) {
+ da = list_first_entry(&ds->ds_addrs,
+ struct nfs4_pnfs_ds_addr,
+ da_node);
+ list_del_init(&da->da_node);
+ kfree(da->da_remotestr);
+ kfree(da);
+ }
+
+ kfree(ds->ds_remotestr);
+ kfree(ds);
+}
+
+void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds)
+{
+ if (refcount_dec_and_lock(&ds->ds_count,
+ &nfs4_ds_cache_lock)) {
+ list_del_init(&ds->ds_node);
+ spin_unlock(&nfs4_ds_cache_lock);
+ destroy_ds(ds);
+ }
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_put);
+
+/*
+ * Create a string with a human readable address and port to avoid
+ * complicated setup around many dprinks.
+ */
+static char *
+nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
+{
+ struct nfs4_pnfs_ds_addr *da;
+ char *remotestr;
+ size_t len;
+ char *p;
+
+ len = 3; /* '{', '}' and eol */
+ list_for_each_entry(da, dsaddrs, da_node) {
+ len += strlen(da->da_remotestr) + 1; /* string plus comma */
+ }
+
+ remotestr = kzalloc(len, gfp_flags);
+ if (!remotestr)
+ return NULL;
+
+ p = remotestr;
+ *(p++) = '{';
+ len--;
+ list_for_each_entry(da, dsaddrs, da_node) {
+ size_t ll = strlen(da->da_remotestr);
+
+ if (ll > len)
+ goto out_err;
+
+ memcpy(p, da->da_remotestr, ll);
+ p += ll;
+ len -= ll;
+
+ if (len < 1)
+ goto out_err;
+ (*p++) = ',';
+ len--;
+ }
+ if (len < 2)
+ goto out_err;
+ *(p++) = '}';
+ *p = '\0';
+ return remotestr;
+out_err:
+ kfree(remotestr);
+ return NULL;
+}
+
+/*
+ * Given a list of multipath struct nfs4_pnfs_ds_addr, add it to ds cache if
+ * uncached and return cached struct nfs4_pnfs_ds.
+ */
+struct nfs4_pnfs_ds *
+nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
+{
+ struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
+ char *remotestr;
+
+ if (list_empty(dsaddrs)) {
+ dprintk("%s: no addresses defined\n", __func__);
+ goto out;
+ }
+
+ ds = kzalloc(sizeof(*ds), gfp_flags);
+ if (!ds)
+ goto out;
+
+ /* this is only used for debugging, so it's ok if its NULL */
+ remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
+
+ spin_lock(&nfs4_ds_cache_lock);
+ tmp_ds = _data_server_lookup_locked(dsaddrs);
+ if (tmp_ds == NULL) {
+ INIT_LIST_HEAD(&ds->ds_addrs);
+ list_splice_init(dsaddrs, &ds->ds_addrs);
+ ds->ds_remotestr = remotestr;
+ refcount_set(&ds->ds_count, 1);
+ INIT_LIST_HEAD(&ds->ds_node);
+ ds->ds_clp = NULL;
+ list_add(&ds->ds_node, &nfs4_data_server_cache);
+ dprintk("%s add new data server %s\n", __func__,
+ ds->ds_remotestr);
+ } else {
+ kfree(remotestr);
+ kfree(ds);
+ refcount_inc(&tmp_ds->ds_count);
+ dprintk("%s data server %s found, inc'ed ds_count to %d\n",
+ __func__, tmp_ds->ds_remotestr,
+ refcount_read(&tmp_ds->ds_count));
+ ds = tmp_ds;
+ }
+ spin_unlock(&nfs4_ds_cache_lock);
+out:
+ return ds;
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_add);
+
+static int nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
+{
+ might_sleep();
+ return wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, TASK_KILLABLE);
+}
+
+static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
+{
+ smp_mb__before_atomic();
+ clear_and_wake_up_bit(NFS4DS_CONNECTING, &ds->ds_state);
+}
+
+static struct nfs_client *(*get_v3_ds_connect)(
+ struct nfs_server *mds_srv,
+ const struct sockaddr *ds_addr,
+ int ds_addrlen,
+ int ds_proto,
+ unsigned int ds_timeo,
+ unsigned int ds_retrans);
+
+static bool load_v3_ds_connect(void)
+{
+ if (!get_v3_ds_connect) {
+ get_v3_ds_connect = symbol_request(nfs3_set_ds_client);
+ WARN_ON_ONCE(!get_v3_ds_connect);
+ }
+
+ return(get_v3_ds_connect != NULL);
+}
+
+void nfs4_pnfs_v3_ds_connect_unload(void)
+{
+ if (get_v3_ds_connect) {
+ symbol_put(nfs3_set_ds_client);
+ get_v3_ds_connect = NULL;
+ }
+}
+
+static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
+ struct nfs4_pnfs_ds *ds,
+ unsigned int timeo,
+ unsigned int retrans)
+{
+ struct nfs_client *clp = ERR_PTR(-EIO);
+ struct nfs4_pnfs_ds_addr *da;
+ int status = 0;
+
+ dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr);
+
+ if (!load_v3_ds_connect())
+ goto out;
+
+ list_for_each_entry(da, &ds->ds_addrs, da_node) {
+ dprintk("%s: DS %s: trying address %s\n",
+ __func__, ds->ds_remotestr, da->da_remotestr);
+
+ if (!IS_ERR(clp)) {
+ struct xprt_create xprt_args = {
+ .ident = XPRT_TRANSPORT_TCP,
+ .net = clp->cl_net,
+ .dstaddr = (struct sockaddr *)&da->da_addr,
+ .addrlen = da->da_addrlen,
+ .servername = clp->cl_hostname,
+ };
+ /* Add this address as an alias */
+ rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
+ rpc_clnt_test_and_add_xprt, NULL);
+ continue;
+ }
+ clp = get_v3_ds_connect(mds_srv,
+ (struct sockaddr *)&da->da_addr,
+ da->da_addrlen, IPPROTO_TCP,
+ timeo, retrans);
+ if (IS_ERR(clp))
+ continue;
+ clp->cl_rpcclient->cl_softerr = 0;
+ clp->cl_rpcclient->cl_softrtry = 0;
+ }
+
+ if (IS_ERR(clp)) {
+ status = PTR_ERR(clp);
+ goto out;
+ }
+
+ smp_wmb();
+ WRITE_ONCE(ds->ds_clp, clp);
+ dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
+out:
+ return status;
+}
+
+static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
+ struct nfs4_pnfs_ds *ds,
+ unsigned int timeo,
+ unsigned int retrans,
+ u32 minor_version)
+{
+ struct nfs_client *clp = ERR_PTR(-EIO);
+ struct nfs4_pnfs_ds_addr *da;
+ int status = 0;
+
+ dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr);
+
+ list_for_each_entry(da, &ds->ds_addrs, da_node) {
+ dprintk("%s: DS %s: trying address %s\n",
+ __func__, ds->ds_remotestr, da->da_remotestr);
+
+ if (!IS_ERR(clp) && clp->cl_mvops->session_trunk) {
+ struct xprt_create xprt_args = {
+ .ident = XPRT_TRANSPORT_TCP,
+ .net = clp->cl_net,
+ .dstaddr = (struct sockaddr *)&da->da_addr,
+ .addrlen = da->da_addrlen,
+ .servername = clp->cl_hostname,
+ };
+ struct nfs4_add_xprt_data xprtdata = {
+ .clp = clp,
+ .cred = nfs4_get_clid_cred(clp),
+ };
+ struct rpc_add_xprt_test rpcdata = {
+ .add_xprt_test = clp->cl_mvops->session_trunk,
+ .data = &xprtdata,
+ };
+
+ /**
+ * Test this address for session trunking and
+ * add as an alias
+ */
+ rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
+ rpc_clnt_setup_test_and_add_xprt,
+ &rpcdata);
+ if (xprtdata.cred)
+ put_cred(xprtdata.cred);
+ } else {
+ clp = nfs4_set_ds_client(mds_srv,
+ (struct sockaddr *)&da->da_addr,
+ da->da_addrlen, IPPROTO_TCP,
+ timeo, retrans, minor_version);
+ if (IS_ERR(clp))
+ continue;
+
+ status = nfs4_init_ds_session(clp,
+ mds_srv->nfs_client->cl_lease_time);
+ if (status) {
+ nfs_put_client(clp);
+ clp = ERR_PTR(-EIO);
+ continue;
+ }
+
+ }
+ }
+
+ if (IS_ERR(clp)) {
+ status = PTR_ERR(clp);
+ goto out;
+ }
+
+ smp_wmb();
+ WRITE_ONCE(ds->ds_clp, clp);
+ dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
+out:
+ return status;
+}
+
+/*
+ * Create an rpc connection to the nfs4_pnfs_ds data server.
+ * Currently only supports IPv4 and IPv6 addresses.
+ * If connection fails, make devid unavailable and return a -errno.
+ */
+int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
+ struct nfs4_deviceid_node *devid, unsigned int timeo,
+ unsigned int retrans, u32 version, u32 minor_version)
+{
+ int err;
+
+ do {
+ err = nfs4_wait_ds_connect(ds);
+ if (err || ds->ds_clp)
+ goto out;
+ if (nfs4_test_deviceid_unavailable(devid))
+ return -ENODEV;
+ } while (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) != 0);
+
+ if (ds->ds_clp)
+ goto connect_done;
+
+ switch (version) {
+ case 3:
+ err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo, retrans);
+ break;
+ case 4:
+ err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo, retrans,
+ minor_version);
+ break;
+ default:
+ dprintk("%s: unsupported DS version %d\n", __func__, version);
+ err = -EPROTONOSUPPORT;
+ }
+
+connect_done:
+ nfs4_clear_ds_conn_bit(ds);
+out:
+ /*
+ * At this point the ds->ds_clp should be ready, but it might have
+ * hit an error.
+ */
+ if (!err) {
+ if (!ds->ds_clp || !nfs_client_init_is_complete(ds->ds_clp)) {
+ WARN_ON_ONCE(ds->ds_clp ||
+ !nfs4_test_deviceid_unavailable(devid));
+ return -EINVAL;
+ }
+ err = nfs_client_init_status(ds->ds_clp);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_connect);
+
+/*
+ * Currently only supports ipv4, ipv6 and one multi-path address.
+ */
+struct nfs4_pnfs_ds_addr *
+nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags)
+{
+ struct nfs4_pnfs_ds_addr *da = NULL;
+ char *buf, *portstr;
+ __be16 port;
+ int nlen, rlen;
+ int tmp[2];
+ __be32 *p;
+ char *netid, *match_netid;
+ size_t len, match_netid_len;
+ char *startsep = "";
+ char *endsep = "";
+
+
+ /* r_netid */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_err;
+ nlen = be32_to_cpup(p++);
+
+ p = xdr_inline_decode(xdr, nlen);
+ if (unlikely(!p))
+ goto out_err;
+
+ netid = kmalloc(nlen+1, gfp_flags);
+ if (unlikely(!netid))
+ goto out_err;
+
+ netid[nlen] = '\0';
+ memcpy(netid, p, nlen);
+
+ /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_free_netid;
+ rlen = be32_to_cpup(p);
+
+ p = xdr_inline_decode(xdr, rlen);
+ if (unlikely(!p))
+ goto out_free_netid;
+
+ /* port is ".ABC.DEF", 8 chars max */
+ if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
+ dprintk("%s: Invalid address, length %d\n", __func__,
+ rlen);
+ goto out_free_netid;
+ }
+ buf = kmalloc(rlen + 1, gfp_flags);
+ if (!buf) {
+ dprintk("%s: Not enough memory\n", __func__);
+ goto out_free_netid;
+ }
+ buf[rlen] = '\0';
+ memcpy(buf, p, rlen);
+
+ /* replace port '.' with '-' */
+ portstr = strrchr(buf, '.');
+ if (!portstr) {
+ dprintk("%s: Failed finding expected dot in port\n",
+ __func__);
+ goto out_free_buf;
+ }
+ *portstr = '-';
+
+ /* find '.' between address and port */
+ portstr = strrchr(buf, '.');
+ if (!portstr) {
+ dprintk("%s: Failed finding expected dot between address and "
+ "port\n", __func__);
+ goto out_free_buf;
+ }
+ *portstr = '\0';
+
+ da = kzalloc(sizeof(*da), gfp_flags);
+ if (unlikely(!da))
+ goto out_free_buf;
+
+ INIT_LIST_HEAD(&da->da_node);
+
+ if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
+ sizeof(da->da_addr))) {
+ dprintk("%s: error parsing address %s\n", __func__, buf);
+ goto out_free_da;
+ }
+
+ portstr++;
+ sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
+ port = htons((tmp[0] << 8) | (tmp[1]));
+
+ switch (da->da_addr.ss_family) {
+ case AF_INET:
+ ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
+ da->da_addrlen = sizeof(struct sockaddr_in);
+ match_netid = "tcp";
+ match_netid_len = 3;
+ break;
+
+ case AF_INET6:
+ ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
+ da->da_addrlen = sizeof(struct sockaddr_in6);
+ match_netid = "tcp6";
+ match_netid_len = 4;
+ startsep = "[";
+ endsep = "]";
+ break;
+
+ default:
+ dprintk("%s: unsupported address family: %u\n",
+ __func__, da->da_addr.ss_family);
+ goto out_free_da;
+ }
+
+ if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
+ dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
+ __func__, netid, match_netid);
+ goto out_free_da;
+ }
+
+ /* save human readable address */
+ len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
+ da->da_remotestr = kzalloc(len, gfp_flags);
+
+ /* NULL is ok, only used for dprintk */
+ if (da->da_remotestr)
+ snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
+ buf, endsep, ntohs(port));
+
+ dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
+ kfree(buf);
+ kfree(netid);
+ return da;
+
+out_free_da:
+ kfree(da);
+out_free_buf:
+ dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
+ kfree(buf);
+out_free_netid:
+ kfree(netid);
+out_err:
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nfs4_decode_mp_ds_addr);
+
+void
+pnfs_layout_mark_request_commit(struct nfs_page *req,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx)
+{
+ struct list_head *list;
+ struct pnfs_commit_array *array;
+ struct pnfs_commit_bucket *bucket;
+
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+ array = pnfs_lookup_commit_array(cinfo->ds, lseg);
+ if (!array || !pnfs_is_valid_lseg(lseg))
+ goto out_resched;
+ bucket = &array->buckets[ds_commit_idx];
+ list = &bucket->written;
+ /* Non-empty buckets hold a reference on the lseg. That ref
+ * is normally transferred to the COMMIT call and released
+ * there. It could also be released if the last req is pulled
+ * off due to a rewrite, in which case it will be done in
+ * pnfs_common_clear_request_commit
+ */
+ if (!bucket->lseg)
+ bucket->lseg = pnfs_get_lseg(lseg);
+ set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+ cinfo->ds->nwritten++;
+
+ nfs_request_add_commit_list_locked(req, list, cinfo);
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+ nfs_mark_page_unstable(req->wb_page, cinfo);
+ return;
+out_resched:
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+ cinfo->completion_ops->resched_write(cinfo, req);
+}
+EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
+
+int
+pnfs_nfs_generic_sync(struct inode *inode, bool datasync)
+{
+ int ret;
+
+ if (!pnfs_layoutcommit_outstanding(inode))
+ return 0;
+ ret = nfs_commit_inode(inode, FLUSH_SYNC);
+ if (ret < 0)
+ return ret;
+ if (datasync)
+ return 0;
+ return pnfs_layoutcommit_inode(inode, true);
+}
+EXPORT_SYMBOL_GPL(pnfs_nfs_generic_sync);
+
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
new file mode 100644
index 000000000..15c865cc8
--- /dev/null
+++ b/fs/nfs/proc.c
@@ -0,0 +1,763 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/proc.c
+ *
+ * Copyright (C) 1992, 1993, 1994 Rick Sladkey
+ *
+ * OS-independent nfs remote procedure call functions
+ *
+ * Tuned by Alan Cox <A.Cox@swansea.ac.uk> for >3K buffers
+ * so at last we can have decent(ish) throughput off a
+ * Sun server.
+ *
+ * Coding optimized and cleaned up by Florian La Roche.
+ * Note: Error returns are optimized for NFS_OK, which isn't translated via
+ * nfs_stat_to_errno(), but happens to be already the right return code.
+ *
+ * Also, the code currently doesn't check the size of the packet, when
+ * it decodes the packet.
+ *
+ * Feel free to fix it and mail me the diffs if it worries you.
+ *
+ * Completely rewritten to support the new RPC call interface;
+ * rewrote and moved the entire XDR stuff to xdr.c
+ * --Olaf Kirch June 1996
+ *
+ * The code below initializes all auto variables explicitly, otherwise
+ * it will fail to work as a module (gcc generates a memset call for an
+ * incomplete struct).
+ */
+
+#include <linux/types.h>
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/pagemap.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/lockd/bind.h>
+#include <linux/freezer.h>
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_PROC
+
+/*
+ * Bare-bones access to getattr: this is for nfs_read_super.
+ */
+static int
+nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ struct nfs_fattr *fattr = info->fattr;
+ struct nfs2_fsstat fsinfo;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_GETATTR],
+ .rpc_argp = fhandle,
+ .rpc_resp = fattr,
+ };
+ int status;
+
+ dprintk("%s: call getattr\n", __func__);
+ nfs_fattr_init(fattr);
+ status = rpc_call_sync(server->client, &msg, 0);
+ /* Retry with default authentication if different */
+ if (status && server->nfs_client->cl_rpcclient != server->client)
+ status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
+ dprintk("%s: reply getattr: %d\n", __func__, status);
+ if (status)
+ return status;
+ dprintk("%s: call statfs\n", __func__);
+ msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS];
+ msg.rpc_resp = &fsinfo;
+ status = rpc_call_sync(server->client, &msg, 0);
+ /* Retry with default authentication if different */
+ if (status && server->nfs_client->cl_rpcclient != server->client)
+ status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
+ dprintk("%s: reply statfs: %d\n", __func__, status);
+ if (status)
+ return status;
+ info->rtmax = NFS_MAXDATA;
+ info->rtpref = fsinfo.tsize;
+ info->rtmult = fsinfo.bsize;
+ info->wtmax = NFS_MAXDATA;
+ info->wtpref = fsinfo.tsize;
+ info->wtmult = fsinfo.bsize;
+ info->dtpref = fsinfo.tsize;
+ info->maxfilesize = 0x7FFFFFFF;
+ info->lease_time = 0;
+ return 0;
+}
+
+/*
+ * One function for each procedure in the NFS protocol.
+ */
+static int
+nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr, struct nfs4_label *label,
+ struct inode *inode)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_GETATTR],
+ .rpc_argp = fhandle,
+ .rpc_resp = fattr,
+ };
+ int status;
+ unsigned short task_flags = 0;
+
+ /* Is this is an attribute revalidation, subject to softreval? */
+ if (inode && (server->flags & NFS_MOUNT_SOFTREVAL))
+ task_flags |= RPC_TASK_TIMEOUT;
+
+ dprintk("NFS call getattr\n");
+ nfs_fattr_init(fattr);
+ status = rpc_call_sync(server->client, &msg, task_flags);
+ dprintk("NFS reply getattr: %d\n", status);
+ return status;
+}
+
+static int
+nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
+ struct iattr *sattr)
+{
+ struct inode *inode = d_inode(dentry);
+ struct nfs_sattrargs arg = {
+ .fh = NFS_FH(inode),
+ .sattr = sattr
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_SETATTR],
+ .rpc_argp = &arg,
+ .rpc_resp = fattr,
+ };
+ int status;
+
+ /* Mask out the non-modebit related stuff from attr->ia_mode */
+ sattr->ia_mode &= S_IALLUGO;
+
+ dprintk("NFS call setattr\n");
+ if (sattr->ia_valid & ATTR_FILE)
+ msg.rpc_cred = nfs_file_cred(sattr->ia_file);
+ nfs_fattr_init(fattr);
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ if (status == 0)
+ nfs_setattr_update_inode(inode, sattr, fattr);
+ dprintk("NFS reply setattr: %d\n", status);
+ return status;
+}
+
+static int
+nfs_proc_lookup(struct inode *dir, struct dentry *dentry,
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+ struct nfs4_label *label)
+{
+ struct nfs_diropargs arg = {
+ .fh = NFS_FH(dir),
+ .name = dentry->d_name.name,
+ .len = dentry->d_name.len
+ };
+ struct nfs_diropok res = {
+ .fh = fhandle,
+ .fattr = fattr
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_LOOKUP],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int status;
+ unsigned short task_flags = 0;
+
+ /* Is this is an attribute revalidation, subject to softreval? */
+ if (nfs_lookup_is_soft_revalidate(dentry))
+ task_flags |= RPC_TASK_TIMEOUT;
+
+ dprintk("NFS call lookup %pd2\n", dentry);
+ nfs_fattr_init(fattr);
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, task_flags);
+ dprintk("NFS reply lookup: %d\n", status);
+ return status;
+}
+
+static int nfs_proc_readlink(struct inode *inode, struct page *page,
+ unsigned int pgbase, unsigned int pglen)
+{
+ struct nfs_readlinkargs args = {
+ .fh = NFS_FH(inode),
+ .pgbase = pgbase,
+ .pglen = pglen,
+ .pages = &page
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_READLINK],
+ .rpc_argp = &args,
+ };
+ int status;
+
+ dprintk("NFS call readlink\n");
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ dprintk("NFS reply readlink: %d\n", status);
+ return status;
+}
+
+struct nfs_createdata {
+ struct nfs_createargs arg;
+ struct nfs_diropok res;
+ struct nfs_fh fhandle;
+ struct nfs_fattr fattr;
+};
+
+static struct nfs_createdata *nfs_alloc_createdata(struct inode *dir,
+ struct dentry *dentry, struct iattr *sattr)
+{
+ struct nfs_createdata *data;
+
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
+
+ if (data != NULL) {
+ data->arg.fh = NFS_FH(dir);
+ data->arg.name = dentry->d_name.name;
+ data->arg.len = dentry->d_name.len;
+ data->arg.sattr = sattr;
+ nfs_fattr_init(&data->fattr);
+ data->fhandle.size = 0;
+ data->res.fh = &data->fhandle;
+ data->res.fattr = &data->fattr;
+ }
+ return data;
+};
+
+static void nfs_free_createdata(const struct nfs_createdata *data)
+{
+ kfree(data);
+}
+
+static int
+nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+ int flags)
+{
+ struct nfs_createdata *data;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_CREATE],
+ };
+ int status = -ENOMEM;
+
+ dprintk("NFS call create %pd\n", dentry);
+ data = nfs_alloc_createdata(dir, dentry, sattr);
+ if (data == NULL)
+ goto out;
+ msg.rpc_argp = &data->arg;
+ msg.rpc_resp = &data->res;
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ nfs_mark_for_revalidate(dir);
+ if (status == 0)
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
+ nfs_free_createdata(data);
+out:
+ dprintk("NFS reply create: %d\n", status);
+ return status;
+}
+
+/*
+ * In NFSv2, mknod is grafted onto the create call.
+ */
+static int
+nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
+ dev_t rdev)
+{
+ struct nfs_createdata *data;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_CREATE],
+ };
+ umode_t mode;
+ int status = -ENOMEM;
+
+ dprintk("NFS call mknod %pd\n", dentry);
+
+ mode = sattr->ia_mode;
+ if (S_ISFIFO(mode)) {
+ sattr->ia_mode = (mode & ~S_IFMT) | S_IFCHR;
+ sattr->ia_valid &= ~ATTR_SIZE;
+ } else if (S_ISCHR(mode) || S_ISBLK(mode)) {
+ sattr->ia_valid |= ATTR_SIZE;
+ sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */
+ }
+
+ data = nfs_alloc_createdata(dir, dentry, sattr);
+ if (data == NULL)
+ goto out;
+ msg.rpc_argp = &data->arg;
+ msg.rpc_resp = &data->res;
+
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ nfs_mark_for_revalidate(dir);
+
+ if (status == -EINVAL && S_ISFIFO(mode)) {
+ sattr->ia_mode = mode;
+ nfs_fattr_init(data->res.fattr);
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ }
+ if (status == 0)
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
+ nfs_free_createdata(data);
+out:
+ dprintk("NFS reply mknod: %d\n", status);
+ return status;
+}
+
+static int
+nfs_proc_remove(struct inode *dir, struct dentry *dentry)
+{
+ struct nfs_removeargs arg = {
+ .fh = NFS_FH(dir),
+ .name = dentry->d_name,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_REMOVE],
+ .rpc_argp = &arg,
+ };
+ int status;
+
+ dprintk("NFS call remove %pd2\n",dentry);
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ nfs_mark_for_revalidate(dir);
+
+ dprintk("NFS reply remove: %d\n", status);
+ return status;
+}
+
+static void
+nfs_proc_unlink_setup(struct rpc_message *msg,
+ struct dentry *dentry,
+ struct inode *inode)
+{
+ msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE];
+}
+
+static void nfs_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
+{
+ rpc_call_start(task);
+}
+
+static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
+{
+ nfs_mark_for_revalidate(dir);
+ return 1;
+}
+
+static void
+nfs_proc_rename_setup(struct rpc_message *msg,
+ struct dentry *old_dentry,
+ struct dentry *new_dentry)
+{
+ msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME];
+}
+
+static void nfs_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
+{
+ rpc_call_start(task);
+}
+
+static int
+nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
+ struct inode *new_dir)
+{
+ nfs_mark_for_revalidate(old_dir);
+ nfs_mark_for_revalidate(new_dir);
+ return 1;
+}
+
+static int
+nfs_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name)
+{
+ struct nfs_linkargs arg = {
+ .fromfh = NFS_FH(inode),
+ .tofh = NFS_FH(dir),
+ .toname = name->name,
+ .tolen = name->len
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_LINK],
+ .rpc_argp = &arg,
+ };
+ int status;
+
+ dprintk("NFS call link %s\n", name->name);
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ nfs_mark_for_revalidate(inode);
+ nfs_mark_for_revalidate(dir);
+ dprintk("NFS reply link: %d\n", status);
+ return status;
+}
+
+static int
+nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
+ unsigned int len, struct iattr *sattr)
+{
+ struct nfs_fh *fh;
+ struct nfs_fattr *fattr;
+ struct nfs_symlinkargs arg = {
+ .fromfh = NFS_FH(dir),
+ .fromname = dentry->d_name.name,
+ .fromlen = dentry->d_name.len,
+ .pages = &page,
+ .pathlen = len,
+ .sattr = sattr
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_SYMLINK],
+ .rpc_argp = &arg,
+ };
+ int status = -ENAMETOOLONG;
+
+ dprintk("NFS call symlink %pd\n", dentry);
+
+ if (len > NFS2_MAXPATHLEN)
+ goto out;
+
+ fh = nfs_alloc_fhandle();
+ fattr = nfs_alloc_fattr();
+ status = -ENOMEM;
+ if (fh == NULL || fattr == NULL)
+ goto out_free;
+
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ nfs_mark_for_revalidate(dir);
+
+ /*
+ * V2 SYMLINK requests don't return any attributes. Setting the
+ * filehandle size to zero indicates to nfs_instantiate that it
+ * should fill in the data with a LOOKUP call on the wire.
+ */
+ if (status == 0)
+ status = nfs_instantiate(dentry, fh, fattr, NULL);
+
+out_free:
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fh);
+out:
+ dprintk("NFS reply symlink: %d\n", status);
+ return status;
+}
+
+static int
+nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
+{
+ struct nfs_createdata *data;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_MKDIR],
+ };
+ int status = -ENOMEM;
+
+ dprintk("NFS call mkdir %pd\n", dentry);
+ data = nfs_alloc_createdata(dir, dentry, sattr);
+ if (data == NULL)
+ goto out;
+ msg.rpc_argp = &data->arg;
+ msg.rpc_resp = &data->res;
+
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ nfs_mark_for_revalidate(dir);
+ if (status == 0)
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
+ nfs_free_createdata(data);
+out:
+ dprintk("NFS reply mkdir: %d\n", status);
+ return status;
+}
+
+static int
+nfs_proc_rmdir(struct inode *dir, const struct qstr *name)
+{
+ struct nfs_diropargs arg = {
+ .fh = NFS_FH(dir),
+ .name = name->name,
+ .len = name->len
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_RMDIR],
+ .rpc_argp = &arg,
+ };
+ int status;
+
+ dprintk("NFS call rmdir %s\n", name->name);
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ nfs_mark_for_revalidate(dir);
+ dprintk("NFS reply rmdir: %d\n", status);
+ return status;
+}
+
+/*
+ * The READDIR implementation is somewhat hackish - we pass a temporary
+ * buffer to the encode function, which installs it in the receive
+ * the receive iovec. The decode function just parses the reply to make
+ * sure it is syntactically correct; the entries itself are decoded
+ * from nfs_readdir by calling the decode_entry function directly.
+ */
+static int
+nfs_proc_readdir(struct dentry *dentry, const struct cred *cred,
+ u64 cookie, struct page **pages, unsigned int count, bool plus)
+{
+ struct inode *dir = d_inode(dentry);
+ struct nfs_readdirargs arg = {
+ .fh = NFS_FH(dir),
+ .cookie = cookie,
+ .count = count,
+ .pages = pages,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_READDIR],
+ .rpc_argp = &arg,
+ .rpc_cred = cred,
+ };
+ int status;
+
+ dprintk("NFS call readdir %d\n", (unsigned int)cookie);
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+
+ nfs_invalidate_atime(dir);
+
+ dprintk("NFS reply readdir: %d\n", status);
+ return status;
+}
+
+static int
+nfs_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsstat *stat)
+{
+ struct nfs2_fsstat fsinfo;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_STATFS],
+ .rpc_argp = fhandle,
+ .rpc_resp = &fsinfo,
+ };
+ int status;
+
+ dprintk("NFS call statfs\n");
+ nfs_fattr_init(stat->fattr);
+ status = rpc_call_sync(server->client, &msg, 0);
+ dprintk("NFS reply statfs: %d\n", status);
+ if (status)
+ goto out;
+ stat->tbytes = (u64)fsinfo.blocks * fsinfo.bsize;
+ stat->fbytes = (u64)fsinfo.bfree * fsinfo.bsize;
+ stat->abytes = (u64)fsinfo.bavail * fsinfo.bsize;
+ stat->tfiles = 0;
+ stat->ffiles = 0;
+ stat->afiles = 0;
+out:
+ return status;
+}
+
+static int
+nfs_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ struct nfs2_fsstat fsinfo;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs_procedures[NFSPROC_STATFS],
+ .rpc_argp = fhandle,
+ .rpc_resp = &fsinfo,
+ };
+ int status;
+
+ dprintk("NFS call fsinfo\n");
+ nfs_fattr_init(info->fattr);
+ status = rpc_call_sync(server->client, &msg, 0);
+ dprintk("NFS reply fsinfo: %d\n", status);
+ if (status)
+ goto out;
+ info->rtmax = NFS_MAXDATA;
+ info->rtpref = fsinfo.tsize;
+ info->rtmult = fsinfo.bsize;
+ info->wtmax = NFS_MAXDATA;
+ info->wtpref = fsinfo.tsize;
+ info->wtmult = fsinfo.bsize;
+ info->dtpref = fsinfo.tsize;
+ info->maxfilesize = 0x7FFFFFFF;
+ info->lease_time = 0;
+out:
+ return status;
+}
+
+static int
+nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_pathconf *info)
+{
+ info->max_link = 0;
+ info->max_namelen = NFS2_MAXNAMLEN;
+ return 0;
+}
+
+static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+ struct inode *inode = hdr->inode;
+
+ nfs_invalidate_atime(inode);
+ if (task->tk_status >= 0) {
+ nfs_refresh_inode(inode, hdr->res.fattr);
+ /* Emulate the eof flag, which isn't normally needed in NFSv2
+ * as it is guaranteed to always return the file attributes
+ */
+ if ((hdr->res.count == 0 && hdr->args.count > 0) ||
+ hdr->args.offset + hdr->res.count >= hdr->res.fattr->size)
+ hdr->res.eof = 1;
+ }
+ return 0;
+}
+
+static void nfs_proc_read_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg)
+{
+ msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
+}
+
+static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ rpc_call_start(task);
+ return 0;
+}
+
+static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
+{
+ if (task->tk_status >= 0) {
+ hdr->res.count = hdr->args.count;
+ nfs_writeback_update_inode(hdr);
+ }
+ return 0;
+}
+
+static void nfs_proc_write_setup(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg,
+ struct rpc_clnt **clnt)
+{
+ /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
+ hdr->args.stable = NFS_FILE_SYNC;
+ msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
+}
+
+static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
+{
+ BUG();
+}
+
+static void
+nfs_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg,
+ struct rpc_clnt **clnt)
+{
+ BUG();
+}
+
+static int
+nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = file_inode(filp);
+
+ return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl, NULL);
+}
+
+/* Helper functions for NFS lock bounds checking */
+#define NFS_LOCK32_OFFSET_MAX ((__s32)0x7fffffffUL)
+static int nfs_lock_check_bounds(const struct file_lock *fl)
+{
+ __s32 start, end;
+
+ start = (__s32)fl->fl_start;
+ if ((loff_t)start != fl->fl_start)
+ goto out_einval;
+
+ if (fl->fl_end != OFFSET_MAX) {
+ end = (__s32)fl->fl_end;
+ if ((loff_t)end != fl->fl_end)
+ goto out_einval;
+ } else
+ end = NFS_LOCK32_OFFSET_MAX;
+
+ if (start < 0 || start > end)
+ goto out_einval;
+ return 0;
+out_einval:
+ return -EINVAL;
+}
+
+static int nfs_have_delegation(struct inode *inode, fmode_t flags)
+{
+ return 0;
+}
+
+static const struct inode_operations nfs_dir_inode_operations = {
+ .create = nfs_create,
+ .lookup = nfs_lookup,
+ .link = nfs_link,
+ .unlink = nfs_unlink,
+ .symlink = nfs_symlink,
+ .mkdir = nfs_mkdir,
+ .rmdir = nfs_rmdir,
+ .mknod = nfs_mknod,
+ .rename = nfs_rename,
+ .permission = nfs_permission,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+};
+
+static const struct inode_operations nfs_file_inode_operations = {
+ .permission = nfs_permission,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+};
+
+const struct nfs_rpc_ops nfs_v2_clientops = {
+ .version = 2, /* protocol version */
+ .dentry_ops = &nfs_dentry_operations,
+ .dir_inode_ops = &nfs_dir_inode_operations,
+ .file_inode_ops = &nfs_file_inode_operations,
+ .file_ops = &nfs_file_operations,
+ .getroot = nfs_proc_get_root,
+ .submount = nfs_submount,
+ .try_get_tree = nfs_try_get_tree,
+ .getattr = nfs_proc_getattr,
+ .setattr = nfs_proc_setattr,
+ .lookup = nfs_proc_lookup,
+ .access = NULL, /* access */
+ .readlink = nfs_proc_readlink,
+ .create = nfs_proc_create,
+ .remove = nfs_proc_remove,
+ .unlink_setup = nfs_proc_unlink_setup,
+ .unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare,
+ .unlink_done = nfs_proc_unlink_done,
+ .rename_setup = nfs_proc_rename_setup,
+ .rename_rpc_prepare = nfs_proc_rename_rpc_prepare,
+ .rename_done = nfs_proc_rename_done,
+ .link = nfs_proc_link,
+ .symlink = nfs_proc_symlink,
+ .mkdir = nfs_proc_mkdir,
+ .rmdir = nfs_proc_rmdir,
+ .readdir = nfs_proc_readdir,
+ .mknod = nfs_proc_mknod,
+ .statfs = nfs_proc_statfs,
+ .fsinfo = nfs_proc_fsinfo,
+ .pathconf = nfs_proc_pathconf,
+ .decode_dirent = nfs2_decode_dirent,
+ .pgio_rpc_prepare = nfs_proc_pgio_rpc_prepare,
+ .read_setup = nfs_proc_read_setup,
+ .read_done = nfs_read_done,
+ .write_setup = nfs_proc_write_setup,
+ .write_done = nfs_write_done,
+ .commit_setup = nfs_proc_commit_setup,
+ .commit_rpc_prepare = nfs_proc_commit_rpc_prepare,
+ .lock = nfs_proc_lock,
+ .lock_check_bounds = nfs_lock_check_bounds,
+ .close_context = nfs_close_context,
+ .have_delegation = nfs_have_delegation,
+ .alloc_client = nfs_alloc_client,
+ .init_client = nfs_init_client,
+ .free_client = nfs_free_client,
+ .create_server = nfs_create_server,
+ .clone_server = nfs_clone_server,
+};
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
new file mode 100644
index 000000000..eb854f1f8
--- /dev/null
+++ b/fs/nfs/read.c
@@ -0,0 +1,486 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/read.c
+ *
+ * Block I/O for NFS
+ *
+ * Partial copy of Linus' read cache modifications to fs/nfs/file.c
+ * modified for async RPC by okir@monad.swb.de
+ */
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/module.h>
+
+#include "nfs4_fs.h"
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+#include "pnfs.h"
+#include "nfstrace.h"
+
+#define NFSDBG_FACILITY NFSDBG_PAGECACHE
+
+static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
+static const struct nfs_rw_ops nfs_rw_read_ops;
+
+static struct kmem_cache *nfs_rdata_cachep;
+
+static struct nfs_pgio_header *nfs_readhdr_alloc(void)
+{
+ struct nfs_pgio_header *p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
+
+ if (p)
+ p->rw_mode = FMODE_READ;
+ return p;
+}
+
+static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
+{
+ kmem_cache_free(nfs_rdata_cachep, rhdr);
+}
+
+static
+int nfs_return_empty_page(struct page *page)
+{
+ zero_user(page, 0, PAGE_SIZE);
+ SetPageUptodate(page);
+ unlock_page(page);
+ return 0;
+}
+
+void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode, bool force_mds,
+ const struct nfs_pgio_completion_ops *compl_ops)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops;
+
+#ifdef CONFIG_NFS_V4_1
+ if (server->pnfs_curr_ld && !force_mds)
+ pg_ops = server->pnfs_curr_ld->pg_read_ops;
+#endif
+ nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops,
+ server->rsize, 0);
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
+
+void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
+{
+ struct nfs_pgio_mirror *mirror;
+
+ if (pgio->pg_ops && pgio->pg_ops->pg_cleanup)
+ pgio->pg_ops->pg_cleanup(pgio);
+
+ pgio->pg_ops = &nfs_pgio_rw_ops;
+
+ /* read path should never have more than one mirror */
+ WARN_ON_ONCE(pgio->pg_mirror_count != 1);
+
+ mirror = &pgio->pg_mirrors[0];
+ mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
+
+static void nfs_readpage_release(struct nfs_page *req, int error)
+{
+ struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
+ struct page *page = req->wb_page;
+
+ dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
+ (long long)req_offset(req));
+
+ if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
+ SetPageError(page);
+ if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
+ struct address_space *mapping = page_file_mapping(page);
+
+ if (PageUptodate(page))
+ nfs_readpage_to_fscache(inode, page, 0);
+ else if (!PageError(page) && !PagePrivate(page))
+ generic_error_remove_page(mapping, page);
+ unlock_page(page);
+ }
+ nfs_release_request(req);
+}
+
+int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
+ struct page *page)
+{
+ struct nfs_page *new;
+ unsigned int len;
+ struct nfs_pageio_descriptor pgio;
+ struct nfs_pgio_mirror *pgm;
+
+ len = nfs_page_length(page);
+ if (len == 0)
+ return nfs_return_empty_page(page);
+ new = nfs_create_request(ctx, page, 0, len);
+ if (IS_ERR(new)) {
+ unlock_page(page);
+ return PTR_ERR(new);
+ }
+ if (len < PAGE_SIZE)
+ zero_user_segment(page, len, PAGE_SIZE);
+
+ nfs_pageio_init_read(&pgio, inode, false,
+ &nfs_async_read_completion_ops);
+ if (!nfs_pageio_add_request(&pgio, new)) {
+ nfs_list_remove_request(new);
+ nfs_readpage_release(new, pgio.pg_error);
+ }
+ nfs_pageio_complete(&pgio);
+
+ /* It doesn't make sense to do mirrored reads! */
+ WARN_ON_ONCE(pgio.pg_mirror_count != 1);
+
+ pgm = &pgio.pg_mirrors[0];
+ NFS_I(inode)->read_io += pgm->pg_bytes_written;
+
+ return pgio.pg_error < 0 ? pgio.pg_error : 0;
+}
+
+static void nfs_page_group_set_uptodate(struct nfs_page *req)
+{
+ if (nfs_page_group_sync_on_bit(req, PG_UPTODATE))
+ SetPageUptodate(req->wb_page);
+}
+
+static void nfs_read_completion(struct nfs_pgio_header *hdr)
+{
+ unsigned long bytes = 0;
+ int error;
+
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
+ goto out;
+ while (!list_empty(&hdr->pages)) {
+ struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+ struct page *page = req->wb_page;
+ unsigned long start = req->wb_pgbase;
+ unsigned long end = req->wb_pgbase + req->wb_bytes;
+
+ if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
+ /* note: regions of the page not covered by a
+ * request are zeroed in nfs_readpage_async /
+ * readpage_async_filler */
+ if (bytes > hdr->good_bytes) {
+ /* nothing in this request was good, so zero
+ * the full extent of the request */
+ zero_user_segment(page, start, end);
+
+ } else if (hdr->good_bytes - bytes < req->wb_bytes) {
+ /* part of this request has good bytes, but
+ * not all. zero the bad bytes */
+ start += hdr->good_bytes - bytes;
+ WARN_ON(start < req->wb_pgbase);
+ zero_user_segment(page, start, end);
+ }
+ }
+ error = 0;
+ bytes += req->wb_bytes;
+ if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
+ if (bytes <= hdr->good_bytes)
+ nfs_page_group_set_uptodate(req);
+ else {
+ error = hdr->error;
+ xchg(&nfs_req_openctx(req)->error, error);
+ }
+ } else
+ nfs_page_group_set_uptodate(req);
+ nfs_list_remove_request(req);
+ nfs_readpage_release(req, error);
+ }
+out:
+ hdr->release(hdr);
+}
+
+static void nfs_initiate_read(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg,
+ const struct nfs_rpc_ops *rpc_ops,
+ struct rpc_task_setup *task_setup_data, int how)
+{
+ struct inode *inode = hdr->inode;
+ int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
+
+ task_setup_data->flags |= swap_flags;
+ rpc_ops->read_setup(hdr, msg);
+ trace_nfs_initiate_read(hdr);
+}
+
+static void
+nfs_async_read_error(struct list_head *head, int error)
+{
+ struct nfs_page *req;
+
+ while (!list_empty(head)) {
+ req = nfs_list_entry(head->next);
+ nfs_list_remove_request(req);
+ nfs_readpage_release(req, error);
+ }
+}
+
+static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
+ .error_cleanup = nfs_async_read_error,
+ .completion = nfs_read_completion,
+};
+
+/*
+ * This is the callback from RPC telling us whether a reply was
+ * received or some error occurred (timeout or socket shutdown).
+ */
+static int nfs_readpage_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr,
+ struct inode *inode)
+{
+ int status = NFS_PROTO(inode)->read_done(task, hdr);
+ if (status != 0)
+ return status;
+
+ nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, hdr->res.count);
+ trace_nfs_readpage_done(task, hdr);
+
+ if (task->tk_status == -ESTALE) {
+ nfs_set_inode_stale(inode);
+ nfs_mark_for_revalidate(inode);
+ }
+ return 0;
+}
+
+static void nfs_readpage_retry(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ struct nfs_pgio_args *argp = &hdr->args;
+ struct nfs_pgio_res *resp = &hdr->res;
+
+ /* This is a short read! */
+ nfs_inc_stats(hdr->inode, NFSIOS_SHORTREAD);
+ trace_nfs_readpage_short(task, hdr);
+
+ /* Has the server at least made some progress? */
+ if (resp->count == 0) {
+ nfs_set_pgio_error(hdr, -EIO, argp->offset);
+ return;
+ }
+
+ /* For non rpc-based layout drivers, retry-through-MDS */
+ if (!task->tk_ops) {
+ hdr->pnfs_error = -EAGAIN;
+ return;
+ }
+
+ /* Yes, so retry the read at the end of the hdr */
+ hdr->mds_offset += resp->count;
+ argp->offset += resp->count;
+ argp->pgbase += resp->count;
+ argp->count -= resp->count;
+ resp->count = 0;
+ resp->eof = 0;
+ rpc_restart_call_prepare(task);
+}
+
+static void nfs_readpage_result(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (hdr->res.eof) {
+ loff_t pos = hdr->args.offset + hdr->res.count;
+ unsigned int new = pos - hdr->io_start;
+
+ if (hdr->good_bytes > new) {
+ hdr->good_bytes = new;
+ set_bit(NFS_IOHDR_EOF, &hdr->flags);
+ clear_bit(NFS_IOHDR_ERROR, &hdr->flags);
+ }
+ } else if (hdr->res.count < hdr->args.count)
+ nfs_readpage_retry(task, hdr);
+}
+
+/*
+ * Read a page over NFS.
+ * We read the page synchronously in the following case:
+ * - The error flag is set for this page. This happens only when a
+ * previous async read operation failed.
+ */
+int nfs_readpage(struct file *file, struct page *page)
+{
+ struct nfs_open_context *ctx;
+ struct inode *inode = page_file_mapping(page)->host;
+ int error;
+
+ dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
+ page, PAGE_SIZE, page_index(page));
+ nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
+ nfs_add_stats(inode, NFSIOS_READPAGES, 1);
+
+ /*
+ * Try to flush any pending writes to the file..
+ *
+ * NOTE! Because we own the page lock, there cannot
+ * be any new pending writes generated at this point
+ * for this page (other pages can be written to).
+ */
+ error = nfs_wb_page(inode, page);
+ if (error)
+ goto out_unlock;
+ if (PageUptodate(page))
+ goto out_unlock;
+
+ error = -ESTALE;
+ if (NFS_STALE(inode))
+ goto out_unlock;
+
+ if (file == NULL) {
+ error = -EBADF;
+ ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
+ if (ctx == NULL)
+ goto out_unlock;
+ } else
+ ctx = get_nfs_open_context(nfs_file_open_context(file));
+
+ if (!IS_SYNC(inode)) {
+ error = nfs_readpage_from_fscache(ctx, inode, page);
+ if (error == 0)
+ goto out;
+ }
+
+ xchg(&ctx->error, 0);
+ error = nfs_readpage_async(ctx, inode, page);
+ if (!error) {
+ error = wait_on_page_locked_killable(page);
+ if (!PageUptodate(page) && !error)
+ error = xchg(&ctx->error, 0);
+ }
+out:
+ put_nfs_open_context(ctx);
+ return error;
+out_unlock:
+ unlock_page(page);
+ return error;
+}
+
+struct nfs_readdesc {
+ struct nfs_pageio_descriptor *pgio;
+ struct nfs_open_context *ctx;
+};
+
+static int
+readpage_async_filler(void *data, struct page *page)
+{
+ struct nfs_readdesc *desc = (struct nfs_readdesc *)data;
+ struct nfs_page *new;
+ unsigned int len;
+ int error;
+
+ len = nfs_page_length(page);
+ if (len == 0)
+ return nfs_return_empty_page(page);
+
+ new = nfs_create_request(desc->ctx, page, 0, len);
+ if (IS_ERR(new))
+ goto out_error;
+
+ if (len < PAGE_SIZE)
+ zero_user_segment(page, len, PAGE_SIZE);
+ if (!nfs_pageio_add_request(desc->pgio, new)) {
+ nfs_list_remove_request(new);
+ error = desc->pgio->pg_error;
+ nfs_readpage_release(new, error);
+ goto out;
+ }
+ return 0;
+out_error:
+ error = PTR_ERR(new);
+ unlock_page(page);
+out:
+ return error;
+}
+
+int nfs_readpages(struct file *filp, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ struct nfs_pageio_descriptor pgio;
+ struct nfs_pgio_mirror *pgm;
+ struct nfs_readdesc desc = {
+ .pgio = &pgio,
+ };
+ struct inode *inode = mapping->host;
+ unsigned long npages;
+ int ret = -ESTALE;
+
+ dprintk("NFS: nfs_readpages (%s/%Lu %d)\n",
+ inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(inode),
+ nr_pages);
+ nfs_inc_stats(inode, NFSIOS_VFSREADPAGES);
+
+ if (NFS_STALE(inode))
+ goto out;
+
+ if (filp == NULL) {
+ desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
+ if (desc.ctx == NULL)
+ return -EBADF;
+ } else
+ desc.ctx = get_nfs_open_context(nfs_file_open_context(filp));
+
+ /* attempt to read as many of the pages as possible from the cache
+ * - this returns -ENOBUFS immediately if the cookie is negative
+ */
+ ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping,
+ pages, &nr_pages);
+ if (ret == 0)
+ goto read_complete; /* all pages were read */
+
+ nfs_pageio_init_read(&pgio, inode, false,
+ &nfs_async_read_completion_ops);
+
+ ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
+ nfs_pageio_complete(&pgio);
+
+ /* It doesn't make sense to do mirrored reads! */
+ WARN_ON_ONCE(pgio.pg_mirror_count != 1);
+
+ pgm = &pgio.pg_mirrors[0];
+ NFS_I(inode)->read_io += pgm->pg_bytes_written;
+ npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >>
+ PAGE_SHIFT;
+ nfs_add_stats(inode, NFSIOS_READPAGES, npages);
+read_complete:
+ put_nfs_open_context(desc.ctx);
+out:
+ return ret;
+}
+
+int __init nfs_init_readpagecache(void)
+{
+ nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
+ sizeof(struct nfs_pgio_header),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (nfs_rdata_cachep == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void nfs_destroy_readpagecache(void)
+{
+ kmem_cache_destroy(nfs_rdata_cachep);
+}
+
+static const struct nfs_rw_ops nfs_rw_read_ops = {
+ .rw_alloc_header = nfs_readhdr_alloc,
+ .rw_free_header = nfs_readhdr_free,
+ .rw_done = nfs_readpage_done,
+ .rw_result = nfs_readpage_result,
+ .rw_initiate = nfs_initiate_read,
+};
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
new file mode 100644
index 000000000..b3fcc27b9
--- /dev/null
+++ b/fs/nfs/super.c
@@ -0,0 +1,1401 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/super.c
+ *
+ * Copyright (C) 1992 Rick Sladkey
+ *
+ * nfs superblock handling functions
+ *
+ * Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some
+ * experimental NFS changes. Modularisation taken straight from SYS5 fs.
+ *
+ * Change to nfs_read_super() to permit NFS mounts to multi-homed hosts.
+ * J.S.Peatfield@damtp.cam.ac.uk
+ *
+ * Split from inode.c by David Howells <dhowells@redhat.com>
+ *
+ * - superblocks are indexed on server only - all inodes, dentries, etc. associated with a
+ * particular server are held in the same superblock
+ * - NFS superblocks can have several effective roots to the dentry tree
+ * - directory type roots are spliced into the tree when a path from one root reaches the root
+ * of another (see nfs_lookup())
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/xprtsock.h>
+#include <linux/sunrpc/xprtrdma.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/slab.h>
+#include <net/ipv6.h>
+#include <linux/netdevice.h>
+#include <linux/nfs_xdr.h>
+#include <linux/magic.h>
+#include <linux/parser.h>
+#include <linux/nsproxy.h>
+#include <linux/rcupdate.h>
+
+#include <linux/uaccess.h>
+#include <linux/nfs_ssc.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#include "fscache.h"
+#include "nfs4session.h"
+#include "pnfs.h"
+#include "nfs.h"
+
+#define NFSDBG_FACILITY NFSDBG_VFS
+
+const struct super_operations nfs_sops = {
+ .alloc_inode = nfs_alloc_inode,
+ .free_inode = nfs_free_inode,
+ .write_inode = nfs_write_inode,
+ .drop_inode = nfs_drop_inode,
+ .statfs = nfs_statfs,
+ .evict_inode = nfs_evict_inode,
+ .umount_begin = nfs_umount_begin,
+ .show_options = nfs_show_options,
+ .show_devname = nfs_show_devname,
+ .show_path = nfs_show_path,
+ .show_stats = nfs_show_stats,
+};
+EXPORT_SYMBOL_GPL(nfs_sops);
+
+static const struct nfs_ssc_client_ops nfs_ssc_clnt_ops_tbl = {
+ .sco_sb_deactive = nfs_sb_deactive,
+};
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+static int __init register_nfs4_fs(void)
+{
+ return register_filesystem(&nfs4_fs_type);
+}
+
+static void unregister_nfs4_fs(void)
+{
+ unregister_filesystem(&nfs4_fs_type);
+}
+#else
+static int __init register_nfs4_fs(void)
+{
+ return 0;
+}
+
+static void unregister_nfs4_fs(void)
+{
+}
+#endif
+
+static void nfs_ssc_register_ops(void)
+{
+ nfs_ssc_register(&nfs_ssc_clnt_ops_tbl);
+}
+
+static void nfs_ssc_unregister_ops(void)
+{
+ nfs_ssc_unregister(&nfs_ssc_clnt_ops_tbl);
+}
+
+static struct shrinker acl_shrinker = {
+ .count_objects = nfs_access_cache_count,
+ .scan_objects = nfs_access_cache_scan,
+ .seeks = DEFAULT_SEEKS,
+};
+
+/*
+ * Register the NFS filesystems
+ */
+int __init register_nfs_fs(void)
+{
+ int ret;
+
+ ret = register_filesystem(&nfs_fs_type);
+ if (ret < 0)
+ goto error_0;
+
+ ret = register_nfs4_fs();
+ if (ret < 0)
+ goto error_1;
+
+ ret = nfs_register_sysctl();
+ if (ret < 0)
+ goto error_2;
+ ret = register_shrinker(&acl_shrinker);
+ if (ret < 0)
+ goto error_3;
+ nfs_ssc_register_ops();
+ return 0;
+error_3:
+ nfs_unregister_sysctl();
+error_2:
+ unregister_nfs4_fs();
+error_1:
+ unregister_filesystem(&nfs_fs_type);
+error_0:
+ return ret;
+}
+
+/*
+ * Unregister the NFS filesystems
+ */
+void __exit unregister_nfs_fs(void)
+{
+ unregister_shrinker(&acl_shrinker);
+ nfs_unregister_sysctl();
+ unregister_nfs4_fs();
+ nfs_ssc_unregister_ops();
+ unregister_filesystem(&nfs_fs_type);
+}
+
+bool nfs_sb_active(struct super_block *sb)
+{
+ struct nfs_server *server = NFS_SB(sb);
+
+ if (!atomic_inc_not_zero(&sb->s_active))
+ return false;
+ if (atomic_inc_return(&server->active) != 1)
+ atomic_dec(&sb->s_active);
+ return true;
+}
+EXPORT_SYMBOL_GPL(nfs_sb_active);
+
+void nfs_sb_deactive(struct super_block *sb)
+{
+ struct nfs_server *server = NFS_SB(sb);
+
+ if (atomic_dec_and_test(&server->active))
+ deactivate_super(sb);
+}
+EXPORT_SYMBOL_GPL(nfs_sb_deactive);
+
+static int __nfs_list_for_each_server(struct list_head *head,
+ int (*fn)(struct nfs_server *, void *),
+ void *data)
+{
+ struct nfs_server *server, *last = NULL;
+ int ret = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, head, client_link) {
+ if (!(server->super && nfs_sb_active(server->super)))
+ continue;
+ rcu_read_unlock();
+ if (last)
+ nfs_sb_deactive(last->super);
+ last = server;
+ ret = fn(server, data);
+ if (ret)
+ goto out;
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+out:
+ if (last)
+ nfs_sb_deactive(last->super);
+ return ret;
+}
+
+int nfs_client_for_each_server(struct nfs_client *clp,
+ int (*fn)(struct nfs_server *, void *),
+ void *data)
+{
+ return __nfs_list_for_each_server(&clp->cl_superblocks, fn, data);
+}
+EXPORT_SYMBOL_GPL(nfs_client_for_each_server);
+
+/*
+ * Deliver file system statistics to userspace
+ */
+int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct nfs_server *server = NFS_SB(dentry->d_sb);
+ unsigned char blockbits;
+ unsigned long blockres;
+ struct nfs_fh *fh = NFS_FH(d_inode(dentry));
+ struct nfs_fsstat res;
+ int error = -ENOMEM;
+
+ res.fattr = nfs_alloc_fattr();
+ if (res.fattr == NULL)
+ goto out_err;
+
+ error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
+ if (unlikely(error == -ESTALE)) {
+ struct dentry *pd_dentry;
+
+ pd_dentry = dget_parent(dentry);
+ nfs_zap_caches(d_inode(pd_dentry));
+ dput(pd_dentry);
+ }
+ nfs_free_fattr(res.fattr);
+ if (error < 0)
+ goto out_err;
+
+ buf->f_type = NFS_SUPER_MAGIC;
+
+ /*
+ * Current versions of glibc do not correctly handle the
+ * case where f_frsize != f_bsize. Eventually we want to
+ * report the value of wtmult in this field.
+ */
+ buf->f_frsize = dentry->d_sb->s_blocksize;
+
+ /*
+ * On most *nix systems, f_blocks, f_bfree, and f_bavail
+ * are reported in units of f_frsize. Linux hasn't had
+ * an f_frsize field in its statfs struct until recently,
+ * thus historically Linux's sys_statfs reports these
+ * fields in units of f_bsize.
+ */
+ buf->f_bsize = dentry->d_sb->s_blocksize;
+ blockbits = dentry->d_sb->s_blocksize_bits;
+ blockres = (1 << blockbits) - 1;
+ buf->f_blocks = (res.tbytes + blockres) >> blockbits;
+ buf->f_bfree = (res.fbytes + blockres) >> blockbits;
+ buf->f_bavail = (res.abytes + blockres) >> blockbits;
+
+ buf->f_files = res.tfiles;
+ buf->f_ffree = res.afiles;
+
+ buf->f_namelen = server->namelen;
+
+ return 0;
+
+ out_err:
+ dprintk("%s: statfs error = %d\n", __func__, -error);
+ return error;
+}
+EXPORT_SYMBOL_GPL(nfs_statfs);
+
+/*
+ * Map the security flavour number to a name
+ */
+static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
+{
+ static const struct {
+ rpc_authflavor_t flavour;
+ const char *str;
+ } sec_flavours[NFS_AUTH_INFO_MAX_FLAVORS] = {
+ /* update NFS_AUTH_INFO_MAX_FLAVORS when this list changes! */
+ { RPC_AUTH_NULL, "null" },
+ { RPC_AUTH_UNIX, "sys" },
+ { RPC_AUTH_GSS_KRB5, "krb5" },
+ { RPC_AUTH_GSS_KRB5I, "krb5i" },
+ { RPC_AUTH_GSS_KRB5P, "krb5p" },
+ { RPC_AUTH_GSS_LKEY, "lkey" },
+ { RPC_AUTH_GSS_LKEYI, "lkeyi" },
+ { RPC_AUTH_GSS_LKEYP, "lkeyp" },
+ { RPC_AUTH_GSS_SPKM, "spkm" },
+ { RPC_AUTH_GSS_SPKMI, "spkmi" },
+ { RPC_AUTH_GSS_SPKMP, "spkmp" },
+ { UINT_MAX, "unknown" }
+ };
+ int i;
+
+ for (i = 0; sec_flavours[i].flavour != UINT_MAX; i++) {
+ if (sec_flavours[i].flavour == flavour)
+ break;
+ }
+ return sec_flavours[i].str;
+}
+
+static void nfs_show_mountd_netid(struct seq_file *m, struct nfs_server *nfss,
+ int showdefaults)
+{
+ struct sockaddr *sap = (struct sockaddr *) &nfss->mountd_address;
+ char *proto = NULL;
+
+ switch (sap->sa_family) {
+ case AF_INET:
+ switch (nfss->mountd_protocol) {
+ case IPPROTO_UDP:
+ proto = RPCBIND_NETID_UDP;
+ break;
+ case IPPROTO_TCP:
+ proto = RPCBIND_NETID_TCP;
+ break;
+ }
+ break;
+ case AF_INET6:
+ switch (nfss->mountd_protocol) {
+ case IPPROTO_UDP:
+ proto = RPCBIND_NETID_UDP6;
+ break;
+ case IPPROTO_TCP:
+ proto = RPCBIND_NETID_TCP6;
+ break;
+ }
+ break;
+ }
+ if (proto || showdefaults)
+ seq_printf(m, ",mountproto=%s", proto ?: "auto");
+}
+
+static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
+ int showdefaults)
+{
+ struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address;
+
+ if (nfss->flags & NFS_MOUNT_LEGACY_INTERFACE)
+ return;
+
+ switch (sap->sa_family) {
+ case AF_INET: {
+ struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+ seq_printf(m, ",mountaddr=%pI4", &sin->sin_addr.s_addr);
+ break;
+ }
+ case AF_INET6: {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+ seq_printf(m, ",mountaddr=%pI6c", &sin6->sin6_addr);
+ break;
+ }
+ default:
+ if (showdefaults)
+ seq_puts(m, ",mountaddr=unspecified");
+ }
+
+ if (nfss->mountd_version || showdefaults)
+ seq_printf(m, ",mountvers=%u", nfss->mountd_version);
+ if ((nfss->mountd_port &&
+ nfss->mountd_port != (unsigned short)NFS_UNSPEC_PORT) ||
+ showdefaults)
+ seq_printf(m, ",mountport=%u", nfss->mountd_port);
+
+ nfs_show_mountd_netid(m, nfss, showdefaults);
+}
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
+ int showdefaults)
+{
+ struct nfs_client *clp = nfss->nfs_client;
+
+ seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
+}
+#else
+static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
+ int showdefaults)
+{
+}
+#endif
+
+static void nfs_show_nfs_version(struct seq_file *m,
+ unsigned int version,
+ unsigned int minorversion)
+{
+ seq_printf(m, ",vers=%u", version);
+ if (version == 4)
+ seq_printf(m, ".%u", minorversion);
+}
+
+/*
+ * Describe the mount options in force on this server representation
+ */
+static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
+ int showdefaults)
+{
+ static const struct proc_nfs_info {
+ int flag;
+ const char *str;
+ const char *nostr;
+ } nfs_info[] = {
+ { NFS_MOUNT_SOFT, ",soft", "" },
+ { NFS_MOUNT_SOFTERR, ",softerr", "" },
+ { NFS_MOUNT_SOFTREVAL, ",softreval", "" },
+ { NFS_MOUNT_POSIX, ",posix", "" },
+ { NFS_MOUNT_NOCTO, ",nocto", "" },
+ { NFS_MOUNT_NOAC, ",noac", "" },
+ { NFS_MOUNT_NONLM, ",nolock", "" },
+ { NFS_MOUNT_NOACL, ",noacl", "" },
+ { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
+ { NFS_MOUNT_UNSHARED, ",nosharecache", "" },
+ { NFS_MOUNT_NORESVPORT, ",noresvport", "" },
+ { 0, NULL, NULL }
+ };
+ const struct proc_nfs_info *nfs_infop;
+ struct nfs_client *clp = nfss->nfs_client;
+ u32 version = clp->rpc_ops->version;
+ int local_flock, local_fcntl;
+
+ nfs_show_nfs_version(m, version, clp->cl_minorversion);
+ seq_printf(m, ",rsize=%u", nfss->rsize);
+ seq_printf(m, ",wsize=%u", nfss->wsize);
+ if (nfss->bsize != 0)
+ seq_printf(m, ",bsize=%u", nfss->bsize);
+ seq_printf(m, ",namlen=%u", nfss->namelen);
+ if (nfss->acregmin != NFS_DEF_ACREGMIN*HZ || showdefaults)
+ seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ);
+ if (nfss->acregmax != NFS_DEF_ACREGMAX*HZ || showdefaults)
+ seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ);
+ if (nfss->acdirmin != NFS_DEF_ACDIRMIN*HZ || showdefaults)
+ seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ);
+ if (nfss->acdirmax != NFS_DEF_ACDIRMAX*HZ || showdefaults)
+ seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ);
+ if (!(nfss->flags & (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR)))
+ seq_puts(m, ",hard");
+ for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
+ if (nfss->flags & nfs_infop->flag)
+ seq_puts(m, nfs_infop->str);
+ else
+ seq_puts(m, nfs_infop->nostr);
+ }
+ rcu_read_lock();
+ seq_printf(m, ",proto=%s",
+ rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID));
+ rcu_read_unlock();
+ if (clp->cl_nconnect > 0)
+ seq_printf(m, ",nconnect=%u", clp->cl_nconnect);
+ if (version == 4) {
+ if (nfss->port != NFS_PORT)
+ seq_printf(m, ",port=%u", nfss->port);
+ } else
+ if (nfss->port)
+ seq_printf(m, ",port=%u", nfss->port);
+
+ seq_printf(m, ",timeo=%lu", 10U * nfss->client->cl_timeout->to_initval / HZ);
+ seq_printf(m, ",retrans=%u", nfss->client->cl_timeout->to_retries);
+ seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor));
+
+ if (version != 4)
+ nfs_show_mountd_options(m, nfss, showdefaults);
+ else
+ nfs_show_nfsv4_options(m, nfss, showdefaults);
+
+ if (nfss->options & NFS_OPTION_FSCACHE)
+ seq_puts(m, ",fsc");
+
+ if (nfss->options & NFS_OPTION_MIGRATION)
+ seq_puts(m, ",migration");
+
+ if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) {
+ if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
+ seq_puts(m, ",lookupcache=none");
+ else
+ seq_puts(m, ",lookupcache=pos");
+ }
+
+ local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK;
+ local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL;
+
+ if (!local_flock && !local_fcntl)
+ seq_puts(m, ",local_lock=none");
+ else if (local_flock && local_fcntl)
+ seq_puts(m, ",local_lock=all");
+ else if (local_flock)
+ seq_puts(m, ",local_lock=flock");
+ else
+ seq_puts(m, ",local_lock=posix");
+}
+
+/*
+ * Describe the mount options on this VFS mountpoint
+ */
+int nfs_show_options(struct seq_file *m, struct dentry *root)
+{
+ struct nfs_server *nfss = NFS_SB(root->d_sb);
+
+ nfs_show_mount_options(m, nfss, 0);
+
+ rcu_read_lock();
+ seq_printf(m, ",addr=%s",
+ rpc_peeraddr2str(nfss->nfs_client->cl_rpcclient,
+ RPC_DISPLAY_ADDR));
+ rcu_read_unlock();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_show_options);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+static void show_lease(struct seq_file *m, struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+ unsigned long expire;
+
+ seq_printf(m, ",lease_time=%ld", clp->cl_lease_time / HZ);
+ expire = clp->cl_last_renewal + clp->cl_lease_time;
+ seq_printf(m, ",lease_expired=%ld",
+ time_after(expire, jiffies) ? 0 : (jiffies - expire) / HZ);
+}
+#ifdef CONFIG_NFS_V4_1
+static void show_sessions(struct seq_file *m, struct nfs_server *server)
+{
+ if (nfs4_has_session(server->nfs_client))
+ seq_puts(m, ",sessions");
+}
+#else
+static void show_sessions(struct seq_file *m, struct nfs_server *server) {}
+#endif
+#endif
+
+#ifdef CONFIG_NFS_V4_1
+static void show_pnfs(struct seq_file *m, struct nfs_server *server)
+{
+ seq_printf(m, ",pnfs=");
+ if (server->pnfs_curr_ld)
+ seq_printf(m, "%s", server->pnfs_curr_ld->name);
+ else
+ seq_printf(m, "not configured");
+}
+
+static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss)
+{
+ if (nfss->nfs_client && nfss->nfs_client->cl_implid) {
+ struct nfs41_impl_id *impl_id = nfss->nfs_client->cl_implid;
+ seq_printf(m, "\n\timpl_id:\tname='%s',domain='%s',"
+ "date='%llu,%u'",
+ impl_id->name, impl_id->domain,
+ impl_id->date.seconds, impl_id->date.nseconds);
+ }
+}
+#else
+#if IS_ENABLED(CONFIG_NFS_V4)
+static void show_pnfs(struct seq_file *m, struct nfs_server *server)
+{
+}
+#endif
+static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss)
+{
+}
+#endif
+
+int nfs_show_devname(struct seq_file *m, struct dentry *root)
+{
+ char *page = (char *) __get_free_page(GFP_KERNEL);
+ char *devname, *dummy;
+ int err = 0;
+ if (!page)
+ return -ENOMEM;
+ devname = nfs_path(&dummy, root, page, PAGE_SIZE, 0);
+ if (IS_ERR(devname))
+ err = PTR_ERR(devname);
+ else
+ seq_escape(m, devname, " \t\n\\");
+ free_page((unsigned long)page);
+ return err;
+}
+EXPORT_SYMBOL_GPL(nfs_show_devname);
+
+int nfs_show_path(struct seq_file *m, struct dentry *dentry)
+{
+ seq_puts(m, "/");
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_show_path);
+
+/*
+ * Present statistical information for this VFS mountpoint
+ */
+int nfs_show_stats(struct seq_file *m, struct dentry *root)
+{
+ int i, cpu;
+ struct nfs_server *nfss = NFS_SB(root->d_sb);
+ struct rpc_auth *auth = nfss->client->cl_auth;
+ struct nfs_iostats totals = { };
+
+ seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS);
+
+ /*
+ * Display all mount option settings
+ */
+ seq_puts(m, "\n\topts:\t");
+ seq_puts(m, sb_rdonly(root->d_sb) ? "ro" : "rw");
+ seq_puts(m, root->d_sb->s_flags & SB_SYNCHRONOUS ? ",sync" : "");
+ seq_puts(m, root->d_sb->s_flags & SB_NOATIME ? ",noatime" : "");
+ seq_puts(m, root->d_sb->s_flags & SB_NODIRATIME ? ",nodiratime" : "");
+ nfs_show_mount_options(m, nfss, 1);
+
+ seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
+
+ show_implementation_id(m, nfss);
+
+ seq_puts(m, "\n\tcaps:\t");
+ seq_printf(m, "caps=0x%x", nfss->caps);
+ seq_printf(m, ",wtmult=%u", nfss->wtmult);
+ seq_printf(m, ",dtsize=%u", nfss->dtsize);
+ seq_printf(m, ",bsize=%u", nfss->bsize);
+ seq_printf(m, ",namlen=%u", nfss->namelen);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+ if (nfss->nfs_client->rpc_ops->version == 4) {
+ seq_puts(m, "\n\tnfsv4:\t");
+ seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
+ seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
+ seq_printf(m, ",bm2=0x%x", nfss->attr_bitmask[2]);
+ seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
+ show_sessions(m, nfss);
+ show_pnfs(m, nfss);
+ show_lease(m, nfss);
+ }
+#endif
+
+ /*
+ * Display security flavor in effect for this mount
+ */
+ seq_printf(m, "\n\tsec:\tflavor=%u", auth->au_ops->au_flavor);
+ if (auth->au_flavor)
+ seq_printf(m, ",pseudoflavor=%u", auth->au_flavor);
+
+ /*
+ * Display superblock I/O counters
+ */
+ for_each_possible_cpu(cpu) {
+ struct nfs_iostats *stats;
+
+ preempt_disable();
+ stats = per_cpu_ptr(nfss->io_stats, cpu);
+
+ for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
+ totals.events[i] += stats->events[i];
+ for (i = 0; i < __NFSIOS_BYTESMAX; i++)
+ totals.bytes[i] += stats->bytes[i];
+#ifdef CONFIG_NFS_FSCACHE
+ for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
+ totals.fscache[i] += stats->fscache[i];
+#endif
+
+ preempt_enable();
+ }
+
+ seq_puts(m, "\n\tevents:\t");
+ for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
+ seq_printf(m, "%lu ", totals.events[i]);
+ seq_puts(m, "\n\tbytes:\t");
+ for (i = 0; i < __NFSIOS_BYTESMAX; i++)
+ seq_printf(m, "%Lu ", totals.bytes[i]);
+#ifdef CONFIG_NFS_FSCACHE
+ if (nfss->options & NFS_OPTION_FSCACHE) {
+ seq_puts(m, "\n\tfsc:\t");
+ for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
+ seq_printf(m, "%Lu ", totals.fscache[i]);
+ }
+#endif
+ seq_putc(m, '\n');
+
+ rpc_clnt_show_stats(m, nfss->client);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_show_stats);
+
+/*
+ * Begin unmount by attempting to remove all automounted mountpoints we added
+ * in response to xdev traversals and referrals
+ */
+void nfs_umount_begin(struct super_block *sb)
+{
+ struct nfs_server *server;
+ struct rpc_clnt *rpc;
+
+ server = NFS_SB(sb);
+ /* -EIO all pending I/O */
+ rpc = server->client_acl;
+ if (!IS_ERR(rpc))
+ rpc_killall_tasks(rpc);
+ rpc = server->client;
+ if (!IS_ERR(rpc))
+ rpc_killall_tasks(rpc);
+}
+EXPORT_SYMBOL_GPL(nfs_umount_begin);
+
+/*
+ * Return true if 'match' is in auth_info or auth_info is empty.
+ * Return false otherwise.
+ */
+bool nfs_auth_info_match(const struct nfs_auth_info *auth_info,
+ rpc_authflavor_t match)
+{
+ int i;
+
+ if (!auth_info->flavor_len)
+ return true;
+
+ for (i = 0; i < auth_info->flavor_len; i++) {
+ if (auth_info->flavors[i] == match)
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(nfs_auth_info_match);
+
+/*
+ * Ensure that a specified authtype in ctx->auth_info is supported by
+ * the server. Returns 0 and sets ctx->selected_flavor if it's ok, and
+ * -EACCES if not.
+ */
+static int nfs_verify_authflavors(struct nfs_fs_context *ctx,
+ rpc_authflavor_t *server_authlist,
+ unsigned int count)
+{
+ rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
+ bool found_auth_null = false;
+ unsigned int i;
+
+ /*
+ * If the sec= mount option is used, the specified flavor or AUTH_NULL
+ * must be in the list returned by the server.
+ *
+ * AUTH_NULL has a special meaning when it's in the server list - it
+ * means that the server will ignore the rpc creds, so any flavor
+ * can be used but still use the sec= that was specified.
+ *
+ * Note also that the MNT procedure in MNTv1 does not return a list
+ * of supported security flavors. In this case, nfs_mount() fabricates
+ * a security flavor list containing just AUTH_NULL.
+ */
+ for (i = 0; i < count; i++) {
+ flavor = server_authlist[i];
+
+ if (nfs_auth_info_match(&ctx->auth_info, flavor))
+ goto out;
+
+ if (flavor == RPC_AUTH_NULL)
+ found_auth_null = true;
+ }
+
+ if (found_auth_null) {
+ flavor = ctx->auth_info.flavors[0];
+ goto out;
+ }
+
+ dfprintk(MOUNT,
+ "NFS: specified auth flavors not supported by server\n");
+ return -EACCES;
+
+out:
+ ctx->selected_flavor = flavor;
+ dfprintk(MOUNT, "NFS: using auth flavor %u\n", ctx->selected_flavor);
+ return 0;
+}
+
+/*
+ * Use the remote server's MOUNT service to request the NFS file handle
+ * corresponding to the provided path.
+ */
+static int nfs_request_mount(struct fs_context *fc,
+ struct nfs_fh *root_fh,
+ rpc_authflavor_t *server_authlist,
+ unsigned int *server_authlist_len)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct nfs_mount_request request = {
+ .sap = (struct sockaddr *)
+ &ctx->mount_server.address,
+ .dirpath = ctx->nfs_server.export_path,
+ .protocol = ctx->mount_server.protocol,
+ .fh = root_fh,
+ .noresvport = ctx->flags & NFS_MOUNT_NORESVPORT,
+ .auth_flav_len = server_authlist_len,
+ .auth_flavs = server_authlist,
+ .net = fc->net_ns,
+ };
+ int status;
+
+ if (ctx->mount_server.version == 0) {
+ switch (ctx->version) {
+ default:
+ ctx->mount_server.version = NFS_MNT3_VERSION;
+ break;
+ case 2:
+ ctx->mount_server.version = NFS_MNT_VERSION;
+ }
+ }
+ request.version = ctx->mount_server.version;
+
+ if (ctx->mount_server.hostname)
+ request.hostname = ctx->mount_server.hostname;
+ else
+ request.hostname = ctx->nfs_server.hostname;
+
+ /*
+ * Construct the mount server's address.
+ */
+ if (ctx->mount_server.address.sa_family == AF_UNSPEC) {
+ memcpy(request.sap, &ctx->nfs_server.address,
+ ctx->nfs_server.addrlen);
+ ctx->mount_server.addrlen = ctx->nfs_server.addrlen;
+ }
+ request.salen = ctx->mount_server.addrlen;
+ nfs_set_port(request.sap, &ctx->mount_server.port, 0);
+
+ /*
+ * Now ask the mount server to map our export path
+ * to a file handle.
+ */
+ status = nfs_mount(&request);
+ if (status != 0) {
+ dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
+ request.hostname, status);
+ return status;
+ }
+
+ return 0;
+}
+
+static struct nfs_server *nfs_try_mount_request(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ int status;
+ unsigned int i;
+ bool tried_auth_unix = false;
+ bool auth_null_in_list = false;
+ struct nfs_server *server = ERR_PTR(-EACCES);
+ rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS];
+ unsigned int authlist_len = ARRAY_SIZE(authlist);
+
+ status = nfs_request_mount(fc, ctx->mntfh, authlist, &authlist_len);
+ if (status)
+ return ERR_PTR(status);
+
+ /*
+ * Was a sec= authflavor specified in the options? First, verify
+ * whether the server supports it, and then just try to use it if so.
+ */
+ if (ctx->auth_info.flavor_len > 0) {
+ status = nfs_verify_authflavors(ctx, authlist, authlist_len);
+ dfprintk(MOUNT, "NFS: using auth flavor %u\n",
+ ctx->selected_flavor);
+ if (status)
+ return ERR_PTR(status);
+ return ctx->nfs_mod->rpc_ops->create_server(fc);
+ }
+
+ /*
+ * No sec= option was provided. RFC 2623, section 2.7 suggests we
+ * SHOULD prefer the flavor listed first. However, some servers list
+ * AUTH_NULL first. Avoid ever choosing AUTH_NULL.
+ */
+ for (i = 0; i < authlist_len; ++i) {
+ rpc_authflavor_t flavor;
+ struct rpcsec_gss_info info;
+
+ flavor = authlist[i];
+ switch (flavor) {
+ case RPC_AUTH_UNIX:
+ tried_auth_unix = true;
+ break;
+ case RPC_AUTH_NULL:
+ auth_null_in_list = true;
+ continue;
+ default:
+ if (rpcauth_get_gssinfo(flavor, &info) != 0)
+ continue;
+ break;
+ }
+ dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor);
+ ctx->selected_flavor = flavor;
+ server = ctx->nfs_mod->rpc_ops->create_server(fc);
+ if (!IS_ERR(server))
+ return server;
+ }
+
+ /*
+ * Nothing we tried so far worked. At this point, give up if we've
+ * already tried AUTH_UNIX or if the server's list doesn't contain
+ * AUTH_NULL
+ */
+ if (tried_auth_unix || !auth_null_in_list)
+ return server;
+
+ /* Last chance! Try AUTH_UNIX */
+ dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX);
+ ctx->selected_flavor = RPC_AUTH_UNIX;
+ return ctx->nfs_mod->rpc_ops->create_server(fc);
+}
+
+int nfs_try_get_tree(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+
+ if (ctx->need_mount)
+ ctx->server = nfs_try_mount_request(fc);
+ else
+ ctx->server = ctx->nfs_mod->rpc_ops->create_server(fc);
+
+ return nfs_get_tree_common(fc);
+}
+EXPORT_SYMBOL_GPL(nfs_try_get_tree);
+
+
+#define NFS_REMOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
+ | NFS_MOUNT_SECURE \
+ | NFS_MOUNT_TCP \
+ | NFS_MOUNT_VER3 \
+ | NFS_MOUNT_KERBEROS \
+ | NFS_MOUNT_NONLM \
+ | NFS_MOUNT_BROKEN_SUID \
+ | NFS_MOUNT_STRICTLOCK \
+ | NFS_MOUNT_LEGACY_INTERFACE)
+
+#define NFS_MOUNT_CMP_FLAGMASK (NFS_REMOUNT_CMP_FLAGMASK & \
+ ~(NFS_MOUNT_UNSHARED | NFS_MOUNT_NORESVPORT))
+
+static int
+nfs_compare_remount_data(struct nfs_server *nfss,
+ struct nfs_fs_context *ctx)
+{
+ if ((ctx->flags ^ nfss->flags) & NFS_REMOUNT_CMP_FLAGMASK ||
+ ctx->rsize != nfss->rsize ||
+ ctx->wsize != nfss->wsize ||
+ ctx->version != nfss->nfs_client->rpc_ops->version ||
+ ctx->minorversion != nfss->nfs_client->cl_minorversion ||
+ ctx->retrans != nfss->client->cl_timeout->to_retries ||
+ !nfs_auth_info_match(&ctx->auth_info, nfss->client->cl_auth->au_flavor) ||
+ ctx->acregmin != nfss->acregmin / HZ ||
+ ctx->acregmax != nfss->acregmax / HZ ||
+ ctx->acdirmin != nfss->acdirmin / HZ ||
+ ctx->acdirmax != nfss->acdirmax / HZ ||
+ ctx->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
+ (ctx->options & NFS_OPTION_FSCACHE) != (nfss->options & NFS_OPTION_FSCACHE) ||
+ ctx->nfs_server.port != nfss->port ||
+ ctx->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
+ !rpc_cmp_addr((struct sockaddr *)&ctx->nfs_server.address,
+ (struct sockaddr *)&nfss->nfs_client->cl_addr))
+ return -EINVAL;
+
+ return 0;
+}
+
+int nfs_reconfigure(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct super_block *sb = fc->root->d_sb;
+ struct nfs_server *nfss = sb->s_fs_info;
+
+ sync_filesystem(sb);
+
+ /*
+ * Userspace mount programs that send binary options generally send
+ * them populated with default values. We have no way to know which
+ * ones were explicitly specified. Fall back to legacy behavior and
+ * just return success.
+ */
+ if (ctx->skip_reconfig_option_check)
+ return 0;
+
+ /*
+ * noac is a special case. It implies -o sync, but that's not
+ * necessarily reflected in the mtab options. reconfigure_super
+ * will clear SB_SYNCHRONOUS if -o sync wasn't specified in the
+ * remount options, so we have to explicitly reset it.
+ */
+ if (ctx->flags & NFS_MOUNT_NOAC) {
+ fc->sb_flags |= SB_SYNCHRONOUS;
+ fc->sb_flags_mask |= SB_SYNCHRONOUS;
+ }
+
+ /* compare new mount options with old ones */
+ return nfs_compare_remount_data(nfss, ctx);
+}
+EXPORT_SYMBOL_GPL(nfs_reconfigure);
+
+/*
+ * Finish setting up an NFS superblock
+ */
+static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx)
+{
+ struct nfs_server *server = NFS_SB(sb);
+
+ sb->s_blocksize_bits = 0;
+ sb->s_blocksize = 0;
+ sb->s_xattr = server->nfs_client->cl_nfs_mod->xattr;
+ sb->s_op = server->nfs_client->cl_nfs_mod->sops;
+ if (ctx && ctx->bsize)
+ sb->s_blocksize = nfs_block_size(ctx->bsize, &sb->s_blocksize_bits);
+
+ switch (server->nfs_client->rpc_ops->version) {
+ case 2:
+ sb->s_time_gran = 1000;
+ sb->s_time_min = 0;
+ sb->s_time_max = U32_MAX;
+ break;
+ case 3:
+ /*
+ * The VFS shouldn't apply the umask to mode bits.
+ * We will do so ourselves when necessary.
+ */
+ sb->s_flags |= SB_POSIXACL;
+ sb->s_time_gran = 1;
+ sb->s_time_min = 0;
+ sb->s_time_max = U32_MAX;
+ sb->s_export_op = &nfs_export_ops;
+ break;
+ case 4:
+ sb->s_flags |= SB_POSIXACL;
+ sb->s_time_gran = 1;
+ sb->s_time_min = S64_MIN;
+ sb->s_time_max = S64_MAX;
+ if (server->caps & NFS_CAP_ATOMIC_OPEN_V1)
+ sb->s_export_op = &nfs_export_ops;
+ break;
+ }
+
+ sb->s_magic = NFS_SUPER_MAGIC;
+
+ /* We probably want something more informative here */
+ snprintf(sb->s_id, sizeof(sb->s_id),
+ "%u:%u", MAJOR(sb->s_dev), MINOR(sb->s_dev));
+
+ if (sb->s_blocksize == 0)
+ sb->s_blocksize = nfs_block_bits(server->wsize,
+ &sb->s_blocksize_bits);
+
+ nfs_super_set_maxbytes(sb, server->maxfilesize);
+}
+
+static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b,
+ const struct fs_context *fc)
+{
+ const struct nfs_server *a = s->s_fs_info;
+ const struct rpc_clnt *clnt_a = a->client;
+ const struct rpc_clnt *clnt_b = b->client;
+
+ if ((s->s_flags & NFS_SB_MASK) != (fc->sb_flags & NFS_SB_MASK))
+ goto Ebusy;
+ if (a->nfs_client != b->nfs_client)
+ goto Ebusy;
+ if ((a->flags ^ b->flags) & NFS_MOUNT_CMP_FLAGMASK)
+ goto Ebusy;
+ if (a->wsize != b->wsize)
+ goto Ebusy;
+ if (a->rsize != b->rsize)
+ goto Ebusy;
+ if (a->acregmin != b->acregmin)
+ goto Ebusy;
+ if (a->acregmax != b->acregmax)
+ goto Ebusy;
+ if (a->acdirmin != b->acdirmin)
+ goto Ebusy;
+ if (a->acdirmax != b->acdirmax)
+ goto Ebusy;
+ if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
+ goto Ebusy;
+ return 1;
+Ebusy:
+ return 0;
+}
+
+static int nfs_set_super(struct super_block *s, struct fs_context *fc)
+{
+ struct nfs_server *server = fc->s_fs_info;
+ int ret;
+
+ s->s_d_op = server->nfs_client->rpc_ops->dentry_ops;
+ ret = set_anon_super(s, server);
+ if (ret == 0)
+ server->s_dev = s->s_dev;
+ return ret;
+}
+
+static int nfs_compare_super_address(struct nfs_server *server1,
+ struct nfs_server *server2)
+{
+ struct sockaddr *sap1, *sap2;
+ struct rpc_xprt *xprt1 = server1->client->cl_xprt;
+ struct rpc_xprt *xprt2 = server2->client->cl_xprt;
+
+ if (!net_eq(xprt1->xprt_net, xprt2->xprt_net))
+ return 0;
+
+ sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr;
+ sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr;
+
+ if (sap1->sa_family != sap2->sa_family)
+ return 0;
+
+ switch (sap1->sa_family) {
+ case AF_INET: {
+ struct sockaddr_in *sin1 = (struct sockaddr_in *)sap1;
+ struct sockaddr_in *sin2 = (struct sockaddr_in *)sap2;
+ if (sin1->sin_addr.s_addr != sin2->sin_addr.s_addr)
+ return 0;
+ if (sin1->sin_port != sin2->sin_port)
+ return 0;
+ break;
+ }
+ case AF_INET6: {
+ struct sockaddr_in6 *sin1 = (struct sockaddr_in6 *)sap1;
+ struct sockaddr_in6 *sin2 = (struct sockaddr_in6 *)sap2;
+ if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
+ return 0;
+ if (sin1->sin6_port != sin2->sin6_port)
+ return 0;
+ break;
+ }
+ default:
+ return 0;
+ }
+
+ return 1;
+}
+
+static int nfs_compare_userns(const struct nfs_server *old,
+ const struct nfs_server *new)
+{
+ const struct user_namespace *oldns = &init_user_ns;
+ const struct user_namespace *newns = &init_user_ns;
+
+ if (old->client && old->client->cl_cred)
+ oldns = old->client->cl_cred->user_ns;
+ if (new->client && new->client->cl_cred)
+ newns = new->client->cl_cred->user_ns;
+ if (oldns != newns)
+ return 0;
+ return 1;
+}
+
+static int nfs_compare_super(struct super_block *sb, struct fs_context *fc)
+{
+ struct nfs_server *server = fc->s_fs_info, *old = NFS_SB(sb);
+
+ if (!nfs_compare_super_address(old, server))
+ return 0;
+ /* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */
+ if (old->flags & NFS_MOUNT_UNSHARED)
+ return 0;
+ if (memcmp(&old->fsid, &server->fsid, sizeof(old->fsid)) != 0)
+ return 0;
+ if (!nfs_compare_userns(old, server))
+ return 0;
+ return nfs_compare_mount_options(sb, server, fc);
+}
+
+#ifdef CONFIG_NFS_FSCACHE
+static void nfs_get_cache_cookie(struct super_block *sb,
+ struct nfs_fs_context *ctx)
+{
+ struct nfs_server *nfss = NFS_SB(sb);
+ char *uniq = NULL;
+ int ulen = 0;
+
+ nfss->fscache_key = NULL;
+ nfss->fscache = NULL;
+
+ if (!ctx)
+ return;
+
+ if (ctx->clone_data.sb) {
+ struct nfs_server *mnt_s = NFS_SB(ctx->clone_data.sb);
+ if (!(mnt_s->options & NFS_OPTION_FSCACHE))
+ return;
+ if (mnt_s->fscache_key) {
+ uniq = mnt_s->fscache_key->key.uniquifier;
+ ulen = mnt_s->fscache_key->key.uniq_len;
+ }
+ } else {
+ if (!(ctx->options & NFS_OPTION_FSCACHE))
+ return;
+ if (ctx->fscache_uniq) {
+ uniq = ctx->fscache_uniq;
+ ulen = strlen(ctx->fscache_uniq);
+ }
+ }
+
+ nfs_fscache_get_super_cookie(sb, uniq, ulen);
+}
+#else
+static void nfs_get_cache_cookie(struct super_block *sb,
+ struct nfs_fs_context *ctx)
+{
+}
+#endif
+
+int nfs_get_tree_common(struct fs_context *fc)
+{
+ struct nfs_fs_context *ctx = nfs_fc2context(fc);
+ struct super_block *s;
+ int (*compare_super)(struct super_block *, struct fs_context *) = nfs_compare_super;
+ struct nfs_server *server = ctx->server;
+ int error;
+
+ ctx->server = NULL;
+ if (IS_ERR(server))
+ return PTR_ERR(server);
+
+ if (server->flags & NFS_MOUNT_UNSHARED)
+ compare_super = NULL;
+
+ /* -o noac implies -o sync */
+ if (server->flags & NFS_MOUNT_NOAC)
+ fc->sb_flags |= SB_SYNCHRONOUS;
+
+ if (ctx->clone_data.sb)
+ if (ctx->clone_data.sb->s_flags & SB_SYNCHRONOUS)
+ fc->sb_flags |= SB_SYNCHRONOUS;
+
+ if (server->caps & NFS_CAP_SECURITY_LABEL)
+ fc->lsm_flags |= SECURITY_LSM_NATIVE_LABELS;
+
+ /* Get a superblock - note that we may end up sharing one that already exists */
+ fc->s_fs_info = server;
+ s = sget_fc(fc, compare_super, nfs_set_super);
+ fc->s_fs_info = NULL;
+ if (IS_ERR(s)) {
+ error = PTR_ERR(s);
+ nfs_errorf(fc, "NFS: Couldn't get superblock");
+ goto out_err_nosb;
+ }
+
+ if (s->s_fs_info != server) {
+ nfs_free_server(server);
+ server = NULL;
+ } else {
+ error = super_setup_bdi_name(s, "%u:%u", MAJOR(server->s_dev),
+ MINOR(server->s_dev));
+ if (error)
+ goto error_splat_super;
+ s->s_bdi->io_pages = server->rpages;
+ server->super = s;
+ }
+
+ if (!s->s_root) {
+ unsigned bsize = ctx->clone_data.inherited_bsize;
+ /* initial superblock/root creation */
+ nfs_fill_super(s, ctx);
+ if (bsize) {
+ s->s_blocksize_bits = bsize;
+ s->s_blocksize = 1U << bsize;
+ }
+ nfs_get_cache_cookie(s, ctx);
+ }
+
+ error = nfs_get_root(s, fc);
+ if (error < 0) {
+ nfs_errorf(fc, "NFS: Couldn't get root dentry");
+ goto error_splat_super;
+ }
+
+ s->s_flags |= SB_ACTIVE;
+ error = 0;
+
+out:
+ return error;
+
+out_err_nosb:
+ nfs_free_server(server);
+ goto out;
+error_splat_super:
+ deactivate_locked_super(s);
+ goto out;
+}
+
+/*
+ * Destroy an NFS2/3 superblock
+ */
+void nfs_kill_super(struct super_block *s)
+{
+ struct nfs_server *server = NFS_SB(s);
+ dev_t dev = s->s_dev;
+
+ generic_shutdown_super(s);
+
+ nfs_fscache_release_super_cookie(s);
+
+ nfs_free_server(server);
+ free_anon_bdev(dev);
+}
+EXPORT_SYMBOL_GPL(nfs_kill_super);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+
+/*
+ * NFS v4 module parameters need to stay in the
+ * NFS client for backwards compatibility
+ */
+unsigned int nfs_callback_set_tcpport;
+unsigned short nfs_callback_nr_threads;
+/* Default cache timeout is 10 minutes */
+unsigned int nfs_idmap_cache_timeout = 600;
+/* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */
+bool nfs4_disable_idmapping = true;
+unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE;
+unsigned short max_session_cb_slots = NFS4_DEF_CB_SLOT_TABLE_SIZE;
+unsigned short send_implementation_id = 1;
+char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = "";
+bool recover_lost_locks = false;
+
+EXPORT_SYMBOL_GPL(nfs_callback_nr_threads);
+EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport);
+EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout);
+EXPORT_SYMBOL_GPL(nfs4_disable_idmapping);
+EXPORT_SYMBOL_GPL(max_session_slots);
+EXPORT_SYMBOL_GPL(max_session_cb_slots);
+EXPORT_SYMBOL_GPL(send_implementation_id);
+EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier);
+EXPORT_SYMBOL_GPL(recover_lost_locks);
+
+#define NFS_CALLBACK_MAXPORTNR (65535U)
+
+static int param_set_portnr(const char *val, const struct kernel_param *kp)
+{
+ unsigned long num;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+ ret = kstrtoul(val, 0, &num);
+ if (ret || num > NFS_CALLBACK_MAXPORTNR)
+ return -EINVAL;
+ *((unsigned int *)kp->arg) = num;
+ return 0;
+}
+static const struct kernel_param_ops param_ops_portnr = {
+ .set = param_set_portnr,
+ .get = param_get_uint,
+};
+#define param_check_portnr(name, p) __param_check(name, p, unsigned int);
+
+module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
+module_param_named(callback_nr_threads, nfs_callback_nr_threads, ushort, 0644);
+MODULE_PARM_DESC(callback_nr_threads, "Number of threads that will be "
+ "assigned to the NFSv4 callback channels.");
+module_param(nfs_idmap_cache_timeout, int, 0644);
+module_param(nfs4_disable_idmapping, bool, 0644);
+module_param_string(nfs4_unique_id, nfs4_client_id_uniquifier,
+ NFS4_CLIENT_ID_UNIQ_LEN, 0600);
+MODULE_PARM_DESC(nfs4_disable_idmapping,
+ "Turn off NFSv4 idmapping when using 'sec=sys'");
+module_param(max_session_slots, ushort, 0644);
+MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 "
+ "requests the client will negotiate");
+module_param(max_session_cb_slots, ushort, 0644);
+MODULE_PARM_DESC(max_session_cb_slots, "Maximum number of parallel NFSv4.1 "
+ "callbacks the client will process for a given server");
+module_param(send_implementation_id, ushort, 0644);
+MODULE_PARM_DESC(send_implementation_id,
+ "Send implementation ID with NFSv4.1 exchange_id");
+MODULE_PARM_DESC(nfs4_unique_id, "nfs_client_id4 uniquifier string");
+
+module_param(recover_lost_locks, bool, 0644);
+MODULE_PARM_DESC(recover_lost_locks,
+ "If the server reports that a lock might be lost, "
+ "try to recover it risking data corruption.");
+
+
+#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
new file mode 100644
index 000000000..25ba299fd
--- /dev/null
+++ b/fs/nfs/symlink.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/symlink.c
+ *
+ * Copyright (C) 1992 Rick Sladkey
+ *
+ * Optimization changes Copyright (C) 1994 Florian La Roche
+ *
+ * Jun 7 1999, cache symlink lookups in the page cache. -DaveM
+ *
+ * nfs symlink handling code
+ */
+
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs_fs.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+
+/* Symlink caching in the page cache is even more simplistic
+ * and straight-forward than readdir caching.
+ */
+
+static int nfs_symlink_filler(void *data, struct page *page)
+{
+ struct inode *inode = data;
+ int error;
+
+ error = NFS_PROTO(inode)->readlink(inode, page, 0, PAGE_SIZE);
+ if (error < 0)
+ goto error;
+ SetPageUptodate(page);
+ unlock_page(page);
+ return 0;
+
+error:
+ SetPageError(page);
+ unlock_page(page);
+ return -EIO;
+}
+
+static const char *nfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
+{
+ struct page *page;
+ void *err;
+
+ if (!dentry) {
+ err = ERR_PTR(nfs_revalidate_mapping_rcu(inode));
+ if (err)
+ return err;
+ page = find_get_page(inode->i_mapping, 0);
+ if (!page)
+ return ERR_PTR(-ECHILD);
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-ECHILD);
+ }
+ } else {
+ err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
+ if (err)
+ return err;
+ page = read_cache_page(&inode->i_data, 0, nfs_symlink_filler,
+ inode);
+ if (IS_ERR(page))
+ return ERR_CAST(page);
+ }
+ set_delayed_call(done, page_put_link, page);
+ return page_address(page);
+}
+
+/*
+ * symlinks can't do much...
+ */
+const struct inode_operations nfs_symlink_inode_operations = {
+ .get_link = nfs_get_link,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+};
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
new file mode 100644
index 000000000..7aea195dd
--- /dev/null
+++ b/fs/nfs/sysctl.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/sysctl.c
+ *
+ * Sysctl interface to NFS parameters
+ */
+#include <linux/types.h>
+#include <linux/linkage.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+#include <linux/nfs_fs.h>
+
+static struct ctl_table_header *nfs_callback_sysctl_table;
+
+static struct ctl_table nfs_cb_sysctls[] = {
+ {
+ .procname = "nfs_mountpoint_timeout",
+ .data = &nfs_mountpoint_expiry_timeout,
+ .maxlen = sizeof(nfs_mountpoint_expiry_timeout),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nfs_congestion_kb",
+ .data = &nfs_congestion_kb,
+ .maxlen = sizeof(nfs_congestion_kb),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static struct ctl_table nfs_cb_sysctl_dir[] = {
+ {
+ .procname = "nfs",
+ .mode = 0555,
+ .child = nfs_cb_sysctls,
+ },
+ { }
+};
+
+static struct ctl_table nfs_cb_sysctl_root[] = {
+ {
+ .procname = "fs",
+ .mode = 0555,
+ .child = nfs_cb_sysctl_dir,
+ },
+ { }
+};
+
+int nfs_register_sysctl(void)
+{
+ nfs_callback_sysctl_table = register_sysctl_table(nfs_cb_sysctl_root);
+ if (nfs_callback_sysctl_table == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+void nfs_unregister_sysctl(void)
+{
+ unregister_sysctl_table(nfs_callback_sysctl_table);
+ nfs_callback_sysctl_table = NULL;
+}
diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
new file mode 100644
index 000000000..f7f778e3e
--- /dev/null
+++ b/fs/nfs/sysfs.c
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Hammerspace Inc
+ */
+
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/nfs_fs.h>
+#include <linux/rcupdate.h>
+
+#include "nfs4_fs.h"
+#include "netns.h"
+#include "sysfs.h"
+
+struct kobject *nfs_client_kobj;
+static struct kset *nfs_kset;
+
+static void nfs_netns_object_release(struct kobject *kobj)
+{
+ kfree(kobj);
+}
+
+static const struct kobj_ns_type_operations *nfs_netns_object_child_ns_type(
+ struct kobject *kobj)
+{
+ return &net_ns_type_operations;
+}
+
+static struct kobj_type nfs_netns_object_type = {
+ .release = nfs_netns_object_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .child_ns_type = nfs_netns_object_child_ns_type,
+};
+
+static struct kobject *nfs_netns_object_alloc(const char *name,
+ struct kset *kset, struct kobject *parent)
+{
+ struct kobject *kobj;
+
+ kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);
+ if (kobj) {
+ kobj->kset = kset;
+ if (kobject_init_and_add(kobj, &nfs_netns_object_type,
+ parent, "%s", name) == 0)
+ return kobj;
+ kobject_put(kobj);
+ }
+ return NULL;
+}
+
+int nfs_sysfs_init(void)
+{
+ nfs_kset = kset_create_and_add("nfs", NULL, fs_kobj);
+ if (!nfs_kset)
+ return -ENOMEM;
+ nfs_client_kobj = nfs_netns_object_alloc("net", nfs_kset, NULL);
+ if (!nfs_client_kobj) {
+ kset_unregister(nfs_kset);
+ nfs_kset = NULL;
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void nfs_sysfs_exit(void)
+{
+ kobject_put(nfs_client_kobj);
+ kset_unregister(nfs_kset);
+}
+
+static ssize_t nfs_netns_identifier_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct nfs_netns_client *c = container_of(kobj,
+ struct nfs_netns_client,
+ kobject);
+ ssize_t ret;
+
+ rcu_read_lock();
+ ret = scnprintf(buf, PAGE_SIZE, "%s\n", rcu_dereference(c->identifier));
+ rcu_read_unlock();
+ return ret;
+}
+
+/* Strip trailing '\n' */
+static size_t nfs_string_strip(const char *c, size_t len)
+{
+ while (len > 0 && c[len-1] == '\n')
+ --len;
+ return len;
+}
+
+static ssize_t nfs_netns_identifier_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct nfs_netns_client *c = container_of(kobj,
+ struct nfs_netns_client,
+ kobject);
+ const char *old;
+ char *p;
+ size_t len;
+
+ len = nfs_string_strip(buf, min_t(size_t, count, CONTAINER_ID_MAXLEN));
+ if (!len)
+ return 0;
+ p = kmemdup_nul(buf, len, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ old = rcu_dereference_protected(xchg(&c->identifier, (char __rcu *)p), 1);
+ if (old) {
+ synchronize_rcu();
+ kfree(old);
+ }
+ return count;
+}
+
+static void nfs_netns_client_release(struct kobject *kobj)
+{
+ struct nfs_netns_client *c = container_of(kobj,
+ struct nfs_netns_client,
+ kobject);
+
+ kfree(rcu_dereference_raw(c->identifier));
+ kfree(c);
+}
+
+static const void *nfs_netns_client_namespace(struct kobject *kobj)
+{
+ return container_of(kobj, struct nfs_netns_client, kobject)->net;
+}
+
+static struct kobj_attribute nfs_netns_client_id = __ATTR(identifier,
+ 0644, nfs_netns_identifier_show, nfs_netns_identifier_store);
+
+static struct attribute *nfs_netns_client_attrs[] = {
+ &nfs_netns_client_id.attr,
+ NULL,
+};
+
+static struct kobj_type nfs_netns_client_type = {
+ .release = nfs_netns_client_release,
+ .default_attrs = nfs_netns_client_attrs,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .namespace = nfs_netns_client_namespace,
+};
+
+static struct nfs_netns_client *nfs_netns_client_alloc(struct kobject *parent,
+ struct net *net)
+{
+ struct nfs_netns_client *p;
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (p) {
+ p->net = net;
+ p->kobject.kset = nfs_kset;
+ if (kobject_init_and_add(&p->kobject, &nfs_netns_client_type,
+ parent, "nfs_client") == 0)
+ return p;
+ kobject_put(&p->kobject);
+ }
+ return NULL;
+}
+
+void nfs_netns_sysfs_setup(struct nfs_net *netns, struct net *net)
+{
+ struct nfs_netns_client *clp;
+
+ clp = nfs_netns_client_alloc(nfs_client_kobj, net);
+ if (clp) {
+ netns->nfs_client = clp;
+ kobject_uevent(&clp->kobject, KOBJ_ADD);
+ }
+}
+
+void nfs_netns_sysfs_destroy(struct nfs_net *netns)
+{
+ struct nfs_netns_client *clp = netns->nfs_client;
+
+ if (clp) {
+ kobject_uevent(&clp->kobject, KOBJ_REMOVE);
+ kobject_del(&clp->kobject);
+ kobject_put(&clp->kobject);
+ netns->nfs_client = NULL;
+ }
+}
diff --git a/fs/nfs/sysfs.h b/fs/nfs/sysfs.h
new file mode 100644
index 000000000..5501ef573
--- /dev/null
+++ b/fs/nfs/sysfs.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2019 Hammerspace Inc
+ */
+
+#ifndef __NFS_SYSFS_H
+#define __NFS_SYSFS_H
+
+#define CONTAINER_ID_MAXLEN (64)
+
+struct nfs_netns_client {
+ struct kobject kobject;
+ struct net *net;
+ const char __rcu *identifier;
+};
+
+extern struct kobject *nfs_client_kobj;
+
+extern int nfs_sysfs_init(void);
+extern void nfs_sysfs_exit(void);
+
+void nfs_netns_sysfs_setup(struct nfs_net *netns, struct net *net);
+void nfs_netns_sysfs_destroy(struct nfs_net *netns);
+
+#endif
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
new file mode 100644
index 000000000..b27ebdcce
--- /dev/null
+++ b/fs/nfs/unlink.c
@@ -0,0 +1,521 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/nfs/unlink.c
+ *
+ * nfs sillydelete handling
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/dcache.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs_fs.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/namei.h>
+#include <linux/fsnotify.h>
+
+#include "internal.h"
+#include "nfs4_fs.h"
+#include "iostat.h"
+#include "delegation.h"
+
+#include "nfstrace.h"
+
+/**
+ * nfs_free_unlinkdata - release data from a sillydelete operation.
+ * @data: pointer to unlink structure.
+ */
+static void
+nfs_free_unlinkdata(struct nfs_unlinkdata *data)
+{
+ put_cred(data->cred);
+ kfree(data->args.name.name);
+ kfree(data);
+}
+
+/**
+ * nfs_async_unlink_done - Sillydelete post-processing
+ * @task: rpc_task of the sillydelete
+ * @calldata: pointer to nfs_unlinkdata
+ *
+ * Do the directory attribute update.
+ */
+static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs_unlinkdata *data = calldata;
+ struct inode *dir = d_inode(data->dentry->d_parent);
+
+ trace_nfs_sillyrename_unlink(data, task->tk_status);
+ if (!NFS_PROTO(dir)->unlink_done(task, dir))
+ rpc_restart_call_prepare(task);
+}
+
+/**
+ * nfs_async_unlink_release - Release the sillydelete data.
+ * @calldata: struct nfs_unlinkdata to release
+ *
+ * We need to call nfs_put_unlinkdata as a 'tk_release' task since the
+ * rpc_task would be freed too.
+ */
+static void nfs_async_unlink_release(void *calldata)
+{
+ struct nfs_unlinkdata *data = calldata;
+ struct dentry *dentry = data->dentry;
+ struct super_block *sb = dentry->d_sb;
+
+ up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
+ d_lookup_done(dentry);
+ nfs_free_unlinkdata(data);
+ dput(dentry);
+ nfs_sb_deactive(sb);
+}
+
+static void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs_unlinkdata *data = calldata;
+ struct inode *dir = d_inode(data->dentry->d_parent);
+ NFS_PROTO(dir)->unlink_rpc_prepare(task, data);
+}
+
+static const struct rpc_call_ops nfs_unlink_ops = {
+ .rpc_call_done = nfs_async_unlink_done,
+ .rpc_release = nfs_async_unlink_release,
+ .rpc_call_prepare = nfs_unlink_prepare,
+};
+
+static void nfs_do_call_unlink(struct inode *inode, struct nfs_unlinkdata *data)
+{
+ struct rpc_message msg = {
+ .rpc_argp = &data->args,
+ .rpc_resp = &data->res,
+ .rpc_cred = data->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_message = &msg,
+ .callback_ops = &nfs_unlink_ops,
+ .callback_data = data,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
+ };
+ struct rpc_task *task;
+ struct inode *dir = d_inode(data->dentry->d_parent);
+ nfs_sb_active(dir->i_sb);
+ data->args.fh = NFS_FH(dir);
+ nfs_fattr_init(data->res.dir_attr);
+
+ NFS_PROTO(dir)->unlink_setup(&msg, data->dentry, inode);
+
+ task_setup_data.rpc_client = NFS_CLIENT(dir);
+ task = rpc_run_task(&task_setup_data);
+ if (!IS_ERR(task))
+ rpc_put_task_async(task);
+}
+
+static int nfs_call_unlink(struct dentry *dentry, struct inode *inode, struct nfs_unlinkdata *data)
+{
+ struct inode *dir = d_inode(dentry->d_parent);
+ struct dentry *alias;
+
+ down_read_non_owner(&NFS_I(dir)->rmdir_sem);
+ alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
+ if (IS_ERR(alias)) {
+ up_read_non_owner(&NFS_I(dir)->rmdir_sem);
+ return 0;
+ }
+ if (!d_in_lookup(alias)) {
+ int ret;
+ void *devname_garbage = NULL;
+
+ /*
+ * Hey, we raced with lookup... See if we need to transfer
+ * the sillyrename information to the aliased dentry.
+ */
+ spin_lock(&alias->d_lock);
+ if (d_really_is_positive(alias) &&
+ !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
+ devname_garbage = alias->d_fsdata;
+ alias->d_fsdata = data;
+ alias->d_flags |= DCACHE_NFSFS_RENAMED;
+ ret = 1;
+ } else
+ ret = 0;
+ spin_unlock(&alias->d_lock);
+ dput(alias);
+ up_read_non_owner(&NFS_I(dir)->rmdir_sem);
+ /*
+ * If we'd displaced old cached devname, free it. At that
+ * point dentry is definitely not a root, so we won't need
+ * that anymore.
+ */
+ kfree(devname_garbage);
+ return ret;
+ }
+ data->dentry = alias;
+ nfs_do_call_unlink(inode, data);
+ return 1;
+}
+
+/**
+ * nfs_async_unlink - asynchronous unlinking of a file
+ * @dentry: parent directory of dentry
+ * @name: name of dentry to unlink
+ */
+static int
+nfs_async_unlink(struct dentry *dentry, const struct qstr *name)
+{
+ struct nfs_unlinkdata *data;
+ int status = -ENOMEM;
+ void *devname_garbage = NULL;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (data == NULL)
+ goto out;
+ data->args.name.name = kstrdup(name->name, GFP_KERNEL);
+ if (!data->args.name.name)
+ goto out_free;
+ data->args.name.len = name->len;
+
+ data->cred = get_current_cred();
+ data->res.dir_attr = &data->dir_attr;
+ init_waitqueue_head(&data->wq);
+
+ status = -EBUSY;
+ spin_lock(&dentry->d_lock);
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+ goto out_unlock;
+ dentry->d_flags |= DCACHE_NFSFS_RENAMED;
+ devname_garbage = dentry->d_fsdata;
+ dentry->d_fsdata = data;
+ spin_unlock(&dentry->d_lock);
+ /*
+ * If we'd displaced old cached devname, free it. At that
+ * point dentry is definitely not a root, so we won't need
+ * that anymore.
+ */
+ kfree(devname_garbage);
+ return 0;
+out_unlock:
+ spin_unlock(&dentry->d_lock);
+ put_cred(data->cred);
+ kfree(data->args.name.name);
+out_free:
+ kfree(data);
+out:
+ return status;
+}
+
+/**
+ * nfs_complete_unlink - Initialize completion of the sillydelete
+ * @dentry: dentry to delete
+ * @inode: inode
+ *
+ * Since we're most likely to be called by dentry_iput(), we
+ * only use the dentry to find the sillydelete. We then copy the name
+ * into the qstr.
+ */
+void
+nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
+{
+ struct nfs_unlinkdata *data;
+
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+ data = dentry->d_fsdata;
+ dentry->d_fsdata = NULL;
+ spin_unlock(&dentry->d_lock);
+
+ if (NFS_STALE(inode) || !nfs_call_unlink(dentry, inode, data))
+ nfs_free_unlinkdata(data);
+}
+
+/* Cancel a queued async unlink. Called when a sillyrename run fails. */
+static void
+nfs_cancel_async_unlink(struct dentry *dentry)
+{
+ spin_lock(&dentry->d_lock);
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+ struct nfs_unlinkdata *data = dentry->d_fsdata;
+
+ dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+ dentry->d_fsdata = NULL;
+ spin_unlock(&dentry->d_lock);
+ nfs_free_unlinkdata(data);
+ return;
+ }
+ spin_unlock(&dentry->d_lock);
+}
+
+/**
+ * nfs_async_rename_done - Sillyrename post-processing
+ * @task: rpc_task of the sillyrename
+ * @calldata: nfs_renamedata for the sillyrename
+ *
+ * Do the directory attribute updates and the d_move
+ */
+static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs_renamedata *data = calldata;
+ struct inode *old_dir = data->old_dir;
+ struct inode *new_dir = data->new_dir;
+ struct dentry *old_dentry = data->old_dentry;
+
+ trace_nfs_sillyrename_rename(old_dir, old_dentry,
+ new_dir, data->new_dentry, task->tk_status);
+ if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
+ rpc_restart_call_prepare(task);
+ return;
+ }
+
+ if (data->complete)
+ data->complete(task, data);
+}
+
+/**
+ * nfs_async_rename_release - Release the sillyrename data.
+ * @calldata: the struct nfs_renamedata to be released
+ */
+static void nfs_async_rename_release(void *calldata)
+{
+ struct nfs_renamedata *data = calldata;
+ struct super_block *sb = data->old_dir->i_sb;
+
+ if (d_really_is_positive(data->old_dentry))
+ nfs_mark_for_revalidate(d_inode(data->old_dentry));
+
+ /* The result of the rename is unknown. Play it safe by
+ * forcing a new lookup */
+ if (data->cancelled) {
+ spin_lock(&data->old_dir->i_lock);
+ nfs_force_lookup_revalidate(data->old_dir);
+ spin_unlock(&data->old_dir->i_lock);
+ if (data->new_dir != data->old_dir) {
+ spin_lock(&data->new_dir->i_lock);
+ nfs_force_lookup_revalidate(data->new_dir);
+ spin_unlock(&data->new_dir->i_lock);
+ }
+ }
+
+ dput(data->old_dentry);
+ dput(data->new_dentry);
+ iput(data->old_dir);
+ iput(data->new_dir);
+ nfs_sb_deactive(sb);
+ put_cred(data->cred);
+ kfree(data);
+}
+
+static void nfs_rename_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs_renamedata *data = calldata;
+ NFS_PROTO(data->old_dir)->rename_rpc_prepare(task, data);
+}
+
+static const struct rpc_call_ops nfs_rename_ops = {
+ .rpc_call_done = nfs_async_rename_done,
+ .rpc_release = nfs_async_rename_release,
+ .rpc_call_prepare = nfs_rename_prepare,
+};
+
+/**
+ * nfs_async_rename - perform an asynchronous rename operation
+ * @old_dir: directory that currently holds the dentry to be renamed
+ * @new_dir: target directory for the rename
+ * @old_dentry: original dentry to be renamed
+ * @new_dentry: dentry to which the old_dentry should be renamed
+ * @complete: Function to run on successful completion
+ *
+ * It's expected that valid references to the dentries and inodes are held
+ */
+struct rpc_task *
+nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
+ struct dentry *old_dentry, struct dentry *new_dentry,
+ void (*complete)(struct rpc_task *, struct nfs_renamedata *))
+{
+ struct nfs_renamedata *data;
+ struct rpc_message msg = { };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_message = &msg,
+ .callback_ops = &nfs_rename_ops,
+ .workqueue = nfsiod_workqueue,
+ .rpc_client = NFS_CLIENT(old_dir),
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
+ };
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (data == NULL)
+ return ERR_PTR(-ENOMEM);
+ task_setup_data.callback_data = data;
+
+ data->cred = get_current_cred();
+
+ msg.rpc_argp = &data->args;
+ msg.rpc_resp = &data->res;
+ msg.rpc_cred = data->cred;
+
+ /* set up nfs_renamedata */
+ data->old_dir = old_dir;
+ ihold(old_dir);
+ data->new_dir = new_dir;
+ ihold(new_dir);
+ data->old_dentry = dget(old_dentry);
+ data->new_dentry = dget(new_dentry);
+ nfs_fattr_init(&data->old_fattr);
+ nfs_fattr_init(&data->new_fattr);
+ data->complete = complete;
+
+ /* set up nfs_renameargs */
+ data->args.old_dir = NFS_FH(old_dir);
+ data->args.old_name = &old_dentry->d_name;
+ data->args.new_dir = NFS_FH(new_dir);
+ data->args.new_name = &new_dentry->d_name;
+
+ /* set up nfs_renameres */
+ data->res.old_fattr = &data->old_fattr;
+ data->res.new_fattr = &data->new_fattr;
+
+ nfs_sb_active(old_dir->i_sb);
+
+ NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dentry, new_dentry);
+
+ return rpc_run_task(&task_setup_data);
+}
+
+/*
+ * Perform tasks needed when a sillyrename is done such as cancelling the
+ * queued async unlink if it failed.
+ */
+static void
+nfs_complete_sillyrename(struct rpc_task *task, struct nfs_renamedata *data)
+{
+ struct dentry *dentry = data->old_dentry;
+
+ if (task->tk_status != 0) {
+ nfs_cancel_async_unlink(dentry);
+ return;
+ }
+}
+
+#define SILLYNAME_PREFIX ".nfs"
+#define SILLYNAME_PREFIX_LEN ((unsigned)sizeof(SILLYNAME_PREFIX) - 1)
+#define SILLYNAME_FILEID_LEN ((unsigned)sizeof(u64) << 1)
+#define SILLYNAME_COUNTER_LEN ((unsigned)sizeof(unsigned int) << 1)
+#define SILLYNAME_LEN (SILLYNAME_PREFIX_LEN + \
+ SILLYNAME_FILEID_LEN + \
+ SILLYNAME_COUNTER_LEN)
+
+/**
+ * nfs_sillyrename - Perform a silly-rename of a dentry
+ * @dir: inode of directory that contains dentry
+ * @dentry: dentry to be sillyrenamed
+ *
+ * NFSv2/3 is stateless and the server doesn't know when the client is
+ * holding a file open. To prevent application problems when a file is
+ * unlinked while it's still open, the client performs a "silly-rename".
+ * That is, it renames the file to a hidden file in the same directory,
+ * and only performs the unlink once the last reference to it is put.
+ *
+ * The final cleanup is done during dentry_iput.
+ *
+ * (Note: NFSv4 is stateful, and has opens, so in theory an NFSv4 server
+ * could take responsibility for keeping open files referenced. The server
+ * would also need to ensure that opened-but-deleted files were kept over
+ * reboots. However, we may not assume a server does so. (RFC 5661
+ * does provide an OPEN4_RESULT_PRESERVE_UNLINKED flag that a server can
+ * use to advertise that it does this; some day we may take advantage of
+ * it.))
+ */
+int
+nfs_sillyrename(struct inode *dir, struct dentry *dentry)
+{
+ static unsigned int sillycounter;
+ unsigned char silly[SILLYNAME_LEN + 1];
+ unsigned long long fileid;
+ struct dentry *sdentry;
+ struct inode *inode = d_inode(dentry);
+ struct rpc_task *task;
+ int error = -EBUSY;
+
+ dfprintk(VFS, "NFS: silly-rename(%pd2, ct=%d)\n",
+ dentry, d_count(dentry));
+ nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
+
+ /*
+ * We don't allow a dentry to be silly-renamed twice.
+ */
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+ goto out;
+
+ fileid = NFS_FILEID(d_inode(dentry));
+
+ sdentry = NULL;
+ do {
+ int slen;
+ dput(sdentry);
+ sillycounter++;
+ slen = scnprintf(silly, sizeof(silly),
+ SILLYNAME_PREFIX "%0*llx%0*x",
+ SILLYNAME_FILEID_LEN, fileid,
+ SILLYNAME_COUNTER_LEN, sillycounter);
+
+ dfprintk(VFS, "NFS: trying to rename %pd to %s\n",
+ dentry, silly);
+
+ sdentry = lookup_one_len(silly, dentry->d_parent, slen);
+ /*
+ * N.B. Better to return EBUSY here ... it could be
+ * dangerous to delete the file while it's in use.
+ */
+ if (IS_ERR(sdentry))
+ goto out;
+ } while (d_inode(sdentry) != NULL); /* need negative lookup */
+
+ ihold(inode);
+
+ /* queue unlink first. Can't do this from rpc_release as it
+ * has to allocate memory
+ */
+ error = nfs_async_unlink(dentry, &sdentry->d_name);
+ if (error)
+ goto out_dput;
+
+ /* run the rename task, undo unlink if it fails */
+ task = nfs_async_rename(dir, dir, dentry, sdentry,
+ nfs_complete_sillyrename);
+ if (IS_ERR(task)) {
+ error = -EBUSY;
+ nfs_cancel_async_unlink(dentry);
+ goto out_dput;
+ }
+
+ /* wait for the RPC task to complete, unless a SIGKILL intervenes */
+ error = rpc_wait_for_completion_task(task);
+ if (error == 0)
+ error = task->tk_status;
+ switch (error) {
+ case 0:
+ /* The rename succeeded */
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter();
+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE
+ | NFS_INO_INVALID_CTIME
+ | NFS_INO_REVAL_FORCED;
+ spin_unlock(&inode->i_lock);
+ d_move(dentry, sdentry);
+ break;
+ case -ERESTARTSYS:
+ /* The result of the rename is unknown. Play it safe by
+ * forcing a new lookup */
+ d_drop(dentry);
+ d_drop(sdentry);
+ }
+ rpc_put_task(task);
+out_dput:
+ iput(inode);
+ dput(sdentry);
+out:
+ return error;
+}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
new file mode 100644
index 000000000..d3cd099ff
--- /dev/null
+++ b/fs/nfs/write.c
@@ -0,0 +1,2180 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/nfs/write.c
+ *
+ * Write file data over NFS.
+ *
+ * Copyright (C) 1996, 1997, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/migrate.h>
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs_page.h>
+#include <linux/backing-dev.h>
+#include <linux/export.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
+#include <linux/iversion.h>
+
+#include <linux/uaccess.h>
+#include <linux/sched/mm.h>
+
+#include "delegation.h"
+#include "internal.h"
+#include "iostat.h"
+#include "nfs4_fs.h"
+#include "fscache.h"
+#include "pnfs.h"
+
+#include "nfstrace.h"
+
+#define NFSDBG_FACILITY NFSDBG_PAGECACHE
+
+#define MIN_POOL_WRITE (32)
+#define MIN_POOL_COMMIT (4)
+
+struct nfs_io_completion {
+ void (*complete)(void *data);
+ void *data;
+ struct kref refcount;
+};
+
+/*
+ * Local function declarations
+ */
+static void nfs_redirty_request(struct nfs_page *req);
+static const struct rpc_call_ops nfs_commit_ops;
+static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
+static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
+static const struct nfs_rw_ops nfs_rw_write_ops;
+static void nfs_inode_remove_request(struct nfs_page *req);
+static void nfs_clear_request_commit(struct nfs_commit_info *cinfo,
+ struct nfs_page *req);
+static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
+ struct inode *inode);
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+ struct page *page);
+
+static struct kmem_cache *nfs_wdata_cachep;
+static mempool_t *nfs_wdata_mempool;
+static struct kmem_cache *nfs_cdata_cachep;
+static mempool_t *nfs_commit_mempool;
+
+struct nfs_commit_data *nfs_commitdata_alloc(void)
+{
+ struct nfs_commit_data *p;
+
+ p = kmem_cache_zalloc(nfs_cdata_cachep, nfs_io_gfp_mask());
+ if (!p) {
+ p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT);
+ if (!p)
+ return NULL;
+ memset(p, 0, sizeof(*p));
+ }
+ INIT_LIST_HEAD(&p->pages);
+ return p;
+}
+EXPORT_SYMBOL_GPL(nfs_commitdata_alloc);
+
+void nfs_commit_free(struct nfs_commit_data *p)
+{
+ mempool_free(p, nfs_commit_mempool);
+}
+EXPORT_SYMBOL_GPL(nfs_commit_free);
+
+static struct nfs_pgio_header *nfs_writehdr_alloc(void)
+{
+ struct nfs_pgio_header *p;
+
+ p = kmem_cache_zalloc(nfs_wdata_cachep, nfs_io_gfp_mask());
+ if (!p) {
+ p = mempool_alloc(nfs_wdata_mempool, GFP_NOWAIT);
+ if (!p)
+ return NULL;
+ memset(p, 0, sizeof(*p));
+ }
+ p->rw_mode = FMODE_WRITE;
+ return p;
+}
+
+static void nfs_writehdr_free(struct nfs_pgio_header *hdr)
+{
+ mempool_free(hdr, nfs_wdata_mempool);
+}
+
+static struct nfs_io_completion *nfs_io_completion_alloc(gfp_t gfp_flags)
+{
+ return kmalloc(sizeof(struct nfs_io_completion), gfp_flags);
+}
+
+static void nfs_io_completion_init(struct nfs_io_completion *ioc,
+ void (*complete)(void *), void *data)
+{
+ ioc->complete = complete;
+ ioc->data = data;
+ kref_init(&ioc->refcount);
+}
+
+static void nfs_io_completion_release(struct kref *kref)
+{
+ struct nfs_io_completion *ioc = container_of(kref,
+ struct nfs_io_completion, refcount);
+ ioc->complete(ioc->data);
+ kfree(ioc);
+}
+
+static void nfs_io_completion_get(struct nfs_io_completion *ioc)
+{
+ if (ioc != NULL)
+ kref_get(&ioc->refcount);
+}
+
+static void nfs_io_completion_put(struct nfs_io_completion *ioc)
+{
+ if (ioc != NULL)
+ kref_put(&ioc->refcount, nfs_io_completion_release);
+}
+
+static void
+nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode)
+{
+ if (!test_and_set_bit(PG_INODE_REF, &req->wb_flags)) {
+ kref_get(&req->wb_kref);
+ atomic_long_inc(&NFS_I(inode)->nrequests);
+ }
+}
+
+static int
+nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode)
+{
+ int ret;
+
+ if (!test_bit(PG_REMOVE, &req->wb_flags))
+ return 0;
+ ret = nfs_page_group_lock(req);
+ if (ret)
+ return ret;
+ if (test_and_clear_bit(PG_REMOVE, &req->wb_flags))
+ nfs_page_set_inode_ref(req, inode);
+ nfs_page_group_unlock(req);
+ return 0;
+}
+
+static struct nfs_page *
+nfs_page_private_request(struct page *page)
+{
+ if (!PagePrivate(page))
+ return NULL;
+ return (struct nfs_page *)page_private(page);
+}
+
+/*
+ * nfs_page_find_head_request_locked - find head request associated with @page
+ *
+ * must be called while holding the inode lock.
+ *
+ * returns matching head request with reference held, or NULL if not found.
+ */
+static struct nfs_page *
+nfs_page_find_private_request(struct page *page)
+{
+ struct address_space *mapping = page_file_mapping(page);
+ struct nfs_page *req;
+
+ if (!PagePrivate(page))
+ return NULL;
+ spin_lock(&mapping->private_lock);
+ req = nfs_page_private_request(page);
+ if (req) {
+ WARN_ON_ONCE(req->wb_head != req);
+ kref_get(&req->wb_kref);
+ }
+ spin_unlock(&mapping->private_lock);
+ return req;
+}
+
+static struct nfs_page *
+nfs_page_find_swap_request(struct page *page)
+{
+ struct inode *inode = page_file_mapping(page)->host;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_page *req = NULL;
+ if (!PageSwapCache(page))
+ return NULL;
+ mutex_lock(&nfsi->commit_mutex);
+ if (PageSwapCache(page)) {
+ req = nfs_page_search_commits_for_head_request_locked(nfsi,
+ page);
+ if (req) {
+ WARN_ON_ONCE(req->wb_head != req);
+ kref_get(&req->wb_kref);
+ }
+ }
+ mutex_unlock(&nfsi->commit_mutex);
+ return req;
+}
+
+/*
+ * nfs_page_find_head_request - find head request associated with @page
+ *
+ * returns matching head request with reference held, or NULL if not found.
+ */
+static struct nfs_page *nfs_page_find_head_request(struct page *page)
+{
+ struct nfs_page *req;
+
+ req = nfs_page_find_private_request(page);
+ if (!req)
+ req = nfs_page_find_swap_request(page);
+ return req;
+}
+
+static struct nfs_page *nfs_find_and_lock_page_request(struct page *page)
+{
+ struct inode *inode = page_file_mapping(page)->host;
+ struct nfs_page *req, *head;
+ int ret;
+
+ for (;;) {
+ req = nfs_page_find_head_request(page);
+ if (!req)
+ return req;
+ head = nfs_page_group_lock_head(req);
+ if (head != req)
+ nfs_release_request(req);
+ if (IS_ERR(head))
+ return head;
+ ret = nfs_cancel_remove_inode(head, inode);
+ if (ret < 0) {
+ nfs_unlock_and_release_request(head);
+ return ERR_PTR(ret);
+ }
+ /* Ensure that nobody removed the request before we locked it */
+ if (head == nfs_page_private_request(page))
+ break;
+ if (PageSwapCache(page))
+ break;
+ nfs_unlock_and_release_request(head);
+ }
+ return head;
+}
+
+/* Adjust the file length if we're writing beyond the end */
+static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
+{
+ struct inode *inode = page_file_mapping(page)->host;
+ loff_t end, i_size;
+ pgoff_t end_index;
+
+ spin_lock(&inode->i_lock);
+ i_size = i_size_read(inode);
+ end_index = (i_size - 1) >> PAGE_SHIFT;
+ if (i_size > 0 && page_index(page) < end_index)
+ goto out;
+ end = page_file_offset(page) + ((loff_t)offset+count);
+ if (i_size >= end)
+ goto out;
+ i_size_write(inode, end);
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
+ nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
+out:
+ spin_unlock(&inode->i_lock);
+}
+
+/* A writeback failed: mark the page as bad, and invalidate the page cache */
+static void nfs_set_pageerror(struct address_space *mapping)
+{
+ struct inode *inode = mapping->host;
+
+ nfs_zap_mapping(mapping->host, mapping);
+ /* Force file size revalidation */
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->cache_validity |= NFS_INO_REVAL_FORCED |
+ NFS_INO_REVAL_PAGECACHE |
+ NFS_INO_INVALID_SIZE;
+ spin_unlock(&inode->i_lock);
+}
+
+static void nfs_mapping_set_error(struct page *page, int error)
+{
+ struct address_space *mapping = page_file_mapping(page);
+
+ SetPageError(page);
+ filemap_set_wb_err(mapping, error);
+ if (mapping->host)
+ errseq_set(&mapping->host->i_sb->s_wb_err,
+ error == -ENOSPC ? -ENOSPC : -EIO);
+ nfs_set_pageerror(mapping);
+}
+
+/*
+ * nfs_page_group_search_locked
+ * @head - head request of page group
+ * @page_offset - offset into page
+ *
+ * Search page group with head @head to find a request that contains the
+ * page offset @page_offset.
+ *
+ * Returns a pointer to the first matching nfs request, or NULL if no
+ * match is found.
+ *
+ * Must be called with the page group lock held
+ */
+static struct nfs_page *
+nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset)
+{
+ struct nfs_page *req;
+
+ req = head;
+ do {
+ if (page_offset >= req->wb_pgbase &&
+ page_offset < (req->wb_pgbase + req->wb_bytes))
+ return req;
+
+ req = req->wb_this_page;
+ } while (req != head);
+
+ return NULL;
+}
+
+/*
+ * nfs_page_group_covers_page
+ * @head - head request of page group
+ *
+ * Return true if the page group with head @head covers the whole page,
+ * returns false otherwise
+ */
+static bool nfs_page_group_covers_page(struct nfs_page *req)
+{
+ struct nfs_page *tmp;
+ unsigned int pos = 0;
+ unsigned int len = nfs_page_length(req->wb_page);
+
+ nfs_page_group_lock(req);
+
+ for (;;) {
+ tmp = nfs_page_group_search_locked(req->wb_head, pos);
+ if (!tmp)
+ break;
+ pos = tmp->wb_pgbase + tmp->wb_bytes;
+ }
+
+ nfs_page_group_unlock(req);
+ return pos >= len;
+}
+
+/* We can set the PG_uptodate flag if we see that a write request
+ * covers the full page.
+ */
+static void nfs_mark_uptodate(struct nfs_page *req)
+{
+ if (PageUptodate(req->wb_page))
+ return;
+ if (!nfs_page_group_covers_page(req))
+ return;
+ SetPageUptodate(req->wb_page);
+}
+
+static int wb_priority(struct writeback_control *wbc)
+{
+ int ret = 0;
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ ret = FLUSH_COND_STABLE;
+ return ret;
+}
+
+/*
+ * NFS congestion control
+ */
+
+int nfs_congestion_kb;
+
+#define NFS_CONGESTION_ON_THRESH (nfs_congestion_kb >> (PAGE_SHIFT-10))
+#define NFS_CONGESTION_OFF_THRESH \
+ (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
+
+static void nfs_set_page_writeback(struct page *page)
+{
+ struct inode *inode = page_file_mapping(page)->host;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+ int ret = test_set_page_writeback(page);
+
+ WARN_ON_ONCE(ret != 0);
+
+ if (atomic_long_inc_return(&nfss->writeback) >
+ NFS_CONGESTION_ON_THRESH)
+ set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+}
+
+static void nfs_end_page_writeback(struct nfs_page *req)
+{
+ struct inode *inode = page_file_mapping(req->wb_page)->host;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+ bool is_done;
+
+ is_done = nfs_page_group_sync_on_bit(req, PG_WB_END);
+ nfs_unlock_request(req);
+ if (!is_done)
+ return;
+
+ end_page_writeback(req->wb_page);
+ if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
+ clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+}
+
+/*
+ * nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests
+ *
+ * @destroy_list - request list (using wb_this_page) terminated by @old_head
+ * @old_head - the old head of the list
+ *
+ * All subrequests must be locked and removed from all lists, so at this point
+ * they are only "active" in this function, and possibly in nfs_wait_on_request
+ * with a reference held by some other context.
+ */
+static void
+nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
+ struct nfs_page *old_head,
+ struct inode *inode)
+{
+ while (destroy_list) {
+ struct nfs_page *subreq = destroy_list;
+
+ destroy_list = (subreq->wb_this_page == old_head) ?
+ NULL : subreq->wb_this_page;
+
+ /* Note: lock subreq in order to change subreq->wb_head */
+ nfs_page_set_headlock(subreq);
+ WARN_ON_ONCE(old_head != subreq->wb_head);
+
+ /* make sure old group is not used */
+ subreq->wb_this_page = subreq;
+ subreq->wb_head = subreq;
+
+ clear_bit(PG_REMOVE, &subreq->wb_flags);
+
+ /* Note: races with nfs_page_group_destroy() */
+ if (!kref_read(&subreq->wb_kref)) {
+ /* Check if we raced with nfs_page_group_destroy() */
+ if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags)) {
+ nfs_page_clear_headlock(subreq);
+ nfs_free_request(subreq);
+ } else
+ nfs_page_clear_headlock(subreq);
+ continue;
+ }
+ nfs_page_clear_headlock(subreq);
+
+ nfs_release_request(old_head);
+
+ if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) {
+ nfs_release_request(subreq);
+ atomic_long_dec(&NFS_I(inode)->nrequests);
+ }
+
+ /* subreq is now totally disconnected from page group or any
+ * write / commit lists. last chance to wake any waiters */
+ nfs_unlock_and_release_request(subreq);
+ }
+}
+
+/*
+ * nfs_join_page_group - destroy subrequests of the head req
+ * @head: the page used to lookup the "page group" of nfs_page structures
+ * @inode: Inode to which the request belongs.
+ *
+ * This function joins all sub requests to the head request by first
+ * locking all requests in the group, cancelling any pending operations
+ * and finally updating the head request to cover the whole range covered by
+ * the (former) group. All subrequests are removed from any write or commit
+ * lists, unlinked from the group and destroyed.
+ */
+void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo,
+ struct inode *inode)
+{
+ struct nfs_page *subreq;
+ struct nfs_page *destroy_list = NULL;
+ unsigned int pgbase, off, bytes;
+
+ pgbase = head->wb_pgbase;
+ bytes = head->wb_bytes;
+ off = head->wb_offset;
+ for (subreq = head->wb_this_page; subreq != head;
+ subreq = subreq->wb_this_page) {
+ /* Subrequests should always form a contiguous range */
+ if (pgbase > subreq->wb_pgbase) {
+ off -= pgbase - subreq->wb_pgbase;
+ bytes += pgbase - subreq->wb_pgbase;
+ pgbase = subreq->wb_pgbase;
+ }
+ bytes = max(subreq->wb_pgbase + subreq->wb_bytes
+ - pgbase, bytes);
+ }
+
+ /* Set the head request's range to cover the former page group */
+ head->wb_pgbase = pgbase;
+ head->wb_bytes = bytes;
+ head->wb_offset = off;
+
+ /* Now that all requests are locked, make sure they aren't on any list.
+ * Commit list removal accounting is done after locks are dropped */
+ subreq = head;
+ do {
+ nfs_clear_request_commit(cinfo, subreq);
+ subreq = subreq->wb_this_page;
+ } while (subreq != head);
+
+ /* unlink subrequests from head, destroy them later */
+ if (head->wb_this_page != head) {
+ /* destroy list will be terminated by head */
+ destroy_list = head->wb_this_page;
+ head->wb_this_page = head;
+ }
+
+ nfs_destroy_unlinked_subrequests(destroy_list, head, inode);
+}
+
+/*
+ * nfs_lock_and_join_requests - join all subreqs to the head req
+ * @page: the page used to lookup the "page group" of nfs_page structures
+ *
+ * This function joins all sub requests to the head request by first
+ * locking all requests in the group, cancelling any pending operations
+ * and finally updating the head request to cover the whole range covered by
+ * the (former) group. All subrequests are removed from any write or commit
+ * lists, unlinked from the group and destroyed.
+ *
+ * Returns a locked, referenced pointer to the head request - which after
+ * this call is guaranteed to be the only request associated with the page.
+ * Returns NULL if no requests are found for @page, or a ERR_PTR if an
+ * error was encountered.
+ */
+static struct nfs_page *
+nfs_lock_and_join_requests(struct page *page)
+{
+ struct inode *inode = page_file_mapping(page)->host;
+ struct nfs_page *head;
+ struct nfs_commit_info cinfo;
+ int ret;
+
+ nfs_init_cinfo_from_inode(&cinfo, inode);
+ /*
+ * A reference is taken only on the head request which acts as a
+ * reference to the whole page group - the group will not be destroyed
+ * until the head reference is released.
+ */
+ head = nfs_find_and_lock_page_request(page);
+ if (IS_ERR_OR_NULL(head))
+ return head;
+
+ /* lock each request in the page group */
+ ret = nfs_page_group_lock_subrequests(head);
+ if (ret < 0) {
+ nfs_unlock_and_release_request(head);
+ return ERR_PTR(ret);
+ }
+
+ nfs_join_page_group(head, &cinfo, inode);
+
+ return head;
+}
+
+static void nfs_write_error(struct nfs_page *req, int error)
+{
+ trace_nfs_write_error(req, error);
+ nfs_mapping_set_error(req->wb_page, error);
+ nfs_inode_remove_request(req);
+ nfs_end_page_writeback(req);
+ nfs_release_request(req);
+}
+
+/*
+ * Find an associated nfs write request, and prepare to flush it out
+ * May return an error if the user signalled nfs_wait_on_request().
+ */
+static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
+ struct page *page)
+{
+ struct nfs_page *req;
+ int ret = 0;
+
+ req = nfs_lock_and_join_requests(page);
+ if (!req)
+ goto out;
+ ret = PTR_ERR(req);
+ if (IS_ERR(req))
+ goto out;
+
+ nfs_set_page_writeback(page);
+ WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
+
+ /* If there is a fatal error that covers this write, just exit */
+ ret = pgio->pg_error;
+ if (nfs_error_is_fatal_on_server(ret))
+ goto out_launder;
+
+ ret = 0;
+ if (!nfs_pageio_add_request(pgio, req)) {
+ ret = pgio->pg_error;
+ /*
+ * Remove the problematic req upon fatal errors on the server
+ */
+ if (nfs_error_is_fatal(ret)) {
+ if (nfs_error_is_fatal_on_server(ret))
+ goto out_launder;
+ } else
+ ret = -EAGAIN;
+ nfs_redirty_request(req);
+ pgio->pg_error = 0;
+ } else
+ nfs_add_stats(page_file_mapping(page)->host,
+ NFSIOS_WRITEPAGES, 1);
+out:
+ return ret;
+out_launder:
+ nfs_write_error(req, ret);
+ return 0;
+}
+
+static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
+ struct nfs_pageio_descriptor *pgio)
+{
+ int ret;
+
+ nfs_pageio_cond_complete(pgio, page_index(page));
+ ret = nfs_page_async_flush(pgio, page);
+ if (ret == -EAGAIN) {
+ redirty_page_for_writepage(wbc, page);
+ ret = AOP_WRITEPAGE_ACTIVATE;
+ }
+ return ret;
+}
+
+/*
+ * Write an mmapped page to the server.
+ */
+static int nfs_writepage_locked(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct nfs_pageio_descriptor pgio;
+ struct inode *inode = page_file_mapping(page)->host;
+ int err;
+
+ nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
+ nfs_pageio_init_write(&pgio, inode, 0,
+ false, &nfs_async_write_completion_ops);
+ err = nfs_do_writepage(page, wbc, &pgio);
+ pgio.pg_error = 0;
+ nfs_pageio_complete(&pgio);
+ return err;
+}
+
+int nfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+ int ret;
+
+ ret = nfs_writepage_locked(page, wbc);
+ if (ret != AOP_WRITEPAGE_ACTIVATE)
+ unlock_page(page);
+ return ret;
+}
+
+static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data)
+{
+ int ret;
+
+ ret = nfs_do_writepage(page, wbc, data);
+ if (ret != AOP_WRITEPAGE_ACTIVATE)
+ unlock_page(page);
+ return ret;
+}
+
+static void nfs_io_completion_commit(void *inode)
+{
+ nfs_commit_inode(inode, 0);
+}
+
+int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ struct nfs_pageio_descriptor pgio;
+ struct nfs_io_completion *ioc;
+ int err;
+
+ nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
+
+ ioc = nfs_io_completion_alloc(GFP_KERNEL);
+ if (ioc)
+ nfs_io_completion_init(ioc, nfs_io_completion_commit, inode);
+
+ nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false,
+ &nfs_async_write_completion_ops);
+ pgio.pg_io_completion = ioc;
+ err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
+ pgio.pg_error = 0;
+ nfs_pageio_complete(&pgio);
+ nfs_io_completion_put(ioc);
+
+ if (err < 0)
+ goto out_err;
+ return 0;
+out_err:
+ return err;
+}
+
+/*
+ * Insert a write request into an inode
+ */
+static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
+{
+ struct address_space *mapping = page_file_mapping(req->wb_page);
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ WARN_ON_ONCE(req->wb_this_page != req);
+
+ /* Lock the request! */
+ nfs_lock_request(req);
+
+ /*
+ * Swap-space should not get truncated. Hence no need to plug the race
+ * with invalidate/truncate.
+ */
+ spin_lock(&mapping->private_lock);
+ if (!nfs_have_writebacks(inode) &&
+ NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
+ inode_inc_iversion_raw(inode);
+ if (likely(!PageSwapCache(req->wb_page))) {
+ set_bit(PG_MAPPED, &req->wb_flags);
+ SetPagePrivate(req->wb_page);
+ set_page_private(req->wb_page, (unsigned long)req);
+ }
+ spin_unlock(&mapping->private_lock);
+ atomic_long_inc(&nfsi->nrequests);
+ /* this a head request for a page group - mark it as having an
+ * extra reference so sub groups can follow suit.
+ * This flag also informs pgio layer when to bump nrequests when
+ * adding subrequests. */
+ WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags));
+ kref_get(&req->wb_kref);
+}
+
+/*
+ * Remove a write request from an inode
+ */
+static void nfs_inode_remove_request(struct nfs_page *req)
+{
+ struct address_space *mapping = page_file_mapping(req->wb_page);
+ struct inode *inode = mapping->host;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_page *head;
+
+ if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
+ head = req->wb_head;
+
+ spin_lock(&mapping->private_lock);
+ if (likely(head->wb_page && !PageSwapCache(head->wb_page))) {
+ set_page_private(head->wb_page, 0);
+ ClearPagePrivate(head->wb_page);
+ clear_bit(PG_MAPPED, &head->wb_flags);
+ }
+ spin_unlock(&mapping->private_lock);
+ }
+
+ if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
+ nfs_release_request(req);
+ atomic_long_dec(&nfsi->nrequests);
+ }
+}
+
+static void
+nfs_mark_request_dirty(struct nfs_page *req)
+{
+ if (req->wb_page)
+ __set_page_dirty_nobuffers(req->wb_page);
+}
+
+/*
+ * nfs_page_search_commits_for_head_request_locked
+ *
+ * Search through commit lists on @inode for the head request for @page.
+ * Must be called while holding the inode (which is cinfo) lock.
+ *
+ * Returns the head request if found, or NULL if not found.
+ */
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+ struct page *page)
+{
+ struct nfs_page *freq, *t;
+ struct nfs_commit_info cinfo;
+ struct inode *inode = &nfsi->vfs_inode;
+
+ nfs_init_cinfo_from_inode(&cinfo, inode);
+
+ /* search through pnfs commit lists */
+ freq = pnfs_search_commit_reqs(inode, &cinfo, page);
+ if (freq)
+ return freq->wb_head;
+
+ /* Linearly search the commit list for the correct request */
+ list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
+ if (freq->wb_page == page)
+ return freq->wb_head;
+ }
+
+ return NULL;
+}
+
+/**
+ * nfs_request_add_commit_list_locked - add request to a commit list
+ * @req: pointer to a struct nfs_page
+ * @dst: commit list head
+ * @cinfo: holds list lock and accounting info
+ *
+ * This sets the PG_CLEAN bit, updates the cinfo count of
+ * number of outstanding requests requiring a commit as well as
+ * the MM page stats.
+ *
+ * The caller must hold NFS_I(cinfo->inode)->commit_mutex, and the
+ * nfs_page lock.
+ */
+void
+nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
+ struct nfs_commit_info *cinfo)
+{
+ set_bit(PG_CLEAN, &req->wb_flags);
+ nfs_list_add_request(req, dst);
+ atomic_long_inc(&cinfo->mds->ncommit);
+}
+EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
+
+/**
+ * nfs_request_add_commit_list - add request to a commit list
+ * @req: pointer to a struct nfs_page
+ * @cinfo: holds list lock and accounting info
+ *
+ * This sets the PG_CLEAN bit, updates the cinfo count of
+ * number of outstanding requests requiring a commit as well as
+ * the MM page stats.
+ *
+ * The caller must _not_ hold the cinfo->lock, but must be
+ * holding the nfs_page lock.
+ */
+void
+nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo)
+{
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+ nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo);
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+ if (req->wb_page)
+ nfs_mark_page_unstable(req->wb_page, cinfo);
+}
+EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
+
+/**
+ * nfs_request_remove_commit_list - Remove request from a commit list
+ * @req: pointer to a nfs_page
+ * @cinfo: holds list lock and accounting info
+ *
+ * This clears the PG_CLEAN bit, and updates the cinfo's count of
+ * number of outstanding requests requiring a commit
+ * It does not update the MM page stats.
+ *
+ * The caller _must_ hold the cinfo->lock and the nfs_page lock.
+ */
+void
+nfs_request_remove_commit_list(struct nfs_page *req,
+ struct nfs_commit_info *cinfo)
+{
+ if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags))
+ return;
+ nfs_list_remove_request(req);
+ atomic_long_dec(&cinfo->mds->ncommit);
+}
+EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list);
+
+static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
+ struct inode *inode)
+{
+ cinfo->inode = inode;
+ cinfo->mds = &NFS_I(inode)->commit_info;
+ cinfo->ds = pnfs_get_ds_info(inode);
+ cinfo->dreq = NULL;
+ cinfo->completion_ops = &nfs_commit_completion_ops;
+}
+
+void nfs_init_cinfo(struct nfs_commit_info *cinfo,
+ struct inode *inode,
+ struct nfs_direct_req *dreq)
+{
+ if (dreq)
+ nfs_init_cinfo_from_dreq(cinfo, dreq);
+ else
+ nfs_init_cinfo_from_inode(cinfo, inode);
+}
+EXPORT_SYMBOL_GPL(nfs_init_cinfo);
+
+/*
+ * Add a request to the inode's commit list.
+ */
+void
+nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo, u32 ds_commit_idx)
+{
+ if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx))
+ return;
+ nfs_request_add_commit_list(req, cinfo);
+}
+
+static void
+nfs_clear_page_commit(struct page *page)
+{
+ dec_node_page_state(page, NR_WRITEBACK);
+ dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb,
+ WB_WRITEBACK);
+}
+
+/* Called holding the request lock on @req */
+static void nfs_clear_request_commit(struct nfs_commit_info *cinfo,
+ struct nfs_page *req)
+{
+ if (test_bit(PG_CLEAN, &req->wb_flags)) {
+ struct nfs_open_context *ctx = nfs_req_openctx(req);
+ struct inode *inode = d_inode(ctx->dentry);
+
+ mutex_lock(&NFS_I(inode)->commit_mutex);
+ if (!pnfs_clear_request_commit(req, cinfo)) {
+ nfs_request_remove_commit_list(req, cinfo);
+ }
+ mutex_unlock(&NFS_I(inode)->commit_mutex);
+ nfs_clear_page_commit(req->wb_page);
+ }
+}
+
+int nfs_write_need_commit(struct nfs_pgio_header *hdr)
+{
+ if (hdr->verf.committed == NFS_DATA_SYNC)
+ return hdr->lseg == NULL;
+ return hdr->verf.committed != NFS_FILE_SYNC;
+}
+
+static void nfs_async_write_init(struct nfs_pgio_header *hdr)
+{
+ nfs_io_completion_get(hdr->io_completion);
+}
+
+static void nfs_write_completion(struct nfs_pgio_header *hdr)
+{
+ struct nfs_commit_info cinfo;
+ unsigned long bytes = 0;
+
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
+ goto out;
+ nfs_init_cinfo_from_inode(&cinfo, hdr->inode);
+ while (!list_empty(&hdr->pages)) {
+ struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+
+ bytes += req->wb_bytes;
+ nfs_list_remove_request(req);
+ if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) &&
+ (hdr->good_bytes < bytes)) {
+ trace_nfs_comp_error(req, hdr->error);
+ nfs_mapping_set_error(req->wb_page, hdr->error);
+ goto remove_req;
+ }
+ if (nfs_write_need_commit(hdr)) {
+ /* Reset wb_nio, since the write was successful. */
+ req->wb_nio = 0;
+ memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
+ nfs_mark_request_commit(req, hdr->lseg, &cinfo,
+ hdr->pgio_mirror_idx);
+ goto next;
+ }
+remove_req:
+ nfs_inode_remove_request(req);
+next:
+ nfs_end_page_writeback(req);
+ nfs_release_request(req);
+ }
+out:
+ nfs_io_completion_put(hdr->io_completion);
+ hdr->release(hdr);
+}
+
+unsigned long
+nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
+{
+ return atomic_long_read(&cinfo->mds->ncommit);
+}
+
+/* NFS_I(cinfo->inode)->commit_mutex held by caller */
+int
+nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
+ struct nfs_commit_info *cinfo, int max)
+{
+ struct nfs_page *req, *tmp;
+ int ret = 0;
+
+ list_for_each_entry_safe(req, tmp, src, wb_list) {
+ kref_get(&req->wb_kref);
+ if (!nfs_lock_request(req)) {
+ nfs_release_request(req);
+ continue;
+ }
+ nfs_request_remove_commit_list(req, cinfo);
+ clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+ nfs_list_add_request(req, dst);
+ ret++;
+ if ((ret == max) && !cinfo->dreq)
+ break;
+ cond_resched();
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_scan_commit_list);
+
+/*
+ * nfs_scan_commit - Scan an inode for commit requests
+ * @inode: NFS inode to scan
+ * @dst: mds destination list
+ * @cinfo: mds and ds lists of reqs ready to commit
+ *
+ * Moves requests from the inode's 'commit' request list.
+ * The requests are *not* checked to ensure that they form a contiguous set.
+ */
+int
+nfs_scan_commit(struct inode *inode, struct list_head *dst,
+ struct nfs_commit_info *cinfo)
+{
+ int ret = 0;
+
+ if (!atomic_long_read(&cinfo->mds->ncommit))
+ return 0;
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+ if (atomic_long_read(&cinfo->mds->ncommit) > 0) {
+ const int max = INT_MAX;
+
+ ret = nfs_scan_commit_list(&cinfo->mds->list, dst,
+ cinfo, max);
+ ret += pnfs_scan_commit_lists(inode, cinfo, max - ret);
+ }
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+ return ret;
+}
+
+/*
+ * Search for an existing write request, and attempt to update
+ * it to reflect a new dirty region on a given page.
+ *
+ * If the attempt fails, then the existing request is flushed out
+ * to disk.
+ */
+static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
+ struct page *page,
+ unsigned int offset,
+ unsigned int bytes)
+{
+ struct nfs_page *req;
+ unsigned int rqend;
+ unsigned int end;
+ int error;
+
+ end = offset + bytes;
+
+ req = nfs_lock_and_join_requests(page);
+ if (IS_ERR_OR_NULL(req))
+ return req;
+
+ rqend = req->wb_offset + req->wb_bytes;
+ /*
+ * Tell the caller to flush out the request if
+ * the offsets are non-contiguous.
+ * Note: nfs_flush_incompatible() will already
+ * have flushed out requests having wrong owners.
+ */
+ if (offset > rqend || end < req->wb_offset)
+ goto out_flushme;
+
+ /* Okay, the request matches. Update the region */
+ if (offset < req->wb_offset) {
+ req->wb_offset = offset;
+ req->wb_pgbase = offset;
+ }
+ if (end > rqend)
+ req->wb_bytes = end - req->wb_offset;
+ else
+ req->wb_bytes = rqend - req->wb_offset;
+ req->wb_nio = 0;
+ return req;
+out_flushme:
+ /*
+ * Note: we mark the request dirty here because
+ * nfs_lock_and_join_requests() cannot preserve
+ * commit flags, so we have to replay the write.
+ */
+ nfs_mark_request_dirty(req);
+ nfs_unlock_and_release_request(req);
+ error = nfs_wb_page(inode, page);
+ return (error < 0) ? ERR_PTR(error) : NULL;
+}
+
+/*
+ * Try to update an existing write request, or create one if there is none.
+ *
+ * Note: Should always be called with the Page Lock held to prevent races
+ * if we have to add a new request. Also assumes that the caller has
+ * already called nfs_flush_incompatible() if necessary.
+ */
+static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
+ struct page *page, unsigned int offset, unsigned int bytes)
+{
+ struct inode *inode = page_file_mapping(page)->host;
+ struct nfs_page *req;
+
+ req = nfs_try_to_update_request(inode, page, offset, bytes);
+ if (req != NULL)
+ goto out;
+ req = nfs_create_request(ctx, page, offset, bytes);
+ if (IS_ERR(req))
+ goto out;
+ nfs_inode_add_request(inode, req);
+out:
+ return req;
+}
+
+static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
+ unsigned int offset, unsigned int count)
+{
+ struct nfs_page *req;
+
+ req = nfs_setup_write_request(ctx, page, offset, count);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ /* Update file length */
+ nfs_grow_file(page, offset, count);
+ nfs_mark_uptodate(req);
+ nfs_mark_request_dirty(req);
+ nfs_unlock_and_release_request(req);
+ return 0;
+}
+
+int nfs_flush_incompatible(struct file *file, struct page *page)
+{
+ struct nfs_open_context *ctx = nfs_file_open_context(file);
+ struct nfs_lock_context *l_ctx;
+ struct file_lock_context *flctx = file_inode(file)->i_flctx;
+ struct nfs_page *req;
+ int do_flush, status;
+ /*
+ * Look for a request corresponding to this page. If there
+ * is one, and it belongs to another file, we flush it out
+ * before we try to copy anything into the page. Do this
+ * due to the lack of an ACCESS-type call in NFSv2.
+ * Also do the same if we find a request from an existing
+ * dropped page.
+ */
+ do {
+ req = nfs_page_find_head_request(page);
+ if (req == NULL)
+ return 0;
+ l_ctx = req->wb_lock_context;
+ do_flush = req->wb_page != page ||
+ !nfs_match_open_context(nfs_req_openctx(req), ctx);
+ if (l_ctx && flctx &&
+ !(list_empty_careful(&flctx->flc_posix) &&
+ list_empty_careful(&flctx->flc_flock))) {
+ do_flush |= l_ctx->lockowner != current->files;
+ }
+ nfs_release_request(req);
+ if (!do_flush)
+ return 0;
+ status = nfs_wb_page(page_file_mapping(page)->host, page);
+ } while (status == 0);
+ return status;
+}
+
+/*
+ * Avoid buffered writes when a open context credential's key would
+ * expire soon.
+ *
+ * Returns -EACCES if the key will expire within RPC_KEY_EXPIRE_FAIL.
+ *
+ * Return 0 and set a credential flag which triggers the inode to flush
+ * and performs NFS_FILE_SYNC writes if the key will expired within
+ * RPC_KEY_EXPIRE_TIMEO.
+ */
+int
+nfs_key_timeout_notify(struct file *filp, struct inode *inode)
+{
+ struct nfs_open_context *ctx = nfs_file_open_context(filp);
+
+ if (nfs_ctx_key_to_expire(ctx, inode) &&
+ !ctx->ll_cred)
+ /* Already expired! */
+ return -EACCES;
+ return 0;
+}
+
+/*
+ * Test if the open context credential key is marked to expire soon.
+ */
+bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode)
+{
+ struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth;
+ struct rpc_cred *cred = ctx->ll_cred;
+ struct auth_cred acred = {
+ .cred = ctx->cred,
+ };
+
+ if (cred && !cred->cr_ops->crmatch(&acred, cred, 0)) {
+ put_rpccred(cred);
+ ctx->ll_cred = NULL;
+ cred = NULL;
+ }
+ if (!cred)
+ cred = auth->au_ops->lookup_cred(auth, &acred, 0);
+ if (!cred || IS_ERR(cred))
+ return true;
+ ctx->ll_cred = cred;
+ return !!(cred->cr_ops->crkey_timeout &&
+ cred->cr_ops->crkey_timeout(cred));
+}
+
+/*
+ * If the page cache is marked as unsafe or invalid, then we can't rely on
+ * the PageUptodate() flag. In this case, we will need to turn off
+ * write optimisations that depend on the page contents being correct.
+ */
+static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ if (nfs_have_delegated_attributes(inode))
+ goto out;
+ if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
+ return false;
+ smp_rmb();
+ if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags))
+ return false;
+out:
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ return false;
+ return PageUptodate(page) != 0;
+}
+
+static bool
+is_whole_file_wrlock(struct file_lock *fl)
+{
+ return fl->fl_start == 0 && fl->fl_end == OFFSET_MAX &&
+ fl->fl_type == F_WRLCK;
+}
+
+/* If we know the page is up to date, and we're not using byte range locks (or
+ * if we have the whole file locked for writing), it may be more efficient to
+ * extend the write to cover the entire page in order to avoid fragmentation
+ * inefficiencies.
+ *
+ * If the file is opened for synchronous writes then we can just skip the rest
+ * of the checks.
+ */
+static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
+{
+ int ret;
+ struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock *fl;
+
+ if (file->f_flags & O_DSYNC)
+ return 0;
+ if (!nfs_write_pageuptodate(page, inode))
+ return 0;
+ if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
+ return 1;
+ if (!flctx || (list_empty_careful(&flctx->flc_flock) &&
+ list_empty_careful(&flctx->flc_posix)))
+ return 1;
+
+ /* Check to see if there are whole file write locks */
+ ret = 0;
+ spin_lock(&flctx->flc_lock);
+ if (!list_empty(&flctx->flc_posix)) {
+ fl = list_first_entry(&flctx->flc_posix, struct file_lock,
+ fl_list);
+ if (is_whole_file_wrlock(fl))
+ ret = 1;
+ } else if (!list_empty(&flctx->flc_flock)) {
+ fl = list_first_entry(&flctx->flc_flock, struct file_lock,
+ fl_list);
+ if (fl->fl_type == F_WRLCK)
+ ret = 1;
+ }
+ spin_unlock(&flctx->flc_lock);
+ return ret;
+}
+
+/*
+ * Update and possibly write a cached page of an NFS file.
+ *
+ * XXX: Keep an eye on generic_file_read to make sure it doesn't do bad
+ * things with a page scheduled for an RPC call (e.g. invalidate it).
+ */
+int nfs_updatepage(struct file *file, struct page *page,
+ unsigned int offset, unsigned int count)
+{
+ struct nfs_open_context *ctx = nfs_file_open_context(file);
+ struct address_space *mapping = page_file_mapping(page);
+ struct inode *inode = mapping->host;
+ int status = 0;
+
+ nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
+
+ dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n",
+ file, count, (long long)(page_file_offset(page) + offset));
+
+ if (!count)
+ goto out;
+
+ if (nfs_can_extend_write(file, page, inode)) {
+ count = max(count + offset, nfs_page_length(page));
+ offset = 0;
+ }
+
+ status = nfs_writepage_setup(ctx, page, offset, count);
+ if (status < 0)
+ nfs_set_pageerror(mapping);
+ else
+ __set_page_dirty_nobuffers(page);
+out:
+ dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
+ status, (long long)i_size_read(inode));
+ return status;
+}
+
+static int flush_task_priority(int how)
+{
+ switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) {
+ case FLUSH_HIGHPRI:
+ return RPC_PRIORITY_HIGH;
+ case FLUSH_LOWPRI:
+ return RPC_PRIORITY_LOW;
+ }
+ return RPC_PRIORITY_NORMAL;
+}
+
+static void nfs_initiate_write(struct nfs_pgio_header *hdr,
+ struct rpc_message *msg,
+ const struct nfs_rpc_ops *rpc_ops,
+ struct rpc_task_setup *task_setup_data, int how)
+{
+ int priority = flush_task_priority(how);
+
+ task_setup_data->priority = priority;
+ rpc_ops->write_setup(hdr, msg, &task_setup_data->rpc_client);
+ trace_nfs_initiate_write(hdr);
+}
+
+/* If a nfs_flush_* function fails, it should remove reqs from @head and
+ * call this on each, which will prepare them to be retried on next
+ * writeback using standard nfs.
+ */
+static void nfs_redirty_request(struct nfs_page *req)
+{
+ /* Bump the transmission count */
+ req->wb_nio++;
+ nfs_mark_request_dirty(req);
+ set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags);
+ nfs_end_page_writeback(req);
+ nfs_release_request(req);
+}
+
+static void nfs_async_write_error(struct list_head *head, int error)
+{
+ struct nfs_page *req;
+
+ while (!list_empty(head)) {
+ req = nfs_list_entry(head->next);
+ nfs_list_remove_request(req);
+ if (nfs_error_is_fatal_on_server(error))
+ nfs_write_error(req, error);
+ else
+ nfs_redirty_request(req);
+ }
+}
+
+static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr)
+{
+ nfs_async_write_error(&hdr->pages, 0);
+ filemap_fdatawrite_range(hdr->inode->i_mapping, hdr->args.offset,
+ hdr->args.offset + hdr->args.count - 1);
+}
+
+static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = {
+ .init_hdr = nfs_async_write_init,
+ .error_cleanup = nfs_async_write_error,
+ .completion = nfs_write_completion,
+ .reschedule_io = nfs_async_write_reschedule_io,
+};
+
+void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode, int ioflags, bool force_mds,
+ const struct nfs_pgio_completion_ops *compl_ops)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops;
+
+#ifdef CONFIG_NFS_V4_1
+ if (server->pnfs_curr_ld && !force_mds)
+ pg_ops = server->pnfs_curr_ld->pg_write_ops;
+#endif
+ nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops,
+ server->wsize, ioflags);
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
+
+void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
+{
+ struct nfs_pgio_mirror *mirror;
+
+ if (pgio->pg_ops && pgio->pg_ops->pg_cleanup)
+ pgio->pg_ops->pg_cleanup(pgio);
+
+ pgio->pg_ops = &nfs_pgio_rw_ops;
+
+ nfs_pageio_stop_mirroring(pgio);
+
+ mirror = &pgio->pg_mirrors[0];
+ mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
+
+
+void nfs_commit_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs_commit_data *data = calldata;
+
+ NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
+}
+
+/*
+ * Special version of should_remove_suid() that ignores capabilities.
+ */
+static int nfs_should_remove_suid(const struct inode *inode)
+{
+ umode_t mode = inode->i_mode;
+ int kill = 0;
+
+ /* suid always must be killed */
+ if (unlikely(mode & S_ISUID))
+ kill = ATTR_KILL_SUID;
+
+ /*
+ * sgid without any exec bits is just a mandatory locking mark; leave
+ * it alone. If some exec bits are set, it's a real sgid; kill it.
+ */
+ if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+ kill |= ATTR_KILL_SGID;
+
+ if (unlikely(kill && S_ISREG(mode)))
+ return kill;
+
+ return 0;
+}
+
+static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr,
+ struct nfs_fattr *fattr)
+{
+ struct nfs_pgio_args *argp = &hdr->args;
+ struct nfs_pgio_res *resp = &hdr->res;
+ u64 size = argp->offset + resp->count;
+
+ if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
+ fattr->size = size;
+ if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode)) {
+ fattr->valid &= ~NFS_ATTR_FATTR_SIZE;
+ return;
+ }
+ if (size != fattr->size)
+ return;
+ /* Set attribute barrier */
+ nfs_fattr_set_barrier(fattr);
+ /* ...and update size */
+ fattr->valid |= NFS_ATTR_FATTR_SIZE;
+}
+
+void nfs_writeback_update_inode(struct nfs_pgio_header *hdr)
+{
+ struct nfs_fattr *fattr = &hdr->fattr;
+ struct inode *inode = hdr->inode;
+
+ spin_lock(&inode->i_lock);
+ nfs_writeback_check_extend(hdr, fattr);
+ nfs_post_op_update_inode_force_wcc_locked(inode, fattr);
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_writeback_update_inode);
+
+/*
+ * This function is called when the WRITE call is complete.
+ */
+static int nfs_writeback_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr,
+ struct inode *inode)
+{
+ int status;
+
+ /*
+ * ->write_done will attempt to use post-op attributes to detect
+ * conflicting writes by other clients. A strict interpretation
+ * of close-to-open would allow us to continue caching even if
+ * another writer had changed the file, but some applications
+ * depend on tighter cache coherency when writing.
+ */
+ status = NFS_PROTO(inode)->write_done(task, hdr);
+ if (status != 0)
+ return status;
+
+ nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
+ trace_nfs_writeback_done(task, hdr);
+
+ if (hdr->res.verf->committed < hdr->args.stable &&
+ task->tk_status >= 0) {
+ /* We tried a write call, but the server did not
+ * commit data to stable storage even though we
+ * requested it.
+ * Note: There is a known bug in Tru64 < 5.0 in which
+ * the server reports NFS_DATA_SYNC, but performs
+ * NFS_FILE_SYNC. We therefore implement this checking
+ * as a dprintk() in order to avoid filling syslog.
+ */
+ static unsigned long complain;
+
+ /* Note this will print the MDS for a DS write */
+ if (time_before(complain, jiffies)) {
+ dprintk("NFS: faulty NFS server %s:"
+ " (committed = %d) != (stable = %d)\n",
+ NFS_SERVER(inode)->nfs_client->cl_hostname,
+ hdr->res.verf->committed, hdr->args.stable);
+ complain = jiffies + 300 * HZ;
+ }
+ }
+
+ /* Deal with the suid/sgid bit corner case */
+ if (nfs_should_remove_suid(inode)) {
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_OTHER;
+ spin_unlock(&inode->i_lock);
+ }
+ return 0;
+}
+
+/*
+ * This function is called when the WRITE call is complete.
+ */
+static void nfs_writeback_result(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ struct nfs_pgio_args *argp = &hdr->args;
+ struct nfs_pgio_res *resp = &hdr->res;
+
+ if (resp->count < argp->count) {
+ static unsigned long complain;
+
+ /* This a short write! */
+ nfs_inc_stats(hdr->inode, NFSIOS_SHORTWRITE);
+
+ /* Has the server at least made some progress? */
+ if (resp->count == 0) {
+ if (time_before(complain, jiffies)) {
+ printk(KERN_WARNING
+ "NFS: Server wrote zero bytes, expected %u.\n",
+ argp->count);
+ complain = jiffies + 300 * HZ;
+ }
+ nfs_set_pgio_error(hdr, -EIO, argp->offset);
+ task->tk_status = -EIO;
+ return;
+ }
+
+ /* For non rpc-based layout drivers, retry-through-MDS */
+ if (!task->tk_ops) {
+ hdr->pnfs_error = -EAGAIN;
+ return;
+ }
+
+ /* Was this an NFSv2 write or an NFSv3 stable write? */
+ if (resp->verf->committed != NFS_UNSTABLE) {
+ /* Resend from where the server left off */
+ hdr->mds_offset += resp->count;
+ argp->offset += resp->count;
+ argp->pgbase += resp->count;
+ argp->count -= resp->count;
+ } else {
+ /* Resend as a stable write in order to avoid
+ * headaches in the case of a server crash.
+ */
+ argp->stable = NFS_FILE_SYNC;
+ }
+ resp->count = 0;
+ resp->verf->committed = 0;
+ rpc_restart_call_prepare(task);
+ }
+}
+
+static int wait_on_commit(struct nfs_mds_commit_info *cinfo)
+{
+ return wait_var_event_killable(&cinfo->rpcs_out,
+ !atomic_read(&cinfo->rpcs_out));
+}
+
+static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo)
+{
+ atomic_inc(&cinfo->rpcs_out);
+}
+
+bool nfs_commit_end(struct nfs_mds_commit_info *cinfo)
+{
+ if (atomic_dec_and_test(&cinfo->rpcs_out)) {
+ wake_up_var(&cinfo->rpcs_out);
+ return true;
+ }
+ return false;
+}
+
+void nfs_commitdata_release(struct nfs_commit_data *data)
+{
+ put_nfs_open_context(data->context);
+ nfs_commit_free(data);
+}
+EXPORT_SYMBOL_GPL(nfs_commitdata_release);
+
+int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
+ const struct nfs_rpc_ops *nfs_ops,
+ const struct rpc_call_ops *call_ops,
+ int how, int flags)
+{
+ struct rpc_task *task;
+ int priority = flush_task_priority(how);
+ struct rpc_message msg = {
+ .rpc_argp = &data->args,
+ .rpc_resp = &data->res,
+ .rpc_cred = data->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .task = &data->task,
+ .rpc_client = clnt,
+ .rpc_message = &msg,
+ .callback_ops = call_ops,
+ .callback_data = data,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC | flags,
+ .priority = priority,
+ };
+ /* Set up the initial task struct. */
+ nfs_ops->commit_setup(data, &msg, &task_setup_data.rpc_client);
+ trace_nfs_initiate_commit(data);
+
+ dprintk("NFS: initiated commit call\n");
+
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ if (how & FLUSH_SYNC)
+ rpc_wait_for_completion_task(task);
+ rpc_put_task(task);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_commit);
+
+static loff_t nfs_get_lwb(struct list_head *head)
+{
+ loff_t lwb = 0;
+ struct nfs_page *req;
+
+ list_for_each_entry(req, head, wb_list)
+ if (lwb < (req_offset(req) + req->wb_bytes))
+ lwb = req_offset(req) + req->wb_bytes;
+
+ return lwb;
+}
+
+/*
+ * Set up the argument/result storage required for the RPC call.
+ */
+void nfs_init_commit(struct nfs_commit_data *data,
+ struct list_head *head,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo)
+{
+ struct nfs_page *first;
+ struct nfs_open_context *ctx;
+ struct inode *inode;
+
+ /* Set up the RPC argument and reply structs
+ * NB: take care not to mess about with data->commit et al. */
+
+ if (head)
+ list_splice_init(head, &data->pages);
+
+ first = nfs_list_entry(data->pages.next);
+ ctx = nfs_req_openctx(first);
+ inode = d_inode(ctx->dentry);
+
+ data->inode = inode;
+ data->cred = ctx->cred;
+ data->lseg = lseg; /* reference transferred */
+ /* only set lwb for pnfs commit */
+ if (lseg)
+ data->lwb = nfs_get_lwb(&data->pages);
+ data->mds_ops = &nfs_commit_ops;
+ data->completion_ops = cinfo->completion_ops;
+ data->dreq = cinfo->dreq;
+
+ data->args.fh = NFS_FH(data->inode);
+ /* Note: we always request a commit of the entire inode */
+ data->args.offset = 0;
+ data->args.count = 0;
+ data->context = get_nfs_open_context(ctx);
+ data->res.fattr = &data->fattr;
+ data->res.verf = &data->verf;
+ nfs_fattr_init(&data->fattr);
+ nfs_commit_begin(cinfo->mds);
+}
+EXPORT_SYMBOL_GPL(nfs_init_commit);
+
+void nfs_retry_commit(struct list_head *page_list,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx)
+{
+ struct nfs_page *req;
+
+ while (!list_empty(page_list)) {
+ req = nfs_list_entry(page_list->next);
+ nfs_list_remove_request(req);
+ nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx);
+ if (!cinfo->dreq)
+ nfs_clear_page_commit(req->wb_page);
+ nfs_unlock_and_release_request(req);
+ }
+}
+EXPORT_SYMBOL_GPL(nfs_retry_commit);
+
+static void
+nfs_commit_resched_write(struct nfs_commit_info *cinfo,
+ struct nfs_page *req)
+{
+ __set_page_dirty_nobuffers(req->wb_page);
+}
+
+/*
+ * Commit dirty pages
+ */
+static int
+nfs_commit_list(struct inode *inode, struct list_head *head, int how,
+ struct nfs_commit_info *cinfo)
+{
+ struct nfs_commit_data *data;
+
+ /* another commit raced with us */
+ if (list_empty(head))
+ return 0;
+
+ data = nfs_commitdata_alloc();
+ if (!data) {
+ nfs_retry_commit(head, NULL, cinfo, -1);
+ return -ENOMEM;
+ }
+
+ /* Set up the argument struct */
+ nfs_init_commit(data, head, NULL, cinfo);
+ return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
+ data->mds_ops, how, RPC_TASK_CRED_NOREF);
+}
+
+/*
+ * COMMIT call returned
+ */
+static void nfs_commit_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs_commit_data *data = calldata;
+
+ dprintk("NFS: %5u nfs_commit_done (status %d)\n",
+ task->tk_pid, task->tk_status);
+
+ /* Call the NFS version-specific code */
+ NFS_PROTO(data->inode)->commit_done(task, data);
+ trace_nfs_commit_done(task, data);
+}
+
+static void nfs_commit_release_pages(struct nfs_commit_data *data)
+{
+ const struct nfs_writeverf *verf = data->res.verf;
+ struct nfs_page *req;
+ int status = data->task.tk_status;
+ struct nfs_commit_info cinfo;
+ struct nfs_server *nfss;
+
+ while (!list_empty(&data->pages)) {
+ req = nfs_list_entry(data->pages.next);
+ nfs_list_remove_request(req);
+ if (req->wb_page)
+ nfs_clear_page_commit(req->wb_page);
+
+ dprintk("NFS: commit (%s/%llu %d@%lld)",
+ nfs_req_openctx(req)->dentry->d_sb->s_id,
+ (unsigned long long)NFS_FILEID(d_inode(nfs_req_openctx(req)->dentry)),
+ req->wb_bytes,
+ (long long)req_offset(req));
+ if (status < 0) {
+ if (req->wb_page) {
+ trace_nfs_commit_error(req, status);
+ nfs_mapping_set_error(req->wb_page, status);
+ nfs_inode_remove_request(req);
+ }
+ dprintk_cont(", error = %d\n", status);
+ goto next;
+ }
+
+ /* Okay, COMMIT succeeded, apparently. Check the verifier
+ * returned by the server against all stored verfs. */
+ if (nfs_write_match_verf(verf, req)) {
+ /* We have a match */
+ if (req->wb_page)
+ nfs_inode_remove_request(req);
+ dprintk_cont(" OK\n");
+ goto next;
+ }
+ /* We have a mismatch. Write the page again */
+ dprintk_cont(" mismatch\n");
+ nfs_mark_request_dirty(req);
+ set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags);
+ next:
+ nfs_unlock_and_release_request(req);
+ /* Latency breaker */
+ cond_resched();
+ }
+ nfss = NFS_SERVER(data->inode);
+ if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
+ clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC);
+
+ nfs_init_cinfo(&cinfo, data->inode, data->dreq);
+ nfs_commit_end(cinfo.mds);
+}
+
+static void nfs_commit_release(void *calldata)
+{
+ struct nfs_commit_data *data = calldata;
+
+ data->completion_ops->completion(data);
+ nfs_commitdata_release(calldata);
+}
+
+static const struct rpc_call_ops nfs_commit_ops = {
+ .rpc_call_prepare = nfs_commit_prepare,
+ .rpc_call_done = nfs_commit_done,
+ .rpc_release = nfs_commit_release,
+};
+
+static const struct nfs_commit_completion_ops nfs_commit_completion_ops = {
+ .completion = nfs_commit_release_pages,
+ .resched_write = nfs_commit_resched_write,
+};
+
+int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
+ int how, struct nfs_commit_info *cinfo)
+{
+ int status;
+
+ status = pnfs_commit_list(inode, head, how, cinfo);
+ if (status == PNFS_NOT_ATTEMPTED)
+ status = nfs_commit_list(inode, head, how, cinfo);
+ return status;
+}
+
+static int __nfs_commit_inode(struct inode *inode, int how,
+ struct writeback_control *wbc)
+{
+ LIST_HEAD(head);
+ struct nfs_commit_info cinfo;
+ int may_wait = how & FLUSH_SYNC;
+ int ret, nscan;
+
+ how &= ~FLUSH_SYNC;
+ nfs_init_cinfo_from_inode(&cinfo, inode);
+ nfs_commit_begin(cinfo.mds);
+ for (;;) {
+ ret = nscan = nfs_scan_commit(inode, &head, &cinfo);
+ if (ret <= 0)
+ break;
+ ret = nfs_generic_commit_list(inode, &head, how, &cinfo);
+ if (ret < 0)
+ break;
+ ret = 0;
+ if (wbc && wbc->sync_mode == WB_SYNC_NONE) {
+ if (nscan < wbc->nr_to_write)
+ wbc->nr_to_write -= nscan;
+ else
+ wbc->nr_to_write = 0;
+ }
+ if (nscan < INT_MAX)
+ break;
+ cond_resched();
+ }
+ nfs_commit_end(cinfo.mds);
+ if (ret || !may_wait)
+ return ret;
+ return wait_on_commit(cinfo.mds);
+}
+
+int nfs_commit_inode(struct inode *inode, int how)
+{
+ return __nfs_commit_inode(inode, how, NULL);
+}
+EXPORT_SYMBOL_GPL(nfs_commit_inode);
+
+int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ int flags = FLUSH_SYNC;
+ int ret = 0;
+
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ /* no commits means nothing needs to be done */
+ if (!atomic_long_read(&nfsi->commit_info.ncommit))
+ goto check_requests_outstanding;
+
+ /* Don't commit yet if this is a non-blocking flush and there
+ * are a lot of outstanding writes for this mapping.
+ */
+ if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))
+ goto out_mark_dirty;
+
+ /* don't wait for the COMMIT response */
+ flags = 0;
+ }
+
+ ret = __nfs_commit_inode(inode, flags, wbc);
+ if (!ret) {
+ if (flags & FLUSH_SYNC)
+ return 0;
+ } else if (atomic_long_read(&nfsi->commit_info.ncommit))
+ goto out_mark_dirty;
+
+check_requests_outstanding:
+ if (!atomic_read(&nfsi->commit_info.rpcs_out))
+ return ret;
+out_mark_dirty:
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_write_inode);
+
+/*
+ * Wrapper for filemap_write_and_wait_range()
+ *
+ * Needed for pNFS in order to ensure data becomes visible to the
+ * client.
+ */
+int nfs_filemap_write_and_wait_range(struct address_space *mapping,
+ loff_t lstart, loff_t lend)
+{
+ int ret;
+
+ ret = filemap_write_and_wait_range(mapping, lstart, lend);
+ if (ret == 0)
+ ret = pnfs_sync_inode(mapping->host, true);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_filemap_write_and_wait_range);
+
+/*
+ * flush the inode to disk.
+ */
+int nfs_wb_all(struct inode *inode)
+{
+ int ret;
+
+ trace_nfs_writeback_inode_enter(inode);
+
+ ret = filemap_write_and_wait(inode->i_mapping);
+ if (ret)
+ goto out;
+ ret = nfs_commit_inode(inode, FLUSH_SYNC);
+ if (ret < 0)
+ goto out;
+ pnfs_sync_inode(inode, true);
+ ret = 0;
+
+out:
+ trace_nfs_writeback_inode_exit(inode, ret);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_wb_all);
+
+int nfs_wb_page_cancel(struct inode *inode, struct page *page)
+{
+ struct nfs_page *req;
+ int ret = 0;
+
+ wait_on_page_writeback(page);
+
+ /* blocking call to cancel all requests and join to a single (head)
+ * request */
+ req = nfs_lock_and_join_requests(page);
+
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ } else if (req) {
+ /* all requests from this page have been cancelled by
+ * nfs_lock_and_join_requests, so just remove the head
+ * request from the inode / page_private pointer and
+ * release it */
+ nfs_inode_remove_request(req);
+ nfs_unlock_and_release_request(req);
+ }
+
+ return ret;
+}
+
+/*
+ * Write back all requests on one page - we do this before reading it.
+ */
+int nfs_wb_page(struct inode *inode, struct page *page)
+{
+ loff_t range_start = page_file_offset(page);
+ loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1);
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = 0,
+ .range_start = range_start,
+ .range_end = range_end,
+ };
+ int ret;
+
+ trace_nfs_writeback_page_enter(inode);
+
+ for (;;) {
+ wait_on_page_writeback(page);
+ if (clear_page_dirty_for_io(page)) {
+ ret = nfs_writepage_locked(page, &wbc);
+ if (ret < 0)
+ goto out_error;
+ continue;
+ }
+ ret = 0;
+ if (!PagePrivate(page))
+ break;
+ ret = nfs_commit_inode(inode, FLUSH_SYNC);
+ if (ret < 0)
+ goto out_error;
+ }
+out_error:
+ trace_nfs_writeback_page_exit(inode, ret);
+ return ret;
+}
+
+#ifdef CONFIG_MIGRATION
+int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
+ struct page *page, enum migrate_mode mode)
+{
+ /*
+ * If PagePrivate is set, then the page is currently associated with
+ * an in-progress read or write request. Don't try to migrate it.
+ *
+ * FIXME: we could do this in principle, but we'll need a way to ensure
+ * that we can safely release the inode reference while holding
+ * the page lock.
+ */
+ if (PagePrivate(page))
+ return -EBUSY;
+
+ if (!nfs_fscache_release_page(page, GFP_KERNEL))
+ return -EBUSY;
+
+ return migrate_page(mapping, newpage, page, mode);
+}
+#endif
+
+int __init nfs_init_writepagecache(void)
+{
+ nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
+ sizeof(struct nfs_pgio_header),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (nfs_wdata_cachep == NULL)
+ return -ENOMEM;
+
+ nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE,
+ nfs_wdata_cachep);
+ if (nfs_wdata_mempool == NULL)
+ goto out_destroy_write_cache;
+
+ nfs_cdata_cachep = kmem_cache_create("nfs_commit_data",
+ sizeof(struct nfs_commit_data),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (nfs_cdata_cachep == NULL)
+ goto out_destroy_write_mempool;
+
+ nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
+ nfs_cdata_cachep);
+ if (nfs_commit_mempool == NULL)
+ goto out_destroy_commit_cache;
+
+ /*
+ * NFS congestion size, scale with available memory.
+ *
+ * 64MB: 8192k
+ * 128MB: 11585k
+ * 256MB: 16384k
+ * 512MB: 23170k
+ * 1GB: 32768k
+ * 2GB: 46340k
+ * 4GB: 65536k
+ * 8GB: 92681k
+ * 16GB: 131072k
+ *
+ * This allows larger machines to have larger/more transfers.
+ * Limit the default to 256M
+ */
+ nfs_congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10);
+ if (nfs_congestion_kb > 256*1024)
+ nfs_congestion_kb = 256*1024;
+
+ return 0;
+
+out_destroy_commit_cache:
+ kmem_cache_destroy(nfs_cdata_cachep);
+out_destroy_write_mempool:
+ mempool_destroy(nfs_wdata_mempool);
+out_destroy_write_cache:
+ kmem_cache_destroy(nfs_wdata_cachep);
+ return -ENOMEM;
+}
+
+void nfs_destroy_writepagecache(void)
+{
+ mempool_destroy(nfs_commit_mempool);
+ kmem_cache_destroy(nfs_cdata_cachep);
+ mempool_destroy(nfs_wdata_mempool);
+ kmem_cache_destroy(nfs_wdata_cachep);
+}
+
+static const struct nfs_rw_ops nfs_rw_write_ops = {
+ .rw_alloc_header = nfs_writehdr_alloc,
+ .rw_free_header = nfs_writehdr_free,
+ .rw_done = nfs_writeback_done,
+ .rw_result = nfs_writeback_result,
+ .rw_initiate = nfs_initiate_write,
+};
diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile
new file mode 100644
index 000000000..fa82f5aaa
--- /dev/null
+++ b/fs/nfs_common/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for Linux filesystem routines that are shared by client and server.
+#
+
+obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o
+nfs_acl-objs := nfsacl.o
+
+obj-$(CONFIG_GRACE_PERIOD) += grace.o
+obj-$(CONFIG_GRACE_PERIOD) += nfs_ssc.o
diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c
new file mode 100644
index 000000000..26f2a50ec
--- /dev/null
+++ b/fs/nfs_common/grace.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Common code for control of lockd and nfsv4 grace periods.
+ *
+ * Transplanted from lockd code
+ */
+
+#include <linux/module.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <linux/fs.h>
+
+static unsigned int grace_net_id;
+static DEFINE_SPINLOCK(grace_lock);
+
+/**
+ * locks_start_grace
+ * @net: net namespace that this lock manager belongs to
+ * @lm: who this grace period is for
+ *
+ * A grace period is a period during which locks should not be given
+ * out. Currently grace periods are only enforced by the two lock
+ * managers (lockd and nfsd), using the locks_in_grace() function to
+ * check when they are in a grace period.
+ *
+ * This function is called to start a grace period.
+ */
+void
+locks_start_grace(struct net *net, struct lock_manager *lm)
+{
+ struct list_head *grace_list = net_generic(net, grace_net_id);
+
+ spin_lock(&grace_lock);
+ if (list_empty(&lm->list))
+ list_add(&lm->list, grace_list);
+ else
+ WARN(1, "double list_add attempt detected in net %x %s\n",
+ net->ns.inum, (net == &init_net) ? "(init_net)" : "");
+ spin_unlock(&grace_lock);
+}
+EXPORT_SYMBOL_GPL(locks_start_grace);
+
+/**
+ * locks_end_grace
+ * @net: net namespace that this lock manager belongs to
+ * @lm: who this grace period is for
+ *
+ * Call this function to state that the given lock manager is ready to
+ * resume regular locking. The grace period will not end until all lock
+ * managers that called locks_start_grace() also call locks_end_grace().
+ * Note that callers count on it being safe to call this more than once,
+ * and the second call should be a no-op.
+ */
+void
+locks_end_grace(struct lock_manager *lm)
+{
+ spin_lock(&grace_lock);
+ list_del_init(&lm->list);
+ spin_unlock(&grace_lock);
+}
+EXPORT_SYMBOL_GPL(locks_end_grace);
+
+static bool
+__state_in_grace(struct net *net, bool open)
+{
+ struct list_head *grace_list = net_generic(net, grace_net_id);
+ struct lock_manager *lm;
+
+ if (!open)
+ return !list_empty(grace_list);
+
+ spin_lock(&grace_lock);
+ list_for_each_entry(lm, grace_list, list) {
+ if (lm->block_opens) {
+ spin_unlock(&grace_lock);
+ return true;
+ }
+ }
+ spin_unlock(&grace_lock);
+ return false;
+}
+
+/**
+ * locks_in_grace
+ *
+ * Lock managers call this function to determine when it is OK for them
+ * to answer ordinary lock requests, and when they should accept only
+ * lock reclaims.
+ */
+bool locks_in_grace(struct net *net)
+{
+ return __state_in_grace(net, false);
+}
+EXPORT_SYMBOL_GPL(locks_in_grace);
+
+bool opens_in_grace(struct net *net)
+{
+ return __state_in_grace(net, true);
+}
+EXPORT_SYMBOL_GPL(opens_in_grace);
+
+static int __net_init
+grace_init_net(struct net *net)
+{
+ struct list_head *grace_list = net_generic(net, grace_net_id);
+
+ INIT_LIST_HEAD(grace_list);
+ return 0;
+}
+
+static void __net_exit
+grace_exit_net(struct net *net)
+{
+ struct list_head *grace_list = net_generic(net, grace_net_id);
+
+ WARN_ONCE(!list_empty(grace_list),
+ "net %x %s: grace_list is not empty\n",
+ net->ns.inum, __func__);
+}
+
+static struct pernet_operations grace_net_ops = {
+ .init = grace_init_net,
+ .exit = grace_exit_net,
+ .id = &grace_net_id,
+ .size = sizeof(struct list_head),
+};
+
+static int __init
+init_grace(void)
+{
+ return register_pernet_subsys(&grace_net_ops);
+}
+
+static void __exit
+exit_grace(void)
+{
+ unregister_pernet_subsys(&grace_net_ops);
+}
+
+MODULE_AUTHOR("Jeff Layton <jlayton@primarydata.com>");
+MODULE_LICENSE("GPL");
+module_init(init_grace)
+module_exit(exit_grace)
diff --git a/fs/nfs_common/nfs_ssc.c b/fs/nfs_common/nfs_ssc.c
new file mode 100644
index 000000000..f43bbb373
--- /dev/null
+++ b/fs/nfs_common/nfs_ssc.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * fs/nfs_common/nfs_ssc_comm.c
+ *
+ * Helper for knfsd's SSC to access ops in NFS client modules
+ *
+ * Author: Dai Ngo <dai.ngo@oracle.com>
+ *
+ * Copyright (c) 2020, Oracle and/or its affiliates.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/nfs_ssc.h>
+#include "../nfs/nfs4_fs.h"
+
+MODULE_LICENSE("GPL");
+
+struct nfs_ssc_client_ops_tbl nfs_ssc_client_tbl;
+EXPORT_SYMBOL_GPL(nfs_ssc_client_tbl);
+
+#ifdef CONFIG_NFS_V4_2
+/**
+ * nfs42_ssc_register - install the NFS_V4 client ops in the nfs_ssc_client_tbl
+ * @ops: NFS_V4 ops to be installed
+ *
+ * Return values:
+ * None
+ */
+void nfs42_ssc_register(const struct nfs4_ssc_client_ops *ops)
+{
+ nfs_ssc_client_tbl.ssc_nfs4_ops = ops;
+}
+EXPORT_SYMBOL_GPL(nfs42_ssc_register);
+
+/**
+ * nfs42_ssc_unregister - uninstall the NFS_V4 client ops from
+ * the nfs_ssc_client_tbl
+ * @ops: ops to be uninstalled
+ *
+ * Return values:
+ * None
+ */
+void nfs42_ssc_unregister(const struct nfs4_ssc_client_ops *ops)
+{
+ if (nfs_ssc_client_tbl.ssc_nfs4_ops != ops)
+ return;
+
+ nfs_ssc_client_tbl.ssc_nfs4_ops = NULL;
+}
+EXPORT_SYMBOL_GPL(nfs42_ssc_unregister);
+#endif /* CONFIG_NFS_V4_2 */
+
+#ifdef CONFIG_NFS_V4_2
+/**
+ * nfs_ssc_register - install the NFS_FS client ops in the nfs_ssc_client_tbl
+ * @ops: NFS_FS ops to be installed
+ *
+ * Return values:
+ * None
+ */
+void nfs_ssc_register(const struct nfs_ssc_client_ops *ops)
+{
+ nfs_ssc_client_tbl.ssc_nfs_ops = ops;
+}
+EXPORT_SYMBOL_GPL(nfs_ssc_register);
+
+/**
+ * nfs_ssc_unregister - uninstall the NFS_FS client ops from
+ * the nfs_ssc_client_tbl
+ * @ops: ops to be uninstalled
+ *
+ * Return values:
+ * None
+ */
+void nfs_ssc_unregister(const struct nfs_ssc_client_ops *ops)
+{
+ if (nfs_ssc_client_tbl.ssc_nfs_ops != ops)
+ return;
+ nfs_ssc_client_tbl.ssc_nfs_ops = NULL;
+}
+EXPORT_SYMBOL_GPL(nfs_ssc_unregister);
+
+#else
+void nfs_ssc_register(const struct nfs_ssc_client_ops *ops)
+{
+}
+EXPORT_SYMBOL_GPL(nfs_ssc_register);
+
+void nfs_ssc_unregister(const struct nfs_ssc_client_ops *ops)
+{
+}
+EXPORT_SYMBOL_GPL(nfs_ssc_unregister);
+#endif /* CONFIG_NFS_V4_2 */
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
new file mode 100644
index 000000000..d056ad2fd
--- /dev/null
+++ b/fs/nfs_common/nfsacl.c
@@ -0,0 +1,297 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * fs/nfs_common/nfsacl.c
+ *
+ * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de>
+ */
+
+/*
+ * The Solaris nfsacl protocol represents some ACLs slightly differently
+ * than POSIX 1003.1e draft 17 does (and we do):
+ *
+ * - Minimal ACLs always have an ACL_MASK entry, so they have
+ * four instead of three entries.
+ * - The ACL_MASK entry in such minimal ACLs always has the same
+ * permissions as the ACL_GROUP_OBJ entry. (In extended ACLs
+ * the ACL_MASK and ACL_GROUP_OBJ entries may differ.)
+ * - The identifier fields of the ACL_USER_OBJ and ACL_GROUP_OBJ
+ * entries contain the identifiers of the owner and owning group.
+ * (In POSIX ACLs we always set them to ACL_UNDEFINED_ID).
+ * - ACL entries in the kernel are kept sorted in ascending order
+ * of (e_tag, e_id). Solaris ACLs are unsorted.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/nfsacl.h>
+#include <linux/nfs3.h>
+#include <linux/sort.h>
+
+MODULE_LICENSE("GPL");
+
+struct nfsacl_encode_desc {
+ struct xdr_array2_desc desc;
+ unsigned int count;
+ struct posix_acl *acl;
+ int typeflag;
+ kuid_t uid;
+ kgid_t gid;
+};
+
+struct nfsacl_simple_acl {
+ struct posix_acl acl;
+ struct posix_acl_entry ace[4];
+};
+
+static int
+xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
+{
+ struct nfsacl_encode_desc *nfsacl_desc =
+ (struct nfsacl_encode_desc *) desc;
+ __be32 *p = elem;
+
+ struct posix_acl_entry *entry =
+ &nfsacl_desc->acl->a_entries[nfsacl_desc->count++];
+
+ *p++ = htonl(entry->e_tag | nfsacl_desc->typeflag);
+ switch(entry->e_tag) {
+ case ACL_USER_OBJ:
+ *p++ = htonl(from_kuid(&init_user_ns, nfsacl_desc->uid));
+ break;
+ case ACL_GROUP_OBJ:
+ *p++ = htonl(from_kgid(&init_user_ns, nfsacl_desc->gid));
+ break;
+ case ACL_USER:
+ *p++ = htonl(from_kuid(&init_user_ns, entry->e_uid));
+ break;
+ case ACL_GROUP:
+ *p++ = htonl(from_kgid(&init_user_ns, entry->e_gid));
+ break;
+ default: /* Solaris depends on that! */
+ *p++ = 0;
+ break;
+ }
+ *p++ = htonl(entry->e_perm & S_IRWXO);
+ return 0;
+}
+
+/**
+ * nfsacl_encode - Encode an NFSv3 ACL
+ *
+ * @buf: destination xdr_buf to contain XDR encoded ACL
+ * @base: byte offset in xdr_buf where XDR'd ACL begins
+ * @inode: inode of file whose ACL this is
+ * @acl: posix_acl to encode
+ * @encode_entries: whether to encode ACEs as well
+ * @typeflag: ACL type: NFS_ACL_DEFAULT or zero
+ *
+ * Returns size of encoded ACL in bytes or a negative errno value.
+ */
+int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
+ struct posix_acl *acl, int encode_entries, int typeflag)
+{
+ int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0;
+ struct nfsacl_encode_desc nfsacl_desc = {
+ .desc = {
+ .elem_size = 12,
+ .array_len = encode_entries ? entries : 0,
+ .xcode = xdr_nfsace_encode,
+ },
+ .acl = acl,
+ .typeflag = typeflag,
+ .uid = inode->i_uid,
+ .gid = inode->i_gid,
+ };
+ struct nfsacl_simple_acl aclbuf;
+ int err;
+
+ if (entries > NFS_ACL_MAX_ENTRIES ||
+ xdr_encode_word(buf, base, entries))
+ return -EINVAL;
+ if (encode_entries && acl && acl->a_count == 3) {
+ struct posix_acl *acl2 = &aclbuf.acl;
+
+ /* Avoid the use of posix_acl_alloc(). nfsacl_encode() is
+ * invoked in contexts where a memory allocation failure is
+ * fatal. Fortunately this fake ACL is small enough to
+ * construct on the stack. */
+ posix_acl_init(acl2, 4);
+
+ /* Insert entries in canonical order: other orders seem
+ to confuse Solaris VxFS. */
+ acl2->a_entries[0] = acl->a_entries[0]; /* ACL_USER_OBJ */
+ acl2->a_entries[1] = acl->a_entries[1]; /* ACL_GROUP_OBJ */
+ acl2->a_entries[2] = acl->a_entries[1]; /* ACL_MASK */
+ acl2->a_entries[2].e_tag = ACL_MASK;
+ acl2->a_entries[3] = acl->a_entries[2]; /* ACL_OTHER */
+ nfsacl_desc.acl = acl2;
+ }
+ err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc);
+ if (!err)
+ err = 8 + nfsacl_desc.desc.elem_size *
+ nfsacl_desc.desc.array_len;
+ return err;
+}
+EXPORT_SYMBOL_GPL(nfsacl_encode);
+
+struct nfsacl_decode_desc {
+ struct xdr_array2_desc desc;
+ unsigned int count;
+ struct posix_acl *acl;
+};
+
+static int
+xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem)
+{
+ struct nfsacl_decode_desc *nfsacl_desc =
+ (struct nfsacl_decode_desc *) desc;
+ __be32 *p = elem;
+ struct posix_acl_entry *entry;
+ unsigned int id;
+
+ if (!nfsacl_desc->acl) {
+ if (desc->array_len > NFS_ACL_MAX_ENTRIES)
+ return -EINVAL;
+ nfsacl_desc->acl = posix_acl_alloc(desc->array_len, GFP_KERNEL);
+ if (!nfsacl_desc->acl)
+ return -ENOMEM;
+ nfsacl_desc->count = 0;
+ }
+
+ entry = &nfsacl_desc->acl->a_entries[nfsacl_desc->count++];
+ entry->e_tag = ntohl(*p++) & ~NFS_ACL_DEFAULT;
+ id = ntohl(*p++);
+ entry->e_perm = ntohl(*p++);
+
+ switch(entry->e_tag) {
+ case ACL_USER:
+ entry->e_uid = make_kuid(&init_user_ns, id);
+ if (!uid_valid(entry->e_uid))
+ return -EINVAL;
+ break;
+ case ACL_GROUP:
+ entry->e_gid = make_kgid(&init_user_ns, id);
+ if (!gid_valid(entry->e_gid))
+ return -EINVAL;
+ break;
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_OTHER:
+ if (entry->e_perm & ~S_IRWXO)
+ return -EINVAL;
+ break;
+ case ACL_MASK:
+ /* Solaris sometimes sets additional bits in the mask */
+ entry->e_perm &= S_IRWXO;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+cmp_acl_entry(const void *x, const void *y)
+{
+ const struct posix_acl_entry *a = x, *b = y;
+
+ if (a->e_tag != b->e_tag)
+ return a->e_tag - b->e_tag;
+ else if ((a->e_tag == ACL_USER) && uid_gt(a->e_uid, b->e_uid))
+ return 1;
+ else if ((a->e_tag == ACL_USER) && uid_lt(a->e_uid, b->e_uid))
+ return -1;
+ else if ((a->e_tag == ACL_GROUP) && gid_gt(a->e_gid, b->e_gid))
+ return 1;
+ else if ((a->e_tag == ACL_GROUP) && gid_lt(a->e_gid, b->e_gid))
+ return -1;
+ else
+ return 0;
+}
+
+/*
+ * Convert from a Solaris ACL to a POSIX 1003.1e draft 17 ACL.
+ */
+static int
+posix_acl_from_nfsacl(struct posix_acl *acl)
+{
+ struct posix_acl_entry *pa, *pe,
+ *group_obj = NULL, *mask = NULL;
+
+ if (!acl)
+ return 0;
+
+ sort(acl->a_entries, acl->a_count, sizeof(struct posix_acl_entry),
+ cmp_acl_entry, NULL);
+
+ /* Find the ACL_GROUP_OBJ and ACL_MASK entries. */
+ FOREACH_ACL_ENTRY(pa, acl, pe) {
+ switch(pa->e_tag) {
+ case ACL_USER_OBJ:
+ break;
+ case ACL_GROUP_OBJ:
+ group_obj = pa;
+ break;
+ case ACL_MASK:
+ mask = pa;
+ fallthrough;
+ case ACL_OTHER:
+ break;
+ }
+ }
+ if (acl->a_count == 4 && group_obj && mask &&
+ mask->e_perm == group_obj->e_perm) {
+ /* remove bogus ACL_MASK entry */
+ memmove(mask, mask+1, (3 - (mask - acl->a_entries)) *
+ sizeof(struct posix_acl_entry));
+ acl->a_count = 3;
+ }
+ return 0;
+}
+
+/**
+ * nfsacl_decode - Decode an NFSv3 ACL
+ *
+ * @buf: xdr_buf containing XDR'd ACL data to decode
+ * @base: byte offset in xdr_buf where XDR'd ACL begins
+ * @aclcnt: count of ACEs in decoded posix_acl
+ * @pacl: buffer in which to place decoded posix_acl
+ *
+ * Returns the length of the decoded ACL in bytes, or a negative errno value.
+ */
+int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
+ struct posix_acl **pacl)
+{
+ struct nfsacl_decode_desc nfsacl_desc = {
+ .desc = {
+ .elem_size = 12,
+ .xcode = pacl ? xdr_nfsace_decode : NULL,
+ },
+ };
+ u32 entries;
+ int err;
+
+ if (xdr_decode_word(buf, base, &entries) ||
+ entries > NFS_ACL_MAX_ENTRIES)
+ return -EINVAL;
+ nfsacl_desc.desc.array_maxlen = entries;
+ err = xdr_decode_array2(buf, base + 4, &nfsacl_desc.desc);
+ if (err)
+ return err;
+ if (pacl) {
+ if (entries != nfsacl_desc.desc.array_len ||
+ posix_acl_from_nfsacl(nfsacl_desc.acl) != 0) {
+ posix_acl_release(nfsacl_desc.acl);
+ return -EINVAL;
+ }
+ *pacl = nfsacl_desc.acl;
+ }
+ if (aclcnt)
+ *aclcnt = entries;
+ return 8 + nfsacl_desc.desc.elem_size *
+ nfsacl_desc.desc.array_len;
+}
+EXPORT_SYMBOL_GPL(nfsacl_decode);
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
new file mode 100644
index 000000000..248f1459c
--- /dev/null
+++ b/fs/nfsd/Kconfig
@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config NFSD
+ tristate "NFS server support"
+ depends on INET
+ depends on FILE_LOCKING
+ depends on FSNOTIFY
+ select LOCKD
+ select SUNRPC
+ select EXPORTFS
+ select NFS_ACL_SUPPORT if NFSD_V2_ACL
+ depends on MULTIUSER
+ help
+ Choose Y here if you want to allow other computers to access
+ files residing on this system using Sun's Network File System
+ protocol. To compile the NFS server support as a module,
+ choose M here: the module will be called nfsd.
+
+ You may choose to use a user-space NFS server instead, in which
+ case you can choose N here.
+
+ To export local file systems using NFS, you also need to install
+ user space programs which can be found in the Linux nfs-utils
+ package, available from http://linux-nfs.org/. More detail about
+ the Linux NFS server implementation is available via the
+ exports(5) man page.
+
+ Below you can choose which versions of the NFS protocol are
+ available to clients mounting the NFS server on this system.
+ Support for NFS version 2 (RFC 1094) is always available when
+ CONFIG_NFSD is selected.
+
+ If unsure, say N.
+
+config NFSD_V2_ACL
+ bool
+ depends on NFSD
+
+config NFSD_V3
+ bool "NFS server support for NFS version 3"
+ depends on NFSD
+ help
+ This option enables support in your system's NFS server for
+ version 3 of the NFS protocol (RFC 1813).
+
+ If unsure, say Y.
+
+config NFSD_V3_ACL
+ bool "NFS server support for the NFSv3 ACL protocol extension"
+ depends on NFSD_V3
+ select NFSD_V2_ACL
+ help
+ Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
+ never became an official part of the NFS version 3 protocol.
+ This protocol extension allows applications on NFS clients to
+ manipulate POSIX Access Control Lists on files residing on NFS
+ servers. NFS servers enforce POSIX ACLs on local files whether
+ this protocol is available or not.
+
+ This option enables support in your system's NFS server for the
+ NFSv3 ACL protocol extension allowing NFS clients to manipulate
+ POSIX ACLs on files exported by your system's NFS server. NFS
+ clients which support the Solaris NFSv3 ACL protocol can then
+ access and modify ACLs on your NFS server.
+
+ To store ACLs on your NFS server, you also need to enable ACL-
+ related CONFIG options for your local file systems of choice.
+
+ If unsure, say N.
+
+config NFSD_V4
+ bool "NFS server support for NFS version 4"
+ depends on NFSD && PROC_FS
+ select NFSD_V3
+ select FS_POSIX_ACL
+ select SUNRPC_GSS
+ select CRYPTO
+ select CRYPTO_MD5
+ select CRYPTO_SHA256
+ select GRACE_PERIOD
+ help
+ This option enables support in your system's NFS server for
+ version 4 of the NFS protocol (RFC 3530).
+
+ To export files using NFSv4, you need to install additional user
+ space programs which can be found in the Linux nfs-utils package,
+ available from http://linux-nfs.org/.
+
+ If unsure, say N.
+
+config NFSD_PNFS
+ bool
+
+config NFSD_BLOCKLAYOUT
+ bool "NFSv4.1 server support for pNFS block layouts"
+ depends on NFSD_V4 && BLOCK
+ select NFSD_PNFS
+ select EXPORTFS_BLOCK_OPS
+ help
+ This option enables support for the exporting pNFS block layouts
+ in the kernel's NFS server. The pNFS block layout enables NFS
+ clients to directly perform I/O to block devices accesible to both
+ the server and the clients. See RFC 5663 for more details.
+
+ If unsure, say N.
+
+config NFSD_SCSILAYOUT
+ bool "NFSv4.1 server support for pNFS SCSI layouts"
+ depends on NFSD_V4 && BLOCK
+ select NFSD_PNFS
+ select EXPORTFS_BLOCK_OPS
+ select BLK_SCSI_REQUEST
+ help
+ This option enables support for the exporting pNFS SCSI layouts
+ in the kernel's NFS server. The pNFS SCSI layout enables NFS
+ clients to directly perform I/O to SCSI devices accesible to both
+ the server and the clients. See draft-ietf-nfsv4-scsi-layout for
+ more details.
+
+ If unsure, say N.
+
+config NFSD_FLEXFILELAYOUT
+ bool "NFSv4.1 server support for pNFS Flex File layouts"
+ depends on NFSD_V4
+ select NFSD_PNFS
+ help
+ This option enables support for the exporting pNFS Flex File
+ layouts in the kernel's NFS server. The pNFS Flex File layout
+ enables NFS clients to directly perform I/O to NFSv3 devices
+ accesible to both the server and the clients. See
+ draft-ietf-nfsv4-flex-files for more details.
+
+ Warning, this server implements the bare minimum functionality
+ to be a flex file server - it is for testing the client,
+ not for use in production.
+
+ If unsure, say N.
+
+config NFSD_V4_2_INTER_SSC
+ bool "NFSv4.2 inter server to server COPY"
+ depends on NFSD_V4 && NFS_V4_1 && NFS_V4_2
+ help
+ This option enables support for NFSv4.2 inter server to
+ server copy where the destination server calls the NFSv4.2
+ client to read the data to copy from the source server.
+
+ If unsure, say N.
+
+config NFSD_V4_SECURITY_LABEL
+ bool "Provide Security Label support for NFSv4 server"
+ depends on NFSD_V4 && SECURITY
+ help
+
+ Say Y here if you want enable fine-grained security label attribute
+ support for NFS version 4. Security labels allow security modules like
+ SELinux and Smack to label files to facilitate enforcement of their policies.
+ Without this an NFSv4 mount will have the same label on each file.
+
+ If you do not wish to enable fine-grained security labels SELinux or
+ Smack policies on NFSv4 files, say N.
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
new file mode 100644
index 000000000..3f0983e93
--- /dev/null
+++ b/fs/nfsd/Makefile
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the Linux nfs server
+#
+
+ccflags-y += -I$(src) # needed for trace events
+
+obj-$(CONFIG_NFSD) += nfsd.o
+
+# this one should be compiled first, as the tracing macros can easily blow up
+nfsd-y += trace.o
+
+nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
+ export.o auth.o lockd.o nfscache.o nfsxdr.o \
+ stats.o filecache.o
+nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
+nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
+nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
+nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
+ nfs4acl.o nfs4callback.o nfs4recover.o
+nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
+nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
+nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
+nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
new file mode 100644
index 000000000..ba14d2f4b
--- /dev/null
+++ b/fs/nfsd/acl.h
@@ -0,0 +1,51 @@
+/*
+ * Common NFSv4 ACL handling definitions.
+ *
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LINUX_NFS4_ACL_H
+#define LINUX_NFS4_ACL_H
+
+struct nfs4_acl;
+struct svc_fh;
+struct svc_rqst;
+
+int nfs4_acl_bytes(int entries);
+int nfs4_acl_get_whotype(char *, u32);
+__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who);
+
+int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
+ struct nfs4_acl **acl);
+__be32 nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfs4_acl *acl);
+
+#endif /* LINUX_NFS4_ACL_H */
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
new file mode 100644
index 000000000..fdf2aad73
--- /dev/null
+++ b/fs/nfsd/auth.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */
+
+#include <linux/sched.h>
+#include "nfsd.h"
+#include "auth.h"
+
+int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
+{
+ struct exp_flavor_info *f;
+ struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
+
+ for (f = exp->ex_flavors; f < end; f++) {
+ if (f->pseudoflavor == rqstp->rq_cred.cr_flavor)
+ return f->flags;
+ }
+ return exp->ex_flags;
+
+}
+
+int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
+{
+ struct group_info *rqgi;
+ struct group_info *gi;
+ struct cred *new;
+ int i;
+ int flags = nfsexp_flags(rqstp, exp);
+
+ validate_process_creds();
+
+ /* discard any old override before preparing the new set */
+ revert_creds(get_cred(current_real_cred()));
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+
+ new->fsuid = rqstp->rq_cred.cr_uid;
+ new->fsgid = rqstp->rq_cred.cr_gid;
+
+ rqgi = rqstp->rq_cred.cr_group_info;
+
+ if (flags & NFSEXP_ALLSQUASH) {
+ new->fsuid = exp->ex_anon_uid;
+ new->fsgid = exp->ex_anon_gid;
+ gi = groups_alloc(0);
+ if (!gi)
+ goto oom;
+ } else if (flags & NFSEXP_ROOTSQUASH) {
+ if (uid_eq(new->fsuid, GLOBAL_ROOT_UID))
+ new->fsuid = exp->ex_anon_uid;
+ if (gid_eq(new->fsgid, GLOBAL_ROOT_GID))
+ new->fsgid = exp->ex_anon_gid;
+
+ gi = groups_alloc(rqgi->ngroups);
+ if (!gi)
+ goto oom;
+
+ for (i = 0; i < rqgi->ngroups; i++) {
+ if (gid_eq(GLOBAL_ROOT_GID, rqgi->gid[i]))
+ gi->gid[i] = exp->ex_anon_gid;
+ else
+ gi->gid[i] = rqgi->gid[i];
+ }
+
+ /* Each thread allocates its own gi, no race */
+ groups_sort(gi);
+ } else {
+ gi = get_group_info(rqgi);
+ }
+
+ if (uid_eq(new->fsuid, INVALID_UID))
+ new->fsuid = exp->ex_anon_uid;
+ if (gid_eq(new->fsgid, INVALID_GID))
+ new->fsgid = exp->ex_anon_gid;
+
+ set_groups(new, gi);
+ put_group_info(gi);
+
+ if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID))
+ new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
+ else
+ new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
+ new->cap_permitted);
+ validate_process_creds();
+ put_cred(override_creds(new));
+ put_cred(new);
+ validate_process_creds();
+ return 0;
+
+oom:
+ abort_creds(new);
+ return -ENOMEM;
+}
+
diff --git a/fs/nfsd/auth.h b/fs/nfsd/auth.h
new file mode 100644
index 000000000..dbd66424f
--- /dev/null
+++ b/fs/nfsd/auth.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * nfsd-specific authentication stuff.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef LINUX_NFSD_AUTH_H
+#define LINUX_NFSD_AUTH_H
+
+/*
+ * Set the current process's fsuid/fsgid etc to those of the NFS
+ * client user
+ */
+int nfsd_setuser(struct svc_rqst *, struct svc_export *);
+
+#endif /* LINUX_NFSD_AUTH_H */
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
new file mode 100644
index 000000000..a07c39c94
--- /dev/null
+++ b/fs/nfsd/blocklayout.c
@@ -0,0 +1,433 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014-2016 Christoph Hellwig.
+ */
+#include <linux/exportfs.h>
+#include <linux/iomap.h>
+#include <linux/genhd.h>
+#include <linux/slab.h>
+#include <linux/pr.h>
+
+#include <linux/nfsd/debug.h>
+#include <scsi/scsi_proto.h>
+#include <scsi/scsi_common.h>
+#include <scsi/scsi_request.h>
+
+#include "blocklayoutxdr.h"
+#include "pnfs.h"
+#include "filecache.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+
+static __be32
+nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
+ struct nfsd4_layoutget *args)
+{
+ struct nfsd4_layout_seg *seg = &args->lg_seg;
+ struct super_block *sb = inode->i_sb;
+ u32 block_size = i_blocksize(inode);
+ struct pnfs_block_extent *bex;
+ struct iomap iomap;
+ u32 device_generation = 0;
+ int error;
+
+ if (seg->offset & (block_size - 1)) {
+ dprintk("pnfsd: I/O misaligned\n");
+ goto out_layoutunavailable;
+ }
+
+ /*
+ * Some clients barf on non-zero block numbers for NONE or INVALID
+ * layouts, so make sure to zero the whole structure.
+ */
+ error = -ENOMEM;
+ bex = kzalloc(sizeof(*bex), GFP_KERNEL);
+ if (!bex)
+ goto out_error;
+ args->lg_content = bex;
+
+ error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length,
+ &iomap, seg->iomode != IOMODE_READ,
+ &device_generation);
+ if (error) {
+ if (error == -ENXIO)
+ goto out_layoutunavailable;
+ goto out_error;
+ }
+
+ if (iomap.length < args->lg_minlength) {
+ dprintk("pnfsd: extent smaller than minlength\n");
+ goto out_layoutunavailable;
+ }
+
+ switch (iomap.type) {
+ case IOMAP_MAPPED:
+ if (seg->iomode == IOMODE_READ)
+ bex->es = PNFS_BLOCK_READ_DATA;
+ else
+ bex->es = PNFS_BLOCK_READWRITE_DATA;
+ bex->soff = iomap.addr;
+ break;
+ case IOMAP_UNWRITTEN:
+ if (seg->iomode & IOMODE_RW) {
+ /*
+ * Crack monkey special case from section 2.3.1.
+ */
+ if (args->lg_minlength == 0) {
+ dprintk("pnfsd: no soup for you!\n");
+ goto out_layoutunavailable;
+ }
+
+ bex->es = PNFS_BLOCK_INVALID_DATA;
+ bex->soff = iomap.addr;
+ break;
+ }
+ fallthrough;
+ case IOMAP_HOLE:
+ if (seg->iomode == IOMODE_READ) {
+ bex->es = PNFS_BLOCK_NONE_DATA;
+ break;
+ }
+ fallthrough;
+ case IOMAP_DELALLOC:
+ default:
+ WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type);
+ goto out_layoutunavailable;
+ }
+
+ error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation);
+ if (error)
+ goto out_error;
+ bex->foff = iomap.offset;
+ bex->len = iomap.length;
+
+ seg->offset = iomap.offset;
+ seg->length = iomap.length;
+
+ dprintk("GET: 0x%llx:0x%llx %d\n", bex->foff, bex->len, bex->es);
+ return 0;
+
+out_error:
+ seg->length = 0;
+ return nfserrno(error);
+out_layoutunavailable:
+ seg->length = 0;
+ return nfserr_layoutunavailable;
+}
+
+static __be32
+nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp,
+ struct iomap *iomaps, int nr_iomaps)
+{
+ loff_t new_size = lcp->lc_last_wr + 1;
+ struct iattr iattr = { .ia_valid = 0 };
+ int error;
+
+ if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
+ timespec64_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
+ lcp->lc_mtime = current_time(inode);
+ iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
+ iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime;
+
+ if (new_size > i_size_read(inode)) {
+ iattr.ia_valid |= ATTR_SIZE;
+ iattr.ia_size = new_size;
+ }
+
+ error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps,
+ nr_iomaps, &iattr);
+ kfree(iomaps);
+ return nfserrno(error);
+}
+
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
+static int
+nfsd4_block_get_device_info_simple(struct super_block *sb,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ struct pnfs_block_deviceaddr *dev;
+ struct pnfs_block_volume *b;
+
+ dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
+ sizeof(struct pnfs_block_volume), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+ gdp->gd_device = dev;
+
+ dev->nr_volumes = 1;
+ b = &dev->volumes[0];
+
+ b->type = PNFS_BLOCK_VOLUME_SIMPLE;
+ b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
+ return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
+ &b->simple.offset);
+}
+
+static __be32
+nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
+ struct svc_rqst *rqstp,
+ struct nfs4_client *clp,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ if (bdev_is_partition(sb->s_bdev))
+ return nfserr_inval;
+ return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
+}
+
+static __be32
+nfsd4_block_proc_layoutcommit(struct inode *inode,
+ struct nfsd4_layoutcommit *lcp)
+{
+ struct iomap *iomaps;
+ int nr_iomaps;
+
+ nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
+ lcp->lc_up_len, &iomaps, i_blocksize(inode));
+ if (nr_iomaps < 0)
+ return nfserrno(nr_iomaps);
+
+ return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
+}
+
+const struct nfsd4_layout_ops bl_layout_ops = {
+ /*
+ * Pretend that we send notification to the client. This is a blatant
+ * lie to force recent Linux clients to cache our device IDs.
+ * We rarely ever change the device ID, so the harm of leaking deviceids
+ * for a while isn't too bad. Unfortunately RFC5661 is a complete mess
+ * in this regard, but I filed errata 4119 for this a while ago, and
+ * hopefully the Linux client will eventually start caching deviceids
+ * without this again.
+ */
+ .notify_types =
+ NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
+ .proc_getdeviceinfo = nfsd4_block_proc_getdeviceinfo,
+ .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo,
+ .proc_layoutget = nfsd4_block_proc_layoutget,
+ .encode_layoutget = nfsd4_block_encode_layoutget,
+ .proc_layoutcommit = nfsd4_block_proc_layoutcommit,
+};
+#endif /* CONFIG_NFSD_BLOCKLAYOUT */
+
+#ifdef CONFIG_NFSD_SCSILAYOUT
+static int nfsd4_scsi_identify_device(struct block_device *bdev,
+ struct pnfs_block_volume *b)
+{
+ struct request_queue *q = bdev->bd_disk->queue;
+ struct request *rq;
+ struct scsi_request *req;
+ /*
+ * The allocation length (passed in bytes 3 and 4 of the INQUIRY
+ * command descriptor block) specifies the number of bytes that have
+ * been allocated for the data-in buffer.
+ * 252 is the highest one-byte value that is a multiple of 4.
+ * 65532 is the highest two-byte value that is a multiple of 4.
+ */
+ size_t bufflen = 252, maxlen = 65532, len, id_len;
+ u8 *buf, *d, type, assoc;
+ int retries = 1, error;
+
+ if (WARN_ON_ONCE(!blk_queue_scsi_passthrough(q)))
+ return -EINVAL;
+
+again:
+ buf = kzalloc(bufflen, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ rq = blk_get_request(q, REQ_OP_SCSI_IN, 0);
+ if (IS_ERR(rq)) {
+ error = -ENOMEM;
+ goto out_free_buf;
+ }
+ req = scsi_req(rq);
+
+ error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
+ if (error)
+ goto out_put_request;
+
+ req->cmd[0] = INQUIRY;
+ req->cmd[1] = 1;
+ req->cmd[2] = 0x83;
+ req->cmd[3] = bufflen >> 8;
+ req->cmd[4] = bufflen & 0xff;
+ req->cmd_len = COMMAND_SIZE(INQUIRY);
+
+ blk_execute_rq(rq->q, NULL, rq, 1);
+ if (req->result) {
+ pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
+ req->result);
+ error = -EIO;
+ goto out_put_request;
+ }
+
+ len = (buf[2] << 8) + buf[3] + 4;
+ if (len > bufflen) {
+ if (len <= maxlen && retries--) {
+ blk_put_request(rq);
+ kfree(buf);
+ bufflen = len;
+ goto again;
+ }
+ pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n",
+ len);
+ goto out_put_request;
+ }
+
+ d = buf + 4;
+ for (d = buf + 4; d < buf + len; d += id_len + 4) {
+ id_len = d[3];
+ type = d[1] & 0xf;
+ assoc = (d[1] >> 4) & 0x3;
+
+ /*
+ * We only care about a EUI-64 and NAA designator types
+ * with LU association.
+ */
+ if (assoc != 0x00)
+ continue;
+ if (type != 0x02 && type != 0x03)
+ continue;
+ if (id_len != 8 && id_len != 12 && id_len != 16)
+ continue;
+
+ b->scsi.code_set = PS_CODE_SET_BINARY;
+ b->scsi.designator_type = type == 0x02 ?
+ PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA;
+ b->scsi.designator_len = id_len;
+ memcpy(b->scsi.designator, d + 4, id_len);
+
+ /*
+ * If we found a 8 or 12 byte descriptor continue on to
+ * see if a 16 byte one is available. If we find a
+ * 16 byte descriptor we're done.
+ */
+ if (id_len == 16)
+ break;
+ }
+
+out_put_request:
+ blk_put_request(rq);
+out_free_buf:
+ kfree(buf);
+ return error;
+}
+
+#define NFSD_MDS_PR_KEY 0x0100000000000000ULL
+
+/*
+ * We use the client ID as a unique key for the reservations.
+ * This allows us to easily fence a client when recalls fail.
+ */
+static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp)
+{
+ return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id;
+}
+
+static int
+nfsd4_block_get_device_info_scsi(struct super_block *sb,
+ struct nfs4_client *clp,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ struct pnfs_block_deviceaddr *dev;
+ struct pnfs_block_volume *b;
+ const struct pr_ops *ops;
+ int error;
+
+ dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
+ sizeof(struct pnfs_block_volume), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+ gdp->gd_device = dev;
+
+ dev->nr_volumes = 1;
+ b = &dev->volumes[0];
+
+ b->type = PNFS_BLOCK_VOLUME_SCSI;
+ b->scsi.pr_key = nfsd4_scsi_pr_key(clp);
+
+ error = nfsd4_scsi_identify_device(sb->s_bdev, b);
+ if (error)
+ return error;
+
+ ops = sb->s_bdev->bd_disk->fops->pr_ops;
+ if (!ops) {
+ pr_err("pNFS: device %s does not support PRs.\n",
+ sb->s_id);
+ return -EINVAL;
+ }
+
+ error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
+ if (error) {
+ pr_err("pNFS: failed to register key for device %s.\n",
+ sb->s_id);
+ return -EINVAL;
+ }
+
+ error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
+ PR_EXCLUSIVE_ACCESS_REG_ONLY, 0);
+ if (error) {
+ pr_err("pNFS: failed to reserve device %s.\n",
+ sb->s_id);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static __be32
+nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb,
+ struct svc_rqst *rqstp,
+ struct nfs4_client *clp,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ if (bdev_is_partition(sb->s_bdev))
+ return nfserr_inval;
+ return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp));
+}
+static __be32
+nfsd4_scsi_proc_layoutcommit(struct inode *inode,
+ struct nfsd4_layoutcommit *lcp)
+{
+ struct iomap *iomaps;
+ int nr_iomaps;
+
+ nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
+ lcp->lc_up_len, &iomaps, i_blocksize(inode));
+ if (nr_iomaps < 0)
+ return nfserrno(nr_iomaps);
+
+ return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
+}
+
+static void
+nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls)
+{
+ struct nfs4_client *clp = ls->ls_stid.sc_client;
+ struct block_device *bdev = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_bdev;
+
+ bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
+ nfsd4_scsi_pr_key(clp), 0, true);
+}
+
+const struct nfsd4_layout_ops scsi_layout_ops = {
+ /*
+ * Pretend that we send notification to the client. This is a blatant
+ * lie to force recent Linux clients to cache our device IDs.
+ * We rarely ever change the device ID, so the harm of leaking deviceids
+ * for a while isn't too bad. Unfortunately RFC5661 is a complete mess
+ * in this regard, but I filed errata 4119 for this a while ago, and
+ * hopefully the Linux client will eventually start caching deviceids
+ * without this again.
+ */
+ .notify_types =
+ NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
+ .proc_getdeviceinfo = nfsd4_scsi_proc_getdeviceinfo,
+ .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo,
+ .proc_layoutget = nfsd4_block_proc_layoutget,
+ .encode_layoutget = nfsd4_block_encode_layoutget,
+ .proc_layoutcommit = nfsd4_scsi_proc_layoutcommit,
+ .fence_client = nfsd4_scsi_fence_client,
+};
+#endif /* CONFIG_NFSD_SCSILAYOUT */
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
new file mode 100644
index 000000000..2455dc8be
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014-2016 Christoph Hellwig.
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/exportfs.h>
+#include <linux/iomap.h>
+#include <linux/nfs4.h>
+
+#include "nfsd.h"
+#include "blocklayoutxdr.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+
+__be32
+nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
+ struct nfsd4_layoutget *lgp)
+{
+ struct pnfs_block_extent *b = lgp->lg_content;
+ int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, sizeof(__be32) + len);
+ if (!p)
+ return nfserr_toosmall;
+
+ *p++ = cpu_to_be32(len);
+ *p++ = cpu_to_be32(1); /* we always return a single extent */
+
+ p = xdr_encode_opaque_fixed(p, &b->vol_id,
+ sizeof(struct nfsd4_deviceid));
+ p = xdr_encode_hyper(p, b->foff);
+ p = xdr_encode_hyper(p, b->len);
+ p = xdr_encode_hyper(p, b->soff);
+ *p++ = cpu_to_be32(b->es);
+ return 0;
+}
+
+static int
+nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
+{
+ __be32 *p;
+ int len;
+
+ switch (b->type) {
+ case PNFS_BLOCK_VOLUME_SIMPLE:
+ len = 4 + 4 + 8 + 4 + (XDR_QUADLEN(b->simple.sig_len) << 2);
+ p = xdr_reserve_space(xdr, len);
+ if (!p)
+ return -ETOOSMALL;
+
+ *p++ = cpu_to_be32(b->type);
+ *p++ = cpu_to_be32(1); /* single signature */
+ p = xdr_encode_hyper(p, b->simple.offset);
+ p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
+ break;
+ case PNFS_BLOCK_VOLUME_SCSI:
+ len = 4 + 4 + 4 + 4 + (XDR_QUADLEN(b->scsi.designator_len) << 2) + 8;
+ p = xdr_reserve_space(xdr, len);
+ if (!p)
+ return -ETOOSMALL;
+
+ *p++ = cpu_to_be32(b->type);
+ *p++ = cpu_to_be32(b->scsi.code_set);
+ *p++ = cpu_to_be32(b->scsi.designator_type);
+ p = xdr_encode_opaque(p, b->scsi.designator, b->scsi.designator_len);
+ p = xdr_encode_hyper(p, b->scsi.pr_key);
+ break;
+ default:
+ return -ENOTSUPP;
+ }
+
+ return len;
+}
+
+__be32
+nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ struct pnfs_block_deviceaddr *dev = gdp->gd_device;
+ int len = sizeof(__be32), ret, i;
+ __be32 *p;
+
+ /*
+ * See paragraph 5 of RFC 8881 S18.40.3.
+ */
+ if (!gdp->gd_maxcount) {
+ if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+ return nfserr_resource;
+ return nfs_ok;
+ }
+
+ p = xdr_reserve_space(xdr, len + sizeof(__be32));
+ if (!p)
+ return nfserr_resource;
+
+ for (i = 0; i < dev->nr_volumes; i++) {
+ ret = nfsd4_block_encode_volume(xdr, &dev->volumes[i]);
+ if (ret < 0)
+ return nfserrno(ret);
+ len += ret;
+ }
+
+ /*
+ * Fill in the overall length and number of volumes at the beginning
+ * of the layout.
+ */
+ *p++ = cpu_to_be32(len);
+ *p++ = cpu_to_be32(dev->nr_volumes);
+ return 0;
+}
+
+int
+nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+ u32 block_size)
+{
+ struct iomap *iomaps;
+ u32 nr_iomaps, i;
+
+ if (len < sizeof(u32)) {
+ dprintk("%s: extent array too small: %u\n", __func__, len);
+ return -EINVAL;
+ }
+ len -= sizeof(u32);
+ if (len % PNFS_BLOCK_EXTENT_SIZE) {
+ dprintk("%s: extent array invalid: %u\n", __func__, len);
+ return -EINVAL;
+ }
+
+ nr_iomaps = be32_to_cpup(p++);
+ if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) {
+ dprintk("%s: extent array size mismatch: %u/%u\n",
+ __func__, len, nr_iomaps);
+ return -EINVAL;
+ }
+
+ iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
+ if (!iomaps) {
+ dprintk("%s: failed to allocate extent array\n", __func__);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < nr_iomaps; i++) {
+ struct pnfs_block_extent bex;
+
+ memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid));
+ p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid));
+
+ p = xdr_decode_hyper(p, &bex.foff);
+ if (bex.foff & (block_size - 1)) {
+ dprintk("%s: unaligned offset 0x%llx\n",
+ __func__, bex.foff);
+ goto fail;
+ }
+ p = xdr_decode_hyper(p, &bex.len);
+ if (bex.len & (block_size - 1)) {
+ dprintk("%s: unaligned length 0x%llx\n",
+ __func__, bex.foff);
+ goto fail;
+ }
+ p = xdr_decode_hyper(p, &bex.soff);
+ if (bex.soff & (block_size - 1)) {
+ dprintk("%s: unaligned disk offset 0x%llx\n",
+ __func__, bex.soff);
+ goto fail;
+ }
+ bex.es = be32_to_cpup(p++);
+ if (bex.es != PNFS_BLOCK_READWRITE_DATA) {
+ dprintk("%s: incorrect extent state %d\n",
+ __func__, bex.es);
+ goto fail;
+ }
+
+ iomaps[i].offset = bex.foff;
+ iomaps[i].length = bex.len;
+ }
+
+ *iomapp = iomaps;
+ return nr_iomaps;
+fail:
+ kfree(iomaps);
+ return -EINVAL;
+}
+
+int
+nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+ u32 block_size)
+{
+ struct iomap *iomaps;
+ u32 nr_iomaps, expected, i;
+
+ if (len < sizeof(u32)) {
+ dprintk("%s: extent array too small: %u\n", __func__, len);
+ return -EINVAL;
+ }
+
+ nr_iomaps = be32_to_cpup(p++);
+ expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE;
+ if (len != expected) {
+ dprintk("%s: extent array size mismatch: %u/%u\n",
+ __func__, len, expected);
+ return -EINVAL;
+ }
+
+ iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
+ if (!iomaps) {
+ dprintk("%s: failed to allocate extent array\n", __func__);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < nr_iomaps; i++) {
+ u64 val;
+
+ p = xdr_decode_hyper(p, &val);
+ if (val & (block_size - 1)) {
+ dprintk("%s: unaligned offset 0x%llx\n", __func__, val);
+ goto fail;
+ }
+ iomaps[i].offset = val;
+
+ p = xdr_decode_hyper(p, &val);
+ if (val & (block_size - 1)) {
+ dprintk("%s: unaligned length 0x%llx\n", __func__, val);
+ goto fail;
+ }
+ iomaps[i].length = val;
+ }
+
+ *iomapp = iomaps;
+ return nr_iomaps;
+fail:
+ kfree(iomaps);
+ return -EINVAL;
+}
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
new file mode 100644
index 000000000..bc5166bfe
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NFSD_BLOCKLAYOUTXDR_H
+#define _NFSD_BLOCKLAYOUTXDR_H 1
+
+#include <linux/blkdev.h>
+#include "xdr4.h"
+
+struct iomap;
+struct xdr_stream;
+
+struct pnfs_block_extent {
+ struct nfsd4_deviceid vol_id;
+ u64 foff;
+ u64 len;
+ u64 soff;
+ enum pnfs_block_extent_state es;
+};
+
+struct pnfs_block_range {
+ u64 foff;
+ u64 len;
+};
+
+/*
+ * Random upper cap for the uuid length to avoid unbounded allocation.
+ * Not actually limited by the protocol.
+ */
+#define PNFS_BLOCK_UUID_LEN 128
+
+struct pnfs_block_volume {
+ enum pnfs_block_volume_type type;
+ union {
+ struct {
+ u64 offset;
+ u32 sig_len;
+ u8 sig[PNFS_BLOCK_UUID_LEN];
+ } simple;
+ struct {
+ enum scsi_code_set code_set;
+ enum scsi_designator_type designator_type;
+ int designator_len;
+ u8 designator[256];
+ u64 pr_key;
+ } scsi;
+ };
+};
+
+struct pnfs_block_deviceaddr {
+ u32 nr_volumes;
+ struct pnfs_block_volume volumes[];
+};
+
+__be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
+ struct nfsd4_getdeviceinfo *gdp);
+__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
+ struct nfsd4_layoutget *lgp);
+int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+ u32 block_size);
+int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+ u32 block_size);
+
+#endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
new file mode 100644
index 000000000..65c331f75
--- /dev/null
+++ b/fs/nfsd/cache.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Request reply cache. This was heavily inspired by the
+ * implementation in 4.3BSD/4.4BSD.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef NFSCACHE_H
+#define NFSCACHE_H
+
+#include <linux/sunrpc/svc.h>
+#include "netns.h"
+
+/*
+ * Representation of a reply cache entry.
+ *
+ * Note that we use a sockaddr_in6 to hold the address instead of the more
+ * typical sockaddr_storage. This is for space reasons, since sockaddr_storage
+ * is much larger than a sockaddr_in6.
+ */
+struct svc_cacherep {
+ struct {
+ /* Keep often-read xid, csum in the same cache line: */
+ __be32 k_xid;
+ __wsum k_csum;
+ u32 k_proc;
+ u32 k_prot;
+ u32 k_vers;
+ unsigned int k_len;
+ struct sockaddr_in6 k_addr;
+ } c_key;
+
+ struct rb_node c_node;
+ struct list_head c_lru;
+ unsigned char c_state, /* unused, inprog, done */
+ c_type, /* status, buffer */
+ c_secure : 1; /* req came from port < 1024 */
+ unsigned long c_timestamp;
+ union {
+ struct kvec u_vec;
+ __be32 u_status;
+ } c_u;
+};
+
+#define c_replvec c_u.u_vec
+#define c_replstat c_u.u_status
+
+/* cache entry states */
+enum {
+ RC_UNUSED,
+ RC_INPROG,
+ RC_DONE
+};
+
+/* return values */
+enum {
+ RC_DROPIT,
+ RC_REPLY,
+ RC_DOIT
+};
+
+/*
+ * Cache types.
+ * We may want to add more types one day, e.g. for diropres and
+ * attrstat replies. Using cache entries with fixed length instead
+ * of buffer pointers may be more efficient.
+ */
+enum {
+ RC_NOCACHE,
+ RC_REPLSTAT,
+ RC_REPLBUFF,
+};
+
+/* Cache entries expire after this time period */
+#define RC_EXPIRE (120 * HZ)
+
+/* Checksum this amount of the request */
+#define RC_CSUMLEN (256U)
+
+int nfsd_drc_slab_create(void);
+void nfsd_drc_slab_free(void);
+int nfsd_reply_cache_init(struct nfsd_net *);
+void nfsd_reply_cache_shutdown(struct nfsd_net *);
+int nfsd_cache_lookup(struct svc_rqst *);
+void nfsd_cache_update(struct svc_rqst *, int, __be32 *);
+int nfsd_reply_cache_stats_open(struct inode *, struct file *);
+
+#endif /* NFSCACHE_H */
diff --git a/fs/nfsd/current_stateid.h b/fs/nfsd/current_stateid.h
new file mode 100644
index 000000000..c28540d86
--- /dev/null
+++ b/fs/nfsd/current_stateid.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NFSD4_CURRENT_STATE_H
+#define _NFSD4_CURRENT_STATE_H
+
+#include "state.h"
+#include "xdr4.h"
+
+extern void clear_current_stateid(struct nfsd4_compound_state *cstate);
+/*
+ * functions to set current state id
+ */
+extern void nfsd4_set_opendowngradestateid(struct nfsd4_compound_state *,
+ union nfsd4_op_u *);
+extern void nfsd4_set_openstateid(struct nfsd4_compound_state *,
+ union nfsd4_op_u *);
+extern void nfsd4_set_lockstateid(struct nfsd4_compound_state *,
+ union nfsd4_op_u *);
+extern void nfsd4_set_closestateid(struct nfsd4_compound_state *,
+ union nfsd4_op_u *);
+
+/*
+ * functions to consume current state id
+ */
+extern void nfsd4_get_opendowngradestateid(struct nfsd4_compound_state *,
+ union nfsd4_op_u *);
+extern void nfsd4_get_delegreturnstateid(struct nfsd4_compound_state *,
+ union nfsd4_op_u *);
+extern void nfsd4_get_freestateid(struct nfsd4_compound_state *,
+ union nfsd4_op_u *);
+extern void nfsd4_get_setattrstateid(struct nfsd4_compound_state *,
+ union nfsd4_op_u *);
+extern void nfsd4_get_closestateid(struct nfsd4_compound_state *,
+ union nfsd4_op_u *);
+extern void nfsd4_get_lockustateid(struct nfsd4_compound_state *,
+ union nfsd4_op_u *);
+extern void nfsd4_get_readstateid(struct nfsd4_compound_state *,
+ union nfsd4_op_u *);
+extern void nfsd4_get_writestateid(struct nfsd4_compound_state *,
+ union nfsd4_op_u *);
+
+#endif /* _NFSD4_CURRENT_STATE_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
new file mode 100644
index 000000000..21e404e7c
--- /dev/null
+++ b/fs/nfsd/export.c
@@ -0,0 +1,1329 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NFS exporting and validation.
+ *
+ * We maintain a list of clients, each of which has a list of
+ * exports. To export an fs to a given client, you first have
+ * to create the client entry with NFSCTL_ADDCLIENT, which
+ * creates a client control block and adds it to the hash
+ * table. Then, you call NFSCTL_EXPORT for each fs.
+ *
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de>
+ */
+
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/module.h>
+#include <linux/exportfs.h>
+#include <linux/sunrpc/svc_xprt.h>
+
+#include "nfsd.h"
+#include "nfsfh.h"
+#include "netns.h"
+#include "pnfs.h"
+#include "filecache.h"
+#include "trace.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_EXPORT
+
+/*
+ * We have two caches.
+ * One maps client+vfsmnt+dentry to export options - the export map
+ * The other maps client+filehandle-fragment to export options. - the expkey map
+ *
+ * The export options are actually stored in the first map, and the
+ * second map contains a reference to the entry in the first map.
+ */
+
+#define EXPKEY_HASHBITS 8
+#define EXPKEY_HASHMAX (1 << EXPKEY_HASHBITS)
+#define EXPKEY_HASHMASK (EXPKEY_HASHMAX -1)
+
+static void expkey_put(struct kref *ref)
+{
+ struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref);
+
+ if (test_bit(CACHE_VALID, &key->h.flags) &&
+ !test_bit(CACHE_NEGATIVE, &key->h.flags))
+ path_put(&key->ek_path);
+ auth_domain_put(key->ek_client);
+ kfree_rcu(key, ek_rcu);
+}
+
+static int expkey_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall(cd, h);
+}
+
+static void expkey_request(struct cache_detail *cd,
+ struct cache_head *h,
+ char **bpp, int *blen)
+{
+ /* client fsidtype \xfsid */
+ struct svc_expkey *ek = container_of(h, struct svc_expkey, h);
+ char type[5];
+
+ qword_add(bpp, blen, ek->ek_client->name);
+ snprintf(type, 5, "%d", ek->ek_fsidtype);
+ qword_add(bpp, blen, type);
+ qword_addhex(bpp, blen, (char*)ek->ek_fsid, key_len(ek->ek_fsidtype));
+ (*bpp)[-1] = '\n';
+}
+
+static struct svc_expkey *svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new,
+ struct svc_expkey *old);
+static struct svc_expkey *svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *);
+
+static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
+{
+ /* client fsidtype fsid expiry [path] */
+ char *buf;
+ int len;
+ struct auth_domain *dom = NULL;
+ int err;
+ int fsidtype;
+ char *ep;
+ struct svc_expkey key;
+ struct svc_expkey *ek = NULL;
+
+ if (mesg[mlen - 1] != '\n')
+ return -EINVAL;
+ mesg[mlen-1] = 0;
+
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ err = -ENOMEM;
+ if (!buf)
+ goto out;
+
+ err = -EINVAL;
+ if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+ goto out;
+
+ err = -ENOENT;
+ dom = auth_domain_find(buf);
+ if (!dom)
+ goto out;
+ dprintk("found domain %s\n", buf);
+
+ err = -EINVAL;
+ if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+ goto out;
+ fsidtype = simple_strtoul(buf, &ep, 10);
+ if (*ep)
+ goto out;
+ dprintk("found fsidtype %d\n", fsidtype);
+ if (key_len(fsidtype)==0) /* invalid type */
+ goto out;
+ if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+ goto out;
+ dprintk("found fsid length %d\n", len);
+ if (len != key_len(fsidtype))
+ goto out;
+
+ /* OK, we seem to have a valid key */
+ key.h.flags = 0;
+ key.h.expiry_time = get_expiry(&mesg);
+ if (key.h.expiry_time == 0)
+ goto out;
+
+ key.ek_client = dom;
+ key.ek_fsidtype = fsidtype;
+ memcpy(key.ek_fsid, buf, len);
+
+ ek = svc_expkey_lookup(cd, &key);
+ err = -ENOMEM;
+ if (!ek)
+ goto out;
+
+ /* now we want a pathname, or empty meaning NEGATIVE */
+ err = -EINVAL;
+ len = qword_get(&mesg, buf, PAGE_SIZE);
+ if (len < 0)
+ goto out;
+ dprintk("Path seems to be <%s>\n", buf);
+ err = 0;
+ if (len == 0) {
+ set_bit(CACHE_NEGATIVE, &key.h.flags);
+ ek = svc_expkey_update(cd, &key, ek);
+ if (ek)
+ trace_nfsd_expkey_update(ek, NULL);
+ else
+ err = -ENOMEM;
+ } else {
+ err = kern_path(buf, 0, &key.ek_path);
+ if (err)
+ goto out;
+
+ dprintk("Found the path %s\n", buf);
+
+ ek = svc_expkey_update(cd, &key, ek);
+ if (ek)
+ trace_nfsd_expkey_update(ek, buf);
+ else
+ err = -ENOMEM;
+ path_put(&key.ek_path);
+ }
+ cache_flush();
+ out:
+ if (ek)
+ cache_put(&ek->h, cd);
+ if (dom)
+ auth_domain_put(dom);
+ kfree(buf);
+ return err;
+}
+
+static int expkey_show(struct seq_file *m,
+ struct cache_detail *cd,
+ struct cache_head *h)
+{
+ struct svc_expkey *ek ;
+ int i;
+
+ if (h ==NULL) {
+ seq_puts(m, "#domain fsidtype fsid [path]\n");
+ return 0;
+ }
+ ek = container_of(h, struct svc_expkey, h);
+ seq_printf(m, "%s %d 0x", ek->ek_client->name,
+ ek->ek_fsidtype);
+ for (i=0; i < key_len(ek->ek_fsidtype)/4; i++)
+ seq_printf(m, "%08x", ek->ek_fsid[i]);
+ if (test_bit(CACHE_VALID, &h->flags) &&
+ !test_bit(CACHE_NEGATIVE, &h->flags)) {
+ seq_printf(m, " ");
+ seq_path(m, &ek->ek_path, "\\ \t\n");
+ }
+ seq_printf(m, "\n");
+ return 0;
+}
+
+static inline int expkey_match (struct cache_head *a, struct cache_head *b)
+{
+ struct svc_expkey *orig = container_of(a, struct svc_expkey, h);
+ struct svc_expkey *new = container_of(b, struct svc_expkey, h);
+
+ if (orig->ek_fsidtype != new->ek_fsidtype ||
+ orig->ek_client != new->ek_client ||
+ memcmp(orig->ek_fsid, new->ek_fsid, key_len(orig->ek_fsidtype)) != 0)
+ return 0;
+ return 1;
+}
+
+static inline void expkey_init(struct cache_head *cnew,
+ struct cache_head *citem)
+{
+ struct svc_expkey *new = container_of(cnew, struct svc_expkey, h);
+ struct svc_expkey *item = container_of(citem, struct svc_expkey, h);
+
+ kref_get(&item->ek_client->ref);
+ new->ek_client = item->ek_client;
+ new->ek_fsidtype = item->ek_fsidtype;
+
+ memcpy(new->ek_fsid, item->ek_fsid, sizeof(new->ek_fsid));
+}
+
+static inline void expkey_update(struct cache_head *cnew,
+ struct cache_head *citem)
+{
+ struct svc_expkey *new = container_of(cnew, struct svc_expkey, h);
+ struct svc_expkey *item = container_of(citem, struct svc_expkey, h);
+
+ new->ek_path = item->ek_path;
+ path_get(&item->ek_path);
+}
+
+static struct cache_head *expkey_alloc(void)
+{
+ struct svc_expkey *i = kmalloc(sizeof(*i), GFP_KERNEL);
+ if (i)
+ return &i->h;
+ else
+ return NULL;
+}
+
+static void expkey_flush(void)
+{
+ /*
+ * Take the nfsd_mutex here to ensure that the file cache is not
+ * destroyed while we're in the middle of flushing.
+ */
+ mutex_lock(&nfsd_mutex);
+ nfsd_file_cache_purge(current->nsproxy->net_ns);
+ mutex_unlock(&nfsd_mutex);
+}
+
+static const struct cache_detail svc_expkey_cache_template = {
+ .owner = THIS_MODULE,
+ .hash_size = EXPKEY_HASHMAX,
+ .name = "nfsd.fh",
+ .cache_put = expkey_put,
+ .cache_upcall = expkey_upcall,
+ .cache_request = expkey_request,
+ .cache_parse = expkey_parse,
+ .cache_show = expkey_show,
+ .match = expkey_match,
+ .init = expkey_init,
+ .update = expkey_update,
+ .alloc = expkey_alloc,
+ .flush = expkey_flush,
+};
+
+static int
+svc_expkey_hash(struct svc_expkey *item)
+{
+ int hash = item->ek_fsidtype;
+ char * cp = (char*)item->ek_fsid;
+ int len = key_len(item->ek_fsidtype);
+
+ hash ^= hash_mem(cp, len, EXPKEY_HASHBITS);
+ hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS);
+ hash &= EXPKEY_HASHMASK;
+ return hash;
+}
+
+static struct svc_expkey *
+svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *item)
+{
+ struct cache_head *ch;
+ int hash = svc_expkey_hash(item);
+
+ ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash);
+ if (ch)
+ return container_of(ch, struct svc_expkey, h);
+ else
+ return NULL;
+}
+
+static struct svc_expkey *
+svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new,
+ struct svc_expkey *old)
+{
+ struct cache_head *ch;
+ int hash = svc_expkey_hash(new);
+
+ ch = sunrpc_cache_update(cd, &new->h, &old->h, hash);
+ if (ch)
+ return container_of(ch, struct svc_expkey, h);
+ else
+ return NULL;
+}
+
+
+#define EXPORT_HASHBITS 8
+#define EXPORT_HASHMAX (1<< EXPORT_HASHBITS)
+
+static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc)
+{
+ struct nfsd4_fs_location *locations = fsloc->locations;
+ int i;
+
+ if (!locations)
+ return;
+
+ for (i = 0; i < fsloc->locations_count; i++) {
+ kfree(locations[i].path);
+ kfree(locations[i].hosts);
+ }
+
+ kfree(locations);
+ fsloc->locations = NULL;
+}
+
+static void svc_export_put(struct kref *ref)
+{
+ struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
+ path_put(&exp->ex_path);
+ auth_domain_put(exp->ex_client);
+ nfsd4_fslocs_free(&exp->ex_fslocs);
+ kfree(exp->ex_uuid);
+ kfree_rcu(exp, ex_rcu);
+}
+
+static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall(cd, h);
+}
+
+static void svc_export_request(struct cache_detail *cd,
+ struct cache_head *h,
+ char **bpp, int *blen)
+{
+ /* client path */
+ struct svc_export *exp = container_of(h, struct svc_export, h);
+ char *pth;
+
+ qword_add(bpp, blen, exp->ex_client->name);
+ pth = d_path(&exp->ex_path, *bpp, *blen);
+ if (IS_ERR(pth)) {
+ /* is this correct? */
+ (*bpp)[0] = '\n';
+ return;
+ }
+ qword_add(bpp, blen, pth);
+ (*bpp)[-1] = '\n';
+}
+
+static struct svc_export *svc_export_update(struct svc_export *new,
+ struct svc_export *old);
+static struct svc_export *svc_export_lookup(struct svc_export *);
+
+static int check_export(struct inode *inode, int *flags, unsigned char *uuid)
+{
+
+ /*
+ * We currently export only dirs, regular files, and (for v4
+ * pseudoroot) symlinks.
+ */
+ if (!S_ISDIR(inode->i_mode) &&
+ !S_ISLNK(inode->i_mode) &&
+ !S_ISREG(inode->i_mode))
+ return -ENOTDIR;
+
+ /*
+ * Mountd should never pass down a writeable V4ROOT export, but,
+ * just to make sure:
+ */
+ if (*flags & NFSEXP_V4ROOT)
+ *flags |= NFSEXP_READONLY;
+
+ /* There are two requirements on a filesystem to be exportable.
+ * 1: We must be able to identify the filesystem from a number.
+ * either a device number (so FS_REQUIRES_DEV needed)
+ * or an FSID number (so NFSEXP_FSID or ->uuid is needed).
+ * 2: We must be able to find an inode from a filehandle.
+ * This means that s_export_op must be set.
+ */
+ if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) &&
+ !(*flags & NFSEXP_FSID) &&
+ uuid == NULL) {
+ dprintk("exp_export: export of non-dev fs without fsid\n");
+ return -EINVAL;
+ }
+
+ if (!inode->i_sb->s_export_op ||
+ !inode->i_sb->s_export_op->fh_to_dentry) {
+ dprintk("exp_export: export of invalid fs type.\n");
+ return -EINVAL;
+ }
+
+ return 0;
+
+}
+
+#ifdef CONFIG_NFSD_V4
+
+static int
+fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc)
+{
+ int len;
+ int migrated, i, err;
+
+ /* more than one fsloc */
+ if (fsloc->locations)
+ return -EINVAL;
+
+ /* listsize */
+ err = get_uint(mesg, &fsloc->locations_count);
+ if (err)
+ return err;
+ if (fsloc->locations_count > MAX_FS_LOCATIONS)
+ return -EINVAL;
+ if (fsloc->locations_count == 0)
+ return 0;
+
+ fsloc->locations = kcalloc(fsloc->locations_count,
+ sizeof(struct nfsd4_fs_location),
+ GFP_KERNEL);
+ if (!fsloc->locations)
+ return -ENOMEM;
+ for (i=0; i < fsloc->locations_count; i++) {
+ /* colon separated host list */
+ err = -EINVAL;
+ len = qword_get(mesg, buf, PAGE_SIZE);
+ if (len <= 0)
+ goto out_free_all;
+ err = -ENOMEM;
+ fsloc->locations[i].hosts = kstrdup(buf, GFP_KERNEL);
+ if (!fsloc->locations[i].hosts)
+ goto out_free_all;
+ err = -EINVAL;
+ /* slash separated path component list */
+ len = qword_get(mesg, buf, PAGE_SIZE);
+ if (len <= 0)
+ goto out_free_all;
+ err = -ENOMEM;
+ fsloc->locations[i].path = kstrdup(buf, GFP_KERNEL);
+ if (!fsloc->locations[i].path)
+ goto out_free_all;
+ }
+ /* migrated */
+ err = get_int(mesg, &migrated);
+ if (err)
+ goto out_free_all;
+ err = -EINVAL;
+ if (migrated < 0 || migrated > 1)
+ goto out_free_all;
+ fsloc->migrated = migrated;
+ return 0;
+out_free_all:
+ nfsd4_fslocs_free(fsloc);
+ return err;
+}
+
+static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp)
+{
+ struct exp_flavor_info *f;
+ u32 listsize;
+ int err;
+
+ /* more than one secinfo */
+ if (exp->ex_nflavors)
+ return -EINVAL;
+
+ err = get_uint(mesg, &listsize);
+ if (err)
+ return err;
+ if (listsize > MAX_SECINFO_LIST)
+ return -EINVAL;
+
+ for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) {
+ err = get_uint(mesg, &f->pseudoflavor);
+ if (err)
+ return err;
+ /*
+ * XXX: It would be nice to also check whether this
+ * pseudoflavor is supported, so we can discover the
+ * problem at export time instead of when a client fails
+ * to authenticate.
+ */
+ err = get_uint(mesg, &f->flags);
+ if (err)
+ return err;
+ /* Only some flags are allowed to differ between flavors: */
+ if (~NFSEXP_SECINFO_FLAGS & (f->flags ^ exp->ex_flags))
+ return -EINVAL;
+ }
+ exp->ex_nflavors = listsize;
+ return 0;
+}
+
+#else /* CONFIG_NFSD_V4 */
+static inline int
+fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc){return 0;}
+static inline int
+secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; }
+#endif
+
+static inline int
+nfsd_uuid_parse(char **mesg, char *buf, unsigned char **puuid)
+{
+ int len;
+
+ /* more than one uuid */
+ if (*puuid)
+ return -EINVAL;
+
+ /* expect a 16 byte uuid encoded as \xXXXX... */
+ len = qword_get(mesg, buf, PAGE_SIZE);
+ if (len != EX_UUID_LEN)
+ return -EINVAL;
+
+ *puuid = kmemdup(buf, EX_UUID_LEN, GFP_KERNEL);
+ if (*puuid == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
+{
+ /* client path expiry [flags anonuid anongid fsid] */
+ char *buf;
+ int len;
+ int err;
+ struct auth_domain *dom = NULL;
+ struct svc_export exp = {}, *expp;
+ int an_int;
+
+ if (mesg[mlen-1] != '\n')
+ return -EINVAL;
+ mesg[mlen-1] = 0;
+
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ /* client */
+ err = -EINVAL;
+ len = qword_get(&mesg, buf, PAGE_SIZE);
+ if (len <= 0)
+ goto out;
+
+ err = -ENOENT;
+ dom = auth_domain_find(buf);
+ if (!dom)
+ goto out;
+
+ /* path */
+ err = -EINVAL;
+ if ((len = qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+ goto out1;
+
+ err = kern_path(buf, 0, &exp.ex_path);
+ if (err)
+ goto out1;
+
+ exp.ex_client = dom;
+ exp.cd = cd;
+ exp.ex_devid_map = NULL;
+
+ /* expiry */
+ err = -EINVAL;
+ exp.h.expiry_time = get_expiry(&mesg);
+ if (exp.h.expiry_time == 0)
+ goto out3;
+
+ /* flags */
+ err = get_int(&mesg, &an_int);
+ if (err == -ENOENT) {
+ err = 0;
+ set_bit(CACHE_NEGATIVE, &exp.h.flags);
+ } else {
+ if (err || an_int < 0)
+ goto out3;
+ exp.ex_flags= an_int;
+
+ /* anon uid */
+ err = get_int(&mesg, &an_int);
+ if (err)
+ goto out3;
+ exp.ex_anon_uid= make_kuid(current_user_ns(), an_int);
+
+ /* anon gid */
+ err = get_int(&mesg, &an_int);
+ if (err)
+ goto out3;
+ exp.ex_anon_gid= make_kgid(current_user_ns(), an_int);
+
+ /* fsid */
+ err = get_int(&mesg, &an_int);
+ if (err)
+ goto out3;
+ exp.ex_fsid = an_int;
+
+ while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) {
+ if (strcmp(buf, "fsloc") == 0)
+ err = fsloc_parse(&mesg, buf, &exp.ex_fslocs);
+ else if (strcmp(buf, "uuid") == 0)
+ err = nfsd_uuid_parse(&mesg, buf, &exp.ex_uuid);
+ else if (strcmp(buf, "secinfo") == 0)
+ err = secinfo_parse(&mesg, buf, &exp);
+ else
+ /* quietly ignore unknown words and anything
+ * following. Newer user-space can try to set
+ * new values, then see what the result was.
+ */
+ break;
+ if (err)
+ goto out4;
+ }
+
+ err = check_export(d_inode(exp.ex_path.dentry), &exp.ex_flags,
+ exp.ex_uuid);
+ if (err)
+ goto out4;
+ /*
+ * No point caching this if it would immediately expire.
+ * Also, this protects exportfs's dummy export from the
+ * anon_uid/anon_gid checks:
+ */
+ if (exp.h.expiry_time < seconds_since_boot())
+ goto out4;
+ /*
+ * For some reason exportfs has been passing down an
+ * invalid (-1) uid & gid on the "dummy" export which it
+ * uses to test export support. To make sure exportfs
+ * sees errors from check_export we therefore need to
+ * delay these checks till after check_export:
+ */
+ err = -EINVAL;
+ if (!uid_valid(exp.ex_anon_uid))
+ goto out4;
+ if (!gid_valid(exp.ex_anon_gid))
+ goto out4;
+ err = 0;
+
+ nfsd4_setup_layout_type(&exp);
+ }
+
+ expp = svc_export_lookup(&exp);
+ if (!expp) {
+ err = -ENOMEM;
+ goto out4;
+ }
+ expp = svc_export_update(&exp, expp);
+ if (expp) {
+ trace_nfsd_export_update(expp);
+ cache_flush();
+ exp_put(expp);
+ } else
+ err = -ENOMEM;
+out4:
+ nfsd4_fslocs_free(&exp.ex_fslocs);
+ kfree(exp.ex_uuid);
+out3:
+ path_put(&exp.ex_path);
+out1:
+ auth_domain_put(dom);
+out:
+ kfree(buf);
+ return err;
+}
+
+static void exp_flags(struct seq_file *m, int flag, int fsid,
+ kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs);
+static void show_secinfo(struct seq_file *m, struct svc_export *exp);
+
+static int svc_export_show(struct seq_file *m,
+ struct cache_detail *cd,
+ struct cache_head *h)
+{
+ struct svc_export *exp ;
+
+ if (h ==NULL) {
+ seq_puts(m, "#path domain(flags)\n");
+ return 0;
+ }
+ exp = container_of(h, struct svc_export, h);
+ seq_path(m, &exp->ex_path, " \t\n\\");
+ seq_putc(m, '\t');
+ seq_escape(m, exp->ex_client->name, " \t\n\\");
+ seq_putc(m, '(');
+ if (test_bit(CACHE_VALID, &h->flags) &&
+ !test_bit(CACHE_NEGATIVE, &h->flags)) {
+ exp_flags(m, exp->ex_flags, exp->ex_fsid,
+ exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs);
+ if (exp->ex_uuid) {
+ int i;
+ seq_puts(m, ",uuid=");
+ for (i = 0; i < EX_UUID_LEN; i++) {
+ if ((i&3) == 0 && i)
+ seq_putc(m, ':');
+ seq_printf(m, "%02x", exp->ex_uuid[i]);
+ }
+ }
+ show_secinfo(m, exp);
+ }
+ seq_puts(m, ")\n");
+ return 0;
+}
+static int svc_export_match(struct cache_head *a, struct cache_head *b)
+{
+ struct svc_export *orig = container_of(a, struct svc_export, h);
+ struct svc_export *new = container_of(b, struct svc_export, h);
+ return orig->ex_client == new->ex_client &&
+ path_equal(&orig->ex_path, &new->ex_path);
+}
+
+static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
+{
+ struct svc_export *new = container_of(cnew, struct svc_export, h);
+ struct svc_export *item = container_of(citem, struct svc_export, h);
+
+ kref_get(&item->ex_client->ref);
+ new->ex_client = item->ex_client;
+ new->ex_path = item->ex_path;
+ path_get(&item->ex_path);
+ new->ex_fslocs.locations = NULL;
+ new->ex_fslocs.locations_count = 0;
+ new->ex_fslocs.migrated = 0;
+ new->ex_layout_types = 0;
+ new->ex_uuid = NULL;
+ new->cd = item->cd;
+}
+
+static void export_update(struct cache_head *cnew, struct cache_head *citem)
+{
+ struct svc_export *new = container_of(cnew, struct svc_export, h);
+ struct svc_export *item = container_of(citem, struct svc_export, h);
+ int i;
+
+ new->ex_flags = item->ex_flags;
+ new->ex_anon_uid = item->ex_anon_uid;
+ new->ex_anon_gid = item->ex_anon_gid;
+ new->ex_fsid = item->ex_fsid;
+ new->ex_devid_map = item->ex_devid_map;
+ item->ex_devid_map = NULL;
+ new->ex_uuid = item->ex_uuid;
+ item->ex_uuid = NULL;
+ new->ex_fslocs.locations = item->ex_fslocs.locations;
+ item->ex_fslocs.locations = NULL;
+ new->ex_fslocs.locations_count = item->ex_fslocs.locations_count;
+ item->ex_fslocs.locations_count = 0;
+ new->ex_fslocs.migrated = item->ex_fslocs.migrated;
+ item->ex_fslocs.migrated = 0;
+ new->ex_layout_types = item->ex_layout_types;
+ new->ex_nflavors = item->ex_nflavors;
+ for (i = 0; i < MAX_SECINFO_LIST; i++) {
+ new->ex_flavors[i] = item->ex_flavors[i];
+ }
+}
+
+static struct cache_head *svc_export_alloc(void)
+{
+ struct svc_export *i = kmalloc(sizeof(*i), GFP_KERNEL);
+ if (i)
+ return &i->h;
+ else
+ return NULL;
+}
+
+static const struct cache_detail svc_export_cache_template = {
+ .owner = THIS_MODULE,
+ .hash_size = EXPORT_HASHMAX,
+ .name = "nfsd.export",
+ .cache_put = svc_export_put,
+ .cache_upcall = svc_export_upcall,
+ .cache_request = svc_export_request,
+ .cache_parse = svc_export_parse,
+ .cache_show = svc_export_show,
+ .match = svc_export_match,
+ .init = svc_export_init,
+ .update = export_update,
+ .alloc = svc_export_alloc,
+};
+
+static int
+svc_export_hash(struct svc_export *exp)
+{
+ int hash;
+
+ hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS);
+ hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS);
+ hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS);
+ return hash;
+}
+
+static struct svc_export *
+svc_export_lookup(struct svc_export *exp)
+{
+ struct cache_head *ch;
+ int hash = svc_export_hash(exp);
+
+ ch = sunrpc_cache_lookup_rcu(exp->cd, &exp->h, hash);
+ if (ch)
+ return container_of(ch, struct svc_export, h);
+ else
+ return NULL;
+}
+
+static struct svc_export *
+svc_export_update(struct svc_export *new, struct svc_export *old)
+{
+ struct cache_head *ch;
+ int hash = svc_export_hash(old);
+
+ ch = sunrpc_cache_update(old->cd, &new->h, &old->h, hash);
+ if (ch)
+ return container_of(ch, struct svc_export, h);
+ else
+ return NULL;
+}
+
+
+static struct svc_expkey *
+exp_find_key(struct cache_detail *cd, struct auth_domain *clp, int fsid_type,
+ u32 *fsidv, struct cache_req *reqp)
+{
+ struct svc_expkey key, *ek;
+ int err;
+
+ if (!clp)
+ return ERR_PTR(-ENOENT);
+
+ key.ek_client = clp;
+ key.ek_fsidtype = fsid_type;
+ memcpy(key.ek_fsid, fsidv, key_len(fsid_type));
+
+ ek = svc_expkey_lookup(cd, &key);
+ if (ek == NULL)
+ return ERR_PTR(-ENOMEM);
+ err = cache_check(cd, &ek->h, reqp);
+ if (err) {
+ trace_nfsd_exp_find_key(&key, err);
+ return ERR_PTR(err);
+ }
+ return ek;
+}
+
+static struct svc_export *
+exp_get_by_name(struct cache_detail *cd, struct auth_domain *clp,
+ const struct path *path, struct cache_req *reqp)
+{
+ struct svc_export *exp, key;
+ int err;
+
+ if (!clp)
+ return ERR_PTR(-ENOENT);
+
+ key.ex_client = clp;
+ key.ex_path = *path;
+ key.cd = cd;
+
+ exp = svc_export_lookup(&key);
+ if (exp == NULL)
+ return ERR_PTR(-ENOMEM);
+ err = cache_check(cd, &exp->h, reqp);
+ if (err) {
+ trace_nfsd_exp_get_by_name(&key, err);
+ return ERR_PTR(err);
+ }
+ return exp;
+}
+
+/*
+ * Find the export entry for a given dentry.
+ */
+static struct svc_export *
+exp_parent(struct cache_detail *cd, struct auth_domain *clp, struct path *path)
+{
+ struct dentry *saved = dget(path->dentry);
+ struct svc_export *exp = exp_get_by_name(cd, clp, path, NULL);
+
+ while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
+ struct dentry *parent = dget_parent(path->dentry);
+ dput(path->dentry);
+ path->dentry = parent;
+ exp = exp_get_by_name(cd, clp, path, NULL);
+ }
+ dput(path->dentry);
+ path->dentry = saved;
+ return exp;
+}
+
+
+
+/*
+ * Obtain the root fh on behalf of a client.
+ * This could be done in user space, but I feel that it adds some safety
+ * since its harder to fool a kernel module than a user space program.
+ */
+int
+exp_rootfh(struct net *net, struct auth_domain *clp, char *name,
+ struct knfsd_fh *f, int maxsize)
+{
+ struct svc_export *exp;
+ struct path path;
+ struct inode *inode;
+ struct svc_fh fh;
+ int err;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct cache_detail *cd = nn->svc_export_cache;
+
+ err = -EPERM;
+ /* NB: we probably ought to check that it's NUL-terminated */
+ if (kern_path(name, 0, &path)) {
+ printk("nfsd: exp_rootfh path not found %s", name);
+ return err;
+ }
+ inode = d_inode(path.dentry);
+
+ dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n",
+ name, path.dentry, clp->name,
+ inode->i_sb->s_id, inode->i_ino);
+ exp = exp_parent(cd, clp, &path);
+ if (IS_ERR(exp)) {
+ err = PTR_ERR(exp);
+ goto out;
+ }
+
+ /*
+ * fh must be initialized before calling fh_compose
+ */
+ fh_init(&fh, maxsize);
+ if (fh_compose(&fh, exp, path.dentry, NULL))
+ err = -EINVAL;
+ else
+ err = 0;
+ memcpy(f, &fh.fh_handle, sizeof(struct knfsd_fh));
+ fh_put(&fh);
+ exp_put(exp);
+out:
+ path_put(&path);
+ return err;
+}
+
+static struct svc_export *exp_find(struct cache_detail *cd,
+ struct auth_domain *clp, int fsid_type,
+ u32 *fsidv, struct cache_req *reqp)
+{
+ struct svc_export *exp;
+ struct nfsd_net *nn = net_generic(cd->net, nfsd_net_id);
+ struct svc_expkey *ek = exp_find_key(nn->svc_expkey_cache, clp, fsid_type, fsidv, reqp);
+ if (IS_ERR(ek))
+ return ERR_CAST(ek);
+
+ exp = exp_get_by_name(cd, clp, &ek->ek_path, reqp);
+ cache_put(&ek->h, nn->svc_expkey_cache);
+
+ if (IS_ERR(exp))
+ return ERR_CAST(exp);
+ return exp;
+}
+
+__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
+{
+ struct exp_flavor_info *f;
+ struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
+
+ /* legacy gss-only clients are always OK: */
+ if (exp->ex_client == rqstp->rq_gssclient)
+ return 0;
+ /* ip-address based client; check sec= export option: */
+ for (f = exp->ex_flavors; f < end; f++) {
+ if (f->pseudoflavor == rqstp->rq_cred.cr_flavor)
+ return 0;
+ }
+ /* defaults in absence of sec= options: */
+ if (exp->ex_nflavors == 0) {
+ if (rqstp->rq_cred.cr_flavor == RPC_AUTH_NULL ||
+ rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX)
+ return 0;
+ }
+
+ /* If the compound op contains a spo_must_allowed op,
+ * it will be sent with integrity/protection which
+ * will have to be expressly allowed on mounts that
+ * don't support it
+ */
+
+ if (nfsd4_spo_must_allow(rqstp))
+ return 0;
+
+ return rqstp->rq_vers < 4 ? nfserr_acces : nfserr_wrongsec;
+}
+
+/*
+ * Uses rq_client and rq_gssclient to find an export; uses rq_client (an
+ * auth_unix client) if it's available and has secinfo information;
+ * otherwise, will try to use rq_gssclient.
+ *
+ * Called from functions that handle requests; functions that do work on
+ * behalf of mountd are passed a single client name to use, and should
+ * use exp_get_by_name() or exp_find().
+ */
+struct svc_export *
+rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path)
+{
+ struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT);
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct cache_detail *cd = nn->svc_export_cache;
+
+ if (rqstp->rq_client == NULL)
+ goto gss;
+
+ /* First try the auth_unix client: */
+ exp = exp_get_by_name(cd, rqstp->rq_client, path, &rqstp->rq_chandle);
+ if (PTR_ERR(exp) == -ENOENT)
+ goto gss;
+ if (IS_ERR(exp))
+ return exp;
+ /* If it has secinfo, assume there are no gss/... clients */
+ if (exp->ex_nflavors > 0)
+ return exp;
+gss:
+ /* Otherwise, try falling back on gss client */
+ if (rqstp->rq_gssclient == NULL)
+ return exp;
+ gssexp = exp_get_by_name(cd, rqstp->rq_gssclient, path, &rqstp->rq_chandle);
+ if (PTR_ERR(gssexp) == -ENOENT)
+ return exp;
+ if (!IS_ERR(exp))
+ exp_put(exp);
+ return gssexp;
+}
+
+struct svc_export *
+rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv)
+{
+ struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT);
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct cache_detail *cd = nn->svc_export_cache;
+
+ if (rqstp->rq_client == NULL)
+ goto gss;
+
+ /* First try the auth_unix client: */
+ exp = exp_find(cd, rqstp->rq_client, fsid_type,
+ fsidv, &rqstp->rq_chandle);
+ if (PTR_ERR(exp) == -ENOENT)
+ goto gss;
+ if (IS_ERR(exp))
+ return exp;
+ /* If it has secinfo, assume there are no gss/... clients */
+ if (exp->ex_nflavors > 0)
+ return exp;
+gss:
+ /* Otherwise, try falling back on gss client */
+ if (rqstp->rq_gssclient == NULL)
+ return exp;
+ gssexp = exp_find(cd, rqstp->rq_gssclient, fsid_type, fsidv,
+ &rqstp->rq_chandle);
+ if (PTR_ERR(gssexp) == -ENOENT)
+ return exp;
+ if (!IS_ERR(exp))
+ exp_put(exp);
+ return gssexp;
+}
+
+struct svc_export *
+rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
+{
+ struct dentry *saved = dget(path->dentry);
+ struct svc_export *exp = rqst_exp_get_by_name(rqstp, path);
+
+ while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
+ struct dentry *parent = dget_parent(path->dentry);
+ dput(path->dentry);
+ path->dentry = parent;
+ exp = rqst_exp_get_by_name(rqstp, path);
+ }
+ dput(path->dentry);
+ path->dentry = saved;
+ return exp;
+}
+
+struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp)
+{
+ u32 fsidv[2];
+
+ mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
+
+ return rqst_exp_find(rqstp, FSID_NUM, fsidv);
+}
+
+/*
+ * Called when we need the filehandle for the root of the pseudofs,
+ * for a given NFSv4 client. The root is defined to be the
+ * export point with fsid==0
+ */
+__be32
+exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
+{
+ struct svc_export *exp;
+ __be32 rv;
+
+ exp = rqst_find_fsidzero_export(rqstp);
+ if (IS_ERR(exp))
+ return nfserrno(PTR_ERR(exp));
+ rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
+ exp_put(exp);
+ return rv;
+}
+
+static struct flags {
+ int flag;
+ char *name[2];
+} expflags[] = {
+ { NFSEXP_READONLY, {"ro", "rw"}},
+ { NFSEXP_INSECURE_PORT, {"insecure", ""}},
+ { NFSEXP_ROOTSQUASH, {"root_squash", "no_root_squash"}},
+ { NFSEXP_ALLSQUASH, {"all_squash", ""}},
+ { NFSEXP_ASYNC, {"async", "sync"}},
+ { NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}},
+ { NFSEXP_NOREADDIRPLUS, {"nordirplus", ""}},
+ { NFSEXP_NOHIDE, {"nohide", ""}},
+ { NFSEXP_CROSSMOUNT, {"crossmnt", ""}},
+ { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
+ { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
+ { NFSEXP_V4ROOT, {"v4root", ""}},
+ { NFSEXP_PNFS, {"pnfs", ""}},
+ { NFSEXP_SECURITY_LABEL, {"security_label", ""}},
+ { 0, {"", ""}}
+};
+
+static void show_expflags(struct seq_file *m, int flags, int mask)
+{
+ struct flags *flg;
+ int state, first = 0;
+
+ for (flg = expflags; flg->flag; flg++) {
+ if (flg->flag & ~mask)
+ continue;
+ state = (flg->flag & flags) ? 0 : 1;
+ if (*flg->name[state])
+ seq_printf(m, "%s%s", first++?",":"", flg->name[state]);
+ }
+}
+
+static void show_secinfo_flags(struct seq_file *m, int flags)
+{
+ seq_printf(m, ",");
+ show_expflags(m, flags, NFSEXP_SECINFO_FLAGS);
+}
+
+static bool secinfo_flags_equal(int f, int g)
+{
+ f &= NFSEXP_SECINFO_FLAGS;
+ g &= NFSEXP_SECINFO_FLAGS;
+ return f == g;
+}
+
+static int show_secinfo_run(struct seq_file *m, struct exp_flavor_info **fp, struct exp_flavor_info *end)
+{
+ int flags;
+
+ flags = (*fp)->flags;
+ seq_printf(m, ",sec=%d", (*fp)->pseudoflavor);
+ (*fp)++;
+ while (*fp != end && secinfo_flags_equal(flags, (*fp)->flags)) {
+ seq_printf(m, ":%d", (*fp)->pseudoflavor);
+ (*fp)++;
+ }
+ return flags;
+}
+
+static void show_secinfo(struct seq_file *m, struct svc_export *exp)
+{
+ struct exp_flavor_info *f;
+ struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
+ int flags;
+
+ if (exp->ex_nflavors == 0)
+ return;
+ f = exp->ex_flavors;
+ flags = show_secinfo_run(m, &f, end);
+ if (!secinfo_flags_equal(flags, exp->ex_flags))
+ show_secinfo_flags(m, flags);
+ while (f != end) {
+ flags = show_secinfo_run(m, &f, end);
+ show_secinfo_flags(m, flags);
+ }
+}
+
+static void exp_flags(struct seq_file *m, int flag, int fsid,
+ kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fsloc)
+{
+ struct user_namespace *userns = m->file->f_cred->user_ns;
+
+ show_expflags(m, flag, NFSEXP_ALLFLAGS);
+ if (flag & NFSEXP_FSID)
+ seq_printf(m, ",fsid=%d", fsid);
+ if (!uid_eq(anonu, make_kuid(userns, (uid_t)-2)) &&
+ !uid_eq(anonu, make_kuid(userns, 0x10000-2)))
+ seq_printf(m, ",anonuid=%u", from_kuid_munged(userns, anonu));
+ if (!gid_eq(anong, make_kgid(userns, (gid_t)-2)) &&
+ !gid_eq(anong, make_kgid(userns, 0x10000-2)))
+ seq_printf(m, ",anongid=%u", from_kgid_munged(userns, anong));
+ if (fsloc && fsloc->locations_count > 0) {
+ char *loctype = (fsloc->migrated) ? "refer" : "replicas";
+ int i;
+
+ seq_printf(m, ",%s=", loctype);
+ seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\");
+ seq_putc(m, '@');
+ seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\");
+ for (i = 1; i < fsloc->locations_count; i++) {
+ seq_putc(m, ';');
+ seq_escape(m, fsloc->locations[i].path, ",;@ \t\n\\");
+ seq_putc(m, '@');
+ seq_escape(m, fsloc->locations[i].hosts, ",;@ \t\n\\");
+ }
+ }
+}
+
+static int e_show(struct seq_file *m, void *p)
+{
+ struct cache_head *cp = p;
+ struct svc_export *exp = container_of(cp, struct svc_export, h);
+ struct cache_detail *cd = m->private;
+
+ if (p == SEQ_START_TOKEN) {
+ seq_puts(m, "# Version 1.1\n");
+ seq_puts(m, "# Path Client(Flags) # IPs\n");
+ return 0;
+ }
+
+ exp_get(exp);
+ if (cache_check(cd, &exp->h, NULL))
+ return 0;
+ exp_put(exp);
+ return svc_export_show(m, cd, cp);
+}
+
+const struct seq_operations nfs_exports_op = {
+ .start = cache_seq_start_rcu,
+ .next = cache_seq_next_rcu,
+ .stop = cache_seq_stop_rcu,
+ .show = e_show,
+};
+
+/*
+ * Initialize the exports module.
+ */
+int
+nfsd_export_init(struct net *net)
+{
+ int rv;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ dprintk("nfsd: initializing export module (net: %x).\n", net->ns.inum);
+
+ nn->svc_export_cache = cache_create_net(&svc_export_cache_template, net);
+ if (IS_ERR(nn->svc_export_cache))
+ return PTR_ERR(nn->svc_export_cache);
+ rv = cache_register_net(nn->svc_export_cache, net);
+ if (rv)
+ goto destroy_export_cache;
+
+ nn->svc_expkey_cache = cache_create_net(&svc_expkey_cache_template, net);
+ if (IS_ERR(nn->svc_expkey_cache)) {
+ rv = PTR_ERR(nn->svc_expkey_cache);
+ goto unregister_export_cache;
+ }
+ rv = cache_register_net(nn->svc_expkey_cache, net);
+ if (rv)
+ goto destroy_expkey_cache;
+ return 0;
+
+destroy_expkey_cache:
+ cache_destroy_net(nn->svc_expkey_cache, net);
+unregister_export_cache:
+ cache_unregister_net(nn->svc_export_cache, net);
+destroy_export_cache:
+ cache_destroy_net(nn->svc_export_cache, net);
+ return rv;
+}
+
+/*
+ * Flush exports table - called when last nfsd thread is killed
+ */
+void
+nfsd_export_flush(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ cache_purge(nn->svc_expkey_cache);
+ cache_purge(nn->svc_export_cache);
+}
+
+/*
+ * Shutdown the exports module.
+ */
+void
+nfsd_export_shutdown(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ dprintk("nfsd: shutting down export module (net: %x).\n", net->ns.inum);
+
+ cache_unregister_net(nn->svc_expkey_cache, net);
+ cache_unregister_net(nn->svc_export_cache, net);
+ cache_destroy_net(nn->svc_expkey_cache, net);
+ cache_destroy_net(nn->svc_export_cache, net);
+ svcauth_unix_purge(net);
+
+ dprintk("nfsd: export shutdown complete (net: %x).\n", net->ns.inum);
+}
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
new file mode 100644
index 000000000..e7daa1f24
--- /dev/null
+++ b/fs/nfsd/export.h
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+#ifndef NFSD_EXPORT_H
+#define NFSD_EXPORT_H
+
+#include <linux/sunrpc/cache.h>
+#include <uapi/linux/nfsd/export.h>
+#include <linux/nfs4.h>
+
+struct knfsd_fh;
+struct svc_fh;
+struct svc_rqst;
+
+/*
+ * FS Locations
+ */
+
+#define MAX_FS_LOCATIONS 128
+
+struct nfsd4_fs_location {
+ char *hosts; /* colon separated list of hosts */
+ char *path; /* slash separated list of path components */
+};
+
+struct nfsd4_fs_locations {
+ uint32_t locations_count;
+ struct nfsd4_fs_location *locations;
+/* If we're not actually serving this data ourselves (only providing a
+ * list of replicas that do serve it) then we set "migrated": */
+ int migrated;
+};
+
+/*
+ * We keep an array of pseudoflavors with the export, in order from most
+ * to least preferred. For the foreseeable future, we don't expect more
+ * than the eight pseudoflavors null, unix, krb5, krb5i, krb5p, skpm3,
+ * spkm3i, and spkm3p (and using all 8 at once should be rare).
+ */
+#define MAX_SECINFO_LIST 8
+#define EX_UUID_LEN 16
+
+struct exp_flavor_info {
+ u32 pseudoflavor;
+ u32 flags;
+};
+
+struct svc_export {
+ struct cache_head h;
+ struct auth_domain * ex_client;
+ int ex_flags;
+ struct path ex_path;
+ kuid_t ex_anon_uid;
+ kgid_t ex_anon_gid;
+ int ex_fsid;
+ unsigned char * ex_uuid; /* 16 byte fsid */
+ struct nfsd4_fs_locations ex_fslocs;
+ uint32_t ex_nflavors;
+ struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST];
+ u32 ex_layout_types;
+ struct nfsd4_deviceid_map *ex_devid_map;
+ struct cache_detail *cd;
+ struct rcu_head ex_rcu;
+};
+
+/* an "export key" (expkey) maps a filehandlefragement to an
+ * svc_export for a given client. There can be several per export,
+ * for the different fsid types.
+ */
+struct svc_expkey {
+ struct cache_head h;
+
+ struct auth_domain * ek_client;
+ int ek_fsidtype;
+ u32 ek_fsid[6];
+
+ struct path ek_path;
+ struct rcu_head ek_rcu;
+};
+
+#define EX_ISSYNC(exp) (!((exp)->ex_flags & NFSEXP_ASYNC))
+#define EX_NOHIDE(exp) ((exp)->ex_flags & NFSEXP_NOHIDE)
+#define EX_WGATHER(exp) ((exp)->ex_flags & NFSEXP_GATHERED_WRITES)
+
+int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp);
+__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp);
+
+/*
+ * Function declarations
+ */
+int nfsd_export_init(struct net *);
+void nfsd_export_shutdown(struct net *);
+void nfsd_export_flush(struct net *);
+struct svc_export * rqst_exp_get_by_name(struct svc_rqst *,
+ struct path *);
+struct svc_export * rqst_exp_parent(struct svc_rqst *,
+ struct path *);
+struct svc_export * rqst_find_fsidzero_export(struct svc_rqst *);
+int exp_rootfh(struct net *, struct auth_domain *,
+ char *path, struct knfsd_fh *, int maxsize);
+__be32 exp_pseudoroot(struct svc_rqst *, struct svc_fh *);
+__be32 nfserrno(int errno);
+
+static inline void exp_put(struct svc_export *exp)
+{
+ cache_put(&exp->h, exp->cd);
+}
+
+static inline struct svc_export *exp_get(struct svc_export *exp)
+{
+ cache_get(&exp->h);
+ return exp;
+}
+struct svc_export * rqst_exp_find(struct svc_rqst *, int, u32 *);
+
+#endif /* NFSD_EXPORT_H */
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
new file mode 100644
index 000000000..76bee0a0d
--- /dev/null
+++ b/fs/nfsd/fault_inject.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
+ *
+ * Uses debugfs to create fault injection points for client testing
+ */
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+
+#include "state.h"
+#include "netns.h"
+
+struct nfsd_fault_inject_op {
+ char *file;
+ u64 (*get)(void);
+ u64 (*set_val)(u64);
+ u64 (*set_clnt)(struct sockaddr_storage *, size_t);
+};
+
+static struct dentry *debug_dir;
+
+static ssize_t fault_inject_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ static u64 val;
+ char read_buf[25];
+ size_t size;
+ loff_t pos = *ppos;
+ struct nfsd_fault_inject_op *op = file_inode(file)->i_private;
+
+ if (!pos)
+ val = op->get();
+ size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val);
+
+ return simple_read_from_buffer(buf, len, ppos, read_buf, size);
+}
+
+static ssize_t fault_inject_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ char write_buf[INET6_ADDRSTRLEN];
+ size_t size = min(sizeof(write_buf) - 1, len);
+ struct net *net = current->nsproxy->net_ns;
+ struct sockaddr_storage sa;
+ struct nfsd_fault_inject_op *op = file_inode(file)->i_private;
+ u64 val;
+ char *nl;
+
+ if (copy_from_user(write_buf, buf, size))
+ return -EFAULT;
+ write_buf[size] = '\0';
+
+ /* Deal with any embedded newlines in the string */
+ nl = strchr(write_buf, '\n');
+ if (nl) {
+ size = nl - write_buf;
+ *nl = '\0';
+ }
+
+ size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa));
+ if (size > 0) {
+ val = op->set_clnt(&sa, size);
+ if (val)
+ pr_info("NFSD [%s]: Client %s had %llu state object(s)\n",
+ op->file, write_buf, val);
+ } else {
+ val = simple_strtoll(write_buf, NULL, 0);
+ if (val == 0)
+ pr_info("NFSD Fault Injection: %s (all)", op->file);
+ else
+ pr_info("NFSD Fault Injection: %s (n = %llu)",
+ op->file, val);
+ val = op->set_val(val);
+ pr_info("NFSD: %s: found %llu", op->file, val);
+ }
+ return len; /* on success, claim we got the whole input */
+}
+
+static const struct file_operations fops_nfsd = {
+ .owner = THIS_MODULE,
+ .read = fault_inject_read,
+ .write = fault_inject_write,
+};
+
+void nfsd_fault_inject_cleanup(void)
+{
+ debugfs_remove_recursive(debug_dir);
+}
+
+static struct nfsd_fault_inject_op inject_ops[] = {
+ {
+ .file = "forget_clients",
+ .get = nfsd_inject_print_clients,
+ .set_val = nfsd_inject_forget_clients,
+ .set_clnt = nfsd_inject_forget_client,
+ },
+ {
+ .file = "forget_locks",
+ .get = nfsd_inject_print_locks,
+ .set_val = nfsd_inject_forget_locks,
+ .set_clnt = nfsd_inject_forget_client_locks,
+ },
+ {
+ .file = "forget_openowners",
+ .get = nfsd_inject_print_openowners,
+ .set_val = nfsd_inject_forget_openowners,
+ .set_clnt = nfsd_inject_forget_client_openowners,
+ },
+ {
+ .file = "forget_delegations",
+ .get = nfsd_inject_print_delegations,
+ .set_val = nfsd_inject_forget_delegations,
+ .set_clnt = nfsd_inject_forget_client_delegations,
+ },
+ {
+ .file = "recall_delegations",
+ .get = nfsd_inject_print_delegations,
+ .set_val = nfsd_inject_recall_delegations,
+ .set_clnt = nfsd_inject_recall_client_delegations,
+ },
+};
+
+void nfsd_fault_inject_init(void)
+{
+ unsigned int i;
+ struct nfsd_fault_inject_op *op;
+ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+
+ debug_dir = debugfs_create_dir("nfsd", NULL);
+
+ for (i = 0; i < ARRAY_SIZE(inject_ops); i++) {
+ op = &inject_ops[i];
+ debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd);
+ }
+}
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
new file mode 100644
index 000000000..e30e1ddc1
--- /dev/null
+++ b/fs/nfsd/filecache.c
@@ -0,0 +1,1092 @@
+/*
+ * Open file cache.
+ *
+ * (c) 2015 - Jeff Layton <jeff.layton@primarydata.com>
+ */
+
+#include <linux/hash.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/sched.h>
+#include <linux/list_lru.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/fsnotify.h>
+#include <linux/seq_file.h>
+
+#include "vfs.h"
+#include "nfsd.h"
+#include "nfsfh.h"
+#include "netns.h"
+#include "filecache.h"
+#include "trace.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_FH
+
+/* FIXME: dynamically size this for the machine somehow? */
+#define NFSD_FILE_HASH_BITS 12
+#define NFSD_FILE_HASH_SIZE (1 << NFSD_FILE_HASH_BITS)
+#define NFSD_LAUNDRETTE_DELAY (2 * HZ)
+
+#define NFSD_FILE_SHUTDOWN (1)
+#define NFSD_FILE_LRU_THRESHOLD (4096UL)
+#define NFSD_FILE_LRU_LIMIT (NFSD_FILE_LRU_THRESHOLD << 2)
+
+/* We only care about NFSD_MAY_READ/WRITE for this cache */
+#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE)
+
+struct nfsd_fcache_bucket {
+ struct hlist_head nfb_head;
+ spinlock_t nfb_lock;
+ unsigned int nfb_count;
+ unsigned int nfb_maxcount;
+};
+
+static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
+
+struct nfsd_fcache_disposal {
+ struct list_head list;
+ struct work_struct work;
+ struct net *net;
+ spinlock_t lock;
+ struct list_head freeme;
+ struct rcu_head rcu;
+};
+
+static struct workqueue_struct *nfsd_filecache_wq __read_mostly;
+
+static struct kmem_cache *nfsd_file_slab;
+static struct kmem_cache *nfsd_file_mark_slab;
+static struct nfsd_fcache_bucket *nfsd_file_hashtbl;
+static struct list_lru nfsd_file_lru;
+static long nfsd_file_lru_flags;
+static struct fsnotify_group *nfsd_file_fsnotify_group;
+static atomic_long_t nfsd_filecache_count;
+static struct delayed_work nfsd_filecache_laundrette;
+static DEFINE_SPINLOCK(laundrette_lock);
+static LIST_HEAD(laundrettes);
+
+static void nfsd_file_gc(void);
+
+static void
+nfsd_file_schedule_laundrette(void)
+{
+ long count = atomic_long_read(&nfsd_filecache_count);
+
+ if (count == 0 || test_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags))
+ return;
+
+ queue_delayed_work(system_wq, &nfsd_filecache_laundrette,
+ NFSD_LAUNDRETTE_DELAY);
+}
+
+static void
+nfsd_file_slab_free(struct rcu_head *rcu)
+{
+ struct nfsd_file *nf = container_of(rcu, struct nfsd_file, nf_rcu);
+
+ put_cred(nf->nf_cred);
+ kmem_cache_free(nfsd_file_slab, nf);
+}
+
+static void
+nfsd_file_mark_free(struct fsnotify_mark *mark)
+{
+ struct nfsd_file_mark *nfm = container_of(mark, struct nfsd_file_mark,
+ nfm_mark);
+
+ kmem_cache_free(nfsd_file_mark_slab, nfm);
+}
+
+static struct nfsd_file_mark *
+nfsd_file_mark_get(struct nfsd_file_mark *nfm)
+{
+ if (!refcount_inc_not_zero(&nfm->nfm_ref))
+ return NULL;
+ return nfm;
+}
+
+static void
+nfsd_file_mark_put(struct nfsd_file_mark *nfm)
+{
+ if (refcount_dec_and_test(&nfm->nfm_ref)) {
+ fsnotify_destroy_mark(&nfm->nfm_mark, nfsd_file_fsnotify_group);
+ fsnotify_put_mark(&nfm->nfm_mark);
+ }
+}
+
+static struct nfsd_file_mark *
+nfsd_file_mark_find_or_create(struct nfsd_file *nf)
+{
+ int err;
+ struct fsnotify_mark *mark;
+ struct nfsd_file_mark *nfm = NULL, *new;
+ struct inode *inode = nf->nf_inode;
+
+ do {
+ mutex_lock(&nfsd_file_fsnotify_group->mark_mutex);
+ mark = fsnotify_find_mark(&inode->i_fsnotify_marks,
+ nfsd_file_fsnotify_group);
+ if (mark) {
+ nfm = nfsd_file_mark_get(container_of(mark,
+ struct nfsd_file_mark,
+ nfm_mark));
+ mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex);
+ if (nfm) {
+ fsnotify_put_mark(mark);
+ break;
+ }
+ /* Avoid soft lockup race with nfsd_file_mark_put() */
+ fsnotify_destroy_mark(mark, nfsd_file_fsnotify_group);
+ fsnotify_put_mark(mark);
+ } else
+ mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex);
+
+ /* allocate a new nfm */
+ new = kmem_cache_alloc(nfsd_file_mark_slab, GFP_KERNEL);
+ if (!new)
+ return NULL;
+ fsnotify_init_mark(&new->nfm_mark, nfsd_file_fsnotify_group);
+ new->nfm_mark.mask = FS_ATTRIB|FS_DELETE_SELF;
+ refcount_set(&new->nfm_ref, 1);
+
+ err = fsnotify_add_inode_mark(&new->nfm_mark, inode, 0);
+
+ /*
+ * If the add was successful, then return the object.
+ * Otherwise, we need to put the reference we hold on the
+ * nfm_mark. The fsnotify code will take a reference and put
+ * it on failure, so we can't just free it directly. It's also
+ * not safe to call fsnotify_destroy_mark on it as the
+ * mark->group will be NULL. Thus, we can't let the nfm_ref
+ * counter drive the destruction at this point.
+ */
+ if (likely(!err))
+ nfm = new;
+ else
+ fsnotify_put_mark(&new->nfm_mark);
+ } while (unlikely(err == -EEXIST));
+
+ return nfm;
+}
+
+static struct nfsd_file *
+nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval,
+ struct net *net)
+{
+ struct nfsd_file *nf;
+
+ nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL);
+ if (nf) {
+ INIT_HLIST_NODE(&nf->nf_node);
+ INIT_LIST_HEAD(&nf->nf_lru);
+ nf->nf_file = NULL;
+ nf->nf_cred = get_current_cred();
+ nf->nf_net = net;
+ nf->nf_flags = 0;
+ nf->nf_inode = inode;
+ nf->nf_hashval = hashval;
+ refcount_set(&nf->nf_ref, 1);
+ nf->nf_may = may & NFSD_FILE_MAY_MASK;
+ if (may & NFSD_MAY_NOT_BREAK_LEASE) {
+ if (may & NFSD_MAY_WRITE)
+ __set_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags);
+ if (may & NFSD_MAY_READ)
+ __set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
+ }
+ nf->nf_mark = NULL;
+ trace_nfsd_file_alloc(nf);
+ }
+ return nf;
+}
+
+static bool
+nfsd_file_free(struct nfsd_file *nf)
+{
+ bool flush = false;
+
+ trace_nfsd_file_put_final(nf);
+ if (nf->nf_mark)
+ nfsd_file_mark_put(nf->nf_mark);
+ if (nf->nf_file) {
+ get_file(nf->nf_file);
+ filp_close(nf->nf_file, NULL);
+ fput(nf->nf_file);
+ flush = true;
+ }
+ call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
+ return flush;
+}
+
+static bool
+nfsd_file_check_writeback(struct nfsd_file *nf)
+{
+ struct file *file = nf->nf_file;
+ struct address_space *mapping;
+
+ if (!file || !(file->f_mode & FMODE_WRITE))
+ return false;
+ mapping = file->f_mapping;
+ return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) ||
+ mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
+}
+
+static int
+nfsd_file_check_write_error(struct nfsd_file *nf)
+{
+ struct file *file = nf->nf_file;
+
+ if (!file || !(file->f_mode & FMODE_WRITE))
+ return 0;
+ return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
+}
+
+static void
+nfsd_file_do_unhash(struct nfsd_file *nf)
+{
+ lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+
+ trace_nfsd_file_unhash(nf);
+
+ if (nfsd_file_check_write_error(nf))
+ nfsd_reset_boot_verifier(net_generic(nf->nf_net, nfsd_net_id));
+ --nfsd_file_hashtbl[nf->nf_hashval].nfb_count;
+ hlist_del_rcu(&nf->nf_node);
+ atomic_long_dec(&nfsd_filecache_count);
+}
+
+static bool
+nfsd_file_unhash(struct nfsd_file *nf)
+{
+ if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+ nfsd_file_do_unhash(nf);
+ if (!list_empty(&nf->nf_lru))
+ list_lru_del(&nfsd_file_lru, &nf->nf_lru);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Return true if the file was unhashed.
+ */
+static bool
+nfsd_file_unhash_and_release_locked(struct nfsd_file *nf, struct list_head *dispose)
+{
+ lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+
+ trace_nfsd_file_unhash_and_release_locked(nf);
+ if (!nfsd_file_unhash(nf))
+ return false;
+ /* keep final reference for nfsd_file_lru_dispose */
+ if (refcount_dec_not_one(&nf->nf_ref))
+ return true;
+
+ list_add(&nf->nf_lru, dispose);
+ return true;
+}
+
+static void
+nfsd_file_put_noref(struct nfsd_file *nf)
+{
+ trace_nfsd_file_put(nf);
+
+ if (refcount_dec_and_test(&nf->nf_ref)) {
+ WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
+ nfsd_file_free(nf);
+ }
+}
+
+void
+nfsd_file_put(struct nfsd_file *nf)
+{
+ bool is_hashed;
+
+ set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
+ if (refcount_read(&nf->nf_ref) > 2 || !nf->nf_file) {
+ nfsd_file_put_noref(nf);
+ return;
+ }
+
+ filemap_flush(nf->nf_file->f_mapping);
+ is_hashed = test_bit(NFSD_FILE_HASHED, &nf->nf_flags) != 0;
+ nfsd_file_put_noref(nf);
+ if (is_hashed)
+ nfsd_file_schedule_laundrette();
+ if (atomic_long_read(&nfsd_filecache_count) >= NFSD_FILE_LRU_LIMIT)
+ nfsd_file_gc();
+}
+
+struct nfsd_file *
+nfsd_file_get(struct nfsd_file *nf)
+{
+ if (likely(refcount_inc_not_zero(&nf->nf_ref)))
+ return nf;
+ return NULL;
+}
+
+static void
+nfsd_file_dispose_list(struct list_head *dispose)
+{
+ struct nfsd_file *nf;
+
+ while(!list_empty(dispose)) {
+ nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
+ list_del(&nf->nf_lru);
+ nfsd_file_put_noref(nf);
+ }
+}
+
+static void
+nfsd_file_dispose_list_sync(struct list_head *dispose)
+{
+ bool flush = false;
+ struct nfsd_file *nf;
+
+ while(!list_empty(dispose)) {
+ nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
+ list_del(&nf->nf_lru);
+ if (!refcount_dec_and_test(&nf->nf_ref))
+ continue;
+ if (nfsd_file_free(nf))
+ flush = true;
+ }
+ if (flush)
+ flush_delayed_fput();
+}
+
+static void
+nfsd_file_list_remove_disposal(struct list_head *dst,
+ struct nfsd_fcache_disposal *l)
+{
+ spin_lock(&l->lock);
+ list_splice_init(&l->freeme, dst);
+ spin_unlock(&l->lock);
+}
+
+static void
+nfsd_file_list_add_disposal(struct list_head *files, struct net *net)
+{
+ struct nfsd_fcache_disposal *l;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(l, &laundrettes, list) {
+ if (l->net == net) {
+ spin_lock(&l->lock);
+ list_splice_tail_init(files, &l->freeme);
+ spin_unlock(&l->lock);
+ queue_work(nfsd_filecache_wq, &l->work);
+ break;
+ }
+ }
+ rcu_read_unlock();
+}
+
+static void
+nfsd_file_list_add_pernet(struct list_head *dst, struct list_head *src,
+ struct net *net)
+{
+ struct nfsd_file *nf, *tmp;
+
+ list_for_each_entry_safe(nf, tmp, src, nf_lru) {
+ if (nf->nf_net == net)
+ list_move_tail(&nf->nf_lru, dst);
+ }
+}
+
+static void
+nfsd_file_dispose_list_delayed(struct list_head *dispose)
+{
+ LIST_HEAD(list);
+ struct nfsd_file *nf;
+
+ while(!list_empty(dispose)) {
+ nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
+ nfsd_file_list_add_pernet(&list, dispose, nf->nf_net);
+ nfsd_file_list_add_disposal(&list, nf->nf_net);
+ }
+}
+
+/*
+ * Note this can deadlock with nfsd_file_cache_purge.
+ */
+static enum lru_status
+nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
+ spinlock_t *lock, void *arg)
+ __releases(lock)
+ __acquires(lock)
+{
+ struct list_head *head = arg;
+ struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
+
+ /*
+ * Do a lockless refcount check. The hashtable holds one reference, so
+ * we look to see if anything else has a reference, or if any have
+ * been put since the shrinker last ran. Those don't get unhashed and
+ * released.
+ *
+ * Note that in the put path, we set the flag and then decrement the
+ * counter. Here we check the counter and then test and clear the flag.
+ * That order is deliberate to ensure that we can do this locklessly.
+ */
+ if (refcount_read(&nf->nf_ref) > 1)
+ goto out_skip;
+
+ /*
+ * Don't throw out files that are still undergoing I/O or
+ * that have uncleared errors pending.
+ */
+ if (nfsd_file_check_writeback(nf))
+ goto out_skip;
+
+ if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags))
+ goto out_skip;
+
+ if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags))
+ goto out_skip;
+
+ list_lru_isolate_move(lru, &nf->nf_lru, head);
+ return LRU_REMOVED;
+out_skip:
+ return LRU_SKIP;
+}
+
+static unsigned long
+nfsd_file_lru_walk_list(struct shrink_control *sc)
+{
+ LIST_HEAD(head);
+ struct nfsd_file *nf;
+ unsigned long ret;
+
+ if (sc)
+ ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
+ nfsd_file_lru_cb, &head);
+ else
+ ret = list_lru_walk(&nfsd_file_lru,
+ nfsd_file_lru_cb,
+ &head, LONG_MAX);
+ list_for_each_entry(nf, &head, nf_lru) {
+ spin_lock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+ nfsd_file_do_unhash(nf);
+ spin_unlock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+ }
+ nfsd_file_dispose_list_delayed(&head);
+ return ret;
+}
+
+static void
+nfsd_file_gc(void)
+{
+ nfsd_file_lru_walk_list(NULL);
+}
+
+static void
+nfsd_file_gc_worker(struct work_struct *work)
+{
+ nfsd_file_gc();
+ nfsd_file_schedule_laundrette();
+}
+
+static unsigned long
+nfsd_file_lru_count(struct shrinker *s, struct shrink_control *sc)
+{
+ return list_lru_count(&nfsd_file_lru);
+}
+
+static unsigned long
+nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
+{
+ return nfsd_file_lru_walk_list(sc);
+}
+
+static struct shrinker nfsd_file_shrinker = {
+ .scan_objects = nfsd_file_lru_scan,
+ .count_objects = nfsd_file_lru_count,
+ .seeks = 1,
+};
+
+static void
+__nfsd_file_close_inode(struct inode *inode, unsigned int hashval,
+ struct list_head *dispose)
+{
+ struct nfsd_file *nf;
+ struct hlist_node *tmp;
+
+ spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
+ hlist_for_each_entry_safe(nf, tmp, &nfsd_file_hashtbl[hashval].nfb_head, nf_node) {
+ if (inode == nf->nf_inode)
+ nfsd_file_unhash_and_release_locked(nf, dispose);
+ }
+ spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+}
+
+/**
+ * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
+ * @inode: inode of the file to attempt to remove
+ *
+ * Walk the whole hash bucket, looking for any files that correspond to "inode".
+ * If any do, then unhash them and put the hashtable reference to them and
+ * destroy any that had their last reference put. Also ensure that any of the
+ * fputs also have their final __fput done as well.
+ */
+void
+nfsd_file_close_inode_sync(struct inode *inode)
+{
+ unsigned int hashval = (unsigned int)hash_long(inode->i_ino,
+ NFSD_FILE_HASH_BITS);
+ LIST_HEAD(dispose);
+
+ __nfsd_file_close_inode(inode, hashval, &dispose);
+ trace_nfsd_file_close_inode_sync(inode, hashval, !list_empty(&dispose));
+ nfsd_file_dispose_list_sync(&dispose);
+}
+
+/**
+ * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
+ * @inode: inode of the file to attempt to remove
+ *
+ * Walk the whole hash bucket, looking for any files that correspond to "inode".
+ * If any do, then unhash them and put the hashtable reference to them and
+ * destroy any that had their last reference put.
+ */
+static void
+nfsd_file_close_inode(struct inode *inode)
+{
+ unsigned int hashval = (unsigned int)hash_long(inode->i_ino,
+ NFSD_FILE_HASH_BITS);
+ LIST_HEAD(dispose);
+
+ __nfsd_file_close_inode(inode, hashval, &dispose);
+ trace_nfsd_file_close_inode(inode, hashval, !list_empty(&dispose));
+ nfsd_file_dispose_list_delayed(&dispose);
+}
+
+/**
+ * nfsd_file_delayed_close - close unused nfsd_files
+ * @work: dummy
+ *
+ * Walk the LRU list and close any entries that have not been used since
+ * the last scan.
+ *
+ * Note this can deadlock with nfsd_file_cache_purge.
+ */
+static void
+nfsd_file_delayed_close(struct work_struct *work)
+{
+ LIST_HEAD(head);
+ struct nfsd_fcache_disposal *l = container_of(work,
+ struct nfsd_fcache_disposal, work);
+
+ nfsd_file_list_remove_disposal(&head, l);
+ nfsd_file_dispose_list(&head);
+}
+
+static int
+nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg,
+ void *data)
+{
+ struct file_lock *fl = data;
+
+ /* Only close files for F_SETLEASE leases */
+ if (fl->fl_flags & FL_LEASE)
+ nfsd_file_close_inode_sync(file_inode(fl->fl_file));
+ return 0;
+}
+
+static struct notifier_block nfsd_file_lease_notifier = {
+ .notifier_call = nfsd_file_lease_notifier_call,
+};
+
+static int
+nfsd_file_fsnotify_handle_event(struct fsnotify_mark *mark, u32 mask,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *name, u32 cookie)
+{
+ trace_nfsd_file_fsnotify_handle_event(inode, mask);
+
+ /* Should be no marks on non-regular files */
+ if (!S_ISREG(inode->i_mode)) {
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ /* don't close files if this was not the last link */
+ if (mask & FS_ATTRIB) {
+ if (inode->i_nlink)
+ return 0;
+ }
+
+ nfsd_file_close_inode(inode);
+ return 0;
+}
+
+
+static const struct fsnotify_ops nfsd_file_fsnotify_ops = {
+ .handle_inode_event = nfsd_file_fsnotify_handle_event,
+ .free_mark = nfsd_file_mark_free,
+};
+
+int
+nfsd_file_cache_init(void)
+{
+ int ret = -ENOMEM;
+ unsigned int i;
+
+ clear_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags);
+
+ if (nfsd_file_hashtbl)
+ return 0;
+
+ nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", 0, 0);
+ if (!nfsd_filecache_wq)
+ goto out;
+
+ nfsd_file_hashtbl = kvcalloc(NFSD_FILE_HASH_SIZE,
+ sizeof(*nfsd_file_hashtbl), GFP_KERNEL);
+ if (!nfsd_file_hashtbl) {
+ pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n");
+ goto out_err;
+ }
+
+ nfsd_file_slab = kmem_cache_create("nfsd_file",
+ sizeof(struct nfsd_file), 0, 0, NULL);
+ if (!nfsd_file_slab) {
+ pr_err("nfsd: unable to create nfsd_file_slab\n");
+ goto out_err;
+ }
+
+ nfsd_file_mark_slab = kmem_cache_create("nfsd_file_mark",
+ sizeof(struct nfsd_file_mark), 0, 0, NULL);
+ if (!nfsd_file_mark_slab) {
+ pr_err("nfsd: unable to create nfsd_file_mark_slab\n");
+ goto out_err;
+ }
+
+
+ ret = list_lru_init(&nfsd_file_lru);
+ if (ret) {
+ pr_err("nfsd: failed to init nfsd_file_lru: %d\n", ret);
+ goto out_err;
+ }
+
+ ret = register_shrinker(&nfsd_file_shrinker);
+ if (ret) {
+ pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret);
+ goto out_lru;
+ }
+
+ ret = lease_register_notifier(&nfsd_file_lease_notifier);
+ if (ret) {
+ pr_err("nfsd: unable to register lease notifier: %d\n", ret);
+ goto out_shrinker;
+ }
+
+ nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops);
+ if (IS_ERR(nfsd_file_fsnotify_group)) {
+ pr_err("nfsd: unable to create fsnotify group: %ld\n",
+ PTR_ERR(nfsd_file_fsnotify_group));
+ nfsd_file_fsnotify_group = NULL;
+ goto out_notifier;
+ }
+
+ for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
+ INIT_HLIST_HEAD(&nfsd_file_hashtbl[i].nfb_head);
+ spin_lock_init(&nfsd_file_hashtbl[i].nfb_lock);
+ }
+
+ INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_gc_worker);
+out:
+ return ret;
+out_notifier:
+ lease_unregister_notifier(&nfsd_file_lease_notifier);
+out_shrinker:
+ unregister_shrinker(&nfsd_file_shrinker);
+out_lru:
+ list_lru_destroy(&nfsd_file_lru);
+out_err:
+ kmem_cache_destroy(nfsd_file_slab);
+ nfsd_file_slab = NULL;
+ kmem_cache_destroy(nfsd_file_mark_slab);
+ nfsd_file_mark_slab = NULL;
+ kvfree(nfsd_file_hashtbl);
+ nfsd_file_hashtbl = NULL;
+ destroy_workqueue(nfsd_filecache_wq);
+ nfsd_filecache_wq = NULL;
+ goto out;
+}
+
+/*
+ * Note this can deadlock with nfsd_file_lru_cb.
+ */
+void
+nfsd_file_cache_purge(struct net *net)
+{
+ unsigned int i;
+ struct nfsd_file *nf;
+ struct hlist_node *next;
+ LIST_HEAD(dispose);
+ bool del;
+
+ if (!nfsd_file_hashtbl)
+ return;
+
+ for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
+ struct nfsd_fcache_bucket *nfb = &nfsd_file_hashtbl[i];
+
+ spin_lock(&nfb->nfb_lock);
+ hlist_for_each_entry_safe(nf, next, &nfb->nfb_head, nf_node) {
+ if (net && nf->nf_net != net)
+ continue;
+ del = nfsd_file_unhash_and_release_locked(nf, &dispose);
+
+ /*
+ * Deadlock detected! Something marked this entry as
+ * unhased, but hasn't removed it from the hash list.
+ */
+ WARN_ON_ONCE(!del);
+ }
+ spin_unlock(&nfb->nfb_lock);
+ nfsd_file_dispose_list(&dispose);
+ }
+}
+
+static struct nfsd_fcache_disposal *
+nfsd_alloc_fcache_disposal(struct net *net)
+{
+ struct nfsd_fcache_disposal *l;
+
+ l = kmalloc(sizeof(*l), GFP_KERNEL);
+ if (!l)
+ return NULL;
+ INIT_WORK(&l->work, nfsd_file_delayed_close);
+ l->net = net;
+ spin_lock_init(&l->lock);
+ INIT_LIST_HEAD(&l->freeme);
+ return l;
+}
+
+static void
+nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l)
+{
+ rcu_assign_pointer(l->net, NULL);
+ cancel_work_sync(&l->work);
+ nfsd_file_dispose_list(&l->freeme);
+ kfree_rcu(l, rcu);
+}
+
+static void
+nfsd_add_fcache_disposal(struct nfsd_fcache_disposal *l)
+{
+ spin_lock(&laundrette_lock);
+ list_add_tail_rcu(&l->list, &laundrettes);
+ spin_unlock(&laundrette_lock);
+}
+
+static void
+nfsd_del_fcache_disposal(struct nfsd_fcache_disposal *l)
+{
+ spin_lock(&laundrette_lock);
+ list_del_rcu(&l->list);
+ spin_unlock(&laundrette_lock);
+}
+
+static int
+nfsd_alloc_fcache_disposal_net(struct net *net)
+{
+ struct nfsd_fcache_disposal *l;
+
+ l = nfsd_alloc_fcache_disposal(net);
+ if (!l)
+ return -ENOMEM;
+ nfsd_add_fcache_disposal(l);
+ return 0;
+}
+
+static void
+nfsd_free_fcache_disposal_net(struct net *net)
+{
+ struct nfsd_fcache_disposal *l;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(l, &laundrettes, list) {
+ if (l->net != net)
+ continue;
+ nfsd_del_fcache_disposal(l);
+ rcu_read_unlock();
+ nfsd_free_fcache_disposal(l);
+ return;
+ }
+ rcu_read_unlock();
+}
+
+int
+nfsd_file_cache_start_net(struct net *net)
+{
+ return nfsd_alloc_fcache_disposal_net(net);
+}
+
+void
+nfsd_file_cache_shutdown_net(struct net *net)
+{
+ nfsd_file_cache_purge(net);
+ nfsd_free_fcache_disposal_net(net);
+}
+
+void
+nfsd_file_cache_shutdown(void)
+{
+ set_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags);
+
+ lease_unregister_notifier(&nfsd_file_lease_notifier);
+ unregister_shrinker(&nfsd_file_shrinker);
+ /*
+ * make sure all callers of nfsd_file_lru_cb are done before
+ * calling nfsd_file_cache_purge
+ */
+ cancel_delayed_work_sync(&nfsd_filecache_laundrette);
+ nfsd_file_cache_purge(NULL);
+ list_lru_destroy(&nfsd_file_lru);
+ rcu_barrier();
+ fsnotify_put_group(nfsd_file_fsnotify_group);
+ nfsd_file_fsnotify_group = NULL;
+ kmem_cache_destroy(nfsd_file_slab);
+ nfsd_file_slab = NULL;
+ fsnotify_wait_marks_destroyed();
+ kmem_cache_destroy(nfsd_file_mark_slab);
+ nfsd_file_mark_slab = NULL;
+ kvfree(nfsd_file_hashtbl);
+ nfsd_file_hashtbl = NULL;
+ destroy_workqueue(nfsd_filecache_wq);
+ nfsd_filecache_wq = NULL;
+}
+
+static bool
+nfsd_match_cred(const struct cred *c1, const struct cred *c2)
+{
+ int i;
+
+ if (!uid_eq(c1->fsuid, c2->fsuid))
+ return false;
+ if (!gid_eq(c1->fsgid, c2->fsgid))
+ return false;
+ if (c1->group_info == NULL || c2->group_info == NULL)
+ return c1->group_info == c2->group_info;
+ if (c1->group_info->ngroups != c2->group_info->ngroups)
+ return false;
+ for (i = 0; i < c1->group_info->ngroups; i++) {
+ if (!gid_eq(c1->group_info->gid[i], c2->group_info->gid[i]))
+ return false;
+ }
+ return true;
+}
+
+static struct nfsd_file *
+nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
+ unsigned int hashval, struct net *net)
+{
+ struct nfsd_file *nf;
+ unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
+
+ hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head,
+ nf_node, lockdep_is_held(&nfsd_file_hashtbl[hashval].nfb_lock)) {
+ if (nf->nf_may != need)
+ continue;
+ if (nf->nf_inode != inode)
+ continue;
+ if (nf->nf_net != net)
+ continue;
+ if (!nfsd_match_cred(nf->nf_cred, current_cred()))
+ continue;
+ if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
+ continue;
+ if (nfsd_file_get(nf) != NULL)
+ return nf;
+ }
+ return NULL;
+}
+
+/**
+ * nfsd_file_is_cached - are there any cached open files for this fh?
+ * @inode: inode of the file to check
+ *
+ * Scan the hashtable for open files that match this fh. Returns true if there
+ * are any, and false if not.
+ */
+bool
+nfsd_file_is_cached(struct inode *inode)
+{
+ bool ret = false;
+ struct nfsd_file *nf;
+ unsigned int hashval;
+
+ hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS);
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head,
+ nf_node) {
+ if (inode == nf->nf_inode) {
+ ret = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ trace_nfsd_file_is_cached(inode, hashval, (int)ret);
+ return ret;
+}
+
+__be32
+nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **pnf)
+{
+ __be32 status;
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_file *nf, *new;
+ struct inode *inode;
+ unsigned int hashval;
+ bool retry = true;
+
+ /* FIXME: skip this if fh_dentry is already set? */
+ status = fh_verify(rqstp, fhp, S_IFREG,
+ may_flags|NFSD_MAY_OWNER_OVERRIDE);
+ if (status != nfs_ok)
+ return status;
+
+ inode = d_inode(fhp->fh_dentry);
+ hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS);
+retry:
+ rcu_read_lock();
+ nf = nfsd_file_find_locked(inode, may_flags, hashval, net);
+ rcu_read_unlock();
+ if (nf)
+ goto wait_for_construction;
+
+ new = nfsd_file_alloc(inode, may_flags, hashval, net);
+ if (!new) {
+ trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags,
+ NULL, nfserr_jukebox);
+ return nfserr_jukebox;
+ }
+
+ spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
+ nf = nfsd_file_find_locked(inode, may_flags, hashval, net);
+ if (nf == NULL)
+ goto open_file;
+ spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+ nfsd_file_slab_free(&new->nf_rcu);
+
+wait_for_construction:
+ wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE);
+
+ /* Did construction of this file fail? */
+ if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+ if (!retry) {
+ status = nfserr_jukebox;
+ goto out;
+ }
+ retry = false;
+ nfsd_file_put_noref(nf);
+ goto retry;
+ }
+
+ this_cpu_inc(nfsd_file_cache_hits);
+
+ if (!(may_flags & NFSD_MAY_NOT_BREAK_LEASE)) {
+ bool write = (may_flags & NFSD_MAY_WRITE);
+
+ if (test_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags) ||
+ (test_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags) && write)) {
+ status = nfserrno(nfsd_open_break_lease(
+ file_inode(nf->nf_file), may_flags));
+ if (status == nfs_ok) {
+ clear_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
+ if (write)
+ clear_bit(NFSD_FILE_BREAK_WRITE,
+ &nf->nf_flags);
+ }
+ }
+ }
+out:
+ if (status == nfs_ok) {
+ *pnf = nf;
+ } else {
+ nfsd_file_put(nf);
+ nf = NULL;
+ }
+
+ trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, nf, status);
+ return status;
+open_file:
+ nf = new;
+ /* Take reference for the hashtable */
+ refcount_inc(&nf->nf_ref);
+ __set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
+ __set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
+ list_lru_add(&nfsd_file_lru, &nf->nf_lru);
+ hlist_add_head_rcu(&nf->nf_node, &nfsd_file_hashtbl[hashval].nfb_head);
+ ++nfsd_file_hashtbl[hashval].nfb_count;
+ nfsd_file_hashtbl[hashval].nfb_maxcount = max(nfsd_file_hashtbl[hashval].nfb_maxcount,
+ nfsd_file_hashtbl[hashval].nfb_count);
+ spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+ if (atomic_long_inc_return(&nfsd_filecache_count) >= NFSD_FILE_LRU_THRESHOLD)
+ nfsd_file_gc();
+
+ nf->nf_mark = nfsd_file_mark_find_or_create(nf);
+ if (nf->nf_mark)
+ status = nfsd_open_verified(rqstp, fhp, S_IFREG,
+ may_flags, &nf->nf_file);
+ else
+ status = nfserr_jukebox;
+ /*
+ * If construction failed, or we raced with a call to unlink()
+ * then unhash.
+ */
+ if (status != nfs_ok || inode->i_nlink == 0) {
+ bool do_free;
+ spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
+ do_free = nfsd_file_unhash(nf);
+ spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+ if (do_free)
+ nfsd_file_put_noref(nf);
+ }
+ clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
+ goto out;
+}
+
+/*
+ * Note that fields may be added, removed or reordered in the future. Programs
+ * scraping this file for info should test the labels to ensure they're
+ * getting the correct field.
+ */
+static int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
+{
+ unsigned int i, count = 0, longest = 0;
+ unsigned long hits = 0;
+
+ /*
+ * No need for spinlocks here since we're not terribly interested in
+ * accuracy. We do take the nfsd_mutex simply to ensure that we
+ * don't end up racing with server shutdown
+ */
+ mutex_lock(&nfsd_mutex);
+ if (nfsd_file_hashtbl) {
+ for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
+ count += nfsd_file_hashtbl[i].nfb_count;
+ longest = max(longest, nfsd_file_hashtbl[i].nfb_count);
+ }
+ }
+ mutex_unlock(&nfsd_mutex);
+
+ for_each_possible_cpu(i)
+ hits += per_cpu(nfsd_file_cache_hits, i);
+
+ seq_printf(m, "total entries: %u\n", count);
+ seq_printf(m, "longest chain: %u\n", longest);
+ seq_printf(m, "cache hits: %lu\n", hits);
+ return 0;
+}
+
+int nfsd_file_cache_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, nfsd_file_cache_stats_show, NULL);
+}
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
new file mode 100644
index 000000000..435ceab27
--- /dev/null
+++ b/fs/nfsd/filecache.h
@@ -0,0 +1,63 @@
+#ifndef _FS_NFSD_FILECACHE_H
+#define _FS_NFSD_FILECACHE_H
+
+#include <linux/fsnotify_backend.h>
+
+/*
+ * This is the fsnotify_mark container that nfsd attaches to the files that it
+ * is holding open. Note that we have a separate refcount here aside from the
+ * one in the fsnotify_mark. We only want a single fsnotify_mark attached to
+ * the inode, and for each nfsd_file to hold a reference to it.
+ *
+ * The fsnotify_mark is itself refcounted, but that's not sufficient to tell us
+ * how to put that reference. If there are still outstanding nfsd_files that
+ * reference the mark, then we would want to call fsnotify_put_mark on it.
+ * If there were not, then we'd need to call fsnotify_destroy_mark. Since we
+ * can't really tell the difference, we use the nfm_mark to keep track of how
+ * many nfsd_files hold references to the mark. When that counter goes to zero
+ * then we know to call fsnotify_destroy_mark on it.
+ */
+struct nfsd_file_mark {
+ struct fsnotify_mark nfm_mark;
+ refcount_t nfm_ref;
+};
+
+/*
+ * A representation of a file that has been opened by knfsd. These are hashed
+ * in the hashtable by inode pointer value. Note that this object doesn't
+ * hold a reference to the inode by itself, so the nf_inode pointer should
+ * never be dereferenced, only used for comparison.
+ */
+struct nfsd_file {
+ struct hlist_node nf_node;
+ struct list_head nf_lru;
+ struct rcu_head nf_rcu;
+ struct file *nf_file;
+ const struct cred *nf_cred;
+ struct net *nf_net;
+#define NFSD_FILE_HASHED (0)
+#define NFSD_FILE_PENDING (1)
+#define NFSD_FILE_BREAK_READ (2)
+#define NFSD_FILE_BREAK_WRITE (3)
+#define NFSD_FILE_REFERENCED (4)
+ unsigned long nf_flags;
+ struct inode *nf_inode;
+ unsigned int nf_hashval;
+ refcount_t nf_ref;
+ unsigned char nf_may;
+ struct nfsd_file_mark *nf_mark;
+};
+
+int nfsd_file_cache_init(void);
+void nfsd_file_cache_purge(struct net *);
+void nfsd_file_cache_shutdown(void);
+int nfsd_file_cache_start_net(struct net *net);
+void nfsd_file_cache_shutdown_net(struct net *net);
+void nfsd_file_put(struct nfsd_file *nf);
+struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
+void nfsd_file_close_inode_sync(struct inode *inode);
+bool nfsd_file_is_cached(struct inode *inode);
+__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **nfp);
+int nfsd_file_cache_stats_open(struct inode *, struct file *);
+#endif /* _FS_NFSD_FILECACHE_H */
diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c
new file mode 100644
index 000000000..db7ef07ae
--- /dev/null
+++ b/fs/nfsd/flexfilelayout.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com>
+ *
+ * The following implements a super-simple flex-file server
+ * where the NFSv4.1 mds is also the ds. And the storage is
+ * the same. I.e., writing to the mds via a NFSv4.1 WRITE
+ * goes to the same location as the NFSv3 WRITE.
+ */
+#include <linux/slab.h>
+
+#include <linux/nfsd/debug.h>
+
+#include <linux/sunrpc/addr.h>
+
+#include "flexfilelayoutxdr.h"
+#include "pnfs.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+static __be32
+nfsd4_ff_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
+ struct nfsd4_layoutget *args)
+{
+ struct nfsd4_layout_seg *seg = &args->lg_seg;
+ u32 device_generation = 0;
+ int error;
+ uid_t u;
+
+ struct pnfs_ff_layout *fl;
+
+ /*
+ * The super simple flex file server has 1 mirror, 1 data server,
+ * and 1 file handle. So instead of 4 allocs, do 1 for now.
+ * Zero it out for the stateid - don't want junk in there!
+ */
+ error = -ENOMEM;
+ fl = kzalloc(sizeof(*fl), GFP_KERNEL);
+ if (!fl)
+ goto out_error;
+ args->lg_content = fl;
+
+ /*
+ * Avoid layout commit, try to force the I/O to the DS,
+ * and for fun, cause all IOMODE_RW layout segments to
+ * effectively be WRITE only.
+ */
+ fl->flags = FF_FLAGS_NO_LAYOUTCOMMIT | FF_FLAGS_NO_IO_THRU_MDS |
+ FF_FLAGS_NO_READ_IO;
+
+ /* Do not allow a IOMODE_READ segment to have write pemissions */
+ if (seg->iomode == IOMODE_READ) {
+ u = from_kuid(&init_user_ns, inode->i_uid) + 1;
+ fl->uid = make_kuid(&init_user_ns, u);
+ } else
+ fl->uid = inode->i_uid;
+ fl->gid = inode->i_gid;
+
+ error = nfsd4_set_deviceid(&fl->deviceid, fhp, device_generation);
+ if (error)
+ goto out_error;
+
+ fl->fh.size = fhp->fh_handle.fh_size;
+ memcpy(fl->fh.data, &fhp->fh_handle.fh_base, fl->fh.size);
+
+ /* Give whole file layout segments */
+ seg->offset = 0;
+ seg->length = NFS4_MAX_UINT64;
+
+ dprintk("GET: 0x%llx:0x%llx %d\n", seg->offset, seg->length,
+ seg->iomode);
+ return 0;
+
+out_error:
+ seg->length = 0;
+ return nfserrno(error);
+}
+
+static __be32
+nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp,
+ struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp)
+{
+ struct pnfs_ff_device_addr *da;
+
+ u16 port;
+ char addr[INET6_ADDRSTRLEN];
+
+ da = kzalloc(sizeof(struct pnfs_ff_device_addr), GFP_KERNEL);
+ if (!da)
+ return nfserrno(-ENOMEM);
+
+ gdp->gd_device = da;
+
+ da->version = 3;
+ da->minor_version = 0;
+
+ da->rsize = svc_max_payload(rqstp);
+ da->wsize = da->rsize;
+
+ rpc_ntop((struct sockaddr *)&rqstp->rq_daddr,
+ addr, INET6_ADDRSTRLEN);
+ if (rqstp->rq_daddr.ss_family == AF_INET) {
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)&rqstp->rq_daddr;
+ port = ntohs(sin->sin_port);
+ snprintf(da->netaddr.netid, FF_NETID_LEN + 1, "tcp");
+ da->netaddr.netid_len = 3;
+ } else {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&rqstp->rq_daddr;
+ port = ntohs(sin6->sin6_port);
+ snprintf(da->netaddr.netid, FF_NETID_LEN + 1, "tcp6");
+ da->netaddr.netid_len = 4;
+ }
+
+ da->netaddr.addr_len =
+ snprintf(da->netaddr.addr, FF_ADDR_LEN + 1,
+ "%s.%hhu.%hhu", addr, port >> 8, port & 0xff);
+
+ da->tightly_coupled = false;
+
+ return 0;
+}
+
+const struct nfsd4_layout_ops ff_layout_ops = {
+ .notify_types =
+ NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
+ .disable_recalls = true,
+ .proc_getdeviceinfo = nfsd4_ff_proc_getdeviceinfo,
+ .encode_getdeviceinfo = nfsd4_ff_encode_getdeviceinfo,
+ .proc_layoutget = nfsd4_ff_proc_layoutget,
+ .encode_layoutget = nfsd4_ff_encode_layoutget,
+};
diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c
new file mode 100644
index 000000000..bb205328e
--- /dev/null
+++ b/fs/nfsd/flexfilelayoutxdr.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com>
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/nfs4.h>
+
+#include "nfsd.h"
+#include "flexfilelayoutxdr.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+struct ff_idmap {
+ char buf[11];
+ int len;
+};
+
+__be32
+nfsd4_ff_encode_layoutget(struct xdr_stream *xdr,
+ struct nfsd4_layoutget *lgp)
+{
+ struct pnfs_ff_layout *fl = lgp->lg_content;
+ int len, mirror_len, ds_len, fh_len;
+ __be32 *p;
+
+ /*
+ * Unlike nfsd4_encode_user, we know these will
+ * always be stringified.
+ */
+ struct ff_idmap uid;
+ struct ff_idmap gid;
+
+ fh_len = 4 + fl->fh.size;
+
+ uid.len = sprintf(uid.buf, "%u", from_kuid(&init_user_ns, fl->uid));
+ gid.len = sprintf(gid.buf, "%u", from_kgid(&init_user_ns, fl->gid));
+
+ /* 8 + len for recording the length, name, and padding */
+ ds_len = 20 + sizeof(stateid_opaque_t) + 4 + fh_len +
+ 8 + uid.len + 8 + gid.len;
+
+ mirror_len = 4 + ds_len;
+
+ /* The layout segment */
+ len = 20 + mirror_len;
+
+ p = xdr_reserve_space(xdr, sizeof(__be32) + len);
+ if (!p)
+ return nfserr_toosmall;
+
+ *p++ = cpu_to_be32(len);
+ p = xdr_encode_hyper(p, 0); /* stripe unit of 1 */
+
+ *p++ = cpu_to_be32(1); /* single mirror */
+ *p++ = cpu_to_be32(1); /* single data server */
+
+ p = xdr_encode_opaque_fixed(p, &fl->deviceid,
+ sizeof(struct nfsd4_deviceid));
+
+ *p++ = cpu_to_be32(1); /* efficiency */
+
+ *p++ = cpu_to_be32(fl->stateid.si_generation);
+ p = xdr_encode_opaque_fixed(p, &fl->stateid.si_opaque,
+ sizeof(stateid_opaque_t));
+
+ *p++ = cpu_to_be32(1); /* single file handle */
+ p = xdr_encode_opaque(p, fl->fh.data, fl->fh.size);
+
+ p = xdr_encode_opaque(p, uid.buf, uid.len);
+ p = xdr_encode_opaque(p, gid.buf, gid.len);
+
+ *p++ = cpu_to_be32(fl->flags);
+ *p++ = cpu_to_be32(0); /* No stats collect hint */
+
+ return 0;
+}
+
+__be32
+nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ struct pnfs_ff_device_addr *da = gdp->gd_device;
+ int len;
+ int ver_len;
+ int addr_len;
+ __be32 *p;
+
+ /*
+ * See paragraph 5 of RFC 8881 S18.40.3.
+ */
+ if (!gdp->gd_maxcount) {
+ if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+ return nfserr_resource;
+ return nfs_ok;
+ }
+
+ /* len + padding for two strings */
+ addr_len = 16 + da->netaddr.netid_len + da->netaddr.addr_len;
+ ver_len = 20;
+
+ len = 4 + ver_len + 4 + addr_len;
+
+ p = xdr_reserve_space(xdr, len + sizeof(__be32));
+ if (!p)
+ return nfserr_resource;
+
+ /*
+ * Fill in the overall length and number of volumes at the beginning
+ * of the layout.
+ */
+ *p++ = cpu_to_be32(len);
+ *p++ = cpu_to_be32(1); /* 1 netaddr */
+ p = xdr_encode_opaque(p, da->netaddr.netid, da->netaddr.netid_len);
+ p = xdr_encode_opaque(p, da->netaddr.addr, da->netaddr.addr_len);
+
+ *p++ = cpu_to_be32(1); /* 1 versions */
+
+ *p++ = cpu_to_be32(da->version);
+ *p++ = cpu_to_be32(da->minor_version);
+ *p++ = cpu_to_be32(da->rsize);
+ *p++ = cpu_to_be32(da->wsize);
+ *p++ = cpu_to_be32(da->tightly_coupled);
+
+ return 0;
+}
diff --git a/fs/nfsd/flexfilelayoutxdr.h b/fs/nfsd/flexfilelayoutxdr.h
new file mode 100644
index 000000000..8e195aeca
--- /dev/null
+++ b/fs/nfsd/flexfilelayoutxdr.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com>
+ */
+#ifndef _NFSD_FLEXFILELAYOUTXDR_H
+#define _NFSD_FLEXFILELAYOUTXDR_H 1
+
+#include <linux/inet.h>
+#include "xdr4.h"
+
+#define FF_FLAGS_NO_LAYOUTCOMMIT 1
+#define FF_FLAGS_NO_IO_THRU_MDS 2
+#define FF_FLAGS_NO_READ_IO 4
+
+struct xdr_stream;
+
+#define FF_NETID_LEN (4)
+#define FF_ADDR_LEN (INET6_ADDRSTRLEN + 8)
+struct pnfs_ff_netaddr {
+ char netid[FF_NETID_LEN + 1];
+ char addr[FF_ADDR_LEN + 1];
+ u32 netid_len;
+ u32 addr_len;
+};
+
+struct pnfs_ff_device_addr {
+ struct pnfs_ff_netaddr netaddr;
+ u32 version;
+ u32 minor_version;
+ u32 rsize;
+ u32 wsize;
+ bool tightly_coupled;
+};
+
+struct pnfs_ff_layout {
+ u32 flags;
+ u32 stats_collect_hint;
+ kuid_t uid;
+ kgid_t gid;
+ struct nfsd4_deviceid deviceid;
+ stateid_t stateid;
+ struct nfs_fh fh;
+};
+
+__be32 nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr,
+ struct nfsd4_getdeviceinfo *gdp);
+__be32 nfsd4_ff_encode_layoutget(struct xdr_stream *xdr,
+ struct nfsd4_layoutget *lgp);
+
+#endif /* _NFSD_FLEXFILELAYOUTXDR_H */
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
new file mode 100644
index 000000000..23cc85d1e
--- /dev/null
+++ b/fs/nfsd/idmap.h
@@ -0,0 +1,60 @@
+/*
+ * Mapping of UID to name and vice versa.
+ *
+ * Copyright (c) 2002, 2003 The Regents of the University of
+ * Michigan. All rights reserved.
+> *
+ * Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LINUX_NFSD_IDMAP_H
+#define LINUX_NFSD_IDMAP_H
+
+#include <linux/in.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/nfs_idmap.h>
+
+#ifdef CONFIG_NFSD_V4
+int nfsd_idmap_init(struct net *);
+void nfsd_idmap_shutdown(struct net *);
+#else
+static inline int nfsd_idmap_init(struct net *net)
+{
+ return 0;
+}
+static inline void nfsd_idmap_shutdown(struct net *net)
+{
+}
+#endif
+
+__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, kuid_t *);
+__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, kgid_t *);
+__be32 nfsd4_encode_user(struct xdr_stream *, struct svc_rqst *, kuid_t);
+__be32 nfsd4_encode_group(struct xdr_stream *, struct svc_rqst *, kgid_t);
+
+#endif /* LINUX_NFSD_IDMAP_H */
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
new file mode 100644
index 000000000..3f5b3d7b6
--- /dev/null
+++ b/fs/nfsd/lockd.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains all the stubs needed when communicating with lockd.
+ * This level of indirection is necessary so we can run nfsd+lockd without
+ * requiring the nfs client to be compiled in/loaded, and vice versa.
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/file.h>
+#include <linux/lockd/bind.h>
+#include "nfsd.h"
+#include "vfs.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_LOCKD
+
+#ifdef CONFIG_LOCKD_V4
+#define nlm_stale_fh nlm4_stale_fh
+#define nlm_failed nlm4_failed
+#else
+#define nlm_stale_fh nlm_lck_denied_nolocks
+#define nlm_failed nlm_lck_denied_nolocks
+#endif
+/*
+ * Note: we hold the dentry use count while the file is open.
+ */
+static __be32
+nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
+{
+ __be32 nfserr;
+ struct svc_fh fh;
+
+ /* must initialize before using! but maxsize doesn't matter */
+ fh_init(&fh,0);
+ fh.fh_handle.fh_size = f->size;
+ memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size);
+ fh.fh_export = NULL;
+
+ nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp);
+ fh_put(&fh);
+ /* We return nlm error codes as nlm doesn't know
+ * about nfsd, but nfsd does know about nlm..
+ */
+ switch (nfserr) {
+ case nfs_ok:
+ return 0;
+ case nfserr_dropit:
+ return nlm_drop_reply;
+ case nfserr_stale:
+ return nlm_stale_fh;
+ default:
+ return nlm_failed;
+ }
+}
+
+static void
+nlm_fclose(struct file *filp)
+{
+ fput(filp);
+}
+
+static const struct nlmsvc_binding nfsd_nlm_ops = {
+ .fopen = nlm_fopen, /* open file for locking */
+ .fclose = nlm_fclose, /* close file */
+};
+
+void
+nfsd_lockd_init(void)
+{
+ dprintk("nfsd: initializing lockd\n");
+ nlmsvc_ops = &nfsd_nlm_ops;
+}
+
+void
+nfsd_lockd_shutdown(void)
+{
+ nlmsvc_ops = NULL;
+}
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
new file mode 100644
index 000000000..02d3d2f0e
--- /dev/null
+++ b/fs/nfsd/netns.h
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * per net namespace data structures for nfsd
+ *
+ * Copyright (C) 2012, Jeff Layton <jlayton@redhat.com>
+ */
+
+#ifndef __NFSD_NETNS_H__
+#define __NFSD_NETNS_H__
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+/* Hash tables for nfs4_clientid state */
+#define CLIENT_HASH_BITS 4
+#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
+#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1)
+
+#define SESSION_HASH_SIZE 512
+
+struct cld_net;
+struct nfsd4_client_tracking_ops;
+
+/*
+ * Represents a nfsd "container". With respect to nfsv4 state tracking, the
+ * fields of interest are the *_id_hashtbls and the *_name_tree. These track
+ * the nfs4_client objects by either short or long form clientid.
+ *
+ * Each nfsd_net runs a nfs4_laundromat workqueue job when necessary to clean
+ * up expired clients and delegations within the container.
+ */
+struct nfsd_net {
+ struct cld_net *cld_net;
+
+ struct cache_detail *svc_expkey_cache;
+ struct cache_detail *svc_export_cache;
+
+ struct cache_detail *idtoname_cache;
+ struct cache_detail *nametoid_cache;
+
+ struct lock_manager nfsd4_manager;
+ bool grace_ended;
+ time64_t boot_time;
+
+ struct dentry *nfsd_client_dir;
+
+ /*
+ * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
+ * used in reboot/reset lease grace period processing
+ *
+ * conf_id_hashtbl[], and conf_name_tree hold confirmed
+ * setclientid_confirmed info.
+ *
+ * unconf_str_hastbl[] and unconf_name_tree hold unconfirmed
+ * setclientid info.
+ */
+ struct list_head *reclaim_str_hashtbl;
+ int reclaim_str_hashtbl_size;
+ struct list_head *conf_id_hashtbl;
+ struct rb_root conf_name_tree;
+ struct list_head *unconf_id_hashtbl;
+ struct rb_root unconf_name_tree;
+ struct list_head *sessionid_hashtbl;
+ /*
+ * client_lru holds client queue ordered by nfs4_client.cl_time
+ * for lease renewal.
+ *
+ * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time
+ * for last close replay.
+ *
+ * All of the above fields are protected by the client_mutex.
+ */
+ struct list_head client_lru;
+ struct list_head close_lru;
+ struct list_head del_recall_lru;
+
+ /* protected by blocked_locks_lock */
+ struct list_head blocked_locks_lru;
+
+ struct delayed_work laundromat_work;
+
+ /* client_lock protects the client lru list and session hash table */
+ spinlock_t client_lock;
+
+ /* protects blocked_locks_lru */
+ spinlock_t blocked_locks_lock;
+
+ struct file *rec_file;
+ bool in_grace;
+ const struct nfsd4_client_tracking_ops *client_tracking_ops;
+
+ time64_t nfsd4_lease;
+ time64_t nfsd4_grace;
+ bool somebody_reclaimed;
+
+ bool track_reclaim_completes;
+ atomic_t nr_reclaim_complete;
+
+ bool nfsd_net_up;
+ bool lockd_up;
+
+ /* Time of server startup */
+ struct timespec64 nfssvc_boot;
+ seqlock_t boot_lock;
+
+ /*
+ * Max number of connections this nfsd container will allow. Defaults
+ * to '0' which is means that it bases this on the number of threads.
+ */
+ unsigned int max_connections;
+
+ u32 clientid_base;
+ u32 clientid_counter;
+ u32 clverifier_counter;
+
+ struct svc_serv *nfsd_serv;
+
+ wait_queue_head_t ntf_wq;
+ atomic_t ntf_refcnt;
+
+ /* Allow umount to wait for nfsd state cleanup */
+ struct completion nfsd_shutdown_complete;
+
+ /*
+ * clientid and stateid data for construction of net unique COPY
+ * stateids.
+ */
+ u32 s2s_cp_cl_id;
+ struct idr s2s_cp_stateids;
+ spinlock_t s2s_cp_lock;
+
+ /*
+ * Version information
+ */
+ bool *nfsd_versions;
+ bool *nfsd4_minorversions;
+
+ /*
+ * Duplicate reply cache
+ */
+ struct nfsd_drc_bucket *drc_hashtbl;
+
+ /* max number of entries allowed in the cache */
+ unsigned int max_drc_entries;
+
+ /* number of significant bits in the hash value */
+ unsigned int maskbits;
+ unsigned int drc_hashsize;
+
+ /*
+ * Stats and other tracking of on the duplicate reply cache.
+ * These fields and the "rc" fields in nfsdstats are modified
+ * with only the per-bucket cache lock, which isn't really safe
+ * and should be fixed if we want the statistics to be
+ * completely accurate.
+ */
+
+ /* total number of entries */
+ atomic_t num_drc_entries;
+
+ /* cache misses due only to checksum comparison failures */
+ unsigned int payload_misses;
+
+ /* amount of memory (in bytes) currently consumed by the DRC */
+ unsigned int drc_mem_usage;
+
+ /* longest hash chain seen */
+ unsigned int longest_chain;
+
+ /* size of cache when we saw the longest hash chain */
+ unsigned int longest_chain_cachesize;
+
+ struct shrinker nfsd_reply_cache_shrinker;
+ /* utsname taken from the process that starts the server */
+ char nfsd_name[UNX_MAXNODENAME+1];
+};
+
+/* Simple check to find out if a given net was properly initialized */
+#define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl)
+
+extern void nfsd_netns_free_versions(struct nfsd_net *nn);
+
+extern unsigned int nfsd_net_id;
+
+void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn);
+void nfsd_reset_boot_verifier(struct nfsd_net *nn);
+#endif /* __NFSD_NETNS_H__ */
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
new file mode 100644
index 000000000..6a900f770
--- /dev/null
+++ b/fs/nfsd/nfs2acl.c
@@ -0,0 +1,438 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Process version 2 NFSACL requests.
+ *
+ * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de>
+ */
+
+#include "nfsd.h"
+/* FIXME: nfsacl.h is a broken header */
+#include <linux/nfsacl.h>
+#include <linux/gfp.h>
+#include "cache.h"
+#include "xdr3.h"
+#include "vfs.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PROC
+
+/*
+ * NULL call.
+ */
+static __be32
+nfsacld_proc_null(struct svc_rqst *rqstp)
+{
+ return rpc_success;
+}
+
+/*
+ * Get the Access and/or Default ACL of a file.
+ */
+static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp)
+{
+ struct nfsd3_getaclargs *argp = rqstp->rq_argp;
+ struct nfsd3_getaclres *resp = rqstp->rq_resp;
+ struct posix_acl *acl;
+ struct inode *inode;
+ svc_fh *fh;
+
+ dprintk("nfsd: GETACL(2acl) %s\n", SVCFH_fmt(&argp->fh));
+
+ fh = fh_copy(&resp->fh, &argp->fh);
+ resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+ if (resp->status != nfs_ok)
+ goto out;
+
+ inode = d_inode(fh->fh_dentry);
+
+ if (argp->mask & ~NFS_ACL_MASK) {
+ resp->status = nfserr_inval;
+ goto out;
+ }
+ resp->mask = argp->mask;
+
+ resp->status = fh_getattr(fh, &resp->stat);
+ if (resp->status != nfs_ok)
+ goto out;
+
+ if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
+ acl = get_acl(inode, ACL_TYPE_ACCESS);
+ if (acl == NULL) {
+ /* Solaris returns the inode's minimum ACL. */
+ acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+ }
+ if (IS_ERR(acl)) {
+ resp->status = nfserrno(PTR_ERR(acl));
+ goto fail;
+ }
+ resp->acl_access = acl;
+ }
+ if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
+ /* Check how Solaris handles requests for the Default ACL
+ of a non-directory! */
+ acl = get_acl(inode, ACL_TYPE_DEFAULT);
+ if (IS_ERR(acl)) {
+ resp->status = nfserrno(PTR_ERR(acl));
+ goto fail;
+ }
+ resp->acl_default = acl;
+ }
+
+ /* resp->acl_{access,default} are released in nfssvc_release_getacl. */
+out:
+ return rpc_success;
+
+fail:
+ posix_acl_release(resp->acl_access);
+ posix_acl_release(resp->acl_default);
+ goto out;
+}
+
+/*
+ * Set the Access and/or Default ACL of a file.
+ */
+static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp)
+{
+ struct nfsd3_setaclargs *argp = rqstp->rq_argp;
+ struct nfsd_attrstat *resp = rqstp->rq_resp;
+ struct inode *inode;
+ svc_fh *fh;
+ int error;
+
+ dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh));
+
+ fh = fh_copy(&resp->fh, &argp->fh);
+ resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
+ if (resp->status != nfs_ok)
+ goto out;
+
+ inode = d_inode(fh->fh_dentry);
+
+ error = fh_want_write(fh);
+ if (error)
+ goto out_errno;
+
+ fh_lock(fh);
+
+ error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access);
+ if (error)
+ goto out_drop_lock;
+ error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default);
+ if (error)
+ goto out_drop_lock;
+
+ fh_unlock(fh);
+
+ fh_drop_write(fh);
+
+ resp->status = fh_getattr(fh, &resp->stat);
+
+out:
+ /* argp->acl_{access,default} may have been allocated in
+ nfssvc_decode_setaclargs. */
+ posix_acl_release(argp->acl_access);
+ posix_acl_release(argp->acl_default);
+ return rpc_success;
+
+out_drop_lock:
+ fh_unlock(fh);
+ fh_drop_write(fh);
+out_errno:
+ resp->status = nfserrno(error);
+ goto out;
+}
+
+/*
+ * Check file attributes
+ */
+static __be32 nfsacld_proc_getattr(struct svc_rqst *rqstp)
+{
+ struct nfsd_fhandle *argp = rqstp->rq_argp;
+ struct nfsd_attrstat *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh));
+
+ fh_copy(&resp->fh, &argp->fh);
+ resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+ if (resp->status != nfs_ok)
+ goto out;
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+out:
+ return rpc_success;
+}
+
+/*
+ * Check file access
+ */
+static __be32 nfsacld_proc_access(struct svc_rqst *rqstp)
+{
+ struct nfsd3_accessargs *argp = rqstp->rq_argp;
+ struct nfsd3_accessres *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: ACCESS(2acl) %s 0x%x\n",
+ SVCFH_fmt(&argp->fh),
+ argp->access);
+
+ fh_copy(&resp->fh, &argp->fh);
+ resp->access = argp->access;
+ resp->status = nfsd_access(rqstp, &resp->fh, &resp->access, NULL);
+ if (resp->status != nfs_ok)
+ goto out;
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+out:
+ return rpc_success;
+}
+
+/*
+ * XDR decode functions
+ */
+static int nfsaclsvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p)
+{
+ return 1;
+}
+
+static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_getaclargs *argp = rqstp->rq_argp;
+
+ p = nfs2svc_decode_fh(p, &argp->fh);
+ if (!p)
+ return 0;
+ argp->mask = ntohl(*p); p++;
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+
+static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_setaclargs *argp = rqstp->rq_argp;
+ struct kvec *head = rqstp->rq_arg.head;
+ unsigned int base;
+ int n;
+
+ p = nfs2svc_decode_fh(p, &argp->fh);
+ if (!p)
+ return 0;
+ argp->mask = ntohl(*p++);
+ if (argp->mask & ~NFS_ACL_MASK ||
+ !xdr_argsize_check(rqstp, p))
+ return 0;
+
+ base = (char *)p - (char *)head->iov_base;
+ n = nfsacl_decode(&rqstp->rq_arg, base, NULL,
+ (argp->mask & NFS_ACL) ?
+ &argp->acl_access : NULL);
+ if (n > 0)
+ n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL,
+ (argp->mask & NFS_DFACL) ?
+ &argp->acl_default : NULL);
+ return (n > 0);
+}
+
+static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_fhandle *argp = rqstp->rq_argp;
+
+ p = nfs2svc_decode_fh(p, &argp->fh);
+ if (!p)
+ return 0;
+ return xdr_argsize_check(rqstp, p);
+}
+
+static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_accessargs *argp = rqstp->rq_argp;
+
+ p = nfs2svc_decode_fh(p, &argp->fh);
+ if (!p)
+ return 0;
+ argp->access = ntohl(*p++);
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+/*
+ * XDR encode functions
+ */
+
+/*
+ * There must be an encoding function for void results so svc_process
+ * will work properly.
+ */
+static int nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p)
+{
+ return xdr_ressize_check(rqstp, p);
+}
+
+/* GETACL */
+static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_getaclres *resp = rqstp->rq_resp;
+ struct dentry *dentry = resp->fh.fh_dentry;
+ struct inode *inode;
+ struct kvec *head = rqstp->rq_res.head;
+ unsigned int base;
+ int n;
+ int w;
+
+ *p++ = resp->status;
+ if (resp->status != nfs_ok)
+ return xdr_ressize_check(rqstp, p);
+
+ /*
+ * Since this is version 2, the check for nfserr in
+ * nfsd_dispatch actually ensures the following cannot happen.
+ * However, it seems fragile to depend on that.
+ */
+ if (dentry == NULL || d_really_is_negative(dentry))
+ return 0;
+ inode = d_inode(dentry);
+
+ p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+ *p++ = htonl(resp->mask);
+ if (!xdr_ressize_check(rqstp, p))
+ return 0;
+ base = (char *)p - (char *)head->iov_base;
+
+ rqstp->rq_res.page_len = w = nfsacl_size(
+ (resp->mask & NFS_ACL) ? resp->acl_access : NULL,
+ (resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
+ while (w > 0) {
+ if (!*(rqstp->rq_next_page++))
+ return 0;
+ w -= PAGE_SIZE;
+ }
+
+ n = nfsacl_encode(&rqstp->rq_res, base, inode,
+ resp->acl_access,
+ resp->mask & NFS_ACL, 0);
+ if (n > 0)
+ n = nfsacl_encode(&rqstp->rq_res, base + n, inode,
+ resp->acl_default,
+ resp->mask & NFS_DFACL,
+ NFS_ACL_DEFAULT);
+ return (n > 0);
+}
+
+static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_attrstat *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ if (resp->status != nfs_ok)
+ goto out;
+
+ p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+out:
+ return xdr_ressize_check(rqstp, p);
+}
+
+/* ACCESS */
+static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_accessres *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ if (resp->status != nfs_ok)
+ goto out;
+
+ p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+ *p++ = htonl(resp->access);
+out:
+ return xdr_ressize_check(rqstp, p);
+}
+
+/*
+ * XDR release functions
+ */
+static void nfsaclsvc_release_getacl(struct svc_rqst *rqstp)
+{
+ struct nfsd3_getaclres *resp = rqstp->rq_resp;
+
+ fh_put(&resp->fh);
+ posix_acl_release(resp->acl_access);
+ posix_acl_release(resp->acl_default);
+}
+
+static void nfsaclsvc_release_attrstat(struct svc_rqst *rqstp)
+{
+ struct nfsd_attrstat *resp = rqstp->rq_resp;
+
+ fh_put(&resp->fh);
+}
+
+static void nfsaclsvc_release_access(struct svc_rqst *rqstp)
+{
+ struct nfsd3_accessres *resp = rqstp->rq_resp;
+
+ fh_put(&resp->fh);
+}
+
+struct nfsd3_voidargs { int dummy; };
+
+#define ST 1 /* status*/
+#define AT 21 /* attributes */
+#define pAT (1+AT) /* post attributes - conditional */
+#define ACL (1+NFS_ACL_MAX_ENTRIES*3) /* Access Control List */
+
+static const struct svc_procedure nfsd_acl_procedures2[5] = {
+ [ACLPROC2_NULL] = {
+ .pc_func = nfsacld_proc_null,
+ .pc_decode = nfsaclsvc_decode_voidarg,
+ .pc_encode = nfsaclsvc_encode_voidres,
+ .pc_argsize = sizeof(struct nfsd3_voidargs),
+ .pc_ressize = sizeof(struct nfsd3_voidargs),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST,
+ },
+ [ACLPROC2_GETACL] = {
+ .pc_func = nfsacld_proc_getacl,
+ .pc_decode = nfsaclsvc_decode_getaclargs,
+ .pc_encode = nfsaclsvc_encode_getaclres,
+ .pc_release = nfsaclsvc_release_getacl,
+ .pc_argsize = sizeof(struct nfsd3_getaclargs),
+ .pc_ressize = sizeof(struct nfsd3_getaclres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+1+2*(1+ACL),
+ },
+ [ACLPROC2_SETACL] = {
+ .pc_func = nfsacld_proc_setacl,
+ .pc_decode = nfsaclsvc_decode_setaclargs,
+ .pc_encode = nfsaclsvc_encode_attrstatres,
+ .pc_release = nfsaclsvc_release_attrstat,
+ .pc_argsize = sizeof(struct nfsd3_setaclargs),
+ .pc_ressize = sizeof(struct nfsd_attrstat),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+AT,
+ },
+ [ACLPROC2_GETATTR] = {
+ .pc_func = nfsacld_proc_getattr,
+ .pc_decode = nfsaclsvc_decode_fhandleargs,
+ .pc_encode = nfsaclsvc_encode_attrstatres,
+ .pc_release = nfsaclsvc_release_attrstat,
+ .pc_argsize = sizeof(struct nfsd_fhandle),
+ .pc_ressize = sizeof(struct nfsd_attrstat),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+AT,
+ },
+ [ACLPROC2_ACCESS] = {
+ .pc_func = nfsacld_proc_access,
+ .pc_decode = nfsaclsvc_decode_accessargs,
+ .pc_encode = nfsaclsvc_encode_accessres,
+ .pc_release = nfsaclsvc_release_access,
+ .pc_argsize = sizeof(struct nfsd3_accessargs),
+ .pc_ressize = sizeof(struct nfsd3_accessres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+AT+1,
+ },
+};
+
+static unsigned int nfsd_acl_count2[ARRAY_SIZE(nfsd_acl_procedures2)];
+const struct svc_version nfsd_acl_version2 = {
+ .vs_vers = 2,
+ .vs_nproc = 5,
+ .vs_proc = nfsd_acl_procedures2,
+ .vs_count = nfsd_acl_count2,
+ .vs_dispatch = nfsd_dispatch,
+ .vs_xdrsize = NFS3_SVC_XDRSIZE,
+};
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
new file mode 100644
index 000000000..34a394e50
--- /dev/null
+++ b/fs/nfsd/nfs3acl.c
@@ -0,0 +1,286 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Process version 3 NFSACL requests.
+ *
+ * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de>
+ */
+
+#include "nfsd.h"
+/* FIXME: nfsacl.h is a broken header */
+#include <linux/nfsacl.h>
+#include <linux/gfp.h>
+#include "cache.h"
+#include "xdr3.h"
+#include "vfs.h"
+
+/*
+ * NULL call.
+ */
+static __be32
+nfsd3_proc_null(struct svc_rqst *rqstp)
+{
+ return rpc_success;
+}
+
+/*
+ * Get the Access and/or Default ACL of a file.
+ */
+static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp)
+{
+ struct nfsd3_getaclargs *argp = rqstp->rq_argp;
+ struct nfsd3_getaclres *resp = rqstp->rq_resp;
+ struct posix_acl *acl;
+ struct inode *inode;
+ svc_fh *fh;
+
+ fh = fh_copy(&resp->fh, &argp->fh);
+ resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+ if (resp->status != nfs_ok)
+ goto out;
+
+ inode = d_inode(fh->fh_dentry);
+
+ if (argp->mask & ~NFS_ACL_MASK) {
+ resp->status = nfserr_inval;
+ goto out;
+ }
+ resp->mask = argp->mask;
+
+ if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
+ acl = get_acl(inode, ACL_TYPE_ACCESS);
+ if (acl == NULL) {
+ /* Solaris returns the inode's minimum ACL. */
+ acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+ }
+ if (IS_ERR(acl)) {
+ resp->status = nfserrno(PTR_ERR(acl));
+ goto fail;
+ }
+ resp->acl_access = acl;
+ }
+ if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
+ /* Check how Solaris handles requests for the Default ACL
+ of a non-directory! */
+ acl = get_acl(inode, ACL_TYPE_DEFAULT);
+ if (IS_ERR(acl)) {
+ resp->status = nfserrno(PTR_ERR(acl));
+ goto fail;
+ }
+ resp->acl_default = acl;
+ }
+
+ /* resp->acl_{access,default} are released in nfs3svc_release_getacl. */
+out:
+ return rpc_success;
+
+fail:
+ posix_acl_release(resp->acl_access);
+ posix_acl_release(resp->acl_default);
+ goto out;
+}
+
+/*
+ * Set the Access and/or Default ACL of a file.
+ */
+static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp)
+{
+ struct nfsd3_setaclargs *argp = rqstp->rq_argp;
+ struct nfsd3_attrstat *resp = rqstp->rq_resp;
+ struct inode *inode;
+ svc_fh *fh;
+ int error;
+
+ fh = fh_copy(&resp->fh, &argp->fh);
+ resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
+ if (resp->status != nfs_ok)
+ goto out;
+
+ inode = d_inode(fh->fh_dentry);
+
+ error = fh_want_write(fh);
+ if (error)
+ goto out_errno;
+
+ fh_lock(fh);
+
+ error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access);
+ if (error)
+ goto out_drop_lock;
+ error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default);
+
+out_drop_lock:
+ fh_unlock(fh);
+ fh_drop_write(fh);
+out_errno:
+ resp->status = nfserrno(error);
+out:
+ /* argp->acl_{access,default} may have been allocated in
+ nfs3svc_decode_setaclargs. */
+ posix_acl_release(argp->acl_access);
+ posix_acl_release(argp->acl_default);
+ return rpc_success;
+}
+
+/*
+ * XDR decode functions
+ */
+static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_getaclargs *args = rqstp->rq_argp;
+
+ p = nfs3svc_decode_fh(p, &args->fh);
+ if (!p)
+ return 0;
+ args->mask = ntohl(*p); p++;
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+
+static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_setaclargs *args = rqstp->rq_argp;
+ struct kvec *head = rqstp->rq_arg.head;
+ unsigned int base;
+ int n;
+
+ p = nfs3svc_decode_fh(p, &args->fh);
+ if (!p)
+ return 0;
+ args->mask = ntohl(*p++);
+ if (args->mask & ~NFS_ACL_MASK ||
+ !xdr_argsize_check(rqstp, p))
+ return 0;
+
+ base = (char *)p - (char *)head->iov_base;
+ n = nfsacl_decode(&rqstp->rq_arg, base, NULL,
+ (args->mask & NFS_ACL) ?
+ &args->acl_access : NULL);
+ if (n > 0)
+ n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL,
+ (args->mask & NFS_DFACL) ?
+ &args->acl_default : NULL);
+ return (n > 0);
+}
+
+/*
+ * XDR encode functions
+ */
+
+/* GETACL */
+static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_getaclres *resp = rqstp->rq_resp;
+ struct dentry *dentry = resp->fh.fh_dentry;
+
+ *p++ = resp->status;
+ p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh);
+ if (resp->status == 0 && dentry && d_really_is_positive(dentry)) {
+ struct inode *inode = d_inode(dentry);
+ struct kvec *head = rqstp->rq_res.head;
+ unsigned int base;
+ int n;
+ int w;
+
+ *p++ = htonl(resp->mask);
+ if (!xdr_ressize_check(rqstp, p))
+ return 0;
+ base = (char *)p - (char *)head->iov_base;
+
+ rqstp->rq_res.page_len = w = nfsacl_size(
+ (resp->mask & NFS_ACL) ? resp->acl_access : NULL,
+ (resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
+ while (w > 0) {
+ if (!*(rqstp->rq_next_page++))
+ return 0;
+ w -= PAGE_SIZE;
+ }
+
+ n = nfsacl_encode(&rqstp->rq_res, base, inode,
+ resp->acl_access,
+ resp->mask & NFS_ACL, 0);
+ if (n > 0)
+ n = nfsacl_encode(&rqstp->rq_res, base + n, inode,
+ resp->acl_default,
+ resp->mask & NFS_DFACL,
+ NFS_ACL_DEFAULT);
+ if (n <= 0)
+ return 0;
+ } else
+ if (!xdr_ressize_check(rqstp, p))
+ return 0;
+
+ return 1;
+}
+
+/* SETACL */
+static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_attrstat *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh);
+ return xdr_ressize_check(rqstp, p);
+}
+
+/*
+ * XDR release functions
+ */
+static void nfs3svc_release_getacl(struct svc_rqst *rqstp)
+{
+ struct nfsd3_getaclres *resp = rqstp->rq_resp;
+
+ fh_put(&resp->fh);
+ posix_acl_release(resp->acl_access);
+ posix_acl_release(resp->acl_default);
+}
+
+struct nfsd3_voidargs { int dummy; };
+
+#define ST 1 /* status*/
+#define AT 21 /* attributes */
+#define pAT (1+AT) /* post attributes - conditional */
+#define ACL (1+NFS_ACL_MAX_ENTRIES*3) /* Access Control List */
+
+static const struct svc_procedure nfsd_acl_procedures3[3] = {
+ [ACLPROC3_NULL] = {
+ .pc_func = nfsd3_proc_null,
+ .pc_decode = nfs3svc_decode_voidarg,
+ .pc_encode = nfs3svc_encode_voidres,
+ .pc_argsize = sizeof(struct nfsd3_voidargs),
+ .pc_ressize = sizeof(struct nfsd3_voidargs),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST,
+ },
+ [ACLPROC3_GETACL] = {
+ .pc_func = nfsd3_proc_getacl,
+ .pc_decode = nfs3svc_decode_getaclargs,
+ .pc_encode = nfs3svc_encode_getaclres,
+ .pc_release = nfs3svc_release_getacl,
+ .pc_argsize = sizeof(struct nfsd3_getaclargs),
+ .pc_ressize = sizeof(struct nfsd3_getaclres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+1+2*(1+ACL),
+ },
+ [ACLPROC3_SETACL] = {
+ .pc_func = nfsd3_proc_setacl,
+ .pc_decode = nfs3svc_decode_setaclargs,
+ .pc_encode = nfs3svc_encode_setaclres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_setaclargs),
+ .pc_ressize = sizeof(struct nfsd3_attrstat),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+pAT,
+ },
+};
+
+static unsigned int nfsd_acl_count3[ARRAY_SIZE(nfsd_acl_procedures3)];
+const struct svc_version nfsd_acl_version3 = {
+ .vs_vers = 3,
+ .vs_nproc = 3,
+ .vs_proc = nfsd_acl_procedures3,
+ .vs_count = nfsd_acl_count3,
+ .vs_dispatch = nfsd_dispatch,
+ .vs_xdrsize = NFS3_SVC_XDRSIZE,
+};
+
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
new file mode 100644
index 000000000..981a4e4c9
--- /dev/null
+++ b/fs/nfsd/nfs3proc.c
@@ -0,0 +1,935 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Process version 3 NFS requests.
+ *
+ * Copyright (C) 1996, 1997, 1998 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/fs.h>
+#include <linux/ext2_fs.h>
+#include <linux/magic.h>
+
+#include "cache.h"
+#include "xdr3.h"
+#include "vfs.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PROC
+
+static int nfs3_ftypes[] = {
+ 0, /* NF3NON */
+ S_IFREG, /* NF3REG */
+ S_IFDIR, /* NF3DIR */
+ S_IFBLK, /* NF3BLK */
+ S_IFCHR, /* NF3CHR */
+ S_IFLNK, /* NF3LNK */
+ S_IFSOCK, /* NF3SOCK */
+ S_IFIFO, /* NF3FIFO */
+};
+
+/*
+ * NULL call.
+ */
+static __be32
+nfsd3_proc_null(struct svc_rqst *rqstp)
+{
+ return rpc_success;
+}
+
+/*
+ * Get a file's attributes
+ */
+static __be32
+nfsd3_proc_getattr(struct svc_rqst *rqstp)
+{
+ struct nfsd_fhandle *argp = rqstp->rq_argp;
+ struct nfsd3_attrstat *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: GETATTR(3) %s\n",
+ SVCFH_fmt(&argp->fh));
+
+ fh_copy(&resp->fh, &argp->fh);
+ resp->status = fh_verify(rqstp, &resp->fh, 0,
+ NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
+ if (resp->status != nfs_ok)
+ goto out;
+
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+out:
+ return rpc_success;
+}
+
+/*
+ * Set a file's attributes
+ */
+static __be32
+nfsd3_proc_setattr(struct svc_rqst *rqstp)
+{
+ struct nfsd3_sattrargs *argp = rqstp->rq_argp;
+ struct nfsd3_attrstat *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: SETATTR(3) %s\n",
+ SVCFH_fmt(&argp->fh));
+
+ fh_copy(&resp->fh, &argp->fh);
+ resp->status = nfsd_setattr(rqstp, &resp->fh, &argp->attrs,
+ argp->check_guard, argp->guardtime);
+ return rpc_success;
+}
+
+/*
+ * Look up a path name component
+ */
+static __be32
+nfsd3_proc_lookup(struct svc_rqst *rqstp)
+{
+ struct nfsd3_diropargs *argp = rqstp->rq_argp;
+ struct nfsd3_diropres *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: LOOKUP(3) %s %.*s\n",
+ SVCFH_fmt(&argp->fh),
+ argp->len,
+ argp->name);
+
+ fh_copy(&resp->dirfh, &argp->fh);
+ fh_init(&resp->fh, NFS3_FHSIZE);
+
+ resp->status = nfsd_lookup(rqstp, &resp->dirfh,
+ argp->name, argp->len,
+ &resp->fh);
+ return rpc_success;
+}
+
+/*
+ * Check file access
+ */
+static __be32
+nfsd3_proc_access(struct svc_rqst *rqstp)
+{
+ struct nfsd3_accessargs *argp = rqstp->rq_argp;
+ struct nfsd3_accessres *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: ACCESS(3) %s 0x%x\n",
+ SVCFH_fmt(&argp->fh),
+ argp->access);
+
+ fh_copy(&resp->fh, &argp->fh);
+ resp->access = argp->access;
+ resp->status = nfsd_access(rqstp, &resp->fh, &resp->access, NULL);
+ return rpc_success;
+}
+
+/*
+ * Read a symlink.
+ */
+static __be32
+nfsd3_proc_readlink(struct svc_rqst *rqstp)
+{
+ struct nfsd3_readlinkargs *argp = rqstp->rq_argp;
+ struct nfsd3_readlinkres *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh));
+
+ /* Read the symlink. */
+ fh_copy(&resp->fh, &argp->fh);
+ resp->len = NFS3_MAXPATHLEN;
+ resp->status = nfsd_readlink(rqstp, &resp->fh, argp->buffer, &resp->len);
+ return rpc_success;
+}
+
+/*
+ * Read a portion of a file.
+ */
+static __be32
+nfsd3_proc_read(struct svc_rqst *rqstp)
+{
+ struct nfsd3_readargs *argp = rqstp->rq_argp;
+ struct nfsd3_readres *resp = rqstp->rq_resp;
+ u32 max_blocksize = svc_max_payload(rqstp);
+ unsigned long cnt = min(argp->count, max_blocksize);
+
+ dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
+ SVCFH_fmt(&argp->fh),
+ (unsigned long) argp->count,
+ (unsigned long long) argp->offset);
+
+ /* Obtain buffer pointer for payload.
+ * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
+ * + 1 (xdr opaque byte count) = 26
+ */
+ resp->count = cnt;
+ svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
+
+ fh_copy(&resp->fh, &argp->fh);
+ resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
+ rqstp->rq_vec, argp->vlen, &resp->count,
+ &resp->eof);
+ return rpc_success;
+}
+
+/*
+ * Write data to a file
+ */
+static __be32
+nfsd3_proc_write(struct svc_rqst *rqstp)
+{
+ struct nfsd3_writeargs *argp = rqstp->rq_argp;
+ struct nfsd3_writeres *resp = rqstp->rq_resp;
+ unsigned long cnt = argp->len;
+ unsigned int nvecs;
+
+ dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n",
+ SVCFH_fmt(&argp->fh),
+ argp->len,
+ (unsigned long long) argp->offset,
+ argp->stable? " stable" : "");
+
+ resp->status = nfserr_fbig;
+ if (argp->offset > (u64)OFFSET_MAX ||
+ argp->offset + argp->len > (u64)OFFSET_MAX)
+ return rpc_success;
+
+ fh_copy(&resp->fh, &argp->fh);
+ resp->committed = argp->stable;
+ nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages,
+ &argp->first, cnt);
+ if (!nvecs) {
+ resp->status = nfserr_io;
+ goto out;
+ }
+ resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
+ rqstp->rq_vec, nvecs, &cnt,
+ resp->committed, resp->verf);
+ resp->count = cnt;
+out:
+ return rpc_success;
+}
+
+/*
+ * With NFSv3, CREATE processing is a lot easier than with NFSv2.
+ * At least in theory; we'll see how it fares in practice when the
+ * first reports about SunOS compatibility problems start to pour in...
+ */
+static __be32
+nfsd3_proc_create(struct svc_rqst *rqstp)
+{
+ struct nfsd3_createargs *argp = rqstp->rq_argp;
+ struct nfsd3_diropres *resp = rqstp->rq_resp;
+ svc_fh *dirfhp, *newfhp = NULL;
+ struct iattr *attr;
+
+ dprintk("nfsd: CREATE(3) %s %.*s\n",
+ SVCFH_fmt(&argp->fh),
+ argp->len,
+ argp->name);
+
+ dirfhp = fh_copy(&resp->dirfh, &argp->fh);
+ newfhp = fh_init(&resp->fh, NFS3_FHSIZE);
+ attr = &argp->attrs;
+
+ /* Unfudge the mode bits */
+ attr->ia_mode &= ~S_IFMT;
+ if (!(attr->ia_valid & ATTR_MODE)) {
+ attr->ia_valid |= ATTR_MODE;
+ attr->ia_mode = S_IFREG;
+ } else {
+ attr->ia_mode = (attr->ia_mode & ~S_IFMT) | S_IFREG;
+ }
+
+ /* Now create the file and set attributes */
+ resp->status = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len,
+ attr, newfhp, argp->createmode,
+ (u32 *)argp->verf, NULL, NULL);
+ return rpc_success;
+}
+
+/*
+ * Make directory. This operation is not idempotent.
+ */
+static __be32
+nfsd3_proc_mkdir(struct svc_rqst *rqstp)
+{
+ struct nfsd3_createargs *argp = rqstp->rq_argp;
+ struct nfsd3_diropres *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: MKDIR(3) %s %.*s\n",
+ SVCFH_fmt(&argp->fh),
+ argp->len,
+ argp->name);
+
+ argp->attrs.ia_valid &= ~ATTR_SIZE;
+ fh_copy(&resp->dirfh, &argp->fh);
+ fh_init(&resp->fh, NFS3_FHSIZE);
+ resp->status = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
+ &argp->attrs, S_IFDIR, 0, &resp->fh);
+ fh_unlock(&resp->dirfh);
+ return rpc_success;
+}
+
+static __be32
+nfsd3_proc_symlink(struct svc_rqst *rqstp)
+{
+ struct nfsd3_symlinkargs *argp = rqstp->rq_argp;
+ struct nfsd3_diropres *resp = rqstp->rq_resp;
+
+ if (argp->tlen == 0) {
+ resp->status = nfserr_inval;
+ goto out;
+ }
+ if (argp->tlen > NFS3_MAXPATHLEN) {
+ resp->status = nfserr_nametoolong;
+ goto out;
+ }
+
+ argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first,
+ page_address(rqstp->rq_arg.pages[0]),
+ argp->tlen);
+ if (IS_ERR(argp->tname)) {
+ resp->status = nfserrno(PTR_ERR(argp->tname));
+ goto out;
+ }
+
+ dprintk("nfsd: SYMLINK(3) %s %.*s -> %.*s\n",
+ SVCFH_fmt(&argp->ffh),
+ argp->flen, argp->fname,
+ argp->tlen, argp->tname);
+
+ fh_copy(&resp->dirfh, &argp->ffh);
+ fh_init(&resp->fh, NFS3_FHSIZE);
+ resp->status = nfsd_symlink(rqstp, &resp->dirfh, argp->fname,
+ argp->flen, argp->tname, &resp->fh);
+ kfree(argp->tname);
+out:
+ return rpc_success;
+}
+
+/*
+ * Make socket/fifo/device.
+ */
+static __be32
+nfsd3_proc_mknod(struct svc_rqst *rqstp)
+{
+ struct nfsd3_mknodargs *argp = rqstp->rq_argp;
+ struct nfsd3_diropres *resp = rqstp->rq_resp;
+ int type;
+ dev_t rdev = 0;
+
+ dprintk("nfsd: MKNOD(3) %s %.*s\n",
+ SVCFH_fmt(&argp->fh),
+ argp->len,
+ argp->name);
+
+ fh_copy(&resp->dirfh, &argp->fh);
+ fh_init(&resp->fh, NFS3_FHSIZE);
+
+ if (argp->ftype == NF3CHR || argp->ftype == NF3BLK) {
+ rdev = MKDEV(argp->major, argp->minor);
+ if (MAJOR(rdev) != argp->major ||
+ MINOR(rdev) != argp->minor) {
+ resp->status = nfserr_inval;
+ goto out;
+ }
+ } else if (argp->ftype != NF3SOCK && argp->ftype != NF3FIFO) {
+ resp->status = nfserr_badtype;
+ goto out;
+ }
+
+ type = nfs3_ftypes[argp->ftype];
+ resp->status = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
+ &argp->attrs, type, rdev, &resp->fh);
+ fh_unlock(&resp->dirfh);
+out:
+ return rpc_success;
+}
+
+/*
+ * Remove file/fifo/socket etc.
+ */
+static __be32
+nfsd3_proc_remove(struct svc_rqst *rqstp)
+{
+ struct nfsd3_diropargs *argp = rqstp->rq_argp;
+ struct nfsd3_attrstat *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: REMOVE(3) %s %.*s\n",
+ SVCFH_fmt(&argp->fh),
+ argp->len,
+ argp->name);
+
+ /* Unlink. -S_IFDIR means file must not be a directory */
+ fh_copy(&resp->fh, &argp->fh);
+ resp->status = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR,
+ argp->name, argp->len);
+ fh_unlock(&resp->fh);
+ return rpc_success;
+}
+
+/*
+ * Remove a directory
+ */
+static __be32
+nfsd3_proc_rmdir(struct svc_rqst *rqstp)
+{
+ struct nfsd3_diropargs *argp = rqstp->rq_argp;
+ struct nfsd3_attrstat *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: RMDIR(3) %s %.*s\n",
+ SVCFH_fmt(&argp->fh),
+ argp->len,
+ argp->name);
+
+ fh_copy(&resp->fh, &argp->fh);
+ resp->status = nfsd_unlink(rqstp, &resp->fh, S_IFDIR,
+ argp->name, argp->len);
+ fh_unlock(&resp->fh);
+ return rpc_success;
+}
+
+static __be32
+nfsd3_proc_rename(struct svc_rqst *rqstp)
+{
+ struct nfsd3_renameargs *argp = rqstp->rq_argp;
+ struct nfsd3_renameres *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: RENAME(3) %s %.*s ->\n",
+ SVCFH_fmt(&argp->ffh),
+ argp->flen,
+ argp->fname);
+ dprintk("nfsd: -> %s %.*s\n",
+ SVCFH_fmt(&argp->tfh),
+ argp->tlen,
+ argp->tname);
+
+ fh_copy(&resp->ffh, &argp->ffh);
+ fh_copy(&resp->tfh, &argp->tfh);
+ resp->status = nfsd_rename(rqstp, &resp->ffh, argp->fname, argp->flen,
+ &resp->tfh, argp->tname, argp->tlen);
+ return rpc_success;
+}
+
+static __be32
+nfsd3_proc_link(struct svc_rqst *rqstp)
+{
+ struct nfsd3_linkargs *argp = rqstp->rq_argp;
+ struct nfsd3_linkres *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: LINK(3) %s ->\n",
+ SVCFH_fmt(&argp->ffh));
+ dprintk("nfsd: -> %s %.*s\n",
+ SVCFH_fmt(&argp->tfh),
+ argp->tlen,
+ argp->tname);
+
+ fh_copy(&resp->fh, &argp->ffh);
+ fh_copy(&resp->tfh, &argp->tfh);
+ resp->status = nfsd_link(rqstp, &resp->tfh, argp->tname, argp->tlen,
+ &resp->fh);
+ return rpc_success;
+}
+
+/*
+ * Read a portion of a directory.
+ */
+static __be32
+nfsd3_proc_readdir(struct svc_rqst *rqstp)
+{
+ struct nfsd3_readdirargs *argp = rqstp->rq_argp;
+ struct nfsd3_readdirres *resp = rqstp->rq_resp;
+ int count = 0;
+ struct page **p;
+ caddr_t page_addr = NULL;
+
+ dprintk("nfsd: READDIR(3) %s %d bytes at %d\n",
+ SVCFH_fmt(&argp->fh),
+ argp->count, (u32) argp->cookie);
+
+ /* Make sure we've room for the NULL ptr & eof flag, and shrink to
+ * client read size */
+ count = (argp->count >> 2) - 2;
+
+ /* Read directory and encode entries on the fly */
+ fh_copy(&resp->fh, &argp->fh);
+
+ resp->buflen = count;
+ resp->common.err = nfs_ok;
+ resp->buffer = argp->buffer;
+ resp->rqstp = rqstp;
+ resp->status = nfsd_readdir(rqstp, &resp->fh, (loff_t *)&argp->cookie,
+ &resp->common, nfs3svc_encode_entry);
+ memcpy(resp->verf, argp->verf, 8);
+ count = 0;
+ for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) {
+ page_addr = page_address(*p);
+
+ if (((caddr_t)resp->buffer >= page_addr) &&
+ ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) {
+ count += (caddr_t)resp->buffer - page_addr;
+ break;
+ }
+ count += PAGE_SIZE;
+ }
+ resp->count = count >> 2;
+ if (resp->offset) {
+ loff_t offset = argp->cookie;
+
+ if (unlikely(resp->offset1)) {
+ /* we ended up with offset on a page boundary */
+ *resp->offset = htonl(offset >> 32);
+ *resp->offset1 = htonl(offset & 0xffffffff);
+ resp->offset1 = NULL;
+ } else {
+ xdr_encode_hyper(resp->offset, offset);
+ }
+ resp->offset = NULL;
+ }
+
+ return rpc_success;
+}
+
+/*
+ * Read a portion of a directory, including file handles and attrs.
+ * For now, we choose to ignore the dircount parameter.
+ */
+static __be32
+nfsd3_proc_readdirplus(struct svc_rqst *rqstp)
+{
+ struct nfsd3_readdirargs *argp = rqstp->rq_argp;
+ struct nfsd3_readdirres *resp = rqstp->rq_resp;
+ int count = 0;
+ loff_t offset;
+ struct page **p;
+ caddr_t page_addr = NULL;
+
+ dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n",
+ SVCFH_fmt(&argp->fh),
+ argp->count, (u32) argp->cookie);
+
+ /* Convert byte count to number of words (i.e. >> 2),
+ * and reserve room for the NULL ptr & eof flag (-2 words) */
+ resp->count = (argp->count >> 2) - 2;
+
+ /* Read directory and encode entries on the fly */
+ fh_copy(&resp->fh, &argp->fh);
+
+ resp->common.err = nfs_ok;
+ resp->buffer = argp->buffer;
+ resp->buflen = resp->count;
+ resp->rqstp = rqstp;
+ offset = argp->cookie;
+
+ resp->status = fh_verify(rqstp, &resp->fh, S_IFDIR, NFSD_MAY_NOP);
+ if (resp->status != nfs_ok)
+ goto out;
+
+ if (resp->fh.fh_export->ex_flags & NFSEXP_NOREADDIRPLUS) {
+ resp->status = nfserr_notsupp;
+ goto out;
+ }
+
+ resp->status = nfsd_readdir(rqstp, &resp->fh, &offset,
+ &resp->common, nfs3svc_encode_entry_plus);
+ memcpy(resp->verf, argp->verf, 8);
+ for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) {
+ page_addr = page_address(*p);
+
+ if (((caddr_t)resp->buffer >= page_addr) &&
+ ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) {
+ count += (caddr_t)resp->buffer - page_addr;
+ break;
+ }
+ count += PAGE_SIZE;
+ }
+ resp->count = count >> 2;
+ if (resp->offset) {
+ if (unlikely(resp->offset1)) {
+ /* we ended up with offset on a page boundary */
+ *resp->offset = htonl(offset >> 32);
+ *resp->offset1 = htonl(offset & 0xffffffff);
+ resp->offset1 = NULL;
+ } else {
+ xdr_encode_hyper(resp->offset, offset);
+ }
+ resp->offset = NULL;
+ }
+
+out:
+ return rpc_success;
+}
+
+/*
+ * Get file system stats
+ */
+static __be32
+nfsd3_proc_fsstat(struct svc_rqst *rqstp)
+{
+ struct nfsd_fhandle *argp = rqstp->rq_argp;
+ struct nfsd3_fsstatres *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: FSSTAT(3) %s\n",
+ SVCFH_fmt(&argp->fh));
+
+ resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0);
+ fh_put(&argp->fh);
+ return rpc_success;
+}
+
+/*
+ * Get file system info
+ */
+static __be32
+nfsd3_proc_fsinfo(struct svc_rqst *rqstp)
+{
+ struct nfsd_fhandle *argp = rqstp->rq_argp;
+ struct nfsd3_fsinfores *resp = rqstp->rq_resp;
+ u32 max_blocksize = svc_max_payload(rqstp);
+
+ dprintk("nfsd: FSINFO(3) %s\n",
+ SVCFH_fmt(&argp->fh));
+
+ resp->f_rtmax = max_blocksize;
+ resp->f_rtpref = max_blocksize;
+ resp->f_rtmult = PAGE_SIZE;
+ resp->f_wtmax = max_blocksize;
+ resp->f_wtpref = max_blocksize;
+ resp->f_wtmult = PAGE_SIZE;
+ resp->f_dtpref = max_blocksize;
+ resp->f_maxfilesize = ~(u32) 0;
+ resp->f_properties = NFS3_FSF_DEFAULT;
+
+ resp->status = fh_verify(rqstp, &argp->fh, 0,
+ NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
+
+ /* Check special features of the file system. May request
+ * different read/write sizes for file systems known to have
+ * problems with large blocks */
+ if (resp->status == nfs_ok) {
+ struct super_block *sb = argp->fh.fh_dentry->d_sb;
+
+ /* Note that we don't care for remote fs's here */
+ if (sb->s_magic == MSDOS_SUPER_MAGIC) {
+ resp->f_properties = NFS3_FSF_BILLYBOY;
+ }
+ resp->f_maxfilesize = sb->s_maxbytes;
+ }
+
+ fh_put(&argp->fh);
+ return rpc_success;
+}
+
+/*
+ * Get pathconf info for the specified file
+ */
+static __be32
+nfsd3_proc_pathconf(struct svc_rqst *rqstp)
+{
+ struct nfsd_fhandle *argp = rqstp->rq_argp;
+ struct nfsd3_pathconfres *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: PATHCONF(3) %s\n",
+ SVCFH_fmt(&argp->fh));
+
+ /* Set default pathconf */
+ resp->p_link_max = 255; /* at least */
+ resp->p_name_max = 255; /* at least */
+ resp->p_no_trunc = 0;
+ resp->p_chown_restricted = 1;
+ resp->p_case_insensitive = 0;
+ resp->p_case_preserving = 1;
+
+ resp->status = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP);
+
+ if (resp->status == nfs_ok) {
+ struct super_block *sb = argp->fh.fh_dentry->d_sb;
+
+ /* Note that we don't care for remote fs's here */
+ switch (sb->s_magic) {
+ case EXT2_SUPER_MAGIC:
+ resp->p_link_max = EXT2_LINK_MAX;
+ resp->p_name_max = EXT2_NAME_LEN;
+ break;
+ case MSDOS_SUPER_MAGIC:
+ resp->p_case_insensitive = 1;
+ resp->p_case_preserving = 0;
+ break;
+ }
+ }
+
+ fh_put(&argp->fh);
+ return rpc_success;
+}
+
+/*
+ * Commit a file (range) to stable storage.
+ */
+static __be32
+nfsd3_proc_commit(struct svc_rqst *rqstp)
+{
+ struct nfsd3_commitargs *argp = rqstp->rq_argp;
+ struct nfsd3_commitres *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: COMMIT(3) %s %u@%Lu\n",
+ SVCFH_fmt(&argp->fh),
+ argp->count,
+ (unsigned long long) argp->offset);
+
+ if (argp->offset > NFS_OFFSET_MAX) {
+ resp->status = nfserr_inval;
+ goto out;
+ }
+
+ fh_copy(&resp->fh, &argp->fh);
+ resp->status = nfsd_commit(rqstp, &resp->fh, argp->offset,
+ argp->count, resp->verf);
+out:
+ return rpc_success;
+}
+
+
+/*
+ * NFSv3 Server procedures.
+ * Only the results of non-idempotent operations are cached.
+ */
+#define nfs3svc_decode_fhandleargs nfs3svc_decode_fhandle
+#define nfs3svc_encode_attrstatres nfs3svc_encode_attrstat
+#define nfs3svc_encode_wccstatres nfs3svc_encode_wccstat
+#define nfsd3_mkdirargs nfsd3_createargs
+#define nfsd3_readdirplusargs nfsd3_readdirargs
+#define nfsd3_fhandleargs nfsd_fhandle
+#define nfsd3_fhandleres nfsd3_attrstat
+#define nfsd3_attrstatres nfsd3_attrstat
+#define nfsd3_wccstatres nfsd3_attrstat
+#define nfsd3_createres nfsd3_diropres
+#define nfsd3_voidres nfsd3_voidargs
+struct nfsd3_voidargs { int dummy; };
+
+#define ST 1 /* status*/
+#define FH 17 /* filehandle with length */
+#define AT 21 /* attributes */
+#define pAT (1+AT) /* post attributes - conditional */
+#define WC (7+pAT) /* WCC attributes */
+
+static const struct svc_procedure nfsd_procedures3[22] = {
+ [NFS3PROC_NULL] = {
+ .pc_func = nfsd3_proc_null,
+ .pc_decode = nfs3svc_decode_voidarg,
+ .pc_encode = nfs3svc_encode_voidres,
+ .pc_argsize = sizeof(struct nfsd3_voidargs),
+ .pc_ressize = sizeof(struct nfsd3_voidres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST,
+ },
+ [NFS3PROC_GETATTR] = {
+ .pc_func = nfsd3_proc_getattr,
+ .pc_decode = nfs3svc_decode_fhandleargs,
+ .pc_encode = nfs3svc_encode_attrstatres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_fhandleargs),
+ .pc_ressize = sizeof(struct nfsd3_attrstatres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+AT,
+ },
+ [NFS3PROC_SETATTR] = {
+ .pc_func = nfsd3_proc_setattr,
+ .pc_decode = nfs3svc_decode_sattrargs,
+ .pc_encode = nfs3svc_encode_wccstatres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_sattrargs),
+ .pc_ressize = sizeof(struct nfsd3_wccstatres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+WC,
+ },
+ [NFS3PROC_LOOKUP] = {
+ .pc_func = nfsd3_proc_lookup,
+ .pc_decode = nfs3svc_decode_diropargs,
+ .pc_encode = nfs3svc_encode_diropres,
+ .pc_release = nfs3svc_release_fhandle2,
+ .pc_argsize = sizeof(struct nfsd3_diropargs),
+ .pc_ressize = sizeof(struct nfsd3_diropres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+FH+pAT+pAT,
+ },
+ [NFS3PROC_ACCESS] = {
+ .pc_func = nfsd3_proc_access,
+ .pc_decode = nfs3svc_decode_accessargs,
+ .pc_encode = nfs3svc_encode_accessres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_accessargs),
+ .pc_ressize = sizeof(struct nfsd3_accessres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+pAT+1,
+ },
+ [NFS3PROC_READLINK] = {
+ .pc_func = nfsd3_proc_readlink,
+ .pc_decode = nfs3svc_decode_readlinkargs,
+ .pc_encode = nfs3svc_encode_readlinkres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_readlinkargs),
+ .pc_ressize = sizeof(struct nfsd3_readlinkres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4,
+ },
+ [NFS3PROC_READ] = {
+ .pc_func = nfsd3_proc_read,
+ .pc_decode = nfs3svc_decode_readargs,
+ .pc_encode = nfs3svc_encode_readres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_readargs),
+ .pc_ressize = sizeof(struct nfsd3_readres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4,
+ },
+ [NFS3PROC_WRITE] = {
+ .pc_func = nfsd3_proc_write,
+ .pc_decode = nfs3svc_decode_writeargs,
+ .pc_encode = nfs3svc_encode_writeres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_writeargs),
+ .pc_ressize = sizeof(struct nfsd3_writeres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+WC+4,
+ },
+ [NFS3PROC_CREATE] = {
+ .pc_func = nfsd3_proc_create,
+ .pc_decode = nfs3svc_decode_createargs,
+ .pc_encode = nfs3svc_encode_createres,
+ .pc_release = nfs3svc_release_fhandle2,
+ .pc_argsize = sizeof(struct nfsd3_createargs),
+ .pc_ressize = sizeof(struct nfsd3_createres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+(1+FH+pAT)+WC,
+ },
+ [NFS3PROC_MKDIR] = {
+ .pc_func = nfsd3_proc_mkdir,
+ .pc_decode = nfs3svc_decode_mkdirargs,
+ .pc_encode = nfs3svc_encode_createres,
+ .pc_release = nfs3svc_release_fhandle2,
+ .pc_argsize = sizeof(struct nfsd3_mkdirargs),
+ .pc_ressize = sizeof(struct nfsd3_createres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+(1+FH+pAT)+WC,
+ },
+ [NFS3PROC_SYMLINK] = {
+ .pc_func = nfsd3_proc_symlink,
+ .pc_decode = nfs3svc_decode_symlinkargs,
+ .pc_encode = nfs3svc_encode_createres,
+ .pc_release = nfs3svc_release_fhandle2,
+ .pc_argsize = sizeof(struct nfsd3_symlinkargs),
+ .pc_ressize = sizeof(struct nfsd3_createres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+(1+FH+pAT)+WC,
+ },
+ [NFS3PROC_MKNOD] = {
+ .pc_func = nfsd3_proc_mknod,
+ .pc_decode = nfs3svc_decode_mknodargs,
+ .pc_encode = nfs3svc_encode_createres,
+ .pc_release = nfs3svc_release_fhandle2,
+ .pc_argsize = sizeof(struct nfsd3_mknodargs),
+ .pc_ressize = sizeof(struct nfsd3_createres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+(1+FH+pAT)+WC,
+ },
+ [NFS3PROC_REMOVE] = {
+ .pc_func = nfsd3_proc_remove,
+ .pc_decode = nfs3svc_decode_diropargs,
+ .pc_encode = nfs3svc_encode_wccstatres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_diropargs),
+ .pc_ressize = sizeof(struct nfsd3_wccstatres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+WC,
+ },
+ [NFS3PROC_RMDIR] = {
+ .pc_func = nfsd3_proc_rmdir,
+ .pc_decode = nfs3svc_decode_diropargs,
+ .pc_encode = nfs3svc_encode_wccstatres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_diropargs),
+ .pc_ressize = sizeof(struct nfsd3_wccstatres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+WC,
+ },
+ [NFS3PROC_RENAME] = {
+ .pc_func = nfsd3_proc_rename,
+ .pc_decode = nfs3svc_decode_renameargs,
+ .pc_encode = nfs3svc_encode_renameres,
+ .pc_release = nfs3svc_release_fhandle2,
+ .pc_argsize = sizeof(struct nfsd3_renameargs),
+ .pc_ressize = sizeof(struct nfsd3_renameres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+WC+WC,
+ },
+ [NFS3PROC_LINK] = {
+ .pc_func = nfsd3_proc_link,
+ .pc_decode = nfs3svc_decode_linkargs,
+ .pc_encode = nfs3svc_encode_linkres,
+ .pc_release = nfs3svc_release_fhandle2,
+ .pc_argsize = sizeof(struct nfsd3_linkargs),
+ .pc_ressize = sizeof(struct nfsd3_linkres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+pAT+WC,
+ },
+ [NFS3PROC_READDIR] = {
+ .pc_func = nfsd3_proc_readdir,
+ .pc_decode = nfs3svc_decode_readdirargs,
+ .pc_encode = nfs3svc_encode_readdirres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_readdirargs),
+ .pc_ressize = sizeof(struct nfsd3_readdirres),
+ .pc_cachetype = RC_NOCACHE,
+ },
+ [NFS3PROC_READDIRPLUS] = {
+ .pc_func = nfsd3_proc_readdirplus,
+ .pc_decode = nfs3svc_decode_readdirplusargs,
+ .pc_encode = nfs3svc_encode_readdirres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_readdirplusargs),
+ .pc_ressize = sizeof(struct nfsd3_readdirres),
+ .pc_cachetype = RC_NOCACHE,
+ },
+ [NFS3PROC_FSSTAT] = {
+ .pc_func = nfsd3_proc_fsstat,
+ .pc_decode = nfs3svc_decode_fhandleargs,
+ .pc_encode = nfs3svc_encode_fsstatres,
+ .pc_argsize = sizeof(struct nfsd3_fhandleargs),
+ .pc_ressize = sizeof(struct nfsd3_fsstatres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+pAT+2*6+1,
+ },
+ [NFS3PROC_FSINFO] = {
+ .pc_func = nfsd3_proc_fsinfo,
+ .pc_decode = nfs3svc_decode_fhandleargs,
+ .pc_encode = nfs3svc_encode_fsinfores,
+ .pc_argsize = sizeof(struct nfsd3_fhandleargs),
+ .pc_ressize = sizeof(struct nfsd3_fsinfores),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+pAT+12,
+ },
+ [NFS3PROC_PATHCONF] = {
+ .pc_func = nfsd3_proc_pathconf,
+ .pc_decode = nfs3svc_decode_fhandleargs,
+ .pc_encode = nfs3svc_encode_pathconfres,
+ .pc_argsize = sizeof(struct nfsd3_fhandleargs),
+ .pc_ressize = sizeof(struct nfsd3_pathconfres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+pAT+6,
+ },
+ [NFS3PROC_COMMIT] = {
+ .pc_func = nfsd3_proc_commit,
+ .pc_decode = nfs3svc_decode_commitargs,
+ .pc_encode = nfs3svc_encode_commitres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_commitargs),
+ .pc_ressize = sizeof(struct nfsd3_commitres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+WC+2,
+ },
+};
+
+static unsigned int nfsd_count3[ARRAY_SIZE(nfsd_procedures3)];
+const struct svc_version nfsd_version3 = {
+ .vs_vers = 3,
+ .vs_nproc = 22,
+ .vs_proc = nfsd_procedures3,
+ .vs_dispatch = nfsd_dispatch,
+ .vs_count = nfsd_count3,
+ .vs_xdrsize = NFS3_SVC_XDRSIZE,
+};
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
new file mode 100644
index 000000000..716566da4
--- /dev/null
+++ b/fs/nfsd/nfs3xdr.c
@@ -0,0 +1,1171 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * XDR support for nfsd/protocol version 3.
+ *
+ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
+ *
+ * 2003-08-09 Jamie Lokier: Use htonl() for nanoseconds, not htons()!
+ */
+
+#include <linux/namei.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include "xdr3.h"
+#include "auth.h"
+#include "netns.h"
+#include "vfs.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_XDR
+
+
+/*
+ * Mapping of S_IF* types to NFS file types
+ */
+static u32 nfs3_ftypes[] = {
+ NF3NON, NF3FIFO, NF3CHR, NF3BAD,
+ NF3DIR, NF3BAD, NF3BLK, NF3BAD,
+ NF3REG, NF3BAD, NF3LNK, NF3BAD,
+ NF3SOCK, NF3BAD, NF3LNK, NF3BAD,
+};
+
+
+/*
+ * XDR functions for basic NFS types
+ */
+static __be32 *
+encode_time3(__be32 *p, struct timespec64 *time)
+{
+ *p++ = htonl((u32) time->tv_sec); *p++ = htonl(time->tv_nsec);
+ return p;
+}
+
+static __be32 *
+decode_time3(__be32 *p, struct timespec64 *time)
+{
+ time->tv_sec = ntohl(*p++);
+ time->tv_nsec = ntohl(*p++);
+ return p;
+}
+
+static __be32 *
+decode_fh(__be32 *p, struct svc_fh *fhp)
+{
+ unsigned int size;
+ fh_init(fhp, NFS3_FHSIZE);
+ size = ntohl(*p++);
+ if (size > NFS3_FHSIZE)
+ return NULL;
+
+ memcpy(&fhp->fh_handle.fh_base, p, size);
+ fhp->fh_handle.fh_size = size;
+ return p + XDR_QUADLEN(size);
+}
+
+/* Helper function for NFSv3 ACL code */
+__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp)
+{
+ return decode_fh(p, fhp);
+}
+
+static __be32 *
+encode_fh(__be32 *p, struct svc_fh *fhp)
+{
+ unsigned int size = fhp->fh_handle.fh_size;
+ *p++ = htonl(size);
+ if (size) p[XDR_QUADLEN(size)-1]=0;
+ memcpy(p, &fhp->fh_handle.fh_base, size);
+ return p + XDR_QUADLEN(size);
+}
+
+/*
+ * Decode a file name and make sure that the path contains
+ * no slashes or null bytes.
+ */
+static __be32 *
+decode_filename(__be32 *p, char **namp, unsigned int *lenp)
+{
+ char *name;
+ unsigned int i;
+
+ if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) {
+ for (i = 0, name = *namp; i < *lenp; i++, name++) {
+ if (*name == '\0' || *name == '/')
+ return NULL;
+ }
+ }
+
+ return p;
+}
+
+static __be32 *
+decode_sattr3(__be32 *p, struct iattr *iap, struct user_namespace *userns)
+{
+ u32 tmp;
+
+ iap->ia_valid = 0;
+
+ if (*p++) {
+ iap->ia_valid |= ATTR_MODE;
+ iap->ia_mode = ntohl(*p++);
+ }
+ if (*p++) {
+ iap->ia_uid = make_kuid(userns, ntohl(*p++));
+ if (uid_valid(iap->ia_uid))
+ iap->ia_valid |= ATTR_UID;
+ }
+ if (*p++) {
+ iap->ia_gid = make_kgid(userns, ntohl(*p++));
+ if (gid_valid(iap->ia_gid))
+ iap->ia_valid |= ATTR_GID;
+ }
+ if (*p++) {
+ u64 newsize;
+
+ iap->ia_valid |= ATTR_SIZE;
+ p = xdr_decode_hyper(p, &newsize);
+ iap->ia_size = min_t(u64, newsize, NFS_OFFSET_MAX);
+ }
+ if ((tmp = ntohl(*p++)) == 1) { /* set to server time */
+ iap->ia_valid |= ATTR_ATIME;
+ } else if (tmp == 2) { /* set to client time */
+ iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET;
+ iap->ia_atime.tv_sec = ntohl(*p++);
+ iap->ia_atime.tv_nsec = ntohl(*p++);
+ }
+ if ((tmp = ntohl(*p++)) == 1) { /* set to server time */
+ iap->ia_valid |= ATTR_MTIME;
+ } else if (tmp == 2) { /* set to client time */
+ iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET;
+ iap->ia_mtime.tv_sec = ntohl(*p++);
+ iap->ia_mtime.tv_nsec = ntohl(*p++);
+ }
+ return p;
+}
+
+static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp)
+{
+ u64 f;
+ switch(fsid_source(fhp)) {
+ default:
+ case FSIDSOURCE_DEV:
+ p = xdr_encode_hyper(p, (u64)huge_encode_dev
+ (fhp->fh_dentry->d_sb->s_dev));
+ break;
+ case FSIDSOURCE_FSID:
+ p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid);
+ break;
+ case FSIDSOURCE_UUID:
+ f = ((u64*)fhp->fh_export->ex_uuid)[0];
+ f ^= ((u64*)fhp->fh_export->ex_uuid)[1];
+ p = xdr_encode_hyper(p, f);
+ break;
+ }
+ return p;
+}
+
+static __be32 *
+encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
+ struct kstat *stat)
+{
+ struct user_namespace *userns = nfsd_user_namespace(rqstp);
+ *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
+ *p++ = htonl((u32) (stat->mode & S_IALLUGO));
+ *p++ = htonl((u32) stat->nlink);
+ *p++ = htonl((u32) from_kuid_munged(userns, stat->uid));
+ *p++ = htonl((u32) from_kgid_munged(userns, stat->gid));
+ if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) {
+ p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN);
+ } else {
+ p = xdr_encode_hyper(p, (u64) stat->size);
+ }
+ p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9);
+ *p++ = htonl((u32) MAJOR(stat->rdev));
+ *p++ = htonl((u32) MINOR(stat->rdev));
+ p = encode_fsid(p, fhp);
+ p = xdr_encode_hyper(p, stat->ino);
+ p = encode_time3(p, &stat->atime);
+ p = encode_time3(p, &stat->mtime);
+ p = encode_time3(p, &stat->ctime);
+
+ return p;
+}
+
+static __be32 *
+encode_saved_post_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
+{
+ /* Attributes to follow */
+ *p++ = xdr_one;
+ return encode_fattr3(rqstp, p, fhp, &fhp->fh_post_attr);
+}
+
+/*
+ * Encode post-operation attributes.
+ * The inode may be NULL if the call failed because of a stale file
+ * handle. In this case, no attributes are returned.
+ */
+static __be32 *
+encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
+{
+ struct dentry *dentry = fhp->fh_dentry;
+ if (dentry && d_really_is_positive(dentry)) {
+ __be32 err;
+ struct kstat stat;
+
+ err = fh_getattr(fhp, &stat);
+ if (!err) {
+ *p++ = xdr_one; /* attributes follow */
+ lease_get_mtime(d_inode(dentry), &stat.mtime);
+ return encode_fattr3(rqstp, p, fhp, &stat);
+ }
+ }
+ *p++ = xdr_zero;
+ return p;
+}
+
+/* Helper for NFSv3 ACLs */
+__be32 *
+nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
+{
+ return encode_post_op_attr(rqstp, p, fhp);
+}
+
+/*
+ * Enocde weak cache consistency data
+ */
+static __be32 *
+encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
+{
+ struct dentry *dentry = fhp->fh_dentry;
+
+ if (dentry && d_really_is_positive(dentry) && fhp->fh_post_saved) {
+ if (fhp->fh_pre_saved) {
+ *p++ = xdr_one;
+ p = xdr_encode_hyper(p, (u64) fhp->fh_pre_size);
+ p = encode_time3(p, &fhp->fh_pre_mtime);
+ p = encode_time3(p, &fhp->fh_pre_ctime);
+ } else {
+ *p++ = xdr_zero;
+ }
+ return encode_saved_post_attr(rqstp, p, fhp);
+ }
+ /* no pre- or post-attrs */
+ *p++ = xdr_zero;
+ return encode_post_op_attr(rqstp, p, fhp);
+}
+
+/*
+ * Fill in the pre_op attr for the wcc data
+ */
+void fill_pre_wcc(struct svc_fh *fhp)
+{
+ struct inode *inode;
+ struct kstat stat;
+ __be32 err;
+
+ if (fhp->fh_pre_saved)
+ return;
+
+ inode = d_inode(fhp->fh_dentry);
+ err = fh_getattr(fhp, &stat);
+ if (err) {
+ /* Grab the times from inode anyway */
+ stat.mtime = inode->i_mtime;
+ stat.ctime = inode->i_ctime;
+ stat.size = inode->i_size;
+ }
+
+ fhp->fh_pre_mtime = stat.mtime;
+ fhp->fh_pre_ctime = stat.ctime;
+ fhp->fh_pre_size = stat.size;
+ fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
+ fhp->fh_pre_saved = true;
+}
+
+/*
+ * Fill in the post_op attr for the wcc data
+ */
+void fill_post_wcc(struct svc_fh *fhp)
+{
+ __be32 err;
+
+ if (fhp->fh_post_saved)
+ printk("nfsd: inode locked twice during operation.\n");
+
+ err = fh_getattr(fhp, &fhp->fh_post_attr);
+ fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr,
+ d_inode(fhp->fh_dentry));
+ if (err) {
+ fhp->fh_post_saved = false;
+ /* Grab the ctime anyway - set_change_info might use it */
+ fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime;
+ } else
+ fhp->fh_post_saved = true;
+}
+
+/*
+ * XDR decode functions
+ */
+int
+nfs3svc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p)
+{
+ return 1;
+}
+
+int
+nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_fhandle *args = rqstp->rq_argp;
+
+ p = decode_fh(p, &args->fh);
+ if (!p)
+ return 0;
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_sattrargs *args = rqstp->rq_argp;
+
+ p = decode_fh(p, &args->fh);
+ if (!p)
+ return 0;
+ p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp));
+
+ if ((args->check_guard = ntohl(*p++)) != 0) {
+ struct timespec64 time;
+ p = decode_time3(p, &time);
+ args->guardtime = time.tv_sec;
+ }
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_diropargs *args = rqstp->rq_argp;
+
+ if (!(p = decode_fh(p, &args->fh))
+ || !(p = decode_filename(p, &args->name, &args->len)))
+ return 0;
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_accessargs *args = rqstp->rq_argp;
+
+ p = decode_fh(p, &args->fh);
+ if (!p)
+ return 0;
+ args->access = ntohl(*p++);
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_readargs *args = rqstp->rq_argp;
+ unsigned int len;
+ int v;
+ u32 max_blocksize = svc_max_payload(rqstp);
+
+ p = decode_fh(p, &args->fh);
+ if (!p)
+ return 0;
+ p = xdr_decode_hyper(p, &args->offset);
+
+ args->count = ntohl(*p++);
+ len = min(args->count, max_blocksize);
+
+ /* set up the kvec */
+ v=0;
+ while (len > 0) {
+ struct page *p = *(rqstp->rq_next_page++);
+
+ rqstp->rq_vec[v].iov_base = page_address(p);
+ rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
+ len -= rqstp->rq_vec[v].iov_len;
+ v++;
+ }
+ args->vlen = v;
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_writeargs *args = rqstp->rq_argp;
+ unsigned int len, hdr, dlen;
+ u32 max_blocksize = svc_max_payload(rqstp);
+ struct kvec *head = rqstp->rq_arg.head;
+ struct kvec *tail = rqstp->rq_arg.tail;
+
+ p = decode_fh(p, &args->fh);
+ if (!p)
+ return 0;
+ p = xdr_decode_hyper(p, &args->offset);
+
+ args->count = ntohl(*p++);
+ args->stable = ntohl(*p++);
+ len = args->len = ntohl(*p++);
+ if ((void *)p > head->iov_base + head->iov_len)
+ return 0;
+ /*
+ * The count must equal the amount of data passed.
+ */
+ if (args->count != args->len)
+ return 0;
+
+ /*
+ * Check to make sure that we got the right number of
+ * bytes.
+ */
+ hdr = (void*)p - head->iov_base;
+ dlen = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len - hdr;
+ /*
+ * Round the length of the data which was specified up to
+ * the next multiple of XDR units and then compare that
+ * against the length which was actually received.
+ * Note that when RPCSEC/GSS (for example) is used, the
+ * data buffer can be padded so dlen might be larger
+ * than required. It must never be smaller.
+ */
+ if (dlen < XDR_QUADLEN(len)*4)
+ return 0;
+
+ if (args->count > max_blocksize) {
+ args->count = max_blocksize;
+ len = args->len = max_blocksize;
+ }
+
+ args->first.iov_base = (void *)p;
+ args->first.iov_len = head->iov_len - hdr;
+ return 1;
+}
+
+int
+nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_createargs *args = rqstp->rq_argp;
+
+ if (!(p = decode_fh(p, &args->fh))
+ || !(p = decode_filename(p, &args->name, &args->len)))
+ return 0;
+
+ switch (args->createmode = ntohl(*p++)) {
+ case NFS3_CREATE_UNCHECKED:
+ case NFS3_CREATE_GUARDED:
+ p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp));
+ break;
+ case NFS3_CREATE_EXCLUSIVE:
+ args->verf = p;
+ p += 2;
+ break;
+ default:
+ return 0;
+ }
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_createargs *args = rqstp->rq_argp;
+
+ if (!(p = decode_fh(p, &args->fh)) ||
+ !(p = decode_filename(p, &args->name, &args->len)))
+ return 0;
+ p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp));
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_symlinkargs *args = rqstp->rq_argp;
+ char *base = (char *)p;
+ size_t dlen;
+
+ if (!(p = decode_fh(p, &args->ffh)) ||
+ !(p = decode_filename(p, &args->fname, &args->flen)))
+ return 0;
+ p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp));
+
+ args->tlen = ntohl(*p++);
+
+ args->first.iov_base = p;
+ args->first.iov_len = rqstp->rq_arg.head[0].iov_len;
+ args->first.iov_len -= (char *)p - base;
+
+ dlen = args->first.iov_len + rqstp->rq_arg.page_len +
+ rqstp->rq_arg.tail[0].iov_len;
+ if (dlen < XDR_QUADLEN(args->tlen) << 2)
+ return 0;
+ return 1;
+}
+
+int
+nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_mknodargs *args = rqstp->rq_argp;
+
+ if (!(p = decode_fh(p, &args->fh))
+ || !(p = decode_filename(p, &args->name, &args->len)))
+ return 0;
+
+ args->ftype = ntohl(*p++);
+
+ if (args->ftype == NF3BLK || args->ftype == NF3CHR
+ || args->ftype == NF3SOCK || args->ftype == NF3FIFO)
+ p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp));
+
+ if (args->ftype == NF3BLK || args->ftype == NF3CHR) {
+ args->major = ntohl(*p++);
+ args->minor = ntohl(*p++);
+ }
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_renameargs *args = rqstp->rq_argp;
+
+ if (!(p = decode_fh(p, &args->ffh))
+ || !(p = decode_filename(p, &args->fname, &args->flen))
+ || !(p = decode_fh(p, &args->tfh))
+ || !(p = decode_filename(p, &args->tname, &args->tlen)))
+ return 0;
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_readlinkargs *args = rqstp->rq_argp;
+
+ p = decode_fh(p, &args->fh);
+ if (!p)
+ return 0;
+ args->buffer = page_address(*(rqstp->rq_next_page++));
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_linkargs *args = rqstp->rq_argp;
+
+ if (!(p = decode_fh(p, &args->ffh))
+ || !(p = decode_fh(p, &args->tfh))
+ || !(p = decode_filename(p, &args->tname, &args->tlen)))
+ return 0;
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_readdirargs *args = rqstp->rq_argp;
+ int len;
+ u32 max_blocksize = svc_max_payload(rqstp);
+
+ p = decode_fh(p, &args->fh);
+ if (!p)
+ return 0;
+ p = xdr_decode_hyper(p, &args->cookie);
+ args->verf = p; p += 2;
+ args->dircount = ~0;
+ args->count = ntohl(*p++);
+ len = args->count = min_t(u32, args->count, max_blocksize);
+
+ while (len > 0) {
+ struct page *p = *(rqstp->rq_next_page++);
+ if (!args->buffer)
+ args->buffer = page_address(p);
+ len -= PAGE_SIZE;
+ }
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_readdirargs *args = rqstp->rq_argp;
+ int len;
+ u32 max_blocksize = svc_max_payload(rqstp);
+
+ p = decode_fh(p, &args->fh);
+ if (!p)
+ return 0;
+ p = xdr_decode_hyper(p, &args->cookie);
+ args->verf = p; p += 2;
+ args->dircount = ntohl(*p++);
+ args->count = ntohl(*p++);
+
+ len = args->count = min(args->count, max_blocksize);
+ while (len > 0) {
+ struct page *p = *(rqstp->rq_next_page++);
+ if (!args->buffer)
+ args->buffer = page_address(p);
+ len -= PAGE_SIZE;
+ }
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_commitargs *args = rqstp->rq_argp;
+ p = decode_fh(p, &args->fh);
+ if (!p)
+ return 0;
+ p = xdr_decode_hyper(p, &args->offset);
+ args->count = ntohl(*p++);
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+/*
+ * XDR encode functions
+ */
+
+int
+nfs3svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p)
+{
+ return xdr_ressize_check(rqstp, p);
+}
+
+/* GETATTR */
+int
+nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_attrstat *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ if (resp->status == 0) {
+ lease_get_mtime(d_inode(resp->fh.fh_dentry),
+ &resp->stat.mtime);
+ p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat);
+ }
+ return xdr_ressize_check(rqstp, p);
+}
+
+/* SETATTR, REMOVE, RMDIR */
+int
+nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_attrstat *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ p = encode_wcc_data(rqstp, p, &resp->fh);
+ return xdr_ressize_check(rqstp, p);
+}
+
+/* LOOKUP */
+int
+nfs3svc_encode_diropres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_diropres *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ if (resp->status == 0) {
+ p = encode_fh(p, &resp->fh);
+ p = encode_post_op_attr(rqstp, p, &resp->fh);
+ }
+ p = encode_post_op_attr(rqstp, p, &resp->dirfh);
+ return xdr_ressize_check(rqstp, p);
+}
+
+/* ACCESS */
+int
+nfs3svc_encode_accessres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_accessres *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ p = encode_post_op_attr(rqstp, p, &resp->fh);
+ if (resp->status == 0)
+ *p++ = htonl(resp->access);
+ return xdr_ressize_check(rqstp, p);
+}
+
+/* READLINK */
+int
+nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_readlinkres *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ p = encode_post_op_attr(rqstp, p, &resp->fh);
+ if (resp->status == 0) {
+ *p++ = htonl(resp->len);
+ xdr_ressize_check(rqstp, p);
+ rqstp->rq_res.page_len = resp->len;
+ if (resp->len & 3) {
+ /* need to pad the tail */
+ rqstp->rq_res.tail[0].iov_base = p;
+ *p = 0;
+ rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3);
+ }
+ return 1;
+ } else
+ return xdr_ressize_check(rqstp, p);
+}
+
+/* READ */
+int
+nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_readres *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ p = encode_post_op_attr(rqstp, p, &resp->fh);
+ if (resp->status == 0) {
+ *p++ = htonl(resp->count);
+ *p++ = htonl(resp->eof);
+ *p++ = htonl(resp->count); /* xdr opaque count */
+ xdr_ressize_check(rqstp, p);
+ /* now update rqstp->rq_res to reflect data as well */
+ rqstp->rq_res.page_len = resp->count;
+ if (resp->count & 3) {
+ /* need to pad the tail */
+ rqstp->rq_res.tail[0].iov_base = p;
+ *p = 0;
+ rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3);
+ }
+ return 1;
+ } else
+ return xdr_ressize_check(rqstp, p);
+}
+
+/* WRITE */
+int
+nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_writeres *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ p = encode_wcc_data(rqstp, p, &resp->fh);
+ if (resp->status == 0) {
+ *p++ = htonl(resp->count);
+ *p++ = htonl(resp->committed);
+ *p++ = resp->verf[0];
+ *p++ = resp->verf[1];
+ }
+ return xdr_ressize_check(rqstp, p);
+}
+
+/* CREATE, MKDIR, SYMLINK, MKNOD */
+int
+nfs3svc_encode_createres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_diropres *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ if (resp->status == 0) {
+ *p++ = xdr_one;
+ p = encode_fh(p, &resp->fh);
+ p = encode_post_op_attr(rqstp, p, &resp->fh);
+ }
+ p = encode_wcc_data(rqstp, p, &resp->dirfh);
+ return xdr_ressize_check(rqstp, p);
+}
+
+/* RENAME */
+int
+nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_renameres *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ p = encode_wcc_data(rqstp, p, &resp->ffh);
+ p = encode_wcc_data(rqstp, p, &resp->tfh);
+ return xdr_ressize_check(rqstp, p);
+}
+
+/* LINK */
+int
+nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_linkres *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ p = encode_post_op_attr(rqstp, p, &resp->fh);
+ p = encode_wcc_data(rqstp, p, &resp->tfh);
+ return xdr_ressize_check(rqstp, p);
+}
+
+/* READDIR */
+int
+nfs3svc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_readdirres *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ p = encode_post_op_attr(rqstp, p, &resp->fh);
+
+ if (resp->status == 0) {
+ /* stupid readdir cookie */
+ memcpy(p, resp->verf, 8); p += 2;
+ xdr_ressize_check(rqstp, p);
+ if (rqstp->rq_res.head[0].iov_len + (2<<2) > PAGE_SIZE)
+ return 1; /*No room for trailer */
+ rqstp->rq_res.page_len = (resp->count) << 2;
+
+ /* add the 'tail' to the end of the 'head' page - page 0. */
+ rqstp->rq_res.tail[0].iov_base = p;
+ *p++ = 0; /* no more entries */
+ *p++ = htonl(resp->common.err == nfserr_eof);
+ rqstp->rq_res.tail[0].iov_len = 2<<2;
+ return 1;
+ } else
+ return xdr_ressize_check(rqstp, p);
+}
+
+static __be32 *
+encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
+ int namlen, u64 ino)
+{
+ *p++ = xdr_one; /* mark entry present */
+ p = xdr_encode_hyper(p, ino); /* file id */
+ p = xdr_encode_array(p, name, namlen);/* name length & name */
+
+ cd->offset = p; /* remember pointer */
+ p = xdr_encode_hyper(p, NFS_OFFSET_MAX);/* offset of next entry */
+
+ return p;
+}
+
+static __be32
+compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
+ const char *name, int namlen, u64 ino)
+{
+ struct svc_export *exp;
+ struct dentry *dparent, *dchild;
+ __be32 rv = nfserr_noent;
+
+ dparent = cd->fh.fh_dentry;
+ exp = cd->fh.fh_export;
+
+ if (isdotent(name, namlen)) {
+ if (namlen == 2) {
+ dchild = dget_parent(dparent);
+ /*
+ * Don't return filehandle for ".." if we're at
+ * the filesystem or export root:
+ */
+ if (dchild == dparent)
+ goto out;
+ if (dparent == exp->ex_path.dentry)
+ goto out;
+ } else
+ dchild = dget(dparent);
+ } else
+ dchild = lookup_positive_unlocked(name, dparent, namlen);
+ if (IS_ERR(dchild))
+ return rv;
+ if (d_mountpoint(dchild))
+ goto out;
+ if (dchild->d_inode->i_ino != ino)
+ goto out;
+ rv = fh_compose(fhp, exp, dchild, &cd->fh);
+out:
+ dput(dchild);
+ return rv;
+}
+
+static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen, u64 ino)
+{
+ struct svc_fh *fh = &cd->scratch;
+ __be32 err;
+
+ fh_init(fh, NFS3_FHSIZE);
+ err = compose_entry_fh(cd, fh, name, namlen, ino);
+ if (err) {
+ *p++ = 0;
+ *p++ = 0;
+ goto out;
+ }
+ p = encode_post_op_attr(cd->rqstp, p, fh);
+ *p++ = xdr_one; /* yes, a file handle follows */
+ p = encode_fh(p, fh);
+out:
+ fh_put(fh);
+ return p;
+}
+
+/*
+ * Encode a directory entry. This one works for both normal readdir
+ * and readdirplus.
+ * The normal readdir reply requires 2 (fileid) + 1 (stringlen)
+ * + string + 2 (cookie) + 1 (next) words, i.e. 6 + strlen.
+ *
+ * The readdirplus baggage is 1+21 words for post_op_attr, plus the
+ * file handle.
+ */
+
+#define NFS3_ENTRY_BAGGAGE (2 + 1 + 2 + 1)
+#define NFS3_ENTRYPLUS_BAGGAGE (1 + 21 + 1 + (NFS3_FHSIZE >> 2))
+static int
+encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
+ loff_t offset, u64 ino, unsigned int d_type, int plus)
+{
+ struct nfsd3_readdirres *cd = container_of(ccd, struct nfsd3_readdirres,
+ common);
+ __be32 *p = cd->buffer;
+ caddr_t curr_page_addr = NULL;
+ struct page ** page;
+ int slen; /* string (name) length */
+ int elen; /* estimated entry length in words */
+ int num_entry_words = 0; /* actual number of words */
+
+ if (cd->offset) {
+ u64 offset64 = offset;
+
+ if (unlikely(cd->offset1)) {
+ /* we ended up with offset on a page boundary */
+ *cd->offset = htonl(offset64 >> 32);
+ *cd->offset1 = htonl(offset64 & 0xffffffff);
+ cd->offset1 = NULL;
+ } else {
+ xdr_encode_hyper(cd->offset, offset64);
+ }
+ cd->offset = NULL;
+ }
+
+ /*
+ dprintk("encode_entry(%.*s @%ld%s)\n",
+ namlen, name, (long) offset, plus? " plus" : "");
+ */
+
+ /* truncate filename if too long */
+ namlen = min(namlen, NFS3_MAXNAMLEN);
+
+ slen = XDR_QUADLEN(namlen);
+ elen = slen + NFS3_ENTRY_BAGGAGE
+ + (plus? NFS3_ENTRYPLUS_BAGGAGE : 0);
+
+ if (cd->buflen < elen) {
+ cd->common.err = nfserr_toosmall;
+ return -EINVAL;
+ }
+
+ /* determine which page in rq_respages[] we are currently filling */
+ for (page = cd->rqstp->rq_respages + 1;
+ page < cd->rqstp->rq_next_page; page++) {
+ curr_page_addr = page_address(*page);
+
+ if (((caddr_t)cd->buffer >= curr_page_addr) &&
+ ((caddr_t)cd->buffer < curr_page_addr + PAGE_SIZE))
+ break;
+ }
+
+ if ((caddr_t)(cd->buffer + elen) < (curr_page_addr + PAGE_SIZE)) {
+ /* encode entry in current page */
+
+ p = encode_entry_baggage(cd, p, name, namlen, ino);
+
+ if (plus)
+ p = encode_entryplus_baggage(cd, p, name, namlen, ino);
+ num_entry_words = p - cd->buffer;
+ } else if (*(page+1) != NULL) {
+ /* temporarily encode entry into next page, then move back to
+ * current and next page in rq_respages[] */
+ __be32 *p1, *tmp;
+ int len1, len2;
+
+ /* grab next page for temporary storage of entry */
+ p1 = tmp = page_address(*(page+1));
+
+ p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
+
+ if (plus)
+ p1 = encode_entryplus_baggage(cd, p1, name, namlen, ino);
+
+ /* determine entry word length and lengths to go in pages */
+ num_entry_words = p1 - tmp;
+ len1 = curr_page_addr + PAGE_SIZE - (caddr_t)cd->buffer;
+ if ((num_entry_words << 2) < len1) {
+ /* the actual number of words in the entry is less
+ * than elen and can still fit in the current page
+ */
+ memmove(p, tmp, num_entry_words << 2);
+ p += num_entry_words;
+
+ /* update offset */
+ cd->offset = cd->buffer + (cd->offset - tmp);
+ } else {
+ unsigned int offset_r = (cd->offset - tmp) << 2;
+
+ /* update pointer to offset location.
+ * This is a 64bit quantity, so we need to
+ * deal with 3 cases:
+ * - entirely in first page
+ * - entirely in second page
+ * - 4 bytes in each page
+ */
+ if (offset_r + 8 <= len1) {
+ cd->offset = p + (cd->offset - tmp);
+ } else if (offset_r >= len1) {
+ cd->offset -= len1 >> 2;
+ } else {
+ /* sitting on the fence */
+ BUG_ON(offset_r != len1 - 4);
+ cd->offset = p + (cd->offset - tmp);
+ cd->offset1 = tmp;
+ }
+
+ len2 = (num_entry_words << 2) - len1;
+
+ /* move from temp page to current and next pages */
+ memmove(p, tmp, len1);
+ memmove(tmp, (caddr_t)tmp+len1, len2);
+
+ p = tmp + (len2 >> 2);
+ }
+ }
+ else {
+ cd->common.err = nfserr_toosmall;
+ return -EINVAL;
+ }
+
+ cd->buflen -= num_entry_words;
+ cd->buffer = p;
+ cd->common.err = nfs_ok;
+ return 0;
+
+}
+
+int
+nfs3svc_encode_entry(void *cd, const char *name,
+ int namlen, loff_t offset, u64 ino, unsigned int d_type)
+{
+ return encode_entry(cd, name, namlen, offset, ino, d_type, 0);
+}
+
+int
+nfs3svc_encode_entry_plus(void *cd, const char *name,
+ int namlen, loff_t offset, u64 ino,
+ unsigned int d_type)
+{
+ return encode_entry(cd, name, namlen, offset, ino, d_type, 1);
+}
+
+/* FSSTAT */
+int
+nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_fsstatres *resp = rqstp->rq_resp;
+ struct kstatfs *s = &resp->stats;
+ u64 bs = s->f_bsize;
+
+ *p++ = resp->status;
+ *p++ = xdr_zero; /* no post_op_attr */
+
+ if (resp->status == 0) {
+ p = xdr_encode_hyper(p, bs * s->f_blocks); /* total bytes */
+ p = xdr_encode_hyper(p, bs * s->f_bfree); /* free bytes */
+ p = xdr_encode_hyper(p, bs * s->f_bavail); /* user available bytes */
+ p = xdr_encode_hyper(p, s->f_files); /* total inodes */
+ p = xdr_encode_hyper(p, s->f_ffree); /* free inodes */
+ p = xdr_encode_hyper(p, s->f_ffree); /* user available inodes */
+ *p++ = htonl(resp->invarsec); /* mean unchanged time */
+ }
+ return xdr_ressize_check(rqstp, p);
+}
+
+/* FSINFO */
+int
+nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_fsinfores *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ *p++ = xdr_zero; /* no post_op_attr */
+
+ if (resp->status == 0) {
+ *p++ = htonl(resp->f_rtmax);
+ *p++ = htonl(resp->f_rtpref);
+ *p++ = htonl(resp->f_rtmult);
+ *p++ = htonl(resp->f_wtmax);
+ *p++ = htonl(resp->f_wtpref);
+ *p++ = htonl(resp->f_wtmult);
+ *p++ = htonl(resp->f_dtpref);
+ p = xdr_encode_hyper(p, resp->f_maxfilesize);
+ *p++ = xdr_one;
+ *p++ = xdr_zero;
+ *p++ = htonl(resp->f_properties);
+ }
+
+ return xdr_ressize_check(rqstp, p);
+}
+
+/* PATHCONF */
+int
+nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_pathconfres *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ *p++ = xdr_zero; /* no post_op_attr */
+
+ if (resp->status == 0) {
+ *p++ = htonl(resp->p_link_max);
+ *p++ = htonl(resp->p_name_max);
+ *p++ = htonl(resp->p_no_trunc);
+ *p++ = htonl(resp->p_chown_restricted);
+ *p++ = htonl(resp->p_case_insensitive);
+ *p++ = htonl(resp->p_case_preserving);
+ }
+
+ return xdr_ressize_check(rqstp, p);
+}
+
+/* COMMIT */
+int
+nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd3_commitres *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ p = encode_wcc_data(rqstp, p, &resp->fh);
+ /* Write verifier */
+ if (resp->status == 0) {
+ *p++ = resp->verf[0];
+ *p++ = resp->verf[1];
+ }
+ return xdr_ressize_check(rqstp, p);
+}
+
+/*
+ * XDR release functions
+ */
+void
+nfs3svc_release_fhandle(struct svc_rqst *rqstp)
+{
+ struct nfsd3_attrstat *resp = rqstp->rq_resp;
+
+ fh_put(&resp->fh);
+}
+
+void
+nfs3svc_release_fhandle2(struct svc_rqst *rqstp)
+{
+ struct nfsd3_fhandle_pair *resp = rqstp->rq_resp;
+
+ fh_put(&resp->fh1);
+ fh_put(&resp->fh2);
+}
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
new file mode 100644
index 000000000..71292a0d6
--- /dev/null
+++ b/fs/nfsd/nfs4acl.c
@@ -0,0 +1,884 @@
+/*
+ * Common NFSv4 ACL handling code.
+ *
+ * Copyright (c) 2002, 2003 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Marius Aamodt Eriksen <marius@umich.edu>
+ * Jeff Sedlak <jsedlak@umich.edu>
+ * J. Bruce Fields <bfields@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/posix_acl.h>
+
+#include "nfsfh.h"
+#include "nfsd.h"
+#include "acl.h"
+#include "vfs.h"
+
+#define NFS4_ACL_TYPE_DEFAULT 0x01
+#define NFS4_ACL_DIR 0x02
+#define NFS4_ACL_OWNER 0x04
+
+/* mode bit translations: */
+#define NFS4_READ_MODE (NFS4_ACE_READ_DATA)
+#define NFS4_WRITE_MODE (NFS4_ACE_WRITE_DATA | NFS4_ACE_APPEND_DATA)
+#define NFS4_EXECUTE_MODE NFS4_ACE_EXECUTE
+#define NFS4_ANYONE_MODE (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL | NFS4_ACE_SYNCHRONIZE)
+#define NFS4_OWNER_MODE (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL)
+
+/* flags used to simulate posix default ACLs */
+#define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \
+ | NFS4_ACE_DIRECTORY_INHERIT_ACE)
+
+#define NFS4_SUPPORTED_FLAGS (NFS4_INHERITANCE_FLAGS \
+ | NFS4_ACE_INHERIT_ONLY_ACE \
+ | NFS4_ACE_IDENTIFIER_GROUP)
+
+static u32
+mask_from_posix(unsigned short perm, unsigned int flags)
+{
+ int mask = NFS4_ANYONE_MODE;
+
+ if (flags & NFS4_ACL_OWNER)
+ mask |= NFS4_OWNER_MODE;
+ if (perm & ACL_READ)
+ mask |= NFS4_READ_MODE;
+ if (perm & ACL_WRITE)
+ mask |= NFS4_WRITE_MODE;
+ if ((perm & ACL_WRITE) && (flags & NFS4_ACL_DIR))
+ mask |= NFS4_ACE_DELETE_CHILD;
+ if (perm & ACL_EXECUTE)
+ mask |= NFS4_EXECUTE_MODE;
+ return mask;
+}
+
+static u32
+deny_mask_from_posix(unsigned short perm, u32 flags)
+{
+ u32 mask = 0;
+
+ if (perm & ACL_READ)
+ mask |= NFS4_READ_MODE;
+ if (perm & ACL_WRITE)
+ mask |= NFS4_WRITE_MODE;
+ if ((perm & ACL_WRITE) && (flags & NFS4_ACL_DIR))
+ mask |= NFS4_ACE_DELETE_CHILD;
+ if (perm & ACL_EXECUTE)
+ mask |= NFS4_EXECUTE_MODE;
+ return mask;
+}
+
+/* XXX: modify functions to return NFS errors; they're only ever
+ * used by nfs code, after all.... */
+
+/* We only map from NFSv4 to POSIX ACLs when setting ACLs, when we err on the
+ * side of being more restrictive, so the mode bit mapping below is
+ * pessimistic. An optimistic version would be needed to handle DENY's,
+ * but we expect to coalesce all ALLOWs and DENYs before mapping to mode
+ * bits. */
+
+static void
+low_mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags)
+{
+ u32 write_mode = NFS4_WRITE_MODE;
+
+ if (flags & NFS4_ACL_DIR)
+ write_mode |= NFS4_ACE_DELETE_CHILD;
+ *mode = 0;
+ if ((perm & NFS4_READ_MODE) == NFS4_READ_MODE)
+ *mode |= ACL_READ;
+ if ((perm & write_mode) == write_mode)
+ *mode |= ACL_WRITE;
+ if ((perm & NFS4_EXECUTE_MODE) == NFS4_EXECUTE_MODE)
+ *mode |= ACL_EXECUTE;
+}
+
+static short ace2type(struct nfs4_ace *);
+static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *,
+ unsigned int);
+
+int
+nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
+ struct nfs4_acl **acl)
+{
+ struct inode *inode = d_inode(dentry);
+ int error = 0;
+ struct posix_acl *pacl = NULL, *dpacl = NULL;
+ unsigned int flags = 0;
+ int size = 0;
+
+ pacl = get_acl(inode, ACL_TYPE_ACCESS);
+ if (!pacl)
+ pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+
+ if (IS_ERR(pacl))
+ return PTR_ERR(pacl);
+
+ /* allocate for worst case: one (deny, allow) pair each: */
+ size += 2 * pacl->a_count;
+
+ if (S_ISDIR(inode->i_mode)) {
+ flags = NFS4_ACL_DIR;
+ dpacl = get_acl(inode, ACL_TYPE_DEFAULT);
+ if (IS_ERR(dpacl)) {
+ error = PTR_ERR(dpacl);
+ goto rel_pacl;
+ }
+
+ if (dpacl)
+ size += 2 * dpacl->a_count;
+ }
+
+ *acl = kmalloc(nfs4_acl_bytes(size), GFP_KERNEL);
+ if (*acl == NULL) {
+ error = -ENOMEM;
+ goto out;
+ }
+ (*acl)->naces = 0;
+
+ _posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT);
+
+ if (dpacl)
+ _posix_to_nfsv4_one(dpacl, *acl, flags | NFS4_ACL_TYPE_DEFAULT);
+
+out:
+ posix_acl_release(dpacl);
+rel_pacl:
+ posix_acl_release(pacl);
+ return error;
+}
+
+struct posix_acl_summary {
+ unsigned short owner;
+ unsigned short users;
+ unsigned short group;
+ unsigned short groups;
+ unsigned short other;
+ unsigned short mask;
+};
+
+static void
+summarize_posix_acl(struct posix_acl *acl, struct posix_acl_summary *pas)
+{
+ struct posix_acl_entry *pa, *pe;
+
+ /*
+ * Only pas.users and pas.groups need initialization; previous
+ * posix_acl_valid() calls ensure that the other fields will be
+ * initialized in the following loop. But, just to placate gcc:
+ */
+ memset(pas, 0, sizeof(*pas));
+ pas->mask = 07;
+
+ pe = acl->a_entries + acl->a_count;
+
+ FOREACH_ACL_ENTRY(pa, acl, pe) {
+ switch (pa->e_tag) {
+ case ACL_USER_OBJ:
+ pas->owner = pa->e_perm;
+ break;
+ case ACL_GROUP_OBJ:
+ pas->group = pa->e_perm;
+ break;
+ case ACL_USER:
+ pas->users |= pa->e_perm;
+ break;
+ case ACL_GROUP:
+ pas->groups |= pa->e_perm;
+ break;
+ case ACL_OTHER:
+ pas->other = pa->e_perm;
+ break;
+ case ACL_MASK:
+ pas->mask = pa->e_perm;
+ break;
+ }
+ }
+ /* We'll only care about effective permissions: */
+ pas->users &= pas->mask;
+ pas->group &= pas->mask;
+ pas->groups &= pas->mask;
+}
+
+/* We assume the acl has been verified with posix_acl_valid. */
+static void
+_posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
+ unsigned int flags)
+{
+ struct posix_acl_entry *pa, *group_owner_entry;
+ struct nfs4_ace *ace;
+ struct posix_acl_summary pas;
+ unsigned short deny;
+ int eflag = ((flags & NFS4_ACL_TYPE_DEFAULT) ?
+ NFS4_INHERITANCE_FLAGS | NFS4_ACE_INHERIT_ONLY_ACE : 0);
+
+ BUG_ON(pacl->a_count < 3);
+ summarize_posix_acl(pacl, &pas);
+
+ pa = pacl->a_entries;
+ ace = acl->aces + acl->naces;
+
+ /* We could deny everything not granted by the owner: */
+ deny = ~pas.owner;
+ /*
+ * but it is equivalent (and simpler) to deny only what is not
+ * granted by later entries:
+ */
+ deny &= pas.users | pas.group | pas.groups | pas.other;
+ if (deny) {
+ ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
+ ace->flag = eflag;
+ ace->access_mask = deny_mask_from_posix(deny, flags);
+ ace->whotype = NFS4_ACL_WHO_OWNER;
+ ace++;
+ acl->naces++;
+ }
+
+ ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE;
+ ace->flag = eflag;
+ ace->access_mask = mask_from_posix(pa->e_perm, flags | NFS4_ACL_OWNER);
+ ace->whotype = NFS4_ACL_WHO_OWNER;
+ ace++;
+ acl->naces++;
+ pa++;
+
+ while (pa->e_tag == ACL_USER) {
+ deny = ~(pa->e_perm & pas.mask);
+ deny &= pas.groups | pas.group | pas.other;
+ if (deny) {
+ ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
+ ace->flag = eflag;
+ ace->access_mask = deny_mask_from_posix(deny, flags);
+ ace->whotype = NFS4_ACL_WHO_NAMED;
+ ace->who_uid = pa->e_uid;
+ ace++;
+ acl->naces++;
+ }
+ ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE;
+ ace->flag = eflag;
+ ace->access_mask = mask_from_posix(pa->e_perm & pas.mask,
+ flags);
+ ace->whotype = NFS4_ACL_WHO_NAMED;
+ ace->who_uid = pa->e_uid;
+ ace++;
+ acl->naces++;
+ pa++;
+ }
+
+ /* In the case of groups, we apply allow ACEs first, then deny ACEs,
+ * since a user can be in more than one group. */
+
+ /* allow ACEs */
+
+ group_owner_entry = pa;
+
+ ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE;
+ ace->flag = eflag;
+ ace->access_mask = mask_from_posix(pas.group, flags);
+ ace->whotype = NFS4_ACL_WHO_GROUP;
+ ace++;
+ acl->naces++;
+ pa++;
+
+ while (pa->e_tag == ACL_GROUP) {
+ ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE;
+ ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP;
+ ace->access_mask = mask_from_posix(pa->e_perm & pas.mask,
+ flags);
+ ace->whotype = NFS4_ACL_WHO_NAMED;
+ ace->who_gid = pa->e_gid;
+ ace++;
+ acl->naces++;
+ pa++;
+ }
+
+ /* deny ACEs */
+
+ pa = group_owner_entry;
+
+ deny = ~pas.group & pas.other;
+ if (deny) {
+ ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
+ ace->flag = eflag;
+ ace->access_mask = deny_mask_from_posix(deny, flags);
+ ace->whotype = NFS4_ACL_WHO_GROUP;
+ ace++;
+ acl->naces++;
+ }
+ pa++;
+
+ while (pa->e_tag == ACL_GROUP) {
+ deny = ~(pa->e_perm & pas.mask);
+ deny &= pas.other;
+ if (deny) {
+ ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
+ ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP;
+ ace->access_mask = deny_mask_from_posix(deny, flags);
+ ace->whotype = NFS4_ACL_WHO_NAMED;
+ ace->who_gid = pa->e_gid;
+ ace++;
+ acl->naces++;
+ }
+ pa++;
+ }
+
+ if (pa->e_tag == ACL_MASK)
+ pa++;
+ ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE;
+ ace->flag = eflag;
+ ace->access_mask = mask_from_posix(pa->e_perm, flags);
+ ace->whotype = NFS4_ACL_WHO_EVERYONE;
+ acl->naces++;
+}
+
+static bool
+pace_gt(struct posix_acl_entry *pace1, struct posix_acl_entry *pace2)
+{
+ if (pace1->e_tag != pace2->e_tag)
+ return pace1->e_tag > pace2->e_tag;
+ if (pace1->e_tag == ACL_USER)
+ return uid_gt(pace1->e_uid, pace2->e_uid);
+ if (pace1->e_tag == ACL_GROUP)
+ return gid_gt(pace1->e_gid, pace2->e_gid);
+ return false;
+}
+
+static void
+sort_pacl_range(struct posix_acl *pacl, int start, int end) {
+ int sorted = 0, i;
+
+ /* We just do a bubble sort; easy to do in place, and we're not
+ * expecting acl's to be long enough to justify anything more. */
+ while (!sorted) {
+ sorted = 1;
+ for (i = start; i < end; i++) {
+ if (pace_gt(&pacl->a_entries[i],
+ &pacl->a_entries[i+1])) {
+ sorted = 0;
+ swap(pacl->a_entries[i],
+ pacl->a_entries[i + 1]);
+ }
+ }
+ }
+}
+
+static void
+sort_pacl(struct posix_acl *pacl)
+{
+ /* posix_acl_valid requires that users and groups be in order
+ * by uid/gid. */
+ int i, j;
+
+ /* no users or groups */
+ if (!pacl || pacl->a_count <= 4)
+ return;
+
+ i = 1;
+ while (pacl->a_entries[i].e_tag == ACL_USER)
+ i++;
+ sort_pacl_range(pacl, 1, i-1);
+
+ BUG_ON(pacl->a_entries[i].e_tag != ACL_GROUP_OBJ);
+ j = ++i;
+ while (pacl->a_entries[j].e_tag == ACL_GROUP)
+ j++;
+ sort_pacl_range(pacl, i, j-1);
+ return;
+}
+
+/*
+ * While processing the NFSv4 ACE, this maintains bitmasks representing
+ * which permission bits have been allowed and which denied to a given
+ * entity: */
+struct posix_ace_state {
+ u32 allow;
+ u32 deny;
+};
+
+struct posix_user_ace_state {
+ union {
+ kuid_t uid;
+ kgid_t gid;
+ };
+ struct posix_ace_state perms;
+};
+
+struct posix_ace_state_array {
+ int n;
+ struct posix_user_ace_state aces[];
+};
+
+/*
+ * While processing the NFSv4 ACE, this maintains the partial permissions
+ * calculated so far: */
+
+struct posix_acl_state {
+ int empty;
+ struct posix_ace_state owner;
+ struct posix_ace_state group;
+ struct posix_ace_state other;
+ struct posix_ace_state everyone;
+ struct posix_ace_state mask; /* Deny unused in this case */
+ struct posix_ace_state_array *users;
+ struct posix_ace_state_array *groups;
+};
+
+static int
+init_state(struct posix_acl_state *state, int cnt)
+{
+ int alloc;
+
+ memset(state, 0, sizeof(struct posix_acl_state));
+ state->empty = 1;
+ /*
+ * In the worst case, each individual acl could be for a distinct
+ * named user or group, but we don't know which, so we allocate
+ * enough space for either:
+ */
+ alloc = sizeof(struct posix_ace_state_array)
+ + cnt*sizeof(struct posix_user_ace_state);
+ state->users = kzalloc(alloc, GFP_KERNEL);
+ if (!state->users)
+ return -ENOMEM;
+ state->groups = kzalloc(alloc, GFP_KERNEL);
+ if (!state->groups) {
+ kfree(state->users);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static void
+free_state(struct posix_acl_state *state) {
+ kfree(state->users);
+ kfree(state->groups);
+}
+
+static inline void add_to_mask(struct posix_acl_state *state, struct posix_ace_state *astate)
+{
+ state->mask.allow |= astate->allow;
+}
+
+static struct posix_acl *
+posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
+{
+ struct posix_acl_entry *pace;
+ struct posix_acl *pacl;
+ int nace;
+ int i;
+
+ /*
+ * ACLs with no ACEs are treated differently in the inheritable
+ * and effective cases: when there are no inheritable ACEs,
+ * calls ->set_acl with a NULL ACL structure.
+ */
+ if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT))
+ return NULL;
+
+ /*
+ * When there are no effective ACEs, the following will end
+ * up setting a 3-element effective posix ACL with all
+ * permissions zero.
+ */
+ if (!state->users->n && !state->groups->n)
+ nace = 3;
+ else /* Note we also include a MASK ACE in this case: */
+ nace = 4 + state->users->n + state->groups->n;
+ pacl = posix_acl_alloc(nace, GFP_KERNEL);
+ if (!pacl)
+ return ERR_PTR(-ENOMEM);
+
+ pace = pacl->a_entries;
+ pace->e_tag = ACL_USER_OBJ;
+ low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags);
+
+ for (i=0; i < state->users->n; i++) {
+ pace++;
+ pace->e_tag = ACL_USER;
+ low_mode_from_nfs4(state->users->aces[i].perms.allow,
+ &pace->e_perm, flags);
+ pace->e_uid = state->users->aces[i].uid;
+ add_to_mask(state, &state->users->aces[i].perms);
+ }
+
+ pace++;
+ pace->e_tag = ACL_GROUP_OBJ;
+ low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags);
+ add_to_mask(state, &state->group);
+
+ for (i=0; i < state->groups->n; i++) {
+ pace++;
+ pace->e_tag = ACL_GROUP;
+ low_mode_from_nfs4(state->groups->aces[i].perms.allow,
+ &pace->e_perm, flags);
+ pace->e_gid = state->groups->aces[i].gid;
+ add_to_mask(state, &state->groups->aces[i].perms);
+ }
+
+ if (state->users->n || state->groups->n) {
+ pace++;
+ pace->e_tag = ACL_MASK;
+ low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags);
+ }
+
+ pace++;
+ pace->e_tag = ACL_OTHER;
+ low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags);
+
+ return pacl;
+}
+
+static inline void allow_bits(struct posix_ace_state *astate, u32 mask)
+{
+ /* Allow all bits in the mask not already denied: */
+ astate->allow |= mask & ~astate->deny;
+}
+
+static inline void deny_bits(struct posix_ace_state *astate, u32 mask)
+{
+ /* Deny all bits in the mask not already allowed: */
+ astate->deny |= mask & ~astate->allow;
+}
+
+static int find_uid(struct posix_acl_state *state, kuid_t uid)
+{
+ struct posix_ace_state_array *a = state->users;
+ int i;
+
+ for (i = 0; i < a->n; i++)
+ if (uid_eq(a->aces[i].uid, uid))
+ return i;
+ /* Not found: */
+ a->n++;
+ a->aces[i].uid = uid;
+ a->aces[i].perms.allow = state->everyone.allow;
+ a->aces[i].perms.deny = state->everyone.deny;
+
+ return i;
+}
+
+static int find_gid(struct posix_acl_state *state, kgid_t gid)
+{
+ struct posix_ace_state_array *a = state->groups;
+ int i;
+
+ for (i = 0; i < a->n; i++)
+ if (gid_eq(a->aces[i].gid, gid))
+ return i;
+ /* Not found: */
+ a->n++;
+ a->aces[i].gid = gid;
+ a->aces[i].perms.allow = state->everyone.allow;
+ a->aces[i].perms.deny = state->everyone.deny;
+
+ return i;
+}
+
+static void deny_bits_array(struct posix_ace_state_array *a, u32 mask)
+{
+ int i;
+
+ for (i=0; i < a->n; i++)
+ deny_bits(&a->aces[i].perms, mask);
+}
+
+static void allow_bits_array(struct posix_ace_state_array *a, u32 mask)
+{
+ int i;
+
+ for (i=0; i < a->n; i++)
+ allow_bits(&a->aces[i].perms, mask);
+}
+
+static void process_one_v4_ace(struct posix_acl_state *state,
+ struct nfs4_ace *ace)
+{
+ u32 mask = ace->access_mask;
+ int i;
+
+ state->empty = 0;
+
+ switch (ace2type(ace)) {
+ case ACL_USER_OBJ:
+ if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+ allow_bits(&state->owner, mask);
+ } else {
+ deny_bits(&state->owner, mask);
+ }
+ break;
+ case ACL_USER:
+ i = find_uid(state, ace->who_uid);
+ if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+ allow_bits(&state->users->aces[i].perms, mask);
+ } else {
+ deny_bits(&state->users->aces[i].perms, mask);
+ mask = state->users->aces[i].perms.deny;
+ deny_bits(&state->owner, mask);
+ }
+ break;
+ case ACL_GROUP_OBJ:
+ if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+ allow_bits(&state->group, mask);
+ } else {
+ deny_bits(&state->group, mask);
+ mask = state->group.deny;
+ deny_bits(&state->owner, mask);
+ deny_bits(&state->everyone, mask);
+ deny_bits_array(state->users, mask);
+ deny_bits_array(state->groups, mask);
+ }
+ break;
+ case ACL_GROUP:
+ i = find_gid(state, ace->who_gid);
+ if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+ allow_bits(&state->groups->aces[i].perms, mask);
+ } else {
+ deny_bits(&state->groups->aces[i].perms, mask);
+ mask = state->groups->aces[i].perms.deny;
+ deny_bits(&state->owner, mask);
+ deny_bits(&state->group, mask);
+ deny_bits(&state->everyone, mask);
+ deny_bits_array(state->users, mask);
+ deny_bits_array(state->groups, mask);
+ }
+ break;
+ case ACL_OTHER:
+ if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+ allow_bits(&state->owner, mask);
+ allow_bits(&state->group, mask);
+ allow_bits(&state->other, mask);
+ allow_bits(&state->everyone, mask);
+ allow_bits_array(state->users, mask);
+ allow_bits_array(state->groups, mask);
+ } else {
+ deny_bits(&state->owner, mask);
+ deny_bits(&state->group, mask);
+ deny_bits(&state->other, mask);
+ deny_bits(&state->everyone, mask);
+ deny_bits_array(state->users, mask);
+ deny_bits_array(state->groups, mask);
+ }
+ }
+}
+
+static int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl,
+ struct posix_acl **pacl, struct posix_acl **dpacl,
+ unsigned int flags)
+{
+ struct posix_acl_state effective_acl_state, default_acl_state;
+ struct nfs4_ace *ace;
+ int ret;
+
+ ret = init_state(&effective_acl_state, acl->naces);
+ if (ret)
+ return ret;
+ ret = init_state(&default_acl_state, acl->naces);
+ if (ret)
+ goto out_estate;
+ ret = -EINVAL;
+ for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) {
+ if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE &&
+ ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE)
+ goto out_dstate;
+ if (ace->flag & ~NFS4_SUPPORTED_FLAGS)
+ goto out_dstate;
+ if ((ace->flag & NFS4_INHERITANCE_FLAGS) == 0) {
+ process_one_v4_ace(&effective_acl_state, ace);
+ continue;
+ }
+ if (!(flags & NFS4_ACL_DIR))
+ goto out_dstate;
+ /*
+ * Note that when only one of FILE_INHERIT or DIRECTORY_INHERIT
+ * is set, we're effectively turning on the other. That's OK,
+ * according to rfc 3530.
+ */
+ process_one_v4_ace(&default_acl_state, ace);
+
+ if (!(ace->flag & NFS4_ACE_INHERIT_ONLY_ACE))
+ process_one_v4_ace(&effective_acl_state, ace);
+ }
+ *pacl = posix_state_to_acl(&effective_acl_state, flags);
+ if (IS_ERR(*pacl)) {
+ ret = PTR_ERR(*pacl);
+ *pacl = NULL;
+ goto out_dstate;
+ }
+ *dpacl = posix_state_to_acl(&default_acl_state,
+ flags | NFS4_ACL_TYPE_DEFAULT);
+ if (IS_ERR(*dpacl)) {
+ ret = PTR_ERR(*dpacl);
+ *dpacl = NULL;
+ posix_acl_release(*pacl);
+ *pacl = NULL;
+ goto out_dstate;
+ }
+ sort_pacl(*pacl);
+ sort_pacl(*dpacl);
+ ret = 0;
+out_dstate:
+ free_state(&default_acl_state);
+out_estate:
+ free_state(&effective_acl_state);
+ return ret;
+}
+
+__be32
+nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfs4_acl *acl)
+{
+ __be32 error;
+ int host_error;
+ struct dentry *dentry;
+ struct inode *inode;
+ struct posix_acl *pacl = NULL, *dpacl = NULL;
+ unsigned int flags = 0;
+
+ /* Get inode */
+ error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR);
+ if (error)
+ return error;
+
+ dentry = fhp->fh_dentry;
+ inode = d_inode(dentry);
+
+ if (S_ISDIR(inode->i_mode))
+ flags = NFS4_ACL_DIR;
+
+ host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
+ if (host_error == -EINVAL)
+ return nfserr_attrnotsupp;
+ if (host_error < 0)
+ goto out_nfserr;
+
+ fh_lock(fhp);
+
+ host_error = set_posix_acl(inode, ACL_TYPE_ACCESS, pacl);
+ if (host_error < 0)
+ goto out_drop_lock;
+
+ if (S_ISDIR(inode->i_mode)) {
+ host_error = set_posix_acl(inode, ACL_TYPE_DEFAULT, dpacl);
+ }
+
+out_drop_lock:
+ fh_unlock(fhp);
+
+ posix_acl_release(pacl);
+ posix_acl_release(dpacl);
+out_nfserr:
+ if (host_error == -EOPNOTSUPP)
+ return nfserr_attrnotsupp;
+ else
+ return nfserrno(host_error);
+}
+
+
+static short
+ace2type(struct nfs4_ace *ace)
+{
+ switch (ace->whotype) {
+ case NFS4_ACL_WHO_NAMED:
+ return (ace->flag & NFS4_ACE_IDENTIFIER_GROUP ?
+ ACL_GROUP : ACL_USER);
+ case NFS4_ACL_WHO_OWNER:
+ return ACL_USER_OBJ;
+ case NFS4_ACL_WHO_GROUP:
+ return ACL_GROUP_OBJ;
+ case NFS4_ACL_WHO_EVERYONE:
+ return ACL_OTHER;
+ }
+ BUG();
+ return -1;
+}
+
+/*
+ * return the size of the struct nfs4_acl required to represent an acl
+ * with @entries entries.
+ */
+int nfs4_acl_bytes(int entries)
+{
+ return sizeof(struct nfs4_acl) + entries * sizeof(struct nfs4_ace);
+}
+
+static struct {
+ char *string;
+ int stringlen;
+ int type;
+} s2t_map[] = {
+ {
+ .string = "OWNER@",
+ .stringlen = sizeof("OWNER@") - 1,
+ .type = NFS4_ACL_WHO_OWNER,
+ },
+ {
+ .string = "GROUP@",
+ .stringlen = sizeof("GROUP@") - 1,
+ .type = NFS4_ACL_WHO_GROUP,
+ },
+ {
+ .string = "EVERYONE@",
+ .stringlen = sizeof("EVERYONE@") - 1,
+ .type = NFS4_ACL_WHO_EVERYONE,
+ },
+};
+
+int
+nfs4_acl_get_whotype(char *p, u32 len)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(s2t_map); i++) {
+ if (s2t_map[i].stringlen == len &&
+ 0 == memcmp(s2t_map[i].string, p, len))
+ return s2t_map[i].type;
+ }
+ return NFS4_ACL_WHO_NAMED;
+}
+
+__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who)
+{
+ __be32 *p;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(s2t_map); i++) {
+ if (s2t_map[i].type != who)
+ continue;
+ p = xdr_reserve_space(xdr, s2t_map[i].stringlen + 4);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque(p, s2t_map[i].string,
+ s2t_map[i].stringlen);
+ return 0;
+ }
+ WARN_ON_ONCE(1);
+ return nfserr_serverfault;
+}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
new file mode 100644
index 000000000..f5b7ad084
--- /dev/null
+++ b/fs/nfsd/nfs4callback.c
@@ -0,0 +1,1381 @@
+/*
+ * Copyright (c) 2001 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Kendrick Smith <kmsmith@umich.edu>
+ * Andy Adamson <andros@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/slab.h>
+#include "nfsd.h"
+#include "state.h"
+#include "netns.h"
+#include "trace.h"
+#include "xdr4cb.h"
+#include "xdr4.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PROC
+
+static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
+
+#define NFSPROC4_CB_NULL 0
+#define NFSPROC4_CB_COMPOUND 1
+
+/* Index of predefined Linux callback client operations */
+
+struct nfs4_cb_compound_hdr {
+ /* args */
+ u32 ident; /* minorversion 0 only */
+ u32 nops;
+ __be32 *nops_p;
+ u32 minorversion;
+ /* res */
+ int status;
+};
+
+static __be32 *xdr_encode_empty_array(__be32 *p)
+{
+ *p++ = xdr_zero;
+ return p;
+}
+
+/*
+ * Encode/decode NFSv4 CB basic data types
+ *
+ * Basic NFSv4 callback data types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section
+ * 20 of RFC 5661: "Network File System (NFS) Version 4 Minor Version
+ * 1 Protocol"
+ */
+
+/*
+ * nfs_cb_opnum4
+ *
+ * enum nfs_cb_opnum4 {
+ * OP_CB_GETATTR = 3,
+ * ...
+ * };
+ */
+enum nfs_cb_opnum4 {
+ OP_CB_GETATTR = 3,
+ OP_CB_RECALL = 4,
+ OP_CB_LAYOUTRECALL = 5,
+ OP_CB_NOTIFY = 6,
+ OP_CB_PUSH_DELEG = 7,
+ OP_CB_RECALL_ANY = 8,
+ OP_CB_RECALLABLE_OBJ_AVAIL = 9,
+ OP_CB_RECALL_SLOT = 10,
+ OP_CB_SEQUENCE = 11,
+ OP_CB_WANTS_CANCELLED = 12,
+ OP_CB_NOTIFY_LOCK = 13,
+ OP_CB_NOTIFY_DEVICEID = 14,
+ OP_CB_OFFLOAD = 15,
+ OP_CB_ILLEGAL = 10044
+};
+
+static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+ *p = cpu_to_be32(op);
+}
+
+/*
+ * nfs_fh4
+ *
+ * typedef opaque nfs_fh4<NFS4_FHSIZE>;
+ */
+static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh)
+{
+ u32 length = fh->fh_size;
+ __be32 *p;
+
+ BUG_ON(length > NFS4_FHSIZE);
+ p = xdr_reserve_space(xdr, 4 + length);
+ xdr_encode_opaque(p, &fh->fh_base, length);
+}
+
+/*
+ * stateid4
+ *
+ * struct stateid4 {
+ * uint32_t seqid;
+ * opaque other[12];
+ * };
+ */
+static void encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE);
+ *p++ = cpu_to_be32(sid->si_generation);
+ xdr_encode_opaque_fixed(p, &sid->si_opaque, NFS4_STATEID_OTHER_SIZE);
+}
+
+/*
+ * sessionid4
+ *
+ * typedef opaque sessionid4[NFS4_SESSIONID_SIZE];
+ */
+static void encode_sessionid4(struct xdr_stream *xdr,
+ const struct nfsd4_session *session)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
+ xdr_encode_opaque_fixed(p, session->se_sessionid.data,
+ NFS4_MAX_SESSIONID_LEN);
+}
+
+/*
+ * nfsstat4
+ */
+static const struct {
+ int stat;
+ int errno;
+} nfs_cb_errtbl[] = {
+ { NFS4_OK, 0 },
+ { NFS4ERR_PERM, -EPERM },
+ { NFS4ERR_NOENT, -ENOENT },
+ { NFS4ERR_IO, -EIO },
+ { NFS4ERR_NXIO, -ENXIO },
+ { NFS4ERR_ACCESS, -EACCES },
+ { NFS4ERR_EXIST, -EEXIST },
+ { NFS4ERR_XDEV, -EXDEV },
+ { NFS4ERR_NOTDIR, -ENOTDIR },
+ { NFS4ERR_ISDIR, -EISDIR },
+ { NFS4ERR_INVAL, -EINVAL },
+ { NFS4ERR_FBIG, -EFBIG },
+ { NFS4ERR_NOSPC, -ENOSPC },
+ { NFS4ERR_ROFS, -EROFS },
+ { NFS4ERR_MLINK, -EMLINK },
+ { NFS4ERR_NAMETOOLONG, -ENAMETOOLONG },
+ { NFS4ERR_NOTEMPTY, -ENOTEMPTY },
+ { NFS4ERR_DQUOT, -EDQUOT },
+ { NFS4ERR_STALE, -ESTALE },
+ { NFS4ERR_BADHANDLE, -EBADHANDLE },
+ { NFS4ERR_BAD_COOKIE, -EBADCOOKIE },
+ { NFS4ERR_NOTSUPP, -ENOTSUPP },
+ { NFS4ERR_TOOSMALL, -ETOOSMALL },
+ { NFS4ERR_SERVERFAULT, -ESERVERFAULT },
+ { NFS4ERR_BADTYPE, -EBADTYPE },
+ { NFS4ERR_LOCKED, -EAGAIN },
+ { NFS4ERR_RESOURCE, -EREMOTEIO },
+ { NFS4ERR_SYMLINK, -ELOOP },
+ { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP },
+ { NFS4ERR_DEADLOCK, -EDEADLK },
+ { -1, -EIO }
+};
+
+/*
+ * If we cannot translate the error, the recovery routines should
+ * handle it.
+ *
+ * Note: remaining NFSv4 error codes have values > 10000, so should
+ * not conflict with native Linux error codes.
+ */
+static int nfs_cb_stat_to_errno(int status)
+{
+ int i;
+
+ for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
+ if (nfs_cb_errtbl[i].stat == status)
+ return nfs_cb_errtbl[i].errno;
+ }
+
+ dprintk("NFSD: Unrecognized NFS CB status value: %u\n", status);
+ return -status;
+}
+
+static int decode_cb_op_status(struct xdr_stream *xdr,
+ enum nfs_cb_opnum4 expected, int *status)
+{
+ __be32 *p;
+ u32 op;
+
+ p = xdr_inline_decode(xdr, 4 + 4);
+ if (unlikely(p == NULL))
+ goto out_overflow;
+ op = be32_to_cpup(p++);
+ if (unlikely(op != expected))
+ goto out_unexpected;
+ *status = nfs_cb_stat_to_errno(be32_to_cpup(p));
+ return 0;
+out_overflow:
+ return -EIO;
+out_unexpected:
+ dprintk("NFSD: Callback server returned operation %d but "
+ "we issued a request for %d\n", op, expected);
+ return -EIO;
+}
+
+/*
+ * CB_COMPOUND4args
+ *
+ * struct CB_COMPOUND4args {
+ * utf8str_cs tag;
+ * uint32_t minorversion;
+ * uint32_t callback_ident;
+ * nfs_cb_argop4 argarray<>;
+ * };
+*/
+static void encode_cb_compound4args(struct xdr_stream *xdr,
+ struct nfs4_cb_compound_hdr *hdr)
+{
+ __be32 * p;
+
+ p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
+ p = xdr_encode_empty_array(p); /* empty tag */
+ *p++ = cpu_to_be32(hdr->minorversion);
+ *p++ = cpu_to_be32(hdr->ident);
+
+ hdr->nops_p = p;
+ *p = cpu_to_be32(hdr->nops); /* argarray element count */
+}
+
+/*
+ * Update argarray element count
+ */
+static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr)
+{
+ BUG_ON(hdr->nops > NFS4_MAX_BACK_CHANNEL_OPS);
+ *hdr->nops_p = cpu_to_be32(hdr->nops);
+}
+
+/*
+ * CB_COMPOUND4res
+ *
+ * struct CB_COMPOUND4res {
+ * nfsstat4 status;
+ * utf8str_cs tag;
+ * nfs_cb_resop4 resarray<>;
+ * };
+ */
+static int decode_cb_compound4res(struct xdr_stream *xdr,
+ struct nfs4_cb_compound_hdr *hdr)
+{
+ u32 length;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4 + 4);
+ if (unlikely(p == NULL))
+ goto out_overflow;
+ hdr->status = be32_to_cpup(p++);
+ /* Ignore the tag */
+ length = be32_to_cpup(p++);
+ p = xdr_inline_decode(xdr, length + 4);
+ if (unlikely(p == NULL))
+ goto out_overflow;
+ p += XDR_QUADLEN(length);
+ hdr->nops = be32_to_cpup(p);
+ return 0;
+out_overflow:
+ return -EIO;
+}
+
+/*
+ * CB_RECALL4args
+ *
+ * struct CB_RECALL4args {
+ * stateid4 stateid;
+ * bool truncate;
+ * nfs_fh4 fh;
+ * };
+ */
+static void encode_cb_recall4args(struct xdr_stream *xdr,
+ const struct nfs4_delegation *dp,
+ struct nfs4_cb_compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_nfs_cb_opnum4(xdr, OP_CB_RECALL);
+ encode_stateid4(xdr, &dp->dl_stid.sc_stateid);
+
+ p = xdr_reserve_space(xdr, 4);
+ *p++ = xdr_zero; /* truncate */
+
+ encode_nfs_fh4(xdr, &dp->dl_stid.sc_file->fi_fhandle);
+
+ hdr->nops++;
+}
+
+/*
+ * CB_SEQUENCE4args
+ *
+ * struct CB_SEQUENCE4args {
+ * sessionid4 csa_sessionid;
+ * sequenceid4 csa_sequenceid;
+ * slotid4 csa_slotid;
+ * slotid4 csa_highest_slotid;
+ * bool csa_cachethis;
+ * referring_call_list4 csa_referring_call_lists<>;
+ * };
+ */
+static void encode_cb_sequence4args(struct xdr_stream *xdr,
+ const struct nfsd4_callback *cb,
+ struct nfs4_cb_compound_hdr *hdr)
+{
+ struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
+ __be32 *p;
+
+ if (hdr->minorversion == 0)
+ return;
+
+ encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE);
+ encode_sessionid4(xdr, session);
+
+ p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
+ *p++ = cpu_to_be32(session->se_cb_seq_nr); /* csa_sequenceid */
+ *p++ = xdr_zero; /* csa_slotid */
+ *p++ = xdr_zero; /* csa_highest_slotid */
+ *p++ = xdr_zero; /* csa_cachethis */
+ xdr_encode_empty_array(p); /* csa_referring_call_lists */
+
+ hdr->nops++;
+}
+
+/*
+ * CB_SEQUENCE4resok
+ *
+ * struct CB_SEQUENCE4resok {
+ * sessionid4 csr_sessionid;
+ * sequenceid4 csr_sequenceid;
+ * slotid4 csr_slotid;
+ * slotid4 csr_highest_slotid;
+ * slotid4 csr_target_highest_slotid;
+ * };
+ *
+ * union CB_SEQUENCE4res switch (nfsstat4 csr_status) {
+ * case NFS4_OK:
+ * CB_SEQUENCE4resok csr_resok4;
+ * default:
+ * void;
+ * };
+ *
+ * Our current back channel implmentation supports a single backchannel
+ * with a single slot.
+ */
+static int decode_cb_sequence4resok(struct xdr_stream *xdr,
+ struct nfsd4_callback *cb)
+{
+ struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
+ int status = -ESERVERFAULT;
+ __be32 *p;
+ u32 dummy;
+
+ /*
+ * If the server returns different values for sessionID, slotID or
+ * sequence number, the server is looney tunes.
+ */
+ p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4);
+ if (unlikely(p == NULL))
+ goto out_overflow;
+
+ if (memcmp(p, session->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) {
+ dprintk("NFS: %s Invalid session id\n", __func__);
+ goto out;
+ }
+ p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
+
+ dummy = be32_to_cpup(p++);
+ if (dummy != session->se_cb_seq_nr) {
+ dprintk("NFS: %s Invalid sequence number\n", __func__);
+ goto out;
+ }
+
+ dummy = be32_to_cpup(p++);
+ if (dummy != 0) {
+ dprintk("NFS: %s Invalid slotid\n", __func__);
+ goto out;
+ }
+
+ /*
+ * FIXME: process highest slotid and target highest slotid
+ */
+ status = 0;
+out:
+ cb->cb_seq_status = status;
+ return status;
+out_overflow:
+ status = -EIO;
+ goto out;
+}
+
+static int decode_cb_sequence4res(struct xdr_stream *xdr,
+ struct nfsd4_callback *cb)
+{
+ int status;
+
+ if (cb->cb_clp->cl_minorversion == 0)
+ return 0;
+
+ status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_seq_status);
+ if (unlikely(status || cb->cb_seq_status))
+ return status;
+
+ return decode_cb_sequence4resok(xdr, cb);
+}
+
+/*
+ * NFSv4.0 and NFSv4.1 XDR encode functions
+ *
+ * NFSv4.0 callback argument types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section 20
+ * of RFC 5661: "Network File System (NFS) Version 4 Minor Version 1
+ * Protocol".
+ */
+
+/*
+ * NB: Without this zero space reservation, callbacks over krb5p fail
+ */
+static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *__unused)
+{
+ xdr_reserve_space(xdr, 0);
+}
+
+/*
+ * 20.2. Operation 4: CB_RECALL - Recall a Delegation
+ */
+static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfsd4_callback *cb = data;
+ const struct nfs4_delegation *dp = cb_to_delegation(cb);
+ struct nfs4_cb_compound_hdr hdr = {
+ .ident = cb->cb_clp->cl_cb_ident,
+ .minorversion = cb->cb_clp->cl_minorversion,
+ };
+
+ encode_cb_compound4args(xdr, &hdr);
+ encode_cb_sequence4args(xdr, cb, &hdr);
+ encode_cb_recall4args(xdr, dp, &hdr);
+ encode_cb_nops(&hdr);
+}
+
+
+/*
+ * NFSv4.0 and NFSv4.1 XDR decode functions
+ *
+ * NFSv4.0 callback result types are defined in section 15 of RFC
+ * 3530: "Network File System (NFS) version 4 Protocol" and section 20
+ * of RFC 5661: "Network File System (NFS) Version 4 Minor Version 1
+ * Protocol".
+ */
+
+static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *__unused)
+{
+ return 0;
+}
+
+/*
+ * 20.2. Operation 4: CB_RECALL - Recall a Delegation
+ */
+static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfsd4_callback *cb = data;
+ struct nfs4_cb_compound_hdr hdr;
+ int status;
+
+ status = decode_cb_compound4res(xdr, &hdr);
+ if (unlikely(status))
+ return status;
+
+ status = decode_cb_sequence4res(xdr, cb);
+ if (unlikely(status || cb->cb_seq_status))
+ return status;
+
+ return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status);
+}
+
+#ifdef CONFIG_NFSD_PNFS
+/*
+ * CB_LAYOUTRECALL4args
+ *
+ * struct layoutrecall_file4 {
+ * nfs_fh4 lor_fh;
+ * offset4 lor_offset;
+ * length4 lor_length;
+ * stateid4 lor_stateid;
+ * };
+ *
+ * union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) {
+ * case LAYOUTRECALL4_FILE:
+ * layoutrecall_file4 lor_layout;
+ * case LAYOUTRECALL4_FSID:
+ * fsid4 lor_fsid;
+ * case LAYOUTRECALL4_ALL:
+ * void;
+ * };
+ *
+ * struct CB_LAYOUTRECALL4args {
+ * layouttype4 clora_type;
+ * layoutiomode4 clora_iomode;
+ * bool clora_changed;
+ * layoutrecall4 clora_recall;
+ * };
+ */
+static void encode_cb_layout4args(struct xdr_stream *xdr,
+ const struct nfs4_layout_stateid *ls,
+ struct nfs4_cb_compound_hdr *hdr)
+{
+ __be32 *p;
+
+ BUG_ON(hdr->minorversion == 0);
+
+ p = xdr_reserve_space(xdr, 5 * 4);
+ *p++ = cpu_to_be32(OP_CB_LAYOUTRECALL);
+ *p++ = cpu_to_be32(ls->ls_layout_type);
+ *p++ = cpu_to_be32(IOMODE_ANY);
+ *p++ = cpu_to_be32(1);
+ *p = cpu_to_be32(RETURN_FILE);
+
+ encode_nfs_fh4(xdr, &ls->ls_stid.sc_file->fi_fhandle);
+
+ p = xdr_reserve_space(xdr, 2 * 8);
+ p = xdr_encode_hyper(p, 0);
+ xdr_encode_hyper(p, NFS4_MAX_UINT64);
+
+ encode_stateid4(xdr, &ls->ls_recall_sid);
+
+ hdr->nops++;
+}
+
+static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfsd4_callback *cb = data;
+ const struct nfs4_layout_stateid *ls =
+ container_of(cb, struct nfs4_layout_stateid, ls_recall);
+ struct nfs4_cb_compound_hdr hdr = {
+ .ident = 0,
+ .minorversion = cb->cb_clp->cl_minorversion,
+ };
+
+ encode_cb_compound4args(xdr, &hdr);
+ encode_cb_sequence4args(xdr, cb, &hdr);
+ encode_cb_layout4args(xdr, ls, &hdr);
+ encode_cb_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfsd4_callback *cb = data;
+ struct nfs4_cb_compound_hdr hdr;
+ int status;
+
+ status = decode_cb_compound4res(xdr, &hdr);
+ if (unlikely(status))
+ return status;
+
+ status = decode_cb_sequence4res(xdr, cb);
+ if (unlikely(status || cb->cb_seq_status))
+ return status;
+
+ return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status);
+}
+#endif /* CONFIG_NFSD_PNFS */
+
+static void encode_stateowner(struct xdr_stream *xdr, struct nfs4_stateowner *so)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 8 + 4 + so->so_owner.len);
+ p = xdr_encode_opaque_fixed(p, &so->so_client->cl_clientid, 8);
+ xdr_encode_opaque(p, so->so_owner.data, so->so_owner.len);
+}
+
+static void nfs4_xdr_enc_cb_notify_lock(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfsd4_callback *cb = data;
+ const struct nfsd4_blocked_lock *nbl =
+ container_of(cb, struct nfsd4_blocked_lock, nbl_cb);
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.fl_owner;
+ struct nfs4_cb_compound_hdr hdr = {
+ .ident = 0,
+ .minorversion = cb->cb_clp->cl_minorversion,
+ };
+
+ __be32 *p;
+
+ BUG_ON(hdr.minorversion == 0);
+
+ encode_cb_compound4args(xdr, &hdr);
+ encode_cb_sequence4args(xdr, cb, &hdr);
+
+ p = xdr_reserve_space(xdr, 4);
+ *p = cpu_to_be32(OP_CB_NOTIFY_LOCK);
+ encode_nfs_fh4(xdr, &nbl->nbl_fh);
+ encode_stateowner(xdr, &lo->lo_owner);
+ hdr.nops++;
+
+ encode_cb_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfsd4_callback *cb = data;
+ struct nfs4_cb_compound_hdr hdr;
+ int status;
+
+ status = decode_cb_compound4res(xdr, &hdr);
+ if (unlikely(status))
+ return status;
+
+ status = decode_cb_sequence4res(xdr, cb);
+ if (unlikely(status || cb->cb_seq_status))
+ return status;
+
+ return decode_cb_op_status(xdr, OP_CB_NOTIFY_LOCK, &cb->cb_status);
+}
+
+/*
+ * struct write_response4 {
+ * stateid4 wr_callback_id<1>;
+ * length4 wr_count;
+ * stable_how4 wr_committed;
+ * verifier4 wr_writeverf;
+ * };
+ * union offload_info4 switch (nfsstat4 coa_status) {
+ * case NFS4_OK:
+ * write_response4 coa_resok4;
+ * default:
+ * length4 coa_bytes_copied;
+ * };
+ * struct CB_OFFLOAD4args {
+ * nfs_fh4 coa_fh;
+ * stateid4 coa_stateid;
+ * offload_info4 coa_offload_info;
+ * };
+ */
+static void encode_offload_info4(struct xdr_stream *xdr,
+ __be32 nfserr,
+ const struct nfsd4_copy *cp)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+ *p++ = nfserr;
+ if (!nfserr) {
+ p = xdr_reserve_space(xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE);
+ p = xdr_encode_empty_array(p);
+ p = xdr_encode_hyper(p, cp->cp_res.wr_bytes_written);
+ *p++ = cpu_to_be32(cp->cp_res.wr_stable_how);
+ p = xdr_encode_opaque_fixed(p, cp->cp_res.wr_verifier.data,
+ NFS4_VERIFIER_SIZE);
+ } else {
+ p = xdr_reserve_space(xdr, 8);
+ /* We always return success if bytes were written */
+ p = xdr_encode_hyper(p, 0);
+ }
+}
+
+static void encode_cb_offload4args(struct xdr_stream *xdr,
+ __be32 nfserr,
+ const struct knfsd_fh *fh,
+ const struct nfsd4_copy *cp,
+ struct nfs4_cb_compound_hdr *hdr)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+ *p++ = cpu_to_be32(OP_CB_OFFLOAD);
+ encode_nfs_fh4(xdr, fh);
+ encode_stateid4(xdr, &cp->cp_res.cb_stateid);
+ encode_offload_info4(xdr, nfserr, cp);
+
+ hdr->nops++;
+}
+
+static void nfs4_xdr_enc_cb_offload(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfsd4_callback *cb = data;
+ const struct nfsd4_copy *cp =
+ container_of(cb, struct nfsd4_copy, cp_cb);
+ struct nfs4_cb_compound_hdr hdr = {
+ .ident = 0,
+ .minorversion = cb->cb_clp->cl_minorversion,
+ };
+
+ encode_cb_compound4args(xdr, &hdr);
+ encode_cb_sequence4args(xdr, cb, &hdr);
+ encode_cb_offload4args(xdr, cp->nfserr, &cp->fh, cp, &hdr);
+ encode_cb_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_cb_offload(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfsd4_callback *cb = data;
+ struct nfs4_cb_compound_hdr hdr;
+ int status;
+
+ status = decode_cb_compound4res(xdr, &hdr);
+ if (unlikely(status))
+ return status;
+
+ status = decode_cb_sequence4res(xdr, cb);
+ if (unlikely(status || cb->cb_seq_status))
+ return status;
+
+ return decode_cb_op_status(xdr, OP_CB_OFFLOAD, &cb->cb_status);
+}
+/*
+ * RPC procedure tables
+ */
+#define PROC(proc, call, argtype, restype) \
+[NFSPROC4_CLNT_##proc] = { \
+ .p_proc = NFSPROC4_CB_##call, \
+ .p_encode = nfs4_xdr_enc_##argtype, \
+ .p_decode = nfs4_xdr_dec_##restype, \
+ .p_arglen = NFS4_enc_##argtype##_sz, \
+ .p_replen = NFS4_dec_##restype##_sz, \
+ .p_statidx = NFSPROC4_CB_##call, \
+ .p_name = #proc, \
+}
+
+static const struct rpc_procinfo nfs4_cb_procedures[] = {
+ PROC(CB_NULL, NULL, cb_null, cb_null),
+ PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall),
+#ifdef CONFIG_NFSD_PNFS
+ PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout),
+#endif
+ PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock),
+ PROC(CB_OFFLOAD, COMPOUND, cb_offload, cb_offload),
+};
+
+static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)];
+static const struct rpc_version nfs_cb_version4 = {
+/*
+ * Note on the callback rpc program version number: despite language in rfc
+ * 5661 section 18.36.3 requiring servers to use 4 in this field, the
+ * official xdr descriptions for both 4.0 and 4.1 specify version 1, and
+ * in practice that appears to be what implementations use. The section
+ * 18.36.3 language is expected to be fixed in an erratum.
+ */
+ .number = 1,
+ .nrprocs = ARRAY_SIZE(nfs4_cb_procedures),
+ .procs = nfs4_cb_procedures,
+ .counts = nfs4_cb_counts,
+};
+
+static const struct rpc_version *nfs_cb_version[2] = {
+ [1] = &nfs_cb_version4,
+};
+
+static const struct rpc_program cb_program;
+
+static struct rpc_stat cb_stats = {
+ .program = &cb_program
+};
+
+#define NFS4_CALLBACK 0x40000000
+static const struct rpc_program cb_program = {
+ .name = "nfs4_cb",
+ .number = NFS4_CALLBACK,
+ .nrvers = ARRAY_SIZE(nfs_cb_version),
+ .version = nfs_cb_version,
+ .stats = &cb_stats,
+ .pipe_dir_name = "nfsd4_cb",
+};
+
+static int max_cb_time(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ /*
+ * nfsd4_lease is set to at most one hour in __nfsd4_write_time,
+ * so we can use 32-bit math on it. Warn if that assumption
+ * ever stops being true.
+ */
+ if (WARN_ON_ONCE(nn->nfsd4_lease > 3600))
+ return 360 * HZ;
+
+ return max(((u32)nn->nfsd4_lease)/10, 1u) * HZ;
+}
+
+static struct workqueue_struct *callback_wq;
+
+static bool nfsd4_queue_cb(struct nfsd4_callback *cb)
+{
+ return queue_work(callback_wq, &cb->cb_work);
+}
+
+static void nfsd41_cb_inflight_begin(struct nfs4_client *clp)
+{
+ atomic_inc(&clp->cl_cb_inflight);
+}
+
+static void nfsd41_cb_inflight_end(struct nfs4_client *clp)
+{
+
+ if (atomic_dec_and_test(&clp->cl_cb_inflight))
+ wake_up_var(&clp->cl_cb_inflight);
+}
+
+static void nfsd41_cb_inflight_wait_complete(struct nfs4_client *clp)
+{
+ wait_var_event(&clp->cl_cb_inflight,
+ !atomic_read(&clp->cl_cb_inflight));
+}
+
+static const struct cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses)
+{
+ if (clp->cl_minorversion == 0) {
+ client->cl_principal = clp->cl_cred.cr_targ_princ ?
+ clp->cl_cred.cr_targ_princ : "nfs";
+
+ return get_cred(rpc_machine_cred());
+ } else {
+ struct cred *kcred;
+
+ kcred = prepare_kernel_cred(NULL);
+ if (!kcred)
+ return NULL;
+
+ kcred->fsuid = ses->se_cb_sec.uid;
+ kcred->fsgid = ses->se_cb_sec.gid;
+ return kcred;
+ }
+}
+
+static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
+{
+ int maxtime = max_cb_time(clp->net);
+ struct rpc_timeout timeparms = {
+ .to_initval = maxtime,
+ .to_retries = 0,
+ .to_maxval = maxtime,
+ };
+ struct rpc_create_args args = {
+ .net = clp->net,
+ .address = (struct sockaddr *) &conn->cb_addr,
+ .addrsize = conn->cb_addrlen,
+ .saddress = (struct sockaddr *) &conn->cb_saddr,
+ .timeout = &timeparms,
+ .program = &cb_program,
+ .version = 1,
+ .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
+ .cred = current_cred(),
+ };
+ struct rpc_clnt *client;
+ const struct cred *cred;
+
+ if (clp->cl_minorversion == 0) {
+ if (!clp->cl_cred.cr_principal &&
+ (clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5)) {
+ trace_nfsd_cb_setup_err(clp, -EINVAL);
+ return -EINVAL;
+ }
+ args.client_name = clp->cl_cred.cr_principal;
+ args.prognumber = conn->cb_prog;
+ args.protocol = XPRT_TRANSPORT_TCP;
+ args.authflavor = clp->cl_cred.cr_flavor;
+ clp->cl_cb_ident = conn->cb_ident;
+ } else {
+ if (!conn->cb_xprt)
+ return -EINVAL;
+ clp->cl_cb_session = ses;
+ args.bc_xprt = conn->cb_xprt;
+ args.prognumber = clp->cl_cb_session->se_cb_prog;
+ args.protocol = conn->cb_xprt->xpt_class->xcl_ident |
+ XPRT_TRANSPORT_BC;
+ args.authflavor = ses->se_cb_sec.flavor;
+ }
+ /* Create RPC client */
+ client = rpc_create(&args);
+ if (IS_ERR(client)) {
+ trace_nfsd_cb_setup_err(clp, PTR_ERR(client));
+ return PTR_ERR(client);
+ }
+ cred = get_backchannel_cred(clp, client, ses);
+ if (!cred) {
+ trace_nfsd_cb_setup_err(clp, -ENOMEM);
+ rpc_shutdown_client(client);
+ return -ENOMEM;
+ }
+
+ if (clp->cl_minorversion != 0)
+ clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
+ clp->cl_cb_client = client;
+ clp->cl_cb_cred = cred;
+ trace_nfsd_cb_setup(clp);
+ return 0;
+}
+
+static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
+{
+ if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
+ return;
+ clp->cl_cb_state = NFSD4_CB_DOWN;
+ trace_nfsd_cb_state(clp);
+}
+
+static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
+{
+ if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
+ return;
+ clp->cl_cb_state = NFSD4_CB_FAULT;
+ trace_nfsd_cb_state(clp);
+}
+
+static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
+
+ trace_nfsd_cb_done(clp, task->tk_status);
+ if (task->tk_status)
+ nfsd4_mark_cb_down(clp, task->tk_status);
+ else {
+ clp->cl_cb_state = NFSD4_CB_UP;
+ trace_nfsd_cb_state(clp);
+ }
+}
+
+static void nfsd4_cb_probe_release(void *calldata)
+{
+ struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
+
+ nfsd41_cb_inflight_end(clp);
+
+}
+
+static const struct rpc_call_ops nfsd4_cb_probe_ops = {
+ /* XXX: release method to ensure we set the cb channel down if
+ * necessary on early failure? */
+ .rpc_call_done = nfsd4_cb_probe_done,
+ .rpc_release = nfsd4_cb_probe_release,
+};
+
+/*
+ * Poke the callback thread to process any updates to the callback
+ * parameters, and send a null probe.
+ */
+void nfsd4_probe_callback(struct nfs4_client *clp)
+{
+ clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+ trace_nfsd_cb_state(clp);
+ set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags);
+ nfsd4_run_cb(&clp->cl_cb_null);
+}
+
+void nfsd4_probe_callback_sync(struct nfs4_client *clp)
+{
+ nfsd4_probe_callback(clp);
+ flush_workqueue(callback_wq);
+}
+
+void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+{
+ clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+ spin_lock(&clp->cl_lock);
+ memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
+ spin_unlock(&clp->cl_lock);
+ trace_nfsd_cb_state(clp);
+}
+
+/*
+ * There's currently a single callback channel slot.
+ * If the slot is available, then mark it busy. Otherwise, set the
+ * thread for sleeping on the callback RPC wait queue.
+ */
+static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task)
+{
+ struct nfs4_client *clp = cb->cb_clp;
+
+ if (!cb->cb_holds_slot &&
+ test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
+ rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
+ /* Race breaker */
+ if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
+ dprintk("%s slot is busy\n", __func__);
+ return false;
+ }
+ rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
+ }
+ cb->cb_holds_slot = true;
+ return true;
+}
+
+static void nfsd41_cb_release_slot(struct nfsd4_callback *cb)
+{
+ struct nfs4_client *clp = cb->cb_clp;
+
+ if (cb->cb_holds_slot) {
+ cb->cb_holds_slot = false;
+ clear_bit(0, &clp->cl_cb_slot_busy);
+ rpc_wake_up_next(&clp->cl_cb_waitq);
+ }
+}
+
+static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
+{
+ struct nfs4_client *clp = cb->cb_clp;
+
+ nfsd41_cb_release_slot(cb);
+ if (cb->cb_ops && cb->cb_ops->release)
+ cb->cb_ops->release(cb);
+ nfsd41_cb_inflight_end(clp);
+}
+
+/*
+ * TODO: cb_sequence should support referring call lists, cachethis, multiple
+ * slots, and mark callback channel down on communication errors.
+ */
+static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfsd4_callback *cb = calldata;
+ struct nfs4_client *clp = cb->cb_clp;
+ u32 minorversion = clp->cl_minorversion;
+
+ /*
+ * cb_seq_status is only set in decode_cb_sequence4res,
+ * and so will remain 1 if an rpc level failure occurs.
+ */
+ cb->cb_seq_status = 1;
+ cb->cb_status = 0;
+ if (minorversion && !nfsd41_cb_get_slot(cb, task))
+ return;
+ rpc_call_start(task);
+}
+
+static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback *cb)
+{
+ struct nfs4_client *clp = cb->cb_clp;
+ struct nfsd4_session *session = clp->cl_cb_session;
+ bool ret = true;
+
+ if (!clp->cl_minorversion) {
+ /*
+ * If the backchannel connection was shut down while this
+ * task was queued, we need to resubmit it after setting up
+ * a new backchannel connection.
+ *
+ * Note that if we lost our callback connection permanently
+ * the submission code will error out, so we don't need to
+ * handle that case here.
+ */
+ if (RPC_SIGNALLED(task))
+ goto need_restart;
+
+ return true;
+ }
+
+ if (!cb->cb_holds_slot)
+ goto need_restart;
+
+ switch (cb->cb_seq_status) {
+ case 0:
+ /*
+ * No need for lock, access serialized in nfsd4_cb_prepare
+ *
+ * RFC5661 20.9.3
+ * If CB_SEQUENCE returns an error, then the state of the slot
+ * (sequence ID, cached reply) MUST NOT change.
+ */
+ ++session->se_cb_seq_nr;
+ break;
+ case -ESERVERFAULT:
+ ++session->se_cb_seq_nr;
+ fallthrough;
+ case 1:
+ case -NFS4ERR_BADSESSION:
+ nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
+ ret = false;
+ break;
+ case -NFS4ERR_DELAY:
+ if (!rpc_restart_call(task))
+ goto out;
+
+ rpc_delay(task, 2 * HZ);
+ return false;
+ case -NFS4ERR_BADSLOT:
+ goto retry_nowait;
+ case -NFS4ERR_SEQ_MISORDERED:
+ if (session->se_cb_seq_nr != 1) {
+ session->se_cb_seq_nr = 1;
+ goto retry_nowait;
+ }
+ break;
+ default:
+ nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
+ dprintk("%s: unprocessed error %d\n", __func__,
+ cb->cb_seq_status);
+ }
+
+ nfsd41_cb_release_slot(cb);
+ dprintk("%s: freed slot, new seqid=%d\n", __func__,
+ clp->cl_cb_session->se_cb_seq_nr);
+
+ if (RPC_SIGNALLED(task))
+ goto need_restart;
+out:
+ return ret;
+retry_nowait:
+ if (rpc_restart_call_prepare(task))
+ ret = false;
+ goto out;
+need_restart:
+ if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) {
+ task->tk_status = 0;
+ cb->cb_need_restart = true;
+ }
+ return false;
+}
+
+static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
+{
+ struct nfsd4_callback *cb = calldata;
+ struct nfs4_client *clp = cb->cb_clp;
+
+ trace_nfsd_cb_done(clp, task->tk_status);
+
+ if (!nfsd4_cb_sequence_done(task, cb))
+ return;
+
+ if (cb->cb_status) {
+ WARN_ON_ONCE(task->tk_status);
+ task->tk_status = cb->cb_status;
+ }
+
+ switch (cb->cb_ops->done(cb, task)) {
+ case 0:
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
+ return;
+ case 1:
+ switch (task->tk_status) {
+ case -EIO:
+ case -ETIMEDOUT:
+ case -EACCES:
+ nfsd4_mark_cb_down(clp, task->tk_status);
+ }
+ break;
+ default:
+ BUG();
+ }
+}
+
+static void nfsd4_cb_release(void *calldata)
+{
+ struct nfsd4_callback *cb = calldata;
+
+ if (cb->cb_need_restart)
+ nfsd4_queue_cb(cb);
+ else
+ nfsd41_destroy_cb(cb);
+
+}
+
+static const struct rpc_call_ops nfsd4_cb_ops = {
+ .rpc_call_prepare = nfsd4_cb_prepare,
+ .rpc_call_done = nfsd4_cb_done,
+ .rpc_release = nfsd4_cb_release,
+};
+
+int nfsd4_create_callback_queue(void)
+{
+ callback_wq = alloc_ordered_workqueue("nfsd4_callbacks", 0);
+ if (!callback_wq)
+ return -ENOMEM;
+ return 0;
+}
+
+void nfsd4_destroy_callback_queue(void)
+{
+ destroy_workqueue(callback_wq);
+}
+
+/* must be called under the state lock */
+void nfsd4_shutdown_callback(struct nfs4_client *clp)
+{
+ set_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags);
+ /*
+ * Note this won't actually result in a null callback;
+ * instead, nfsd4_run_cb_null() will detect the killed
+ * client, destroy the rpc client, and stop:
+ */
+ nfsd4_run_cb(&clp->cl_cb_null);
+ flush_workqueue(callback_wq);
+ nfsd41_cb_inflight_wait_complete(clp);
+}
+
+/* requires cl_lock: */
+static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
+{
+ struct nfsd4_session *s;
+ struct nfsd4_conn *c;
+
+ list_for_each_entry(s, &clp->cl_sessions, se_perclnt) {
+ list_for_each_entry(c, &s->se_conns, cn_persession) {
+ if (c->cn_flags & NFS4_CDFC4_BACK)
+ return c;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * Note there isn't a lot of locking in this code; instead we depend on
+ * the fact that it is run from the callback_wq, which won't run two
+ * work items at once. So, for example, callback_wq handles all access
+ * of cl_cb_client and all calls to rpc_create or rpc_shutdown_client.
+ */
+static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
+{
+ struct nfs4_cb_conn conn;
+ struct nfs4_client *clp = cb->cb_clp;
+ struct nfsd4_session *ses = NULL;
+ struct nfsd4_conn *c;
+ int err;
+
+ /*
+ * This is either an update, or the client dying; in either case,
+ * kill the old client:
+ */
+ if (clp->cl_cb_client) {
+ trace_nfsd_cb_shutdown(clp);
+ rpc_shutdown_client(clp->cl_cb_client);
+ clp->cl_cb_client = NULL;
+ put_cred(clp->cl_cb_cred);
+ clp->cl_cb_cred = NULL;
+ }
+ if (clp->cl_cb_conn.cb_xprt) {
+ svc_xprt_put(clp->cl_cb_conn.cb_xprt);
+ clp->cl_cb_conn.cb_xprt = NULL;
+ }
+ if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
+ return;
+ spin_lock(&clp->cl_lock);
+ /*
+ * Only serialized callback code is allowed to clear these
+ * flags; main nfsd code can only set them:
+ */
+ BUG_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK));
+ clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags);
+ memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
+ c = __nfsd4_find_backchannel(clp);
+ if (c) {
+ svc_xprt_get(c->cn_xprt);
+ conn.cb_xprt = c->cn_xprt;
+ ses = c->cn_session;
+ }
+ spin_unlock(&clp->cl_lock);
+
+ err = setup_callback_client(clp, &conn, ses);
+ if (err) {
+ nfsd4_mark_cb_down(clp, err);
+ if (c)
+ svc_xprt_put(c->cn_xprt);
+ return;
+ }
+}
+
+static void
+nfsd4_run_cb_work(struct work_struct *work)
+{
+ struct nfsd4_callback *cb =
+ container_of(work, struct nfsd4_callback, cb_work);
+ struct nfs4_client *clp = cb->cb_clp;
+ struct rpc_clnt *clnt;
+ int flags;
+
+ trace_nfsd_cb_work(clp, cb->cb_msg.rpc_proc->p_name);
+
+ if (cb->cb_need_restart) {
+ cb->cb_need_restart = false;
+ } else {
+ if (cb->cb_ops && cb->cb_ops->prepare)
+ cb->cb_ops->prepare(cb);
+ }
+
+ if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
+ nfsd4_process_cb_update(cb);
+
+ clnt = clp->cl_cb_client;
+ if (!clnt) {
+ /* Callback channel broken, or client killed; give up: */
+ nfsd41_destroy_cb(cb);
+ return;
+ }
+
+ /*
+ * Don't send probe messages for 4.1 or later.
+ */
+ if (!cb->cb_ops && clp->cl_minorversion) {
+ clp->cl_cb_state = NFSD4_CB_UP;
+ nfsd41_destroy_cb(cb);
+ return;
+ }
+
+ cb->cb_msg.rpc_cred = clp->cl_cb_cred;
+ flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN;
+ rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags,
+ cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb);
+}
+
+void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
+ const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op)
+{
+ cb->cb_clp = clp;
+ cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op];
+ cb->cb_msg.rpc_argp = cb;
+ cb->cb_msg.rpc_resp = cb;
+ cb->cb_ops = ops;
+ INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
+ cb->cb_seq_status = 1;
+ cb->cb_status = 0;
+ cb->cb_need_restart = false;
+ cb->cb_holds_slot = false;
+}
+
+void nfsd4_run_cb(struct nfsd4_callback *cb)
+{
+ struct nfs4_client *clp = cb->cb_clp;
+
+ nfsd41_cb_inflight_begin(clp);
+ if (!nfsd4_queue_cb(cb))
+ nfsd41_cb_inflight_end(clp);
+}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
new file mode 100644
index 000000000..f92161ce1
--- /dev/null
+++ b/fs/nfsd/nfs4idmap.c
@@ -0,0 +1,686 @@
+/*
+ * Mapping of UID/GIDs to name and vice versa.
+ *
+ * Copyright (c) 2002, 2003 The Regents of the University of
+ * Michigan. All rights reserved.
+ *
+ * Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include <net/net_namespace.h>
+#include "idmap.h"
+#include "nfsd.h"
+#include "netns.h"
+
+/*
+ * Turn off idmapping when using AUTH_SYS.
+ */
+static bool nfs4_disable_idmapping = true;
+module_param(nfs4_disable_idmapping, bool, 0644);
+MODULE_PARM_DESC(nfs4_disable_idmapping,
+ "Turn off server's NFSv4 idmapping when using 'sec=sys'");
+
+/*
+ * Cache entry
+ */
+
+/*
+ * XXX we know that IDMAP_NAMESZ < PAGE_SIZE, but it's ugly to rely on
+ * that.
+ */
+
+struct ent {
+ struct cache_head h;
+ int type; /* User / Group */
+ u32 id;
+ char name[IDMAP_NAMESZ];
+ char authname[IDMAP_NAMESZ];
+ struct rcu_head rcu_head;
+};
+
+/* Common entry handling */
+
+#define ENT_HASHBITS 8
+#define ENT_HASHMAX (1 << ENT_HASHBITS)
+
+static void
+ent_init(struct cache_head *cnew, struct cache_head *citm)
+{
+ struct ent *new = container_of(cnew, struct ent, h);
+ struct ent *itm = container_of(citm, struct ent, h);
+
+ new->id = itm->id;
+ new->type = itm->type;
+
+ strlcpy(new->name, itm->name, sizeof(new->name));
+ strlcpy(new->authname, itm->authname, sizeof(new->authname));
+}
+
+static void
+ent_put(struct kref *ref)
+{
+ struct ent *map = container_of(ref, struct ent, h.ref);
+ kfree_rcu(map, rcu_head);
+}
+
+static struct cache_head *
+ent_alloc(void)
+{
+ struct ent *e = kmalloc(sizeof(*e), GFP_KERNEL);
+ if (e)
+ return &e->h;
+ else
+ return NULL;
+}
+
+/*
+ * ID -> Name cache
+ */
+
+static uint32_t
+idtoname_hash(struct ent *ent)
+{
+ uint32_t hash;
+
+ hash = hash_str(ent->authname, ENT_HASHBITS);
+ hash = hash_long(hash ^ ent->id, ENT_HASHBITS);
+
+ /* Flip LSB for user/group */
+ if (ent->type == IDMAP_TYPE_GROUP)
+ hash ^= 1;
+
+ return hash;
+}
+
+static int
+idtoname_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall_timeout(cd, h);
+}
+
+static void
+idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
+ int *blen)
+{
+ struct ent *ent = container_of(ch, struct ent, h);
+ char idstr[11];
+
+ qword_add(bpp, blen, ent->authname);
+ snprintf(idstr, sizeof(idstr), "%u", ent->id);
+ qword_add(bpp, blen, ent->type == IDMAP_TYPE_GROUP ? "group" : "user");
+ qword_add(bpp, blen, idstr);
+
+ (*bpp)[-1] = '\n';
+}
+
+static int
+idtoname_match(struct cache_head *ca, struct cache_head *cb)
+{
+ struct ent *a = container_of(ca, struct ent, h);
+ struct ent *b = container_of(cb, struct ent, h);
+
+ return (a->id == b->id && a->type == b->type &&
+ strcmp(a->authname, b->authname) == 0);
+}
+
+static int
+idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h)
+{
+ struct ent *ent;
+
+ if (h == NULL) {
+ seq_puts(m, "#domain type id [name]\n");
+ return 0;
+ }
+ ent = container_of(h, struct ent, h);
+ seq_printf(m, "%s %s %u", ent->authname,
+ ent->type == IDMAP_TYPE_GROUP ? "group" : "user",
+ ent->id);
+ if (test_bit(CACHE_VALID, &h->flags))
+ seq_printf(m, " %s", ent->name);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static void
+warn_no_idmapd(struct cache_detail *detail, int has_died)
+{
+ printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n",
+ has_died ? "died" : "not been started");
+}
+
+
+static int idtoname_parse(struct cache_detail *, char *, int);
+static struct ent *idtoname_lookup(struct cache_detail *, struct ent *);
+static struct ent *idtoname_update(struct cache_detail *, struct ent *,
+ struct ent *);
+
+static const struct cache_detail idtoname_cache_template = {
+ .owner = THIS_MODULE,
+ .hash_size = ENT_HASHMAX,
+ .name = "nfs4.idtoname",
+ .cache_put = ent_put,
+ .cache_upcall = idtoname_upcall,
+ .cache_request = idtoname_request,
+ .cache_parse = idtoname_parse,
+ .cache_show = idtoname_show,
+ .warn_no_listener = warn_no_idmapd,
+ .match = idtoname_match,
+ .init = ent_init,
+ .update = ent_init,
+ .alloc = ent_alloc,
+};
+
+static int
+idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
+{
+ struct ent ent, *res;
+ char *buf1, *bp;
+ int len;
+ int error = -EINVAL;
+
+ if (buf[buflen - 1] != '\n')
+ return (-EINVAL);
+ buf[buflen - 1]= '\0';
+
+ buf1 = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (buf1 == NULL)
+ return (-ENOMEM);
+
+ memset(&ent, 0, sizeof(ent));
+
+ /* Authentication name */
+ len = qword_get(&buf, buf1, PAGE_SIZE);
+ if (len <= 0 || len >= IDMAP_NAMESZ)
+ goto out;
+ memcpy(ent.authname, buf1, sizeof(ent.authname));
+
+ /* Type */
+ if (qword_get(&buf, buf1, PAGE_SIZE) <= 0)
+ goto out;
+ ent.type = strcmp(buf1, "user") == 0 ?
+ IDMAP_TYPE_USER : IDMAP_TYPE_GROUP;
+
+ /* ID */
+ if (qword_get(&buf, buf1, PAGE_SIZE) <= 0)
+ goto out;
+ ent.id = simple_strtoul(buf1, &bp, 10);
+ if (bp == buf1)
+ goto out;
+
+ /* expiry */
+ ent.h.expiry_time = get_expiry(&buf);
+ if (ent.h.expiry_time == 0)
+ goto out;
+
+ error = -ENOMEM;
+ res = idtoname_lookup(cd, &ent);
+ if (!res)
+ goto out;
+
+ /* Name */
+ error = -EINVAL;
+ len = qword_get(&buf, buf1, PAGE_SIZE);
+ if (len < 0 || len >= IDMAP_NAMESZ)
+ goto out;
+ if (len == 0)
+ set_bit(CACHE_NEGATIVE, &ent.h.flags);
+ else
+ memcpy(ent.name, buf1, sizeof(ent.name));
+ error = -ENOMEM;
+ res = idtoname_update(cd, &ent, res);
+ if (res == NULL)
+ goto out;
+
+ cache_put(&res->h, cd);
+ error = 0;
+out:
+ kfree(buf1);
+ return error;
+}
+
+static struct ent *
+idtoname_lookup(struct cache_detail *cd, struct ent *item)
+{
+ struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h,
+ idtoname_hash(item));
+ if (ch)
+ return container_of(ch, struct ent, h);
+ else
+ return NULL;
+}
+
+static struct ent *
+idtoname_update(struct cache_detail *cd, struct ent *new, struct ent *old)
+{
+ struct cache_head *ch = sunrpc_cache_update(cd, &new->h, &old->h,
+ idtoname_hash(new));
+ if (ch)
+ return container_of(ch, struct ent, h);
+ else
+ return NULL;
+}
+
+
+/*
+ * Name -> ID cache
+ */
+
+static inline int
+nametoid_hash(struct ent *ent)
+{
+ return hash_str(ent->name, ENT_HASHBITS);
+}
+
+static int
+nametoid_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall_timeout(cd, h);
+}
+
+static void
+nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
+ int *blen)
+{
+ struct ent *ent = container_of(ch, struct ent, h);
+
+ qword_add(bpp, blen, ent->authname);
+ qword_add(bpp, blen, ent->type == IDMAP_TYPE_GROUP ? "group" : "user");
+ qword_add(bpp, blen, ent->name);
+
+ (*bpp)[-1] = '\n';
+}
+
+static int
+nametoid_match(struct cache_head *ca, struct cache_head *cb)
+{
+ struct ent *a = container_of(ca, struct ent, h);
+ struct ent *b = container_of(cb, struct ent, h);
+
+ return (a->type == b->type && strcmp(a->name, b->name) == 0 &&
+ strcmp(a->authname, b->authname) == 0);
+}
+
+static int
+nametoid_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h)
+{
+ struct ent *ent;
+
+ if (h == NULL) {
+ seq_puts(m, "#domain type name [id]\n");
+ return 0;
+ }
+ ent = container_of(h, struct ent, h);
+ seq_printf(m, "%s %s %s", ent->authname,
+ ent->type == IDMAP_TYPE_GROUP ? "group" : "user",
+ ent->name);
+ if (test_bit(CACHE_VALID, &h->flags))
+ seq_printf(m, " %u", ent->id);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static struct ent *nametoid_lookup(struct cache_detail *, struct ent *);
+static struct ent *nametoid_update(struct cache_detail *, struct ent *,
+ struct ent *);
+static int nametoid_parse(struct cache_detail *, char *, int);
+
+static const struct cache_detail nametoid_cache_template = {
+ .owner = THIS_MODULE,
+ .hash_size = ENT_HASHMAX,
+ .name = "nfs4.nametoid",
+ .cache_put = ent_put,
+ .cache_upcall = nametoid_upcall,
+ .cache_request = nametoid_request,
+ .cache_parse = nametoid_parse,
+ .cache_show = nametoid_show,
+ .warn_no_listener = warn_no_idmapd,
+ .match = nametoid_match,
+ .init = ent_init,
+ .update = ent_init,
+ .alloc = ent_alloc,
+};
+
+static int
+nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
+{
+ struct ent ent, *res;
+ char *buf1;
+ int len, error = -EINVAL;
+
+ if (buf[buflen - 1] != '\n')
+ return (-EINVAL);
+ buf[buflen - 1]= '\0';
+
+ buf1 = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (buf1 == NULL)
+ return (-ENOMEM);
+
+ memset(&ent, 0, sizeof(ent));
+
+ /* Authentication name */
+ len = qword_get(&buf, buf1, PAGE_SIZE);
+ if (len <= 0 || len >= IDMAP_NAMESZ)
+ goto out;
+ memcpy(ent.authname, buf1, sizeof(ent.authname));
+
+ /* Type */
+ if (qword_get(&buf, buf1, PAGE_SIZE) <= 0)
+ goto out;
+ ent.type = strcmp(buf1, "user") == 0 ?
+ IDMAP_TYPE_USER : IDMAP_TYPE_GROUP;
+
+ /* Name */
+ len = qword_get(&buf, buf1, PAGE_SIZE);
+ if (len <= 0 || len >= IDMAP_NAMESZ)
+ goto out;
+ memcpy(ent.name, buf1, sizeof(ent.name));
+
+ /* expiry */
+ ent.h.expiry_time = get_expiry(&buf);
+ if (ent.h.expiry_time == 0)
+ goto out;
+
+ /* ID */
+ error = get_int(&buf, &ent.id);
+ if (error == -EINVAL)
+ goto out;
+ if (error == -ENOENT)
+ set_bit(CACHE_NEGATIVE, &ent.h.flags);
+
+ error = -ENOMEM;
+ res = nametoid_lookup(cd, &ent);
+ if (res == NULL)
+ goto out;
+ res = nametoid_update(cd, &ent, res);
+ if (res == NULL)
+ goto out;
+
+ cache_put(&res->h, cd);
+ error = 0;
+out:
+ kfree(buf1);
+ return (error);
+}
+
+
+static struct ent *
+nametoid_lookup(struct cache_detail *cd, struct ent *item)
+{
+ struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h,
+ nametoid_hash(item));
+ if (ch)
+ return container_of(ch, struct ent, h);
+ else
+ return NULL;
+}
+
+static struct ent *
+nametoid_update(struct cache_detail *cd, struct ent *new, struct ent *old)
+{
+ struct cache_head *ch = sunrpc_cache_update(cd, &new->h, &old->h,
+ nametoid_hash(new));
+ if (ch)
+ return container_of(ch, struct ent, h);
+ else
+ return NULL;
+}
+
+/*
+ * Exported API
+ */
+
+int
+nfsd_idmap_init(struct net *net)
+{
+ int rv;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ nn->idtoname_cache = cache_create_net(&idtoname_cache_template, net);
+ if (IS_ERR(nn->idtoname_cache))
+ return PTR_ERR(nn->idtoname_cache);
+ rv = cache_register_net(nn->idtoname_cache, net);
+ if (rv)
+ goto destroy_idtoname_cache;
+ nn->nametoid_cache = cache_create_net(&nametoid_cache_template, net);
+ if (IS_ERR(nn->nametoid_cache)) {
+ rv = PTR_ERR(nn->nametoid_cache);
+ goto unregister_idtoname_cache;
+ }
+ rv = cache_register_net(nn->nametoid_cache, net);
+ if (rv)
+ goto destroy_nametoid_cache;
+ return 0;
+
+destroy_nametoid_cache:
+ cache_destroy_net(nn->nametoid_cache, net);
+unregister_idtoname_cache:
+ cache_unregister_net(nn->idtoname_cache, net);
+destroy_idtoname_cache:
+ cache_destroy_net(nn->idtoname_cache, net);
+ return rv;
+}
+
+void
+nfsd_idmap_shutdown(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ cache_unregister_net(nn->idtoname_cache, net);
+ cache_unregister_net(nn->nametoid_cache, net);
+ cache_destroy_net(nn->idtoname_cache, net);
+ cache_destroy_net(nn->nametoid_cache, net);
+}
+
+static int
+idmap_lookup(struct svc_rqst *rqstp,
+ struct ent *(*lookup_fn)(struct cache_detail *, struct ent *),
+ struct ent *key, struct cache_detail *detail, struct ent **item)
+{
+ int ret;
+
+ *item = lookup_fn(detail, key);
+ if (!*item)
+ return -ENOMEM;
+ retry:
+ ret = cache_check(detail, &(*item)->h, &rqstp->rq_chandle);
+
+ if (ret == -ETIMEDOUT) {
+ struct ent *prev_item = *item;
+ *item = lookup_fn(detail, key);
+ if (*item != prev_item)
+ goto retry;
+ cache_put(&(*item)->h, detail);
+ }
+ return ret;
+}
+
+static char *
+rqst_authname(struct svc_rqst *rqstp)
+{
+ struct auth_domain *clp;
+
+ clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client;
+ return clp->name;
+}
+
+static __be32
+idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen,
+ u32 *id)
+{
+ struct ent *item, key = {
+ .type = type,
+ };
+ int ret;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ if (namelen + 1 > sizeof(key.name))
+ return nfserr_badowner;
+ memcpy(key.name, name, namelen);
+ key.name[namelen] = '\0';
+ strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
+ ret = idmap_lookup(rqstp, nametoid_lookup, &key, nn->nametoid_cache, &item);
+ if (ret == -ENOENT)
+ return nfserr_badowner;
+ if (ret)
+ return nfserrno(ret);
+ *id = item->id;
+ cache_put(&item->h, nn->nametoid_cache);
+ return 0;
+}
+
+static __be32 encode_ascii_id(struct xdr_stream *xdr, u32 id)
+{
+ char buf[11];
+ int len;
+ __be32 *p;
+
+ len = sprintf(buf, "%u", id);
+ p = xdr_reserve_space(xdr, len + 4);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque(p, buf, len);
+ return 0;
+}
+
+static __be32 idmap_id_to_name(struct xdr_stream *xdr,
+ struct svc_rqst *rqstp, int type, u32 id)
+{
+ struct ent *item, key = {
+ .id = id,
+ .type = type,
+ };
+ __be32 *p;
+ int ret;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
+ ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item);
+ if (ret == -ENOENT)
+ return encode_ascii_id(xdr, id);
+ if (ret)
+ return nfserrno(ret);
+ ret = strlen(item->name);
+ WARN_ON_ONCE(ret > IDMAP_NAMESZ);
+ p = xdr_reserve_space(xdr, ret + 4);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque(p, item->name, ret);
+ cache_put(&item->h, nn->idtoname_cache);
+ return 0;
+}
+
+static bool
+numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id)
+{
+ int ret;
+ char buf[11];
+
+ if (namelen + 1 > sizeof(buf))
+ /* too long to represent a 32-bit id: */
+ return false;
+ /* Just to make sure it's null-terminated: */
+ memcpy(buf, name, namelen);
+ buf[namelen] = '\0';
+ ret = kstrtouint(buf, 10, id);
+ return ret == 0;
+}
+
+static __be32
+do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id)
+{
+ if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
+ if (numeric_name_to_id(rqstp, type, name, namelen, id))
+ return 0;
+ /*
+ * otherwise, fall through and try idmapping, for
+ * backwards compatibility with clients sending names:
+ */
+ return idmap_name_to_id(rqstp, type, name, namelen, id);
+}
+
+static __be32 encode_name_from_id(struct xdr_stream *xdr,
+ struct svc_rqst *rqstp, int type, u32 id)
+{
+ if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
+ return encode_ascii_id(xdr, id);
+ return idmap_id_to_name(xdr, rqstp, type, id);
+}
+
+__be32
+nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen,
+ kuid_t *uid)
+{
+ __be32 status;
+ u32 id = -1;
+
+ if (name == NULL || namelen == 0)
+ return nfserr_inval;
+
+ status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id);
+ *uid = make_kuid(nfsd_user_namespace(rqstp), id);
+ if (!uid_valid(*uid))
+ status = nfserr_badowner;
+ return status;
+}
+
+__be32
+nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
+ kgid_t *gid)
+{
+ __be32 status;
+ u32 id = -1;
+
+ if (name == NULL || namelen == 0)
+ return nfserr_inval;
+
+ status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id);
+ *gid = make_kgid(nfsd_user_namespace(rqstp), id);
+ if (!gid_valid(*gid))
+ status = nfserr_badowner;
+ return status;
+}
+
+__be32 nfsd4_encode_user(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+ kuid_t uid)
+{
+ u32 id = from_kuid_munged(nfsd_user_namespace(rqstp), uid);
+ return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_USER, id);
+}
+
+__be32 nfsd4_encode_group(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+ kgid_t gid)
+{
+ u32 id = from_kgid_munged(nfsd_user_namespace(rqstp), gid);
+ return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_GROUP, id);
+}
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
new file mode 100644
index 000000000..2673019d3
--- /dev/null
+++ b/fs/nfsd/nfs4layouts.c
@@ -0,0 +1,786 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/blkdev.h>
+#include <linux/kmod.h>
+#include <linux/file.h>
+#include <linux/jhash.h>
+#include <linux/sched.h>
+#include <linux/sunrpc/addr.h>
+
+#include "pnfs.h"
+#include "netns.h"
+#include "trace.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+struct nfs4_layout {
+ struct list_head lo_perstate;
+ struct nfs4_layout_stateid *lo_state;
+ struct nfsd4_layout_seg lo_seg;
+};
+
+static struct kmem_cache *nfs4_layout_cache;
+static struct kmem_cache *nfs4_layout_stateid_cache;
+
+static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
+static const struct lock_manager_operations nfsd4_layouts_lm_ops;
+
+const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
+#ifdef CONFIG_NFSD_FLEXFILELAYOUT
+ [LAYOUT_FLEX_FILES] = &ff_layout_ops,
+#endif
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
+ [LAYOUT_BLOCK_VOLUME] = &bl_layout_ops,
+#endif
+#ifdef CONFIG_NFSD_SCSILAYOUT
+ [LAYOUT_SCSI] = &scsi_layout_ops,
+#endif
+};
+
+/* pNFS device ID to export fsid mapping */
+#define DEVID_HASH_BITS 8
+#define DEVID_HASH_SIZE (1 << DEVID_HASH_BITS)
+#define DEVID_HASH_MASK (DEVID_HASH_SIZE - 1)
+static u64 nfsd_devid_seq = 1;
+static struct list_head nfsd_devid_hash[DEVID_HASH_SIZE];
+static DEFINE_SPINLOCK(nfsd_devid_lock);
+
+static inline u32 devid_hashfn(u64 idx)
+{
+ return jhash_2words(idx, idx >> 32, 0) & DEVID_HASH_MASK;
+}
+
+static void
+nfsd4_alloc_devid_map(const struct svc_fh *fhp)
+{
+ const struct knfsd_fh *fh = &fhp->fh_handle;
+ size_t fsid_len = key_len(fh->fh_fsid_type);
+ struct nfsd4_deviceid_map *map, *old;
+ int i;
+
+ map = kzalloc(sizeof(*map) + fsid_len, GFP_KERNEL);
+ if (!map)
+ return;
+
+ map->fsid_type = fh->fh_fsid_type;
+ memcpy(&map->fsid, fh->fh_fsid, fsid_len);
+
+ spin_lock(&nfsd_devid_lock);
+ if (fhp->fh_export->ex_devid_map)
+ goto out_unlock;
+
+ for (i = 0; i < DEVID_HASH_SIZE; i++) {
+ list_for_each_entry(old, &nfsd_devid_hash[i], hash) {
+ if (old->fsid_type != fh->fh_fsid_type)
+ continue;
+ if (memcmp(old->fsid, fh->fh_fsid,
+ key_len(old->fsid_type)))
+ continue;
+
+ fhp->fh_export->ex_devid_map = old;
+ goto out_unlock;
+ }
+ }
+
+ map->idx = nfsd_devid_seq++;
+ list_add_tail_rcu(&map->hash, &nfsd_devid_hash[devid_hashfn(map->idx)]);
+ fhp->fh_export->ex_devid_map = map;
+ map = NULL;
+
+out_unlock:
+ spin_unlock(&nfsd_devid_lock);
+ kfree(map);
+}
+
+struct nfsd4_deviceid_map *
+nfsd4_find_devid_map(int idx)
+{
+ struct nfsd4_deviceid_map *map, *ret = NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(map, &nfsd_devid_hash[devid_hashfn(idx)], hash)
+ if (map->idx == idx)
+ ret = map;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+int
+nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+ u32 device_generation)
+{
+ if (!fhp->fh_export->ex_devid_map) {
+ nfsd4_alloc_devid_map(fhp);
+ if (!fhp->fh_export->ex_devid_map)
+ return -ENOMEM;
+ }
+
+ id->fsid_idx = fhp->fh_export->ex_devid_map->idx;
+ id->generation = device_generation;
+ id->pad = 0;
+ return 0;
+}
+
+void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT)
+ struct super_block *sb = exp->ex_path.mnt->mnt_sb;
+#endif
+
+ if (!(exp->ex_flags & NFSEXP_PNFS))
+ return;
+
+#ifdef CONFIG_NFSD_FLEXFILELAYOUT
+ exp->ex_layout_types |= 1 << LAYOUT_FLEX_FILES;
+#endif
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
+ if (sb->s_export_op->get_uuid &&
+ sb->s_export_op->map_blocks &&
+ sb->s_export_op->commit_blocks)
+ exp->ex_layout_types |= 1 << LAYOUT_BLOCK_VOLUME;
+#endif
+#ifdef CONFIG_NFSD_SCSILAYOUT
+ if (sb->s_export_op->map_blocks &&
+ sb->s_export_op->commit_blocks &&
+ sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops &&
+ blk_queue_scsi_passthrough(sb->s_bdev->bd_disk->queue))
+ exp->ex_layout_types |= 1 << LAYOUT_SCSI;
+#endif
+}
+
+static void
+nfsd4_free_layout_stateid(struct nfs4_stid *stid)
+{
+ struct nfs4_layout_stateid *ls = layoutstateid(stid);
+ struct nfs4_client *clp = ls->ls_stid.sc_client;
+ struct nfs4_file *fp = ls->ls_stid.sc_file;
+
+ trace_nfsd_layoutstate_free(&ls->ls_stid.sc_stateid);
+
+ spin_lock(&clp->cl_lock);
+ list_del_init(&ls->ls_perclnt);
+ spin_unlock(&clp->cl_lock);
+
+ spin_lock(&fp->fi_lock);
+ list_del_init(&ls->ls_perfile);
+ spin_unlock(&fp->fi_lock);
+
+ if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
+ vfs_setlease(ls->ls_file->nf_file, F_UNLCK, NULL, (void **)&ls);
+ nfsd_file_put(ls->ls_file);
+
+ if (ls->ls_recalled)
+ atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls);
+
+ kmem_cache_free(nfs4_layout_stateid_cache, ls);
+}
+
+static int
+nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
+{
+ struct file_lock *fl;
+ int status;
+
+ if (nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
+ return 0;
+
+ fl = locks_alloc_lock();
+ if (!fl)
+ return -ENOMEM;
+ locks_init_lock(fl);
+ fl->fl_lmops = &nfsd4_layouts_lm_ops;
+ fl->fl_flags = FL_LAYOUT;
+ fl->fl_type = F_RDLCK;
+ fl->fl_end = OFFSET_MAX;
+ fl->fl_owner = ls;
+ fl->fl_pid = current->tgid;
+ fl->fl_file = ls->ls_file->nf_file;
+
+ status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL);
+ if (status) {
+ locks_free_lock(fl);
+ return status;
+ }
+ BUG_ON(fl != NULL);
+ return 0;
+}
+
+static struct nfs4_layout_stateid *
+nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
+ struct nfs4_stid *parent, u32 layout_type)
+{
+ struct nfs4_client *clp = cstate->clp;
+ struct nfs4_file *fp = parent->sc_file;
+ struct nfs4_layout_stateid *ls;
+ struct nfs4_stid *stp;
+
+ stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache,
+ nfsd4_free_layout_stateid);
+ if (!stp)
+ return NULL;
+
+ get_nfs4_file(fp);
+ stp->sc_file = fp;
+
+ ls = layoutstateid(stp);
+ INIT_LIST_HEAD(&ls->ls_perclnt);
+ INIT_LIST_HEAD(&ls->ls_perfile);
+ spin_lock_init(&ls->ls_lock);
+ INIT_LIST_HEAD(&ls->ls_layouts);
+ mutex_init(&ls->ls_mutex);
+ ls->ls_layout_type = layout_type;
+ nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
+ NFSPROC4_CLNT_CB_LAYOUT);
+
+ if (parent->sc_type == NFS4_DELEG_STID)
+ ls->ls_file = nfsd_file_get(fp->fi_deleg_file);
+ else
+ ls->ls_file = find_any_file(fp);
+ BUG_ON(!ls->ls_file);
+
+ if (nfsd4_layout_setlease(ls)) {
+ nfsd_file_put(ls->ls_file);
+ put_nfs4_file(fp);
+ kmem_cache_free(nfs4_layout_stateid_cache, ls);
+ return NULL;
+ }
+
+ spin_lock(&clp->cl_lock);
+ stp->sc_type = NFS4_LAYOUT_STID;
+ list_add(&ls->ls_perclnt, &clp->cl_lo_states);
+ spin_unlock(&clp->cl_lock);
+
+ spin_lock(&fp->fi_lock);
+ list_add(&ls->ls_perfile, &fp->fi_lo_states);
+ spin_unlock(&fp->fi_lock);
+
+ trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
+ return ls;
+}
+
+__be32
+nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate, stateid_t *stateid,
+ bool create, u32 layout_type, struct nfs4_layout_stateid **lsp)
+{
+ struct nfs4_layout_stateid *ls;
+ struct nfs4_stid *stid;
+ unsigned char typemask = NFS4_LAYOUT_STID;
+ __be32 status;
+
+ if (create)
+ typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID);
+
+ status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid,
+ net_generic(SVC_NET(rqstp), nfsd_net_id));
+ if (status)
+ goto out;
+
+ if (!fh_match(&cstate->current_fh.fh_handle,
+ &stid->sc_file->fi_fhandle)) {
+ status = nfserr_bad_stateid;
+ goto out_put_stid;
+ }
+
+ if (stid->sc_type != NFS4_LAYOUT_STID) {
+ ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type);
+ nfs4_put_stid(stid);
+
+ status = nfserr_jukebox;
+ if (!ls)
+ goto out;
+ mutex_lock(&ls->ls_mutex);
+ } else {
+ ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
+
+ status = nfserr_bad_stateid;
+ mutex_lock(&ls->ls_mutex);
+ if (nfsd4_stateid_generation_after(stateid, &stid->sc_stateid))
+ goto out_unlock_stid;
+ if (layout_type != ls->ls_layout_type)
+ goto out_unlock_stid;
+ }
+
+ *lsp = ls;
+ return 0;
+
+out_unlock_stid:
+ mutex_unlock(&ls->ls_mutex);
+out_put_stid:
+ nfs4_put_stid(stid);
+out:
+ return status;
+}
+
+static void
+nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
+{
+ spin_lock(&ls->ls_lock);
+ if (ls->ls_recalled)
+ goto out_unlock;
+
+ if (list_empty(&ls->ls_layouts))
+ goto out_unlock;
+
+ ls->ls_recalled = true;
+ atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls);
+ trace_nfsd_layout_recall(&ls->ls_stid.sc_stateid);
+
+ refcount_inc(&ls->ls_stid.sc_count);
+ nfsd4_run_cb(&ls->ls_recall);
+
+out_unlock:
+ spin_unlock(&ls->ls_lock);
+}
+
+static inline u64
+layout_end(struct nfsd4_layout_seg *seg)
+{
+ u64 end = seg->offset + seg->length;
+ return end >= seg->offset ? end : NFS4_MAX_UINT64;
+}
+
+static void
+layout_update_len(struct nfsd4_layout_seg *lo, u64 end)
+{
+ if (end == NFS4_MAX_UINT64)
+ lo->length = NFS4_MAX_UINT64;
+ else
+ lo->length = end - lo->offset;
+}
+
+static bool
+layouts_overlapping(struct nfs4_layout *lo, struct nfsd4_layout_seg *s)
+{
+ if (s->iomode != IOMODE_ANY && s->iomode != lo->lo_seg.iomode)
+ return false;
+ if (layout_end(&lo->lo_seg) <= s->offset)
+ return false;
+ if (layout_end(s) <= lo->lo_seg.offset)
+ return false;
+ return true;
+}
+
+static bool
+layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new)
+{
+ if (lo->iomode != new->iomode)
+ return false;
+ if (layout_end(new) < lo->offset)
+ return false;
+ if (layout_end(lo) < new->offset)
+ return false;
+
+ lo->offset = min(lo->offset, new->offset);
+ layout_update_len(lo, max(layout_end(lo), layout_end(new)));
+ return true;
+}
+
+static __be32
+nfsd4_recall_conflict(struct nfs4_layout_stateid *ls)
+{
+ struct nfs4_file *fp = ls->ls_stid.sc_file;
+ struct nfs4_layout_stateid *l, *n;
+ __be32 nfserr = nfs_ok;
+
+ assert_spin_locked(&fp->fi_lock);
+
+ list_for_each_entry_safe(l, n, &fp->fi_lo_states, ls_perfile) {
+ if (l != ls) {
+ nfsd4_recall_file_layout(l);
+ nfserr = nfserr_recallconflict;
+ }
+ }
+
+ return nfserr;
+}
+
+__be32
+nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
+{
+ struct nfsd4_layout_seg *seg = &lgp->lg_seg;
+ struct nfs4_file *fp = ls->ls_stid.sc_file;
+ struct nfs4_layout *lp, *new = NULL;
+ __be32 nfserr;
+
+ spin_lock(&fp->fi_lock);
+ nfserr = nfsd4_recall_conflict(ls);
+ if (nfserr)
+ goto out;
+ spin_lock(&ls->ls_lock);
+ list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+ if (layouts_try_merge(&lp->lo_seg, seg))
+ goto done;
+ }
+ spin_unlock(&ls->ls_lock);
+ spin_unlock(&fp->fi_lock);
+
+ new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL);
+ if (!new)
+ return nfserr_jukebox;
+ memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg));
+ new->lo_state = ls;
+
+ spin_lock(&fp->fi_lock);
+ nfserr = nfsd4_recall_conflict(ls);
+ if (nfserr)
+ goto out;
+ spin_lock(&ls->ls_lock);
+ list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+ if (layouts_try_merge(&lp->lo_seg, seg))
+ goto done;
+ }
+
+ refcount_inc(&ls->ls_stid.sc_count);
+ list_add_tail(&new->lo_perstate, &ls->ls_layouts);
+ new = NULL;
+done:
+ nfs4_inc_and_copy_stateid(&lgp->lg_sid, &ls->ls_stid);
+ spin_unlock(&ls->ls_lock);
+out:
+ spin_unlock(&fp->fi_lock);
+ if (new)
+ kmem_cache_free(nfs4_layout_cache, new);
+ return nfserr;
+}
+
+static void
+nfsd4_free_layouts(struct list_head *reaplist)
+{
+ while (!list_empty(reaplist)) {
+ struct nfs4_layout *lp = list_first_entry(reaplist,
+ struct nfs4_layout, lo_perstate);
+
+ list_del(&lp->lo_perstate);
+ nfs4_put_stid(&lp->lo_state->ls_stid);
+ kmem_cache_free(nfs4_layout_cache, lp);
+ }
+}
+
+static void
+nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg,
+ struct list_head *reaplist)
+{
+ struct nfsd4_layout_seg *lo = &lp->lo_seg;
+ u64 end = layout_end(lo);
+
+ if (seg->offset <= lo->offset) {
+ if (layout_end(seg) >= end) {
+ list_move_tail(&lp->lo_perstate, reaplist);
+ return;
+ }
+ lo->offset = layout_end(seg);
+ } else {
+ /* retain the whole layout segment on a split. */
+ if (layout_end(seg) < end) {
+ dprintk("%s: split not supported\n", __func__);
+ return;
+ }
+ end = seg->offset;
+ }
+
+ layout_update_len(lo, end);
+}
+
+__be32
+nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_layoutreturn *lrp)
+{
+ struct nfs4_layout_stateid *ls;
+ struct nfs4_layout *lp, *n;
+ LIST_HEAD(reaplist);
+ __be32 nfserr;
+ int found = 0;
+
+ nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lrp->lr_sid,
+ false, lrp->lr_layout_type,
+ &ls);
+ if (nfserr) {
+ trace_nfsd_layout_return_lookup_fail(&lrp->lr_sid);
+ return nfserr;
+ }
+
+ spin_lock(&ls->ls_lock);
+ list_for_each_entry_safe(lp, n, &ls->ls_layouts, lo_perstate) {
+ if (layouts_overlapping(lp, &lrp->lr_seg)) {
+ nfsd4_return_file_layout(lp, &lrp->lr_seg, &reaplist);
+ found++;
+ }
+ }
+ if (!list_empty(&ls->ls_layouts)) {
+ if (found)
+ nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid);
+ lrp->lrs_present = 1;
+ } else {
+ trace_nfsd_layoutstate_unhash(&ls->ls_stid.sc_stateid);
+ nfs4_unhash_stid(&ls->ls_stid);
+ lrp->lrs_present = 0;
+ }
+ spin_unlock(&ls->ls_lock);
+
+ mutex_unlock(&ls->ls_mutex);
+ nfs4_put_stid(&ls->ls_stid);
+ nfsd4_free_layouts(&reaplist);
+ return nfs_ok;
+}
+
+__be32
+nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_layoutreturn *lrp)
+{
+ struct nfs4_layout_stateid *ls, *n;
+ struct nfs4_client *clp = cstate->clp;
+ struct nfs4_layout *lp, *t;
+ LIST_HEAD(reaplist);
+
+ lrp->lrs_present = 0;
+
+ spin_lock(&clp->cl_lock);
+ list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
+ if (ls->ls_layout_type != lrp->lr_layout_type)
+ continue;
+
+ if (lrp->lr_return_type == RETURN_FSID &&
+ !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle,
+ &cstate->current_fh.fh_handle))
+ continue;
+
+ spin_lock(&ls->ls_lock);
+ list_for_each_entry_safe(lp, t, &ls->ls_layouts, lo_perstate) {
+ if (lrp->lr_seg.iomode == IOMODE_ANY ||
+ lrp->lr_seg.iomode == lp->lo_seg.iomode)
+ list_move_tail(&lp->lo_perstate, &reaplist);
+ }
+ spin_unlock(&ls->ls_lock);
+ }
+ spin_unlock(&clp->cl_lock);
+
+ nfsd4_free_layouts(&reaplist);
+ return 0;
+}
+
+static void
+nfsd4_return_all_layouts(struct nfs4_layout_stateid *ls,
+ struct list_head *reaplist)
+{
+ spin_lock(&ls->ls_lock);
+ list_splice_init(&ls->ls_layouts, reaplist);
+ spin_unlock(&ls->ls_lock);
+}
+
+void
+nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+ struct nfs4_layout_stateid *ls, *n;
+ LIST_HEAD(reaplist);
+
+ spin_lock(&clp->cl_lock);
+ list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt)
+ nfsd4_return_all_layouts(ls, &reaplist);
+ spin_unlock(&clp->cl_lock);
+
+ nfsd4_free_layouts(&reaplist);
+}
+
+void
+nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp)
+{
+ struct nfs4_layout_stateid *ls, *n;
+ LIST_HEAD(reaplist);
+
+ spin_lock(&fp->fi_lock);
+ list_for_each_entry_safe(ls, n, &fp->fi_lo_states, ls_perfile) {
+ if (ls->ls_stid.sc_client == clp)
+ nfsd4_return_all_layouts(ls, &reaplist);
+ }
+ spin_unlock(&fp->fi_lock);
+
+ nfsd4_free_layouts(&reaplist);
+}
+
+static void
+nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
+{
+ struct nfs4_client *clp = ls->ls_stid.sc_client;
+ char addr_str[INET6_ADDRSTRLEN];
+ static char const nfsd_recall_failed[] = "/sbin/nfsd-recall-failed";
+ static char *envp[] = {
+ "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ NULL
+ };
+ char *argv[8];
+ int error;
+
+ rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
+
+ printk(KERN_WARNING
+ "nfsd: client %s failed to respond to layout recall. "
+ " Fencing..\n", addr_str);
+
+ argv[0] = (char *)nfsd_recall_failed;
+ argv[1] = addr_str;
+ argv[2] = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_id;
+ argv[3] = NULL;
+
+ error = call_usermodehelper(nfsd_recall_failed, argv, envp,
+ UMH_WAIT_PROC);
+ if (error) {
+ printk(KERN_ERR "nfsd: fence failed for client %s: %d!\n",
+ addr_str, error);
+ }
+}
+
+static void
+nfsd4_cb_layout_prepare(struct nfsd4_callback *cb)
+{
+ struct nfs4_layout_stateid *ls =
+ container_of(cb, struct nfs4_layout_stateid, ls_recall);
+
+ mutex_lock(&ls->ls_mutex);
+ nfs4_inc_and_copy_stateid(&ls->ls_recall_sid, &ls->ls_stid);
+ mutex_unlock(&ls->ls_mutex);
+}
+
+static int
+nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
+{
+ struct nfs4_layout_stateid *ls =
+ container_of(cb, struct nfs4_layout_stateid, ls_recall);
+ struct nfsd_net *nn;
+ ktime_t now, cutoff;
+ const struct nfsd4_layout_ops *ops;
+
+
+ switch (task->tk_status) {
+ case 0:
+ case -NFS4ERR_DELAY:
+ /*
+ * Anything left? If not, then call it done. Note that we don't
+ * take the spinlock since this is an optimization and nothing
+ * should get added until the cb counter goes to zero.
+ */
+ if (list_empty(&ls->ls_layouts))
+ return 1;
+
+ /* Poll the client until it's done with the layout */
+ now = ktime_get();
+ nn = net_generic(ls->ls_stid.sc_client->net, nfsd_net_id);
+
+ /* Client gets 2 lease periods to return it */
+ cutoff = ktime_add_ns(task->tk_start,
+ (u64)nn->nfsd4_lease * NSEC_PER_SEC * 2);
+
+ if (ktime_before(now, cutoff)) {
+ rpc_delay(task, HZ/100); /* 10 mili-seconds */
+ return 0;
+ }
+ fallthrough;
+ default:
+ /*
+ * Unknown error or non-responding client, we'll need to fence.
+ */
+ trace_nfsd_layout_recall_fail(&ls->ls_stid.sc_stateid);
+
+ ops = nfsd4_layout_ops[ls->ls_layout_type];
+ if (ops->fence_client)
+ ops->fence_client(ls);
+ else
+ nfsd4_cb_layout_fail(ls);
+ return 1;
+ case -NFS4ERR_NOMATCHING_LAYOUT:
+ trace_nfsd_layout_recall_done(&ls->ls_stid.sc_stateid);
+ task->tk_status = 0;
+ return 1;
+ }
+}
+
+static void
+nfsd4_cb_layout_release(struct nfsd4_callback *cb)
+{
+ struct nfs4_layout_stateid *ls =
+ container_of(cb, struct nfs4_layout_stateid, ls_recall);
+ LIST_HEAD(reaplist);
+
+ trace_nfsd_layout_recall_release(&ls->ls_stid.sc_stateid);
+
+ nfsd4_return_all_layouts(ls, &reaplist);
+ nfsd4_free_layouts(&reaplist);
+ nfs4_put_stid(&ls->ls_stid);
+}
+
+static const struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
+ .prepare = nfsd4_cb_layout_prepare,
+ .done = nfsd4_cb_layout_done,
+ .release = nfsd4_cb_layout_release,
+};
+
+static bool
+nfsd4_layout_lm_break(struct file_lock *fl)
+{
+ /*
+ * We don't want the locks code to timeout the lease for us;
+ * we'll remove it ourself if a layout isn't returned
+ * in time:
+ */
+ fl->fl_break_time = 0;
+ nfsd4_recall_file_layout(fl->fl_owner);
+ return false;
+}
+
+static int
+nfsd4_layout_lm_change(struct file_lock *onlist, int arg,
+ struct list_head *dispose)
+{
+ BUG_ON(!(arg & F_UNLCK));
+ return lease_modify(onlist, arg, dispose);
+}
+
+static const struct lock_manager_operations nfsd4_layouts_lm_ops = {
+ .lm_break = nfsd4_layout_lm_break,
+ .lm_change = nfsd4_layout_lm_change,
+};
+
+int
+nfsd4_init_pnfs(void)
+{
+ int i;
+
+ for (i = 0; i < DEVID_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&nfsd_devid_hash[i]);
+
+ nfs4_layout_cache = kmem_cache_create("nfs4_layout",
+ sizeof(struct nfs4_layout), 0, 0, NULL);
+ if (!nfs4_layout_cache)
+ return -ENOMEM;
+
+ nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid",
+ sizeof(struct nfs4_layout_stateid), 0, 0, NULL);
+ if (!nfs4_layout_stateid_cache) {
+ kmem_cache_destroy(nfs4_layout_cache);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void
+nfsd4_exit_pnfs(void)
+{
+ int i;
+
+ kmem_cache_destroy(nfs4_layout_cache);
+ kmem_cache_destroy(nfs4_layout_stateid_cache);
+
+ for (i = 0; i < DEVID_HASH_SIZE; i++) {
+ struct nfsd4_deviceid_map *map, *n;
+
+ list_for_each_entry_safe(map, n, &nfsd_devid_hash[i], hash)
+ kfree(map);
+ }
+}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
new file mode 100644
index 000000000..e84996c38
--- /dev/null
+++ b/fs/nfsd/nfs4proc.c
@@ -0,0 +1,3329 @@
+/*
+ * Server-side procedures for NFSv4.
+ *
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Kendrick Smith <kmsmith@umich.edu>
+ * Andy Adamson <andros@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/fs_struct.h>
+#include <linux/file.h>
+#include <linux/falloc.h>
+#include <linux/slab.h>
+#include <linux/kthread.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/nfs_ssc.h>
+
+#include "idmap.h"
+#include "cache.h"
+#include "xdr4.h"
+#include "vfs.h"
+#include "current_stateid.h"
+#include "netns.h"
+#include "acl.h"
+#include "pnfs.h"
+#include "trace.h"
+
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#include <linux/security.h>
+
+static inline void
+nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
+{
+ struct inode *inode = d_inode(resfh->fh_dentry);
+ int status;
+
+ inode_lock(inode);
+ status = security_inode_setsecctx(resfh->fh_dentry,
+ label->data, label->len);
+ inode_unlock(inode);
+
+ if (status)
+ /*
+ * XXX: We should really fail the whole open, but we may
+ * already have created a new file, so it may be too
+ * late. For now this seems the least of evils:
+ */
+ bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+
+ return;
+}
+#else
+static inline void
+nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
+{ }
+#endif
+
+#define NFSDDBG_FACILITY NFSDDBG_PROC
+
+static u32 nfsd_attrmask[] = {
+ NFSD_WRITEABLE_ATTRS_WORD0,
+ NFSD_WRITEABLE_ATTRS_WORD1,
+ NFSD_WRITEABLE_ATTRS_WORD2
+};
+
+static u32 nfsd41_ex_attrmask[] = {
+ NFSD_SUPPATTR_EXCLCREAT_WORD0,
+ NFSD_SUPPATTR_EXCLCREAT_WORD1,
+ NFSD_SUPPATTR_EXCLCREAT_WORD2
+};
+
+static __be32
+check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ u32 *bmval, u32 *writable)
+{
+ struct dentry *dentry = cstate->current_fh.fh_dentry;
+ struct svc_export *exp = cstate->current_fh.fh_export;
+
+ if (!nfsd_attrs_supported(cstate->minorversion, bmval))
+ return nfserr_attrnotsupp;
+ if ((bmval[0] & FATTR4_WORD0_ACL) && !IS_POSIXACL(d_inode(dentry)))
+ return nfserr_attrnotsupp;
+ if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) &&
+ !(exp->ex_flags & NFSEXP_SECURITY_LABEL))
+ return nfserr_attrnotsupp;
+ if (writable && !bmval_is_subset(bmval, writable))
+ return nfserr_inval;
+ if (writable && (bmval[2] & FATTR4_WORD2_MODE_UMASK) &&
+ (bmval[1] & FATTR4_WORD1_MODE))
+ return nfserr_inval;
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_check_open_attributes(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
+{
+ __be32 status = nfs_ok;
+
+ if (open->op_create == NFS4_OPEN_CREATE) {
+ if (open->op_createmode == NFS4_CREATE_UNCHECKED
+ || open->op_createmode == NFS4_CREATE_GUARDED)
+ status = check_attr_support(rqstp, cstate,
+ open->op_bmval, nfsd_attrmask);
+ else if (open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1)
+ status = check_attr_support(rqstp, cstate,
+ open->op_bmval, nfsd41_ex_attrmask);
+ }
+
+ return status;
+}
+
+static int
+is_create_with_attrs(struct nfsd4_open *open)
+{
+ return open->op_create == NFS4_OPEN_CREATE
+ && (open->op_createmode == NFS4_CREATE_UNCHECKED
+ || open->op_createmode == NFS4_CREATE_GUARDED
+ || open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1);
+}
+
+/*
+ * if error occurs when setting the acl, just clear the acl bit
+ * in the returned attr bitmap.
+ */
+static void
+do_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfs4_acl *acl, u32 *bmval)
+{
+ __be32 status;
+
+ status = nfsd4_set_nfs4_acl(rqstp, fhp, acl);
+ if (status)
+ /*
+ * We should probably fail the whole open at this point,
+ * but we've already created the file, so it's too late;
+ * So this seems the least of evils:
+ */
+ bmval[0] &= ~FATTR4_WORD0_ACL;
+}
+
+static inline void
+fh_dup2(struct svc_fh *dst, struct svc_fh *src)
+{
+ fh_put(dst);
+ dget(src->fh_dentry);
+ if (src->fh_export)
+ exp_get(src->fh_export);
+ *dst = *src;
+}
+
+static __be32
+do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, int accmode)
+{
+ __be32 status;
+
+ if (open->op_truncate &&
+ !(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
+ return nfserr_inval;
+
+ accmode |= NFSD_MAY_READ_IF_EXEC;
+
+ if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
+ accmode |= NFSD_MAY_READ;
+ if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
+ accmode |= (NFSD_MAY_WRITE | NFSD_MAY_TRUNC);
+ if (open->op_share_deny & NFS4_SHARE_DENY_READ)
+ accmode |= NFSD_MAY_WRITE;
+
+ status = fh_verify(rqstp, current_fh, S_IFREG, accmode);
+
+ return status;
+}
+
+static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
+{
+ umode_t mode = d_inode(fh->fh_dentry)->i_mode;
+
+ if (S_ISREG(mode))
+ return nfs_ok;
+ if (S_ISDIR(mode))
+ return nfserr_isdir;
+ /*
+ * Using err_symlink as our catch-all case may look odd; but
+ * there's no other obvious error for this case in 4.0, and we
+ * happen to know that it will cause the linux v4 client to do
+ * the right thing on attempts to open something other than a
+ * regular file.
+ */
+ return nfserr_symlink;
+}
+
+static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh *resfh)
+{
+ if (nfsd4_has_session(cstate))
+ return;
+ fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
+ &resfh->fh_handle);
+}
+
+static __be32
+do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh)
+{
+ struct svc_fh *current_fh = &cstate->current_fh;
+ int accmode;
+ __be32 status;
+
+ *resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
+ if (!*resfh)
+ return nfserr_jukebox;
+ fh_init(*resfh, NFS4_FHSIZE);
+ open->op_truncate = false;
+
+ if (open->op_create) {
+ /* FIXME: check session persistence and pnfs flags.
+ * The nfsv4.1 spec requires the following semantics:
+ *
+ * Persistent | pNFS | Server REQUIRED | Client Allowed
+ * Reply Cache | server | |
+ * -------------+--------+-----------------+--------------------
+ * no | no | EXCLUSIVE4_1 | EXCLUSIVE4_1
+ * | | | (SHOULD)
+ * | | and EXCLUSIVE4 | or EXCLUSIVE4
+ * | | | (SHOULD NOT)
+ * no | yes | EXCLUSIVE4_1 | EXCLUSIVE4_1
+ * yes | no | GUARDED4 | GUARDED4
+ * yes | yes | GUARDED4 | GUARDED4
+ */
+
+ /*
+ * Note: create modes (UNCHECKED,GUARDED...) are the same
+ * in NFSv4 as in v3 except EXCLUSIVE4_1.
+ */
+ current->fs->umask = open->op_umask;
+ status = do_nfsd_create(rqstp, current_fh, open->op_fname.data,
+ open->op_fname.len, &open->op_iattr,
+ *resfh, open->op_createmode,
+ (u32 *)open->op_verf.data,
+ &open->op_truncate, &open->op_created);
+ current->fs->umask = 0;
+
+ if (!status && open->op_label.len)
+ nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval);
+
+ /*
+ * Following rfc 3530 14.2.16, and rfc 5661 18.16.4
+ * use the returned bitmask to indicate which attributes
+ * we used to store the verifier:
+ */
+ if (nfsd_create_is_exclusive(open->op_createmode) && status == 0)
+ open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS |
+ FATTR4_WORD1_TIME_MODIFY);
+ } else
+ /*
+ * Note this may exit with the parent still locked.
+ * We will hold the lock until nfsd4_open's final
+ * lookup, to prevent renames or unlinks until we've had
+ * a chance to an acquire a delegation if appropriate.
+ */
+ status = nfsd_lookup(rqstp, current_fh,
+ open->op_fname.data, open->op_fname.len, *resfh);
+ if (status)
+ goto out;
+ status = nfsd_check_obj_isreg(*resfh);
+ if (status)
+ goto out;
+
+ if (is_create_with_attrs(open) && open->op_acl != NULL)
+ do_set_nfs4_acl(rqstp, *resfh, open->op_acl, open->op_bmval);
+
+ nfsd4_set_open_owner_reply_cache(cstate, open, *resfh);
+ accmode = NFSD_MAY_NOP;
+ if (open->op_created ||
+ open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
+ accmode |= NFSD_MAY_OWNER_OVERRIDE;
+ status = do_open_permission(rqstp, *resfh, open, accmode);
+ set_change_info(&open->op_cinfo, current_fh);
+out:
+ return status;
+}
+
+static __be32
+do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
+{
+ struct svc_fh *current_fh = &cstate->current_fh;
+ __be32 status;
+ int accmode = 0;
+
+ /* We don't know the target directory, and therefore can not
+ * set the change info
+ */
+
+ memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
+
+ nfsd4_set_open_owner_reply_cache(cstate, open, current_fh);
+
+ open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
+ (open->op_iattr.ia_size == 0);
+ /*
+ * In the delegation case, the client is telling us about an
+ * open that it *already* performed locally, some time ago. We
+ * should let it succeed now if possible.
+ *
+ * In the case of a CLAIM_FH open, on the other hand, the client
+ * may be counting on us to enforce permissions (the Linux 4.1
+ * client uses this for normal opens, for example).
+ */
+ if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH)
+ accmode = NFSD_MAY_OWNER_OVERRIDE;
+
+ status = do_open_permission(rqstp, current_fh, open, accmode);
+
+ return status;
+}
+
+static void
+copy_clientid(clientid_t *clid, struct nfsd4_session *session)
+{
+ struct nfsd4_sessionid *sid =
+ (struct nfsd4_sessionid *)session->se_sessionid.data;
+
+ clid->cl_boot = sid->clientid.cl_boot;
+ clid->cl_id = sid->clientid.cl_id;
+}
+
+static __be32
+nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_open *open = &u->open;
+ __be32 status;
+ struct svc_fh *resfh = NULL;
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ bool reclaim = false;
+
+ dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",
+ (int)open->op_fname.len, open->op_fname.data,
+ open->op_openowner);
+
+ /* This check required by spec. */
+ if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
+ return nfserr_inval;
+
+ open->op_created = false;
+ /*
+ * RFC5661 18.51.3
+ * Before RECLAIM_COMPLETE done, server should deny new lock
+ */
+ if (nfsd4_has_session(cstate) &&
+ !test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE,
+ &cstate->session->se_client->cl_flags) &&
+ open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+ return nfserr_grace;
+
+ if (nfsd4_has_session(cstate))
+ copy_clientid(&open->op_clientid, cstate->session);
+
+ /* check seqid for replay. set nfs4_owner */
+ status = nfsd4_process_open1(cstate, open, nn);
+ if (status == nfserr_replay_me) {
+ struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay;
+ fh_put(&cstate->current_fh);
+ fh_copy_shallow(&cstate->current_fh.fh_handle,
+ &rp->rp_openfh);
+ status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
+ if (status)
+ dprintk("nfsd4_open: replay failed"
+ " restoring previous filehandle\n");
+ else
+ status = nfserr_replay_me;
+ }
+ if (status)
+ goto out;
+ if (open->op_xdr_error) {
+ status = open->op_xdr_error;
+ goto out;
+ }
+
+ status = nfsd4_check_open_attributes(rqstp, cstate, open);
+ if (status)
+ goto out;
+
+ /* Openowner is now set, so sequence id will get bumped. Now we need
+ * these checks before we do any creates: */
+ status = nfserr_grace;
+ if (opens_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+ goto out;
+ status = nfserr_no_grace;
+ if (!opens_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
+ goto out;
+
+ switch (open->op_claim_type) {
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ case NFS4_OPEN_CLAIM_NULL:
+ status = do_open_lookup(rqstp, cstate, open, &resfh);
+ if (status)
+ goto out;
+ break;
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ status = nfs4_check_open_reclaim(&open->op_clientid,
+ cstate, nn);
+ if (status)
+ goto out;
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+ reclaim = true;
+ fallthrough;
+ case NFS4_OPEN_CLAIM_FH:
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ status = do_open_fhandle(rqstp, cstate, open);
+ if (status)
+ goto out;
+ resfh = &cstate->current_fh;
+ break;
+ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+ case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+ dprintk("NFSD: unsupported OPEN claim type %d\n",
+ open->op_claim_type);
+ status = nfserr_notsupp;
+ goto out;
+ default:
+ dprintk("NFSD: Invalid OPEN claim type %d\n",
+ open->op_claim_type);
+ status = nfserr_inval;
+ goto out;
+ }
+ /*
+ * nfsd4_process_open2() does the actual opening of the file. If
+ * successful, it (1) truncates the file if open->op_truncate was
+ * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
+ */
+ status = nfsd4_process_open2(rqstp, resfh, open);
+ WARN(status && open->op_created,
+ "nfsd4_process_open2 failed to open newly-created file! status=%u\n",
+ be32_to_cpu(status));
+ if (reclaim && !status)
+ nn->somebody_reclaimed = true;
+out:
+ if (resfh && resfh != &cstate->current_fh) {
+ fh_dup2(&cstate->current_fh, resfh);
+ fh_put(resfh);
+ kfree(resfh);
+ }
+ nfsd4_cleanup_open_state(cstate, open);
+ nfsd4_bump_seqid(cstate, status);
+ return status;
+}
+
+/*
+ * OPEN is the only seqid-mutating operation whose decoding can fail
+ * with a seqid-mutating error (specifically, decoding of user names in
+ * the attributes). Therefore we have to do some processing to look up
+ * the stateowner so that we can bump the seqid.
+ */
+static __be32 nfsd4_open_omfg(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_op *op)
+{
+ struct nfsd4_open *open = &op->u.open;
+
+ if (!seqid_mutating_err(ntohl(op->status)))
+ return op->status;
+ if (nfsd4_has_session(cstate))
+ return op->status;
+ open->op_xdr_error = op->status;
+ return nfsd4_open(rqstp, cstate, &op->u);
+}
+
+/*
+ * filehandle-manipulating ops.
+ */
+static __be32
+nfsd4_getfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ u->getfh = &cstate->current_fh;
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_putfh *putfh = &u->putfh;
+ __be32 ret;
+
+ fh_put(&cstate->current_fh);
+ cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen;
+ memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval,
+ putfh->pf_fhlen);
+ ret = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS);
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+ if (ret == nfserr_stale && putfh->no_verify) {
+ SET_FH_FLAG(&cstate->current_fh, NFSD4_FH_FOREIGN);
+ ret = 0;
+ }
+#endif
+ return ret;
+}
+
+static __be32
+nfsd4_putrootfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ __be32 status;
+
+ fh_put(&cstate->current_fh);
+ status = exp_pseudoroot(rqstp, &cstate->current_fh);
+ return status;
+}
+
+static __be32
+nfsd4_restorefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ if (!cstate->save_fh.fh_dentry)
+ return nfserr_restorefh;
+
+ fh_dup2(&cstate->current_fh, &cstate->save_fh);
+ if (HAS_CSTATE_FLAG(cstate, SAVED_STATE_ID_FLAG)) {
+ memcpy(&cstate->current_stateid, &cstate->save_stateid, sizeof(stateid_t));
+ SET_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG);
+ }
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_savefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ fh_dup2(&cstate->save_fh, &cstate->current_fh);
+ if (HAS_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG)) {
+ memcpy(&cstate->save_stateid, &cstate->current_stateid, sizeof(stateid_t));
+ SET_CSTATE_FLAG(cstate, SAVED_STATE_ID_FLAG);
+ }
+ return nfs_ok;
+}
+
+/*
+ * misc nfsv4 ops
+ */
+static __be32
+nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_access *access = &u->access;
+ u32 access_full;
+
+ access_full = NFS3_ACCESS_FULL;
+ if (cstate->minorversion >= 2)
+ access_full |= NFS4_ACCESS_XALIST | NFS4_ACCESS_XAREAD |
+ NFS4_ACCESS_XAWRITE;
+
+ if (access->ac_req_access & ~access_full)
+ return nfserr_inval;
+
+ access->ac_resp_access = access->ac_req_access;
+ return nfsd_access(rqstp, &cstate->current_fh, &access->ac_resp_access,
+ &access->ac_supported);
+}
+
+static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)
+{
+ __be32 *verf = (__be32 *)verifier->data;
+
+ BUILD_BUG_ON(2*sizeof(*verf) != sizeof(verifier->data));
+
+ nfsd_copy_boot_verifier(verf, net_generic(net, nfsd_net_id));
+}
+
+static __be32
+nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_commit *commit = &u->commit;
+
+ return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
+ commit->co_count,
+ (__be32 *)commit->co_verf.data);
+}
+
+static __be32
+nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_create *create = &u->create;
+ struct svc_fh resfh;
+ __be32 status;
+ dev_t rdev;
+
+ fh_init(&resfh, NFS4_FHSIZE);
+
+ status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_NOP);
+ if (status)
+ return status;
+
+ status = check_attr_support(rqstp, cstate, create->cr_bmval,
+ nfsd_attrmask);
+ if (status)
+ return status;
+
+ current->fs->umask = create->cr_umask;
+ switch (create->cr_type) {
+ case NF4LNK:
+ status = nfsd_symlink(rqstp, &cstate->current_fh,
+ create->cr_name, create->cr_namelen,
+ create->cr_data, &resfh);
+ break;
+
+ case NF4BLK:
+ status = nfserr_inval;
+ rdev = MKDEV(create->cr_specdata1, create->cr_specdata2);
+ if (MAJOR(rdev) != create->cr_specdata1 ||
+ MINOR(rdev) != create->cr_specdata2)
+ goto out_umask;
+ status = nfsd_create(rqstp, &cstate->current_fh,
+ create->cr_name, create->cr_namelen,
+ &create->cr_iattr, S_IFBLK, rdev, &resfh);
+ break;
+
+ case NF4CHR:
+ status = nfserr_inval;
+ rdev = MKDEV(create->cr_specdata1, create->cr_specdata2);
+ if (MAJOR(rdev) != create->cr_specdata1 ||
+ MINOR(rdev) != create->cr_specdata2)
+ goto out_umask;
+ status = nfsd_create(rqstp, &cstate->current_fh,
+ create->cr_name, create->cr_namelen,
+ &create->cr_iattr,S_IFCHR, rdev, &resfh);
+ break;
+
+ case NF4SOCK:
+ status = nfsd_create(rqstp, &cstate->current_fh,
+ create->cr_name, create->cr_namelen,
+ &create->cr_iattr, S_IFSOCK, 0, &resfh);
+ break;
+
+ case NF4FIFO:
+ status = nfsd_create(rqstp, &cstate->current_fh,
+ create->cr_name, create->cr_namelen,
+ &create->cr_iattr, S_IFIFO, 0, &resfh);
+ break;
+
+ case NF4DIR:
+ create->cr_iattr.ia_valid &= ~ATTR_SIZE;
+ status = nfsd_create(rqstp, &cstate->current_fh,
+ create->cr_name, create->cr_namelen,
+ &create->cr_iattr, S_IFDIR, 0, &resfh);
+ break;
+
+ default:
+ status = nfserr_badtype;
+ }
+
+ if (status)
+ goto out;
+
+ if (create->cr_label.len)
+ nfsd4_security_inode_setsecctx(&resfh, &create->cr_label, create->cr_bmval);
+
+ if (create->cr_acl != NULL)
+ do_set_nfs4_acl(rqstp, &resfh, create->cr_acl,
+ create->cr_bmval);
+
+ fh_unlock(&cstate->current_fh);
+ set_change_info(&create->cr_cinfo, &cstate->current_fh);
+ fh_dup2(&cstate->current_fh, &resfh);
+out:
+ fh_put(&resfh);
+out_umask:
+ current->fs->umask = 0;
+ return status;
+}
+
+static __be32
+nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_getattr *getattr = &u->getattr;
+ __be32 status;
+
+ status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
+ if (status)
+ return status;
+
+ if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
+ return nfserr_inval;
+
+ getattr->ga_bmval[0] &= nfsd_suppattrs[cstate->minorversion][0];
+ getattr->ga_bmval[1] &= nfsd_suppattrs[cstate->minorversion][1];
+ getattr->ga_bmval[2] &= nfsd_suppattrs[cstate->minorversion][2];
+
+ getattr->ga_fhp = &cstate->current_fh;
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_link *link = &u->link;
+ __be32 status;
+
+ status = nfsd_link(rqstp, &cstate->current_fh,
+ link->li_name, link->li_namelen, &cstate->save_fh);
+ if (!status)
+ set_change_info(&link->li_cinfo, &cstate->current_fh);
+ return status;
+}
+
+static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh)
+{
+ struct svc_fh tmp_fh;
+ __be32 ret;
+
+ fh_init(&tmp_fh, NFS4_FHSIZE);
+ ret = exp_pseudoroot(rqstp, &tmp_fh);
+ if (ret)
+ return ret;
+ if (tmp_fh.fh_dentry == fh->fh_dentry) {
+ fh_put(&tmp_fh);
+ return nfserr_noent;
+ }
+ fh_put(&tmp_fh);
+ return nfsd_lookup(rqstp, fh, "..", 2, fh);
+}
+
+static __be32
+nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ return nfsd4_do_lookupp(rqstp, &cstate->current_fh);
+}
+
+static __be32
+nfsd4_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ return nfsd_lookup(rqstp, &cstate->current_fh,
+ u->lookup.lo_name, u->lookup.lo_len,
+ &cstate->current_fh);
+}
+
+static __be32
+nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_read *read = &u->read;
+ __be32 status;
+
+ read->rd_nf = NULL;
+ if (read->rd_offset >= OFFSET_MAX)
+ return nfserr_inval;
+
+ trace_nfsd_read_start(rqstp, &cstate->current_fh,
+ read->rd_offset, read->rd_length);
+
+ /*
+ * If we do a zero copy read, then a client will see read data
+ * that reflects the state of the file *after* performing the
+ * following compound.
+ *
+ * To ensure proper ordering, we therefore turn off zero copy if
+ * the client wants us to do more in this compound:
+ */
+ if (!nfsd4_last_compound_op(rqstp))
+ clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
+
+ /* check stateid */
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ &read->rd_stateid, RD_STATE,
+ &read->rd_nf, NULL);
+ if (status) {
+ dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
+ goto out;
+ }
+ status = nfs_ok;
+out:
+ read->rd_rqstp = rqstp;
+ read->rd_fhp = &cstate->current_fh;
+ return status;
+}
+
+
+static void
+nfsd4_read_release(union nfsd4_op_u *u)
+{
+ if (u->read.rd_nf)
+ nfsd_file_put(u->read.rd_nf);
+ trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
+ u->read.rd_offset, u->read.rd_length);
+}
+
+static __be32
+nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_readdir *readdir = &u->readdir;
+ u64 cookie = readdir->rd_cookie;
+ static const nfs4_verifier zeroverf;
+
+ /* no need to check permission - this will be done in nfsd_readdir() */
+
+ if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
+ return nfserr_inval;
+
+ readdir->rd_bmval[0] &= nfsd_suppattrs[cstate->minorversion][0];
+ readdir->rd_bmval[1] &= nfsd_suppattrs[cstate->minorversion][1];
+ readdir->rd_bmval[2] &= nfsd_suppattrs[cstate->minorversion][2];
+
+ if ((cookie == 1) || (cookie == 2) ||
+ (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
+ return nfserr_bad_cookie;
+
+ readdir->rd_rqstp = rqstp;
+ readdir->rd_fhp = &cstate->current_fh;
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_readlink(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ u->readlink.rl_rqstp = rqstp;
+ u->readlink.rl_fhp = &cstate->current_fh;
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_remove *remove = &u->remove;
+ __be32 status;
+
+ if (opens_in_grace(SVC_NET(rqstp)))
+ return nfserr_grace;
+ status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
+ remove->rm_name, remove->rm_namelen);
+ if (!status) {
+ fh_unlock(&cstate->current_fh);
+ set_change_info(&remove->rm_cinfo, &cstate->current_fh);
+ }
+ return status;
+}
+
+static __be32
+nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_rename *rename = &u->rename;
+ __be32 status;
+
+ if (opens_in_grace(SVC_NET(rqstp)))
+ return nfserr_grace;
+ status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname,
+ rename->rn_snamelen, &cstate->current_fh,
+ rename->rn_tname, rename->rn_tnamelen);
+ if (status)
+ return status;
+ set_change_info(&rename->rn_sinfo, &cstate->save_fh);
+ set_change_info(&rename->rn_tinfo, &cstate->current_fh);
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_secinfo *secinfo = &u->secinfo;
+ struct svc_export *exp;
+ struct dentry *dentry;
+ __be32 err;
+
+ err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC);
+ if (err)
+ return err;
+ err = nfsd_lookup_dentry(rqstp, &cstate->current_fh,
+ secinfo->si_name, secinfo->si_namelen,
+ &exp, &dentry);
+ if (err)
+ return err;
+ fh_unlock(&cstate->current_fh);
+ if (d_really_is_negative(dentry)) {
+ exp_put(exp);
+ err = nfserr_noent;
+ } else
+ secinfo->si_exp = exp;
+ dput(dentry);
+ if (cstate->minorversion)
+ /* See rfc 5661 section 2.6.3.1.1.8 */
+ fh_put(&cstate->current_fh);
+ return err;
+}
+
+static __be32
+nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ __be32 err;
+
+ switch (u->secinfo_no_name.sin_style) {
+ case NFS4_SECINFO_STYLE4_CURRENT_FH:
+ break;
+ case NFS4_SECINFO_STYLE4_PARENT:
+ err = nfsd4_do_lookupp(rqstp, &cstate->current_fh);
+ if (err)
+ return err;
+ break;
+ default:
+ return nfserr_inval;
+ }
+
+ u->secinfo_no_name.sin_exp = exp_get(cstate->current_fh.fh_export);
+ fh_put(&cstate->current_fh);
+ return nfs_ok;
+}
+
+static void
+nfsd4_secinfo_release(union nfsd4_op_u *u)
+{
+ if (u->secinfo.si_exp)
+ exp_put(u->secinfo.si_exp);
+}
+
+static void
+nfsd4_secinfo_no_name_release(union nfsd4_op_u *u)
+{
+ if (u->secinfo_no_name.sin_exp)
+ exp_put(u->secinfo_no_name.sin_exp);
+}
+
+static __be32
+nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_setattr *setattr = &u->setattr;
+ __be32 status = nfs_ok;
+ int err;
+
+ if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
+ status = nfs4_preprocess_stateid_op(rqstp, cstate,
+ &cstate->current_fh, &setattr->sa_stateid,
+ WR_STATE, NULL, NULL);
+ if (status) {
+ dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
+ return status;
+ }
+ }
+ err = fh_want_write(&cstate->current_fh);
+ if (err)
+ return nfserrno(err);
+ status = nfs_ok;
+
+ status = check_attr_support(rqstp, cstate, setattr->sa_bmval,
+ nfsd_attrmask);
+ if (status)
+ goto out;
+
+ if (setattr->sa_acl != NULL)
+ status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh,
+ setattr->sa_acl);
+ if (status)
+ goto out;
+ if (setattr->sa_label.len)
+ status = nfsd4_set_nfs4_label(rqstp, &cstate->current_fh,
+ &setattr->sa_label);
+ if (status)
+ goto out;
+ status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
+ 0, (time64_t)0);
+out:
+ fh_drop_write(&cstate->current_fh);
+ return status;
+}
+
+static __be32
+nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_write *write = &u->write;
+ stateid_t *stateid = &write->wr_stateid;
+ struct nfsd_file *nf = NULL;
+ __be32 status = nfs_ok;
+ unsigned long cnt;
+ int nvecs;
+
+ if (write->wr_offset > (u64)OFFSET_MAX ||
+ write->wr_offset + write->wr_buflen > (u64)OFFSET_MAX)
+ return nfserr_fbig;
+
+ cnt = write->wr_buflen;
+ trace_nfsd_write_start(rqstp, &cstate->current_fh,
+ write->wr_offset, cnt);
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ stateid, WR_STATE, &nf, NULL);
+ if (status) {
+ dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
+ return status;
+ }
+
+ write->wr_how_written = write->wr_stable_how;
+
+ nvecs = svc_fill_write_vector(rqstp, write->wr_pagelist,
+ &write->wr_head, write->wr_buflen);
+ WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
+
+ status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf,
+ write->wr_offset, rqstp->rq_vec, nvecs, &cnt,
+ write->wr_how_written,
+ (__be32 *)write->wr_verifier.data);
+ nfsd_file_put(nf);
+
+ write->wr_bytes_written = cnt;
+ trace_nfsd_write_done(rqstp, &cstate->current_fh,
+ write->wr_offset, cnt);
+ return status;
+}
+
+static __be32
+nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ stateid_t *src_stateid, struct nfsd_file **src,
+ stateid_t *dst_stateid, struct nfsd_file **dst)
+{
+ __be32 status;
+
+ if (!cstate->save_fh.fh_dentry)
+ return nfserr_nofilehandle;
+
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh,
+ src_stateid, RD_STATE, src, NULL);
+ if (status) {
+ dprintk("NFSD: %s: couldn't process src stateid!\n", __func__);
+ goto out;
+ }
+
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ dst_stateid, WR_STATE, dst, NULL);
+ if (status) {
+ dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__);
+ goto out_put_src;
+ }
+
+ /* fix up for NFS-specific error code */
+ if (!S_ISREG(file_inode((*src)->nf_file)->i_mode) ||
+ !S_ISREG(file_inode((*dst)->nf_file)->i_mode)) {
+ status = nfserr_wrong_type;
+ goto out_put_dst;
+ }
+
+out:
+ return status;
+out_put_dst:
+ nfsd_file_put(*dst);
+ *dst = NULL;
+out_put_src:
+ nfsd_file_put(*src);
+ *src = NULL;
+ goto out;
+}
+
+static __be32
+nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_clone *clone = &u->clone;
+ struct nfsd_file *src, *dst;
+ __be32 status;
+
+ status = nfsd4_verify_copy(rqstp, cstate, &clone->cl_src_stateid, &src,
+ &clone->cl_dst_stateid, &dst);
+ if (status)
+ goto out;
+
+ status = nfsd4_clone_file_range(src, clone->cl_src_pos,
+ dst, clone->cl_dst_pos, clone->cl_count,
+ EX_ISSYNC(cstate->current_fh.fh_export));
+
+ nfsd_file_put(dst);
+ nfsd_file_put(src);
+out:
+ return status;
+}
+
+void nfs4_put_copy(struct nfsd4_copy *copy)
+{
+ if (!refcount_dec_and_test(&copy->refcount))
+ return;
+ kfree(copy);
+}
+
+static bool
+check_and_set_stop_copy(struct nfsd4_copy *copy)
+{
+ bool value;
+
+ spin_lock(&copy->cp_clp->async_lock);
+ value = copy->stopped;
+ if (!copy->stopped)
+ copy->stopped = true;
+ spin_unlock(&copy->cp_clp->async_lock);
+ return value;
+}
+
+static void nfsd4_stop_copy(struct nfsd4_copy *copy)
+{
+ /* only 1 thread should stop the copy */
+ if (!check_and_set_stop_copy(copy))
+ kthread_stop(copy->copy_task);
+ nfs4_put_copy(copy);
+}
+
+static struct nfsd4_copy *nfsd4_get_copy(struct nfs4_client *clp)
+{
+ struct nfsd4_copy *copy = NULL;
+
+ spin_lock(&clp->async_lock);
+ if (!list_empty(&clp->async_copies)) {
+ copy = list_first_entry(&clp->async_copies, struct nfsd4_copy,
+ copies);
+ refcount_inc(&copy->refcount);
+ }
+ spin_unlock(&clp->async_lock);
+ return copy;
+}
+
+void nfsd4_shutdown_copy(struct nfs4_client *clp)
+{
+ struct nfsd4_copy *copy;
+
+ while ((copy = nfsd4_get_copy(clp)) != NULL)
+ nfsd4_stop_copy(copy);
+}
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+
+extern struct file *nfs42_ssc_open(struct vfsmount *ss_mnt,
+ struct nfs_fh *src_fh,
+ nfs4_stateid *stateid);
+extern void nfs42_ssc_close(struct file *filep);
+
+extern void nfs_sb_deactive(struct super_block *sb);
+
+#define NFSD42_INTERSSC_MOUNTOPS "vers=4.2,addr=%s,sec=sys"
+
+/*
+ * Support one copy source server for now.
+ */
+static __be32
+nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
+ struct vfsmount **mount)
+{
+ struct file_system_type *type;
+ struct vfsmount *ss_mnt;
+ struct nfs42_netaddr *naddr;
+ struct sockaddr_storage tmp_addr;
+ size_t tmp_addrlen, match_netid_len = 3;
+ char *startsep = "", *endsep = "", *match_netid = "tcp";
+ char *ipaddr, *dev_name, *raw_data;
+ int len, raw_len;
+ __be32 status = nfserr_inval;
+
+ naddr = &nss->u.nl4_addr;
+ tmp_addrlen = rpc_uaddr2sockaddr(SVC_NET(rqstp), naddr->addr,
+ naddr->addr_len,
+ (struct sockaddr *)&tmp_addr,
+ sizeof(tmp_addr));
+ if (tmp_addrlen == 0)
+ goto out_err;
+
+ if (tmp_addr.ss_family == AF_INET6) {
+ startsep = "[";
+ endsep = "]";
+ match_netid = "tcp6";
+ match_netid_len = 4;
+ }
+
+ if (naddr->netid_len != match_netid_len ||
+ strncmp(naddr->netid, match_netid, naddr->netid_len))
+ goto out_err;
+
+ /* Construct the raw data for the vfs_kern_mount call */
+ len = RPC_MAX_ADDRBUFLEN + 1;
+ ipaddr = kzalloc(len, GFP_KERNEL);
+ if (!ipaddr)
+ goto out_err;
+
+ rpc_ntop((struct sockaddr *)&tmp_addr, ipaddr, len);
+
+ /* 2 for ipv6 endsep and startsep. 3 for ":/" and trailing '/0'*/
+
+ raw_len = strlen(NFSD42_INTERSSC_MOUNTOPS) + strlen(ipaddr);
+ raw_data = kzalloc(raw_len, GFP_KERNEL);
+ if (!raw_data)
+ goto out_free_ipaddr;
+
+ snprintf(raw_data, raw_len, NFSD42_INTERSSC_MOUNTOPS, ipaddr);
+
+ status = nfserr_nodev;
+ type = get_fs_type("nfs");
+ if (!type)
+ goto out_free_rawdata;
+
+ /* Set the server:<export> for the vfs_kern_mount call */
+ dev_name = kzalloc(len + 5, GFP_KERNEL);
+ if (!dev_name)
+ goto out_free_rawdata;
+ snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep);
+
+ /* Use an 'internal' mount: SB_KERNMOUNT -> MNT_INTERNAL */
+ ss_mnt = vfs_kern_mount(type, SB_KERNMOUNT, dev_name, raw_data);
+ module_put(type->owner);
+ if (IS_ERR(ss_mnt))
+ goto out_free_devname;
+
+ status = 0;
+ *mount = ss_mnt;
+
+out_free_devname:
+ kfree(dev_name);
+out_free_rawdata:
+ kfree(raw_data);
+out_free_ipaddr:
+ kfree(ipaddr);
+out_err:
+ return status;
+}
+
+/*
+ * Verify COPY destination stateid.
+ *
+ * Connect to the source server with NFSv4.1.
+ * Create the source struct file for nfsd_copy_range.
+ * Called with COPY cstate:
+ * SAVED_FH: source filehandle
+ * CURRENT_FH: destination filehandle
+ */
+static __be32
+nfsd4_setup_inter_ssc(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_copy *copy, struct vfsmount **mount)
+{
+ struct svc_fh *s_fh = NULL;
+ stateid_t *s_stid = &copy->cp_src_stateid;
+ __be32 status = nfserr_inval;
+
+ /* Verify the destination stateid and set dst struct file*/
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ &copy->cp_dst_stateid,
+ WR_STATE, &copy->nf_dst, NULL);
+ if (status)
+ goto out;
+
+ status = nfsd4_interssc_connect(&copy->cp_src, rqstp, mount);
+ if (status)
+ goto out;
+
+ s_fh = &cstate->save_fh;
+
+ copy->c_fh.size = s_fh->fh_handle.fh_size;
+ memcpy(copy->c_fh.data, &s_fh->fh_handle.fh_base, copy->c_fh.size);
+ copy->stateid.seqid = cpu_to_be32(s_stid->si_generation);
+ memcpy(copy->stateid.other, (void *)&s_stid->si_opaque,
+ sizeof(stateid_opaque_t));
+
+ status = 0;
+out:
+ return status;
+}
+
+static void
+nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
+ struct nfsd_file *dst)
+{
+ nfs42_ssc_close(src->nf_file);
+ fput(src->nf_file);
+ nfsd_file_put(dst);
+ mntput(ss_mnt);
+}
+
+#else /* CONFIG_NFSD_V4_2_INTER_SSC */
+
+static __be32
+nfsd4_setup_inter_ssc(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_copy *copy,
+ struct vfsmount **mount)
+{
+ *mount = NULL;
+ return nfserr_inval;
+}
+
+static void
+nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
+ struct nfsd_file *dst)
+{
+}
+
+static struct file *nfs42_ssc_open(struct vfsmount *ss_mnt,
+ struct nfs_fh *src_fh,
+ nfs4_stateid *stateid)
+{
+ return NULL;
+}
+#endif /* CONFIG_NFSD_V4_2_INTER_SSC */
+
+static __be32
+nfsd4_setup_intra_ssc(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_copy *copy)
+{
+ return nfsd4_verify_copy(rqstp, cstate, &copy->cp_src_stateid,
+ &copy->nf_src, &copy->cp_dst_stateid,
+ &copy->nf_dst);
+}
+
+static void
+nfsd4_cleanup_intra_ssc(struct nfsd_file *src, struct nfsd_file *dst)
+{
+ nfsd_file_put(src);
+ nfsd_file_put(dst);
+}
+
+static void nfsd4_cb_offload_release(struct nfsd4_callback *cb)
+{
+ struct nfsd4_copy *copy = container_of(cb, struct nfsd4_copy, cp_cb);
+
+ nfs4_put_copy(copy);
+}
+
+static int nfsd4_cb_offload_done(struct nfsd4_callback *cb,
+ struct rpc_task *task)
+{
+ return 1;
+}
+
+static const struct nfsd4_callback_ops nfsd4_cb_offload_ops = {
+ .release = nfsd4_cb_offload_release,
+ .done = nfsd4_cb_offload_done
+};
+
+static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync)
+{
+ copy->cp_res.wr_stable_how = NFS_UNSTABLE;
+ copy->cp_synchronous = sync;
+ gen_boot_verifier(&copy->cp_res.wr_verifier, copy->cp_clp->net);
+}
+
+static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
+{
+ struct file *dst = copy->nf_dst->nf_file;
+ struct file *src = copy->nf_src->nf_file;
+ ssize_t bytes_copied = 0;
+ size_t bytes_total = copy->cp_count;
+ u64 src_pos = copy->cp_src_pos;
+ u64 dst_pos = copy->cp_dst_pos;
+
+ do {
+ if (kthread_should_stop())
+ break;
+ bytes_copied = nfsd_copy_file_range(src, src_pos, dst, dst_pos,
+ bytes_total);
+ if (bytes_copied <= 0)
+ break;
+ bytes_total -= bytes_copied;
+ copy->cp_res.wr_bytes_written += bytes_copied;
+ src_pos += bytes_copied;
+ dst_pos += bytes_copied;
+ } while (bytes_total > 0 && !copy->cp_synchronous);
+ return bytes_copied;
+}
+
+static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync)
+{
+ __be32 status;
+ ssize_t bytes;
+
+ bytes = _nfsd_copy_file_range(copy);
+ /* for async copy, we ignore the error, client can always retry
+ * to get the error
+ */
+ if (bytes < 0 && !copy->cp_res.wr_bytes_written)
+ status = nfserrno(bytes);
+ else {
+ nfsd4_init_copy_res(copy, sync);
+ status = nfs_ok;
+ }
+
+ if (!copy->cp_intra) /* Inter server SSC */
+ nfsd4_cleanup_inter_ssc(copy->ss_mnt, copy->nf_src,
+ copy->nf_dst);
+ else
+ nfsd4_cleanup_intra_ssc(copy->nf_src, copy->nf_dst);
+
+ return status;
+}
+
+static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst)
+{
+ dst->cp_src_pos = src->cp_src_pos;
+ dst->cp_dst_pos = src->cp_dst_pos;
+ dst->cp_count = src->cp_count;
+ dst->cp_synchronous = src->cp_synchronous;
+ memcpy(&dst->cp_res, &src->cp_res, sizeof(src->cp_res));
+ memcpy(&dst->fh, &src->fh, sizeof(src->fh));
+ dst->cp_clp = src->cp_clp;
+ dst->nf_dst = nfsd_file_get(src->nf_dst);
+ dst->cp_intra = src->cp_intra;
+ if (src->cp_intra) /* for inter, file_src doesn't exist yet */
+ dst->nf_src = nfsd_file_get(src->nf_src);
+
+ memcpy(&dst->cp_stateid, &src->cp_stateid, sizeof(src->cp_stateid));
+ memcpy(&dst->cp_src, &src->cp_src, sizeof(struct nl4_server));
+ memcpy(&dst->stateid, &src->stateid, sizeof(src->stateid));
+ memcpy(&dst->c_fh, &src->c_fh, sizeof(src->c_fh));
+ dst->ss_mnt = src->ss_mnt;
+}
+
+static void cleanup_async_copy(struct nfsd4_copy *copy)
+{
+ nfs4_free_copy_state(copy);
+ nfsd_file_put(copy->nf_dst);
+ if (copy->cp_intra)
+ nfsd_file_put(copy->nf_src);
+ spin_lock(&copy->cp_clp->async_lock);
+ list_del(&copy->copies);
+ spin_unlock(&copy->cp_clp->async_lock);
+ nfs4_put_copy(copy);
+}
+
+static int nfsd4_do_async_copy(void *data)
+{
+ struct nfsd4_copy *copy = (struct nfsd4_copy *)data;
+ struct nfsd4_copy *cb_copy;
+
+ if (!copy->cp_intra) { /* Inter server SSC */
+ copy->nf_src = kzalloc(sizeof(struct nfsd_file), GFP_KERNEL);
+ if (!copy->nf_src) {
+ copy->nfserr = nfserr_serverfault;
+ /* ss_mnt will be unmounted by the laundromat */
+ goto do_callback;
+ }
+ copy->nf_src->nf_file = nfs42_ssc_open(copy->ss_mnt, &copy->c_fh,
+ &copy->stateid);
+ if (IS_ERR(copy->nf_src->nf_file)) {
+ copy->nfserr = nfserr_offload_denied;
+ /* ss_mnt will be unmounted by the laundromat */
+ goto do_callback;
+ }
+ }
+
+ copy->nfserr = nfsd4_do_copy(copy, 0);
+do_callback:
+ cb_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL);
+ if (!cb_copy)
+ goto out;
+ refcount_set(&cb_copy->refcount, 1);
+ memcpy(&cb_copy->cp_res, &copy->cp_res, sizeof(copy->cp_res));
+ cb_copy->cp_clp = copy->cp_clp;
+ cb_copy->nfserr = copy->nfserr;
+ memcpy(&cb_copy->fh, &copy->fh, sizeof(copy->fh));
+ nfsd4_init_cb(&cb_copy->cp_cb, cb_copy->cp_clp,
+ &nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD);
+ nfsd4_run_cb(&cb_copy->cp_cb);
+out:
+ if (!copy->cp_intra)
+ kfree(copy->nf_src);
+ cleanup_async_copy(copy);
+ return 0;
+}
+
+static __be32
+nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_copy *copy = &u->copy;
+ __be32 status;
+ struct nfsd4_copy *async_copy = NULL;
+
+ if (!copy->cp_intra) { /* Inter server SSC */
+ if (!inter_copy_offload_enable || copy->cp_synchronous) {
+ status = nfserr_notsupp;
+ goto out;
+ }
+ status = nfsd4_setup_inter_ssc(rqstp, cstate, copy,
+ &copy->ss_mnt);
+ if (status)
+ return nfserr_offload_denied;
+ } else {
+ status = nfsd4_setup_intra_ssc(rqstp, cstate, copy);
+ if (status)
+ return status;
+ }
+
+ copy->cp_clp = cstate->clp;
+ memcpy(&copy->fh, &cstate->current_fh.fh_handle,
+ sizeof(struct knfsd_fh));
+ if (!copy->cp_synchronous) {
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ status = nfserrno(-ENOMEM);
+ async_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL);
+ if (!async_copy)
+ goto out_err;
+ if (!nfs4_init_copy_state(nn, copy))
+ goto out_err;
+ refcount_set(&async_copy->refcount, 1);
+ memcpy(&copy->cp_res.cb_stateid, &copy->cp_stateid.stid,
+ sizeof(copy->cp_res.cb_stateid));
+ dup_copy_fields(copy, async_copy);
+ async_copy->copy_task = kthread_create(nfsd4_do_async_copy,
+ async_copy, "%s", "copy thread");
+ if (IS_ERR(async_copy->copy_task))
+ goto out_err;
+ spin_lock(&async_copy->cp_clp->async_lock);
+ list_add(&async_copy->copies,
+ &async_copy->cp_clp->async_copies);
+ spin_unlock(&async_copy->cp_clp->async_lock);
+ wake_up_process(async_copy->copy_task);
+ status = nfs_ok;
+ } else {
+ status = nfsd4_do_copy(copy, 1);
+ }
+out:
+ return status;
+out_err:
+ if (async_copy)
+ cleanup_async_copy(async_copy);
+ status = nfserrno(-ENOMEM);
+ /*
+ * source's vfsmount of inter-copy will be unmounted
+ * by the laundromat
+ */
+ goto out;
+}
+
+struct nfsd4_copy *
+find_async_copy(struct nfs4_client *clp, stateid_t *stateid)
+{
+ struct nfsd4_copy *copy;
+
+ spin_lock(&clp->async_lock);
+ list_for_each_entry(copy, &clp->async_copies, copies) {
+ if (memcmp(&copy->cp_stateid.stid, stateid, NFS4_STATEID_SIZE))
+ continue;
+ refcount_inc(&copy->refcount);
+ spin_unlock(&clp->async_lock);
+ return copy;
+ }
+ spin_unlock(&clp->async_lock);
+ return NULL;
+}
+
+static __be32
+nfsd4_offload_cancel(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_offload_status *os = &u->offload_status;
+ struct nfsd4_copy *copy;
+ struct nfs4_client *clp = cstate->clp;
+
+ copy = find_async_copy(clp, &os->stateid);
+ if (!copy) {
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ return manage_cpntf_state(nn, &os->stateid, clp, NULL);
+ } else
+ nfsd4_stop_copy(copy);
+
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_copy_notify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_copy_notify *cn = &u->copy_notify;
+ __be32 status;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct nfs4_stid *stid;
+ struct nfs4_cpntf_state *cps;
+ struct nfs4_client *clp = cstate->clp;
+
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ &cn->cpn_src_stateid, RD_STATE, NULL,
+ &stid);
+ if (status)
+ return status;
+
+ cn->cpn_sec = nn->nfsd4_lease;
+ cn->cpn_nsec = 0;
+
+ status = nfserrno(-ENOMEM);
+ cps = nfs4_alloc_init_cpntf_state(nn, stid);
+ if (!cps)
+ goto out;
+ memcpy(&cn->cpn_cnr_stateid, &cps->cp_stateid.stid, sizeof(stateid_t));
+ memcpy(&cps->cp_p_stateid, &stid->sc_stateid, sizeof(stateid_t));
+ memcpy(&cps->cp_p_clid, &clp->cl_clientid, sizeof(clientid_t));
+
+ /* For now, only return one server address in cpn_src, the
+ * address used by the client to connect to this server.
+ */
+ cn->cpn_src.nl4_type = NL4_NETADDR;
+ status = nfsd4_set_netaddr((struct sockaddr *)&rqstp->rq_daddr,
+ &cn->cpn_src.u.nl4_addr);
+ WARN_ON_ONCE(status);
+ if (status) {
+ nfs4_put_cpntf_state(nn, cps);
+ goto out;
+ }
+out:
+ nfs4_put_stid(stid);
+ return status;
+}
+
+static __be32
+nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ struct nfsd4_fallocate *fallocate, int flags)
+{
+ __be32 status;
+ struct nfsd_file *nf;
+
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ &fallocate->falloc_stateid,
+ WR_STATE, &nf, NULL);
+ if (status != nfs_ok) {
+ dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n");
+ return status;
+ }
+
+ status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, nf->nf_file,
+ fallocate->falloc_offset,
+ fallocate->falloc_length,
+ flags);
+ nfsd_file_put(nf);
+ return status;
+}
+static __be32
+nfsd4_offload_status(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_offload_status *os = &u->offload_status;
+ __be32 status = 0;
+ struct nfsd4_copy *copy;
+ struct nfs4_client *clp = cstate->clp;
+
+ copy = find_async_copy(clp, &os->stateid);
+ if (copy) {
+ os->count = copy->cp_res.wr_bytes_written;
+ nfs4_put_copy(copy);
+ } else
+ status = nfserr_bad_stateid;
+
+ return status;
+}
+
+static __be32
+nfsd4_allocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ return nfsd4_fallocate(rqstp, cstate, &u->allocate, 0);
+}
+
+static __be32
+nfsd4_deallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ return nfsd4_fallocate(rqstp, cstate, &u->deallocate,
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE);
+}
+
+static __be32
+nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_seek *seek = &u->seek;
+ int whence;
+ __be32 status;
+ struct nfsd_file *nf;
+
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ &seek->seek_stateid,
+ RD_STATE, &nf, NULL);
+ if (status) {
+ dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
+ return status;
+ }
+
+ switch (seek->seek_whence) {
+ case NFS4_CONTENT_DATA:
+ whence = SEEK_DATA;
+ break;
+ case NFS4_CONTENT_HOLE:
+ whence = SEEK_HOLE;
+ break;
+ default:
+ status = nfserr_union_notsupp;
+ goto out;
+ }
+
+ /*
+ * Note: This call does change file->f_pos, but nothing in NFSD
+ * should ever file->f_pos.
+ */
+ seek->seek_pos = vfs_llseek(nf->nf_file, seek->seek_offset, whence);
+ if (seek->seek_pos < 0)
+ status = nfserrno(seek->seek_pos);
+ else if (seek->seek_pos >= i_size_read(file_inode(nf->nf_file)))
+ seek->seek_eof = true;
+
+out:
+ nfsd_file_put(nf);
+ return status;
+}
+
+/* This routine never returns NFS_OK! If there are no other errors, it
+ * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the
+ * attributes matched. VERIFY is implemented by mapping NFSERR_SAME
+ * to NFS_OK after the call; NVERIFY by mapping NFSERR_NOT_SAME to NFS_OK.
+ */
+static __be32
+_nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ struct nfsd4_verify *verify)
+{
+ __be32 *buf, *p;
+ int count;
+ __be32 status;
+
+ status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
+ if (status)
+ return status;
+
+ status = check_attr_support(rqstp, cstate, verify->ve_bmval, NULL);
+ if (status)
+ return status;
+
+ if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)
+ || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1))
+ return nfserr_inval;
+ if (verify->ve_attrlen & 3)
+ return nfserr_inval;
+
+ /* count in words:
+ * bitmap_len(1) + bitmap(2) + attr_len(1) = 4
+ */
+ count = 4 + (verify->ve_attrlen >> 2);
+ buf = kmalloc(count << 2, GFP_KERNEL);
+ if (!buf)
+ return nfserr_jukebox;
+
+ p = buf;
+ status = nfsd4_encode_fattr_to_buf(&p, count, &cstate->current_fh,
+ cstate->current_fh.fh_export,
+ cstate->current_fh.fh_dentry,
+ verify->ve_bmval,
+ rqstp, 0);
+ /*
+ * If nfsd4_encode_fattr() ran out of space, assume that's because
+ * the attributes are longer (hence different) than those given:
+ */
+ if (status == nfserr_resource)
+ status = nfserr_not_same;
+ if (status)
+ goto out_kfree;
+
+ /* skip bitmap */
+ p = buf + 1 + ntohl(buf[0]);
+ status = nfserr_not_same;
+ if (ntohl(*p++) != verify->ve_attrlen)
+ goto out_kfree;
+ if (!memcmp(p, verify->ve_attrval, verify->ve_attrlen))
+ status = nfserr_same;
+
+out_kfree:
+ kfree(buf);
+ return status;
+}
+
+static __be32
+nfsd4_nverify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ __be32 status;
+
+ status = _nfsd4_verify(rqstp, cstate, &u->verify);
+ return status == nfserr_not_same ? nfs_ok : status;
+}
+
+static __be32
+nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ __be32 status;
+
+ status = _nfsd4_verify(rqstp, cstate, &u->nverify);
+ return status == nfserr_same ? nfs_ok : status;
+}
+
+#ifdef CONFIG_NFSD_PNFS
+static const struct nfsd4_layout_ops *
+nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
+{
+ if (!exp->ex_layout_types) {
+ dprintk("%s: export does not support pNFS\n", __func__);
+ return NULL;
+ }
+
+ if (layout_type >= LAYOUT_TYPE_MAX ||
+ !(exp->ex_layout_types & (1 << layout_type))) {
+ dprintk("%s: layout type %d not supported\n",
+ __func__, layout_type);
+ return NULL;
+ }
+
+ return nfsd4_layout_ops[layout_type];
+}
+
+static __be32
+nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate, union nfsd4_op_u *u)
+{
+ struct nfsd4_getdeviceinfo *gdp = &u->getdeviceinfo;
+ const struct nfsd4_layout_ops *ops;
+ struct nfsd4_deviceid_map *map;
+ struct svc_export *exp;
+ __be32 nfserr;
+
+ dprintk("%s: layout_type %u dev_id [0x%llx:0x%x] maxcnt %u\n",
+ __func__,
+ gdp->gd_layout_type,
+ gdp->gd_devid.fsid_idx, gdp->gd_devid.generation,
+ gdp->gd_maxcount);
+
+ map = nfsd4_find_devid_map(gdp->gd_devid.fsid_idx);
+ if (!map) {
+ dprintk("%s: couldn't find device ID to export mapping!\n",
+ __func__);
+ return nfserr_noent;
+ }
+
+ exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid);
+ if (IS_ERR(exp)) {
+ dprintk("%s: could not find device id\n", __func__);
+ return nfserr_noent;
+ }
+
+ nfserr = nfserr_layoutunavailable;
+ ops = nfsd4_layout_verify(exp, gdp->gd_layout_type);
+ if (!ops)
+ goto out;
+
+ nfserr = nfs_ok;
+ if (gdp->gd_maxcount != 0) {
+ nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb,
+ rqstp, cstate->session->se_client, gdp);
+ }
+
+ gdp->gd_notify_types &= ops->notify_types;
+out:
+ exp_put(exp);
+ return nfserr;
+}
+
+static void
+nfsd4_getdeviceinfo_release(union nfsd4_op_u *u)
+{
+ kfree(u->getdeviceinfo.gd_device);
+}
+
+static __be32
+nfsd4_layoutget(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate, union nfsd4_op_u *u)
+{
+ struct nfsd4_layoutget *lgp = &u->layoutget;
+ struct svc_fh *current_fh = &cstate->current_fh;
+ const struct nfsd4_layout_ops *ops;
+ struct nfs4_layout_stateid *ls;
+ __be32 nfserr;
+ int accmode = NFSD_MAY_READ_IF_EXEC;
+
+ switch (lgp->lg_seg.iomode) {
+ case IOMODE_READ:
+ accmode |= NFSD_MAY_READ;
+ break;
+ case IOMODE_RW:
+ accmode |= NFSD_MAY_READ | NFSD_MAY_WRITE;
+ break;
+ default:
+ dprintk("%s: invalid iomode %d\n",
+ __func__, lgp->lg_seg.iomode);
+ nfserr = nfserr_badiomode;
+ goto out;
+ }
+
+ nfserr = fh_verify(rqstp, current_fh, 0, accmode);
+ if (nfserr)
+ goto out;
+
+ nfserr = nfserr_layoutunavailable;
+ ops = nfsd4_layout_verify(current_fh->fh_export, lgp->lg_layout_type);
+ if (!ops)
+ goto out;
+
+ /*
+ * Verify minlength and range as per RFC5661:
+ * o If loga_length is less than loga_minlength,
+ * the metadata server MUST return NFS4ERR_INVAL.
+ * o If the sum of loga_offset and loga_minlength exceeds
+ * NFS4_UINT64_MAX, and loga_minlength is not
+ * NFS4_UINT64_MAX, the error NFS4ERR_INVAL MUST result.
+ * o If the sum of loga_offset and loga_length exceeds
+ * NFS4_UINT64_MAX, and loga_length is not NFS4_UINT64_MAX,
+ * the error NFS4ERR_INVAL MUST result.
+ */
+ nfserr = nfserr_inval;
+ if (lgp->lg_seg.length < lgp->lg_minlength ||
+ (lgp->lg_minlength != NFS4_MAX_UINT64 &&
+ lgp->lg_minlength > NFS4_MAX_UINT64 - lgp->lg_seg.offset) ||
+ (lgp->lg_seg.length != NFS4_MAX_UINT64 &&
+ lgp->lg_seg.length > NFS4_MAX_UINT64 - lgp->lg_seg.offset))
+ goto out;
+ if (lgp->lg_seg.length == 0)
+ goto out;
+
+ nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid,
+ true, lgp->lg_layout_type, &ls);
+ if (nfserr) {
+ trace_nfsd_layout_get_lookup_fail(&lgp->lg_sid);
+ goto out;
+ }
+
+ nfserr = nfserr_recallconflict;
+ if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls))
+ goto out_put_stid;
+
+ nfserr = ops->proc_layoutget(d_inode(current_fh->fh_dentry),
+ current_fh, lgp);
+ if (nfserr)
+ goto out_put_stid;
+
+ nfserr = nfsd4_insert_layout(lgp, ls);
+
+out_put_stid:
+ mutex_unlock(&ls->ls_mutex);
+ nfs4_put_stid(&ls->ls_stid);
+out:
+ return nfserr;
+}
+
+static void
+nfsd4_layoutget_release(union nfsd4_op_u *u)
+{
+ kfree(u->layoutget.lg_content);
+}
+
+static __be32
+nfsd4_layoutcommit(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate, union nfsd4_op_u *u)
+{
+ struct nfsd4_layoutcommit *lcp = &u->layoutcommit;
+ const struct nfsd4_layout_seg *seg = &lcp->lc_seg;
+ struct svc_fh *current_fh = &cstate->current_fh;
+ const struct nfsd4_layout_ops *ops;
+ loff_t new_size = lcp->lc_last_wr + 1;
+ struct inode *inode;
+ struct nfs4_layout_stateid *ls;
+ __be32 nfserr;
+
+ nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE);
+ if (nfserr)
+ goto out;
+
+ nfserr = nfserr_layoutunavailable;
+ ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type);
+ if (!ops)
+ goto out;
+ inode = d_inode(current_fh->fh_dentry);
+
+ nfserr = nfserr_inval;
+ if (new_size <= seg->offset) {
+ dprintk("pnfsd: last write before layout segment\n");
+ goto out;
+ }
+ if (new_size > seg->offset + seg->length) {
+ dprintk("pnfsd: last write beyond layout segment\n");
+ goto out;
+ }
+ if (!lcp->lc_newoffset && new_size > i_size_read(inode)) {
+ dprintk("pnfsd: layoutcommit beyond EOF\n");
+ goto out;
+ }
+
+ nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid,
+ false, lcp->lc_layout_type,
+ &ls);
+ if (nfserr) {
+ trace_nfsd_layout_commit_lookup_fail(&lcp->lc_sid);
+ /* fixup error code as per RFC5661 */
+ if (nfserr == nfserr_bad_stateid)
+ nfserr = nfserr_badlayout;
+ goto out;
+ }
+
+ /* LAYOUTCOMMIT does not require any serialization */
+ mutex_unlock(&ls->ls_mutex);
+
+ if (new_size > i_size_read(inode)) {
+ lcp->lc_size_chg = 1;
+ lcp->lc_newsize = new_size;
+ } else {
+ lcp->lc_size_chg = 0;
+ }
+
+ nfserr = ops->proc_layoutcommit(inode, lcp);
+ nfs4_put_stid(&ls->ls_stid);
+out:
+ return nfserr;
+}
+
+static __be32
+nfsd4_layoutreturn(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate, union nfsd4_op_u *u)
+{
+ struct nfsd4_layoutreturn *lrp = &u->layoutreturn;
+ struct svc_fh *current_fh = &cstate->current_fh;
+ __be32 nfserr;
+
+ nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
+ if (nfserr)
+ goto out;
+
+ nfserr = nfserr_layoutunavailable;
+ if (!nfsd4_layout_verify(current_fh->fh_export, lrp->lr_layout_type))
+ goto out;
+
+ switch (lrp->lr_seg.iomode) {
+ case IOMODE_READ:
+ case IOMODE_RW:
+ case IOMODE_ANY:
+ break;
+ default:
+ dprintk("%s: invalid iomode %d\n", __func__,
+ lrp->lr_seg.iomode);
+ nfserr = nfserr_inval;
+ goto out;
+ }
+
+ switch (lrp->lr_return_type) {
+ case RETURN_FILE:
+ nfserr = nfsd4_return_file_layouts(rqstp, cstate, lrp);
+ break;
+ case RETURN_FSID:
+ case RETURN_ALL:
+ nfserr = nfsd4_return_client_layouts(rqstp, cstate, lrp);
+ break;
+ default:
+ dprintk("%s: invalid return_type %d\n", __func__,
+ lrp->lr_return_type);
+ nfserr = nfserr_inval;
+ break;
+ }
+out:
+ return nfserr;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
+static __be32
+nfsd4_getxattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_getxattr *getxattr = &u->getxattr;
+
+ return nfsd_getxattr(rqstp, &cstate->current_fh,
+ getxattr->getxa_name, &getxattr->getxa_buf,
+ &getxattr->getxa_len);
+}
+
+static __be32
+nfsd4_setxattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_setxattr *setxattr = &u->setxattr;
+ __be32 ret;
+
+ if (opens_in_grace(SVC_NET(rqstp)))
+ return nfserr_grace;
+
+ ret = nfsd_setxattr(rqstp, &cstate->current_fh, setxattr->setxa_name,
+ setxattr->setxa_buf, setxattr->setxa_len,
+ setxattr->setxa_flags);
+
+ if (!ret)
+ set_change_info(&setxattr->setxa_cinfo, &cstate->current_fh);
+
+ return ret;
+}
+
+static __be32
+nfsd4_listxattrs(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ /*
+ * Get the entire list, then copy out only the user attributes
+ * in the encode function.
+ */
+ return nfsd_listxattr(rqstp, &cstate->current_fh,
+ &u->listxattrs.lsxa_buf, &u->listxattrs.lsxa_len);
+}
+
+static __be32
+nfsd4_removexattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_removexattr *removexattr = &u->removexattr;
+ __be32 ret;
+
+ if (opens_in_grace(SVC_NET(rqstp)))
+ return nfserr_grace;
+
+ ret = nfsd_removexattr(rqstp, &cstate->current_fh,
+ removexattr->rmxa_name);
+
+ if (!ret)
+ set_change_info(&removexattr->rmxa_cinfo, &cstate->current_fh);
+
+ return ret;
+}
+
+/*
+ * NULL call.
+ */
+static __be32
+nfsd4_proc_null(struct svc_rqst *rqstp)
+{
+ return rpc_success;
+}
+
+static inline void nfsd4_increment_op_stats(u32 opnum)
+{
+ if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP)
+ nfsdstats.nfs4_opcount[opnum]++;
+}
+
+static const struct nfsd4_operation nfsd4_ops[];
+
+static const char *nfsd4_op_name(unsigned opnum);
+
+/*
+ * Enforce NFSv4.1 COMPOUND ordering rules:
+ *
+ * Also note, enforced elsewhere:
+ * - SEQUENCE other than as first op results in
+ * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
+ * - BIND_CONN_TO_SESSION must be the only op in its compound.
+ * (Enforced in nfsd4_bind_conn_to_session().)
+ * - DESTROY_SESSION must be the final operation in a compound, if
+ * sessionid's in SEQUENCE and DESTROY_SESSION are the same.
+ * (Enforced in nfsd4_destroy_session().)
+ */
+static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args)
+{
+ struct nfsd4_op *first_op = &args->ops[0];
+
+ /* These ordering requirements don't apply to NFSv4.0: */
+ if (args->minorversion == 0)
+ return nfs_ok;
+ /* This is weird, but OK, not our problem: */
+ if (args->opcnt == 0)
+ return nfs_ok;
+ if (first_op->status == nfserr_op_illegal)
+ return nfs_ok;
+ if (!(nfsd4_ops[first_op->opnum].op_flags & ALLOWED_AS_FIRST_OP))
+ return nfserr_op_not_in_session;
+ if (first_op->opnum == OP_SEQUENCE)
+ return nfs_ok;
+ /*
+ * So first_op is something allowed outside a session, like
+ * EXCHANGE_ID; but then it has to be the only op in the
+ * compound:
+ */
+ if (args->opcnt != 1)
+ return nfserr_not_only_op;
+ return nfs_ok;
+}
+
+const struct nfsd4_operation *OPDESC(struct nfsd4_op *op)
+{
+ return &nfsd4_ops[op->opnum];
+}
+
+bool nfsd4_cache_this_op(struct nfsd4_op *op)
+{
+ if (op->opnum == OP_ILLEGAL)
+ return false;
+ return OPDESC(op)->op_flags & OP_CACHEME;
+}
+
+static bool need_wrongsec_check(struct svc_rqst *rqstp)
+{
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+ struct nfsd4_op *this = &argp->ops[resp->opcnt - 1];
+ struct nfsd4_op *next = &argp->ops[resp->opcnt];
+ const struct nfsd4_operation *thisd = OPDESC(this);
+ const struct nfsd4_operation *nextd;
+
+ /*
+ * Most ops check wronsec on our own; only the putfh-like ops
+ * have special rules.
+ */
+ if (!(thisd->op_flags & OP_IS_PUTFH_LIKE))
+ return false;
+ /*
+ * rfc 5661 2.6.3.1.1.6: don't bother erroring out a
+ * put-filehandle operation if we're not going to use the
+ * result:
+ */
+ if (argp->opcnt == resp->opcnt)
+ return false;
+ if (next->opnum == OP_ILLEGAL)
+ return false;
+ nextd = OPDESC(next);
+ /*
+ * Rest of 2.6.3.1.1: certain operations will return WRONGSEC
+ * errors themselves as necessary; others should check for them
+ * now:
+ */
+ return !(nextd->op_flags & OP_HANDLES_WRONGSEC);
+}
+
+static void svcxdr_init_encode(struct svc_rqst *rqstp,
+ struct nfsd4_compoundres *resp)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_buf *buf = &rqstp->rq_res;
+ struct kvec *head = buf->head;
+
+ xdr->buf = buf;
+ xdr->iov = head;
+ xdr->p = head->iov_base + head->iov_len;
+ xdr->end = head->iov_base + PAGE_SIZE - rqstp->rq_auth_slack;
+ /* Tail and page_len should be zero at this point: */
+ buf->len = buf->head[0].iov_len;
+ xdr->scratch.iov_len = 0;
+ xdr->page_ptr = buf->pages - 1;
+ buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages)
+ - rqstp->rq_auth_slack;
+}
+
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+static void
+check_if_stalefh_allowed(struct nfsd4_compoundargs *args)
+{
+ struct nfsd4_op *op, *current_op = NULL, *saved_op = NULL;
+ struct nfsd4_copy *copy;
+ struct nfsd4_putfh *putfh;
+ int i;
+
+ /* traverse all operation and if it's a COPY compound, mark the
+ * source filehandle to skip verification
+ */
+ for (i = 0; i < args->opcnt; i++) {
+ op = &args->ops[i];
+ if (op->opnum == OP_PUTFH)
+ current_op = op;
+ else if (op->opnum == OP_SAVEFH)
+ saved_op = current_op;
+ else if (op->opnum == OP_RESTOREFH)
+ current_op = saved_op;
+ else if (op->opnum == OP_COPY) {
+ copy = (struct nfsd4_copy *)&op->u;
+ if (!saved_op) {
+ op->status = nfserr_nofilehandle;
+ return;
+ }
+ putfh = (struct nfsd4_putfh *)&saved_op->u;
+ if (!copy->cp_intra)
+ putfh->no_verify = true;
+ }
+ }
+}
+#else
+static void
+check_if_stalefh_allowed(struct nfsd4_compoundargs *args)
+{
+}
+#endif
+
+/*
+ * COMPOUND call.
+ */
+static __be32
+nfsd4_proc_compound(struct svc_rqst *rqstp)
+{
+ struct nfsd4_compoundargs *args = rqstp->rq_argp;
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct nfsd4_op *op;
+ struct nfsd4_compound_state *cstate = &resp->cstate;
+ struct svc_fh *current_fh = &cstate->current_fh;
+ struct svc_fh *save_fh = &cstate->save_fh;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ __be32 status;
+
+ svcxdr_init_encode(rqstp, resp);
+ resp->tagp = resp->xdr.p;
+ /* reserve space for: taglen, tag, and opcnt */
+ xdr_reserve_space(&resp->xdr, 8 + args->taglen);
+ resp->taglen = args->taglen;
+ resp->tag = args->tag;
+ resp->rqstp = rqstp;
+ cstate->minorversion = args->minorversion;
+ fh_init(current_fh, NFS4_FHSIZE);
+ fh_init(save_fh, NFS4_FHSIZE);
+ /*
+ * Don't use the deferral mechanism for NFSv4; compounds make it
+ * too hard to avoid non-idempotency problems.
+ */
+ clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
+
+ /*
+ * According to RFC3010, this takes precedence over all other errors.
+ */
+ status = nfserr_minor_vers_mismatch;
+ if (nfsd_minorversion(nn, args->minorversion, NFSD_TEST) <= 0)
+ goto out;
+ status = nfserr_resource;
+ if (args->opcnt > NFSD_MAX_OPS_PER_COMPOUND)
+ goto out;
+
+ status = nfs41_check_op_ordering(args);
+ if (status) {
+ op = &args->ops[0];
+ op->status = status;
+ resp->opcnt = 1;
+ goto encode_op;
+ }
+ check_if_stalefh_allowed(args);
+
+ rqstp->rq_lease_breaker = (void **)&cstate->clp;
+
+ trace_nfsd_compound(rqstp, args->opcnt);
+ while (!status && resp->opcnt < args->opcnt) {
+ op = &args->ops[resp->opcnt++];
+
+ /*
+ * The XDR decode routines may have pre-set op->status;
+ * for example, if there is a miscellaneous XDR error
+ * it will be set to nfserr_bad_xdr.
+ */
+ if (op->status) {
+ if (op->opnum == OP_OPEN)
+ op->status = nfsd4_open_omfg(rqstp, cstate, op);
+ goto encode_op;
+ }
+ if (!current_fh->fh_dentry &&
+ !HAS_FH_FLAG(current_fh, NFSD4_FH_FOREIGN)) {
+ if (!(op->opdesc->op_flags & ALLOWED_WITHOUT_FH)) {
+ op->status = nfserr_nofilehandle;
+ goto encode_op;
+ }
+ } else if (current_fh->fh_export &&
+ current_fh->fh_export->ex_fslocs.migrated &&
+ !(op->opdesc->op_flags & ALLOWED_ON_ABSENT_FS)) {
+ op->status = nfserr_moved;
+ goto encode_op;
+ }
+
+ fh_clear_wcc(current_fh);
+
+ /* If op is non-idempotent */
+ if (op->opdesc->op_flags & OP_MODIFIES_SOMETHING) {
+ /*
+ * Don't execute this op if we couldn't encode a
+ * succesful reply:
+ */
+ u32 plen = op->opdesc->op_rsize_bop(rqstp, op);
+ /*
+ * Plus if there's another operation, make sure
+ * we'll have space to at least encode an error:
+ */
+ if (resp->opcnt < args->opcnt)
+ plen += COMPOUND_ERR_SLACK_SPACE;
+ op->status = nfsd4_check_resp_size(resp, plen);
+ }
+
+ if (op->status)
+ goto encode_op;
+
+ if (op->opdesc->op_get_currentstateid)
+ op->opdesc->op_get_currentstateid(cstate, &op->u);
+ op->status = op->opdesc->op_func(rqstp, cstate, &op->u);
+
+ /* Only from SEQUENCE */
+ if (cstate->status == nfserr_replay_cache) {
+ dprintk("%s NFS4.1 replay from cache\n", __func__);
+ status = op->status;
+ goto out;
+ }
+ if (!op->status) {
+ if (op->opdesc->op_set_currentstateid)
+ op->opdesc->op_set_currentstateid(cstate, &op->u);
+
+ if (op->opdesc->op_flags & OP_CLEAR_STATEID)
+ clear_current_stateid(cstate);
+
+ if (current_fh->fh_export &&
+ need_wrongsec_check(rqstp))
+ op->status = check_nfsd_access(current_fh->fh_export, rqstp);
+ }
+encode_op:
+ if (op->status == nfserr_replay_me) {
+ op->replay = &cstate->replay_owner->so_replay;
+ nfsd4_encode_replay(&resp->xdr, op);
+ status = op->status = op->replay->rp_status;
+ } else {
+ nfsd4_encode_operation(resp, op);
+ status = op->status;
+ }
+
+ trace_nfsd_compound_status(args->opcnt, resp->opcnt, status,
+ nfsd4_op_name(op->opnum));
+
+ nfsd4_cstate_clear_replay(cstate);
+ nfsd4_increment_op_stats(op->opnum);
+ }
+
+ fh_put(current_fh);
+ fh_put(save_fh);
+ BUG_ON(cstate->replay_owner);
+out:
+ cstate->status = status;
+ /* Reset deferral mechanism for RPC deferrals */
+ set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
+ return rpc_success;
+}
+
+#define op_encode_hdr_size (2)
+#define op_encode_stateid_maxsz (XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define op_encode_verifier_maxsz (XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define op_encode_change_info_maxsz (5)
+#define nfs4_fattr_bitmap_maxsz (4)
+
+/* We'll fall back on returning no lockowner if run out of space: */
+#define op_encode_lockowner_maxsz (0)
+#define op_encode_lock_denied_maxsz (8 + op_encode_lockowner_maxsz)
+
+#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+
+#define op_encode_ace_maxsz (3 + nfs4_owner_maxsz)
+#define op_encode_delegation_maxsz (1 + op_encode_stateid_maxsz + 1 + \
+ op_encode_ace_maxsz)
+
+#define op_encode_channel_attrs_maxsz (6 + 1 + 1)
+
+static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32);
+}
+
+static inline u32 nfsd4_access_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ /* ac_supported, ac_resp_access */
+ return (op_encode_hdr_size + 2)* sizeof(__be32);
+}
+
+static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_change_info_maxsz
+ + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
+}
+
+/*
+ * Note since this is an idempotent operation we won't insist on failing
+ * the op prematurely if the estimate is too large. We may turn off splice
+ * reads unnecessarily.
+ */
+static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp,
+ struct nfsd4_op *op)
+{
+ u32 *bmap = op->u.getattr.ga_bmval;
+ u32 bmap0 = bmap[0], bmap1 = bmap[1], bmap2 = bmap[2];
+ u32 ret = 0;
+
+ if (bmap0 & FATTR4_WORD0_ACL)
+ return svc_max_payload(rqstp);
+ if (bmap0 & FATTR4_WORD0_FS_LOCATIONS)
+ return svc_max_payload(rqstp);
+
+ if (bmap1 & FATTR4_WORD1_OWNER) {
+ ret += IDMAP_NAMESZ + 4;
+ bmap1 &= ~FATTR4_WORD1_OWNER;
+ }
+ if (bmap1 & FATTR4_WORD1_OWNER_GROUP) {
+ ret += IDMAP_NAMESZ + 4;
+ bmap1 &= ~FATTR4_WORD1_OWNER_GROUP;
+ }
+ if (bmap0 & FATTR4_WORD0_FILEHANDLE) {
+ ret += NFS4_FHSIZE + 4;
+ bmap0 &= ~FATTR4_WORD0_FILEHANDLE;
+ }
+ if (bmap2 & FATTR4_WORD2_SECURITY_LABEL) {
+ ret += NFS4_MAXLABELLEN + 12;
+ bmap2 &= ~FATTR4_WORD2_SECURITY_LABEL;
+ }
+ /*
+ * Largest of remaining attributes are 16 bytes (e.g.,
+ * supported_attributes)
+ */
+ ret += 16 * (hweight32(bmap0) + hweight32(bmap1) + hweight32(bmap2));
+ /* bitmask, length */
+ ret += 20;
+ return ret;
+}
+
+static inline u32 nfsd4_getfh_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + 1) * sizeof(__be32) + NFS4_FHSIZE;
+}
+
+static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_change_info_maxsz)
+ * sizeof(__be32);
+}
+
+static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_lock_denied_maxsz)
+ * sizeof(__be32);
+}
+
+static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_stateid_maxsz
+ + op_encode_change_info_maxsz + 1
+ + nfs4_fattr_bitmap_maxsz
+ + op_encode_delegation_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ u32 maxcount = 0, rlen = 0;
+
+ maxcount = svc_max_payload(rqstp);
+ rlen = min(op->u.read.rd_length, maxcount);
+
+ return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ u32 maxcount = svc_max_payload(rqstp);
+ u32 rlen = min(op->u.read.rd_length, maxcount);
+ /*
+ * If we detect that the file changed during hole encoding, then we
+ * recover by encoding the remaining reply as data. This means we need
+ * to set aside enough room to encode two data segments.
+ */
+ u32 seg_len = 2 * (1 + 2 + 1);
+
+ return (op_encode_hdr_size + 2 + seg_len + XDR_QUADLEN(rlen)) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ u32 maxcount = 0, rlen = 0;
+
+ maxcount = svc_max_payload(rqstp);
+ rlen = min(op->u.readdir.rd_maxcount, maxcount);
+
+ return (op_encode_hdr_size + op_encode_verifier_maxsz +
+ XDR_QUADLEN(rlen)) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_readlink_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + 1) * sizeof(__be32) + PAGE_SIZE;
+}
+
+static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_change_info_maxsz)
+ * sizeof(__be32);
+}
+
+static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_change_info_maxsz
+ + op_encode_change_info_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp,
+ struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size
+ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_test_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + 1 + op->u.test_stateid.ts_num_ids)
+ * sizeof(__be32);
+}
+
+static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_secinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + RPC_AUTH_MAXFLAVOR *
+ (4 + XDR_QUADLEN(GSS_OID_MAX_LEN))) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) *
+ sizeof(__be32);
+}
+
+static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + 2 + op_encode_verifier_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
+ 1 + 1 + /* eir_flags, spr_how */\
+ 4 + /* spo_must_enforce & _allow with bitmap */\
+ 2 + /*eir_server_owner.so_minor_id */\
+ /* eir_server_owner.so_major_id<> */\
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
+ /* eir_server_scope<> */\
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
+ 1 + /* eir_server_impl_id array length */\
+ 0 /* ignored eir_server_impl_id contents */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\
+ 2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\
+ 2 + /* csr_sequence, csr_flags */\
+ op_encode_channel_attrs_maxsz + \
+ op_encode_channel_attrs_maxsz) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size +
+ 1 /* wr_callback */ +
+ op_encode_stateid_maxsz /* wr_callback */ +
+ 2 /* wr_count */ +
+ 1 /* wr_committed */ +
+ op_encode_verifier_maxsz +
+ 1 /* cr_consecutive */ +
+ 1 /* cr_synchronous */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_offload_status_rsize(struct svc_rqst *rqstp,
+ struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size +
+ 2 /* osr_count */ +
+ 1 /* osr_complete<1> optional 0 for now */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_copy_notify_rsize(struct svc_rqst *rqstp,
+ struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size +
+ 3 /* cnr_lease_time */ +
+ 1 /* We support one cnr_source_server */ +
+ 1 /* cnr_stateid seq */ +
+ op_encode_stateid_maxsz /* cnr_stateid */ +
+ 1 /* num cnr_source_server*/ +
+ 1 /* nl4_type */ +
+ 1 /* nl4 size */ +
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) /*nl4_loc + nl4_loc_sz */)
+ * sizeof(__be32);
+}
+
+#ifdef CONFIG_NFSD_PNFS
+static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ u32 maxcount = 0, rlen = 0;
+
+ maxcount = svc_max_payload(rqstp);
+ rlen = min(op->u.getdeviceinfo.gd_maxcount, maxcount);
+
+ return (op_encode_hdr_size +
+ 1 /* gd_layout_type*/ +
+ XDR_QUADLEN(rlen) +
+ 2 /* gd_notify_types */) * sizeof(__be32);
+}
+
+/*
+ * At this stage we don't really know what layout driver will handle the request,
+ * so we need to define an arbitrary upper bound here.
+ */
+#define MAX_LAYOUT_SIZE 128
+static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size +
+ 1 /* logr_return_on_close */ +
+ op_encode_stateid_maxsz +
+ 1 /* nr of layouts */ +
+ MAX_LAYOUT_SIZE) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size +
+ 1 /* locr_newsize */ +
+ 2 /* ns_size */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size +
+ 1 /* lrs_stateid */ +
+ op_encode_stateid_maxsz) * sizeof(__be32);
+}
+#endif /* CONFIG_NFSD_PNFS */
+
+
+static inline u32 nfsd4_seek_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + 3) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_getxattr_rsize(struct svc_rqst *rqstp,
+ struct nfsd4_op *op)
+{
+ u32 maxcount, rlen;
+
+ maxcount = svc_max_payload(rqstp);
+ rlen = min_t(u32, XATTR_SIZE_MAX, maxcount);
+
+ return (op_encode_hdr_size + 1 + XDR_QUADLEN(rlen)) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_setxattr_rsize(struct svc_rqst *rqstp,
+ struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_change_info_maxsz)
+ * sizeof(__be32);
+}
+static inline u32 nfsd4_listxattrs_rsize(struct svc_rqst *rqstp,
+ struct nfsd4_op *op)
+{
+ u32 maxcount, rlen;
+
+ maxcount = svc_max_payload(rqstp);
+ rlen = min(op->u.listxattrs.lsxa_maxcount, maxcount);
+
+ return (op_encode_hdr_size + 4 + XDR_QUADLEN(rlen)) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_removexattr_rsize(struct svc_rqst *rqstp,
+ struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size + op_encode_change_info_maxsz)
+ * sizeof(__be32);
+}
+
+
+static const struct nfsd4_operation nfsd4_ops[] = {
+ [OP_ACCESS] = {
+ .op_func = nfsd4_access,
+ .op_name = "OP_ACCESS",
+ .op_rsize_bop = nfsd4_access_rsize,
+ },
+ [OP_CLOSE] = {
+ .op_func = nfsd4_close,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_CLOSE",
+ .op_rsize_bop = nfsd4_status_stateid_rsize,
+ .op_get_currentstateid = nfsd4_get_closestateid,
+ .op_set_currentstateid = nfsd4_set_closestateid,
+ },
+ [OP_COMMIT] = {
+ .op_func = nfsd4_commit,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_COMMIT",
+ .op_rsize_bop = nfsd4_commit_rsize,
+ },
+ [OP_CREATE] = {
+ .op_func = nfsd4_create,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME | OP_CLEAR_STATEID,
+ .op_name = "OP_CREATE",
+ .op_rsize_bop = nfsd4_create_rsize,
+ },
+ [OP_DELEGRETURN] = {
+ .op_func = nfsd4_delegreturn,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_DELEGRETURN",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ .op_get_currentstateid = nfsd4_get_delegreturnstateid,
+ },
+ [OP_GETATTR] = {
+ .op_func = nfsd4_getattr,
+ .op_flags = ALLOWED_ON_ABSENT_FS,
+ .op_rsize_bop = nfsd4_getattr_rsize,
+ .op_name = "OP_GETATTR",
+ },
+ [OP_GETFH] = {
+ .op_func = nfsd4_getfh,
+ .op_name = "OP_GETFH",
+ .op_rsize_bop = nfsd4_getfh_rsize,
+ },
+ [OP_LINK] = {
+ .op_func = nfsd4_link,
+ .op_flags = ALLOWED_ON_ABSENT_FS | OP_MODIFIES_SOMETHING
+ | OP_CACHEME,
+ .op_name = "OP_LINK",
+ .op_rsize_bop = nfsd4_link_rsize,
+ },
+ [OP_LOCK] = {
+ .op_func = nfsd4_lock,
+ .op_flags = OP_MODIFIES_SOMETHING |
+ OP_NONTRIVIAL_ERROR_ENCODE,
+ .op_name = "OP_LOCK",
+ .op_rsize_bop = nfsd4_lock_rsize,
+ .op_set_currentstateid = nfsd4_set_lockstateid,
+ },
+ [OP_LOCKT] = {
+ .op_func = nfsd4_lockt,
+ .op_flags = OP_NONTRIVIAL_ERROR_ENCODE,
+ .op_name = "OP_LOCKT",
+ .op_rsize_bop = nfsd4_lock_rsize,
+ },
+ [OP_LOCKU] = {
+ .op_func = nfsd4_locku,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_LOCKU",
+ .op_rsize_bop = nfsd4_status_stateid_rsize,
+ .op_get_currentstateid = nfsd4_get_lockustateid,
+ },
+ [OP_LOOKUP] = {
+ .op_func = nfsd4_lookup,
+ .op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID,
+ .op_name = "OP_LOOKUP",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
+ [OP_LOOKUPP] = {
+ .op_func = nfsd4_lookupp,
+ .op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID,
+ .op_name = "OP_LOOKUPP",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
+ [OP_NVERIFY] = {
+ .op_func = nfsd4_nverify,
+ .op_name = "OP_NVERIFY",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
+ [OP_OPEN] = {
+ .op_func = nfsd4_open,
+ .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
+ .op_name = "OP_OPEN",
+ .op_rsize_bop = nfsd4_open_rsize,
+ .op_set_currentstateid = nfsd4_set_openstateid,
+ },
+ [OP_OPEN_CONFIRM] = {
+ .op_func = nfsd4_open_confirm,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_OPEN_CONFIRM",
+ .op_rsize_bop = nfsd4_status_stateid_rsize,
+ },
+ [OP_OPEN_DOWNGRADE] = {
+ .op_func = nfsd4_open_downgrade,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_OPEN_DOWNGRADE",
+ .op_rsize_bop = nfsd4_status_stateid_rsize,
+ .op_get_currentstateid = nfsd4_get_opendowngradestateid,
+ .op_set_currentstateid = nfsd4_set_opendowngradestateid,
+ },
+ [OP_PUTFH] = {
+ .op_func = nfsd4_putfh,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID,
+ .op_name = "OP_PUTFH",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
+ [OP_PUTPUBFH] = {
+ .op_func = nfsd4_putrootfh,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID,
+ .op_name = "OP_PUTPUBFH",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
+ [OP_PUTROOTFH] = {
+ .op_func = nfsd4_putrootfh,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID,
+ .op_name = "OP_PUTROOTFH",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
+ [OP_READ] = {
+ .op_func = nfsd4_read,
+ .op_release = nfsd4_read_release,
+ .op_name = "OP_READ",
+ .op_rsize_bop = nfsd4_read_rsize,
+ .op_get_currentstateid = nfsd4_get_readstateid,
+ },
+ [OP_READDIR] = {
+ .op_func = nfsd4_readdir,
+ .op_name = "OP_READDIR",
+ .op_rsize_bop = nfsd4_readdir_rsize,
+ },
+ [OP_READLINK] = {
+ .op_func = nfsd4_readlink,
+ .op_name = "OP_READLINK",
+ .op_rsize_bop = nfsd4_readlink_rsize,
+ },
+ [OP_REMOVE] = {
+ .op_func = nfsd4_remove,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_name = "OP_REMOVE",
+ .op_rsize_bop = nfsd4_remove_rsize,
+ },
+ [OP_RENAME] = {
+ .op_func = nfsd4_rename,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_name = "OP_RENAME",
+ .op_rsize_bop = nfsd4_rename_rsize,
+ },
+ [OP_RENEW] = {
+ .op_func = nfsd4_renew,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_MODIFIES_SOMETHING,
+ .op_name = "OP_RENEW",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+
+ },
+ [OP_RESTOREFH] = {
+ .op_func = nfsd4_restorefh,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
+ .op_name = "OP_RESTOREFH",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
+ [OP_SAVEFH] = {
+ .op_func = nfsd4_savefh,
+ .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
+ .op_name = "OP_SAVEFH",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
+ [OP_SECINFO] = {
+ .op_func = nfsd4_secinfo,
+ .op_release = nfsd4_secinfo_release,
+ .op_flags = OP_HANDLES_WRONGSEC,
+ .op_name = "OP_SECINFO",
+ .op_rsize_bop = nfsd4_secinfo_rsize,
+ },
+ [OP_SETATTR] = {
+ .op_func = nfsd4_setattr,
+ .op_name = "OP_SETATTR",
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME
+ | OP_NONTRIVIAL_ERROR_ENCODE,
+ .op_rsize_bop = nfsd4_setattr_rsize,
+ .op_get_currentstateid = nfsd4_get_setattrstateid,
+ },
+ [OP_SETCLIENTID] = {
+ .op_func = nfsd4_setclientid,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_MODIFIES_SOMETHING | OP_CACHEME
+ | OP_NONTRIVIAL_ERROR_ENCODE,
+ .op_name = "OP_SETCLIENTID",
+ .op_rsize_bop = nfsd4_setclientid_rsize,
+ },
+ [OP_SETCLIENTID_CONFIRM] = {
+ .op_func = nfsd4_setclientid_confirm,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_name = "OP_SETCLIENTID_CONFIRM",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
+ [OP_VERIFY] = {
+ .op_func = nfsd4_verify,
+ .op_name = "OP_VERIFY",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
+ [OP_WRITE] = {
+ .op_func = nfsd4_write,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_name = "OP_WRITE",
+ .op_rsize_bop = nfsd4_write_rsize,
+ .op_get_currentstateid = nfsd4_get_writestateid,
+ },
+ [OP_RELEASE_LOCKOWNER] = {
+ .op_func = nfsd4_release_lockowner,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+ | OP_MODIFIES_SOMETHING,
+ .op_name = "OP_RELEASE_LOCKOWNER",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
+
+ /* NFSv4.1 operations */
+ [OP_EXCHANGE_ID] = {
+ .op_func = nfsd4_exchange_id,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+ | OP_MODIFIES_SOMETHING,
+ .op_name = "OP_EXCHANGE_ID",
+ .op_rsize_bop = nfsd4_exchange_id_rsize,
+ },
+ [OP_BACKCHANNEL_CTL] = {
+ .op_func = nfsd4_backchannel_ctl,
+ .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
+ .op_name = "OP_BACKCHANNEL_CTL",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
+ [OP_BIND_CONN_TO_SESSION] = {
+ .op_func = nfsd4_bind_conn_to_session,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+ | OP_MODIFIES_SOMETHING,
+ .op_name = "OP_BIND_CONN_TO_SESSION",
+ .op_rsize_bop = nfsd4_bind_conn_to_session_rsize,
+ },
+ [OP_CREATE_SESSION] = {
+ .op_func = nfsd4_create_session,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+ | OP_MODIFIES_SOMETHING,
+ .op_name = "OP_CREATE_SESSION",
+ .op_rsize_bop = nfsd4_create_session_rsize,
+ },
+ [OP_DESTROY_SESSION] = {
+ .op_func = nfsd4_destroy_session,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+ | OP_MODIFIES_SOMETHING,
+ .op_name = "OP_DESTROY_SESSION",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
+ [OP_SEQUENCE] = {
+ .op_func = nfsd4_sequence,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+ .op_name = "OP_SEQUENCE",
+ .op_rsize_bop = nfsd4_sequence_rsize,
+ },
+ [OP_DESTROY_CLIENTID] = {
+ .op_func = nfsd4_destroy_clientid,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+ | OP_MODIFIES_SOMETHING,
+ .op_name = "OP_DESTROY_CLIENTID",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
+ [OP_RECLAIM_COMPLETE] = {
+ .op_func = nfsd4_reclaim_complete,
+ .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
+ .op_name = "OP_RECLAIM_COMPLETE",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
+ [OP_SECINFO_NO_NAME] = {
+ .op_func = nfsd4_secinfo_no_name,
+ .op_release = nfsd4_secinfo_no_name_release,
+ .op_flags = OP_HANDLES_WRONGSEC,
+ .op_name = "OP_SECINFO_NO_NAME",
+ .op_rsize_bop = nfsd4_secinfo_rsize,
+ },
+ [OP_TEST_STATEID] = {
+ .op_func = nfsd4_test_stateid,
+ .op_flags = ALLOWED_WITHOUT_FH,
+ .op_name = "OP_TEST_STATEID",
+ .op_rsize_bop = nfsd4_test_stateid_rsize,
+ },
+ [OP_FREE_STATEID] = {
+ .op_func = nfsd4_free_stateid,
+ .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
+ .op_name = "OP_FREE_STATEID",
+ .op_get_currentstateid = nfsd4_get_freestateid,
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
+#ifdef CONFIG_NFSD_PNFS
+ [OP_GETDEVICEINFO] = {
+ .op_func = nfsd4_getdeviceinfo,
+ .op_release = nfsd4_getdeviceinfo_release,
+ .op_flags = ALLOWED_WITHOUT_FH,
+ .op_name = "OP_GETDEVICEINFO",
+ .op_rsize_bop = nfsd4_getdeviceinfo_rsize,
+ },
+ [OP_LAYOUTGET] = {
+ .op_func = nfsd4_layoutget,
+ .op_release = nfsd4_layoutget_release,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_LAYOUTGET",
+ .op_rsize_bop = nfsd4_layoutget_rsize,
+ },
+ [OP_LAYOUTCOMMIT] = {
+ .op_func = nfsd4_layoutcommit,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_LAYOUTCOMMIT",
+ .op_rsize_bop = nfsd4_layoutcommit_rsize,
+ },
+ [OP_LAYOUTRETURN] = {
+ .op_func = nfsd4_layoutreturn,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_LAYOUTRETURN",
+ .op_rsize_bop = nfsd4_layoutreturn_rsize,
+ },
+#endif /* CONFIG_NFSD_PNFS */
+
+ /* NFSv4.2 operations */
+ [OP_ALLOCATE] = {
+ .op_func = nfsd4_allocate,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_ALLOCATE",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
+ [OP_DEALLOCATE] = {
+ .op_func = nfsd4_deallocate,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_DEALLOCATE",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
+ [OP_CLONE] = {
+ .op_func = nfsd4_clone,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_CLONE",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
+ [OP_COPY] = {
+ .op_func = nfsd4_copy,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_COPY",
+ .op_rsize_bop = nfsd4_copy_rsize,
+ },
+ [OP_READ_PLUS] = {
+ .op_func = nfsd4_read,
+ .op_release = nfsd4_read_release,
+ .op_name = "OP_READ_PLUS",
+ .op_rsize_bop = nfsd4_read_plus_rsize,
+ .op_get_currentstateid = nfsd4_get_readstateid,
+ },
+ [OP_SEEK] = {
+ .op_func = nfsd4_seek,
+ .op_name = "OP_SEEK",
+ .op_rsize_bop = nfsd4_seek_rsize,
+ },
+ [OP_OFFLOAD_STATUS] = {
+ .op_func = nfsd4_offload_status,
+ .op_name = "OP_OFFLOAD_STATUS",
+ .op_rsize_bop = nfsd4_offload_status_rsize,
+ },
+ [OP_OFFLOAD_CANCEL] = {
+ .op_func = nfsd4_offload_cancel,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_OFFLOAD_CANCEL",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
+ [OP_COPY_NOTIFY] = {
+ .op_func = nfsd4_copy_notify,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_COPY_NOTIFY",
+ .op_rsize_bop = nfsd4_copy_notify_rsize,
+ },
+ [OP_GETXATTR] = {
+ .op_func = nfsd4_getxattr,
+ .op_name = "OP_GETXATTR",
+ .op_rsize_bop = nfsd4_getxattr_rsize,
+ },
+ [OP_SETXATTR] = {
+ .op_func = nfsd4_setxattr,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_name = "OP_SETXATTR",
+ .op_rsize_bop = nfsd4_setxattr_rsize,
+ },
+ [OP_LISTXATTRS] = {
+ .op_func = nfsd4_listxattrs,
+ .op_name = "OP_LISTXATTRS",
+ .op_rsize_bop = nfsd4_listxattrs_rsize,
+ },
+ [OP_REMOVEXATTR] = {
+ .op_func = nfsd4_removexattr,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_name = "OP_REMOVEXATTR",
+ .op_rsize_bop = nfsd4_removexattr_rsize,
+ },
+};
+
+/**
+ * nfsd4_spo_must_allow - Determine if the compound op contains an
+ * operation that is allowed to be sent with machine credentials
+ *
+ * @rqstp: a pointer to the struct svc_rqst
+ *
+ * Checks to see if the compound contains a spo_must_allow op
+ * and confirms that it was sent with the proper machine creds.
+ */
+
+bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
+{
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+ struct nfsd4_op *this = &argp->ops[resp->opcnt - 1];
+ struct nfsd4_compound_state *cstate = &resp->cstate;
+ struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow;
+ u32 opiter;
+
+ if (!cstate->minorversion)
+ return false;
+
+ if (cstate->spo_must_allowed)
+ return true;
+
+ opiter = resp->opcnt;
+ while (opiter < argp->opcnt) {
+ this = &argp->ops[opiter++];
+ if (test_bit(this->opnum, allow->u.longs) &&
+ cstate->clp->cl_mach_cred &&
+ nfsd4_mach_creds_match(cstate->clp, rqstp)) {
+ cstate->spo_must_allowed = true;
+ return true;
+ }
+ }
+ cstate->spo_must_allowed = false;
+ return false;
+}
+
+int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ if (op->opnum == OP_ILLEGAL || op->status == nfserr_notsupp)
+ return op_encode_hdr_size * sizeof(__be32);
+
+ BUG_ON(OPDESC(op)->op_rsize_bop == NULL);
+ return OPDESC(op)->op_rsize_bop(rqstp, op);
+}
+
+void warn_on_nonidempotent_op(struct nfsd4_op *op)
+{
+ if (OPDESC(op)->op_flags & OP_MODIFIES_SOMETHING) {
+ pr_err("unable to encode reply to nonidempotent op %d (%s)\n",
+ op->opnum, nfsd4_op_name(op->opnum));
+ WARN_ON_ONCE(1);
+ }
+}
+
+static const char *nfsd4_op_name(unsigned opnum)
+{
+ if (opnum < ARRAY_SIZE(nfsd4_ops))
+ return nfsd4_ops[opnum].op_name;
+ return "unknown_operation";
+}
+
+#define nfsd4_voidres nfsd4_voidargs
+struct nfsd4_voidargs { int dummy; };
+
+static const struct svc_procedure nfsd_procedures4[2] = {
+ [NFSPROC4_NULL] = {
+ .pc_func = nfsd4_proc_null,
+ .pc_decode = nfs4svc_decode_voidarg,
+ .pc_encode = nfs4svc_encode_voidres,
+ .pc_argsize = sizeof(struct nfsd4_voidargs),
+ .pc_ressize = sizeof(struct nfsd4_voidres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = 1,
+ },
+ [NFSPROC4_COMPOUND] = {
+ .pc_func = nfsd4_proc_compound,
+ .pc_decode = nfs4svc_decode_compoundargs,
+ .pc_encode = nfs4svc_encode_compoundres,
+ .pc_argsize = sizeof(struct nfsd4_compoundargs),
+ .pc_ressize = sizeof(struct nfsd4_compoundres),
+ .pc_release = nfsd4_release_compoundargs,
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = NFSD_BUFSIZE/4,
+ },
+};
+
+static unsigned int nfsd_count3[ARRAY_SIZE(nfsd_procedures4)];
+const struct svc_version nfsd_version4 = {
+ .vs_vers = 4,
+ .vs_nproc = 2,
+ .vs_proc = nfsd_procedures4,
+ .vs_count = nfsd_count3,
+ .vs_dispatch = nfsd_dispatch,
+ .vs_xdrsize = NFS4_SVC_XDRSIZE,
+ .vs_rpcb_optnl = true,
+ .vs_need_cong_ctrl = true,
+};
+
+/*
+ * Local variables:
+ * c-basic-offset: 8
+ * End:
+ */
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
new file mode 100644
index 000000000..83c4e6883
--- /dev/null
+++ b/fs/nfsd/nfs4recover.c
@@ -0,0 +1,2169 @@
+/*
+* Copyright (c) 2004 The Regents of the University of Michigan.
+* Copyright (c) 2012 Jeff Layton <jlayton@redhat.com>
+* All rights reserved.
+*
+* Andy Adamson <andros@citi.umich.edu>
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* 1. Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+* 2. Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the distribution.
+* 3. Neither the name of the University nor the names of its
+* contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#include <crypto/hash.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <net/net_namespace.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfsd/cld.h>
+
+#include "nfsd.h"
+#include "state.h"
+#include "vfs.h"
+#include "netns.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PROC
+
+/* Declarations */
+struct nfsd4_client_tracking_ops {
+ int (*init)(struct net *);
+ void (*exit)(struct net *);
+ void (*create)(struct nfs4_client *);
+ void (*remove)(struct nfs4_client *);
+ int (*check)(struct nfs4_client *);
+ void (*grace_done)(struct nfsd_net *);
+ uint8_t version;
+ size_t msglen;
+};
+
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops;
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2;
+
+/* Globals */
+static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
+
+static int
+nfs4_save_creds(const struct cred **original_creds)
+{
+ struct cred *new;
+
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+
+ new->fsuid = GLOBAL_ROOT_UID;
+ new->fsgid = GLOBAL_ROOT_GID;
+ *original_creds = override_creds(new);
+ put_cred(new);
+ return 0;
+}
+
+static void
+nfs4_reset_creds(const struct cred *original)
+{
+ revert_creds(original);
+}
+
+static void
+md5_to_hex(char *out, char *md5)
+{
+ int i;
+
+ for (i=0; i<16; i++) {
+ unsigned char c = md5[i];
+
+ *out++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1);
+ *out++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1);
+ }
+ *out = '\0';
+}
+
+static int
+nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname)
+{
+ struct xdr_netobj cksum;
+ struct crypto_shash *tfm;
+ int status;
+
+ dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
+ clname->len, clname->data);
+ tfm = crypto_alloc_shash("md5", 0, 0);
+ if (IS_ERR(tfm)) {
+ status = PTR_ERR(tfm);
+ goto out_no_tfm;
+ }
+
+ cksum.len = crypto_shash_digestsize(tfm);
+ cksum.data = kmalloc(cksum.len, GFP_KERNEL);
+ if (cksum.data == NULL) {
+ status = -ENOMEM;
+ goto out;
+ }
+
+ status = crypto_shash_tfm_digest(tfm, clname->data, clname->len,
+ cksum.data);
+ if (status)
+ goto out;
+
+ md5_to_hex(dname, cksum.data);
+
+ status = 0;
+out:
+ kfree(cksum.data);
+ crypto_free_shash(tfm);
+out_no_tfm:
+ return status;
+}
+
+/*
+ * If we had an error generating the recdir name for the legacy tracker
+ * then warn the admin. If the error doesn't appear to be transient,
+ * then disable recovery tracking.
+ */
+static void
+legacy_recdir_name_error(struct nfs4_client *clp, int error)
+{
+ printk(KERN_ERR "NFSD: unable to generate recoverydir "
+ "name (%d).\n", error);
+
+ /*
+ * if the algorithm just doesn't exist, then disable the recovery
+ * tracker altogether. The crypto libs will generally return this if
+ * FIPS is enabled as well.
+ */
+ if (error == -ENOENT) {
+ printk(KERN_ERR "NFSD: disabling legacy clientid tracking. "
+ "Reboot recovery will not function correctly!\n");
+ nfsd4_client_tracking_exit(clp->net);
+ }
+}
+
+static void
+__nfsd4_create_reclaim_record_grace(struct nfs4_client *clp,
+ const char *dname, int len, struct nfsd_net *nn)
+{
+ struct xdr_netobj name;
+ struct xdr_netobj princhash = { .len = 0, .data = NULL };
+ struct nfs4_client_reclaim *crp;
+
+ name.data = kmemdup(dname, len, GFP_KERNEL);
+ if (!name.data) {
+ dprintk("%s: failed to allocate memory for name.data!\n",
+ __func__);
+ return;
+ }
+ name.len = len;
+ crp = nfs4_client_to_reclaim(name, princhash, nn);
+ if (!crp) {
+ kfree(name.data);
+ return;
+ }
+ crp->cr_clp = clp;
+}
+
+static void
+nfsd4_create_clid_dir(struct nfs4_client *clp)
+{
+ const struct cred *original_cred;
+ char dname[HEXDIR_LEN];
+ struct dentry *dir, *dentry;
+ int status;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ if (test_and_set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return;
+ if (!nn->rec_file)
+ return;
+
+ status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+ if (status)
+ return legacy_recdir_name_error(clp, status);
+
+ status = nfs4_save_creds(&original_cred);
+ if (status < 0)
+ return;
+
+ status = mnt_want_write_file(nn->rec_file);
+ if (status)
+ goto out_creds;
+
+ dir = nn->rec_file->f_path.dentry;
+ /* lock the parent */
+ inode_lock(d_inode(dir));
+
+ dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
+ if (IS_ERR(dentry)) {
+ status = PTR_ERR(dentry);
+ goto out_unlock;
+ }
+ if (d_really_is_positive(dentry))
+ /*
+ * In the 4.1 case, where we're called from
+ * reclaim_complete(), records from the previous reboot
+ * may still be left, so this is OK.
+ *
+ * In the 4.0 case, we should never get here; but we may
+ * as well be forgiving and just succeed silently.
+ */
+ goto out_put;
+ status = vfs_mkdir(d_inode(dir), dentry, S_IRWXU);
+out_put:
+ dput(dentry);
+out_unlock:
+ inode_unlock(d_inode(dir));
+ if (status == 0) {
+ if (nn->in_grace)
+ __nfsd4_create_reclaim_record_grace(clp, dname,
+ HEXDIR_LEN, nn);
+ vfs_fsync(nn->rec_file, 0);
+ } else {
+ printk(KERN_ERR "NFSD: failed to write recovery record"
+ " (err %d); please check that %s exists"
+ " and is writeable", status,
+ user_recovery_dirname);
+ }
+ mnt_drop_write_file(nn->rec_file);
+out_creds:
+ nfs4_reset_creds(original_cred);
+}
+
+typedef int (recdir_func)(struct dentry *, struct dentry *, struct nfsd_net *);
+
+struct name_list {
+ char name[HEXDIR_LEN];
+ struct list_head list;
+};
+
+struct nfs4_dir_ctx {
+ struct dir_context ctx;
+ struct list_head names;
+};
+
+static int
+nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen,
+ loff_t offset, u64 ino, unsigned int d_type)
+{
+ struct nfs4_dir_ctx *ctx =
+ container_of(__ctx, struct nfs4_dir_ctx, ctx);
+ struct name_list *entry;
+
+ if (namlen != HEXDIR_LEN - 1)
+ return 0;
+ entry = kmalloc(sizeof(struct name_list), GFP_KERNEL);
+ if (entry == NULL)
+ return -ENOMEM;
+ memcpy(entry->name, name, HEXDIR_LEN - 1);
+ entry->name[HEXDIR_LEN - 1] = '\0';
+ list_add(&entry->list, &ctx->names);
+ return 0;
+}
+
+static int
+nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
+{
+ const struct cred *original_cred;
+ struct dentry *dir = nn->rec_file->f_path.dentry;
+ struct nfs4_dir_ctx ctx = {
+ .ctx.actor = nfsd4_build_namelist,
+ .names = LIST_HEAD_INIT(ctx.names)
+ };
+ struct name_list *entry, *tmp;
+ int status;
+
+ status = nfs4_save_creds(&original_cred);
+ if (status < 0)
+ return status;
+
+ status = vfs_llseek(nn->rec_file, 0, SEEK_SET);
+ if (status < 0) {
+ nfs4_reset_creds(original_cred);
+ return status;
+ }
+
+ status = iterate_dir(nn->rec_file, &ctx.ctx);
+ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
+
+ list_for_each_entry_safe(entry, tmp, &ctx.names, list) {
+ if (!status) {
+ struct dentry *dentry;
+ dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
+ if (IS_ERR(dentry)) {
+ status = PTR_ERR(dentry);
+ break;
+ }
+ status = f(dir, dentry, nn);
+ dput(dentry);
+ }
+ list_del(&entry->list);
+ kfree(entry);
+ }
+ inode_unlock(d_inode(dir));
+ nfs4_reset_creds(original_cred);
+
+ list_for_each_entry_safe(entry, tmp, &ctx.names, list) {
+ dprintk("NFSD: %s. Left entry %s\n", __func__, entry->name);
+ list_del(&entry->list);
+ kfree(entry);
+ }
+ return status;
+}
+
+static int
+nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
+{
+ struct dentry *dir, *dentry;
+ int status;
+
+ dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
+
+ dir = nn->rec_file->f_path.dentry;
+ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
+ dentry = lookup_one_len(name, dir, namlen);
+ if (IS_ERR(dentry)) {
+ status = PTR_ERR(dentry);
+ goto out_unlock;
+ }
+ status = -ENOENT;
+ if (d_really_is_negative(dentry))
+ goto out;
+ status = vfs_rmdir(d_inode(dir), dentry);
+out:
+ dput(dentry);
+out_unlock:
+ inode_unlock(d_inode(dir));
+ return status;
+}
+
+static void
+__nfsd4_remove_reclaim_record_grace(const char *dname, int len,
+ struct nfsd_net *nn)
+{
+ struct xdr_netobj name;
+ struct nfs4_client_reclaim *crp;
+
+ name.data = kmemdup(dname, len, GFP_KERNEL);
+ if (!name.data) {
+ dprintk("%s: failed to allocate memory for name.data!\n",
+ __func__);
+ return;
+ }
+ name.len = len;
+ crp = nfsd4_find_reclaim_client(name, nn);
+ kfree(name.data);
+ if (crp)
+ nfs4_remove_reclaim_record(crp, nn);
+}
+
+static void
+nfsd4_remove_clid_dir(struct nfs4_client *clp)
+{
+ const struct cred *original_cred;
+ char dname[HEXDIR_LEN];
+ int status;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ if (!nn->rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return;
+
+ status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+ if (status)
+ return legacy_recdir_name_error(clp, status);
+
+ status = mnt_want_write_file(nn->rec_file);
+ if (status)
+ goto out;
+ clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+
+ status = nfs4_save_creds(&original_cred);
+ if (status < 0)
+ goto out_drop_write;
+
+ status = nfsd4_unlink_clid_dir(dname, HEXDIR_LEN-1, nn);
+ nfs4_reset_creds(original_cred);
+ if (status == 0) {
+ vfs_fsync(nn->rec_file, 0);
+ if (nn->in_grace)
+ __nfsd4_remove_reclaim_record_grace(dname,
+ HEXDIR_LEN, nn);
+ }
+out_drop_write:
+ mnt_drop_write_file(nn->rec_file);
+out:
+ if (status)
+ printk("NFSD: Failed to remove expired client state directory"
+ " %.*s\n", HEXDIR_LEN, dname);
+}
+
+static int
+purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
+{
+ int status;
+ struct xdr_netobj name;
+
+ if (child->d_name.len != HEXDIR_LEN - 1) {
+ printk("%s: illegal name %pd in recovery directory\n",
+ __func__, child);
+ /* Keep trying; maybe the others are OK: */
+ return 0;
+ }
+ name.data = kmemdup_nul(child->d_name.name, child->d_name.len, GFP_KERNEL);
+ if (!name.data) {
+ dprintk("%s: failed to allocate memory for name.data!\n",
+ __func__);
+ goto out;
+ }
+ name.len = HEXDIR_LEN;
+ if (nfs4_has_reclaimed_state(name, nn))
+ goto out_free;
+
+ status = vfs_rmdir(d_inode(parent), child);
+ if (status)
+ printk("failed to remove client recovery directory %pd\n",
+ child);
+out_free:
+ kfree(name.data);
+out:
+ /* Keep trying, success or failure: */
+ return 0;
+}
+
+static void
+nfsd4_recdir_purge_old(struct nfsd_net *nn)
+{
+ int status;
+
+ nn->in_grace = false;
+ if (!nn->rec_file)
+ return;
+ status = mnt_want_write_file(nn->rec_file);
+ if (status)
+ goto out;
+ status = nfsd4_list_rec_dir(purge_old, nn);
+ if (status == 0)
+ vfs_fsync(nn->rec_file, 0);
+ mnt_drop_write_file(nn->rec_file);
+out:
+ nfs4_release_reclaim(nn);
+ if (status)
+ printk("nfsd4: failed to purge old clients from recovery"
+ " directory %pD\n", nn->rec_file);
+}
+
+static int
+load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
+{
+ struct xdr_netobj name;
+ struct xdr_netobj princhash = { .len = 0, .data = NULL };
+
+ if (child->d_name.len != HEXDIR_LEN - 1) {
+ printk("%s: illegal name %pd in recovery directory\n",
+ __func__, child);
+ /* Keep trying; maybe the others are OK: */
+ return 0;
+ }
+ name.data = kmemdup_nul(child->d_name.name, child->d_name.len, GFP_KERNEL);
+ if (!name.data) {
+ dprintk("%s: failed to allocate memory for name.data!\n",
+ __func__);
+ goto out;
+ }
+ name.len = HEXDIR_LEN;
+ if (!nfs4_client_to_reclaim(name, princhash, nn))
+ kfree(name.data);
+out:
+ return 0;
+}
+
+static int
+nfsd4_recdir_load(struct net *net) {
+ int status;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (!nn->rec_file)
+ return 0;
+
+ status = nfsd4_list_rec_dir(load_recdir, nn);
+ if (status)
+ printk("nfsd4: failed loading clients from recovery"
+ " directory %pD\n", nn->rec_file);
+ return status;
+}
+
+/*
+ * Hold reference to the recovery directory.
+ */
+
+static int
+nfsd4_init_recdir(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ const struct cred *original_cred;
+ int status;
+
+ printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
+ user_recovery_dirname);
+
+ BUG_ON(nn->rec_file);
+
+ status = nfs4_save_creds(&original_cred);
+ if (status < 0) {
+ printk("NFSD: Unable to change credentials to find recovery"
+ " directory: error %d\n",
+ status);
+ return status;
+ }
+
+ nn->rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0);
+ if (IS_ERR(nn->rec_file)) {
+ printk("NFSD: unable to find recovery directory %s\n",
+ user_recovery_dirname);
+ status = PTR_ERR(nn->rec_file);
+ nn->rec_file = NULL;
+ }
+
+ nfs4_reset_creds(original_cred);
+ if (!status)
+ nn->in_grace = true;
+ return status;
+}
+
+static void
+nfsd4_shutdown_recdir(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (!nn->rec_file)
+ return;
+ fput(nn->rec_file);
+ nn->rec_file = NULL;
+}
+
+static int
+nfs4_legacy_state_init(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ int i;
+
+ nn->reclaim_str_hashtbl = kmalloc_array(CLIENT_HASH_SIZE,
+ sizeof(struct list_head),
+ GFP_KERNEL);
+ if (!nn->reclaim_str_hashtbl)
+ return -ENOMEM;
+
+ for (i = 0; i < CLIENT_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
+ nn->reclaim_str_hashtbl_size = 0;
+
+ return 0;
+}
+
+static void
+nfs4_legacy_state_shutdown(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ kfree(nn->reclaim_str_hashtbl);
+}
+
+static int
+nfsd4_load_reboot_recovery_data(struct net *net)
+{
+ int status;
+
+ status = nfsd4_init_recdir(net);
+ if (status)
+ return status;
+
+ status = nfsd4_recdir_load(net);
+ if (status)
+ nfsd4_shutdown_recdir(net);
+
+ return status;
+}
+
+static int
+nfsd4_legacy_tracking_init(struct net *net)
+{
+ int status;
+
+ /* XXX: The legacy code won't work in a container */
+ if (net != &init_net) {
+ pr_warn("NFSD: attempt to initialize legacy client tracking in a container ignored.\n");
+ return -EINVAL;
+ }
+
+ status = nfs4_legacy_state_init(net);
+ if (status)
+ return status;
+
+ status = nfsd4_load_reboot_recovery_data(net);
+ if (status)
+ goto err;
+ printk("NFSD: Using legacy client tracking operations.\n");
+ return 0;
+
+err:
+ nfs4_legacy_state_shutdown(net);
+ return status;
+}
+
+static void
+nfsd4_legacy_tracking_exit(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ nfs4_release_reclaim(nn);
+ nfsd4_shutdown_recdir(net);
+ nfs4_legacy_state_shutdown(net);
+}
+
+/*
+ * Change the NFSv4 recovery directory to recdir.
+ */
+int
+nfs4_reset_recoverydir(char *recdir)
+{
+ int status;
+ struct path path;
+
+ status = kern_path(recdir, LOOKUP_FOLLOW, &path);
+ if (status)
+ return status;
+ status = -ENOTDIR;
+ if (d_is_dir(path.dentry)) {
+ strcpy(user_recovery_dirname, recdir);
+ status = 0;
+ }
+ path_put(&path);
+ return status;
+}
+
+char *
+nfs4_recoverydir(void)
+{
+ return user_recovery_dirname;
+}
+
+static int
+nfsd4_check_legacy_client(struct nfs4_client *clp)
+{
+ int status;
+ char dname[HEXDIR_LEN];
+ struct nfs4_client_reclaim *crp;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ struct xdr_netobj name;
+
+ /* did we already find that this client is stable? */
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return 0;
+
+ status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+ if (status) {
+ legacy_recdir_name_error(clp, status);
+ return status;
+ }
+
+ /* look for it in the reclaim hashtable otherwise */
+ name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL);
+ if (!name.data) {
+ dprintk("%s: failed to allocate memory for name.data!\n",
+ __func__);
+ goto out_enoent;
+ }
+ name.len = HEXDIR_LEN;
+ crp = nfsd4_find_reclaim_client(name, nn);
+ kfree(name.data);
+ if (crp) {
+ set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ crp->cr_clp = clp;
+ return 0;
+ }
+
+out_enoent:
+ return -ENOENT;
+}
+
+static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
+ .init = nfsd4_legacy_tracking_init,
+ .exit = nfsd4_legacy_tracking_exit,
+ .create = nfsd4_create_clid_dir,
+ .remove = nfsd4_remove_clid_dir,
+ .check = nfsd4_check_legacy_client,
+ .grace_done = nfsd4_recdir_purge_old,
+ .version = 1,
+ .msglen = 0,
+};
+
+/* Globals */
+#define NFSD_PIPE_DIR "nfsd"
+#define NFSD_CLD_PIPE "cld"
+
+/* per-net-ns structure for holding cld upcall info */
+struct cld_net {
+ struct rpc_pipe *cn_pipe;
+ spinlock_t cn_lock;
+ struct list_head cn_list;
+ unsigned int cn_xid;
+ bool cn_has_legacy;
+ struct crypto_shash *cn_tfm;
+};
+
+struct cld_upcall {
+ struct list_head cu_list;
+ struct cld_net *cu_net;
+ struct completion cu_done;
+ union {
+ struct cld_msg_hdr cu_hdr;
+ struct cld_msg cu_msg;
+ struct cld_msg_v2 cu_msg_v2;
+ } cu_u;
+};
+
+static int
+__cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg, struct nfsd_net *nn)
+{
+ int ret;
+ struct rpc_pipe_msg msg;
+ struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_u);
+
+ memset(&msg, 0, sizeof(msg));
+ msg.data = cmsg;
+ msg.len = nn->client_tracking_ops->msglen;
+
+ ret = rpc_queue_upcall(pipe, &msg);
+ if (ret < 0) {
+ goto out;
+ }
+
+ wait_for_completion(&cup->cu_done);
+
+ if (msg.errno < 0)
+ ret = msg.errno;
+out:
+ return ret;
+}
+
+static int
+cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg, struct nfsd_net *nn)
+{
+ int ret;
+
+ /*
+ * -EAGAIN occurs when pipe is closed and reopened while there are
+ * upcalls queued.
+ */
+ do {
+ ret = __cld_pipe_upcall(pipe, cmsg, nn);
+ } while (ret == -EAGAIN);
+
+ return ret;
+}
+
+static ssize_t
+__cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
+ struct nfsd_net *nn)
+{
+ uint8_t cmd, princhashlen;
+ struct xdr_netobj name, princhash = { .len = 0, .data = NULL };
+ uint16_t namelen;
+ struct cld_net *cn = nn->cld_net;
+
+ if (get_user(cmd, &cmsg->cm_cmd)) {
+ dprintk("%s: error when copying cmd from userspace", __func__);
+ return -EFAULT;
+ }
+ if (cmd == Cld_GraceStart) {
+ if (nn->client_tracking_ops->version >= 2) {
+ const struct cld_clntinfo __user *ci;
+
+ ci = &cmsg->cm_u.cm_clntinfo;
+ if (get_user(namelen, &ci->cc_name.cn_len))
+ return -EFAULT;
+ name.data = memdup_user(&ci->cc_name.cn_id, namelen);
+ if (IS_ERR_OR_NULL(name.data))
+ return -EFAULT;
+ name.len = namelen;
+ get_user(princhashlen, &ci->cc_princhash.cp_len);
+ if (princhashlen > 0) {
+ princhash.data = memdup_user(
+ &ci->cc_princhash.cp_data,
+ princhashlen);
+ if (IS_ERR_OR_NULL(princhash.data)) {
+ kfree(name.data);
+ return -EFAULT;
+ }
+ princhash.len = princhashlen;
+ } else
+ princhash.len = 0;
+ } else {
+ const struct cld_name __user *cnm;
+
+ cnm = &cmsg->cm_u.cm_name;
+ if (get_user(namelen, &cnm->cn_len))
+ return -EFAULT;
+ name.data = memdup_user(&cnm->cn_id, namelen);
+ if (IS_ERR_OR_NULL(name.data))
+ return -EFAULT;
+ name.len = namelen;
+ }
+ if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) {
+ name.len = name.len - 5;
+ memmove(name.data, name.data + 5, name.len);
+ cn->cn_has_legacy = true;
+ }
+ if (!nfs4_client_to_reclaim(name, princhash, nn)) {
+ kfree(name.data);
+ kfree(princhash.data);
+ return -EFAULT;
+ }
+ return nn->client_tracking_ops->msglen;
+ }
+ return -EFAULT;
+}
+
+static ssize_t
+cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
+{
+ struct cld_upcall *tmp, *cup;
+ struct cld_msg_hdr __user *hdr = (struct cld_msg_hdr __user *)src;
+ struct cld_msg_v2 __user *cmsg = (struct cld_msg_v2 __user *)src;
+ uint32_t xid;
+ struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info,
+ nfsd_net_id);
+ struct cld_net *cn = nn->cld_net;
+ int16_t status;
+
+ if (mlen != nn->client_tracking_ops->msglen) {
+ dprintk("%s: got %zu bytes, expected %zu\n", __func__, mlen,
+ nn->client_tracking_ops->msglen);
+ return -EINVAL;
+ }
+
+ /* copy just the xid so we can try to find that */
+ if (copy_from_user(&xid, &hdr->cm_xid, sizeof(xid)) != 0) {
+ dprintk("%s: error when copying xid from userspace", __func__);
+ return -EFAULT;
+ }
+
+ /*
+ * copy the status so we know whether to remove the upcall from the
+ * list (for -EINPROGRESS, we just want to make sure the xid is
+ * valid, not remove the upcall from the list)
+ */
+ if (get_user(status, &hdr->cm_status)) {
+ dprintk("%s: error when copying status from userspace", __func__);
+ return -EFAULT;
+ }
+
+ /* walk the list and find corresponding xid */
+ cup = NULL;
+ spin_lock(&cn->cn_lock);
+ list_for_each_entry(tmp, &cn->cn_list, cu_list) {
+ if (get_unaligned(&tmp->cu_u.cu_hdr.cm_xid) == xid) {
+ cup = tmp;
+ if (status != -EINPROGRESS)
+ list_del_init(&cup->cu_list);
+ break;
+ }
+ }
+ spin_unlock(&cn->cn_lock);
+
+ /* couldn't find upcall? */
+ if (!cup) {
+ dprintk("%s: couldn't find upcall -- xid=%u\n", __func__, xid);
+ return -EINVAL;
+ }
+
+ if (status == -EINPROGRESS)
+ return __cld_pipe_inprogress_downcall(cmsg, nn);
+
+ if (copy_from_user(&cup->cu_u.cu_msg_v2, src, mlen) != 0)
+ return -EFAULT;
+
+ complete(&cup->cu_done);
+ return mlen;
+}
+
+static void
+cld_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+ struct cld_msg *cmsg = msg->data;
+ struct cld_upcall *cup = container_of(cmsg, struct cld_upcall,
+ cu_u.cu_msg);
+
+ /* errno >= 0 means we got a downcall */
+ if (msg->errno >= 0)
+ return;
+
+ complete(&cup->cu_done);
+}
+
+static const struct rpc_pipe_ops cld_upcall_ops = {
+ .upcall = rpc_pipe_generic_upcall,
+ .downcall = cld_pipe_downcall,
+ .destroy_msg = cld_pipe_destroy_msg,
+};
+
+static struct dentry *
+nfsd4_cld_register_sb(struct super_block *sb, struct rpc_pipe *pipe)
+{
+ struct dentry *dir, *dentry;
+
+ dir = rpc_d_lookup_sb(sb, NFSD_PIPE_DIR);
+ if (dir == NULL)
+ return ERR_PTR(-ENOENT);
+ dentry = rpc_mkpipe_dentry(dir, NFSD_CLD_PIPE, NULL, pipe);
+ dput(dir);
+ return dentry;
+}
+
+static void
+nfsd4_cld_unregister_sb(struct rpc_pipe *pipe)
+{
+ if (pipe->dentry)
+ rpc_unlink(pipe->dentry);
+}
+
+static struct dentry *
+nfsd4_cld_register_net(struct net *net, struct rpc_pipe *pipe)
+{
+ struct super_block *sb;
+ struct dentry *dentry;
+
+ sb = rpc_get_sb_net(net);
+ if (!sb)
+ return NULL;
+ dentry = nfsd4_cld_register_sb(sb, pipe);
+ rpc_put_sb_net(net);
+ return dentry;
+}
+
+static void
+nfsd4_cld_unregister_net(struct net *net, struct rpc_pipe *pipe)
+{
+ struct super_block *sb;
+
+ sb = rpc_get_sb_net(net);
+ if (sb) {
+ nfsd4_cld_unregister_sb(pipe);
+ rpc_put_sb_net(net);
+ }
+}
+
+/* Initialize rpc_pipefs pipe for communication with client tracking daemon */
+static int
+__nfsd4_init_cld_pipe(struct net *net)
+{
+ int ret;
+ struct dentry *dentry;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct cld_net *cn;
+
+ if (nn->cld_net)
+ return 0;
+
+ cn = kzalloc(sizeof(*cn), GFP_KERNEL);
+ if (!cn) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ cn->cn_pipe = rpc_mkpipe_data(&cld_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN);
+ if (IS_ERR(cn->cn_pipe)) {
+ ret = PTR_ERR(cn->cn_pipe);
+ goto err;
+ }
+ spin_lock_init(&cn->cn_lock);
+ INIT_LIST_HEAD(&cn->cn_list);
+
+ dentry = nfsd4_cld_register_net(net, cn->cn_pipe);
+ if (IS_ERR(dentry)) {
+ ret = PTR_ERR(dentry);
+ goto err_destroy_data;
+ }
+
+ cn->cn_pipe->dentry = dentry;
+ cn->cn_has_legacy = false;
+ nn->cld_net = cn;
+ return 0;
+
+err_destroy_data:
+ rpc_destroy_pipe_data(cn->cn_pipe);
+err:
+ kfree(cn);
+ printk(KERN_ERR "NFSD: unable to create nfsdcld upcall pipe (%d)\n",
+ ret);
+ return ret;
+}
+
+static int
+nfsd4_init_cld_pipe(struct net *net)
+{
+ int status;
+
+ status = __nfsd4_init_cld_pipe(net);
+ if (!status)
+ printk("NFSD: Using old nfsdcld client tracking operations.\n");
+ return status;
+}
+
+static void
+nfsd4_remove_cld_pipe(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct cld_net *cn = nn->cld_net;
+
+ nfsd4_cld_unregister_net(net, cn->cn_pipe);
+ rpc_destroy_pipe_data(cn->cn_pipe);
+ if (cn->cn_tfm)
+ crypto_free_shash(cn->cn_tfm);
+ kfree(nn->cld_net);
+ nn->cld_net = NULL;
+}
+
+static struct cld_upcall *
+alloc_cld_upcall(struct nfsd_net *nn)
+{
+ struct cld_upcall *new, *tmp;
+ struct cld_net *cn = nn->cld_net;
+
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return new;
+
+ /* FIXME: hard cap on number in flight? */
+restart_search:
+ spin_lock(&cn->cn_lock);
+ list_for_each_entry(tmp, &cn->cn_list, cu_list) {
+ if (tmp->cu_u.cu_msg.cm_xid == cn->cn_xid) {
+ cn->cn_xid++;
+ spin_unlock(&cn->cn_lock);
+ goto restart_search;
+ }
+ }
+ init_completion(&new->cu_done);
+ new->cu_u.cu_msg.cm_vers = nn->client_tracking_ops->version;
+ put_unaligned(cn->cn_xid++, &new->cu_u.cu_msg.cm_xid);
+ new->cu_net = cn;
+ list_add(&new->cu_list, &cn->cn_list);
+ spin_unlock(&cn->cn_lock);
+
+ dprintk("%s: allocated xid %u\n", __func__, new->cu_u.cu_msg.cm_xid);
+
+ return new;
+}
+
+static void
+free_cld_upcall(struct cld_upcall *victim)
+{
+ struct cld_net *cn = victim->cu_net;
+
+ spin_lock(&cn->cn_lock);
+ list_del(&victim->cu_list);
+ spin_unlock(&cn->cn_lock);
+ kfree(victim);
+}
+
+/* Ask daemon to create a new record */
+static void
+nfsd4_cld_create(struct nfs4_client *clp)
+{
+ int ret;
+ struct cld_upcall *cup;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ struct cld_net *cn = nn->cld_net;
+
+ /* Don't upcall if it's already stored */
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return;
+
+ cup = alloc_cld_upcall(nn);
+ if (!cup) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ cup->cu_u.cu_msg.cm_cmd = Cld_Create;
+ cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+ memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+ clp->cl_name.len);
+
+ ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn);
+ if (!ret) {
+ ret = cup->cu_u.cu_msg.cm_status;
+ set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ }
+
+ free_cld_upcall(cup);
+out_err:
+ if (ret)
+ printk(KERN_ERR "NFSD: Unable to create client "
+ "record on stable storage: %d\n", ret);
+}
+
+/* Ask daemon to create a new record */
+static void
+nfsd4_cld_create_v2(struct nfs4_client *clp)
+{
+ int ret;
+ struct cld_upcall *cup;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ struct cld_net *cn = nn->cld_net;
+ struct cld_msg_v2 *cmsg;
+ struct crypto_shash *tfm = cn->cn_tfm;
+ struct xdr_netobj cksum;
+ char *principal = NULL;
+
+ /* Don't upcall if it's already stored */
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return;
+
+ cup = alloc_cld_upcall(nn);
+ if (!cup) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ cmsg = &cup->cu_u.cu_msg_v2;
+ cmsg->cm_cmd = Cld_Create;
+ cmsg->cm_u.cm_clntinfo.cc_name.cn_len = clp->cl_name.len;
+ memcpy(cmsg->cm_u.cm_clntinfo.cc_name.cn_id, clp->cl_name.data,
+ clp->cl_name.len);
+ if (clp->cl_cred.cr_raw_principal)
+ principal = clp->cl_cred.cr_raw_principal;
+ else if (clp->cl_cred.cr_principal)
+ principal = clp->cl_cred.cr_principal;
+ if (principal) {
+ cksum.len = crypto_shash_digestsize(tfm);
+ cksum.data = kmalloc(cksum.len, GFP_KERNEL);
+ if (cksum.data == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = crypto_shash_tfm_digest(tfm, principal, strlen(principal),
+ cksum.data);
+ if (ret) {
+ kfree(cksum.data);
+ goto out;
+ }
+ cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = cksum.len;
+ memcpy(cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data,
+ cksum.data, cksum.len);
+ kfree(cksum.data);
+ } else
+ cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0;
+
+ ret = cld_pipe_upcall(cn->cn_pipe, cmsg, nn);
+ if (!ret) {
+ ret = cmsg->cm_status;
+ set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ }
+
+out:
+ free_cld_upcall(cup);
+out_err:
+ if (ret)
+ pr_err("NFSD: Unable to create client record on stable storage: %d\n",
+ ret);
+}
+
+/* Ask daemon to create a new record */
+static void
+nfsd4_cld_remove(struct nfs4_client *clp)
+{
+ int ret;
+ struct cld_upcall *cup;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ struct cld_net *cn = nn->cld_net;
+
+ /* Don't upcall if it's already removed */
+ if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return;
+
+ cup = alloc_cld_upcall(nn);
+ if (!cup) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ cup->cu_u.cu_msg.cm_cmd = Cld_Remove;
+ cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+ memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+ clp->cl_name.len);
+
+ ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn);
+ if (!ret) {
+ ret = cup->cu_u.cu_msg.cm_status;
+ clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ }
+
+ free_cld_upcall(cup);
+out_err:
+ if (ret)
+ printk(KERN_ERR "NFSD: Unable to remove client "
+ "record from stable storage: %d\n", ret);
+}
+
+/*
+ * For older nfsdcld's that do not allow us to "slurp" the clients
+ * from the tracking database during startup.
+ *
+ * Check for presence of a record, and update its timestamp
+ */
+static int
+nfsd4_cld_check_v0(struct nfs4_client *clp)
+{
+ int ret;
+ struct cld_upcall *cup;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ struct cld_net *cn = nn->cld_net;
+
+ /* Don't upcall if one was already stored during this grace pd */
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return 0;
+
+ cup = alloc_cld_upcall(nn);
+ if (!cup) {
+ printk(KERN_ERR "NFSD: Unable to check client record on "
+ "stable storage: %d\n", -ENOMEM);
+ return -ENOMEM;
+ }
+
+ cup->cu_u.cu_msg.cm_cmd = Cld_Check;
+ cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+ memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+ clp->cl_name.len);
+
+ ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn);
+ if (!ret) {
+ ret = cup->cu_u.cu_msg.cm_status;
+ set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ }
+
+ free_cld_upcall(cup);
+ return ret;
+}
+
+/*
+ * For newer nfsdcld's that allow us to "slurp" the clients
+ * from the tracking database during startup.
+ *
+ * Check for presence of a record in the reclaim_str_hashtbl
+ */
+static int
+nfsd4_cld_check(struct nfs4_client *clp)
+{
+ struct nfs4_client_reclaim *crp;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ struct cld_net *cn = nn->cld_net;
+ int status;
+ char dname[HEXDIR_LEN];
+ struct xdr_netobj name;
+
+ /* did we already find that this client is stable? */
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return 0;
+
+ /* look for it in the reclaim hashtable otherwise */
+ crp = nfsd4_find_reclaim_client(clp->cl_name, nn);
+ if (crp)
+ goto found;
+
+ if (cn->cn_has_legacy) {
+ status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+ if (status)
+ return -ENOENT;
+
+ name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL);
+ if (!name.data) {
+ dprintk("%s: failed to allocate memory for name.data!\n",
+ __func__);
+ return -ENOENT;
+ }
+ name.len = HEXDIR_LEN;
+ crp = nfsd4_find_reclaim_client(name, nn);
+ kfree(name.data);
+ if (crp)
+ goto found;
+
+ }
+ return -ENOENT;
+found:
+ crp->cr_clp = clp;
+ return 0;
+}
+
+static int
+nfsd4_cld_check_v2(struct nfs4_client *clp)
+{
+ struct nfs4_client_reclaim *crp;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ struct cld_net *cn = nn->cld_net;
+ int status;
+ char dname[HEXDIR_LEN];
+ struct xdr_netobj name;
+ struct crypto_shash *tfm = cn->cn_tfm;
+ struct xdr_netobj cksum;
+ char *principal = NULL;
+
+ /* did we already find that this client is stable? */
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return 0;
+
+ /* look for it in the reclaim hashtable otherwise */
+ crp = nfsd4_find_reclaim_client(clp->cl_name, nn);
+ if (crp)
+ goto found;
+
+ if (cn->cn_has_legacy) {
+ status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+ if (status)
+ return -ENOENT;
+
+ name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL);
+ if (!name.data) {
+ dprintk("%s: failed to allocate memory for name.data\n",
+ __func__);
+ return -ENOENT;
+ }
+ name.len = HEXDIR_LEN;
+ crp = nfsd4_find_reclaim_client(name, nn);
+ kfree(name.data);
+ if (crp)
+ goto found;
+
+ }
+ return -ENOENT;
+found:
+ if (crp->cr_princhash.len) {
+ if (clp->cl_cred.cr_raw_principal)
+ principal = clp->cl_cred.cr_raw_principal;
+ else if (clp->cl_cred.cr_principal)
+ principal = clp->cl_cred.cr_principal;
+ if (principal == NULL)
+ return -ENOENT;
+ cksum.len = crypto_shash_digestsize(tfm);
+ cksum.data = kmalloc(cksum.len, GFP_KERNEL);
+ if (cksum.data == NULL)
+ return -ENOENT;
+ status = crypto_shash_tfm_digest(tfm, principal,
+ strlen(principal), cksum.data);
+ if (status) {
+ kfree(cksum.data);
+ return -ENOENT;
+ }
+ if (memcmp(crp->cr_princhash.data, cksum.data,
+ crp->cr_princhash.len)) {
+ kfree(cksum.data);
+ return -ENOENT;
+ }
+ kfree(cksum.data);
+ }
+ crp->cr_clp = clp;
+ return 0;
+}
+
+static int
+nfsd4_cld_grace_start(struct nfsd_net *nn)
+{
+ int ret;
+ struct cld_upcall *cup;
+ struct cld_net *cn = nn->cld_net;
+
+ cup = alloc_cld_upcall(nn);
+ if (!cup) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ cup->cu_u.cu_msg.cm_cmd = Cld_GraceStart;
+ ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn);
+ if (!ret)
+ ret = cup->cu_u.cu_msg.cm_status;
+
+ free_cld_upcall(cup);
+out_err:
+ if (ret)
+ dprintk("%s: Unable to get clients from userspace: %d\n",
+ __func__, ret);
+ return ret;
+}
+
+/* For older nfsdcld's that need cm_gracetime */
+static void
+nfsd4_cld_grace_done_v0(struct nfsd_net *nn)
+{
+ int ret;
+ struct cld_upcall *cup;
+ struct cld_net *cn = nn->cld_net;
+
+ cup = alloc_cld_upcall(nn);
+ if (!cup) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone;
+ cup->cu_u.cu_msg.cm_u.cm_gracetime = nn->boot_time;
+ ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn);
+ if (!ret)
+ ret = cup->cu_u.cu_msg.cm_status;
+
+ free_cld_upcall(cup);
+out_err:
+ if (ret)
+ printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
+}
+
+/*
+ * For newer nfsdcld's that do not need cm_gracetime. We also need to call
+ * nfs4_release_reclaim() to clear out the reclaim_str_hashtbl.
+ */
+static void
+nfsd4_cld_grace_done(struct nfsd_net *nn)
+{
+ int ret;
+ struct cld_upcall *cup;
+ struct cld_net *cn = nn->cld_net;
+
+ cup = alloc_cld_upcall(nn);
+ if (!cup) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone;
+ ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn);
+ if (!ret)
+ ret = cup->cu_u.cu_msg.cm_status;
+
+ free_cld_upcall(cup);
+out_err:
+ nfs4_release_reclaim(nn);
+ if (ret)
+ printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
+}
+
+static int
+nfs4_cld_state_init(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ int i;
+
+ nn->reclaim_str_hashtbl = kmalloc_array(CLIENT_HASH_SIZE,
+ sizeof(struct list_head),
+ GFP_KERNEL);
+ if (!nn->reclaim_str_hashtbl)
+ return -ENOMEM;
+
+ for (i = 0; i < CLIENT_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
+ nn->reclaim_str_hashtbl_size = 0;
+ nn->track_reclaim_completes = true;
+ atomic_set(&nn->nr_reclaim_complete, 0);
+
+ return 0;
+}
+
+static void
+nfs4_cld_state_shutdown(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ nn->track_reclaim_completes = false;
+ kfree(nn->reclaim_str_hashtbl);
+}
+
+static bool
+cld_running(struct nfsd_net *nn)
+{
+ struct cld_net *cn = nn->cld_net;
+ struct rpc_pipe *pipe = cn->cn_pipe;
+
+ return pipe->nreaders || pipe->nwriters;
+}
+
+static int
+nfsd4_cld_get_version(struct nfsd_net *nn)
+{
+ int ret = 0;
+ struct cld_upcall *cup;
+ struct cld_net *cn = nn->cld_net;
+ uint8_t version;
+
+ cup = alloc_cld_upcall(nn);
+ if (!cup) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+ cup->cu_u.cu_msg.cm_cmd = Cld_GetVersion;
+ ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn);
+ if (!ret) {
+ ret = cup->cu_u.cu_msg.cm_status;
+ if (ret)
+ goto out_free;
+ version = cup->cu_u.cu_msg.cm_u.cm_version;
+ dprintk("%s: userspace returned version %u\n",
+ __func__, version);
+ if (version < 1)
+ version = 1;
+ else if (version > CLD_UPCALL_VERSION)
+ version = CLD_UPCALL_VERSION;
+
+ switch (version) {
+ case 1:
+ nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
+ break;
+ case 2:
+ nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v2;
+ break;
+ default:
+ break;
+ }
+ }
+out_free:
+ free_cld_upcall(cup);
+out_err:
+ if (ret)
+ dprintk("%s: Unable to get version from userspace: %d\n",
+ __func__, ret);
+ return ret;
+}
+
+static int
+nfsd4_cld_tracking_init(struct net *net)
+{
+ int status;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ bool running;
+ int retries = 10;
+ struct crypto_shash *tfm;
+
+ status = nfs4_cld_state_init(net);
+ if (status)
+ return status;
+
+ status = __nfsd4_init_cld_pipe(net);
+ if (status)
+ goto err_shutdown;
+
+ /*
+ * rpc pipe upcalls take 30 seconds to time out, so we don't want to
+ * queue an upcall unless we know that nfsdcld is running (because we
+ * want this to fail fast so that nfsd4_client_tracking_init() can try
+ * the next client tracking method). nfsdcld should already be running
+ * before nfsd is started, so the wait here is for nfsdcld to open the
+ * pipefs file we just created.
+ */
+ while (!(running = cld_running(nn)) && retries--)
+ msleep(100);
+
+ if (!running) {
+ status = -ETIMEDOUT;
+ goto err_remove;
+ }
+ tfm = crypto_alloc_shash("sha256", 0, 0);
+ if (IS_ERR(tfm)) {
+ status = PTR_ERR(tfm);
+ goto err_remove;
+ }
+ nn->cld_net->cn_tfm = tfm;
+
+ status = nfsd4_cld_get_version(nn);
+ if (status == -EOPNOTSUPP)
+ pr_warn("NFSD: nfsdcld GetVersion upcall failed. Please upgrade nfsdcld.\n");
+
+ status = nfsd4_cld_grace_start(nn);
+ if (status) {
+ if (status == -EOPNOTSUPP)
+ pr_warn("NFSD: nfsdcld GraceStart upcall failed. Please upgrade nfsdcld.\n");
+ nfs4_release_reclaim(nn);
+ goto err_remove;
+ } else
+ printk("NFSD: Using nfsdcld client tracking operations.\n");
+ return 0;
+
+err_remove:
+ nfsd4_remove_cld_pipe(net);
+err_shutdown:
+ nfs4_cld_state_shutdown(net);
+ return status;
+}
+
+static void
+nfsd4_cld_tracking_exit(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ nfs4_release_reclaim(nn);
+ nfsd4_remove_cld_pipe(net);
+ nfs4_cld_state_shutdown(net);
+}
+
+/* For older nfsdcld's */
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v0 = {
+ .init = nfsd4_init_cld_pipe,
+ .exit = nfsd4_remove_cld_pipe,
+ .create = nfsd4_cld_create,
+ .remove = nfsd4_cld_remove,
+ .check = nfsd4_cld_check_v0,
+ .grace_done = nfsd4_cld_grace_done_v0,
+ .version = 1,
+ .msglen = sizeof(struct cld_msg),
+};
+
+/* For newer nfsdcld's */
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
+ .init = nfsd4_cld_tracking_init,
+ .exit = nfsd4_cld_tracking_exit,
+ .create = nfsd4_cld_create,
+ .remove = nfsd4_cld_remove,
+ .check = nfsd4_cld_check,
+ .grace_done = nfsd4_cld_grace_done,
+ .version = 1,
+ .msglen = sizeof(struct cld_msg),
+};
+
+/* v2 create/check ops include the principal, if available */
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2 = {
+ .init = nfsd4_cld_tracking_init,
+ .exit = nfsd4_cld_tracking_exit,
+ .create = nfsd4_cld_create_v2,
+ .remove = nfsd4_cld_remove,
+ .check = nfsd4_cld_check_v2,
+ .grace_done = nfsd4_cld_grace_done,
+ .version = 2,
+ .msglen = sizeof(struct cld_msg_v2),
+};
+
+/* upcall via usermodehelper */
+static char cltrack_prog[PATH_MAX] = "/sbin/nfsdcltrack";
+module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog),
+ S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(cltrack_prog, "Path to the nfsdcltrack upcall program");
+
+static bool cltrack_legacy_disable;
+module_param(cltrack_legacy_disable, bool, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(cltrack_legacy_disable,
+ "Disable legacy recoverydir conversion. Default: false");
+
+#define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR="
+#define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR="
+#define HAS_SESSION_ENV_PREFIX "NFSDCLTRACK_CLIENT_HAS_SESSION="
+#define GRACE_START_ENV_PREFIX "NFSDCLTRACK_GRACE_START="
+
+static char *
+nfsd4_cltrack_legacy_topdir(void)
+{
+ int copied;
+ size_t len;
+ char *result;
+
+ if (cltrack_legacy_disable)
+ return NULL;
+
+ len = strlen(LEGACY_TOPDIR_ENV_PREFIX) +
+ strlen(nfs4_recoverydir()) + 1;
+
+ result = kmalloc(len, GFP_KERNEL);
+ if (!result)
+ return result;
+
+ copied = snprintf(result, len, LEGACY_TOPDIR_ENV_PREFIX "%s",
+ nfs4_recoverydir());
+ if (copied >= len) {
+ /* just return nothing if output was truncated */
+ kfree(result);
+ return NULL;
+ }
+
+ return result;
+}
+
+static char *
+nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name)
+{
+ int copied;
+ size_t len;
+ char *result;
+
+ if (cltrack_legacy_disable)
+ return NULL;
+
+ /* +1 is for '/' between "topdir" and "recdir" */
+ len = strlen(LEGACY_RECDIR_ENV_PREFIX) +
+ strlen(nfs4_recoverydir()) + 1 + HEXDIR_LEN;
+
+ result = kmalloc(len, GFP_KERNEL);
+ if (!result)
+ return result;
+
+ copied = snprintf(result, len, LEGACY_RECDIR_ENV_PREFIX "%s/",
+ nfs4_recoverydir());
+ if (copied > (len - HEXDIR_LEN)) {
+ /* just return nothing if output will be truncated */
+ kfree(result);
+ return NULL;
+ }
+
+ copied = nfs4_make_rec_clidname(result + copied, name);
+ if (copied) {
+ kfree(result);
+ return NULL;
+ }
+
+ return result;
+}
+
+static char *
+nfsd4_cltrack_client_has_session(struct nfs4_client *clp)
+{
+ int copied;
+ size_t len;
+ char *result;
+
+ /* prefix + Y/N character + terminating NULL */
+ len = strlen(HAS_SESSION_ENV_PREFIX) + 1 + 1;
+
+ result = kmalloc(len, GFP_KERNEL);
+ if (!result)
+ return result;
+
+ copied = snprintf(result, len, HAS_SESSION_ENV_PREFIX "%c",
+ clp->cl_minorversion ? 'Y' : 'N');
+ if (copied >= len) {
+ /* just return nothing if output was truncated */
+ kfree(result);
+ return NULL;
+ }
+
+ return result;
+}
+
+static char *
+nfsd4_cltrack_grace_start(time64_t grace_start)
+{
+ int copied;
+ size_t len;
+ char *result;
+
+ /* prefix + max width of int64_t string + terminating NULL */
+ len = strlen(GRACE_START_ENV_PREFIX) + 22 + 1;
+
+ result = kmalloc(len, GFP_KERNEL);
+ if (!result)
+ return result;
+
+ copied = snprintf(result, len, GRACE_START_ENV_PREFIX "%lld",
+ grace_start);
+ if (copied >= len) {
+ /* just return nothing if output was truncated */
+ kfree(result);
+ return NULL;
+ }
+
+ return result;
+}
+
+static int
+nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1)
+{
+ char *envp[3];
+ char *argv[4];
+ int ret;
+
+ if (unlikely(!cltrack_prog[0])) {
+ dprintk("%s: cltrack_prog is disabled\n", __func__);
+ return -EACCES;
+ }
+
+ dprintk("%s: cmd: %s\n", __func__, cmd);
+ dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)");
+ dprintk("%s: env0: %s\n", __func__, env0 ? env0 : "(null)");
+ dprintk("%s: env1: %s\n", __func__, env1 ? env1 : "(null)");
+
+ envp[0] = env0;
+ envp[1] = env1;
+ envp[2] = NULL;
+
+ argv[0] = (char *)cltrack_prog;
+ argv[1] = cmd;
+ argv[2] = arg;
+ argv[3] = NULL;
+
+ ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+ /*
+ * Disable the upcall mechanism if we're getting an ENOENT or EACCES
+ * error. The admin can re-enable it on the fly by using sysfs
+ * once the problem has been fixed.
+ */
+ if (ret == -ENOENT || ret == -EACCES) {
+ dprintk("NFSD: %s was not found or isn't executable (%d). "
+ "Setting cltrack_prog to blank string!",
+ cltrack_prog, ret);
+ cltrack_prog[0] = '\0';
+ }
+ dprintk("%s: %s return value: %d\n", __func__, cltrack_prog, ret);
+
+ return ret;
+}
+
+static char *
+bin_to_hex_dup(const unsigned char *src, int srclen)
+{
+ char *buf;
+
+ /* +1 for terminating NULL */
+ buf = kzalloc((srclen * 2) + 1, GFP_KERNEL);
+ if (!buf)
+ return buf;
+
+ bin2hex(buf, src, srclen);
+ return buf;
+}
+
+static int
+nfsd4_umh_cltrack_init(struct net *net)
+{
+ int ret;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ char *grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
+
+ /* XXX: The usermode helper s not working in container yet. */
+ if (net != &init_net) {
+ pr_warn("NFSD: attempt to initialize umh client tracking in a container ignored.\n");
+ kfree(grace_start);
+ return -EINVAL;
+ }
+
+ ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL);
+ kfree(grace_start);
+ if (!ret)
+ printk("NFSD: Using UMH upcall client tracking operations.\n");
+ return ret;
+}
+
+static void
+nfsd4_cltrack_upcall_lock(struct nfs4_client *clp)
+{
+ wait_on_bit_lock(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK,
+ TASK_UNINTERRUPTIBLE);
+}
+
+static void
+nfsd4_cltrack_upcall_unlock(struct nfs4_client *clp)
+{
+ smp_mb__before_atomic();
+ clear_bit(NFSD4_CLIENT_UPCALL_LOCK, &clp->cl_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK);
+}
+
+static void
+nfsd4_umh_cltrack_create(struct nfs4_client *clp)
+{
+ char *hexid, *has_session, *grace_start;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ /*
+ * With v4.0 clients, there's little difference in outcome between a
+ * create and check operation, and we can end up calling into this
+ * function multiple times per client (once for each openowner). So,
+ * for v4.0 clients skip upcalling once the client has been recorded
+ * on stable storage.
+ *
+ * For v4.1+ clients, the outcome of the two operations is different,
+ * so we must ensure that we upcall for the create operation. v4.1+
+ * clients call this on RECLAIM_COMPLETE though, so we should only end
+ * up doing a single create upcall per client.
+ */
+ if (clp->cl_minorversion == 0 &&
+ test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return;
+
+ hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
+ if (!hexid) {
+ dprintk("%s: can't allocate memory for upcall!\n", __func__);
+ return;
+ }
+
+ has_session = nfsd4_cltrack_client_has_session(clp);
+ grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
+
+ nfsd4_cltrack_upcall_lock(clp);
+ if (!nfsd4_umh_cltrack_upcall("create", hexid, has_session, grace_start))
+ set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ nfsd4_cltrack_upcall_unlock(clp);
+
+ kfree(has_session);
+ kfree(grace_start);
+ kfree(hexid);
+}
+
+static void
+nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
+{
+ char *hexid;
+
+ if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return;
+
+ hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
+ if (!hexid) {
+ dprintk("%s: can't allocate memory for upcall!\n", __func__);
+ return;
+ }
+
+ nfsd4_cltrack_upcall_lock(clp);
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags) &&
+ nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL) == 0)
+ clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ nfsd4_cltrack_upcall_unlock(clp);
+
+ kfree(hexid);
+}
+
+static int
+nfsd4_umh_cltrack_check(struct nfs4_client *clp)
+{
+ int ret;
+ char *hexid, *has_session, *legacy;
+
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return 0;
+
+ hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
+ if (!hexid) {
+ dprintk("%s: can't allocate memory for upcall!\n", __func__);
+ return -ENOMEM;
+ }
+
+ has_session = nfsd4_cltrack_client_has_session(clp);
+ legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name);
+
+ nfsd4_cltrack_upcall_lock(clp);
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) {
+ ret = 0;
+ } else {
+ ret = nfsd4_umh_cltrack_upcall("check", hexid, has_session, legacy);
+ if (ret == 0)
+ set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ }
+ nfsd4_cltrack_upcall_unlock(clp);
+ kfree(has_session);
+ kfree(legacy);
+ kfree(hexid);
+
+ return ret;
+}
+
+static void
+nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn)
+{
+ char *legacy;
+ char timestr[22]; /* FIXME: better way to determine max size? */
+
+ sprintf(timestr, "%lld", nn->boot_time);
+ legacy = nfsd4_cltrack_legacy_topdir();
+ nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy, NULL);
+ kfree(legacy);
+}
+
+static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = {
+ .init = nfsd4_umh_cltrack_init,
+ .exit = NULL,
+ .create = nfsd4_umh_cltrack_create,
+ .remove = nfsd4_umh_cltrack_remove,
+ .check = nfsd4_umh_cltrack_check,
+ .grace_done = nfsd4_umh_cltrack_grace_done,
+ .version = 1,
+ .msglen = 0,
+};
+
+int
+nfsd4_client_tracking_init(struct net *net)
+{
+ int status;
+ struct path path;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ /* just run the init if it the method is already decided */
+ if (nn->client_tracking_ops)
+ goto do_init;
+
+ /* First, try to use nfsdcld */
+ nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
+ status = nn->client_tracking_ops->init(net);
+ if (!status)
+ return status;
+ if (status != -ETIMEDOUT) {
+ nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v0;
+ status = nn->client_tracking_ops->init(net);
+ if (!status)
+ return status;
+ }
+
+ /*
+ * Next, try the UMH upcall.
+ */
+ nn->client_tracking_ops = &nfsd4_umh_tracking_ops;
+ status = nn->client_tracking_ops->init(net);
+ if (!status)
+ return status;
+
+ /*
+ * Finally, See if the recoverydir exists and is a directory.
+ * If it is, then use the legacy ops.
+ */
+ nn->client_tracking_ops = &nfsd4_legacy_tracking_ops;
+ status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path);
+ if (!status) {
+ status = d_is_dir(path.dentry);
+ path_put(&path);
+ if (!status) {
+ status = -EINVAL;
+ goto out;
+ }
+ }
+
+do_init:
+ status = nn->client_tracking_ops->init(net);
+out:
+ if (status) {
+ printk(KERN_WARNING "NFSD: Unable to initialize client "
+ "recovery tracking! (%d)\n", status);
+ nn->client_tracking_ops = NULL;
+ }
+ return status;
+}
+
+void
+nfsd4_client_tracking_exit(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (nn->client_tracking_ops) {
+ if (nn->client_tracking_ops->exit)
+ nn->client_tracking_ops->exit(net);
+ nn->client_tracking_ops = NULL;
+ }
+}
+
+void
+nfsd4_client_record_create(struct nfs4_client *clp)
+{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ if (nn->client_tracking_ops)
+ nn->client_tracking_ops->create(clp);
+}
+
+void
+nfsd4_client_record_remove(struct nfs4_client *clp)
+{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ if (nn->client_tracking_ops)
+ nn->client_tracking_ops->remove(clp);
+}
+
+int
+nfsd4_client_record_check(struct nfs4_client *clp)
+{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ if (nn->client_tracking_ops)
+ return nn->client_tracking_ops->check(clp);
+
+ return -EOPNOTSUPP;
+}
+
+void
+nfsd4_record_grace_done(struct nfsd_net *nn)
+{
+ if (nn->client_tracking_ops)
+ nn->client_tracking_ops->grace_done(nn);
+}
+
+static int
+rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr)
+{
+ struct super_block *sb = ptr;
+ struct net *net = sb->s_fs_info;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct cld_net *cn = nn->cld_net;
+ struct dentry *dentry;
+ int ret = 0;
+
+ if (!try_module_get(THIS_MODULE))
+ return 0;
+
+ if (!cn) {
+ module_put(THIS_MODULE);
+ return 0;
+ }
+
+ switch (event) {
+ case RPC_PIPEFS_MOUNT:
+ dentry = nfsd4_cld_register_sb(sb, cn->cn_pipe);
+ if (IS_ERR(dentry)) {
+ ret = PTR_ERR(dentry);
+ break;
+ }
+ cn->cn_pipe->dentry = dentry;
+ break;
+ case RPC_PIPEFS_UMOUNT:
+ if (cn->cn_pipe->dentry)
+ nfsd4_cld_unregister_sb(cn->cn_pipe);
+ break;
+ default:
+ ret = -ENOTSUPP;
+ break;
+ }
+ module_put(THIS_MODULE);
+ return ret;
+}
+
+static struct notifier_block nfsd4_cld_block = {
+ .notifier_call = rpc_pipefs_event,
+};
+
+int
+register_cld_notifier(void)
+{
+ WARN_ON(!nfsd_net_id);
+ return rpc_pipefs_notifier_register(&nfsd4_cld_block);
+}
+
+void
+unregister_cld_notifier(void)
+{
+ rpc_pipefs_notifier_unregister(&nfsd4_cld_block);
+}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
new file mode 100644
index 000000000..d402ca0b5
--- /dev/null
+++ b/fs/nfsd/nfs4state.c
@@ -0,0 +1,7588 @@
+/*
+* Copyright (c) 2001 The Regents of the University of Michigan.
+* All rights reserved.
+*
+* Kendrick Smith <kmsmith@umich.edu>
+* Andy Adamson <kandros@umich.edu>
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* 1. Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+* 2. Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the distribution.
+* 3. Neither the name of the University nor the names of its
+* contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/ratelimit.h>
+#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/jhash.h>
+#include <linux/string_helpers.h>
+#include "xdr4.h"
+#include "xdr4cb.h"
+#include "vfs.h"
+#include "current_stateid.h"
+
+#include "netns.h"
+#include "pnfs.h"
+#include "filecache.h"
+#include "trace.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PROC
+
+#define all_ones {{~0,~0},~0}
+static const stateid_t one_stateid = {
+ .si_generation = ~0,
+ .si_opaque = all_ones,
+};
+static const stateid_t zero_stateid = {
+ /* all fields zero */
+};
+static const stateid_t currentstateid = {
+ .si_generation = 1,
+};
+static const stateid_t close_stateid = {
+ .si_generation = 0xffffffffU,
+};
+
+static u64 current_sessionid = 1;
+
+#define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t)))
+#define ONE_STATEID(stateid) (!memcmp((stateid), &one_stateid, sizeof(stateid_t)))
+#define CURRENT_STATEID(stateid) (!memcmp((stateid), &currentstateid, sizeof(stateid_t)))
+#define CLOSE_STATEID(stateid) (!memcmp((stateid), &close_stateid, sizeof(stateid_t)))
+
+/* forward declarations */
+static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner);
+static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
+void nfsd4_end_grace(struct nfsd_net *nn);
+static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps);
+
+/* Locking: */
+
+/*
+ * Currently used for the del_recall_lru and file hash table. In an
+ * effort to decrease the scope of the client_mutex, this spinlock may
+ * eventually cover more:
+ */
+static DEFINE_SPINLOCK(state_lock);
+
+enum nfsd4_st_mutex_lock_subclass {
+ OPEN_STATEID_MUTEX = 0,
+ LOCK_STATEID_MUTEX = 1,
+};
+
+/*
+ * A waitqueue for all in-progress 4.0 CLOSE operations that are waiting for
+ * the refcount on the open stateid to drop.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(close_wq);
+
+/*
+ * A waitqueue where a writer to clients/#/ctl destroying a client can
+ * wait for cl_rpc_users to drop to 0 and then for the client to be
+ * unhashed.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(expiry_wq);
+
+static struct kmem_cache *client_slab;
+static struct kmem_cache *openowner_slab;
+static struct kmem_cache *lockowner_slab;
+static struct kmem_cache *file_slab;
+static struct kmem_cache *stateid_slab;
+static struct kmem_cache *deleg_slab;
+static struct kmem_cache *odstate_slab;
+
+static void free_session(struct nfsd4_session *);
+
+static const struct nfsd4_callback_ops nfsd4_cb_recall_ops;
+static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops;
+
+static bool is_session_dead(struct nfsd4_session *ses)
+{
+ return ses->se_flags & NFS4_SESSION_DEAD;
+}
+
+static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me)
+{
+ if (atomic_read(&ses->se_ref) > ref_held_by_me)
+ return nfserr_jukebox;
+ ses->se_flags |= NFS4_SESSION_DEAD;
+ return nfs_ok;
+}
+
+static bool is_client_expired(struct nfs4_client *clp)
+{
+ return clp->cl_time == 0;
+}
+
+static __be32 get_client_locked(struct nfs4_client *clp)
+{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ lockdep_assert_held(&nn->client_lock);
+
+ if (is_client_expired(clp))
+ return nfserr_expired;
+ atomic_inc(&clp->cl_rpc_users);
+ return nfs_ok;
+}
+
+/* must be called under the client_lock */
+static inline void
+renew_client_locked(struct nfs4_client *clp)
+{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ if (is_client_expired(clp)) {
+ WARN_ON(1);
+ printk("%s: client (clientid %08x/%08x) already expired\n",
+ __func__,
+ clp->cl_clientid.cl_boot,
+ clp->cl_clientid.cl_id);
+ return;
+ }
+
+ list_move_tail(&clp->cl_lru, &nn->client_lru);
+ clp->cl_time = ktime_get_boottime_seconds();
+}
+
+static void put_client_renew_locked(struct nfs4_client *clp)
+{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ lockdep_assert_held(&nn->client_lock);
+
+ if (!atomic_dec_and_test(&clp->cl_rpc_users))
+ return;
+ if (!is_client_expired(clp))
+ renew_client_locked(clp);
+ else
+ wake_up_all(&expiry_wq);
+}
+
+static void put_client_renew(struct nfs4_client *clp)
+{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ if (!atomic_dec_and_lock(&clp->cl_rpc_users, &nn->client_lock))
+ return;
+ if (!is_client_expired(clp))
+ renew_client_locked(clp);
+ else
+ wake_up_all(&expiry_wq);
+ spin_unlock(&nn->client_lock);
+}
+
+static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses)
+{
+ __be32 status;
+
+ if (is_session_dead(ses))
+ return nfserr_badsession;
+ status = get_client_locked(ses->se_client);
+ if (status)
+ return status;
+ atomic_inc(&ses->se_ref);
+ return nfs_ok;
+}
+
+static void nfsd4_put_session_locked(struct nfsd4_session *ses)
+{
+ struct nfs4_client *clp = ses->se_client;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ lockdep_assert_held(&nn->client_lock);
+
+ if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses))
+ free_session(ses);
+ put_client_renew_locked(clp);
+}
+
+static void nfsd4_put_session(struct nfsd4_session *ses)
+{
+ struct nfs4_client *clp = ses->se_client;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ spin_lock(&nn->client_lock);
+ nfsd4_put_session_locked(ses);
+ spin_unlock(&nn->client_lock);
+}
+
+static struct nfsd4_blocked_lock *
+find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
+ struct nfsd_net *nn)
+{
+ struct nfsd4_blocked_lock *cur, *found = NULL;
+
+ spin_lock(&nn->blocked_locks_lock);
+ list_for_each_entry(cur, &lo->lo_blocked, nbl_list) {
+ if (fh_match(fh, &cur->nbl_fh)) {
+ list_del_init(&cur->nbl_list);
+ list_del_init(&cur->nbl_lru);
+ found = cur;
+ break;
+ }
+ }
+ spin_unlock(&nn->blocked_locks_lock);
+ if (found)
+ locks_delete_block(&found->nbl_lock);
+ return found;
+}
+
+static struct nfsd4_blocked_lock *
+find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
+ struct nfsd_net *nn)
+{
+ struct nfsd4_blocked_lock *nbl;
+
+ nbl = find_blocked_lock(lo, fh, nn);
+ if (!nbl) {
+ nbl= kmalloc(sizeof(*nbl), GFP_KERNEL);
+ if (nbl) {
+ INIT_LIST_HEAD(&nbl->nbl_list);
+ INIT_LIST_HEAD(&nbl->nbl_lru);
+ fh_copy_shallow(&nbl->nbl_fh, fh);
+ locks_init_lock(&nbl->nbl_lock);
+ nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client,
+ &nfsd4_cb_notify_lock_ops,
+ NFSPROC4_CLNT_CB_NOTIFY_LOCK);
+ }
+ }
+ return nbl;
+}
+
+static void
+free_blocked_lock(struct nfsd4_blocked_lock *nbl)
+{
+ locks_delete_block(&nbl->nbl_lock);
+ locks_release_private(&nbl->nbl_lock);
+ kfree(nbl);
+}
+
+static void
+remove_blocked_locks(struct nfs4_lockowner *lo)
+{
+ struct nfs4_client *clp = lo->lo_owner.so_client;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ struct nfsd4_blocked_lock *nbl;
+ LIST_HEAD(reaplist);
+
+ /* Dequeue all blocked locks */
+ spin_lock(&nn->blocked_locks_lock);
+ while (!list_empty(&lo->lo_blocked)) {
+ nbl = list_first_entry(&lo->lo_blocked,
+ struct nfsd4_blocked_lock,
+ nbl_list);
+ list_del_init(&nbl->nbl_list);
+ list_move(&nbl->nbl_lru, &reaplist);
+ }
+ spin_unlock(&nn->blocked_locks_lock);
+
+ /* Now free them */
+ while (!list_empty(&reaplist)) {
+ nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock,
+ nbl_lru);
+ list_del_init(&nbl->nbl_lru);
+ free_blocked_lock(nbl);
+ }
+}
+
+static void
+nfsd4_cb_notify_lock_prepare(struct nfsd4_callback *cb)
+{
+ struct nfsd4_blocked_lock *nbl = container_of(cb,
+ struct nfsd4_blocked_lock, nbl_cb);
+ locks_delete_block(&nbl->nbl_lock);
+}
+
+static int
+nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task)
+{
+ /*
+ * Since this is just an optimization, we don't try very hard if it
+ * turns out not to succeed. We'll requeue it on NFS4ERR_DELAY, and
+ * just quit trying on anything else.
+ */
+ switch (task->tk_status) {
+ case -NFS4ERR_DELAY:
+ rpc_delay(task, 1 * HZ);
+ return 0;
+ default:
+ return 1;
+ }
+}
+
+static void
+nfsd4_cb_notify_lock_release(struct nfsd4_callback *cb)
+{
+ struct nfsd4_blocked_lock *nbl = container_of(cb,
+ struct nfsd4_blocked_lock, nbl_cb);
+
+ free_blocked_lock(nbl);
+}
+
+static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = {
+ .prepare = nfsd4_cb_notify_lock_prepare,
+ .done = nfsd4_cb_notify_lock_done,
+ .release = nfsd4_cb_notify_lock_release,
+};
+
+static inline struct nfs4_stateowner *
+nfs4_get_stateowner(struct nfs4_stateowner *sop)
+{
+ atomic_inc(&sop->so_count);
+ return sop;
+}
+
+static int
+same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner)
+{
+ return (sop->so_owner.len == owner->len) &&
+ 0 == memcmp(sop->so_owner.data, owner->data, owner->len);
+}
+
+static struct nfs4_openowner *
+find_openstateowner_str_locked(unsigned int hashval, struct nfsd4_open *open,
+ struct nfs4_client *clp)
+{
+ struct nfs4_stateowner *so;
+
+ lockdep_assert_held(&clp->cl_lock);
+
+ list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[hashval],
+ so_strhash) {
+ if (!so->so_is_open_owner)
+ continue;
+ if (same_owner_str(so, &open->op_owner))
+ return openowner(nfs4_get_stateowner(so));
+ }
+ return NULL;
+}
+
+static struct nfs4_openowner *
+find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open,
+ struct nfs4_client *clp)
+{
+ struct nfs4_openowner *oo;
+
+ spin_lock(&clp->cl_lock);
+ oo = find_openstateowner_str_locked(hashval, open, clp);
+ spin_unlock(&clp->cl_lock);
+ return oo;
+}
+
+static inline u32
+opaque_hashval(const void *ptr, int nbytes)
+{
+ unsigned char *cptr = (unsigned char *) ptr;
+
+ u32 x = 0;
+ while (nbytes--) {
+ x *= 37;
+ x += *cptr++;
+ }
+ return x;
+}
+
+static void nfsd4_free_file_rcu(struct rcu_head *rcu)
+{
+ struct nfs4_file *fp = container_of(rcu, struct nfs4_file, fi_rcu);
+
+ kmem_cache_free(file_slab, fp);
+}
+
+void
+put_nfs4_file(struct nfs4_file *fi)
+{
+ might_lock(&state_lock);
+
+ if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
+ hlist_del_rcu(&fi->fi_hash);
+ spin_unlock(&state_lock);
+ WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
+ WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
+ call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
+ }
+}
+
+static struct nfsd_file *
+__nfs4_get_fd(struct nfs4_file *f, int oflag)
+{
+ if (f->fi_fds[oflag])
+ return nfsd_file_get(f->fi_fds[oflag]);
+ return NULL;
+}
+
+static struct nfsd_file *
+find_writeable_file_locked(struct nfs4_file *f)
+{
+ struct nfsd_file *ret;
+
+ lockdep_assert_held(&f->fi_lock);
+
+ ret = __nfs4_get_fd(f, O_WRONLY);
+ if (!ret)
+ ret = __nfs4_get_fd(f, O_RDWR);
+ return ret;
+}
+
+static struct nfsd_file *
+find_writeable_file(struct nfs4_file *f)
+{
+ struct nfsd_file *ret;
+
+ spin_lock(&f->fi_lock);
+ ret = find_writeable_file_locked(f);
+ spin_unlock(&f->fi_lock);
+
+ return ret;
+}
+
+static struct nfsd_file *
+find_readable_file_locked(struct nfs4_file *f)
+{
+ struct nfsd_file *ret;
+
+ lockdep_assert_held(&f->fi_lock);
+
+ ret = __nfs4_get_fd(f, O_RDONLY);
+ if (!ret)
+ ret = __nfs4_get_fd(f, O_RDWR);
+ return ret;
+}
+
+static struct nfsd_file *
+find_readable_file(struct nfs4_file *f)
+{
+ struct nfsd_file *ret;
+
+ spin_lock(&f->fi_lock);
+ ret = find_readable_file_locked(f);
+ spin_unlock(&f->fi_lock);
+
+ return ret;
+}
+
+struct nfsd_file *
+find_any_file(struct nfs4_file *f)
+{
+ struct nfsd_file *ret;
+
+ if (!f)
+ return NULL;
+ spin_lock(&f->fi_lock);
+ ret = __nfs4_get_fd(f, O_RDWR);
+ if (!ret) {
+ ret = __nfs4_get_fd(f, O_WRONLY);
+ if (!ret)
+ ret = __nfs4_get_fd(f, O_RDONLY);
+ }
+ spin_unlock(&f->fi_lock);
+ return ret;
+}
+
+static struct nfsd_file *find_any_file_locked(struct nfs4_file *f)
+{
+ lockdep_assert_held(&f->fi_lock);
+
+ if (f->fi_fds[O_RDWR])
+ return f->fi_fds[O_RDWR];
+ if (f->fi_fds[O_WRONLY])
+ return f->fi_fds[O_WRONLY];
+ if (f->fi_fds[O_RDONLY])
+ return f->fi_fds[O_RDONLY];
+ return NULL;
+}
+
+static struct nfsd_file *find_deleg_file_locked(struct nfs4_file *f)
+{
+ lockdep_assert_held(&f->fi_lock);
+
+ if (f->fi_deleg_file)
+ return f->fi_deleg_file;
+ return NULL;
+}
+
+static atomic_long_t num_delegations;
+unsigned long max_delegations;
+
+/*
+ * Open owner state (share locks)
+ */
+
+/* hash tables for lock and open owners */
+#define OWNER_HASH_BITS 8
+#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS)
+#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1)
+
+static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
+{
+ unsigned int ret;
+
+ ret = opaque_hashval(ownername->data, ownername->len);
+ return ret & OWNER_HASH_MASK;
+}
+
+/* hash table for nfs4_file */
+#define FILE_HASH_BITS 8
+#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
+
+static unsigned int nfsd_fh_hashval(struct knfsd_fh *fh)
+{
+ return jhash2(fh->fh_base.fh_pad, XDR_QUADLEN(fh->fh_size), 0);
+}
+
+static unsigned int file_hashval(struct knfsd_fh *fh)
+{
+ return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1);
+}
+
+static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
+
+static void
+__nfs4_file_get_access(struct nfs4_file *fp, u32 access)
+{
+ lockdep_assert_held(&fp->fi_lock);
+
+ if (access & NFS4_SHARE_ACCESS_WRITE)
+ atomic_inc(&fp->fi_access[O_WRONLY]);
+ if (access & NFS4_SHARE_ACCESS_READ)
+ atomic_inc(&fp->fi_access[O_RDONLY]);
+}
+
+static __be32
+nfs4_file_get_access(struct nfs4_file *fp, u32 access)
+{
+ lockdep_assert_held(&fp->fi_lock);
+
+ /* Does this access mode make sense? */
+ if (access & ~NFS4_SHARE_ACCESS_BOTH)
+ return nfserr_inval;
+
+ /* Does it conflict with a deny mode already set? */
+ if ((access & fp->fi_share_deny) != 0)
+ return nfserr_share_denied;
+
+ __nfs4_file_get_access(fp, access);
+ return nfs_ok;
+}
+
+static __be32 nfs4_file_check_deny(struct nfs4_file *fp, u32 deny)
+{
+ /* Common case is that there is no deny mode. */
+ if (deny) {
+ /* Does this deny mode make sense? */
+ if (deny & ~NFS4_SHARE_DENY_BOTH)
+ return nfserr_inval;
+
+ if ((deny & NFS4_SHARE_DENY_READ) &&
+ atomic_read(&fp->fi_access[O_RDONLY]))
+ return nfserr_share_denied;
+
+ if ((deny & NFS4_SHARE_DENY_WRITE) &&
+ atomic_read(&fp->fi_access[O_WRONLY]))
+ return nfserr_share_denied;
+ }
+ return nfs_ok;
+}
+
+static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
+{
+ might_lock(&fp->fi_lock);
+
+ if (atomic_dec_and_lock(&fp->fi_access[oflag], &fp->fi_lock)) {
+ struct nfsd_file *f1 = NULL;
+ struct nfsd_file *f2 = NULL;
+
+ swap(f1, fp->fi_fds[oflag]);
+ if (atomic_read(&fp->fi_access[1 - oflag]) == 0)
+ swap(f2, fp->fi_fds[O_RDWR]);
+ spin_unlock(&fp->fi_lock);
+ if (f1)
+ nfsd_file_put(f1);
+ if (f2)
+ nfsd_file_put(f2);
+ }
+}
+
+static void nfs4_file_put_access(struct nfs4_file *fp, u32 access)
+{
+ WARN_ON_ONCE(access & ~NFS4_SHARE_ACCESS_BOTH);
+
+ if (access & NFS4_SHARE_ACCESS_WRITE)
+ __nfs4_file_put_access(fp, O_WRONLY);
+ if (access & NFS4_SHARE_ACCESS_READ)
+ __nfs4_file_put_access(fp, O_RDONLY);
+}
+
+/*
+ * Allocate a new open/delegation state counter. This is needed for
+ * pNFS for proper return on close semantics.
+ *
+ * Note that we only allocate it for pNFS-enabled exports, otherwise
+ * all pointers to struct nfs4_clnt_odstate are always NULL.
+ */
+static struct nfs4_clnt_odstate *
+alloc_clnt_odstate(struct nfs4_client *clp)
+{
+ struct nfs4_clnt_odstate *co;
+
+ co = kmem_cache_zalloc(odstate_slab, GFP_KERNEL);
+ if (co) {
+ co->co_client = clp;
+ refcount_set(&co->co_odcount, 1);
+ }
+ return co;
+}
+
+static void
+hash_clnt_odstate_locked(struct nfs4_clnt_odstate *co)
+{
+ struct nfs4_file *fp = co->co_file;
+
+ lockdep_assert_held(&fp->fi_lock);
+ list_add(&co->co_perfile, &fp->fi_clnt_odstate);
+}
+
+static inline void
+get_clnt_odstate(struct nfs4_clnt_odstate *co)
+{
+ if (co)
+ refcount_inc(&co->co_odcount);
+}
+
+static void
+put_clnt_odstate(struct nfs4_clnt_odstate *co)
+{
+ struct nfs4_file *fp;
+
+ if (!co)
+ return;
+
+ fp = co->co_file;
+ if (refcount_dec_and_lock(&co->co_odcount, &fp->fi_lock)) {
+ list_del(&co->co_perfile);
+ spin_unlock(&fp->fi_lock);
+
+ nfsd4_return_all_file_layouts(co->co_client, fp);
+ kmem_cache_free(odstate_slab, co);
+ }
+}
+
+static struct nfs4_clnt_odstate *
+find_or_hash_clnt_odstate(struct nfs4_file *fp, struct nfs4_clnt_odstate *new)
+{
+ struct nfs4_clnt_odstate *co;
+ struct nfs4_client *cl;
+
+ if (!new)
+ return NULL;
+
+ cl = new->co_client;
+
+ spin_lock(&fp->fi_lock);
+ list_for_each_entry(co, &fp->fi_clnt_odstate, co_perfile) {
+ if (co->co_client == cl) {
+ get_clnt_odstate(co);
+ goto out;
+ }
+ }
+ co = new;
+ co->co_file = fp;
+ hash_clnt_odstate_locked(new);
+out:
+ spin_unlock(&fp->fi_lock);
+ return co;
+}
+
+struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab,
+ void (*sc_free)(struct nfs4_stid *))
+{
+ struct nfs4_stid *stid;
+ int new_id;
+
+ stid = kmem_cache_zalloc(slab, GFP_KERNEL);
+ if (!stid)
+ return NULL;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&cl->cl_lock);
+ /* Reserving 0 for start of file in nfsdfs "states" file: */
+ new_id = idr_alloc_cyclic(&cl->cl_stateids, stid, 1, 0, GFP_NOWAIT);
+ spin_unlock(&cl->cl_lock);
+ idr_preload_end();
+ if (new_id < 0)
+ goto out_free;
+
+ stid->sc_free = sc_free;
+ stid->sc_client = cl;
+ stid->sc_stateid.si_opaque.so_id = new_id;
+ stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid;
+ /* Will be incremented before return to client: */
+ refcount_set(&stid->sc_count, 1);
+ spin_lock_init(&stid->sc_lock);
+ INIT_LIST_HEAD(&stid->sc_cp_list);
+
+ /*
+ * It shouldn't be a problem to reuse an opaque stateid value.
+ * I don't think it is for 4.1. But with 4.0 I worry that, for
+ * example, a stray write retransmission could be accepted by
+ * the server when it should have been rejected. Therefore,
+ * adopt a trick from the sctp code to attempt to maximize the
+ * amount of time until an id is reused, by ensuring they always
+ * "increase" (mod INT_MAX):
+ */
+ return stid;
+out_free:
+ kmem_cache_free(slab, stid);
+ return NULL;
+}
+
+/*
+ * Create a unique stateid_t to represent each COPY.
+ */
+static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid,
+ unsigned char sc_type)
+{
+ int new_id;
+
+ stid->stid.si_opaque.so_clid.cl_boot = (u32)nn->boot_time;
+ stid->stid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id;
+ stid->sc_type = sc_type;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&nn->s2s_cp_lock);
+ new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, stid, 0, 0, GFP_NOWAIT);
+ stid->stid.si_opaque.so_id = new_id;
+ stid->stid.si_generation = 1;
+ spin_unlock(&nn->s2s_cp_lock);
+ idr_preload_end();
+ if (new_id < 0)
+ return 0;
+ return 1;
+}
+
+int nfs4_init_copy_state(struct nfsd_net *nn, struct nfsd4_copy *copy)
+{
+ return nfs4_init_cp_state(nn, &copy->cp_stateid, NFS4_COPY_STID);
+}
+
+struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn,
+ struct nfs4_stid *p_stid)
+{
+ struct nfs4_cpntf_state *cps;
+
+ cps = kzalloc(sizeof(struct nfs4_cpntf_state), GFP_KERNEL);
+ if (!cps)
+ return NULL;
+ cps->cpntf_time = ktime_get_boottime_seconds();
+ refcount_set(&cps->cp_stateid.sc_count, 1);
+ if (!nfs4_init_cp_state(nn, &cps->cp_stateid, NFS4_COPYNOTIFY_STID))
+ goto out_free;
+ spin_lock(&nn->s2s_cp_lock);
+ list_add(&cps->cp_list, &p_stid->sc_cp_list);
+ spin_unlock(&nn->s2s_cp_lock);
+ return cps;
+out_free:
+ kfree(cps);
+ return NULL;
+}
+
+void nfs4_free_copy_state(struct nfsd4_copy *copy)
+{
+ struct nfsd_net *nn;
+
+ WARN_ON_ONCE(copy->cp_stateid.sc_type != NFS4_COPY_STID);
+ nn = net_generic(copy->cp_clp->net, nfsd_net_id);
+ spin_lock(&nn->s2s_cp_lock);
+ idr_remove(&nn->s2s_cp_stateids,
+ copy->cp_stateid.stid.si_opaque.so_id);
+ spin_unlock(&nn->s2s_cp_lock);
+}
+
+static void nfs4_free_cpntf_statelist(struct net *net, struct nfs4_stid *stid)
+{
+ struct nfs4_cpntf_state *cps;
+ struct nfsd_net *nn;
+
+ nn = net_generic(net, nfsd_net_id);
+ spin_lock(&nn->s2s_cp_lock);
+ while (!list_empty(&stid->sc_cp_list)) {
+ cps = list_first_entry(&stid->sc_cp_list,
+ struct nfs4_cpntf_state, cp_list);
+ _free_cpntf_state_locked(nn, cps);
+ }
+ spin_unlock(&nn->s2s_cp_lock);
+}
+
+static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp)
+{
+ struct nfs4_stid *stid;
+
+ stid = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_ol_stateid);
+ if (!stid)
+ return NULL;
+
+ return openlockstateid(stid);
+}
+
+static void nfs4_free_deleg(struct nfs4_stid *stid)
+{
+ WARN_ON(!list_empty(&stid->sc_cp_list));
+ kmem_cache_free(deleg_slab, stid);
+ atomic_long_dec(&num_delegations);
+}
+
+/*
+ * When we recall a delegation, we should be careful not to hand it
+ * out again straight away.
+ * To ensure this we keep a pair of bloom filters ('new' and 'old')
+ * in which the filehandles of recalled delegations are "stored".
+ * If a filehandle appear in either filter, a delegation is blocked.
+ * When a delegation is recalled, the filehandle is stored in the "new"
+ * filter.
+ * Every 30 seconds we swap the filters and clear the "new" one,
+ * unless both are empty of course.
+ *
+ * Each filter is 256 bits. We hash the filehandle to 32bit and use the
+ * low 3 bytes as hash-table indices.
+ *
+ * 'blocked_delegations_lock', which is always taken in block_delegations(),
+ * is used to manage concurrent access. Testing does not need the lock
+ * except when swapping the two filters.
+ */
+static DEFINE_SPINLOCK(blocked_delegations_lock);
+static struct bloom_pair {
+ int entries, old_entries;
+ time64_t swap_time;
+ int new; /* index into 'set' */
+ DECLARE_BITMAP(set[2], 256);
+} blocked_delegations;
+
+static int delegation_blocked(struct knfsd_fh *fh)
+{
+ u32 hash;
+ struct bloom_pair *bd = &blocked_delegations;
+
+ if (bd->entries == 0)
+ return 0;
+ if (ktime_get_seconds() - bd->swap_time > 30) {
+ spin_lock(&blocked_delegations_lock);
+ if (ktime_get_seconds() - bd->swap_time > 30) {
+ bd->entries -= bd->old_entries;
+ bd->old_entries = bd->entries;
+ memset(bd->set[bd->new], 0,
+ sizeof(bd->set[0]));
+ bd->new = 1-bd->new;
+ bd->swap_time = ktime_get_seconds();
+ }
+ spin_unlock(&blocked_delegations_lock);
+ }
+ hash = jhash(&fh->fh_base, fh->fh_size, 0);
+ if (test_bit(hash&255, bd->set[0]) &&
+ test_bit((hash>>8)&255, bd->set[0]) &&
+ test_bit((hash>>16)&255, bd->set[0]))
+ return 1;
+
+ if (test_bit(hash&255, bd->set[1]) &&
+ test_bit((hash>>8)&255, bd->set[1]) &&
+ test_bit((hash>>16)&255, bd->set[1]))
+ return 1;
+
+ return 0;
+}
+
+static void block_delegations(struct knfsd_fh *fh)
+{
+ u32 hash;
+ struct bloom_pair *bd = &blocked_delegations;
+
+ hash = jhash(&fh->fh_base, fh->fh_size, 0);
+
+ spin_lock(&blocked_delegations_lock);
+ __set_bit(hash&255, bd->set[bd->new]);
+ __set_bit((hash>>8)&255, bd->set[bd->new]);
+ __set_bit((hash>>16)&255, bd->set[bd->new]);
+ if (bd->entries == 0)
+ bd->swap_time = ktime_get_seconds();
+ bd->entries += 1;
+ spin_unlock(&blocked_delegations_lock);
+}
+
+static struct nfs4_delegation *
+alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
+ struct svc_fh *current_fh,
+ struct nfs4_clnt_odstate *odstate)
+{
+ struct nfs4_delegation *dp;
+ long n;
+
+ dprintk("NFSD alloc_init_deleg\n");
+ n = atomic_long_inc_return(&num_delegations);
+ if (n < 0 || n > max_delegations)
+ goto out_dec;
+ if (delegation_blocked(&current_fh->fh_handle))
+ goto out_dec;
+ dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg));
+ if (dp == NULL)
+ goto out_dec;
+
+ /*
+ * delegation seqid's are never incremented. The 4.1 special
+ * meaning of seqid 0 isn't meaningful, really, but let's avoid
+ * 0 anyway just for consistency and use 1:
+ */
+ dp->dl_stid.sc_stateid.si_generation = 1;
+ INIT_LIST_HEAD(&dp->dl_perfile);
+ INIT_LIST_HEAD(&dp->dl_perclnt);
+ INIT_LIST_HEAD(&dp->dl_recall_lru);
+ dp->dl_clnt_odstate = odstate;
+ get_clnt_odstate(odstate);
+ dp->dl_type = NFS4_OPEN_DELEGATE_READ;
+ dp->dl_retries = 1;
+ nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
+ &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
+ get_nfs4_file(fp);
+ dp->dl_stid.sc_file = fp;
+ return dp;
+out_dec:
+ atomic_long_dec(&num_delegations);
+ return NULL;
+}
+
+void
+nfs4_put_stid(struct nfs4_stid *s)
+{
+ struct nfs4_file *fp = s->sc_file;
+ struct nfs4_client *clp = s->sc_client;
+
+ might_lock(&clp->cl_lock);
+
+ if (!refcount_dec_and_lock(&s->sc_count, &clp->cl_lock)) {
+ wake_up_all(&close_wq);
+ return;
+ }
+ idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id);
+ nfs4_free_cpntf_statelist(clp->net, s);
+ spin_unlock(&clp->cl_lock);
+ s->sc_free(s);
+ if (fp)
+ put_nfs4_file(fp);
+}
+
+void
+nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid)
+{
+ stateid_t *src = &stid->sc_stateid;
+
+ spin_lock(&stid->sc_lock);
+ if (unlikely(++src->si_generation == 0))
+ src->si_generation = 1;
+ memcpy(dst, src, sizeof(*dst));
+ spin_unlock(&stid->sc_lock);
+}
+
+static void put_deleg_file(struct nfs4_file *fp)
+{
+ struct nfsd_file *nf = NULL;
+
+ spin_lock(&fp->fi_lock);
+ if (--fp->fi_delegees == 0)
+ swap(nf, fp->fi_deleg_file);
+ spin_unlock(&fp->fi_lock);
+
+ if (nf)
+ nfsd_file_put(nf);
+}
+
+static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp)
+{
+ struct nfs4_file *fp = dp->dl_stid.sc_file;
+ struct nfsd_file *nf = fp->fi_deleg_file;
+
+ WARN_ON_ONCE(!fp->fi_delegees);
+
+ vfs_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp);
+ put_deleg_file(fp);
+}
+
+static void destroy_unhashed_deleg(struct nfs4_delegation *dp)
+{
+ put_clnt_odstate(dp->dl_clnt_odstate);
+ nfs4_unlock_deleg_lease(dp);
+ nfs4_put_stid(&dp->dl_stid);
+}
+
+void nfs4_unhash_stid(struct nfs4_stid *s)
+{
+ s->sc_type = 0;
+}
+
+/**
+ * nfs4_delegation_exists - Discover if this delegation already exists
+ * @clp: a pointer to the nfs4_client we're granting a delegation to
+ * @fp: a pointer to the nfs4_file we're granting a delegation on
+ *
+ * Return:
+ * On success: true iff an existing delegation is found
+ */
+
+static bool
+nfs4_delegation_exists(struct nfs4_client *clp, struct nfs4_file *fp)
+{
+ struct nfs4_delegation *searchdp = NULL;
+ struct nfs4_client *searchclp = NULL;
+
+ lockdep_assert_held(&state_lock);
+ lockdep_assert_held(&fp->fi_lock);
+
+ list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) {
+ searchclp = searchdp->dl_stid.sc_client;
+ if (clp == searchclp) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/**
+ * hash_delegation_locked - Add a delegation to the appropriate lists
+ * @dp: a pointer to the nfs4_delegation we are adding.
+ * @fp: a pointer to the nfs4_file we're granting a delegation on
+ *
+ * Return:
+ * On success: NULL if the delegation was successfully hashed.
+ *
+ * On error: -EAGAIN if one was previously granted to this
+ * nfs4_client for this nfs4_file. Delegation is not hashed.
+ *
+ */
+
+static int
+hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
+{
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
+
+ lockdep_assert_held(&state_lock);
+ lockdep_assert_held(&fp->fi_lock);
+
+ if (nfs4_delegation_exists(clp, fp))
+ return -EAGAIN;
+ refcount_inc(&dp->dl_stid.sc_count);
+ dp->dl_stid.sc_type = NFS4_DELEG_STID;
+ list_add(&dp->dl_perfile, &fp->fi_delegations);
+ list_add(&dp->dl_perclnt, &clp->cl_delegations);
+ return 0;
+}
+
+static bool delegation_hashed(struct nfs4_delegation *dp)
+{
+ return !(list_empty(&dp->dl_perfile));
+}
+
+static bool
+unhash_delegation_locked(struct nfs4_delegation *dp)
+{
+ struct nfs4_file *fp = dp->dl_stid.sc_file;
+
+ lockdep_assert_held(&state_lock);
+
+ if (!delegation_hashed(dp))
+ return false;
+
+ dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID;
+ /* Ensure that deleg break won't try to requeue it */
+ ++dp->dl_time;
+ spin_lock(&fp->fi_lock);
+ list_del_init(&dp->dl_perclnt);
+ list_del_init(&dp->dl_recall_lru);
+ list_del_init(&dp->dl_perfile);
+ spin_unlock(&fp->fi_lock);
+ return true;
+}
+
+static void destroy_delegation(struct nfs4_delegation *dp)
+{
+ bool unhashed;
+
+ spin_lock(&state_lock);
+ unhashed = unhash_delegation_locked(dp);
+ spin_unlock(&state_lock);
+ if (unhashed)
+ destroy_unhashed_deleg(dp);
+}
+
+static void revoke_delegation(struct nfs4_delegation *dp)
+{
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
+
+ WARN_ON(!list_empty(&dp->dl_recall_lru));
+
+ if (clp->cl_minorversion) {
+ spin_lock(&clp->cl_lock);
+ dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
+ refcount_inc(&dp->dl_stid.sc_count);
+ list_add(&dp->dl_recall_lru, &clp->cl_revoked);
+ spin_unlock(&clp->cl_lock);
+ }
+ destroy_unhashed_deleg(dp);
+}
+
+/*
+ * SETCLIENTID state
+ */
+
+static unsigned int clientid_hashval(u32 id)
+{
+ return id & CLIENT_HASH_MASK;
+}
+
+static unsigned int clientstr_hashval(struct xdr_netobj name)
+{
+ return opaque_hashval(name.data, 8) & CLIENT_HASH_MASK;
+}
+
+/*
+ * We store the NONE, READ, WRITE, and BOTH bits separately in the
+ * st_{access,deny}_bmap field of the stateid, in order to track not
+ * only what share bits are currently in force, but also what
+ * combinations of share bits previous opens have used. This allows us
+ * to enforce the recommendation of rfc 3530 14.2.19 that the server
+ * return an error if the client attempt to downgrade to a combination
+ * of share bits not explicable by closing some of its previous opens.
+ *
+ * XXX: This enforcement is actually incomplete, since we don't keep
+ * track of access/deny bit combinations; so, e.g., we allow:
+ *
+ * OPEN allow read, deny write
+ * OPEN allow both, deny none
+ * DOWNGRADE allow read, deny none
+ *
+ * which we should reject.
+ */
+static unsigned int
+bmap_to_share_mode(unsigned long bmap) {
+ int i;
+ unsigned int access = 0;
+
+ for (i = 1; i < 4; i++) {
+ if (test_bit(i, &bmap))
+ access |= i;
+ }
+ return access;
+}
+
+/* set share access for a given stateid */
+static inline void
+set_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+ unsigned char mask = 1 << access;
+
+ WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
+ stp->st_access_bmap |= mask;
+}
+
+/* clear share access for a given stateid */
+static inline void
+clear_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+ unsigned char mask = 1 << access;
+
+ WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
+ stp->st_access_bmap &= ~mask;
+}
+
+/* test whether a given stateid has access */
+static inline bool
+test_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+ unsigned char mask = 1 << access;
+
+ return (bool)(stp->st_access_bmap & mask);
+}
+
+/* set share deny for a given stateid */
+static inline void
+set_deny(u32 deny, struct nfs4_ol_stateid *stp)
+{
+ unsigned char mask = 1 << deny;
+
+ WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
+ stp->st_deny_bmap |= mask;
+}
+
+/* clear share deny for a given stateid */
+static inline void
+clear_deny(u32 deny, struct nfs4_ol_stateid *stp)
+{
+ unsigned char mask = 1 << deny;
+
+ WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
+ stp->st_deny_bmap &= ~mask;
+}
+
+/* test whether a given stateid is denying specific access */
+static inline bool
+test_deny(u32 deny, struct nfs4_ol_stateid *stp)
+{
+ unsigned char mask = 1 << deny;
+
+ return (bool)(stp->st_deny_bmap & mask);
+}
+
+static int nfs4_access_to_omode(u32 access)
+{
+ switch (access & NFS4_SHARE_ACCESS_BOTH) {
+ case NFS4_SHARE_ACCESS_READ:
+ return O_RDONLY;
+ case NFS4_SHARE_ACCESS_WRITE:
+ return O_WRONLY;
+ case NFS4_SHARE_ACCESS_BOTH:
+ return O_RDWR;
+ }
+ WARN_ON_ONCE(1);
+ return O_RDONLY;
+}
+
+/*
+ * A stateid that had a deny mode associated with it is being released
+ * or downgraded. Recalculate the deny mode on the file.
+ */
+static void
+recalculate_deny_mode(struct nfs4_file *fp)
+{
+ struct nfs4_ol_stateid *stp;
+
+ spin_lock(&fp->fi_lock);
+ fp->fi_share_deny = 0;
+ list_for_each_entry(stp, &fp->fi_stateids, st_perfile)
+ fp->fi_share_deny |= bmap_to_share_mode(stp->st_deny_bmap);
+ spin_unlock(&fp->fi_lock);
+}
+
+static void
+reset_union_bmap_deny(u32 deny, struct nfs4_ol_stateid *stp)
+{
+ int i;
+ bool change = false;
+
+ for (i = 1; i < 4; i++) {
+ if ((i & deny) != i) {
+ change = true;
+ clear_deny(i, stp);
+ }
+ }
+
+ /* Recalculate per-file deny mode if there was a change */
+ if (change)
+ recalculate_deny_mode(stp->st_stid.sc_file);
+}
+
+/* release all access and file references for a given stateid */
+static void
+release_all_access(struct nfs4_ol_stateid *stp)
+{
+ int i;
+ struct nfs4_file *fp = stp->st_stid.sc_file;
+
+ if (fp && stp->st_deny_bmap != 0)
+ recalculate_deny_mode(fp);
+
+ for (i = 1; i < 4; i++) {
+ if (test_access(i, stp))
+ nfs4_file_put_access(stp->st_stid.sc_file, i);
+ clear_access(i, stp);
+ }
+}
+
+static inline void nfs4_free_stateowner(struct nfs4_stateowner *sop)
+{
+ kfree(sop->so_owner.data);
+ sop->so_ops->so_free(sop);
+}
+
+static void nfs4_put_stateowner(struct nfs4_stateowner *sop)
+{
+ struct nfs4_client *clp = sop->so_client;
+
+ might_lock(&clp->cl_lock);
+
+ if (!atomic_dec_and_lock(&sop->so_count, &clp->cl_lock))
+ return;
+ sop->so_ops->so_unhash(sop);
+ spin_unlock(&clp->cl_lock);
+ nfs4_free_stateowner(sop);
+}
+
+static bool
+nfs4_ol_stateid_unhashed(const struct nfs4_ol_stateid *stp)
+{
+ return list_empty(&stp->st_perfile);
+}
+
+static bool unhash_ol_stateid(struct nfs4_ol_stateid *stp)
+{
+ struct nfs4_file *fp = stp->st_stid.sc_file;
+
+ lockdep_assert_held(&stp->st_stateowner->so_client->cl_lock);
+
+ if (list_empty(&stp->st_perfile))
+ return false;
+
+ spin_lock(&fp->fi_lock);
+ list_del_init(&stp->st_perfile);
+ spin_unlock(&fp->fi_lock);
+ list_del(&stp->st_perstateowner);
+ return true;
+}
+
+static void nfs4_free_ol_stateid(struct nfs4_stid *stid)
+{
+ struct nfs4_ol_stateid *stp = openlockstateid(stid);
+
+ put_clnt_odstate(stp->st_clnt_odstate);
+ release_all_access(stp);
+ if (stp->st_stateowner)
+ nfs4_put_stateowner(stp->st_stateowner);
+ WARN_ON(!list_empty(&stid->sc_cp_list));
+ kmem_cache_free(stateid_slab, stid);
+}
+
+static void nfs4_free_lock_stateid(struct nfs4_stid *stid)
+{
+ struct nfs4_ol_stateid *stp = openlockstateid(stid);
+ struct nfs4_lockowner *lo = lockowner(stp->st_stateowner);
+ struct nfsd_file *nf;
+
+ nf = find_any_file(stp->st_stid.sc_file);
+ if (nf) {
+ get_file(nf->nf_file);
+ filp_close(nf->nf_file, (fl_owner_t)lo);
+ nfsd_file_put(nf);
+ }
+ nfs4_free_ol_stateid(stid);
+}
+
+/*
+ * Put the persistent reference to an already unhashed generic stateid, while
+ * holding the cl_lock. If it's the last reference, then put it onto the
+ * reaplist for later destruction.
+ */
+static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp,
+ struct list_head *reaplist)
+{
+ struct nfs4_stid *s = &stp->st_stid;
+ struct nfs4_client *clp = s->sc_client;
+
+ lockdep_assert_held(&clp->cl_lock);
+
+ WARN_ON_ONCE(!list_empty(&stp->st_locks));
+
+ if (!refcount_dec_and_test(&s->sc_count)) {
+ wake_up_all(&close_wq);
+ return;
+ }
+
+ idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id);
+ list_add(&stp->st_locks, reaplist);
+}
+
+static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp)
+{
+ lockdep_assert_held(&stp->st_stid.sc_client->cl_lock);
+
+ if (!unhash_ol_stateid(stp))
+ return false;
+ list_del_init(&stp->st_locks);
+ nfs4_unhash_stid(&stp->st_stid);
+ return true;
+}
+
+static void release_lock_stateid(struct nfs4_ol_stateid *stp)
+{
+ struct nfs4_client *clp = stp->st_stid.sc_client;
+ bool unhashed;
+
+ spin_lock(&clp->cl_lock);
+ unhashed = unhash_lock_stateid(stp);
+ spin_unlock(&clp->cl_lock);
+ if (unhashed)
+ nfs4_put_stid(&stp->st_stid);
+}
+
+static void unhash_lockowner_locked(struct nfs4_lockowner *lo)
+{
+ struct nfs4_client *clp = lo->lo_owner.so_client;
+
+ lockdep_assert_held(&clp->cl_lock);
+
+ list_del_init(&lo->lo_owner.so_strhash);
+}
+
+/*
+ * Free a list of generic stateids that were collected earlier after being
+ * fully unhashed.
+ */
+static void
+free_ol_stateid_reaplist(struct list_head *reaplist)
+{
+ struct nfs4_ol_stateid *stp;
+ struct nfs4_file *fp;
+
+ might_sleep();
+
+ while (!list_empty(reaplist)) {
+ stp = list_first_entry(reaplist, struct nfs4_ol_stateid,
+ st_locks);
+ list_del(&stp->st_locks);
+ fp = stp->st_stid.sc_file;
+ stp->st_stid.sc_free(&stp->st_stid);
+ if (fp)
+ put_nfs4_file(fp);
+ }
+}
+
+static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp,
+ struct list_head *reaplist)
+{
+ struct nfs4_ol_stateid *stp;
+
+ lockdep_assert_held(&open_stp->st_stid.sc_client->cl_lock);
+
+ while (!list_empty(&open_stp->st_locks)) {
+ stp = list_entry(open_stp->st_locks.next,
+ struct nfs4_ol_stateid, st_locks);
+ WARN_ON(!unhash_lock_stateid(stp));
+ put_ol_stateid_locked(stp, reaplist);
+ }
+}
+
+static bool unhash_open_stateid(struct nfs4_ol_stateid *stp,
+ struct list_head *reaplist)
+{
+ lockdep_assert_held(&stp->st_stid.sc_client->cl_lock);
+
+ if (!unhash_ol_stateid(stp))
+ return false;
+ release_open_stateid_locks(stp, reaplist);
+ return true;
+}
+
+static void release_open_stateid(struct nfs4_ol_stateid *stp)
+{
+ LIST_HEAD(reaplist);
+
+ spin_lock(&stp->st_stid.sc_client->cl_lock);
+ if (unhash_open_stateid(stp, &reaplist))
+ put_ol_stateid_locked(stp, &reaplist);
+ spin_unlock(&stp->st_stid.sc_client->cl_lock);
+ free_ol_stateid_reaplist(&reaplist);
+}
+
+static void unhash_openowner_locked(struct nfs4_openowner *oo)
+{
+ struct nfs4_client *clp = oo->oo_owner.so_client;
+
+ lockdep_assert_held(&clp->cl_lock);
+
+ list_del_init(&oo->oo_owner.so_strhash);
+ list_del_init(&oo->oo_perclient);
+}
+
+static void release_last_closed_stateid(struct nfs4_openowner *oo)
+{
+ struct nfsd_net *nn = net_generic(oo->oo_owner.so_client->net,
+ nfsd_net_id);
+ struct nfs4_ol_stateid *s;
+
+ spin_lock(&nn->client_lock);
+ s = oo->oo_last_closed_stid;
+ if (s) {
+ list_del_init(&oo->oo_close_lru);
+ oo->oo_last_closed_stid = NULL;
+ }
+ spin_unlock(&nn->client_lock);
+ if (s)
+ nfs4_put_stid(&s->st_stid);
+}
+
+static void release_openowner(struct nfs4_openowner *oo)
+{
+ struct nfs4_ol_stateid *stp;
+ struct nfs4_client *clp = oo->oo_owner.so_client;
+ struct list_head reaplist;
+
+ INIT_LIST_HEAD(&reaplist);
+
+ spin_lock(&clp->cl_lock);
+ unhash_openowner_locked(oo);
+ while (!list_empty(&oo->oo_owner.so_stateids)) {
+ stp = list_first_entry(&oo->oo_owner.so_stateids,
+ struct nfs4_ol_stateid, st_perstateowner);
+ if (unhash_open_stateid(stp, &reaplist))
+ put_ol_stateid_locked(stp, &reaplist);
+ }
+ spin_unlock(&clp->cl_lock);
+ free_ol_stateid_reaplist(&reaplist);
+ release_last_closed_stateid(oo);
+ nfs4_put_stateowner(&oo->oo_owner);
+}
+
+static inline int
+hash_sessionid(struct nfs4_sessionid *sessionid)
+{
+ struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid;
+
+ return sid->sequence % SESSION_HASH_SIZE;
+}
+
+#ifdef CONFIG_SUNRPC_DEBUG
+static inline void
+dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
+{
+ u32 *ptr = (u32 *)(&sessionid->data[0]);
+ dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]);
+}
+#else
+static inline void
+dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
+{
+}
+#endif
+
+/*
+ * Bump the seqid on cstate->replay_owner, and clear replay_owner if it
+ * won't be used for replay.
+ */
+void nfsd4_bump_seqid(struct nfsd4_compound_state *cstate, __be32 nfserr)
+{
+ struct nfs4_stateowner *so = cstate->replay_owner;
+
+ if (nfserr == nfserr_replay_me)
+ return;
+
+ if (!seqid_mutating_err(ntohl(nfserr))) {
+ nfsd4_cstate_clear_replay(cstate);
+ return;
+ }
+ if (!so)
+ return;
+ if (so->so_is_open_owner)
+ release_last_closed_stateid(openowner(so));
+ so->so_seqid++;
+ return;
+}
+
+static void
+gen_sessionid(struct nfsd4_session *ses)
+{
+ struct nfs4_client *clp = ses->se_client;
+ struct nfsd4_sessionid *sid;
+
+ sid = (struct nfsd4_sessionid *)ses->se_sessionid.data;
+ sid->clientid = clp->cl_clientid;
+ sid->sequence = current_sessionid++;
+ sid->reserved = 0;
+}
+
+/*
+ * The protocol defines ca_maxresponssize_cached to include the size of
+ * the rpc header, but all we need to cache is the data starting after
+ * the end of the initial SEQUENCE operation--the rest we regenerate
+ * each time. Therefore we can advertise a ca_maxresponssize_cached
+ * value that is the number of bytes in our cache plus a few additional
+ * bytes. In order to stay on the safe side, and not promise more than
+ * we can cache, those additional bytes must be the minimum possible: 24
+ * bytes of rpc header (xid through accept state, with AUTH_NULL
+ * verifier), 12 for the compound header (with zero-length tag), and 44
+ * for the SEQUENCE op response:
+ */
+#define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44)
+
+static void
+free_session_slots(struct nfsd4_session *ses)
+{
+ int i;
+
+ for (i = 0; i < ses->se_fchannel.maxreqs; i++) {
+ free_svc_cred(&ses->se_slots[i]->sl_cred);
+ kfree(ses->se_slots[i]);
+ }
+}
+
+/*
+ * We don't actually need to cache the rpc and session headers, so we
+ * can allocate a little less for each slot:
+ */
+static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca)
+{
+ u32 size;
+
+ if (ca->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ)
+ size = 0;
+ else
+ size = ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+ return size + sizeof(struct nfsd4_slot);
+}
+
+/*
+ * XXX: If we run out of reserved DRC memory we could (up to a point)
+ * re-negotiate active sessions and reduce their slot usage to make
+ * room for new connections. For now we just fail the create session.
+ */
+static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn)
+{
+ u32 slotsize = slot_bytes(ca);
+ u32 num = ca->maxreqs;
+ unsigned long avail, total_avail;
+ unsigned int scale_factor;
+
+ spin_lock(&nfsd_drc_lock);
+ if (nfsd_drc_max_mem > nfsd_drc_mem_used)
+ total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used;
+ else
+ /* We have handed out more space than we chose in
+ * set_max_drc() to allow. That isn't really a
+ * problem as long as that doesn't make us think we
+ * have lots more due to integer overflow.
+ */
+ total_avail = 0;
+ avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail);
+ /*
+ * Never use more than a fraction of the remaining memory,
+ * unless it's the only way to give this client a slot.
+ * The chosen fraction is either 1/8 or 1/number of threads,
+ * whichever is smaller. This ensures there are adequate
+ * slots to support multiple clients per thread.
+ * Give the client one slot even if that would require
+ * over-allocation--it is better than failure.
+ */
+ scale_factor = max_t(unsigned int, 8, nn->nfsd_serv->sv_nrthreads);
+
+ avail = clamp_t(unsigned long, avail, slotsize,
+ total_avail/scale_factor);
+ num = min_t(int, num, avail / slotsize);
+ num = max_t(int, num, 1);
+ nfsd_drc_mem_used += num * slotsize;
+ spin_unlock(&nfsd_drc_lock);
+
+ return num;
+}
+
+static void nfsd4_put_drc_mem(struct nfsd4_channel_attrs *ca)
+{
+ int slotsize = slot_bytes(ca);
+
+ spin_lock(&nfsd_drc_lock);
+ nfsd_drc_mem_used -= slotsize * ca->maxreqs;
+ spin_unlock(&nfsd_drc_lock);
+}
+
+static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
+ struct nfsd4_channel_attrs *battrs)
+{
+ int numslots = fattrs->maxreqs;
+ int slotsize = slot_bytes(fattrs);
+ struct nfsd4_session *new;
+ int mem, i;
+
+ BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *)
+ + sizeof(struct nfsd4_session) > PAGE_SIZE);
+ mem = numslots * sizeof(struct nfsd4_slot *);
+
+ new = kzalloc(sizeof(*new) + mem, GFP_KERNEL);
+ if (!new)
+ return NULL;
+ /* allocate each struct nfsd4_slot and data cache in one piece */
+ for (i = 0; i < numslots; i++) {
+ new->se_slots[i] = kzalloc(slotsize, GFP_KERNEL);
+ if (!new->se_slots[i])
+ goto out_free;
+ }
+
+ memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
+ memcpy(&new->se_bchannel, battrs, sizeof(struct nfsd4_channel_attrs));
+
+ return new;
+out_free:
+ while (i--)
+ kfree(new->se_slots[i]);
+ kfree(new);
+ return NULL;
+}
+
+static void free_conn(struct nfsd4_conn *c)
+{
+ svc_xprt_put(c->cn_xprt);
+ kfree(c);
+}
+
+static void nfsd4_conn_lost(struct svc_xpt_user *u)
+{
+ struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user);
+ struct nfs4_client *clp = c->cn_session->se_client;
+
+ spin_lock(&clp->cl_lock);
+ if (!list_empty(&c->cn_persession)) {
+ list_del(&c->cn_persession);
+ free_conn(c);
+ }
+ nfsd4_probe_callback(clp);
+ spin_unlock(&clp->cl_lock);
+}
+
+static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
+{
+ struct nfsd4_conn *conn;
+
+ conn = kmalloc(sizeof(struct nfsd4_conn), GFP_KERNEL);
+ if (!conn)
+ return NULL;
+ svc_xprt_get(rqstp->rq_xprt);
+ conn->cn_xprt = rqstp->rq_xprt;
+ conn->cn_flags = flags;
+ INIT_LIST_HEAD(&conn->cn_xpt_user.list);
+ return conn;
+}
+
+static void __nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
+{
+ conn->cn_session = ses;
+ list_add(&conn->cn_persession, &ses->se_conns);
+}
+
+static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
+{
+ struct nfs4_client *clp = ses->se_client;
+
+ spin_lock(&clp->cl_lock);
+ __nfsd4_hash_conn(conn, ses);
+ spin_unlock(&clp->cl_lock);
+}
+
+static int nfsd4_register_conn(struct nfsd4_conn *conn)
+{
+ conn->cn_xpt_user.callback = nfsd4_conn_lost;
+ return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
+}
+
+static void nfsd4_init_conn(struct svc_rqst *rqstp, struct nfsd4_conn *conn, struct nfsd4_session *ses)
+{
+ int ret;
+
+ nfsd4_hash_conn(conn, ses);
+ ret = nfsd4_register_conn(conn);
+ if (ret)
+ /* oops; xprt is already down: */
+ nfsd4_conn_lost(&conn->cn_xpt_user);
+ /* We may have gained or lost a callback channel: */
+ nfsd4_probe_callback_sync(ses->se_client);
+}
+
+static struct nfsd4_conn *alloc_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_create_session *cses)
+{
+ u32 dir = NFS4_CDFC4_FORE;
+
+ if (cses->flags & SESSION4_BACK_CHAN)
+ dir |= NFS4_CDFC4_BACK;
+ return alloc_conn(rqstp, dir);
+}
+
+/* must be called under client_lock */
+static void nfsd4_del_conns(struct nfsd4_session *s)
+{
+ struct nfs4_client *clp = s->se_client;
+ struct nfsd4_conn *c;
+
+ spin_lock(&clp->cl_lock);
+ while (!list_empty(&s->se_conns)) {
+ c = list_first_entry(&s->se_conns, struct nfsd4_conn, cn_persession);
+ list_del_init(&c->cn_persession);
+ spin_unlock(&clp->cl_lock);
+
+ unregister_xpt_user(c->cn_xprt, &c->cn_xpt_user);
+ free_conn(c);
+
+ spin_lock(&clp->cl_lock);
+ }
+ spin_unlock(&clp->cl_lock);
+}
+
+static void __free_session(struct nfsd4_session *ses)
+{
+ free_session_slots(ses);
+ kfree(ses);
+}
+
+static void free_session(struct nfsd4_session *ses)
+{
+ nfsd4_del_conns(ses);
+ nfsd4_put_drc_mem(&ses->se_fchannel);
+ __free_session(ses);
+}
+
+static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses)
+{
+ int idx;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ new->se_client = clp;
+ gen_sessionid(new);
+
+ INIT_LIST_HEAD(&new->se_conns);
+
+ new->se_cb_seq_nr = 1;
+ new->se_flags = cses->flags;
+ new->se_cb_prog = cses->callback_prog;
+ new->se_cb_sec = cses->cb_sec;
+ atomic_set(&new->se_ref, 0);
+ idx = hash_sessionid(&new->se_sessionid);
+ list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
+ spin_lock(&clp->cl_lock);
+ list_add(&new->se_perclnt, &clp->cl_sessions);
+ spin_unlock(&clp->cl_lock);
+
+ {
+ struct sockaddr *sa = svc_addr(rqstp);
+ /*
+ * This is a little silly; with sessions there's no real
+ * use for the callback address. Use the peer address
+ * as a reasonable default for now, but consider fixing
+ * the rpc client not to require an address in the
+ * future:
+ */
+ rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
+ clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
+ }
+}
+
+/* caller must hold client_lock */
+static struct nfsd4_session *
+__find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net)
+{
+ struct nfsd4_session *elem;
+ int idx;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ lockdep_assert_held(&nn->client_lock);
+
+ dump_sessionid(__func__, sessionid);
+ idx = hash_sessionid(sessionid);
+ /* Search in the appropriate list */
+ list_for_each_entry(elem, &nn->sessionid_hashtbl[idx], se_hash) {
+ if (!memcmp(elem->se_sessionid.data, sessionid->data,
+ NFS4_MAX_SESSIONID_LEN)) {
+ return elem;
+ }
+ }
+
+ dprintk("%s: session not found\n", __func__);
+ return NULL;
+}
+
+static struct nfsd4_session *
+find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net,
+ __be32 *ret)
+{
+ struct nfsd4_session *session;
+ __be32 status = nfserr_badsession;
+
+ session = __find_in_sessionid_hashtbl(sessionid, net);
+ if (!session)
+ goto out;
+ status = nfsd4_get_session_locked(session);
+ if (status)
+ session = NULL;
+out:
+ *ret = status;
+ return session;
+}
+
+/* caller must hold client_lock */
+static void
+unhash_session(struct nfsd4_session *ses)
+{
+ struct nfs4_client *clp = ses->se_client;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ lockdep_assert_held(&nn->client_lock);
+
+ list_del(&ses->se_hash);
+ spin_lock(&ses->se_client->cl_lock);
+ list_del(&ses->se_perclnt);
+ spin_unlock(&ses->se_client->cl_lock);
+}
+
+/* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */
+static int
+STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
+{
+ /*
+ * We're assuming the clid was not given out from a boot
+ * precisely 2^32 (about 136 years) before this one. That seems
+ * a safe assumption:
+ */
+ if (clid->cl_boot == (u32)nn->boot_time)
+ return 0;
+ trace_nfsd_clid_stale(clid);
+ return 1;
+}
+
+/*
+ * XXX Should we use a slab cache ?
+ * This type of memory management is somewhat inefficient, but we use it
+ * anyway since SETCLIENTID is not a common operation.
+ */
+static struct nfs4_client *alloc_client(struct xdr_netobj name)
+{
+ struct nfs4_client *clp;
+ int i;
+
+ clp = kmem_cache_zalloc(client_slab, GFP_KERNEL);
+ if (clp == NULL)
+ return NULL;
+ xdr_netobj_dup(&clp->cl_name, &name, GFP_KERNEL);
+ if (clp->cl_name.data == NULL)
+ goto err_no_name;
+ clp->cl_ownerstr_hashtbl = kmalloc_array(OWNER_HASH_SIZE,
+ sizeof(struct list_head),
+ GFP_KERNEL);
+ if (!clp->cl_ownerstr_hashtbl)
+ goto err_no_hashtbl;
+ for (i = 0; i < OWNER_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&clp->cl_ownerstr_hashtbl[i]);
+ INIT_LIST_HEAD(&clp->cl_sessions);
+ idr_init(&clp->cl_stateids);
+ atomic_set(&clp->cl_rpc_users, 0);
+ clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+ INIT_LIST_HEAD(&clp->cl_idhash);
+ INIT_LIST_HEAD(&clp->cl_openowners);
+ INIT_LIST_HEAD(&clp->cl_delegations);
+ INIT_LIST_HEAD(&clp->cl_lru);
+ INIT_LIST_HEAD(&clp->cl_revoked);
+#ifdef CONFIG_NFSD_PNFS
+ INIT_LIST_HEAD(&clp->cl_lo_states);
+#endif
+ INIT_LIST_HEAD(&clp->async_copies);
+ spin_lock_init(&clp->async_lock);
+ spin_lock_init(&clp->cl_lock);
+ rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
+ return clp;
+err_no_hashtbl:
+ kfree(clp->cl_name.data);
+err_no_name:
+ kmem_cache_free(client_slab, clp);
+ return NULL;
+}
+
+static void __free_client(struct kref *k)
+{
+ struct nfsdfs_client *c = container_of(k, struct nfsdfs_client, cl_ref);
+ struct nfs4_client *clp = container_of(c, struct nfs4_client, cl_nfsdfs);
+
+ free_svc_cred(&clp->cl_cred);
+ kfree(clp->cl_ownerstr_hashtbl);
+ kfree(clp->cl_name.data);
+ kfree(clp->cl_nii_domain.data);
+ kfree(clp->cl_nii_name.data);
+ idr_destroy(&clp->cl_stateids);
+ kmem_cache_free(client_slab, clp);
+}
+
+static void drop_client(struct nfs4_client *clp)
+{
+ kref_put(&clp->cl_nfsdfs.cl_ref, __free_client);
+}
+
+static void
+free_client(struct nfs4_client *clp)
+{
+ while (!list_empty(&clp->cl_sessions)) {
+ struct nfsd4_session *ses;
+ ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+ se_perclnt);
+ list_del(&ses->se_perclnt);
+ WARN_ON_ONCE(atomic_read(&ses->se_ref));
+ free_session(ses);
+ }
+ rpc_destroy_wait_queue(&clp->cl_cb_waitq);
+ if (clp->cl_nfsd_dentry) {
+ nfsd_client_rmdir(clp->cl_nfsd_dentry);
+ clp->cl_nfsd_dentry = NULL;
+ wake_up_all(&expiry_wq);
+ }
+ drop_client(clp);
+}
+
+/* must be called under the client_lock */
+static void
+unhash_client_locked(struct nfs4_client *clp)
+{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ struct nfsd4_session *ses;
+
+ lockdep_assert_held(&nn->client_lock);
+
+ /* Mark the client as expired! */
+ clp->cl_time = 0;
+ /* Make it invisible */
+ if (!list_empty(&clp->cl_idhash)) {
+ list_del_init(&clp->cl_idhash);
+ if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
+ rb_erase(&clp->cl_namenode, &nn->conf_name_tree);
+ else
+ rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
+ }
+ list_del_init(&clp->cl_lru);
+ spin_lock(&clp->cl_lock);
+ list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
+ list_del_init(&ses->se_hash);
+ spin_unlock(&clp->cl_lock);
+}
+
+static void
+unhash_client(struct nfs4_client *clp)
+{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ spin_lock(&nn->client_lock);
+ unhash_client_locked(clp);
+ spin_unlock(&nn->client_lock);
+}
+
+static __be32 mark_client_expired_locked(struct nfs4_client *clp)
+{
+ if (atomic_read(&clp->cl_rpc_users))
+ return nfserr_jukebox;
+ unhash_client_locked(clp);
+ return nfs_ok;
+}
+
+static void
+__destroy_client(struct nfs4_client *clp)
+{
+ int i;
+ struct nfs4_openowner *oo;
+ struct nfs4_delegation *dp;
+ struct list_head reaplist;
+
+ INIT_LIST_HEAD(&reaplist);
+ spin_lock(&state_lock);
+ while (!list_empty(&clp->cl_delegations)) {
+ dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
+ WARN_ON(!unhash_delegation_locked(dp));
+ list_add(&dp->dl_recall_lru, &reaplist);
+ }
+ spin_unlock(&state_lock);
+ while (!list_empty(&reaplist)) {
+ dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
+ list_del_init(&dp->dl_recall_lru);
+ destroy_unhashed_deleg(dp);
+ }
+ while (!list_empty(&clp->cl_revoked)) {
+ dp = list_entry(clp->cl_revoked.next, struct nfs4_delegation, dl_recall_lru);
+ list_del_init(&dp->dl_recall_lru);
+ nfs4_put_stid(&dp->dl_stid);
+ }
+ while (!list_empty(&clp->cl_openowners)) {
+ oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient);
+ nfs4_get_stateowner(&oo->oo_owner);
+ release_openowner(oo);
+ }
+ for (i = 0; i < OWNER_HASH_SIZE; i++) {
+ struct nfs4_stateowner *so, *tmp;
+
+ list_for_each_entry_safe(so, tmp, &clp->cl_ownerstr_hashtbl[i],
+ so_strhash) {
+ /* Should be no openowners at this point */
+ WARN_ON_ONCE(so->so_is_open_owner);
+ remove_blocked_locks(lockowner(so));
+ }
+ }
+ nfsd4_return_all_client_layouts(clp);
+ nfsd4_shutdown_copy(clp);
+ nfsd4_shutdown_callback(clp);
+ if (clp->cl_cb_conn.cb_xprt)
+ svc_xprt_put(clp->cl_cb_conn.cb_xprt);
+ free_client(clp);
+ wake_up_all(&expiry_wq);
+}
+
+static void
+destroy_client(struct nfs4_client *clp)
+{
+ unhash_client(clp);
+ __destroy_client(clp);
+}
+
+static void inc_reclaim_complete(struct nfs4_client *clp)
+{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ if (!nn->track_reclaim_completes)
+ return;
+ if (!nfsd4_find_reclaim_client(clp->cl_name, nn))
+ return;
+ if (atomic_inc_return(&nn->nr_reclaim_complete) ==
+ nn->reclaim_str_hashtbl_size) {
+ printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n",
+ clp->net->ns.inum);
+ nfsd4_end_grace(nn);
+ }
+}
+
+static void expire_client(struct nfs4_client *clp)
+{
+ unhash_client(clp);
+ nfsd4_client_record_remove(clp);
+ __destroy_client(clp);
+}
+
+static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
+{
+ memcpy(target->cl_verifier.data, source->data,
+ sizeof(target->cl_verifier.data));
+}
+
+static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
+{
+ target->cl_clientid.cl_boot = source->cl_clientid.cl_boot;
+ target->cl_clientid.cl_id = source->cl_clientid.cl_id;
+}
+
+static int copy_cred(struct svc_cred *target, struct svc_cred *source)
+{
+ target->cr_principal = kstrdup(source->cr_principal, GFP_KERNEL);
+ target->cr_raw_principal = kstrdup(source->cr_raw_principal,
+ GFP_KERNEL);
+ target->cr_targ_princ = kstrdup(source->cr_targ_princ, GFP_KERNEL);
+ if ((source->cr_principal && !target->cr_principal) ||
+ (source->cr_raw_principal && !target->cr_raw_principal) ||
+ (source->cr_targ_princ && !target->cr_targ_princ))
+ return -ENOMEM;
+
+ target->cr_flavor = source->cr_flavor;
+ target->cr_uid = source->cr_uid;
+ target->cr_gid = source->cr_gid;
+ target->cr_group_info = source->cr_group_info;
+ get_group_info(target->cr_group_info);
+ target->cr_gss_mech = source->cr_gss_mech;
+ if (source->cr_gss_mech)
+ gss_mech_get(source->cr_gss_mech);
+ return 0;
+}
+
+static int
+compare_blob(const struct xdr_netobj *o1, const struct xdr_netobj *o2)
+{
+ if (o1->len < o2->len)
+ return -1;
+ if (o1->len > o2->len)
+ return 1;
+ return memcmp(o1->data, o2->data, o1->len);
+}
+
+static int
+same_verf(nfs4_verifier *v1, nfs4_verifier *v2)
+{
+ return 0 == memcmp(v1->data, v2->data, sizeof(v1->data));
+}
+
+static int
+same_clid(clientid_t *cl1, clientid_t *cl2)
+{
+ return (cl1->cl_boot == cl2->cl_boot) && (cl1->cl_id == cl2->cl_id);
+}
+
+static bool groups_equal(struct group_info *g1, struct group_info *g2)
+{
+ int i;
+
+ if (g1->ngroups != g2->ngroups)
+ return false;
+ for (i=0; i<g1->ngroups; i++)
+ if (!gid_eq(g1->gid[i], g2->gid[i]))
+ return false;
+ return true;
+}
+
+/*
+ * RFC 3530 language requires clid_inuse be returned when the
+ * "principal" associated with a requests differs from that previously
+ * used. We use uid, gid's, and gss principal string as our best
+ * approximation. We also don't want to allow non-gss use of a client
+ * established using gss: in theory cr_principal should catch that
+ * change, but in practice cr_principal can be null even in the gss case
+ * since gssd doesn't always pass down a principal string.
+ */
+static bool is_gss_cred(struct svc_cred *cr)
+{
+ /* Is cr_flavor one of the gss "pseudoflavors"?: */
+ return (cr->cr_flavor > RPC_AUTH_MAXFLAVOR);
+}
+
+
+static bool
+same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
+{
+ if ((is_gss_cred(cr1) != is_gss_cred(cr2))
+ || (!uid_eq(cr1->cr_uid, cr2->cr_uid))
+ || (!gid_eq(cr1->cr_gid, cr2->cr_gid))
+ || !groups_equal(cr1->cr_group_info, cr2->cr_group_info))
+ return false;
+ /* XXX: check that cr_targ_princ fields match ? */
+ if (cr1->cr_principal == cr2->cr_principal)
+ return true;
+ if (!cr1->cr_principal || !cr2->cr_principal)
+ return false;
+ return 0 == strcmp(cr1->cr_principal, cr2->cr_principal);
+}
+
+static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp)
+{
+ struct svc_cred *cr = &rqstp->rq_cred;
+ u32 service;
+
+ if (!cr->cr_gss_mech)
+ return false;
+ service = gss_pseudoflavor_to_service(cr->cr_gss_mech, cr->cr_flavor);
+ return service == RPC_GSS_SVC_INTEGRITY ||
+ service == RPC_GSS_SVC_PRIVACY;
+}
+
+bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
+{
+ struct svc_cred *cr = &rqstp->rq_cred;
+
+ if (!cl->cl_mach_cred)
+ return true;
+ if (cl->cl_cred.cr_gss_mech != cr->cr_gss_mech)
+ return false;
+ if (!svc_rqst_integrity_protected(rqstp))
+ return false;
+ if (cl->cl_cred.cr_raw_principal)
+ return 0 == strcmp(cl->cl_cred.cr_raw_principal,
+ cr->cr_raw_principal);
+ if (!cr->cr_principal)
+ return false;
+ return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal);
+}
+
+static void gen_confirm(struct nfs4_client *clp, struct nfsd_net *nn)
+{
+ __be32 verf[2];
+
+ /*
+ * This is opaque to client, so no need to byte-swap. Use
+ * __force to keep sparse happy
+ */
+ verf[0] = (__force __be32)(u32)ktime_get_real_seconds();
+ verf[1] = (__force __be32)nn->clverifier_counter++;
+ memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data));
+}
+
+static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn)
+{
+ clp->cl_clientid.cl_boot = (u32)nn->boot_time;
+ clp->cl_clientid.cl_id = nn->clientid_counter++;
+ gen_confirm(clp, nn);
+}
+
+static struct nfs4_stid *
+find_stateid_locked(struct nfs4_client *cl, stateid_t *t)
+{
+ struct nfs4_stid *ret;
+
+ ret = idr_find(&cl->cl_stateids, t->si_opaque.so_id);
+ if (!ret || !ret->sc_type)
+ return NULL;
+ return ret;
+}
+
+static struct nfs4_stid *
+find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
+{
+ struct nfs4_stid *s;
+
+ spin_lock(&cl->cl_lock);
+ s = find_stateid_locked(cl, t);
+ if (s != NULL) {
+ if (typemask & s->sc_type)
+ refcount_inc(&s->sc_count);
+ else
+ s = NULL;
+ }
+ spin_unlock(&cl->cl_lock);
+ return s;
+}
+
+static struct nfs4_client *get_nfsdfs_clp(struct inode *inode)
+{
+ struct nfsdfs_client *nc;
+ nc = get_nfsdfs_client(inode);
+ if (!nc)
+ return NULL;
+ return container_of(nc, struct nfs4_client, cl_nfsdfs);
+}
+
+static void seq_quote_mem(struct seq_file *m, char *data, int len)
+{
+ seq_printf(m, "\"");
+ seq_escape_mem_ascii(m, data, len);
+ seq_printf(m, "\"");
+}
+
+static int client_info_show(struct seq_file *m, void *v)
+{
+ struct inode *inode = m->private;
+ struct nfs4_client *clp;
+ u64 clid;
+
+ clp = get_nfsdfs_clp(inode);
+ if (!clp)
+ return -ENXIO;
+ memcpy(&clid, &clp->cl_clientid, sizeof(clid));
+ seq_printf(m, "clientid: 0x%llx\n", clid);
+ seq_printf(m, "address: \"%pISpc\"\n", (struct sockaddr *)&clp->cl_addr);
+ seq_printf(m, "name: ");
+ seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len);
+ seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion);
+ if (clp->cl_nii_domain.data) {
+ seq_printf(m, "Implementation domain: ");
+ seq_quote_mem(m, clp->cl_nii_domain.data,
+ clp->cl_nii_domain.len);
+ seq_printf(m, "\nImplementation name: ");
+ seq_quote_mem(m, clp->cl_nii_name.data, clp->cl_nii_name.len);
+ seq_printf(m, "\nImplementation time: [%lld, %ld]\n",
+ clp->cl_nii_time.tv_sec, clp->cl_nii_time.tv_nsec);
+ }
+ drop_client(clp);
+
+ return 0;
+}
+
+static int client_info_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, client_info_show, inode);
+}
+
+static const struct file_operations client_info_fops = {
+ .open = client_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void *states_start(struct seq_file *s, loff_t *pos)
+ __acquires(&clp->cl_lock)
+{
+ struct nfs4_client *clp = s->private;
+ unsigned long id = *pos;
+ void *ret;
+
+ spin_lock(&clp->cl_lock);
+ ret = idr_get_next_ul(&clp->cl_stateids, &id);
+ *pos = id;
+ return ret;
+}
+
+static void *states_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct nfs4_client *clp = s->private;
+ unsigned long id = *pos;
+ void *ret;
+
+ id = *pos;
+ id++;
+ ret = idr_get_next_ul(&clp->cl_stateids, &id);
+ *pos = id;
+ return ret;
+}
+
+static void states_stop(struct seq_file *s, void *v)
+ __releases(&clp->cl_lock)
+{
+ struct nfs4_client *clp = s->private;
+
+ spin_unlock(&clp->cl_lock);
+}
+
+static void nfs4_show_fname(struct seq_file *s, struct nfsd_file *f)
+{
+ seq_printf(s, "filename: \"%pD2\"", f->nf_file);
+}
+
+static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f)
+{
+ struct inode *inode = f->nf_inode;
+
+ seq_printf(s, "superblock: \"%02x:%02x:%ld\"",
+ MAJOR(inode->i_sb->s_dev),
+ MINOR(inode->i_sb->s_dev),
+ inode->i_ino);
+}
+
+static void nfs4_show_owner(struct seq_file *s, struct nfs4_stateowner *oo)
+{
+ seq_printf(s, "owner: ");
+ seq_quote_mem(s, oo->so_owner.data, oo->so_owner.len);
+}
+
+static void nfs4_show_stateid(struct seq_file *s, stateid_t *stid)
+{
+ seq_printf(s, "0x%.8x", stid->si_generation);
+ seq_printf(s, "%12phN", &stid->si_opaque);
+}
+
+static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
+{
+ struct nfs4_ol_stateid *ols;
+ struct nfs4_file *nf;
+ struct nfsd_file *file;
+ struct nfs4_stateowner *oo;
+ unsigned int access, deny;
+
+ if (st->sc_type != NFS4_OPEN_STID && st->sc_type != NFS4_LOCK_STID)
+ return 0; /* XXX: or SEQ_SKIP? */
+ ols = openlockstateid(st);
+ oo = ols->st_stateowner;
+ nf = st->sc_file;
+
+ spin_lock(&nf->fi_lock);
+ file = find_any_file_locked(nf);
+ if (!file)
+ goto out;
+
+ seq_printf(s, "- ");
+ nfs4_show_stateid(s, &st->sc_stateid);
+ seq_printf(s, ": { type: open, ");
+
+ access = bmap_to_share_mode(ols->st_access_bmap);
+ deny = bmap_to_share_mode(ols->st_deny_bmap);
+
+ seq_printf(s, "access: %s%s, ",
+ access & NFS4_SHARE_ACCESS_READ ? "r" : "-",
+ access & NFS4_SHARE_ACCESS_WRITE ? "w" : "-");
+ seq_printf(s, "deny: %s%s, ",
+ deny & NFS4_SHARE_ACCESS_READ ? "r" : "-",
+ deny & NFS4_SHARE_ACCESS_WRITE ? "w" : "-");
+
+ nfs4_show_superblock(s, file);
+ seq_printf(s, ", ");
+ nfs4_show_fname(s, file);
+ seq_printf(s, ", ");
+ nfs4_show_owner(s, oo);
+ seq_printf(s, " }\n");
+out:
+ spin_unlock(&nf->fi_lock);
+ return 0;
+}
+
+static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
+{
+ struct nfs4_ol_stateid *ols;
+ struct nfs4_file *nf;
+ struct nfsd_file *file;
+ struct nfs4_stateowner *oo;
+
+ ols = openlockstateid(st);
+ oo = ols->st_stateowner;
+ nf = st->sc_file;
+ spin_lock(&nf->fi_lock);
+ file = find_any_file_locked(nf);
+ if (!file)
+ goto out;
+
+ seq_printf(s, "- ");
+ nfs4_show_stateid(s, &st->sc_stateid);
+ seq_printf(s, ": { type: lock, ");
+
+ /*
+ * Note: a lock stateid isn't really the same thing as a lock,
+ * it's the locking state held by one owner on a file, and there
+ * may be multiple (or no) lock ranges associated with it.
+ * (Same for the matter is true of open stateids.)
+ */
+
+ nfs4_show_superblock(s, file);
+ /* XXX: open stateid? */
+ seq_printf(s, ", ");
+ nfs4_show_fname(s, file);
+ seq_printf(s, ", ");
+ nfs4_show_owner(s, oo);
+ seq_printf(s, " }\n");
+out:
+ spin_unlock(&nf->fi_lock);
+ return 0;
+}
+
+static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
+{
+ struct nfs4_delegation *ds;
+ struct nfs4_file *nf;
+ struct nfsd_file *file;
+
+ ds = delegstateid(st);
+ nf = st->sc_file;
+ spin_lock(&nf->fi_lock);
+ file = find_deleg_file_locked(nf);
+ if (!file)
+ goto out;
+
+ seq_printf(s, "- ");
+ nfs4_show_stateid(s, &st->sc_stateid);
+ seq_printf(s, ": { type: deleg, ");
+
+ /* Kinda dead code as long as we only support read delegs: */
+ seq_printf(s, "access: %s, ",
+ ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w");
+
+ /* XXX: lease time, whether it's being recalled. */
+
+ nfs4_show_superblock(s, file);
+ seq_printf(s, ", ");
+ nfs4_show_fname(s, file);
+ seq_printf(s, " }\n");
+out:
+ spin_unlock(&nf->fi_lock);
+ return 0;
+}
+
+static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st)
+{
+ struct nfs4_layout_stateid *ls;
+ struct nfsd_file *file;
+
+ ls = container_of(st, struct nfs4_layout_stateid, ls_stid);
+ file = ls->ls_file;
+
+ seq_printf(s, "- ");
+ nfs4_show_stateid(s, &st->sc_stateid);
+ seq_printf(s, ": { type: layout, ");
+
+ /* XXX: What else would be useful? */
+
+ nfs4_show_superblock(s, file);
+ seq_printf(s, ", ");
+ nfs4_show_fname(s, file);
+ seq_printf(s, " }\n");
+
+ return 0;
+}
+
+static int states_show(struct seq_file *s, void *v)
+{
+ struct nfs4_stid *st = v;
+
+ switch (st->sc_type) {
+ case NFS4_OPEN_STID:
+ return nfs4_show_open(s, st);
+ case NFS4_LOCK_STID:
+ return nfs4_show_lock(s, st);
+ case NFS4_DELEG_STID:
+ return nfs4_show_deleg(s, st);
+ case NFS4_LAYOUT_STID:
+ return nfs4_show_layout(s, st);
+ default:
+ return 0; /* XXX: or SEQ_SKIP? */
+ }
+ /* XXX: copy stateids? */
+}
+
+static struct seq_operations states_seq_ops = {
+ .start = states_start,
+ .next = states_next,
+ .stop = states_stop,
+ .show = states_show
+};
+
+static int client_states_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *s;
+ struct nfs4_client *clp;
+ int ret;
+
+ clp = get_nfsdfs_clp(inode);
+ if (!clp)
+ return -ENXIO;
+
+ ret = seq_open(file, &states_seq_ops);
+ if (ret)
+ return ret;
+ s = file->private_data;
+ s->private = clp;
+ return 0;
+}
+
+static int client_opens_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = file->private_data;
+ struct nfs4_client *clp = m->private;
+
+ /* XXX: alternatively, we could get/drop in seq start/stop */
+ drop_client(clp);
+ return seq_release(inode, file);
+}
+
+static const struct file_operations client_states_fops = {
+ .open = client_states_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = client_opens_release,
+};
+
+/*
+ * Normally we refuse to destroy clients that are in use, but here the
+ * administrator is telling us to just do it. We also want to wait
+ * so the caller has a guarantee that the client's locks are gone by
+ * the time the write returns:
+ */
+static void force_expire_client(struct nfs4_client *clp)
+{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ bool already_expired;
+
+ spin_lock(&nn->client_lock);
+ clp->cl_time = 0;
+ spin_unlock(&nn->client_lock);
+
+ wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0);
+ spin_lock(&nn->client_lock);
+ already_expired = list_empty(&clp->cl_lru);
+ if (!already_expired)
+ unhash_client_locked(clp);
+ spin_unlock(&nn->client_lock);
+
+ if (!already_expired)
+ expire_client(clp);
+ else
+ wait_event(expiry_wq, clp->cl_nfsd_dentry == NULL);
+}
+
+static ssize_t client_ctl_write(struct file *file, const char __user *buf,
+ size_t size, loff_t *pos)
+{
+ char *data;
+ struct nfs4_client *clp;
+
+ data = simple_transaction_get(file, buf, size);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+ if (size != 7 || 0 != memcmp(data, "expire\n", 7))
+ return -EINVAL;
+ clp = get_nfsdfs_clp(file_inode(file));
+ if (!clp)
+ return -ENXIO;
+ force_expire_client(clp);
+ drop_client(clp);
+ return 7;
+}
+
+static const struct file_operations client_ctl_fops = {
+ .write = client_ctl_write,
+ .release = simple_transaction_release,
+};
+
+static const struct tree_descr client_files[] = {
+ [0] = {"info", &client_info_fops, S_IRUSR},
+ [1] = {"states", &client_states_fops, S_IRUSR},
+ [2] = {"ctl", &client_ctl_fops, S_IWUSR},
+ [3] = {""},
+};
+
+static struct nfs4_client *create_client(struct xdr_netobj name,
+ struct svc_rqst *rqstp, nfs4_verifier *verf)
+{
+ struct nfs4_client *clp;
+ struct sockaddr *sa = svc_addr(rqstp);
+ int ret;
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ clp = alloc_client(name);
+ if (clp == NULL)
+ return NULL;
+
+ ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred);
+ if (ret) {
+ free_client(clp);
+ return NULL;
+ }
+ gen_clid(clp, nn);
+ kref_init(&clp->cl_nfsdfs.cl_ref);
+ nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
+ clp->cl_time = ktime_get_boottime_seconds();
+ clear_bit(0, &clp->cl_cb_slot_busy);
+ copy_verf(clp, verf);
+ memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
+ clp->cl_cb_session = NULL;
+ clp->net = net;
+ clp->cl_nfsd_dentry = nfsd_client_mkdir(nn, &clp->cl_nfsdfs,
+ clp->cl_clientid.cl_id - nn->clientid_base,
+ client_files);
+ if (!clp->cl_nfsd_dentry) {
+ free_client(clp);
+ return NULL;
+ }
+ return clp;
+}
+
+static void
+add_clp_to_name_tree(struct nfs4_client *new_clp, struct rb_root *root)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+ struct nfs4_client *clp;
+
+ while (*new) {
+ clp = rb_entry(*new, struct nfs4_client, cl_namenode);
+ parent = *new;
+
+ if (compare_blob(&clp->cl_name, &new_clp->cl_name) > 0)
+ new = &((*new)->rb_left);
+ else
+ new = &((*new)->rb_right);
+ }
+
+ rb_link_node(&new_clp->cl_namenode, parent, new);
+ rb_insert_color(&new_clp->cl_namenode, root);
+}
+
+static struct nfs4_client *
+find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root)
+{
+ int cmp;
+ struct rb_node *node = root->rb_node;
+ struct nfs4_client *clp;
+
+ while (node) {
+ clp = rb_entry(node, struct nfs4_client, cl_namenode);
+ cmp = compare_blob(&clp->cl_name, name);
+ if (cmp > 0)
+ node = node->rb_left;
+ else if (cmp < 0)
+ node = node->rb_right;
+ else
+ return clp;
+ }
+ return NULL;
+}
+
+static void
+add_to_unconfirmed(struct nfs4_client *clp)
+{
+ unsigned int idhashval;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ lockdep_assert_held(&nn->client_lock);
+
+ clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
+ add_clp_to_name_tree(clp, &nn->unconf_name_tree);
+ idhashval = clientid_hashval(clp->cl_clientid.cl_id);
+ list_add(&clp->cl_idhash, &nn->unconf_id_hashtbl[idhashval]);
+ renew_client_locked(clp);
+}
+
+static void
+move_to_confirmed(struct nfs4_client *clp)
+{
+ unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id);
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ lockdep_assert_held(&nn->client_lock);
+
+ dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
+ list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]);
+ rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
+ add_clp_to_name_tree(clp, &nn->conf_name_tree);
+ set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
+ renew_client_locked(clp);
+}
+
+static struct nfs4_client *
+find_client_in_id_table(struct list_head *tbl, clientid_t *clid, bool sessions)
+{
+ struct nfs4_client *clp;
+ unsigned int idhashval = clientid_hashval(clid->cl_id);
+
+ list_for_each_entry(clp, &tbl[idhashval], cl_idhash) {
+ if (same_clid(&clp->cl_clientid, clid)) {
+ if ((bool)clp->cl_minorversion != sessions)
+ return NULL;
+ renew_client_locked(clp);
+ return clp;
+ }
+ }
+ return NULL;
+}
+
+static struct nfs4_client *
+find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
+{
+ struct list_head *tbl = nn->conf_id_hashtbl;
+
+ lockdep_assert_held(&nn->client_lock);
+ return find_client_in_id_table(tbl, clid, sessions);
+}
+
+static struct nfs4_client *
+find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
+{
+ struct list_head *tbl = nn->unconf_id_hashtbl;
+
+ lockdep_assert_held(&nn->client_lock);
+ return find_client_in_id_table(tbl, clid, sessions);
+}
+
+static bool clp_used_exchangeid(struct nfs4_client *clp)
+{
+ return clp->cl_exchange_flags != 0;
+}
+
+static struct nfs4_client *
+find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
+{
+ lockdep_assert_held(&nn->client_lock);
+ return find_clp_in_name_tree(name, &nn->conf_name_tree);
+}
+
+static struct nfs4_client *
+find_unconfirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
+{
+ lockdep_assert_held(&nn->client_lock);
+ return find_clp_in_name_tree(name, &nn->unconf_name_tree);
+}
+
+static void
+gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp)
+{
+ struct nfs4_cb_conn *conn = &clp->cl_cb_conn;
+ struct sockaddr *sa = svc_addr(rqstp);
+ u32 scopeid = rpc_get_scope_id(sa);
+ unsigned short expected_family;
+
+ /* Currently, we only support tcp and tcp6 for the callback channel */
+ if (se->se_callback_netid_len == 3 &&
+ !memcmp(se->se_callback_netid_val, "tcp", 3))
+ expected_family = AF_INET;
+ else if (se->se_callback_netid_len == 4 &&
+ !memcmp(se->se_callback_netid_val, "tcp6", 4))
+ expected_family = AF_INET6;
+ else
+ goto out_err;
+
+ conn->cb_addrlen = rpc_uaddr2sockaddr(clp->net, se->se_callback_addr_val,
+ se->se_callback_addr_len,
+ (struct sockaddr *)&conn->cb_addr,
+ sizeof(conn->cb_addr));
+
+ if (!conn->cb_addrlen || conn->cb_addr.ss_family != expected_family)
+ goto out_err;
+
+ if (conn->cb_addr.ss_family == AF_INET6)
+ ((struct sockaddr_in6 *)&conn->cb_addr)->sin6_scope_id = scopeid;
+
+ conn->cb_prog = se->se_callback_prog;
+ conn->cb_ident = se->se_callback_ident;
+ memcpy(&conn->cb_saddr, &rqstp->rq_daddr, rqstp->rq_daddrlen);
+ trace_nfsd_cb_args(clp, conn);
+ return;
+out_err:
+ conn->cb_addr.ss_family = AF_UNSPEC;
+ conn->cb_addrlen = 0;
+ trace_nfsd_cb_nodelegs(clp);
+ return;
+}
+
+/*
+ * Cache a reply. nfsd4_check_resp_size() has bounded the cache size.
+ */
+static void
+nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
+{
+ struct xdr_buf *buf = resp->xdr.buf;
+ struct nfsd4_slot *slot = resp->cstate.slot;
+ unsigned int base;
+
+ dprintk("--> %s slot %p\n", __func__, slot);
+
+ slot->sl_flags |= NFSD4_SLOT_INITIALIZED;
+ slot->sl_opcnt = resp->opcnt;
+ slot->sl_status = resp->cstate.status;
+ free_svc_cred(&slot->sl_cred);
+ copy_cred(&slot->sl_cred, &resp->rqstp->rq_cred);
+
+ if (!nfsd4_cache_this(resp)) {
+ slot->sl_flags &= ~NFSD4_SLOT_CACHED;
+ return;
+ }
+ slot->sl_flags |= NFSD4_SLOT_CACHED;
+
+ base = resp->cstate.data_offset;
+ slot->sl_datalen = buf->len - base;
+ if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen))
+ WARN(1, "%s: sessions DRC could not cache compound\n",
+ __func__);
+ return;
+}
+
+/*
+ * Encode the replay sequence operation from the slot values.
+ * If cachethis is FALSE encode the uncached rep error on the next
+ * operation which sets resp->p and increments resp->opcnt for
+ * nfs4svc_encode_compoundres.
+ *
+ */
+static __be32
+nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args,
+ struct nfsd4_compoundres *resp)
+{
+ struct nfsd4_op *op;
+ struct nfsd4_slot *slot = resp->cstate.slot;
+
+ /* Encode the replayed sequence operation */
+ op = &args->ops[resp->opcnt - 1];
+ nfsd4_encode_operation(resp, op);
+
+ if (slot->sl_flags & NFSD4_SLOT_CACHED)
+ return op->status;
+ if (args->opcnt == 1) {
+ /*
+ * The original operation wasn't a solo sequence--we
+ * always cache those--so this retry must not match the
+ * original:
+ */
+ op->status = nfserr_seq_false_retry;
+ } else {
+ op = &args->ops[resp->opcnt++];
+ op->status = nfserr_retry_uncached_rep;
+ nfsd4_encode_operation(resp, op);
+ }
+ return op->status;
+}
+
+/*
+ * The sequence operation is not cached because we can use the slot and
+ * session values.
+ */
+static __be32
+nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
+ struct nfsd4_sequence *seq)
+{
+ struct nfsd4_slot *slot = resp->cstate.slot;
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+ __be32 status;
+
+ dprintk("--> %s slot %p\n", __func__, slot);
+
+ status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp);
+ if (status)
+ return status;
+
+ p = xdr_reserve_space(xdr, slot->sl_datalen);
+ if (!p) {
+ WARN_ON_ONCE(1);
+ return nfserr_serverfault;
+ }
+ xdr_encode_opaque_fixed(p, slot->sl_data, slot->sl_datalen);
+ xdr_commit_encode(xdr);
+
+ resp->opcnt = slot->sl_opcnt;
+ return slot->sl_status;
+}
+
+/*
+ * Set the exchange_id flags returned by the server.
+ */
+static void
+nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
+{
+#ifdef CONFIG_NFSD_PNFS
+ new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS;
+#else
+ new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+#endif
+
+ /* Referrals are supported, Migration is not. */
+ new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
+
+ /* set the wire flags to return to client. */
+ clid->flags = new->cl_exchange_flags;
+}
+
+static bool client_has_openowners(struct nfs4_client *clp)
+{
+ struct nfs4_openowner *oo;
+
+ list_for_each_entry(oo, &clp->cl_openowners, oo_perclient) {
+ if (!list_empty(&oo->oo_owner.so_stateids))
+ return true;
+ }
+ return false;
+}
+
+static bool client_has_state(struct nfs4_client *clp)
+{
+ return client_has_openowners(clp)
+#ifdef CONFIG_NFSD_PNFS
+ || !list_empty(&clp->cl_lo_states)
+#endif
+ || !list_empty(&clp->cl_delegations)
+ || !list_empty(&clp->cl_sessions)
+ || !list_empty(&clp->async_copies);
+}
+
+static __be32 copy_impl_id(struct nfs4_client *clp,
+ struct nfsd4_exchange_id *exid)
+{
+ if (!exid->nii_domain.data)
+ return 0;
+ xdr_netobj_dup(&clp->cl_nii_domain, &exid->nii_domain, GFP_KERNEL);
+ if (!clp->cl_nii_domain.data)
+ return nfserr_jukebox;
+ xdr_netobj_dup(&clp->cl_nii_name, &exid->nii_name, GFP_KERNEL);
+ if (!clp->cl_nii_name.data)
+ return nfserr_jukebox;
+ clp->cl_nii_time = exid->nii_time;
+ return 0;
+}
+
+__be32
+nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_exchange_id *exid = &u->exchange_id;
+ struct nfs4_client *conf, *new;
+ struct nfs4_client *unconf = NULL;
+ __be32 status;
+ char addr_str[INET6_ADDRSTRLEN];
+ nfs4_verifier verf = exid->verifier;
+ struct sockaddr *sa = svc_addr(rqstp);
+ bool update = exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ rpc_ntop(sa, addr_str, sizeof(addr_str));
+ dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
+ "ip_addr=%s flags %x, spa_how %d\n",
+ __func__, rqstp, exid, exid->clname.len, exid->clname.data,
+ addr_str, exid->flags, exid->spa_how);
+
+ if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
+ return nfserr_inval;
+
+ new = create_client(exid->clname, rqstp, &verf);
+ if (new == NULL)
+ return nfserr_jukebox;
+ status = copy_impl_id(new, exid);
+ if (status)
+ goto out_nolock;
+
+ switch (exid->spa_how) {
+ case SP4_MACH_CRED:
+ exid->spo_must_enforce[0] = 0;
+ exid->spo_must_enforce[1] = (
+ 1 << (OP_BIND_CONN_TO_SESSION - 32) |
+ 1 << (OP_EXCHANGE_ID - 32) |
+ 1 << (OP_CREATE_SESSION - 32) |
+ 1 << (OP_DESTROY_SESSION - 32) |
+ 1 << (OP_DESTROY_CLIENTID - 32));
+
+ exid->spo_must_allow[0] &= (1 << (OP_CLOSE) |
+ 1 << (OP_OPEN_DOWNGRADE) |
+ 1 << (OP_LOCKU) |
+ 1 << (OP_DELEGRETURN));
+
+ exid->spo_must_allow[1] &= (
+ 1 << (OP_TEST_STATEID - 32) |
+ 1 << (OP_FREE_STATEID - 32));
+ if (!svc_rqst_integrity_protected(rqstp)) {
+ status = nfserr_inval;
+ goto out_nolock;
+ }
+ /*
+ * Sometimes userspace doesn't give us a principal.
+ * Which is a bug, really. Anyway, we can't enforce
+ * MACH_CRED in that case, better to give up now:
+ */
+ if (!new->cl_cred.cr_principal &&
+ !new->cl_cred.cr_raw_principal) {
+ status = nfserr_serverfault;
+ goto out_nolock;
+ }
+ new->cl_mach_cred = true;
+ case SP4_NONE:
+ break;
+ default: /* checked by xdr code */
+ WARN_ON_ONCE(1);
+ fallthrough;
+ case SP4_SSV:
+ status = nfserr_encr_alg_unsupp;
+ goto out_nolock;
+ }
+
+ /* Cases below refer to rfc 5661 section 18.35.4: */
+ spin_lock(&nn->client_lock);
+ conf = find_confirmed_client_by_name(&exid->clname, nn);
+ if (conf) {
+ bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred);
+ bool verfs_match = same_verf(&verf, &conf->cl_verifier);
+
+ if (update) {
+ if (!clp_used_exchangeid(conf)) { /* buggy client */
+ status = nfserr_inval;
+ goto out;
+ }
+ if (!nfsd4_mach_creds_match(conf, rqstp)) {
+ status = nfserr_wrong_cred;
+ goto out;
+ }
+ if (!creds_match) { /* case 9 */
+ status = nfserr_perm;
+ goto out;
+ }
+ if (!verfs_match) { /* case 8 */
+ status = nfserr_not_same;
+ goto out;
+ }
+ /* case 6 */
+ exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
+ goto out_copy;
+ }
+ if (!creds_match) { /* case 3 */
+ if (client_has_state(conf)) {
+ status = nfserr_clid_inuse;
+ goto out;
+ }
+ goto out_new;
+ }
+ if (verfs_match) { /* case 2 */
+ conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R;
+ goto out_copy;
+ }
+ /* case 5, client reboot */
+ conf = NULL;
+ goto out_new;
+ }
+
+ if (update) { /* case 7 */
+ status = nfserr_noent;
+ goto out;
+ }
+
+ unconf = find_unconfirmed_client_by_name(&exid->clname, nn);
+ if (unconf) /* case 4, possible retry or client restart */
+ unhash_client_locked(unconf);
+
+ /* case 1 (normal case) */
+out_new:
+ if (conf) {
+ status = mark_client_expired_locked(conf);
+ if (status)
+ goto out;
+ }
+ new->cl_minorversion = cstate->minorversion;
+ new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0];
+ new->cl_spo_must_allow.u.words[1] = exid->spo_must_allow[1];
+
+ add_to_unconfirmed(new);
+ swap(new, conf);
+out_copy:
+ exid->clientid.cl_boot = conf->cl_clientid.cl_boot;
+ exid->clientid.cl_id = conf->cl_clientid.cl_id;
+
+ exid->seqid = conf->cl_cs_slot.sl_seqid + 1;
+ nfsd4_set_ex_flags(conf, exid);
+
+ dprintk("nfsd4_exchange_id seqid %d flags %x\n",
+ conf->cl_cs_slot.sl_seqid, conf->cl_exchange_flags);
+ status = nfs_ok;
+
+out:
+ spin_unlock(&nn->client_lock);
+out_nolock:
+ if (new)
+ expire_client(new);
+ if (unconf)
+ expire_client(unconf);
+ return status;
+}
+
+static __be32
+check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse)
+{
+ dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid,
+ slot_seqid);
+
+ /* The slot is in use, and no response has been sent. */
+ if (slot_inuse) {
+ if (seqid == slot_seqid)
+ return nfserr_jukebox;
+ else
+ return nfserr_seq_misordered;
+ }
+ /* Note unsigned 32-bit arithmetic handles wraparound: */
+ if (likely(seqid == slot_seqid + 1))
+ return nfs_ok;
+ if (seqid == slot_seqid)
+ return nfserr_replay_cache;
+ return nfserr_seq_misordered;
+}
+
+/*
+ * Cache the create session result into the create session single DRC
+ * slot cache by saving the xdr structure. sl_seqid has been set.
+ * Do this for solo or embedded create session operations.
+ */
+static void
+nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses,
+ struct nfsd4_clid_slot *slot, __be32 nfserr)
+{
+ slot->sl_status = nfserr;
+ memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses));
+}
+
+static __be32
+nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses,
+ struct nfsd4_clid_slot *slot)
+{
+ memcpy(cr_ses, &slot->sl_cr_ses, sizeof(*cr_ses));
+ return slot->sl_status;
+}
+
+#define NFSD_MIN_REQ_HDR_SEQ_SZ ((\
+ 2 * 2 + /* credential,verifier: AUTH_NULL, length 0 */ \
+ 1 + /* MIN tag is length with zero, only length */ \
+ 3 + /* version, opcount, opcode */ \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+ /* seqid, slotID, slotID, cache */ \
+ 4 ) * sizeof(__be32))
+
+#define NFSD_MIN_RESP_HDR_SEQ_SZ ((\
+ 2 + /* verifier: AUTH_NULL, length 0 */\
+ 1 + /* status */ \
+ 1 + /* MIN tag is length with zero, only length */ \
+ 3 + /* opcount, opcode, opstatus*/ \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+ /* seqid, slotID, slotID, slotID, status */ \
+ 5 ) * sizeof(__be32))
+
+static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn)
+{
+ u32 maxrpc = nn->nfsd_serv->sv_max_mesg;
+
+ if (ca->maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ)
+ return nfserr_toosmall;
+ if (ca->maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ)
+ return nfserr_toosmall;
+ ca->headerpadsz = 0;
+ ca->maxreq_sz = min_t(u32, ca->maxreq_sz, maxrpc);
+ ca->maxresp_sz = min_t(u32, ca->maxresp_sz, maxrpc);
+ ca->maxops = min_t(u32, ca->maxops, NFSD_MAX_OPS_PER_COMPOUND);
+ ca->maxresp_cached = min_t(u32, ca->maxresp_cached,
+ NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ);
+ ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION);
+ /*
+ * Note decreasing slot size below client's request may make it
+ * difficult for client to function correctly, whereas
+ * decreasing the number of slots will (just?) affect
+ * performance. When short on memory we therefore prefer to
+ * decrease number of slots instead of their size. Clients that
+ * request larger slots than they need will get poor results:
+ * Note that we always allow at least one slot, because our
+ * accounting is soft and provides no guarantees either way.
+ */
+ ca->maxreqs = nfsd4_get_drc_mem(ca, nn);
+
+ return nfs_ok;
+}
+
+/*
+ * Server's NFSv4.1 backchannel support is AUTH_SYS-only for now.
+ * These are based on similar macros in linux/sunrpc/msg_prot.h .
+ */
+#define RPC_MAX_HEADER_WITH_AUTH_SYS \
+ (RPC_CALLHDRSIZE + 2 * (2 + UNX_CALLSLACK))
+
+#define RPC_MAX_REPHEADER_WITH_AUTH_SYS \
+ (RPC_REPHDRSIZE + (2 + NUL_REPLYSLACK))
+
+#define NFSD_CB_MAX_REQ_SZ ((NFS4_enc_cb_recall_sz + \
+ RPC_MAX_HEADER_WITH_AUTH_SYS) * sizeof(__be32))
+#define NFSD_CB_MAX_RESP_SZ ((NFS4_dec_cb_recall_sz + \
+ RPC_MAX_REPHEADER_WITH_AUTH_SYS) * \
+ sizeof(__be32))
+
+static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
+{
+ ca->headerpadsz = 0;
+
+ if (ca->maxreq_sz < NFSD_CB_MAX_REQ_SZ)
+ return nfserr_toosmall;
+ if (ca->maxresp_sz < NFSD_CB_MAX_RESP_SZ)
+ return nfserr_toosmall;
+ ca->maxresp_cached = 0;
+ if (ca->maxops < 2)
+ return nfserr_toosmall;
+
+ return nfs_ok;
+}
+
+static __be32 nfsd4_check_cb_sec(struct nfsd4_cb_sec *cbs)
+{
+ switch (cbs->flavor) {
+ case RPC_AUTH_NULL:
+ case RPC_AUTH_UNIX:
+ return nfs_ok;
+ default:
+ /*
+ * GSS case: the spec doesn't allow us to return this
+ * error. But it also doesn't allow us not to support
+ * GSS.
+ * I'd rather this fail hard than return some error the
+ * client might think it can already handle:
+ */
+ return nfserr_encr_alg_unsupp;
+ }
+}
+
+__be32
+nfsd4_create_session(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate, union nfsd4_op_u *u)
+{
+ struct nfsd4_create_session *cr_ses = &u->create_session;
+ struct sockaddr *sa = svc_addr(rqstp);
+ struct nfs4_client *conf, *unconf;
+ struct nfs4_client *old = NULL;
+ struct nfsd4_session *new;
+ struct nfsd4_conn *conn;
+ struct nfsd4_clid_slot *cs_slot = NULL;
+ __be32 status = 0;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
+ return nfserr_inval;
+ status = nfsd4_check_cb_sec(&cr_ses->cb_sec);
+ if (status)
+ return status;
+ status = check_forechannel_attrs(&cr_ses->fore_channel, nn);
+ if (status)
+ return status;
+ status = check_backchannel_attrs(&cr_ses->back_channel);
+ if (status)
+ goto out_release_drc_mem;
+ status = nfserr_jukebox;
+ new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel);
+ if (!new)
+ goto out_release_drc_mem;
+ conn = alloc_conn_from_crses(rqstp, cr_ses);
+ if (!conn)
+ goto out_free_session;
+
+ spin_lock(&nn->client_lock);
+ unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn);
+ conf = find_confirmed_client(&cr_ses->clientid, true, nn);
+ WARN_ON_ONCE(conf && unconf);
+
+ if (conf) {
+ status = nfserr_wrong_cred;
+ if (!nfsd4_mach_creds_match(conf, rqstp))
+ goto out_free_conn;
+ cs_slot = &conf->cl_cs_slot;
+ status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
+ if (status) {
+ if (status == nfserr_replay_cache)
+ status = nfsd4_replay_create_session(cr_ses, cs_slot);
+ goto out_free_conn;
+ }
+ } else if (unconf) {
+ if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
+ !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
+ status = nfserr_clid_inuse;
+ goto out_free_conn;
+ }
+ status = nfserr_wrong_cred;
+ if (!nfsd4_mach_creds_match(unconf, rqstp))
+ goto out_free_conn;
+ cs_slot = &unconf->cl_cs_slot;
+ status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
+ if (status) {
+ /* an unconfirmed replay returns misordered */
+ status = nfserr_seq_misordered;
+ goto out_free_conn;
+ }
+ old = find_confirmed_client_by_name(&unconf->cl_name, nn);
+ if (old) {
+ status = mark_client_expired_locked(old);
+ if (status) {
+ old = NULL;
+ goto out_free_conn;
+ }
+ }
+ move_to_confirmed(unconf);
+ conf = unconf;
+ } else {
+ status = nfserr_stale_clientid;
+ goto out_free_conn;
+ }
+ status = nfs_ok;
+ /* Persistent sessions are not supported */
+ cr_ses->flags &= ~SESSION4_PERSIST;
+ /* Upshifting from TCP to RDMA is not supported */
+ cr_ses->flags &= ~SESSION4_RDMA;
+
+ init_session(rqstp, new, conf, cr_ses);
+ nfsd4_get_session_locked(new);
+
+ memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
+ NFS4_MAX_SESSIONID_LEN);
+ cs_slot->sl_seqid++;
+ cr_ses->seqid = cs_slot->sl_seqid;
+
+ /* cache solo and embedded create sessions under the client_lock */
+ nfsd4_cache_create_session(cr_ses, cs_slot, status);
+ spin_unlock(&nn->client_lock);
+ /* init connection and backchannel */
+ nfsd4_init_conn(rqstp, conn, new);
+ nfsd4_put_session(new);
+ if (old)
+ expire_client(old);
+ return status;
+out_free_conn:
+ spin_unlock(&nn->client_lock);
+ free_conn(conn);
+ if (old)
+ expire_client(old);
+out_free_session:
+ __free_session(new);
+out_release_drc_mem:
+ nfsd4_put_drc_mem(&cr_ses->fore_channel);
+ return status;
+}
+
+static __be32 nfsd4_map_bcts_dir(u32 *dir)
+{
+ switch (*dir) {
+ case NFS4_CDFC4_FORE:
+ case NFS4_CDFC4_BACK:
+ return nfs_ok;
+ case NFS4_CDFC4_FORE_OR_BOTH:
+ case NFS4_CDFC4_BACK_OR_BOTH:
+ *dir = NFS4_CDFC4_BOTH;
+ return nfs_ok;
+ }
+ return nfserr_inval;
+}
+
+__be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_backchannel_ctl *bc = &u->backchannel_ctl;
+ struct nfsd4_session *session = cstate->session;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ __be32 status;
+
+ status = nfsd4_check_cb_sec(&bc->bc_cb_sec);
+ if (status)
+ return status;
+ spin_lock(&nn->client_lock);
+ session->se_cb_prog = bc->bc_cb_program;
+ session->se_cb_sec = bc->bc_cb_sec;
+ spin_unlock(&nn->client_lock);
+
+ nfsd4_probe_callback(session->se_client);
+
+ return nfs_ok;
+}
+
+static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s)
+{
+ struct nfsd4_conn *c;
+
+ list_for_each_entry(c, &s->se_conns, cn_persession) {
+ if (c->cn_xprt == xpt) {
+ return c;
+ }
+ }
+ return NULL;
+}
+
+static __be32 nfsd4_match_existing_connection(struct svc_rqst *rqst,
+ struct nfsd4_session *session, u32 req, struct nfsd4_conn **conn)
+{
+ struct nfs4_client *clp = session->se_client;
+ struct svc_xprt *xpt = rqst->rq_xprt;
+ struct nfsd4_conn *c;
+ __be32 status;
+
+ /* Following the last paragraph of RFC 5661 Section 18.34.3: */
+ spin_lock(&clp->cl_lock);
+ c = __nfsd4_find_conn(xpt, session);
+ if (!c)
+ status = nfserr_noent;
+ else if (req == c->cn_flags)
+ status = nfs_ok;
+ else if (req == NFS4_CDFC4_FORE_OR_BOTH &&
+ c->cn_flags != NFS4_CDFC4_BACK)
+ status = nfs_ok;
+ else if (req == NFS4_CDFC4_BACK_OR_BOTH &&
+ c->cn_flags != NFS4_CDFC4_FORE)
+ status = nfs_ok;
+ else
+ status = nfserr_inval;
+ spin_unlock(&clp->cl_lock);
+ if (status == nfs_ok && conn)
+ *conn = c;
+ return status;
+}
+
+__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session;
+ __be32 status;
+ struct nfsd4_conn *conn;
+ struct nfsd4_session *session;
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (!nfsd4_last_compound_op(rqstp))
+ return nfserr_not_only_op;
+ spin_lock(&nn->client_lock);
+ session = find_in_sessionid_hashtbl(&bcts->sessionid, net, &status);
+ spin_unlock(&nn->client_lock);
+ if (!session)
+ goto out_no_session;
+ status = nfserr_wrong_cred;
+ if (!nfsd4_mach_creds_match(session->se_client, rqstp))
+ goto out;
+ status = nfsd4_match_existing_connection(rqstp, session,
+ bcts->dir, &conn);
+ if (status == nfs_ok) {
+ if (bcts->dir == NFS4_CDFC4_FORE_OR_BOTH ||
+ bcts->dir == NFS4_CDFC4_BACK)
+ conn->cn_flags |= NFS4_CDFC4_BACK;
+ nfsd4_probe_callback(session->se_client);
+ goto out;
+ }
+ if (status == nfserr_inval)
+ goto out;
+ status = nfsd4_map_bcts_dir(&bcts->dir);
+ if (status)
+ goto out;
+ conn = alloc_conn(rqstp, bcts->dir);
+ status = nfserr_jukebox;
+ if (!conn)
+ goto out;
+ nfsd4_init_conn(rqstp, conn, session);
+ status = nfs_ok;
+out:
+ nfsd4_put_session(session);
+out_no_session:
+ return status;
+}
+
+static bool nfsd4_compound_in_session(struct nfsd4_compound_state *cstate, struct nfs4_sessionid *sid)
+{
+ if (!cstate->session)
+ return false;
+ return !memcmp(sid, &cstate->session->se_sessionid, sizeof(*sid));
+}
+
+__be32
+nfsd4_destroy_session(struct svc_rqst *r, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfs4_sessionid *sessionid = &u->destroy_session.sessionid;
+ struct nfsd4_session *ses;
+ __be32 status;
+ int ref_held_by_me = 0;
+ struct net *net = SVC_NET(r);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ status = nfserr_not_only_op;
+ if (nfsd4_compound_in_session(cstate, sessionid)) {
+ if (!nfsd4_last_compound_op(r))
+ goto out;
+ ref_held_by_me++;
+ }
+ dump_sessionid(__func__, sessionid);
+ spin_lock(&nn->client_lock);
+ ses = find_in_sessionid_hashtbl(sessionid, net, &status);
+ if (!ses)
+ goto out_client_lock;
+ status = nfserr_wrong_cred;
+ if (!nfsd4_mach_creds_match(ses->se_client, r))
+ goto out_put_session;
+ status = mark_session_dead_locked(ses, 1 + ref_held_by_me);
+ if (status)
+ goto out_put_session;
+ unhash_session(ses);
+ spin_unlock(&nn->client_lock);
+
+ nfsd4_probe_callback_sync(ses->se_client);
+
+ spin_lock(&nn->client_lock);
+ status = nfs_ok;
+out_put_session:
+ nfsd4_put_session_locked(ses);
+out_client_lock:
+ spin_unlock(&nn->client_lock);
+out:
+ return status;
+}
+
+static __be32 nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
+{
+ struct nfs4_client *clp = ses->se_client;
+ struct nfsd4_conn *c;
+ __be32 status = nfs_ok;
+ int ret;
+
+ spin_lock(&clp->cl_lock);
+ c = __nfsd4_find_conn(new->cn_xprt, ses);
+ if (c)
+ goto out_free;
+ status = nfserr_conn_not_bound_to_session;
+ if (clp->cl_mach_cred)
+ goto out_free;
+ __nfsd4_hash_conn(new, ses);
+ spin_unlock(&clp->cl_lock);
+ ret = nfsd4_register_conn(new);
+ if (ret)
+ /* oops; xprt is already down: */
+ nfsd4_conn_lost(&new->cn_xpt_user);
+ return nfs_ok;
+out_free:
+ spin_unlock(&clp->cl_lock);
+ free_conn(new);
+ return status;
+}
+
+static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session)
+{
+ struct nfsd4_compoundargs *args = rqstp->rq_argp;
+
+ return args->opcnt > session->se_fchannel.maxops;
+}
+
+static bool nfsd4_request_too_big(struct svc_rqst *rqstp,
+ struct nfsd4_session *session)
+{
+ struct xdr_buf *xb = &rqstp->rq_arg;
+
+ return xb->len > session->se_fchannel.maxreq_sz;
+}
+
+static bool replay_matches_cache(struct svc_rqst *rqstp,
+ struct nfsd4_sequence *seq, struct nfsd4_slot *slot)
+{
+ struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+
+ if ((bool)(slot->sl_flags & NFSD4_SLOT_CACHETHIS) !=
+ (bool)seq->cachethis)
+ return false;
+ /*
+ * If there's an error then the reply can have fewer ops than
+ * the call.
+ */
+ if (slot->sl_opcnt < argp->opcnt && !slot->sl_status)
+ return false;
+ /*
+ * But if we cached a reply with *more* ops than the call you're
+ * sending us now, then this new call is clearly not really a
+ * replay of the old one:
+ */
+ if (slot->sl_opcnt > argp->opcnt)
+ return false;
+ /* This is the only check explicitly called by spec: */
+ if (!same_creds(&rqstp->rq_cred, &slot->sl_cred))
+ return false;
+ /*
+ * There may be more comparisons we could actually do, but the
+ * spec doesn't require us to catch every case where the calls
+ * don't match (that would require caching the call as well as
+ * the reply), so we don't bother.
+ */
+ return true;
+}
+
+__be32
+nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_sequence *seq = &u->sequence;
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct xdr_stream *xdr = &resp->xdr;
+ struct nfsd4_session *session;
+ struct nfs4_client *clp;
+ struct nfsd4_slot *slot;
+ struct nfsd4_conn *conn;
+ __be32 status;
+ int buflen;
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (resp->opcnt != 1)
+ return nfserr_sequence_pos;
+
+ /*
+ * Will be either used or freed by nfsd4_sequence_check_conn
+ * below.
+ */
+ conn = alloc_conn(rqstp, NFS4_CDFC4_FORE);
+ if (!conn)
+ return nfserr_jukebox;
+
+ spin_lock(&nn->client_lock);
+ session = find_in_sessionid_hashtbl(&seq->sessionid, net, &status);
+ if (!session)
+ goto out_no_session;
+ clp = session->se_client;
+
+ status = nfserr_too_many_ops;
+ if (nfsd4_session_too_many_ops(rqstp, session))
+ goto out_put_session;
+
+ status = nfserr_req_too_big;
+ if (nfsd4_request_too_big(rqstp, session))
+ goto out_put_session;
+
+ status = nfserr_badslot;
+ if (seq->slotid >= session->se_fchannel.maxreqs)
+ goto out_put_session;
+
+ slot = session->se_slots[seq->slotid];
+ dprintk("%s: slotid %d\n", __func__, seq->slotid);
+
+ /* We do not negotiate the number of slots yet, so set the
+ * maxslots to the session maxreqs which is used to encode
+ * sr_highest_slotid and the sr_target_slot id to maxslots */
+ seq->maxslots = session->se_fchannel.maxreqs;
+
+ status = check_slot_seqid(seq->seqid, slot->sl_seqid,
+ slot->sl_flags & NFSD4_SLOT_INUSE);
+ if (status == nfserr_replay_cache) {
+ status = nfserr_seq_misordered;
+ if (!(slot->sl_flags & NFSD4_SLOT_INITIALIZED))
+ goto out_put_session;
+ status = nfserr_seq_false_retry;
+ if (!replay_matches_cache(rqstp, seq, slot))
+ goto out_put_session;
+ cstate->slot = slot;
+ cstate->session = session;
+ cstate->clp = clp;
+ /* Return the cached reply status and set cstate->status
+ * for nfsd4_proc_compound processing */
+ status = nfsd4_replay_cache_entry(resp, seq);
+ cstate->status = nfserr_replay_cache;
+ goto out;
+ }
+ if (status)
+ goto out_put_session;
+
+ status = nfsd4_sequence_check_conn(conn, session);
+ conn = NULL;
+ if (status)
+ goto out_put_session;
+
+ buflen = (seq->cachethis) ?
+ session->se_fchannel.maxresp_cached :
+ session->se_fchannel.maxresp_sz;
+ status = (seq->cachethis) ? nfserr_rep_too_big_to_cache :
+ nfserr_rep_too_big;
+ if (xdr_restrict_buflen(xdr, buflen - rqstp->rq_auth_slack))
+ goto out_put_session;
+ svc_reserve(rqstp, buflen);
+
+ status = nfs_ok;
+ /* Success! bump slot seqid */
+ slot->sl_seqid = seq->seqid;
+ slot->sl_flags |= NFSD4_SLOT_INUSE;
+ if (seq->cachethis)
+ slot->sl_flags |= NFSD4_SLOT_CACHETHIS;
+ else
+ slot->sl_flags &= ~NFSD4_SLOT_CACHETHIS;
+
+ cstate->slot = slot;
+ cstate->session = session;
+ cstate->clp = clp;
+
+out:
+ switch (clp->cl_cb_state) {
+ case NFSD4_CB_DOWN:
+ seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
+ break;
+ case NFSD4_CB_FAULT:
+ seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT;
+ break;
+ default:
+ seq->status_flags = 0;
+ }
+ if (!list_empty(&clp->cl_revoked))
+ seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED;
+out_no_session:
+ if (conn)
+ free_conn(conn);
+ spin_unlock(&nn->client_lock);
+ return status;
+out_put_session:
+ nfsd4_put_session_locked(session);
+ goto out_no_session;
+}
+
+void
+nfsd4_sequence_done(struct nfsd4_compoundres *resp)
+{
+ struct nfsd4_compound_state *cs = &resp->cstate;
+
+ if (nfsd4_has_session(cs)) {
+ if (cs->status != nfserr_replay_cache) {
+ nfsd4_store_cache_entry(resp);
+ cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE;
+ }
+ /* Drop session reference that was taken in nfsd4_sequence() */
+ nfsd4_put_session(cs->session);
+ } else if (cs->clp)
+ put_client_renew(cs->clp);
+}
+
+__be32
+nfsd4_destroy_clientid(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_destroy_clientid *dc = &u->destroy_clientid;
+ struct nfs4_client *conf, *unconf;
+ struct nfs4_client *clp = NULL;
+ __be32 status = 0;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ spin_lock(&nn->client_lock);
+ unconf = find_unconfirmed_client(&dc->clientid, true, nn);
+ conf = find_confirmed_client(&dc->clientid, true, nn);
+ WARN_ON_ONCE(conf && unconf);
+
+ if (conf) {
+ if (client_has_state(conf)) {
+ status = nfserr_clientid_busy;
+ goto out;
+ }
+ status = mark_client_expired_locked(conf);
+ if (status)
+ goto out;
+ clp = conf;
+ } else if (unconf)
+ clp = unconf;
+ else {
+ status = nfserr_stale_clientid;
+ goto out;
+ }
+ if (!nfsd4_mach_creds_match(clp, rqstp)) {
+ clp = NULL;
+ status = nfserr_wrong_cred;
+ goto out;
+ }
+ unhash_client_locked(clp);
+out:
+ spin_unlock(&nn->client_lock);
+ if (clp)
+ expire_client(clp);
+ return status;
+}
+
+__be32
+nfsd4_reclaim_complete(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate, union nfsd4_op_u *u)
+{
+ struct nfsd4_reclaim_complete *rc = &u->reclaim_complete;
+ __be32 status = 0;
+
+ if (rc->rca_one_fs) {
+ if (!cstate->current_fh.fh_dentry)
+ return nfserr_nofilehandle;
+ /*
+ * We don't take advantage of the rca_one_fs case.
+ * That's OK, it's optional, we can safely ignore it.
+ */
+ return nfs_ok;
+ }
+
+ status = nfserr_complete_already;
+ if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE,
+ &cstate->session->se_client->cl_flags))
+ goto out;
+
+ status = nfserr_stale_clientid;
+ if (is_client_expired(cstate->session->se_client))
+ /*
+ * The following error isn't really legal.
+ * But we only get here if the client just explicitly
+ * destroyed the client. Surely it no longer cares what
+ * error it gets back on an operation for the dead
+ * client.
+ */
+ goto out;
+
+ status = nfs_ok;
+ nfsd4_client_record_create(cstate->session->se_client);
+ inc_reclaim_complete(cstate->session->se_client);
+out:
+ return status;
+}
+
+__be32
+nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_setclientid *setclid = &u->setclientid;
+ struct xdr_netobj clname = setclid->se_name;
+ nfs4_verifier clverifier = setclid->se_verf;
+ struct nfs4_client *conf, *new;
+ struct nfs4_client *unconf = NULL;
+ __be32 status;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ new = create_client(clname, rqstp, &clverifier);
+ if (new == NULL)
+ return nfserr_jukebox;
+ /* Cases below refer to rfc 3530 section 14.2.33: */
+ spin_lock(&nn->client_lock);
+ conf = find_confirmed_client_by_name(&clname, nn);
+ if (conf && client_has_state(conf)) {
+ /* case 0: */
+ status = nfserr_clid_inuse;
+ if (clp_used_exchangeid(conf))
+ goto out;
+ if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
+ trace_nfsd_clid_inuse_err(conf);
+ goto out;
+ }
+ }
+ unconf = find_unconfirmed_client_by_name(&clname, nn);
+ if (unconf)
+ unhash_client_locked(unconf);
+ /* We need to handle only case 1: probable callback update */
+ if (conf && same_verf(&conf->cl_verifier, &clverifier)) {
+ copy_clid(new, conf);
+ gen_confirm(new, nn);
+ }
+ new->cl_minorversion = 0;
+ gen_callback(new, setclid, rqstp);
+ add_to_unconfirmed(new);
+ setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
+ setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
+ memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data));
+ new = NULL;
+ status = nfs_ok;
+out:
+ spin_unlock(&nn->client_lock);
+ if (new)
+ free_client(new);
+ if (unconf)
+ expire_client(unconf);
+ return status;
+}
+
+
+__be32
+nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_setclientid_confirm *setclientid_confirm =
+ &u->setclientid_confirm;
+ struct nfs4_client *conf, *unconf;
+ struct nfs4_client *old = NULL;
+ nfs4_verifier confirm = setclientid_confirm->sc_confirm;
+ clientid_t * clid = &setclientid_confirm->sc_clientid;
+ __be32 status;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ if (STALE_CLIENTID(clid, nn))
+ return nfserr_stale_clientid;
+
+ spin_lock(&nn->client_lock);
+ conf = find_confirmed_client(clid, false, nn);
+ unconf = find_unconfirmed_client(clid, false, nn);
+ /*
+ * We try hard to give out unique clientid's, so if we get an
+ * attempt to confirm the same clientid with a different cred,
+ * the client may be buggy; this should never happen.
+ *
+ * Nevertheless, RFC 7530 recommends INUSE for this case:
+ */
+ status = nfserr_clid_inuse;
+ if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred))
+ goto out;
+ if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred))
+ goto out;
+ /* cases below refer to rfc 3530 section 14.2.34: */
+ if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) {
+ if (conf && same_verf(&confirm, &conf->cl_confirm)) {
+ /* case 2: probable retransmit */
+ status = nfs_ok;
+ } else /* case 4: client hasn't noticed we rebooted yet? */
+ status = nfserr_stale_clientid;
+ goto out;
+ }
+ status = nfs_ok;
+ if (conf) { /* case 1: callback update */
+ old = unconf;
+ unhash_client_locked(old);
+ nfsd4_change_callback(conf, &unconf->cl_cb_conn);
+ } else { /* case 3: normal case; new or rebooted client */
+ old = find_confirmed_client_by_name(&unconf->cl_name, nn);
+ if (old) {
+ status = nfserr_clid_inuse;
+ if (client_has_state(old)
+ && !same_creds(&unconf->cl_cred,
+ &old->cl_cred)) {
+ old = NULL;
+ goto out;
+ }
+ status = mark_client_expired_locked(old);
+ if (status) {
+ old = NULL;
+ goto out;
+ }
+ }
+ move_to_confirmed(unconf);
+ conf = unconf;
+ }
+ get_client_locked(conf);
+ spin_unlock(&nn->client_lock);
+ nfsd4_probe_callback(conf);
+ spin_lock(&nn->client_lock);
+ put_client_renew_locked(conf);
+out:
+ spin_unlock(&nn->client_lock);
+ if (old)
+ expire_client(old);
+ return status;
+}
+
+static struct nfs4_file *nfsd4_alloc_file(void)
+{
+ return kmem_cache_alloc(file_slab, GFP_KERNEL);
+}
+
+/* OPEN Share state helper functions */
+static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
+ struct nfs4_file *fp)
+{
+ lockdep_assert_held(&state_lock);
+
+ refcount_set(&fp->fi_ref, 1);
+ spin_lock_init(&fp->fi_lock);
+ INIT_LIST_HEAD(&fp->fi_stateids);
+ INIT_LIST_HEAD(&fp->fi_delegations);
+ INIT_LIST_HEAD(&fp->fi_clnt_odstate);
+ fh_copy_shallow(&fp->fi_fhandle, fh);
+ fp->fi_deleg_file = NULL;
+ fp->fi_had_conflict = false;
+ fp->fi_share_deny = 0;
+ memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
+ memset(fp->fi_access, 0, sizeof(fp->fi_access));
+#ifdef CONFIG_NFSD_PNFS
+ INIT_LIST_HEAD(&fp->fi_lo_states);
+ atomic_set(&fp->fi_lo_recalls, 0);
+#endif
+ hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
+}
+
+void
+nfsd4_free_slabs(void)
+{
+ kmem_cache_destroy(client_slab);
+ kmem_cache_destroy(openowner_slab);
+ kmem_cache_destroy(lockowner_slab);
+ kmem_cache_destroy(file_slab);
+ kmem_cache_destroy(stateid_slab);
+ kmem_cache_destroy(deleg_slab);
+ kmem_cache_destroy(odstate_slab);
+}
+
+int
+nfsd4_init_slabs(void)
+{
+ client_slab = kmem_cache_create("nfsd4_clients",
+ sizeof(struct nfs4_client), 0, 0, NULL);
+ if (client_slab == NULL)
+ goto out;
+ openowner_slab = kmem_cache_create("nfsd4_openowners",
+ sizeof(struct nfs4_openowner), 0, 0, NULL);
+ if (openowner_slab == NULL)
+ goto out_free_client_slab;
+ lockowner_slab = kmem_cache_create("nfsd4_lockowners",
+ sizeof(struct nfs4_lockowner), 0, 0, NULL);
+ if (lockowner_slab == NULL)
+ goto out_free_openowner_slab;
+ file_slab = kmem_cache_create("nfsd4_files",
+ sizeof(struct nfs4_file), 0, 0, NULL);
+ if (file_slab == NULL)
+ goto out_free_lockowner_slab;
+ stateid_slab = kmem_cache_create("nfsd4_stateids",
+ sizeof(struct nfs4_ol_stateid), 0, 0, NULL);
+ if (stateid_slab == NULL)
+ goto out_free_file_slab;
+ deleg_slab = kmem_cache_create("nfsd4_delegations",
+ sizeof(struct nfs4_delegation), 0, 0, NULL);
+ if (deleg_slab == NULL)
+ goto out_free_stateid_slab;
+ odstate_slab = kmem_cache_create("nfsd4_odstate",
+ sizeof(struct nfs4_clnt_odstate), 0, 0, NULL);
+ if (odstate_slab == NULL)
+ goto out_free_deleg_slab;
+ return 0;
+
+out_free_deleg_slab:
+ kmem_cache_destroy(deleg_slab);
+out_free_stateid_slab:
+ kmem_cache_destroy(stateid_slab);
+out_free_file_slab:
+ kmem_cache_destroy(file_slab);
+out_free_lockowner_slab:
+ kmem_cache_destroy(lockowner_slab);
+out_free_openowner_slab:
+ kmem_cache_destroy(openowner_slab);
+out_free_client_slab:
+ kmem_cache_destroy(client_slab);
+out:
+ return -ENOMEM;
+}
+
+static void init_nfs4_replay(struct nfs4_replay *rp)
+{
+ rp->rp_status = nfserr_serverfault;
+ rp->rp_buflen = 0;
+ rp->rp_buf = rp->rp_ibuf;
+ mutex_init(&rp->rp_mutex);
+}
+
+static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
+ struct nfs4_stateowner *so)
+{
+ if (!nfsd4_has_session(cstate)) {
+ mutex_lock(&so->so_replay.rp_mutex);
+ cstate->replay_owner = nfs4_get_stateowner(so);
+ }
+}
+
+void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
+{
+ struct nfs4_stateowner *so = cstate->replay_owner;
+
+ if (so != NULL) {
+ cstate->replay_owner = NULL;
+ mutex_unlock(&so->so_replay.rp_mutex);
+ nfs4_put_stateowner(so);
+ }
+}
+
+static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp)
+{
+ struct nfs4_stateowner *sop;
+
+ sop = kmem_cache_alloc(slab, GFP_KERNEL);
+ if (!sop)
+ return NULL;
+
+ xdr_netobj_dup(&sop->so_owner, owner, GFP_KERNEL);
+ if (!sop->so_owner.data) {
+ kmem_cache_free(slab, sop);
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&sop->so_stateids);
+ sop->so_client = clp;
+ init_nfs4_replay(&sop->so_replay);
+ atomic_set(&sop->so_count, 1);
+ return sop;
+}
+
+static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
+{
+ lockdep_assert_held(&clp->cl_lock);
+
+ list_add(&oo->oo_owner.so_strhash,
+ &clp->cl_ownerstr_hashtbl[strhashval]);
+ list_add(&oo->oo_perclient, &clp->cl_openowners);
+}
+
+static void nfs4_unhash_openowner(struct nfs4_stateowner *so)
+{
+ unhash_openowner_locked(openowner(so));
+}
+
+static void nfs4_free_openowner(struct nfs4_stateowner *so)
+{
+ struct nfs4_openowner *oo = openowner(so);
+
+ kmem_cache_free(openowner_slab, oo);
+}
+
+static const struct nfs4_stateowner_operations openowner_ops = {
+ .so_unhash = nfs4_unhash_openowner,
+ .so_free = nfs4_free_openowner,
+};
+
+static struct nfs4_ol_stateid *
+nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
+{
+ struct nfs4_ol_stateid *local, *ret = NULL;
+ struct nfs4_openowner *oo = open->op_openowner;
+
+ lockdep_assert_held(&fp->fi_lock);
+
+ list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
+ /* ignore lock owners */
+ if (local->st_stateowner->so_is_open_owner == 0)
+ continue;
+ if (local->st_stateowner != &oo->oo_owner)
+ continue;
+ if (local->st_stid.sc_type == NFS4_OPEN_STID) {
+ ret = local;
+ refcount_inc(&ret->st_stid.sc_count);
+ break;
+ }
+ }
+ return ret;
+}
+
+static __be32
+nfsd4_verify_open_stid(struct nfs4_stid *s)
+{
+ __be32 ret = nfs_ok;
+
+ switch (s->sc_type) {
+ default:
+ break;
+ case 0:
+ case NFS4_CLOSED_STID:
+ case NFS4_CLOSED_DELEG_STID:
+ ret = nfserr_bad_stateid;
+ break;
+ case NFS4_REVOKED_DELEG_STID:
+ ret = nfserr_deleg_revoked;
+ }
+ return ret;
+}
+
+/* Lock the stateid st_mutex, and deal with races with CLOSE */
+static __be32
+nfsd4_lock_ol_stateid(struct nfs4_ol_stateid *stp)
+{
+ __be32 ret;
+
+ mutex_lock_nested(&stp->st_mutex, LOCK_STATEID_MUTEX);
+ ret = nfsd4_verify_open_stid(&stp->st_stid);
+ if (ret != nfs_ok)
+ mutex_unlock(&stp->st_mutex);
+ return ret;
+}
+
+static struct nfs4_ol_stateid *
+nfsd4_find_and_lock_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
+{
+ struct nfs4_ol_stateid *stp;
+ for (;;) {
+ spin_lock(&fp->fi_lock);
+ stp = nfsd4_find_existing_open(fp, open);
+ spin_unlock(&fp->fi_lock);
+ if (!stp || nfsd4_lock_ol_stateid(stp) == nfs_ok)
+ break;
+ nfs4_put_stid(&stp->st_stid);
+ }
+ return stp;
+}
+
+static struct nfs4_openowner *
+alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
+ struct nfsd4_compound_state *cstate)
+{
+ struct nfs4_client *clp = cstate->clp;
+ struct nfs4_openowner *oo, *ret;
+
+ oo = alloc_stateowner(openowner_slab, &open->op_owner, clp);
+ if (!oo)
+ return NULL;
+ oo->oo_owner.so_ops = &openowner_ops;
+ oo->oo_owner.so_is_open_owner = 1;
+ oo->oo_owner.so_seqid = open->op_seqid;
+ oo->oo_flags = 0;
+ if (nfsd4_has_session(cstate))
+ oo->oo_flags |= NFS4_OO_CONFIRMED;
+ oo->oo_time = 0;
+ oo->oo_last_closed_stid = NULL;
+ INIT_LIST_HEAD(&oo->oo_close_lru);
+ spin_lock(&clp->cl_lock);
+ ret = find_openstateowner_str_locked(strhashval, open, clp);
+ if (ret == NULL) {
+ hash_openowner(oo, clp, strhashval);
+ ret = oo;
+ } else
+ nfs4_free_stateowner(&oo->oo_owner);
+
+ spin_unlock(&clp->cl_lock);
+ return ret;
+}
+
+static struct nfs4_ol_stateid *
+init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open)
+{
+
+ struct nfs4_openowner *oo = open->op_openowner;
+ struct nfs4_ol_stateid *retstp = NULL;
+ struct nfs4_ol_stateid *stp;
+
+ stp = open->op_stp;
+ /* We are moving these outside of the spinlocks to avoid the warnings */
+ mutex_init(&stp->st_mutex);
+ mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX);
+
+retry:
+ spin_lock(&oo->oo_owner.so_client->cl_lock);
+ spin_lock(&fp->fi_lock);
+
+ retstp = nfsd4_find_existing_open(fp, open);
+ if (retstp)
+ goto out_unlock;
+
+ open->op_stp = NULL;
+ refcount_inc(&stp->st_stid.sc_count);
+ stp->st_stid.sc_type = NFS4_OPEN_STID;
+ INIT_LIST_HEAD(&stp->st_locks);
+ stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner);
+ get_nfs4_file(fp);
+ stp->st_stid.sc_file = fp;
+ stp->st_access_bmap = 0;
+ stp->st_deny_bmap = 0;
+ stp->st_openstp = NULL;
+ list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
+ list_add(&stp->st_perfile, &fp->fi_stateids);
+
+out_unlock:
+ spin_unlock(&fp->fi_lock);
+ spin_unlock(&oo->oo_owner.so_client->cl_lock);
+ if (retstp) {
+ /* Handle races with CLOSE */
+ if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) {
+ nfs4_put_stid(&retstp->st_stid);
+ goto retry;
+ }
+ /* To keep mutex tracking happy */
+ mutex_unlock(&stp->st_mutex);
+ stp = retstp;
+ }
+ return stp;
+}
+
+/*
+ * In the 4.0 case we need to keep the owners around a little while to handle
+ * CLOSE replay. We still do need to release any file access that is held by
+ * them before returning however.
+ */
+static void
+move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
+{
+ struct nfs4_ol_stateid *last;
+ struct nfs4_openowner *oo = openowner(s->st_stateowner);
+ struct nfsd_net *nn = net_generic(s->st_stid.sc_client->net,
+ nfsd_net_id);
+
+ dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo);
+
+ /*
+ * We know that we hold one reference via nfsd4_close, and another
+ * "persistent" reference for the client. If the refcount is higher
+ * than 2, then there are still calls in progress that are using this
+ * stateid. We can't put the sc_file reference until they are finished.
+ * Wait for the refcount to drop to 2. Since it has been unhashed,
+ * there should be no danger of the refcount going back up again at
+ * this point.
+ */
+ wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2);
+
+ release_all_access(s);
+ if (s->st_stid.sc_file) {
+ put_nfs4_file(s->st_stid.sc_file);
+ s->st_stid.sc_file = NULL;
+ }
+
+ spin_lock(&nn->client_lock);
+ last = oo->oo_last_closed_stid;
+ oo->oo_last_closed_stid = s;
+ list_move_tail(&oo->oo_close_lru, &nn->close_lru);
+ oo->oo_time = ktime_get_boottime_seconds();
+ spin_unlock(&nn->client_lock);
+ if (last)
+ nfs4_put_stid(&last->st_stid);
+}
+
+/* search file_hashtbl[] for file */
+static struct nfs4_file *
+find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
+{
+ struct nfs4_file *fp;
+
+ hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
+ lockdep_is_held(&state_lock)) {
+ if (fh_match(&fp->fi_fhandle, fh)) {
+ if (refcount_inc_not_zero(&fp->fi_ref))
+ return fp;
+ }
+ }
+ return NULL;
+}
+
+struct nfs4_file *
+find_file(struct knfsd_fh *fh)
+{
+ struct nfs4_file *fp;
+ unsigned int hashval = file_hashval(fh);
+
+ rcu_read_lock();
+ fp = find_file_locked(fh, hashval);
+ rcu_read_unlock();
+ return fp;
+}
+
+static struct nfs4_file *
+find_or_add_file(struct nfs4_file *new, struct knfsd_fh *fh)
+{
+ struct nfs4_file *fp;
+ unsigned int hashval = file_hashval(fh);
+
+ rcu_read_lock();
+ fp = find_file_locked(fh, hashval);
+ rcu_read_unlock();
+ if (fp)
+ return fp;
+
+ spin_lock(&state_lock);
+ fp = find_file_locked(fh, hashval);
+ if (likely(fp == NULL)) {
+ nfsd4_init_file(fh, hashval, new);
+ fp = new;
+ }
+ spin_unlock(&state_lock);
+
+ return fp;
+}
+
+/*
+ * Called to check deny when READ with all zero stateid or
+ * WRITE with all zero or all one stateid
+ */
+static __be32
+nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
+{
+ struct nfs4_file *fp;
+ __be32 ret = nfs_ok;
+
+ fp = find_file(&current_fh->fh_handle);
+ if (!fp)
+ return ret;
+ /* Check for conflicting share reservations */
+ spin_lock(&fp->fi_lock);
+ if (fp->fi_share_deny & deny_type)
+ ret = nfserr_locked;
+ spin_unlock(&fp->fi_lock);
+ put_nfs4_file(fp);
+ return ret;
+}
+
+static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb)
+{
+ struct nfs4_delegation *dp = cb_to_delegation(cb);
+ struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net,
+ nfsd_net_id);
+
+ block_delegations(&dp->dl_stid.sc_file->fi_fhandle);
+
+ /*
+ * We can't do this in nfsd_break_deleg_cb because it is
+ * already holding inode->i_lock.
+ *
+ * If the dl_time != 0, then we know that it has already been
+ * queued for a lease break. Don't queue it again.
+ */
+ spin_lock(&state_lock);
+ if (delegation_hashed(dp) && dp->dl_time == 0) {
+ dp->dl_time = ktime_get_boottime_seconds();
+ list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
+ }
+ spin_unlock(&state_lock);
+}
+
+static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
+ struct rpc_task *task)
+{
+ struct nfs4_delegation *dp = cb_to_delegation(cb);
+
+ if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID ||
+ dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID)
+ return 1;
+
+ switch (task->tk_status) {
+ case 0:
+ return 1;
+ case -NFS4ERR_DELAY:
+ rpc_delay(task, 2 * HZ);
+ return 0;
+ case -EBADHANDLE:
+ case -NFS4ERR_BAD_STATEID:
+ /*
+ * Race: client probably got cb_recall before open reply
+ * granting delegation.
+ */
+ if (dp->dl_retries--) {
+ rpc_delay(task, 2 * HZ);
+ return 0;
+ }
+ fallthrough;
+ default:
+ return 1;
+ }
+}
+
+static void nfsd4_cb_recall_release(struct nfsd4_callback *cb)
+{
+ struct nfs4_delegation *dp = cb_to_delegation(cb);
+
+ nfs4_put_stid(&dp->dl_stid);
+}
+
+static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
+ .prepare = nfsd4_cb_recall_prepare,
+ .done = nfsd4_cb_recall_done,
+ .release = nfsd4_cb_recall_release,
+};
+
+static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
+{
+ /*
+ * We're assuming the state code never drops its reference
+ * without first removing the lease. Since we're in this lease
+ * callback (and since the lease code is serialized by the
+ * i_lock) we know the server hasn't removed the lease yet, and
+ * we know it's safe to take a reference.
+ */
+ refcount_inc(&dp->dl_stid.sc_count);
+ nfsd4_run_cb(&dp->dl_recall);
+}
+
+/* Called from break_lease() with i_lock held. */
+static bool
+nfsd_break_deleg_cb(struct file_lock *fl)
+{
+ bool ret = false;
+ struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
+ struct nfs4_file *fp = dp->dl_stid.sc_file;
+
+ trace_nfsd_deleg_break(&dp->dl_stid.sc_stateid);
+
+ /*
+ * We don't want the locks code to timeout the lease for us;
+ * we'll remove it ourself if a delegation isn't returned
+ * in time:
+ */
+ fl->fl_break_time = 0;
+
+ spin_lock(&fp->fi_lock);
+ fp->fi_had_conflict = true;
+ nfsd_break_one_deleg(dp);
+ spin_unlock(&fp->fi_lock);
+ return ret;
+}
+
+/**
+ * nfsd_breaker_owns_lease - Check if lease conflict was resolved
+ * @fl: Lock state to check
+ *
+ * Return values:
+ * %true: Lease conflict was resolved
+ * %false: Lease conflict was not resolved.
+ */
+static bool nfsd_breaker_owns_lease(struct file_lock *fl)
+{
+ struct nfs4_delegation *dl = fl->fl_owner;
+ struct svc_rqst *rqst;
+ struct nfs4_client *clp;
+
+ if (!i_am_nfsd())
+ return false;
+ rqst = kthread_data(current);
+ /* Note rq_prog == NFS_ACL_PROGRAM is also possible: */
+ if (rqst->rq_prog != NFS_PROGRAM || rqst->rq_vers < 4)
+ return false;
+ clp = *(rqst->rq_lease_breaker);
+ return dl->dl_stid.sc_client == clp;
+}
+
+static int
+nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
+ struct list_head *dispose)
+{
+ if (arg & F_UNLCK)
+ return lease_modify(onlist, arg, dispose);
+ else
+ return -EAGAIN;
+}
+
+static const struct lock_manager_operations nfsd_lease_mng_ops = {
+ .lm_breaker_owns_lease = nfsd_breaker_owns_lease,
+ .lm_break = nfsd_break_deleg_cb,
+ .lm_change = nfsd_change_deleg_cb,
+};
+
+static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so, u32 seqid)
+{
+ if (nfsd4_has_session(cstate))
+ return nfs_ok;
+ if (seqid == so->so_seqid - 1)
+ return nfserr_replay_me;
+ if (seqid == so->so_seqid)
+ return nfs_ok;
+ return nfserr_bad_seqid;
+}
+
+static __be32 lookup_clientid(clientid_t *clid,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd_net *nn,
+ bool sessions)
+{
+ struct nfs4_client *found;
+
+ if (cstate->clp) {
+ found = cstate->clp;
+ if (!same_clid(&found->cl_clientid, clid))
+ return nfserr_stale_clientid;
+ return nfs_ok;
+ }
+
+ if (STALE_CLIENTID(clid, nn))
+ return nfserr_stale_clientid;
+
+ /*
+ * For v4.1+ we get the client in the SEQUENCE op. If we don't have one
+ * cached already then we know this is for is for v4.0 and "sessions"
+ * will be false.
+ */
+ WARN_ON_ONCE(cstate->session);
+ spin_lock(&nn->client_lock);
+ found = find_confirmed_client(clid, sessions, nn);
+ if (!found) {
+ spin_unlock(&nn->client_lock);
+ return nfserr_expired;
+ }
+ atomic_inc(&found->cl_rpc_users);
+ spin_unlock(&nn->client_lock);
+
+ /* Cache the nfs4_client in cstate! */
+ cstate->clp = found;
+ return nfs_ok;
+}
+
+__be32
+nfsd4_process_open1(struct nfsd4_compound_state *cstate,
+ struct nfsd4_open *open, struct nfsd_net *nn)
+{
+ clientid_t *clientid = &open->op_clientid;
+ struct nfs4_client *clp = NULL;
+ unsigned int strhashval;
+ struct nfs4_openowner *oo = NULL;
+ __be32 status;
+
+ if (STALE_CLIENTID(&open->op_clientid, nn))
+ return nfserr_stale_clientid;
+ /*
+ * In case we need it later, after we've already created the
+ * file and don't want to risk a further failure:
+ */
+ open->op_file = nfsd4_alloc_file();
+ if (open->op_file == NULL)
+ return nfserr_jukebox;
+
+ status = lookup_clientid(clientid, cstate, nn, false);
+ if (status)
+ return status;
+ clp = cstate->clp;
+
+ strhashval = ownerstr_hashval(&open->op_owner);
+ oo = find_openstateowner_str(strhashval, open, clp);
+ open->op_openowner = oo;
+ if (!oo) {
+ goto new_owner;
+ }
+ if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
+ /* Replace unconfirmed owners without checking for replay. */
+ release_openowner(oo);
+ open->op_openowner = NULL;
+ goto new_owner;
+ }
+ status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
+ if (status)
+ return status;
+ goto alloc_stateid;
+new_owner:
+ oo = alloc_init_open_stateowner(strhashval, open, cstate);
+ if (oo == NULL)
+ return nfserr_jukebox;
+ open->op_openowner = oo;
+alloc_stateid:
+ open->op_stp = nfs4_alloc_open_stateid(clp);
+ if (!open->op_stp)
+ return nfserr_jukebox;
+
+ if (nfsd4_has_session(cstate) &&
+ (cstate->current_fh.fh_export->ex_flags & NFSEXP_PNFS)) {
+ open->op_odstate = alloc_clnt_odstate(clp);
+ if (!open->op_odstate)
+ return nfserr_jukebox;
+ }
+
+ return nfs_ok;
+}
+
+static inline __be32
+nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
+{
+ if ((flags & WR_STATE) && (dp->dl_type == NFS4_OPEN_DELEGATE_READ))
+ return nfserr_openmode;
+ else
+ return nfs_ok;
+}
+
+static int share_access_to_flags(u32 share_access)
+{
+ return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
+}
+
+static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s)
+{
+ struct nfs4_stid *ret;
+
+ ret = find_stateid_by_type(cl, s,
+ NFS4_DELEG_STID|NFS4_REVOKED_DELEG_STID);
+ if (!ret)
+ return NULL;
+ return delegstateid(ret);
+}
+
+static bool nfsd4_is_deleg_cur(struct nfsd4_open *open)
+{
+ return open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR ||
+ open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH;
+}
+
+static __be32
+nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open,
+ struct nfs4_delegation **dp)
+{
+ int flags;
+ __be32 status = nfserr_bad_stateid;
+ struct nfs4_delegation *deleg;
+
+ deleg = find_deleg_stateid(cl, &open->op_delegate_stateid);
+ if (deleg == NULL)
+ goto out;
+ if (deleg->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) {
+ nfs4_put_stid(&deleg->dl_stid);
+ if (cl->cl_minorversion)
+ status = nfserr_deleg_revoked;
+ goto out;
+ }
+ flags = share_access_to_flags(open->op_share_access);
+ status = nfs4_check_delegmode(deleg, flags);
+ if (status) {
+ nfs4_put_stid(&deleg->dl_stid);
+ goto out;
+ }
+ *dp = deleg;
+out:
+ if (!nfsd4_is_deleg_cur(open))
+ return nfs_ok;
+ if (status)
+ return status;
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+ return nfs_ok;
+}
+
+static inline int nfs4_access_to_access(u32 nfs4_access)
+{
+ int flags = 0;
+
+ if (nfs4_access & NFS4_SHARE_ACCESS_READ)
+ flags |= NFSD_MAY_READ;
+ if (nfs4_access & NFS4_SHARE_ACCESS_WRITE)
+ flags |= NFSD_MAY_WRITE;
+ return flags;
+}
+
+static inline __be32
+nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
+ struct nfsd4_open *open)
+{
+ struct iattr iattr = {
+ .ia_valid = ATTR_SIZE,
+ .ia_size = 0,
+ };
+ if (!open->op_truncate)
+ return 0;
+ if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
+ return nfserr_inval;
+ return nfsd_setattr(rqstp, fh, &iattr, 0, (time64_t)0);
+}
+
+static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
+ struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp,
+ struct nfsd4_open *open)
+{
+ struct nfsd_file *nf = NULL;
+ __be32 status;
+ int oflag = nfs4_access_to_omode(open->op_share_access);
+ int access = nfs4_access_to_access(open->op_share_access);
+ unsigned char old_access_bmap, old_deny_bmap;
+
+ spin_lock(&fp->fi_lock);
+
+ /*
+ * Are we trying to set a deny mode that would conflict with
+ * current access?
+ */
+ status = nfs4_file_check_deny(fp, open->op_share_deny);
+ if (status != nfs_ok) {
+ spin_unlock(&fp->fi_lock);
+ goto out;
+ }
+
+ /* set access to the file */
+ status = nfs4_file_get_access(fp, open->op_share_access);
+ if (status != nfs_ok) {
+ spin_unlock(&fp->fi_lock);
+ goto out;
+ }
+
+ /* Set access bits in stateid */
+ old_access_bmap = stp->st_access_bmap;
+ set_access(open->op_share_access, stp);
+
+ /* Set new deny mask */
+ old_deny_bmap = stp->st_deny_bmap;
+ set_deny(open->op_share_deny, stp);
+ fp->fi_share_deny |= (open->op_share_deny & NFS4_SHARE_DENY_BOTH);
+
+ if (!fp->fi_fds[oflag]) {
+ spin_unlock(&fp->fi_lock);
+ status = nfsd_file_acquire(rqstp, cur_fh, access, &nf);
+ if (status)
+ goto out_put_access;
+ spin_lock(&fp->fi_lock);
+ if (!fp->fi_fds[oflag]) {
+ fp->fi_fds[oflag] = nf;
+ nf = NULL;
+ }
+ }
+ spin_unlock(&fp->fi_lock);
+ if (nf)
+ nfsd_file_put(nf);
+
+ status = nfserrno(nfsd_open_break_lease(cur_fh->fh_dentry->d_inode,
+ access));
+ if (status)
+ goto out_put_access;
+
+ status = nfsd4_truncate(rqstp, cur_fh, open);
+ if (status)
+ goto out_put_access;
+out:
+ return status;
+out_put_access:
+ stp->st_access_bmap = old_access_bmap;
+ nfs4_file_put_access(fp, open->op_share_access);
+ reset_union_bmap_deny(bmap_to_share_mode(old_deny_bmap), stp);
+ goto out;
+}
+
+static __be32
+nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
+{
+ __be32 status;
+ unsigned char old_deny_bmap = stp->st_deny_bmap;
+
+ if (!test_access(open->op_share_access, stp))
+ return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open);
+
+ /* test and set deny mode */
+ spin_lock(&fp->fi_lock);
+ status = nfs4_file_check_deny(fp, open->op_share_deny);
+ if (status == nfs_ok) {
+ set_deny(open->op_share_deny, stp);
+ fp->fi_share_deny |=
+ (open->op_share_deny & NFS4_SHARE_DENY_BOTH);
+ }
+ spin_unlock(&fp->fi_lock);
+
+ if (status != nfs_ok)
+ return status;
+
+ status = nfsd4_truncate(rqstp, cur_fh, open);
+ if (status != nfs_ok)
+ reset_union_bmap_deny(old_deny_bmap, stp);
+ return status;
+}
+
+/* Should we give out recallable state?: */
+static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
+{
+ if (clp->cl_cb_state == NFSD4_CB_UP)
+ return true;
+ /*
+ * In the sessions case, since we don't have to establish a
+ * separate connection for callbacks, we assume it's OK
+ * until we hear otherwise:
+ */
+ return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
+}
+
+static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp,
+ int flag)
+{
+ struct file_lock *fl;
+
+ fl = locks_alloc_lock();
+ if (!fl)
+ return NULL;
+ fl->fl_lmops = &nfsd_lease_mng_ops;
+ fl->fl_flags = FL_DELEG;
+ fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
+ fl->fl_end = OFFSET_MAX;
+ fl->fl_owner = (fl_owner_t)dp;
+ fl->fl_pid = current->tgid;
+ fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file;
+ return fl;
+}
+
+static struct nfs4_delegation *
+nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
+ struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate)
+{
+ int status = 0;
+ struct nfs4_delegation *dp;
+ struct nfsd_file *nf;
+ struct file_lock *fl;
+
+ /*
+ * The fi_had_conflict and nfs_get_existing_delegation checks
+ * here are just optimizations; we'll need to recheck them at
+ * the end:
+ */
+ if (fp->fi_had_conflict)
+ return ERR_PTR(-EAGAIN);
+
+ nf = find_readable_file(fp);
+ if (!nf) {
+ /* We should always have a readable file here */
+ WARN_ON_ONCE(1);
+ return ERR_PTR(-EBADF);
+ }
+ spin_lock(&state_lock);
+ spin_lock(&fp->fi_lock);
+ if (nfs4_delegation_exists(clp, fp))
+ status = -EAGAIN;
+ else if (!fp->fi_deleg_file) {
+ fp->fi_deleg_file = nf;
+ /* increment early to prevent fi_deleg_file from being
+ * cleared */
+ fp->fi_delegees = 1;
+ nf = NULL;
+ } else
+ fp->fi_delegees++;
+ spin_unlock(&fp->fi_lock);
+ spin_unlock(&state_lock);
+ if (nf)
+ nfsd_file_put(nf);
+ if (status)
+ return ERR_PTR(status);
+
+ status = -ENOMEM;
+ dp = alloc_init_deleg(clp, fp, fh, odstate);
+ if (!dp)
+ goto out_delegees;
+
+ fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ);
+ if (!fl)
+ goto out_clnt_odstate;
+
+ status = vfs_setlease(fp->fi_deleg_file->nf_file, fl->fl_type, &fl, NULL);
+ if (fl)
+ locks_free_lock(fl);
+ if (status)
+ goto out_clnt_odstate;
+
+ spin_lock(&state_lock);
+ spin_lock(&fp->fi_lock);
+ if (fp->fi_had_conflict)
+ status = -EAGAIN;
+ else
+ status = hash_delegation_locked(dp, fp);
+ spin_unlock(&fp->fi_lock);
+ spin_unlock(&state_lock);
+
+ if (status)
+ goto out_unlock;
+
+ return dp;
+out_unlock:
+ vfs_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp);
+out_clnt_odstate:
+ put_clnt_odstate(dp->dl_clnt_odstate);
+ nfs4_put_stid(&dp->dl_stid);
+out_delegees:
+ put_deleg_file(fp);
+ return ERR_PTR(status);
+}
+
+static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
+{
+ open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT;
+ if (status == -EAGAIN)
+ open->op_why_no_deleg = WND4_CONTENTION;
+ else {
+ open->op_why_no_deleg = WND4_RESOURCE;
+ switch (open->op_deleg_want) {
+ case NFS4_SHARE_WANT_READ_DELEG:
+ case NFS4_SHARE_WANT_WRITE_DELEG:
+ case NFS4_SHARE_WANT_ANY_DELEG:
+ break;
+ case NFS4_SHARE_WANT_CANCEL:
+ open->op_why_no_deleg = WND4_CANCELLED;
+ break;
+ case NFS4_SHARE_WANT_NO_DELEG:
+ WARN_ON_ONCE(1);
+ }
+ }
+}
+
+/*
+ * Attempt to hand out a delegation.
+ *
+ * Note we don't support write delegations, and won't until the vfs has
+ * proper support for them.
+ */
+static void
+nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
+ struct nfs4_ol_stateid *stp)
+{
+ struct nfs4_delegation *dp;
+ struct nfs4_openowner *oo = openowner(stp->st_stateowner);
+ struct nfs4_client *clp = stp->st_stid.sc_client;
+ int cb_up;
+ int status = 0;
+
+ cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
+ open->op_recall = 0;
+ switch (open->op_claim_type) {
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ if (!cb_up)
+ open->op_recall = 1;
+ if (open->op_delegate_type != NFS4_OPEN_DELEGATE_READ)
+ goto out_no_deleg;
+ break;
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_FH:
+ /*
+ * Let's not give out any delegations till everyone's
+ * had the chance to reclaim theirs, *and* until
+ * NLM locks have all been reclaimed:
+ */
+ if (locks_in_grace(clp->net))
+ goto out_no_deleg;
+ if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
+ goto out_no_deleg;
+ /*
+ * Also, if the file was opened for write or
+ * create, there's a good chance the client's
+ * about to write to it, resulting in an
+ * immediate recall (since we don't support
+ * write delegations):
+ */
+ if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
+ goto out_no_deleg;
+ if (open->op_create == NFS4_OPEN_CREATE)
+ goto out_no_deleg;
+ break;
+ default:
+ goto out_no_deleg;
+ }
+ dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file, stp->st_clnt_odstate);
+ if (IS_ERR(dp))
+ goto out_no_deleg;
+
+ memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid));
+
+ trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid);
+ open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
+ nfs4_put_stid(&dp->dl_stid);
+ return;
+out_no_deleg:
+ open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE;
+ if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
+ open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) {
+ dprintk("NFSD: WARNING: refusing delegation reclaim\n");
+ open->op_recall = 1;
+ }
+
+ /* 4.1 client asking for a delegation? */
+ if (open->op_deleg_want)
+ nfsd4_open_deleg_none_ext(open, status);
+ return;
+}
+
+static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
+ struct nfs4_delegation *dp)
+{
+ if (open->op_deleg_want == NFS4_SHARE_WANT_READ_DELEG &&
+ dp->dl_type == NFS4_OPEN_DELEGATE_WRITE) {
+ open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT;
+ open->op_why_no_deleg = WND4_NOT_SUPP_DOWNGRADE;
+ } else if (open->op_deleg_want == NFS4_SHARE_WANT_WRITE_DELEG &&
+ dp->dl_type == NFS4_OPEN_DELEGATE_WRITE) {
+ open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT;
+ open->op_why_no_deleg = WND4_NOT_SUPP_UPGRADE;
+ }
+ /* Otherwise the client must be confused wanting a delegation
+ * it already has, therefore we don't return
+ * NFS4_OPEN_DELEGATE_NONE_EXT and reason.
+ */
+}
+
+__be32
+nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
+{
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
+ struct nfs4_file *fp = NULL;
+ struct nfs4_ol_stateid *stp = NULL;
+ struct nfs4_delegation *dp = NULL;
+ __be32 status;
+ bool new_stp = false;
+
+ /*
+ * Lookup file; if found, lookup stateid and check open request,
+ * and check for delegations in the process of being recalled.
+ * If not found, create the nfs4_file struct
+ */
+ fp = find_or_add_file(open->op_file, &current_fh->fh_handle);
+ if (fp != open->op_file) {
+ status = nfs4_check_deleg(cl, open, &dp);
+ if (status)
+ goto out;
+ stp = nfsd4_find_and_lock_existing_open(fp, open);
+ } else {
+ open->op_file = NULL;
+ status = nfserr_bad_stateid;
+ if (nfsd4_is_deleg_cur(open))
+ goto out;
+ }
+
+ if (!stp) {
+ stp = init_open_stateid(fp, open);
+ if (!open->op_stp)
+ new_stp = true;
+ }
+
+ /*
+ * OPEN the file, or upgrade an existing OPEN.
+ * If truncate fails, the OPEN fails.
+ *
+ * stp is already locked.
+ */
+ if (!new_stp) {
+ /* Stateid was found, this is an OPEN upgrade */
+ status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
+ if (status) {
+ mutex_unlock(&stp->st_mutex);
+ goto out;
+ }
+ } else {
+ status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
+ if (status) {
+ stp->st_stid.sc_type = NFS4_CLOSED_STID;
+ release_open_stateid(stp);
+ mutex_unlock(&stp->st_mutex);
+ goto out;
+ }
+
+ stp->st_clnt_odstate = find_or_hash_clnt_odstate(fp,
+ open->op_odstate);
+ if (stp->st_clnt_odstate == open->op_odstate)
+ open->op_odstate = NULL;
+ }
+
+ nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid);
+ mutex_unlock(&stp->st_mutex);
+
+ if (nfsd4_has_session(&resp->cstate)) {
+ if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
+ open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT;
+ open->op_why_no_deleg = WND4_NOT_WANTED;
+ goto nodeleg;
+ }
+ }
+
+ /*
+ * Attempt to hand out a delegation. No error return, because the
+ * OPEN succeeds even if we fail.
+ */
+ nfs4_open_delegation(current_fh, open, stp);
+nodeleg:
+ status = nfs_ok;
+ trace_nfsd_open(&stp->st_stid.sc_stateid);
+out:
+ /* 4.1 client trying to upgrade/downgrade delegation? */
+ if (open->op_delegate_type == NFS4_OPEN_DELEGATE_NONE && dp &&
+ open->op_deleg_want)
+ nfsd4_deleg_xgrade_none_ext(open, dp);
+
+ if (fp)
+ put_nfs4_file(fp);
+ if (status == 0 && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+ /*
+ * To finish the open response, we just need to set the rflags.
+ */
+ open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
+ if (nfsd4_has_session(&resp->cstate))
+ open->op_rflags |= NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK;
+ else if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED))
+ open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
+
+ if (dp)
+ nfs4_put_stid(&dp->dl_stid);
+ if (stp)
+ nfs4_put_stid(&stp->st_stid);
+
+ return status;
+}
+
+void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
+ struct nfsd4_open *open)
+{
+ if (open->op_openowner) {
+ struct nfs4_stateowner *so = &open->op_openowner->oo_owner;
+
+ nfsd4_cstate_assign_replay(cstate, so);
+ nfs4_put_stateowner(so);
+ }
+ if (open->op_file)
+ kmem_cache_free(file_slab, open->op_file);
+ if (open->op_stp)
+ nfs4_put_stid(&open->op_stp->st_stid);
+ if (open->op_odstate)
+ kmem_cache_free(odstate_slab, open->op_odstate);
+}
+
+__be32
+nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ clientid_t *clid = &u->renew;
+ struct nfs4_client *clp;
+ __be32 status;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ trace_nfsd_clid_renew(clid);
+ status = lookup_clientid(clid, cstate, nn, false);
+ if (status)
+ goto out;
+ clp = cstate->clp;
+ status = nfserr_cb_path_down;
+ if (!list_empty(&clp->cl_delegations)
+ && clp->cl_cb_state != NFSD4_CB_UP)
+ goto out;
+ status = nfs_ok;
+out:
+ return status;
+}
+
+void
+nfsd4_end_grace(struct nfsd_net *nn)
+{
+ /* do nothing if grace period already ended */
+ if (nn->grace_ended)
+ return;
+
+ trace_nfsd_grace_complete(nn);
+ nn->grace_ended = true;
+ /*
+ * If the server goes down again right now, an NFSv4
+ * client will still be allowed to reclaim after it comes back up,
+ * even if it hasn't yet had a chance to reclaim state this time.
+ *
+ */
+ nfsd4_record_grace_done(nn);
+ /*
+ * At this point, NFSv4 clients can still reclaim. But if the
+ * server crashes, any that have not yet reclaimed will be out
+ * of luck on the next boot.
+ *
+ * (NFSv4.1+ clients are considered to have reclaimed once they
+ * call RECLAIM_COMPLETE. NFSv4.0 clients are considered to
+ * have reclaimed after their first OPEN.)
+ */
+ locks_end_grace(&nn->nfsd4_manager);
+ /*
+ * At this point, and once lockd and/or any other containers
+ * exit their grace period, further reclaims will fail and
+ * regular locking can resume.
+ */
+}
+
+/*
+ * If we've waited a lease period but there are still clients trying to
+ * reclaim, wait a little longer to give them a chance to finish.
+ */
+static bool clients_still_reclaiming(struct nfsd_net *nn)
+{
+ time64_t double_grace_period_end = nn->boot_time +
+ 2 * nn->nfsd4_lease;
+
+ if (nn->track_reclaim_completes &&
+ atomic_read(&nn->nr_reclaim_complete) ==
+ nn->reclaim_str_hashtbl_size)
+ return false;
+ if (!nn->somebody_reclaimed)
+ return false;
+ nn->somebody_reclaimed = false;
+ /*
+ * If we've given them *two* lease times to reclaim, and they're
+ * still not done, give up:
+ */
+ if (ktime_get_boottime_seconds() > double_grace_period_end)
+ return false;
+ return true;
+}
+
+static time64_t
+nfs4_laundromat(struct nfsd_net *nn)
+{
+ struct nfs4_client *clp;
+ struct nfs4_openowner *oo;
+ struct nfs4_delegation *dp;
+ struct nfs4_ol_stateid *stp;
+ struct nfsd4_blocked_lock *nbl;
+ struct list_head *pos, *next, reaplist;
+ time64_t cutoff = ktime_get_boottime_seconds() - nn->nfsd4_lease;
+ time64_t t, new_timeo = nn->nfsd4_lease;
+ struct nfs4_cpntf_state *cps;
+ copy_stateid_t *cps_t;
+ int i;
+
+ if (clients_still_reclaiming(nn)) {
+ new_timeo = 0;
+ goto out;
+ }
+ nfsd4_end_grace(nn);
+ INIT_LIST_HEAD(&reaplist);
+
+ spin_lock(&nn->s2s_cp_lock);
+ idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) {
+ cps = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid);
+ if (cps->cp_stateid.sc_type == NFS4_COPYNOTIFY_STID &&
+ cps->cpntf_time < cutoff)
+ _free_cpntf_state_locked(nn, cps);
+ }
+ spin_unlock(&nn->s2s_cp_lock);
+
+ spin_lock(&nn->client_lock);
+ list_for_each_safe(pos, next, &nn->client_lru) {
+ clp = list_entry(pos, struct nfs4_client, cl_lru);
+ if (clp->cl_time > cutoff) {
+ t = clp->cl_time - cutoff;
+ new_timeo = min(new_timeo, t);
+ break;
+ }
+ if (mark_client_expired_locked(clp)) {
+ trace_nfsd_clid_expired(&clp->cl_clientid);
+ continue;
+ }
+ list_add(&clp->cl_lru, &reaplist);
+ }
+ spin_unlock(&nn->client_lock);
+ list_for_each_safe(pos, next, &reaplist) {
+ clp = list_entry(pos, struct nfs4_client, cl_lru);
+ trace_nfsd_clid_purged(&clp->cl_clientid);
+ list_del_init(&clp->cl_lru);
+ expire_client(clp);
+ }
+ spin_lock(&state_lock);
+ list_for_each_safe(pos, next, &nn->del_recall_lru) {
+ dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
+ if (dp->dl_time > cutoff) {
+ t = dp->dl_time - cutoff;
+ new_timeo = min(new_timeo, t);
+ break;
+ }
+ WARN_ON(!unhash_delegation_locked(dp));
+ list_add(&dp->dl_recall_lru, &reaplist);
+ }
+ spin_unlock(&state_lock);
+ while (!list_empty(&reaplist)) {
+ dp = list_first_entry(&reaplist, struct nfs4_delegation,
+ dl_recall_lru);
+ list_del_init(&dp->dl_recall_lru);
+ revoke_delegation(dp);
+ }
+
+ spin_lock(&nn->client_lock);
+ while (!list_empty(&nn->close_lru)) {
+ oo = list_first_entry(&nn->close_lru, struct nfs4_openowner,
+ oo_close_lru);
+ if (oo->oo_time > cutoff) {
+ t = oo->oo_time - cutoff;
+ new_timeo = min(new_timeo, t);
+ break;
+ }
+ list_del_init(&oo->oo_close_lru);
+ stp = oo->oo_last_closed_stid;
+ oo->oo_last_closed_stid = NULL;
+ spin_unlock(&nn->client_lock);
+ nfs4_put_stid(&stp->st_stid);
+ spin_lock(&nn->client_lock);
+ }
+ spin_unlock(&nn->client_lock);
+
+ /*
+ * It's possible for a client to try and acquire an already held lock
+ * that is being held for a long time, and then lose interest in it.
+ * So, we clean out any un-revisited request after a lease period
+ * under the assumption that the client is no longer interested.
+ *
+ * RFC5661, sec. 9.6 states that the client must not rely on getting
+ * notifications and must continue to poll for locks, even when the
+ * server supports them. Thus this shouldn't lead to clients blocking
+ * indefinitely once the lock does become free.
+ */
+ BUG_ON(!list_empty(&reaplist));
+ spin_lock(&nn->blocked_locks_lock);
+ while (!list_empty(&nn->blocked_locks_lru)) {
+ nbl = list_first_entry(&nn->blocked_locks_lru,
+ struct nfsd4_blocked_lock, nbl_lru);
+ if (nbl->nbl_time > cutoff) {
+ t = nbl->nbl_time - cutoff;
+ new_timeo = min(new_timeo, t);
+ break;
+ }
+ list_move(&nbl->nbl_lru, &reaplist);
+ list_del_init(&nbl->nbl_list);
+ }
+ spin_unlock(&nn->blocked_locks_lock);
+
+ while (!list_empty(&reaplist)) {
+ nbl = list_first_entry(&reaplist,
+ struct nfsd4_blocked_lock, nbl_lru);
+ list_del_init(&nbl->nbl_lru);
+ free_blocked_lock(nbl);
+ }
+out:
+ new_timeo = max_t(time64_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
+ return new_timeo;
+}
+
+static struct workqueue_struct *laundry_wq;
+static void laundromat_main(struct work_struct *);
+
+static void
+laundromat_main(struct work_struct *laundry)
+{
+ time64_t t;
+ struct delayed_work *dwork = to_delayed_work(laundry);
+ struct nfsd_net *nn = container_of(dwork, struct nfsd_net,
+ laundromat_work);
+
+ t = nfs4_laundromat(nn);
+ queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ);
+}
+
+static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
+{
+ if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle))
+ return nfserr_bad_stateid;
+ return nfs_ok;
+}
+
+static inline int
+access_permit_read(struct nfs4_ol_stateid *stp)
+{
+ return test_access(NFS4_SHARE_ACCESS_READ, stp) ||
+ test_access(NFS4_SHARE_ACCESS_BOTH, stp) ||
+ test_access(NFS4_SHARE_ACCESS_WRITE, stp);
+}
+
+static inline int
+access_permit_write(struct nfs4_ol_stateid *stp)
+{
+ return test_access(NFS4_SHARE_ACCESS_WRITE, stp) ||
+ test_access(NFS4_SHARE_ACCESS_BOTH, stp);
+}
+
+static
+__be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags)
+{
+ __be32 status = nfserr_openmode;
+
+ /* For lock stateid's, we test the parent open, not the lock: */
+ if (stp->st_openstp)
+ stp = stp->st_openstp;
+ if ((flags & WR_STATE) && !access_permit_write(stp))
+ goto out;
+ if ((flags & RD_STATE) && !access_permit_read(stp))
+ goto out;
+ status = nfs_ok;
+out:
+ return status;
+}
+
+static inline __be32
+check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid, int flags)
+{
+ if (ONE_STATEID(stateid) && (flags & RD_STATE))
+ return nfs_ok;
+ else if (opens_in_grace(net)) {
+ /* Answer in remaining cases depends on existence of
+ * conflicting state; so we must wait out the grace period. */
+ return nfserr_grace;
+ } else if (flags & WR_STATE)
+ return nfs4_share_conflict(current_fh,
+ NFS4_SHARE_DENY_WRITE);
+ else /* (flags & RD_STATE) && ZERO_STATEID(stateid) */
+ return nfs4_share_conflict(current_fh,
+ NFS4_SHARE_DENY_READ);
+}
+
+/*
+ * Allow READ/WRITE during grace period on recovered state only for files
+ * that are not able to provide mandatory locking.
+ */
+static inline int
+grace_disallows_io(struct net *net, struct inode *inode)
+{
+ return opens_in_grace(net) && mandatory_lock(inode);
+}
+
+static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
+{
+ /*
+ * When sessions are used the stateid generation number is ignored
+ * when it is zero.
+ */
+ if (has_session && in->si_generation == 0)
+ return nfs_ok;
+
+ if (in->si_generation == ref->si_generation)
+ return nfs_ok;
+
+ /* If the client sends us a stateid from the future, it's buggy: */
+ if (nfsd4_stateid_generation_after(in, ref))
+ return nfserr_bad_stateid;
+ /*
+ * However, we could see a stateid from the past, even from a
+ * non-buggy client. For example, if the client sends a lock
+ * while some IO is outstanding, the lock may bump si_generation
+ * while the IO is still in flight. The client could avoid that
+ * situation by waiting for responses on all the IO requests,
+ * but better performance may result in retrying IO that
+ * receives an old_stateid error if requests are rarely
+ * reordered in flight:
+ */
+ return nfserr_old_stateid;
+}
+
+static __be32 nfsd4_stid_check_stateid_generation(stateid_t *in, struct nfs4_stid *s, bool has_session)
+{
+ __be32 ret;
+
+ spin_lock(&s->sc_lock);
+ ret = nfsd4_verify_open_stid(s);
+ if (ret == nfs_ok)
+ ret = check_stateid_generation(in, &s->sc_stateid, has_session);
+ spin_unlock(&s->sc_lock);
+ return ret;
+}
+
+static __be32 nfsd4_check_openowner_confirmed(struct nfs4_ol_stateid *ols)
+{
+ if (ols->st_stateowner->so_is_open_owner &&
+ !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
+ return nfserr_bad_stateid;
+ return nfs_ok;
+}
+
+static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
+{
+ struct nfs4_stid *s;
+ __be32 status = nfserr_bad_stateid;
+
+ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) ||
+ CLOSE_STATEID(stateid))
+ return status;
+ spin_lock(&cl->cl_lock);
+ s = find_stateid_locked(cl, stateid);
+ if (!s)
+ goto out_unlock;
+ status = nfsd4_stid_check_stateid_generation(stateid, s, 1);
+ if (status)
+ goto out_unlock;
+ switch (s->sc_type) {
+ case NFS4_DELEG_STID:
+ status = nfs_ok;
+ break;
+ case NFS4_REVOKED_DELEG_STID:
+ status = nfserr_deleg_revoked;
+ break;
+ case NFS4_OPEN_STID:
+ case NFS4_LOCK_STID:
+ status = nfsd4_check_openowner_confirmed(openlockstateid(s));
+ break;
+ default:
+ printk("unknown stateid type %x\n", s->sc_type);
+ fallthrough;
+ case NFS4_CLOSED_STID:
+ case NFS4_CLOSED_DELEG_STID:
+ status = nfserr_bad_stateid;
+ }
+out_unlock:
+ spin_unlock(&cl->cl_lock);
+ return status;
+}
+
+__be32
+nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
+ stateid_t *stateid, unsigned char typemask,
+ struct nfs4_stid **s, struct nfsd_net *nn)
+{
+ __be32 status;
+ bool return_revoked = false;
+
+ /*
+ * only return revoked delegations if explicitly asked.
+ * otherwise we report revoked or bad_stateid status.
+ */
+ if (typemask & NFS4_REVOKED_DELEG_STID)
+ return_revoked = true;
+ else if (typemask & NFS4_DELEG_STID)
+ typemask |= NFS4_REVOKED_DELEG_STID;
+
+ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) ||
+ CLOSE_STATEID(stateid))
+ return nfserr_bad_stateid;
+ status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn,
+ false);
+ if (status == nfserr_stale_clientid) {
+ if (cstate->session)
+ return nfserr_bad_stateid;
+ return nfserr_stale_stateid;
+ }
+ if (status)
+ return status;
+ *s = find_stateid_by_type(cstate->clp, stateid, typemask);
+ if (!*s)
+ return nfserr_bad_stateid;
+ if (((*s)->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) {
+ nfs4_put_stid(*s);
+ if (cstate->minorversion)
+ return nfserr_deleg_revoked;
+ return nfserr_bad_stateid;
+ }
+ return nfs_ok;
+}
+
+static struct nfsd_file *
+nfs4_find_file(struct nfs4_stid *s, int flags)
+{
+ if (!s)
+ return NULL;
+
+ switch (s->sc_type) {
+ case NFS4_DELEG_STID:
+ if (WARN_ON_ONCE(!s->sc_file->fi_deleg_file))
+ return NULL;
+ return nfsd_file_get(s->sc_file->fi_deleg_file);
+ case NFS4_OPEN_STID:
+ case NFS4_LOCK_STID:
+ if (flags & RD_STATE)
+ return find_readable_file(s->sc_file);
+ else
+ return find_writeable_file(s->sc_file);
+ }
+
+ return NULL;
+}
+
+static __be32
+nfs4_check_olstateid(struct nfs4_ol_stateid *ols, int flags)
+{
+ __be32 status;
+
+ status = nfsd4_check_openowner_confirmed(ols);
+ if (status)
+ return status;
+ return nfs4_check_openmode(ols, flags);
+}
+
+static __be32
+nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s,
+ struct nfsd_file **nfp, int flags)
+{
+ int acc = (flags & RD_STATE) ? NFSD_MAY_READ : NFSD_MAY_WRITE;
+ struct nfsd_file *nf;
+ __be32 status;
+
+ nf = nfs4_find_file(s, flags);
+ if (nf) {
+ status = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
+ acc | NFSD_MAY_OWNER_OVERRIDE);
+ if (status) {
+ nfsd_file_put(nf);
+ goto out;
+ }
+ } else {
+ status = nfsd_file_acquire(rqstp, fhp, acc, &nf);
+ if (status)
+ return status;
+ }
+ *nfp = nf;
+out:
+ return status;
+}
+static void
+_free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps)
+{
+ WARN_ON_ONCE(cps->cp_stateid.sc_type != NFS4_COPYNOTIFY_STID);
+ if (!refcount_dec_and_test(&cps->cp_stateid.sc_count))
+ return;
+ list_del(&cps->cp_list);
+ idr_remove(&nn->s2s_cp_stateids,
+ cps->cp_stateid.stid.si_opaque.so_id);
+ kfree(cps);
+}
+/*
+ * A READ from an inter server to server COPY will have a
+ * copy stateid. Look up the copy notify stateid from the
+ * idr structure and take a reference on it.
+ */
+__be32 manage_cpntf_state(struct nfsd_net *nn, stateid_t *st,
+ struct nfs4_client *clp,
+ struct nfs4_cpntf_state **cps)
+{
+ copy_stateid_t *cps_t;
+ struct nfs4_cpntf_state *state = NULL;
+
+ if (st->si_opaque.so_clid.cl_id != nn->s2s_cp_cl_id)
+ return nfserr_bad_stateid;
+ spin_lock(&nn->s2s_cp_lock);
+ cps_t = idr_find(&nn->s2s_cp_stateids, st->si_opaque.so_id);
+ if (cps_t) {
+ state = container_of(cps_t, struct nfs4_cpntf_state,
+ cp_stateid);
+ if (state->cp_stateid.sc_type != NFS4_COPYNOTIFY_STID) {
+ state = NULL;
+ goto unlock;
+ }
+ if (!clp)
+ refcount_inc(&state->cp_stateid.sc_count);
+ else
+ _free_cpntf_state_locked(nn, state);
+ }
+unlock:
+ spin_unlock(&nn->s2s_cp_lock);
+ if (!state)
+ return nfserr_bad_stateid;
+ if (!clp && state)
+ *cps = state;
+ return 0;
+}
+
+static __be32 find_cpntf_state(struct nfsd_net *nn, stateid_t *st,
+ struct nfs4_stid **stid)
+{
+ __be32 status;
+ struct nfs4_cpntf_state *cps = NULL;
+ struct nfsd4_compound_state cstate;
+
+ status = manage_cpntf_state(nn, st, NULL, &cps);
+ if (status)
+ return status;
+
+ cps->cpntf_time = ktime_get_boottime_seconds();
+ memset(&cstate, 0, sizeof(cstate));
+ status = lookup_clientid(&cps->cp_p_clid, &cstate, nn, true);
+ if (status)
+ goto out;
+ status = nfsd4_lookup_stateid(&cstate, &cps->cp_p_stateid,
+ NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
+ stid, nn);
+ put_client_renew(cstate.clp);
+out:
+ nfs4_put_cpntf_state(nn, cps);
+ return status;
+}
+
+void nfs4_put_cpntf_state(struct nfsd_net *nn, struct nfs4_cpntf_state *cps)
+{
+ spin_lock(&nn->s2s_cp_lock);
+ _free_cpntf_state_locked(nn, cps);
+ spin_unlock(&nn->s2s_cp_lock);
+}
+
+/*
+ * Checks for stateid operations
+ */
+__be32
+nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate, struct svc_fh *fhp,
+ stateid_t *stateid, int flags, struct nfsd_file **nfp,
+ struct nfs4_stid **cstid)
+{
+ struct inode *ino = d_inode(fhp->fh_dentry);
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct nfs4_stid *s = NULL;
+ __be32 status;
+
+ if (nfp)
+ *nfp = NULL;
+
+ if (grace_disallows_io(net, ino))
+ return nfserr_grace;
+
+ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
+ status = check_special_stateids(net, fhp, stateid, flags);
+ goto done;
+ }
+
+ status = nfsd4_lookup_stateid(cstate, stateid,
+ NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
+ &s, nn);
+ if (status == nfserr_bad_stateid)
+ status = find_cpntf_state(nn, stateid, &s);
+ if (status)
+ return status;
+ status = nfsd4_stid_check_stateid_generation(stateid, s,
+ nfsd4_has_session(cstate));
+ if (status)
+ goto out;
+
+ switch (s->sc_type) {
+ case NFS4_DELEG_STID:
+ status = nfs4_check_delegmode(delegstateid(s), flags);
+ break;
+ case NFS4_OPEN_STID:
+ case NFS4_LOCK_STID:
+ status = nfs4_check_olstateid(openlockstateid(s), flags);
+ break;
+ default:
+ status = nfserr_bad_stateid;
+ break;
+ }
+ if (status)
+ goto out;
+ status = nfs4_check_fh(fhp, s);
+
+done:
+ if (status == nfs_ok && nfp)
+ status = nfs4_check_file(rqstp, fhp, s, nfp, flags);
+out:
+ if (s) {
+ if (!status && cstid)
+ *cstid = s;
+ else
+ nfs4_put_stid(s);
+ }
+ return status;
+}
+
+/*
+ * Test if the stateid is valid
+ */
+__be32
+nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_test_stateid *test_stateid = &u->test_stateid;
+ struct nfsd4_test_stateid_id *stateid;
+ struct nfs4_client *cl = cstate->session->se_client;
+
+ list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list)
+ stateid->ts_id_status =
+ nfsd4_validate_stateid(cl, &stateid->ts_id_stateid);
+
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s)
+{
+ struct nfs4_ol_stateid *stp = openlockstateid(s);
+ __be32 ret;
+
+ ret = nfsd4_lock_ol_stateid(stp);
+ if (ret)
+ goto out_put_stid;
+
+ ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
+ if (ret)
+ goto out;
+
+ ret = nfserr_locks_held;
+ if (check_for_locks(stp->st_stid.sc_file,
+ lockowner(stp->st_stateowner)))
+ goto out;
+
+ release_lock_stateid(stp);
+ ret = nfs_ok;
+
+out:
+ mutex_unlock(&stp->st_mutex);
+out_put_stid:
+ nfs4_put_stid(s);
+ return ret;
+}
+
+__be32
+nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_free_stateid *free_stateid = &u->free_stateid;
+ stateid_t *stateid = &free_stateid->fr_stateid;
+ struct nfs4_stid *s;
+ struct nfs4_delegation *dp;
+ struct nfs4_client *cl = cstate->session->se_client;
+ __be32 ret = nfserr_bad_stateid;
+
+ spin_lock(&cl->cl_lock);
+ s = find_stateid_locked(cl, stateid);
+ if (!s)
+ goto out_unlock;
+ spin_lock(&s->sc_lock);
+ switch (s->sc_type) {
+ case NFS4_DELEG_STID:
+ ret = nfserr_locks_held;
+ break;
+ case NFS4_OPEN_STID:
+ ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
+ if (ret)
+ break;
+ ret = nfserr_locks_held;
+ break;
+ case NFS4_LOCK_STID:
+ spin_unlock(&s->sc_lock);
+ refcount_inc(&s->sc_count);
+ spin_unlock(&cl->cl_lock);
+ ret = nfsd4_free_lock_stateid(stateid, s);
+ goto out;
+ case NFS4_REVOKED_DELEG_STID:
+ spin_unlock(&s->sc_lock);
+ dp = delegstateid(s);
+ list_del_init(&dp->dl_recall_lru);
+ spin_unlock(&cl->cl_lock);
+ nfs4_put_stid(s);
+ ret = nfs_ok;
+ goto out;
+ /* Default falls through and returns nfserr_bad_stateid */
+ }
+ spin_unlock(&s->sc_lock);
+out_unlock:
+ spin_unlock(&cl->cl_lock);
+out:
+ return ret;
+}
+
+static inline int
+setlkflg (int type)
+{
+ return (type == NFS4_READW_LT || type == NFS4_READ_LT) ?
+ RD_STATE : WR_STATE;
+}
+
+static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_ol_stateid *stp)
+{
+ struct svc_fh *current_fh = &cstate->current_fh;
+ struct nfs4_stateowner *sop = stp->st_stateowner;
+ __be32 status;
+
+ status = nfsd4_check_seqid(cstate, sop, seqid);
+ if (status)
+ return status;
+ status = nfsd4_lock_ol_stateid(stp);
+ if (status != nfs_ok)
+ return status;
+ status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
+ if (status == nfs_ok)
+ status = nfs4_check_fh(current_fh, &stp->st_stid);
+ if (status != nfs_ok)
+ mutex_unlock(&stp->st_mutex);
+ return status;
+}
+
+/*
+ * Checks for sequence id mutating operations.
+ */
+static __be32
+nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
+ stateid_t *stateid, char typemask,
+ struct nfs4_ol_stateid **stpp,
+ struct nfsd_net *nn)
+{
+ __be32 status;
+ struct nfs4_stid *s;
+ struct nfs4_ol_stateid *stp = NULL;
+
+ trace_nfsd_preprocess(seqid, stateid);
+
+ *stpp = NULL;
+ status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn);
+ if (status)
+ return status;
+ stp = openlockstateid(s);
+ nfsd4_cstate_assign_replay(cstate, stp->st_stateowner);
+
+ status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp);
+ if (!status)
+ *stpp = stp;
+ else
+ nfs4_put_stid(&stp->st_stid);
+ return status;
+}
+
+static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
+ stateid_t *stateid, struct nfs4_ol_stateid **stpp, struct nfsd_net *nn)
+{
+ __be32 status;
+ struct nfs4_openowner *oo;
+ struct nfs4_ol_stateid *stp;
+
+ status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
+ NFS4_OPEN_STID, &stp, nn);
+ if (status)
+ return status;
+ oo = openowner(stp->st_stateowner);
+ if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
+ mutex_unlock(&stp->st_mutex);
+ nfs4_put_stid(&stp->st_stid);
+ return nfserr_bad_stateid;
+ }
+ *stpp = stp;
+ return nfs_ok;
+}
+
+__be32
+nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_open_confirm *oc = &u->open_confirm;
+ __be32 status;
+ struct nfs4_openowner *oo;
+ struct nfs4_ol_stateid *stp;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ dprintk("NFSD: nfsd4_open_confirm on file %pd\n",
+ cstate->current_fh.fh_dentry);
+
+ status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0);
+ if (status)
+ return status;
+
+ status = nfs4_preprocess_seqid_op(cstate,
+ oc->oc_seqid, &oc->oc_req_stateid,
+ NFS4_OPEN_STID, &stp, nn);
+ if (status)
+ goto out;
+ oo = openowner(stp->st_stateowner);
+ status = nfserr_bad_stateid;
+ if (oo->oo_flags & NFS4_OO_CONFIRMED) {
+ mutex_unlock(&stp->st_mutex);
+ goto put_stateid;
+ }
+ oo->oo_flags |= NFS4_OO_CONFIRMED;
+ nfs4_inc_and_copy_stateid(&oc->oc_resp_stateid, &stp->st_stid);
+ mutex_unlock(&stp->st_mutex);
+ trace_nfsd_open_confirm(oc->oc_seqid, &stp->st_stid.sc_stateid);
+ nfsd4_client_record_create(oo->oo_owner.so_client);
+ status = nfs_ok;
+put_stateid:
+ nfs4_put_stid(&stp->st_stid);
+out:
+ nfsd4_bump_seqid(cstate, status);
+ return status;
+}
+
+static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access)
+{
+ if (!test_access(access, stp))
+ return;
+ nfs4_file_put_access(stp->st_stid.sc_file, access);
+ clear_access(access, stp);
+}
+
+static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access)
+{
+ switch (to_access) {
+ case NFS4_SHARE_ACCESS_READ:
+ nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_WRITE);
+ nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH);
+ break;
+ case NFS4_SHARE_ACCESS_WRITE:
+ nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_READ);
+ nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH);
+ break;
+ case NFS4_SHARE_ACCESS_BOTH:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+}
+
+__be32
+nfsd4_open_downgrade(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate, union nfsd4_op_u *u)
+{
+ struct nfsd4_open_downgrade *od = &u->open_downgrade;
+ __be32 status;
+ struct nfs4_ol_stateid *stp;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ dprintk("NFSD: nfsd4_open_downgrade on file %pd\n",
+ cstate->current_fh.fh_dentry);
+
+ /* We don't yet support WANT bits: */
+ if (od->od_deleg_want)
+ dprintk("NFSD: %s: od_deleg_want=0x%x ignored\n", __func__,
+ od->od_deleg_want);
+
+ status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid,
+ &od->od_stateid, &stp, nn);
+ if (status)
+ goto out;
+ status = nfserr_inval;
+ if (!test_access(od->od_share_access, stp)) {
+ dprintk("NFSD: access not a subset of current bitmap: 0x%hhx, input access=%08x\n",
+ stp->st_access_bmap, od->od_share_access);
+ goto put_stateid;
+ }
+ if (!test_deny(od->od_share_deny, stp)) {
+ dprintk("NFSD: deny not a subset of current bitmap: 0x%hhx, input deny=%08x\n",
+ stp->st_deny_bmap, od->od_share_deny);
+ goto put_stateid;
+ }
+ nfs4_stateid_downgrade(stp, od->od_share_access);
+ reset_union_bmap_deny(od->od_share_deny, stp);
+ nfs4_inc_and_copy_stateid(&od->od_stateid, &stp->st_stid);
+ status = nfs_ok;
+put_stateid:
+ mutex_unlock(&stp->st_mutex);
+ nfs4_put_stid(&stp->st_stid);
+out:
+ nfsd4_bump_seqid(cstate, status);
+ return status;
+}
+
+static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
+{
+ struct nfs4_client *clp = s->st_stid.sc_client;
+ bool unhashed;
+ LIST_HEAD(reaplist);
+ struct nfs4_ol_stateid *stp;
+
+ spin_lock(&clp->cl_lock);
+ unhashed = unhash_open_stateid(s, &reaplist);
+
+ if (clp->cl_minorversion) {
+ if (unhashed)
+ put_ol_stateid_locked(s, &reaplist);
+ spin_unlock(&clp->cl_lock);
+ list_for_each_entry(stp, &reaplist, st_locks)
+ nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
+ free_ol_stateid_reaplist(&reaplist);
+ } else {
+ spin_unlock(&clp->cl_lock);
+ free_ol_stateid_reaplist(&reaplist);
+ if (unhashed)
+ move_to_close_lru(s, clp->net);
+ }
+}
+
+/*
+ * nfs4_unlock_state() called after encode
+ */
+__be32
+nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_close *close = &u->close;
+ __be32 status;
+ struct nfs4_ol_stateid *stp;
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ dprintk("NFSD: nfsd4_close on file %pd\n",
+ cstate->current_fh.fh_dentry);
+
+ status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
+ &close->cl_stateid,
+ NFS4_OPEN_STID|NFS4_CLOSED_STID,
+ &stp, nn);
+ nfsd4_bump_seqid(cstate, status);
+ if (status)
+ goto out;
+
+ stp->st_stid.sc_type = NFS4_CLOSED_STID;
+
+ /*
+ * Technically we don't _really_ have to increment or copy it, since
+ * it should just be gone after this operation and we clobber the
+ * copied value below, but we continue to do so here just to ensure
+ * that racing ops see that there was a state change.
+ */
+ nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
+
+ nfsd4_close_open_stateid(stp);
+ mutex_unlock(&stp->st_mutex);
+
+ /* v4.1+ suggests that we send a special stateid in here, since the
+ * clients should just ignore this anyway. Since this is not useful
+ * for v4.0 clients either, we set it to the special close_stateid
+ * universally.
+ *
+ * See RFC5661 section 18.2.4, and RFC7530 section 16.2.5
+ */
+ memcpy(&close->cl_stateid, &close_stateid, sizeof(close->cl_stateid));
+
+ /* put reference from nfs4_preprocess_seqid_op */
+ nfs4_put_stid(&stp->st_stid);
+out:
+ return status;
+}
+
+__be32
+nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_delegreturn *dr = &u->delegreturn;
+ struct nfs4_delegation *dp;
+ stateid_t *stateid = &dr->dr_stateid;
+ struct nfs4_stid *s;
+ __be32 status;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
+ return status;
+
+ status = nfsd4_lookup_stateid(cstate, stateid, NFS4_DELEG_STID, &s, nn);
+ if (status)
+ goto out;
+ dp = delegstateid(s);
+ status = nfsd4_stid_check_stateid_generation(stateid, &dp->dl_stid, nfsd4_has_session(cstate));
+ if (status)
+ goto put_stateid;
+
+ destroy_delegation(dp);
+put_stateid:
+ nfs4_put_stid(&dp->dl_stid);
+out:
+ return status;
+}
+
+static inline u64
+end_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ end = start + len;
+ return end >= start ? end: NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ WARN_ON_ONCE(!len);
+ end = start + len;
+ return end > start ? end - 1: NFS4_MAX_UINT64;
+}
+
+/*
+ * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
+ * we can't properly handle lock requests that go beyond the (2^63 - 1)-th
+ * byte, because of sign extension problems. Since NFSv4 calls for 64-bit
+ * locking, this prevents us from being completely protocol-compliant. The
+ * real solution to this problem is to start using unsigned file offsets in
+ * the VFS, but this is a very deep change!
+ */
+static inline void
+nfs4_transform_lock_offset(struct file_lock *lock)
+{
+ if (lock->fl_start < 0)
+ lock->fl_start = OFFSET_MAX;
+ if (lock->fl_end < 0)
+ lock->fl_end = OFFSET_MAX;
+}
+
+static fl_owner_t
+nfsd4_fl_get_owner(fl_owner_t owner)
+{
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner;
+
+ nfs4_get_stateowner(&lo->lo_owner);
+ return owner;
+}
+
+static void
+nfsd4_fl_put_owner(fl_owner_t owner)
+{
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner;
+
+ if (lo)
+ nfs4_put_stateowner(&lo->lo_owner);
+}
+
+static void
+nfsd4_lm_notify(struct file_lock *fl)
+{
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *)fl->fl_owner;
+ struct net *net = lo->lo_owner.so_client->net;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct nfsd4_blocked_lock *nbl = container_of(fl,
+ struct nfsd4_blocked_lock, nbl_lock);
+ bool queue = false;
+
+ /* An empty list means that something else is going to be using it */
+ spin_lock(&nn->blocked_locks_lock);
+ if (!list_empty(&nbl->nbl_list)) {
+ list_del_init(&nbl->nbl_list);
+ list_del_init(&nbl->nbl_lru);
+ queue = true;
+ }
+ spin_unlock(&nn->blocked_locks_lock);
+
+ if (queue)
+ nfsd4_run_cb(&nbl->nbl_cb);
+}
+
+static const struct lock_manager_operations nfsd_posix_mng_ops = {
+ .lm_notify = nfsd4_lm_notify,
+ .lm_get_owner = nfsd4_fl_get_owner,
+ .lm_put_owner = nfsd4_fl_put_owner,
+};
+
+static inline void
+nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
+{
+ struct nfs4_lockowner *lo;
+
+ if (fl->fl_lmops == &nfsd_posix_mng_ops) {
+ lo = (struct nfs4_lockowner *) fl->fl_owner;
+ xdr_netobj_dup(&deny->ld_owner, &lo->lo_owner.so_owner,
+ GFP_KERNEL);
+ if (!deny->ld_owner.data)
+ /* We just don't care that much */
+ goto nevermind;
+ deny->ld_clientid = lo->lo_owner.so_client->cl_clientid;
+ } else {
+nevermind:
+ deny->ld_owner.len = 0;
+ deny->ld_owner.data = NULL;
+ deny->ld_clientid.cl_boot = 0;
+ deny->ld_clientid.cl_id = 0;
+ }
+ deny->ld_start = fl->fl_start;
+ deny->ld_length = NFS4_MAX_UINT64;
+ if (fl->fl_end != NFS4_MAX_UINT64)
+ deny->ld_length = fl->fl_end - fl->fl_start + 1;
+ deny->ld_type = NFS4_READ_LT;
+ if (fl->fl_type != F_RDLCK)
+ deny->ld_type = NFS4_WRITE_LT;
+}
+
+static struct nfs4_lockowner *
+find_lockowner_str_locked(struct nfs4_client *clp, struct xdr_netobj *owner)
+{
+ unsigned int strhashval = ownerstr_hashval(owner);
+ struct nfs4_stateowner *so;
+
+ lockdep_assert_held(&clp->cl_lock);
+
+ list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[strhashval],
+ so_strhash) {
+ if (so->so_is_open_owner)
+ continue;
+ if (same_owner_str(so, owner))
+ return lockowner(nfs4_get_stateowner(so));
+ }
+ return NULL;
+}
+
+static struct nfs4_lockowner *
+find_lockowner_str(struct nfs4_client *clp, struct xdr_netobj *owner)
+{
+ struct nfs4_lockowner *lo;
+
+ spin_lock(&clp->cl_lock);
+ lo = find_lockowner_str_locked(clp, owner);
+ spin_unlock(&clp->cl_lock);
+ return lo;
+}
+
+static void nfs4_unhash_lockowner(struct nfs4_stateowner *sop)
+{
+ unhash_lockowner_locked(lockowner(sop));
+}
+
+static void nfs4_free_lockowner(struct nfs4_stateowner *sop)
+{
+ struct nfs4_lockowner *lo = lockowner(sop);
+
+ kmem_cache_free(lockowner_slab, lo);
+}
+
+static const struct nfs4_stateowner_operations lockowner_ops = {
+ .so_unhash = nfs4_unhash_lockowner,
+ .so_free = nfs4_free_lockowner,
+};
+
+/*
+ * Alloc a lock owner structure.
+ * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has
+ * occurred.
+ *
+ * strhashval = ownerstr_hashval
+ */
+static struct nfs4_lockowner *
+alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp,
+ struct nfs4_ol_stateid *open_stp,
+ struct nfsd4_lock *lock)
+{
+ struct nfs4_lockowner *lo, *ret;
+
+ lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp);
+ if (!lo)
+ return NULL;
+ INIT_LIST_HEAD(&lo->lo_blocked);
+ INIT_LIST_HEAD(&lo->lo_owner.so_stateids);
+ lo->lo_owner.so_is_open_owner = 0;
+ lo->lo_owner.so_seqid = lock->lk_new_lock_seqid;
+ lo->lo_owner.so_ops = &lockowner_ops;
+ spin_lock(&clp->cl_lock);
+ ret = find_lockowner_str_locked(clp, &lock->lk_new_owner);
+ if (ret == NULL) {
+ list_add(&lo->lo_owner.so_strhash,
+ &clp->cl_ownerstr_hashtbl[strhashval]);
+ ret = lo;
+ } else
+ nfs4_free_stateowner(&lo->lo_owner);
+
+ spin_unlock(&clp->cl_lock);
+ return ret;
+}
+
+static struct nfs4_ol_stateid *
+find_lock_stateid(const struct nfs4_lockowner *lo,
+ const struct nfs4_ol_stateid *ost)
+{
+ struct nfs4_ol_stateid *lst;
+
+ lockdep_assert_held(&ost->st_stid.sc_client->cl_lock);
+
+ /* If ost is not hashed, ost->st_locks will not be valid */
+ if (!nfs4_ol_stateid_unhashed(ost))
+ list_for_each_entry(lst, &ost->st_locks, st_locks) {
+ if (lst->st_stateowner == &lo->lo_owner) {
+ refcount_inc(&lst->st_stid.sc_count);
+ return lst;
+ }
+ }
+ return NULL;
+}
+
+static struct nfs4_ol_stateid *
+init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
+ struct nfs4_file *fp, struct inode *inode,
+ struct nfs4_ol_stateid *open_stp)
+{
+ struct nfs4_client *clp = lo->lo_owner.so_client;
+ struct nfs4_ol_stateid *retstp;
+
+ mutex_init(&stp->st_mutex);
+ mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX);
+retry:
+ spin_lock(&clp->cl_lock);
+ if (nfs4_ol_stateid_unhashed(open_stp))
+ goto out_close;
+ retstp = find_lock_stateid(lo, open_stp);
+ if (retstp)
+ goto out_found;
+ refcount_inc(&stp->st_stid.sc_count);
+ stp->st_stid.sc_type = NFS4_LOCK_STID;
+ stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner);
+ get_nfs4_file(fp);
+ stp->st_stid.sc_file = fp;
+ stp->st_access_bmap = 0;
+ stp->st_deny_bmap = open_stp->st_deny_bmap;
+ stp->st_openstp = open_stp;
+ spin_lock(&fp->fi_lock);
+ list_add(&stp->st_locks, &open_stp->st_locks);
+ list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
+ list_add(&stp->st_perfile, &fp->fi_stateids);
+ spin_unlock(&fp->fi_lock);
+ spin_unlock(&clp->cl_lock);
+ return stp;
+out_found:
+ spin_unlock(&clp->cl_lock);
+ if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) {
+ nfs4_put_stid(&retstp->st_stid);
+ goto retry;
+ }
+ /* To keep mutex tracking happy */
+ mutex_unlock(&stp->st_mutex);
+ return retstp;
+out_close:
+ spin_unlock(&clp->cl_lock);
+ mutex_unlock(&stp->st_mutex);
+ return NULL;
+}
+
+static struct nfs4_ol_stateid *
+find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi,
+ struct inode *inode, struct nfs4_ol_stateid *ost,
+ bool *new)
+{
+ struct nfs4_stid *ns = NULL;
+ struct nfs4_ol_stateid *lst;
+ struct nfs4_openowner *oo = openowner(ost->st_stateowner);
+ struct nfs4_client *clp = oo->oo_owner.so_client;
+
+ *new = false;
+ spin_lock(&clp->cl_lock);
+ lst = find_lock_stateid(lo, ost);
+ spin_unlock(&clp->cl_lock);
+ if (lst != NULL) {
+ if (nfsd4_lock_ol_stateid(lst) == nfs_ok)
+ goto out;
+ nfs4_put_stid(&lst->st_stid);
+ }
+ ns = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_lock_stateid);
+ if (ns == NULL)
+ return NULL;
+
+ lst = init_lock_stateid(openlockstateid(ns), lo, fi, inode, ost);
+ if (lst == openlockstateid(ns))
+ *new = true;
+ else
+ nfs4_put_stid(ns);
+out:
+ return lst;
+}
+
+static int
+check_lock_length(u64 offset, u64 length)
+{
+ return ((length == 0) || ((length != NFS4_MAX_UINT64) &&
+ (length > ~offset)));
+}
+
+static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
+{
+ struct nfs4_file *fp = lock_stp->st_stid.sc_file;
+
+ lockdep_assert_held(&fp->fi_lock);
+
+ if (test_access(access, lock_stp))
+ return;
+ __nfs4_file_get_access(fp, access);
+ set_access(access, lock_stp);
+}
+
+static __be32
+lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
+ struct nfs4_ol_stateid *ost,
+ struct nfsd4_lock *lock,
+ struct nfs4_ol_stateid **plst, bool *new)
+{
+ __be32 status;
+ struct nfs4_file *fi = ost->st_stid.sc_file;
+ struct nfs4_openowner *oo = openowner(ost->st_stateowner);
+ struct nfs4_client *cl = oo->oo_owner.so_client;
+ struct inode *inode = d_inode(cstate->current_fh.fh_dentry);
+ struct nfs4_lockowner *lo;
+ struct nfs4_ol_stateid *lst;
+ unsigned int strhashval;
+
+ lo = find_lockowner_str(cl, &lock->lk_new_owner);
+ if (!lo) {
+ strhashval = ownerstr_hashval(&lock->lk_new_owner);
+ lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock);
+ if (lo == NULL)
+ return nfserr_jukebox;
+ } else {
+ /* with an existing lockowner, seqids must be the same */
+ status = nfserr_bad_seqid;
+ if (!cstate->minorversion &&
+ lock->lk_new_lock_seqid != lo->lo_owner.so_seqid)
+ goto out;
+ }
+
+ lst = find_or_create_lock_stateid(lo, fi, inode, ost, new);
+ if (lst == NULL) {
+ status = nfserr_jukebox;
+ goto out;
+ }
+
+ status = nfs_ok;
+ *plst = lst;
+out:
+ nfs4_put_stateowner(&lo->lo_owner);
+ return status;
+}
+
+/*
+ * LOCK operation
+ */
+__be32
+nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_lock *lock = &u->lock;
+ struct nfs4_openowner *open_sop = NULL;
+ struct nfs4_lockowner *lock_sop = NULL;
+ struct nfs4_ol_stateid *lock_stp = NULL;
+ struct nfs4_ol_stateid *open_stp = NULL;
+ struct nfs4_file *fp;
+ struct nfsd_file *nf = NULL;
+ struct nfsd4_blocked_lock *nbl = NULL;
+ struct file_lock *file_lock = NULL;
+ struct file_lock *conflock = NULL;
+ __be32 status = 0;
+ int lkflg;
+ int err;
+ bool new = false;
+ unsigned char fl_type;
+ unsigned int fl_flags = FL_POSIX;
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
+ (long long) lock->lk_offset,
+ (long long) lock->lk_length);
+
+ if (check_lock_length(lock->lk_offset, lock->lk_length))
+ return nfserr_inval;
+
+ if ((status = fh_verify(rqstp, &cstate->current_fh,
+ S_IFREG, NFSD_MAY_LOCK))) {
+ dprintk("NFSD: nfsd4_lock: permission denied!\n");
+ return status;
+ }
+
+ if (lock->lk_is_new) {
+ if (nfsd4_has_session(cstate))
+ /* See rfc 5661 18.10.3: given clientid is ignored: */
+ memcpy(&lock->lk_new_clientid,
+ &cstate->session->se_client->cl_clientid,
+ sizeof(clientid_t));
+
+ status = nfserr_stale_clientid;
+ if (STALE_CLIENTID(&lock->lk_new_clientid, nn))
+ goto out;
+
+ /* validate and update open stateid and open seqid */
+ status = nfs4_preprocess_confirmed_seqid_op(cstate,
+ lock->lk_new_open_seqid,
+ &lock->lk_new_open_stateid,
+ &open_stp, nn);
+ if (status)
+ goto out;
+ mutex_unlock(&open_stp->st_mutex);
+ open_sop = openowner(open_stp->st_stateowner);
+ status = nfserr_bad_stateid;
+ if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
+ &lock->lk_new_clientid))
+ goto out;
+ status = lookup_or_create_lock_state(cstate, open_stp, lock,
+ &lock_stp, &new);
+ } else {
+ status = nfs4_preprocess_seqid_op(cstate,
+ lock->lk_old_lock_seqid,
+ &lock->lk_old_lock_stateid,
+ NFS4_LOCK_STID, &lock_stp, nn);
+ }
+ if (status)
+ goto out;
+ lock_sop = lockowner(lock_stp->st_stateowner);
+
+ lkflg = setlkflg(lock->lk_type);
+ status = nfs4_check_openmode(lock_stp, lkflg);
+ if (status)
+ goto out;
+
+ status = nfserr_grace;
+ if (locks_in_grace(net) && !lock->lk_reclaim)
+ goto out;
+ status = nfserr_no_grace;
+ if (!locks_in_grace(net) && lock->lk_reclaim)
+ goto out;
+
+ fp = lock_stp->st_stid.sc_file;
+ switch (lock->lk_type) {
+ case NFS4_READW_LT:
+ if (nfsd4_has_session(cstate))
+ fl_flags |= FL_SLEEP;
+ fallthrough;
+ case NFS4_READ_LT:
+ spin_lock(&fp->fi_lock);
+ nf = find_readable_file_locked(fp);
+ if (nf)
+ get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
+ spin_unlock(&fp->fi_lock);
+ fl_type = F_RDLCK;
+ break;
+ case NFS4_WRITEW_LT:
+ if (nfsd4_has_session(cstate))
+ fl_flags |= FL_SLEEP;
+ fallthrough;
+ case NFS4_WRITE_LT:
+ spin_lock(&fp->fi_lock);
+ nf = find_writeable_file_locked(fp);
+ if (nf)
+ get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
+ spin_unlock(&fp->fi_lock);
+ fl_type = F_WRLCK;
+ break;
+ default:
+ status = nfserr_inval;
+ goto out;
+ }
+
+ if (!nf) {
+ status = nfserr_openmode;
+ goto out;
+ }
+
+ nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn);
+ if (!nbl) {
+ dprintk("NFSD: %s: unable to allocate block!\n", __func__);
+ status = nfserr_jukebox;
+ goto out;
+ }
+
+ file_lock = &nbl->nbl_lock;
+ file_lock->fl_type = fl_type;
+ file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner));
+ file_lock->fl_pid = current->tgid;
+ file_lock->fl_file = nf->nf_file;
+ file_lock->fl_flags = fl_flags;
+ file_lock->fl_lmops = &nfsd_posix_mng_ops;
+ file_lock->fl_start = lock->lk_offset;
+ file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
+ nfs4_transform_lock_offset(file_lock);
+
+ conflock = locks_alloc_lock();
+ if (!conflock) {
+ dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
+ status = nfserr_jukebox;
+ goto out;
+ }
+
+ if (fl_flags & FL_SLEEP) {
+ nbl->nbl_time = ktime_get_boottime_seconds();
+ spin_lock(&nn->blocked_locks_lock);
+ list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked);
+ list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru);
+ spin_unlock(&nn->blocked_locks_lock);
+ }
+
+ err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, conflock);
+ switch (err) {
+ case 0: /* success! */
+ nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid);
+ status = 0;
+ if (lock->lk_reclaim)
+ nn->somebody_reclaimed = true;
+ break;
+ case FILE_LOCK_DEFERRED:
+ nbl = NULL;
+ fallthrough;
+ case -EAGAIN: /* conflock holds conflicting lock */
+ status = nfserr_denied;
+ dprintk("NFSD: nfsd4_lock: conflicting lock found!\n");
+ nfs4_set_lock_denied(conflock, &lock->lk_denied);
+ break;
+ case -EDEADLK:
+ status = nfserr_deadlock;
+ break;
+ default:
+ dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err);
+ status = nfserrno(err);
+ break;
+ }
+out:
+ if (nbl) {
+ /* dequeue it if we queued it before */
+ if (fl_flags & FL_SLEEP) {
+ spin_lock(&nn->blocked_locks_lock);
+ list_del_init(&nbl->nbl_list);
+ list_del_init(&nbl->nbl_lru);
+ spin_unlock(&nn->blocked_locks_lock);
+ }
+ free_blocked_lock(nbl);
+ }
+ if (nf)
+ nfsd_file_put(nf);
+ if (lock_stp) {
+ /* Bump seqid manually if the 4.0 replay owner is openowner */
+ if (cstate->replay_owner &&
+ cstate->replay_owner != &lock_sop->lo_owner &&
+ seqid_mutating_err(ntohl(status)))
+ lock_sop->lo_owner.so_seqid++;
+
+ /*
+ * If this is a new, never-before-used stateid, and we are
+ * returning an error, then just go ahead and release it.
+ */
+ if (status && new)
+ release_lock_stateid(lock_stp);
+
+ mutex_unlock(&lock_stp->st_mutex);
+
+ nfs4_put_stid(&lock_stp->st_stid);
+ }
+ if (open_stp)
+ nfs4_put_stid(&open_stp->st_stid);
+ nfsd4_bump_seqid(cstate, status);
+ if (conflock)
+ locks_free_lock(conflock);
+ return status;
+}
+
+/*
+ * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN,
+ * so we do a temporary open here just to get an open file to pass to
+ * vfs_test_lock.
+ */
+static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
+{
+ struct nfsd_file *nf;
+ __be32 err;
+
+ err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
+ if (err)
+ return err;
+ fh_lock(fhp); /* to block new leases till after test_lock: */
+ err = nfserrno(nfsd_open_break_lease(fhp->fh_dentry->d_inode,
+ NFSD_MAY_READ));
+ if (err)
+ goto out;
+ lock->fl_file = nf->nf_file;
+ err = nfserrno(vfs_test_lock(nf->nf_file, lock));
+ lock->fl_file = NULL;
+out:
+ fh_unlock(fhp);
+ nfsd_file_put(nf);
+ return err;
+}
+
+/*
+ * LOCKT operation
+ */
+__be32
+nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_lockt *lockt = &u->lockt;
+ struct file_lock *file_lock = NULL;
+ struct nfs4_lockowner *lo = NULL;
+ __be32 status;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ if (locks_in_grace(SVC_NET(rqstp)))
+ return nfserr_grace;
+
+ if (check_lock_length(lockt->lt_offset, lockt->lt_length))
+ return nfserr_inval;
+
+ if (!nfsd4_has_session(cstate)) {
+ status = lookup_clientid(&lockt->lt_clientid, cstate, nn,
+ false);
+ if (status)
+ goto out;
+ }
+
+ if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
+ goto out;
+
+ file_lock = locks_alloc_lock();
+ if (!file_lock) {
+ dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
+ status = nfserr_jukebox;
+ goto out;
+ }
+
+ switch (lockt->lt_type) {
+ case NFS4_READ_LT:
+ case NFS4_READW_LT:
+ file_lock->fl_type = F_RDLCK;
+ break;
+ case NFS4_WRITE_LT:
+ case NFS4_WRITEW_LT:
+ file_lock->fl_type = F_WRLCK;
+ break;
+ default:
+ dprintk("NFSD: nfs4_lockt: bad lock type!\n");
+ status = nfserr_inval;
+ goto out;
+ }
+
+ lo = find_lockowner_str(cstate->clp, &lockt->lt_owner);
+ if (lo)
+ file_lock->fl_owner = (fl_owner_t)lo;
+ file_lock->fl_pid = current->tgid;
+ file_lock->fl_flags = FL_POSIX;
+
+ file_lock->fl_start = lockt->lt_offset;
+ file_lock->fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
+
+ nfs4_transform_lock_offset(file_lock);
+
+ status = nfsd_test_lock(rqstp, &cstate->current_fh, file_lock);
+ if (status)
+ goto out;
+
+ if (file_lock->fl_type != F_UNLCK) {
+ status = nfserr_denied;
+ nfs4_set_lock_denied(file_lock, &lockt->lt_denied);
+ }
+out:
+ if (lo)
+ nfs4_put_stateowner(&lo->lo_owner);
+ if (file_lock)
+ locks_free_lock(file_lock);
+ return status;
+}
+
+__be32
+nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_locku *locku = &u->locku;
+ struct nfs4_ol_stateid *stp;
+ struct nfsd_file *nf = NULL;
+ struct file_lock *file_lock = NULL;
+ __be32 status;
+ int err;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n",
+ (long long) locku->lu_offset,
+ (long long) locku->lu_length);
+
+ if (check_lock_length(locku->lu_offset, locku->lu_length))
+ return nfserr_inval;
+
+ status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
+ &locku->lu_stateid, NFS4_LOCK_STID,
+ &stp, nn);
+ if (status)
+ goto out;
+ nf = find_any_file(stp->st_stid.sc_file);
+ if (!nf) {
+ status = nfserr_lock_range;
+ goto put_stateid;
+ }
+ file_lock = locks_alloc_lock();
+ if (!file_lock) {
+ dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
+ status = nfserr_jukebox;
+ goto put_file;
+ }
+
+ file_lock->fl_type = F_UNLCK;
+ file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner));
+ file_lock->fl_pid = current->tgid;
+ file_lock->fl_file = nf->nf_file;
+ file_lock->fl_flags = FL_POSIX;
+ file_lock->fl_lmops = &nfsd_posix_mng_ops;
+ file_lock->fl_start = locku->lu_offset;
+
+ file_lock->fl_end = last_byte_offset(locku->lu_offset,
+ locku->lu_length);
+ nfs4_transform_lock_offset(file_lock);
+
+ err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, NULL);
+ if (err) {
+ dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n");
+ goto out_nfserr;
+ }
+ nfs4_inc_and_copy_stateid(&locku->lu_stateid, &stp->st_stid);
+put_file:
+ nfsd_file_put(nf);
+put_stateid:
+ mutex_unlock(&stp->st_mutex);
+ nfs4_put_stid(&stp->st_stid);
+out:
+ nfsd4_bump_seqid(cstate, status);
+ if (file_lock)
+ locks_free_lock(file_lock);
+ return status;
+
+out_nfserr:
+ status = nfserrno(err);
+ goto put_file;
+}
+
+/*
+ * returns
+ * true: locks held by lockowner
+ * false: no locks held by lockowner
+ */
+static bool
+check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
+{
+ struct file_lock *fl;
+ int status = false;
+ struct nfsd_file *nf = find_any_file(fp);
+ struct inode *inode;
+ struct file_lock_context *flctx;
+
+ if (!nf) {
+ /* Any valid lock stateid should have some sort of access */
+ WARN_ON_ONCE(1);
+ return status;
+ }
+
+ inode = locks_inode(nf->nf_file);
+ flctx = inode->i_flctx;
+
+ if (flctx && !list_empty_careful(&flctx->flc_posix)) {
+ spin_lock(&flctx->flc_lock);
+ list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
+ if (fl->fl_owner == (fl_owner_t)lowner) {
+ status = true;
+ break;
+ }
+ }
+ spin_unlock(&flctx->flc_lock);
+ }
+ nfsd_file_put(nf);
+ return status;
+}
+
+__be32
+nfsd4_release_lockowner(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_release_lockowner *rlockowner = &u->release_lockowner;
+ clientid_t *clid = &rlockowner->rl_clientid;
+ struct nfs4_stateowner *sop;
+ struct nfs4_lockowner *lo = NULL;
+ struct nfs4_ol_stateid *stp;
+ struct xdr_netobj *owner = &rlockowner->rl_owner;
+ unsigned int hashval = ownerstr_hashval(owner);
+ __be32 status;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct nfs4_client *clp;
+ LIST_HEAD (reaplist);
+
+ dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
+ clid->cl_boot, clid->cl_id);
+
+ status = lookup_clientid(clid, cstate, nn, false);
+ if (status)
+ return status;
+
+ clp = cstate->clp;
+ /* Find the matching lock stateowner */
+ spin_lock(&clp->cl_lock);
+ list_for_each_entry(sop, &clp->cl_ownerstr_hashtbl[hashval],
+ so_strhash) {
+
+ if (sop->so_is_open_owner || !same_owner_str(sop, owner))
+ continue;
+
+ if (atomic_read(&sop->so_count) != 1) {
+ spin_unlock(&clp->cl_lock);
+ return nfserr_locks_held;
+ }
+
+ lo = lockowner(sop);
+ nfs4_get_stateowner(sop);
+ break;
+ }
+ if (!lo) {
+ spin_unlock(&clp->cl_lock);
+ return status;
+ }
+
+ unhash_lockowner_locked(lo);
+ while (!list_empty(&lo->lo_owner.so_stateids)) {
+ stp = list_first_entry(&lo->lo_owner.so_stateids,
+ struct nfs4_ol_stateid,
+ st_perstateowner);
+ WARN_ON(!unhash_lock_stateid(stp));
+ put_ol_stateid_locked(stp, &reaplist);
+ }
+ spin_unlock(&clp->cl_lock);
+ free_ol_stateid_reaplist(&reaplist);
+ remove_blocked_locks(lo);
+ nfs4_put_stateowner(&lo->lo_owner);
+
+ return status;
+}
+
+static inline struct nfs4_client_reclaim *
+alloc_reclaim(void)
+{
+ return kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL);
+}
+
+bool
+nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn)
+{
+ struct nfs4_client_reclaim *crp;
+
+ crp = nfsd4_find_reclaim_client(name, nn);
+ return (crp && crp->cr_clp);
+}
+
+/*
+ * failure => all reset bets are off, nfserr_no_grace...
+ *
+ * The caller is responsible for freeing name.data if NULL is returned (it
+ * will be freed in nfs4_remove_reclaim_record in the normal case).
+ */
+struct nfs4_client_reclaim *
+nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash,
+ struct nfsd_net *nn)
+{
+ unsigned int strhashval;
+ struct nfs4_client_reclaim *crp;
+
+ crp = alloc_reclaim();
+ if (crp) {
+ strhashval = clientstr_hashval(name);
+ INIT_LIST_HEAD(&crp->cr_strhash);
+ list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]);
+ crp->cr_name.data = name.data;
+ crp->cr_name.len = name.len;
+ crp->cr_princhash.data = princhash.data;
+ crp->cr_princhash.len = princhash.len;
+ crp->cr_clp = NULL;
+ nn->reclaim_str_hashtbl_size++;
+ }
+ return crp;
+}
+
+void
+nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn)
+{
+ list_del(&crp->cr_strhash);
+ kfree(crp->cr_name.data);
+ kfree(crp->cr_princhash.data);
+ kfree(crp);
+ nn->reclaim_str_hashtbl_size--;
+}
+
+void
+nfs4_release_reclaim(struct nfsd_net *nn)
+{
+ struct nfs4_client_reclaim *crp = NULL;
+ int i;
+
+ for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+ while (!list_empty(&nn->reclaim_str_hashtbl[i])) {
+ crp = list_entry(nn->reclaim_str_hashtbl[i].next,
+ struct nfs4_client_reclaim, cr_strhash);
+ nfs4_remove_reclaim_record(crp, nn);
+ }
+ }
+ WARN_ON_ONCE(nn->reclaim_str_hashtbl_size);
+}
+
+/*
+ * called from OPEN, CLAIM_PREVIOUS with a new clientid. */
+struct nfs4_client_reclaim *
+nfsd4_find_reclaim_client(struct xdr_netobj name, struct nfsd_net *nn)
+{
+ unsigned int strhashval;
+ struct nfs4_client_reclaim *crp = NULL;
+
+ strhashval = clientstr_hashval(name);
+ list_for_each_entry(crp, &nn->reclaim_str_hashtbl[strhashval], cr_strhash) {
+ if (compare_blob(&crp->cr_name, &name) == 0) {
+ return crp;
+ }
+ }
+ return NULL;
+}
+
+/*
+* Called from OPEN. Look for clientid in reclaim list.
+*/
+__be32
+nfs4_check_open_reclaim(clientid_t *clid,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd_net *nn)
+{
+ __be32 status;
+
+ /* find clientid in conf_id_hashtbl */
+ status = lookup_clientid(clid, cstate, nn, false);
+ if (status)
+ return nfserr_reclaim_bad;
+
+ if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags))
+ return nfserr_no_grace;
+
+ if (nfsd4_client_record_check(cstate->clp))
+ return nfserr_reclaim_bad;
+
+ return nfs_ok;
+}
+
+/*
+ * Since the lifetime of a delegation isn't limited to that of an open, a
+ * client may quite reasonably hang on to a delegation as long as it has
+ * the inode cached. This becomes an obvious problem the first time a
+ * client's inode cache approaches the size of the server's total memory.
+ *
+ * For now we avoid this problem by imposing a hard limit on the number
+ * of delegations, which varies according to the server's memory size.
+ */
+static void
+set_max_delegations(void)
+{
+ /*
+ * Allow at most 4 delegations per megabyte of RAM. Quick
+ * estimates suggest that in the worst case (where every delegation
+ * is for a different inode), a delegation could take about 1.5K,
+ * giving a worst case usage of about 6% of memory.
+ */
+ max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT);
+}
+
+static int nfs4_state_create_net(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ int i;
+
+ nn->conf_id_hashtbl = kmalloc_array(CLIENT_HASH_SIZE,
+ sizeof(struct list_head),
+ GFP_KERNEL);
+ if (!nn->conf_id_hashtbl)
+ goto err;
+ nn->unconf_id_hashtbl = kmalloc_array(CLIENT_HASH_SIZE,
+ sizeof(struct list_head),
+ GFP_KERNEL);
+ if (!nn->unconf_id_hashtbl)
+ goto err_unconf_id;
+ nn->sessionid_hashtbl = kmalloc_array(SESSION_HASH_SIZE,
+ sizeof(struct list_head),
+ GFP_KERNEL);
+ if (!nn->sessionid_hashtbl)
+ goto err_sessionid;
+
+ for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+ INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]);
+ INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]);
+ }
+ for (i = 0; i < SESSION_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]);
+ nn->conf_name_tree = RB_ROOT;
+ nn->unconf_name_tree = RB_ROOT;
+ nn->boot_time = ktime_get_real_seconds();
+ nn->grace_ended = false;
+ nn->nfsd4_manager.block_opens = true;
+ INIT_LIST_HEAD(&nn->nfsd4_manager.list);
+ INIT_LIST_HEAD(&nn->client_lru);
+ INIT_LIST_HEAD(&nn->close_lru);
+ INIT_LIST_HEAD(&nn->del_recall_lru);
+ spin_lock_init(&nn->client_lock);
+ spin_lock_init(&nn->s2s_cp_lock);
+ idr_init(&nn->s2s_cp_stateids);
+
+ spin_lock_init(&nn->blocked_locks_lock);
+ INIT_LIST_HEAD(&nn->blocked_locks_lru);
+
+ INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
+ get_net(net);
+
+ return 0;
+
+err_sessionid:
+ kfree(nn->unconf_id_hashtbl);
+err_unconf_id:
+ kfree(nn->conf_id_hashtbl);
+err:
+ return -ENOMEM;
+}
+
+static void
+nfs4_state_destroy_net(struct net *net)
+{
+ int i;
+ struct nfs4_client *clp = NULL;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+ while (!list_empty(&nn->conf_id_hashtbl[i])) {
+ clp = list_entry(nn->conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
+ destroy_client(clp);
+ }
+ }
+
+ WARN_ON(!list_empty(&nn->blocked_locks_lru));
+
+ for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+ while (!list_empty(&nn->unconf_id_hashtbl[i])) {
+ clp = list_entry(nn->unconf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
+ destroy_client(clp);
+ }
+ }
+
+ kfree(nn->sessionid_hashtbl);
+ kfree(nn->unconf_id_hashtbl);
+ kfree(nn->conf_id_hashtbl);
+ put_net(net);
+}
+
+int
+nfs4_state_start_net(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ int ret;
+
+ ret = nfs4_state_create_net(net);
+ if (ret)
+ return ret;
+ locks_start_grace(net, &nn->nfsd4_manager);
+ nfsd4_client_tracking_init(net);
+ if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0)
+ goto skip_grace;
+ printk(KERN_INFO "NFSD: starting %lld-second grace period (net %x)\n",
+ nn->nfsd4_grace, net->ns.inum);
+ trace_nfsd_grace_start(nn);
+ queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
+ return 0;
+
+skip_grace:
+ printk(KERN_INFO "NFSD: no clients to reclaim, skipping NFSv4 grace period (net %x)\n",
+ net->ns.inum);
+ queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_lease * HZ);
+ nfsd4_end_grace(nn);
+ return 0;
+}
+
+/* initialization to perform when the nfsd service is started: */
+
+int
+nfs4_state_start(void)
+{
+ int ret;
+
+ laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4");
+ if (laundry_wq == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = nfsd4_create_callback_queue();
+ if (ret)
+ goto out_free_laundry;
+
+ set_max_delegations();
+ return 0;
+
+out_free_laundry:
+ destroy_workqueue(laundry_wq);
+out:
+ return ret;
+}
+
+void
+nfs4_state_shutdown_net(struct net *net)
+{
+ struct nfs4_delegation *dp = NULL;
+ struct list_head *pos, *next, reaplist;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ cancel_delayed_work_sync(&nn->laundromat_work);
+ locks_end_grace(&nn->nfsd4_manager);
+
+ INIT_LIST_HEAD(&reaplist);
+ spin_lock(&state_lock);
+ list_for_each_safe(pos, next, &nn->del_recall_lru) {
+ dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
+ WARN_ON(!unhash_delegation_locked(dp));
+ list_add(&dp->dl_recall_lru, &reaplist);
+ }
+ spin_unlock(&state_lock);
+ list_for_each_safe(pos, next, &reaplist) {
+ dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
+ list_del_init(&dp->dl_recall_lru);
+ destroy_unhashed_deleg(dp);
+ }
+
+ nfsd4_client_tracking_exit(net);
+ nfs4_state_destroy_net(net);
+}
+
+void
+nfs4_state_shutdown(void)
+{
+ destroy_workqueue(laundry_wq);
+ nfsd4_destroy_callback_queue();
+}
+
+static void
+get_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid)
+{
+ if (HAS_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG) &&
+ CURRENT_STATEID(stateid))
+ memcpy(stateid, &cstate->current_stateid, sizeof(stateid_t));
+}
+
+static void
+put_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid)
+{
+ if (cstate->minorversion) {
+ memcpy(&cstate->current_stateid, stateid, sizeof(stateid_t));
+ SET_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG);
+ }
+}
+
+void
+clear_current_stateid(struct nfsd4_compound_state *cstate)
+{
+ CLEAR_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG);
+}
+
+/*
+ * functions to set current state id
+ */
+void
+nfsd4_set_opendowngradestateid(struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ put_stateid(cstate, &u->open_downgrade.od_stateid);
+}
+
+void
+nfsd4_set_openstateid(struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ put_stateid(cstate, &u->open.op_stateid);
+}
+
+void
+nfsd4_set_closestateid(struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ put_stateid(cstate, &u->close.cl_stateid);
+}
+
+void
+nfsd4_set_lockstateid(struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ put_stateid(cstate, &u->lock.lk_resp_stateid);
+}
+
+/*
+ * functions to consume current state id
+ */
+
+void
+nfsd4_get_opendowngradestateid(struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ get_stateid(cstate, &u->open_downgrade.od_stateid);
+}
+
+void
+nfsd4_get_delegreturnstateid(struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ get_stateid(cstate, &u->delegreturn.dr_stateid);
+}
+
+void
+nfsd4_get_freestateid(struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ get_stateid(cstate, &u->free_stateid.fr_stateid);
+}
+
+void
+nfsd4_get_setattrstateid(struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ get_stateid(cstate, &u->setattr.sa_stateid);
+}
+
+void
+nfsd4_get_closestateid(struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ get_stateid(cstate, &u->close.cl_stateid);
+}
+
+void
+nfsd4_get_lockustateid(struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ get_stateid(cstate, &u->locku.lu_stateid);
+}
+
+void
+nfsd4_get_readstateid(struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ get_stateid(cstate, &u->read.rd_stateid);
+}
+
+void
+nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ get_stateid(cstate, &u->write.wr_stateid);
+}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
new file mode 100644
index 000000000..dbfa24cf3
--- /dev/null
+++ b/fs/nfsd/nfs4xdr.c
@@ -0,0 +1,5354 @@
+/*
+ * Server-side XDR for NFSv4
+ *
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Kendrick Smith <kmsmith@umich.edu>
+ * Andy Adamson <andros@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/utsname.h>
+#include <linux/pagemap.h>
+#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/xattr.h>
+#include <uapi/linux/xattr.h>
+
+#include "idmap.h"
+#include "acl.h"
+#include "xdr4.h"
+#include "vfs.h"
+#include "state.h"
+#include "cache.h"
+#include "netns.h"
+#include "pnfs.h"
+#include "filecache.h"
+
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#include <linux/security.h>
+#endif
+
+
+#define NFSDDBG_FACILITY NFSDDBG_XDR
+
+const u32 nfsd_suppattrs[3][3] = {
+ {NFSD4_SUPPORTED_ATTRS_WORD0,
+ NFSD4_SUPPORTED_ATTRS_WORD1,
+ NFSD4_SUPPORTED_ATTRS_WORD2},
+
+ {NFSD4_1_SUPPORTED_ATTRS_WORD0,
+ NFSD4_1_SUPPORTED_ATTRS_WORD1,
+ NFSD4_1_SUPPORTED_ATTRS_WORD2},
+
+ {NFSD4_1_SUPPORTED_ATTRS_WORD0,
+ NFSD4_1_SUPPORTED_ATTRS_WORD1,
+ NFSD4_2_SUPPORTED_ATTRS_WORD2},
+};
+
+/*
+ * As per referral draft, the fsid for a referral MUST be different from the fsid of the containing
+ * directory in order to indicate to the client that a filesystem boundary is present
+ * We use a fixed fsid for a referral
+ */
+#define NFS4_REFERRAL_FSID_MAJOR 0x8000000ULL
+#define NFS4_REFERRAL_FSID_MINOR 0x8000000ULL
+
+static __be32
+check_filename(char *str, int len)
+{
+ int i;
+
+ if (len == 0)
+ return nfserr_inval;
+ if (isdotent(str, len))
+ return nfserr_badname;
+ for (i = 0; i < len; i++)
+ if (str[i] == '/')
+ return nfserr_badname;
+ return 0;
+}
+
+#define DECODE_HEAD \
+ __be32 *p; \
+ __be32 status
+#define DECODE_TAIL \
+ status = 0; \
+out: \
+ return status; \
+xdr_error: \
+ dprintk("NFSD: xdr error (%s:%d)\n", \
+ __FILE__, __LINE__); \
+ status = nfserr_bad_xdr; \
+ goto out
+
+#define READMEM(x,nbytes) do { \
+ x = (char *)p; \
+ p += XDR_QUADLEN(nbytes); \
+} while (0)
+#define SAVEMEM(x,nbytes) do { \
+ if (!(x = (p==argp->tmp || p == argp->tmpp) ? \
+ savemem(argp, p, nbytes) : \
+ (char *)p)) { \
+ dprintk("NFSD: xdr error (%s:%d)\n", \
+ __FILE__, __LINE__); \
+ goto xdr_error; \
+ } \
+ p += XDR_QUADLEN(nbytes); \
+} while (0)
+#define COPYMEM(x,nbytes) do { \
+ memcpy((x), p, nbytes); \
+ p += XDR_QUADLEN(nbytes); \
+} while (0)
+
+/* READ_BUF, read_buf(): nbytes must be <= PAGE_SIZE */
+#define READ_BUF(nbytes) do { \
+ if (nbytes <= (u32)((char *)argp->end - (char *)argp->p)) { \
+ p = argp->p; \
+ argp->p += XDR_QUADLEN(nbytes); \
+ } else if (!(p = read_buf(argp, nbytes))) { \
+ dprintk("NFSD: xdr error (%s:%d)\n", \
+ __FILE__, __LINE__); \
+ goto xdr_error; \
+ } \
+} while (0)
+
+static void next_decode_page(struct nfsd4_compoundargs *argp)
+{
+ argp->p = page_address(argp->pagelist[0]);
+ argp->pagelist++;
+ if (argp->pagelen < PAGE_SIZE) {
+ argp->end = argp->p + XDR_QUADLEN(argp->pagelen);
+ argp->pagelen = 0;
+ } else {
+ argp->end = argp->p + (PAGE_SIZE>>2);
+ argp->pagelen -= PAGE_SIZE;
+ }
+}
+
+static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
+{
+ /* We want more bytes than seem to be available.
+ * Maybe we need a new page, maybe we have just run out
+ */
+ unsigned int avail = (char *)argp->end - (char *)argp->p;
+ __be32 *p;
+
+ if (argp->pagelen == 0) {
+ struct kvec *vec = &argp->rqstp->rq_arg.tail[0];
+
+ if (!argp->tail) {
+ argp->tail = true;
+ avail = vec->iov_len;
+ argp->p = vec->iov_base;
+ argp->end = vec->iov_base + avail;
+ }
+
+ if (avail < nbytes)
+ return NULL;
+
+ p = argp->p;
+ argp->p += XDR_QUADLEN(nbytes);
+ return p;
+ }
+
+ if (avail + argp->pagelen < nbytes)
+ return NULL;
+ if (avail + PAGE_SIZE < nbytes) /* need more than a page !! */
+ return NULL;
+ /* ok, we can do it with the current plus the next page */
+ if (nbytes <= sizeof(argp->tmp))
+ p = argp->tmp;
+ else {
+ kfree(argp->tmpp);
+ p = argp->tmpp = kmalloc(nbytes, GFP_KERNEL);
+ if (!p)
+ return NULL;
+
+ }
+ /*
+ * The following memcpy is safe because read_buf is always
+ * called with nbytes > avail, and the two cases above both
+ * guarantee p points to at least nbytes bytes.
+ */
+ memcpy(p, argp->p, avail);
+ next_decode_page(argp);
+ memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
+ argp->p += XDR_QUADLEN(nbytes - avail);
+ return p;
+}
+
+static unsigned int compoundargs_bytes_left(struct nfsd4_compoundargs *argp)
+{
+ unsigned int this = (char *)argp->end - (char *)argp->p;
+
+ return this + argp->pagelen;
+}
+
+static int zero_clientid(clientid_t *clid)
+{
+ return (clid->cl_boot == 0) && (clid->cl_id == 0);
+}
+
+/**
+ * svcxdr_tmpalloc - allocate memory to be freed after compound processing
+ * @argp: NFSv4 compound argument structure
+ * @len: length of buffer to allocate
+ *
+ * Allocates a buffer of size @len to be freed when processing the compound
+ * operation described in @argp finishes.
+ */
+static void *
+svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, u32 len)
+{
+ struct svcxdr_tmpbuf *tb;
+
+ tb = kmalloc(sizeof(*tb) + len, GFP_KERNEL);
+ if (!tb)
+ return NULL;
+ tb->next = argp->to_free;
+ argp->to_free = tb;
+ return tb->buf;
+}
+
+/*
+ * For xdr strings that need to be passed to other kernel api's
+ * as null-terminated strings.
+ *
+ * Note null-terminating in place usually isn't safe since the
+ * buffer might end on a page boundary.
+ */
+static char *
+svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, u32 len)
+{
+ char *p = svcxdr_tmpalloc(argp, len + 1);
+
+ if (!p)
+ return NULL;
+ memcpy(p, buf, len);
+ p[len] = '\0';
+ return p;
+}
+
+static __be32
+svcxdr_construct_vector(struct nfsd4_compoundargs *argp, struct kvec *head,
+ struct page ***pagelist, u32 buflen)
+{
+ int avail;
+ int len;
+ int pages;
+
+ /* Sorry .. no magic macros for this.. *
+ * READ_BUF(write->wr_buflen);
+ * SAVEMEM(write->wr_buf, write->wr_buflen);
+ */
+ avail = (char *)argp->end - (char *)argp->p;
+ if (avail + argp->pagelen < buflen) {
+ dprintk("NFSD: xdr error (%s:%d)\n",
+ __FILE__, __LINE__);
+ return nfserr_bad_xdr;
+ }
+ head->iov_base = argp->p;
+ head->iov_len = avail;
+ *pagelist = argp->pagelist;
+
+ len = XDR_QUADLEN(buflen) << 2;
+ if (len >= avail) {
+ len -= avail;
+
+ pages = len >> PAGE_SHIFT;
+ argp->pagelist += pages;
+ argp->pagelen -= pages * PAGE_SIZE;
+ len -= pages * PAGE_SIZE;
+
+ next_decode_page(argp);
+ }
+ argp->p += XDR_QUADLEN(len);
+
+ return 0;
+}
+
+/**
+ * savemem - duplicate a chunk of memory for later processing
+ * @argp: NFSv4 compound argument structure to be freed with
+ * @p: pointer to be duplicated
+ * @nbytes: length to be duplicated
+ *
+ * Returns a pointer to a copy of @nbytes bytes of memory at @p
+ * that are preserved until processing of the NFSv4 compound
+ * operation described by @argp finishes.
+ */
+static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
+{
+ void *ret;
+
+ ret = svcxdr_tmpalloc(argp, nbytes);
+ if (!ret)
+ return NULL;
+ memcpy(ret, p, nbytes);
+ return ret;
+}
+
+static __be32
+nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec64 *tv)
+{
+ DECODE_HEAD;
+
+ READ_BUF(12);
+ p = xdr_decode_hyper(p, &tv->tv_sec);
+ tv->tv_nsec = be32_to_cpup(p++);
+ if (tv->tv_nsec >= (u32)1000000000)
+ return nfserr_inval;
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
+{
+ u32 bmlen;
+ DECODE_HEAD;
+
+ bmval[0] = 0;
+ bmval[1] = 0;
+ bmval[2] = 0;
+
+ READ_BUF(4);
+ bmlen = be32_to_cpup(p++);
+ if (bmlen > 1000)
+ goto xdr_error;
+
+ READ_BUF(bmlen << 2);
+ if (bmlen > 0)
+ bmval[0] = be32_to_cpup(p++);
+ if (bmlen > 1)
+ bmval[1] = be32_to_cpup(p++);
+ if (bmlen > 2)
+ bmval[2] = be32_to_cpup(p++);
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
+ struct iattr *iattr, struct nfs4_acl **acl,
+ struct xdr_netobj *label, int *umask)
+{
+ int expected_len, len = 0;
+ u32 dummy32;
+ char *buf;
+
+ DECODE_HEAD;
+ iattr->ia_valid = 0;
+ if ((status = nfsd4_decode_bitmap(argp, bmval)))
+ return status;
+
+ if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0
+ || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1
+ || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2) {
+ if (nfsd_attrs_supported(argp->minorversion, bmval))
+ return nfserr_inval;
+ return nfserr_attrnotsupp;
+ }
+
+ READ_BUF(4);
+ expected_len = be32_to_cpup(p++);
+
+ if (bmval[0] & FATTR4_WORD0_SIZE) {
+ READ_BUF(8);
+ len += 8;
+ p = xdr_decode_hyper(p, &iattr->ia_size);
+ iattr->ia_valid |= ATTR_SIZE;
+ }
+ if (bmval[0] & FATTR4_WORD0_ACL) {
+ u32 nace;
+ struct nfs4_ace *ace;
+
+ READ_BUF(4); len += 4;
+ nace = be32_to_cpup(p++);
+
+ if (nace > compoundargs_bytes_left(argp)/20)
+ /*
+ * Even with 4-byte names there wouldn't be
+ * space for that many aces; something fishy is
+ * going on:
+ */
+ return nfserr_fbig;
+
+ *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace));
+ if (*acl == NULL)
+ return nfserr_jukebox;
+
+ (*acl)->naces = nace;
+ for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) {
+ READ_BUF(16); len += 16;
+ ace->type = be32_to_cpup(p++);
+ ace->flag = be32_to_cpup(p++);
+ ace->access_mask = be32_to_cpup(p++);
+ dummy32 = be32_to_cpup(p++);
+ READ_BUF(dummy32);
+ len += XDR_QUADLEN(dummy32) << 2;
+ READMEM(buf, dummy32);
+ ace->whotype = nfs4_acl_get_whotype(buf, dummy32);
+ status = nfs_ok;
+ if (ace->whotype != NFS4_ACL_WHO_NAMED)
+ ;
+ else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
+ status = nfsd_map_name_to_gid(argp->rqstp,
+ buf, dummy32, &ace->who_gid);
+ else
+ status = nfsd_map_name_to_uid(argp->rqstp,
+ buf, dummy32, &ace->who_uid);
+ if (status)
+ return status;
+ }
+ } else
+ *acl = NULL;
+ if (bmval[1] & FATTR4_WORD1_MODE) {
+ READ_BUF(4);
+ len += 4;
+ iattr->ia_mode = be32_to_cpup(p++);
+ iattr->ia_mode &= (S_IFMT | S_IALLUGO);
+ iattr->ia_valid |= ATTR_MODE;
+ }
+ if (bmval[1] & FATTR4_WORD1_OWNER) {
+ READ_BUF(4);
+ len += 4;
+ dummy32 = be32_to_cpup(p++);
+ READ_BUF(dummy32);
+ len += (XDR_QUADLEN(dummy32) << 2);
+ READMEM(buf, dummy32);
+ if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
+ return status;
+ iattr->ia_valid |= ATTR_UID;
+ }
+ if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) {
+ READ_BUF(4);
+ len += 4;
+ dummy32 = be32_to_cpup(p++);
+ READ_BUF(dummy32);
+ len += (XDR_QUADLEN(dummy32) << 2);
+ READMEM(buf, dummy32);
+ if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
+ return status;
+ iattr->ia_valid |= ATTR_GID;
+ }
+ if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
+ READ_BUF(4);
+ len += 4;
+ dummy32 = be32_to_cpup(p++);
+ switch (dummy32) {
+ case NFS4_SET_TO_CLIENT_TIME:
+ len += 12;
+ status = nfsd4_decode_time(argp, &iattr->ia_atime);
+ if (status)
+ return status;
+ iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
+ break;
+ case NFS4_SET_TO_SERVER_TIME:
+ iattr->ia_valid |= ATTR_ATIME;
+ break;
+ default:
+ goto xdr_error;
+ }
+ }
+ if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
+ READ_BUF(4);
+ len += 4;
+ dummy32 = be32_to_cpup(p++);
+ switch (dummy32) {
+ case NFS4_SET_TO_CLIENT_TIME:
+ len += 12;
+ status = nfsd4_decode_time(argp, &iattr->ia_mtime);
+ if (status)
+ return status;
+ iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET);
+ break;
+ case NFS4_SET_TO_SERVER_TIME:
+ iattr->ia_valid |= ATTR_MTIME;
+ break;
+ default:
+ goto xdr_error;
+ }
+ }
+
+ label->len = 0;
+ if (IS_ENABLED(CONFIG_NFSD_V4_SECURITY_LABEL) &&
+ bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
+ READ_BUF(4);
+ len += 4;
+ dummy32 = be32_to_cpup(p++); /* lfs: we don't use it */
+ READ_BUF(4);
+ len += 4;
+ dummy32 = be32_to_cpup(p++); /* pi: we don't use it either */
+ READ_BUF(4);
+ len += 4;
+ dummy32 = be32_to_cpup(p++);
+ READ_BUF(dummy32);
+ if (dummy32 > NFS4_MAXLABELLEN)
+ return nfserr_badlabel;
+ len += (XDR_QUADLEN(dummy32) << 2);
+ READMEM(buf, dummy32);
+ label->len = dummy32;
+ label->data = svcxdr_dupstr(argp, buf, dummy32);
+ if (!label->data)
+ return nfserr_jukebox;
+ }
+ if (bmval[2] & FATTR4_WORD2_MODE_UMASK) {
+ if (!umask)
+ goto xdr_error;
+ READ_BUF(8);
+ len += 8;
+ dummy32 = be32_to_cpup(p++);
+ iattr->ia_mode = dummy32 & (S_IFMT | S_IALLUGO);
+ dummy32 = be32_to_cpup(p++);
+ *umask = dummy32 & S_IRWXUGO;
+ iattr->ia_valid |= ATTR_MODE;
+ }
+ if (len != expected_len)
+ goto xdr_error;
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid)
+{
+ DECODE_HEAD;
+
+ READ_BUF(sizeof(stateid_t));
+ sid->si_generation = be32_to_cpup(p++);
+ COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access)
+{
+ DECODE_HEAD;
+
+ READ_BUF(4);
+ access->ac_req_access = be32_to_cpup(p++);
+
+ DECODE_TAIL;
+}
+
+static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs)
+{
+ DECODE_HEAD;
+ struct user_namespace *userns = nfsd_user_namespace(argp->rqstp);
+ u32 dummy, uid, gid;
+ char *machine_name;
+ int i;
+ int nr_secflavs;
+
+ /* callback_sec_params4 */
+ READ_BUF(4);
+ nr_secflavs = be32_to_cpup(p++);
+ if (nr_secflavs)
+ cbs->flavor = (u32)(-1);
+ else
+ /* Is this legal? Be generous, take it to mean AUTH_NONE: */
+ cbs->flavor = 0;
+ for (i = 0; i < nr_secflavs; ++i) {
+ READ_BUF(4);
+ dummy = be32_to_cpup(p++);
+ switch (dummy) {
+ case RPC_AUTH_NULL:
+ /* Nothing to read */
+ if (cbs->flavor == (u32)(-1))
+ cbs->flavor = RPC_AUTH_NULL;
+ break;
+ case RPC_AUTH_UNIX:
+ READ_BUF(8);
+ /* stamp */
+ dummy = be32_to_cpup(p++);
+
+ /* machine name */
+ dummy = be32_to_cpup(p++);
+ READ_BUF(dummy);
+ SAVEMEM(machine_name, dummy);
+
+ /* uid, gid */
+ READ_BUF(8);
+ uid = be32_to_cpup(p++);
+ gid = be32_to_cpup(p++);
+
+ /* more gids */
+ READ_BUF(4);
+ dummy = be32_to_cpup(p++);
+ READ_BUF(dummy * 4);
+ if (cbs->flavor == (u32)(-1)) {
+ kuid_t kuid = make_kuid(userns, uid);
+ kgid_t kgid = make_kgid(userns, gid);
+ if (uid_valid(kuid) && gid_valid(kgid)) {
+ cbs->uid = kuid;
+ cbs->gid = kgid;
+ cbs->flavor = RPC_AUTH_UNIX;
+ } else {
+ dprintk("RPC_AUTH_UNIX with invalid"
+ "uid or gid ignoring!\n");
+ }
+ }
+ break;
+ case RPC_AUTH_GSS:
+ dprintk("RPC_AUTH_GSS callback secflavor "
+ "not supported!\n");
+ READ_BUF(8);
+ /* gcbp_service */
+ dummy = be32_to_cpup(p++);
+ /* gcbp_handle_from_server */
+ dummy = be32_to_cpup(p++);
+ READ_BUF(dummy);
+ p += XDR_QUADLEN(dummy);
+ /* gcbp_handle_from_client */
+ READ_BUF(4);
+ dummy = be32_to_cpup(p++);
+ READ_BUF(dummy);
+ break;
+ default:
+ dprintk("Illegal callback secflavor\n");
+ return nfserr_inval;
+ }
+ }
+ DECODE_TAIL;
+}
+
+static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc)
+{
+ DECODE_HEAD;
+
+ READ_BUF(4);
+ bc->bc_cb_program = be32_to_cpup(p++);
+ nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec);
+
+ DECODE_TAIL;
+}
+
+static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
+{
+ DECODE_HEAD;
+
+ READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
+ COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+ bcts->dir = be32_to_cpup(p++);
+ /* XXX: skipping ctsa_use_conn_in_rdma_mode. Perhaps Tom Tucker
+ * could help us figure out we should be using it. */
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
+{
+ DECODE_HEAD;
+
+ READ_BUF(4);
+ close->cl_seqid = be32_to_cpup(p++);
+ return nfsd4_decode_stateid(argp, &close->cl_stateid);
+
+ DECODE_TAIL;
+}
+
+
+static __be32
+nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit)
+{
+ DECODE_HEAD;
+
+ READ_BUF(12);
+ p = xdr_decode_hyper(p, &commit->co_offset);
+ commit->co_count = be32_to_cpup(p++);
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create)
+{
+ DECODE_HEAD;
+
+ READ_BUF(4);
+ create->cr_type = be32_to_cpup(p++);
+ switch (create->cr_type) {
+ case NF4LNK:
+ READ_BUF(4);
+ create->cr_datalen = be32_to_cpup(p++);
+ READ_BUF(create->cr_datalen);
+ create->cr_data = svcxdr_dupstr(argp, p, create->cr_datalen);
+ if (!create->cr_data)
+ return nfserr_jukebox;
+ break;
+ case NF4BLK:
+ case NF4CHR:
+ READ_BUF(8);
+ create->cr_specdata1 = be32_to_cpup(p++);
+ create->cr_specdata2 = be32_to_cpup(p++);
+ break;
+ case NF4SOCK:
+ case NF4FIFO:
+ case NF4DIR:
+ default:
+ break;
+ }
+
+ READ_BUF(4);
+ create->cr_namelen = be32_to_cpup(p++);
+ READ_BUF(create->cr_namelen);
+ SAVEMEM(create->cr_name, create->cr_namelen);
+ if ((status = check_filename(create->cr_name, create->cr_namelen)))
+ return status;
+
+ status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
+ &create->cr_acl, &create->cr_label,
+ &create->cr_umask);
+ if (status)
+ goto out;
+
+ DECODE_TAIL;
+}
+
+static inline __be32
+nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
+{
+ return nfsd4_decode_stateid(argp, &dr->dr_stateid);
+}
+
+static inline __be32
+nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr)
+{
+ return nfsd4_decode_bitmap(argp, getattr->ga_bmval);
+}
+
+static __be32
+nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
+{
+ DECODE_HEAD;
+
+ READ_BUF(4);
+ link->li_namelen = be32_to_cpup(p++);
+ READ_BUF(link->li_namelen);
+ SAVEMEM(link->li_name, link->li_namelen);
+ if ((status = check_filename(link->li_name, link->li_namelen)))
+ return status;
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
+{
+ DECODE_HEAD;
+
+ /*
+ * type, reclaim(boolean), offset, length, new_lock_owner(boolean)
+ */
+ READ_BUF(28);
+ lock->lk_type = be32_to_cpup(p++);
+ if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT))
+ goto xdr_error;
+ lock->lk_reclaim = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &lock->lk_offset);
+ p = xdr_decode_hyper(p, &lock->lk_length);
+ lock->lk_is_new = be32_to_cpup(p++);
+
+ if (lock->lk_is_new) {
+ READ_BUF(4);
+ lock->lk_new_open_seqid = be32_to_cpup(p++);
+ status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid);
+ if (status)
+ return status;
+ READ_BUF(8 + sizeof(clientid_t));
+ lock->lk_new_lock_seqid = be32_to_cpup(p++);
+ COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t));
+ lock->lk_new_owner.len = be32_to_cpup(p++);
+ READ_BUF(lock->lk_new_owner.len);
+ READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len);
+ } else {
+ status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid);
+ if (status)
+ return status;
+ READ_BUF(4);
+ lock->lk_old_lock_seqid = be32_to_cpup(p++);
+ }
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
+{
+ DECODE_HEAD;
+
+ READ_BUF(32);
+ lockt->lt_type = be32_to_cpup(p++);
+ if((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT))
+ goto xdr_error;
+ p = xdr_decode_hyper(p, &lockt->lt_offset);
+ p = xdr_decode_hyper(p, &lockt->lt_length);
+ COPYMEM(&lockt->lt_clientid, 8);
+ lockt->lt_owner.len = be32_to_cpup(p++);
+ READ_BUF(lockt->lt_owner.len);
+ READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
+{
+ DECODE_HEAD;
+
+ READ_BUF(8);
+ locku->lu_type = be32_to_cpup(p++);
+ if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
+ goto xdr_error;
+ locku->lu_seqid = be32_to_cpup(p++);
+ status = nfsd4_decode_stateid(argp, &locku->lu_stateid);
+ if (status)
+ return status;
+ READ_BUF(16);
+ p = xdr_decode_hyper(p, &locku->lu_offset);
+ p = xdr_decode_hyper(p, &locku->lu_length);
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup)
+{
+ DECODE_HEAD;
+
+ READ_BUF(4);
+ lookup->lo_len = be32_to_cpup(p++);
+ READ_BUF(lookup->lo_len);
+ SAVEMEM(lookup->lo_name, lookup->lo_len);
+ if ((status = check_filename(lookup->lo_name, lookup->lo_len)))
+ return status;
+
+ DECODE_TAIL;
+}
+
+static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *share_access, u32 *deleg_want, u32 *deleg_when)
+{
+ __be32 *p;
+ u32 w;
+
+ READ_BUF(4);
+ w = be32_to_cpup(p++);
+ *share_access = w & NFS4_SHARE_ACCESS_MASK;
+ *deleg_want = w & NFS4_SHARE_WANT_MASK;
+ if (deleg_when)
+ *deleg_when = w & NFS4_SHARE_WHEN_MASK;
+
+ switch (w & NFS4_SHARE_ACCESS_MASK) {
+ case NFS4_SHARE_ACCESS_READ:
+ case NFS4_SHARE_ACCESS_WRITE:
+ case NFS4_SHARE_ACCESS_BOTH:
+ break;
+ default:
+ return nfserr_bad_xdr;
+ }
+ w &= ~NFS4_SHARE_ACCESS_MASK;
+ if (!w)
+ return nfs_ok;
+ if (!argp->minorversion)
+ return nfserr_bad_xdr;
+ switch (w & NFS4_SHARE_WANT_MASK) {
+ case NFS4_SHARE_WANT_NO_PREFERENCE:
+ case NFS4_SHARE_WANT_READ_DELEG:
+ case NFS4_SHARE_WANT_WRITE_DELEG:
+ case NFS4_SHARE_WANT_ANY_DELEG:
+ case NFS4_SHARE_WANT_NO_DELEG:
+ case NFS4_SHARE_WANT_CANCEL:
+ break;
+ default:
+ return nfserr_bad_xdr;
+ }
+ w &= ~NFS4_SHARE_WANT_MASK;
+ if (!w)
+ return nfs_ok;
+
+ if (!deleg_when) /* open_downgrade */
+ return nfserr_inval;
+ switch (w) {
+ case NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL:
+ case NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED:
+ case (NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL |
+ NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED):
+ return nfs_ok;
+ }
+xdr_error:
+ return nfserr_bad_xdr;
+}
+
+static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x)
+{
+ __be32 *p;
+
+ READ_BUF(4);
+ *x = be32_to_cpup(p++);
+ /* Note: unlinke access bits, deny bits may be zero. */
+ if (*x & ~NFS4_SHARE_DENY_BOTH)
+ return nfserr_bad_xdr;
+ return nfs_ok;
+xdr_error:
+ return nfserr_bad_xdr;
+}
+
+static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o)
+{
+ __be32 *p;
+
+ READ_BUF(4);
+ o->len = be32_to_cpup(p++);
+
+ if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT)
+ return nfserr_bad_xdr;
+
+ READ_BUF(o->len);
+ SAVEMEM(o->data, o->len);
+ return nfs_ok;
+xdr_error:
+ return nfserr_bad_xdr;
+}
+
+static __be32
+nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
+{
+ DECODE_HEAD;
+ u32 dummy;
+
+ memset(open->op_bmval, 0, sizeof(open->op_bmval));
+ open->op_iattr.ia_valid = 0;
+ open->op_openowner = NULL;
+
+ open->op_xdr_error = 0;
+ /* seqid, share_access, share_deny, clientid, ownerlen */
+ READ_BUF(4);
+ open->op_seqid = be32_to_cpup(p++);
+ /* decode, yet ignore deleg_when until supported */
+ status = nfsd4_decode_share_access(argp, &open->op_share_access,
+ &open->op_deleg_want, &dummy);
+ if (status)
+ goto xdr_error;
+ status = nfsd4_decode_share_deny(argp, &open->op_share_deny);
+ if (status)
+ goto xdr_error;
+ READ_BUF(sizeof(clientid_t));
+ COPYMEM(&open->op_clientid, sizeof(clientid_t));
+ status = nfsd4_decode_opaque(argp, &open->op_owner);
+ if (status)
+ goto xdr_error;
+ READ_BUF(4);
+ open->op_create = be32_to_cpup(p++);
+ switch (open->op_create) {
+ case NFS4_OPEN_NOCREATE:
+ break;
+ case NFS4_OPEN_CREATE:
+ READ_BUF(4);
+ open->op_createmode = be32_to_cpup(p++);
+ switch (open->op_createmode) {
+ case NFS4_CREATE_UNCHECKED:
+ case NFS4_CREATE_GUARDED:
+ status = nfsd4_decode_fattr(argp, open->op_bmval,
+ &open->op_iattr, &open->op_acl, &open->op_label,
+ &open->op_umask);
+ if (status)
+ goto out;
+ break;
+ case NFS4_CREATE_EXCLUSIVE:
+ READ_BUF(NFS4_VERIFIER_SIZE);
+ COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
+ break;
+ case NFS4_CREATE_EXCLUSIVE4_1:
+ if (argp->minorversion < 1)
+ goto xdr_error;
+ READ_BUF(NFS4_VERIFIER_SIZE);
+ COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
+ status = nfsd4_decode_fattr(argp, open->op_bmval,
+ &open->op_iattr, &open->op_acl, &open->op_label,
+ &open->op_umask);
+ if (status)
+ goto out;
+ break;
+ default:
+ goto xdr_error;
+ }
+ break;
+ default:
+ goto xdr_error;
+ }
+
+ /* open_claim */
+ READ_BUF(4);
+ open->op_claim_type = be32_to_cpup(p++);
+ switch (open->op_claim_type) {
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+ READ_BUF(4);
+ open->op_fname.len = be32_to_cpup(p++);
+ READ_BUF(open->op_fname.len);
+ SAVEMEM(open->op_fname.data, open->op_fname.len);
+ if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
+ return status;
+ break;
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ READ_BUF(4);
+ open->op_delegate_type = be32_to_cpup(p++);
+ break;
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
+ if (status)
+ return status;
+ READ_BUF(4);
+ open->op_fname.len = be32_to_cpup(p++);
+ READ_BUF(open->op_fname.len);
+ SAVEMEM(open->op_fname.data, open->op_fname.len);
+ if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
+ return status;
+ break;
+ case NFS4_OPEN_CLAIM_FH:
+ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+ if (argp->minorversion < 1)
+ goto xdr_error;
+ /* void */
+ break;
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ if (argp->minorversion < 1)
+ goto xdr_error;
+ status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
+ if (status)
+ return status;
+ break;
+ default:
+ goto xdr_error;
+ }
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf)
+{
+ DECODE_HEAD;
+
+ if (argp->minorversion >= 1)
+ return nfserr_notsupp;
+
+ status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
+ if (status)
+ return status;
+ READ_BUF(4);
+ open_conf->oc_seqid = be32_to_cpup(p++);
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down)
+{
+ DECODE_HEAD;
+
+ status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
+ if (status)
+ return status;
+ READ_BUF(4);
+ open_down->od_seqid = be32_to_cpup(p++);
+ status = nfsd4_decode_share_access(argp, &open_down->od_share_access,
+ &open_down->od_deleg_want, NULL);
+ if (status)
+ return status;
+ status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny);
+ if (status)
+ return status;
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
+{
+ DECODE_HEAD;
+
+ READ_BUF(4);
+ putfh->pf_fhlen = be32_to_cpup(p++);
+ if (putfh->pf_fhlen > NFS4_FHSIZE)
+ goto xdr_error;
+ READ_BUF(putfh->pf_fhlen);
+ SAVEMEM(putfh->pf_fhval, putfh->pf_fhlen);
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p)
+{
+ if (argp->minorversion == 0)
+ return nfs_ok;
+ return nfserr_notsupp;
+}
+
+static __be32
+nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
+{
+ DECODE_HEAD;
+
+ status = nfsd4_decode_stateid(argp, &read->rd_stateid);
+ if (status)
+ return status;
+ READ_BUF(12);
+ p = xdr_decode_hyper(p, &read->rd_offset);
+ read->rd_length = be32_to_cpup(p++);
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir)
+{
+ DECODE_HEAD;
+
+ READ_BUF(24);
+ p = xdr_decode_hyper(p, &readdir->rd_cookie);
+ COPYMEM(readdir->rd_verf.data, sizeof(readdir->rd_verf.data));
+ readdir->rd_dircount = be32_to_cpup(p++);
+ readdir->rd_maxcount = be32_to_cpup(p++);
+ if ((status = nfsd4_decode_bitmap(argp, readdir->rd_bmval)))
+ goto out;
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove)
+{
+ DECODE_HEAD;
+
+ READ_BUF(4);
+ remove->rm_namelen = be32_to_cpup(p++);
+ READ_BUF(remove->rm_namelen);
+ SAVEMEM(remove->rm_name, remove->rm_namelen);
+ if ((status = check_filename(remove->rm_name, remove->rm_namelen)))
+ return status;
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename)
+{
+ DECODE_HEAD;
+
+ READ_BUF(4);
+ rename->rn_snamelen = be32_to_cpup(p++);
+ READ_BUF(rename->rn_snamelen);
+ SAVEMEM(rename->rn_sname, rename->rn_snamelen);
+ READ_BUF(4);
+ rename->rn_tnamelen = be32_to_cpup(p++);
+ READ_BUF(rename->rn_tnamelen);
+ SAVEMEM(rename->rn_tname, rename->rn_tnamelen);
+ if ((status = check_filename(rename->rn_sname, rename->rn_snamelen)))
+ return status;
+ if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen)))
+ return status;
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
+{
+ DECODE_HEAD;
+
+ if (argp->minorversion >= 1)
+ return nfserr_notsupp;
+
+ READ_BUF(sizeof(clientid_t));
+ COPYMEM(clientid, sizeof(clientid_t));
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
+ struct nfsd4_secinfo *secinfo)
+{
+ DECODE_HEAD;
+
+ READ_BUF(4);
+ secinfo->si_namelen = be32_to_cpup(p++);
+ READ_BUF(secinfo->si_namelen);
+ SAVEMEM(secinfo->si_name, secinfo->si_namelen);
+ status = check_filename(secinfo->si_name, secinfo->si_namelen);
+ if (status)
+ return status;
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
+ struct nfsd4_secinfo_no_name *sin)
+{
+ DECODE_HEAD;
+
+ READ_BUF(4);
+ sin->sin_style = be32_to_cpup(p++);
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
+{
+ __be32 status;
+
+ status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
+ if (status)
+ return status;
+ return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr,
+ &setattr->sa_acl, &setattr->sa_label, NULL);
+}
+
+static __be32
+nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid)
+{
+ DECODE_HEAD;
+
+ if (argp->minorversion >= 1)
+ return nfserr_notsupp;
+
+ READ_BUF(NFS4_VERIFIER_SIZE);
+ COPYMEM(setclientid->se_verf.data, NFS4_VERIFIER_SIZE);
+
+ status = nfsd4_decode_opaque(argp, &setclientid->se_name);
+ if (status)
+ return nfserr_bad_xdr;
+ READ_BUF(8);
+ setclientid->se_callback_prog = be32_to_cpup(p++);
+ setclientid->se_callback_netid_len = be32_to_cpup(p++);
+ READ_BUF(setclientid->se_callback_netid_len);
+ SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len);
+ READ_BUF(4);
+ setclientid->se_callback_addr_len = be32_to_cpup(p++);
+
+ READ_BUF(setclientid->se_callback_addr_len);
+ SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len);
+ READ_BUF(4);
+ setclientid->se_callback_ident = be32_to_cpup(p++);
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c)
+{
+ DECODE_HEAD;
+
+ if (argp->minorversion >= 1)
+ return nfserr_notsupp;
+
+ READ_BUF(8 + NFS4_VERIFIER_SIZE);
+ COPYMEM(&scd_c->sc_clientid, 8);
+ COPYMEM(&scd_c->sc_confirm, NFS4_VERIFIER_SIZE);
+
+ DECODE_TAIL;
+}
+
+/* Also used for NVERIFY */
+static __be32
+nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify)
+{
+ DECODE_HEAD;
+
+ if ((status = nfsd4_decode_bitmap(argp, verify->ve_bmval)))
+ goto out;
+
+ /* For convenience's sake, we compare raw xdr'd attributes in
+ * nfsd4_proc_verify */
+
+ READ_BUF(4);
+ verify->ve_attrlen = be32_to_cpup(p++);
+ READ_BUF(verify->ve_attrlen);
+ SAVEMEM(verify->ve_attrval, verify->ve_attrlen);
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
+{
+ DECODE_HEAD;
+
+ status = nfsd4_decode_stateid(argp, &write->wr_stateid);
+ if (status)
+ return status;
+ READ_BUF(16);
+ p = xdr_decode_hyper(p, &write->wr_offset);
+ write->wr_stable_how = be32_to_cpup(p++);
+ if (write->wr_stable_how > NFS_FILE_SYNC)
+ goto xdr_error;
+ write->wr_buflen = be32_to_cpup(p++);
+
+ status = svcxdr_construct_vector(argp, &write->wr_head,
+ &write->wr_pagelist, write->wr_buflen);
+ if (status)
+ return status;
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner)
+{
+ DECODE_HEAD;
+
+ if (argp->minorversion >= 1)
+ return nfserr_notsupp;
+
+ READ_BUF(12);
+ COPYMEM(&rlockowner->rl_clientid, sizeof(clientid_t));
+ rlockowner->rl_owner.len = be32_to_cpup(p++);
+ READ_BUF(rlockowner->rl_owner.len);
+ READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len);
+
+ if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid))
+ return nfserr_inval;
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
+ struct nfsd4_exchange_id *exid)
+{
+ int dummy, tmp;
+ DECODE_HEAD;
+
+ READ_BUF(NFS4_VERIFIER_SIZE);
+ COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
+
+ status = nfsd4_decode_opaque(argp, &exid->clname);
+ if (status)
+ return nfserr_bad_xdr;
+
+ READ_BUF(4);
+ exid->flags = be32_to_cpup(p++);
+
+ /* Ignore state_protect4_a */
+ READ_BUF(4);
+ exid->spa_how = be32_to_cpup(p++);
+ switch (exid->spa_how) {
+ case SP4_NONE:
+ break;
+ case SP4_MACH_CRED:
+ /* spo_must_enforce */
+ status = nfsd4_decode_bitmap(argp,
+ exid->spo_must_enforce);
+ if (status)
+ goto out;
+ /* spo_must_allow */
+ status = nfsd4_decode_bitmap(argp, exid->spo_must_allow);
+ if (status)
+ goto out;
+ break;
+ case SP4_SSV:
+ /* ssp_ops */
+ READ_BUF(4);
+ dummy = be32_to_cpup(p++);
+ READ_BUF(dummy * 4);
+ p += dummy;
+
+ READ_BUF(4);
+ dummy = be32_to_cpup(p++);
+ READ_BUF(dummy * 4);
+ p += dummy;
+
+ /* ssp_hash_algs<> */
+ READ_BUF(4);
+ tmp = be32_to_cpup(p++);
+ while (tmp--) {
+ READ_BUF(4);
+ dummy = be32_to_cpup(p++);
+ READ_BUF(dummy);
+ p += XDR_QUADLEN(dummy);
+ }
+
+ /* ssp_encr_algs<> */
+ READ_BUF(4);
+ tmp = be32_to_cpup(p++);
+ while (tmp--) {
+ READ_BUF(4);
+ dummy = be32_to_cpup(p++);
+ READ_BUF(dummy);
+ p += XDR_QUADLEN(dummy);
+ }
+
+ /* ignore ssp_window and ssp_num_gss_handles: */
+ READ_BUF(8);
+ break;
+ default:
+ goto xdr_error;
+ }
+
+ READ_BUF(4); /* nfs_impl_id4 array length */
+ dummy = be32_to_cpup(p++);
+
+ if (dummy > 1)
+ goto xdr_error;
+
+ if (dummy == 1) {
+ status = nfsd4_decode_opaque(argp, &exid->nii_domain);
+ if (status)
+ goto xdr_error;
+
+ /* nii_name */
+ status = nfsd4_decode_opaque(argp, &exid->nii_name);
+ if (status)
+ goto xdr_error;
+
+ /* nii_date */
+ status = nfsd4_decode_time(argp, &exid->nii_time);
+ if (status)
+ goto xdr_error;
+ }
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
+ struct nfsd4_create_session *sess)
+{
+ DECODE_HEAD;
+
+ READ_BUF(16);
+ COPYMEM(&sess->clientid, 8);
+ sess->seqid = be32_to_cpup(p++);
+ sess->flags = be32_to_cpup(p++);
+
+ /* Fore channel attrs */
+ READ_BUF(28);
+ p++; /* headerpadsz is always 0 */
+ sess->fore_channel.maxreq_sz = be32_to_cpup(p++);
+ sess->fore_channel.maxresp_sz = be32_to_cpup(p++);
+ sess->fore_channel.maxresp_cached = be32_to_cpup(p++);
+ sess->fore_channel.maxops = be32_to_cpup(p++);
+ sess->fore_channel.maxreqs = be32_to_cpup(p++);
+ sess->fore_channel.nr_rdma_attrs = be32_to_cpup(p++);
+ if (sess->fore_channel.nr_rdma_attrs == 1) {
+ READ_BUF(4);
+ sess->fore_channel.rdma_attrs = be32_to_cpup(p++);
+ } else if (sess->fore_channel.nr_rdma_attrs > 1) {
+ dprintk("Too many fore channel attr bitmaps!\n");
+ goto xdr_error;
+ }
+
+ /* Back channel attrs */
+ READ_BUF(28);
+ p++; /* headerpadsz is always 0 */
+ sess->back_channel.maxreq_sz = be32_to_cpup(p++);
+ sess->back_channel.maxresp_sz = be32_to_cpup(p++);
+ sess->back_channel.maxresp_cached = be32_to_cpup(p++);
+ sess->back_channel.maxops = be32_to_cpup(p++);
+ sess->back_channel.maxreqs = be32_to_cpup(p++);
+ sess->back_channel.nr_rdma_attrs = be32_to_cpup(p++);
+ if (sess->back_channel.nr_rdma_attrs == 1) {
+ READ_BUF(4);
+ sess->back_channel.rdma_attrs = be32_to_cpup(p++);
+ } else if (sess->back_channel.nr_rdma_attrs > 1) {
+ dprintk("Too many back channel attr bitmaps!\n");
+ goto xdr_error;
+ }
+
+ READ_BUF(4);
+ sess->callback_prog = be32_to_cpup(p++);
+ nfsd4_decode_cb_sec(argp, &sess->cb_sec);
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
+ struct nfsd4_destroy_session *destroy_session)
+{
+ DECODE_HEAD;
+ READ_BUF(NFS4_MAX_SESSIONID_LEN);
+ COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp,
+ struct nfsd4_free_stateid *free_stateid)
+{
+ DECODE_HEAD;
+
+ READ_BUF(sizeof(stateid_t));
+ free_stateid->fr_stateid.si_generation = be32_to_cpup(p++);
+ COPYMEM(&free_stateid->fr_stateid.si_opaque, sizeof(stateid_opaque_t));
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
+ struct nfsd4_sequence *seq)
+{
+ DECODE_HEAD;
+
+ READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+ COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+ seq->seqid = be32_to_cpup(p++);
+ seq->slotid = be32_to_cpup(p++);
+ seq->maxslots = be32_to_cpup(p++);
+ seq->cachethis = be32_to_cpup(p++);
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid)
+{
+ int i;
+ __be32 *p, status;
+ struct nfsd4_test_stateid_id *stateid;
+
+ READ_BUF(4);
+ test_stateid->ts_num_ids = ntohl(*p++);
+
+ INIT_LIST_HEAD(&test_stateid->ts_stateid_list);
+
+ for (i = 0; i < test_stateid->ts_num_ids; i++) {
+ stateid = svcxdr_tmpalloc(argp, sizeof(*stateid));
+ if (!stateid) {
+ status = nfserrno(-ENOMEM);
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&stateid->ts_id_list);
+ list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list);
+
+ status = nfsd4_decode_stateid(argp, &stateid->ts_id_stateid);
+ if (status)
+ goto out;
+ }
+
+ status = 0;
+out:
+ return status;
+xdr_error:
+ dprintk("NFSD: xdr error (%s:%d)\n", __FILE__, __LINE__);
+ status = nfserr_bad_xdr;
+ goto out;
+}
+
+static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_clientid *dc)
+{
+ DECODE_HEAD;
+
+ READ_BUF(8);
+ COPYMEM(&dc->clientid, 8);
+
+ DECODE_TAIL;
+}
+
+static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
+{
+ DECODE_HEAD;
+
+ READ_BUF(4);
+ rc->rca_one_fs = be32_to_cpup(p++);
+
+ DECODE_TAIL;
+}
+
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
+ struct nfsd4_getdeviceinfo *gdev)
+{
+ DECODE_HEAD;
+ u32 num, i;
+
+ READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4);
+ COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid));
+ gdev->gd_layout_type = be32_to_cpup(p++);
+ gdev->gd_maxcount = be32_to_cpup(p++);
+ num = be32_to_cpup(p++);
+ if (num) {
+ if (num > 1000)
+ goto xdr_error;
+ READ_BUF(4 * num);
+ gdev->gd_notify_types = be32_to_cpup(p++);
+ for (i = 1; i < num; i++) {
+ if (be32_to_cpup(p++)) {
+ status = nfserr_inval;
+ goto out;
+ }
+ }
+ }
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
+ struct nfsd4_layoutget *lgp)
+{
+ DECODE_HEAD;
+
+ READ_BUF(36);
+ lgp->lg_signal = be32_to_cpup(p++);
+ lgp->lg_layout_type = be32_to_cpup(p++);
+ lgp->lg_seg.iomode = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &lgp->lg_seg.offset);
+ p = xdr_decode_hyper(p, &lgp->lg_seg.length);
+ p = xdr_decode_hyper(p, &lgp->lg_minlength);
+
+ status = nfsd4_decode_stateid(argp, &lgp->lg_sid);
+ if (status)
+ return status;
+
+ READ_BUF(4);
+ lgp->lg_maxcount = be32_to_cpup(p++);
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
+ struct nfsd4_layoutcommit *lcp)
+{
+ DECODE_HEAD;
+ u32 timechange;
+
+ READ_BUF(20);
+ p = xdr_decode_hyper(p, &lcp->lc_seg.offset);
+ p = xdr_decode_hyper(p, &lcp->lc_seg.length);
+ lcp->lc_reclaim = be32_to_cpup(p++);
+
+ status = nfsd4_decode_stateid(argp, &lcp->lc_sid);
+ if (status)
+ return status;
+
+ READ_BUF(4);
+ lcp->lc_newoffset = be32_to_cpup(p++);
+ if (lcp->lc_newoffset) {
+ READ_BUF(8);
+ p = xdr_decode_hyper(p, &lcp->lc_last_wr);
+ } else
+ lcp->lc_last_wr = 0;
+ READ_BUF(4);
+ timechange = be32_to_cpup(p++);
+ if (timechange) {
+ status = nfsd4_decode_time(argp, &lcp->lc_mtime);
+ if (status)
+ return status;
+ } else {
+ lcp->lc_mtime.tv_nsec = UTIME_NOW;
+ }
+ READ_BUF(8);
+ lcp->lc_layout_type = be32_to_cpup(p++);
+
+ /*
+ * Save the layout update in XDR format and let the layout driver deal
+ * with it later.
+ */
+ lcp->lc_up_len = be32_to_cpup(p++);
+ if (lcp->lc_up_len > 0) {
+ READ_BUF(lcp->lc_up_len);
+ READMEM(lcp->lc_up_layout, lcp->lc_up_len);
+ }
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
+ struct nfsd4_layoutreturn *lrp)
+{
+ DECODE_HEAD;
+
+ READ_BUF(16);
+ lrp->lr_reclaim = be32_to_cpup(p++);
+ lrp->lr_layout_type = be32_to_cpup(p++);
+ lrp->lr_seg.iomode = be32_to_cpup(p++);
+ lrp->lr_return_type = be32_to_cpup(p++);
+ if (lrp->lr_return_type == RETURN_FILE) {
+ READ_BUF(16);
+ p = xdr_decode_hyper(p, &lrp->lr_seg.offset);
+ p = xdr_decode_hyper(p, &lrp->lr_seg.length);
+
+ status = nfsd4_decode_stateid(argp, &lrp->lr_sid);
+ if (status)
+ return status;
+
+ READ_BUF(4);
+ lrp->lrf_body_len = be32_to_cpup(p++);
+ if (lrp->lrf_body_len > 0) {
+ READ_BUF(lrp->lrf_body_len);
+ READMEM(lrp->lrf_body, lrp->lrf_body_len);
+ }
+ } else {
+ lrp->lr_seg.offset = 0;
+ lrp->lr_seg.length = NFS4_MAX_UINT64;
+ }
+
+ DECODE_TAIL;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
+static __be32
+nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
+ struct nfsd4_fallocate *fallocate)
+{
+ DECODE_HEAD;
+
+ status = nfsd4_decode_stateid(argp, &fallocate->falloc_stateid);
+ if (status)
+ return status;
+
+ READ_BUF(16);
+ p = xdr_decode_hyper(p, &fallocate->falloc_offset);
+ xdr_decode_hyper(p, &fallocate->falloc_length);
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone)
+{
+ DECODE_HEAD;
+
+ status = nfsd4_decode_stateid(argp, &clone->cl_src_stateid);
+ if (status)
+ return status;
+ status = nfsd4_decode_stateid(argp, &clone->cl_dst_stateid);
+ if (status)
+ return status;
+
+ READ_BUF(8 + 8 + 8);
+ p = xdr_decode_hyper(p, &clone->cl_src_pos);
+ p = xdr_decode_hyper(p, &clone->cl_dst_pos);
+ p = xdr_decode_hyper(p, &clone->cl_count);
+ DECODE_TAIL;
+}
+
+static __be32 nfsd4_decode_nl4_server(struct nfsd4_compoundargs *argp,
+ struct nl4_server *ns)
+{
+ DECODE_HEAD;
+ struct nfs42_netaddr *naddr;
+
+ READ_BUF(4);
+ ns->nl4_type = be32_to_cpup(p++);
+
+ /* currently support for 1 inter-server source server */
+ switch (ns->nl4_type) {
+ case NL4_NETADDR:
+ naddr = &ns->u.nl4_addr;
+
+ READ_BUF(4);
+ naddr->netid_len = be32_to_cpup(p++);
+ if (naddr->netid_len > RPCBIND_MAXNETIDLEN)
+ goto xdr_error;
+
+ READ_BUF(naddr->netid_len + 4); /* 4 for uaddr len */
+ COPYMEM(naddr->netid, naddr->netid_len);
+
+ naddr->addr_len = be32_to_cpup(p++);
+ if (naddr->addr_len > RPCBIND_MAXUADDRLEN)
+ goto xdr_error;
+
+ READ_BUF(naddr->addr_len);
+ COPYMEM(naddr->addr, naddr->addr_len);
+ break;
+ default:
+ goto xdr_error;
+ }
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
+{
+ DECODE_HEAD;
+ struct nl4_server *ns_dummy;
+ int i, count;
+
+ status = nfsd4_decode_stateid(argp, &copy->cp_src_stateid);
+ if (status)
+ return status;
+ status = nfsd4_decode_stateid(argp, &copy->cp_dst_stateid);
+ if (status)
+ return status;
+
+ READ_BUF(8 + 8 + 8 + 4 + 4 + 4);
+ p = xdr_decode_hyper(p, &copy->cp_src_pos);
+ p = xdr_decode_hyper(p, &copy->cp_dst_pos);
+ p = xdr_decode_hyper(p, &copy->cp_count);
+ p++; /* ca_consecutive: we always do consecutive copies */
+ copy->cp_synchronous = be32_to_cpup(p++);
+
+ count = be32_to_cpup(p++);
+
+ copy->cp_intra = false;
+ if (count == 0) { /* intra-server copy */
+ copy->cp_intra = true;
+ goto intra;
+ }
+
+ /* decode all the supplied server addresses but use first */
+ status = nfsd4_decode_nl4_server(argp, &copy->cp_src);
+ if (status)
+ return status;
+
+ ns_dummy = kmalloc(sizeof(struct nl4_server), GFP_KERNEL);
+ if (ns_dummy == NULL)
+ return nfserrno(-ENOMEM);
+ for (i = 0; i < count - 1; i++) {
+ status = nfsd4_decode_nl4_server(argp, ns_dummy);
+ if (status) {
+ kfree(ns_dummy);
+ return status;
+ }
+ }
+ kfree(ns_dummy);
+intra:
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp,
+ struct nfsd4_offload_status *os)
+{
+ return nfsd4_decode_stateid(argp, &os->stateid);
+}
+
+static __be32
+nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp,
+ struct nfsd4_copy_notify *cn)
+{
+ __be32 status;
+
+ status = nfsd4_decode_stateid(argp, &cn->cpn_src_stateid);
+ if (status)
+ return status;
+ return nfsd4_decode_nl4_server(argp, &cn->cpn_dst);
+}
+
+static __be32
+nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
+{
+ DECODE_HEAD;
+
+ status = nfsd4_decode_stateid(argp, &seek->seek_stateid);
+ if (status)
+ return status;
+
+ READ_BUF(8 + 4);
+ p = xdr_decode_hyper(p, &seek->seek_offset);
+ seek->seek_whence = be32_to_cpup(p);
+
+ DECODE_TAIL;
+}
+
+/*
+ * XDR data that is more than PAGE_SIZE in size is normally part of a
+ * read or write. However, the size of extended attributes is limited
+ * by the maximum request size, and then further limited by the underlying
+ * filesystem limits. This can exceed PAGE_SIZE (currently, XATTR_SIZE_MAX
+ * is 64k). Since there is no kvec- or page-based interface to xattrs,
+ * and we're not dealing with contiguous pages, we need to do some copying.
+ */
+
+/*
+ * Decode data into buffer. Uses head and pages constructed by
+ * svcxdr_construct_vector.
+ */
+static __be32
+nfsd4_vbuf_from_vector(struct nfsd4_compoundargs *argp, struct kvec *head,
+ struct page **pages, char **bufp, u32 buflen)
+{
+ char *tmp, *dp;
+ u32 len;
+
+ if (buflen <= head->iov_len) {
+ /*
+ * We're in luck, the head has enough space. Just return
+ * the head, no need for copying.
+ */
+ *bufp = head->iov_base;
+ return 0;
+ }
+
+ tmp = svcxdr_tmpalloc(argp, buflen);
+ if (tmp == NULL)
+ return nfserr_jukebox;
+
+ dp = tmp;
+ memcpy(dp, head->iov_base, head->iov_len);
+ buflen -= head->iov_len;
+ dp += head->iov_len;
+
+ while (buflen > 0) {
+ len = min_t(u32, buflen, PAGE_SIZE);
+ memcpy(dp, page_address(*pages), len);
+
+ buflen -= len;
+ dp += len;
+ pages++;
+ }
+
+ *bufp = tmp;
+ return 0;
+}
+
+/*
+ * Get a user extended attribute name from the XDR buffer.
+ * It will not have the "user." prefix, so prepend it.
+ * Lastly, check for nul characters in the name.
+ */
+static __be32
+nfsd4_decode_xattr_name(struct nfsd4_compoundargs *argp, char **namep)
+{
+ DECODE_HEAD;
+ char *name, *sp, *dp;
+ u32 namelen, cnt;
+
+ READ_BUF(4);
+ namelen = be32_to_cpup(p++);
+
+ if (namelen > (XATTR_NAME_MAX - XATTR_USER_PREFIX_LEN))
+ return nfserr_nametoolong;
+
+ if (namelen == 0)
+ goto xdr_error;
+
+ READ_BUF(namelen);
+
+ name = svcxdr_tmpalloc(argp, namelen + XATTR_USER_PREFIX_LEN + 1);
+ if (!name)
+ return nfserr_jukebox;
+
+ memcpy(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+
+ /*
+ * Copy the extended attribute name over while checking for 0
+ * characters.
+ */
+ sp = (char *)p;
+ dp = name + XATTR_USER_PREFIX_LEN;
+ cnt = namelen;
+
+ while (cnt-- > 0) {
+ if (*sp == '\0')
+ goto xdr_error;
+ *dp++ = *sp++;
+ }
+ *dp = '\0';
+
+ *namep = name;
+
+ DECODE_TAIL;
+}
+
+/*
+ * A GETXATTR op request comes without a length specifier. We just set the
+ * maximum length for the reply based on XATTR_SIZE_MAX and the maximum
+ * channel reply size. nfsd_getxattr will probe the length of the xattr,
+ * check it against getxa_len, and allocate + return the value.
+ */
+static __be32
+nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp,
+ struct nfsd4_getxattr *getxattr)
+{
+ __be32 status;
+ u32 maxcount;
+
+ status = nfsd4_decode_xattr_name(argp, &getxattr->getxa_name);
+ if (status)
+ return status;
+
+ maxcount = svc_max_payload(argp->rqstp);
+ maxcount = min_t(u32, XATTR_SIZE_MAX, maxcount);
+
+ getxattr->getxa_len = maxcount;
+
+ return status;
+}
+
+static __be32
+nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp,
+ struct nfsd4_setxattr *setxattr)
+{
+ DECODE_HEAD;
+ u32 flags, maxcount, size;
+ struct kvec head;
+ struct page **pagelist;
+
+ READ_BUF(4);
+ flags = be32_to_cpup(p++);
+
+ if (flags > SETXATTR4_REPLACE)
+ return nfserr_inval;
+ setxattr->setxa_flags = flags;
+
+ status = nfsd4_decode_xattr_name(argp, &setxattr->setxa_name);
+ if (status)
+ return status;
+
+ maxcount = svc_max_payload(argp->rqstp);
+ maxcount = min_t(u32, XATTR_SIZE_MAX, maxcount);
+
+ READ_BUF(4);
+ size = be32_to_cpup(p++);
+ if (size > maxcount)
+ return nfserr_xattr2big;
+
+ setxattr->setxa_len = size;
+ if (size > 0) {
+ status = svcxdr_construct_vector(argp, &head, &pagelist, size);
+ if (status)
+ return status;
+
+ status = nfsd4_vbuf_from_vector(argp, &head, pagelist,
+ &setxattr->setxa_buf, size);
+ }
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp,
+ struct nfsd4_listxattrs *listxattrs)
+{
+ DECODE_HEAD;
+ u32 maxcount;
+
+ READ_BUF(12);
+ p = xdr_decode_hyper(p, &listxattrs->lsxa_cookie);
+
+ /*
+ * If the cookie is too large to have even one user.x attribute
+ * plus trailing '\0' left in a maximum size buffer, it's invalid.
+ */
+ if (listxattrs->lsxa_cookie >=
+ (XATTR_LIST_MAX / (XATTR_USER_PREFIX_LEN + 2)))
+ return nfserr_badcookie;
+
+ maxcount = be32_to_cpup(p++);
+ if (maxcount < 8)
+ /* Always need at least 2 words (length and one character) */
+ return nfserr_inval;
+
+ maxcount = min(maxcount, svc_max_payload(argp->rqstp));
+ listxattrs->lsxa_maxcount = maxcount;
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_removexattr(struct nfsd4_compoundargs *argp,
+ struct nfsd4_removexattr *removexattr)
+{
+ return nfsd4_decode_xattr_name(argp, &removexattr->rmxa_name);
+}
+
+static __be32
+nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
+{
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
+{
+ return nfserr_notsupp;
+}
+
+typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
+
+static const nfsd4_dec nfsd4_dec_ops[] = {
+ [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access,
+ [OP_CLOSE] = (nfsd4_dec)nfsd4_decode_close,
+ [OP_COMMIT] = (nfsd4_dec)nfsd4_decode_commit,
+ [OP_CREATE] = (nfsd4_dec)nfsd4_decode_create,
+ [OP_DELEGPURGE] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_DELEGRETURN] = (nfsd4_dec)nfsd4_decode_delegreturn,
+ [OP_GETATTR] = (nfsd4_dec)nfsd4_decode_getattr,
+ [OP_GETFH] = (nfsd4_dec)nfsd4_decode_noop,
+ [OP_LINK] = (nfsd4_dec)nfsd4_decode_link,
+ [OP_LOCK] = (nfsd4_dec)nfsd4_decode_lock,
+ [OP_LOCKT] = (nfsd4_dec)nfsd4_decode_lockt,
+ [OP_LOCKU] = (nfsd4_dec)nfsd4_decode_locku,
+ [OP_LOOKUP] = (nfsd4_dec)nfsd4_decode_lookup,
+ [OP_LOOKUPP] = (nfsd4_dec)nfsd4_decode_noop,
+ [OP_NVERIFY] = (nfsd4_dec)nfsd4_decode_verify,
+ [OP_OPEN] = (nfsd4_dec)nfsd4_decode_open,
+ [OP_OPENATTR] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm,
+ [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade,
+ [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh,
+ [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_putpubfh,
+ [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop,
+ [OP_READ] = (nfsd4_dec)nfsd4_decode_read,
+ [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir,
+ [OP_READLINK] = (nfsd4_dec)nfsd4_decode_noop,
+ [OP_REMOVE] = (nfsd4_dec)nfsd4_decode_remove,
+ [OP_RENAME] = (nfsd4_dec)nfsd4_decode_rename,
+ [OP_RENEW] = (nfsd4_dec)nfsd4_decode_renew,
+ [OP_RESTOREFH] = (nfsd4_dec)nfsd4_decode_noop,
+ [OP_SAVEFH] = (nfsd4_dec)nfsd4_decode_noop,
+ [OP_SECINFO] = (nfsd4_dec)nfsd4_decode_secinfo,
+ [OP_SETATTR] = (nfsd4_dec)nfsd4_decode_setattr,
+ [OP_SETCLIENTID] = (nfsd4_dec)nfsd4_decode_setclientid,
+ [OP_SETCLIENTID_CONFIRM] = (nfsd4_dec)nfsd4_decode_setclientid_confirm,
+ [OP_VERIFY] = (nfsd4_dec)nfsd4_decode_verify,
+ [OP_WRITE] = (nfsd4_dec)nfsd4_decode_write,
+ [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner,
+
+ /* new operations for NFSv4.1 */
+ [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_backchannel_ctl,
+ [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
+ [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id,
+ [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session,
+ [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session,
+ [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid,
+ [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
+#ifdef CONFIG_NFSD_PNFS
+ [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdeviceinfo,
+ [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit,
+ [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget,
+ [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn,
+#else
+ [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp,
+#endif
+ [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
+ [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence,
+ [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid,
+ [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid,
+ [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete,
+
+ /* new operations for NFSv4.2 */
+ [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate,
+ [OP_COPY] = (nfsd4_dec)nfsd4_decode_copy,
+ [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_copy_notify,
+ [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate,
+ [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_offload_status,
+ [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_offload_status,
+ [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_read,
+ [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek,
+ [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_CLONE] = (nfsd4_dec)nfsd4_decode_clone,
+ /* RFC 8276 extended atributes operations */
+ [OP_GETXATTR] = (nfsd4_dec)nfsd4_decode_getxattr,
+ [OP_SETXATTR] = (nfsd4_dec)nfsd4_decode_setxattr,
+ [OP_LISTXATTRS] = (nfsd4_dec)nfsd4_decode_listxattrs,
+ [OP_REMOVEXATTR] = (nfsd4_dec)nfsd4_decode_removexattr,
+};
+
+static inline bool
+nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op)
+{
+ if (op->opnum < FIRST_NFS4_OP)
+ return false;
+ else if (argp->minorversion == 0 && op->opnum > LAST_NFS40_OP)
+ return false;
+ else if (argp->minorversion == 1 && op->opnum > LAST_NFS41_OP)
+ return false;
+ else if (argp->minorversion == 2 && op->opnum > LAST_NFS42_OP)
+ return false;
+ return true;
+}
+
+static __be32
+nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
+{
+ DECODE_HEAD;
+ struct nfsd4_op *op;
+ bool cachethis = false;
+ int auth_slack= argp->rqstp->rq_auth_slack;
+ int max_reply = auth_slack + 8; /* opcnt, status */
+ int readcount = 0;
+ int readbytes = 0;
+ int i;
+
+ READ_BUF(4);
+ argp->taglen = be32_to_cpup(p++);
+ READ_BUF(argp->taglen);
+ SAVEMEM(argp->tag, argp->taglen);
+ READ_BUF(8);
+ argp->minorversion = be32_to_cpup(p++);
+ argp->opcnt = be32_to_cpup(p++);
+ max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2);
+
+ if (argp->taglen > NFSD4_MAX_TAGLEN)
+ goto xdr_error;
+ /*
+ * NFS4ERR_RESOURCE is a more helpful error than GARBAGE_ARGS
+ * here, so we return success at the xdr level so that
+ * nfsd4_proc can handle this is an NFS-level error.
+ */
+ if (argp->opcnt > NFSD_MAX_OPS_PER_COMPOUND)
+ return 0;
+
+ if (argp->opcnt > ARRAY_SIZE(argp->iops)) {
+ argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL);
+ if (!argp->ops) {
+ argp->ops = argp->iops;
+ dprintk("nfsd: couldn't allocate room for COMPOUND\n");
+ goto xdr_error;
+ }
+ }
+
+ if (argp->minorversion > NFSD_SUPPORTED_MINOR_VERSION)
+ argp->opcnt = 0;
+
+ for (i = 0; i < argp->opcnt; i++) {
+ op = &argp->ops[i];
+ op->replay = NULL;
+
+ READ_BUF(4);
+ op->opnum = be32_to_cpup(p++);
+
+ if (nfsd4_opnum_in_range(argp, op))
+ op->status = nfsd4_dec_ops[op->opnum](argp, &op->u);
+ else {
+ op->opnum = OP_ILLEGAL;
+ op->status = nfserr_op_illegal;
+ }
+ op->opdesc = OPDESC(op);
+ /*
+ * We'll try to cache the result in the DRC if any one
+ * op in the compound wants to be cached:
+ */
+ cachethis |= nfsd4_cache_this_op(op);
+
+ if (op->opnum == OP_READ || op->opnum == OP_READ_PLUS) {
+ readcount++;
+ readbytes += nfsd4_max_reply(argp->rqstp, op);
+ } else
+ max_reply += nfsd4_max_reply(argp->rqstp, op);
+ /*
+ * OP_LOCK and OP_LOCKT may return a conflicting lock.
+ * (Special case because it will just skip encoding this
+ * if it runs out of xdr buffer space, and it is the only
+ * operation that behaves this way.)
+ */
+ if (op->opnum == OP_LOCK || op->opnum == OP_LOCKT)
+ max_reply += NFS4_OPAQUE_LIMIT;
+
+ if (op->status) {
+ argp->opcnt = i+1;
+ break;
+ }
+ }
+ /* Sessions make the DRC unnecessary: */
+ if (argp->minorversion)
+ cachethis = false;
+ svc_reserve(argp->rqstp, max_reply + readbytes);
+ argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE;
+
+ if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack)
+ clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags);
+
+ DECODE_TAIL;
+}
+
+static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
+ struct svc_export *exp)
+{
+ if (exp->ex_flags & NFSEXP_V4ROOT) {
+ *p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
+ *p++ = 0;
+ } else if (IS_I_VERSION(inode)) {
+ p = xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode));
+ } else {
+ *p++ = cpu_to_be32(stat->ctime.tv_sec);
+ *p++ = cpu_to_be32(stat->ctime.tv_nsec);
+ }
+ return p;
+}
+
+/*
+ * ctime (in NFSv4, time_metadata) is not writeable, and the client
+ * doesn't really care what resolution could theoretically be stored by
+ * the filesystem.
+ *
+ * The client cares how close together changes can be while still
+ * guaranteeing ctime changes. For most filesystems (which have
+ * timestamps with nanosecond fields) that is limited by the resolution
+ * of the time returned from current_time() (which I'm assuming to be
+ * 1/HZ).
+ */
+static __be32 *encode_time_delta(__be32 *p, struct inode *inode)
+{
+ struct timespec64 ts;
+ u32 ns;
+
+ ns = max_t(u32, NSEC_PER_SEC/HZ, inode->i_sb->s_time_gran);
+ ts = ns_to_timespec64(ns);
+
+ p = xdr_encode_hyper(p, ts.tv_sec);
+ *p++ = cpu_to_be32(ts.tv_nsec);
+
+ return p;
+}
+
+static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c)
+{
+ *p++ = cpu_to_be32(c->atomic);
+ if (c->change_supported) {
+ p = xdr_encode_hyper(p, c->before_change);
+ p = xdr_encode_hyper(p, c->after_change);
+ } else {
+ *p++ = cpu_to_be32(c->before_ctime_sec);
+ *p++ = cpu_to_be32(c->before_ctime_nsec);
+ *p++ = cpu_to_be32(c->after_ctime_sec);
+ *p++ = cpu_to_be32(c->after_ctime_nsec);
+ }
+ return p;
+}
+
+/* Encode as an array of strings the string given with components
+ * separated @sep, escaped with esc_enter and esc_exit.
+ */
+static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep,
+ char *components, char esc_enter,
+ char esc_exit)
+{
+ __be32 *p;
+ __be32 pathlen;
+ int pathlen_offset;
+ int strlen, count=0;
+ char *str, *end, *next;
+
+ dprintk("nfsd4_encode_components(%s)\n", components);
+
+ pathlen_offset = xdr->buf->len;
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ p++; /* We will fill this in with @count later */
+
+ end = str = components;
+ while (*end) {
+ bool found_esc = false;
+
+ /* try to parse as esc_start, ..., esc_end, sep */
+ if (*str == esc_enter) {
+ for (; *end && (*end != esc_exit); end++)
+ /* find esc_exit or end of string */;
+ next = end + 1;
+ if (*end && (!*next || *next == sep)) {
+ str++;
+ found_esc = true;
+ }
+ }
+
+ if (!found_esc)
+ for (; *end && (*end != sep); end++)
+ /* find sep or end of string */;
+
+ strlen = end - str;
+ if (strlen) {
+ p = xdr_reserve_space(xdr, strlen + 4);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque(p, str, strlen);
+ count++;
+ }
+ else
+ end++;
+ if (found_esc)
+ end = next;
+
+ str = end;
+ }
+ pathlen = htonl(count);
+ write_bytes_to_xdr_buf(xdr->buf, pathlen_offset, &pathlen, 4);
+ return 0;
+}
+
+/* Encode as an array of strings the string given with components
+ * separated @sep.
+ */
+static __be32 nfsd4_encode_components(struct xdr_stream *xdr, char sep,
+ char *components)
+{
+ return nfsd4_encode_components_esc(xdr, sep, components, 0, 0);
+}
+
+/*
+ * encode a location element of a fs_locations structure
+ */
+static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr,
+ struct nfsd4_fs_location *location)
+{
+ __be32 status;
+
+ status = nfsd4_encode_components_esc(xdr, ':', location->hosts,
+ '[', ']');
+ if (status)
+ return status;
+ status = nfsd4_encode_components(xdr, '/', location->path);
+ if (status)
+ return status;
+ return 0;
+}
+
+/*
+ * Encode a path in RFC3530 'pathname4' format
+ */
+static __be32 nfsd4_encode_path(struct xdr_stream *xdr,
+ const struct path *root,
+ const struct path *path)
+{
+ struct path cur = *path;
+ __be32 *p;
+ struct dentry **components = NULL;
+ unsigned int ncomponents = 0;
+ __be32 err = nfserr_jukebox;
+
+ dprintk("nfsd4_encode_components(");
+
+ path_get(&cur);
+ /* First walk the path up to the nfsd root, and store the
+ * dentries/path components in an array.
+ */
+ for (;;) {
+ if (path_equal(&cur, root))
+ break;
+ if (cur.dentry == cur.mnt->mnt_root) {
+ if (follow_up(&cur))
+ continue;
+ goto out_free;
+ }
+ if ((ncomponents & 15) == 0) {
+ struct dentry **new;
+ new = krealloc(components,
+ sizeof(*new) * (ncomponents + 16),
+ GFP_KERNEL);
+ if (!new)
+ goto out_free;
+ components = new;
+ }
+ components[ncomponents++] = cur.dentry;
+ cur.dentry = dget_parent(cur.dentry);
+ }
+ err = nfserr_resource;
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_free;
+ *p++ = cpu_to_be32(ncomponents);
+
+ while (ncomponents) {
+ struct dentry *dentry = components[ncomponents - 1];
+ unsigned int len;
+
+ spin_lock(&dentry->d_lock);
+ len = dentry->d_name.len;
+ p = xdr_reserve_space(xdr, len + 4);
+ if (!p) {
+ spin_unlock(&dentry->d_lock);
+ goto out_free;
+ }
+ p = xdr_encode_opaque(p, dentry->d_name.name, len);
+ dprintk("/%pd", dentry);
+ spin_unlock(&dentry->d_lock);
+ dput(dentry);
+ ncomponents--;
+ }
+
+ err = 0;
+out_free:
+ dprintk(")\n");
+ while (ncomponents)
+ dput(components[--ncomponents]);
+ kfree(components);
+ path_put(&cur);
+ return err;
+}
+
+static __be32 nfsd4_encode_fsloc_fsroot(struct xdr_stream *xdr,
+ struct svc_rqst *rqstp, const struct path *path)
+{
+ struct svc_export *exp_ps;
+ __be32 res;
+
+ exp_ps = rqst_find_fsidzero_export(rqstp);
+ if (IS_ERR(exp_ps))
+ return nfserrno(PTR_ERR(exp_ps));
+ res = nfsd4_encode_path(xdr, &exp_ps->ex_path, path);
+ exp_put(exp_ps);
+ return res;
+}
+
+/*
+ * encode a fs_locations structure
+ */
+static __be32 nfsd4_encode_fs_locations(struct xdr_stream *xdr,
+ struct svc_rqst *rqstp, struct svc_export *exp)
+{
+ __be32 status;
+ int i;
+ __be32 *p;
+ struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
+
+ status = nfsd4_encode_fsloc_fsroot(xdr, rqstp, &exp->ex_path);
+ if (status)
+ return status;
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(fslocs->locations_count);
+ for (i=0; i<fslocs->locations_count; i++) {
+ status = nfsd4_encode_fs_location4(xdr, &fslocs->locations[i]);
+ if (status)
+ return status;
+ }
+ return 0;
+}
+
+static u32 nfs4_file_type(umode_t mode)
+{
+ switch (mode & S_IFMT) {
+ case S_IFIFO: return NF4FIFO;
+ case S_IFCHR: return NF4CHR;
+ case S_IFDIR: return NF4DIR;
+ case S_IFBLK: return NF4BLK;
+ case S_IFLNK: return NF4LNK;
+ case S_IFREG: return NF4REG;
+ case S_IFSOCK: return NF4SOCK;
+ default: return NF4BAD;
+ };
+}
+
+static inline __be32
+nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+ struct nfs4_ace *ace)
+{
+ if (ace->whotype != NFS4_ACL_WHO_NAMED)
+ return nfs4_acl_write_who(xdr, ace->whotype);
+ else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
+ return nfsd4_encode_group(xdr, rqstp, ace->who_gid);
+ else
+ return nfsd4_encode_user(xdr, rqstp, ace->who_uid);
+}
+
+static inline __be32
+nfsd4_encode_layout_types(struct xdr_stream *xdr, u32 layout_types)
+{
+ __be32 *p;
+ unsigned long i = hweight_long(layout_types);
+
+ p = xdr_reserve_space(xdr, 4 + 4 * i);
+ if (!p)
+ return nfserr_resource;
+
+ *p++ = cpu_to_be32(i);
+
+ for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i)
+ if (layout_types & (1 << i))
+ *p++ = cpu_to_be32(i);
+
+ return 0;
+}
+
+#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
+ FATTR4_WORD0_RDATTR_ERROR)
+#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID
+#define WORD2_ABSENT_FS_ATTRS 0
+
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+static inline __be32
+nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+ void *context, int len)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, len + 4 + 4 + 4);
+ if (!p)
+ return nfserr_resource;
+
+ /*
+ * For now we use a 0 here to indicate the null translation; in
+ * the future we may place a call to translation code here.
+ */
+ *p++ = cpu_to_be32(0); /* lfs */
+ *p++ = cpu_to_be32(0); /* pi */
+ p = xdr_encode_opaque(p, context, len);
+ return 0;
+}
+#else
+static inline __be32
+nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+ void *context, int len)
+{ return 0; }
+#endif
+
+static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *bmval2, u32 *rdattr_err)
+{
+ /* As per referral draft: */
+ if (*bmval0 & ~WORD0_ABSENT_FS_ATTRS ||
+ *bmval1 & ~WORD1_ABSENT_FS_ATTRS) {
+ if (*bmval0 & FATTR4_WORD0_RDATTR_ERROR ||
+ *bmval0 & FATTR4_WORD0_FS_LOCATIONS)
+ *rdattr_err = NFSERR_MOVED;
+ else
+ return nfserr_moved;
+ }
+ *bmval0 &= WORD0_ABSENT_FS_ATTRS;
+ *bmval1 &= WORD1_ABSENT_FS_ATTRS;
+ *bmval2 &= WORD2_ABSENT_FS_ATTRS;
+ return 0;
+}
+
+
+static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
+{
+ struct path path = exp->ex_path;
+ int err;
+
+ path_get(&path);
+ while (follow_up(&path)) {
+ if (path.dentry != path.mnt->mnt_root)
+ break;
+ }
+ err = vfs_getattr(&path, stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
+ path_put(&path);
+ return err;
+}
+
+static __be32
+nfsd4_encode_bitmap(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2)
+{
+ __be32 *p;
+
+ if (bmval2) {
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(3);
+ *p++ = cpu_to_be32(bmval0);
+ *p++ = cpu_to_be32(bmval1);
+ *p++ = cpu_to_be32(bmval2);
+ } else if (bmval1) {
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(2);
+ *p++ = cpu_to_be32(bmval0);
+ *p++ = cpu_to_be32(bmval1);
+ } else {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(bmval0);
+ }
+
+ return 0;
+out_resource:
+ return nfserr_resource;
+}
+
+/*
+ * Note: @fhp can be NULL; in this case, we might have to compose the filehandle
+ * ourselves.
+ */
+static __be32
+nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
+ struct svc_export *exp,
+ struct dentry *dentry, u32 *bmval,
+ struct svc_rqst *rqstp, int ignore_crossmnt)
+{
+ u32 bmval0 = bmval[0];
+ u32 bmval1 = bmval[1];
+ u32 bmval2 = bmval[2];
+ struct kstat stat;
+ struct svc_fh *tempfh = NULL;
+ struct kstatfs statfs;
+ __be32 *p;
+ int starting_len = xdr->buf->len;
+ int attrlen_offset;
+ __be32 attrlen;
+ u32 dummy;
+ u64 dummy64;
+ u32 rdattr_err = 0;
+ __be32 status;
+ int err;
+ struct nfs4_acl *acl = NULL;
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ void *context = NULL;
+ int contextlen;
+#endif
+ bool contextsupport = false;
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ u32 minorversion = resp->cstate.minorversion;
+ struct path path = {
+ .mnt = exp->ex_path.mnt,
+ .dentry = dentry,
+ };
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
+ BUG_ON(!nfsd_attrs_supported(minorversion, bmval));
+
+ if (exp->ex_fslocs.migrated) {
+ status = fattr_handle_absent_fs(&bmval0, &bmval1, &bmval2, &rdattr_err);
+ if (status)
+ goto out;
+ }
+
+ err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
+ if (err)
+ goto out_nfserr;
+ if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
+ FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) ||
+ (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
+ FATTR4_WORD1_SPACE_TOTAL))) {
+ err = vfs_statfs(&path, &statfs);
+ if (err)
+ goto out_nfserr;
+ }
+ if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) {
+ tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
+ status = nfserr_jukebox;
+ if (!tempfh)
+ goto out;
+ fh_init(tempfh, NFS4_FHSIZE);
+ status = fh_compose(tempfh, exp, dentry, NULL);
+ if (status)
+ goto out;
+ fhp = tempfh;
+ }
+ if (bmval0 & FATTR4_WORD0_ACL) {
+ err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl);
+ if (err == -EOPNOTSUPP)
+ bmval0 &= ~FATTR4_WORD0_ACL;
+ else if (err == -EINVAL) {
+ status = nfserr_attrnotsupp;
+ goto out;
+ } else if (err != 0)
+ goto out_nfserr;
+ }
+
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) ||
+ bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
+ if (exp->ex_flags & NFSEXP_SECURITY_LABEL)
+ err = security_inode_getsecctx(d_inode(dentry),
+ &context, &contextlen);
+ else
+ err = -EOPNOTSUPP;
+ contextsupport = (err == 0);
+ if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
+ if (err == -EOPNOTSUPP)
+ bmval2 &= ~FATTR4_WORD2_SECURITY_LABEL;
+ else if (err)
+ goto out_nfserr;
+ }
+ }
+#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
+
+ status = nfsd4_encode_bitmap(xdr, bmval0, bmval1, bmval2);
+ if (status)
+ goto out;
+
+ attrlen_offset = xdr->buf->len;
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ p++; /* to be backfilled later */
+
+ if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
+ u32 supp[3];
+
+ memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
+
+ if (!IS_POSIXACL(dentry->d_inode))
+ supp[0] &= ~FATTR4_WORD0_ACL;
+ if (!contextsupport)
+ supp[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+ if (!supp[2]) {
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(2);
+ *p++ = cpu_to_be32(supp[0]);
+ *p++ = cpu_to_be32(supp[1]);
+ } else {
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(3);
+ *p++ = cpu_to_be32(supp[0]);
+ *p++ = cpu_to_be32(supp[1]);
+ *p++ = cpu_to_be32(supp[2]);
+ }
+ }
+ if (bmval0 & FATTR4_WORD0_TYPE) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ dummy = nfs4_file_type(stat.mode);
+ if (dummy == NF4BAD) {
+ status = nfserr_serverfault;
+ goto out;
+ }
+ *p++ = cpu_to_be32(dummy);
+ }
+ if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ if (exp->ex_flags & NFSEXP_NOSUBTREECHECK)
+ *p++ = cpu_to_be32(NFS4_FH_PERSISTENT);
+ else
+ *p++ = cpu_to_be32(NFS4_FH_PERSISTENT|
+ NFS4_FH_VOL_RENAME);
+ }
+ if (bmval0 & FATTR4_WORD0_CHANGE) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ p = encode_change(p, &stat, d_inode(dentry), exp);
+ }
+ if (bmval0 & FATTR4_WORD0_SIZE) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ p = xdr_encode_hyper(p, stat.size);
+ }
+ if (bmval0 & FATTR4_WORD0_LINK_SUPPORT) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(1);
+ }
+ if (bmval0 & FATTR4_WORD0_SYMLINK_SUPPORT) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(1);
+ }
+ if (bmval0 & FATTR4_WORD0_NAMED_ATTR) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(0);
+ }
+ if (bmval0 & FATTR4_WORD0_FSID) {
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
+ goto out_resource;
+ if (exp->ex_fslocs.migrated) {
+ p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR);
+ p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR);
+ } else switch(fsid_source(fhp)) {
+ case FSIDSOURCE_FSID:
+ p = xdr_encode_hyper(p, (u64)exp->ex_fsid);
+ p = xdr_encode_hyper(p, (u64)0);
+ break;
+ case FSIDSOURCE_DEV:
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(MAJOR(stat.dev));
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(MINOR(stat.dev));
+ break;
+ case FSIDSOURCE_UUID:
+ p = xdr_encode_opaque_fixed(p, exp->ex_uuid,
+ EX_UUID_LEN);
+ break;
+ }
+ }
+ if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(0);
+ }
+ if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(nn->nfsd4_lease);
+ }
+ if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(rdattr_err);
+ }
+ if (bmval0 & FATTR4_WORD0_ACL) {
+ struct nfs4_ace *ace;
+
+ if (acl == NULL) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+
+ *p++ = cpu_to_be32(0);
+ goto out_acl;
+ }
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(acl->naces);
+
+ for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) {
+ p = xdr_reserve_space(xdr, 4*3);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(ace->type);
+ *p++ = cpu_to_be32(ace->flag);
+ *p++ = cpu_to_be32(ace->access_mask &
+ NFS4_ACE_MASK_ALL);
+ status = nfsd4_encode_aclname(xdr, rqstp, ace);
+ if (status)
+ goto out;
+ }
+ }
+out_acl:
+ if (bmval0 & FATTR4_WORD0_ACLSUPPORT) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(IS_POSIXACL(dentry->d_inode) ?
+ ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0);
+ }
+ if (bmval0 & FATTR4_WORD0_CANSETTIME) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(1);
+ }
+ if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(0);
+ }
+ if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(1);
+ }
+ if (bmval0 & FATTR4_WORD0_CHOWN_RESTRICTED) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(1);
+ }
+ if (bmval0 & FATTR4_WORD0_FILEHANDLE) {
+ p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4);
+ if (!p)
+ goto out_resource;
+ p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base,
+ fhp->fh_handle.fh_size);
+ }
+ if (bmval0 & FATTR4_WORD0_FILEID) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ p = xdr_encode_hyper(p, stat.ino);
+ }
+ if (bmval0 & FATTR4_WORD0_FILES_AVAIL) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ p = xdr_encode_hyper(p, (u64) statfs.f_ffree);
+ }
+ if (bmval0 & FATTR4_WORD0_FILES_FREE) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ p = xdr_encode_hyper(p, (u64) statfs.f_ffree);
+ }
+ if (bmval0 & FATTR4_WORD0_FILES_TOTAL) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ p = xdr_encode_hyper(p, (u64) statfs.f_files);
+ }
+ if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
+ status = nfsd4_encode_fs_locations(xdr, rqstp, exp);
+ if (status)
+ goto out;
+ }
+ if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(1);
+ }
+ if (bmval0 & FATTR4_WORD0_MAXFILESIZE) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ p = xdr_encode_hyper(p, exp->ex_path.mnt->mnt_sb->s_maxbytes);
+ }
+ if (bmval0 & FATTR4_WORD0_MAXLINK) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(255);
+ }
+ if (bmval0 & FATTR4_WORD0_MAXNAME) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(statfs.f_namelen);
+ }
+ if (bmval0 & FATTR4_WORD0_MAXREAD) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp));
+ }
+ if (bmval0 & FATTR4_WORD0_MAXWRITE) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp));
+ }
+ if (bmval1 & FATTR4_WORD1_MODE) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(stat.mode & S_IALLUGO);
+ }
+ if (bmval1 & FATTR4_WORD1_NO_TRUNC) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(1);
+ }
+ if (bmval1 & FATTR4_WORD1_NUMLINKS) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(stat.nlink);
+ }
+ if (bmval1 & FATTR4_WORD1_OWNER) {
+ status = nfsd4_encode_user(xdr, rqstp, stat.uid);
+ if (status)
+ goto out;
+ }
+ if (bmval1 & FATTR4_WORD1_OWNER_GROUP) {
+ status = nfsd4_encode_group(xdr, rqstp, stat.gid);
+ if (status)
+ goto out;
+ }
+ if (bmval1 & FATTR4_WORD1_RAWDEV) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32((u32) MAJOR(stat.rdev));
+ *p++ = cpu_to_be32((u32) MINOR(stat.rdev));
+ }
+ if (bmval1 & FATTR4_WORD1_SPACE_AVAIL) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ dummy64 = (u64)statfs.f_bavail * (u64)statfs.f_bsize;
+ p = xdr_encode_hyper(p, dummy64);
+ }
+ if (bmval1 & FATTR4_WORD1_SPACE_FREE) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ dummy64 = (u64)statfs.f_bfree * (u64)statfs.f_bsize;
+ p = xdr_encode_hyper(p, dummy64);
+ }
+ if (bmval1 & FATTR4_WORD1_SPACE_TOTAL) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ dummy64 = (u64)statfs.f_blocks * (u64)statfs.f_bsize;
+ p = xdr_encode_hyper(p, dummy64);
+ }
+ if (bmval1 & FATTR4_WORD1_SPACE_USED) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ dummy64 = (u64)stat.blocks << 9;
+ p = xdr_encode_hyper(p, dummy64);
+ }
+ if (bmval1 & FATTR4_WORD1_TIME_ACCESS) {
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
+ goto out_resource;
+ p = xdr_encode_hyper(p, (s64)stat.atime.tv_sec);
+ *p++ = cpu_to_be32(stat.atime.tv_nsec);
+ }
+ if (bmval1 & FATTR4_WORD1_TIME_DELTA) {
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
+ goto out_resource;
+ p = encode_time_delta(p, d_inode(dentry));
+ }
+ if (bmval1 & FATTR4_WORD1_TIME_METADATA) {
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
+ goto out_resource;
+ p = xdr_encode_hyper(p, (s64)stat.ctime.tv_sec);
+ *p++ = cpu_to_be32(stat.ctime.tv_nsec);
+ }
+ if (bmval1 & FATTR4_WORD1_TIME_MODIFY) {
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
+ goto out_resource;
+ p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec);
+ *p++ = cpu_to_be32(stat.mtime.tv_nsec);
+ }
+ if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
+ struct kstat parent_stat;
+ u64 ino = stat.ino;
+
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ /*
+ * Get parent's attributes if not ignoring crossmount
+ * and this is the root of a cross-mounted filesystem.
+ */
+ if (ignore_crossmnt == 0 &&
+ dentry == exp->ex_path.mnt->mnt_root) {
+ err = get_parent_attributes(exp, &parent_stat);
+ if (err)
+ goto out_nfserr;
+ ino = parent_stat.ino;
+ }
+ p = xdr_encode_hyper(p, ino);
+ }
+#ifdef CONFIG_NFSD_PNFS
+ if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) {
+ status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types);
+ if (status)
+ goto out;
+ }
+
+ if (bmval2 & FATTR4_WORD2_LAYOUT_TYPES) {
+ status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types);
+ if (status)
+ goto out;
+ }
+
+ if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(stat.blksize);
+ }
+#endif /* CONFIG_NFSD_PNFS */
+ if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
+ u32 supp[3];
+
+ memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
+ supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0;
+ supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1;
+ supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2;
+
+ status = nfsd4_encode_bitmap(xdr, supp[0], supp[1], supp[2]);
+ if (status)
+ goto out;
+ }
+
+ if (bmval2 & FATTR4_WORD2_CHANGE_ATTR_TYPE) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ if (IS_I_VERSION(d_inode(dentry)))
+ *p++ = cpu_to_be32(NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR);
+ else
+ *p++ = cpu_to_be32(NFS4_CHANGE_TYPE_IS_TIME_METADATA);
+ }
+
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
+ status = nfsd4_encode_security_label(xdr, rqstp, context,
+ contextlen);
+ if (status)
+ goto out;
+ }
+#endif
+
+ if (bmval2 & FATTR4_WORD2_XATTR_SUPPORT) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ err = xattr_supported_namespace(d_inode(dentry),
+ XATTR_USER_PREFIX);
+ *p++ = cpu_to_be32(err == 0);
+ }
+
+ attrlen = htonl(xdr->buf->len - attrlen_offset - 4);
+ write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, 4);
+ status = nfs_ok;
+
+out:
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ if (context)
+ security_release_secctx(context, contextlen);
+#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
+ kfree(acl);
+ if (tempfh) {
+ fh_put(tempfh);
+ kfree(tempfh);
+ }
+ if (status)
+ xdr_truncate_encode(xdr, starting_len);
+ return status;
+out_nfserr:
+ status = nfserrno(err);
+ goto out;
+out_resource:
+ status = nfserr_resource;
+ goto out;
+}
+
+static void svcxdr_init_encode_from_buffer(struct xdr_stream *xdr,
+ struct xdr_buf *buf, __be32 *p, int bytes)
+{
+ xdr->scratch.iov_len = 0;
+ memset(buf, 0, sizeof(struct xdr_buf));
+ buf->head[0].iov_base = p;
+ buf->head[0].iov_len = 0;
+ buf->len = 0;
+ xdr->buf = buf;
+ xdr->iov = buf->head;
+ xdr->p = p;
+ xdr->end = (void *)p + bytes;
+ buf->buflen = bytes;
+}
+
+__be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words,
+ struct svc_fh *fhp, struct svc_export *exp,
+ struct dentry *dentry, u32 *bmval,
+ struct svc_rqst *rqstp, int ignore_crossmnt)
+{
+ struct xdr_buf dummy;
+ struct xdr_stream xdr;
+ __be32 ret;
+
+ svcxdr_init_encode_from_buffer(&xdr, &dummy, *p, words << 2);
+ ret = nfsd4_encode_fattr(&xdr, fhp, exp, dentry, bmval, rqstp,
+ ignore_crossmnt);
+ *p = xdr.p;
+ return ret;
+}
+
+static inline int attributes_need_mount(u32 *bmval)
+{
+ if (bmval[0] & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_LEASE_TIME))
+ return 1;
+ if (bmval[1] & ~FATTR4_WORD1_MOUNTED_ON_FILEID)
+ return 1;
+ return 0;
+}
+
+static __be32
+nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
+ const char *name, int namlen)
+{
+ struct svc_export *exp = cd->rd_fhp->fh_export;
+ struct dentry *dentry;
+ __be32 nfserr;
+ int ignore_crossmnt = 0;
+
+ dentry = lookup_positive_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
+ if (IS_ERR(dentry))
+ return nfserrno(PTR_ERR(dentry));
+
+ exp_get(exp);
+ /*
+ * In the case of a mountpoint, the client may be asking for
+ * attributes that are only properties of the underlying filesystem
+ * as opposed to the cross-mounted file system. In such a case,
+ * we will not follow the cross mount and will fill the attribtutes
+ * directly from the mountpoint dentry.
+ */
+ if (nfsd_mountpoint(dentry, exp)) {
+ int err;
+
+ if (!(exp->ex_flags & NFSEXP_V4ROOT)
+ && !attributes_need_mount(cd->rd_bmval)) {
+ ignore_crossmnt = 1;
+ goto out_encode;
+ }
+ /*
+ * Why the heck aren't we just using nfsd_lookup??
+ * Different "."/".." handling? Something else?
+ * At least, add a comment here to explain....
+ */
+ err = nfsd_cross_mnt(cd->rd_rqstp, &dentry, &exp);
+ if (err) {
+ nfserr = nfserrno(err);
+ goto out_put;
+ }
+ nfserr = check_nfsd_access(exp, cd->rd_rqstp);
+ if (nfserr)
+ goto out_put;
+
+ }
+out_encode:
+ nfserr = nfsd4_encode_fattr(xdr, NULL, exp, dentry, cd->rd_bmval,
+ cd->rd_rqstp, ignore_crossmnt);
+out_put:
+ dput(dentry);
+ exp_put(exp);
+ return nfserr;
+}
+
+static __be32 *
+nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 20);
+ if (!p)
+ return NULL;
+ *p++ = htonl(2);
+ *p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */
+ *p++ = htonl(0); /* bmval1 */
+
+ *p++ = htonl(4); /* attribute length */
+ *p++ = nfserr; /* no htonl */
+ return p;
+}
+
+static int
+nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
+ loff_t offset, u64 ino, unsigned int d_type)
+{
+ struct readdir_cd *ccd = ccdv;
+ struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common);
+ struct xdr_stream *xdr = cd->xdr;
+ int start_offset = xdr->buf->len;
+ int cookie_offset;
+ u32 name_and_cookie;
+ int entry_bytes;
+ __be32 nfserr = nfserr_toosmall;
+ __be64 wire_offset;
+ __be32 *p;
+
+ /* In nfsv4, "." and ".." never make it onto the wire.. */
+ if (name && isdotent(name, namlen)) {
+ cd->common.err = nfs_ok;
+ return 0;
+ }
+
+ if (cd->cookie_offset) {
+ wire_offset = cpu_to_be64(offset);
+ write_bytes_to_xdr_buf(xdr->buf, cd->cookie_offset,
+ &wire_offset, 8);
+ }
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto fail;
+ *p++ = xdr_one; /* mark entry present */
+ cookie_offset = xdr->buf->len;
+ p = xdr_reserve_space(xdr, 3*4 + namlen);
+ if (!p)
+ goto fail;
+ p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */
+ p = xdr_encode_array(p, name, namlen); /* name length & name */
+
+ nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen);
+ switch (nfserr) {
+ case nfs_ok:
+ break;
+ case nfserr_resource:
+ nfserr = nfserr_toosmall;
+ goto fail;
+ case nfserr_noent:
+ xdr_truncate_encode(xdr, start_offset);
+ goto skip_entry;
+ case nfserr_jukebox:
+ /*
+ * The pseudoroot should only display dentries that lead to
+ * exports. If we get EJUKEBOX here, then we can't tell whether
+ * this entry should be included. Just fail the whole READDIR
+ * with NFS4ERR_DELAY in that case, and hope that the situation
+ * will resolve itself by the client's next attempt.
+ */
+ if (cd->rd_fhp->fh_export->ex_flags & NFSEXP_V4ROOT)
+ goto fail;
+ fallthrough;
+ default:
+ /*
+ * If the client requested the RDATTR_ERROR attribute,
+ * we stuff the error code into this attribute
+ * and continue. If this attribute was not requested,
+ * then in accordance with the spec, we fail the
+ * entire READDIR operation(!)
+ */
+ if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR))
+ goto fail;
+ p = nfsd4_encode_rdattr_error(xdr, nfserr);
+ if (p == NULL) {
+ nfserr = nfserr_toosmall;
+ goto fail;
+ }
+ }
+ nfserr = nfserr_toosmall;
+ entry_bytes = xdr->buf->len - start_offset;
+ if (entry_bytes > cd->rd_maxcount)
+ goto fail;
+ cd->rd_maxcount -= entry_bytes;
+ /*
+ * RFC 3530 14.2.24 describes rd_dircount as only a "hint", and
+ * notes that it could be zero. If it is zero, then the server
+ * should enforce only the rd_maxcount value.
+ */
+ if (cd->rd_dircount) {
+ name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8;
+ if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
+ goto fail;
+ cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
+ if (!cd->rd_dircount)
+ cd->rd_maxcount = 0;
+ }
+
+ cd->cookie_offset = cookie_offset;
+skip_entry:
+ cd->common.err = nfs_ok;
+ return 0;
+fail:
+ xdr_truncate_encode(xdr, start_offset);
+ cd->common.err = nfserr;
+ return -EINVAL;
+}
+
+static __be32
+nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, sizeof(stateid_t));
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(sid->si_generation);
+ p = xdr_encode_opaque_fixed(p, &sid->si_opaque,
+ sizeof(stateid_opaque_t));
+ return 0;
+}
+
+static __be32
+nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(access->ac_supported);
+ *p++ = cpu_to_be32(access->ac_resp_access);
+ return 0;
+}
+
+static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque_fixed(p, bcts->sessionid.data,
+ NFS4_MAX_SESSIONID_LEN);
+ *p++ = cpu_to_be32(bcts->dir);
+ /* Upshifting from TCP to RDMA is not supported */
+ *p++ = cpu_to_be32(0);
+ return 0;
+}
+
+static __be32
+nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+
+ return nfsd4_encode_stateid(xdr, &close->cl_stateid);
+}
+
+
+static __be32
+nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque_fixed(p, commit->co_verf.data,
+ NFS4_VERIFIER_SIZE);
+ return 0;
+}
+
+static __be32
+nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 20);
+ if (!p)
+ return nfserr_resource;
+ encode_cinfo(p, &create->cr_cinfo);
+ return nfsd4_encode_bitmap(xdr, create->cr_bmval[0],
+ create->cr_bmval[1], create->cr_bmval[2]);
+}
+
+static __be32
+nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr)
+{
+ struct svc_fh *fhp = getattr->ga_fhp;
+ struct xdr_stream *xdr = &resp->xdr;
+
+ return nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry,
+ getattr->ga_bmval, resp->rqstp, 0);
+}
+
+static __be32
+nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ struct svc_fh *fhp = *fhpp;
+ unsigned int len;
+ __be32 *p;
+
+ len = fhp->fh_handle.fh_size;
+ p = xdr_reserve_space(xdr, len + 4);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, len);
+ return 0;
+}
+
+/*
+* Including all fields other than the name, a LOCK4denied structure requires
+* 8(clientid) + 4(namelen) + 8(offset) + 8(length) + 4(type) = 32 bytes.
+*/
+static __be32
+nfsd4_encode_lock_denied(struct xdr_stream *xdr, struct nfsd4_lock_denied *ld)
+{
+ struct xdr_netobj *conf = &ld->ld_owner;
+ __be32 *p;
+
+again:
+ p = xdr_reserve_space(xdr, 32 + XDR_LEN(conf->len));
+ if (!p) {
+ /*
+ * Don't fail to return the result just because we can't
+ * return the conflicting open:
+ */
+ if (conf->len) {
+ kfree(conf->data);
+ conf->len = 0;
+ conf->data = NULL;
+ goto again;
+ }
+ return nfserr_resource;
+ }
+ p = xdr_encode_hyper(p, ld->ld_start);
+ p = xdr_encode_hyper(p, ld->ld_length);
+ *p++ = cpu_to_be32(ld->ld_type);
+ if (conf->len) {
+ p = xdr_encode_opaque_fixed(p, &ld->ld_clientid, 8);
+ p = xdr_encode_opaque(p, conf->data, conf->len);
+ kfree(conf->data);
+ } else { /* non - nfsv4 lock in conflict, no clientid nor owner */
+ p = xdr_encode_hyper(p, (u64)0); /* clientid */
+ *p++ = cpu_to_be32(0); /* length of owner name */
+ }
+ return nfserr_denied;
+}
+
+static __be32
+nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+
+ if (!nfserr)
+ nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid);
+ else if (nfserr == nfserr_denied)
+ nfserr = nfsd4_encode_lock_denied(xdr, &lock->lk_denied);
+
+ return nfserr;
+}
+
+static __be32
+nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+
+ if (nfserr == nfserr_denied)
+ nfsd4_encode_lock_denied(xdr, &lockt->lt_denied);
+ return nfserr;
+}
+
+static __be32
+nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+
+ return nfsd4_encode_stateid(xdr, &locku->lu_stateid);
+}
+
+
+static __be32
+nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 20);
+ if (!p)
+ return nfserr_resource;
+ p = encode_cinfo(p, &link->li_cinfo);
+ return 0;
+}
+
+
+static __be32
+nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid);
+ if (nfserr)
+ return nfserr;
+ p = xdr_reserve_space(xdr, 24);
+ if (!p)
+ return nfserr_resource;
+ p = encode_cinfo(p, &open->op_cinfo);
+ *p++ = cpu_to_be32(open->op_rflags);
+
+ nfserr = nfsd4_encode_bitmap(xdr, open->op_bmval[0], open->op_bmval[1],
+ open->op_bmval[2]);
+ if (nfserr)
+ return nfserr;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+
+ *p++ = cpu_to_be32(open->op_delegate_type);
+ switch (open->op_delegate_type) {
+ case NFS4_OPEN_DELEGATE_NONE:
+ break;
+ case NFS4_OPEN_DELEGATE_READ:
+ nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid);
+ if (nfserr)
+ return nfserr;
+ p = xdr_reserve_space(xdr, 20);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(open->op_recall);
+
+ /*
+ * TODO: ACE's in delegations
+ */
+ *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */
+ break;
+ case NFS4_OPEN_DELEGATE_WRITE:
+ nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid);
+ if (nfserr)
+ return nfserr;
+ p = xdr_reserve_space(xdr, 32);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(open->op_recall);
+
+ /*
+ * TODO: space_limit's in delegations
+ */
+ *p++ = cpu_to_be32(NFS4_LIMIT_SIZE);
+ *p++ = cpu_to_be32(~(u32)0);
+ *p++ = cpu_to_be32(~(u32)0);
+
+ /*
+ * TODO: ACE's in delegations
+ */
+ *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */
+ break;
+ case NFS4_OPEN_DELEGATE_NONE_EXT: /* 4.1 */
+ switch (open->op_why_no_deleg) {
+ case WND4_CONTENTION:
+ case WND4_RESOURCE:
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(open->op_why_no_deleg);
+ /* deleg signaling not supported yet: */
+ *p++ = cpu_to_be32(0);
+ break;
+ default:
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(open->op_why_no_deleg);
+ }
+ break;
+ default:
+ BUG();
+ }
+ /* XXX save filehandle here */
+ return 0;
+}
+
+static __be32
+nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+
+ return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid);
+}
+
+static __be32
+nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+
+ return nfsd4_encode_stateid(xdr, &od->od_stateid);
+}
+
+static __be32 nfsd4_encode_splice_read(
+ struct nfsd4_compoundres *resp,
+ struct nfsd4_read *read,
+ struct file *file, unsigned long maxcount)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_buf *buf = xdr->buf;
+ u32 eof;
+ int space_left;
+ __be32 nfserr;
+ __be32 *p = xdr->p - 2;
+
+ /* Make sure there will be room for padding if needed */
+ if (xdr->end - xdr->p < 1)
+ return nfserr_resource;
+
+ nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp,
+ file, read->rd_offset, &maxcount, &eof);
+ read->rd_length = maxcount;
+ if (nfserr) {
+ /*
+ * nfsd_splice_actor may have already messed with the
+ * page length; reset it so as not to confuse
+ * xdr_truncate_encode:
+ */
+ buf->page_len = 0;
+ return nfserr;
+ }
+
+ *(p++) = htonl(eof);
+ *(p++) = htonl(maxcount);
+
+ buf->page_len = maxcount;
+ buf->len += maxcount;
+ xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1)
+ / PAGE_SIZE;
+
+ /* Use rest of head for padding and remaining ops: */
+ buf->tail[0].iov_base = xdr->p;
+ buf->tail[0].iov_len = 0;
+ xdr->iov = buf->tail;
+ if (maxcount&3) {
+ int pad = 4 - (maxcount&3);
+
+ *(xdr->p++) = 0;
+
+ buf->tail[0].iov_base += maxcount&3;
+ buf->tail[0].iov_len = pad;
+ buf->len += pad;
+ }
+
+ space_left = min_t(int, (void *)xdr->end - (void *)xdr->p,
+ buf->buflen - buf->len);
+ buf->buflen = buf->len + space_left;
+ xdr->end = (__be32 *)((void *)xdr->end + space_left);
+
+ return 0;
+}
+
+static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
+ struct nfsd4_read *read,
+ struct file *file, unsigned long maxcount)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ u32 eof;
+ int starting_len = xdr->buf->len - 8;
+ __be32 nfserr;
+ __be32 tmp;
+ int pad;
+
+ read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, maxcount);
+ if (read->rd_vlen < 0)
+ return nfserr_resource;
+
+ nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
+ resp->rqstp->rq_vec, read->rd_vlen, &maxcount,
+ &eof);
+ read->rd_length = maxcount;
+ if (nfserr)
+ return nfserr;
+ if (svc_encode_read_payload(resp->rqstp, starting_len + 8, maxcount))
+ return nfserr_io;
+ xdr_truncate_encode(xdr, starting_len + 8 + xdr_align_size(maxcount));
+
+ tmp = htonl(eof);
+ write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4);
+ tmp = htonl(maxcount);
+ write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
+
+ tmp = xdr_zero;
+ pad = (maxcount&3) ? 4 - (maxcount&3) : 0;
+ write_bytes_to_xdr_buf(xdr->buf, starting_len + 8 + maxcount,
+ &tmp, pad);
+ return 0;
+
+}
+
+static __be32
+nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_read *read)
+{
+ unsigned long maxcount;
+ struct xdr_stream *xdr = &resp->xdr;
+ struct file *file;
+ int starting_len = xdr->buf->len;
+ __be32 *p;
+
+ if (nfserr)
+ return nfserr;
+ file = read->rd_nf->nf_file;
+
+ p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */
+ if (!p) {
+ WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags));
+ return nfserr_resource;
+ }
+ if (resp->xdr.buf->page_len &&
+ test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) {
+ WARN_ON_ONCE(1);
+ return nfserr_serverfault;
+ }
+ xdr_commit_encode(xdr);
+
+ maxcount = svc_max_payload(resp->rqstp);
+ maxcount = min_t(unsigned long, maxcount,
+ (xdr->buf->buflen - xdr->buf->len));
+ maxcount = min_t(unsigned long, maxcount, read->rd_length);
+
+ if (file->f_op->splice_read &&
+ test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
+ nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
+ else
+ nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
+
+ if (nfserr)
+ xdr_truncate_encode(xdr, starting_len);
+
+ return nfserr;
+}
+
+static __be32
+nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink)
+{
+ int maxcount;
+ __be32 wire_count;
+ int zero = 0;
+ struct xdr_stream *xdr = &resp->xdr;
+ int length_offset = xdr->buf->len;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ maxcount = PAGE_SIZE;
+
+ p = xdr_reserve_space(xdr, maxcount);
+ if (!p)
+ return nfserr_resource;
+ /*
+ * XXX: By default, vfs_readlink() will truncate symlinks if they
+ * would overflow the buffer. Is this kosher in NFSv4? If not, one
+ * easy fix is: if vfs_readlink() precisely fills the buffer, assume
+ * that truncation occurred, and return NFS4ERR_RESOURCE.
+ */
+ nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp,
+ (char *)p, &maxcount);
+ if (nfserr == nfserr_isdir)
+ nfserr = nfserr_inval;
+ if (nfserr) {
+ xdr_truncate_encode(xdr, length_offset);
+ return nfserr;
+ }
+
+ wire_count = htonl(maxcount);
+ write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, 4);
+ xdr_truncate_encode(xdr, length_offset + 4 + ALIGN(maxcount, 4));
+ if (maxcount & 3)
+ write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount,
+ &zero, 4 - (maxcount&3));
+ return 0;
+}
+
+static __be32
+nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir)
+{
+ int maxcount;
+ int bytes_left;
+ loff_t offset;
+ __be64 wire_offset;
+ struct xdr_stream *xdr = &resp->xdr;
+ int starting_len = xdr->buf->len;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
+ if (!p)
+ return nfserr_resource;
+
+ /* XXX: Following NFSv3, we ignore the READDIR verifier for now. */
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0);
+ resp->xdr.buf->head[0].iov_len = ((char *)resp->xdr.p)
+ - (char *)resp->xdr.buf->head[0].iov_base;
+
+ /*
+ * Number of bytes left for directory entries allowing for the
+ * final 8 bytes of the readdir and a following failed op:
+ */
+ bytes_left = xdr->buf->buflen - xdr->buf->len
+ - COMPOUND_ERR_SLACK_SPACE - 8;
+ if (bytes_left < 0) {
+ nfserr = nfserr_resource;
+ goto err_no_verf;
+ }
+ maxcount = svc_max_payload(resp->rqstp);
+ maxcount = min_t(u32, readdir->rd_maxcount, maxcount);
+ /*
+ * Note the rfc defines rd_maxcount as the size of the
+ * READDIR4resok structure, which includes the verifier above
+ * and the 8 bytes encoded at the end of this function:
+ */
+ if (maxcount < 16) {
+ nfserr = nfserr_toosmall;
+ goto err_no_verf;
+ }
+ maxcount = min_t(int, maxcount-16, bytes_left);
+
+ /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */
+ if (!readdir->rd_dircount)
+ readdir->rd_dircount = svc_max_payload(resp->rqstp);
+
+ readdir->xdr = xdr;
+ readdir->rd_maxcount = maxcount;
+ readdir->common.err = 0;
+ readdir->cookie_offset = 0;
+
+ offset = readdir->rd_cookie;
+ nfserr = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp,
+ &offset,
+ &readdir->common, nfsd4_encode_dirent);
+ if (nfserr == nfs_ok &&
+ readdir->common.err == nfserr_toosmall &&
+ xdr->buf->len == starting_len + 8) {
+ /* nothing encoded; which limit did we hit?: */
+ if (maxcount - 16 < bytes_left)
+ /* It was the fault of rd_maxcount: */
+ nfserr = nfserr_toosmall;
+ else
+ /* We ran out of buffer space: */
+ nfserr = nfserr_resource;
+ }
+ if (nfserr)
+ goto err_no_verf;
+
+ if (readdir->cookie_offset) {
+ wire_offset = cpu_to_be64(offset);
+ write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset,
+ &wire_offset, 8);
+ }
+
+ p = xdr_reserve_space(xdr, 8);
+ if (!p) {
+ WARN_ON_ONCE(1);
+ goto err_no_verf;
+ }
+ *p++ = 0; /* no more entries */
+ *p++ = htonl(readdir->common.err == nfserr_eof);
+
+ return 0;
+err_no_verf:
+ xdr_truncate_encode(xdr, starting_len);
+ return nfserr;
+}
+
+static __be32
+nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 20);
+ if (!p)
+ return nfserr_resource;
+ p = encode_cinfo(p, &remove->rm_cinfo);
+ return 0;
+}
+
+static __be32
+nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 40);
+ if (!p)
+ return nfserr_resource;
+ p = encode_cinfo(p, &rename->rn_sinfo);
+ p = encode_cinfo(p, &rename->rn_tinfo);
+ return 0;
+}
+
+static __be32
+nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
+{
+ u32 i, nflavs, supported;
+ struct exp_flavor_info *flavs;
+ struct exp_flavor_info def_flavs[2];
+ __be32 *p, *flavorsp;
+ static bool report = true;
+
+ if (exp->ex_nflavors) {
+ flavs = exp->ex_flavors;
+ nflavs = exp->ex_nflavors;
+ } else { /* Handling of some defaults in absence of real secinfo: */
+ flavs = def_flavs;
+ if (exp->ex_client->flavour->flavour == RPC_AUTH_UNIX) {
+ nflavs = 2;
+ flavs[0].pseudoflavor = RPC_AUTH_UNIX;
+ flavs[1].pseudoflavor = RPC_AUTH_NULL;
+ } else if (exp->ex_client->flavour->flavour == RPC_AUTH_GSS) {
+ nflavs = 1;
+ flavs[0].pseudoflavor
+ = svcauth_gss_flavor(exp->ex_client);
+ } else {
+ nflavs = 1;
+ flavs[0].pseudoflavor
+ = exp->ex_client->flavour->flavour;
+ }
+ }
+
+ supported = 0;
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ flavorsp = p++; /* to be backfilled later */
+
+ for (i = 0; i < nflavs; i++) {
+ rpc_authflavor_t pf = flavs[i].pseudoflavor;
+ struct rpcsec_gss_info info;
+
+ if (rpcauth_get_gssinfo(pf, &info) == 0) {
+ supported++;
+ p = xdr_reserve_space(xdr, 4 + 4 +
+ XDR_LEN(info.oid.len) + 4 + 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(RPC_AUTH_GSS);
+ p = xdr_encode_opaque(p, info.oid.data, info.oid.len);
+ *p++ = cpu_to_be32(info.qop);
+ *p++ = cpu_to_be32(info.service);
+ } else if (pf < RPC_AUTH_MAXFLAVOR) {
+ supported++;
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(pf);
+ } else {
+ if (report)
+ pr_warn("NFS: SECINFO: security flavor %u "
+ "is not supported\n", pf);
+ }
+ }
+
+ if (nflavs != supported)
+ report = false;
+ *flavorsp = htonl(supported);
+ return 0;
+}
+
+static __be32
+nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_secinfo *secinfo)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+
+ return nfsd4_do_encode_secinfo(xdr, secinfo->si_exp);
+}
+
+static __be32
+nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_secinfo_no_name *secinfo)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+
+ return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp);
+}
+
+/*
+ * The SETATTR encode routine is special -- it always encodes a bitmap,
+ * regardless of the error status.
+ */
+static __be32
+nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
+ return nfserr_resource;
+ if (nfserr) {
+ *p++ = cpu_to_be32(3);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0);
+ }
+ else {
+ *p++ = cpu_to_be32(3);
+ *p++ = cpu_to_be32(setattr->sa_bmval[0]);
+ *p++ = cpu_to_be32(setattr->sa_bmval[1]);
+ *p++ = cpu_to_be32(setattr->sa_bmval[2]);
+ }
+ return nfserr;
+}
+
+static __be32
+nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ if (!nfserr) {
+ p = xdr_reserve_space(xdr, 8 + NFS4_VERIFIER_SIZE);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque_fixed(p, &scd->se_clientid, 8);
+ p = xdr_encode_opaque_fixed(p, &scd->se_confirm,
+ NFS4_VERIFIER_SIZE);
+ }
+ else if (nfserr == nfserr_clid_inuse) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0);
+ }
+ return nfserr;
+}
+
+static __be32
+nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(write->wr_bytes_written);
+ *p++ = cpu_to_be32(write->wr_how_written);
+ p = xdr_encode_opaque_fixed(p, write->wr_verifier.data,
+ NFS4_VERIFIER_SIZE);
+ return 0;
+}
+
+static __be32
+nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_exchange_id *exid)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+ char *major_id;
+ char *server_scope;
+ int major_id_sz;
+ int server_scope_sz;
+ uint64_t minor_id = 0;
+ struct nfsd_net *nn = net_generic(SVC_NET(resp->rqstp), nfsd_net_id);
+
+ major_id = nn->nfsd_name;
+ major_id_sz = strlen(nn->nfsd_name);
+ server_scope = nn->nfsd_name;
+ server_scope_sz = strlen(nn->nfsd_name);
+
+ p = xdr_reserve_space(xdr,
+ 8 /* eir_clientid */ +
+ 4 /* eir_sequenceid */ +
+ 4 /* eir_flags */ +
+ 4 /* spr_how */);
+ if (!p)
+ return nfserr_resource;
+
+ p = xdr_encode_opaque_fixed(p, &exid->clientid, 8);
+ *p++ = cpu_to_be32(exid->seqid);
+ *p++ = cpu_to_be32(exid->flags);
+
+ *p++ = cpu_to_be32(exid->spa_how);
+
+ switch (exid->spa_how) {
+ case SP4_NONE:
+ break;
+ case SP4_MACH_CRED:
+ /* spo_must_enforce bitmap: */
+ nfserr = nfsd4_encode_bitmap(xdr,
+ exid->spo_must_enforce[0],
+ exid->spo_must_enforce[1],
+ exid->spo_must_enforce[2]);
+ if (nfserr)
+ return nfserr;
+ /* spo_must_allow bitmap: */
+ nfserr = nfsd4_encode_bitmap(xdr,
+ exid->spo_must_allow[0],
+ exid->spo_must_allow[1],
+ exid->spo_must_allow[2]);
+ if (nfserr)
+ return nfserr;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+
+ p = xdr_reserve_space(xdr,
+ 8 /* so_minor_id */ +
+ 4 /* so_major_id.len */ +
+ (XDR_QUADLEN(major_id_sz) * 4) +
+ 4 /* eir_server_scope.len */ +
+ (XDR_QUADLEN(server_scope_sz) * 4) +
+ 4 /* eir_server_impl_id.count (0) */);
+ if (!p)
+ return nfserr_resource;
+
+ /* The server_owner struct */
+ p = xdr_encode_hyper(p, minor_id); /* Minor id */
+ /* major id */
+ p = xdr_encode_opaque(p, major_id, major_id_sz);
+
+ /* Server scope */
+ p = xdr_encode_opaque(p, server_scope, server_scope_sz);
+
+ /* Implementation id */
+ *p++ = cpu_to_be32(0); /* zero length nfs_impl_id4 array */
+ return 0;
+}
+
+static __be32
+nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_create_session *sess)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 24);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque_fixed(p, sess->sessionid.data,
+ NFS4_MAX_SESSIONID_LEN);
+ *p++ = cpu_to_be32(sess->seqid);
+ *p++ = cpu_to_be32(sess->flags);
+
+ p = xdr_reserve_space(xdr, 28);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(0); /* headerpadsz */
+ *p++ = cpu_to_be32(sess->fore_channel.maxreq_sz);
+ *p++ = cpu_to_be32(sess->fore_channel.maxresp_sz);
+ *p++ = cpu_to_be32(sess->fore_channel.maxresp_cached);
+ *p++ = cpu_to_be32(sess->fore_channel.maxops);
+ *p++ = cpu_to_be32(sess->fore_channel.maxreqs);
+ *p++ = cpu_to_be32(sess->fore_channel.nr_rdma_attrs);
+
+ if (sess->fore_channel.nr_rdma_attrs) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(sess->fore_channel.rdma_attrs);
+ }
+
+ p = xdr_reserve_space(xdr, 28);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(0); /* headerpadsz */
+ *p++ = cpu_to_be32(sess->back_channel.maxreq_sz);
+ *p++ = cpu_to_be32(sess->back_channel.maxresp_sz);
+ *p++ = cpu_to_be32(sess->back_channel.maxresp_cached);
+ *p++ = cpu_to_be32(sess->back_channel.maxops);
+ *p++ = cpu_to_be32(sess->back_channel.maxreqs);
+ *p++ = cpu_to_be32(sess->back_channel.nr_rdma_attrs);
+
+ if (sess->back_channel.nr_rdma_attrs) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(sess->back_channel.rdma_attrs);
+ }
+ return 0;
+}
+
+static __be32
+nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_sequence *seq)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque_fixed(p, seq->sessionid.data,
+ NFS4_MAX_SESSIONID_LEN);
+ *p++ = cpu_to_be32(seq->seqid);
+ *p++ = cpu_to_be32(seq->slotid);
+ /* Note slotid's are numbered from zero: */
+ *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_highest_slotid */
+ *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_target_highest_slotid */
+ *p++ = cpu_to_be32(seq->status_flags);
+
+ resp->cstate.data_offset = xdr->buf->len; /* DRC cache data pointer */
+ return 0;
+}
+
+static __be32
+nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_test_stateid *test_stateid)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ struct nfsd4_test_stateid_id *stateid, *next;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4 + (4 * test_stateid->ts_num_ids));
+ if (!p)
+ return nfserr_resource;
+ *p++ = htonl(test_stateid->ts_num_ids);
+
+ list_for_each_entry_safe(stateid, next, &test_stateid->ts_stateid_list, ts_id_list) {
+ *p++ = stateid->ts_id_status;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_getdeviceinfo *gdev)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ const struct nfsd4_layout_ops *ops;
+ u32 starting_len = xdr->buf->len, needed_len;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+
+ *p++ = cpu_to_be32(gdev->gd_layout_type);
+
+ ops = nfsd4_layout_ops[gdev->gd_layout_type];
+ nfserr = ops->encode_getdeviceinfo(xdr, gdev);
+ if (nfserr) {
+ /*
+ * We don't bother to burden the layout drivers with
+ * enforcing gd_maxcount, just tell the client to
+ * come back with a bigger buffer if it's not enough.
+ */
+ if (xdr->buf->len + 4 > gdev->gd_maxcount)
+ goto toosmall;
+ return nfserr;
+ }
+
+ if (gdev->gd_notify_types) {
+ p = xdr_reserve_space(xdr, 4 + 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(1); /* bitmap length */
+ *p++ = cpu_to_be32(gdev->gd_notify_types);
+ } else {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = 0;
+ }
+
+ return 0;
+toosmall:
+ dprintk("%s: maxcount too small\n", __func__);
+ needed_len = xdr->buf->len + 4 /* notifications */;
+ xdr_truncate_encode(xdr, starting_len);
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(needed_len);
+ return nfserr_toosmall;
+}
+
+static __be32
+nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_layoutget *lgp)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ const struct nfsd4_layout_ops *ops;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t));
+ if (!p)
+ return nfserr_resource;
+
+ *p++ = cpu_to_be32(1); /* we always set return-on-close */
+ *p++ = cpu_to_be32(lgp->lg_sid.si_generation);
+ p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque,
+ sizeof(stateid_opaque_t));
+
+ *p++ = cpu_to_be32(1); /* we always return a single layout */
+ p = xdr_encode_hyper(p, lgp->lg_seg.offset);
+ p = xdr_encode_hyper(p, lgp->lg_seg.length);
+ *p++ = cpu_to_be32(lgp->lg_seg.iomode);
+ *p++ = cpu_to_be32(lgp->lg_layout_type);
+
+ ops = nfsd4_layout_ops[lgp->lg_layout_type];
+ return ops->encode_layoutget(xdr, lgp);
+}
+
+static __be32
+nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_layoutcommit *lcp)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(lcp->lc_size_chg);
+ if (lcp->lc_size_chg) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_hyper(p, lcp->lc_newsize);
+ }
+
+ return 0;
+}
+
+static __be32
+nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_layoutreturn *lrp)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(lrp->lrs_present);
+ if (lrp->lrs_present)
+ return nfsd4_encode_stateid(xdr, &lrp->lr_sid);
+ return 0;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
+static __be32
+nfsd42_encode_write_res(struct nfsd4_compoundres *resp,
+ struct nfsd42_write_res *write, bool sync)
+{
+ __be32 *p;
+ p = xdr_reserve_space(&resp->xdr, 4);
+ if (!p)
+ return nfserr_resource;
+
+ if (sync)
+ *p++ = cpu_to_be32(0);
+ else {
+ __be32 nfserr;
+ *p++ = cpu_to_be32(1);
+ nfserr = nfsd4_encode_stateid(&resp->xdr, &write->cb_stateid);
+ if (nfserr)
+ return nfserr;
+ }
+ p = xdr_reserve_space(&resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE);
+ if (!p)
+ return nfserr_resource;
+
+ p = xdr_encode_hyper(p, write->wr_bytes_written);
+ *p++ = cpu_to_be32(write->wr_stable_how);
+ p = xdr_encode_opaque_fixed(p, write->wr_verifier.data,
+ NFS4_VERIFIER_SIZE);
+ return nfs_ok;
+}
+
+static __be32
+nfsd42_encode_nl4_server(struct nfsd4_compoundres *resp, struct nl4_server *ns)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ struct nfs42_netaddr *addr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+ *p++ = cpu_to_be32(ns->nl4_type);
+
+ switch (ns->nl4_type) {
+ case NL4_NETADDR:
+ addr = &ns->u.nl4_addr;
+
+ /* netid_len, netid, uaddr_len, uaddr (port included
+ * in RPCBIND_MAXUADDRLEN)
+ */
+ p = xdr_reserve_space(xdr,
+ 4 /* netid len */ +
+ (XDR_QUADLEN(addr->netid_len) * 4) +
+ 4 /* uaddr len */ +
+ (XDR_QUADLEN(addr->addr_len) * 4));
+ if (!p)
+ return nfserr_resource;
+
+ *p++ = cpu_to_be32(addr->netid_len);
+ p = xdr_encode_opaque_fixed(p, addr->netid,
+ addr->netid_len);
+ *p++ = cpu_to_be32(addr->addr_len);
+ p = xdr_encode_opaque_fixed(p, addr->addr,
+ addr->addr_len);
+ break;
+ default:
+ WARN_ON_ONCE(ns->nl4_type != NL4_NETADDR);
+ return nfserr_inval;
+ }
+
+ return 0;
+}
+
+static __be32
+nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_copy *copy)
+{
+ __be32 *p;
+
+ nfserr = nfsd42_encode_write_res(resp, &copy->cp_res,
+ copy->cp_synchronous);
+ if (nfserr)
+ return nfserr;
+
+ p = xdr_reserve_space(&resp->xdr, 4 + 4);
+ *p++ = xdr_one; /* cr_consecutive */
+ *p++ = cpu_to_be32(copy->cp_synchronous);
+ return 0;
+}
+
+static __be32
+nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_offload_status *os)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 8 + 4);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_hyper(p, os->count);
+ *p++ = cpu_to_be32(0);
+ return nfserr;
+}
+
+static __be32
+nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
+ struct nfsd4_read *read,
+ unsigned long *maxcount, u32 *eof,
+ loff_t *pos)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ struct file *file = read->rd_nf->nf_file;
+ int starting_len = xdr->buf->len;
+ loff_t hole_pos;
+ __be32 nfserr;
+ __be32 *p, tmp;
+ __be64 tmp64;
+
+ hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE);
+ if (hole_pos > read->rd_offset)
+ *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset);
+ *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len));
+
+ /* Content type, offset, byte count */
+ p = xdr_reserve_space(xdr, 4 + 8 + 4);
+ if (!p)
+ return nfserr_resource;
+
+ read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount);
+ if (read->rd_vlen < 0)
+ return nfserr_resource;
+
+ nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
+ resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof);
+ if (nfserr)
+ return nfserr;
+ xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount));
+
+ tmp = htonl(NFS4_CONTENT_DATA);
+ write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4);
+ tmp64 = cpu_to_be64(read->rd_offset);
+ write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp64, 8);
+ tmp = htonl(*maxcount);
+ write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp, 4);
+
+ tmp = xdr_zero;
+ write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp,
+ xdr_pad_size(*maxcount));
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
+ struct nfsd4_read *read,
+ unsigned long *maxcount, u32 *eof)
+{
+ struct file *file = read->rd_nf->nf_file;
+ loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
+ loff_t f_size = i_size_read(file_inode(file));
+ unsigned long count;
+ __be32 *p;
+
+ if (data_pos == -ENXIO)
+ data_pos = f_size;
+ else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
+ return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
+ count = data_pos - read->rd_offset;
+
+ /* Content type, offset, byte count */
+ p = xdr_reserve_space(&resp->xdr, 4 + 8 + 8);
+ if (!p)
+ return nfserr_resource;
+
+ *p++ = htonl(NFS4_CONTENT_HOLE);
+ p = xdr_encode_hyper(p, read->rd_offset);
+ p = xdr_encode_hyper(p, count);
+
+ *eof = (read->rd_offset + count) >= f_size;
+ *maxcount = min_t(unsigned long, count, *maxcount);
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_read *read)
+{
+ unsigned long maxcount, count;
+ struct xdr_stream *xdr = &resp->xdr;
+ struct file *file;
+ int starting_len = xdr->buf->len;
+ int last_segment = xdr->buf->len;
+ int segments = 0;
+ __be32 *p, tmp;
+ bool is_data;
+ loff_t pos;
+ u32 eof;
+
+ if (nfserr)
+ return nfserr;
+ file = read->rd_nf->nf_file;
+
+ /* eof flag, segment count */
+ p = xdr_reserve_space(xdr, 4 + 4);
+ if (!p)
+ return nfserr_resource;
+ xdr_commit_encode(xdr);
+
+ maxcount = svc_max_payload(resp->rqstp);
+ maxcount = min_t(unsigned long, maxcount,
+ (xdr->buf->buflen - xdr->buf->len));
+ maxcount = min_t(unsigned long, maxcount, read->rd_length);
+ count = maxcount;
+
+ eof = read->rd_offset >= i_size_read(file_inode(file));
+ if (eof)
+ goto out;
+
+ pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
+ is_data = pos > read->rd_offset;
+
+ while (count > 0 && !eof) {
+ maxcount = count;
+ if (is_data)
+ nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
+ segments == 0 ? &pos : NULL);
+ else
+ nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
+ if (nfserr)
+ goto out;
+ count -= maxcount;
+ read->rd_offset += maxcount;
+ is_data = !is_data;
+ last_segment = xdr->buf->len;
+ segments++;
+ }
+
+out:
+ if (nfserr && segments == 0)
+ xdr_truncate_encode(xdr, starting_len);
+ else {
+ if (nfserr) {
+ xdr_truncate_encode(xdr, last_segment);
+ nfserr = nfs_ok;
+ eof = 0;
+ }
+ tmp = htonl(eof);
+ write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4);
+ tmp = htonl(segments);
+ write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
+ }
+
+ return nfserr;
+}
+
+static __be32
+nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_copy_notify *cn)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ if (nfserr)
+ return nfserr;
+
+ /* 8 sec, 4 nsec */
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
+ return nfserr_resource;
+
+ /* cnr_lease_time */
+ p = xdr_encode_hyper(p, cn->cpn_sec);
+ *p++ = cpu_to_be32(cn->cpn_nsec);
+
+ /* cnr_stateid */
+ nfserr = nfsd4_encode_stateid(xdr, &cn->cpn_cnr_stateid);
+ if (nfserr)
+ return nfserr;
+
+ /* cnr_src.nl_nsvr */
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+
+ *p++ = cpu_to_be32(1);
+
+ return nfsd42_encode_nl4_server(resp, &cn->cpn_src);
+}
+
+static __be32
+nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_seek *seek)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(&resp->xdr, 4 + 8);
+ *p++ = cpu_to_be32(seek->seek_eof);
+ p = xdr_encode_hyper(p, seek->seek_pos);
+
+ return 0;
+}
+
+static __be32
+nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
+{
+ return nfserr;
+}
+
+/*
+ * Encode kmalloc-ed buffer in to XDR stream.
+ */
+static __be32
+nfsd4_vbuf_to_stream(struct xdr_stream *xdr, char *buf, u32 buflen)
+{
+ u32 cplen;
+ __be32 *p;
+
+ cplen = min_t(unsigned long, buflen,
+ ((void *)xdr->end - (void *)xdr->p));
+ p = xdr_reserve_space(xdr, cplen);
+ if (!p)
+ return nfserr_resource;
+
+ memcpy(p, buf, cplen);
+ buf += cplen;
+ buflen -= cplen;
+
+ while (buflen) {
+ cplen = min_t(u32, buflen, PAGE_SIZE);
+ p = xdr_reserve_space(xdr, cplen);
+ if (!p)
+ return nfserr_resource;
+
+ memcpy(p, buf, cplen);
+
+ if (cplen < PAGE_SIZE) {
+ /*
+ * We're done, with a length that wasn't page
+ * aligned, so possibly not word aligned. Pad
+ * any trailing bytes with 0.
+ */
+ xdr_encode_opaque_fixed(p, NULL, cplen);
+ break;
+ }
+
+ buflen -= PAGE_SIZE;
+ buf += PAGE_SIZE;
+ }
+
+ return 0;
+}
+
+static __be32
+nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_getxattr *getxattr)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p, err;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+
+ *p = cpu_to_be32(getxattr->getxa_len);
+
+ if (getxattr->getxa_len == 0)
+ return 0;
+
+ err = nfsd4_vbuf_to_stream(xdr, getxattr->getxa_buf,
+ getxattr->getxa_len);
+
+ kvfree(getxattr->getxa_buf);
+
+ return err;
+}
+
+static __be32
+nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_setxattr *setxattr)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 20);
+ if (!p)
+ return nfserr_resource;
+
+ encode_cinfo(p, &setxattr->setxa_cinfo);
+
+ return 0;
+}
+
+/*
+ * See if there are cookie values that can be rejected outright.
+ */
+static __be32
+nfsd4_listxattr_validate_cookie(struct nfsd4_listxattrs *listxattrs,
+ u32 *offsetp)
+{
+ u64 cookie = listxattrs->lsxa_cookie;
+
+ /*
+ * If the cookie is larger than the maximum number we can fit
+ * in either the buffer we just got back from vfs_listxattr, or,
+ * XDR-encoded, in the return buffer, it's invalid.
+ */
+ if (cookie > (listxattrs->lsxa_len) / (XATTR_USER_PREFIX_LEN + 2))
+ return nfserr_badcookie;
+
+ if (cookie > (listxattrs->lsxa_maxcount /
+ (XDR_QUADLEN(XATTR_USER_PREFIX_LEN + 2) + 4)))
+ return nfserr_badcookie;
+
+ *offsetp = (u32)cookie;
+ return 0;
+}
+
+static __be32
+nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_listxattrs *listxattrs)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ u32 cookie_offset, count_offset, eof;
+ u32 left, xdrleft, slen, count;
+ u32 xdrlen, offset;
+ u64 cookie;
+ char *sp;
+ __be32 status, tmp;
+ __be32 *p;
+ u32 nuser;
+
+ eof = 1;
+
+ status = nfsd4_listxattr_validate_cookie(listxattrs, &offset);
+ if (status)
+ goto out;
+
+ /*
+ * Reserve space for the cookie and the name array count. Record
+ * the offsets to save them later.
+ */
+ cookie_offset = xdr->buf->len;
+ count_offset = cookie_offset + 8;
+ p = xdr_reserve_space(xdr, 12);
+ if (!p) {
+ status = nfserr_resource;
+ goto out;
+ }
+
+ count = 0;
+ left = listxattrs->lsxa_len;
+ sp = listxattrs->lsxa_buf;
+ nuser = 0;
+
+ xdrleft = listxattrs->lsxa_maxcount;
+
+ while (left > 0 && xdrleft > 0) {
+ slen = strlen(sp);
+
+ /*
+ * Check if this is a "user." attribute, skip it if not.
+ */
+ if (strncmp(sp, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+ goto contloop;
+
+ slen -= XATTR_USER_PREFIX_LEN;
+ xdrlen = 4 + ((slen + 3) & ~3);
+ if (xdrlen > xdrleft) {
+ if (count == 0) {
+ /*
+ * Can't even fit the first attribute name.
+ */
+ status = nfserr_toosmall;
+ goto out;
+ }
+ eof = 0;
+ goto wreof;
+ }
+
+ left -= XATTR_USER_PREFIX_LEN;
+ sp += XATTR_USER_PREFIX_LEN;
+ if (nuser++ < offset)
+ goto contloop;
+
+
+ p = xdr_reserve_space(xdr, xdrlen);
+ if (!p) {
+ status = nfserr_resource;
+ goto out;
+ }
+
+ xdr_encode_opaque(p, sp, slen);
+
+ xdrleft -= xdrlen;
+ count++;
+contloop:
+ sp += slen + 1;
+ left -= slen + 1;
+ }
+
+ /*
+ * If there were user attributes to copy, but we didn't copy
+ * any, the offset was too large (e.g. the cookie was invalid).
+ */
+ if (nuser > 0 && count == 0) {
+ status = nfserr_badcookie;
+ goto out;
+ }
+
+wreof:
+ p = xdr_reserve_space(xdr, 4);
+ if (!p) {
+ status = nfserr_resource;
+ goto out;
+ }
+ *p = cpu_to_be32(eof);
+
+ cookie = offset + count;
+
+ write_bytes_to_xdr_buf(xdr->buf, cookie_offset, &cookie, 8);
+ tmp = cpu_to_be32(count);
+ write_bytes_to_xdr_buf(xdr->buf, count_offset, &tmp, 4);
+out:
+ if (listxattrs->lsxa_len)
+ kvfree(listxattrs->lsxa_buf);
+ return status;
+}
+
+static __be32
+nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_removexattr *removexattr)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 20);
+ if (!p)
+ return nfserr_resource;
+
+ p = encode_cinfo(p, &removexattr->rmxa_cinfo);
+ return 0;
+}
+
+typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
+
+/*
+ * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
+ * since we don't need to filter out obsolete ops as this is
+ * done in the decoding phase.
+ */
+static const nfsd4_enc nfsd4_enc_ops[] = {
+ [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access,
+ [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close,
+ [OP_COMMIT] = (nfsd4_enc)nfsd4_encode_commit,
+ [OP_CREATE] = (nfsd4_enc)nfsd4_encode_create,
+ [OP_DELEGPURGE] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_DELEGRETURN] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_GETATTR] = (nfsd4_enc)nfsd4_encode_getattr,
+ [OP_GETFH] = (nfsd4_enc)nfsd4_encode_getfh,
+ [OP_LINK] = (nfsd4_enc)nfsd4_encode_link,
+ [OP_LOCK] = (nfsd4_enc)nfsd4_encode_lock,
+ [OP_LOCKT] = (nfsd4_enc)nfsd4_encode_lockt,
+ [OP_LOCKU] = (nfsd4_enc)nfsd4_encode_locku,
+ [OP_LOOKUP] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open,
+ [OP_OPENATTR] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm,
+ [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade,
+ [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_PUTPUBFH] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_PUTROOTFH] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_READ] = (nfsd4_enc)nfsd4_encode_read,
+ [OP_READDIR] = (nfsd4_enc)nfsd4_encode_readdir,
+ [OP_READLINK] = (nfsd4_enc)nfsd4_encode_readlink,
+ [OP_REMOVE] = (nfsd4_enc)nfsd4_encode_remove,
+ [OP_RENAME] = (nfsd4_enc)nfsd4_encode_rename,
+ [OP_RENEW] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_RESTOREFH] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_SAVEFH] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_SECINFO] = (nfsd4_enc)nfsd4_encode_secinfo,
+ [OP_SETATTR] = (nfsd4_enc)nfsd4_encode_setattr,
+ [OP_SETCLIENTID] = (nfsd4_enc)nfsd4_encode_setclientid,
+ [OP_SETCLIENTID_CONFIRM] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write,
+ [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop,
+
+ /* NFSv4.1 operations */
+ [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
+ [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id,
+ [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session,
+ [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
+#ifdef CONFIG_NFSD_PNFS
+ [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdeviceinfo,
+ [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit,
+ [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget,
+ [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn,
+#else
+ [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
+#endif
+ [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
+ [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
+ [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_test_stateid,
+ [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop,
+
+ /* NFSv4.2 operations */
+ [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_COPY] = (nfsd4_enc)nfsd4_encode_copy,
+ [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_copy_notify,
+ [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_offload_status,
+ [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_read_plus,
+ [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek,
+ [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_CLONE] = (nfsd4_enc)nfsd4_encode_noop,
+
+ /* RFC 8276 extended atributes operations */
+ [OP_GETXATTR] = (nfsd4_enc)nfsd4_encode_getxattr,
+ [OP_SETXATTR] = (nfsd4_enc)nfsd4_encode_setxattr,
+ [OP_LISTXATTRS] = (nfsd4_enc)nfsd4_encode_listxattrs,
+ [OP_REMOVEXATTR] = (nfsd4_enc)nfsd4_encode_removexattr,
+};
+
+/*
+ * Calculate whether we still have space to encode repsize bytes.
+ * There are two considerations:
+ * - For NFS versions >=4.1, the size of the reply must stay within
+ * session limits
+ * - For all NFS versions, we must stay within limited preallocated
+ * buffer space.
+ *
+ * This is called before the operation is processed, so can only provide
+ * an upper estimate. For some nonidempotent operations (such as
+ * getattr), it's not necessarily a problem if that estimate is wrong,
+ * as we can fail it after processing without significant side effects.
+ */
+__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 respsize)
+{
+ struct xdr_buf *buf = &resp->rqstp->rq_res;
+ struct nfsd4_slot *slot = resp->cstate.slot;
+
+ if (buf->len + respsize <= buf->buflen)
+ return nfs_ok;
+ if (!nfsd4_has_session(&resp->cstate))
+ return nfserr_resource;
+ if (slot->sl_flags & NFSD4_SLOT_CACHETHIS) {
+ WARN_ON_ONCE(1);
+ return nfserr_rep_too_big_to_cache;
+ }
+ return nfserr_rep_too_big;
+}
+
+void
+nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ struct nfs4_stateowner *so = resp->cstate.replay_owner;
+ struct svc_rqst *rqstp = resp->rqstp;
+ const struct nfsd4_operation *opdesc = op->opdesc;
+ int post_err_offset;
+ nfsd4_enc encoder;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 8);
+ if (!p) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ *p++ = cpu_to_be32(op->opnum);
+ post_err_offset = xdr->buf->len;
+
+ if (op->opnum == OP_ILLEGAL)
+ goto status;
+ if (op->status && opdesc &&
+ !(opdesc->op_flags & OP_NONTRIVIAL_ERROR_ENCODE))
+ goto status;
+ BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
+ !nfsd4_enc_ops[op->opnum]);
+ encoder = nfsd4_enc_ops[op->opnum];
+ op->status = encoder(resp, op->status, &op->u);
+ if (opdesc && opdesc->op_release)
+ opdesc->op_release(&op->u);
+ xdr_commit_encode(xdr);
+
+ /* nfsd4_check_resp_size guarantees enough room for error status */
+ if (!op->status) {
+ int space_needed = 0;
+ if (!nfsd4_last_compound_op(rqstp))
+ space_needed = COMPOUND_ERR_SLACK_SPACE;
+ op->status = nfsd4_check_resp_size(resp, space_needed);
+ }
+ if (op->status == nfserr_resource && nfsd4_has_session(&resp->cstate)) {
+ struct nfsd4_slot *slot = resp->cstate.slot;
+
+ if (slot->sl_flags & NFSD4_SLOT_CACHETHIS)
+ op->status = nfserr_rep_too_big_to_cache;
+ else
+ op->status = nfserr_rep_too_big;
+ }
+ if (op->status == nfserr_resource ||
+ op->status == nfserr_rep_too_big ||
+ op->status == nfserr_rep_too_big_to_cache) {
+ /*
+ * The operation may have already been encoded or
+ * partially encoded. No op returns anything additional
+ * in the case of one of these three errors, so we can
+ * just truncate back to after the status. But it's a
+ * bug if we had to do this on a non-idempotent op:
+ */
+ warn_on_nonidempotent_op(op);
+ xdr_truncate_encode(xdr, post_err_offset);
+ }
+ if (so) {
+ int len = xdr->buf->len - post_err_offset;
+
+ so->so_replay.rp_status = op->status;
+ so->so_replay.rp_buflen = len;
+ read_bytes_from_xdr_buf(xdr->buf, post_err_offset,
+ so->so_replay.rp_buf, len);
+ }
+status:
+ /* Note that op->status is already in network byte order: */
+ write_bytes_to_xdr_buf(xdr->buf, post_err_offset - 4, &op->status, 4);
+}
+
+/*
+ * Encode the reply stored in the stateowner reply cache
+ *
+ * XDR note: do not encode rp->rp_buflen: the buffer contains the
+ * previously sent already encoded operation.
+ */
+void
+nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op)
+{
+ __be32 *p;
+ struct nfs4_replay *rp = op->replay;
+
+ p = xdr_reserve_space(xdr, 8 + rp->rp_buflen);
+ if (!p) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ *p++ = cpu_to_be32(op->opnum);
+ *p++ = rp->rp_status; /* already xdr'ed */
+
+ p = xdr_encode_opaque_fixed(p, rp->rp_buf, rp->rp_buflen);
+}
+
+int
+nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p)
+{
+ return xdr_ressize_check(rqstp, p);
+}
+
+void nfsd4_release_compoundargs(struct svc_rqst *rqstp)
+{
+ struct nfsd4_compoundargs *args = rqstp->rq_argp;
+
+ if (args->ops != args->iops) {
+ kfree(args->ops);
+ args->ops = args->iops;
+ }
+ kfree(args->tmpp);
+ args->tmpp = NULL;
+ while (args->to_free) {
+ struct svcxdr_tmpbuf *tb = args->to_free;
+ args->to_free = tb->next;
+ kfree(tb);
+ }
+}
+
+int
+nfs4svc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p)
+{
+ return 1;
+}
+
+int
+nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd4_compoundargs *args = rqstp->rq_argp;
+
+ if (rqstp->rq_arg.head[0].iov_len % 4) {
+ /* client is nuts */
+ dprintk("%s: compound not properly padded! (peeraddr=%pISc xid=0x%x)",
+ __func__, svc_addr(rqstp), be32_to_cpu(rqstp->rq_xid));
+ return 0;
+ }
+ args->p = p;
+ args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len;
+ args->pagelist = rqstp->rq_arg.pages;
+ args->pagelen = rqstp->rq_arg.page_len;
+ args->tail = false;
+ args->tmpp = NULL;
+ args->to_free = NULL;
+ args->ops = args->iops;
+ args->rqstp = rqstp;
+
+ return !nfsd4_decode_compound(args);
+}
+
+int
+nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct xdr_buf *buf = resp->xdr.buf;
+
+ WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len +
+ buf->tail[0].iov_len);
+
+ *p = resp->cstate.status;
+
+ rqstp->rq_next_page = resp->xdr.page_ptr + 1;
+
+ p = resp->tagp;
+ *p++ = htonl(resp->taglen);
+ memcpy(p, resp->tag, resp->taglen);
+ p += XDR_QUADLEN(resp->taglen);
+ *p++ = htonl(resp->opcnt);
+
+ nfsd4_sequence_done(resp);
+ return 1;
+}
+
+/*
+ * Local variables:
+ * c-basic-offset: 8
+ * End:
+ */
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
new file mode 100644
index 000000000..80c90fc23
--- /dev/null
+++ b/fs/nfsd/nfscache.c
@@ -0,0 +1,609 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Request reply cache. This is currently a global cache, but this may
+ * change in the future and be a per-client cache.
+ *
+ * This code is heavily inspired by the 44BSD implementation, although
+ * it does things a bit differently.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/highmem.h>
+#include <linux/log2.h>
+#include <linux/hash.h>
+#include <net/checksum.h>
+
+#include "nfsd.h"
+#include "cache.h"
+#include "trace.h"
+
+/*
+ * We use this value to determine the number of hash buckets from the max
+ * cache size, the idea being that when the cache is at its maximum number
+ * of entries, then this should be the average number of entries per bucket.
+ */
+#define TARGET_BUCKET_SIZE 64
+
+struct nfsd_drc_bucket {
+ struct rb_root rb_head;
+ struct list_head lru_head;
+ spinlock_t cache_lock;
+};
+
+static struct kmem_cache *drc_slab;
+
+static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
+static unsigned long nfsd_reply_cache_count(struct shrinker *shrink,
+ struct shrink_control *sc);
+static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink,
+ struct shrink_control *sc);
+
+/*
+ * Put a cap on the size of the DRC based on the amount of available
+ * low memory in the machine.
+ *
+ * 64MB: 8192
+ * 128MB: 11585
+ * 256MB: 16384
+ * 512MB: 23170
+ * 1GB: 32768
+ * 2GB: 46340
+ * 4GB: 65536
+ * 8GB: 92681
+ * 16GB: 131072
+ *
+ * ...with a hard cap of 256k entries. In the worst case, each entry will be
+ * ~1k, so the above numbers should give a rough max of the amount of memory
+ * used in k.
+ *
+ * XXX: these limits are per-container, so memory used will increase
+ * linearly with number of containers. Maybe that's OK.
+ */
+static unsigned int
+nfsd_cache_size_limit(void)
+{
+ unsigned int limit;
+ unsigned long low_pages = totalram_pages() - totalhigh_pages();
+
+ limit = (16 * int_sqrt(low_pages)) << (PAGE_SHIFT-10);
+ return min_t(unsigned int, limit, 256*1024);
+}
+
+/*
+ * Compute the number of hash buckets we need. Divide the max cachesize by
+ * the "target" max bucket size, and round up to next power of two.
+ */
+static unsigned int
+nfsd_hashsize(unsigned int limit)
+{
+ return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE);
+}
+
+static u32
+nfsd_cache_hash(__be32 xid, struct nfsd_net *nn)
+{
+ return hash_32(be32_to_cpu(xid), nn->maskbits);
+}
+
+static struct svc_cacherep *
+nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum,
+ struct nfsd_net *nn)
+{
+ struct svc_cacherep *rp;
+
+ rp = kmem_cache_alloc(drc_slab, GFP_KERNEL);
+ if (rp) {
+ rp->c_state = RC_UNUSED;
+ rp->c_type = RC_NOCACHE;
+ RB_CLEAR_NODE(&rp->c_node);
+ INIT_LIST_HEAD(&rp->c_lru);
+
+ memset(&rp->c_key, 0, sizeof(rp->c_key));
+ rp->c_key.k_xid = rqstp->rq_xid;
+ rp->c_key.k_proc = rqstp->rq_proc;
+ rpc_copy_addr((struct sockaddr *)&rp->c_key.k_addr, svc_addr(rqstp));
+ rpc_set_port((struct sockaddr *)&rp->c_key.k_addr, rpc_get_port(svc_addr(rqstp)));
+ rp->c_key.k_prot = rqstp->rq_prot;
+ rp->c_key.k_vers = rqstp->rq_vers;
+ rp->c_key.k_len = rqstp->rq_arg.len;
+ rp->c_key.k_csum = csum;
+ }
+ return rp;
+}
+
+static void
+nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp,
+ struct nfsd_net *nn)
+{
+ if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) {
+ nn->drc_mem_usage -= rp->c_replvec.iov_len;
+ kfree(rp->c_replvec.iov_base);
+ }
+ if (rp->c_state != RC_UNUSED) {
+ rb_erase(&rp->c_node, &b->rb_head);
+ list_del(&rp->c_lru);
+ atomic_dec(&nn->num_drc_entries);
+ nn->drc_mem_usage -= sizeof(*rp);
+ }
+ kmem_cache_free(drc_slab, rp);
+}
+
+static void
+nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp,
+ struct nfsd_net *nn)
+{
+ spin_lock(&b->cache_lock);
+ nfsd_reply_cache_free_locked(b, rp, nn);
+ spin_unlock(&b->cache_lock);
+}
+
+int nfsd_drc_slab_create(void)
+{
+ drc_slab = kmem_cache_create("nfsd_drc",
+ sizeof(struct svc_cacherep), 0, 0, NULL);
+ return drc_slab ? 0: -ENOMEM;
+}
+
+void nfsd_drc_slab_free(void)
+{
+ kmem_cache_destroy(drc_slab);
+}
+
+int nfsd_reply_cache_init(struct nfsd_net *nn)
+{
+ unsigned int hashsize;
+ unsigned int i;
+ int status = 0;
+
+ nn->max_drc_entries = nfsd_cache_size_limit();
+ atomic_set(&nn->num_drc_entries, 0);
+ hashsize = nfsd_hashsize(nn->max_drc_entries);
+ nn->maskbits = ilog2(hashsize);
+
+ nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan;
+ nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count;
+ nn->nfsd_reply_cache_shrinker.seeks = 1;
+ status = register_shrinker(&nn->nfsd_reply_cache_shrinker);
+ if (status)
+ goto out_nomem;
+
+ nn->drc_hashtbl = kvzalloc(array_size(hashsize,
+ sizeof(*nn->drc_hashtbl)), GFP_KERNEL);
+ if (!nn->drc_hashtbl)
+ goto out_shrinker;
+
+ for (i = 0; i < hashsize; i++) {
+ INIT_LIST_HEAD(&nn->drc_hashtbl[i].lru_head);
+ spin_lock_init(&nn->drc_hashtbl[i].cache_lock);
+ }
+ nn->drc_hashsize = hashsize;
+
+ return 0;
+out_shrinker:
+ unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
+out_nomem:
+ printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
+ return -ENOMEM;
+}
+
+void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
+{
+ struct svc_cacherep *rp;
+ unsigned int i;
+
+ unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
+
+ for (i = 0; i < nn->drc_hashsize; i++) {
+ struct list_head *head = &nn->drc_hashtbl[i].lru_head;
+ while (!list_empty(head)) {
+ rp = list_first_entry(head, struct svc_cacherep, c_lru);
+ nfsd_reply_cache_free_locked(&nn->drc_hashtbl[i],
+ rp, nn);
+ }
+ }
+
+ kvfree(nn->drc_hashtbl);
+ nn->drc_hashtbl = NULL;
+ nn->drc_hashsize = 0;
+
+}
+
+/*
+ * Move cache entry to end of LRU list, and queue the cleaner to run if it's
+ * not already scheduled.
+ */
+static void
+lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
+{
+ rp->c_timestamp = jiffies;
+ list_move_tail(&rp->c_lru, &b->lru_head);
+}
+
+static long
+prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn)
+{
+ struct svc_cacherep *rp, *tmp;
+ long freed = 0;
+
+ list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) {
+ /*
+ * Don't free entries attached to calls that are still
+ * in-progress, but do keep scanning the list.
+ */
+ if (rp->c_state == RC_INPROG)
+ continue;
+ if (atomic_read(&nn->num_drc_entries) <= nn->max_drc_entries &&
+ time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
+ break;
+ nfsd_reply_cache_free_locked(b, rp, nn);
+ freed++;
+ }
+ return freed;
+}
+
+/*
+ * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
+ * Also prune the oldest ones when the total exceeds the max number of entries.
+ */
+static long
+prune_cache_entries(struct nfsd_net *nn)
+{
+ unsigned int i;
+ long freed = 0;
+
+ for (i = 0; i < nn->drc_hashsize; i++) {
+ struct nfsd_drc_bucket *b = &nn->drc_hashtbl[i];
+
+ if (list_empty(&b->lru_head))
+ continue;
+ spin_lock(&b->cache_lock);
+ freed += prune_bucket(b, nn);
+ spin_unlock(&b->cache_lock);
+ }
+ return freed;
+}
+
+static unsigned long
+nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ struct nfsd_net *nn = container_of(shrink,
+ struct nfsd_net, nfsd_reply_cache_shrinker);
+
+ return atomic_read(&nn->num_drc_entries);
+}
+
+static unsigned long
+nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ struct nfsd_net *nn = container_of(shrink,
+ struct nfsd_net, nfsd_reply_cache_shrinker);
+
+ return prune_cache_entries(nn);
+}
+/*
+ * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
+ */
+static __wsum
+nfsd_cache_csum(struct svc_rqst *rqstp)
+{
+ int idx;
+ unsigned int base;
+ __wsum csum;
+ struct xdr_buf *buf = &rqstp->rq_arg;
+ const unsigned char *p = buf->head[0].iov_base;
+ size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len,
+ RC_CSUMLEN);
+ size_t len = min(buf->head[0].iov_len, csum_len);
+
+ /* rq_arg.head first */
+ csum = csum_partial(p, len, 0);
+ csum_len -= len;
+
+ /* Continue into page array */
+ idx = buf->page_base / PAGE_SIZE;
+ base = buf->page_base & ~PAGE_MASK;
+ while (csum_len) {
+ p = page_address(buf->pages[idx]) + base;
+ len = min_t(size_t, PAGE_SIZE - base, csum_len);
+ csum = csum_partial(p, len, csum);
+ csum_len -= len;
+ base = 0;
+ ++idx;
+ }
+ return csum;
+}
+
+static int
+nfsd_cache_key_cmp(const struct svc_cacherep *key,
+ const struct svc_cacherep *rp, struct nfsd_net *nn)
+{
+ if (key->c_key.k_xid == rp->c_key.k_xid &&
+ key->c_key.k_csum != rp->c_key.k_csum) {
+ ++nn->payload_misses;
+ trace_nfsd_drc_mismatch(nn, key, rp);
+ }
+
+ return memcmp(&key->c_key, &rp->c_key, sizeof(key->c_key));
+}
+
+/*
+ * Search the request hash for an entry that matches the given rqstp.
+ * Must be called with cache_lock held. Returns the found entry or
+ * inserts an empty key on failure.
+ */
+static struct svc_cacherep *
+nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key,
+ struct nfsd_net *nn)
+{
+ struct svc_cacherep *rp, *ret = key;
+ struct rb_node **p = &b->rb_head.rb_node,
+ *parent = NULL;
+ unsigned int entries = 0;
+ int cmp;
+
+ while (*p != NULL) {
+ ++entries;
+ parent = *p;
+ rp = rb_entry(parent, struct svc_cacherep, c_node);
+
+ cmp = nfsd_cache_key_cmp(key, rp, nn);
+ if (cmp < 0)
+ p = &parent->rb_left;
+ else if (cmp > 0)
+ p = &parent->rb_right;
+ else {
+ ret = rp;
+ goto out;
+ }
+ }
+ rb_link_node(&key->c_node, parent, p);
+ rb_insert_color(&key->c_node, &b->rb_head);
+out:
+ /* tally hash chain length stats */
+ if (entries > nn->longest_chain) {
+ nn->longest_chain = entries;
+ nn->longest_chain_cachesize = atomic_read(&nn->num_drc_entries);
+ } else if (entries == nn->longest_chain) {
+ /* prefer to keep the smallest cachesize possible here */
+ nn->longest_chain_cachesize = min_t(unsigned int,
+ nn->longest_chain_cachesize,
+ atomic_read(&nn->num_drc_entries));
+ }
+
+ lru_put_end(b, ret);
+ return ret;
+}
+
+/**
+ * nfsd_cache_lookup - Find an entry in the duplicate reply cache
+ * @rqstp: Incoming Call to find
+ *
+ * Try to find an entry matching the current call in the cache. When none
+ * is found, we try to grab the oldest expired entry off the LRU list. If
+ * a suitable one isn't there, then drop the cache_lock and allocate a
+ * new one, then search again in case one got inserted while this thread
+ * didn't hold the lock.
+ *
+ * Return values:
+ * %RC_DOIT: Process the request normally
+ * %RC_REPLY: Reply from cache
+ * %RC_DROPIT: Do not process the request further
+ */
+int nfsd_cache_lookup(struct svc_rqst *rqstp)
+{
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct svc_cacherep *rp, *found;
+ __be32 xid = rqstp->rq_xid;
+ __wsum csum;
+ u32 hash = nfsd_cache_hash(xid, nn);
+ struct nfsd_drc_bucket *b = &nn->drc_hashtbl[hash];
+ int type = rqstp->rq_cachetype;
+ int rtn = RC_DOIT;
+
+ rqstp->rq_cacherep = NULL;
+ if (type == RC_NOCACHE) {
+ nfsdstats.rcnocache++;
+ goto out;
+ }
+
+ csum = nfsd_cache_csum(rqstp);
+
+ /*
+ * Since the common case is a cache miss followed by an insert,
+ * preallocate an entry.
+ */
+ rp = nfsd_reply_cache_alloc(rqstp, csum, nn);
+ if (!rp)
+ goto out;
+
+ spin_lock(&b->cache_lock);
+ found = nfsd_cache_insert(b, rp, nn);
+ if (found != rp) {
+ nfsd_reply_cache_free_locked(NULL, rp, nn);
+ rp = found;
+ goto found_entry;
+ }
+
+ nfsdstats.rcmisses++;
+ rqstp->rq_cacherep = rp;
+ rp->c_state = RC_INPROG;
+
+ atomic_inc(&nn->num_drc_entries);
+ nn->drc_mem_usage += sizeof(*rp);
+
+ /* go ahead and prune the cache */
+ prune_bucket(b, nn);
+
+out_unlock:
+ spin_unlock(&b->cache_lock);
+out:
+ return rtn;
+
+found_entry:
+ /* We found a matching entry which is either in progress or done. */
+ nfsdstats.rchits++;
+ rtn = RC_DROPIT;
+
+ /* Request being processed */
+ if (rp->c_state == RC_INPROG)
+ goto out_trace;
+
+ /* From the hall of fame of impractical attacks:
+ * Is this a user who tries to snoop on the cache? */
+ rtn = RC_DOIT;
+ if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && rp->c_secure)
+ goto out_trace;
+
+ /* Compose RPC reply header */
+ switch (rp->c_type) {
+ case RC_NOCACHE:
+ break;
+ case RC_REPLSTAT:
+ svc_putu32(&rqstp->rq_res.head[0], rp->c_replstat);
+ rtn = RC_REPLY;
+ break;
+ case RC_REPLBUFF:
+ if (!nfsd_cache_append(rqstp, &rp->c_replvec))
+ goto out_unlock; /* should not happen */
+ rtn = RC_REPLY;
+ break;
+ default:
+ WARN_ONCE(1, "nfsd: bad repcache type %d\n", rp->c_type);
+ }
+
+out_trace:
+ trace_nfsd_drc_found(nn, rqstp, rtn);
+ goto out_unlock;
+}
+
+/**
+ * nfsd_cache_update - Update an entry in the duplicate reply cache.
+ * @rqstp: svc_rqst with a finished Reply
+ * @cachetype: which cache to update
+ * @statp: Reply's status code
+ *
+ * This is called from nfsd_dispatch when the procedure has been
+ * executed and the complete reply is in rqstp->rq_res.
+ *
+ * We're copying around data here rather than swapping buffers because
+ * the toplevel loop requires max-sized buffers, which would be a waste
+ * of memory for a cache with a max reply size of 100 bytes (diropokres).
+ *
+ * If we should start to use different types of cache entries tailored
+ * specifically for attrstat and fh's, we may save even more space.
+ *
+ * Also note that a cachetype of RC_NOCACHE can legally be passed when
+ * nfsd failed to encode a reply that otherwise would have been cached.
+ * In this case, nfsd_cache_update is called with statp == NULL.
+ */
+void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
+{
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct svc_cacherep *rp = rqstp->rq_cacherep;
+ struct kvec *resv = &rqstp->rq_res.head[0], *cachv;
+ u32 hash;
+ struct nfsd_drc_bucket *b;
+ int len;
+ size_t bufsize = 0;
+
+ if (!rp)
+ return;
+
+ hash = nfsd_cache_hash(rp->c_key.k_xid, nn);
+ b = &nn->drc_hashtbl[hash];
+
+ len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
+ len >>= 2;
+
+ /* Don't cache excessive amounts of data and XDR failures */
+ if (!statp || len > (256 >> 2)) {
+ nfsd_reply_cache_free(b, rp, nn);
+ return;
+ }
+
+ switch (cachetype) {
+ case RC_REPLSTAT:
+ if (len != 1)
+ printk("nfsd: RC_REPLSTAT/reply len %d!\n",len);
+ rp->c_replstat = *statp;
+ break;
+ case RC_REPLBUFF:
+ cachv = &rp->c_replvec;
+ bufsize = len << 2;
+ cachv->iov_base = kmalloc(bufsize, GFP_KERNEL);
+ if (!cachv->iov_base) {
+ nfsd_reply_cache_free(b, rp, nn);
+ return;
+ }
+ cachv->iov_len = bufsize;
+ memcpy(cachv->iov_base, statp, bufsize);
+ break;
+ case RC_NOCACHE:
+ nfsd_reply_cache_free(b, rp, nn);
+ return;
+ }
+ spin_lock(&b->cache_lock);
+ nn->drc_mem_usage += bufsize;
+ lru_put_end(b, rp);
+ rp->c_secure = test_bit(RQ_SECURE, &rqstp->rq_flags);
+ rp->c_type = cachetype;
+ rp->c_state = RC_DONE;
+ spin_unlock(&b->cache_lock);
+ return;
+}
+
+/*
+ * Copy cached reply to current reply buffer. Should always fit.
+ * FIXME as reply is in a page, we should just attach the page, and
+ * keep a refcount....
+ */
+static int
+nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
+{
+ struct kvec *vec = &rqstp->rq_res.head[0];
+
+ if (vec->iov_len + data->iov_len > PAGE_SIZE) {
+ printk(KERN_WARNING "nfsd: cached reply too large (%zd).\n",
+ data->iov_len);
+ return 0;
+ }
+ memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len);
+ vec->iov_len += data->iov_len;
+ return 1;
+}
+
+/*
+ * Note that fields may be added, removed or reordered in the future. Programs
+ * scraping this file for info should test the labels to ensure they're
+ * getting the correct field.
+ */
+static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
+{
+ struct nfsd_net *nn = m->private;
+
+ seq_printf(m, "max entries: %u\n", nn->max_drc_entries);
+ seq_printf(m, "num entries: %u\n",
+ atomic_read(&nn->num_drc_entries));
+ seq_printf(m, "hash buckets: %u\n", 1 << nn->maskbits);
+ seq_printf(m, "mem usage: %u\n", nn->drc_mem_usage);
+ seq_printf(m, "cache hits: %u\n", nfsdstats.rchits);
+ seq_printf(m, "cache misses: %u\n", nfsdstats.rcmisses);
+ seq_printf(m, "not cached: %u\n", nfsdstats.rcnocache);
+ seq_printf(m, "payload misses: %u\n", nn->payload_misses);
+ seq_printf(m, "longest chain len: %u\n", nn->longest_chain);
+ seq_printf(m, "cachesize at longest: %u\n", nn->longest_chain_cachesize);
+ return 0;
+}
+
+int nfsd_reply_cache_stats_open(struct inode *inode, struct file *file)
+{
+ struct nfsd_net *nn = net_generic(file_inode(file)->i_sb->s_fs_info,
+ nfsd_net_id);
+
+ return single_open(file, nfsd_reply_cache_stats_show, nn);
+}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
new file mode 100644
index 000000000..c4b11560a
--- /dev/null
+++ b/fs/nfsd/nfsctl.c
@@ -0,0 +1,1578 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Syscall interface to knfsd.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/ctype.h>
+#include <linux/fs_context.h>
+
+#include <linux/sunrpc/svcsock.h>
+#include <linux/lockd/lockd.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/gss_api.h>
+#include <linux/sunrpc/gss_krb5_enctypes.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/module.h>
+#include <linux/fsnotify.h>
+
+#include "idmap.h"
+#include "nfsd.h"
+#include "cache.h"
+#include "state.h"
+#include "netns.h"
+#include "pnfs.h"
+
+/*
+ * We have a single directory with several nodes in it.
+ */
+enum {
+ NFSD_Root = 1,
+ NFSD_List,
+ NFSD_Export_features,
+ NFSD_Fh,
+ NFSD_FO_UnlockIP,
+ NFSD_FO_UnlockFS,
+ NFSD_Threads,
+ NFSD_Pool_Threads,
+ NFSD_Pool_Stats,
+ NFSD_Reply_Cache_Stats,
+ NFSD_Versions,
+ NFSD_Ports,
+ NFSD_MaxBlkSize,
+ NFSD_MaxConnections,
+ NFSD_SupportedEnctypes,
+ /*
+ * The below MUST come last. Otherwise we leave a hole in nfsd_files[]
+ * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
+ */
+#ifdef CONFIG_NFSD_V4
+ NFSD_Leasetime,
+ NFSD_Gracetime,
+ NFSD_RecoveryDir,
+ NFSD_V4EndGrace,
+#endif
+ NFSD_MaxReserved
+};
+
+/*
+ * write() for these nodes.
+ */
+static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
+static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
+static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
+static ssize_t write_threads(struct file *file, char *buf, size_t size);
+static ssize_t write_pool_threads(struct file *file, char *buf, size_t size);
+static ssize_t write_versions(struct file *file, char *buf, size_t size);
+static ssize_t write_ports(struct file *file, char *buf, size_t size);
+static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
+static ssize_t write_maxconn(struct file *file, char *buf, size_t size);
+#ifdef CONFIG_NFSD_V4
+static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
+static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
+static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
+static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size);
+#endif
+
+static ssize_t (*const write_op[])(struct file *, char *, size_t) = {
+ [NFSD_Fh] = write_filehandle,
+ [NFSD_FO_UnlockIP] = write_unlock_ip,
+ [NFSD_FO_UnlockFS] = write_unlock_fs,
+ [NFSD_Threads] = write_threads,
+ [NFSD_Pool_Threads] = write_pool_threads,
+ [NFSD_Versions] = write_versions,
+ [NFSD_Ports] = write_ports,
+ [NFSD_MaxBlkSize] = write_maxblksize,
+ [NFSD_MaxConnections] = write_maxconn,
+#ifdef CONFIG_NFSD_V4
+ [NFSD_Leasetime] = write_leasetime,
+ [NFSD_Gracetime] = write_gracetime,
+ [NFSD_RecoveryDir] = write_recoverydir,
+ [NFSD_V4EndGrace] = write_v4_end_grace,
+#endif
+};
+
+static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos)
+{
+ ino_t ino = file_inode(file)->i_ino;
+ char *data;
+ ssize_t rv;
+
+ if (ino >= ARRAY_SIZE(write_op) || !write_op[ino])
+ return -EINVAL;
+
+ data = simple_transaction_get(file, buf, size);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ rv = write_op[ino](file, data, size);
+ if (rv >= 0) {
+ simple_transaction_set(file, rv);
+ rv = size;
+ }
+ return rv;
+}
+
+static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
+{
+ if (! file->private_data) {
+ /* An attempt to read a transaction file without writing
+ * causes a 0-byte write so that the file can return
+ * state information
+ */
+ ssize_t rv = nfsctl_transaction_write(file, buf, 0, pos);
+ if (rv < 0)
+ return rv;
+ }
+ return simple_transaction_read(file, buf, size, pos);
+}
+
+static const struct file_operations transaction_ops = {
+ .write = nfsctl_transaction_write,
+ .read = nfsctl_transaction_read,
+ .release = simple_transaction_release,
+ .llseek = default_llseek,
+};
+
+static int exports_net_open(struct net *net, struct file *file)
+{
+ int err;
+ struct seq_file *seq;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ err = seq_open(file, &nfs_exports_op);
+ if (err)
+ return err;
+
+ seq = file->private_data;
+ seq->private = nn->svc_export_cache;
+ return 0;
+}
+
+static int exports_proc_open(struct inode *inode, struct file *file)
+{
+ return exports_net_open(current->nsproxy->net_ns, file);
+}
+
+static const struct proc_ops exports_proc_ops = {
+ .proc_open = exports_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
+};
+
+static int exports_nfsd_open(struct inode *inode, struct file *file)
+{
+ return exports_net_open(inode->i_sb->s_fs_info, file);
+}
+
+static const struct file_operations exports_nfsd_operations = {
+ .open = exports_nfsd_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int export_features_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "0x%x 0x%x\n", NFSEXP_ALLFLAGS, NFSEXP_SECINFO_FLAGS);
+ return 0;
+}
+
+static int export_features_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, export_features_show, NULL);
+}
+
+static const struct file_operations export_features_operations = {
+ .open = export_features_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
+static int supported_enctypes_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, KRB5_SUPPORTED_ENCTYPES);
+ return 0;
+}
+
+static int supported_enctypes_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, supported_enctypes_show, NULL);
+}
+
+static const struct file_operations supported_enctypes_ops = {
+ .open = supported_enctypes_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
+
+static const struct file_operations pool_stats_operations = {
+ .open = nfsd_pool_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = nfsd_pool_stats_release,
+};
+
+static const struct file_operations reply_cache_stats_operations = {
+ .open = nfsd_reply_cache_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/*----------------------------------------------------------------------------*/
+/*
+ * payload - write methods
+ */
+
+static inline struct net *netns(struct file *file)
+{
+ return file_inode(file)->i_sb->s_fs_info;
+}
+
+/*
+ * write_unlock_ip - Release all locks used by a client
+ *
+ * Experimental.
+ *
+ * Input:
+ * buf: '\n'-terminated C string containing a
+ * presentation format IP address
+ * size: length of C string in @buf
+ * Output:
+ * On success: returns zero if all specified locks were released;
+ * returns one if one or more locks were not released
+ * On error: return code is negative errno value
+ */
+static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
+{
+ struct sockaddr_storage address;
+ struct sockaddr *sap = (struct sockaddr *)&address;
+ size_t salen = sizeof(address);
+ char *fo_path;
+ struct net *net = netns(file);
+
+ /* sanity check */
+ if (size == 0)
+ return -EINVAL;
+
+ if (buf[size-1] != '\n')
+ return -EINVAL;
+
+ fo_path = buf;
+ if (qword_get(&buf, fo_path, size) < 0)
+ return -EINVAL;
+
+ if (rpc_pton(net, fo_path, size, sap, salen) == 0)
+ return -EINVAL;
+
+ return nlmsvc_unlock_all_by_ip(sap);
+}
+
+/*
+ * write_unlock_fs - Release all locks on a local file system
+ *
+ * Experimental.
+ *
+ * Input:
+ * buf: '\n'-terminated C string containing the
+ * absolute pathname of a local file system
+ * size: length of C string in @buf
+ * Output:
+ * On success: returns zero if all specified locks were released;
+ * returns one if one or more locks were not released
+ * On error: return code is negative errno value
+ */
+static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
+{
+ struct path path;
+ char *fo_path;
+ int error;
+
+ /* sanity check */
+ if (size == 0)
+ return -EINVAL;
+
+ if (buf[size-1] != '\n')
+ return -EINVAL;
+
+ fo_path = buf;
+ if (qword_get(&buf, fo_path, size) < 0)
+ return -EINVAL;
+
+ error = kern_path(fo_path, 0, &path);
+ if (error)
+ return error;
+
+ /*
+ * XXX: Needs better sanity checking. Otherwise we could end up
+ * releasing locks on the wrong file system.
+ *
+ * For example:
+ * 1. Does the path refer to a directory?
+ * 2. Is that directory a mount point, or
+ * 3. Is that directory the root of an exported file system?
+ */
+ error = nlmsvc_unlock_all_by_sb(path.dentry->d_sb);
+
+ path_put(&path);
+ return error;
+}
+
+/*
+ * write_filehandle - Get a variable-length NFS file handle by path
+ *
+ * On input, the buffer contains a '\n'-terminated C string comprised of
+ * three alphanumeric words separated by whitespace. The string may
+ * contain escape sequences.
+ *
+ * Input:
+ * buf:
+ * domain: client domain name
+ * path: export pathname
+ * maxsize: numeric maximum size of
+ * @buf
+ * size: length of C string in @buf
+ * Output:
+ * On success: passed-in buffer filled with '\n'-terminated C
+ * string containing a ASCII hex text version
+ * of the NFS file handle;
+ * return code is the size in bytes of the string
+ * On error: return code is negative errno value
+ */
+static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
+{
+ char *dname, *path;
+ int maxsize;
+ char *mesg = buf;
+ int len;
+ struct auth_domain *dom;
+ struct knfsd_fh fh;
+
+ if (size == 0)
+ return -EINVAL;
+
+ if (buf[size-1] != '\n')
+ return -EINVAL;
+ buf[size-1] = 0;
+
+ dname = mesg;
+ len = qword_get(&mesg, dname, size);
+ if (len <= 0)
+ return -EINVAL;
+
+ path = dname+len+1;
+ len = qword_get(&mesg, path, size);
+ if (len <= 0)
+ return -EINVAL;
+
+ len = get_int(&mesg, &maxsize);
+ if (len)
+ return len;
+
+ if (maxsize < NFS_FHSIZE)
+ return -EINVAL;
+ maxsize = min(maxsize, NFS3_FHSIZE);
+
+ if (qword_get(&mesg, mesg, size)>0)
+ return -EINVAL;
+
+ /* we have all the words, they are in buf.. */
+ dom = unix_domain_find(dname);
+ if (!dom)
+ return -ENOMEM;
+
+ len = exp_rootfh(netns(file), dom, path, &fh, maxsize);
+ auth_domain_put(dom);
+ if (len)
+ return len;
+
+ mesg = buf;
+ len = SIMPLE_TRANSACTION_LIMIT;
+ qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size);
+ mesg[-1] = '\n';
+ return mesg - buf;
+}
+
+/*
+ * write_threads - Start NFSD, or report the current number of running threads
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ * Output:
+ * On success: passed-in buffer filled with '\n'-terminated C
+ * string numeric value representing the number of
+ * running NFSD threads;
+ * return code is the size in bytes of the string
+ * On error: return code is zero
+ *
+ * OR
+ *
+ * Input:
+ * buf: C string containing an unsigned
+ * integer value representing the
+ * number of NFSD threads to start
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: NFS service is started;
+ * passed-in buffer filled with '\n'-terminated C
+ * string numeric value representing the number of
+ * running NFSD threads;
+ * return code is the size in bytes of the string
+ * On error: return code is zero or a negative errno value
+ */
+static ssize_t write_threads(struct file *file, char *buf, size_t size)
+{
+ char *mesg = buf;
+ int rv;
+ struct net *net = netns(file);
+
+ if (size > 0) {
+ int newthreads;
+ rv = get_int(&mesg, &newthreads);
+ if (rv)
+ return rv;
+ if (newthreads < 0)
+ return -EINVAL;
+ rv = nfsd_svc(newthreads, net, file->f_cred);
+ if (rv < 0)
+ return rv;
+ } else
+ rv = nfsd_nrthreads(net);
+
+ return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv);
+}
+
+/*
+ * write_pool_threads - Set or report the current number of threads per pool
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ *
+ * OR
+ *
+ * Input:
+ * buf: C string containing whitespace-
+ * separated unsigned integer values
+ * representing the number of NFSD
+ * threads to start in each pool
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: passed-in buffer filled with '\n'-terminated C
+ * string containing integer values representing the
+ * number of NFSD threads in each pool;
+ * return code is the size in bytes of the string
+ * On error: return code is zero or a negative errno value
+ */
+static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
+{
+ /* if size > 0, look for an array of number of threads per node
+ * and apply them then write out number of threads per node as reply
+ */
+ char *mesg = buf;
+ int i;
+ int rv;
+ int len;
+ int npools;
+ int *nthreads;
+ struct net *net = netns(file);
+
+ mutex_lock(&nfsd_mutex);
+ npools = nfsd_nrpools(net);
+ if (npools == 0) {
+ /*
+ * NFS is shut down. The admin can start it by
+ * writing to the threads file but NOT the pool_threads
+ * file, sorry. Report zero threads.
+ */
+ mutex_unlock(&nfsd_mutex);
+ strcpy(buf, "0\n");
+ return strlen(buf);
+ }
+
+ nthreads = kcalloc(npools, sizeof(int), GFP_KERNEL);
+ rv = -ENOMEM;
+ if (nthreads == NULL)
+ goto out_free;
+
+ if (size > 0) {
+ for (i = 0; i < npools; i++) {
+ rv = get_int(&mesg, &nthreads[i]);
+ if (rv == -ENOENT)
+ break; /* fewer numbers than pools */
+ if (rv)
+ goto out_free; /* syntax error */
+ rv = -EINVAL;
+ if (nthreads[i] < 0)
+ goto out_free;
+ }
+ rv = nfsd_set_nrthreads(i, nthreads, net);
+ if (rv)
+ goto out_free;
+ }
+
+ rv = nfsd_get_nrthreads(npools, nthreads, net);
+ if (rv)
+ goto out_free;
+
+ mesg = buf;
+ size = SIMPLE_TRANSACTION_LIMIT;
+ for (i = 0; i < npools && size > 0; i++) {
+ snprintf(mesg, size, "%d%c", nthreads[i], (i == npools-1 ? '\n' : ' '));
+ len = strlen(mesg);
+ size -= len;
+ mesg += len;
+ }
+ rv = mesg - buf;
+out_free:
+ kfree(nthreads);
+ mutex_unlock(&nfsd_mutex);
+ return rv;
+}
+
+static ssize_t
+nfsd_print_version_support(struct nfsd_net *nn, char *buf, int remaining,
+ const char *sep, unsigned vers, int minor)
+{
+ const char *format = minor < 0 ? "%s%c%u" : "%s%c%u.%u";
+ bool supported = !!nfsd_vers(nn, vers, NFSD_TEST);
+
+ if (vers == 4 && minor >= 0 &&
+ !nfsd_minorversion(nn, minor, NFSD_TEST))
+ supported = false;
+ if (minor == 0 && supported)
+ /*
+ * special case for backward compatability.
+ * +4.0 is never reported, it is implied by
+ * +4, unless -4.0 is present.
+ */
+ return 0;
+ return snprintf(buf, remaining, format, sep,
+ supported ? '+' : '-', vers, minor);
+}
+
+static ssize_t __write_versions(struct file *file, char *buf, size_t size)
+{
+ char *mesg = buf;
+ char *vers, *minorp, sign;
+ int len, num, remaining;
+ ssize_t tlen = 0;
+ char *sep;
+ struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
+
+ if (size>0) {
+ if (nn->nfsd_serv)
+ /* Cannot change versions without updating
+ * nn->nfsd_serv->sv_xdrsize, and reallocing
+ * rq_argp and rq_resp
+ */
+ return -EBUSY;
+ if (buf[size-1] != '\n')
+ return -EINVAL;
+ buf[size-1] = 0;
+
+ vers = mesg;
+ len = qword_get(&mesg, vers, size);
+ if (len <= 0) return -EINVAL;
+ do {
+ enum vers_op cmd;
+ unsigned minor;
+ sign = *vers;
+ if (sign == '+' || sign == '-')
+ num = simple_strtol((vers+1), &minorp, 0);
+ else
+ num = simple_strtol(vers, &minorp, 0);
+ if (*minorp == '.') {
+ if (num != 4)
+ return -EINVAL;
+ if (kstrtouint(minorp+1, 0, &minor) < 0)
+ return -EINVAL;
+ }
+
+ cmd = sign == '-' ? NFSD_CLEAR : NFSD_SET;
+ switch(num) {
+ case 2:
+ case 3:
+ nfsd_vers(nn, num, cmd);
+ break;
+ case 4:
+ if (*minorp == '.') {
+ if (nfsd_minorversion(nn, minor, cmd) < 0)
+ return -EINVAL;
+ } else if ((cmd == NFSD_SET) != nfsd_vers(nn, num, NFSD_TEST)) {
+ /*
+ * Either we have +4 and no minors are enabled,
+ * or we have -4 and at least one minor is enabled.
+ * In either case, propagate 'cmd' to all minors.
+ */
+ minor = 0;
+ while (nfsd_minorversion(nn, minor, cmd) >= 0)
+ minor++;
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+ vers += len + 1;
+ } while ((len = qword_get(&mesg, vers, size)) > 0);
+ /* If all get turned off, turn them back on, as
+ * having no versions is BAD
+ */
+ nfsd_reset_versions(nn);
+ }
+
+ /* Now write current state into reply buffer */
+ len = 0;
+ sep = "";
+ remaining = SIMPLE_TRANSACTION_LIMIT;
+ for (num=2 ; num <= 4 ; num++) {
+ int minor;
+ if (!nfsd_vers(nn, num, NFSD_AVAIL))
+ continue;
+
+ minor = -1;
+ do {
+ len = nfsd_print_version_support(nn, buf, remaining,
+ sep, num, minor);
+ if (len >= remaining)
+ goto out;
+ remaining -= len;
+ buf += len;
+ tlen += len;
+ minor++;
+ if (len)
+ sep = " ";
+ } while (num == 4 && minor <= NFSD_SUPPORTED_MINOR_VERSION);
+ }
+out:
+ len = snprintf(buf, remaining, "\n");
+ if (len >= remaining)
+ return -EINVAL;
+ return tlen + len;
+}
+
+/*
+ * write_versions - Set or report the available NFS protocol versions
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ * Output:
+ * On success: passed-in buffer filled with '\n'-terminated C
+ * string containing positive or negative integer
+ * values representing the current status of each
+ * protocol version;
+ * return code is the size in bytes of the string
+ * On error: return code is zero or a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ * buf: C string containing whitespace-
+ * separated positive or negative
+ * integer values representing NFS
+ * protocol versions to enable ("+n")
+ * or disable ("-n")
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: status of zero or more protocol versions has
+ * been updated; passed-in buffer filled with
+ * '\n'-terminated C string containing positive
+ * or negative integer values representing the
+ * current status of each protocol version;
+ * return code is the size in bytes of the string
+ * On error: return code is zero or a negative errno value
+ */
+static ssize_t write_versions(struct file *file, char *buf, size_t size)
+{
+ ssize_t rv;
+
+ mutex_lock(&nfsd_mutex);
+ rv = __write_versions(file, buf, size);
+ mutex_unlock(&nfsd_mutex);
+ return rv;
+}
+
+/*
+ * Zero-length write. Return a list of NFSD's current listener
+ * transports.
+ */
+static ssize_t __write_ports_names(char *buf, struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (nn->nfsd_serv == NULL)
+ return 0;
+ return svc_xprt_names(nn->nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT);
+}
+
+/*
+ * A single 'fd' number was written, in which case it must be for
+ * a socket of a supported family/protocol, and we use it as an
+ * nfsd listener.
+ */
+static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred *cred)
+{
+ char *mesg = buf;
+ int fd, err;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ err = get_int(&mesg, &fd);
+ if (err != 0 || fd < 0)
+ return -EINVAL;
+
+ if (svc_alien_sock(net, fd)) {
+ printk(KERN_ERR "%s: socket net is different to NFSd's one\n", __func__);
+ return -EINVAL;
+ }
+
+ err = nfsd_create_serv(net);
+ if (err != 0)
+ return err;
+
+ err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred);
+ if (err < 0) {
+ nfsd_destroy(net);
+ return err;
+ }
+
+ /* Decrease the count, but don't shut down the service */
+ nn->nfsd_serv->sv_nrthreads--;
+ return err;
+}
+
+/*
+ * A transport listener is added by writing it's transport name and
+ * a port number.
+ */
+static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cred *cred)
+{
+ char transport[16];
+ struct svc_xprt *xprt;
+ int port, err;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (sscanf(buf, "%15s %5u", transport, &port) != 2)
+ return -EINVAL;
+
+ if (port < 1 || port > USHRT_MAX)
+ return -EINVAL;
+
+ err = nfsd_create_serv(net);
+ if (err != 0)
+ return err;
+
+ err = svc_create_xprt(nn->nfsd_serv, transport, net,
+ PF_INET, port, SVC_SOCK_ANONYMOUS, cred);
+ if (err < 0)
+ goto out_err;
+
+ err = svc_create_xprt(nn->nfsd_serv, transport, net,
+ PF_INET6, port, SVC_SOCK_ANONYMOUS, cred);
+ if (err < 0 && err != -EAFNOSUPPORT)
+ goto out_close;
+
+ /* Decrease the count, but don't shut down the service */
+ nn->nfsd_serv->sv_nrthreads--;
+ return 0;
+out_close:
+ xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port);
+ if (xprt != NULL) {
+ svc_close_xprt(xprt);
+ svc_xprt_put(xprt);
+ }
+out_err:
+ if (!list_empty(&nn->nfsd_serv->sv_permsocks))
+ nn->nfsd_serv->sv_nrthreads--;
+ else
+ nfsd_destroy(net);
+ return err;
+}
+
+static ssize_t __write_ports(struct file *file, char *buf, size_t size,
+ struct net *net)
+{
+ if (size == 0)
+ return __write_ports_names(buf, net);
+
+ if (isdigit(buf[0]))
+ return __write_ports_addfd(buf, net, file->f_cred);
+
+ if (isalpha(buf[0]))
+ return __write_ports_addxprt(buf, net, file->f_cred);
+
+ return -EINVAL;
+}
+
+/*
+ * write_ports - Pass a socket file descriptor or transport name to listen on
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ * Output:
+ * On success: passed-in buffer filled with a '\n'-terminated C
+ * string containing a whitespace-separated list of
+ * named NFSD listeners;
+ * return code is the size in bytes of the string
+ * On error: return code is zero or a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ * buf: C string containing an unsigned
+ * integer value representing a bound
+ * but unconnected socket that is to be
+ * used as an NFSD listener; listen(3)
+ * must be called for a SOCK_STREAM
+ * socket, otherwise it is ignored
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: NFS service is started;
+ * passed-in buffer filled with a '\n'-terminated C
+ * string containing a unique alphanumeric name of
+ * the listener;
+ * return code is the size in bytes of the string
+ * On error: return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ * buf: C string containing a transport
+ * name and an unsigned integer value
+ * representing the port to listen on,
+ * separated by whitespace
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: returns zero; NFS service is started
+ * On error: return code is a negative errno value
+ */
+static ssize_t write_ports(struct file *file, char *buf, size_t size)
+{
+ ssize_t rv;
+
+ mutex_lock(&nfsd_mutex);
+ rv = __write_ports(file, buf, size, netns(file));
+ mutex_unlock(&nfsd_mutex);
+ return rv;
+}
+
+
+int nfsd_max_blksize;
+
+/*
+ * write_maxblksize - Set or report the current NFS blksize
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ *
+ * OR
+ *
+ * Input:
+ * buf: C string containing an unsigned
+ * integer value representing the new
+ * NFS blksize
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: passed-in buffer filled with '\n'-terminated C string
+ * containing numeric value of the current NFS blksize
+ * setting;
+ * return code is the size in bytes of the string
+ * On error: return code is zero or a negative errno value
+ */
+static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
+{
+ char *mesg = buf;
+ struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
+
+ if (size > 0) {
+ int bsize;
+ int rv = get_int(&mesg, &bsize);
+ if (rv)
+ return rv;
+ /* force bsize into allowed range and
+ * required alignment.
+ */
+ bsize = max_t(int, bsize, 1024);
+ bsize = min_t(int, bsize, NFSSVC_MAXBLKSIZE);
+ bsize &= ~(1024-1);
+ mutex_lock(&nfsd_mutex);
+ if (nn->nfsd_serv) {
+ mutex_unlock(&nfsd_mutex);
+ return -EBUSY;
+ }
+ nfsd_max_blksize = bsize;
+ mutex_unlock(&nfsd_mutex);
+ }
+
+ return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n",
+ nfsd_max_blksize);
+}
+
+/*
+ * write_maxconn - Set or report the current max number of connections
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ * OR
+ *
+ * Input:
+ * buf: C string containing an unsigned
+ * integer value representing the new
+ * number of max connections
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: passed-in buffer filled with '\n'-terminated C string
+ * containing numeric value of max_connections setting
+ * for this net namespace;
+ * return code is the size in bytes of the string
+ * On error: return code is zero or a negative errno value
+ */
+static ssize_t write_maxconn(struct file *file, char *buf, size_t size)
+{
+ char *mesg = buf;
+ struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
+ unsigned int maxconn = nn->max_connections;
+
+ if (size > 0) {
+ int rv = get_uint(&mesg, &maxconn);
+
+ if (rv)
+ return rv;
+ nn->max_connections = maxconn;
+ }
+
+ return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxconn);
+}
+
+#ifdef CONFIG_NFSD_V4
+static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
+ time64_t *time, struct nfsd_net *nn)
+{
+ char *mesg = buf;
+ int rv, i;
+
+ if (size > 0) {
+ if (nn->nfsd_serv)
+ return -EBUSY;
+ rv = get_int(&mesg, &i);
+ if (rv)
+ return rv;
+ /*
+ * Some sanity checking. We don't have a reason for
+ * these particular numbers, but problems with the
+ * extremes are:
+ * - Too short: the briefest network outage may
+ * cause clients to lose all their locks. Also,
+ * the frequent polling may be wasteful.
+ * - Too long: do you really want reboot recovery
+ * to take more than an hour? Or to make other
+ * clients wait an hour before being able to
+ * revoke a dead client's locks?
+ */
+ if (i < 10 || i > 3600)
+ return -EINVAL;
+ *time = i;
+ }
+
+ return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%lld\n", *time);
+}
+
+static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size,
+ time64_t *time, struct nfsd_net *nn)
+{
+ ssize_t rv;
+
+ mutex_lock(&nfsd_mutex);
+ rv = __nfsd4_write_time(file, buf, size, time, nn);
+ mutex_unlock(&nfsd_mutex);
+ return rv;
+}
+
+/*
+ * write_leasetime - Set or report the current NFSv4 lease time
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ *
+ * OR
+ *
+ * Input:
+ * buf: C string containing an unsigned
+ * integer value representing the new
+ * NFSv4 lease expiry time
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: passed-in buffer filled with '\n'-terminated C
+ * string containing unsigned integer value of the
+ * current lease expiry time;
+ * return code is the size in bytes of the string
+ * On error: return code is zero or a negative errno value
+ */
+static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
+{
+ struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
+ return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn);
+}
+
+/*
+ * write_gracetime - Set or report current NFSv4 grace period time
+ *
+ * As above, but sets the time of the NFSv4 grace period.
+ *
+ * Note this should never be set to less than the *previous*
+ * lease-period time, but we don't try to enforce this. (In the common
+ * case (a new boot), we don't know what the previous lease time was
+ * anyway.)
+ */
+static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
+{
+ struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
+ return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn);
+}
+
+static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size,
+ struct nfsd_net *nn)
+{
+ char *mesg = buf;
+ char *recdir;
+ int len, status;
+
+ if (size > 0) {
+ if (nn->nfsd_serv)
+ return -EBUSY;
+ if (size > PATH_MAX || buf[size-1] != '\n')
+ return -EINVAL;
+ buf[size-1] = 0;
+
+ recdir = mesg;
+ len = qword_get(&mesg, recdir, size);
+ if (len <= 0)
+ return -EINVAL;
+
+ status = nfs4_reset_recoverydir(recdir);
+ if (status)
+ return status;
+ }
+
+ return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n",
+ nfs4_recoverydir());
+}
+
+/*
+ * write_recoverydir - Set or report the pathname of the recovery directory
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ *
+ * OR
+ *
+ * Input:
+ * buf: C string containing the pathname
+ * of the directory on a local file
+ * system containing permanent NFSv4
+ * recovery data
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: passed-in buffer filled with '\n'-terminated C string
+ * containing the current recovery pathname setting;
+ * return code is the size in bytes of the string
+ * On error: return code is zero or a negative errno value
+ */
+static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
+{
+ ssize_t rv;
+ struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
+
+ mutex_lock(&nfsd_mutex);
+ rv = __write_recoverydir(file, buf, size, nn);
+ mutex_unlock(&nfsd_mutex);
+ return rv;
+}
+
+/*
+ * write_v4_end_grace - release grace period for nfsd's v4.x lock manager
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ * OR
+ *
+ * Input:
+ * buf: any value
+ * size: non-zero length of C string in @buf
+ * Output:
+ * passed-in buffer filled with "Y" or "N" with a newline
+ * and NULL-terminated C string. This indicates whether
+ * the grace period has ended in the current net
+ * namespace. Return code is the size in bytes of the
+ * string. Writing a string that starts with 'Y', 'y', or
+ * '1' to the file will end the grace period for nfsd's v4
+ * lock manager.
+ */
+static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
+{
+ struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id);
+
+ if (size > 0) {
+ switch(buf[0]) {
+ case 'Y':
+ case 'y':
+ case '1':
+ if (!nn->nfsd_serv)
+ return -EBUSY;
+ nfsd4_end_grace(nn);
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%c\n",
+ nn->grace_ended ? 'Y' : 'N');
+}
+
+#endif
+
+/*----------------------------------------------------------------------------*/
+/*
+ * populating the filesystem.
+ */
+
+/* Basically copying rpc_get_inode. */
+static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode)
+{
+ struct inode *inode = new_inode(sb);
+ if (!inode)
+ return NULL;
+ /* Following advice from simple_fill_super documentation: */
+ inode->i_ino = iunique(sb, NFSD_MaxReserved);
+ inode->i_mode = mode;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ switch (mode & S_IFMT) {
+ case S_IFDIR:
+ inode->i_fop = &simple_dir_operations;
+ inode->i_op = &simple_dir_inode_operations;
+ inc_nlink(inode);
+ default:
+ break;
+ }
+ return inode;
+}
+
+static int __nfsd_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode, struct nfsdfs_client *ncl)
+{
+ struct inode *inode;
+
+ inode = nfsd_get_inode(dir->i_sb, mode);
+ if (!inode)
+ return -ENOMEM;
+ if (ncl) {
+ inode->i_private = ncl;
+ kref_get(&ncl->cl_ref);
+ }
+ d_add(dentry, inode);
+ inc_nlink(dir);
+ fsnotify_mkdir(dir, dentry);
+ return 0;
+}
+
+static struct dentry *nfsd_mkdir(struct dentry *parent, struct nfsdfs_client *ncl, char *name)
+{
+ struct inode *dir = parent->d_inode;
+ struct dentry *dentry;
+ int ret = -ENOMEM;
+
+ inode_lock(dir);
+ dentry = d_alloc_name(parent, name);
+ if (!dentry)
+ goto out_err;
+ ret = __nfsd_mkdir(d_inode(parent), dentry, S_IFDIR | 0600, ncl);
+ if (ret)
+ goto out_err;
+out:
+ inode_unlock(dir);
+ return dentry;
+out_err:
+ dput(dentry);
+ dentry = ERR_PTR(ret);
+ goto out;
+}
+
+static void clear_ncl(struct inode *inode)
+{
+ struct nfsdfs_client *ncl = inode->i_private;
+
+ inode->i_private = NULL;
+ kref_put(&ncl->cl_ref, ncl->cl_release);
+}
+
+static struct nfsdfs_client *__get_nfsdfs_client(struct inode *inode)
+{
+ struct nfsdfs_client *nc = inode->i_private;
+
+ if (nc)
+ kref_get(&nc->cl_ref);
+ return nc;
+}
+
+struct nfsdfs_client *get_nfsdfs_client(struct inode *inode)
+{
+ struct nfsdfs_client *nc;
+
+ inode_lock_shared(inode);
+ nc = __get_nfsdfs_client(inode);
+ inode_unlock_shared(inode);
+ return nc;
+}
+/* from __rpc_unlink */
+static void nfsdfs_remove_file(struct inode *dir, struct dentry *dentry)
+{
+ int ret;
+
+ clear_ncl(d_inode(dentry));
+ dget(dentry);
+ ret = simple_unlink(dir, dentry);
+ d_drop(dentry);
+ fsnotify_unlink(dir, dentry);
+ dput(dentry);
+ WARN_ON_ONCE(ret);
+}
+
+static void nfsdfs_remove_files(struct dentry *root)
+{
+ struct dentry *dentry, *tmp;
+
+ list_for_each_entry_safe(dentry, tmp, &root->d_subdirs, d_child) {
+ if (!simple_positive(dentry)) {
+ WARN_ON_ONCE(1); /* I think this can't happen? */
+ continue;
+ }
+ nfsdfs_remove_file(d_inode(root), dentry);
+ }
+}
+
+/* XXX: cut'n'paste from simple_fill_super; figure out if we could share
+ * code instead. */
+static int nfsdfs_create_files(struct dentry *root,
+ const struct tree_descr *files)
+{
+ struct inode *dir = d_inode(root);
+ struct inode *inode;
+ struct dentry *dentry;
+ int i;
+
+ inode_lock(dir);
+ for (i = 0; files->name && files->name[0]; i++, files++) {
+ if (!files->name)
+ continue;
+ dentry = d_alloc_name(root, files->name);
+ if (!dentry)
+ goto out;
+ inode = nfsd_get_inode(d_inode(root)->i_sb,
+ S_IFREG | files->mode);
+ if (!inode) {
+ dput(dentry);
+ goto out;
+ }
+ inode->i_fop = files->ops;
+ inode->i_private = __get_nfsdfs_client(dir);
+ d_add(dentry, inode);
+ fsnotify_create(dir, dentry);
+ }
+ inode_unlock(dir);
+ return 0;
+out:
+ nfsdfs_remove_files(root);
+ inode_unlock(dir);
+ return -ENOMEM;
+}
+
+/* on success, returns positive number unique to that client. */
+struct dentry *nfsd_client_mkdir(struct nfsd_net *nn,
+ struct nfsdfs_client *ncl, u32 id,
+ const struct tree_descr *files)
+{
+ struct dentry *dentry;
+ char name[11];
+ int ret;
+
+ sprintf(name, "%u", id);
+
+ dentry = nfsd_mkdir(nn->nfsd_client_dir, ncl, name);
+ if (IS_ERR(dentry)) /* XXX: tossing errors? */
+ return NULL;
+ ret = nfsdfs_create_files(dentry, files);
+ if (ret) {
+ nfsd_client_rmdir(dentry);
+ return NULL;
+ }
+ return dentry;
+}
+
+/* Taken from __rpc_rmdir: */
+void nfsd_client_rmdir(struct dentry *dentry)
+{
+ struct inode *dir = d_inode(dentry->d_parent);
+ struct inode *inode = d_inode(dentry);
+ int ret;
+
+ inode_lock(dir);
+ nfsdfs_remove_files(dentry);
+ clear_ncl(inode);
+ dget(dentry);
+ ret = simple_rmdir(dir, dentry);
+ WARN_ON_ONCE(ret);
+ d_drop(dentry);
+ fsnotify_rmdir(dir, dentry);
+ dput(dentry);
+ inode_unlock(dir);
+}
+
+static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+ nfsd_net_id);
+ struct dentry *dentry;
+ int ret;
+
+ static const struct tree_descr nfsd_files[] = {
+ [NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO},
+ [NFSD_Export_features] = {"export_features",
+ &export_features_operations, S_IRUGO},
+ [NFSD_FO_UnlockIP] = {"unlock_ip",
+ &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_FO_UnlockFS] = {"unlock_filesystem",
+ &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
+ [NFSD_Reply_Cache_Stats] = {"reply_cache_stats", &reply_cache_stats_operations, S_IRUGO},
+ [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
+ [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
+ [NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO},
+#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
+ [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO},
+#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
+#ifdef CONFIG_NFSD_V4
+ [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_V4EndGrace] = {"v4_end_grace", &transaction_ops, S_IWUSR|S_IRUGO},
+#endif
+ /* last one */ {""}
+ };
+
+ ret = simple_fill_super(sb, 0x6e667364, nfsd_files);
+ if (ret)
+ return ret;
+ dentry = nfsd_mkdir(sb->s_root, NULL, "clients");
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ nn->nfsd_client_dir = dentry;
+ return 0;
+}
+
+static int nfsd_fs_get_tree(struct fs_context *fc)
+{
+ return get_tree_keyed(fc, nfsd_fill_super, get_net(fc->net_ns));
+}
+
+static void nfsd_fs_free_fc(struct fs_context *fc)
+{
+ if (fc->s_fs_info)
+ put_net(fc->s_fs_info);
+}
+
+static const struct fs_context_operations nfsd_fs_context_ops = {
+ .free = nfsd_fs_free_fc,
+ .get_tree = nfsd_fs_get_tree,
+};
+
+static int nfsd_init_fs_context(struct fs_context *fc)
+{
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(fc->net_ns->user_ns);
+ fc->ops = &nfsd_fs_context_ops;
+ return 0;
+}
+
+static void nfsd_umount(struct super_block *sb)
+{
+ struct net *net = sb->s_fs_info;
+
+ nfsd_shutdown_threads(net);
+
+ kill_litter_super(sb);
+ put_net(net);
+}
+
+static struct file_system_type nfsd_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "nfsd",
+ .init_fs_context = nfsd_init_fs_context,
+ .kill_sb = nfsd_umount,
+};
+MODULE_ALIAS_FS("nfsd");
+
+#ifdef CONFIG_PROC_FS
+static int create_proc_exports_entry(void)
+{
+ struct proc_dir_entry *entry;
+
+ entry = proc_mkdir("fs/nfs", NULL);
+ if (!entry)
+ return -ENOMEM;
+ entry = proc_create("exports", 0, entry, &exports_proc_ops);
+ if (!entry) {
+ remove_proc_entry("fs/nfs", NULL);
+ return -ENOMEM;
+ }
+ return 0;
+}
+#else /* CONFIG_PROC_FS */
+static int create_proc_exports_entry(void)
+{
+ return 0;
+}
+#endif
+
+unsigned int nfsd_net_id;
+
+static __net_init int nfsd_init_net(struct net *net)
+{
+ int retval;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ retval = nfsd_export_init(net);
+ if (retval)
+ goto out_export_error;
+ retval = nfsd_idmap_init(net);
+ if (retval)
+ goto out_idmap_error;
+ nn->nfsd_versions = NULL;
+ nn->nfsd4_minorversions = NULL;
+ retval = nfsd_reply_cache_init(nn);
+ if (retval)
+ goto out_drc_error;
+ nn->nfsd4_lease = 90; /* default lease time */
+ nn->nfsd4_grace = 90;
+ nn->somebody_reclaimed = false;
+ nn->track_reclaim_completes = false;
+ nn->clverifier_counter = prandom_u32();
+ nn->clientid_base = prandom_u32();
+ nn->clientid_counter = nn->clientid_base + 1;
+ nn->s2s_cp_cl_id = nn->clientid_counter++;
+
+ atomic_set(&nn->ntf_refcnt, 0);
+ init_waitqueue_head(&nn->ntf_wq);
+ seqlock_init(&nn->boot_lock);
+
+ return 0;
+
+out_drc_error:
+ nfsd_idmap_shutdown(net);
+out_idmap_error:
+ nfsd_export_shutdown(net);
+out_export_error:
+ return retval;
+}
+
+static __net_exit void nfsd_exit_net(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ nfsd_reply_cache_shutdown(nn);
+ nfsd_idmap_shutdown(net);
+ nfsd_export_shutdown(net);
+ nfsd_netns_free_versions(net_generic(net, nfsd_net_id));
+}
+
+static struct pernet_operations nfsd_net_ops = {
+ .init = nfsd_init_net,
+ .exit = nfsd_exit_net,
+ .id = &nfsd_net_id,
+ .size = sizeof(struct nfsd_net),
+};
+
+static int __init init_nfsd(void)
+{
+ int retval;
+ printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n");
+
+ retval = nfsd4_init_slabs();
+ if (retval)
+ return retval;
+ retval = nfsd4_init_pnfs();
+ if (retval)
+ goto out_free_slabs;
+ nfsd_stat_init(); /* Statistics */
+ retval = nfsd_drc_slab_create();
+ if (retval)
+ goto out_free_stat;
+ nfsd_lockd_init(); /* lockd->nfsd callbacks */
+ retval = create_proc_exports_entry();
+ if (retval)
+ goto out_free_lockd;
+ retval = register_filesystem(&nfsd_fs_type);
+ if (retval)
+ goto out_free_exports;
+ retval = register_pernet_subsys(&nfsd_net_ops);
+ if (retval < 0)
+ goto out_free_filesystem;
+ retval = register_cld_notifier();
+ if (retval)
+ goto out_free_all;
+ return 0;
+out_free_all:
+ unregister_pernet_subsys(&nfsd_net_ops);
+out_free_filesystem:
+ unregister_filesystem(&nfsd_fs_type);
+out_free_exports:
+ remove_proc_entry("fs/nfs/exports", NULL);
+ remove_proc_entry("fs/nfs", NULL);
+out_free_lockd:
+ nfsd_lockd_shutdown();
+ nfsd_drc_slab_free();
+out_free_stat:
+ nfsd_stat_shutdown();
+ nfsd4_exit_pnfs();
+out_free_slabs:
+ nfsd4_free_slabs();
+ return retval;
+}
+
+static void __exit exit_nfsd(void)
+{
+ unregister_cld_notifier();
+ unregister_pernet_subsys(&nfsd_net_ops);
+ nfsd_drc_slab_free();
+ remove_proc_entry("fs/nfs/exports", NULL);
+ remove_proc_entry("fs/nfs", NULL);
+ nfsd_stat_shutdown();
+ nfsd_lockd_shutdown();
+ nfsd4_free_slabs();
+ nfsd4_exit_pnfs();
+ unregister_filesystem(&nfsd_fs_type);
+}
+
+MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
+MODULE_LICENSE("GPL");
+module_init(init_nfsd)
+module_exit(exit_nfsd)
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
new file mode 100644
index 000000000..4362d295e
--- /dev/null
+++ b/fs/nfsd/nfsd.h
@@ -0,0 +1,489 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Hodge-podge collection of knfsd-related stuff.
+ * I will sort this out later.
+ *
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef LINUX_NFSD_NFSD_H
+#define LINUX_NFSD_NFSD_H
+
+#include <linux/types.h>
+#include <linux/mount.h>
+
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs3.h>
+#include <linux/nfs4.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/sunrpc/msg_prot.h>
+#include <linux/sunrpc/addr.h>
+
+#include <uapi/linux/nfsd/debug.h>
+
+#include "netns.h"
+#include "stats.h"
+#include "export.h"
+
+#undef ifdebug
+#ifdef CONFIG_SUNRPC_DEBUG
+# define ifdebug(flag) if (nfsd_debug & NFSDDBG_##flag)
+#else
+# define ifdebug(flag) if (0)
+#endif
+
+/*
+ * nfsd version
+ */
+#define NFSD_SUPPORTED_MINOR_VERSION 2
+/*
+ * Maximum blocksizes supported by daemon under various circumstances.
+ */
+#define NFSSVC_MAXBLKSIZE RPCSVC_MAXPAYLOAD
+/* NFSv2 is limited by the protocol specification, see RFC 1094 */
+#define NFSSVC_MAXBLKSIZE_V2 (8*1024)
+
+
+/*
+ * Largest number of bytes we need to allocate for an NFS
+ * call or reply. Used to control buffer sizes. We use
+ * the length of v3 WRITE, READDIR and READDIR replies
+ * which are an RPC header, up to 26 XDR units of reply
+ * data, and some page data.
+ *
+ * Note that accuracy here doesn't matter too much as the
+ * size is rounded up to a page size when allocating space.
+ */
+#define NFSD_BUFSIZE ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE)
+
+struct readdir_cd {
+ __be32 err; /* 0, nfserr, or nfserr_eof */
+};
+
+
+extern struct svc_program nfsd_program;
+extern const struct svc_version nfsd_version2, nfsd_version3,
+ nfsd_version4;
+extern struct mutex nfsd_mutex;
+extern spinlock_t nfsd_drc_lock;
+extern unsigned long nfsd_drc_max_mem;
+extern unsigned long nfsd_drc_mem_used;
+
+extern const struct seq_operations nfs_exports_op;
+
+/*
+ * Function prototypes.
+ */
+int nfsd_svc(int nrservs, struct net *net, const struct cred *cred);
+int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp);
+
+int nfsd_nrthreads(struct net *);
+int nfsd_nrpools(struct net *);
+int nfsd_get_nrthreads(int n, int *, struct net *);
+int nfsd_set_nrthreads(int n, int *, struct net *);
+int nfsd_pool_stats_open(struct inode *, struct file *);
+int nfsd_pool_stats_release(struct inode *, struct file *);
+void nfsd_shutdown_threads(struct net *net);
+
+void nfsd_destroy(struct net *net);
+
+bool i_am_nfsd(void);
+
+struct nfsdfs_client {
+ struct kref cl_ref;
+ void (*cl_release)(struct kref *kref);
+};
+
+struct nfsdfs_client *get_nfsdfs_client(struct inode *);
+struct dentry *nfsd_client_mkdir(struct nfsd_net *nn,
+ struct nfsdfs_client *ncl, u32 id, const struct tree_descr *);
+void nfsd_client_rmdir(struct dentry *dentry);
+
+
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+#ifdef CONFIG_NFSD_V2_ACL
+extern const struct svc_version nfsd_acl_version2;
+#else
+#define nfsd_acl_version2 NULL
+#endif
+#ifdef CONFIG_NFSD_V3_ACL
+extern const struct svc_version nfsd_acl_version3;
+#else
+#define nfsd_acl_version3 NULL
+#endif
+#endif
+
+struct nfsd_net;
+
+enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL };
+int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change);
+int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change);
+void nfsd_reset_versions(struct nfsd_net *nn);
+int nfsd_create_serv(struct net *net);
+
+extern int nfsd_max_blksize;
+
+static inline int nfsd_v4client(struct svc_rqst *rq)
+{
+ return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4;
+}
+static inline struct user_namespace *
+nfsd_user_namespace(const struct svc_rqst *rqstp)
+{
+ const struct cred *cred = rqstp->rq_xprt->xpt_cred;
+ return cred ? cred->user_ns : &init_user_ns;
+}
+
+/*
+ * NFSv4 State
+ */
+#ifdef CONFIG_NFSD_V4
+extern unsigned long max_delegations;
+int nfsd4_init_slabs(void);
+void nfsd4_free_slabs(void);
+int nfs4_state_start(void);
+int nfs4_state_start_net(struct net *net);
+void nfs4_state_shutdown(void);
+void nfs4_state_shutdown_net(struct net *net);
+int nfs4_reset_recoverydir(char *recdir);
+char * nfs4_recoverydir(void);
+bool nfsd4_spo_must_allow(struct svc_rqst *rqstp);
+#else
+static inline int nfsd4_init_slabs(void) { return 0; }
+static inline void nfsd4_free_slabs(void) { }
+static inline int nfs4_state_start(void) { return 0; }
+static inline int nfs4_state_start_net(struct net *net) { return 0; }
+static inline void nfs4_state_shutdown(void) { }
+static inline void nfs4_state_shutdown_net(struct net *net) { }
+static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
+static inline char * nfs4_recoverydir(void) {return NULL; }
+static inline bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
+{
+ return false;
+}
+#endif
+
+/*
+ * lockd binding
+ */
+void nfsd_lockd_init(void);
+void nfsd_lockd_shutdown(void);
+
+
+/*
+ * These macros provide pre-xdr'ed values for faster operation.
+ */
+#define nfs_ok cpu_to_be32(NFS_OK)
+#define nfserr_perm cpu_to_be32(NFSERR_PERM)
+#define nfserr_noent cpu_to_be32(NFSERR_NOENT)
+#define nfserr_io cpu_to_be32(NFSERR_IO)
+#define nfserr_nxio cpu_to_be32(NFSERR_NXIO)
+#define nfserr_eagain cpu_to_be32(NFSERR_EAGAIN)
+#define nfserr_acces cpu_to_be32(NFSERR_ACCES)
+#define nfserr_exist cpu_to_be32(NFSERR_EXIST)
+#define nfserr_xdev cpu_to_be32(NFSERR_XDEV)
+#define nfserr_nodev cpu_to_be32(NFSERR_NODEV)
+#define nfserr_notdir cpu_to_be32(NFSERR_NOTDIR)
+#define nfserr_isdir cpu_to_be32(NFSERR_ISDIR)
+#define nfserr_inval cpu_to_be32(NFSERR_INVAL)
+#define nfserr_fbig cpu_to_be32(NFSERR_FBIG)
+#define nfserr_nospc cpu_to_be32(NFSERR_NOSPC)
+#define nfserr_rofs cpu_to_be32(NFSERR_ROFS)
+#define nfserr_mlink cpu_to_be32(NFSERR_MLINK)
+#define nfserr_opnotsupp cpu_to_be32(NFSERR_OPNOTSUPP)
+#define nfserr_nametoolong cpu_to_be32(NFSERR_NAMETOOLONG)
+#define nfserr_notempty cpu_to_be32(NFSERR_NOTEMPTY)
+#define nfserr_dquot cpu_to_be32(NFSERR_DQUOT)
+#define nfserr_stale cpu_to_be32(NFSERR_STALE)
+#define nfserr_remote cpu_to_be32(NFSERR_REMOTE)
+#define nfserr_wflush cpu_to_be32(NFSERR_WFLUSH)
+#define nfserr_badhandle cpu_to_be32(NFSERR_BADHANDLE)
+#define nfserr_notsync cpu_to_be32(NFSERR_NOT_SYNC)
+#define nfserr_badcookie cpu_to_be32(NFSERR_BAD_COOKIE)
+#define nfserr_notsupp cpu_to_be32(NFSERR_NOTSUPP)
+#define nfserr_toosmall cpu_to_be32(NFSERR_TOOSMALL)
+#define nfserr_serverfault cpu_to_be32(NFSERR_SERVERFAULT)
+#define nfserr_badtype cpu_to_be32(NFSERR_BADTYPE)
+#define nfserr_jukebox cpu_to_be32(NFSERR_JUKEBOX)
+#define nfserr_denied cpu_to_be32(NFSERR_DENIED)
+#define nfserr_deadlock cpu_to_be32(NFSERR_DEADLOCK)
+#define nfserr_expired cpu_to_be32(NFSERR_EXPIRED)
+#define nfserr_bad_cookie cpu_to_be32(NFSERR_BAD_COOKIE)
+#define nfserr_same cpu_to_be32(NFSERR_SAME)
+#define nfserr_clid_inuse cpu_to_be32(NFSERR_CLID_INUSE)
+#define nfserr_stale_clientid cpu_to_be32(NFSERR_STALE_CLIENTID)
+#define nfserr_resource cpu_to_be32(NFSERR_RESOURCE)
+#define nfserr_moved cpu_to_be32(NFSERR_MOVED)
+#define nfserr_nofilehandle cpu_to_be32(NFSERR_NOFILEHANDLE)
+#define nfserr_minor_vers_mismatch cpu_to_be32(NFSERR_MINOR_VERS_MISMATCH)
+#define nfserr_share_denied cpu_to_be32(NFSERR_SHARE_DENIED)
+#define nfserr_stale_stateid cpu_to_be32(NFSERR_STALE_STATEID)
+#define nfserr_old_stateid cpu_to_be32(NFSERR_OLD_STATEID)
+#define nfserr_bad_stateid cpu_to_be32(NFSERR_BAD_STATEID)
+#define nfserr_bad_seqid cpu_to_be32(NFSERR_BAD_SEQID)
+#define nfserr_symlink cpu_to_be32(NFSERR_SYMLINK)
+#define nfserr_not_same cpu_to_be32(NFSERR_NOT_SAME)
+#define nfserr_lock_range cpu_to_be32(NFSERR_LOCK_RANGE)
+#define nfserr_restorefh cpu_to_be32(NFSERR_RESTOREFH)
+#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP)
+#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR)
+#define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE)
+#define nfserr_badowner cpu_to_be32(NFSERR_BADOWNER)
+#define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD)
+#define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL)
+#define nfserr_grace cpu_to_be32(NFSERR_GRACE)
+#define nfserr_no_grace cpu_to_be32(NFSERR_NO_GRACE)
+#define nfserr_reclaim_bad cpu_to_be32(NFSERR_RECLAIM_BAD)
+#define nfserr_badname cpu_to_be32(NFSERR_BADNAME)
+#define nfserr_cb_path_down cpu_to_be32(NFSERR_CB_PATH_DOWN)
+#define nfserr_locked cpu_to_be32(NFSERR_LOCKED)
+#define nfserr_wrongsec cpu_to_be32(NFSERR_WRONGSEC)
+#define nfserr_badiomode cpu_to_be32(NFS4ERR_BADIOMODE)
+#define nfserr_badlayout cpu_to_be32(NFS4ERR_BADLAYOUT)
+#define nfserr_bad_session_digest cpu_to_be32(NFS4ERR_BAD_SESSION_DIGEST)
+#define nfserr_badsession cpu_to_be32(NFS4ERR_BADSESSION)
+#define nfserr_badslot cpu_to_be32(NFS4ERR_BADSLOT)
+#define nfserr_complete_already cpu_to_be32(NFS4ERR_COMPLETE_ALREADY)
+#define nfserr_conn_not_bound_to_session cpu_to_be32(NFS4ERR_CONN_NOT_BOUND_TO_SESSION)
+#define nfserr_deleg_already_wanted cpu_to_be32(NFS4ERR_DELEG_ALREADY_WANTED)
+#define nfserr_back_chan_busy cpu_to_be32(NFS4ERR_BACK_CHAN_BUSY)
+#define nfserr_layouttrylater cpu_to_be32(NFS4ERR_LAYOUTTRYLATER)
+#define nfserr_layoutunavailable cpu_to_be32(NFS4ERR_LAYOUTUNAVAILABLE)
+#define nfserr_nomatching_layout cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT)
+#define nfserr_recallconflict cpu_to_be32(NFS4ERR_RECALLCONFLICT)
+#define nfserr_unknown_layouttype cpu_to_be32(NFS4ERR_UNKNOWN_LAYOUTTYPE)
+#define nfserr_seq_misordered cpu_to_be32(NFS4ERR_SEQ_MISORDERED)
+#define nfserr_sequence_pos cpu_to_be32(NFS4ERR_SEQUENCE_POS)
+#define nfserr_req_too_big cpu_to_be32(NFS4ERR_REQ_TOO_BIG)
+#define nfserr_rep_too_big cpu_to_be32(NFS4ERR_REP_TOO_BIG)
+#define nfserr_rep_too_big_to_cache cpu_to_be32(NFS4ERR_REP_TOO_BIG_TO_CACHE)
+#define nfserr_retry_uncached_rep cpu_to_be32(NFS4ERR_RETRY_UNCACHED_REP)
+#define nfserr_unsafe_compound cpu_to_be32(NFS4ERR_UNSAFE_COMPOUND)
+#define nfserr_too_many_ops cpu_to_be32(NFS4ERR_TOO_MANY_OPS)
+#define nfserr_op_not_in_session cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION)
+#define nfserr_hash_alg_unsupp cpu_to_be32(NFS4ERR_HASH_ALG_UNSUPP)
+#define nfserr_clientid_busy cpu_to_be32(NFS4ERR_CLIENTID_BUSY)
+#define nfserr_pnfs_io_hole cpu_to_be32(NFS4ERR_PNFS_IO_HOLE)
+#define nfserr_seq_false_retry cpu_to_be32(NFS4ERR_SEQ_FALSE_RETRY)
+#define nfserr_bad_high_slot cpu_to_be32(NFS4ERR_BAD_HIGH_SLOT)
+#define nfserr_deadsession cpu_to_be32(NFS4ERR_DEADSESSION)
+#define nfserr_encr_alg_unsupp cpu_to_be32(NFS4ERR_ENCR_ALG_UNSUPP)
+#define nfserr_pnfs_no_layout cpu_to_be32(NFS4ERR_PNFS_NO_LAYOUT)
+#define nfserr_not_only_op cpu_to_be32(NFS4ERR_NOT_ONLY_OP)
+#define nfserr_wrong_cred cpu_to_be32(NFS4ERR_WRONG_CRED)
+#define nfserr_wrong_type cpu_to_be32(NFS4ERR_WRONG_TYPE)
+#define nfserr_dirdeleg_unavail cpu_to_be32(NFS4ERR_DIRDELEG_UNAVAIL)
+#define nfserr_reject_deleg cpu_to_be32(NFS4ERR_REJECT_DELEG)
+#define nfserr_returnconflict cpu_to_be32(NFS4ERR_RETURNCONFLICT)
+#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED)
+#define nfserr_partner_notsupp cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP)
+#define nfserr_partner_no_auth cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH)
+#define nfserr_union_notsupp cpu_to_be32(NFS4ERR_UNION_NOTSUPP)
+#define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED)
+#define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS)
+#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL)
+#define nfserr_file_open cpu_to_be32(NFS4ERR_FILE_OPEN)
+#define nfserr_xattr2big cpu_to_be32(NFS4ERR_XATTR2BIG)
+#define nfserr_noxattr cpu_to_be32(NFS4ERR_NOXATTR)
+
+/* error codes for internal use */
+/* if a request fails due to kmalloc failure, it gets dropped.
+ * Client should resend eventually
+ */
+#define nfserr_dropit cpu_to_be32(30000)
+/* end-of-file indicator in readdir */
+#define nfserr_eof cpu_to_be32(30001)
+/* replay detected */
+#define nfserr_replay_me cpu_to_be32(11001)
+/* nfs41 replay detected */
+#define nfserr_replay_cache cpu_to_be32(11002)
+
+/* Check for dir entries '.' and '..' */
+#define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.'))
+
+#ifdef CONFIG_NFSD_V4
+
+/* before processing a COMPOUND operation, we have to check that there
+ * is enough space in the buffer for XDR encode to succeed. otherwise,
+ * we might process an operation with side effects, and be unable to
+ * tell the client that the operation succeeded.
+ *
+ * COMPOUND_SLACK_SPACE - this is the minimum bytes of buffer space
+ * needed to encode an "ordinary" _successful_ operation. (GETATTR,
+ * READ, READDIR, and READLINK have their own buffer checks.) if we
+ * fall below this level, we fail the next operation with NFS4ERR_RESOURCE.
+ *
+ * COMPOUND_ERR_SLACK_SPACE - this is the minimum bytes of buffer space
+ * needed to encode an operation which has failed with NFS4ERR_RESOURCE.
+ * care is taken to ensure that we never fall below this level for any
+ * reason.
+ */
+#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */
+#define COMPOUND_ERR_SLACK_SPACE 16 /* OP_SETATTR */
+
+#define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */
+
+/*
+ * The following attributes are currently not supported by the NFSv4 server:
+ * ARCHIVE (deprecated anyway)
+ * HIDDEN (unlikely to be supported any time soon)
+ * MIMETYPE (unlikely to be supported any time soon)
+ * QUOTA_* (will be supported in a forthcoming patch)
+ * SYSTEM (unlikely to be supported any time soon)
+ * TIME_BACKUP (unlikely to be supported any time soon)
+ * TIME_CREATE (unlikely to be supported any time soon)
+ */
+#define NFSD4_SUPPORTED_ATTRS_WORD0 \
+(FATTR4_WORD0_SUPPORTED_ATTRS | FATTR4_WORD0_TYPE | FATTR4_WORD0_FH_EXPIRE_TYPE \
+ | FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE | FATTR4_WORD0_LINK_SUPPORT \
+ | FATTR4_WORD0_SYMLINK_SUPPORT | FATTR4_WORD0_NAMED_ATTR | FATTR4_WORD0_FSID \
+ | FATTR4_WORD0_UNIQUE_HANDLES | FATTR4_WORD0_LEASE_TIME | FATTR4_WORD0_RDATTR_ERROR \
+ | FATTR4_WORD0_ACLSUPPORT | FATTR4_WORD0_CANSETTIME | FATTR4_WORD0_CASE_INSENSITIVE \
+ | FATTR4_WORD0_CASE_PRESERVING | FATTR4_WORD0_CHOWN_RESTRICTED \
+ | FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FILEID | FATTR4_WORD0_FILES_AVAIL \
+ | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_HOMOGENEOUS \
+ | FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXLINK | FATTR4_WORD0_MAXNAME \
+ | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE | FATTR4_WORD0_ACL)
+
+#define NFSD4_SUPPORTED_ATTRS_WORD1 \
+(FATTR4_WORD1_MODE | FATTR4_WORD1_NO_TRUNC | FATTR4_WORD1_NUMLINKS \
+ | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV \
+ | FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \
+ | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \
+ | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \
+ | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID)
+
+#define NFSD4_SUPPORTED_ATTRS_WORD2 0
+
+/* 4.1 */
+#ifdef CONFIG_NFSD_PNFS
+#define PNFSD_SUPPORTED_ATTRS_WORD1 FATTR4_WORD1_FS_LAYOUT_TYPES
+#define PNFSD_SUPPORTED_ATTRS_WORD2 \
+(FATTR4_WORD2_LAYOUT_BLKSIZE | FATTR4_WORD2_LAYOUT_TYPES)
+#else
+#define PNFSD_SUPPORTED_ATTRS_WORD1 0
+#define PNFSD_SUPPORTED_ATTRS_WORD2 0
+#endif /* CONFIG_NFSD_PNFS */
+
+#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
+ NFSD4_SUPPORTED_ATTRS_WORD0
+
+#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
+ (NFSD4_SUPPORTED_ATTRS_WORD1 | PNFSD_SUPPORTED_ATTRS_WORD1)
+
+#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
+ (NFSD4_SUPPORTED_ATTRS_WORD2 | PNFSD_SUPPORTED_ATTRS_WORD2 | \
+ FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+
+/* 4.2 */
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#define NFSD4_2_SECURITY_ATTRS FATTR4_WORD2_SECURITY_LABEL
+#else
+#define NFSD4_2_SECURITY_ATTRS 0
+#endif
+
+#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
+ (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \
+ FATTR4_WORD2_CHANGE_ATTR_TYPE | \
+ FATTR4_WORD2_MODE_UMASK | \
+ NFSD4_2_SECURITY_ATTRS | \
+ FATTR4_WORD2_XATTR_SUPPORT)
+
+extern const u32 nfsd_suppattrs[3][3];
+
+static inline __be32 nfsd4_set_netaddr(struct sockaddr *addr,
+ struct nfs42_netaddr *netaddr)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)addr;
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr;
+ unsigned int port;
+ size_t ret_addr, ret_port;
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ port = ntohs(sin->sin_port);
+ sprintf(netaddr->netid, "tcp");
+ netaddr->netid_len = 3;
+ break;
+ case AF_INET6:
+ port = ntohs(sin6->sin6_port);
+ sprintf(netaddr->netid, "tcp6");
+ netaddr->netid_len = 4;
+ break;
+ default:
+ return nfserr_inval;
+ }
+ ret_addr = rpc_ntop(addr, netaddr->addr, sizeof(netaddr->addr));
+ ret_port = snprintf(netaddr->addr + ret_addr,
+ RPCBIND_MAXUADDRLEN + 1 - ret_addr,
+ ".%u.%u", port >> 8, port & 0xff);
+ WARN_ON(ret_port >= RPCBIND_MAXUADDRLEN + 1 - ret_addr);
+ netaddr->addr_len = ret_addr + ret_port;
+ return 0;
+}
+
+static inline bool bmval_is_subset(const u32 *bm1, const u32 *bm2)
+{
+ return !((bm1[0] & ~bm2[0]) ||
+ (bm1[1] & ~bm2[1]) ||
+ (bm1[2] & ~bm2[2]));
+}
+
+static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval)
+{
+ return bmval_is_subset(bmval, nfsd_suppattrs[minorversion]);
+}
+
+/* These will return ERR_INVAL if specified in GETATTR or READDIR. */
+#define NFSD_WRITEONLY_ATTRS_WORD1 \
+ (FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+
+/*
+ * These are the only attrs allowed in CREATE/OPEN/SETATTR. Don't add
+ * a writeable attribute here without also adding code to parse it to
+ * nfsd4_decode_fattr().
+ */
+#define NFSD_WRITEABLE_ATTRS_WORD0 \
+ (FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL)
+#define NFSD_WRITEABLE_ATTRS_WORD1 \
+ (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
+ | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+#define MAYBE_FATTR4_WORD2_SECURITY_LABEL \
+ FATTR4_WORD2_SECURITY_LABEL
+#else
+#define MAYBE_FATTR4_WORD2_SECURITY_LABEL 0
+#endif
+#define NFSD_WRITEABLE_ATTRS_WORD2 \
+ (FATTR4_WORD2_MODE_UMASK \
+ | MAYBE_FATTR4_WORD2_SECURITY_LABEL)
+
+#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
+ NFSD_WRITEABLE_ATTRS_WORD0
+/*
+ * we currently store the exclusive create verifier in the v_{a,m}time
+ * attributes so the client can't set these at create time using EXCLUSIVE4_1
+ */
+#define NFSD_SUPPATTR_EXCLCREAT_WORD1 \
+ (NFSD_WRITEABLE_ATTRS_WORD1 & \
+ ~(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET))
+#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \
+ NFSD_WRITEABLE_ATTRS_WORD2
+
+extern int nfsd4_is_junction(struct dentry *dentry);
+extern int register_cld_notifier(void);
+extern void unregister_cld_notifier(void);
+#else /* CONFIG_NFSD_V4 */
+static inline int nfsd4_is_junction(struct dentry *dentry)
+{
+ return 0;
+}
+
+#define register_cld_notifier() 0
+#define unregister_cld_notifier() do { } while(0)
+
+#endif /* CONFIG_NFSD_V4 */
+
+#endif /* LINUX_NFSD_NFSD_H */
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
new file mode 100644
index 000000000..c81dbbad8
--- /dev/null
+++ b/fs/nfsd/nfsfh.c
@@ -0,0 +1,713 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NFS server file handle treatment.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ * Portions Copyright (C) 1999 G. Allen Morris III <gam3@acm.org>
+ * Extensive rewrite by Neil Brown <neilb@cse.unsw.edu.au> Southern-Spring 1999
+ * ... and again Southern-Winter 2001 to support export_operations
+ */
+
+#include <linux/exportfs.h>
+
+#include <linux/sunrpc/svcauth_gss.h>
+#include "nfsd.h"
+#include "vfs.h"
+#include "auth.h"
+#include "trace.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_FH
+
+
+/*
+ * our acceptability function.
+ * if NOSUBTREECHECK, accept anything
+ * if not, require that we can walk up to exp->ex_dentry
+ * doing some checks on the 'x' bits
+ */
+static int nfsd_acceptable(void *expv, struct dentry *dentry)
+{
+ struct svc_export *exp = expv;
+ int rv;
+ struct dentry *tdentry;
+ struct dentry *parent;
+
+ if (exp->ex_flags & NFSEXP_NOSUBTREECHECK)
+ return 1;
+
+ tdentry = dget(dentry);
+ while (tdentry != exp->ex_path.dentry && !IS_ROOT(tdentry)) {
+ /* make sure parents give x permission to user */
+ int err;
+ parent = dget_parent(tdentry);
+ err = inode_permission(d_inode(parent), MAY_EXEC);
+ if (err < 0) {
+ dput(parent);
+ break;
+ }
+ dput(tdentry);
+ tdentry = parent;
+ }
+ if (tdentry != exp->ex_path.dentry)
+ dprintk("nfsd_acceptable failed at %p %pd\n", tdentry, tdentry);
+ rv = (tdentry == exp->ex_path.dentry);
+ dput(tdentry);
+ return rv;
+}
+
+/* Type check. The correct error return for type mismatches does not seem to be
+ * generally agreed upon. SunOS seems to use EISDIR if file isn't S_IFREG; a
+ * comment in the NFSv3 spec says this is incorrect (implementation notes for
+ * the write call).
+ */
+static inline __be32
+nfsd_mode_check(struct svc_rqst *rqstp, struct dentry *dentry,
+ umode_t requested)
+{
+ umode_t mode = d_inode(dentry)->i_mode & S_IFMT;
+
+ if (requested == 0) /* the caller doesn't care */
+ return nfs_ok;
+ if (mode == requested) {
+ if (mode == S_IFDIR && !d_can_lookup(dentry)) {
+ WARN_ON_ONCE(1);
+ return nfserr_notdir;
+ }
+ return nfs_ok;
+ }
+ /*
+ * v4 has an error more specific than err_notdir which we should
+ * return in preference to err_notdir:
+ */
+ if (rqstp->rq_vers == 4 && mode == S_IFLNK)
+ return nfserr_symlink;
+ if (requested == S_IFDIR)
+ return nfserr_notdir;
+ if (mode == S_IFDIR)
+ return nfserr_isdir;
+ return nfserr_inval;
+}
+
+static bool nfsd_originating_port_ok(struct svc_rqst *rqstp, int flags)
+{
+ if (flags & NFSEXP_INSECURE_PORT)
+ return true;
+ /* We don't require gss requests to use low ports: */
+ if (rqstp->rq_cred.cr_flavor >= RPC_AUTH_GSS)
+ return true;
+ return test_bit(RQ_SECURE, &rqstp->rq_flags);
+}
+
+static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
+ struct svc_export *exp)
+{
+ int flags = nfsexp_flags(rqstp, exp);
+
+ /* Check if the request originated from a secure port. */
+ if (!nfsd_originating_port_ok(rqstp, flags)) {
+ RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+ dprintk("nfsd: request from insecure port %s!\n",
+ svc_print_addr(rqstp, buf, sizeof(buf)));
+ return nfserr_perm;
+ }
+
+ /* Set user creds for this exportpoint */
+ return nfserrno(nfsd_setuser(rqstp, exp));
+}
+
+static inline __be32 check_pseudo_root(struct svc_rqst *rqstp,
+ struct dentry *dentry, struct svc_export *exp)
+{
+ if (!(exp->ex_flags & NFSEXP_V4ROOT))
+ return nfs_ok;
+ /*
+ * v2/v3 clients have no need for the V4ROOT export--they use
+ * the mount protocl instead; also, further V4ROOT checks may be
+ * in v4-specific code, in which case v2/v3 clients could bypass
+ * them.
+ */
+ if (!nfsd_v4client(rqstp))
+ return nfserr_stale;
+ /*
+ * We're exposing only the directories and symlinks that have to be
+ * traversed on the way to real exports:
+ */
+ if (unlikely(!d_is_dir(dentry) &&
+ !d_is_symlink(dentry)))
+ return nfserr_stale;
+ /*
+ * A pseudoroot export gives permission to access only one
+ * single directory; the kernel has to make another upcall
+ * before granting access to anything else under it:
+ */
+ if (unlikely(dentry != exp->ex_path.dentry))
+ return nfserr_stale;
+ return nfs_ok;
+}
+
+/*
+ * Use the given filehandle to look up the corresponding export and
+ * dentry. On success, the results are used to set fh_export and
+ * fh_dentry.
+ */
+static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
+{
+ struct knfsd_fh *fh = &fhp->fh_handle;
+ struct fid *fid = NULL, sfid;
+ struct svc_export *exp;
+ struct dentry *dentry;
+ int fileid_type;
+ int data_left = fh->fh_size/4;
+ __be32 error;
+
+ error = nfserr_stale;
+ if (rqstp->rq_vers > 2)
+ error = nfserr_badhandle;
+ if (rqstp->rq_vers == 4 && fh->fh_size == 0)
+ return nfserr_nofilehandle;
+
+ if (fh->fh_version == 1) {
+ int len;
+
+ if (--data_left < 0)
+ return error;
+ if (fh->fh_auth_type != 0)
+ return error;
+ len = key_len(fh->fh_fsid_type) / 4;
+ if (len == 0)
+ return error;
+ if (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
+ /* deprecated, convert to type 3 */
+ len = key_len(FSID_ENCODE_DEV)/4;
+ fh->fh_fsid_type = FSID_ENCODE_DEV;
+ /*
+ * struct knfsd_fh uses host-endian fields, which are
+ * sometimes used to hold net-endian values. This
+ * confuses sparse, so we must use __force here to
+ * keep it from complaining.
+ */
+ fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]),
+ ntohl((__force __be32)fh->fh_fsid[1])));
+ fh->fh_fsid[1] = fh->fh_fsid[2];
+ }
+ data_left -= len;
+ if (data_left < 0)
+ return error;
+ exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid);
+ fid = (struct fid *)(fh->fh_fsid + len);
+ } else {
+ __u32 tfh[2];
+ dev_t xdev;
+ ino_t xino;
+
+ if (fh->fh_size != NFS_FHSIZE)
+ return error;
+ /* assume old filehandle format */
+ xdev = old_decode_dev(fh->ofh_xdev);
+ xino = u32_to_ino_t(fh->ofh_xino);
+ mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL);
+ exp = rqst_exp_find(rqstp, FSID_DEV, tfh);
+ }
+
+ error = nfserr_stale;
+ if (IS_ERR(exp)) {
+ trace_nfsd_set_fh_dentry_badexport(rqstp, fhp, PTR_ERR(exp));
+
+ if (PTR_ERR(exp) == -ENOENT)
+ return error;
+
+ return nfserrno(PTR_ERR(exp));
+ }
+
+ if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) {
+ /* Elevate privileges so that the lack of 'r' or 'x'
+ * permission on some parent directory will
+ * not stop exportfs_decode_fh from being able
+ * to reconnect a directory into the dentry cache.
+ * The same problem can affect "SUBTREECHECK" exports,
+ * but as nfsd_acceptable depends on correct
+ * access control settings being in effect, we cannot
+ * fix that case easily.
+ */
+ struct cred *new = prepare_creds();
+ if (!new) {
+ error = nfserrno(-ENOMEM);
+ goto out;
+ }
+ new->cap_effective =
+ cap_raise_nfsd_set(new->cap_effective,
+ new->cap_permitted);
+ put_cred(override_creds(new));
+ put_cred(new);
+ } else {
+ error = nfsd_setuser_and_check_port(rqstp, exp);
+ if (error)
+ goto out;
+ }
+
+ /*
+ * Look up the dentry using the NFS file handle.
+ */
+ error = nfserr_stale;
+ if (rqstp->rq_vers > 2)
+ error = nfserr_badhandle;
+
+ if (fh->fh_version != 1) {
+ sfid.i32.ino = fh->ofh_ino;
+ sfid.i32.gen = fh->ofh_generation;
+ sfid.i32.parent_ino = fh->ofh_dirino;
+ fid = &sfid;
+ data_left = 3;
+ if (fh->ofh_dirino == 0)
+ fileid_type = FILEID_INO32_GEN;
+ else
+ fileid_type = FILEID_INO32_GEN_PARENT;
+ } else
+ fileid_type = fh->fh_fileid_type;
+
+ if (fileid_type == FILEID_ROOT)
+ dentry = dget(exp->ex_path.dentry);
+ else {
+ dentry = exportfs_decode_fh(exp->ex_path.mnt, fid,
+ data_left, fileid_type,
+ nfsd_acceptable, exp);
+ if (IS_ERR_OR_NULL(dentry))
+ trace_nfsd_set_fh_dentry_badhandle(rqstp, fhp,
+ dentry ? PTR_ERR(dentry) : -ESTALE);
+ }
+ if (dentry == NULL)
+ goto out;
+ if (IS_ERR(dentry)) {
+ if (PTR_ERR(dentry) != -EINVAL)
+ error = nfserrno(PTR_ERR(dentry));
+ goto out;
+ }
+
+ if (d_is_dir(dentry) &&
+ (dentry->d_flags & DCACHE_DISCONNECTED)) {
+ printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %pd2\n",
+ dentry);
+ }
+
+ fhp->fh_dentry = dentry;
+ fhp->fh_export = exp;
+ return 0;
+out:
+ exp_put(exp);
+ return error;
+}
+
+/**
+ * fh_verify - filehandle lookup and access checking
+ * @rqstp: pointer to current rpc request
+ * @fhp: filehandle to be verified
+ * @type: expected type of object pointed to by filehandle
+ * @access: type of access needed to object
+ *
+ * Look up a dentry from the on-the-wire filehandle, check the client's
+ * access to the export, and set the current task's credentials.
+ *
+ * Regardless of success or failure of fh_verify(), fh_put() should be
+ * called on @fhp when the caller is finished with the filehandle.
+ *
+ * fh_verify() may be called multiple times on a given filehandle, for
+ * example, when processing an NFSv4 compound. The first call will look
+ * up a dentry using the on-the-wire filehandle. Subsequent calls will
+ * skip the lookup and just perform the other checks and possibly change
+ * the current task's credentials.
+ *
+ * @type specifies the type of object expected using one of the S_IF*
+ * constants defined in include/linux/stat.h. The caller may use zero
+ * to indicate that it doesn't care, or a negative integer to indicate
+ * that it expects something not of the given type.
+ *
+ * @access is formed from the NFSD_MAY_* constants defined in
+ * fs/nfsd/vfs.h.
+ */
+__be32
+fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
+{
+ struct svc_export *exp;
+ struct dentry *dentry;
+ __be32 error;
+
+ dprintk("nfsd: fh_verify(%s)\n", SVCFH_fmt(fhp));
+
+ if (!fhp->fh_dentry) {
+ error = nfsd_set_fh_dentry(rqstp, fhp);
+ if (error)
+ goto out;
+ }
+ dentry = fhp->fh_dentry;
+ exp = fhp->fh_export;
+ /*
+ * We still have to do all these permission checks, even when
+ * fh_dentry is already set:
+ * - fh_verify may be called multiple times with different
+ * "access" arguments (e.g. nfsd_proc_create calls
+ * fh_verify(...,NFSD_MAY_EXEC) first, then later (in
+ * nfsd_create) calls fh_verify(...,NFSD_MAY_CREATE).
+ * - in the NFSv4 case, the filehandle may have been filled
+ * in by fh_compose, and given a dentry, but further
+ * compound operations performed with that filehandle
+ * still need permissions checks. In the worst case, a
+ * mountpoint crossing may have changed the export
+ * options, and we may now need to use a different uid
+ * (for example, if different id-squashing options are in
+ * effect on the new filesystem).
+ */
+ error = check_pseudo_root(rqstp, dentry, exp);
+ if (error)
+ goto out;
+
+ error = nfsd_setuser_and_check_port(rqstp, exp);
+ if (error)
+ goto out;
+
+ error = nfsd_mode_check(rqstp, dentry, type);
+ if (error)
+ goto out;
+
+ /*
+ * pseudoflavor restrictions are not enforced on NLM,
+ * which clients virtually always use auth_sys for,
+ * even while using RPCSEC_GSS for NFS.
+ */
+ if (access & NFSD_MAY_LOCK || access & NFSD_MAY_BYPASS_GSS)
+ goto skip_pseudoflavor_check;
+ /*
+ * Clients may expect to be able to use auth_sys during mount,
+ * even if they use gss for everything else; see section 2.3.2
+ * of rfc 2623.
+ */
+ if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT
+ && exp->ex_path.dentry == dentry)
+ goto skip_pseudoflavor_check;
+
+ error = check_nfsd_access(exp, rqstp);
+ if (error)
+ goto out;
+
+skip_pseudoflavor_check:
+ /* Finally, check access permissions. */
+ error = nfsd_permission(rqstp, exp, dentry, access);
+
+ if (error) {
+ dprintk("fh_verify: %pd2 permission failure, "
+ "acc=%x, error=%d\n",
+ dentry,
+ access, ntohl(error));
+ }
+out:
+ if (error == nfserr_stale)
+ nfsdstats.fh_stale++;
+ return error;
+}
+
+
+/*
+ * Compose a file handle for an NFS reply.
+ *
+ * Note that when first composed, the dentry may not yet have
+ * an inode. In this case a call to fh_update should be made
+ * before the fh goes out on the wire ...
+ */
+static void _fh_update(struct svc_fh *fhp, struct svc_export *exp,
+ struct dentry *dentry)
+{
+ if (dentry != exp->ex_path.dentry) {
+ struct fid *fid = (struct fid *)
+ (fhp->fh_handle.fh_fsid + fhp->fh_handle.fh_size/4 - 1);
+ int maxsize = (fhp->fh_maxsize - fhp->fh_handle.fh_size)/4;
+ int subtreecheck = !(exp->ex_flags & NFSEXP_NOSUBTREECHECK);
+
+ fhp->fh_handle.fh_fileid_type =
+ exportfs_encode_fh(dentry, fid, &maxsize, subtreecheck);
+ fhp->fh_handle.fh_size += maxsize * 4;
+ } else {
+ fhp->fh_handle.fh_fileid_type = FILEID_ROOT;
+ }
+}
+
+/*
+ * for composing old style file handles
+ */
+static inline void _fh_update_old(struct dentry *dentry,
+ struct svc_export *exp,
+ struct knfsd_fh *fh)
+{
+ fh->ofh_ino = ino_t_to_u32(d_inode(dentry)->i_ino);
+ fh->ofh_generation = d_inode(dentry)->i_generation;
+ if (d_is_dir(dentry) ||
+ (exp->ex_flags & NFSEXP_NOSUBTREECHECK))
+ fh->ofh_dirino = 0;
+}
+
+static bool is_root_export(struct svc_export *exp)
+{
+ return exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root;
+}
+
+static struct super_block *exp_sb(struct svc_export *exp)
+{
+ return exp->ex_path.dentry->d_sb;
+}
+
+static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp)
+{
+ switch (fsid_type) {
+ case FSID_DEV:
+ if (!old_valid_dev(exp_sb(exp)->s_dev))
+ return false;
+ fallthrough;
+ case FSID_MAJOR_MINOR:
+ case FSID_ENCODE_DEV:
+ return exp_sb(exp)->s_type->fs_flags & FS_REQUIRES_DEV;
+ case FSID_NUM:
+ return exp->ex_flags & NFSEXP_FSID;
+ case FSID_UUID8:
+ case FSID_UUID16:
+ if (!is_root_export(exp))
+ return false;
+ fallthrough;
+ case FSID_UUID4_INUM:
+ case FSID_UUID16_INUM:
+ return exp->ex_uuid != NULL;
+ }
+ return true;
+}
+
+
+static void set_version_and_fsid_type(struct svc_fh *fhp, struct svc_export *exp, struct svc_fh *ref_fh)
+{
+ u8 version;
+ u8 fsid_type;
+retry:
+ version = 1;
+ if (ref_fh && ref_fh->fh_export == exp) {
+ version = ref_fh->fh_handle.fh_version;
+ fsid_type = ref_fh->fh_handle.fh_fsid_type;
+
+ ref_fh = NULL;
+
+ switch (version) {
+ case 0xca:
+ fsid_type = FSID_DEV;
+ break;
+ case 1:
+ break;
+ default:
+ goto retry;
+ }
+
+ /*
+ * As the fsid -> filesystem mapping was guided by
+ * user-space, there is no guarantee that the filesystem
+ * actually supports that fsid type. If it doesn't we
+ * loop around again without ref_fh set.
+ */
+ if (!fsid_type_ok_for_exp(fsid_type, exp))
+ goto retry;
+ } else if (exp->ex_flags & NFSEXP_FSID) {
+ fsid_type = FSID_NUM;
+ } else if (exp->ex_uuid) {
+ if (fhp->fh_maxsize >= 64) {
+ if (is_root_export(exp))
+ fsid_type = FSID_UUID16;
+ else
+ fsid_type = FSID_UUID16_INUM;
+ } else {
+ if (is_root_export(exp))
+ fsid_type = FSID_UUID8;
+ else
+ fsid_type = FSID_UUID4_INUM;
+ }
+ } else if (!old_valid_dev(exp_sb(exp)->s_dev))
+ /* for newer device numbers, we must use a newer fsid format */
+ fsid_type = FSID_ENCODE_DEV;
+ else
+ fsid_type = FSID_DEV;
+ fhp->fh_handle.fh_version = version;
+ if (version)
+ fhp->fh_handle.fh_fsid_type = fsid_type;
+}
+
+__be32
+fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
+ struct svc_fh *ref_fh)
+{
+ /* ref_fh is a reference file handle.
+ * if it is non-null and for the same filesystem, then we should compose
+ * a filehandle which is of the same version, where possible.
+ * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca
+ * Then create a 32byte filehandle using nfs_fhbase_old
+ *
+ */
+
+ struct inode * inode = d_inode(dentry);
+ dev_t ex_dev = exp_sb(exp)->s_dev;
+
+ dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %pd2, ino=%ld)\n",
+ MAJOR(ex_dev), MINOR(ex_dev),
+ (long) d_inode(exp->ex_path.dentry)->i_ino,
+ dentry,
+ (inode ? inode->i_ino : 0));
+
+ /* Choose filehandle version and fsid type based on
+ * the reference filehandle (if it is in the same export)
+ * or the export options.
+ */
+ set_version_and_fsid_type(fhp, exp, ref_fh);
+
+ if (ref_fh == fhp)
+ fh_put(ref_fh);
+
+ if (fhp->fh_locked || fhp->fh_dentry) {
+ printk(KERN_ERR "fh_compose: fh %pd2 not initialized!\n",
+ dentry);
+ }
+ if (fhp->fh_maxsize < NFS_FHSIZE)
+ printk(KERN_ERR "fh_compose: called with maxsize %d! %pd2\n",
+ fhp->fh_maxsize,
+ dentry);
+
+ fhp->fh_dentry = dget(dentry); /* our internal copy */
+ fhp->fh_export = exp_get(exp);
+
+ if (fhp->fh_handle.fh_version == 0xca) {
+ /* old style filehandle please */
+ memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE);
+ fhp->fh_handle.fh_size = NFS_FHSIZE;
+ fhp->fh_handle.ofh_dcookie = 0xfeebbaca;
+ fhp->fh_handle.ofh_dev = old_encode_dev(ex_dev);
+ fhp->fh_handle.ofh_xdev = fhp->fh_handle.ofh_dev;
+ fhp->fh_handle.ofh_xino =
+ ino_t_to_u32(d_inode(exp->ex_path.dentry)->i_ino);
+ fhp->fh_handle.ofh_dirino = ino_t_to_u32(parent_ino(dentry));
+ if (inode)
+ _fh_update_old(dentry, exp, &fhp->fh_handle);
+ } else {
+ fhp->fh_handle.fh_size =
+ key_len(fhp->fh_handle.fh_fsid_type) + 4;
+ fhp->fh_handle.fh_auth_type = 0;
+
+ mk_fsid(fhp->fh_handle.fh_fsid_type,
+ fhp->fh_handle.fh_fsid,
+ ex_dev,
+ d_inode(exp->ex_path.dentry)->i_ino,
+ exp->ex_fsid, exp->ex_uuid);
+
+ if (inode)
+ _fh_update(fhp, exp, dentry);
+ if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
+ fh_put(fhp);
+ return nfserr_opnotsupp;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Update file handle information after changing a dentry.
+ * This is only called by nfsd_create, nfsd_create_v3 and nfsd_proc_create
+ */
+__be32
+fh_update(struct svc_fh *fhp)
+{
+ struct dentry *dentry;
+
+ if (!fhp->fh_dentry)
+ goto out_bad;
+
+ dentry = fhp->fh_dentry;
+ if (d_really_is_negative(dentry))
+ goto out_negative;
+ if (fhp->fh_handle.fh_version != 1) {
+ _fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle);
+ } else {
+ if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT)
+ return 0;
+
+ _fh_update(fhp, fhp->fh_export, dentry);
+ if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
+ return nfserr_opnotsupp;
+ }
+ return 0;
+out_bad:
+ printk(KERN_ERR "fh_update: fh not verified!\n");
+ return nfserr_serverfault;
+out_negative:
+ printk(KERN_ERR "fh_update: %pd2 still negative!\n",
+ dentry);
+ return nfserr_serverfault;
+}
+
+/*
+ * Release a file handle.
+ */
+void
+fh_put(struct svc_fh *fhp)
+{
+ struct dentry * dentry = fhp->fh_dentry;
+ struct svc_export * exp = fhp->fh_export;
+ if (dentry) {
+ fh_unlock(fhp);
+ fhp->fh_dentry = NULL;
+ dput(dentry);
+ fh_clear_wcc(fhp);
+ }
+ fh_drop_write(fhp);
+ if (exp) {
+ exp_put(exp);
+ fhp->fh_export = NULL;
+ }
+ return;
+}
+
+/*
+ * Shorthand for dprintk()'s
+ */
+char * SVCFH_fmt(struct svc_fh *fhp)
+{
+ struct knfsd_fh *fh = &fhp->fh_handle;
+
+ static char buf[80];
+ sprintf(buf, "%d: %08x %08x %08x %08x %08x %08x",
+ fh->fh_size,
+ fh->fh_base.fh_pad[0],
+ fh->fh_base.fh_pad[1],
+ fh->fh_base.fh_pad[2],
+ fh->fh_base.fh_pad[3],
+ fh->fh_base.fh_pad[4],
+ fh->fh_base.fh_pad[5]);
+ return buf;
+}
+
+enum fsid_source fsid_source(struct svc_fh *fhp)
+{
+ if (fhp->fh_handle.fh_version != 1)
+ return FSIDSOURCE_DEV;
+ switch(fhp->fh_handle.fh_fsid_type) {
+ case FSID_DEV:
+ case FSID_ENCODE_DEV:
+ case FSID_MAJOR_MINOR:
+ if (exp_sb(fhp->fh_export)->s_type->fs_flags & FS_REQUIRES_DEV)
+ return FSIDSOURCE_DEV;
+ break;
+ case FSID_NUM:
+ if (fhp->fh_export->ex_flags & NFSEXP_FSID)
+ return FSIDSOURCE_FSID;
+ break;
+ default:
+ break;
+ }
+ /* either a UUID type filehandle, or the filehandle doesn't
+ * match the export.
+ */
+ if (fhp->fh_export->ex_flags & NFSEXP_FSID)
+ return FSIDSOURCE_FSID;
+ if (fhp->fh_export->ex_uuid)
+ return FSIDSOURCE_UUID;
+ return FSIDSOURCE_DEV;
+}
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
new file mode 100644
index 000000000..56cfbc361
--- /dev/null
+++ b/fs/nfsd/nfsfh.h
@@ -0,0 +1,326 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
+ *
+ * This file describes the layout of the file handles as passed
+ * over the wire.
+ */
+#ifndef _LINUX_NFSD_NFSFH_H
+#define _LINUX_NFSD_NFSFH_H
+
+#include <linux/crc32.h>
+#include <linux/sunrpc/svc.h>
+#include <uapi/linux/nfsd/nfsfh.h>
+#include <linux/iversion.h>
+
+static inline __u32 ino_t_to_u32(ino_t ino)
+{
+ return (__u32) ino;
+}
+
+static inline ino_t u32_to_ino_t(__u32 uino)
+{
+ return (ino_t) uino;
+}
+
+/*
+ * This is the internal representation of an NFS handle used in knfsd.
+ * pre_mtime/post_version will be used to support wcc_attr's in NFSv3.
+ */
+typedef struct svc_fh {
+ struct knfsd_fh fh_handle; /* FH data */
+ int fh_maxsize; /* max size for fh_handle */
+ struct dentry * fh_dentry; /* validated dentry */
+ struct svc_export * fh_export; /* export pointer */
+
+ bool fh_locked; /* inode locked by us */
+ bool fh_want_write; /* remount protection taken */
+ int fh_flags; /* FH flags */
+#ifdef CONFIG_NFSD_V3
+ bool fh_post_saved; /* post-op attrs saved */
+ bool fh_pre_saved; /* pre-op attrs saved */
+
+ /* Pre-op attributes saved during fh_lock */
+ __u64 fh_pre_size; /* size before operation */
+ struct timespec64 fh_pre_mtime; /* mtime before oper */
+ struct timespec64 fh_pre_ctime; /* ctime before oper */
+ /*
+ * pre-op nfsv4 change attr: note must check IS_I_VERSION(inode)
+ * to find out if it is valid.
+ */
+ u64 fh_pre_change;
+
+ /* Post-op attributes saved in fh_unlock */
+ struct kstat fh_post_attr; /* full attrs after operation */
+ u64 fh_post_change; /* nfsv4 change; see above */
+#endif /* CONFIG_NFSD_V3 */
+
+} svc_fh;
+#define NFSD4_FH_FOREIGN (1<<0)
+#define SET_FH_FLAG(c, f) ((c)->fh_flags |= (f))
+#define HAS_FH_FLAG(c, f) ((c)->fh_flags & (f))
+
+enum nfsd_fsid {
+ FSID_DEV = 0,
+ FSID_NUM,
+ FSID_MAJOR_MINOR,
+ FSID_ENCODE_DEV,
+ FSID_UUID4_INUM,
+ FSID_UUID8,
+ FSID_UUID16,
+ FSID_UUID16_INUM,
+};
+
+enum fsid_source {
+ FSIDSOURCE_DEV,
+ FSIDSOURCE_FSID,
+ FSIDSOURCE_UUID,
+};
+extern enum fsid_source fsid_source(struct svc_fh *fhp);
+
+
+/*
+ * This might look a little large to "inline" but in all calls except
+ * one, 'vers' is constant so moste of the function disappears.
+ *
+ * In some cases the values are considered to be host endian and in
+ * others, net endian. fsidv is always considered to be u32 as the
+ * callers don't know which it will be. So we must use __force to keep
+ * sparse from complaining. Since these values are opaque to the
+ * client, that shouldn't be a problem.
+ */
+static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino,
+ u32 fsid, unsigned char *uuid)
+{
+ u32 *up;
+ switch(vers) {
+ case FSID_DEV:
+ fsidv[0] = (__force __u32)htonl((MAJOR(dev)<<16) |
+ MINOR(dev));
+ fsidv[1] = ino_t_to_u32(ino);
+ break;
+ case FSID_NUM:
+ fsidv[0] = fsid;
+ break;
+ case FSID_MAJOR_MINOR:
+ fsidv[0] = (__force __u32)htonl(MAJOR(dev));
+ fsidv[1] = (__force __u32)htonl(MINOR(dev));
+ fsidv[2] = ino_t_to_u32(ino);
+ break;
+
+ case FSID_ENCODE_DEV:
+ fsidv[0] = new_encode_dev(dev);
+ fsidv[1] = ino_t_to_u32(ino);
+ break;
+
+ case FSID_UUID4_INUM:
+ /* 4 byte fsid and inode number */
+ up = (u32*)uuid;
+ fsidv[0] = ino_t_to_u32(ino);
+ fsidv[1] = up[0] ^ up[1] ^ up[2] ^ up[3];
+ break;
+
+ case FSID_UUID8:
+ /* 8 byte fsid */
+ up = (u32*)uuid;
+ fsidv[0] = up[0] ^ up[2];
+ fsidv[1] = up[1] ^ up[3];
+ break;
+
+ case FSID_UUID16:
+ /* 16 byte fsid - NFSv3+ only */
+ memcpy(fsidv, uuid, 16);
+ break;
+
+ case FSID_UUID16_INUM:
+ /* 8 byte inode and 16 byte fsid */
+ *(u64*)fsidv = (u64)ino;
+ memcpy(fsidv+2, uuid, 16);
+ break;
+ default: BUG();
+ }
+}
+
+static inline int key_len(int type)
+{
+ switch(type) {
+ case FSID_DEV: return 8;
+ case FSID_NUM: return 4;
+ case FSID_MAJOR_MINOR: return 12;
+ case FSID_ENCODE_DEV: return 8;
+ case FSID_UUID4_INUM: return 8;
+ case FSID_UUID8: return 8;
+ case FSID_UUID16: return 16;
+ case FSID_UUID16_INUM: return 24;
+ default: return 0;
+ }
+}
+
+/*
+ * Shorthand for dprintk()'s
+ */
+extern char * SVCFH_fmt(struct svc_fh *fhp);
+
+/*
+ * Function prototypes
+ */
+__be32 fh_verify(struct svc_rqst *, struct svc_fh *, umode_t, int);
+__be32 fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *);
+__be32 fh_update(struct svc_fh *);
+void fh_put(struct svc_fh *);
+
+static __inline__ struct svc_fh *
+fh_copy(struct svc_fh *dst, struct svc_fh *src)
+{
+ WARN_ON(src->fh_dentry || src->fh_locked);
+
+ *dst = *src;
+ return dst;
+}
+
+static inline void
+fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src)
+{
+ dst->fh_size = src->fh_size;
+ memcpy(&dst->fh_base, &src->fh_base, src->fh_size);
+}
+
+static __inline__ struct svc_fh *
+fh_init(struct svc_fh *fhp, int maxsize)
+{
+ memset(fhp, 0, sizeof(*fhp));
+ fhp->fh_maxsize = maxsize;
+ return fhp;
+}
+
+static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+{
+ if (fh1->fh_size != fh2->fh_size)
+ return false;
+ if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0)
+ return false;
+ return true;
+}
+
+static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+{
+ if (fh1->fh_fsid_type != fh2->fh_fsid_type)
+ return false;
+ if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type)) != 0)
+ return false;
+ return true;
+}
+
+#ifdef CONFIG_CRC32
+/**
+ * knfsd_fh_hash - calculate the crc32 hash for the filehandle
+ * @fh - pointer to filehandle
+ *
+ * returns a crc32 hash for the filehandle that is compatible with
+ * the one displayed by "wireshark".
+ */
+
+static inline u32
+knfsd_fh_hash(struct knfsd_fh *fh)
+{
+ return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_base, fh->fh_size);
+}
+#else
+static inline u32
+knfsd_fh_hash(struct knfsd_fh *fh)
+{
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_NFSD_V3
+/*
+ * The wcc data stored in current_fh should be cleared
+ * between compound ops.
+ */
+static inline void
+fh_clear_wcc(struct svc_fh *fhp)
+{
+ fhp->fh_post_saved = false;
+ fhp->fh_pre_saved = false;
+}
+
+/*
+ * We could use i_version alone as the change attribute. However,
+ * i_version can go backwards after a reboot. On its own that doesn't
+ * necessarily cause a problem, but if i_version goes backwards and then
+ * is incremented again it could reuse a value that was previously used
+ * before boot, and a client who queried the two values might
+ * incorrectly assume nothing changed.
+ *
+ * By using both ctime and the i_version counter we guarantee that as
+ * long as time doesn't go backwards we never reuse an old value.
+ */
+static inline u64 nfsd4_change_attribute(struct kstat *stat,
+ struct inode *inode)
+{
+ u64 chattr;
+
+ chattr = stat->ctime.tv_sec;
+ chattr <<= 30;
+ chattr += stat->ctime.tv_nsec;
+ chattr += inode_query_iversion(inode);
+ return chattr;
+}
+
+extern void fill_pre_wcc(struct svc_fh *fhp);
+extern void fill_post_wcc(struct svc_fh *fhp);
+#else
+#define fh_clear_wcc(ignored)
+#define fill_pre_wcc(ignored)
+#define fill_post_wcc(notused)
+#endif /* CONFIG_NFSD_V3 */
+
+
+/*
+ * Lock a file handle/inode
+ * NOTE: both fh_lock and fh_unlock are done "by hand" in
+ * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once
+ * so, any changes here should be reflected there.
+ */
+
+static inline void
+fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
+{
+ struct dentry *dentry = fhp->fh_dentry;
+ struct inode *inode;
+
+ BUG_ON(!dentry);
+
+ if (fhp->fh_locked) {
+ printk(KERN_WARNING "fh_lock: %pd2 already locked!\n",
+ dentry);
+ return;
+ }
+
+ inode = d_inode(dentry);
+ inode_lock_nested(inode, subclass);
+ fill_pre_wcc(fhp);
+ fhp->fh_locked = true;
+}
+
+static inline void
+fh_lock(struct svc_fh *fhp)
+{
+ fh_lock_nested(fhp, I_MUTEX_NORMAL);
+}
+
+/*
+ * Unlock a file handle/inode
+ */
+static inline void
+fh_unlock(struct svc_fh *fhp)
+{
+ if (fhp->fh_locked) {
+ fill_post_wcc(fhp);
+ inode_unlock(d_inode(fhp->fh_dentry));
+ fhp->fh_locked = false;
+ }
+}
+
+#endif /* _LINUX_NFSD_NFSFH_H */
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
new file mode 100644
index 000000000..bbd01e839
--- /dev/null
+++ b/fs/nfsd/nfsproc.c
@@ -0,0 +1,856 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Process version 2 NFS requests.
+ *
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/namei.h>
+
+#include "cache.h"
+#include "xdr.h"
+#include "vfs.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PROC
+
+static __be32
+nfsd_proc_null(struct svc_rqst *rqstp)
+{
+ return rpc_success;
+}
+
+/*
+ * Get a file's attributes
+ * N.B. After this call resp->fh needs an fh_put
+ */
+static __be32
+nfsd_proc_getattr(struct svc_rqst *rqstp)
+{
+ struct nfsd_fhandle *argp = rqstp->rq_argp;
+ struct nfsd_attrstat *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh));
+
+ fh_copy(&resp->fh, &argp->fh);
+ resp->status = fh_verify(rqstp, &resp->fh, 0,
+ NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
+ if (resp->status != nfs_ok)
+ goto out;
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+out:
+ return rpc_success;
+}
+
+/*
+ * Set a file's attributes
+ * N.B. After this call resp->fh needs an fh_put
+ */
+static __be32
+nfsd_proc_setattr(struct svc_rqst *rqstp)
+{
+ struct nfsd_sattrargs *argp = rqstp->rq_argp;
+ struct nfsd_attrstat *resp = rqstp->rq_resp;
+ struct iattr *iap = &argp->attrs;
+ struct svc_fh *fhp;
+
+ dprintk("nfsd: SETATTR %s, valid=%x, size=%ld\n",
+ SVCFH_fmt(&argp->fh),
+ argp->attrs.ia_valid, (long) argp->attrs.ia_size);
+
+ fhp = fh_copy(&resp->fh, &argp->fh);
+
+ /*
+ * NFSv2 does not differentiate between "set-[ac]time-to-now"
+ * which only requires access, and "set-[ac]time-to-X" which
+ * requires ownership.
+ * So if it looks like it might be "set both to the same time which
+ * is close to now", and if setattr_prepare fails, then we
+ * convert to "set to now" instead of "set to explicit time"
+ *
+ * We only call setattr_prepare as the last test as technically
+ * it is not an interface that we should be using.
+ */
+#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET)
+#define MAX_TOUCH_TIME_ERROR (30*60)
+ if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET &&
+ iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec) {
+ /*
+ * Looks probable.
+ *
+ * Now just make sure time is in the right ballpark.
+ * Solaris, at least, doesn't seem to care what the time
+ * request is. We require it be within 30 minutes of now.
+ */
+ time64_t delta = iap->ia_atime.tv_sec - ktime_get_real_seconds();
+
+ resp->status = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
+ if (resp->status != nfs_ok)
+ goto out;
+
+ if (delta < 0)
+ delta = -delta;
+ if (delta < MAX_TOUCH_TIME_ERROR &&
+ setattr_prepare(fhp->fh_dentry, iap) != 0) {
+ /*
+ * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME.
+ * This will cause notify_change to set these times
+ * to "now"
+ */
+ iap->ia_valid &= ~BOTH_TIME_SET;
+ }
+ }
+
+ resp->status = nfsd_setattr(rqstp, fhp, iap, 0, (time64_t)0);
+ if (resp->status != nfs_ok)
+ goto out;
+
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+out:
+ return rpc_success;
+}
+
+/* Obsolete, replaced by MNTPROC_MNT. */
+static __be32
+nfsd_proc_root(struct svc_rqst *rqstp)
+{
+ return rpc_success;
+}
+
+/*
+ * Look up a path name component
+ * Note: the dentry in the resp->fh may be negative if the file
+ * doesn't exist yet.
+ * N.B. After this call resp->fh needs an fh_put
+ */
+static __be32
+nfsd_proc_lookup(struct svc_rqst *rqstp)
+{
+ struct nfsd_diropargs *argp = rqstp->rq_argp;
+ struct nfsd_diropres *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: LOOKUP %s %.*s\n",
+ SVCFH_fmt(&argp->fh), argp->len, argp->name);
+
+ fh_init(&resp->fh, NFS_FHSIZE);
+ resp->status = nfsd_lookup(rqstp, &argp->fh, argp->name, argp->len,
+ &resp->fh);
+ fh_put(&argp->fh);
+ if (resp->status != nfs_ok)
+ goto out;
+
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+out:
+ return rpc_success;
+}
+
+/*
+ * Read a symlink.
+ */
+static __be32
+nfsd_proc_readlink(struct svc_rqst *rqstp)
+{
+ struct nfsd_readlinkargs *argp = rqstp->rq_argp;
+ struct nfsd_readlinkres *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh));
+
+ /* Read the symlink. */
+ resp->len = NFS_MAXPATHLEN;
+ resp->status = nfsd_readlink(rqstp, &argp->fh, argp->buffer, &resp->len);
+
+ fh_put(&argp->fh);
+ return rpc_success;
+}
+
+/*
+ * Read a portion of a file.
+ * N.B. After this call resp->fh needs an fh_put
+ */
+static __be32
+nfsd_proc_read(struct svc_rqst *rqstp)
+{
+ struct nfsd_readargs *argp = rqstp->rq_argp;
+ struct nfsd_readres *resp = rqstp->rq_resp;
+ u32 eof;
+
+ dprintk("nfsd: READ %s %d bytes at %d\n",
+ SVCFH_fmt(&argp->fh),
+ argp->count, argp->offset);
+
+ /* Obtain buffer pointer for payload. 19 is 1 word for
+ * status, 17 words for fattr, and 1 word for the byte count.
+ */
+
+ if (NFSSVC_MAXBLKSIZE_V2 < argp->count) {
+ char buf[RPC_MAX_ADDRBUFLEN];
+ printk(KERN_NOTICE
+ "oversized read request from %s (%d bytes)\n",
+ svc_print_addr(rqstp, buf, sizeof(buf)),
+ argp->count);
+ argp->count = NFSSVC_MAXBLKSIZE_V2;
+ }
+ svc_reserve_auth(rqstp, (19<<2) + argp->count + 4);
+
+ resp->count = argp->count;
+ resp->status = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh),
+ argp->offset,
+ rqstp->rq_vec, argp->vlen,
+ &resp->count,
+ &eof);
+ if (resp->status == nfs_ok)
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+ else if (resp->status == nfserr_jukebox)
+ return rpc_drop_reply;
+ return rpc_success;
+}
+
+/* Reserved */
+static __be32
+nfsd_proc_writecache(struct svc_rqst *rqstp)
+{
+ return rpc_success;
+}
+
+/*
+ * Write data to a file
+ * N.B. After this call resp->fh needs an fh_put
+ */
+static __be32
+nfsd_proc_write(struct svc_rqst *rqstp)
+{
+ struct nfsd_writeargs *argp = rqstp->rq_argp;
+ struct nfsd_attrstat *resp = rqstp->rq_resp;
+ unsigned long cnt = argp->len;
+ unsigned int nvecs;
+
+ dprintk("nfsd: WRITE %s %u bytes at %d\n",
+ SVCFH_fmt(&argp->fh),
+ argp->len, argp->offset);
+
+ nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages,
+ &argp->first, cnt);
+ if (!nvecs) {
+ resp->status = nfserr_io;
+ goto out;
+ }
+
+ resp->status = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh),
+ argp->offset, rqstp->rq_vec, nvecs,
+ &cnt, NFS_DATA_SYNC, NULL);
+ if (resp->status == nfs_ok)
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+ else if (resp->status == nfserr_jukebox)
+ return rpc_drop_reply;
+out:
+ return rpc_success;
+}
+
+/*
+ * CREATE processing is complicated. The keyword here is `overloaded.'
+ * The parent directory is kept locked between the check for existence
+ * and the actual create() call in compliance with VFS protocols.
+ * N.B. After this call _both_ argp->fh and resp->fh need an fh_put
+ */
+static __be32
+nfsd_proc_create(struct svc_rqst *rqstp)
+{
+ struct nfsd_createargs *argp = rqstp->rq_argp;
+ struct nfsd_diropres *resp = rqstp->rq_resp;
+ svc_fh *dirfhp = &argp->fh;
+ svc_fh *newfhp = &resp->fh;
+ struct iattr *attr = &argp->attrs;
+ struct inode *inode;
+ struct dentry *dchild;
+ int type, mode;
+ int hosterr;
+ dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size);
+
+ dprintk("nfsd: CREATE %s %.*s\n",
+ SVCFH_fmt(dirfhp), argp->len, argp->name);
+
+ /* First verify the parent file handle */
+ resp->status = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC);
+ if (resp->status != nfs_ok)
+ goto done; /* must fh_put dirfhp even on error */
+
+ /* Check for NFSD_MAY_WRITE in nfsd_create if necessary */
+
+ resp->status = nfserr_exist;
+ if (isdotent(argp->name, argp->len))
+ goto done;
+ hosterr = fh_want_write(dirfhp);
+ if (hosterr) {
+ resp->status = nfserrno(hosterr);
+ goto done;
+ }
+
+ fh_lock_nested(dirfhp, I_MUTEX_PARENT);
+ dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len);
+ if (IS_ERR(dchild)) {
+ resp->status = nfserrno(PTR_ERR(dchild));
+ goto out_unlock;
+ }
+ fh_init(newfhp, NFS_FHSIZE);
+ resp->status = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp);
+ if (!resp->status && d_really_is_negative(dchild))
+ resp->status = nfserr_noent;
+ dput(dchild);
+ if (resp->status) {
+ if (resp->status != nfserr_noent)
+ goto out_unlock;
+ /*
+ * If the new file handle wasn't verified, we can't tell
+ * whether the file exists or not. Time to bail ...
+ */
+ resp->status = nfserr_acces;
+ if (!newfhp->fh_dentry) {
+ printk(KERN_WARNING
+ "nfsd_proc_create: file handle not verified\n");
+ goto out_unlock;
+ }
+ }
+
+ inode = d_inode(newfhp->fh_dentry);
+
+ /* Unfudge the mode bits */
+ if (attr->ia_valid & ATTR_MODE) {
+ type = attr->ia_mode & S_IFMT;
+ mode = attr->ia_mode & ~S_IFMT;
+ if (!type) {
+ /* no type, so if target exists, assume same as that,
+ * else assume a file */
+ if (inode) {
+ type = inode->i_mode & S_IFMT;
+ switch(type) {
+ case S_IFCHR:
+ case S_IFBLK:
+ /* reserve rdev for later checking */
+ rdev = inode->i_rdev;
+ attr->ia_valid |= ATTR_SIZE;
+
+ fallthrough;
+ case S_IFIFO:
+ /* this is probably a permission check..
+ * at least IRIX implements perm checking on
+ * echo thing > device-special-file-or-pipe
+ * by doing a CREATE with type==0
+ */
+ resp->status = nfsd_permission(rqstp,
+ newfhp->fh_export,
+ newfhp->fh_dentry,
+ NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS);
+ if (resp->status && resp->status != nfserr_rofs)
+ goto out_unlock;
+ }
+ } else
+ type = S_IFREG;
+ }
+ } else if (inode) {
+ type = inode->i_mode & S_IFMT;
+ mode = inode->i_mode & ~S_IFMT;
+ } else {
+ type = S_IFREG;
+ mode = 0; /* ??? */
+ }
+
+ attr->ia_valid |= ATTR_MODE;
+ attr->ia_mode = mode;
+
+ /* Special treatment for non-regular files according to the
+ * gospel of sun micro
+ */
+ if (type != S_IFREG) {
+ if (type != S_IFBLK && type != S_IFCHR) {
+ rdev = 0;
+ } else if (type == S_IFCHR && !(attr->ia_valid & ATTR_SIZE)) {
+ /* If you think you've seen the worst, grok this. */
+ type = S_IFIFO;
+ } else {
+ /* Okay, char or block special */
+ if (!rdev)
+ rdev = wanted;
+ }
+
+ /* we've used the SIZE information, so discard it */
+ attr->ia_valid &= ~ATTR_SIZE;
+
+ /* Make sure the type and device matches */
+ resp->status = nfserr_exist;
+ if (inode && inode_wrong_type(inode, type))
+ goto out_unlock;
+ }
+
+ resp->status = nfs_ok;
+ if (!inode) {
+ /* File doesn't exist. Create it and set attrs */
+ resp->status = nfsd_create_locked(rqstp, dirfhp, argp->name,
+ argp->len, attr, type, rdev,
+ newfhp);
+ } else if (type == S_IFREG) {
+ dprintk("nfsd: existing %s, valid=%x, size=%ld\n",
+ argp->name, attr->ia_valid, (long) attr->ia_size);
+ /* File already exists. We ignore all attributes except
+ * size, so that creat() behaves exactly like
+ * open(..., O_CREAT|O_TRUNC|O_WRONLY).
+ */
+ attr->ia_valid &= ATTR_SIZE;
+ if (attr->ia_valid)
+ resp->status = nfsd_setattr(rqstp, newfhp, attr, 0,
+ (time64_t)0);
+ }
+
+out_unlock:
+ /* We don't really need to unlock, as fh_put does it. */
+ fh_unlock(dirfhp);
+ fh_drop_write(dirfhp);
+done:
+ fh_put(dirfhp);
+ if (resp->status != nfs_ok)
+ goto out;
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+out:
+ return rpc_success;
+}
+
+static __be32
+nfsd_proc_remove(struct svc_rqst *rqstp)
+{
+ struct nfsd_diropargs *argp = rqstp->rq_argp;
+ struct nfsd_stat *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: REMOVE %s %.*s\n", SVCFH_fmt(&argp->fh),
+ argp->len, argp->name);
+
+ /* Unlink. -SIFDIR means file must not be a directory */
+ resp->status = nfsd_unlink(rqstp, &argp->fh, -S_IFDIR,
+ argp->name, argp->len);
+ fh_put(&argp->fh);
+ return rpc_success;
+}
+
+static __be32
+nfsd_proc_rename(struct svc_rqst *rqstp)
+{
+ struct nfsd_renameargs *argp = rqstp->rq_argp;
+ struct nfsd_stat *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: RENAME %s %.*s -> \n",
+ SVCFH_fmt(&argp->ffh), argp->flen, argp->fname);
+ dprintk("nfsd: -> %s %.*s\n",
+ SVCFH_fmt(&argp->tfh), argp->tlen, argp->tname);
+
+ resp->status = nfsd_rename(rqstp, &argp->ffh, argp->fname, argp->flen,
+ &argp->tfh, argp->tname, argp->tlen);
+ fh_put(&argp->ffh);
+ fh_put(&argp->tfh);
+ return rpc_success;
+}
+
+static __be32
+nfsd_proc_link(struct svc_rqst *rqstp)
+{
+ struct nfsd_linkargs *argp = rqstp->rq_argp;
+ struct nfsd_stat *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: LINK %s ->\n",
+ SVCFH_fmt(&argp->ffh));
+ dprintk("nfsd: %s %.*s\n",
+ SVCFH_fmt(&argp->tfh),
+ argp->tlen,
+ argp->tname);
+
+ resp->status = nfsd_link(rqstp, &argp->tfh, argp->tname, argp->tlen,
+ &argp->ffh);
+ fh_put(&argp->ffh);
+ fh_put(&argp->tfh);
+ return rpc_success;
+}
+
+static __be32
+nfsd_proc_symlink(struct svc_rqst *rqstp)
+{
+ struct nfsd_symlinkargs *argp = rqstp->rq_argp;
+ struct nfsd_stat *resp = rqstp->rq_resp;
+ struct svc_fh newfh;
+
+ if (argp->tlen > NFS_MAXPATHLEN) {
+ resp->status = nfserr_nametoolong;
+ goto out;
+ }
+
+ argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first,
+ page_address(rqstp->rq_arg.pages[0]),
+ argp->tlen);
+ if (IS_ERR(argp->tname)) {
+ resp->status = nfserrno(PTR_ERR(argp->tname));
+ goto out;
+ }
+
+ dprintk("nfsd: SYMLINK %s %.*s -> %.*s\n",
+ SVCFH_fmt(&argp->ffh), argp->flen, argp->fname,
+ argp->tlen, argp->tname);
+
+ fh_init(&newfh, NFS_FHSIZE);
+ resp->status = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen,
+ argp->tname, &newfh);
+
+ kfree(argp->tname);
+ fh_put(&argp->ffh);
+ fh_put(&newfh);
+out:
+ return rpc_success;
+}
+
+/*
+ * Make directory. This operation is not idempotent.
+ * N.B. After this call resp->fh needs an fh_put
+ */
+static __be32
+nfsd_proc_mkdir(struct svc_rqst *rqstp)
+{
+ struct nfsd_createargs *argp = rqstp->rq_argp;
+ struct nfsd_diropres *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: MKDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
+
+ if (resp->fh.fh_dentry) {
+ printk(KERN_WARNING
+ "nfsd_proc_mkdir: response already verified??\n");
+ }
+
+ argp->attrs.ia_valid &= ~ATTR_SIZE;
+ fh_init(&resp->fh, NFS_FHSIZE);
+ resp->status = nfsd_create(rqstp, &argp->fh, argp->name, argp->len,
+ &argp->attrs, S_IFDIR, 0, &resp->fh);
+ fh_put(&argp->fh);
+ if (resp->status != nfs_ok)
+ goto out;
+
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+out:
+ return rpc_success;
+}
+
+/*
+ * Remove a directory
+ */
+static __be32
+nfsd_proc_rmdir(struct svc_rqst *rqstp)
+{
+ struct nfsd_diropargs *argp = rqstp->rq_argp;
+ struct nfsd_stat *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: RMDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
+
+ resp->status = nfsd_unlink(rqstp, &argp->fh, S_IFDIR,
+ argp->name, argp->len);
+ fh_put(&argp->fh);
+ return rpc_success;
+}
+
+/*
+ * Read a portion of a directory.
+ */
+static __be32
+nfsd_proc_readdir(struct svc_rqst *rqstp)
+{
+ struct nfsd_readdirargs *argp = rqstp->rq_argp;
+ struct nfsd_readdirres *resp = rqstp->rq_resp;
+ int count;
+ loff_t offset;
+
+ dprintk("nfsd: READDIR %s %d bytes at %d\n",
+ SVCFH_fmt(&argp->fh),
+ argp->count, argp->cookie);
+
+ /* Shrink to the client read size */
+ count = (argp->count >> 2) - 2;
+
+ /* Make sure we've room for the NULL ptr & eof flag */
+ count -= 2;
+ if (count < 0)
+ count = 0;
+
+ resp->buffer = argp->buffer;
+ resp->offset = NULL;
+ resp->buflen = count;
+ resp->common.err = nfs_ok;
+ /* Read directory and encode entries on the fly */
+ offset = argp->cookie;
+ resp->status = nfsd_readdir(rqstp, &argp->fh, &offset,
+ &resp->common, nfssvc_encode_entry);
+
+ resp->count = resp->buffer - argp->buffer;
+ if (resp->offset)
+ *resp->offset = htonl(offset);
+
+ fh_put(&argp->fh);
+ return rpc_success;
+}
+
+/*
+ * Get file system info
+ */
+static __be32
+nfsd_proc_statfs(struct svc_rqst *rqstp)
+{
+ struct nfsd_fhandle *argp = rqstp->rq_argp;
+ struct nfsd_statfsres *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh));
+
+ resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats,
+ NFSD_MAY_BYPASS_GSS_ON_ROOT);
+ fh_put(&argp->fh);
+ return rpc_success;
+}
+
+/*
+ * NFSv2 Server procedures.
+ * Only the results of non-idempotent operations are cached.
+ */
+struct nfsd_void { int dummy; };
+
+#define ST 1 /* status */
+#define FH 8 /* filehandle */
+#define AT 18 /* attributes */
+
+static const struct svc_procedure nfsd_procedures2[18] = {
+ [NFSPROC_NULL] = {
+ .pc_func = nfsd_proc_null,
+ .pc_decode = nfssvc_decode_void,
+ .pc_encode = nfssvc_encode_void,
+ .pc_argsize = sizeof(struct nfsd_void),
+ .pc_ressize = sizeof(struct nfsd_void),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = 0,
+ },
+ [NFSPROC_GETATTR] = {
+ .pc_func = nfsd_proc_getattr,
+ .pc_decode = nfssvc_decode_fhandle,
+ .pc_encode = nfssvc_encode_attrstat,
+ .pc_release = nfssvc_release_attrstat,
+ .pc_argsize = sizeof(struct nfsd_fhandle),
+ .pc_ressize = sizeof(struct nfsd_attrstat),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+AT,
+ },
+ [NFSPROC_SETATTR] = {
+ .pc_func = nfsd_proc_setattr,
+ .pc_decode = nfssvc_decode_sattrargs,
+ .pc_encode = nfssvc_encode_attrstat,
+ .pc_release = nfssvc_release_attrstat,
+ .pc_argsize = sizeof(struct nfsd_sattrargs),
+ .pc_ressize = sizeof(struct nfsd_attrstat),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+AT,
+ },
+ [NFSPROC_ROOT] = {
+ .pc_func = nfsd_proc_root,
+ .pc_decode = nfssvc_decode_void,
+ .pc_encode = nfssvc_encode_void,
+ .pc_argsize = sizeof(struct nfsd_void),
+ .pc_ressize = sizeof(struct nfsd_void),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = 0,
+ },
+ [NFSPROC_LOOKUP] = {
+ .pc_func = nfsd_proc_lookup,
+ .pc_decode = nfssvc_decode_diropargs,
+ .pc_encode = nfssvc_encode_diropres,
+ .pc_release = nfssvc_release_diropres,
+ .pc_argsize = sizeof(struct nfsd_diropargs),
+ .pc_ressize = sizeof(struct nfsd_diropres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+FH+AT,
+ },
+ [NFSPROC_READLINK] = {
+ .pc_func = nfsd_proc_readlink,
+ .pc_decode = nfssvc_decode_readlinkargs,
+ .pc_encode = nfssvc_encode_readlinkres,
+ .pc_argsize = sizeof(struct nfsd_readlinkargs),
+ .pc_ressize = sizeof(struct nfsd_readlinkres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+1+NFS_MAXPATHLEN/4,
+ },
+ [NFSPROC_READ] = {
+ .pc_func = nfsd_proc_read,
+ .pc_decode = nfssvc_decode_readargs,
+ .pc_encode = nfssvc_encode_readres,
+ .pc_release = nfssvc_release_readres,
+ .pc_argsize = sizeof(struct nfsd_readargs),
+ .pc_ressize = sizeof(struct nfsd_readres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4,
+ },
+ [NFSPROC_WRITECACHE] = {
+ .pc_func = nfsd_proc_writecache,
+ .pc_decode = nfssvc_decode_void,
+ .pc_encode = nfssvc_encode_void,
+ .pc_argsize = sizeof(struct nfsd_void),
+ .pc_ressize = sizeof(struct nfsd_void),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = 0,
+ },
+ [NFSPROC_WRITE] = {
+ .pc_func = nfsd_proc_write,
+ .pc_decode = nfssvc_decode_writeargs,
+ .pc_encode = nfssvc_encode_attrstat,
+ .pc_release = nfssvc_release_attrstat,
+ .pc_argsize = sizeof(struct nfsd_writeargs),
+ .pc_ressize = sizeof(struct nfsd_attrstat),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+AT,
+ },
+ [NFSPROC_CREATE] = {
+ .pc_func = nfsd_proc_create,
+ .pc_decode = nfssvc_decode_createargs,
+ .pc_encode = nfssvc_encode_diropres,
+ .pc_release = nfssvc_release_diropres,
+ .pc_argsize = sizeof(struct nfsd_createargs),
+ .pc_ressize = sizeof(struct nfsd_diropres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+FH+AT,
+ },
+ [NFSPROC_REMOVE] = {
+ .pc_func = nfsd_proc_remove,
+ .pc_decode = nfssvc_decode_diropargs,
+ .pc_encode = nfssvc_encode_stat,
+ .pc_argsize = sizeof(struct nfsd_diropargs),
+ .pc_ressize = sizeof(struct nfsd_stat),
+ .pc_cachetype = RC_REPLSTAT,
+ .pc_xdrressize = ST,
+ },
+ [NFSPROC_RENAME] = {
+ .pc_func = nfsd_proc_rename,
+ .pc_decode = nfssvc_decode_renameargs,
+ .pc_encode = nfssvc_encode_stat,
+ .pc_argsize = sizeof(struct nfsd_renameargs),
+ .pc_ressize = sizeof(struct nfsd_stat),
+ .pc_cachetype = RC_REPLSTAT,
+ .pc_xdrressize = ST,
+ },
+ [NFSPROC_LINK] = {
+ .pc_func = nfsd_proc_link,
+ .pc_decode = nfssvc_decode_linkargs,
+ .pc_encode = nfssvc_encode_stat,
+ .pc_argsize = sizeof(struct nfsd_linkargs),
+ .pc_ressize = sizeof(struct nfsd_stat),
+ .pc_cachetype = RC_REPLSTAT,
+ .pc_xdrressize = ST,
+ },
+ [NFSPROC_SYMLINK] = {
+ .pc_func = nfsd_proc_symlink,
+ .pc_decode = nfssvc_decode_symlinkargs,
+ .pc_encode = nfssvc_encode_stat,
+ .pc_argsize = sizeof(struct nfsd_symlinkargs),
+ .pc_ressize = sizeof(struct nfsd_stat),
+ .pc_cachetype = RC_REPLSTAT,
+ .pc_xdrressize = ST,
+ },
+ [NFSPROC_MKDIR] = {
+ .pc_func = nfsd_proc_mkdir,
+ .pc_decode = nfssvc_decode_createargs,
+ .pc_encode = nfssvc_encode_diropres,
+ .pc_release = nfssvc_release_diropres,
+ .pc_argsize = sizeof(struct nfsd_createargs),
+ .pc_ressize = sizeof(struct nfsd_diropres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+FH+AT,
+ },
+ [NFSPROC_RMDIR] = {
+ .pc_func = nfsd_proc_rmdir,
+ .pc_decode = nfssvc_decode_diropargs,
+ .pc_encode = nfssvc_encode_stat,
+ .pc_argsize = sizeof(struct nfsd_diropargs),
+ .pc_ressize = sizeof(struct nfsd_stat),
+ .pc_cachetype = RC_REPLSTAT,
+ .pc_xdrressize = ST,
+ },
+ [NFSPROC_READDIR] = {
+ .pc_func = nfsd_proc_readdir,
+ .pc_decode = nfssvc_decode_readdirargs,
+ .pc_encode = nfssvc_encode_readdirres,
+ .pc_argsize = sizeof(struct nfsd_readdirargs),
+ .pc_ressize = sizeof(struct nfsd_readdirres),
+ .pc_cachetype = RC_NOCACHE,
+ },
+ [NFSPROC_STATFS] = {
+ .pc_func = nfsd_proc_statfs,
+ .pc_decode = nfssvc_decode_fhandle,
+ .pc_encode = nfssvc_encode_statfsres,
+ .pc_argsize = sizeof(struct nfsd_fhandle),
+ .pc_ressize = sizeof(struct nfsd_statfsres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+5,
+ },
+};
+
+
+static unsigned int nfsd_count2[ARRAY_SIZE(nfsd_procedures2)];
+const struct svc_version nfsd_version2 = {
+ .vs_vers = 2,
+ .vs_nproc = 18,
+ .vs_proc = nfsd_procedures2,
+ .vs_count = nfsd_count2,
+ .vs_dispatch = nfsd_dispatch,
+ .vs_xdrsize = NFS2_SVC_XDRSIZE,
+};
+
+/*
+ * Map errnos to NFS errnos.
+ */
+__be32
+nfserrno (int errno)
+{
+ static struct {
+ __be32 nfserr;
+ int syserr;
+ } nfs_errtbl[] = {
+ { nfs_ok, 0 },
+ { nfserr_perm, -EPERM },
+ { nfserr_noent, -ENOENT },
+ { nfserr_io, -EIO },
+ { nfserr_nxio, -ENXIO },
+ { nfserr_fbig, -E2BIG },
+ { nfserr_acces, -EACCES },
+ { nfserr_exist, -EEXIST },
+ { nfserr_xdev, -EXDEV },
+ { nfserr_mlink, -EMLINK },
+ { nfserr_nodev, -ENODEV },
+ { nfserr_notdir, -ENOTDIR },
+ { nfserr_isdir, -EISDIR },
+ { nfserr_inval, -EINVAL },
+ { nfserr_fbig, -EFBIG },
+ { nfserr_nospc, -ENOSPC },
+ { nfserr_rofs, -EROFS },
+ { nfserr_mlink, -EMLINK },
+ { nfserr_nametoolong, -ENAMETOOLONG },
+ { nfserr_notempty, -ENOTEMPTY },
+#ifdef EDQUOT
+ { nfserr_dquot, -EDQUOT },
+#endif
+ { nfserr_stale, -ESTALE },
+ { nfserr_jukebox, -ETIMEDOUT },
+ { nfserr_jukebox, -ERESTARTSYS },
+ { nfserr_jukebox, -EAGAIN },
+ { nfserr_jukebox, -EWOULDBLOCK },
+ { nfserr_jukebox, -ENOMEM },
+ { nfserr_io, -ETXTBSY },
+ { nfserr_notsupp, -EOPNOTSUPP },
+ { nfserr_toosmall, -ETOOSMALL },
+ { nfserr_serverfault, -ESERVERFAULT },
+ { nfserr_serverfault, -ENFILE },
+ { nfserr_io, -EUCLEAN },
+ { nfserr_perm, -ENOKEY },
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) {
+ if (nfs_errtbl[i].syserr == errno)
+ return nfs_errtbl[i].nfserr;
+ }
+ WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno);
+ return nfserr_io;
+}
+
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
new file mode 100644
index 000000000..2e61a565c
--- /dev/null
+++ b/fs/nfsd/nfssvc.c
@@ -0,0 +1,1135 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Central processing for nfsd.
+ *
+ * Authors: Olaf Kirch (okir@monad.swb.de)
+ *
+ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/sched/signal.h>
+#include <linux/freezer.h>
+#include <linux/module.h>
+#include <linux/fs_struct.h>
+#include <linux/swap.h>
+
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/lockd/bind.h>
+#include <linux/nfsacl.h>
+#include <linux/seq_file.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
+#include <net/net_namespace.h>
+#include "nfsd.h"
+#include "cache.h"
+#include "vfs.h"
+#include "netns.h"
+#include "filecache.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_SVC
+
+bool inter_copy_offload_enable;
+EXPORT_SYMBOL_GPL(inter_copy_offload_enable);
+module_param(inter_copy_offload_enable, bool, 0644);
+MODULE_PARM_DESC(inter_copy_offload_enable,
+ "Enable inter server to server copy offload. Default: false");
+
+extern struct svc_program nfsd_program;
+static int nfsd(void *vrqstp);
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+static int nfsd_acl_rpcbind_set(struct net *,
+ const struct svc_program *,
+ u32, int,
+ unsigned short,
+ unsigned short);
+static __be32 nfsd_acl_init_request(struct svc_rqst *,
+ const struct svc_program *,
+ struct svc_process_info *);
+#endif
+static int nfsd_rpcbind_set(struct net *,
+ const struct svc_program *,
+ u32, int,
+ unsigned short,
+ unsigned short);
+static __be32 nfsd_init_request(struct svc_rqst *,
+ const struct svc_program *,
+ struct svc_process_info *);
+
+/*
+ * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and the members
+ * of the svc_serv struct. In particular, ->sv_nrthreads but also to some
+ * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt
+ *
+ * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a
+ * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number
+ * of nfsd threads must exist and each must listed in ->sp_all_threads in each
+ * entry of ->sv_pools[].
+ *
+ * Transitions of the thread count between zero and non-zero are of particular
+ * interest since the svc_serv needs to be created and initialized at that
+ * point, or freed.
+ *
+ * Finally, the nfsd_mutex also protects some of the global variables that are
+ * accessed when nfsd starts and that are settable via the write_* routines in
+ * nfsctl.c. In particular:
+ *
+ * user_recovery_dirname
+ * user_lease_time
+ * nfsd_versions
+ */
+DEFINE_MUTEX(nfsd_mutex);
+
+/*
+ * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used.
+ * nfsd_drc_max_pages limits the total amount of memory available for
+ * version 4.1 DRC caches.
+ * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage.
+ */
+spinlock_t nfsd_drc_lock;
+unsigned long nfsd_drc_max_mem;
+unsigned long nfsd_drc_mem_used;
+
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+static struct svc_stat nfsd_acl_svcstats;
+static const struct svc_version *nfsd_acl_version[] = {
+ [2] = &nfsd_acl_version2,
+ [3] = &nfsd_acl_version3,
+};
+
+#define NFSD_ACL_MINVERS 2
+#define NFSD_ACL_NRVERS ARRAY_SIZE(nfsd_acl_version)
+
+static struct svc_program nfsd_acl_program = {
+ .pg_prog = NFS_ACL_PROGRAM,
+ .pg_nvers = NFSD_ACL_NRVERS,
+ .pg_vers = nfsd_acl_version,
+ .pg_name = "nfsacl",
+ .pg_class = "nfsd",
+ .pg_stats = &nfsd_acl_svcstats,
+ .pg_authenticate = &svc_set_client,
+ .pg_init_request = nfsd_acl_init_request,
+ .pg_rpcbind_set = nfsd_acl_rpcbind_set,
+};
+
+static struct svc_stat nfsd_acl_svcstats = {
+ .program = &nfsd_acl_program,
+};
+#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
+
+static const struct svc_version *nfsd_version[] = {
+ [2] = &nfsd_version2,
+#if defined(CONFIG_NFSD_V3)
+ [3] = &nfsd_version3,
+#endif
+#if defined(CONFIG_NFSD_V4)
+ [4] = &nfsd_version4,
+#endif
+};
+
+#define NFSD_MINVERS 2
+#define NFSD_NRVERS ARRAY_SIZE(nfsd_version)
+
+struct svc_program nfsd_program = {
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+ .pg_next = &nfsd_acl_program,
+#endif
+ .pg_prog = NFS_PROGRAM, /* program number */
+ .pg_nvers = NFSD_NRVERS, /* nr of entries in nfsd_version */
+ .pg_vers = nfsd_version, /* version table */
+ .pg_name = "nfsd", /* program name */
+ .pg_class = "nfsd", /* authentication class */
+ .pg_stats = &nfsd_svcstats, /* version table */
+ .pg_authenticate = &svc_set_client, /* export authentication */
+ .pg_init_request = nfsd_init_request,
+ .pg_rpcbind_set = nfsd_rpcbind_set,
+};
+
+static bool
+nfsd_support_version(int vers)
+{
+ if (vers >= NFSD_MINVERS && vers < NFSD_NRVERS)
+ return nfsd_version[vers] != NULL;
+ return false;
+}
+
+static bool *
+nfsd_alloc_versions(void)
+{
+ bool *vers = kmalloc_array(NFSD_NRVERS, sizeof(bool), GFP_KERNEL);
+ unsigned i;
+
+ if (vers) {
+ /* All compiled versions are enabled by default */
+ for (i = 0; i < NFSD_NRVERS; i++)
+ vers[i] = nfsd_support_version(i);
+ }
+ return vers;
+}
+
+static bool *
+nfsd_alloc_minorversions(void)
+{
+ bool *vers = kmalloc_array(NFSD_SUPPORTED_MINOR_VERSION + 1,
+ sizeof(bool), GFP_KERNEL);
+ unsigned i;
+
+ if (vers) {
+ /* All minor versions are enabled by default */
+ for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++)
+ vers[i] = nfsd_support_version(4);
+ }
+ return vers;
+}
+
+void
+nfsd_netns_free_versions(struct nfsd_net *nn)
+{
+ kfree(nn->nfsd_versions);
+ kfree(nn->nfsd4_minorversions);
+ nn->nfsd_versions = NULL;
+ nn->nfsd4_minorversions = NULL;
+}
+
+static void
+nfsd_netns_init_versions(struct nfsd_net *nn)
+{
+ if (!nn->nfsd_versions) {
+ nn->nfsd_versions = nfsd_alloc_versions();
+ nn->nfsd4_minorversions = nfsd_alloc_minorversions();
+ if (!nn->nfsd_versions || !nn->nfsd4_minorversions)
+ nfsd_netns_free_versions(nn);
+ }
+}
+
+int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change)
+{
+ if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
+ return 0;
+ switch(change) {
+ case NFSD_SET:
+ if (nn->nfsd_versions)
+ nn->nfsd_versions[vers] = nfsd_support_version(vers);
+ break;
+ case NFSD_CLEAR:
+ nfsd_netns_init_versions(nn);
+ if (nn->nfsd_versions)
+ nn->nfsd_versions[vers] = false;
+ break;
+ case NFSD_TEST:
+ if (nn->nfsd_versions)
+ return nn->nfsd_versions[vers];
+ fallthrough;
+ case NFSD_AVAIL:
+ return nfsd_support_version(vers);
+ }
+ return 0;
+}
+
+static void
+nfsd_adjust_nfsd_versions4(struct nfsd_net *nn)
+{
+ unsigned i;
+
+ for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) {
+ if (nn->nfsd4_minorversions[i])
+ return;
+ }
+ nfsd_vers(nn, 4, NFSD_CLEAR);
+}
+
+int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change)
+{
+ if (minorversion > NFSD_SUPPORTED_MINOR_VERSION &&
+ change != NFSD_AVAIL)
+ return -1;
+
+ switch(change) {
+ case NFSD_SET:
+ if (nn->nfsd4_minorversions) {
+ nfsd_vers(nn, 4, NFSD_SET);
+ nn->nfsd4_minorversions[minorversion] =
+ nfsd_vers(nn, 4, NFSD_TEST);
+ }
+ break;
+ case NFSD_CLEAR:
+ nfsd_netns_init_versions(nn);
+ if (nn->nfsd4_minorversions) {
+ nn->nfsd4_minorversions[minorversion] = false;
+ nfsd_adjust_nfsd_versions4(nn);
+ }
+ break;
+ case NFSD_TEST:
+ if (nn->nfsd4_minorversions)
+ return nn->nfsd4_minorversions[minorversion];
+ return nfsd_vers(nn, 4, NFSD_TEST);
+ case NFSD_AVAIL:
+ return minorversion <= NFSD_SUPPORTED_MINOR_VERSION &&
+ nfsd_vers(nn, 4, NFSD_AVAIL);
+ }
+ return 0;
+}
+
+/*
+ * Maximum number of nfsd processes
+ */
+#define NFSD_MAXSERVS 8192
+
+int nfsd_nrthreads(struct net *net)
+{
+ int rv = 0;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ mutex_lock(&nfsd_mutex);
+ if (nn->nfsd_serv)
+ rv = nn->nfsd_serv->sv_nrthreads;
+ mutex_unlock(&nfsd_mutex);
+ return rv;
+}
+
+static int nfsd_init_socks(struct net *net, const struct cred *cred)
+{
+ int error;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (!list_empty(&nn->nfsd_serv->sv_permsocks))
+ return 0;
+
+ error = svc_create_xprt(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT,
+ SVC_SOCK_DEFAULTS, cred);
+ if (error < 0)
+ return error;
+
+ error = svc_create_xprt(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT,
+ SVC_SOCK_DEFAULTS, cred);
+ if (error < 0)
+ return error;
+
+ return 0;
+}
+
+static int nfsd_users = 0;
+
+static int nfsd_startup_generic(int nrservs)
+{
+ int ret;
+
+ if (nfsd_users++)
+ return 0;
+
+ ret = nfsd_file_cache_init();
+ if (ret)
+ goto dec_users;
+
+ ret = nfs4_state_start();
+ if (ret)
+ goto out_file_cache;
+ return 0;
+
+out_file_cache:
+ nfsd_file_cache_shutdown();
+dec_users:
+ nfsd_users--;
+ return ret;
+}
+
+static void nfsd_shutdown_generic(void)
+{
+ if (--nfsd_users)
+ return;
+
+ nfs4_state_shutdown();
+ nfsd_file_cache_shutdown();
+}
+
+static bool nfsd_needs_lockd(struct nfsd_net *nn)
+{
+ return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST);
+}
+
+void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn)
+{
+ int seq = 0;
+
+ do {
+ read_seqbegin_or_lock(&nn->boot_lock, &seq);
+ /*
+ * This is opaque to client, so no need to byte-swap. Use
+ * __force to keep sparse happy. y2038 time_t overflow is
+ * irrelevant in this usage
+ */
+ verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec;
+ verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec;
+ } while (need_seqretry(&nn->boot_lock, seq));
+ done_seqretry(&nn->boot_lock, seq);
+}
+
+static void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn)
+{
+ ktime_get_real_ts64(&nn->nfssvc_boot);
+}
+
+void nfsd_reset_boot_verifier(struct nfsd_net *nn)
+{
+ write_seqlock(&nn->boot_lock);
+ nfsd_reset_boot_verifier_locked(nn);
+ write_sequnlock(&nn->boot_lock);
+}
+
+static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cred)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ int ret;
+
+ if (nn->nfsd_net_up)
+ return 0;
+
+ ret = nfsd_startup_generic(nrservs);
+ if (ret)
+ return ret;
+ ret = nfsd_init_socks(net, cred);
+ if (ret)
+ goto out_socks;
+
+ if (nfsd_needs_lockd(nn) && !nn->lockd_up) {
+ ret = lockd_up(net, cred);
+ if (ret)
+ goto out_socks;
+ nn->lockd_up = true;
+ }
+
+ ret = nfsd_file_cache_start_net(net);
+ if (ret)
+ goto out_lockd;
+ ret = nfs4_state_start_net(net);
+ if (ret)
+ goto out_filecache;
+
+ nn->nfsd_net_up = true;
+ return 0;
+
+out_filecache:
+ nfsd_file_cache_shutdown_net(net);
+out_lockd:
+ if (nn->lockd_up) {
+ lockd_down(net);
+ nn->lockd_up = false;
+ }
+out_socks:
+ nfsd_shutdown_generic();
+ return ret;
+}
+
+static void nfsd_shutdown_net(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ nfs4_state_shutdown_net(net);
+ nfsd_file_cache_shutdown_net(net);
+ if (nn->lockd_up) {
+ lockd_down(net);
+ nn->lockd_up = false;
+ }
+ nn->nfsd_net_up = false;
+ nfsd_shutdown_generic();
+}
+
+static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+ struct net_device *dev = ifa->ifa_dev->dev;
+ struct net *net = dev_net(dev);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct sockaddr_in sin;
+
+ if ((event != NETDEV_DOWN) ||
+ !atomic_inc_not_zero(&nn->ntf_refcnt))
+ goto out;
+
+ if (nn->nfsd_serv) {
+ dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = ifa->ifa_local;
+ svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin);
+ }
+ atomic_dec(&nn->ntf_refcnt);
+ wake_up(&nn->ntf_wq);
+
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfsd_inetaddr_notifier = {
+ .notifier_call = nfsd_inetaddr_event,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int nfsd_inet6addr_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
+ struct net_device *dev = ifa->idev->dev;
+ struct net *net = dev_net(dev);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct sockaddr_in6 sin6;
+
+ if ((event != NETDEV_DOWN) ||
+ !atomic_inc_not_zero(&nn->ntf_refcnt))
+ goto out;
+
+ if (nn->nfsd_serv) {
+ dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr);
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = ifa->addr;
+ if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
+ sin6.sin6_scope_id = ifa->idev->dev->ifindex;
+ svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6);
+ }
+ atomic_dec(&nn->ntf_refcnt);
+ wake_up(&nn->ntf_wq);
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfsd_inet6addr_notifier = {
+ .notifier_call = nfsd_inet6addr_event,
+};
+#endif
+
+/* Only used under nfsd_mutex, so this atomic may be overkill: */
+static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0);
+
+static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ atomic_dec(&nn->ntf_refcnt);
+ /* check if the notifier still has clients */
+ if (atomic_dec_return(&nfsd_notifier_refcount) == 0) {
+ unregister_inetaddr_notifier(&nfsd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+ unregister_inet6addr_notifier(&nfsd_inet6addr_notifier);
+#endif
+ }
+ wait_event(nn->ntf_wq, atomic_read(&nn->ntf_refcnt) == 0);
+
+ /*
+ * write_ports can create the server without actually starting
+ * any threads--if we get shut down before any threads are
+ * started, then nfsd_last_thread will be run before any of this
+ * other initialization has been done except the rpcb information.
+ */
+ svc_rpcb_cleanup(serv, net);
+ if (!nn->nfsd_net_up)
+ return;
+
+ nfsd_shutdown_net(net);
+ pr_info("nfsd: last server has exited, flushing export cache\n");
+ nfsd_export_flush(net);
+}
+
+void nfsd_reset_versions(struct nfsd_net *nn)
+{
+ int i;
+
+ for (i = 0; i < NFSD_NRVERS; i++)
+ if (nfsd_vers(nn, i, NFSD_TEST))
+ return;
+
+ for (i = 0; i < NFSD_NRVERS; i++)
+ if (i != 4)
+ nfsd_vers(nn, i, NFSD_SET);
+ else {
+ int minor = 0;
+ while (nfsd_minorversion(nn, minor, NFSD_SET) >= 0)
+ minor++;
+ }
+}
+
+/*
+ * Each session guarantees a negotiated per slot memory cache for replies
+ * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated
+ * NFSv4.1 server might want to use more memory for a DRC than a machine
+ * with mutiple services.
+ *
+ * Impose a hard limit on the number of pages for the DRC which varies
+ * according to the machines free pages. This is of course only a default.
+ *
+ * For now this is a #defined shift which could be under admin control
+ * in the future.
+ */
+static void set_max_drc(void)
+{
+ #define NFSD_DRC_SIZE_SHIFT 7
+ nfsd_drc_max_mem = (nr_free_buffer_pages()
+ >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
+ nfsd_drc_mem_used = 0;
+ spin_lock_init(&nfsd_drc_lock);
+ dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem);
+}
+
+static int nfsd_get_default_max_blksize(void)
+{
+ struct sysinfo i;
+ unsigned long long target;
+ unsigned long ret;
+
+ si_meminfo(&i);
+ target = (i.totalram - i.totalhigh) << PAGE_SHIFT;
+ /*
+ * Aim for 1/4096 of memory per thread This gives 1MB on 4Gig
+ * machines, but only uses 32K on 128M machines. Bottom out at
+ * 8K on 32M and smaller. Of course, this is only a default.
+ */
+ target >>= 12;
+
+ ret = NFSSVC_MAXBLKSIZE;
+ while (ret > target && ret >= 8*1024*2)
+ ret /= 2;
+ return ret;
+}
+
+static const struct svc_serv_ops nfsd_thread_sv_ops = {
+ .svo_shutdown = nfsd_last_thread,
+ .svo_function = nfsd,
+ .svo_enqueue_xprt = svc_xprt_do_enqueue,
+ .svo_setup = svc_set_num_threads,
+ .svo_module = THIS_MODULE,
+};
+
+static void nfsd_complete_shutdown(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ WARN_ON(!mutex_is_locked(&nfsd_mutex));
+
+ nn->nfsd_serv = NULL;
+ complete(&nn->nfsd_shutdown_complete);
+}
+
+void nfsd_shutdown_threads(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct svc_serv *serv;
+
+ mutex_lock(&nfsd_mutex);
+ serv = nn->nfsd_serv;
+ if (serv == NULL) {
+ mutex_unlock(&nfsd_mutex);
+ return;
+ }
+
+ svc_get(serv);
+ /* Kill outstanding nfsd threads */
+ serv->sv_ops->svo_setup(serv, NULL, 0);
+ nfsd_destroy(net);
+ mutex_unlock(&nfsd_mutex);
+ /* Wait for shutdown of nfsd_serv to complete */
+ wait_for_completion(&nn->nfsd_shutdown_complete);
+}
+
+bool i_am_nfsd(void)
+{
+ return kthread_func(current) == nfsd;
+}
+
+int nfsd_create_serv(struct net *net)
+{
+ int error;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ WARN_ON(!mutex_is_locked(&nfsd_mutex));
+ if (nn->nfsd_serv) {
+ svc_get(nn->nfsd_serv);
+ return 0;
+ }
+ if (nfsd_max_blksize == 0)
+ nfsd_max_blksize = nfsd_get_default_max_blksize();
+ nfsd_reset_versions(nn);
+ nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
+ &nfsd_thread_sv_ops);
+ if (nn->nfsd_serv == NULL)
+ return -ENOMEM;
+ init_completion(&nn->nfsd_shutdown_complete);
+
+ nn->nfsd_serv->sv_maxconn = nn->max_connections;
+ error = svc_bind(nn->nfsd_serv, net);
+ if (error < 0) {
+ svc_destroy(nn->nfsd_serv);
+ nfsd_complete_shutdown(net);
+ return error;
+ }
+
+ set_max_drc();
+ /* check if the notifier is already set */
+ if (atomic_inc_return(&nfsd_notifier_refcount) == 1) {
+ register_inetaddr_notifier(&nfsd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+ register_inet6addr_notifier(&nfsd_inet6addr_notifier);
+#endif
+ }
+ atomic_inc(&nn->ntf_refcnt);
+ nfsd_reset_boot_verifier(nn);
+ return 0;
+}
+
+int nfsd_nrpools(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (nn->nfsd_serv == NULL)
+ return 0;
+ else
+ return nn->nfsd_serv->sv_nrpools;
+}
+
+int nfsd_get_nrthreads(int n, int *nthreads, struct net *net)
+{
+ int i = 0;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (nn->nfsd_serv != NULL) {
+ for (i = 0; i < nn->nfsd_serv->sv_nrpools && i < n; i++)
+ nthreads[i] = nn->nfsd_serv->sv_pools[i].sp_nrthreads;
+ }
+
+ return 0;
+}
+
+void nfsd_destroy(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ int destroy = (nn->nfsd_serv->sv_nrthreads == 1);
+
+ if (destroy)
+ svc_shutdown_net(nn->nfsd_serv, net);
+ svc_destroy(nn->nfsd_serv);
+ if (destroy)
+ nfsd_complete_shutdown(net);
+}
+
+int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
+{
+ int i = 0;
+ int tot = 0;
+ int err = 0;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ WARN_ON(!mutex_is_locked(&nfsd_mutex));
+
+ if (nn->nfsd_serv == NULL || n <= 0)
+ return 0;
+
+ if (n > nn->nfsd_serv->sv_nrpools)
+ n = nn->nfsd_serv->sv_nrpools;
+
+ /* enforce a global maximum number of threads */
+ tot = 0;
+ for (i = 0; i < n; i++) {
+ nthreads[i] = min(nthreads[i], NFSD_MAXSERVS);
+ tot += nthreads[i];
+ }
+ if (tot > NFSD_MAXSERVS) {
+ /* total too large: scale down requested numbers */
+ for (i = 0; i < n && tot > 0; i++) {
+ int new = nthreads[i] * NFSD_MAXSERVS / tot;
+ tot -= (nthreads[i] - new);
+ nthreads[i] = new;
+ }
+ for (i = 0; i < n && tot > 0; i++) {
+ nthreads[i]--;
+ tot--;
+ }
+ }
+
+ /*
+ * There must always be a thread in pool 0; the admin
+ * can't shut down NFS completely using pool_threads.
+ */
+ if (nthreads[0] == 0)
+ nthreads[0] = 1;
+
+ /* apply the new numbers */
+ svc_get(nn->nfsd_serv);
+ for (i = 0; i < n; i++) {
+ err = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv,
+ &nn->nfsd_serv->sv_pools[i], nthreads[i]);
+ if (err)
+ break;
+ }
+ nfsd_destroy(net);
+ return err;
+}
+
+/*
+ * Adjust the number of threads and return the new number of threads.
+ * This is also the function that starts the server if necessary, if
+ * this is the first time nrservs is nonzero.
+ */
+int
+nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
+{
+ int error;
+ bool nfsd_up_before;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ mutex_lock(&nfsd_mutex);
+ dprintk("nfsd: creating service\n");
+
+ nrservs = max(nrservs, 0);
+ nrservs = min(nrservs, NFSD_MAXSERVS);
+ error = 0;
+
+ if (nrservs == 0 && nn->nfsd_serv == NULL)
+ goto out;
+
+ strlcpy(nn->nfsd_name, utsname()->nodename,
+ sizeof(nn->nfsd_name));
+
+ error = nfsd_create_serv(net);
+ if (error)
+ goto out;
+
+ nfsd_up_before = nn->nfsd_net_up;
+
+ error = nfsd_startup_net(nrservs, net, cred);
+ if (error)
+ goto out_destroy;
+ error = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv,
+ NULL, nrservs);
+ if (error)
+ goto out_shutdown;
+ /* We are holding a reference to nn->nfsd_serv which
+ * we don't want to count in the return value,
+ * so subtract 1
+ */
+ error = nn->nfsd_serv->sv_nrthreads - 1;
+out_shutdown:
+ if (error < 0 && !nfsd_up_before)
+ nfsd_shutdown_net(net);
+out_destroy:
+ nfsd_destroy(net); /* Release server */
+out:
+ mutex_unlock(&nfsd_mutex);
+ return error;
+}
+
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+static bool
+nfsd_support_acl_version(int vers)
+{
+ if (vers >= NFSD_ACL_MINVERS && vers < NFSD_ACL_NRVERS)
+ return nfsd_acl_version[vers] != NULL;
+ return false;
+}
+
+static int
+nfsd_acl_rpcbind_set(struct net *net, const struct svc_program *progp,
+ u32 version, int family, unsigned short proto,
+ unsigned short port)
+{
+ if (!nfsd_support_acl_version(version) ||
+ !nfsd_vers(net_generic(net, nfsd_net_id), version, NFSD_TEST))
+ return 0;
+ return svc_generic_rpcbind_set(net, progp, version, family,
+ proto, port);
+}
+
+static __be32
+nfsd_acl_init_request(struct svc_rqst *rqstp,
+ const struct svc_program *progp,
+ struct svc_process_info *ret)
+{
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ int i;
+
+ if (likely(nfsd_support_acl_version(rqstp->rq_vers) &&
+ nfsd_vers(nn, rqstp->rq_vers, NFSD_TEST)))
+ return svc_generic_init_request(rqstp, progp, ret);
+
+ ret->mismatch.lovers = NFSD_ACL_NRVERS;
+ for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) {
+ if (nfsd_support_acl_version(rqstp->rq_vers) &&
+ nfsd_vers(nn, i, NFSD_TEST)) {
+ ret->mismatch.lovers = i;
+ break;
+ }
+ }
+ if (ret->mismatch.lovers == NFSD_ACL_NRVERS)
+ return rpc_prog_unavail;
+ ret->mismatch.hivers = NFSD_ACL_MINVERS;
+ for (i = NFSD_ACL_NRVERS - 1; i >= NFSD_ACL_MINVERS; i--) {
+ if (nfsd_support_acl_version(rqstp->rq_vers) &&
+ nfsd_vers(nn, i, NFSD_TEST)) {
+ ret->mismatch.hivers = i;
+ break;
+ }
+ }
+ return rpc_prog_mismatch;
+}
+#endif
+
+static int
+nfsd_rpcbind_set(struct net *net, const struct svc_program *progp,
+ u32 version, int family, unsigned short proto,
+ unsigned short port)
+{
+ if (!nfsd_vers(net_generic(net, nfsd_net_id), version, NFSD_TEST))
+ return 0;
+ return svc_generic_rpcbind_set(net, progp, version, family,
+ proto, port);
+}
+
+static __be32
+nfsd_init_request(struct svc_rqst *rqstp,
+ const struct svc_program *progp,
+ struct svc_process_info *ret)
+{
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ int i;
+
+ if (likely(nfsd_vers(nn, rqstp->rq_vers, NFSD_TEST)))
+ return svc_generic_init_request(rqstp, progp, ret);
+
+ ret->mismatch.lovers = NFSD_NRVERS;
+ for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) {
+ if (nfsd_vers(nn, i, NFSD_TEST)) {
+ ret->mismatch.lovers = i;
+ break;
+ }
+ }
+ if (ret->mismatch.lovers == NFSD_NRVERS)
+ return rpc_prog_unavail;
+ ret->mismatch.hivers = NFSD_MINVERS;
+ for (i = NFSD_NRVERS - 1; i >= NFSD_MINVERS; i--) {
+ if (nfsd_vers(nn, i, NFSD_TEST)) {
+ ret->mismatch.hivers = i;
+ break;
+ }
+ }
+ return rpc_prog_mismatch;
+}
+
+/*
+ * This is the NFS server kernel thread
+ */
+static int
+nfsd(void *vrqstp)
+{
+ struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
+ struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list);
+ struct net *net = perm_sock->xpt_net;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ int err;
+
+ /* Lock module and set up kernel thread */
+ mutex_lock(&nfsd_mutex);
+
+ /* At this point, the thread shares current->fs
+ * with the init process. We need to create files with the
+ * umask as defined by the client instead of init's umask. */
+ if (unshare_fs_struct() < 0) {
+ printk("Unable to start nfsd thread: out of memory\n");
+ goto out;
+ }
+
+ current->fs->umask = 0;
+
+ /*
+ * thread is spawned with all signals set to SIG_IGN, re-enable
+ * the ones that will bring down the thread
+ */
+ allow_signal(SIGKILL);
+ allow_signal(SIGHUP);
+ allow_signal(SIGINT);
+ allow_signal(SIGQUIT);
+
+ nfsdstats.th_cnt++;
+ mutex_unlock(&nfsd_mutex);
+
+ set_freezable();
+
+ /*
+ * The main request loop
+ */
+ for (;;) {
+ /* Update sv_maxconn if it has changed */
+ rqstp->rq_server->sv_maxconn = nn->max_connections;
+
+ /*
+ * Find a socket with data available and call its
+ * recvfrom routine.
+ */
+ while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN)
+ ;
+ if (err == -EINTR)
+ break;
+ validate_process_creds();
+ svc_process(rqstp);
+ validate_process_creds();
+ }
+
+ /* Clear signals before calling svc_exit_thread() */
+ flush_signals(current);
+
+ mutex_lock(&nfsd_mutex);
+ nfsdstats.th_cnt --;
+
+out:
+ rqstp->rq_server = NULL;
+
+ /* Release the thread */
+ svc_exit_thread(rqstp);
+
+ nfsd_destroy(net);
+
+ /* Release module */
+ mutex_unlock(&nfsd_mutex);
+ module_put_and_exit(0);
+ return 0;
+}
+
+/*
+ * A write procedure can have a large argument, and a read procedure can
+ * have a large reply, but no NFSv2 or NFSv3 procedure has argument and
+ * reply that can both be larger than a page. The xdr code has taken
+ * advantage of this assumption to be a sloppy about bounds checking in
+ * some cases. Pending a rewrite of the NFSv2/v3 xdr code to fix that
+ * problem, we enforce these assumptions here:
+ */
+static bool nfs_request_too_big(struct svc_rqst *rqstp,
+ const struct svc_procedure *proc)
+{
+ /*
+ * The ACL code has more careful bounds-checking and is not
+ * susceptible to this problem:
+ */
+ if (rqstp->rq_prog != NFS_PROGRAM)
+ return false;
+ /*
+ * Ditto NFSv4 (which can in theory have argument and reply both
+ * more than a page):
+ */
+ if (rqstp->rq_vers >= 4)
+ return false;
+ /* The reply will be small, we're OK: */
+ if (proc->pc_xdrressize > 0 &&
+ proc->pc_xdrressize < XDR_QUADLEN(PAGE_SIZE))
+ return false;
+
+ return rqstp->rq_arg.len > PAGE_SIZE;
+}
+
+/**
+ * nfsd_dispatch - Process an NFS or NFSACL Request
+ * @rqstp: incoming request
+ * @statp: pointer to location of accept_stat field in RPC Reply buffer
+ *
+ * This RPC dispatcher integrates the NFS server's duplicate reply cache.
+ *
+ * Return values:
+ * %0: Processing complete; do not send a Reply
+ * %1: Processing complete; send Reply in rqstp->rq_res
+ */
+int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
+{
+ const struct svc_procedure *proc = rqstp->rq_procinfo;
+ struct kvec *argv = &rqstp->rq_arg.head[0];
+ struct kvec *resv = &rqstp->rq_res.head[0];
+ __be32 *p;
+
+ dprintk("nfsd_dispatch: vers %d proc %d\n",
+ rqstp->rq_vers, rqstp->rq_proc);
+
+ if (nfs_request_too_big(rqstp, proc))
+ goto out_too_large;
+
+ /*
+ * Give the xdr decoder a chance to change this if it wants
+ * (necessary in the NFSv4.0 compound case)
+ */
+ rqstp->rq_cachetype = proc->pc_cachetype;
+ if (!proc->pc_decode(rqstp, argv->iov_base))
+ goto out_decode_err;
+
+ switch (nfsd_cache_lookup(rqstp)) {
+ case RC_DOIT:
+ break;
+ case RC_REPLY:
+ goto out_cached_reply;
+ case RC_DROPIT:
+ goto out_dropit;
+ }
+
+ /*
+ * Need to grab the location to store the status, as
+ * NFSv4 does some encoding while processing
+ */
+ p = resv->iov_base + resv->iov_len;
+ resv->iov_len += sizeof(__be32);
+
+ *statp = proc->pc_func(rqstp);
+ if (*statp == rpc_drop_reply || test_bit(RQ_DROPME, &rqstp->rq_flags))
+ goto out_update_drop;
+
+ if (!proc->pc_encode(rqstp, p))
+ goto out_encode_err;
+
+ nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1);
+out_cached_reply:
+ return 1;
+
+out_too_large:
+ dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers);
+ *statp = rpc_garbage_args;
+ return 1;
+
+out_decode_err:
+ dprintk("nfsd: failed to decode arguments!\n");
+ *statp = rpc_garbage_args;
+ return 1;
+
+out_update_drop:
+ dprintk("nfsd: Dropping request; may be revisited later\n");
+ nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
+out_dropit:
+ return 0;
+
+out_encode_err:
+ dprintk("nfsd: failed to encode result!\n");
+ nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
+ *statp = rpc_system_err;
+ return 1;
+}
+
+int nfsd_pool_stats_open(struct inode *inode, struct file *file)
+{
+ int ret;
+ struct nfsd_net *nn = net_generic(inode->i_sb->s_fs_info, nfsd_net_id);
+
+ mutex_lock(&nfsd_mutex);
+ if (nn->nfsd_serv == NULL) {
+ mutex_unlock(&nfsd_mutex);
+ return -ENODEV;
+ }
+ /* bump up the psudo refcount while traversing */
+ svc_get(nn->nfsd_serv);
+ ret = svc_pool_stats_open(nn->nfsd_serv, file);
+ mutex_unlock(&nfsd_mutex);
+ return ret;
+}
+
+int nfsd_pool_stats_release(struct inode *inode, struct file *file)
+{
+ int ret = seq_release(inode, file);
+ struct net *net = inode->i_sb->s_fs_info;
+
+ mutex_lock(&nfsd_mutex);
+ /* this function really, really should have been called svc_put() */
+ nfsd_destroy(net);
+ mutex_unlock(&nfsd_mutex);
+ return ret;
+}
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
new file mode 100644
index 000000000..8a288c8fc
--- /dev/null
+++ b/fs/nfsd/nfsxdr.c
@@ -0,0 +1,616 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * XDR support for nfsd
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include "vfs.h"
+#include "xdr.h"
+#include "auth.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_XDR
+
+/*
+ * Mapping of S_IF* types to NFS file types
+ */
+static u32 nfs_ftypes[] = {
+ NFNON, NFCHR, NFCHR, NFBAD,
+ NFDIR, NFBAD, NFBLK, NFBAD,
+ NFREG, NFBAD, NFLNK, NFBAD,
+ NFSOCK, NFBAD, NFLNK, NFBAD,
+};
+
+
+/*
+ * XDR functions for basic NFS types
+ */
+static __be32 *
+decode_fh(__be32 *p, struct svc_fh *fhp)
+{
+ fh_init(fhp, NFS_FHSIZE);
+ memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE);
+ fhp->fh_handle.fh_size = NFS_FHSIZE;
+
+ /* FIXME: Look up export pointer here and verify
+ * Sun Secure RPC if requested */
+ return p + (NFS_FHSIZE >> 2);
+}
+
+/* Helper function for NFSv2 ACL code */
+__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp)
+{
+ return decode_fh(p, fhp);
+}
+
+static __be32 *
+encode_fh(__be32 *p, struct svc_fh *fhp)
+{
+ memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE);
+ return p + (NFS_FHSIZE>> 2);
+}
+
+/*
+ * Decode a file name and make sure that the path contains
+ * no slashes or null bytes.
+ */
+static __be32 *
+decode_filename(__be32 *p, char **namp, unsigned int *lenp)
+{
+ char *name;
+ unsigned int i;
+
+ if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) {
+ for (i = 0, name = *namp; i < *lenp; i++, name++) {
+ if (*name == '\0' || *name == '/')
+ return NULL;
+ }
+ }
+
+ return p;
+}
+
+static __be32 *
+decode_sattr(__be32 *p, struct iattr *iap, struct user_namespace *userns)
+{
+ u32 tmp, tmp1;
+
+ iap->ia_valid = 0;
+
+ /* Sun client bug compatibility check: some sun clients seem to
+ * put 0xffff in the mode field when they mean 0xffffffff.
+ * Quoting the 4.4BSD nfs server code: Nah nah nah nah na nah.
+ */
+ if ((tmp = ntohl(*p++)) != (u32)-1 && tmp != 0xffff) {
+ iap->ia_valid |= ATTR_MODE;
+ iap->ia_mode = tmp;
+ }
+ if ((tmp = ntohl(*p++)) != (u32)-1) {
+ iap->ia_uid = make_kuid(userns, tmp);
+ if (uid_valid(iap->ia_uid))
+ iap->ia_valid |= ATTR_UID;
+ }
+ if ((tmp = ntohl(*p++)) != (u32)-1) {
+ iap->ia_gid = make_kgid(userns, tmp);
+ if (gid_valid(iap->ia_gid))
+ iap->ia_valid |= ATTR_GID;
+ }
+ if ((tmp = ntohl(*p++)) != (u32)-1) {
+ iap->ia_valid |= ATTR_SIZE;
+ iap->ia_size = tmp;
+ }
+ tmp = ntohl(*p++); tmp1 = ntohl(*p++);
+ if (tmp != (u32)-1 && tmp1 != (u32)-1) {
+ iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET;
+ iap->ia_atime.tv_sec = tmp;
+ iap->ia_atime.tv_nsec = tmp1 * 1000;
+ }
+ tmp = ntohl(*p++); tmp1 = ntohl(*p++);
+ if (tmp != (u32)-1 && tmp1 != (u32)-1) {
+ iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET;
+ iap->ia_mtime.tv_sec = tmp;
+ iap->ia_mtime.tv_nsec = tmp1 * 1000;
+ /*
+ * Passing the invalid value useconds=1000000 for mtime
+ * is a Sun convention for "set both mtime and atime to
+ * current server time". It's needed to make permissions
+ * checks for the "touch" program across v2 mounts to
+ * Solaris and Irix boxes work correctly. See description of
+ * sattr in section 6.1 of "NFS Illustrated" by
+ * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5
+ */
+ if (tmp1 == 1000000)
+ iap->ia_valid &= ~(ATTR_ATIME_SET|ATTR_MTIME_SET);
+ }
+ return p;
+}
+
+static __be32 *
+encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
+ struct kstat *stat)
+{
+ struct user_namespace *userns = nfsd_user_namespace(rqstp);
+ struct dentry *dentry = fhp->fh_dentry;
+ int type;
+ struct timespec64 time;
+ u32 f;
+
+ type = (stat->mode & S_IFMT);
+
+ *p++ = htonl(nfs_ftypes[type >> 12]);
+ *p++ = htonl((u32) stat->mode);
+ *p++ = htonl((u32) stat->nlink);
+ *p++ = htonl((u32) from_kuid_munged(userns, stat->uid));
+ *p++ = htonl((u32) from_kgid_munged(userns, stat->gid));
+
+ if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) {
+ *p++ = htonl(NFS_MAXPATHLEN);
+ } else {
+ *p++ = htonl((u32) stat->size);
+ }
+ *p++ = htonl((u32) stat->blksize);
+ if (S_ISCHR(type) || S_ISBLK(type))
+ *p++ = htonl(new_encode_dev(stat->rdev));
+ else
+ *p++ = htonl(0xffffffff);
+ *p++ = htonl((u32) stat->blocks);
+ switch (fsid_source(fhp)) {
+ default:
+ case FSIDSOURCE_DEV:
+ *p++ = htonl(new_encode_dev(stat->dev));
+ break;
+ case FSIDSOURCE_FSID:
+ *p++ = htonl((u32) fhp->fh_export->ex_fsid);
+ break;
+ case FSIDSOURCE_UUID:
+ f = ((u32*)fhp->fh_export->ex_uuid)[0];
+ f ^= ((u32*)fhp->fh_export->ex_uuid)[1];
+ f ^= ((u32*)fhp->fh_export->ex_uuid)[2];
+ f ^= ((u32*)fhp->fh_export->ex_uuid)[3];
+ *p++ = htonl(f);
+ break;
+ }
+ *p++ = htonl((u32) stat->ino);
+ *p++ = htonl((u32) stat->atime.tv_sec);
+ *p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0);
+ time = stat->mtime;
+ lease_get_mtime(d_inode(dentry), &time);
+ *p++ = htonl((u32) time.tv_sec);
+ *p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0);
+ *p++ = htonl((u32) stat->ctime.tv_sec);
+ *p++ = htonl(stat->ctime.tv_nsec ? stat->ctime.tv_nsec / 1000 : 0);
+
+ return p;
+}
+
+/* Helper function for NFSv2 ACL code */
+__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat)
+{
+ return encode_fattr(rqstp, p, fhp, stat);
+}
+
+/*
+ * XDR decode functions
+ */
+int
+nfssvc_decode_void(struct svc_rqst *rqstp, __be32 *p)
+{
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_fhandle *args = rqstp->rq_argp;
+
+ p = decode_fh(p, &args->fh);
+ if (!p)
+ return 0;
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_sattrargs *args = rqstp->rq_argp;
+
+ p = decode_fh(p, &args->fh);
+ if (!p)
+ return 0;
+ p = decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp));
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_diropargs *args = rqstp->rq_argp;
+
+ if (!(p = decode_fh(p, &args->fh))
+ || !(p = decode_filename(p, &args->name, &args->len)))
+ return 0;
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_readargs *args = rqstp->rq_argp;
+ unsigned int len;
+ int v;
+ p = decode_fh(p, &args->fh);
+ if (!p)
+ return 0;
+
+ args->offset = ntohl(*p++);
+ len = args->count = ntohl(*p++);
+ p++; /* totalcount - unused */
+
+ len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2);
+
+ /* set up somewhere to store response.
+ * We take pages, put them on reslist and include in iovec
+ */
+ v=0;
+ while (len > 0) {
+ struct page *p = *(rqstp->rq_next_page++);
+
+ rqstp->rq_vec[v].iov_base = page_address(p);
+ rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
+ len -= rqstp->rq_vec[v].iov_len;
+ v++;
+ }
+ args->vlen = v;
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_writeargs *args = rqstp->rq_argp;
+ unsigned int len, hdr, dlen;
+ struct kvec *head = rqstp->rq_arg.head;
+
+ p = decode_fh(p, &args->fh);
+ if (!p)
+ return 0;
+
+ p++; /* beginoffset */
+ args->offset = ntohl(*p++); /* offset */
+ p++; /* totalcount */
+ len = args->len = ntohl(*p++);
+ /*
+ * The protocol specifies a maximum of 8192 bytes.
+ */
+ if (len > NFSSVC_MAXBLKSIZE_V2)
+ return 0;
+
+ /*
+ * Check to make sure that we got the right number of
+ * bytes.
+ */
+ hdr = (void*)p - head->iov_base;
+ if (hdr > head->iov_len)
+ return 0;
+ dlen = head->iov_len + rqstp->rq_arg.page_len - hdr;
+
+ /*
+ * Round the length of the data which was specified up to
+ * the next multiple of XDR units and then compare that
+ * against the length which was actually received.
+ * Note that when RPCSEC/GSS (for example) is used, the
+ * data buffer can be padded so dlen might be larger
+ * than required. It must never be smaller.
+ */
+ if (dlen < XDR_QUADLEN(len)*4)
+ return 0;
+
+ args->first.iov_base = (void *)p;
+ args->first.iov_len = head->iov_len - hdr;
+ return 1;
+}
+
+int
+nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_createargs *args = rqstp->rq_argp;
+
+ if ( !(p = decode_fh(p, &args->fh))
+ || !(p = decode_filename(p, &args->name, &args->len)))
+ return 0;
+ p = decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp));
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_renameargs *args = rqstp->rq_argp;
+
+ if (!(p = decode_fh(p, &args->ffh))
+ || !(p = decode_filename(p, &args->fname, &args->flen))
+ || !(p = decode_fh(p, &args->tfh))
+ || !(p = decode_filename(p, &args->tname, &args->tlen)))
+ return 0;
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_readlinkargs *args = rqstp->rq_argp;
+
+ p = decode_fh(p, &args->fh);
+ if (!p)
+ return 0;
+ args->buffer = page_address(*(rqstp->rq_next_page++));
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_linkargs *args = rqstp->rq_argp;
+
+ if (!(p = decode_fh(p, &args->ffh))
+ || !(p = decode_fh(p, &args->tfh))
+ || !(p = decode_filename(p, &args->tname, &args->tlen)))
+ return 0;
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+int
+nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_symlinkargs *args = rqstp->rq_argp;
+ char *base = (char *)p;
+ size_t xdrlen;
+
+ if ( !(p = decode_fh(p, &args->ffh))
+ || !(p = decode_filename(p, &args->fname, &args->flen)))
+ return 0;
+
+ args->tlen = ntohl(*p++);
+ if (args->tlen == 0)
+ return 0;
+
+ args->first.iov_base = p;
+ args->first.iov_len = rqstp->rq_arg.head[0].iov_len;
+ args->first.iov_len -= (char *)p - base;
+
+ /* This request is never larger than a page. Therefore,
+ * transport will deliver either:
+ * 1. pathname in the pagelist -> sattr is in the tail.
+ * 2. everything in the head buffer -> sattr is in the head.
+ */
+ if (rqstp->rq_arg.page_len) {
+ if (args->tlen != rqstp->rq_arg.page_len)
+ return 0;
+ p = rqstp->rq_arg.tail[0].iov_base;
+ } else {
+ xdrlen = XDR_QUADLEN(args->tlen);
+ if (xdrlen > args->first.iov_len - (8 * sizeof(__be32)))
+ return 0;
+ p += xdrlen;
+ }
+ decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp));
+
+ return 1;
+}
+
+int
+nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_readdirargs *args = rqstp->rq_argp;
+
+ p = decode_fh(p, &args->fh);
+ if (!p)
+ return 0;
+ args->cookie = ntohl(*p++);
+ args->count = ntohl(*p++);
+ args->count = min_t(u32, args->count, PAGE_SIZE);
+ args->buffer = page_address(*(rqstp->rq_next_page++));
+
+ return xdr_argsize_check(rqstp, p);
+}
+
+/*
+ * XDR encode functions
+ */
+int
+nfssvc_encode_void(struct svc_rqst *rqstp, __be32 *p)
+{
+ return xdr_ressize_check(rqstp, p);
+}
+
+int
+nfssvc_encode_stat(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_stat *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ return xdr_ressize_check(rqstp, p);
+}
+
+int
+nfssvc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_attrstat *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ if (resp->status != nfs_ok)
+ goto out;
+ p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+out:
+ return xdr_ressize_check(rqstp, p);
+}
+
+int
+nfssvc_encode_diropres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_diropres *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ if (resp->status != nfs_ok)
+ goto out;
+ p = encode_fh(p, &resp->fh);
+ p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+out:
+ return xdr_ressize_check(rqstp, p);
+}
+
+int
+nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_readlinkres *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ if (resp->status != nfs_ok)
+ return xdr_ressize_check(rqstp, p);
+
+ *p++ = htonl(resp->len);
+ xdr_ressize_check(rqstp, p);
+ rqstp->rq_res.page_len = resp->len;
+ if (resp->len & 3) {
+ /* need to pad the tail */
+ rqstp->rq_res.tail[0].iov_base = p;
+ *p = 0;
+ rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3);
+ }
+ return 1;
+}
+
+int
+nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_readres *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ if (resp->status != nfs_ok)
+ return xdr_ressize_check(rqstp, p);
+
+ p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+ *p++ = htonl(resp->count);
+ xdr_ressize_check(rqstp, p);
+
+ /* now update rqstp->rq_res to reflect data as well */
+ rqstp->rq_res.page_len = resp->count;
+ if (resp->count & 3) {
+ /* need to pad the tail */
+ rqstp->rq_res.tail[0].iov_base = p;
+ *p = 0;
+ rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3);
+ }
+ return 1;
+}
+
+int
+nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_readdirres *resp = rqstp->rq_resp;
+
+ *p++ = resp->status;
+ if (resp->status != nfs_ok)
+ return xdr_ressize_check(rqstp, p);
+
+ xdr_ressize_check(rqstp, p);
+ p = resp->buffer;
+ *p++ = 0; /* no more entries */
+ *p++ = htonl((resp->common.err == nfserr_eof));
+ rqstp->rq_res.page_len = (((unsigned long)p-1) & ~PAGE_MASK)+1;
+
+ return 1;
+}
+
+int
+nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p)
+{
+ struct nfsd_statfsres *resp = rqstp->rq_resp;
+ struct kstatfs *stat = &resp->stats;
+
+ *p++ = resp->status;
+ if (resp->status != nfs_ok)
+ return xdr_ressize_check(rqstp, p);
+
+ *p++ = htonl(NFSSVC_MAXBLKSIZE_V2); /* max transfer size */
+ *p++ = htonl(stat->f_bsize);
+ *p++ = htonl(stat->f_blocks);
+ *p++ = htonl(stat->f_bfree);
+ *p++ = htonl(stat->f_bavail);
+ return xdr_ressize_check(rqstp, p);
+}
+
+int
+nfssvc_encode_entry(void *ccdv, const char *name,
+ int namlen, loff_t offset, u64 ino, unsigned int d_type)
+{
+ struct readdir_cd *ccd = ccdv;
+ struct nfsd_readdirres *cd = container_of(ccd, struct nfsd_readdirres, common);
+ __be32 *p = cd->buffer;
+ int buflen, slen;
+
+ /*
+ dprintk("nfsd: entry(%.*s off %ld ino %ld)\n",
+ namlen, name, offset, ino);
+ */
+
+ if (offset > ~((u32) 0)) {
+ cd->common.err = nfserr_fbig;
+ return -EINVAL;
+ }
+ if (cd->offset)
+ *cd->offset = htonl(offset);
+
+ /* truncate filename */
+ namlen = min(namlen, NFS2_MAXNAMLEN);
+ slen = XDR_QUADLEN(namlen);
+
+ if ((buflen = cd->buflen - slen - 4) < 0) {
+ cd->common.err = nfserr_toosmall;
+ return -EINVAL;
+ }
+ if (ino > ~((u32) 0)) {
+ cd->common.err = nfserr_fbig;
+ return -EINVAL;
+ }
+ *p++ = xdr_one; /* mark entry present */
+ *p++ = htonl((u32) ino); /* file id */
+ p = xdr_encode_array(p, name, namlen);/* name length & name */
+ cd->offset = p; /* remember pointer */
+ *p++ = htonl(~0U); /* offset of next entry */
+
+ cd->buflen = buflen;
+ cd->buffer = p;
+ cd->common.err = nfs_ok;
+ return 0;
+}
+
+/*
+ * XDR release functions
+ */
+void nfssvc_release_attrstat(struct svc_rqst *rqstp)
+{
+ struct nfsd_attrstat *resp = rqstp->rq_resp;
+
+ fh_put(&resp->fh);
+}
+
+void nfssvc_release_diropres(struct svc_rqst *rqstp)
+{
+ struct nfsd_diropres *resp = rqstp->rq_resp;
+
+ fh_put(&resp->fh);
+}
+
+void nfssvc_release_readres(struct svc_rqst *rqstp)
+{
+ struct nfsd_readres *resp = rqstp->rq_resp;
+
+ fh_put(&resp->fh);
+}
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
new file mode 100644
index 000000000..4f4282d4e
--- /dev/null
+++ b/fs/nfsd/pnfs.h
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_NFSD_PNFS_H
+#define _FS_NFSD_PNFS_H 1
+
+#ifdef CONFIG_NFSD_V4
+#include <linux/exportfs.h>
+#include <linux/nfsd/export.h>
+
+#include "state.h"
+#include "xdr4.h"
+
+struct xdr_stream;
+
+struct nfsd4_deviceid_map {
+ struct list_head hash;
+ u64 idx;
+ int fsid_type;
+ u32 fsid[];
+};
+
+struct nfsd4_layout_ops {
+ u32 notify_types;
+ bool disable_recalls;
+
+ __be32 (*proc_getdeviceinfo)(struct super_block *sb,
+ struct svc_rqst *rqstp,
+ struct nfs4_client *clp,
+ struct nfsd4_getdeviceinfo *gdevp);
+ __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
+ struct nfsd4_getdeviceinfo *gdevp);
+
+ __be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp,
+ struct nfsd4_layoutget *lgp);
+ __be32 (*encode_layoutget)(struct xdr_stream *,
+ struct nfsd4_layoutget *lgp);
+
+ __be32 (*proc_layoutcommit)(struct inode *inode,
+ struct nfsd4_layoutcommit *lcp);
+
+ void (*fence_client)(struct nfs4_layout_stateid *ls);
+};
+
+extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
+extern const struct nfsd4_layout_ops bl_layout_ops;
+#endif
+#ifdef CONFIG_NFSD_SCSILAYOUT
+extern const struct nfsd4_layout_ops scsi_layout_ops;
+#endif
+#ifdef CONFIG_NFSD_FLEXFILELAYOUT
+extern const struct nfsd4_layout_ops ff_layout_ops;
+#endif
+
+__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate, stateid_t *stateid,
+ bool create, u32 layout_type, struct nfs4_layout_stateid **lsp);
+__be32 nfsd4_insert_layout(struct nfsd4_layoutget *lgp,
+ struct nfs4_layout_stateid *ls);
+__be32 nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_layoutreturn *lrp);
+__be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_layoutreturn *lrp);
+int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+ u32 device_generation);
+struct nfsd4_deviceid_map *nfsd4_find_devid_map(int idx);
+#endif /* CONFIG_NFSD_V4 */
+
+#ifdef CONFIG_NFSD_PNFS
+void nfsd4_setup_layout_type(struct svc_export *exp);
+void nfsd4_return_all_client_layouts(struct nfs4_client *);
+void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+ struct nfs4_file *fp);
+int nfsd4_init_pnfs(void);
+void nfsd4_exit_pnfs(void);
+#else
+struct nfs4_client;
+struct nfs4_file;
+
+static inline void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+}
+
+static inline void nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+}
+static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+ struct nfs4_file *fp)
+{
+}
+static inline void nfsd4_exit_pnfs(void)
+{
+}
+static inline int nfsd4_init_pnfs(void)
+{
+ return 0;
+}
+#endif /* CONFIG_NFSD_PNFS */
+#endif /* _FS_NFSD_PNFS_H */
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
new file mode 100644
index 000000000..9eae11a9d
--- /dev/null
+++ b/fs/nfsd/state.h
@@ -0,0 +1,696 @@
+/*
+ * Copyright (c) 2001 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Kendrick Smith <kmsmith@umich.edu>
+ * Andy Adamson <andros@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _NFSD4_STATE_H
+#define _NFSD4_STATE_H
+
+#include <linux/idr.h>
+#include <linux/refcount.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include "nfsfh.h"
+#include "nfsd.h"
+
+typedef struct {
+ u32 cl_boot;
+ u32 cl_id;
+} clientid_t;
+
+typedef struct {
+ clientid_t so_clid;
+ u32 so_id;
+} stateid_opaque_t;
+
+typedef struct {
+ u32 si_generation;
+ stateid_opaque_t si_opaque;
+} stateid_t;
+
+typedef struct {
+ stateid_t stid;
+#define NFS4_COPY_STID 1
+#define NFS4_COPYNOTIFY_STID 2
+ unsigned char sc_type;
+ refcount_t sc_count;
+} copy_stateid_t;
+
+struct nfsd4_callback {
+ struct nfs4_client *cb_clp;
+ struct rpc_message cb_msg;
+ const struct nfsd4_callback_ops *cb_ops;
+ struct work_struct cb_work;
+ int cb_seq_status;
+ int cb_status;
+ bool cb_need_restart;
+ bool cb_holds_slot;
+};
+
+struct nfsd4_callback_ops {
+ void (*prepare)(struct nfsd4_callback *);
+ int (*done)(struct nfsd4_callback *, struct rpc_task *);
+ void (*release)(struct nfsd4_callback *);
+};
+
+/*
+ * A core object that represents a "common" stateid. These are generally
+ * embedded within the different (more specific) stateid objects and contain
+ * fields that are of general use to any stateid.
+ */
+struct nfs4_stid {
+ refcount_t sc_count;
+#define NFS4_OPEN_STID 1
+#define NFS4_LOCK_STID 2
+#define NFS4_DELEG_STID 4
+/* For an open stateid kept around *only* to process close replays: */
+#define NFS4_CLOSED_STID 8
+/* For a deleg stateid kept around only to process free_stateid's: */
+#define NFS4_REVOKED_DELEG_STID 16
+#define NFS4_CLOSED_DELEG_STID 32
+#define NFS4_LAYOUT_STID 64
+ struct list_head sc_cp_list;
+ unsigned char sc_type;
+ stateid_t sc_stateid;
+ spinlock_t sc_lock;
+ struct nfs4_client *sc_client;
+ struct nfs4_file *sc_file;
+ void (*sc_free)(struct nfs4_stid *);
+};
+
+/* Keep a list of stateids issued by the COPY_NOTIFY, associate it with the
+ * parent OPEN/LOCK/DELEG stateid.
+ */
+struct nfs4_cpntf_state {
+ copy_stateid_t cp_stateid;
+ struct list_head cp_list; /* per parent nfs4_stid */
+ stateid_t cp_p_stateid; /* copy of parent's stateid */
+ clientid_t cp_p_clid; /* copy of parent's clid */
+ time64_t cpntf_time; /* last time stateid used */
+};
+
+/*
+ * Represents a delegation stateid. The nfs4_client holds references to these
+ * and they are put when it is being destroyed or when the delegation is
+ * returned by the client:
+ *
+ * o 1 reference as long as a delegation is still in force (taken when it's
+ * alloc'd, put when it's returned or revoked)
+ *
+ * o 1 reference as long as a recall rpc is in progress (taken when the lease
+ * is broken, put when the rpc exits)
+ *
+ * o 1 more ephemeral reference for each nfsd thread currently doing something
+ * with that delegation without holding the cl_lock
+ *
+ * If the server attempts to recall a delegation and the client doesn't do so
+ * before a timeout, the server may also revoke the delegation. In that case,
+ * the object will either be destroyed (v4.0) or moved to a per-client list of
+ * revoked delegations (v4.1+).
+ *
+ * This object is a superset of the nfs4_stid.
+ */
+struct nfs4_delegation {
+ struct nfs4_stid dl_stid; /* must be first field */
+ struct list_head dl_perfile;
+ struct list_head dl_perclnt;
+ struct list_head dl_recall_lru; /* delegation recalled */
+ struct nfs4_clnt_odstate *dl_clnt_odstate;
+ u32 dl_type;
+ time64_t dl_time;
+/* For recall: */
+ int dl_retries;
+ struct nfsd4_callback dl_recall;
+};
+
+#define cb_to_delegation(cb) \
+ container_of(cb, struct nfs4_delegation, dl_recall)
+
+/* client delegation callback info */
+struct nfs4_cb_conn {
+ /* SETCLIENTID info */
+ struct sockaddr_storage cb_addr;
+ struct sockaddr_storage cb_saddr;
+ size_t cb_addrlen;
+ u32 cb_prog; /* used only in 4.0 case;
+ per-session otherwise */
+ u32 cb_ident; /* minorversion 0 only */
+ struct svc_xprt *cb_xprt; /* minorversion 1 only */
+};
+
+static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
+{
+ return container_of(s, struct nfs4_delegation, dl_stid);
+}
+
+/* Maximum number of slots per session. 160 is useful for long haul TCP */
+#define NFSD_MAX_SLOTS_PER_SESSION 160
+/* Maximum number of operations per session compound */
+#define NFSD_MAX_OPS_PER_COMPOUND 16
+/* Maximum session per slot cache size */
+#define NFSD_SLOT_CACHE_SIZE 2048
+/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
+#define NFSD_CACHE_SIZE_SLOTS_PER_SESSION 32
+#define NFSD_MAX_MEM_PER_SESSION \
+ (NFSD_CACHE_SIZE_SLOTS_PER_SESSION * NFSD_SLOT_CACHE_SIZE)
+
+struct nfsd4_slot {
+ u32 sl_seqid;
+ __be32 sl_status;
+ struct svc_cred sl_cred;
+ u32 sl_datalen;
+ u16 sl_opcnt;
+#define NFSD4_SLOT_INUSE (1 << 0)
+#define NFSD4_SLOT_CACHETHIS (1 << 1)
+#define NFSD4_SLOT_INITIALIZED (1 << 2)
+#define NFSD4_SLOT_CACHED (1 << 3)
+ u8 sl_flags;
+ char sl_data[];
+};
+
+struct nfsd4_channel_attrs {
+ u32 headerpadsz;
+ u32 maxreq_sz;
+ u32 maxresp_sz;
+ u32 maxresp_cached;
+ u32 maxops;
+ u32 maxreqs;
+ u32 nr_rdma_attrs;
+ u32 rdma_attrs;
+};
+
+struct nfsd4_cb_sec {
+ u32 flavor; /* (u32)(-1) used to mean "no valid flavor" */
+ kuid_t uid;
+ kgid_t gid;
+};
+
+struct nfsd4_create_session {
+ clientid_t clientid;
+ struct nfs4_sessionid sessionid;
+ u32 seqid;
+ u32 flags;
+ struct nfsd4_channel_attrs fore_channel;
+ struct nfsd4_channel_attrs back_channel;
+ u32 callback_prog;
+ struct nfsd4_cb_sec cb_sec;
+};
+
+struct nfsd4_backchannel_ctl {
+ u32 bc_cb_program;
+ struct nfsd4_cb_sec bc_cb_sec;
+};
+
+struct nfsd4_bind_conn_to_session {
+ struct nfs4_sessionid sessionid;
+ u32 dir;
+};
+
+/* The single slot clientid cache structure */
+struct nfsd4_clid_slot {
+ u32 sl_seqid;
+ __be32 sl_status;
+ struct nfsd4_create_session sl_cr_ses;
+};
+
+struct nfsd4_conn {
+ struct list_head cn_persession;
+ struct svc_xprt *cn_xprt;
+ struct svc_xpt_user cn_xpt_user;
+ struct nfsd4_session *cn_session;
+/* CDFC4_FORE, CDFC4_BACK: */
+ unsigned char cn_flags;
+};
+
+/*
+ * Representation of a v4.1+ session. These are refcounted in a similar fashion
+ * to the nfs4_client. References are only taken when the server is actively
+ * working on the object (primarily during the processing of compounds).
+ */
+struct nfsd4_session {
+ atomic_t se_ref;
+ struct list_head se_hash; /* hash by sessionid */
+ struct list_head se_perclnt;
+/* See SESSION4_PERSIST, etc. for standard flags; this is internal-only: */
+#define NFS4_SESSION_DEAD 0x010
+ u32 se_flags;
+ struct nfs4_client *se_client;
+ struct nfs4_sessionid se_sessionid;
+ struct nfsd4_channel_attrs se_fchannel;
+ struct nfsd4_channel_attrs se_bchannel;
+ struct nfsd4_cb_sec se_cb_sec;
+ struct list_head se_conns;
+ u32 se_cb_prog;
+ u32 se_cb_seq_nr;
+ struct nfsd4_slot *se_slots[]; /* forward channel slots */
+};
+
+/* formatted contents of nfs4_sessionid */
+struct nfsd4_sessionid {
+ clientid_t clientid;
+ u32 sequence;
+ u32 reserved;
+};
+
+#define HEXDIR_LEN 33 /* hex version of 16 byte md5 of cl_name plus '\0' */
+
+/*
+ * struct nfs4_client - one per client. Clientids live here.
+ *
+ * The initial object created by an NFS client using SETCLIENTID (for NFSv4.0)
+ * or EXCHANGE_ID (for NFSv4.1+). These objects are refcounted and timestamped.
+ * Each nfsd_net_ns object contains a set of these and they are tracked via
+ * short and long form clientid. They are hashed and searched for under the
+ * per-nfsd_net client_lock spinlock.
+ *
+ * References to it are only held during the processing of compounds, and in
+ * certain other operations. In their "resting state" they have a refcount of
+ * 0. If they are not renewed within a lease period, they become eligible for
+ * destruction by the laundromat.
+ *
+ * These objects can also be destroyed prematurely by the fault injection code,
+ * or if the client sends certain forms of SETCLIENTID or EXCHANGE_ID updates.
+ * Care is taken *not* to do this however when the objects have an elevated
+ * refcount.
+ *
+ * o Each nfs4_client is hashed by clientid
+ *
+ * o Each nfs4_clients is also hashed by name (the opaque quantity initially
+ * sent by the client to identify itself).
+ *
+ * o cl_perclient list is used to ensure no dangling stateowner references
+ * when we expire the nfs4_client
+ */
+struct nfs4_client {
+ struct list_head cl_idhash; /* hash by cl_clientid.id */
+ struct rb_node cl_namenode; /* link into by-name trees */
+ struct list_head *cl_ownerstr_hashtbl;
+ struct list_head cl_openowners;
+ struct idr cl_stateids; /* stateid lookup */
+ struct list_head cl_delegations;
+ struct list_head cl_revoked; /* unacknowledged, revoked 4.1 state */
+ struct list_head cl_lru; /* tail queue */
+#ifdef CONFIG_NFSD_PNFS
+ struct list_head cl_lo_states; /* outstanding layout states */
+#endif
+ struct xdr_netobj cl_name; /* id generated by client */
+ nfs4_verifier cl_verifier; /* generated by client */
+ time64_t cl_time; /* time of last lease renewal */
+ struct sockaddr_storage cl_addr; /* client ipaddress */
+ bool cl_mach_cred; /* SP4_MACH_CRED in force */
+ struct svc_cred cl_cred; /* setclientid principal */
+ clientid_t cl_clientid; /* generated by server */
+ nfs4_verifier cl_confirm; /* generated by server */
+ u32 cl_minorversion;
+ /* NFSv4.1 client implementation id: */
+ struct xdr_netobj cl_nii_domain;
+ struct xdr_netobj cl_nii_name;
+ struct timespec64 cl_nii_time;
+
+ /* for v4.0 and v4.1 callbacks: */
+ struct nfs4_cb_conn cl_cb_conn;
+#define NFSD4_CLIENT_CB_UPDATE (0)
+#define NFSD4_CLIENT_CB_KILL (1)
+#define NFSD4_CLIENT_STABLE (2) /* client on stable storage */
+#define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */
+#define NFSD4_CLIENT_CONFIRMED (4) /* client is confirmed */
+#define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */
+#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \
+ 1 << NFSD4_CLIENT_CB_KILL)
+ unsigned long cl_flags;
+ const struct cred *cl_cb_cred;
+ struct rpc_clnt *cl_cb_client;
+ u32 cl_cb_ident;
+#define NFSD4_CB_UP 0
+#define NFSD4_CB_UNKNOWN 1
+#define NFSD4_CB_DOWN 2
+#define NFSD4_CB_FAULT 3
+ int cl_cb_state;
+ struct nfsd4_callback cl_cb_null;
+ struct nfsd4_session *cl_cb_session;
+
+ /* for all client information that callback code might need: */
+ spinlock_t cl_lock;
+
+ /* for nfs41 */
+ struct list_head cl_sessions;
+ struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */
+ u32 cl_exchange_flags;
+ /* number of rpc's in progress over an associated session: */
+ atomic_t cl_rpc_users;
+ struct nfsdfs_client cl_nfsdfs;
+ struct nfs4_op_map cl_spo_must_allow;
+
+ /* debugging info directory under nfsd/clients/ : */
+ struct dentry *cl_nfsd_dentry;
+
+ /* for nfs41 callbacks */
+ /* We currently support a single back channel with a single slot */
+ unsigned long cl_cb_slot_busy;
+ struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */
+ /* wait here for slots */
+ struct net *net;
+ struct list_head async_copies; /* list of async copies */
+ spinlock_t async_lock; /* lock for async copies */
+ atomic_t cl_cb_inflight; /* Outstanding callbacks */
+};
+
+/* struct nfs4_client_reset
+ * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl
+ * upon lease reset, or from upcall to state_daemon (to read in state
+ * from non-volitile storage) upon reboot.
+ */
+struct nfs4_client_reclaim {
+ struct list_head cr_strhash; /* hash by cr_name */
+ struct nfs4_client *cr_clp; /* pointer to associated clp */
+ struct xdr_netobj cr_name; /* recovery dir name */
+ struct xdr_netobj cr_princhash;
+};
+
+/* A reasonable value for REPLAY_ISIZE was estimated as follows:
+ * The OPEN response, typically the largest, requires
+ * 4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) + 8(verifier) +
+ * 4(deleg. type) + 8(deleg. stateid) + 4(deleg. recall flag) +
+ * 20(deleg. space limit) + ~32(deleg. ace) = 112 bytes
+ */
+
+#define NFSD4_REPLAY_ISIZE 112
+
+/*
+ * Replay buffer, where the result of the last seqid-mutating operation
+ * is cached.
+ */
+struct nfs4_replay {
+ __be32 rp_status;
+ unsigned int rp_buflen;
+ char *rp_buf;
+ struct knfsd_fh rp_openfh;
+ struct mutex rp_mutex;
+ char rp_ibuf[NFSD4_REPLAY_ISIZE];
+};
+
+struct nfs4_stateowner;
+
+struct nfs4_stateowner_operations {
+ void (*so_unhash)(struct nfs4_stateowner *);
+ void (*so_free)(struct nfs4_stateowner *);
+};
+
+/*
+ * A core object that represents either an open or lock owner. The object and
+ * lock owner objects have one of these embedded within them. Refcounts and
+ * other fields common to both owner types are contained within these
+ * structures.
+ */
+struct nfs4_stateowner {
+ struct list_head so_strhash;
+ struct list_head so_stateids;
+ struct nfs4_client *so_client;
+ const struct nfs4_stateowner_operations *so_ops;
+ /* after increment in nfsd4_bump_seqid, represents the next
+ * sequence id expected from the client: */
+ atomic_t so_count;
+ u32 so_seqid;
+ struct xdr_netobj so_owner; /* open owner name */
+ struct nfs4_replay so_replay;
+ bool so_is_open_owner;
+};
+
+/*
+ * When a file is opened, the client provides an open state owner opaque string
+ * that indicates the "owner" of that open. These objects are refcounted.
+ * References to it are held by each open state associated with it. This object
+ * is a superset of the nfs4_stateowner struct.
+ */
+struct nfs4_openowner {
+ struct nfs4_stateowner oo_owner; /* must be first field */
+ struct list_head oo_perclient;
+ /*
+ * We keep around openowners a little while after last close,
+ * which saves clients from having to confirm, and allows us to
+ * handle close replays if they come soon enough. The close_lru
+ * is a list of such openowners, to be reaped by the laundromat
+ * thread eventually if they remain unused:
+ */
+ struct list_head oo_close_lru;
+ struct nfs4_ol_stateid *oo_last_closed_stid;
+ time64_t oo_time; /* time of placement on so_close_lru */
+#define NFS4_OO_CONFIRMED 1
+ unsigned char oo_flags;
+};
+
+/*
+ * Represents a generic "lockowner". Similar to an openowner. References to it
+ * are held by the lock stateids that are created on its behalf. This object is
+ * a superset of the nfs4_stateowner struct.
+ */
+struct nfs4_lockowner {
+ struct nfs4_stateowner lo_owner; /* must be first element */
+ struct list_head lo_blocked; /* blocked file_locks */
+};
+
+static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so)
+{
+ return container_of(so, struct nfs4_openowner, oo_owner);
+}
+
+static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
+{
+ return container_of(so, struct nfs4_lockowner, lo_owner);
+}
+
+/*
+ * Per-client state indicating no. of opens and outstanding delegations
+ * on a file from a particular client.'od' stands for 'open & delegation'
+ */
+struct nfs4_clnt_odstate {
+ struct nfs4_client *co_client;
+ struct nfs4_file *co_file;
+ struct list_head co_perfile;
+ refcount_t co_odcount;
+};
+
+/*
+ * nfs4_file: a file opened by some number of (open) nfs4_stateowners.
+ *
+ * These objects are global. nfsd keeps one instance of a nfs4_file per
+ * filehandle (though it may keep multiple file descriptors for each). Each
+ * inode can have multiple filehandles associated with it, so there is
+ * (potentially) a many to one relationship between this struct and struct
+ * inode.
+ *
+ * These are hashed by filehandle in the file_hashtbl, which is protected by
+ * the global state_lock spinlock.
+ */
+struct nfs4_file {
+ refcount_t fi_ref;
+ spinlock_t fi_lock;
+ struct hlist_node fi_hash; /* hash on fi_fhandle */
+ struct list_head fi_stateids;
+ union {
+ struct list_head fi_delegations;
+ struct rcu_head fi_rcu;
+ };
+ struct list_head fi_clnt_odstate;
+ /* One each for O_RDONLY, O_WRONLY, O_RDWR: */
+ struct nfsd_file *fi_fds[3];
+ /*
+ * Each open or lock stateid contributes 0-4 to the counts
+ * below depending on which bits are set in st_access_bitmap:
+ * 1 to fi_access[O_RDONLY] if NFS4_SHARE_ACCES_READ is set
+ * + 1 to fi_access[O_WRONLY] if NFS4_SHARE_ACCESS_WRITE is set
+ * + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set.
+ */
+ atomic_t fi_access[2];
+ u32 fi_share_deny;
+ struct nfsd_file *fi_deleg_file;
+ int fi_delegees;
+ struct knfsd_fh fi_fhandle;
+ bool fi_had_conflict;
+#ifdef CONFIG_NFSD_PNFS
+ struct list_head fi_lo_states;
+ atomic_t fi_lo_recalls;
+#endif
+};
+
+/*
+ * A generic struct representing either a open or lock stateid. The nfs4_client
+ * holds a reference to each of these objects, and they in turn hold a
+ * reference to their respective stateowners. The client's reference is
+ * released in response to a close or unlock (depending on whether it's an open
+ * or lock stateid) or when the client is being destroyed.
+ *
+ * In the case of v4.0 open stateids, these objects are preserved for a little
+ * while after close in order to handle CLOSE replays. Those are eventually
+ * reclaimed via a LRU scheme by the laundromat.
+ *
+ * This object is a superset of the nfs4_stid. "ol" stands for "Open or Lock".
+ * Better suggestions welcome.
+ */
+struct nfs4_ol_stateid {
+ struct nfs4_stid st_stid;
+ struct list_head st_perfile;
+ struct list_head st_perstateowner;
+ struct list_head st_locks;
+ struct nfs4_stateowner *st_stateowner;
+ struct nfs4_clnt_odstate *st_clnt_odstate;
+ unsigned char st_access_bmap;
+ unsigned char st_deny_bmap;
+ struct nfs4_ol_stateid *st_openstp;
+ struct mutex st_mutex;
+};
+
+static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
+{
+ return container_of(s, struct nfs4_ol_stateid, st_stid);
+}
+
+struct nfs4_layout_stateid {
+ struct nfs4_stid ls_stid;
+ struct list_head ls_perclnt;
+ struct list_head ls_perfile;
+ spinlock_t ls_lock;
+ struct list_head ls_layouts;
+ u32 ls_layout_type;
+ struct nfsd_file *ls_file;
+ struct nfsd4_callback ls_recall;
+ stateid_t ls_recall_sid;
+ bool ls_recalled;
+ struct mutex ls_mutex;
+};
+
+static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
+{
+ return container_of(s, struct nfs4_layout_stateid, ls_stid);
+}
+
+/* flags for preprocess_seqid_op() */
+#define RD_STATE 0x00000010
+#define WR_STATE 0x00000020
+
+enum nfsd4_cb_op {
+ NFSPROC4_CLNT_CB_NULL = 0,
+ NFSPROC4_CLNT_CB_RECALL,
+ NFSPROC4_CLNT_CB_LAYOUT,
+ NFSPROC4_CLNT_CB_OFFLOAD,
+ NFSPROC4_CLNT_CB_SEQUENCE,
+ NFSPROC4_CLNT_CB_NOTIFY_LOCK,
+};
+
+/* Returns true iff a is later than b: */
+static inline bool nfsd4_stateid_generation_after(stateid_t *a, stateid_t *b)
+{
+ return (s32)(a->si_generation - b->si_generation) > 0;
+}
+
+/*
+ * When a client tries to get a lock on a file, we set one of these objects
+ * on the blocking lock. When the lock becomes free, we can then issue a
+ * CB_NOTIFY_LOCK to the server.
+ */
+struct nfsd4_blocked_lock {
+ struct list_head nbl_list;
+ struct list_head nbl_lru;
+ time64_t nbl_time;
+ struct file_lock nbl_lock;
+ struct knfsd_fh nbl_fh;
+ struct nfsd4_callback nbl_cb;
+};
+
+struct nfsd4_compound_state;
+struct nfsd_net;
+struct nfsd4_copy;
+
+extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate, struct svc_fh *fhp,
+ stateid_t *stateid, int flags, struct nfsd_file **filp,
+ struct nfs4_stid **cstid);
+__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
+ stateid_t *stateid, unsigned char typemask,
+ struct nfs4_stid **s, struct nfsd_net *nn);
+struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab,
+ void (*sc_free)(struct nfs4_stid *));
+int nfs4_init_copy_state(struct nfsd_net *nn, struct nfsd4_copy *copy);
+void nfs4_free_copy_state(struct nfsd4_copy *copy);
+struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn,
+ struct nfs4_stid *p_stid);
+void nfs4_unhash_stid(struct nfs4_stid *s);
+void nfs4_put_stid(struct nfs4_stid *s);
+void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid);
+void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
+extern void nfs4_release_reclaim(struct nfsd_net *);
+extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct xdr_netobj name,
+ struct nfsd_net *nn);
+extern __be32 nfs4_check_open_reclaim(clientid_t *clid,
+ struct nfsd4_compound_state *cstate, struct nfsd_net *nn);
+extern void nfsd4_probe_callback(struct nfs4_client *clp);
+extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
+extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
+extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
+ const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
+extern void nfsd4_run_cb(struct nfsd4_callback *cb);
+extern int nfsd4_create_callback_queue(void);
+extern void nfsd4_destroy_callback_queue(void);
+extern void nfsd4_shutdown_callback(struct nfs4_client *);
+extern void nfsd4_shutdown_copy(struct nfs4_client *clp);
+extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp);
+extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name,
+ struct xdr_netobj princhash, struct nfsd_net *nn);
+extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn);
+
+struct nfs4_file *find_file(struct knfsd_fh *fh);
+void put_nfs4_file(struct nfs4_file *fi);
+extern void nfs4_put_copy(struct nfsd4_copy *copy);
+extern struct nfsd4_copy *
+find_async_copy(struct nfs4_client *clp, stateid_t *staetid);
+extern void nfs4_put_cpntf_state(struct nfsd_net *nn,
+ struct nfs4_cpntf_state *cps);
+extern __be32 manage_cpntf_state(struct nfsd_net *nn, stateid_t *st,
+ struct nfs4_client *clp,
+ struct nfs4_cpntf_state **cps);
+static inline void get_nfs4_file(struct nfs4_file *fi)
+{
+ refcount_inc(&fi->fi_ref);
+}
+struct nfsd_file *find_any_file(struct nfs4_file *f);
+
+/* grace period management */
+void nfsd4_end_grace(struct nfsd_net *nn);
+
+/* nfs4recover operations */
+extern int nfsd4_client_tracking_init(struct net *net);
+extern void nfsd4_client_tracking_exit(struct net *net);
+extern void nfsd4_client_record_create(struct nfs4_client *clp);
+extern void nfsd4_client_record_remove(struct nfs4_client *clp);
+extern int nfsd4_client_record_check(struct nfs4_client *clp);
+extern void nfsd4_record_grace_done(struct nfsd_net *nn);
+
+#endif /* NFSD4_STATE_H */
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
new file mode 100644
index 000000000..b1bc582b0
--- /dev/null
+++ b/fs/nfsd/stats.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * procfs-based user access to knfsd statistics
+ *
+ * /proc/net/rpc/nfsd
+ *
+ * Format:
+ * rc <hits> <misses> <nocache>
+ * Statistsics for the reply cache
+ * fh <stale> <total-lookups> <anonlookups> <dir-not-in-dcache> <nondir-not-in-dcache>
+ * statistics for filehandle lookup
+ * io <bytes-read> <bytes-written>
+ * statistics for IO throughput
+ * th <threads> <fullcnt> <10%-20%> <20%-30%> ... <90%-100%> <100%>
+ * time (seconds) when nfsd thread usage above thresholds
+ * and number of times that all threads were in use
+ * ra cache-size <10% <20% <30% ... <100% not-found
+ * number of times that read-ahead entry was found that deep in
+ * the cache.
+ * plus generic RPC stats (see net/sunrpc/stats.c)
+ *
+ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/sunrpc/stats.h>
+#include <net/net_namespace.h>
+
+#include "nfsd.h"
+
+struct nfsd_stats nfsdstats;
+struct svc_stat nfsd_svcstats = {
+ .program = &nfsd_program,
+};
+
+static int nfsd_proc_show(struct seq_file *seq, void *v)
+{
+ int i;
+
+ seq_printf(seq, "rc %u %u %u\nfh %u %u %u %u %u\nio %u %u\n",
+ nfsdstats.rchits,
+ nfsdstats.rcmisses,
+ nfsdstats.rcnocache,
+ nfsdstats.fh_stale,
+ nfsdstats.fh_lookup,
+ nfsdstats.fh_anon,
+ nfsdstats.fh_nocache_dir,
+ nfsdstats.fh_nocache_nondir,
+ nfsdstats.io_read,
+ nfsdstats.io_write);
+ /* thread usage: */
+ seq_printf(seq, "th %u %u", nfsdstats.th_cnt, nfsdstats.th_fullcnt);
+ for (i=0; i<10; i++) {
+ unsigned int jifs = nfsdstats.th_usage[i];
+ unsigned int sec = jifs / HZ, msec = (jifs % HZ)*1000/HZ;
+ seq_printf(seq, " %u.%03u", sec, msec);
+ }
+
+ /* newline and ra-cache */
+ seq_printf(seq, "\nra %u", nfsdstats.ra_size);
+ for (i=0; i<11; i++)
+ seq_printf(seq, " %u", nfsdstats.ra_depth[i]);
+ seq_putc(seq, '\n');
+
+ /* show my rpc info */
+ svc_seq_show(seq, &nfsd_svcstats);
+
+#ifdef CONFIG_NFSD_V4
+ /* Show count for individual nfsv4 operations */
+ /* Writing operation numbers 0 1 2 also for maintaining uniformity */
+ seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1);
+ for (i = 0; i <= LAST_NFS4_OP; i++)
+ seq_printf(seq, " %u", nfsdstats.nfs4_opcount[i]);
+
+ seq_putc(seq, '\n');
+#endif
+
+ return 0;
+}
+
+static int nfsd_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, nfsd_proc_show, NULL);
+}
+
+static const struct proc_ops nfsd_proc_ops = {
+ .proc_open = nfsd_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+};
+
+void
+nfsd_stat_init(void)
+{
+ svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops);
+}
+
+void
+nfsd_stat_shutdown(void)
+{
+ svc_proc_unregister(&init_net, "nfsd");
+}
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
new file mode 100644
index 000000000..b23fdac69
--- /dev/null
+++ b/fs/nfsd/stats.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Statistics for NFS server.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+#ifndef _NFSD_STATS_H
+#define _NFSD_STATS_H
+
+#include <uapi/linux/nfsd/stats.h>
+
+
+struct nfsd_stats {
+ unsigned int rchits; /* repcache hits */
+ unsigned int rcmisses; /* repcache hits */
+ unsigned int rcnocache; /* uncached reqs */
+ unsigned int fh_stale; /* FH stale error */
+ unsigned int fh_lookup; /* dentry cached */
+ unsigned int fh_anon; /* anon file dentry returned */
+ unsigned int fh_nocache_dir; /* filehandle not found in dcache */
+ unsigned int fh_nocache_nondir; /* filehandle not found in dcache */
+ unsigned int io_read; /* bytes returned to read requests */
+ unsigned int io_write; /* bytes passed in write requests */
+ unsigned int th_cnt; /* number of available threads */
+ unsigned int th_usage[10]; /* number of ticks during which n perdeciles
+ * of available threads were in use */
+ unsigned int th_fullcnt; /* number of times last free thread was used */
+ unsigned int ra_size; /* size of ra cache */
+ unsigned int ra_depth[11]; /* number of times ra entry was found that deep
+ * in the cache (10percentiles). [10] = not found */
+#ifdef CONFIG_NFSD_V4
+ unsigned int nfs4_opcount[LAST_NFS4_OP + 1]; /* count of individual nfsv4 operations */
+#endif
+
+};
+
+
+extern struct nfsd_stats nfsdstats;
+extern struct svc_stat nfsd_svcstats;
+
+void nfsd_stat_init(void);
+void nfsd_stat_shutdown(void);
+
+#endif /* _NFSD_STATS_H */
diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c
new file mode 100644
index 000000000..90967466a
--- /dev/null
+++ b/fs/nfsd/trace.c
@@ -0,0 +1,3 @@
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
new file mode 100644
index 000000000..a952f4a9b
--- /dev/null
+++ b/fs/nfsd/trace.h
@@ -0,0 +1,756 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nfsd
+
+#if !defined(_NFSD_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _NFSD_TRACE_H
+
+#include <linux/tracepoint.h>
+#include "export.h"
+#include "nfsfh.h"
+
+TRACE_EVENT(nfsd_compound,
+ TP_PROTO(const struct svc_rqst *rqst,
+ u32 args_opcnt),
+ TP_ARGS(rqst, args_opcnt),
+ TP_STRUCT__entry(
+ __field(u32, xid)
+ __field(u32, args_opcnt)
+ ),
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(rqst->rq_xid);
+ __entry->args_opcnt = args_opcnt;
+ ),
+ TP_printk("xid=0x%08x opcnt=%u",
+ __entry->xid, __entry->args_opcnt)
+)
+
+TRACE_EVENT(nfsd_compound_status,
+ TP_PROTO(u32 args_opcnt,
+ u32 resp_opcnt,
+ __be32 status,
+ const char *name),
+ TP_ARGS(args_opcnt, resp_opcnt, status, name),
+ TP_STRUCT__entry(
+ __field(u32, args_opcnt)
+ __field(u32, resp_opcnt)
+ __field(int, status)
+ __string(name, name)
+ ),
+ TP_fast_assign(
+ __entry->args_opcnt = args_opcnt;
+ __entry->resp_opcnt = resp_opcnt;
+ __entry->status = be32_to_cpu(status);
+ __assign_str(name, name);
+ ),
+ TP_printk("op=%u/%u %s status=%d",
+ __entry->resp_opcnt, __entry->args_opcnt,
+ __get_str(name), __entry->status)
+)
+
+DECLARE_EVENT_CLASS(nfsd_fh_err_class,
+ TP_PROTO(struct svc_rqst *rqstp,
+ struct svc_fh *fhp,
+ int status),
+ TP_ARGS(rqstp, fhp, status),
+ TP_STRUCT__entry(
+ __field(u32, xid)
+ __field(u32, fh_hash)
+ __field(int, status)
+ ),
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __entry->status = status;
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x status=%d",
+ __entry->xid, __entry->fh_hash,
+ __entry->status)
+)
+
+#define DEFINE_NFSD_FH_ERR_EVENT(name) \
+DEFINE_EVENT(nfsd_fh_err_class, nfsd_##name, \
+ TP_PROTO(struct svc_rqst *rqstp, \
+ struct svc_fh *fhp, \
+ int status), \
+ TP_ARGS(rqstp, fhp, status))
+
+DEFINE_NFSD_FH_ERR_EVENT(set_fh_dentry_badexport);
+DEFINE_NFSD_FH_ERR_EVENT(set_fh_dentry_badhandle);
+
+TRACE_EVENT(nfsd_exp_find_key,
+ TP_PROTO(const struct svc_expkey *key,
+ int status),
+ TP_ARGS(key, status),
+ TP_STRUCT__entry(
+ __field(int, fsidtype)
+ __array(u32, fsid, 6)
+ __string(auth_domain, key->ek_client->name)
+ __field(int, status)
+ ),
+ TP_fast_assign(
+ __entry->fsidtype = key->ek_fsidtype;
+ memcpy(__entry->fsid, key->ek_fsid, 4*6);
+ __assign_str(auth_domain, key->ek_client->name);
+ __entry->status = status;
+ ),
+ TP_printk("fsid=%x::%s domain=%s status=%d",
+ __entry->fsidtype,
+ __print_array(__entry->fsid, 6, 4),
+ __get_str(auth_domain),
+ __entry->status
+ )
+);
+
+TRACE_EVENT(nfsd_expkey_update,
+ TP_PROTO(const struct svc_expkey *key, const char *exp_path),
+ TP_ARGS(key, exp_path),
+ TP_STRUCT__entry(
+ __field(int, fsidtype)
+ __array(u32, fsid, 6)
+ __string(auth_domain, key->ek_client->name)
+ __string(path, exp_path)
+ __field(bool, cache)
+ ),
+ TP_fast_assign(
+ __entry->fsidtype = key->ek_fsidtype;
+ memcpy(__entry->fsid, key->ek_fsid, 4*6);
+ __assign_str(auth_domain, key->ek_client->name);
+ __assign_str(path, exp_path);
+ __entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags);
+ ),
+ TP_printk("fsid=%x::%s domain=%s path=%s cache=%s",
+ __entry->fsidtype,
+ __print_array(__entry->fsid, 6, 4),
+ __get_str(auth_domain),
+ __get_str(path),
+ __entry->cache ? "pos" : "neg"
+ )
+);
+
+TRACE_EVENT(nfsd_exp_get_by_name,
+ TP_PROTO(const struct svc_export *key,
+ int status),
+ TP_ARGS(key, status),
+ TP_STRUCT__entry(
+ __string(path, key->ex_path.dentry->d_name.name)
+ __string(auth_domain, key->ex_client->name)
+ __field(int, status)
+ ),
+ TP_fast_assign(
+ __assign_str(path, key->ex_path.dentry->d_name.name);
+ __assign_str(auth_domain, key->ex_client->name);
+ __entry->status = status;
+ ),
+ TP_printk("path=%s domain=%s status=%d",
+ __get_str(path),
+ __get_str(auth_domain),
+ __entry->status
+ )
+);
+
+TRACE_EVENT(nfsd_export_update,
+ TP_PROTO(const struct svc_export *key),
+ TP_ARGS(key),
+ TP_STRUCT__entry(
+ __string(path, key->ex_path.dentry->d_name.name)
+ __string(auth_domain, key->ex_client->name)
+ __field(bool, cache)
+ ),
+ TP_fast_assign(
+ __assign_str(path, key->ex_path.dentry->d_name.name);
+ __assign_str(auth_domain, key->ex_client->name);
+ __entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags);
+ ),
+ TP_printk("path=%s domain=%s cache=%s",
+ __get_str(path),
+ __get_str(auth_domain),
+ __entry->cache ? "pos" : "neg"
+ )
+);
+
+DECLARE_EVENT_CLASS(nfsd_io_class,
+ TP_PROTO(struct svc_rqst *rqstp,
+ struct svc_fh *fhp,
+ u64 offset,
+ u32 len),
+ TP_ARGS(rqstp, fhp, offset, len),
+ TP_STRUCT__entry(
+ __field(u32, xid)
+ __field(u32, fh_hash)
+ __field(u64, offset)
+ __field(u32, len)
+ ),
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __entry->offset = offset;
+ __entry->len = len;
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x offset=%llu len=%u",
+ __entry->xid, __entry->fh_hash,
+ __entry->offset, __entry->len)
+)
+
+#define DEFINE_NFSD_IO_EVENT(name) \
+DEFINE_EVENT(nfsd_io_class, nfsd_##name, \
+ TP_PROTO(struct svc_rqst *rqstp, \
+ struct svc_fh *fhp, \
+ u64 offset, \
+ u32 len), \
+ TP_ARGS(rqstp, fhp, offset, len))
+
+DEFINE_NFSD_IO_EVENT(read_start);
+DEFINE_NFSD_IO_EVENT(read_splice);
+DEFINE_NFSD_IO_EVENT(read_vector);
+DEFINE_NFSD_IO_EVENT(read_io_done);
+DEFINE_NFSD_IO_EVENT(read_done);
+DEFINE_NFSD_IO_EVENT(write_start);
+DEFINE_NFSD_IO_EVENT(write_opened);
+DEFINE_NFSD_IO_EVENT(write_io_done);
+DEFINE_NFSD_IO_EVENT(write_done);
+
+DECLARE_EVENT_CLASS(nfsd_err_class,
+ TP_PROTO(struct svc_rqst *rqstp,
+ struct svc_fh *fhp,
+ loff_t offset,
+ int status),
+ TP_ARGS(rqstp, fhp, offset, status),
+ TP_STRUCT__entry(
+ __field(u32, xid)
+ __field(u32, fh_hash)
+ __field(loff_t, offset)
+ __field(int, status)
+ ),
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __entry->offset = offset;
+ __entry->status = status;
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x offset=%lld status=%d",
+ __entry->xid, __entry->fh_hash,
+ __entry->offset, __entry->status)
+)
+
+#define DEFINE_NFSD_ERR_EVENT(name) \
+DEFINE_EVENT(nfsd_err_class, nfsd_##name, \
+ TP_PROTO(struct svc_rqst *rqstp, \
+ struct svc_fh *fhp, \
+ loff_t offset, \
+ int len), \
+ TP_ARGS(rqstp, fhp, offset, len))
+
+DEFINE_NFSD_ERR_EVENT(read_err);
+DEFINE_NFSD_ERR_EVENT(write_err);
+
+#include "state.h"
+#include "filecache.h"
+#include "vfs.h"
+
+DECLARE_EVENT_CLASS(nfsd_stateid_class,
+ TP_PROTO(stateid_t *stp),
+ TP_ARGS(stp),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(u32, si_id)
+ __field(u32, si_generation)
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
+ __entry->cl_id = stp->si_opaque.so_clid.cl_id;
+ __entry->si_id = stp->si_opaque.so_id;
+ __entry->si_generation = stp->si_generation;
+ ),
+ TP_printk("client %08x:%08x stateid %08x:%08x",
+ __entry->cl_boot,
+ __entry->cl_id,
+ __entry->si_id,
+ __entry->si_generation)
+)
+
+#define DEFINE_STATEID_EVENT(name) \
+DEFINE_EVENT(nfsd_stateid_class, nfsd_##name, \
+ TP_PROTO(stateid_t *stp), \
+ TP_ARGS(stp))
+
+DEFINE_STATEID_EVENT(layoutstate_alloc);
+DEFINE_STATEID_EVENT(layoutstate_unhash);
+DEFINE_STATEID_EVENT(layoutstate_free);
+DEFINE_STATEID_EVENT(layout_get_lookup_fail);
+DEFINE_STATEID_EVENT(layout_commit_lookup_fail);
+DEFINE_STATEID_EVENT(layout_return_lookup_fail);
+DEFINE_STATEID_EVENT(layout_recall);
+DEFINE_STATEID_EVENT(layout_recall_done);
+DEFINE_STATEID_EVENT(layout_recall_fail);
+DEFINE_STATEID_EVENT(layout_recall_release);
+
+DEFINE_STATEID_EVENT(open);
+DEFINE_STATEID_EVENT(deleg_read);
+DEFINE_STATEID_EVENT(deleg_break);
+DEFINE_STATEID_EVENT(deleg_recall);
+
+DECLARE_EVENT_CLASS(nfsd_stateseqid_class,
+ TP_PROTO(u32 seqid, const stateid_t *stp),
+ TP_ARGS(seqid, stp),
+ TP_STRUCT__entry(
+ __field(u32, seqid)
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(u32, si_id)
+ __field(u32, si_generation)
+ ),
+ TP_fast_assign(
+ __entry->seqid = seqid;
+ __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
+ __entry->cl_id = stp->si_opaque.so_clid.cl_id;
+ __entry->si_id = stp->si_opaque.so_id;
+ __entry->si_generation = stp->si_generation;
+ ),
+ TP_printk("seqid=%u client %08x:%08x stateid %08x:%08x",
+ __entry->seqid, __entry->cl_boot, __entry->cl_id,
+ __entry->si_id, __entry->si_generation)
+)
+
+#define DEFINE_STATESEQID_EVENT(name) \
+DEFINE_EVENT(nfsd_stateseqid_class, nfsd_##name, \
+ TP_PROTO(u32 seqid, const stateid_t *stp), \
+ TP_ARGS(seqid, stp))
+
+DEFINE_STATESEQID_EVENT(preprocess);
+DEFINE_STATESEQID_EVENT(open_confirm);
+
+DECLARE_EVENT_CLASS(nfsd_clientid_class,
+ TP_PROTO(const clientid_t *clid),
+ TP_ARGS(clid),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = clid->cl_boot;
+ __entry->cl_id = clid->cl_id;
+ ),
+ TP_printk("client %08x:%08x", __entry->cl_boot, __entry->cl_id)
+)
+
+#define DEFINE_CLIENTID_EVENT(name) \
+DEFINE_EVENT(nfsd_clientid_class, nfsd_clid_##name, \
+ TP_PROTO(const clientid_t *clid), \
+ TP_ARGS(clid))
+
+DEFINE_CLIENTID_EVENT(expired);
+DEFINE_CLIENTID_EVENT(purged);
+DEFINE_CLIENTID_EVENT(renew);
+DEFINE_CLIENTID_EVENT(stale);
+
+DECLARE_EVENT_CLASS(nfsd_net_class,
+ TP_PROTO(const struct nfsd_net *nn),
+ TP_ARGS(nn),
+ TP_STRUCT__entry(
+ __field(unsigned long long, boot_time)
+ ),
+ TP_fast_assign(
+ __entry->boot_time = nn->boot_time;
+ ),
+ TP_printk("boot_time=%16llx", __entry->boot_time)
+)
+
+#define DEFINE_NET_EVENT(name) \
+DEFINE_EVENT(nfsd_net_class, nfsd_##name, \
+ TP_PROTO(const struct nfsd_net *nn), \
+ TP_ARGS(nn))
+
+DEFINE_NET_EVENT(grace_start);
+DEFINE_NET_EVENT(grace_complete);
+
+TRACE_EVENT(nfsd_clid_inuse_err,
+ TP_PROTO(const struct nfs4_client *clp),
+ TP_ARGS(clp),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __field(unsigned int, namelen)
+ __dynamic_array(unsigned char, name, clp->cl_name.len)
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ memcpy(__entry->addr, &clp->cl_addr,
+ sizeof(struct sockaddr_in6));
+ __entry->namelen = clp->cl_name.len;
+ memcpy(__get_dynamic_array(name), clp->cl_name.data,
+ clp->cl_name.len);
+ ),
+ TP_printk("nfs4_clientid %.*s already in use by %pISpc, client %08x:%08x",
+ __entry->namelen, __get_str(name), __entry->addr,
+ __entry->cl_boot, __entry->cl_id)
+)
+
+TRACE_DEFINE_ENUM(NFSD_FILE_HASHED);
+TRACE_DEFINE_ENUM(NFSD_FILE_PENDING);
+TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_READ);
+TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_WRITE);
+TRACE_DEFINE_ENUM(NFSD_FILE_REFERENCED);
+
+#define show_nf_flags(val) \
+ __print_flags(val, "|", \
+ { 1 << NFSD_FILE_HASHED, "HASHED" }, \
+ { 1 << NFSD_FILE_PENDING, "PENDING" }, \
+ { 1 << NFSD_FILE_BREAK_READ, "BREAK_READ" }, \
+ { 1 << NFSD_FILE_BREAK_WRITE, "BREAK_WRITE" }, \
+ { 1 << NFSD_FILE_REFERENCED, "REFERENCED"})
+
+/* FIXME: This should probably be fleshed out in the future. */
+#define show_nf_may(val) \
+ __print_flags(val, "|", \
+ { NFSD_MAY_READ, "READ" }, \
+ { NFSD_MAY_WRITE, "WRITE" }, \
+ { NFSD_MAY_NOT_BREAK_LEASE, "NOT_BREAK_LEASE" })
+
+DECLARE_EVENT_CLASS(nfsd_file_class,
+ TP_PROTO(struct nfsd_file *nf),
+ TP_ARGS(nf),
+ TP_STRUCT__entry(
+ __field(unsigned int, nf_hashval)
+ __field(void *, nf_inode)
+ __field(int, nf_ref)
+ __field(unsigned long, nf_flags)
+ __field(unsigned char, nf_may)
+ __field(struct file *, nf_file)
+ ),
+ TP_fast_assign(
+ __entry->nf_hashval = nf->nf_hashval;
+ __entry->nf_inode = nf->nf_inode;
+ __entry->nf_ref = refcount_read(&nf->nf_ref);
+ __entry->nf_flags = nf->nf_flags;
+ __entry->nf_may = nf->nf_may;
+ __entry->nf_file = nf->nf_file;
+ ),
+ TP_printk("hash=0x%x inode=0x%p ref=%d flags=%s may=%s file=%p",
+ __entry->nf_hashval,
+ __entry->nf_inode,
+ __entry->nf_ref,
+ show_nf_flags(__entry->nf_flags),
+ show_nf_may(__entry->nf_may),
+ __entry->nf_file)
+)
+
+#define DEFINE_NFSD_FILE_EVENT(name) \
+DEFINE_EVENT(nfsd_file_class, name, \
+ TP_PROTO(struct nfsd_file *nf), \
+ TP_ARGS(nf))
+
+DEFINE_NFSD_FILE_EVENT(nfsd_file_alloc);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_release_locked);
+
+TRACE_EVENT(nfsd_file_acquire,
+ TP_PROTO(struct svc_rqst *rqstp, unsigned int hash,
+ struct inode *inode, unsigned int may_flags,
+ struct nfsd_file *nf, __be32 status),
+
+ TP_ARGS(rqstp, hash, inode, may_flags, nf, status),
+
+ TP_STRUCT__entry(
+ __field(u32, xid)
+ __field(unsigned int, hash)
+ __field(void *, inode)
+ __field(unsigned int, may_flags)
+ __field(int, nf_ref)
+ __field(unsigned long, nf_flags)
+ __field(unsigned char, nf_may)
+ __field(struct file *, nf_file)
+ __field(u32, status)
+ ),
+
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->hash = hash;
+ __entry->inode = inode;
+ __entry->may_flags = may_flags;
+ __entry->nf_ref = nf ? refcount_read(&nf->nf_ref) : 0;
+ __entry->nf_flags = nf ? nf->nf_flags : 0;
+ __entry->nf_may = nf ? nf->nf_may : 0;
+ __entry->nf_file = nf ? nf->nf_file : NULL;
+ __entry->status = be32_to_cpu(status);
+ ),
+
+ TP_printk("xid=0x%x hash=0x%x inode=0x%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=0x%p status=%u",
+ __entry->xid, __entry->hash, __entry->inode,
+ show_nf_may(__entry->may_flags), __entry->nf_ref,
+ show_nf_flags(__entry->nf_flags),
+ show_nf_may(__entry->nf_may), __entry->nf_file,
+ __entry->status)
+);
+
+DECLARE_EVENT_CLASS(nfsd_file_search_class,
+ TP_PROTO(struct inode *inode, unsigned int hash, int found),
+ TP_ARGS(inode, hash, found),
+ TP_STRUCT__entry(
+ __field(struct inode *, inode)
+ __field(unsigned int, hash)
+ __field(int, found)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->hash = hash;
+ __entry->found = found;
+ ),
+ TP_printk("hash=0x%x inode=0x%p found=%d", __entry->hash,
+ __entry->inode, __entry->found)
+);
+
+#define DEFINE_NFSD_FILE_SEARCH_EVENT(name) \
+DEFINE_EVENT(nfsd_file_search_class, name, \
+ TP_PROTO(struct inode *inode, unsigned int hash, int found), \
+ TP_ARGS(inode, hash, found))
+
+DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode_sync);
+DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode);
+DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_is_cached);
+
+TRACE_EVENT(nfsd_file_fsnotify_handle_event,
+ TP_PROTO(struct inode *inode, u32 mask),
+ TP_ARGS(inode, mask),
+ TP_STRUCT__entry(
+ __field(struct inode *, inode)
+ __field(unsigned int, nlink)
+ __field(umode_t, mode)
+ __field(u32, mask)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->nlink = inode->i_nlink;
+ __entry->mode = inode->i_mode;
+ __entry->mask = mask;
+ ),
+ TP_printk("inode=0x%p nlink=%u mode=0%ho mask=0x%x", __entry->inode,
+ __entry->nlink, __entry->mode, __entry->mask)
+);
+
+#include "cache.h"
+
+TRACE_DEFINE_ENUM(RC_DROPIT);
+TRACE_DEFINE_ENUM(RC_REPLY);
+TRACE_DEFINE_ENUM(RC_DOIT);
+
+#define show_drc_retval(x) \
+ __print_symbolic(x, \
+ { RC_DROPIT, "DROPIT" }, \
+ { RC_REPLY, "REPLY" }, \
+ { RC_DOIT, "DOIT" })
+
+TRACE_EVENT(nfsd_drc_found,
+ TP_PROTO(
+ const struct nfsd_net *nn,
+ const struct svc_rqst *rqstp,
+ int result
+ ),
+ TP_ARGS(nn, rqstp, result),
+ TP_STRUCT__entry(
+ __field(unsigned long long, boot_time)
+ __field(unsigned long, result)
+ __field(u32, xid)
+ ),
+ TP_fast_assign(
+ __entry->boot_time = nn->boot_time;
+ __entry->result = result;
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ ),
+ TP_printk("boot_time=%16llx xid=0x%08x result=%s",
+ __entry->boot_time, __entry->xid,
+ show_drc_retval(__entry->result))
+
+);
+
+TRACE_EVENT(nfsd_drc_mismatch,
+ TP_PROTO(
+ const struct nfsd_net *nn,
+ const struct svc_cacherep *key,
+ const struct svc_cacherep *rp
+ ),
+ TP_ARGS(nn, key, rp),
+ TP_STRUCT__entry(
+ __field(unsigned long long, boot_time)
+ __field(u32, xid)
+ __field(u32, cached)
+ __field(u32, ingress)
+ ),
+ TP_fast_assign(
+ __entry->boot_time = nn->boot_time;
+ __entry->xid = be32_to_cpu(key->c_key.k_xid);
+ __entry->cached = (__force u32)key->c_key.k_csum;
+ __entry->ingress = (__force u32)rp->c_key.k_csum;
+ ),
+ TP_printk("boot_time=%16llx xid=0x%08x cached-csum=0x%08x ingress-csum=0x%08x",
+ __entry->boot_time, __entry->xid, __entry->cached,
+ __entry->ingress)
+);
+
+TRACE_EVENT(nfsd_cb_args,
+ TP_PROTO(
+ const struct nfs4_client *clp,
+ const struct nfs4_cb_conn *conn
+ ),
+ TP_ARGS(clp, conn),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(u32, prog)
+ __field(u32, ident)
+ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ __entry->prog = conn->cb_prog;
+ __entry->ident = conn->cb_ident;
+ memcpy(__entry->addr, &conn->cb_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+ TP_printk("client %08x:%08x callback addr=%pISpc prog=%u ident=%u",
+ __entry->cl_boot, __entry->cl_id,
+ __entry->addr, __entry->prog, __entry->ident)
+);
+
+TRACE_EVENT(nfsd_cb_nodelegs,
+ TP_PROTO(const struct nfs4_client *clp),
+ TP_ARGS(clp),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ ),
+ TP_printk("client %08x:%08x", __entry->cl_boot, __entry->cl_id)
+)
+
+TRACE_DEFINE_ENUM(NFSD4_CB_UP);
+TRACE_DEFINE_ENUM(NFSD4_CB_UNKNOWN);
+TRACE_DEFINE_ENUM(NFSD4_CB_DOWN);
+TRACE_DEFINE_ENUM(NFSD4_CB_FAULT);
+
+#define show_cb_state(val) \
+ __print_symbolic(val, \
+ { NFSD4_CB_UP, "UP" }, \
+ { NFSD4_CB_UNKNOWN, "UNKNOWN" }, \
+ { NFSD4_CB_DOWN, "DOWN" }, \
+ { NFSD4_CB_FAULT, "FAULT"})
+
+DECLARE_EVENT_CLASS(nfsd_cb_class,
+ TP_PROTO(const struct nfs4_client *clp),
+ TP_ARGS(clp),
+ TP_STRUCT__entry(
+ __field(unsigned long, state)
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ ),
+ TP_fast_assign(
+ __entry->state = clp->cl_cb_state;
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+ TP_printk("addr=%pISpc client %08x:%08x state=%s",
+ __entry->addr, __entry->cl_boot, __entry->cl_id,
+ show_cb_state(__entry->state))
+);
+
+#define DEFINE_NFSD_CB_EVENT(name) \
+DEFINE_EVENT(nfsd_cb_class, nfsd_cb_##name, \
+ TP_PROTO(const struct nfs4_client *clp), \
+ TP_ARGS(clp))
+
+DEFINE_NFSD_CB_EVENT(setup);
+DEFINE_NFSD_CB_EVENT(state);
+DEFINE_NFSD_CB_EVENT(shutdown);
+
+TRACE_EVENT(nfsd_cb_setup_err,
+ TP_PROTO(
+ const struct nfs4_client *clp,
+ long error
+ ),
+ TP_ARGS(clp, error),
+ TP_STRUCT__entry(
+ __field(long, error)
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ ),
+ TP_fast_assign(
+ __entry->error = error;
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+ TP_printk("addr=%pISpc client %08x:%08x error=%ld",
+ __entry->addr, __entry->cl_boot, __entry->cl_id, __entry->error)
+);
+
+TRACE_EVENT(nfsd_cb_work,
+ TP_PROTO(
+ const struct nfs4_client *clp,
+ const char *procedure
+ ),
+ TP_ARGS(clp, procedure),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __string(procedure, procedure)
+ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ __assign_str(procedure, procedure)
+ memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+ TP_printk("addr=%pISpc client %08x:%08x procedure=%s",
+ __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_str(procedure))
+);
+
+TRACE_EVENT(nfsd_cb_done,
+ TP_PROTO(
+ const struct nfs4_client *clp,
+ int status
+ ),
+ TP_ARGS(clp, status),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(int, status)
+ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ __entry->status = status;
+ memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+ TP_printk("addr=%pISpc client %08x:%08x status=%d",
+ __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __entry->status)
+);
+
+#endif /* _NFSD_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
new file mode 100644
index 000000000..31edb883a
--- /dev/null
+++ b/fs/nfsd/vfs.c
@@ -0,0 +1,2407 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * File operations used by nfsd. Some of these have been ripped from
+ * other parts of the kernel because they weren't exported, others
+ * are partial duplicates with added or changed functionality.
+ *
+ * Note that several functions dget() the dentry upon which they want
+ * to act, most notably those that create directory entries. Response
+ * dentry's are dput()'d if necessary in the release callback.
+ * So if you notice code paths that apparently fail to dput() the
+ * dentry, don't worry--they have been taken care of.
+ *
+ * Copyright (C) 1995-1999 Olaf Kirch <okir@monad.swb.de>
+ * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp>
+ */
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/splice.h>
+#include <linux/falloc.h>
+#include <linux/fcntl.h>
+#include <linux/namei.h>
+#include <linux/delay.h>
+#include <linux/fsnotify.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+#include <linux/jhash.h>
+#include <linux/ima.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/exportfs.h>
+#include <linux/writeback.h>
+#include <linux/security.h>
+
+#ifdef CONFIG_NFSD_V3
+#include "xdr3.h"
+#endif /* CONFIG_NFSD_V3 */
+
+#ifdef CONFIG_NFSD_V4
+#include "../internal.h"
+#include "acl.h"
+#include "idmap.h"
+#endif /* CONFIG_NFSD_V4 */
+
+#include "nfsd.h"
+#include "vfs.h"
+#include "filecache.h"
+#include "trace.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_FILEOP
+
+/*
+ * Called from nfsd_lookup and encode_dirent. Check if we have crossed
+ * a mount point.
+ * Returns -EAGAIN or -ETIMEDOUT leaving *dpp and *expp unchanged,
+ * or nfs_ok having possibly changed *dpp and *expp
+ */
+int
+nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
+ struct svc_export **expp)
+{
+ struct svc_export *exp = *expp, *exp2 = NULL;
+ struct dentry *dentry = *dpp;
+ struct path path = {.mnt = mntget(exp->ex_path.mnt),
+ .dentry = dget(dentry)};
+ int err = 0;
+
+ err = follow_down(&path);
+ if (err < 0)
+ goto out;
+ if (path.mnt == exp->ex_path.mnt && path.dentry == dentry &&
+ nfsd_mountpoint(dentry, exp) == 2) {
+ /* This is only a mountpoint in some other namespace */
+ path_put(&path);
+ goto out;
+ }
+
+ exp2 = rqst_exp_get_by_name(rqstp, &path);
+ if (IS_ERR(exp2)) {
+ err = PTR_ERR(exp2);
+ /*
+ * We normally allow NFS clients to continue
+ * "underneath" a mountpoint that is not exported.
+ * The exception is V4ROOT, where no traversal is ever
+ * allowed without an explicit export of the new
+ * directory.
+ */
+ if (err == -ENOENT && !(exp->ex_flags & NFSEXP_V4ROOT))
+ err = 0;
+ path_put(&path);
+ goto out;
+ }
+ if (nfsd_v4client(rqstp) ||
+ (exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
+ /* successfully crossed mount point */
+ /*
+ * This is subtle: path.dentry is *not* on path.mnt
+ * at this point. The only reason we are safe is that
+ * original mnt is pinned down by exp, so we should
+ * put path *before* putting exp
+ */
+ *dpp = path.dentry;
+ path.dentry = dentry;
+ *expp = exp2;
+ exp2 = exp;
+ }
+ path_put(&path);
+ exp_put(exp2);
+out:
+ return err;
+}
+
+static void follow_to_parent(struct path *path)
+{
+ struct dentry *dp;
+
+ while (path->dentry == path->mnt->mnt_root && follow_up(path))
+ ;
+ dp = dget_parent(path->dentry);
+ dput(path->dentry);
+ path->dentry = dp;
+}
+
+static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, struct svc_export **exp, struct dentry **dentryp)
+{
+ struct svc_export *exp2;
+ struct path path = {.mnt = mntget((*exp)->ex_path.mnt),
+ .dentry = dget(dparent)};
+
+ follow_to_parent(&path);
+
+ exp2 = rqst_exp_parent(rqstp, &path);
+ if (PTR_ERR(exp2) == -ENOENT) {
+ *dentryp = dget(dparent);
+ } else if (IS_ERR(exp2)) {
+ path_put(&path);
+ return PTR_ERR(exp2);
+ } else {
+ *dentryp = dget(path.dentry);
+ exp_put(*exp);
+ *exp = exp2;
+ }
+ path_put(&path);
+ return 0;
+}
+
+/*
+ * For nfsd purposes, we treat V4ROOT exports as though there was an
+ * export at *every* directory.
+ * We return:
+ * '1' if this dentry *must* be an export point,
+ * '2' if it might be, if there is really a mount here, and
+ * '0' if there is no chance of an export point here.
+ */
+int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
+{
+ if (!d_inode(dentry))
+ return 0;
+ if (exp->ex_flags & NFSEXP_V4ROOT)
+ return 1;
+ if (nfsd4_is_junction(dentry))
+ return 1;
+ if (d_mountpoint(dentry))
+ /*
+ * Might only be a mountpoint in a different namespace,
+ * but we need to check.
+ */
+ return 2;
+ return 0;
+}
+
+__be32
+nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ const char *name, unsigned int len,
+ struct svc_export **exp_ret, struct dentry **dentry_ret)
+{
+ struct svc_export *exp;
+ struct dentry *dparent;
+ struct dentry *dentry;
+ int host_err;
+
+ dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
+
+ dparent = fhp->fh_dentry;
+ exp = exp_get(fhp->fh_export);
+
+ /* Lookup the name, but don't follow links */
+ if (isdotent(name, len)) {
+ if (len==1)
+ dentry = dget(dparent);
+ else if (dparent != exp->ex_path.dentry)
+ dentry = dget_parent(dparent);
+ else if (!EX_NOHIDE(exp) && !nfsd_v4client(rqstp))
+ dentry = dget(dparent); /* .. == . just like at / */
+ else {
+ /* checking mountpoint crossing is very different when stepping up */
+ host_err = nfsd_lookup_parent(rqstp, dparent, &exp, &dentry);
+ if (host_err)
+ goto out_nfserr;
+ }
+ } else {
+ /*
+ * In the nfsd4_open() case, this may be held across
+ * subsequent open and delegation acquisition which may
+ * need to take the child's i_mutex:
+ */
+ fh_lock_nested(fhp, I_MUTEX_PARENT);
+ dentry = lookup_one_len(name, dparent, len);
+ host_err = PTR_ERR(dentry);
+ if (IS_ERR(dentry))
+ goto out_nfserr;
+ if (nfsd_mountpoint(dentry, exp)) {
+ /*
+ * We don't need the i_mutex after all. It's
+ * still possible we could open this (regular
+ * files can be mountpoints too), but the
+ * i_mutex is just there to prevent renames of
+ * something that we might be about to delegate,
+ * and a mountpoint won't be renamed:
+ */
+ fh_unlock(fhp);
+ if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
+ dput(dentry);
+ goto out_nfserr;
+ }
+ }
+ }
+ *dentry_ret = dentry;
+ *exp_ret = exp;
+ return 0;
+
+out_nfserr:
+ exp_put(exp);
+ return nfserrno(host_err);
+}
+
+/*
+ * Look up one component of a pathname.
+ * N.B. After this call _both_ fhp and resfh need an fh_put
+ *
+ * If the lookup would cross a mountpoint, and the mounted filesystem
+ * is exported to the client with NFSEXP_NOHIDE, then the lookup is
+ * accepted as it stands and the mounted directory is
+ * returned. Otherwise the covered directory is returned.
+ * NOTE: this mountpoint crossing is not supported properly by all
+ * clients and is explicitly disallowed for NFSv3
+ * NeilBrown <neilb@cse.unsw.edu.au>
+ */
+__be32
+nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
+ unsigned int len, struct svc_fh *resfh)
+{
+ struct svc_export *exp;
+ struct dentry *dentry;
+ __be32 err;
+
+ err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
+ if (err)
+ return err;
+ err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry);
+ if (err)
+ return err;
+ err = check_nfsd_access(exp, rqstp);
+ if (err)
+ goto out;
+ /*
+ * Note: we compose the file handle now, but as the
+ * dentry may be negative, it may need to be updated.
+ */
+ err = fh_compose(resfh, exp, dentry, fhp);
+ if (!err && d_really_is_negative(dentry))
+ err = nfserr_noent;
+out:
+ dput(dentry);
+ exp_put(exp);
+ return err;
+}
+
+/*
+ * Commit metadata changes to stable storage.
+ */
+static int
+commit_inode_metadata(struct inode *inode)
+{
+ const struct export_operations *export_ops = inode->i_sb->s_export_op;
+
+ if (export_ops->commit_metadata)
+ return export_ops->commit_metadata(inode);
+ return sync_inode_metadata(inode, 1);
+}
+
+static int
+commit_metadata(struct svc_fh *fhp)
+{
+ struct inode *inode = d_inode(fhp->fh_dentry);
+
+ if (!EX_ISSYNC(fhp->fh_export))
+ return 0;
+ return commit_inode_metadata(inode);
+}
+
+/*
+ * Go over the attributes and take care of the small differences between
+ * NFS semantics and what Linux expects.
+ */
+static void
+nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
+{
+ /* sanitize the mode change */
+ if (iap->ia_valid & ATTR_MODE) {
+ iap->ia_mode &= S_IALLUGO;
+ iap->ia_mode |= (inode->i_mode & ~S_IALLUGO);
+ }
+
+ /* Revoke setuid/setgid on chown */
+ if (!S_ISDIR(inode->i_mode) &&
+ ((iap->ia_valid & ATTR_UID) || (iap->ia_valid & ATTR_GID))) {
+ iap->ia_valid |= ATTR_KILL_PRIV;
+ if (iap->ia_valid & ATTR_MODE) {
+ /* we're setting mode too, just clear the s*id bits */
+ iap->ia_mode &= ~S_ISUID;
+ if (iap->ia_mode & S_IXGRP)
+ iap->ia_mode &= ~S_ISGID;
+ } else {
+ /* set ATTR_KILL_* bits and let VFS handle it */
+ iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID);
+ }
+ }
+}
+
+static __be32
+nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct iattr *iap)
+{
+ struct inode *inode = d_inode(fhp->fh_dentry);
+ int host_err;
+
+ if (iap->ia_size < inode->i_size) {
+ __be32 err;
+
+ err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
+ NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE);
+ if (err)
+ return err;
+ }
+
+ host_err = get_write_access(inode);
+ if (host_err)
+ goto out_nfserrno;
+
+ host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
+ if (host_err)
+ goto out_put_write_access;
+ return 0;
+
+out_put_write_access:
+ put_write_access(inode);
+out_nfserrno:
+ return nfserrno(host_err);
+}
+
+/*
+ * Set various file attributes. After this call fhp needs an fh_put.
+ */
+__be32
+nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
+ int check_guard, time64_t guardtime)
+{
+ struct dentry *dentry;
+ struct inode *inode;
+ int accmode = NFSD_MAY_SATTR;
+ umode_t ftype = 0;
+ __be32 err;
+ int host_err;
+ bool get_write_count;
+ bool size_change = (iap->ia_valid & ATTR_SIZE);
+
+ if (iap->ia_valid & ATTR_SIZE) {
+ accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
+ ftype = S_IFREG;
+ }
+
+ /*
+ * If utimes(2) and friends are called with times not NULL, we should
+ * not set NFSD_MAY_WRITE bit. Otherwise fh_verify->nfsd_permission
+ * will return EACCES, when the caller's effective UID does not match
+ * the owner of the file, and the caller is not privileged. In this
+ * situation, we should return EPERM(notify_change will return this).
+ */
+ if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME)) {
+ accmode |= NFSD_MAY_OWNER_OVERRIDE;
+ if (!(iap->ia_valid & (ATTR_ATIME_SET | ATTR_MTIME_SET)))
+ accmode |= NFSD_MAY_WRITE;
+ }
+
+ /* Callers that do fh_verify should do the fh_want_write: */
+ get_write_count = !fhp->fh_dentry;
+
+ /* Get inode */
+ err = fh_verify(rqstp, fhp, ftype, accmode);
+ if (err)
+ return err;
+ if (get_write_count) {
+ host_err = fh_want_write(fhp);
+ if (host_err)
+ goto out;
+ }
+
+ dentry = fhp->fh_dentry;
+ inode = d_inode(dentry);
+
+ /* Ignore any mode updates on symlinks */
+ if (S_ISLNK(inode->i_mode))
+ iap->ia_valid &= ~ATTR_MODE;
+
+ if (!iap->ia_valid)
+ return 0;
+
+ nfsd_sanitize_attrs(inode, iap);
+
+ if (check_guard && guardtime != inode->i_ctime.tv_sec)
+ return nfserr_notsync;
+
+ /*
+ * The size case is special, it changes the file in addition to the
+ * attributes, and file systems don't expect it to be mixed with
+ * "random" attribute changes. We thus split out the size change
+ * into a separate call to ->setattr, and do the rest as a separate
+ * setattr call.
+ */
+ if (size_change) {
+ err = nfsd_get_write_access(rqstp, fhp, iap);
+ if (err)
+ return err;
+ }
+
+ fh_lock(fhp);
+ if (size_change) {
+ /*
+ * RFC5661, Section 18.30.4:
+ * Changing the size of a file with SETATTR indirectly
+ * changes the time_modify and change attributes.
+ *
+ * (and similar for the older RFCs)
+ */
+ struct iattr size_attr = {
+ .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME,
+ .ia_size = iap->ia_size,
+ };
+
+ host_err = notify_change(dentry, &size_attr, NULL);
+ if (host_err)
+ goto out_unlock;
+ iap->ia_valid &= ~ATTR_SIZE;
+
+ /*
+ * Avoid the additional setattr call below if the only other
+ * attribute that the client sends is the mtime, as we update
+ * it as part of the size change above.
+ */
+ if ((iap->ia_valid & ~ATTR_MTIME) == 0)
+ goto out_unlock;
+ }
+
+ iap->ia_valid |= ATTR_CTIME;
+ host_err = notify_change(dentry, iap, NULL);
+
+out_unlock:
+ fh_unlock(fhp);
+ if (size_change)
+ put_write_access(inode);
+out:
+ if (!host_err)
+ host_err = commit_metadata(fhp);
+ return nfserrno(host_err);
+}
+
+#if defined(CONFIG_NFSD_V4)
+/*
+ * NFS junction information is stored in an extended attribute.
+ */
+#define NFSD_JUNCTION_XATTR_NAME XATTR_TRUSTED_PREFIX "junction.nfs"
+
+/**
+ * nfsd4_is_junction - Test if an object could be an NFS junction
+ *
+ * @dentry: object to test
+ *
+ * Returns 1 if "dentry" appears to contain NFS junction information.
+ * Otherwise 0 is returned.
+ */
+int nfsd4_is_junction(struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+
+ if (inode == NULL)
+ return 0;
+ if (inode->i_mode & S_IXUGO)
+ return 0;
+ if (!(inode->i_mode & S_ISVTX))
+ return 0;
+ if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0)
+ return 0;
+ return 1;
+}
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct xdr_netobj *label)
+{
+ __be32 error;
+ int host_error;
+ struct dentry *dentry;
+
+ error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
+ if (error)
+ return error;
+
+ dentry = fhp->fh_dentry;
+
+ inode_lock(d_inode(dentry));
+ host_error = security_inode_setsecctx(dentry, label->data, label->len);
+ inode_unlock(d_inode(dentry));
+ return nfserrno(host_error);
+}
+#else
+__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct xdr_netobj *label)
+{
+ return nfserr_notsupp;
+}
+#endif
+
+__be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
+ struct nfsd_file *nf_dst, u64 dst_pos, u64 count, bool sync)
+{
+ struct file *src = nf_src->nf_file;
+ struct file *dst = nf_dst->nf_file;
+ errseq_t since;
+ loff_t cloned;
+ __be32 ret = 0;
+
+ since = READ_ONCE(dst->f_wb_err);
+ cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0);
+ if (cloned < 0) {
+ ret = nfserrno(cloned);
+ goto out_err;
+ }
+ if (count && cloned != count) {
+ ret = nfserrno(-EINVAL);
+ goto out_err;
+ }
+ if (sync) {
+ loff_t dst_end = count ? dst_pos + count - 1 : LLONG_MAX;
+ int status = vfs_fsync_range(dst, dst_pos, dst_end, 0);
+
+ if (!status)
+ status = filemap_check_wb_err(dst->f_mapping, since);
+ if (!status)
+ status = commit_inode_metadata(file_inode(src));
+ if (status < 0) {
+ nfsd_reset_boot_verifier(net_generic(nf_dst->nf_net,
+ nfsd_net_id));
+ ret = nfserrno(status);
+ }
+ }
+out_err:
+ return ret;
+}
+
+ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
+ u64 dst_pos, u64 count)
+{
+ ssize_t ret;
+
+ /*
+ * Limit copy to 4MB to prevent indefinitely blocking an nfsd
+ * thread and client rpc slot. The choice of 4MB is somewhat
+ * arbitrary. We might instead base this on r/wsize, or make it
+ * tunable, or use a time instead of a byte limit, or implement
+ * asynchronous copy. In theory a client could also recognize a
+ * limit like this and pipeline multiple COPY requests.
+ */
+ count = min_t(u64, count, 1 << 22);
+ ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+
+ if (ret == -EOPNOTSUPP || ret == -EXDEV)
+ ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count,
+ COPY_FILE_SPLICE);
+ return ret;
+}
+
+__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct file *file, loff_t offset, loff_t len,
+ int flags)
+{
+ int error;
+
+ if (!S_ISREG(file_inode(file)->i_mode))
+ return nfserr_inval;
+
+ error = vfs_fallocate(file, flags, offset, len);
+ if (!error)
+ error = commit_metadata(fhp);
+
+ return nfserrno(error);
+}
+#endif /* defined(CONFIG_NFSD_V4) */
+
+#ifdef CONFIG_NFSD_V3
+/*
+ * Check server access rights to a file system object
+ */
+struct accessmap {
+ u32 access;
+ int how;
+};
+static struct accessmap nfs3_regaccess[] = {
+ { NFS3_ACCESS_READ, NFSD_MAY_READ },
+ { NFS3_ACCESS_EXECUTE, NFSD_MAY_EXEC },
+ { NFS3_ACCESS_MODIFY, NFSD_MAY_WRITE|NFSD_MAY_TRUNC },
+ { NFS3_ACCESS_EXTEND, NFSD_MAY_WRITE },
+
+#ifdef CONFIG_NFSD_V4
+ { NFS4_ACCESS_XAREAD, NFSD_MAY_READ },
+ { NFS4_ACCESS_XAWRITE, NFSD_MAY_WRITE },
+ { NFS4_ACCESS_XALIST, NFSD_MAY_READ },
+#endif
+
+ { 0, 0 }
+};
+
+static struct accessmap nfs3_diraccess[] = {
+ { NFS3_ACCESS_READ, NFSD_MAY_READ },
+ { NFS3_ACCESS_LOOKUP, NFSD_MAY_EXEC },
+ { NFS3_ACCESS_MODIFY, NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC},
+ { NFS3_ACCESS_EXTEND, NFSD_MAY_EXEC|NFSD_MAY_WRITE },
+ { NFS3_ACCESS_DELETE, NFSD_MAY_REMOVE },
+
+#ifdef CONFIG_NFSD_V4
+ { NFS4_ACCESS_XAREAD, NFSD_MAY_READ },
+ { NFS4_ACCESS_XAWRITE, NFSD_MAY_WRITE },
+ { NFS4_ACCESS_XALIST, NFSD_MAY_READ },
+#endif
+
+ { 0, 0 }
+};
+
+static struct accessmap nfs3_anyaccess[] = {
+ /* Some clients - Solaris 2.6 at least, make an access call
+ * to the server to check for access for things like /dev/null
+ * (which really, the server doesn't care about). So
+ * We provide simple access checking for them, looking
+ * mainly at mode bits, and we make sure to ignore read-only
+ * filesystem checks
+ */
+ { NFS3_ACCESS_READ, NFSD_MAY_READ },
+ { NFS3_ACCESS_EXECUTE, NFSD_MAY_EXEC },
+ { NFS3_ACCESS_MODIFY, NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS },
+ { NFS3_ACCESS_EXTEND, NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS },
+
+ { 0, 0 }
+};
+
+__be32
+nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported)
+{
+ struct accessmap *map;
+ struct svc_export *export;
+ struct dentry *dentry;
+ u32 query, result = 0, sresult = 0;
+ __be32 error;
+
+ error = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
+ if (error)
+ goto out;
+
+ export = fhp->fh_export;
+ dentry = fhp->fh_dentry;
+
+ if (d_is_reg(dentry))
+ map = nfs3_regaccess;
+ else if (d_is_dir(dentry))
+ map = nfs3_diraccess;
+ else
+ map = nfs3_anyaccess;
+
+
+ query = *access;
+ for (; map->access; map++) {
+ if (map->access & query) {
+ __be32 err2;
+
+ sresult |= map->access;
+
+ err2 = nfsd_permission(rqstp, export, dentry, map->how);
+ switch (err2) {
+ case nfs_ok:
+ result |= map->access;
+ break;
+
+ /* the following error codes just mean the access was not allowed,
+ * rather than an error occurred */
+ case nfserr_rofs:
+ case nfserr_acces:
+ case nfserr_perm:
+ /* simply don't "or" in the access bit. */
+ break;
+ default:
+ error = err2;
+ goto out;
+ }
+ }
+ }
+ *access = result;
+ if (supported)
+ *supported = sresult;
+
+ out:
+ return error;
+}
+#endif /* CONFIG_NFSD_V3 */
+
+int nfsd_open_break_lease(struct inode *inode, int access)
+{
+ unsigned int mode;
+
+ if (access & NFSD_MAY_NOT_BREAK_LEASE)
+ return 0;
+ mode = (access & NFSD_MAY_WRITE) ? O_WRONLY : O_RDONLY;
+ return break_lease(inode, mode | O_NONBLOCK);
+}
+
+/*
+ * Open an existing file or directory.
+ * The may_flags argument indicates the type of open (read/write/lock)
+ * and additional flags.
+ * N.B. After this call fhp needs an fh_put
+ */
+static __be32
+__nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
+ int may_flags, struct file **filp)
+{
+ struct path path;
+ struct inode *inode;
+ struct file *file;
+ int flags = O_RDONLY|O_LARGEFILE;
+ __be32 err;
+ int host_err = 0;
+
+ path.mnt = fhp->fh_export->ex_path.mnt;
+ path.dentry = fhp->fh_dentry;
+ inode = d_inode(path.dentry);
+
+ /* Disallow write access to files with the append-only bit set
+ * or any access when mandatory locking enabled
+ */
+ err = nfserr_perm;
+ if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE))
+ goto out;
+ /*
+ * We must ignore files (but only files) which might have mandatory
+ * locks on them because there is no way to know if the accesser has
+ * the lock.
+ */
+ if (S_ISREG((inode)->i_mode) && mandatory_lock(inode))
+ goto out;
+
+ if (!inode->i_fop)
+ goto out;
+
+ host_err = nfsd_open_break_lease(inode, may_flags);
+ if (host_err) /* NOMEM or WOULDBLOCK */
+ goto out_nfserr;
+
+ if (may_flags & NFSD_MAY_WRITE) {
+ if (may_flags & NFSD_MAY_READ)
+ flags = O_RDWR|O_LARGEFILE;
+ else
+ flags = O_WRONLY|O_LARGEFILE;
+ }
+
+ file = dentry_open(&path, flags, current_cred());
+ if (IS_ERR(file)) {
+ host_err = PTR_ERR(file);
+ goto out_nfserr;
+ }
+
+ host_err = ima_file_check(file, may_flags);
+ if (host_err) {
+ fput(file);
+ goto out_nfserr;
+ }
+
+ if (may_flags & NFSD_MAY_64BIT_COOKIE)
+ file->f_mode |= FMODE_64BITHASH;
+ else
+ file->f_mode |= FMODE_32BITHASH;
+
+ *filp = file;
+out_nfserr:
+ err = nfserrno(host_err);
+out:
+ return err;
+}
+
+__be32
+nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
+ int may_flags, struct file **filp)
+{
+ __be32 err;
+
+ validate_process_creds();
+ /*
+ * If we get here, then the client has already done an "open",
+ * and (hopefully) checked permission - so allow OWNER_OVERRIDE
+ * in case a chmod has now revoked permission.
+ *
+ * Arguably we should also allow the owner override for
+ * directories, but we never have and it doesn't seem to have
+ * caused anyone a problem. If we were to change this, note
+ * also that our filldir callbacks would need a variant of
+ * lookup_one_len that doesn't check permissions.
+ */
+ if (type == S_IFREG)
+ may_flags |= NFSD_MAY_OWNER_OVERRIDE;
+ err = fh_verify(rqstp, fhp, type, may_flags);
+ if (!err)
+ err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
+ validate_process_creds();
+ return err;
+}
+
+__be32
+nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
+ int may_flags, struct file **filp)
+{
+ __be32 err;
+
+ validate_process_creds();
+ err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
+ validate_process_creds();
+ return err;
+}
+
+/*
+ * Grab and keep cached pages associated with a file in the svc_rqst
+ * so that they can be passed to the network sendmsg/sendpage routines
+ * directly. They will be released after the sending has completed.
+ */
+static int
+nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+ struct splice_desc *sd)
+{
+ struct svc_rqst *rqstp = sd->u.data;
+ struct page **pp = rqstp->rq_next_page;
+ struct page *page = buf->page;
+ size_t size;
+
+ size = sd->len;
+
+ if (rqstp->rq_res.page_len == 0) {
+ get_page(page);
+ put_page(*rqstp->rq_next_page);
+ *(rqstp->rq_next_page++) = page;
+ rqstp->rq_res.page_base = buf->offset;
+ rqstp->rq_res.page_len = size;
+ } else if (page != pp[-1]) {
+ get_page(page);
+ if (*rqstp->rq_next_page)
+ put_page(*rqstp->rq_next_page);
+ *(rqstp->rq_next_page++) = page;
+ rqstp->rq_res.page_len += size;
+ } else
+ rqstp->rq_res.page_len += size;
+
+ return size;
+}
+
+static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
+ struct splice_desc *sd)
+{
+ return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
+}
+
+static u32 nfsd_eof_on_read(struct file *file, loff_t offset, ssize_t len,
+ size_t expected)
+{
+ if (expected != 0 && len == 0)
+ return 1;
+ if (offset+len >= i_size_read(file_inode(file)))
+ return 1;
+ return 0;
+}
+
+static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct file *file, loff_t offset,
+ unsigned long *count, u32 *eof, ssize_t host_err)
+{
+ if (host_err >= 0) {
+ nfsdstats.io_read += host_err;
+ *eof = nfsd_eof_on_read(file, offset, host_err, *count);
+ *count = host_err;
+ fsnotify_access(file);
+ trace_nfsd_read_io_done(rqstp, fhp, offset, *count);
+ return 0;
+ } else {
+ trace_nfsd_read_err(rqstp, fhp, offset, host_err);
+ return nfserrno(host_err);
+ }
+}
+
+__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct file *file, loff_t offset, unsigned long *count,
+ u32 *eof)
+{
+ struct splice_desc sd = {
+ .len = 0,
+ .total_len = *count,
+ .pos = offset,
+ .u.data = rqstp,
+ };
+ ssize_t host_err;
+
+ trace_nfsd_read_splice(rqstp, fhp, offset, *count);
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+ host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
+ return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
+}
+
+__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct file *file, loff_t offset,
+ struct kvec *vec, int vlen, unsigned long *count,
+ u32 *eof)
+{
+ struct iov_iter iter;
+ loff_t ppos = offset;
+ ssize_t host_err;
+
+ trace_nfsd_read_vector(rqstp, fhp, offset, *count);
+ iov_iter_kvec(&iter, READ, vec, vlen, *count);
+ host_err = vfs_iter_read(file, &iter, &ppos, 0);
+ return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
+}
+
+/*
+ * Gathered writes: If another process is currently writing to the file,
+ * there's a high chance this is another nfsd (triggered by a bulk write
+ * from a client's biod). Rather than syncing the file with each write
+ * request, we sleep for 10 msec.
+ *
+ * I don't know if this roughly approximates C. Juszak's idea of
+ * gathered writes, but it's a nice and simple solution (IMHO), and it
+ * seems to work:-)
+ *
+ * Note: we do this only in the NFSv2 case, since v3 and higher have a
+ * better tool (separate unstable writes and commits) for solving this
+ * problem.
+ */
+static int wait_for_concurrent_writes(struct file *file)
+{
+ struct inode *inode = file_inode(file);
+ static ino_t last_ino;
+ static dev_t last_dev;
+ int err = 0;
+
+ if (atomic_read(&inode->i_writecount) > 1
+ || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) {
+ dprintk("nfsd: write defer %d\n", task_pid_nr(current));
+ msleep(10);
+ dprintk("nfsd: write resume %d\n", task_pid_nr(current));
+ }
+
+ if (inode->i_state & I_DIRTY) {
+ dprintk("nfsd: write sync %d\n", task_pid_nr(current));
+ err = vfs_fsync(file, 0);
+ }
+ last_ino = inode->i_ino;
+ last_dev = inode->i_sb->s_dev;
+ return err;
+}
+
+__be32
+nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
+ loff_t offset, struct kvec *vec, int vlen,
+ unsigned long *cnt, int stable,
+ __be32 *verf)
+{
+ struct file *file = nf->nf_file;
+ struct svc_export *exp;
+ struct iov_iter iter;
+ errseq_t since;
+ __be32 nfserr;
+ int host_err;
+ int use_wgather;
+ loff_t pos = offset;
+ unsigned int pflags = current->flags;
+ rwf_t flags = 0;
+
+ trace_nfsd_write_opened(rqstp, fhp, offset, *cnt);
+
+ if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
+ /*
+ * We want throttling in balance_dirty_pages()
+ * and shrink_inactive_list() to only consider
+ * the backingdev we are writing to, so that nfs to
+ * localhost doesn't cause nfsd to lock up due to all
+ * the client's dirty pages or its congested queue.
+ */
+ current->flags |= PF_LOCAL_THROTTLE;
+
+ exp = fhp->fh_export;
+ use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
+
+ if (!EX_ISSYNC(exp))
+ stable = NFS_UNSTABLE;
+
+ if (stable && !use_wgather)
+ flags |= RWF_SYNC;
+
+ iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt);
+ since = READ_ONCE(file->f_wb_err);
+ if (flags & RWF_SYNC) {
+ if (verf)
+ nfsd_copy_boot_verifier(verf,
+ net_generic(SVC_NET(rqstp),
+ nfsd_net_id));
+ host_err = vfs_iter_write(file, &iter, &pos, flags);
+ if (host_err < 0)
+ nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
+ nfsd_net_id));
+ } else {
+ if (verf)
+ nfsd_copy_boot_verifier(verf,
+ net_generic(SVC_NET(rqstp),
+ nfsd_net_id));
+ host_err = vfs_iter_write(file, &iter, &pos, flags);
+ }
+ if (host_err < 0) {
+ nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
+ nfsd_net_id));
+ goto out_nfserr;
+ }
+ *cnt = host_err;
+ nfsdstats.io_write += *cnt;
+ fsnotify_modify(file);
+ host_err = filemap_check_wb_err(file->f_mapping, since);
+ if (host_err < 0)
+ goto out_nfserr;
+
+ if (stable && use_wgather) {
+ host_err = wait_for_concurrent_writes(file);
+ if (host_err < 0)
+ nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
+ nfsd_net_id));
+ }
+
+out_nfserr:
+ if (host_err >= 0) {
+ trace_nfsd_write_io_done(rqstp, fhp, offset, *cnt);
+ nfserr = nfs_ok;
+ } else {
+ trace_nfsd_write_err(rqstp, fhp, offset, host_err);
+ nfserr = nfserrno(host_err);
+ }
+ if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
+ current_restore_flags(pflags, PF_LOCAL_THROTTLE);
+ return nfserr;
+}
+
+/*
+ * Read data from a file. count must contain the requested read count
+ * on entry. On return, *count contains the number of bytes actually read.
+ * N.B. After this call fhp needs an fh_put
+ */
+__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ loff_t offset, struct kvec *vec, int vlen, unsigned long *count,
+ u32 *eof)
+{
+ struct nfsd_file *nf;
+ struct file *file;
+ __be32 err;
+
+ trace_nfsd_read_start(rqstp, fhp, offset, *count);
+ err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
+ if (err)
+ return err;
+
+ file = nf->nf_file;
+ if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags))
+ err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof);
+ else
+ err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count, eof);
+
+ nfsd_file_put(nf);
+
+ trace_nfsd_read_done(rqstp, fhp, offset, *count);
+
+ return err;
+}
+
+/*
+ * Write data to a file.
+ * The stable flag requests synchronous writes.
+ * N.B. After this call fhp needs an fh_put
+ */
+__be32
+nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
+ struct kvec *vec, int vlen, unsigned long *cnt, int stable,
+ __be32 *verf)
+{
+ struct nfsd_file *nf;
+ __be32 err;
+
+ trace_nfsd_write_start(rqstp, fhp, offset, *cnt);
+
+ err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE, &nf);
+ if (err)
+ goto out;
+
+ err = nfsd_vfs_write(rqstp, fhp, nf, offset, vec,
+ vlen, cnt, stable, verf);
+ nfsd_file_put(nf);
+out:
+ trace_nfsd_write_done(rqstp, fhp, offset, *cnt);
+ return err;
+}
+
+#ifdef CONFIG_NFSD_V3
+/*
+ * Commit all pending writes to stable storage.
+ *
+ * Note: we only guarantee that data that lies within the range specified
+ * by the 'offset' and 'count' parameters will be synced.
+ *
+ * Unfortunately we cannot lock the file to make sure we return full WCC
+ * data to the client, as locking happens lower down in the filesystem.
+ */
+__be32
+nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ loff_t offset, unsigned long count, __be32 *verf)
+{
+ struct nfsd_file *nf;
+ loff_t end = LLONG_MAX;
+ __be32 err = nfserr_inval;
+
+ if (offset < 0)
+ goto out;
+ if (count != 0) {
+ end = offset + (loff_t)count - 1;
+ if (end < offset)
+ goto out;
+ }
+
+ err = nfsd_file_acquire(rqstp, fhp,
+ NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf);
+ if (err)
+ goto out;
+ if (EX_ISSYNC(fhp->fh_export)) {
+ errseq_t since = READ_ONCE(nf->nf_file->f_wb_err);
+ int err2;
+
+ err2 = vfs_fsync_range(nf->nf_file, offset, end, 0);
+ switch (err2) {
+ case 0:
+ nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net,
+ nfsd_net_id));
+ err2 = filemap_check_wb_err(nf->nf_file->f_mapping,
+ since);
+ err = nfserrno(err2);
+ break;
+ case -EINVAL:
+ err = nfserr_notsupp;
+ break;
+ default:
+ nfsd_reset_boot_verifier(net_generic(nf->nf_net,
+ nfsd_net_id));
+ err = nfserrno(err2);
+ }
+ } else
+ nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net,
+ nfsd_net_id));
+
+ nfsd_file_put(nf);
+out:
+ return err;
+}
+#endif /* CONFIG_NFSD_V3 */
+
+static __be32
+nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
+ struct iattr *iap)
+{
+ /*
+ * Mode has already been set earlier in create:
+ */
+ iap->ia_valid &= ~ATTR_MODE;
+ /*
+ * Setting uid/gid works only for root. Irix appears to
+ * send along the gid on create when it tries to implement
+ * setgid directories via NFS:
+ */
+ if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID))
+ iap->ia_valid &= ~(ATTR_UID|ATTR_GID);
+ if (iap->ia_valid)
+ return nfsd_setattr(rqstp, resfhp, iap, 0, (time64_t)0);
+ /* Callers expect file metadata to be committed here */
+ return nfserrno(commit_metadata(resfhp));
+}
+
+/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
+ * setting size to 0 may fail for some specific file systems by the permission
+ * checking which requires WRITE permission but the mode is 000.
+ * we ignore the resizing(to 0) on the just new created file, since the size is
+ * 0 after file created.
+ *
+ * call this only after vfs_create() is called.
+ * */
+static void
+nfsd_check_ignore_resizing(struct iattr *iap)
+{
+ if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
+ iap->ia_valid &= ~ATTR_SIZE;
+}
+
+/* The parent directory should already be locked: */
+__be32
+nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ char *fname, int flen, struct iattr *iap,
+ int type, dev_t rdev, struct svc_fh *resfhp)
+{
+ struct dentry *dentry, *dchild;
+ struct inode *dirp;
+ __be32 err;
+ __be32 err2;
+ int host_err;
+
+ dentry = fhp->fh_dentry;
+ dirp = d_inode(dentry);
+
+ dchild = dget(resfhp->fh_dentry);
+ if (!fhp->fh_locked) {
+ WARN_ONCE(1, "nfsd_create: parent %pd2 not locked!\n",
+ dentry);
+ err = nfserr_io;
+ goto out;
+ }
+
+ err = nfsd_permission(rqstp, fhp->fh_export, dentry, NFSD_MAY_CREATE);
+ if (err)
+ goto out;
+
+ if (!(iap->ia_valid & ATTR_MODE))
+ iap->ia_mode = 0;
+ iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type;
+
+ if (!IS_POSIXACL(dirp))
+ iap->ia_mode &= ~current_umask();
+
+ err = 0;
+ host_err = 0;
+ switch (type) {
+ case S_IFREG:
+ host_err = vfs_create(dirp, dchild, iap->ia_mode, true);
+ if (!host_err)
+ nfsd_check_ignore_resizing(iap);
+ break;
+ case S_IFDIR:
+ host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
+ if (!host_err && unlikely(d_unhashed(dchild))) {
+ struct dentry *d;
+ d = lookup_one_len(dchild->d_name.name,
+ dchild->d_parent,
+ dchild->d_name.len);
+ if (IS_ERR(d)) {
+ host_err = PTR_ERR(d);
+ break;
+ }
+ if (unlikely(d_is_negative(d))) {
+ dput(d);
+ err = nfserr_serverfault;
+ goto out;
+ }
+ dput(resfhp->fh_dentry);
+ resfhp->fh_dentry = dget(d);
+ err = fh_update(resfhp);
+ dput(dchild);
+ dchild = d;
+ if (err)
+ goto out;
+ }
+ break;
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
+ host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
+ break;
+ default:
+ printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n",
+ type);
+ host_err = -EINVAL;
+ }
+ if (host_err < 0)
+ goto out_nfserr;
+
+ err = nfsd_create_setattr(rqstp, resfhp, iap);
+
+ /*
+ * nfsd_create_setattr already committed the child. Transactional
+ * filesystems had a chance to commit changes for both parent and
+ * child simultaneously making the following commit_metadata a
+ * noop.
+ */
+ err2 = nfserrno(commit_metadata(fhp));
+ if (err2)
+ err = err2;
+ /*
+ * Update the file handle to get the new inode info.
+ */
+ if (!err)
+ err = fh_update(resfhp);
+out:
+ dput(dchild);
+ return err;
+
+out_nfserr:
+ err = nfserrno(host_err);
+ goto out;
+}
+
+/*
+ * Create a filesystem object (regular, directory, special).
+ * Note that the parent directory is left locked.
+ *
+ * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp
+ */
+__be32
+nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ char *fname, int flen, struct iattr *iap,
+ int type, dev_t rdev, struct svc_fh *resfhp)
+{
+ struct dentry *dentry, *dchild = NULL;
+ __be32 err;
+ int host_err;
+
+ if (isdotent(fname, flen))
+ return nfserr_exist;
+
+ err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_NOP);
+ if (err)
+ return err;
+
+ dentry = fhp->fh_dentry;
+
+ host_err = fh_want_write(fhp);
+ if (host_err)
+ return nfserrno(host_err);
+
+ fh_lock_nested(fhp, I_MUTEX_PARENT);
+ dchild = lookup_one_len(fname, dentry, flen);
+ host_err = PTR_ERR(dchild);
+ if (IS_ERR(dchild))
+ return nfserrno(host_err);
+ err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
+ /*
+ * We unconditionally drop our ref to dchild as fh_compose will have
+ * already grabbed its own ref for it.
+ */
+ dput(dchild);
+ if (err)
+ return err;
+ return nfsd_create_locked(rqstp, fhp, fname, flen, iap, type,
+ rdev, resfhp);
+}
+
+#ifdef CONFIG_NFSD_V3
+
+/*
+ * NFSv3 and NFSv4 version of nfsd_create
+ */
+__be32
+do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ char *fname, int flen, struct iattr *iap,
+ struct svc_fh *resfhp, int createmode, u32 *verifier,
+ bool *truncp, bool *created)
+{
+ struct dentry *dentry, *dchild = NULL;
+ struct inode *dirp;
+ __be32 err;
+ int host_err;
+ __u32 v_mtime=0, v_atime=0;
+
+ err = nfserr_perm;
+ if (!flen)
+ goto out;
+ err = nfserr_exist;
+ if (isdotent(fname, flen))
+ goto out;
+ if (!(iap->ia_valid & ATTR_MODE))
+ iap->ia_mode = 0;
+ err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
+ if (err)
+ goto out;
+
+ dentry = fhp->fh_dentry;
+ dirp = d_inode(dentry);
+
+ host_err = fh_want_write(fhp);
+ if (host_err)
+ goto out_nfserr;
+
+ fh_lock_nested(fhp, I_MUTEX_PARENT);
+
+ /*
+ * Compose the response file handle.
+ */
+ dchild = lookup_one_len(fname, dentry, flen);
+ host_err = PTR_ERR(dchild);
+ if (IS_ERR(dchild))
+ goto out_nfserr;
+
+ /* If file doesn't exist, check for permissions to create one */
+ if (d_really_is_negative(dchild)) {
+ err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+ if (err)
+ goto out;
+ }
+
+ err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
+ if (err)
+ goto out;
+
+ if (nfsd_create_is_exclusive(createmode)) {
+ /* solaris7 gets confused (bugid 4218508) if these have
+ * the high bit set, so just clear the high bits. If this is
+ * ever changed to use different attrs for storing the
+ * verifier, then do_open_lookup() will also need to be fixed
+ * accordingly.
+ */
+ v_mtime = verifier[0]&0x7fffffff;
+ v_atime = verifier[1]&0x7fffffff;
+ }
+
+ if (d_really_is_positive(dchild)) {
+ err = 0;
+
+ switch (createmode) {
+ case NFS3_CREATE_UNCHECKED:
+ if (! d_is_reg(dchild))
+ goto out;
+ else if (truncp) {
+ /* in nfsv4, we need to treat this case a little
+ * differently. we don't want to truncate the
+ * file now; this would be wrong if the OPEN
+ * fails for some other reason. furthermore,
+ * if the size is nonzero, we should ignore it
+ * according to spec!
+ */
+ *truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size;
+ }
+ else {
+ iap->ia_valid &= ATTR_SIZE;
+ goto set_attr;
+ }
+ break;
+ case NFS3_CREATE_EXCLUSIVE:
+ if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime
+ && d_inode(dchild)->i_atime.tv_sec == v_atime
+ && d_inode(dchild)->i_size == 0 ) {
+ if (created)
+ *created = true;
+ break;
+ }
+ fallthrough;
+ case NFS4_CREATE_EXCLUSIVE4_1:
+ if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime
+ && d_inode(dchild)->i_atime.tv_sec == v_atime
+ && d_inode(dchild)->i_size == 0 ) {
+ if (created)
+ *created = true;
+ goto set_attr;
+ }
+ fallthrough;
+ case NFS3_CREATE_GUARDED:
+ err = nfserr_exist;
+ }
+ fh_drop_write(fhp);
+ goto out;
+ }
+
+ if (!IS_POSIXACL(dirp))
+ iap->ia_mode &= ~current_umask();
+
+ host_err = vfs_create(dirp, dchild, iap->ia_mode, true);
+ if (host_err < 0) {
+ fh_drop_write(fhp);
+ goto out_nfserr;
+ }
+ if (created)
+ *created = true;
+
+ nfsd_check_ignore_resizing(iap);
+
+ if (nfsd_create_is_exclusive(createmode)) {
+ /* Cram the verifier into atime/mtime */
+ iap->ia_valid = ATTR_MTIME|ATTR_ATIME
+ | ATTR_MTIME_SET|ATTR_ATIME_SET;
+ /* XXX someone who knows this better please fix it for nsec */
+ iap->ia_mtime.tv_sec = v_mtime;
+ iap->ia_atime.tv_sec = v_atime;
+ iap->ia_mtime.tv_nsec = 0;
+ iap->ia_atime.tv_nsec = 0;
+ }
+
+ set_attr:
+ err = nfsd_create_setattr(rqstp, resfhp, iap);
+
+ /*
+ * nfsd_create_setattr already committed the child
+ * (and possibly also the parent).
+ */
+ if (!err)
+ err = nfserrno(commit_metadata(fhp));
+
+ /*
+ * Update the filehandle to get the new inode info.
+ */
+ if (!err)
+ err = fh_update(resfhp);
+
+ out:
+ fh_unlock(fhp);
+ if (dchild && !IS_ERR(dchild))
+ dput(dchild);
+ fh_drop_write(fhp);
+ return err;
+
+ out_nfserr:
+ err = nfserrno(host_err);
+ goto out;
+}
+#endif /* CONFIG_NFSD_V3 */
+
+/*
+ * Read a symlink. On entry, *lenp must contain the maximum path length that
+ * fits into the buffer. On return, it contains the true length.
+ * N.B. After this call fhp needs an fh_put
+ */
+__be32
+nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
+{
+ __be32 err;
+ const char *link;
+ struct path path;
+ DEFINE_DELAYED_CALL(done);
+ int len;
+
+ err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP);
+ if (unlikely(err))
+ return err;
+
+ path.mnt = fhp->fh_export->ex_path.mnt;
+ path.dentry = fhp->fh_dentry;
+
+ if (unlikely(!d_is_symlink(path.dentry)))
+ return nfserr_inval;
+
+ touch_atime(&path);
+
+ link = vfs_get_link(path.dentry, &done);
+ if (IS_ERR(link))
+ return nfserrno(PTR_ERR(link));
+
+ len = strlen(link);
+ if (len < *lenp)
+ *lenp = len;
+ memcpy(buf, link, *lenp);
+ do_delayed_call(&done);
+ return 0;
+}
+
+/*
+ * Create a symlink and look up its inode
+ * N.B. After this call _both_ fhp and resfhp need an fh_put
+ */
+__be32
+nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ char *fname, int flen,
+ char *path,
+ struct svc_fh *resfhp)
+{
+ struct dentry *dentry, *dnew;
+ __be32 err, cerr;
+ int host_err;
+
+ err = nfserr_noent;
+ if (!flen || path[0] == '\0')
+ goto out;
+ err = nfserr_exist;
+ if (isdotent(fname, flen))
+ goto out;
+
+ err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+ if (err)
+ goto out;
+
+ host_err = fh_want_write(fhp);
+ if (host_err)
+ goto out_nfserr;
+
+ fh_lock(fhp);
+ dentry = fhp->fh_dentry;
+ dnew = lookup_one_len(fname, dentry, flen);
+ host_err = PTR_ERR(dnew);
+ if (IS_ERR(dnew))
+ goto out_nfserr;
+
+ host_err = vfs_symlink(d_inode(dentry), dnew, path);
+ err = nfserrno(host_err);
+ if (!err)
+ err = nfserrno(commit_metadata(fhp));
+ fh_unlock(fhp);
+
+ fh_drop_write(fhp);
+
+ cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
+ dput(dnew);
+ if (err==0) err = cerr;
+out:
+ return err;
+
+out_nfserr:
+ err = nfserrno(host_err);
+ goto out;
+}
+
+/*
+ * Create a hardlink
+ * N.B. After this call _both_ ffhp and tfhp need an fh_put
+ */
+__be32
+nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
+ char *name, int len, struct svc_fh *tfhp)
+{
+ struct dentry *ddir, *dnew, *dold;
+ struct inode *dirp;
+ __be32 err;
+ int host_err;
+
+ err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE);
+ if (err)
+ goto out;
+ err = fh_verify(rqstp, tfhp, 0, NFSD_MAY_NOP);
+ if (err)
+ goto out;
+ err = nfserr_isdir;
+ if (d_is_dir(tfhp->fh_dentry))
+ goto out;
+ err = nfserr_perm;
+ if (!len)
+ goto out;
+ err = nfserr_exist;
+ if (isdotent(name, len))
+ goto out;
+
+ host_err = fh_want_write(tfhp);
+ if (host_err) {
+ err = nfserrno(host_err);
+ goto out;
+ }
+
+ fh_lock_nested(ffhp, I_MUTEX_PARENT);
+ ddir = ffhp->fh_dentry;
+ dirp = d_inode(ddir);
+
+ dnew = lookup_one_len(name, ddir, len);
+ host_err = PTR_ERR(dnew);
+ if (IS_ERR(dnew))
+ goto out_nfserr;
+
+ dold = tfhp->fh_dentry;
+
+ err = nfserr_noent;
+ if (d_really_is_negative(dold))
+ goto out_dput;
+ host_err = vfs_link(dold, dirp, dnew, NULL);
+ if (!host_err) {
+ err = nfserrno(commit_metadata(ffhp));
+ if (!err)
+ err = nfserrno(commit_metadata(tfhp));
+ } else {
+ if (host_err == -EXDEV && rqstp->rq_vers == 2)
+ err = nfserr_acces;
+ else
+ err = nfserrno(host_err);
+ }
+out_dput:
+ dput(dnew);
+out_unlock:
+ fh_unlock(ffhp);
+ fh_drop_write(tfhp);
+out:
+ return err;
+
+out_nfserr:
+ err = nfserrno(host_err);
+ goto out_unlock;
+}
+
+static void
+nfsd_close_cached_files(struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+
+ if (inode && S_ISREG(inode->i_mode))
+ nfsd_file_close_inode_sync(inode);
+}
+
+static bool
+nfsd_has_cached_files(struct dentry *dentry)
+{
+ bool ret = false;
+ struct inode *inode = d_inode(dentry);
+
+ if (inode && S_ISREG(inode->i_mode))
+ ret = nfsd_file_is_cached(inode);
+ return ret;
+}
+
+/*
+ * Rename a file
+ * N.B. After this call _both_ ffhp and tfhp need an fh_put
+ */
+__be32
+nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
+ struct svc_fh *tfhp, char *tname, int tlen)
+{
+ struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap;
+ struct inode *fdir, *tdir;
+ __be32 err;
+ int host_err;
+ bool has_cached = false;
+
+ err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
+ if (err)
+ goto out;
+ err = fh_verify(rqstp, tfhp, S_IFDIR, NFSD_MAY_CREATE);
+ if (err)
+ goto out;
+
+ fdentry = ffhp->fh_dentry;
+ fdir = d_inode(fdentry);
+
+ tdentry = tfhp->fh_dentry;
+ tdir = d_inode(tdentry);
+
+ err = nfserr_perm;
+ if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
+ goto out;
+
+ err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;
+ if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
+ goto out;
+ if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
+ goto out;
+
+retry:
+ host_err = fh_want_write(ffhp);
+ if (host_err) {
+ err = nfserrno(host_err);
+ goto out;
+ }
+
+ /* cannot use fh_lock as we need deadlock protective ordering
+ * so do it by hand */
+ trap = lock_rename(tdentry, fdentry);
+ ffhp->fh_locked = tfhp->fh_locked = true;
+ fill_pre_wcc(ffhp);
+ fill_pre_wcc(tfhp);
+
+ odentry = lookup_one_len(fname, fdentry, flen);
+ host_err = PTR_ERR(odentry);
+ if (IS_ERR(odentry))
+ goto out_nfserr;
+
+ host_err = -ENOENT;
+ if (d_really_is_negative(odentry))
+ goto out_dput_old;
+ host_err = -EINVAL;
+ if (odentry == trap)
+ goto out_dput_old;
+
+ ndentry = lookup_one_len(tname, tdentry, tlen);
+ host_err = PTR_ERR(ndentry);
+ if (IS_ERR(ndentry))
+ goto out_dput_old;
+ host_err = -ENOTEMPTY;
+ if (ndentry == trap)
+ goto out_dput_new;
+
+ if (nfsd_has_cached_files(ndentry)) {
+ has_cached = true;
+ goto out_dput_old;
+ } else {
+ host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
+ if (!host_err) {
+ host_err = commit_metadata(tfhp);
+ if (!host_err)
+ host_err = commit_metadata(ffhp);
+ }
+ }
+ out_dput_new:
+ dput(ndentry);
+ out_dput_old:
+ dput(odentry);
+ out_nfserr:
+ err = nfserrno(host_err);
+ /*
+ * We cannot rely on fh_unlock on the two filehandles,
+ * as that would do the wrong thing if the two directories
+ * were the same, so again we do it by hand.
+ */
+ if (!has_cached) {
+ fill_post_wcc(ffhp);
+ fill_post_wcc(tfhp);
+ }
+ unlock_rename(tdentry, fdentry);
+ ffhp->fh_locked = tfhp->fh_locked = false;
+ fh_drop_write(ffhp);
+
+ /*
+ * If the target dentry has cached open files, then we need to try to
+ * close them prior to doing the rename. Flushing delayed fput
+ * shouldn't be done with locks held however, so we delay it until this
+ * point and then reattempt the whole shebang.
+ */
+ if (has_cached) {
+ has_cached = false;
+ nfsd_close_cached_files(ndentry);
+ dput(ndentry);
+ goto retry;
+ }
+out:
+ return err;
+}
+
+/*
+ * Unlink a file or directory
+ * N.B. After this call fhp needs an fh_put
+ */
+__be32
+nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
+ char *fname, int flen)
+{
+ struct dentry *dentry, *rdentry;
+ struct inode *dirp;
+ __be32 err;
+ int host_err;
+
+ err = nfserr_acces;
+ if (!flen || isdotent(fname, flen))
+ goto out;
+ err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_REMOVE);
+ if (err)
+ goto out;
+
+ host_err = fh_want_write(fhp);
+ if (host_err)
+ goto out_nfserr;
+
+ fh_lock_nested(fhp, I_MUTEX_PARENT);
+ dentry = fhp->fh_dentry;
+ dirp = d_inode(dentry);
+
+ rdentry = lookup_one_len(fname, dentry, flen);
+ host_err = PTR_ERR(rdentry);
+ if (IS_ERR(rdentry))
+ goto out_drop_write;
+
+ if (d_really_is_negative(rdentry)) {
+ dput(rdentry);
+ host_err = -ENOENT;
+ goto out_drop_write;
+ }
+
+ if (!type)
+ type = d_inode(rdentry)->i_mode & S_IFMT;
+
+ if (type != S_IFDIR) {
+ nfsd_close_cached_files(rdentry);
+ host_err = vfs_unlink(dirp, rdentry, NULL);
+ } else {
+ host_err = vfs_rmdir(dirp, rdentry);
+ }
+
+ if (!host_err)
+ host_err = commit_metadata(fhp);
+ dput(rdentry);
+
+out_drop_write:
+ fh_drop_write(fhp);
+out_nfserr:
+ if (host_err == -EBUSY) {
+ /* name is mounted-on. There is no perfect
+ * error status.
+ */
+ if (nfsd_v4client(rqstp))
+ err = nfserr_file_open;
+ else
+ err = nfserr_acces;
+ } else {
+ err = nfserrno(host_err);
+ }
+out:
+ return err;
+}
+
+/*
+ * We do this buffering because we must not call back into the file
+ * system's ->lookup() method from the filldir callback. That may well
+ * deadlock a number of file systems.
+ *
+ * This is based heavily on the implementation of same in XFS.
+ */
+struct buffered_dirent {
+ u64 ino;
+ loff_t offset;
+ int namlen;
+ unsigned int d_type;
+ char name[];
+};
+
+struct readdir_data {
+ struct dir_context ctx;
+ char *dirent;
+ size_t used;
+ int full;
+};
+
+static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name,
+ int namlen, loff_t offset, u64 ino,
+ unsigned int d_type)
+{
+ struct readdir_data *buf =
+ container_of(ctx, struct readdir_data, ctx);
+ struct buffered_dirent *de = (void *)(buf->dirent + buf->used);
+ unsigned int reclen;
+
+ reclen = ALIGN(sizeof(struct buffered_dirent) + namlen, sizeof(u64));
+ if (buf->used + reclen > PAGE_SIZE) {
+ buf->full = 1;
+ return -EINVAL;
+ }
+
+ de->namlen = namlen;
+ de->offset = offset;
+ de->ino = ino;
+ de->d_type = d_type;
+ memcpy(de->name, name, namlen);
+ buf->used += reclen;
+
+ return 0;
+}
+
+static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
+ struct readdir_cd *cdp, loff_t *offsetp)
+{
+ struct buffered_dirent *de;
+ int host_err;
+ int size;
+ loff_t offset;
+ struct readdir_data buf = {
+ .ctx.actor = nfsd_buffered_filldir,
+ .dirent = (void *)__get_free_page(GFP_KERNEL)
+ };
+
+ if (!buf.dirent)
+ return nfserrno(-ENOMEM);
+
+ offset = *offsetp;
+
+ while (1) {
+ unsigned int reclen;
+
+ cdp->err = nfserr_eof; /* will be cleared on successful read */
+ buf.used = 0;
+ buf.full = 0;
+
+ host_err = iterate_dir(file, &buf.ctx);
+ if (buf.full)
+ host_err = 0;
+
+ if (host_err < 0)
+ break;
+
+ size = buf.used;
+
+ if (!size)
+ break;
+
+ de = (struct buffered_dirent *)buf.dirent;
+ while (size > 0) {
+ offset = de->offset;
+
+ if (func(cdp, de->name, de->namlen, de->offset,
+ de->ino, de->d_type))
+ break;
+
+ if (cdp->err != nfs_ok)
+ break;
+
+ reclen = ALIGN(sizeof(*de) + de->namlen,
+ sizeof(u64));
+ size -= reclen;
+ de = (struct buffered_dirent *)((char *)de + reclen);
+ }
+ if (size > 0) /* We bailed out early */
+ break;
+
+ offset = vfs_llseek(file, 0, SEEK_CUR);
+ }
+
+ free_page((unsigned long)(buf.dirent));
+
+ if (host_err)
+ return nfserrno(host_err);
+
+ *offsetp = offset;
+ return cdp->err;
+}
+
+/*
+ * Read entries from a directory.
+ * The NFSv3/4 verifier we ignore for now.
+ */
+__be32
+nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
+ struct readdir_cd *cdp, nfsd_filldir_t func)
+{
+ __be32 err;
+ struct file *file;
+ loff_t offset = *offsetp;
+ int may_flags = NFSD_MAY_READ;
+
+ /* NFSv2 only supports 32 bit cookies */
+ if (rqstp->rq_vers > 2)
+ may_flags |= NFSD_MAY_64BIT_COOKIE;
+
+ err = nfsd_open(rqstp, fhp, S_IFDIR, may_flags, &file);
+ if (err)
+ goto out;
+
+ offset = vfs_llseek(file, offset, SEEK_SET);
+ if (offset < 0) {
+ err = nfserrno((int)offset);
+ goto out_close;
+ }
+
+ err = nfsd_buffered_readdir(file, func, cdp, offsetp);
+
+ if (err == nfserr_eof || err == nfserr_toosmall)
+ err = nfs_ok; /* can still be found in ->err */
+out_close:
+ fput(file);
+out:
+ return err;
+}
+
+/*
+ * Get file system stats
+ * N.B. After this call fhp needs an fh_put
+ */
+__be32
+nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
+{
+ __be32 err;
+
+ err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
+ if (!err) {
+ struct path path = {
+ .mnt = fhp->fh_export->ex_path.mnt,
+ .dentry = fhp->fh_dentry,
+ };
+ if (vfs_statfs(&path, stat))
+ err = nfserr_io;
+ }
+ return err;
+}
+
+static int exp_rdonly(struct svc_rqst *rqstp, struct svc_export *exp)
+{
+ return nfsexp_flags(rqstp, exp) & NFSEXP_READONLY;
+}
+
+#ifdef CONFIG_NFSD_V4
+/*
+ * Helper function to translate error numbers. In the case of xattr operations,
+ * some error codes need to be translated outside of the standard translations.
+ *
+ * ENODATA needs to be translated to nfserr_noxattr.
+ * E2BIG to nfserr_xattr2big.
+ *
+ * Additionally, vfs_listxattr can return -ERANGE. This means that the
+ * file has too many extended attributes to retrieve inside an
+ * XATTR_LIST_MAX sized buffer. This is a bug in the xattr implementation:
+ * filesystems will allow the adding of extended attributes until they hit
+ * their own internal limit. This limit may be larger than XATTR_LIST_MAX.
+ * So, at that point, the attributes are present and valid, but can't
+ * be retrieved using listxattr, since the upper level xattr code enforces
+ * the XATTR_LIST_MAX limit.
+ *
+ * This bug means that we need to deal with listxattr returning -ERANGE. The
+ * best mapping is to return TOOSMALL.
+ */
+static __be32
+nfsd_xattr_errno(int err)
+{
+ switch (err) {
+ case -ENODATA:
+ return nfserr_noxattr;
+ case -E2BIG:
+ return nfserr_xattr2big;
+ case -ERANGE:
+ return nfserr_toosmall;
+ }
+ return nfserrno(err);
+}
+
+/*
+ * Retrieve the specified user extended attribute. To avoid always
+ * having to allocate the maximum size (since we are not getting
+ * a maximum size from the RPC), do a probe + alloc. Hold a reader
+ * lock on i_rwsem to prevent the extended attribute from changing
+ * size while we're doing this.
+ */
+__be32
+nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name,
+ void **bufp, int *lenp)
+{
+ ssize_t len;
+ __be32 err;
+ char *buf;
+ struct inode *inode;
+ struct dentry *dentry;
+
+ err = fh_verify(rqstp, fhp, 0, NFSD_MAY_READ);
+ if (err)
+ return err;
+
+ err = nfs_ok;
+ dentry = fhp->fh_dentry;
+ inode = d_inode(dentry);
+
+ inode_lock_shared(inode);
+
+ len = vfs_getxattr(dentry, name, NULL, 0);
+
+ /*
+ * Zero-length attribute, just return.
+ */
+ if (len == 0) {
+ *bufp = NULL;
+ *lenp = 0;
+ goto out;
+ }
+
+ if (len < 0) {
+ err = nfsd_xattr_errno(len);
+ goto out;
+ }
+
+ if (len > *lenp) {
+ err = nfserr_toosmall;
+ goto out;
+ }
+
+ buf = kvmalloc(len, GFP_KERNEL | GFP_NOFS);
+ if (buf == NULL) {
+ err = nfserr_jukebox;
+ goto out;
+ }
+
+ len = vfs_getxattr(dentry, name, buf, len);
+ if (len <= 0) {
+ kvfree(buf);
+ buf = NULL;
+ err = nfsd_xattr_errno(len);
+ }
+
+ *lenp = len;
+ *bufp = buf;
+
+out:
+ inode_unlock_shared(inode);
+
+ return err;
+}
+
+/*
+ * Retrieve the xattr names. Since we can't know how many are
+ * user extended attributes, we must get all attributes here,
+ * and have the XDR encode filter out the "user." ones.
+ *
+ * While this could always just allocate an XATTR_LIST_MAX
+ * buffer, that's a waste, so do a probe + allocate. To
+ * avoid any changes between the probe and allocate, wrap
+ * this in inode_lock.
+ */
+__be32
+nfsd_listxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char **bufp,
+ int *lenp)
+{
+ ssize_t len;
+ __be32 err;
+ char *buf;
+ struct inode *inode;
+ struct dentry *dentry;
+
+ err = fh_verify(rqstp, fhp, 0, NFSD_MAY_READ);
+ if (err)
+ return err;
+
+ dentry = fhp->fh_dentry;
+ inode = d_inode(dentry);
+ *lenp = 0;
+
+ inode_lock_shared(inode);
+
+ len = vfs_listxattr(dentry, NULL, 0);
+ if (len <= 0) {
+ err = nfsd_xattr_errno(len);
+ goto out;
+ }
+
+ if (len > XATTR_LIST_MAX) {
+ err = nfserr_xattr2big;
+ goto out;
+ }
+
+ /*
+ * We're holding i_rwsem - use GFP_NOFS.
+ */
+ buf = kvmalloc(len, GFP_KERNEL | GFP_NOFS);
+ if (buf == NULL) {
+ err = nfserr_jukebox;
+ goto out;
+ }
+
+ len = vfs_listxattr(dentry, buf, len);
+ if (len <= 0) {
+ kvfree(buf);
+ err = nfsd_xattr_errno(len);
+ goto out;
+ }
+
+ *lenp = len;
+ *bufp = buf;
+
+ err = nfs_ok;
+out:
+ inode_unlock_shared(inode);
+
+ return err;
+}
+
+/*
+ * Removexattr and setxattr need to call fh_lock to both lock the inode
+ * and set the change attribute. Since the top-level vfs_removexattr
+ * and vfs_setxattr calls already do their own inode_lock calls, call
+ * the _locked variant. Pass in a NULL pointer for delegated_inode,
+ * and let the client deal with NFS4ERR_DELAY (same as with e.g.
+ * setattr and remove).
+ */
+__be32
+nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name)
+{
+ __be32 err;
+ int ret;
+
+ err = fh_verify(rqstp, fhp, 0, NFSD_MAY_WRITE);
+ if (err)
+ return err;
+
+ ret = fh_want_write(fhp);
+ if (ret)
+ return nfserrno(ret);
+
+ fh_lock(fhp);
+
+ ret = __vfs_removexattr_locked(fhp->fh_dentry, name, NULL);
+
+ fh_unlock(fhp);
+ fh_drop_write(fhp);
+
+ return nfsd_xattr_errno(ret);
+}
+
+__be32
+nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name,
+ void *buf, u32 len, u32 flags)
+{
+ __be32 err;
+ int ret;
+
+ err = fh_verify(rqstp, fhp, 0, NFSD_MAY_WRITE);
+ if (err)
+ return err;
+
+ ret = fh_want_write(fhp);
+ if (ret)
+ return nfserrno(ret);
+ fh_lock(fhp);
+
+ ret = __vfs_setxattr_locked(fhp->fh_dentry, name, buf, len, flags,
+ NULL);
+
+ fh_unlock(fhp);
+ fh_drop_write(fhp);
+
+ return nfsd_xattr_errno(ret);
+}
+#endif
+
+/*
+ * Check for a user's access permissions to this inode.
+ */
+__be32
+nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
+ struct dentry *dentry, int acc)
+{
+ struct inode *inode = d_inode(dentry);
+ int err;
+
+ if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP)
+ return 0;
+#if 0
+ dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n",
+ acc,
+ (acc & NFSD_MAY_READ)? " read" : "",
+ (acc & NFSD_MAY_WRITE)? " write" : "",
+ (acc & NFSD_MAY_EXEC)? " exec" : "",
+ (acc & NFSD_MAY_SATTR)? " sattr" : "",
+ (acc & NFSD_MAY_TRUNC)? " trunc" : "",
+ (acc & NFSD_MAY_LOCK)? " lock" : "",
+ (acc & NFSD_MAY_OWNER_OVERRIDE)? " owneroverride" : "",
+ inode->i_mode,
+ IS_IMMUTABLE(inode)? " immut" : "",
+ IS_APPEND(inode)? " append" : "",
+ __mnt_is_readonly(exp->ex_path.mnt)? " ro" : "");
+ dprintk(" owner %d/%d user %d/%d\n",
+ inode->i_uid, inode->i_gid, current_fsuid(), current_fsgid());
+#endif
+
+ /* Normally we reject any write/sattr etc access on a read-only file
+ * system. But if it is IRIX doing check on write-access for a
+ * device special file, we ignore rofs.
+ */
+ if (!(acc & NFSD_MAY_LOCAL_ACCESS))
+ if (acc & (NFSD_MAY_WRITE | NFSD_MAY_SATTR | NFSD_MAY_TRUNC)) {
+ if (exp_rdonly(rqstp, exp) ||
+ __mnt_is_readonly(exp->ex_path.mnt))
+ return nfserr_rofs;
+ if (/* (acc & NFSD_MAY_WRITE) && */ IS_IMMUTABLE(inode))
+ return nfserr_perm;
+ }
+ if ((acc & NFSD_MAY_TRUNC) && IS_APPEND(inode))
+ return nfserr_perm;
+
+ if (acc & NFSD_MAY_LOCK) {
+ /* If we cannot rely on authentication in NLM requests,
+ * just allow locks, otherwise require read permission, or
+ * ownership
+ */
+ if (exp->ex_flags & NFSEXP_NOAUTHNLM)
+ return 0;
+ else
+ acc = NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE;
+ }
+ /*
+ * The file owner always gets access permission for accesses that
+ * would normally be checked at open time. This is to make
+ * file access work even when the client has done a fchmod(fd, 0).
+ *
+ * However, `cp foo bar' should fail nevertheless when bar is
+ * readonly. A sensible way to do this might be to reject all
+ * attempts to truncate a read-only file, because a creat() call
+ * always implies file truncation.
+ * ... but this isn't really fair. A process may reasonably call
+ * ftruncate on an open file descriptor on a file with perm 000.
+ * We must trust the client to do permission checking - using "ACCESS"
+ * with NFSv3.
+ */
+ if ((acc & NFSD_MAY_OWNER_OVERRIDE) &&
+ uid_eq(inode->i_uid, current_fsuid()))
+ return 0;
+
+ /* This assumes NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */
+ err = inode_permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC));
+
+ /* Allow read access to binaries even when mode 111 */
+ if (err == -EACCES && S_ISREG(inode->i_mode) &&
+ (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) ||
+ acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC)))
+ err = inode_permission(inode, MAY_EXEC);
+
+ return err? nfserrno(err) : 0;
+}
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
new file mode 100644
index 000000000..a2442ebe5
--- /dev/null
+++ b/fs/nfsd/vfs.h
@@ -0,0 +1,169 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef LINUX_NFSD_VFS_H
+#define LINUX_NFSD_VFS_H
+
+#include "nfsfh.h"
+#include "nfsd.h"
+
+/*
+ * Flags for nfsd_permission
+ */
+#define NFSD_MAY_NOP 0
+#define NFSD_MAY_EXEC 0x001 /* == MAY_EXEC */
+#define NFSD_MAY_WRITE 0x002 /* == MAY_WRITE */
+#define NFSD_MAY_READ 0x004 /* == MAY_READ */
+#define NFSD_MAY_SATTR 0x008
+#define NFSD_MAY_TRUNC 0x010
+#define NFSD_MAY_LOCK 0x020
+#define NFSD_MAY_MASK 0x03f
+
+/* extra hints to permission and open routines: */
+#define NFSD_MAY_OWNER_OVERRIDE 0x040
+#define NFSD_MAY_LOCAL_ACCESS 0x080 /* for device special files */
+#define NFSD_MAY_BYPASS_GSS_ON_ROOT 0x100
+#define NFSD_MAY_NOT_BREAK_LEASE 0x200
+#define NFSD_MAY_BYPASS_GSS 0x400
+#define NFSD_MAY_READ_IF_EXEC 0x800
+
+#define NFSD_MAY_64BIT_COOKIE 0x1000 /* 64 bit readdir cookies for >= NFSv3 */
+
+#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
+#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
+
+struct nfsd_file;
+
+/*
+ * Callback function for readdir
+ */
+typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
+
+/* nfsd/vfs.c */
+int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
+ struct svc_export **expp);
+__be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *,
+ const char *, unsigned int, struct svc_fh *);
+__be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *,
+ const char *, unsigned int,
+ struct svc_export **, struct dentry **);
+__be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *,
+ struct iattr *, int, time64_t);
+int nfsd_mountpoint(struct dentry *, struct svc_export *);
+#ifdef CONFIG_NFSD_V4
+__be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
+ struct xdr_netobj *);
+__be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *,
+ struct file *, loff_t, loff_t, int);
+__be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
+ struct nfsd_file *nf_dst, u64 dst_pos,
+ u64 count, bool sync);
+#endif /* CONFIG_NFSD_V4 */
+__be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *,
+ char *name, int len, struct iattr *attrs,
+ int type, dev_t rdev, struct svc_fh *res);
+__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
+ char *name, int len, struct iattr *attrs,
+ int type, dev_t rdev, struct svc_fh *res);
+#ifdef CONFIG_NFSD_V3
+__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
+__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
+ char *name, int len, struct iattr *attrs,
+ struct svc_fh *res, int createmode,
+ u32 *verifier, bool *truncp, bool *created);
+__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *,
+ loff_t, unsigned long, __be32 *verf);
+#endif /* CONFIG_NFSD_V3 */
+#ifdef CONFIG_NFSD_V4
+__be32 nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ char *name, void **bufp, int *lenp);
+__be32 nfsd_listxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ char **bufp, int *lenp);
+__be32 nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ char *name);
+__be32 nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ char *name, void *buf, u32 len, u32 flags);
+#endif
+int nfsd_open_break_lease(struct inode *, int);
+__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
+ int, struct file **);
+__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t,
+ int, struct file **);
+__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct file *file, loff_t offset,
+ unsigned long *count,
+ u32 *eof);
+__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct file *file, loff_t offset,
+ struct kvec *vec, int vlen,
+ unsigned long *count,
+ u32 *eof);
+__be32 nfsd_read(struct svc_rqst *, struct svc_fh *,
+ loff_t, struct kvec *, int, unsigned long *,
+ u32 *eof);
+__be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t,
+ struct kvec *, int, unsigned long *,
+ int stable, __be32 *verf);
+__be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfsd_file *nf, loff_t offset,
+ struct kvec *vec, int vlen, unsigned long *cnt,
+ int stable, __be32 *verf);
+__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *,
+ char *, int *);
+__be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *,
+ char *name, int len, char *path,
+ struct svc_fh *res);
+__be32 nfsd_link(struct svc_rqst *, struct svc_fh *,
+ char *, int, struct svc_fh *);
+ssize_t nfsd_copy_file_range(struct file *, u64,
+ struct file *, u64, u64);
+__be32 nfsd_rename(struct svc_rqst *,
+ struct svc_fh *, char *, int,
+ struct svc_fh *, char *, int);
+__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
+ char *name, int len);
+__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *,
+ loff_t *, struct readdir_cd *, nfsd_filldir_t);
+__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
+ struct kstatfs *, int access);
+
+__be32 nfsd_permission(struct svc_rqst *, struct svc_export *,
+ struct dentry *, int);
+
+static inline int fh_want_write(struct svc_fh *fh)
+{
+ int ret;
+
+ if (fh->fh_want_write)
+ return 0;
+ ret = mnt_want_write(fh->fh_export->ex_path.mnt);
+ if (!ret)
+ fh->fh_want_write = true;
+ return ret;
+}
+
+static inline void fh_drop_write(struct svc_fh *fh)
+{
+ if (fh->fh_want_write) {
+ fh->fh_want_write = false;
+ mnt_drop_write(fh->fh_export->ex_path.mnt);
+ }
+}
+
+static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat)
+{
+ struct path p = {.mnt = fh->fh_export->ex_path.mnt,
+ .dentry = fh->fh_dentry};
+ return nfserrno(vfs_getattr(&p, stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT));
+}
+
+static inline int nfsd_create_is_exclusive(int createmode)
+{
+ return createmode == NFS3_CREATE_EXCLUSIVE
+ || createmode == NFS4_CREATE_EXCLUSIVE4_1;
+}
+
+#endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
new file mode 100644
index 000000000..b8cc6a4b2
--- /dev/null
+++ b/fs/nfsd/xdr.h
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* XDR types for nfsd. This is mainly a typing exercise. */
+
+#ifndef LINUX_NFSD_H
+#define LINUX_NFSD_H
+
+#include <linux/vfs.h>
+#include "nfsd.h"
+#include "nfsfh.h"
+
+struct nfsd_fhandle {
+ struct svc_fh fh;
+};
+
+struct nfsd_sattrargs {
+ struct svc_fh fh;
+ struct iattr attrs;
+};
+
+struct nfsd_diropargs {
+ struct svc_fh fh;
+ char * name;
+ unsigned int len;
+};
+
+struct nfsd_readargs {
+ struct svc_fh fh;
+ __u32 offset;
+ __u32 count;
+ int vlen;
+};
+
+struct nfsd_writeargs {
+ svc_fh fh;
+ __u32 offset;
+ __u32 len;
+ struct kvec first;
+};
+
+struct nfsd_createargs {
+ struct svc_fh fh;
+ char * name;
+ unsigned int len;
+ struct iattr attrs;
+};
+
+struct nfsd_renameargs {
+ struct svc_fh ffh;
+ char * fname;
+ unsigned int flen;
+ struct svc_fh tfh;
+ char * tname;
+ unsigned int tlen;
+};
+
+struct nfsd_readlinkargs {
+ struct svc_fh fh;
+ char * buffer;
+};
+
+struct nfsd_linkargs {
+ struct svc_fh ffh;
+ struct svc_fh tfh;
+ char * tname;
+ unsigned int tlen;
+};
+
+struct nfsd_symlinkargs {
+ struct svc_fh ffh;
+ char * fname;
+ unsigned int flen;
+ char * tname;
+ unsigned int tlen;
+ struct iattr attrs;
+ struct kvec first;
+};
+
+struct nfsd_readdirargs {
+ struct svc_fh fh;
+ __u32 cookie;
+ __u32 count;
+ __be32 * buffer;
+};
+
+struct nfsd_stat {
+ __be32 status;
+};
+
+struct nfsd_attrstat {
+ __be32 status;
+ struct svc_fh fh;
+ struct kstat stat;
+};
+
+struct nfsd_diropres {
+ __be32 status;
+ struct svc_fh fh;
+ struct kstat stat;
+};
+
+struct nfsd_readlinkres {
+ __be32 status;
+ int len;
+};
+
+struct nfsd_readres {
+ __be32 status;
+ struct svc_fh fh;
+ unsigned long count;
+ struct kstat stat;
+};
+
+struct nfsd_readdirres {
+ __be32 status;
+
+ int count;
+
+ struct readdir_cd common;
+ __be32 * buffer;
+ int buflen;
+ __be32 * offset;
+};
+
+struct nfsd_statfsres {
+ __be32 status;
+ struct kstatfs stats;
+};
+
+/*
+ * Storage requirements for XDR arguments and results.
+ */
+union nfsd_xdrstore {
+ struct nfsd_sattrargs sattr;
+ struct nfsd_diropargs dirop;
+ struct nfsd_readargs read;
+ struct nfsd_writeargs write;
+ struct nfsd_createargs create;
+ struct nfsd_renameargs rename;
+ struct nfsd_linkargs link;
+ struct nfsd_symlinkargs symlink;
+ struct nfsd_readdirargs readdir;
+};
+
+#define NFS2_SVC_XDRSIZE sizeof(union nfsd_xdrstore)
+
+
+int nfssvc_decode_void(struct svc_rqst *, __be32 *);
+int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *);
+int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *);
+int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *);
+int nfssvc_decode_readargs(struct svc_rqst *, __be32 *);
+int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *);
+int nfssvc_decode_createargs(struct svc_rqst *, __be32 *);
+int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *);
+int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *);
+int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *);
+int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *);
+int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *);
+int nfssvc_encode_void(struct svc_rqst *, __be32 *);
+int nfssvc_encode_stat(struct svc_rqst *, __be32 *);
+int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *);
+int nfssvc_encode_diropres(struct svc_rqst *, __be32 *);
+int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *);
+int nfssvc_encode_readres(struct svc_rqst *, __be32 *);
+int nfssvc_encode_statfsres(struct svc_rqst *, __be32 *);
+int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *);
+
+int nfssvc_encode_entry(void *, const char *name,
+ int namlen, loff_t offset, u64 ino, unsigned int);
+
+void nfssvc_release_attrstat(struct svc_rqst *rqstp);
+void nfssvc_release_diropres(struct svc_rqst *rqstp);
+void nfssvc_release_readres(struct svc_rqst *rqstp);
+
+/* Helper functions for NFSv2 ACL code */
+__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat);
+__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp);
+
+#endif /* LINUX_NFSD_H */
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
new file mode 100644
index 000000000..ae6fa6c9c
--- /dev/null
+++ b/fs/nfsd/xdr3.h
@@ -0,0 +1,324 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * XDR types for NFSv3 in nfsd.
+ *
+ * Copyright (C) 1996-1998, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef _LINUX_NFSD_XDR3_H
+#define _LINUX_NFSD_XDR3_H
+
+#include "xdr.h"
+
+struct nfsd3_sattrargs {
+ struct svc_fh fh;
+ struct iattr attrs;
+ int check_guard;
+ time64_t guardtime;
+};
+
+struct nfsd3_diropargs {
+ struct svc_fh fh;
+ char * name;
+ unsigned int len;
+};
+
+struct nfsd3_accessargs {
+ struct svc_fh fh;
+ unsigned int access;
+};
+
+struct nfsd3_readargs {
+ struct svc_fh fh;
+ __u64 offset;
+ __u32 count;
+ int vlen;
+};
+
+struct nfsd3_writeargs {
+ svc_fh fh;
+ __u64 offset;
+ __u32 count;
+ int stable;
+ __u32 len;
+ struct kvec first;
+};
+
+struct nfsd3_createargs {
+ struct svc_fh fh;
+ char * name;
+ unsigned int len;
+ int createmode;
+ struct iattr attrs;
+ __be32 * verf;
+};
+
+struct nfsd3_mknodargs {
+ struct svc_fh fh;
+ char * name;
+ unsigned int len;
+ __u32 ftype;
+ __u32 major, minor;
+ struct iattr attrs;
+};
+
+struct nfsd3_renameargs {
+ struct svc_fh ffh;
+ char * fname;
+ unsigned int flen;
+ struct svc_fh tfh;
+ char * tname;
+ unsigned int tlen;
+};
+
+struct nfsd3_readlinkargs {
+ struct svc_fh fh;
+ char * buffer;
+};
+
+struct nfsd3_linkargs {
+ struct svc_fh ffh;
+ struct svc_fh tfh;
+ char * tname;
+ unsigned int tlen;
+};
+
+struct nfsd3_symlinkargs {
+ struct svc_fh ffh;
+ char * fname;
+ unsigned int flen;
+ char * tname;
+ unsigned int tlen;
+ struct iattr attrs;
+ struct kvec first;
+};
+
+struct nfsd3_readdirargs {
+ struct svc_fh fh;
+ __u64 cookie;
+ __u32 dircount;
+ __u32 count;
+ __be32 * verf;
+ __be32 * buffer;
+};
+
+struct nfsd3_commitargs {
+ struct svc_fh fh;
+ __u64 offset;
+ __u32 count;
+};
+
+struct nfsd3_getaclargs {
+ struct svc_fh fh;
+ int mask;
+};
+
+struct posix_acl;
+struct nfsd3_setaclargs {
+ struct svc_fh fh;
+ int mask;
+ struct posix_acl *acl_access;
+ struct posix_acl *acl_default;
+};
+
+struct nfsd3_attrstat {
+ __be32 status;
+ struct svc_fh fh;
+ struct kstat stat;
+};
+
+/* LOOKUP, CREATE, MKDIR, SYMLINK, MKNOD */
+struct nfsd3_diropres {
+ __be32 status;
+ struct svc_fh dirfh;
+ struct svc_fh fh;
+};
+
+struct nfsd3_accessres {
+ __be32 status;
+ struct svc_fh fh;
+ __u32 access;
+ struct kstat stat;
+};
+
+struct nfsd3_readlinkres {
+ __be32 status;
+ struct svc_fh fh;
+ __u32 len;
+};
+
+struct nfsd3_readres {
+ __be32 status;
+ struct svc_fh fh;
+ unsigned long count;
+ __u32 eof;
+};
+
+struct nfsd3_writeres {
+ __be32 status;
+ struct svc_fh fh;
+ unsigned long count;
+ int committed;
+ __be32 verf[2];
+};
+
+struct nfsd3_renameres {
+ __be32 status;
+ struct svc_fh ffh;
+ struct svc_fh tfh;
+};
+
+struct nfsd3_linkres {
+ __be32 status;
+ struct svc_fh tfh;
+ struct svc_fh fh;
+};
+
+struct nfsd3_readdirres {
+ __be32 status;
+ struct svc_fh fh;
+ /* Just to save kmalloc on every readdirplus entry (svc_fh is a
+ * little large for the stack): */
+ struct svc_fh scratch;
+ int count;
+ __be32 verf[2];
+
+ struct readdir_cd common;
+ __be32 * buffer;
+ int buflen;
+ __be32 * offset;
+ __be32 * offset1;
+ struct svc_rqst * rqstp;
+
+};
+
+struct nfsd3_fsstatres {
+ __be32 status;
+ struct kstatfs stats;
+ __u32 invarsec;
+};
+
+struct nfsd3_fsinfores {
+ __be32 status;
+ __u32 f_rtmax;
+ __u32 f_rtpref;
+ __u32 f_rtmult;
+ __u32 f_wtmax;
+ __u32 f_wtpref;
+ __u32 f_wtmult;
+ __u32 f_dtpref;
+ __u64 f_maxfilesize;
+ __u32 f_properties;
+};
+
+struct nfsd3_pathconfres {
+ __be32 status;
+ __u32 p_link_max;
+ __u32 p_name_max;
+ __u32 p_no_trunc;
+ __u32 p_chown_restricted;
+ __u32 p_case_insensitive;
+ __u32 p_case_preserving;
+};
+
+struct nfsd3_commitres {
+ __be32 status;
+ struct svc_fh fh;
+ __be32 verf[2];
+};
+
+struct nfsd3_getaclres {
+ __be32 status;
+ struct svc_fh fh;
+ int mask;
+ struct posix_acl *acl_access;
+ struct posix_acl *acl_default;
+ struct kstat stat;
+};
+
+/* dummy type for release */
+struct nfsd3_fhandle_pair {
+ __u32 dummy;
+ struct svc_fh fh1;
+ struct svc_fh fh2;
+};
+
+/*
+ * Storage requirements for XDR arguments and results.
+ */
+union nfsd3_xdrstore {
+ struct nfsd3_sattrargs sattrargs;
+ struct nfsd3_diropargs diropargs;
+ struct nfsd3_readargs readargs;
+ struct nfsd3_writeargs writeargs;
+ struct nfsd3_createargs createargs;
+ struct nfsd3_renameargs renameargs;
+ struct nfsd3_linkargs linkargs;
+ struct nfsd3_symlinkargs symlinkargs;
+ struct nfsd3_readdirargs readdirargs;
+ struct nfsd3_diropres diropres;
+ struct nfsd3_accessres accessres;
+ struct nfsd3_readlinkres readlinkres;
+ struct nfsd3_readres readres;
+ struct nfsd3_writeres writeres;
+ struct nfsd3_renameres renameres;
+ struct nfsd3_linkres linkres;
+ struct nfsd3_readdirres readdirres;
+ struct nfsd3_fsstatres fsstatres;
+ struct nfsd3_fsinfores fsinfores;
+ struct nfsd3_pathconfres pathconfres;
+ struct nfsd3_commitres commitres;
+ struct nfsd3_getaclres getaclres;
+};
+
+#define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore)
+
+int nfs3svc_decode_voidarg(struct svc_rqst *, __be32 *);
+int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *);
+int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *);
+int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *);
+int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *);
+int nfs3svc_decode_readargs(struct svc_rqst *, __be32 *);
+int nfs3svc_decode_writeargs(struct svc_rqst *, __be32 *);
+int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *);
+int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *);
+int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *);
+int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *);
+int nfs3svc_decode_readlinkargs(struct svc_rqst *, __be32 *);
+int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *);
+int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *);
+int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *);
+int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *);
+int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *);
+int nfs3svc_encode_voidres(struct svc_rqst *, __be32 *);
+int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *);
+int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *);
+int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *);
+int nfs3svc_encode_accessres(struct svc_rqst *, __be32 *);
+int nfs3svc_encode_readlinkres(struct svc_rqst *, __be32 *);
+int nfs3svc_encode_readres(struct svc_rqst *, __be32 *);
+int nfs3svc_encode_writeres(struct svc_rqst *, __be32 *);
+int nfs3svc_encode_createres(struct svc_rqst *, __be32 *);
+int nfs3svc_encode_renameres(struct svc_rqst *, __be32 *);
+int nfs3svc_encode_linkres(struct svc_rqst *, __be32 *);
+int nfs3svc_encode_readdirres(struct svc_rqst *, __be32 *);
+int nfs3svc_encode_fsstatres(struct svc_rqst *, __be32 *);
+int nfs3svc_encode_fsinfores(struct svc_rqst *, __be32 *);
+int nfs3svc_encode_pathconfres(struct svc_rqst *, __be32 *);
+int nfs3svc_encode_commitres(struct svc_rqst *, __be32 *);
+
+void nfs3svc_release_fhandle(struct svc_rqst *);
+void nfs3svc_release_fhandle2(struct svc_rqst *);
+int nfs3svc_encode_entry(void *, const char *name,
+ int namlen, loff_t offset, u64 ino,
+ unsigned int);
+int nfs3svc_encode_entry_plus(void *, const char *name,
+ int namlen, loff_t offset, u64 ino,
+ unsigned int);
+/* Helper functions for NFSv3 ACL code */
+__be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p,
+ struct svc_fh *fhp);
+__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp);
+
+
+#endif /* _LINUX_NFSD_XDR3_H */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
new file mode 100644
index 000000000..679d40af1
--- /dev/null
+++ b/fs/nfsd/xdr4.h
@@ -0,0 +1,902 @@
+/*
+ * Server-side types for NFSv4.
+ *
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Kendrick Smith <kmsmith@umich.edu>
+ * Andy Adamson <andros@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _LINUX_NFSD_XDR4_H
+#define _LINUX_NFSD_XDR4_H
+
+#include "state.h"
+#include "nfsd.h"
+
+#define NFSD4_MAX_TAGLEN 128
+#define XDR_LEN(n) (((n) + 3) & ~3)
+
+#define CURRENT_STATE_ID_FLAG (1<<0)
+#define SAVED_STATE_ID_FLAG (1<<1)
+
+#define SET_CSTATE_FLAG(c, f) ((c)->sid_flags |= (f))
+#define HAS_CSTATE_FLAG(c, f) ((c)->sid_flags & (f))
+#define CLEAR_CSTATE_FLAG(c, f) ((c)->sid_flags &= ~(f))
+
+struct nfsd4_compound_state {
+ struct svc_fh current_fh;
+ struct svc_fh save_fh;
+ struct nfs4_stateowner *replay_owner;
+ struct nfs4_client *clp;
+ /* For sessions DRC */
+ struct nfsd4_session *session;
+ struct nfsd4_slot *slot;
+ int data_offset;
+ bool spo_must_allowed;
+ size_t iovlen;
+ u32 minorversion;
+ __be32 status;
+ stateid_t current_stateid;
+ stateid_t save_stateid;
+ /* to indicate current and saved state id presents */
+ u32 sid_flags;
+};
+
+static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs)
+{
+ return cs->slot != NULL;
+}
+
+struct nfsd4_change_info {
+ u32 atomic;
+ bool change_supported;
+ u32 before_ctime_sec;
+ u32 before_ctime_nsec;
+ u64 before_change;
+ u32 after_ctime_sec;
+ u32 after_ctime_nsec;
+ u64 after_change;
+};
+
+struct nfsd4_access {
+ u32 ac_req_access; /* request */
+ u32 ac_supported; /* response */
+ u32 ac_resp_access; /* response */
+};
+
+struct nfsd4_close {
+ u32 cl_seqid; /* request */
+ stateid_t cl_stateid; /* request+response */
+};
+
+struct nfsd4_commit {
+ u64 co_offset; /* request */
+ u32 co_count; /* request */
+ nfs4_verifier co_verf; /* response */
+};
+
+struct nfsd4_create {
+ u32 cr_namelen; /* request */
+ char * cr_name; /* request */
+ u32 cr_type; /* request */
+ union { /* request */
+ struct {
+ u32 datalen;
+ char *data;
+ struct kvec first;
+ } link; /* NF4LNK */
+ struct {
+ u32 specdata1;
+ u32 specdata2;
+ } dev; /* NF4BLK, NF4CHR */
+ } u;
+ u32 cr_bmval[3]; /* request */
+ struct iattr cr_iattr; /* request */
+ int cr_umask; /* request */
+ struct nfsd4_change_info cr_cinfo; /* response */
+ struct nfs4_acl *cr_acl;
+ struct xdr_netobj cr_label;
+};
+#define cr_datalen u.link.datalen
+#define cr_data u.link.data
+#define cr_first u.link.first
+#define cr_specdata1 u.dev.specdata1
+#define cr_specdata2 u.dev.specdata2
+
+struct nfsd4_delegreturn {
+ stateid_t dr_stateid;
+};
+
+struct nfsd4_getattr {
+ u32 ga_bmval[3]; /* request */
+ struct svc_fh *ga_fhp; /* response */
+};
+
+struct nfsd4_link {
+ u32 li_namelen; /* request */
+ char * li_name; /* request */
+ struct nfsd4_change_info li_cinfo; /* response */
+};
+
+struct nfsd4_lock_denied {
+ clientid_t ld_clientid;
+ struct xdr_netobj ld_owner;
+ u64 ld_start;
+ u64 ld_length;
+ u32 ld_type;
+};
+
+struct nfsd4_lock {
+ /* request */
+ u32 lk_type;
+ u32 lk_reclaim; /* boolean */
+ u64 lk_offset;
+ u64 lk_length;
+ u32 lk_is_new;
+ union {
+ struct {
+ u32 open_seqid;
+ stateid_t open_stateid;
+ u32 lock_seqid;
+ clientid_t clientid;
+ struct xdr_netobj owner;
+ } new;
+ struct {
+ stateid_t lock_stateid;
+ u32 lock_seqid;
+ } old;
+ } v;
+
+ /* response */
+ union {
+ struct {
+ stateid_t stateid;
+ } ok;
+ struct nfsd4_lock_denied denied;
+ } u;
+};
+#define lk_new_open_seqid v.new.open_seqid
+#define lk_new_open_stateid v.new.open_stateid
+#define lk_new_lock_seqid v.new.lock_seqid
+#define lk_new_clientid v.new.clientid
+#define lk_new_owner v.new.owner
+#define lk_old_lock_stateid v.old.lock_stateid
+#define lk_old_lock_seqid v.old.lock_seqid
+
+#define lk_resp_stateid u.ok.stateid
+#define lk_denied u.denied
+
+
+struct nfsd4_lockt {
+ u32 lt_type;
+ clientid_t lt_clientid;
+ struct xdr_netobj lt_owner;
+ u64 lt_offset;
+ u64 lt_length;
+ struct nfsd4_lock_denied lt_denied;
+};
+
+
+struct nfsd4_locku {
+ u32 lu_type;
+ u32 lu_seqid;
+ stateid_t lu_stateid;
+ u64 lu_offset;
+ u64 lu_length;
+};
+
+
+struct nfsd4_lookup {
+ u32 lo_len; /* request */
+ char * lo_name; /* request */
+};
+
+struct nfsd4_putfh {
+ u32 pf_fhlen; /* request */
+ char *pf_fhval; /* request */
+ bool no_verify; /* represents foreigh fh */
+};
+
+struct nfsd4_getxattr {
+ char *getxa_name; /* request */
+ u32 getxa_len; /* request */
+ void *getxa_buf;
+};
+
+struct nfsd4_setxattr {
+ u32 setxa_flags; /* request */
+ char *setxa_name; /* request */
+ char *setxa_buf; /* request */
+ u32 setxa_len; /* request */
+ struct nfsd4_change_info setxa_cinfo; /* response */
+};
+
+struct nfsd4_removexattr {
+ char *rmxa_name; /* request */
+ struct nfsd4_change_info rmxa_cinfo; /* response */
+};
+
+struct nfsd4_listxattrs {
+ u64 lsxa_cookie; /* request */
+ u32 lsxa_maxcount; /* request */
+ char *lsxa_buf; /* unfiltered buffer (reply) */
+ u32 lsxa_len; /* unfiltered len (reply) */
+};
+
+struct nfsd4_open {
+ u32 op_claim_type; /* request */
+ struct xdr_netobj op_fname; /* request - everything but CLAIM_PREV */
+ u32 op_delegate_type; /* request - CLAIM_PREV only */
+ stateid_t op_delegate_stateid; /* request - response */
+ u32 op_why_no_deleg; /* response - DELEG_NONE_EXT only */
+ u32 op_create; /* request */
+ u32 op_createmode; /* request */
+ int op_umask; /* request */
+ u32 op_bmval[3]; /* request */
+ struct iattr op_iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */
+ nfs4_verifier op_verf __attribute__((aligned(32)));
+ /* EXCLUSIVE4 */
+ clientid_t op_clientid; /* request */
+ struct xdr_netobj op_owner; /* request */
+ u32 op_seqid; /* request */
+ u32 op_share_access; /* request */
+ u32 op_share_deny; /* request */
+ u32 op_deleg_want; /* request */
+ stateid_t op_stateid; /* response */
+ __be32 op_xdr_error; /* see nfsd4_open_omfg() */
+ u32 op_recall; /* recall */
+ struct nfsd4_change_info op_cinfo; /* response */
+ u32 op_rflags; /* response */
+ bool op_truncate; /* used during processing */
+ bool op_created; /* used during processing */
+ struct nfs4_openowner *op_openowner; /* used during processing */
+ struct nfs4_file *op_file; /* used during processing */
+ struct nfs4_ol_stateid *op_stp; /* used during processing */
+ struct nfs4_clnt_odstate *op_odstate; /* used during processing */
+ struct nfs4_acl *op_acl;
+ struct xdr_netobj op_label;
+};
+
+struct nfsd4_open_confirm {
+ stateid_t oc_req_stateid /* request */;
+ u32 oc_seqid /* request */;
+ stateid_t oc_resp_stateid /* response */;
+};
+
+struct nfsd4_open_downgrade {
+ stateid_t od_stateid;
+ u32 od_seqid;
+ u32 od_share_access; /* request */
+ u32 od_deleg_want; /* request */
+ u32 od_share_deny; /* request */
+};
+
+
+struct nfsd4_read {
+ stateid_t rd_stateid; /* request */
+ u64 rd_offset; /* request */
+ u32 rd_length; /* request */
+ int rd_vlen;
+ struct nfsd_file *rd_nf;
+
+ struct svc_rqst *rd_rqstp; /* response */
+ struct svc_fh *rd_fhp; /* response */
+};
+
+struct nfsd4_readdir {
+ u64 rd_cookie; /* request */
+ nfs4_verifier rd_verf; /* request */
+ u32 rd_dircount; /* request */
+ u32 rd_maxcount; /* request */
+ u32 rd_bmval[3]; /* request */
+ struct svc_rqst *rd_rqstp; /* response */
+ struct svc_fh * rd_fhp; /* response */
+
+ struct readdir_cd common;
+ struct xdr_stream *xdr;
+ int cookie_offset;
+};
+
+struct nfsd4_release_lockowner {
+ clientid_t rl_clientid;
+ struct xdr_netobj rl_owner;
+};
+struct nfsd4_readlink {
+ struct svc_rqst *rl_rqstp; /* request */
+ struct svc_fh * rl_fhp; /* request */
+};
+
+struct nfsd4_remove {
+ u32 rm_namelen; /* request */
+ char * rm_name; /* request */
+ struct nfsd4_change_info rm_cinfo; /* response */
+};
+
+struct nfsd4_rename {
+ u32 rn_snamelen; /* request */
+ char * rn_sname; /* request */
+ u32 rn_tnamelen; /* request */
+ char * rn_tname; /* request */
+ struct nfsd4_change_info rn_sinfo; /* response */
+ struct nfsd4_change_info rn_tinfo; /* response */
+};
+
+struct nfsd4_secinfo {
+ u32 si_namelen; /* request */
+ char *si_name; /* request */
+ struct svc_export *si_exp; /* response */
+};
+
+struct nfsd4_secinfo_no_name {
+ u32 sin_style; /* request */
+ struct svc_export *sin_exp; /* response */
+};
+
+struct nfsd4_setattr {
+ stateid_t sa_stateid; /* request */
+ u32 sa_bmval[3]; /* request */
+ struct iattr sa_iattr; /* request */
+ struct nfs4_acl *sa_acl;
+ struct xdr_netobj sa_label;
+};
+
+struct nfsd4_setclientid {
+ nfs4_verifier se_verf; /* request */
+ struct xdr_netobj se_name;
+ u32 se_callback_prog; /* request */
+ u32 se_callback_netid_len; /* request */
+ char * se_callback_netid_val; /* request */
+ u32 se_callback_addr_len; /* request */
+ char * se_callback_addr_val; /* request */
+ u32 se_callback_ident; /* request */
+ clientid_t se_clientid; /* response */
+ nfs4_verifier se_confirm; /* response */
+};
+
+struct nfsd4_setclientid_confirm {
+ clientid_t sc_clientid;
+ nfs4_verifier sc_confirm;
+};
+
+struct nfsd4_saved_compoundargs {
+ __be32 *p;
+ __be32 *end;
+ int pagelen;
+ struct page **pagelist;
+};
+
+struct nfsd4_test_stateid_id {
+ __be32 ts_id_status;
+ stateid_t ts_id_stateid;
+ struct list_head ts_id_list;
+};
+
+struct nfsd4_test_stateid {
+ u32 ts_num_ids;
+ struct list_head ts_stateid_list;
+};
+
+struct nfsd4_free_stateid {
+ stateid_t fr_stateid; /* request */
+};
+
+/* also used for NVERIFY */
+struct nfsd4_verify {
+ u32 ve_bmval[3]; /* request */
+ u32 ve_attrlen; /* request */
+ char * ve_attrval; /* request */
+};
+
+struct nfsd4_write {
+ stateid_t wr_stateid; /* request */
+ u64 wr_offset; /* request */
+ u32 wr_stable_how; /* request */
+ u32 wr_buflen; /* request */
+ struct kvec wr_head;
+ struct page ** wr_pagelist; /* request */
+
+ u32 wr_bytes_written; /* response */
+ u32 wr_how_written; /* response */
+ nfs4_verifier wr_verifier; /* response */
+};
+
+struct nfsd4_exchange_id {
+ nfs4_verifier verifier;
+ struct xdr_netobj clname;
+ u32 flags;
+ clientid_t clientid;
+ u32 seqid;
+ int spa_how;
+ u32 spo_must_enforce[3];
+ u32 spo_must_allow[3];
+ struct xdr_netobj nii_domain;
+ struct xdr_netobj nii_name;
+ struct timespec64 nii_time;
+};
+
+struct nfsd4_sequence {
+ struct nfs4_sessionid sessionid; /* request/response */
+ u32 seqid; /* request/response */
+ u32 slotid; /* request/response */
+ u32 maxslots; /* request/response */
+ u32 cachethis; /* request */
+#if 0
+ u32 target_maxslots; /* response */
+#endif /* not yet */
+ u32 status_flags; /* response */
+};
+
+struct nfsd4_destroy_session {
+ struct nfs4_sessionid sessionid;
+};
+
+struct nfsd4_destroy_clientid {
+ clientid_t clientid;
+};
+
+struct nfsd4_reclaim_complete {
+ u32 rca_one_fs;
+};
+
+struct nfsd4_deviceid {
+ u64 fsid_idx;
+ u32 generation;
+ u32 pad;
+};
+
+struct nfsd4_layout_seg {
+ u32 iomode;
+ u64 offset;
+ u64 length;
+};
+
+struct nfsd4_getdeviceinfo {
+ struct nfsd4_deviceid gd_devid; /* request */
+ u32 gd_layout_type; /* request */
+ u32 gd_maxcount; /* request */
+ u32 gd_notify_types;/* request - response */
+ void *gd_device; /* response */
+};
+
+struct nfsd4_layoutget {
+ u64 lg_minlength; /* request */
+ u32 lg_signal; /* request */
+ u32 lg_layout_type; /* request */
+ u32 lg_maxcount; /* request */
+ stateid_t lg_sid; /* request/response */
+ struct nfsd4_layout_seg lg_seg; /* request/response */
+ void *lg_content; /* response */
+};
+
+struct nfsd4_layoutcommit {
+ stateid_t lc_sid; /* request */
+ struct nfsd4_layout_seg lc_seg; /* request */
+ u32 lc_reclaim; /* request */
+ u32 lc_newoffset; /* request */
+ u64 lc_last_wr; /* request */
+ struct timespec64 lc_mtime; /* request */
+ u32 lc_layout_type; /* request */
+ u32 lc_up_len; /* layout length */
+ void *lc_up_layout; /* decoded by callback */
+ u32 lc_size_chg; /* boolean for response */
+ u64 lc_newsize; /* response */
+};
+
+struct nfsd4_layoutreturn {
+ u32 lr_return_type; /* request */
+ u32 lr_layout_type; /* request */
+ struct nfsd4_layout_seg lr_seg; /* request */
+ u32 lr_reclaim; /* request */
+ u32 lrf_body_len; /* request */
+ void *lrf_body; /* request */
+ stateid_t lr_sid; /* request/response */
+ u32 lrs_present; /* response */
+};
+
+struct nfsd4_fallocate {
+ /* request */
+ stateid_t falloc_stateid;
+ loff_t falloc_offset;
+ u64 falloc_length;
+};
+
+struct nfsd4_clone {
+ /* request */
+ stateid_t cl_src_stateid;
+ stateid_t cl_dst_stateid;
+ u64 cl_src_pos;
+ u64 cl_dst_pos;
+ u64 cl_count;
+};
+
+struct nfsd42_write_res {
+ u64 wr_bytes_written;
+ u32 wr_stable_how;
+ nfs4_verifier wr_verifier;
+ stateid_t cb_stateid;
+};
+
+struct nfsd4_copy {
+ /* request */
+ stateid_t cp_src_stateid;
+ stateid_t cp_dst_stateid;
+ u64 cp_src_pos;
+ u64 cp_dst_pos;
+ u64 cp_count;
+ struct nl4_server cp_src;
+ bool cp_intra;
+
+ /* both */
+ bool cp_synchronous;
+
+ /* response */
+ struct nfsd42_write_res cp_res;
+
+ /* for cb_offload */
+ struct nfsd4_callback cp_cb;
+ __be32 nfserr;
+ struct knfsd_fh fh;
+
+ struct nfs4_client *cp_clp;
+
+ struct nfsd_file *nf_src;
+ struct nfsd_file *nf_dst;
+
+ copy_stateid_t cp_stateid;
+
+ struct list_head copies;
+ struct task_struct *copy_task;
+ refcount_t refcount;
+ bool stopped;
+
+ struct vfsmount *ss_mnt;
+ struct nfs_fh c_fh;
+ nfs4_stateid stateid;
+};
+extern bool inter_copy_offload_enable;
+
+struct nfsd4_seek {
+ /* request */
+ stateid_t seek_stateid;
+ loff_t seek_offset;
+ u32 seek_whence;
+
+ /* response */
+ u32 seek_eof;
+ loff_t seek_pos;
+};
+
+struct nfsd4_offload_status {
+ /* request */
+ stateid_t stateid;
+
+ /* response */
+ u64 count;
+ u32 status;
+};
+
+struct nfsd4_copy_notify {
+ /* request */
+ stateid_t cpn_src_stateid;
+ struct nl4_server cpn_dst;
+
+ /* response */
+ stateid_t cpn_cnr_stateid;
+ u64 cpn_sec;
+ u32 cpn_nsec;
+ struct nl4_server cpn_src;
+};
+
+struct nfsd4_op {
+ int opnum;
+ const struct nfsd4_operation * opdesc;
+ __be32 status;
+ union nfsd4_op_u {
+ struct nfsd4_access access;
+ struct nfsd4_close close;
+ struct nfsd4_commit commit;
+ struct nfsd4_create create;
+ struct nfsd4_delegreturn delegreturn;
+ struct nfsd4_getattr getattr;
+ struct svc_fh * getfh;
+ struct nfsd4_link link;
+ struct nfsd4_lock lock;
+ struct nfsd4_lockt lockt;
+ struct nfsd4_locku locku;
+ struct nfsd4_lookup lookup;
+ struct nfsd4_verify nverify;
+ struct nfsd4_open open;
+ struct nfsd4_open_confirm open_confirm;
+ struct nfsd4_open_downgrade open_downgrade;
+ struct nfsd4_putfh putfh;
+ struct nfsd4_read read;
+ struct nfsd4_readdir readdir;
+ struct nfsd4_readlink readlink;
+ struct nfsd4_remove remove;
+ struct nfsd4_rename rename;
+ clientid_t renew;
+ struct nfsd4_secinfo secinfo;
+ struct nfsd4_setattr setattr;
+ struct nfsd4_setclientid setclientid;
+ struct nfsd4_setclientid_confirm setclientid_confirm;
+ struct nfsd4_verify verify;
+ struct nfsd4_write write;
+ struct nfsd4_release_lockowner release_lockowner;
+
+ /* NFSv4.1 */
+ struct nfsd4_exchange_id exchange_id;
+ struct nfsd4_backchannel_ctl backchannel_ctl;
+ struct nfsd4_bind_conn_to_session bind_conn_to_session;
+ struct nfsd4_create_session create_session;
+ struct nfsd4_destroy_session destroy_session;
+ struct nfsd4_destroy_clientid destroy_clientid;
+ struct nfsd4_sequence sequence;
+ struct nfsd4_reclaim_complete reclaim_complete;
+ struct nfsd4_test_stateid test_stateid;
+ struct nfsd4_free_stateid free_stateid;
+ struct nfsd4_getdeviceinfo getdeviceinfo;
+ struct nfsd4_layoutget layoutget;
+ struct nfsd4_layoutcommit layoutcommit;
+ struct nfsd4_layoutreturn layoutreturn;
+ struct nfsd4_secinfo_no_name secinfo_no_name;
+
+ /* NFSv4.2 */
+ struct nfsd4_fallocate allocate;
+ struct nfsd4_fallocate deallocate;
+ struct nfsd4_clone clone;
+ struct nfsd4_copy copy;
+ struct nfsd4_offload_status offload_status;
+ struct nfsd4_copy_notify copy_notify;
+ struct nfsd4_seek seek;
+
+ struct nfsd4_getxattr getxattr;
+ struct nfsd4_setxattr setxattr;
+ struct nfsd4_listxattrs listxattrs;
+ struct nfsd4_removexattr removexattr;
+ } u;
+ struct nfs4_replay * replay;
+};
+
+bool nfsd4_cache_this_op(struct nfsd4_op *);
+
+/*
+ * Memory needed just for the duration of processing one compound:
+ */
+struct svcxdr_tmpbuf {
+ struct svcxdr_tmpbuf *next;
+ char buf[];
+};
+
+struct nfsd4_compoundargs {
+ /* scratch variables for XDR decode */
+ __be32 * p;
+ __be32 * end;
+ struct page ** pagelist;
+ int pagelen;
+ bool tail;
+ __be32 tmp[8];
+ __be32 * tmpp;
+ struct svcxdr_tmpbuf *to_free;
+
+ struct svc_rqst *rqstp;
+
+ u32 taglen;
+ char * tag;
+ u32 minorversion;
+ u32 opcnt;
+ struct nfsd4_op *ops;
+ struct nfsd4_op iops[8];
+ int cachetype;
+};
+
+struct nfsd4_compoundres {
+ /* scratch variables for XDR encode */
+ struct xdr_stream xdr;
+ struct svc_rqst * rqstp;
+
+ u32 taglen;
+ char * tag;
+ u32 opcnt;
+ __be32 * tagp; /* tag, opcount encode location */
+ struct nfsd4_compound_state cstate;
+};
+
+static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp)
+{
+ struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
+ return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE;
+}
+
+/*
+ * The session reply cache only needs to cache replies that the client
+ * actually asked us to. But it's almost free for us to cache compounds
+ * consisting of only a SEQUENCE op, so we may as well cache those too.
+ * Also, the protocol doesn't give us a convenient response in the case
+ * of a replay of a solo SEQUENCE op that wasn't cached
+ * (RETRY_UNCACHED_REP can only be returned in the second op of a
+ * compound).
+ */
+static inline bool nfsd4_cache_this(struct nfsd4_compoundres *resp)
+{
+ return (resp->cstate.slot->sl_flags & NFSD4_SLOT_CACHETHIS)
+ || nfsd4_is_solo_sequence(resp);
+}
+
+static inline bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
+{
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+
+ return argp->opcnt == resp->opcnt;
+}
+
+const struct nfsd4_operation *OPDESC(struct nfsd4_op *op);
+int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op);
+void warn_on_nonidempotent_op(struct nfsd4_op *op);
+
+#define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs)
+
+static inline void
+set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
+{
+ BUG_ON(!fhp->fh_pre_saved);
+ cinfo->atomic = (u32)fhp->fh_post_saved;
+ cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry));
+
+ cinfo->before_change = fhp->fh_pre_change;
+ cinfo->after_change = fhp->fh_post_change;
+ cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
+ cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
+ cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
+ cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
+
+}
+
+
+bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp);
+int nfs4svc_decode_voidarg(struct svc_rqst *, __be32 *);
+int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *);
+int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *);
+int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *);
+__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
+void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
+void nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op);
+__be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words,
+ struct svc_fh *fhp, struct svc_export *exp,
+ struct dentry *dentry,
+ u32 *bmval, struct svc_rqst *, int ignore_crossmnt);
+extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *, union nfsd4_op_u *u);
+extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *, union nfsd4_op_u *u);
+extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *, union nfsd4_op_u *u);
+extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *,
+ struct nfsd4_compound_state *, union nfsd4_op_u *u);
+extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *,
+ struct nfsd4_compound_state *, union nfsd4_op_u *u);
+extern __be32 nfsd4_create_session(struct svc_rqst *,
+ struct nfsd4_compound_state *, union nfsd4_op_u *u);
+extern __be32 nfsd4_sequence(struct svc_rqst *,
+ struct nfsd4_compound_state *, union nfsd4_op_u *u);
+extern void nfsd4_sequence_done(struct nfsd4_compoundres *resp);
+extern __be32 nfsd4_destroy_session(struct svc_rqst *,
+ struct nfsd4_compound_state *, union nfsd4_op_u *u);
+extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *,
+ union nfsd4_op_u *u);
+__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *,
+ union nfsd4_op_u *u);
+extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
+ struct nfsd4_open *open, struct nfsd_net *nn);
+extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
+ struct svc_fh *current_fh, struct nfsd4_open *open);
+extern void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate);
+extern void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
+ struct nfsd4_open *open);
+extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *, union nfsd4_op_u *u);
+extern __be32 nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
+ union nfsd4_op_u *u);
+extern __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *, union nfsd4_op_u *u);
+extern __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
+ union nfsd4_op_u *u);
+extern __be32 nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
+ union nfsd4_op_u *u);
+extern __be32 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
+ union nfsd4_op_u *u);
+extern __be32
+nfsd4_release_lockowner(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *, union nfsd4_op_u *u);
+extern void nfsd4_release_compoundargs(struct svc_rqst *rqstp);
+extern __be32 nfsd4_delegreturn(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *, union nfsd4_op_u *u);
+extern __be32 nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
+ union nfsd4_op_u *u);
+extern __be32 nfsd4_test_stateid(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *, union nfsd4_op_u *);
+extern __be32 nfsd4_free_stateid(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *, union nfsd4_op_u *);
+extern void nfsd4_bump_seqid(struct nfsd4_compound_state *, __be32 nfserr);
+
+enum nfsd4_op_flags {
+ ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */
+ ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */
+ ALLOWED_AS_FIRST_OP = 1 << 2, /* ops reqired first in compound */
+ /* For rfc 5661 section 2.6.3.1.1: */
+ OP_HANDLES_WRONGSEC = 1 << 3,
+ OP_IS_PUTFH_LIKE = 1 << 4,
+ /*
+ * These are the ops whose result size we estimate before
+ * encoding, to avoid performing an op then not being able to
+ * respond or cache a response. This includes writes and setattrs
+ * as well as the operations usually called "nonidempotent":
+ */
+ OP_MODIFIES_SOMETHING = 1 << 5,
+ /*
+ * Cache compounds containing these ops in the xid-based drc:
+ * We use the DRC for compounds containing non-idempotent
+ * operations, *except* those that are 4.1-specific (since
+ * sessions provide their own EOS), and except for stateful
+ * operations other than setclientid and setclientid_confirm
+ * (since sequence numbers provide EOS for open, lock, etc in
+ * the v4.0 case).
+ */
+ OP_CACHEME = 1 << 6,
+ /*
+ * These are ops which clear current state id.
+ */
+ OP_CLEAR_STATEID = 1 << 7,
+ /* Most ops return only an error on failure; some may do more: */
+ OP_NONTRIVIAL_ERROR_ENCODE = 1 << 8,
+};
+
+struct nfsd4_operation {
+ __be32 (*op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
+ union nfsd4_op_u *);
+ void (*op_release)(union nfsd4_op_u *);
+ u32 op_flags;
+ char *op_name;
+ /* Try to get response size before operation */
+ u32 (*op_rsize_bop)(struct svc_rqst *, struct nfsd4_op *);
+ void (*op_get_currentstateid)(struct nfsd4_compound_state *,
+ union nfsd4_op_u *);
+ void (*op_set_currentstateid)(struct nfsd4_compound_state *,
+ union nfsd4_op_u *);
+};
+
+
+#endif
+
+/*
+ * Local variables:
+ * c-basic-offset: 8
+ * End:
+ */
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
new file mode 100644
index 000000000..547cf07cf
--- /dev/null
+++ b/fs/nfsd/xdr4cb.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#define NFS4_MAXTAGLEN 20
+
+#define NFS4_enc_cb_null_sz 0
+#define NFS4_dec_cb_null_sz 0
+#define cb_compound_enc_hdr_sz 4
+#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2))
+#define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2)
+#define cb_sequence_enc_sz (sessionid_sz + 4 + \
+ 1 /* no referring calls list yet */)
+#define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4)
+
+#define op_enc_sz 1
+#define op_dec_sz 2
+#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2))
+#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2)
+#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \
+ cb_sequence_enc_sz + \
+ 1 + enc_stateid_sz + \
+ enc_nfs4_fh_sz)
+
+#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \
+ cb_sequence_dec_sz + \
+ op_dec_sz)
+#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \
+ cb_sequence_enc_sz + \
+ 1 + 3 + \
+ enc_nfs4_fh_sz + 4)
+#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \
+ cb_sequence_dec_sz + \
+ op_dec_sz)
+
+#define NFS4_enc_cb_notify_lock_sz (cb_compound_enc_hdr_sz + \
+ cb_sequence_enc_sz + \
+ 2 + 1 + \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+ enc_nfs4_fh_sz)
+#define NFS4_dec_cb_notify_lock_sz (cb_compound_dec_hdr_sz + \
+ cb_sequence_dec_sz + \
+ op_dec_sz)
+#define enc_cb_offload_info_sz (1 + 1 + 2 + 1 + \
+ XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define NFS4_enc_cb_offload_sz (cb_compound_enc_hdr_sz + \
+ cb_sequence_enc_sz + \
+ enc_nfs4_fh_sz + \
+ enc_stateid_sz + \
+ enc_cb_offload_info_sz)
+#define NFS4_dec_cb_offload_sz (cb_compound_dec_hdr_sz + \
+ cb_sequence_dec_sz + \
+ op_dec_sz)